summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/cgroup.h15
-rw-r--r--include/linux/delayacct.h23
-rw-r--r--include/linux/hmm.h2
-rw-r--r--include/linux/huge_mm.h8
-rw-r--r--include/linux/iomap.h4
-rw-r--r--include/linux/linkage.h1
-rw-r--r--include/linux/math64.h3
-rw-r--r--include/linux/memblock.h15
-rw-r--r--include/linux/memcontrol.h15
-rw-r--r--include/linux/mm.h48
-rw-r--r--include/linux/mmu_notifier.h27
-rw-r--r--include/linux/mmzone.h4
-rw-r--r--include/linux/page-flags.h14
-rw-r--r--include/linux/pfn_t.h4
-rw-r--r--include/linux/psi.h53
-rw-r--r--include/linux/psi_types.h92
-rw-r--r--include/linux/sched.h13
-rw-r--r--include/linux/sched/loadavg.h24
-rw-r--r--include/linux/slab.h56
-rw-r--r--include/linux/swap.h15
21 files changed, 326 insertions, 114 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 22254c1fe1c5..5e1694fe035b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -20,6 +20,7 @@
#include <linux/u64_stats_sync.h>
#include <linux/workqueue.h>
#include <linux/bpf-cgroup.h>
+#include <linux/psi_types.h>
#ifdef CONFIG_CGROUPS
@@ -436,6 +437,9 @@ struct cgroup {
/* used to schedule release agent */
struct work_struct release_agent_work;
+ /* used to track pressure stalls */
+ struct psi_group psi;
+
/* used to store eBPF programs */
struct cgroup_bpf bpf;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b622d6608605..9968332cceed 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn);
}
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+ return &cgrp->psi;
+}
+
static inline void cgroup_init_kthreadd(void)
{
/*
@@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
return NULL;
}
+static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ return NULL;
+}
+
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+ return NULL;
+}
+
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
struct cgroup *ancestor)
{
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 31c865d1842e..577d1b25fccd 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -57,7 +57,12 @@ struct task_delay_info {
u64 freepages_start;
u64 freepages_delay; /* wait for memory reclaim */
+
+ u64 thrashing_start;
+ u64 thrashing_delay; /* wait for thrashing page */
+
u32 freepages_count; /* total count of memory reclaim */
+ u32 thrashing_count; /* total count of thrash waits */
};
#endif
@@ -76,6 +81,8 @@ extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
+extern void __delayacct_thrashing_start(void);
+extern void __delayacct_thrashing_end(void);
static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
{
@@ -156,6 +163,18 @@ static inline void delayacct_freepages_end(void)
__delayacct_freepages_end();
}
+static inline void delayacct_thrashing_start(void)
+{
+ if (current->delays)
+ __delayacct_thrashing_start();
+}
+
+static inline void delayacct_thrashing_end(void)
+{
+ if (current->delays)
+ __delayacct_thrashing_end();
+}
+
#else
static inline void delayacct_set_flag(int flag)
{}
@@ -182,6 +201,10 @@ static inline void delayacct_freepages_start(void)
{}
static inline void delayacct_freepages_end(void)
{}
+static inline void delayacct_thrashing_start(void)
+{}
+static inline void delayacct_thrashing_end(void)
+{}
#endif /* CONFIG_TASK_DELAY_ACCT */
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4c92e3ba3e16..dde947083d4e 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -107,7 +107,7 @@ enum hmm_pfn_flag_e {
* HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
* HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
* HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
- * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
+ * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not
* be mirrored by a device, because the entry will never have HMM_PFN_VALID
* set and the pfn value is undefined.
*
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fdcb45999b26..4663ee96cf59 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -213,9 +213,9 @@ static inline int hpage_nr_pages(struct page *page)
}
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags);
+ pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags);
+ pud_t *pud, int flags, struct dev_pagemap **pgmap);
extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
@@ -344,13 +344,13 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm)
}
static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmd, int flags)
+ unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
return NULL;
}
static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
- unsigned long addr, pud_t *pud, int flags)
+ unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
{
return NULL;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3555d54bf79a..9a4258154b25 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -6,6 +6,7 @@
#include <linux/bitmap.h>
#include <linux/mm.h>
#include <linux/types.h>
+#include <linux/mm_types.h>
struct address_space;
struct fiemap_extent_info;
@@ -141,7 +142,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
-int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops);
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
+ const struct iomap_ops *ops);
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
loff_t start, loff_t len, const struct iomap_ops *ops);
loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index d7618c41f74c..7c47b1a471d4 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -90,6 +90,7 @@
#ifndef WEAK
#define WEAK(name) \
.weak name ASM_NL \
+ ALIGN ASM_NL \
name:
#endif
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 837f2f2d1d34..bb2c84afb80c 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -281,4 +281,7 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
}
#endif /* mul_u64_u32_div */
+#define DIV64_U64_ROUND_UP(ll, d) \
+ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
+
#endif /* _LINUX_MATH64_H */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 516920549378..2acdd046df2d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -265,21 +265,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
-/**
- * for_each_resv_unavail_range - iterate through reserved and unavailable memory
- * @i: u64 used as loop variable
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
- *
- * Walks over unavailable but reserved (reserved && !memory) areas of memblock.
- * Available as soon as memblock is initialized.
- * Note: because this memory does not belong to any physical node, flags and
- * nid arguments do not make sense and thus not exported as arguments.
- */
-#define for_each_resv_unavail_range(i, p_start, p_end) \
- for_each_mem_range(i, &memblock.reserved, &memblock.memory, \
- NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
-
static inline void memblock_set_region_flags(struct memblock_region *r,
enum memblock_flags flags)
{
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 652f602167df..7ab2120155a4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -78,7 +78,7 @@ struct mem_cgroup_reclaim_cookie {
struct mem_cgroup_id {
int id;
- atomic_t ref;
+ refcount_t ref;
};
/*
@@ -1268,10 +1268,11 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
void memcg_kmem_put_cache(struct kmem_cache *cachep);
int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg);
+
+#ifdef CONFIG_MEMCG_KMEM
int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
void memcg_kmem_uncharge(struct page *page, int order);
-#ifdef CONFIG_MEMCG_KMEM
extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1307,6 +1308,16 @@ extern int memcg_expand_shrinker_maps(int new_id);
extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
int nid, int shrinker_id);
#else
+
+static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void memcg_kmem_uncharge(struct page *page, int order)
+{
+}
+
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
diff --git a/include/linux/mm.h b/include/linux/mm.h
index daa2b8f1e9a8..1e52b8fd1685 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -848,6 +848,8 @@ static inline bool is_zone_device_page(const struct page *page)
{
return page_zonenum(page) == ZONE_DEVICE;
}
+extern void memmap_init_zone_device(struct zone *, unsigned long,
+ unsigned long, struct dev_pagemap *);
#else
static inline bool is_zone_device_page(const struct page *page)
{
@@ -2304,6 +2306,8 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
struct list_head *uf);
+extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
+ struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
@@ -2502,11 +2506,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn);
@@ -2525,32 +2529,6 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
-static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn)
-{
- int err = vm_insert_mixed(vma, addr, pfn);
-
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
-
- return VM_FAULT_NOPAGE;
-}
-
-static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn)
-{
- int err = vm_insert_pfn(vma, addr, pfn);
-
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
-
- return VM_FAULT_NOPAGE;
-}
-
static inline vm_fault_t vmf_error(int err)
{
if (err == -ENOMEM)
@@ -2558,16 +2536,8 @@ static inline vm_fault_t vmf_error(int err)
return VM_FAULT_SIGBUS;
}
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int foll_flags,
- unsigned int *page_mask);
-
-static inline struct page *follow_page(struct vm_area_struct *vma,
- unsigned long address, unsigned int foll_flags)
-{
- unsigned int unused_page_mask;
- return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
-}
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags);
#define FOLL_WRITE 0x01 /* check pte is writable */
#define FOLL_TOUCH 0x02 /* mark page accessed */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 133ba78820ee..9893a6432adf 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -2,7 +2,6 @@
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H
-#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
@@ -11,9 +10,6 @@
struct mmu_notifier;
struct mmu_notifier_ops;
-/* mmu_notifier_ops flags */
-#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
-
#ifdef CONFIG_MMU_NOTIFIER
/*
@@ -31,15 +27,6 @@ struct mmu_notifier_mm {
struct mmu_notifier_ops {
/*
- * Flags to specify behavior of callbacks for this MMU notifier.
- * Used to determine which context an operation may be called.
- *
- * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
- * block
- */
- int flags;
-
- /*
* Called either by mmu_notifier_unregister or when the mm is
* being destroyed by exit_mmap, always before all pages are
* freed. This can run concurrently with other mmu notifier
@@ -153,7 +140,9 @@ struct mmu_notifier_ops {
*
* If blockable argument is set to false then the callback cannot
* sleep and has to return with -EAGAIN. 0 should be returned
- * otherwise.
+ * otherwise. Please note that if invalidate_range_start approves
+ * a non-blocking behavior then the same applies to
+ * invalidate_range_end.
*
*/
int (*invalidate_range_start)(struct mmu_notifier *mn,
@@ -181,10 +170,6 @@ struct mmu_notifier_ops {
* Note that this function might be called with just a sub-range
* of what was passed to invalidate_range_start()/end(), if
* called between those functions.
- *
- * If this callback cannot block, and invalidate_range_{start,end}
- * cannot block, mmu_notifier_ops.flags should have
- * MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long start, unsigned long end);
@@ -239,7 +224,6 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
-extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -493,11 +477,6 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
{
}
-static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
-{
- return false;
-}
-
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
{
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d4b0c79d2924..9f0caccd5833 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -161,8 +161,10 @@ enum node_stat_item {
NR_SLAB_UNRECLAIMABLE,
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
+ WORKINGSET_NODES,
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
+ WORKINGSET_RESTORE,
WORKINGSET_NODERECLAIM,
NR_ANON_MAPPED, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
@@ -180,7 +182,7 @@ enum node_stat_item {
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
- NR_INDIRECTLY_RECLAIMABLE_BYTES, /* measured in bytes */
+ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
NR_VM_NODE_STAT_ITEMS
};
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74bee8cecf4c..50ce1bddaf56 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -69,13 +69,14 @@
*/
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
- PG_error,
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
PG_active,
+ PG_workingset,
PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
+ PG_error,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
@@ -162,6 +163,14 @@ static inline int PagePoisoned(const struct page *page)
return page->flags == PAGE_POISON_PATTERN;
}
+#ifdef CONFIG_DEBUG_VM
+void page_init_poison(struct page *page, size_t size);
+#else
+static inline void page_init_poison(struct page *page, size_t size)
+{
+}
+#endif
+
/*
* Page flags policies wrt compound pages
*
@@ -280,6 +289,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
TESTCLEARFLAG(Active, active, PF_HEAD)
+PAGEFLAG(Workingset, workingset, PF_HEAD)
+ TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
__PAGEFLAG(Slab, slab, PF_NO_TAIL)
__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
@@ -292,6 +303,7 @@ PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
__CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+ __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
__CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
__SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 21713dc14ce2..7bb77850c65a 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -9,8 +9,10 @@
* PFN_SG_LAST - pfn references a page and is the last scatterlist entry
* PFN_DEV - pfn is not covered by system memmap by default
* PFN_MAP - pfn has a dynamic page mapping established by a device driver
+ * PFN_SPECIAL - for CONFIG_FS_DAX_LIMITED builds to allow XIP, but not
+ * get_user_pages
*/
-#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT))
+#define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT))
#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1))
#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2))
#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
diff --git a/include/linux/psi.h b/include/linux/psi.h
new file mode 100644
index 000000000000..8e0725aac0aa
--- /dev/null
+++ b/include/linux/psi.h
@@ -0,0 +1,53 @@
+#ifndef _LINUX_PSI_H
+#define _LINUX_PSI_H
+
+#include <linux/psi_types.h>
+#include <linux/sched.h>
+
+struct seq_file;
+struct css_set;
+
+#ifdef CONFIG_PSI
+
+extern bool psi_disabled;
+
+void psi_init(void);
+
+void psi_task_change(struct task_struct *task, int clear, int set);
+
+void psi_memstall_tick(struct task_struct *task, int cpu);
+void psi_memstall_enter(unsigned long *flags);
+void psi_memstall_leave(unsigned long *flags);
+
+int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
+
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgrp);
+void psi_cgroup_free(struct cgroup *cgrp);
+void cgroup_move_task(struct task_struct *p, struct css_set *to);
+#endif
+
+#else /* CONFIG_PSI */
+
+static inline void psi_init(void) {}
+
+static inline void psi_memstall_enter(unsigned long *flags) {}
+static inline void psi_memstall_leave(unsigned long *flags) {}
+
+#ifdef CONFIG_CGROUPS
+static inline int psi_cgroup_alloc(struct cgroup *cgrp)
+{
+ return 0;
+}
+static inline void psi_cgroup_free(struct cgroup *cgrp)
+{
+}
+static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
+{
+ rcu_assign_pointer(p->cgroups, to);
+}
+#endif
+
+#endif /* CONFIG_PSI */
+
+#endif /* _LINUX_PSI_H */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
new file mode 100644
index 000000000000..2cf422db5d18
--- /dev/null
+++ b/include/linux/psi_types.h
@@ -0,0 +1,92 @@
+#ifndef _LINUX_PSI_TYPES_H
+#define _LINUX_PSI_TYPES_H
+
+#include <linux/seqlock.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_PSI
+
+/* Tracked task states */
+enum psi_task_count {
+ NR_IOWAIT,
+ NR_MEMSTALL,
+ NR_RUNNING,
+ NR_PSI_TASK_COUNTS,
+};
+
+/* Task state bitmasks */
+#define TSK_IOWAIT (1 << NR_IOWAIT)
+#define TSK_MEMSTALL (1 << NR_MEMSTALL)
+#define TSK_RUNNING (1 << NR_RUNNING)
+
+/* Resources that workloads could be stalled on */
+enum psi_res {
+ PSI_IO,
+ PSI_MEM,
+ PSI_CPU,
+ NR_PSI_RESOURCES,
+};
+
+/*
+ * Pressure states for each resource:
+ *
+ * SOME: Stalled tasks & working tasks
+ * FULL: Stalled tasks & no working tasks
+ */
+enum psi_states {
+ PSI_IO_SOME,
+ PSI_IO_FULL,
+ PSI_MEM_SOME,
+ PSI_MEM_FULL,
+ PSI_CPU_SOME,
+ /* Only per-CPU, to weigh the CPU in the global average: */
+ PSI_NONIDLE,
+ NR_PSI_STATES,
+};
+
+struct psi_group_cpu {
+ /* 1st cacheline updated by the scheduler */
+
+ /* Aggregator needs to know of concurrent changes */
+ seqcount_t seq ____cacheline_aligned_in_smp;
+
+ /* States of the tasks belonging to this group */
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
+
+ /* Period time sampling buckets for each state of interest (ns) */
+ u32 times[NR_PSI_STATES];
+
+ /* Time of last task change in this group (rq_clock) */
+ u64 state_start;
+
+ /* 2nd cacheline updated by the aggregator */
+
+ /* Delta detection against the sampling buckets */
+ u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp;
+};
+
+struct psi_group {
+ /* Protects data updated during an aggregation */
+ struct mutex stat_lock;
+
+ /* Per-cpu task state & time tracking */
+ struct psi_group_cpu __percpu *pcpu;
+
+ /* Periodic aggregation state */
+ u64 total_prev[NR_PSI_STATES - 1];
+ u64 last_update;
+ u64 next_update;
+ struct delayed_work clock_work;
+
+ /* Total stall times and sampled pressure averages */
+ u64 total[NR_PSI_STATES - 1];
+ unsigned long avg[NR_PSI_STATES - 1][3];
+};
+
+#else /* CONFIG_PSI */
+
+struct psi_group { };
+
+#endif /* CONFIG_PSI */
+
+#endif /* _LINUX_PSI_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index adfb3f9a7597..8f8a5418b627 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,6 +25,7 @@
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/signal_types.h>
+#include <linux/psi_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
#include <linux/rseq.h>
@@ -706,6 +707,10 @@ struct task_struct {
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
unsigned sched_remote_wakeup:1;
+#ifdef CONFIG_PSI
+ unsigned sched_psi_wake_requeue:1;
+#endif
+
/* Force alignment to the next boundary: */
unsigned :0;
@@ -719,9 +724,6 @@ struct task_struct {
#endif
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
-#ifdef CONFIG_MEMCG_KMEM
- unsigned memcg_kmem_skip_account:1;
-#endif
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
@@ -965,6 +967,10 @@ struct task_struct {
kernel_siginfo_t *last_siginfo;
struct task_io_accounting ioac;
+#ifdef CONFIG_PSI
+ /* Pressure stall state */
+ unsigned int psi_flags;
+#endif
#ifdef CONFIG_TASK_XACCT
/* Accumulated RSS usage: */
u64 acct_rss_mem1;
@@ -1391,6 +1397,7 @@ extern struct pid *cad_pid;
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
+#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
index 80bc84ba5d2a..4859bea47a7b 100644
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -22,10 +22,26 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
#define EXP_5 2014 /* 1/exp(5sec/5min) */
#define EXP_15 2037 /* 1/exp(5sec/15min) */
-#define CALC_LOAD(load,exp,n) \
- load *= exp; \
- load += n*(FIXED_1-exp); \
- load >>= FSHIFT;
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static inline unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ unsigned long newload;
+
+ newload = load * exp + active * (FIXED_1 - exp);
+ if (active >= load)
+ newload += FIXED_1-1;
+
+ return newload / FIXED_1;
+}
+
+extern unsigned long calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n);
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
extern void calc_global_load(unsigned long ticks);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ed9cbddeb4a6..918f374e7156 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -295,12 +295,43 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
#define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \
(KMALLOC_MIN_SIZE) : 16)
+/*
+ * Whenever changing this, take care of that kmalloc_type() and
+ * create_kmalloc_caches() still work as intended.
+ */
+enum kmalloc_cache_type {
+ KMALLOC_NORMAL = 0,
+ KMALLOC_RECLAIM,
+#ifdef CONFIG_ZONE_DMA
+ KMALLOC_DMA,
+#endif
+ NR_KMALLOC_TYPES
+};
+
#ifndef CONFIG_SLOB
-extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
+extern struct kmem_cache *
+kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
+
+static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
+{
+ int is_dma = 0;
+ int type_dma = 0;
+ int is_reclaimable;
+
#ifdef CONFIG_ZONE_DMA
-extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
+ is_dma = !!(flags & __GFP_DMA);
+ type_dma = is_dma * KMALLOC_DMA;
#endif
+ is_reclaimable = !!(flags & __GFP_RECLAIMABLE);
+
+ /*
+ * If an allocation is both __GFP_DMA and __GFP_RECLAIMABLE, return
+ * KMALLOC_DMA and effectively ignore __GFP_RECLAIMABLE
+ */
+ return type_dma + (is_reclaimable & !is_dma) * KMALLOC_RECLAIM;
+}
+
/*
* Figure out which kmalloc slab an allocation of a certain size
* belongs to.
@@ -501,18 +532,20 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
+#ifndef CONFIG_SLOB
+ unsigned int index;
+#endif
if (size > KMALLOC_MAX_CACHE_SIZE)
return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
- if (!(flags & GFP_DMA)) {
- unsigned int index = kmalloc_index(size);
+ index = kmalloc_index(size);
- if (!index)
- return ZERO_SIZE_PTR;
+ if (!index)
+ return ZERO_SIZE_PTR;
- return kmem_cache_alloc_trace(kmalloc_caches[index],
- flags, size);
- }
+ return kmem_cache_alloc_trace(
+ kmalloc_caches[kmalloc_type(flags)][index],
+ flags, size);
#endif
}
return __kmalloc(size, flags);
@@ -542,13 +575,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
#ifndef CONFIG_SLOB
if (__builtin_constant_p(size) &&
- size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) {
+ size <= KMALLOC_MAX_CACHE_SIZE) {
unsigned int i = kmalloc_index(size);
if (!i)
return ZERO_SIZE_PTR;
- return kmem_cache_alloc_node_trace(kmalloc_caches[i],
+ return kmem_cache_alloc_node_trace(
+ kmalloc_caches[kmalloc_type(flags)][i],
flags, node, size);
}
#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8e2c11e692ba..38195f5c96b1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -167,13 +167,14 @@ enum {
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
SWP_BLKDEV = (1 << 6), /* its a block device */
- SWP_FILE = (1 << 7), /* set after swap_activate success */
- SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */
- SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */
- SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */
- SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */
+ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */
+ SWP_FS = (1 << 8), /* swap file goes through fs */
+ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */
+ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
+ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
+ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
/* add others here before... */
- SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */
+ SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */
};
#define SWAP_CLUSTER_MAX 32UL
@@ -296,7 +297,7 @@ struct vma_swap_readahead {
/* linux/mm/workingset.c */
void *workingset_eviction(struct address_space *mapping, struct page *page);
-bool workingset_refault(void *shadow);
+void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page);
/* Do not use directly, use workingset_lookup_update */