diff options
-rw-r--r-- | Documentation/admin-guide/cgroup-v2.rst | 20 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 17 | ||||
-rw-r--r-- | mm/memcontrol.c | 5 | ||||
-rw-r--r-- | mm/vmscan.c | 76 |
4 files changed, 106 insertions, 12 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7bf3f129c68b..8ed408166890 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -606,8 +606,8 @@ on an IO device and is an example of this type. Protections ----------- -A cgroup is protected to be allocated upto the configured amount of -the resource if the usages of all its ancestors are under their +A cgroup is protected upto the configured amount of the resource +as long as the usages of all its ancestors are under their protected levels. Protections can be hard guarantees or best effort soft boundaries. Protections can also be over-committed in which case only upto the amount available to the parent is protected among @@ -1020,7 +1020,10 @@ PAGE_SIZE multiple when read back. is within its effective min boundary, the cgroup's memory won't be reclaimed under any conditions. If there is no unprotected reclaimable memory available, OOM killer - is invoked. + is invoked. Above the effective min boundary (or + effective low boundary if it is higher), pages are reclaimed + proportionally to the overage, reducing reclaim pressure for + smaller overages. Effective min boundary is limited by memory.min values of all ancestor cgroups. If there is memory.min overcommitment @@ -1042,7 +1045,10 @@ PAGE_SIZE multiple when read back. Best-effort memory protection. If the memory usage of a cgroup is within its effective low boundary, the cgroup's memory won't be reclaimed unless memory can be reclaimed - from unprotected cgroups. + from unprotected cgroups. Above the effective low boundary (or + effective min boundary if it is higher), pages are reclaimed + proportionally to the overage, reducing reclaim pressure for + smaller overages. Effective low boundary is limited by memory.low values of all ancestor cgroups. If there is memory.low overcommitment @@ -2283,8 +2289,10 @@ system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated -reserve. A cgroup enjoys reclaim protection when it's within its low, -which makes delegation of subtrees possible. +reserve. A cgroup enjoys reclaim protection when it's within its +effective low, which makes delegation of subtrees possible. It also +enjoys having reclaim pressure proportional to its overage when +above its effective low. The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f3d880b7ca1..252dbe54611d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -333,6 +333,11 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +{ + return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow)); +} + enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); @@ -531,6 +536,8 @@ void mem_cgroup_handle_over_high(void); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); +unsigned long mem_cgroup_size(struct mem_cgroup *memcg); + void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); @@ -824,6 +831,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } +static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg) +{ + return 0; +} + static inline enum mem_cgroup_protection mem_cgroup_protected( struct mem_cgroup *root, struct mem_cgroup *memcg) { @@ -981,6 +993,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return 0; } +static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return 0; +} + static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 386056514ee6..dc03bf2da91d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1377,6 +1377,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return max; } +unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory); +} + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { diff --git a/mm/vmscan.c b/mm/vmscan.c index e979705bbf32..6cad24bd206c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2435,17 +2435,74 @@ out: *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); - unsigned long size; + unsigned long lruvec_size; unsigned long scan; + unsigned long protection; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + protection = mem_cgroup_protection(memcg); + + if (protection > 0) { + /* + * Scale a cgroup's reclaim pressure by proportioning its current + * usage to its memory.low or memory.min setting. + * + * This is important, as otherwise scanning aggression becomes + * extremely binary -- from nothing as we approach the memory + * protection threshold, to totally nominal as we exceed it. This + * results in requiring setting extremely liberal protection + * thresholds. It also means we simply get no protection at all if + * we set it too low, which is not ideal. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long baseline = 0; + + /* + * During the reclaim first pass, we only consider cgroups in + * excess of their protection setting, but if that doesn't produce + * free pages, we come back for a second pass where we reclaim from + * all groups. + * + * To maintain fairness in both cases, the first pass targets + * groups in proportion to their overage, and the second pass + * targets groups in proportion to their protection utilization. + * + * So on the first pass, a group whose size is 130% of its + * protection will be targeted at 30% of its size. On the second + * pass, a group whose size is at 40% of its protection will be + * targeted at 40% of its size. + */ + if (!sc->memcg_low_reclaim) + baseline = lruvec_size; + scan = lruvec_size * cgroup_size / protection - baseline; + + /* + * Don't allow the scan target to exceed the lruvec size, which + * otherwise could happen if we have >200% overage in the normal + * case, or >100% overage when sc->memcg_low_reclaim is set. + * + * This is important because other cgroups without memory.low have + * their scan target initially set to their lruvec size, so + * allowing values >100% of the lruvec size here could result in + * penalising cgroups with memory.low set even *more* than their + * peers in some cases in the case of large overages. + * + * Also, minimally target SWAP_CLUSTER_MAX pages to keep reclaim + * moving forwards. + */ + scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size); + } else { + scan = lruvec_size; + } + + scan >>= sc->priority; - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) - scan = min(size, SWAP_CLUSTER_MAX); + scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: @@ -2465,7 +2522,7 @@ out: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) { - size = 0; + lruvec_size = 0; scan = 0; } break; @@ -2474,7 +2531,7 @@ out: BUG(); } - *lru_pages += size; + *lru_pages += lruvec_size; nr[lru] = scan; } } @@ -2735,6 +2792,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); break; case MEMCG_PROT_NONE: + /* + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). + */ break; } |