// SPDX-License-Identifier: GPL-2.0-only /* * Benchmark module for page_pool. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include #include #include #include "time_bench.h" static int verbose = 1; #define MY_POOL_SIZE 1024 /* Makes tests selectable. Useful for perf-record to analyze a single test. * Hint: Bash shells support writing binary number like: $((2#101010) * * # modprobe bench_page_pool_simple run_flags=$((2#100)) */ static unsigned long run_flags = 0xFFFFFFFF; module_param(run_flags, ulong, 0); MODULE_PARM_DESC(run_flags, "Limit which bench test that runs"); /* Count the bit number from the enum */ enum benchmark_bit { bit_run_bench_baseline, bit_run_bench_no_softirq01, bit_run_bench_no_softirq02, bit_run_bench_no_softirq03, }; #define bit(b) (1 << (b)) #define enabled(b) ((run_flags & (bit(b)))) /* notice time_bench is limited to U32_MAX nr loops */ static unsigned long loops = 10000000; module_param(loops, ulong, 0); MODULE_PARM_DESC(loops, "Specify loops bench will run"); /* Timing at the nanosec level, we need to know the overhead * introduced by the for loop itself */ static int time_bench_for_loop(struct time_bench_record *rec, void *data) { uint64_t loops_cnt = 0; int i; time_bench_start(rec); /** Loop to measure **/ for (i = 0; i < rec->loops; i++) { loops_cnt++; barrier(); /* avoid compiler to optimize this loop */ } time_bench_stop(rec, loops_cnt); return loops_cnt; } static int time_bench_atomic_inc(struct time_bench_record *rec, void *data) { uint64_t loops_cnt = 0; atomic_t cnt; int i; atomic_set(&cnt, 0); time_bench_start(rec); /** Loop to measure **/ for (i = 0; i < rec->loops; i++) { atomic_inc(&cnt); barrier(); /* avoid compiler to optimize this loop */ } loops_cnt = atomic_read(&cnt); time_bench_stop(rec, loops_cnt); return loops_cnt; } /* The ptr_ping in page_pool uses a spinlock. We need to know the minimum * overhead of taking+releasing a spinlock, to know the cycles that can be saved * by e.g. amortizing this via bulking. */ static int time_bench_lock(struct time_bench_record *rec, void *data) { uint64_t loops_cnt = 0; spinlock_t lock; int i; spin_lock_init(&lock); time_bench_start(rec); /** Loop to measure **/ for (i = 0; i < rec->loops; i++) { spin_lock(&lock); loops_cnt++; barrier(); /* avoid compiler to optimize this loop */ spin_unlock(&lock); } time_bench_stop(rec, loops_cnt); return loops_cnt; } /* Helper for filling some page's into ptr_ring */ static void pp_fill_ptr_ring(struct page_pool *pp, int elems) { /* GFP_ATOMIC needed when under run softirq */ gfp_t gfp_mask = GFP_ATOMIC; struct page **array; int i; array = kcalloc(elems, sizeof(struct page *), gfp_mask); for (i = 0; i < elems; i++) array[i] = page_pool_alloc_pages(pp, gfp_mask); for (i = 0; i < elems; i++) page_pool_put_page(pp, array[i], -1, false); kfree(array); } enum test_type { type_fast_path, type_ptr_ring, type_page_allocator }; /* Depends on compile optimizing this function */ static int time_bench_page_pool(struct time_bench_record *rec, void *data, enum test_type type, const char *func) { uint64_t loops_cnt = 0; gfp_t gfp_mask = GFP_ATOMIC; /* GFP_ATOMIC is not really needed */ int i, err; struct page_pool *pp; struct page *page; struct page_pool_params pp_params = { .order = 0, .flags = 0, .pool_size = MY_POOL_SIZE, .nid = NUMA_NO_NODE, .dev = NULL, /* Only use for DMA mapping */ .dma_dir = DMA_BIDIRECTIONAL, }; pp = page_pool_create(&pp_params); if (IS_ERR(pp)) { err = PTR_ERR(pp); pr_warn("%s: Error(%d) creating page_pool\n", func, err); goto out; } pp_fill_ptr_ring(pp, 64); if (in_serving_softirq()) pr_warn("%s(): in_serving_softirq fast-path\n", func); else pr_warn("%s(): Cannot use page_pool fast-path\n", func); time_bench_start(rec); /** Loop to measure **/ for (i = 0; i < rec->loops; i++) { /* Common fast-path alloc that depend on in_serving_softirq() */ page = page_pool_alloc_pages(pp, gfp_mask); if (!page) break; loops_cnt++; barrier(); /* avoid compiler to optimize this loop */ /* The benchmarks purpose it to test different return paths. * Compiler should inline optimize other function calls out */ if (type == type_fast_path) { /* Fast-path recycling e.g. XDP_DROP use-case */ page_pool_recycle_direct(pp, page); } else if (type == type_ptr_ring) { /* Normal return path */ page_pool_put_page(pp, page, -1, false); } else if (type == type_page_allocator) { /* Test if not pages are recycled, but instead * returned back into systems page allocator */ get_page(page); /* cause no-recycling */ page_pool_put_page(pp, page, -1, false); put_page(page); } else { BUILD_BUG(); } } time_bench_stop(rec, loops_cnt); out: page_pool_destroy(pp); return loops_cnt; } static int time_bench_page_pool01_fast_path(struct time_bench_record *rec, void *data) { return time_bench_page_pool(rec, data, type_fast_path, __func__); } static int time_bench_page_pool02_ptr_ring(struct time_bench_record *rec, void *data) { return time_bench_page_pool(rec, data, type_ptr_ring, __func__); } static int time_bench_page_pool03_slow(struct time_bench_record *rec, void *data) { return time_bench_page_pool(rec, data, type_page_allocator, __func__); } static int run_benchmark_tests(void) { uint32_t nr_loops = loops; /* Baseline tests */ if (enabled(bit_run_bench_baseline)) { time_bench_loop(nr_loops * 10, 0, "for_loop", NULL, time_bench_for_loop); time_bench_loop(nr_loops * 10, 0, "atomic_inc", NULL, time_bench_atomic_inc); time_bench_loop(nr_loops, 0, "lock", NULL, time_bench_lock); } /* This test cannot activate correct code path, due to no-softirq ctx */ if (enabled(bit_run_bench_no_softirq01)) time_bench_loop(nr_loops, 0, "no-softirq-page_pool01", NULL, time_bench_page_pool01_fast_path); if (enabled(bit_run_bench_no_softirq02)) time_bench_loop(nr_loops, 0, "no-softirq-page_pool02", NULL, time_bench_page_pool02_ptr_ring); if (enabled(bit_run_bench_no_softirq03)) time_bench_loop(nr_loops, 0, "no-softirq-page_pool03", NULL, time_bench_page_pool03_slow); return 0; } static int __init bench_page_pool_simple_module_init(void) { if (verbose) pr_info("Loaded\n"); if (loops > U32_MAX) { pr_err("Module param loops(%lu) exceeded U32_MAX(%u)\n", loops, U32_MAX); return -ECHRNG; } run_benchmark_tests(); return 0; } module_init(bench_page_pool_simple_module_init); static void __exit bench_page_pool_simple_module_exit(void) { if (verbose) pr_info("Unloaded\n"); } module_exit(bench_page_pool_simple_module_exit); MODULE_DESCRIPTION("Benchmark of page_pool simple cases"); MODULE_AUTHOR("Jesper Dangaard Brouer "); MODULE_LICENSE("GPL");