diff --git a/kernel.spec b/kernel.spec index 82d3355e52a2197b9f77b9892355f128447f862f..2e6bb9400cf60545b76b210181abb34609945521 100644 --- a/kernel.spec +++ b/kernel.spec @@ -32,7 +32,7 @@ Name: kernel Version: 4.19.90 -Release: %{hulkrelease}.0263 +Release: %{hulkrelease}.0264 Summary: Linux Kernel License: GPLv2 URL: http://www.kernel.org/ @@ -849,6 +849,27 @@ fi %endif %changelog +* Mon Jan 29 2024 Xue Sinian - 4.19.90-2402.1.0.0264 +- sched/fair: ARM64 enables SIS_UTIL and disables SIS_PROP +- sched/fair: Fix kabi borken in sched_domain_shared +- sched/fair: Introduce SIS_UTIL to search idle CPU based on sum of util_avg +- sched:Open the kernel configuration for cluster. +- scheduler: Disable cluster scheduling by default +- scheduler: Add boot time enabling/disabling of cluster scheduling +- scheduler: Add runtime knob sysctl_sched_cluster +- sched/topology, arch/arm64: Rebuild the sched_domain hierarchy when the CPU capacity changes +- sched/topology, drivers/base/arch_topology: Rebuild the sched_domain hierarchy when capacities change +- sysctl: add a new register_sysctl_init() interface +- scheduler: Create SDTL_SKIP flag to skip topology level +- sched/fair: Scan cluster before scanning LLC in wake-up path +- sched: Add per_cpu cluster domain info and cpus_share_lowest_cache API +- arch_topology: Make cluster topology span at least SMT CPUs +- arch_topology: Limit span of cpu_clustergroup_mask() +- topology: Remove unused cpu_cluster_mask() +- topology/sysfs: export cluster attributes only if an architectures has support +- sched: Add cluster scheduler level in core and related Kconfig for ARM64 +- topology: Represent clusters of CPUs within a die + * Tue Jan 30 2024 hongrongxuan - 4.19.90-2402.1.0.0263 - !4277 fs:/dcache.c: fix negative dentry limit not complete problem - !4288 net/rds: Fix UBSAN: array-index-out-of-bounds in rds_cmsg_recv diff --git a/patches/0758-topology-Represent-clusters-of-CPUs-within-a-die.patch b/patches/0758-topology-Represent-clusters-of-CPUs-within-a-die.patch new file mode 100644 index 0000000000000000000000000000000000000000..28f8f06145db83cc4c5b20f262ff8c106586c078 --- /dev/null +++ b/patches/0758-topology-Represent-clusters-of-CPUs-within-a-die.patch @@ -0,0 +1,475 @@ +From 1b13f2fb8f8b9a9e96d0d5b2b95266a189cd0555 Mon Sep 17 00:00:00 2001 +From: Jonathan Cameron +Date: Thu, 18 Nov 2021 20:43:35 +0800 +Subject: [PATCH 01/19] topology: Represent clusters of CPUs within a die + +mainline inclusion +from mainline-v5.16-rc1 +commit c5e22feffdd736cb02b98b0f5b375c8ebc858dd4 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I4GEZS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c5e22feffdd736cb02b98b0f5b375c8ebc858dd4 + +------------------------------------------------------------------------ + +Both ACPI and DT provide the ability to describe additional layers of +topology between that of individual cores and higher level constructs +such as the level at which the last level cache is shared. +In ACPI this can be represented in PPTT as a Processor Hierarchy +Node Structure [1] that is the parent of the CPU cores and in turn +has a parent Processor Hierarchy Nodes Structure representing +a higher level of topology. + +For example Kunpeng 920 has 6 or 8 clusters in each NUMA node, and each +cluster has 4 cpus. All clusters share L3 cache data, but each cluster +has local L3 tag. On the other hand, each clusters will share some +internal system bus. + ++-----------------------------------+ +---------+ +| +------+ +------+ +--------------------------+ | +| | CPU0 | | cpu1 | | +-----------+ | | +| +------+ +------+ | | | | | +| +----+ L3 | | | +| +------+ +------+ cluster | | tag | | | +| | CPU2 | | CPU3 | | | | | | +| +------+ +------+ | +-----------+ | | +| | | | ++-----------------------------------+ | | ++-----------------------------------+ | | +| +------+ +------+ +--------------------------+ | +| | | | | | +-----------+ | | +| +------+ +------+ | | | | | +| | | L3 | | | +| +------+ +------+ +----+ tag | | | +| | | | | | | | | | +| +------+ +------+ | +-----------+ | | +| | | | ++-----------------------------------+ | L3 | + | data | ++-----------------------------------+ | | +| +------+ +------+ | +-----------+ | | +| | | | | | | | | | +| +------+ +------+ +----+ L3 | | | +| | | tag | | | +| +------+ +------+ | | | | | +| | | | | | +-----------+ | | +| +------+ +------+ +--------------------------+ | ++-----------------------------------| | | ++-----------------------------------| | | +| +------+ +------+ +--------------------------+ | +| | | | | | +-----------+ | | +| +------+ +------+ | | | | | +| +----+ L3 | | | +| +------+ +------+ | | tag | | | +| | | | | | | | | | +| +------+ +------+ | +-----------+ | | +| | | | ++-----------------------------------+ | | ++-----------------------------------+ | | +| +------+ +------+ +--------------------------+ | +| | | | | | +-----------+ | | +| +------+ +------+ | | | | | +| | | L3 | | | +| +------+ +------+ +---+ tag | | | +| | | | | | | | | | +| +------+ +------+ | +-----------+ | | +| | | | ++-----------------------------------+ | | ++-----------------------------------+ | | +| +------+ +------+ +--------------------------+ | +| | | | | | +-----------+ | | +| +------+ +------+ | | | | | +| | | L3 | | | +| +------+ +------+ +--+ tag | | | +| | | | | | | | | | +| +------+ +------+ | +-----------+ | | +| | +---------+ ++-----------------------------------+ + +That means spreading tasks among clusters will bring more bandwidth +while packing tasks within one cluster will lead to smaller cache +synchronization latency. So both kernel and userspace will have +a chance to leverage this topology to deploy tasks accordingly to +achieve either smaller cache latency within one cluster or an even +distribution of load among clusters for higher throughput. + +This patch exposes cluster topology to both kernel and userspace. +Libraried like hwloc will know cluster by cluster_cpus and related +sysfs attributes. PoC of HWLOC support at [2]. + +Note this patch only handle the ACPI case. + +Special consideration is needed for SMT processors, where it is +necessary to move 2 levels up the hierarchy from the leaf nodes +(thus skipping the processor core level). + +Note that arm64 / ACPI does not provide any means of identifying +a die level in the topology but that may be unrelate to the cluster +level. + +[1] ACPI Specification 6.3 - section 5.2.29.1 processor hierarchy node + structure (Type 0) +[2] https://github.com/hisilicon/hwloc/tree/linux-cluster + +Signed-off-by: Jonathan Cameron +Signed-off-by: Tian Tao +Signed-off-by: Barry Song +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20210924085104.44806-2-21cnbao@gmail.com +Signed-off-by: Yicong Yang +Reviewed-by: tao zeng +Signed-off-by: Zheng Zengkai + +Conflicts: + Documentation/ABI/stable/sysfs-devices-system-cpu + Documentation/admin-guide/cputopology.rst + drivers/base/arch_topology.c + drivers/base/topology.c + include/linux/arch_topology.h + include/linux/topology.h + +Signed-off-by: Jiang Yi +--- + Documentation/cputopology.txt | 26 +++++++++-- + arch/arm64/include/asm/topology.h | 5 +++ + arch/arm64/kernel/topology.c | 17 ++++++++ + drivers/acpi/pptt.c | 72 +++++++++++++++++++++++++++++++ + drivers/base/topology.c | 10 +++++ + include/linux/acpi.h | 5 +++ + include/linux/topology.h | 6 +++ + 7 files changed, 137 insertions(+), 4 deletions(-) + +diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt +index 2ff8a1e9a2db..acd55bf0c718 100644 +--- a/Documentation/cputopology.txt ++++ b/Documentation/cputopology.txt +@@ -18,6 +18,11 @@ die_id: + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + ++cluster_id: ++ the cluster ID of cpuX. Typically it is the hardware platform's ++ identifier (rather than the kernel's). The actual value is ++ architecture and platform dependent. ++ + core_id: + + the CPU core ID of cpuX. Typically it is the hardware platform's +@@ -36,6 +41,15 @@ drawer_id: + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + ++cluster_cpus: ++ ++ internal kernel map of CPUs within the same cluster ++ ++cluster_cpus_list: ++ ++ human-readable list of CPUs within the same cluster. ++ The format is like 0-3, 8-11, 14,17. ++ + thread_siblings: + + internal kernel map of cpuX's hardware threads within the same +@@ -88,11 +102,13 @@ these macros in include/asm-XXX/topology.h:: + + #define topology_physical_package_id(cpu) + #define topology_die_id(cpu) ++ #define topology_cluster_id(cpu) + #define topology_core_id(cpu) + #define topology_book_id(cpu) + #define topology_drawer_id(cpu) + #define topology_sibling_cpumask(cpu) + #define topology_core_cpumask(cpu) ++ #define topology_cluster_cpumask(cpu) + #define topology_book_cpumask(cpu) + #define topology_drawer_cpumask(cpu) + +@@ -107,10 +123,12 @@ not defined by include/asm-XXX/topology.h: + + 1) topology_physical_package_id: -1 + 2) topology_die_id: -1 +-3) topology_core_id: 0 +-4) topology_sibling_cpumask: just the given CPU +-5) topology_core_cpumask: just the given CPU +-6) topology_die_cpumask: just the given CPU ++3) topology_cluster_id: -1 ++4) topology_core_id: 0 ++5) topology_sibling_cpumask: just the given CPU ++6) topology_core_cpumask: just the given CPU ++7) topology_cluster_cpumask: just the given CPU ++8) topology_die_cpumask: just the given CPU + + For architectures that don't support books (CONFIG_SCHED_BOOK) there are no + default definitions for topology_book_id() and topology_book_cpumask(). +diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h +index 49a0fee4f89b..164e26035653 100644 +--- a/arch/arm64/include/asm/topology.h ++++ b/arch/arm64/include/asm/topology.h +@@ -7,25 +7,30 @@ + struct cpu_topology { + int thread_id; + int core_id; ++ int cluster_id; + int package_id; + int llc_id; + cpumask_t thread_sibling; + cpumask_t core_sibling; ++ cpumask_t cluster_sibling; + cpumask_t llc_sibling; + }; + + extern struct cpu_topology cpu_topology[NR_CPUS]; + + #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) ++#define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id) + #define topology_core_id(cpu) (cpu_topology[cpu].core_id) + #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) + #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) ++#define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) + #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) + + void init_cpu_topology(void); + void store_cpu_topology(unsigned int cpuid); + void remove_cpu_topology(unsigned int cpuid); + const struct cpumask *cpu_coregroup_mask(int cpu); ++const struct cpumask *cpu_clustergroup_mask(int cpu); + + #ifdef CONFIG_NUMA + +diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c +index bf937d334b81..e4effe6f3177 100644 +--- a/arch/arm64/kernel/topology.c ++++ b/arch/arm64/kernel/topology.c +@@ -230,6 +230,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) + return core_mask; + } + ++const struct cpumask *cpu_clustergroup_mask(int cpu) ++{ ++ return &cpu_topology[cpu].cluster_sibling; ++} ++ + static void update_siblings_masks(unsigned int cpuid) + { + struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; +@@ -247,6 +252,12 @@ static void update_siblings_masks(unsigned int cpuid) + if (cpuid_topo->package_id != cpu_topo->package_id) + continue; + ++ if (cpuid_topo->cluster_id == cpu_topo->cluster_id && ++ cpuid_topo->cluster_id != -1) { ++ cpumask_set_cpu(cpu, &cpuid_topo->cluster_sibling); ++ cpumask_set_cpu(cpuid, &cpu_topo->cluster_sibling); ++ } ++ + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); + cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); + +@@ -312,6 +323,9 @@ static void clear_cpu_topology(int cpu) + cpumask_clear(&cpu_topo->llc_sibling); + cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); + ++ cpumask_clear(&cpu_topo->cluster_sibling); ++ cpumask_set_cpu(cpu, &cpu_topo->cluster_sibling); ++ + cpumask_clear(&cpu_topo->core_sibling); + cpumask_set_cpu(cpu, &cpu_topo->core_sibling); + cpumask_clear(&cpu_topo->thread_sibling); +@@ -327,6 +341,7 @@ static void __init reset_cpu_topology(void) + + cpu_topo->thread_id = -1; + cpu_topo->core_id = 0; ++ cpu_topo->cluster_id = -1; + cpu_topo->package_id = -1; + cpu_topo->llc_id = -1; + +@@ -438,6 +453,8 @@ static int __init parse_acpi_topology(void) + cpu_topology[cpu].thread_id = -1; + cpu_topology[cpu].core_id = topology_id; + } ++ topology_id = find_acpi_cpu_topology_cluster(cpu); ++ cpu_topology[cpu].cluster_id = topology_id; + topology_id = find_acpi_cpu_topology_package(cpu); + cpu_topology[cpu].package_id = topology_id; + +diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c +index 74ce9fcee830..31c9f9663464 100644 +--- a/drivers/acpi/pptt.c ++++ b/drivers/acpi/pptt.c +@@ -617,6 +617,11 @@ static struct acpi_pptt_processor *acpi_find_processor_tag(struct acpi_table_hea + return cpu; + } + ++static void acpi_pptt_warn_missing(void) ++{ ++ pr_warn_once("No PPTT table found, CPU and cache topology may be inaccurate\n"); ++} ++ + /** + * topology_get_acpi_cpu_tag() - Find a unique topology value for a feature + * @table: Pointer to the head of the PPTT table +@@ -871,6 +876,73 @@ int find_acpi_cpu_topology_package(unsigned int cpu) + ACPI_PPTT_PHYSICAL_PACKAGE); + } + ++/** ++ * find_acpi_cpu_topology_cluster() - Determine a unique CPU cluster value ++ * @cpu: Kernel logical CPU number ++ * ++ * Determine a topology unique cluster ID for the given CPU/thread. ++ * This ID can then be used to group peers, which will have matching ids. ++ * ++ * The cluster, if present is the level of topology above CPUs. In a ++ * multi-thread CPU, it will be the level above the CPU, not the thread. ++ * It may not exist in single CPU systems. In simple multi-CPU systems, ++ * it may be equal to the package topology level. ++ * ++ * Return: -ENOENT if the PPTT doesn't exist, the CPU cannot be found ++ * or there is no toplogy level above the CPU.. ++ * Otherwise returns a value which represents the package for this CPU. ++ */ ++ ++int find_acpi_cpu_topology_cluster(unsigned int cpu) ++{ ++ struct acpi_table_header *table; ++ acpi_status status; ++ struct acpi_pptt_processor *cpu_node, *cluster_node; ++ u32 acpi_cpu_id; ++ int retval; ++ int is_thread; ++ ++ status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); ++ if (ACPI_FAILURE(status)) { ++ acpi_pptt_warn_missing(); ++ return -ENOENT; ++ } ++ ++ acpi_cpu_id = get_acpi_id_for_cpu(cpu); ++ cpu_node = acpi_find_processor_node(table, acpi_cpu_id); ++ if (cpu_node == NULL || !cpu_node->parent) { ++ retval = -ENOENT; ++ goto put_table; ++ } ++ ++ is_thread = cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_IS_THREAD; ++ cluster_node = fetch_pptt_node(table, cpu_node->parent); ++ if (cluster_node == NULL) { ++ retval = -ENOENT; ++ goto put_table; ++ } ++ if (is_thread) { ++ if (!cluster_node->parent) { ++ retval = -ENOENT; ++ goto put_table; ++ } ++ cluster_node = fetch_pptt_node(table, cluster_node->parent); ++ if (cluster_node == NULL) { ++ retval = -ENOENT; ++ goto put_table; ++ } ++ } ++ if (cluster_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID) ++ retval = cluster_node->acpi_processor_id; ++ else ++ retval = ACPI_PTR_DIFF(cluster_node, table); ++ ++put_table: ++ acpi_put_table(table); ++ ++ return retval; ++} ++ + /** + * find_acpi_cpu_topology_hetero_id() - Get a core architecture tag + * @cpu: Kernel logical CPU number +diff --git a/drivers/base/topology.c b/drivers/base/topology.c +index da74231de498..7e4bdf65e27a 100644 +--- a/drivers/base/topology.c ++++ b/drivers/base/topology.c +@@ -46,6 +46,9 @@ static DEVICE_ATTR_RO(physical_package_id); + define_id_show_func(die_id); + static DEVICE_ATTR_RO(die_id); + ++define_id_show_func(cluster_id); ++static DEVICE_ATTR_RO(cluster_id); ++ + define_id_show_func(core_id); + static DEVICE_ATTR_RO(core_id); + +@@ -57,6 +60,10 @@ define_siblings_show_func(core_siblings, core_cpumask); + static DEVICE_ATTR_RO(core_siblings); + static DEVICE_ATTR_RO(core_siblings_list); + ++define_siblings_show_func(cluster_cpus, cluster_cpumask); ++static DEVICE_ATTR_RO(cluster_cpus); ++static DEVICE_ATTR_RO(cluster_cpus_list); ++ + #ifdef CONFIG_SCHED_BOOK + define_id_show_func(book_id); + static DEVICE_ATTR_RO(book_id); +@@ -76,11 +83,14 @@ static DEVICE_ATTR_RO(drawer_siblings_list); + static struct attribute *default_attrs[] = { + &dev_attr_physical_package_id.attr, + &dev_attr_die_id.attr, ++ &dev_attr_cluster_id.attr, + &dev_attr_core_id.attr, + &dev_attr_thread_siblings.attr, + &dev_attr_thread_siblings_list.attr, + &dev_attr_core_siblings.attr, + &dev_attr_core_siblings_list.attr, ++ &dev_attr_cluster_cpus.attr, ++ &dev_attr_cluster_cpus_list.attr, + #ifdef CONFIG_SCHED_BOOK + &dev_attr_book_id.attr, + &dev_attr_book_siblings.attr, +diff --git a/include/linux/acpi.h b/include/linux/acpi.h +index 3669c2ff26ed..0dba77b8629b 100644 +--- a/include/linux/acpi.h ++++ b/include/linux/acpi.h +@@ -1328,6 +1328,7 @@ static inline int lpit_read_residency_count_address(u64 *address) + #ifdef CONFIG_ACPI_PPTT + int acpi_pptt_cpu_is_thread(unsigned int cpu); + int find_acpi_cpu_topology(unsigned int cpu, int level); ++int find_acpi_cpu_topology_cluster(unsigned int cpu); + int find_acpi_cpu_topology_package(unsigned int cpu); + int find_acpi_cpu_topology_hetero_id(unsigned int cpu); + int find_acpi_cpu_cache_topology(unsigned int cpu, int level); +@@ -1341,6 +1342,10 @@ static inline int find_acpi_cpu_topology(unsigned int cpu, int level) + { + return -EINVAL; + } ++static inline int find_acpi_cpu_topology_cluster(unsigned int cpu) ++{ ++ return -EINVAL; ++} + static inline int find_acpi_cpu_topology_package(unsigned int cpu) + { + return -EINVAL; +diff --git a/include/linux/topology.h b/include/linux/topology.h +index a19771cd267d..90dd075394b2 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -188,6 +188,9 @@ static inline int cpu_to_mem(int cpu) + #ifndef topology_die_id + #define topology_die_id(cpu) ((void)(cpu), -1) + #endif ++#ifndef topology_cluster_id ++#define topology_cluster_id(cpu) ((void)(cpu), -1) ++#endif + #ifndef topology_core_id + #define topology_core_id(cpu) ((void)(cpu), 0) + #endif +@@ -197,6 +200,9 @@ static inline int cpu_to_mem(int cpu) + #ifndef topology_core_cpumask + #define topology_core_cpumask(cpu) cpumask_of(cpu) + #endif ++#ifndef topology_cluster_cpumask ++#define topology_cluster_cpumask(cpu) cpumask_of(cpu) ++#endif + + #ifdef CONFIG_SCHED_SMT + static inline const struct cpumask *cpu_smt_mask(int cpu) +-- +2.23.0 + diff --git a/patches/0759-sched-Add-cluster-scheduler-level-in-core-and-relate.patch b/patches/0759-sched-Add-cluster-scheduler-level-in-core-and-relate.patch new file mode 100644 index 0000000000000000000000000000000000000000..aecb436a85d2e5b9607a03d10b5f5019bdaa7e46 --- /dev/null +++ b/patches/0759-sched-Add-cluster-scheduler-level-in-core-and-relate.patch @@ -0,0 +1,264 @@ +From 7bc2a89b5f480371e49d18e82841fc74494c9281 Mon Sep 17 00:00:00 2001 +From: Barry Song +Date: Fri, 24 Sep 2021 20:51:03 +1200 +Subject: [PATCH 02/19] sched: Add cluster scheduler level in core and related + Kconfig for ARM64 + +mainline inclusion +from mainline-v5.16-rc1 +commit 778c558f49a2cb3dc7b18a80ff515e82aa813627 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I4GEZS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=778c558f49a2cb3dc7b18a80ff515e82aa813627 + +------------------------------------------------------------------------ + +This patch adds scheduler level for clusters and automatically enables +the load balance among clusters. It will directly benefit a lot of +workload which loves more resources such as memory bandwidth, caches. + +Testing has widely been done in two different hardware configurations of +Kunpeng920: + + 24 cores in one NUMA(6 clusters in each NUMA node); + 32 cores in one NUMA(8 clusters in each NUMA node) + +Workload is running on either one NUMA node or four NUMA nodes, thus, +this can estimate the effect of cluster spreading w/ and w/o NUMA load +balance. + +* Stream benchmark: + +4threads stream (on 1NUMA * 24cores = 24cores) + stream stream + w/o patch w/ patch +MB/sec copy 29929.64 ( 0.00%) 32932.68 ( 10.03%) +MB/sec scale 29861.10 ( 0.00%) 32710.58 ( 9.54%) +MB/sec add 27034.42 ( 0.00%) 32400.68 ( 19.85%) +MB/sec triad 27225.26 ( 0.00%) 31965.36 ( 17.41%) + +6threads stream (on 1NUMA * 24cores = 24cores) + stream stream + w/o patch w/ patch +MB/sec copy 40330.24 ( 0.00%) 42377.68 ( 5.08%) +MB/sec scale 40196.42 ( 0.00%) 42197.90 ( 4.98%) +MB/sec add 37427.00 ( 0.00%) 41960.78 ( 12.11%) +MB/sec triad 37841.36 ( 0.00%) 42513.64 ( 12.35%) + +12threads stream (on 1NUMA * 24cores = 24cores) + stream stream + w/o patch w/ patch +MB/sec copy 52639.82 ( 0.00%) 53818.04 ( 2.24%) +MB/sec scale 52350.30 ( 0.00%) 53253.38 ( 1.73%) +MB/sec add 53607.68 ( 0.00%) 55198.82 ( 2.97%) +MB/sec triad 54776.66 ( 0.00%) 56360.40 ( 2.89%) + +Thus, it could help memory-bound workload especially under medium load. +Similar improvement is also seen in lkp-pbzip2: + +* lkp-pbzip2 benchmark + +2-96 threads (on 4NUMA * 24cores = 96cores) + lkp-pbzip2 lkp-pbzip2 + w/o patch w/ patch +Hmean tput-2 11062841.57 ( 0.00%) 11341817.51 * 2.52%* +Hmean tput-5 26815503.70 ( 0.00%) 27412872.65 * 2.23%* +Hmean tput-8 41873782.21 ( 0.00%) 43326212.92 * 3.47%* +Hmean tput-12 61875980.48 ( 0.00%) 64578337.51 * 4.37%* +Hmean tput-21 105814963.07 ( 0.00%) 111381851.01 * 5.26%* +Hmean tput-30 150349470.98 ( 0.00%) 156507070.73 * 4.10%* +Hmean tput-48 237195937.69 ( 0.00%) 242353597.17 * 2.17%* +Hmean tput-79 360252509.37 ( 0.00%) 362635169.23 * 0.66%* +Hmean tput-96 394571737.90 ( 0.00%) 400952978.48 * 1.62%* + +2-24 threads (on 1NUMA * 24cores = 24cores) + lkp-pbzip2 lkp-pbzip2 + w/o patch w/ patch +Hmean tput-2 11071705.49 ( 0.00%) 11296869.10 * 2.03%* +Hmean tput-4 20782165.19 ( 0.00%) 21949232.15 * 5.62%* +Hmean tput-6 30489565.14 ( 0.00%) 33023026.96 * 8.31%* +Hmean tput-8 40376495.80 ( 0.00%) 42779286.27 * 5.95%* +Hmean tput-12 61264033.85 ( 0.00%) 62995632.78 * 2.83%* +Hmean tput-18 86697139.39 ( 0.00%) 86461545.74 ( -0.27%) +Hmean tput-24 104854637.04 ( 0.00%) 104522649.46 * -0.32%* + +In the case of 6 threads and 8 threads, we see the greatest performance +improvement. + +Similar improvement can be seen on lkp-pixz though the improvement is +smaller: + +* lkp-pixz benchmark + +2-24 threads lkp-pixz (on 1NUMA * 24cores = 24cores) + lkp-pixz lkp-pixz + w/o patch w/ patch +Hmean tput-2 6486981.16 ( 0.00%) 6561515.98 * 1.15%* +Hmean tput-4 11645766.38 ( 0.00%) 11614628.43 ( -0.27%) +Hmean tput-6 15429943.96 ( 0.00%) 15957350.76 * 3.42%* +Hmean tput-8 19974087.63 ( 0.00%) 20413746.98 * 2.20%* +Hmean tput-12 28172068.18 ( 0.00%) 28751997.06 * 2.06%* +Hmean tput-18 39413409.54 ( 0.00%) 39896830.55 * 1.23%* +Hmean tput-24 49101815.85 ( 0.00%) 49418141.47 * 0.64%* + +* SPECrate benchmark + +4,8,16 copies mcf_r(on 1NUMA * 32cores = 32cores) + Base Base + Run Time Rate + ------- --------- +4 Copies w/o 580 (w/ 570) w/o 11.1 (w/ 11.3) +8 Copies w/o 647 (w/ 605) w/o 20.0 (w/ 21.4, +7%) +16 Copies w/o 844 (w/ 844) w/o 30.6 (w/ 30.6) + +32 Copies(on 4NUMA * 32 cores = 128cores) +[w/o patch] + Base Base Base +Benchmarks Copies Run Time Rate +--------------- ------- --------- --------- +500.perlbench_r 32 584 87.2 * +502.gcc_r 32 503 90.2 * +505.mcf_r 32 745 69.4 * +520.omnetpp_r 32 1031 40.7 * +523.xalancbmk_r 32 597 56.6 * +525.x264_r 1 -- CE +531.deepsjeng_r 32 336 109 * +541.leela_r 32 556 95.4 * +548.exchange2_r 32 513 163 * +557.xz_r 32 530 65.2 * + Est. SPECrate2017_int_base 80.3 + +[w/ patch] + Base Base Base +Benchmarks Copies Run Time Rate +--------------- ------- --------- --------- +500.perlbench_r 32 580 87.8 (+0.688%) * +502.gcc_r 32 477 95.1 (+5.432%) * +505.mcf_r 32 644 80.3 (+13.574%) * +520.omnetpp_r 32 942 44.6 (+9.58%) * +523.xalancbmk_r 32 560 60.4 (+6.714%%) * +525.x264_r 1 -- CE +531.deepsjeng_r 32 337 109 (+0.000%) * +541.leela_r 32 554 95.6 (+0.210%) * +548.exchange2_r 32 515 163 (+0.000%) * +557.xz_r 32 524 66.0 (+1.227%) * + Est. SPECrate2017_int_base 83.7 (+4.062%) + +On the other hand, it is slightly helpful to CPU-bound tasks like +kernbench: + +* 24-96 threads kernbench (on 4NUMA * 24cores = 96cores) + kernbench kernbench + w/o cluster w/ cluster +Min user-24 12054.67 ( 0.00%) 12024.19 ( 0.25%) +Min syst-24 1751.51 ( 0.00%) 1731.68 ( 1.13%) +Min elsp-24 600.46 ( 0.00%) 598.64 ( 0.30%) +Min user-48 12361.93 ( 0.00%) 12315.32 ( 0.38%) +Min syst-48 1917.66 ( 0.00%) 1892.73 ( 1.30%) +Min elsp-48 333.96 ( 0.00%) 332.57 ( 0.42%) +Min user-96 12922.40 ( 0.00%) 12921.17 ( 0.01%) +Min syst-96 2143.94 ( 0.00%) 2110.39 ( 1.56%) +Min elsp-96 211.22 ( 0.00%) 210.47 ( 0.36%) +Amean user-24 12063.99 ( 0.00%) 12030.78 * 0.28%* +Amean syst-24 1755.20 ( 0.00%) 1735.53 * 1.12%* +Amean elsp-24 601.60 ( 0.00%) 600.19 ( 0.23%) +Amean user-48 12362.62 ( 0.00%) 12315.56 * 0.38%* +Amean syst-48 1921.59 ( 0.00%) 1894.95 * 1.39%* +Amean elsp-48 334.10 ( 0.00%) 332.82 * 0.38%* +Amean user-96 12925.27 ( 0.00%) 12922.63 ( 0.02%) +Amean syst-96 2146.66 ( 0.00%) 2122.20 * 1.14%* +Amean elsp-96 211.96 ( 0.00%) 211.79 ( 0.08%) + +Note this patch isn't an universal win, it might hurt those workload +which can benefit from packing. Though tasks which want to take +advantages of lower communication latency of one cluster won't +necessarily been packed in one cluster while kernel is not aware of +clusters, they have some chance to be randomly packed. But this +patch will make them more likely spread. + +Signed-off-by: Barry Song +Tested-by: Yicong Yang +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Jiang Yi +--- + arch/arm64/Kconfig | 9 +++++++++ + include/linux/sched/topology.h | 7 +++++++ + include/linux/topology.h | 7 +++++++ + kernel/sched/topology.c | 5 +++++ + 4 files changed, 28 insertions(+) + +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index aeae4ca1e7df..323d88736d28 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -794,6 +794,15 @@ config SCHED_MC + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + ++config SCHED_CLUSTER ++ bool "Cluster scheduler support" ++ help ++ Cluster scheduler support improves the CPU scheduler's decision ++ making when dealing with machines that have clusters of CPUs. ++ Cluster usually means a couple of CPUs which are placed closely ++ by sharing mid-level caches, last-level cache tags or internal ++ busses. ++ + config SCHED_SMT + bool "SMT scheduler support" + help +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index a9032116c13e..15d2e06f690b 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -40,6 +40,13 @@ static inline int cpu_smt_flags(void) + } + #endif + ++#ifdef CONFIG_SCHED_CLUSTER ++static inline int cpu_cluster_flags(void) ++{ ++ return SD_SHARE_PKG_RESOURCES; ++} ++#endif ++ + #ifdef CONFIG_SCHED_MC + static inline int cpu_core_flags(void) + { +diff --git a/include/linux/topology.h b/include/linux/topology.h +index 90dd075394b2..58f8a9e9d90b 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -211,6 +211,13 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) + } + #endif + ++#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) ++static inline const struct cpumask *cpu_cluster_mask(int cpu) ++{ ++ return topology_cluster_cpumask(cpu); ++} ++#endif ++ + static inline const struct cpumask *cpu_cpu_mask(int cpu) + { + return cpumask_of_node(cpu_to_node(cpu)); +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index ad5591520c99..5d662314c08b 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1298,6 +1298,11 @@ static struct sched_domain_topology_level default_topology[] = { + #ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + #endif ++ ++#ifdef CONFIG_SCHED_CLUSTER ++ { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, ++#endif ++ + #ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + #endif +-- +2.23.0 + diff --git a/patches/0760-topology-sysfs-export-cluster-attributes-only-if-an-.patch b/patches/0760-topology-sysfs-export-cluster-attributes-only-if-an-.patch new file mode 100644 index 0000000000000000000000000000000000000000..61d51b1a645e20a34a2975e6330b1ee1edd67bcf --- /dev/null +++ b/patches/0760-topology-sysfs-export-cluster-attributes-only-if-an-.patch @@ -0,0 +1,128 @@ +From 9af57e69c315e5898098a00433fdf56613529fac Mon Sep 17 00:00:00 2001 +From: Heiko Carstens +Date: Mon, 29 Nov 2021 14:03:08 +0100 +Subject: [PATCH 03/19] topology/sysfs: export cluster attributes only if an + architectures has support + +mainline inclusion +from mainline-v5.17-rc1 +commit e795707703b32fecdd7467afcc33ff1e92416c05 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I4GEZS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e795707703b32fecdd7467afcc33ff1e92416c05 + +---------------------------------------------------------------------- + +The cluster_id and cluster_cpus topology sysfs attributes have been +added with commit c5e22feffdd7 ("topology: Represent clusters of CPUs +within a die"). + +They are currently only used for x86, arm64, and riscv (via generic +arch topology), however they are still present with bogus default +values for all other architectures. Instead of enforcing such new +sysfs attributes to all architectures, make them only optional visible +if an architecture opts in by defining both the topology_cluster_id +and topology_cluster_cpumask attributes. + +This is similar to what was done when the book and drawer topology +levels were introduced: avoid useless and therefore confusing sysfs +attributes for architectures which cannot make use of them. + +This should not break any existing applications, since this is a +new interface introduced with the v5.16 merge window. + +Acked-by: Peter Zijlstra (Intel) +Signed-off-by: Heiko Carstens +Link: https://lore.kernel.org/r/20211129130309.3256168-3-hca@linux.ibm.com +Signed-off-by: Greg Kroah-Hartman + +Conflicts: + Documentation/admin-guide/cputopology.rst + drivers/base/topology.c + include/linux/topology.h + +Signed-off-by: Jiang Yi +--- + Documentation/cputopology.txt | 3 +++ + drivers/base/topology.c | 8 ++++++++ + include/linux/topology.h | 4 ++++ + 3 files changed, 15 insertions(+) + +diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt +index acd55bf0c718..e6645ff18994 100644 +--- a/Documentation/cputopology.txt ++++ b/Documentation/cputopology.txt +@@ -94,6 +94,9 @@ Architecture-neutral, drivers/base/topology.c, exports these attributes. + However, the book and drawer related sysfs files will only be created if + CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. + ++The cluster hierarchy related sysfs files will only be created if an ++architecture provides the related macros as described below. ++ + CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, + where they reflect the cpu and cache hierarchy. + +diff --git a/drivers/base/topology.c b/drivers/base/topology.c +index 7e4bdf65e27a..66ebcf05030f 100644 +--- a/drivers/base/topology.c ++++ b/drivers/base/topology.c +@@ -46,8 +46,10 @@ static DEVICE_ATTR_RO(physical_package_id); + define_id_show_func(die_id); + static DEVICE_ATTR_RO(die_id); + ++#ifdef TOPOLOGY_CLUSTER_SYSFS + define_id_show_func(cluster_id); + static DEVICE_ATTR_RO(cluster_id); ++#endif + + define_id_show_func(core_id); + static DEVICE_ATTR_RO(core_id); +@@ -60,9 +62,11 @@ define_siblings_show_func(core_siblings, core_cpumask); + static DEVICE_ATTR_RO(core_siblings); + static DEVICE_ATTR_RO(core_siblings_list); + ++#ifdef TOPOLOGY_CLUSTER_SYSFS + define_siblings_show_func(cluster_cpus, cluster_cpumask); + static DEVICE_ATTR_RO(cluster_cpus); + static DEVICE_ATTR_RO(cluster_cpus_list); ++#endif + + #ifdef CONFIG_SCHED_BOOK + define_id_show_func(book_id); +@@ -83,14 +87,18 @@ static DEVICE_ATTR_RO(drawer_siblings_list); + static struct attribute *default_attrs[] = { + &dev_attr_physical_package_id.attr, + &dev_attr_die_id.attr, ++#ifdef TOPOLOGY_CLUSTER_SYSFS + &dev_attr_cluster_id.attr, ++#endif + &dev_attr_core_id.attr, + &dev_attr_thread_siblings.attr, + &dev_attr_thread_siblings_list.attr, + &dev_attr_core_siblings.attr, + &dev_attr_core_siblings_list.attr, ++#ifdef TOPOLOGY_CLUSTER_SYSFS + &dev_attr_cluster_cpus.attr, + &dev_attr_cluster_cpus_list.attr, ++#endif + #ifdef CONFIG_SCHED_BOOK + &dev_attr_book_id.attr, + &dev_attr_book_siblings.attr, +diff --git a/include/linux/topology.h b/include/linux/topology.h +index 58f8a9e9d90b..9033a952ee68 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -182,6 +182,10 @@ static inline int cpu_to_mem(int cpu) + + #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ + ++#if defined(topology_cluster_id) && defined(topology_cluster_cpumask) ++#define TOPOLOGY_CLUSTER_SYSFS ++#endif ++ + #ifndef topology_physical_package_id + #define topology_physical_package_id(cpu) ((void)(cpu), -1) + #endif +-- +2.23.0 + diff --git a/patches/0761-topology-Remove-unused-cpu_cluster_mask.patch b/patches/0761-topology-Remove-unused-cpu_cluster_mask.patch new file mode 100644 index 0000000000000000000000000000000000000000..7536559e6f9f9cf848859aff3edcd6374cc7713d --- /dev/null +++ b/patches/0761-topology-Remove-unused-cpu_cluster_mask.patch @@ -0,0 +1,49 @@ +From 70b392a77b5436b9020266a55cb80ad048c2ffba Mon Sep 17 00:00:00 2001 +From: Dietmar Eggemann +Date: Fri, 13 May 2022 11:34:33 +0200 +Subject: [PATCH 04/19] topology: Remove unused cpu_cluster_mask() + +mainline inclusion +from mainline-v5.19-rc1 +commit 15f214f9bdb7c1f560b4bf863c5a72ff53b442a4 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I4GEZS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=15f214f9bdb7c1f560b4bf863c5a72ff53b442a4 + +------------------------------------------------------------------------ + +default_topology[] uses cpu_clustergroup_mask() for the CLS level +(guarded by CONFIG_SCHED_CLUSTER) which is currently provided by x86 +(arch/x86/kernel/smpboot.c) and arm64 (drivers/base/arch_topology.c). + +Fixes: 778c558f49a2 ("sched: Add cluster scheduler level in core and related Kconfig for ARM64") +Acked-by: Barry Song +Signed-off-by: Dietmar Eggemann +Link: https://lore.kernel.org/r/20220513093433.425163-1-dietmar.eggemann@arm.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Jiang Yi +--- + include/linux/topology.h | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/include/linux/topology.h b/include/linux/topology.h +index 9033a952ee68..9a7753fcae6a 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -215,13 +215,6 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) + } + #endif + +-#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) +-static inline const struct cpumask *cpu_cluster_mask(int cpu) +-{ +- return topology_cluster_cpumask(cpu); +-} +-#endif +- + static inline const struct cpumask *cpu_cpu_mask(int cpu) + { + return cpumask_of_node(cpu_to_node(cpu)); +-- +2.23.0 + diff --git a/patches/0762-arch_topology-Limit-span-of-cpu_clustergroup_mask.patch b/patches/0762-arch_topology-Limit-span-of-cpu_clustergroup_mask.patch new file mode 100644 index 0000000000000000000000000000000000000000..728a1e4b05c57b50cbb8765539725115f94c4274 --- /dev/null +++ b/patches/0762-arch_topology-Limit-span-of-cpu_clustergroup_mask.patch @@ -0,0 +1,73 @@ +From 9c1386d3a2d25509d99b671e56fd4a380762d1f5 Mon Sep 17 00:00:00 2001 +From: Ionela Voinescu +Date: Mon, 4 Jul 2022 11:16:01 +0100 +Subject: [PATCH 05/19] arch_topology: Limit span of cpu_clustergroup_mask() + +mainline inclusion +from mainline-v6.0-rc1 +commit bfcc4397435dc0407099b9a805391abc05c2313b +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I88UKS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bfcc4397435dc0407099b9a805391abc05c2313b + +---------------------------------------------------------------------- + +Currently the cluster identifier is not set on DT based platforms. +The reset or default value is -1 for all the CPUs. Once we assign the +cluster identifier values correctly, the cluster_sibling mask will be +populated and returned by cpu_clustergroup_mask() to contribute in the +creation of the CLS scheduling domain level, if SCHED_CLUSTER is +enabled. + +To avoid topologies that will result in questionable or incorrect +scheduling domains, impose restrictions regarding the span of clusters, +as presented to scheduling domains building code: cluster_sibling should +not span more or the same CPUs as cpu_coregroup_mask(). + +This is needed in order to obtain a strict separation between the MC and +CLS levels, and maintain the same domains for existing platforms in +the presence of CONFIG_SCHED_CLUSTER, where the new cluster information +is redundant and irrelevant for the scheduler. + +While previously the scheduling domain builder code would have removed MC +as redundant and kept CLS if SCHED_CLUSTER was enabled and the +cpu_coregroup_mask() and cpu_clustergroup_mask() spanned the same CPUs, +now CLS will be removed and MC kept. + +Link: https://lore.kernel.org/r/20220704101605.1318280-18-sudeep.holla@arm.com +Cc: Darren Hart +Tested-by: Conor Dooley +Acked-by: Vincent Guittot +Signed-off-by: Ionela Voinescu +Signed-off-by: Sudeep Holla + +Conflicts: + drivers/base/arch_topology.c + +Signed-off-by: Jiang Yi +--- + arch/arm64/kernel/topology.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c +index e4effe6f3177..b76c92b6ac14 100644 +--- a/arch/arm64/kernel/topology.c ++++ b/arch/arm64/kernel/topology.c +@@ -232,6 +232,14 @@ const struct cpumask *cpu_coregroup_mask(int cpu) + + const struct cpumask *cpu_clustergroup_mask(int cpu) + { ++ /* ++ * Forbid cpu_clustergroup_mask() to span more or the same CPUs as ++ * cpu_coregroup_mask(). ++ */ ++ if (cpumask_subset(cpu_coregroup_mask(cpu), ++ &cpu_topology[cpu].cluster_sibling)) ++ return get_cpu_mask(cpu); ++ + return &cpu_topology[cpu].cluster_sibling; + } + +-- +2.23.0 + diff --git a/patches/0763-arch_topology-Make-cluster-topology-span-at-least-SM.patch b/patches/0763-arch_topology-Make-cluster-topology-span-at-least-SM.patch new file mode 100644 index 0000000000000000000000000000000000000000..7a8cf46a45e327686e980aa80d2f6722fa7da390 --- /dev/null +++ b/patches/0763-arch_topology-Make-cluster-topology-span-at-least-SM.patch @@ -0,0 +1,72 @@ +From 644d6533015ec38f1abad1c1104f63262051a80e Mon Sep 17 00:00:00 2001 +From: Yicong Yang +Date: Mon, 5 Sep 2022 20:26:15 +0800 +Subject: [PATCH 06/19] arch_topology: Make cluster topology span at least SMT + CPUs + +mainline inclusion +from mainline-v6.0-rc5 +commit 5ac251c8a05ce074e5efac779debf82a15d870a3 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I88UKS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5ac251c8a05ce074e5efac779debf82a15d870a3 + +---------------------------------------------------------------------- + +Currently cpu_clustergroup_mask() will return CPU mask if cluster span more +or the same CPUs as cpu_coregroup_mask(). This will result topology borken +on non-Cluster SMT machines when building with CONFIG_SCHED_CLUSTER=y. + +Test with: +qemu-system-aarch64 -enable-kvm -machine virt \ + -net none \ + -cpu host \ + -bios ./QEMU_EFI.fd \ + -m 2G \ + -smp 48,sockets=2,cores=12,threads=2 \ + -kernel $Image \ + -initrd $Rootfs \ + -nographic + -append "rdinit=init console=ttyAMA0 sched_verbose loglevel=8" + +We'll get below error: +[ 3.084568] BUG: arch topology borken +[ 3.084570] the SMT domain not a subset of the CLS domain + +Since cluster is a level higher than SMT, fix this by making cluster +spans at least SMT CPUs. + +Fixes: bfcc4397435d ("arch_topology: Limit span of cpu_clustergroup_mask()") +Cc: Sudeep Holla +Cc: Vincent Guittot +Cc: Ionela Voinescu +Cc: Greg KH +Reviewed-by: Sudeep Holla +Signed-off-by: Yicong Yang +Link: https://lore.kernel.org/r/20220905122615.12946-1-yangyicong@huawei.com +Signed-off-by: Greg Kroah-Hartman + +Conflicts: + drivers/base/arch_topology.c + +Signed-off-by: Jiang Yi +--- + arch/arm64/kernel/topology.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c +index b76c92b6ac14..46f58120392c 100644 +--- a/arch/arm64/kernel/topology.c ++++ b/arch/arm64/kernel/topology.c +@@ -238,7 +238,7 @@ const struct cpumask *cpu_clustergroup_mask(int cpu) + */ + if (cpumask_subset(cpu_coregroup_mask(cpu), + &cpu_topology[cpu].cluster_sibling)) +- return get_cpu_mask(cpu); ++ return topology_sibling_cpumask(cpu); + + return &cpu_topology[cpu].cluster_sibling; + } +-- +2.23.0 + diff --git a/patches/0764-sched-Add-per_cpu-cluster-domain-info-and-cpus_share.patch b/patches/0764-sched-Add-per_cpu-cluster-domain-info-and-cpus_share.patch new file mode 100644 index 0000000000000000000000000000000000000000..44ed2f7e345af9bd2e2436a49757c85436a0b7fd --- /dev/null +++ b/patches/0764-sched-Add-per_cpu-cluster-domain-info-and-cpus_share.patch @@ -0,0 +1,176 @@ +From b34a517d3516bc566787d37458aa0a3b493f966d Mon Sep 17 00:00:00 2001 +From: Barry Song +Date: Mon, 17 Oct 2022 15:01:55 +0800 +Subject: [PATCH 07/19] sched: Add per_cpu cluster domain info and + cpus_share_lowest_cache API + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/20220915073423.25535-1-yangyicong@huawei.com/ + +---------------------------------------------------------------------- + +Add per-cpu cluster domain info and cpus_share_lowest_cache() API. +This is the preparation for the optimization of select_idle_cpu() +on platforms with cluster scheduler level. + +Tested-by: K Prateek Nayak +Signed-off-by: Barry Song +Signed-off-by: Yicong Yang +Reviewed-by: Gautham R. Shenoy +Reviewed-by: Tim Chen +Reviewed-by: Vincent Guittot +Signed-off-by: Jie Liu + +Conflicts: + include/linux/sched/sd_flags.h + kernel/sched/core.c + kernel/sched/sched.h + kernel/sched/topology.c + +Signed-off-by: Jiang Yi +--- + include/linux/sched/topology.h | 23 +++++++++++++++-------- + kernel/sched/core.c | 14 ++++++++++++++ + kernel/sched/sched.h | 2 ++ + kernel/sched/topology.c | 15 +++++++++++++++ + 4 files changed, 46 insertions(+), 8 deletions(-) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 15d2e06f690b..55eec54e7f1e 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -25,13 +25,14 @@ + #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ + #define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ + #define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ +-#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ +-#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ +-#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ +-#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ +-#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ +-#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ +-#define SD_NUMA 0x4000 /* cross-node balancing */ ++#define SD_CLUSTER 0x0100 /* Domain members share CPU cluster */ ++#define SD_SHARE_POWERDOMAIN 0x0200 /* Domain members share power domain */ ++#define SD_SHARE_PKG_RESOURCES 0x0400 /* Domain members share cpu pkg resources */ ++#define SD_SERIALIZE 0x0800 /* Only a single load balancing instance */ ++#define SD_ASYM_PACKING 0x1000 /* Place busy groups earlier in the domain */ ++#define SD_PREFER_SIBLING 0x2000 /* Prefer to place tasks in a sibling domain */ ++#define SD_OVERLAP 0x4000 /* sched_domains of this level overlap */ ++#define SD_NUMA 0x8000 /* cross-node balancing */ + + #ifdef CONFIG_SCHED_SMT + static inline int cpu_smt_flags(void) +@@ -43,7 +44,7 @@ static inline int cpu_smt_flags(void) + #ifdef CONFIG_SCHED_CLUSTER + static inline int cpu_cluster_flags(void) + { +- return SD_SHARE_PKG_RESOURCES; ++ return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; + } + #endif + +@@ -180,6 +181,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); + void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); + + bool cpus_share_cache(int this_cpu, int that_cpu); ++bool cpus_share_lowest_cache(int this_cpu, int that_cpu); + + typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + typedef int (*sched_domain_flags_f)(void); +@@ -227,6 +229,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) + return true; + } + ++static inline bool cpus_share_lowest_cache(int this_cpu, int that_cpu) ++{ ++ return true; ++} ++ + #endif /* !CONFIG_SMP */ + + static inline int task_node(const struct task_struct *p) +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 7825ceaae0c4..bbfed1ce2372 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1851,6 +1851,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu) + + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); + } ++ ++/* ++ * Whether CPUs are share lowest cache, which means LLC on non-cluster ++ * machines and LLC tag or L2 on machines with clusters. ++ */ ++bool cpus_share_lowest_cache(int this_cpu, int that_cpu) ++{ ++ if (this_cpu == that_cpu) ++ return true; ++ ++ return per_cpu(sd_lowest_cache_id, this_cpu) == ++ per_cpu(sd_lowest_cache_id, that_cpu); ++} ++ + #endif /* CONFIG_SMP */ + + static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 1d882a2b8d5f..c9019e1a6296 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1307,7 +1307,9 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) + DECLARE_PER_CPU(struct sched_domain *, sd_llc); + DECLARE_PER_CPU(int, sd_llc_size); + DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(int, sd_lowest_cache_id); + DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); + DECLARE_PER_CPU(struct sched_domain *, sd_numa); + DECLARE_PER_CPU(struct sched_domain *, sd_asym); + +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 5d662314c08b..0b299f9d60cf 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -412,6 +412,8 @@ static void destroy_sched_domains(struct sched_domain *sd) + DEFINE_PER_CPU(struct sched_domain *, sd_llc); + DEFINE_PER_CPU(int, sd_llc_size); + DEFINE_PER_CPU(int, sd_llc_id); ++DEFINE_PER_CPU(int, sd_lowest_cache_id); ++DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); + DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); + DEFINE_PER_CPU(struct sched_domain *, sd_numa); + DEFINE_PER_CPU(struct sched_domain *, sd_asym); +@@ -445,6 +447,18 @@ static void update_top_cache_domain(int cpu) + per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + ++ sd = lowest_flag_domain(cpu, SD_CLUSTER); ++ if (sd) ++ id = cpumask_first(sched_domain_span(sd)); ++ rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd); ++ ++ /* ++ * This assignment should be placed after the sd_llc_id as ++ * we want this id equals to cluster id on cluster machines ++ * but equals to LLC id on non-Cluster machines. ++ */ ++ per_cpu(sd_lowest_cache_id, cpu) = id; ++ + sd = lowest_flag_domain(cpu, SD_NUMA); + rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + +@@ -1162,6 +1176,7 @@ static struct cpumask ***sched_domains_numa_masks; + */ + #define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUCAPACITY | \ ++ SD_CLUSTER | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ +-- +2.23.0 + diff --git a/patches/0765-sched-fair-Scan-cluster-before-scanning-LLC-in-wake-.patch b/patches/0765-sched-fair-Scan-cluster-before-scanning-LLC-in-wake-.patch new file mode 100644 index 0000000000000000000000000000000000000000..59d8d63ca6e3a6be45a487e30add57733b27f899 --- /dev/null +++ b/patches/0765-sched-fair-Scan-cluster-before-scanning-LLC-in-wake-.patch @@ -0,0 +1,224 @@ +From ee8485e1b79de27b4ce5573dce128d429ccaae03 Mon Sep 17 00:00:00 2001 +From: Barry Song +Date: Mon, 17 Oct 2022 15:34:27 +0800 +Subject: [PATCH 08/19] sched/fair: Scan cluster before scanning LLC in wake-up + path + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/20220915073423.25535-1-yangyicong@huawei.com/ + +---------------------------------------------------------------------- + +For platforms having clusters like Kunpeng920, CPUs within the same cluster +have lower latency when synchronizing and accessing shared resources like +cache. Thus, this patch tries to find an idle cpu within the cluster of the +target CPU before scanning the whole LLC to gain lower latency. + +Testing has been done on Kunpeng920 by pinning tasks to one numa and two +numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs. + +With this patch, We noticed enhancement on tbench within one numa or cross +two numa. + +On numa 0: + 6.0-rc1 patched +Hmean 1 351.20 ( 0.00%) 396.45 * 12.88%* +Hmean 2 700.43 ( 0.00%) 793.76 * 13.32%* +Hmean 4 1404.42 ( 0.00%) 1583.62 * 12.76%* +Hmean 8 2833.31 ( 0.00%) 3147.85 * 11.10%* +Hmean 16 5501.90 ( 0.00%) 6089.89 * 10.69%* +Hmean 32 10428.59 ( 0.00%) 10619.63 * 1.83%* +Hmean 64 8223.39 ( 0.00%) 8306.93 * 1.02%* +Hmean 128 7042.88 ( 0.00%) 7068.03 * 0.36%* + +On numa 0-1: + 6.0-rc1 patched +Hmean 1 363.06 ( 0.00%) 397.13 * 9.38%* +Hmean 2 721.68 ( 0.00%) 789.84 * 9.44%* +Hmean 4 1435.15 ( 0.00%) 1566.01 * 9.12%* +Hmean 8 2776.17 ( 0.00%) 3007.05 * 8.32%* +Hmean 16 5471.71 ( 0.00%) 6103.91 * 11.55%* +Hmean 32 10164.98 ( 0.00%) 11531.81 * 13.45%* +Hmean 64 17143.28 ( 0.00%) 20078.68 * 17.12%* +Hmean 128 14552.70 ( 0.00%) 15156.41 * 4.15%* +Hmean 256 12827.37 ( 0.00%) 13326.86 * 3.89%* + +Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch +in the code has not been tested but it supposed to work. + +Suggested-by: Peter Zijlstra +[https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-ass.net] +Tested-by: Yicong Yang +Signed-off-by: Barry Song +Signed-off-by: Yicong Yang +Reviewed-by: Tim Chen +Reviewed-by: Chen Yu +Signed-off-by: Jie Liu + +Conflicts: + kernel/sched/fair.c + kernel/sched/sched.h + kernel/sched/topology.c + +Signed-off-by: Jiang Yi +--- + kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++---- + kernel/sched/sched.h | 1 + + kernel/sched/topology.c | 11 +++++++++ + 3 files changed, 58 insertions(+), 4 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 6d0ec315f7be..11fc12369c7d 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -6800,6 +6800,30 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int + cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + #endif + ++ if (static_branch_unlikely(&sched_cluster_active)) { ++ struct sched_domain *sdc = ++ rcu_dereference(per_cpu(sd_cluster, target)); ++ ++ if (sdc) { ++ for_each_cpu_wrap(core, sched_domain_span(sdc), target) { ++ bool idle = true; ++ ++ if (!cpumask_test_cpu(core, cpus)) ++ continue; ++ ++ for_each_cpu(cpu, cpu_smt_mask(core)) { ++ cpumask_clear_cpu(cpu, cpus); ++ if (!available_idle_cpu(cpu)) ++ idle = false; ++ } ++ ++ if (idle) ++ return core; ++ } ++ cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); ++ } ++ } ++ + for_each_cpu_wrap(core, cpus, target) { + bool idle = true; + +@@ -6905,8 +6929,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t + cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + #endif + ++ if (static_branch_unlikely(&sched_cluster_active)) { ++ struct sched_domain *sdc = ++ rcu_dereference(per_cpu(sd_cluster, target)); ++ ++ if (sdc) { ++ for_each_cpu_wrap(cpu, sched_domain_span(sdc), target) { ++ if (!cpumask_test_cpu(cpu, cpus)) ++ continue; ++ if (--nr <= 0) ++ return -1; ++ if (available_idle_cpu(cpu) || ++ sched_idle_cpu(cpu)) ++ return cpu; ++ } ++ cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); ++ } ++ } ++ + for_each_cpu_wrap(cpu, cpus, target) { +- if (!--nr) ++ if (--nr <= 0) + return -1; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + break; +@@ -6956,11 +6998,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + * If the previous CPU is cache affine and idle, don't be stupid: + */ + #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +- if (prev != target && cpus_share_cache(prev, target) && ++ if (prev != target && cpus_share_lowest_cache(prev, target) && + cpumask_test_cpu(prev, p->select_cpus) && + (available_idle_cpu(prev) || sched_idle_cpu(prev))) { + #else +- if (prev != target && cpus_share_cache(prev, target) && ++ if (prev != target && cpus_share_lowest_cache(prev, target) && + (available_idle_cpu(prev) || sched_idle_cpu(prev))) { + #endif + SET_STAT(found_idle_cpu_easy); +@@ -6971,7 +7013,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + recent_used_cpu = p->recent_used_cpu; + if (recent_used_cpu != prev && + recent_used_cpu != target && +- cpus_share_cache(recent_used_cpu, target) && ++ cpus_share_lowest_cache(recent_used_cpu, target) && + (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(p->recent_used_cpu, p->select_cpus)) { +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c9019e1a6296..131228b5c268 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1312,6 +1312,7 @@ DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); + DECLARE_PER_CPU(struct sched_domain *, sd_numa); + DECLARE_PER_CPU(struct sched_domain *, sd_asym); ++extern struct static_key_false sched_cluster_active; + + struct sched_group_capacity { + atomic_t ref; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 0b299f9d60cf..eda15f08577f 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -418,6 +418,8 @@ DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); + DEFINE_PER_CPU(struct sched_domain *, sd_numa); + DEFINE_PER_CPU(struct sched_domain *, sd_asym); + ++DEFINE_STATIC_KEY_FALSE(sched_cluster_active); ++ + static void update_top_cache_domain(int cpu) + { + #ifdef CONFIG_SCHED_STEAL +@@ -1856,6 +1858,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + struct s_data d; + struct rq *rq = NULL; + int i, ret = -ENOMEM; ++ bool has_cluster = false; + + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) +@@ -1868,6 +1871,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + sd = NULL; + for_each_sd_topology(tl) { + sd = build_sched_domain(tl, cpu_map, attr, sd, i); ++ has_cluster |= sd->flags & SD_CLUSTER; + if (tl == sched_domain_topology) + *per_cpu_ptr(d.sd, i) = sd; + if (tl->flags & SDTL_OVERLAP) +@@ -1924,6 +1928,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + } + rcu_read_unlock(); + ++ if (has_cluster) ++ static_branch_inc_cpuslocked(&sched_cluster_active); ++ + if (rq && sched_debug_enabled) { + pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", + cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); +@@ -2018,8 +2025,12 @@ int sched_init_domains(const struct cpumask *cpu_map) + */ + static void detach_destroy_domains(const struct cpumask *cpu_map) + { ++ unsigned int cpu = cpumask_any(cpu_map); + int i; + ++ if (rcu_access_pointer(per_cpu(sd_cluster, cpu))) ++ static_branch_dec_cpuslocked(&sched_cluster_active); ++ + rcu_read_lock(); + for_each_cpu(i, cpu_map) + cpu_attach_domain(NULL, &def_root_domain, i); +-- +2.23.0 + diff --git a/patches/0766-scheduler-Create-SDTL_SKIP-flag-to-skip-topology-lev.patch b/patches/0766-scheduler-Create-SDTL_SKIP-flag-to-skip-topology-lev.patch new file mode 100644 index 0000000000000000000000000000000000000000..0d82d41523f4e815111234bd1a0c3d2ac91c1583 --- /dev/null +++ b/patches/0766-scheduler-Create-SDTL_SKIP-flag-to-skip-topology-lev.patch @@ -0,0 +1,77 @@ +From 86ef1e50a2a6a271baf972c3fa5a11e91bf69533 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 3 Dec 2021 12:32:38 -0800 +Subject: [PATCH 09/19] scheduler: Create SDTL_SKIP flag to skip topology level + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ + +---------------------------------------------------------------------- + +A system admin may not want to use cluster scheduling. Make changes to +allow cluster topology level to be skipped when building sched domains. + +Create SDTL_SKIP bit on the sched_domain_topology_level flag so we can +check if the cluster topology level should be skipped when building +sched domains. + +Signed-off-by: Tim Chen +Signed-off-by: Jie Liu + +Conflicts: + kernel/sched/topology.c + +Signed-off-by: Jiang Yi +--- + include/linux/sched/topology.h | 1 + + kernel/sched/topology.c | 12 ++++++++++-- + 2 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 55eec54e7f1e..ae4ba452c111 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -187,6 +187,7 @@ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + typedef int (*sched_domain_flags_f)(void); + + #define SDTL_OVERLAP 0x01 ++#define SDTL_SKIP 0x02 + + struct sd_data { + struct sched_domain *__percpu *sd; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index eda15f08577f..887e2d06d98a 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1330,8 +1330,16 @@ static struct sched_domain_topology_level default_topology[] = { + static struct sched_domain_topology_level *sched_domain_topology = + default_topology; + ++static struct sched_domain_topology_level * ++next_tl(struct sched_domain_topology_level *tl) ++{ ++ while (tl->mask && tl->flags & SDTL_SKIP) ++ ++tl; ++ return tl; ++} ++ + #define for_each_sd_topology(tl) \ +- for (tl = sched_domain_topology; tl->mask; tl++) ++ for (tl = next_tl(sched_domain_topology); tl->mask; tl = next_tl(++tl)) + + void set_sched_topology(struct sched_domain_topology_level *tl) + { +@@ -1872,7 +1880,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + for_each_sd_topology(tl) { + sd = build_sched_domain(tl, cpu_map, attr, sd, i); + has_cluster |= sd->flags & SD_CLUSTER; +- if (tl == sched_domain_topology) ++ if (tl == next_tl(sched_domain_topology)) + *per_cpu_ptr(d.sd, i) = sd; + if (tl->flags & SDTL_OVERLAP) + sd->flags |= SD_OVERLAP; +-- +2.23.0 + diff --git a/patches/0767-sysctl-add-a-new-register_sysctl_init-interface.patch b/patches/0767-sysctl-add-a-new-register_sysctl_init-interface.patch new file mode 100644 index 0000000000000000000000000000000000000000..9c7d3f3ef0607fb34566689a55131114eb3a2a74 --- /dev/null +++ b/patches/0767-sysctl-add-a-new-register_sysctl_init-interface.patch @@ -0,0 +1,196 @@ +From c288211ed99601e8c398e7f5db42373bcaa5f5c1 Mon Sep 17 00:00:00 2001 +From: Xiaoming Ni +Date: Thu, 28 Jul 2022 18:06:57 +0800 +Subject: [PATCH 10/19] sysctl: add a new register_sysctl_init() interface + +mainline inclusion +from mainline-v5.17-rc1 +commit 3ddd9a808cee7284931312f2f3e854c9617f44b2 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ddd9a808cee7284931312f2f3e854c9617f44b2 + +---------------------------------------------------------------------- + +Patch series "sysctl: first set of kernel/sysctl cleanups", v2. + +Finally had time to respin the series of the work we had started last +year on cleaning up the kernel/sysct.c kitchen sink. People keeps +stuffing their sysctls in that file and this creates a maintenance +burden. So this effort is aimed at placing sysctls where they actually +belong. + +I'm going to split patches up into series as there is quite a bit of +work. + +This first set adds register_sysctl_init() for uses of registerting a +sysctl on the init path, adds const where missing to a few places, +generalizes common values so to be more easy to share, and starts the +move of a few kernel/sysctl.c out where they belong. + +The majority of rework on v2 in this first patch set is 0-day fixes. +Eric Biederman's feedback is later addressed in subsequent patch sets. + +I'll only post the first two patch sets for now. We can address the +rest once the first two patch sets get completely reviewed / Acked. + +This patch (of 9): + +The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty +dishes, this makes it very difficult to maintain. + +To help with this maintenance let's start by moving sysctls to places +where they actually belong. The proc sysctl maintainers do not want to +know what sysctl knobs you wish to add for your own piece of code, we +just care about the core logic. + +Today though folks heavily rely on tables on kernel/sysctl.c so they can +easily just extend this table with their needed sysctls. In order to +help users move their sysctls out we need to provide a helper which can +be used during code initialization. + +We special-case the initialization use of register_sysctl() since it +*is* safe to fail, given all that sysctls do is provide a dynamic +interface to query or modify at runtime an existing variable. So the +use case of register_sysctl() on init should *not* stop if the sysctls +don't end up getting registered. It would be counter productive to stop +boot if a simple sysctl registration failed. + +Provide a helper for init then, and document the recommended init levels +to use for callers of this routine. We will later use this in +subsequent patches to start slimming down kernel/sysctl.c tables and +moving sysctl registration to the code which actually needs these +sysctls. + +[mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c ] + +Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org +Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org +Signed-off-by: Xiaoming Ni +Signed-off-by: Luis Chamberlain +Reviewed-by: Kees Cook +Cc: Iurii Zaikin +Cc: "Eric W. Biederman" +Cc: Peter Zijlstra +Cc: Greg Kroah-Hartman +Cc: Paul Turner +Cc: Andy Shevchenko +Cc: Sebastian Reichel +Cc: Tetsuo Handa +Cc: Petr Mladek +Cc: Sergey Senozhatsky +Cc: Qing Wang +Cc: Benjamin LaHaise +Cc: Al Viro +Cc: Jan Kara +Cc: Amir Goldstein +Cc: Stephen Kitt +Cc: Antti Palosaari +Cc: Arnd Bergmann +Cc: Benjamin Herrenschmidt +Cc: Clemens Ladisch +Cc: David Airlie +Cc: Jani Nikula +Cc: Joel Becker +Cc: Joonas Lahtinen +Cc: Joseph Qi +Cc: Julia Lawall +Cc: Lukas Middendorf +Cc: Mark Fasheh +Cc: Phillip Potter +Cc: Rodrigo Vivi +Cc: Douglas Gilbert +Cc: James E.J. Bottomley +Cc: Jani Nikula +Cc: John Ogness +Cc: Martin K. Petersen +Cc: "Rafael J. Wysocki" +Cc: Steven Rostedt (VMware) +Cc: Suren Baghdasaryan +Cc: "Theodore Ts'o" +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Liu Shixin +Reviewed-by: Kefeng Wang +Signed-off-by: Zheng Zengkai + +Conflicts: + both modified: fs/proc/proc_sysctl.c + both modified: include/linux/sysctl.h + +Signed-off-by: Jiang Yi +--- + fs/proc/proc_sysctl.c | 34 ++++++++++++++++++++++++++++++++++ + include/linux/sysctl.h | 4 ++++ + 2 files changed, 38 insertions(+) + +diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c +index c95f32b83a94..2712aa568331 100644 +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include "internal.h" + + static const struct dentry_operations proc_sys_dentry_operations; +@@ -1376,6 +1377,39 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab + } + EXPORT_SYMBOL(register_sysctl); + ++/** ++ * __register_sysctl_init() - register sysctl table to path ++ * @path: path name for sysctl base ++ * @table: This is the sysctl table that needs to be registered to the path ++ * @table_name: The name of sysctl table, only used for log printing when ++ * registration fails ++ * ++ * The sysctl interface is used by userspace to query or modify at runtime ++ * a predefined value set on a variable. These variables however have default ++ * values pre-set. Code which depends on these variables will always work even ++ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the ++ * ability to query or modify the sysctls dynamically at run time. Chances of ++ * register_sysctl() failing on init are extremely low, and so for both reasons ++ * this function does not return any error as it is used by initialization code. ++ * ++ * Context: Can only be called after your respective sysctl base path has been ++ * registered. So for instance, most base directories are registered early on ++ * init before init levels are processed through proc_sys_init() and ++ * sysctl_init(). ++ */ ++void __init __register_sysctl_init(const char *path, struct ctl_table *table, ++ const char *table_name) ++{ ++ struct ctl_table_header *hdr = register_sysctl(path, table); ++ ++ if (unlikely(!hdr)) { ++ pr_err("failed when register_sysctl %s to %s\n", ++ table_name, path); ++ return; ++ } ++ kmemleak_not_leak(hdr); ++} ++ + static char *append_path(const char *path, char *pos, const char *name) + { + int namelen; +diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h +index b769ecfcc3bd..04c822f6e7e9 100644 +--- a/include/linux/sysctl.h ++++ b/include/linux/sysctl.h +@@ -198,6 +198,10 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + void unregister_sysctl_table(struct ctl_table_header * table); + + extern int sysctl_init(void); ++extern void __register_sysctl_init(const char *path, struct ctl_table *table, ++ const char *table_name); ++#define register_sysctl_init(path, table) \ ++ __register_sysctl_init(path, table, #table) + + extern struct ctl_table sysctl_mount_point[]; + +-- +2.23.0 + diff --git a/patches/0768-sched-topology-drivers-base-arch_topology-Rebuild-th.patch b/patches/0768-sched-topology-drivers-base-arch_topology-Rebuild-th.patch new file mode 100644 index 0000000000000000000000000000000000000000..5cb07391de6918d3b008ac9c5cd7c6d34db9b6b8 --- /dev/null +++ b/patches/0768-sched-topology-drivers-base-arch_topology-Rebuild-th.patch @@ -0,0 +1,87 @@ +From de72c7e79c7f5d295f0d42322269e24a37bb0701 Mon Sep 17 00:00:00 2001 +From: Morten Rasmussen +Date: Fri, 20 Jul 2018 14:32:32 +0100 +Subject: [PATCH 11/19] sched/topology, drivers/base/arch_topology: Rebuild the + sched_domain hierarchy when capacities change + +mainline inclusion +from mainline-v4.20-rc1 +commit bb1fbdd3c3fd12b612c7d8cdf13bd6bfeebdefa3 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bb1fbdd3c3fd12b612c7d8cdf13bd6bfeebdefa3 + +---------------------------------------------------------------------- + +The setting of SD_ASYM_CPUCAPACITY depends on the per-CPU capacities. +These might not have their final values when the hierarchy is initially +built as the values depend on cpufreq to be initialized or the values +being set through sysfs. To ensure that the flags are set correctly we +need to rebuild the sched_domain hierarchy whenever the reported per-CPU +capacity (arch_scale_cpu_capacity()) changes. + +This patch ensure that a full sched_domain rebuild happens when CPU +capacity changes occur. + +Signed-off-by: Morten Rasmussen +Signed-off-by: Peter Zijlstra (Intel) +Cc: Greg Kroah-Hartman +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: dietmar.eggemann@arm.com +Cc: valentin.schneider@arm.com +Cc: vincent.guittot@linaro.org +Link: http://lkml.kernel.org/r/1532093554-30504-3-git-send-email-morten.rasmussen@arm.com +Signed-off-by: Ingo Molnar + +Conflicts: + drivers/base/arch_topology.c + +Signed-off-by: Jiang Yi +--- + drivers/base/arch_topology.c | 8 ++++++++ + include/linux/arch_topology.h | 1 + + 2 files changed, 9 insertions(+) + +diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c +index 729dded51e7b..5ef5e0198f9e 100644 +--- a/drivers/base/arch_topology.c ++++ b/drivers/base/arch_topology.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; + +@@ -67,6 +68,13 @@ static int register_cpu_capacity_sysctl(void) + } + subsys_initcall(register_cpu_capacity_sysctl); + ++static int update_topology; ++ ++int topology_update_cpu_topology(void) ++{ ++ return update_topology; ++} ++ + static u32 capacity_scale; + static u32 *raw_capacity; + +diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h +index 80c28bfce557..a0889776a9e0 100644 +--- a/include/linux/arch_topology.h ++++ b/include/linux/arch_topology.h +@@ -9,6 +9,7 @@ + #include + + void topology_normalize_cpu_scale(void); ++int topology_update_cpu_topology(void); + + struct device_node; + bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu); +-- +2.23.0 + diff --git a/patches/0769-sched-topology-arch-arm64-Rebuild-the-sched_domain-h.patch b/patches/0769-sched-topology-arch-arm64-Rebuild-the-sched_domain-h.patch new file mode 100644 index 0000000000000000000000000000000000000000..ed8e456081f71ba7a119b451b1bac6c58dd345df --- /dev/null +++ b/patches/0769-sched-topology-arch-arm64-Rebuild-the-sched_domain-h.patch @@ -0,0 +1,58 @@ +From 7a8dde1b4f0ca6155e72ac0c24bbadd78b1952bb Mon Sep 17 00:00:00 2001 +From: Morten Rasmussen +Date: Fri, 20 Jul 2018 14:32:33 +0100 +Subject: [PATCH 12/19] sched/topology, arch/arm64: Rebuild the sched_domain + hierarchy when the CPU capacity changes + +mainline inclusion +from mainline-v4.20-rc1 +commit 3ba09df4b8b6e3f01ed6381e8fb890840fd0bca3 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ba09df4b8b6e3f01ed6381e8fb890840fd0bca3 + +---------------------------------------------------------------------- + +Asymmetric CPU capacity can not necessarily be determined accurately at +the time the initial sched_domain hierarchy is built during boot. It is +therefore necessary to be able to force a full rebuild of the hierarchy +later triggered by the arch_topology driver. A full rebuild requires the +arch-code to implement arch_update_cpu_topology() which isn't yet +implemented for arm64. This patch points the arm64 implementation to +arch_topology driver to ensure that full hierarchy rebuild happens when +needed. + +Signed-off-by: Morten Rasmussen +Signed-off-by: Peter Zijlstra (Intel) +Cc: Catalin Marinas +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Will Deacon +Cc: dietmar.eggemann@arm.com +Cc: valentin.schneider@arm.com +Cc: vincent.guittot@linaro.org +Link: http://lkml.kernel.org/r/1532093554-30504-4-git-send-email-morten.rasmussen@arm.com +Signed-off-by: Ingo Molnar +Signed-off-by: Jiang Yi +--- + arch/arm64/include/asm/topology.h | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h +index 164e26035653..7f0de9ec65f7 100644 +--- a/arch/arm64/include/asm/topology.h ++++ b/arch/arm64/include/asm/topology.h +@@ -50,6 +50,9 @@ int pcibus_to_node(struct pci_bus *bus); + /* Replace task scheduler's default cpu-invariant accounting */ + #define arch_scale_cpu_capacity topology_get_cpu_scale + ++/* Enable topology flag updates */ ++#define arch_update_cpu_topology topology_update_cpu_topology ++ + #include + + #endif /* _ASM_ARM_TOPOLOGY_H */ +-- +2.23.0 + diff --git a/patches/0770-scheduler-Add-runtime-knob-sysctl_sched_cluster.patch b/patches/0770-scheduler-Add-runtime-knob-sysctl_sched_cluster.patch new file mode 100644 index 0000000000000000000000000000000000000000..3730c79618999f41ba8ba9ad9aeced468f826a99 --- /dev/null +++ b/patches/0770-scheduler-Add-runtime-knob-sysctl_sched_cluster.patch @@ -0,0 +1,237 @@ +From 559cd62adbc084cfcb1fccc060fe5ecb1010d918 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 3 Dec 2021 12:32:40 -0800 +Subject: [PATCH 13/19] scheduler: Add runtime knob sysctl_sched_cluster + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ + +---------------------------------------------------------------------- + +Allow run time configuration of the scheduler to use cluster +scheduling. Configuration can be changed via the sysctl variable +/proc/sys/kernel/sched_cluster. Setting it to 1 enable cluster +scheduling and setting it to 0 turns it off. + +Cluster scheduling should benefit independent tasks by load balancing +them between clusters. It reaps the most benefit when the system's CPUs +are not fully busy, so we can spread the tasks out between the clusters to +reduce contention on cluster resource (e.g. L2 cache). + +However, if the system is expected to operate close to full utilization, +the system admin could turn this feature off so as not to incur +extra load balancing overhead between the cluster domains. + +Signed-off-by: Tim Chen +Signed-off-by: Jie Liu + +Conflicts: + arch/x86/kernel/smpboot.c + drivers/base/arch_topology.c + include/linux/sched/sysctl.h + +Signed-off-by: Jiang Yi +--- + arch/x86/kernel/smpboot.c | 8 +++++ + drivers/base/arch_topology.c | 10 +++++- + include/linux/sched/sysctl.h | 7 ++++ + include/linux/topology.h | 1 + + kernel/sched/core.c | 1 + + kernel/sched/sched.h | 6 ++++ + kernel/sched/topology.c | 67 ++++++++++++++++++++++++++++++++++++ + 7 files changed, 99 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index e9dd01f7d602..1993690cfd80 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -56,6 +56,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -116,6 +117,13 @@ int arch_update_cpu_topology(void) + return retval; + } + ++void arch_rebuild_cpu_topology(void) ++{ ++ x86_topology_update = true; ++ rebuild_sched_domains(); ++ x86_topology_update = false; ++} ++ + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) + { + unsigned long flags; +diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c +index 5ef5e0198f9e..f601eb3238a1 100644 +--- a/drivers/base/arch_topology.c ++++ b/drivers/base/arch_topology.c +@@ -68,6 +68,7 @@ static int register_cpu_capacity_sysctl(void) + } + subsys_initcall(register_cpu_capacity_sysctl); + ++static u32 capacity_scale; + static int update_topology; + + int topology_update_cpu_topology(void) +@@ -75,7 +76,14 @@ int topology_update_cpu_topology(void) + return update_topology; + } + +-static u32 capacity_scale; ++void __weak arch_rebuild_cpu_topology(void) ++{ ++ update_topology = 1; ++ rebuild_sched_domains(); ++ pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); ++ update_topology = 0; ++} ++ + static u32 *raw_capacity; + + static int free_raw_capacity(void) +diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h +index ad472760e97d..691037143faa 100644 +--- a/include/linux/sched/sysctl.h ++++ b/include/linux/sched/sysctl.h +@@ -104,4 +104,11 @@ extern int sysctl_schedstats(struct ctl_table *table, int write, + loff_t *ppos); + + extern int sysctl_umh_affinity; ++ ++#ifdef CONFIG_SCHED_CLUSTER ++extern unsigned int sysctl_sched_cluster; ++int sched_cluster_handler(struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos); ++#endif ++ + #endif /* _LINUX_SCHED_SYSCTL_H */ +diff --git a/include/linux/topology.h b/include/linux/topology.h +index 9a7753fcae6a..63fb192f425b 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -43,6 +43,7 @@ + if (nr_cpus_node(node)) + + int arch_update_cpu_topology(void); ++void arch_rebuild_cpu_topology(void); + + /* Conform to ACPI 2.0 SLIT distance definitions */ + #define LOCAL_DISTANCE 10 +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index bbfed1ce2372..e518fc08fd41 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5968,6 +5968,7 @@ int sched_cpu_dying(unsigned int cpu) + void __init sched_init_smp(void) + { + sched_init_numa(); ++ set_sched_cluster(); + + /* + * There's no userspace yet to cause hotplug operations; hence all the +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 131228b5c268..7e2c49032615 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1190,6 +1190,12 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) + #endif + } + ++#ifdef CONFIG_SCHED_CLUSTER ++extern void set_sched_cluster(void); ++#else ++static inline void set_sched_cluster(void) { } ++#endif ++ + #ifdef CONFIG_NUMA + enum numa_topology_type { + NUMA_DIRECT, +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 887e2d06d98a..8157e9fb9bfa 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1330,6 +1330,73 @@ static struct sched_domain_topology_level default_topology[] = { + static struct sched_domain_topology_level *sched_domain_topology = + default_topology; + ++#ifdef CONFIG_SCHED_CLUSTER ++void set_sched_cluster(void) ++{ ++ struct sched_domain_topology_level *tl; ++ ++ for (tl = sched_domain_topology; tl->mask; tl++) { ++ if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) { ++ if (!sysctl_sched_cluster) ++ tl->flags |= SDTL_SKIP; ++ else ++ tl->flags &= ~SDTL_SKIP; ++ break; ++ } ++ } ++} ++ ++/* set via /proc/sys/kernel/sched_cluster */ ++unsigned int __read_mostly sysctl_sched_cluster = 1; ++ ++static DEFINE_MUTEX(sched_cluster_mutex); ++int sched_cluster_handler(struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret; ++ unsigned int oldval; ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ mutex_lock(&sched_cluster_mutex); ++ oldval = sysctl_sched_cluster; ++ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); ++ if (!ret && write) { ++ if (oldval != sysctl_sched_cluster) { ++ set_sched_cluster(); ++ arch_rebuild_cpu_topology(); ++ } ++ } ++ mutex_unlock(&sched_cluster_mutex); ++ ++ return ret; ++} ++ ++static int zero; ++static int one = 1; ++ ++static struct ctl_table sched_cluster_sysctls[] = { ++ { ++ .procname = "sched_cluster", ++ .data = &sysctl_sched_cluster, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_cluster_handler, ++ .extra1 = (void *)&zero, ++ .extra2 = (void *)&one, ++ }, ++ {} ++}; ++ ++static int __init sched_cluster_sysctl_init(void) ++{ ++ register_sysctl_init("kernel", sched_cluster_sysctls); ++ return 0; ++} ++late_initcall(sched_cluster_sysctl_init); ++#endif ++ + static struct sched_domain_topology_level * + next_tl(struct sched_domain_topology_level *tl) + { +-- +2.23.0 + diff --git a/patches/0771-scheduler-Add-boot-time-enabling-disabling-of-cluste.patch b/patches/0771-scheduler-Add-boot-time-enabling-disabling-of-cluste.patch new file mode 100644 index 0000000000000000000000000000000000000000..a380f3ab6250d7c7d57daa92f80d4600705ce016 --- /dev/null +++ b/patches/0771-scheduler-Add-boot-time-enabling-disabling-of-cluste.patch @@ -0,0 +1,72 @@ +From 4ab20861b0c082f9721b87a8c7de2cc673205bd1 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 3 Dec 2021 12:32:41 -0800 +Subject: [PATCH 14/19] scheduler: Add boot time enabling/disabling of cluster + scheduling + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ + +---------------------------------------------------------------------- + +Add boot time parameter sched_cluster to enable or disable cluster +scheduling. Set boot parameter as follow: + + sched_cluster=0 disables cluster scheduling + sched_cluster=1 enables cluster scheduling + +Signed-off-by: Tim Chen +Signed-off-by: Jie Liu +Signed-off-by: Jiang Yi +--- + Documentation/admin-guide/kernel-parameters.txt | 4 ++++ + kernel/sched/topology.c | 16 ++++++++++++++++ + 2 files changed, 20 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 81c3e5e6447f..cd413b202ea5 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4310,6 +4310,10 @@ + + sched_debug [KNL] Enables verbose scheduler debug messages. + ++ sched_cluster= Enable or disable cluster scheduling. ++ 0 -- disable. ++ 1 -- enable. ++ + schedstats= [KNL,X86] Enable or disable scheduled statistics. + Allowed values are enable and disable. This feature + incurs a small amount of overhead in the scheduler +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 8157e9fb9bfa..fdc3ae9e1bc0 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1395,6 +1395,22 @@ static int __init sched_cluster_sysctl_init(void) + return 0; + } + late_initcall(sched_cluster_sysctl_init); ++ ++static int __init sched_cluster_option(char *str) ++{ ++ int enable; ++ ++ if (get_option(&str, &enable)) { ++ if (enable != 0 && enable != 1) ++ return -EINVAL; ++ ++ sysctl_sched_cluster = enable; ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++early_param("sched_cluster", sched_cluster_option); + #endif + + static struct sched_domain_topology_level * +-- +2.23.0 + diff --git a/patches/0772-scheduler-Disable-cluster-scheduling-by-default.patch b/patches/0772-scheduler-Disable-cluster-scheduling-by-default.patch new file mode 100644 index 0000000000000000000000000000000000000000..47fb3d04157dc6050a12dfbd1661c108a75b36a6 --- /dev/null +++ b/patches/0772-scheduler-Disable-cluster-scheduling-by-default.patch @@ -0,0 +1,38 @@ +From 51bf9d798b7d76c74d3c1012758220ebcf67bf38 Mon Sep 17 00:00:00 2001 +From: Yicong Yang +Date: Mon, 13 Feb 2023 10:48:54 +0800 +Subject: [PATCH 15/19] scheduler: Disable cluster scheduling by default + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S + +---------------------------------------------------------------------- + +Disable cluster scheduling by default since it's not a universal win. +User can choose to enable it through sysctl or at boot time according to +their scenario. + +Signed-off-by: Yicong Yang +Signed-off-by: Jie Liu +Signed-off-by: Jiang Yi +--- + kernel/sched/topology.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index fdc3ae9e1bc0..1cc9ec74d24b 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1347,7 +1347,7 @@ void set_sched_cluster(void) + } + + /* set via /proc/sys/kernel/sched_cluster */ +-unsigned int __read_mostly sysctl_sched_cluster = 1; ++unsigned int __read_mostly sysctl_sched_cluster; + + static DEFINE_MUTEX(sched_cluster_mutex); + int sched_cluster_handler(struct ctl_table *table, int write, +-- +2.23.0 + diff --git a/patches/0773-sched-Open-the-kernel-configuration-for-cluster.patch b/patches/0773-sched-Open-the-kernel-configuration-for-cluster.patch new file mode 100644 index 0000000000000000000000000000000000000000..7eb4316bfe0e24d1bc7ec3f5e1fbae5f0477abff --- /dev/null +++ b/patches/0773-sched-Open-the-kernel-configuration-for-cluster.patch @@ -0,0 +1,35 @@ +From 43a06b04e4eb28df96358685784e7e1f1de03f17 Mon Sep 17 00:00:00 2001 +From: Jie Liu +Date: Mon, 24 Oct 2022 09:34:57 +0800 +Subject: [PATCH 16/19] sched:Open the kernel configuration for cluster. + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S + +---------------------------------------------------------------------- + +In the past configuration, CONFIG_SCHED_CLUSTER was not set. Now, we need +to open the configuration. + +Signed-off-by: Jie Liu +Signed-off-by: Jiang Yi +--- + arch/arm64/configs/openeuler_defconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig +index ded320901f60..8314f1fe1aec 100644 +--- a/arch/arm64/configs/openeuler_defconfig ++++ b/arch/arm64/configs/openeuler_defconfig +@@ -421,6 +421,7 @@ CONFIG_ARM64_PA_BITS=48 + CONFIG_SCHED_MC=y + # CONFIG_SCHED_SMT is not set + CONFIG_NR_CPUS=1024 ++CONFIG_SCHED_CLUSTER=y + CONFIG_HOTPLUG_CPU=y + # CONFIG_ARM64_BOOTPARAM_HOTPLUG_CPU0 is not set + CONFIG_ARM64_ERR_RECOV=y +-- +2.23.0 + diff --git a/patches/0774-sched-fair-Introduce-SIS_UTIL-to-search-idle-CPU-bas.patch b/patches/0774-sched-fair-Introduce-SIS_UTIL-to-search-idle-CPU-bas.patch new file mode 100644 index 0000000000000000000000000000000000000000..f477fe2dc4ebd2f936a904fadbc47ae49d1e9259 --- /dev/null +++ b/patches/0774-sched-fair-Introduce-SIS_UTIL-to-search-idle-CPU-bas.patch @@ -0,0 +1,553 @@ +From 28930676a127de04c7eaf442f367850be157ac0f Mon Sep 17 00:00:00 2001 +From: Chen Yu +Date: Wed, 30 Nov 2022 18:23:24 +0800 +Subject: [PATCH 17/19] sched/fair: Introduce SIS_UTIL to search idle CPU based + on sum of util_avg + +mainline inclusion +from mainline-v6.0-rc1 +commit 70fb5ccf2ebb09a0c8ebba775041567812d45f86 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I61E4M + +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=70fb5ccf2ebb09a0c8ebba775041567812d45 + +-------------------------------- + +[Problem Statement] +select_idle_cpu() might spend too much time searching for an idle CPU, +when the system is overloaded. + +The following histogram is the time spent in select_idle_cpu(), +when running 224 instances of netperf on a system with 112 CPUs +per LLC domain: + +@usecs: +[0] 533 | | +[1] 5495 | | +[2, 4) 12008 | | +[4, 8) 239252 | | +[8, 16) 4041924 |@@@@@@@@@@@@@@ | +[16, 32) 12357398 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[32, 64) 14820255 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| +[64, 128) 13047682 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[128, 256) 8235013 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[256, 512) 4507667 |@@@@@@@@@@@@@@@ | +[512, 1K) 2600472 |@@@@@@@@@ | +[1K, 2K) 927912 |@@@ | +[2K, 4K) 218720 | | +[4K, 8K) 98161 | | +[8K, 16K) 37722 | | +[16K, 32K) 6715 | | +[32K, 64K) 477 | | +[64K, 128K) 7 | | + +netperf latency usecs: + +======= +case load Lat_99th std% +TCP_RR thread-224 257.39 ( 0.21) + +The time spent in select_idle_cpu() is visible to netperf and might have a negative +impact. + +[Symptom analysis] +The patch [1] from Mel Gorman has been applied to track the efficiency +of select_idle_sibling. Copy the indicators here: + +SIS Search Efficiency(se_eff%): + A ratio expressed as a percentage of runqueues scanned versus + idle CPUs found. A 100% efficiency indicates that the target, + prev or recent CPU of a task was idle at wakeup. The lower the + efficiency, the more runqueues were scanned before an idle CPU + was found. + +SIS Domain Search Efficiency(dom_eff%): + Similar, except only for the slower SIS + patch. + +SIS Fast Success Rate(fast_rate%): + Percentage of SIS that used target, prev or + recent CPUs. + +SIS Success rate(success_rate%): + Percentage of scans that found an idle CPU. + +The test is based on Aubrey's schedtests tool, including netperf, hackbench, +schbench and tbench. + +Test on vanilla kernel: +schedstat_parse.py -f netperf_vanilla.log +case load se_eff% dom_eff% fast_rate% success_rate% +TCP_RR 28 threads 99.978 18.535 99.995 100.000 +TCP_RR 56 threads 99.397 5.671 99.964 100.000 +TCP_RR 84 threads 21.721 6.818 73.632 100.000 +TCP_RR 112 threads 12.500 5.533 59.000 100.000 +TCP_RR 140 threads 8.524 4.535 49.020 100.000 +TCP_RR 168 threads 6.438 3.945 40.309 99.999 +TCP_RR 196 threads 5.397 3.718 32.320 99.982 +TCP_RR 224 threads 4.874 3.661 25.775 99.767 +UDP_RR 28 threads 99.988 17.704 99.997 100.000 +UDP_RR 56 threads 99.528 5.977 99.970 100.000 +UDP_RR 84 threads 24.219 6.992 76.479 100.000 +UDP_RR 112 threads 13.907 5.706 62.538 100.000 +UDP_RR 140 threads 9.408 4.699 52.519 100.000 +UDP_RR 168 threads 7.095 4.077 44.352 100.000 +UDP_RR 196 threads 5.757 3.775 35.764 99.991 +UDP_RR 224 threads 5.124 3.704 28.748 99.860 + +schedstat_parse.py -f schbench_vanilla.log +(each group has 28 tasks) +case load se_eff% dom_eff% fast_rate% success_rate% +normal 1 mthread 99.152 6.400 99.941 100.000 +normal 2 mthreads 97.844 4.003 99.908 100.000 +normal 3 mthreads 96.395 2.118 99.917 99.998 +normal 4 mthreads 55.288 1.451 98.615 99.804 +normal 5 mthreads 7.004 1.870 45.597 61.036 +normal 6 mthreads 3.354 1.346 20.777 34.230 +normal 7 mthreads 2.183 1.028 11.257 21.055 +normal 8 mthreads 1.653 0.825 7.849 15.549 + +schedstat_parse.py -f hackbench_vanilla.log +(each group has 28 tasks) +case load se_eff% dom_eff% fast_rate% success_rate% +process-pipe 1 group 99.991 7.692 99.999 100.000 +process-pipe 2 groups 99.934 4.615 99.997 100.000 +process-pipe 3 groups 99.597 3.198 99.987 100.000 +process-pipe 4 groups 98.378 2.464 99.958 100.000 +process-pipe 5 groups 27.474 3.653 89.811 99.800 +process-pipe 6 groups 20.201 4.098 82.763 99.570 +process-pipe 7 groups 16.423 4.156 77.398 99.316 +process-pipe 8 groups 13.165 3.920 72.232 98.828 +process-sockets 1 group 99.977 5.882 99.999 100.000 +process-sockets 2 groups 99.927 5.505 99.996 100.000 +process-sockets 3 groups 99.397 3.250 99.980 100.000 +process-sockets 4 groups 79.680 4.258 98.864 99.998 +process-sockets 5 groups 7.673 2.503 63.659 92.115 +process-sockets 6 groups 4.642 1.584 58.946 88.048 +process-sockets 7 groups 3.493 1.379 49.816 81.164 +process-sockets 8 groups 3.015 1.407 40.845 75.500 +threads-pipe 1 group 99.997 0.000 100.000 100.000 +threads-pipe 2 groups 99.894 2.932 99.997 100.000 +threads-pipe 3 groups 99.611 4.117 99.983 100.000 +threads-pipe 4 groups 97.703 2.624 99.937 100.000 +threads-pipe 5 groups 22.919 3.623 87.150 99.764 +threads-pipe 6 groups 18.016 4.038 80.491 99.557 +threads-pipe 7 groups 14.663 3.991 75.239 99.247 +threads-pipe 8 groups 12.242 3.808 70.651 98.644 +threads-sockets 1 group 99.990 6.667 99.999 100.000 +threads-sockets 2 groups 99.940 5.114 99.997 100.000 +threads-sockets 3 groups 99.469 4.115 99.977 100.000 +threads-sockets 4 groups 87.528 4.038 99.400 100.000 +threads-sockets 5 groups 6.942 2.398 59.244 88.337 +threads-sockets 6 groups 4.359 1.954 49.448 87.860 +threads-sockets 7 groups 2.845 1.345 41.198 77.102 +threads-sockets 8 groups 2.871 1.404 38.512 74.312 + +schedstat_parse.py -f tbench_vanilla.log +case load se_eff% dom_eff% fast_rate% success_rate% +loopback 28 threads 99.976 18.369 99.995 100.000 +loopback 56 threads 99.222 7.799 99.934 100.000 +loopback 84 threads 19.723 6.819 70.215 100.000 +loopback 112 threads 11.283 5.371 55.371 99.999 +loopback 140 threads 0.000 0.000 0.000 0.000 +loopback 168 threads 0.000 0.000 0.000 0.000 +loopback 196 threads 0.000 0.000 0.000 0.000 +loopback 224 threads 0.000 0.000 0.000 0.000 + +According to the test above, if the system becomes busy, the +SIS Search Efficiency(se_eff%) drops significantly. Although some +benchmarks would finally find an idle CPU(success_rate% = 100%), it is +doubtful whether it is worth it to search the whole LLC domain. + +[Proposal] +It would be ideal to have a crystal ball to answer this question: +How many CPUs must a wakeup path walk down, before it can find an idle +CPU? Many potential metrics could be used to predict the number. +One candidate is the sum of util_avg in this LLC domain. The benefit +of choosing util_avg is that it is a metric of accumulated historic +activity, which seems to be smoother than instantaneous metrics +(such as rq->nr_running). Besides, choosing the sum of util_avg +would help predict the load of the LLC domain more precisely, because +SIS_PROP uses one CPU's idle time to estimate the total LLC domain idle +time. + +In summary, the lower the util_avg is, the more select_idle_cpu() +should scan for idle CPU, and vice versa. When the sum of util_avg +in this LLC domain hits 85% or above, the scan stops. The reason to +choose 85% as the threshold is that this is the imbalance_pct(117) +when a LLC sched group is overloaded. + +Introduce the quadratic function: + +y = SCHED_CAPACITY_SCALE - p * x^2 +and y'= y / SCHED_CAPACITY_SCALE + +x is the ratio of sum_util compared to the CPU capacity: +x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) +y' is the ratio of CPUs to be scanned in the LLC domain, +and the number of CPUs to scan is calculated by: + +nr_scan = llc_weight * y' + +Choosing quadratic function is because: +[1] Compared to the linear function, it scans more aggressively when the + sum_util is low. +[2] Compared to the exponential function, it is easier to calculate. +[3] It seems that there is no accurate mapping between the sum of util_avg + and the number of CPUs to be scanned. Use heuristic scan for now. + +For a platform with 112 CPUs per LLC, the number of CPUs to scan is: +sum_util% 0 5 15 25 35 45 55 65 75 85 86 ... +scan_nr 112 111 108 102 93 81 65 47 25 1 0 ... + +For a platform with 16 CPUs per LLC, the number of CPUs to scan is: +sum_util% 0 5 15 25 35 45 55 65 75 85 86 ... +scan_nr 16 15 15 14 13 11 9 6 3 0 0 ... + +Furthermore, to minimize the overhead of calculating the metrics in +select_idle_cpu(), borrow the statistics from periodic load balance. +As mentioned by Abel, on a platform with 112 CPUs per LLC, the +sum_util calculated by periodic load balance after 112 ms would +decay to about 0.5 * 0.5 * 0.5 * 0.7 = 8.75%, thus bringing a delay +in reflecting the latest utilization. But it is a trade-off. +Checking the util_avg in newidle load balance would be more frequent, +but it brings overhead - multiple CPUs write/read the per-LLC shared +variable and introduces cache contention. Tim also mentioned that, +it is allowed to be non-optimal in terms of scheduling for the +short-term variations, but if there is a long-term trend in the load +behavior, the scheduler can adjust for that. + +When SIS_UTIL is enabled, the select_idle_cpu() uses the nr_scan +calculated by SIS_UTIL instead of the one from SIS_PROP. As Peter and +Mel suggested, SIS_UTIL should be enabled by default. + +This patch is based on the util_avg, which is very sensitive to the +CPU frequency invariance. There is an issue that, when the max frequency +has been clamp, the util_avg would decay insanely fast when +the CPU is idle. Commit addca285120b ("cpufreq: intel_pstate: Handle no_turbo +in frequency invariance") could be used to mitigate this symptom, by adjusting +the arch_max_freq_ratio when turbo is disabled. But this issue is still +not thoroughly fixed, because the current code is unaware of the user-specified +max CPU frequency. + +[Test result] + +netperf and tbench were launched with 25% 50% 75% 100% 125% 150% +175% 200% of CPU number respectively. Hackbench and schbench were launched +by 1, 2 ,4, 8 groups. Each test lasts for 100 seconds and repeats 3 times. + +The following is the benchmark result comparison between +baseline:vanilla v5.19-rc1 and compare:patched kernel. Positive compare% +indicates better performance. + +Each netperf test is a: +netperf -4 -H 127.0.1 -t TCP/UDP_RR -c -C -l 100 +netperf.throughput +======= +case load baseline(std%) compare%( std%) +TCP_RR 28 threads 1.00 ( 0.34) -0.16 ( 0.40) +TCP_RR 56 threads 1.00 ( 0.19) -0.02 ( 0.20) +TCP_RR 84 threads 1.00 ( 0.39) -0.47 ( 0.40) +TCP_RR 112 threads 1.00 ( 0.21) -0.66 ( 0.22) +TCP_RR 140 threads 1.00 ( 0.19) -0.69 ( 0.19) +TCP_RR 168 threads 1.00 ( 0.18) -0.48 ( 0.18) +TCP_RR 196 threads 1.00 ( 0.16) +194.70 ( 16.43) +TCP_RR 224 threads 1.00 ( 0.16) +197.30 ( 7.85) +UDP_RR 28 threads 1.00 ( 0.37) +0.35 ( 0.33) +UDP_RR 56 threads 1.00 ( 11.18) -0.32 ( 0.21) +UDP_RR 84 threads 1.00 ( 1.46) -0.98 ( 0.32) +UDP_RR 112 threads 1.00 ( 28.85) -2.48 ( 19.61) +UDP_RR 140 threads 1.00 ( 0.70) -0.71 ( 14.04) +UDP_RR 168 threads 1.00 ( 14.33) -0.26 ( 11.16) +UDP_RR 196 threads 1.00 ( 12.92) +186.92 ( 20.93) +UDP_RR 224 threads 1.00 ( 11.74) +196.79 ( 18.62) + +Take the 224 threads as an example, the SIS search metrics changes are +illustrated below: + + vanilla patched + 4544492 +237.5% 15338634 sched_debug.cpu.sis_domain_search.avg + 38539 +39686.8% 15333634 sched_debug.cpu.sis_failed.avg + 128300000 -87.9% 15551326 sched_debug.cpu.sis_scanned.avg + 5842896 +162.7% 15347978 sched_debug.cpu.sis_search.avg + +There is -87.9% less CPU scans after patched, which indicates lower overhead. +Besides, with this patch applied, there is -13% less rq lock contention +in perf-profile.calltrace.cycles-pp._raw_spin_lock.raw_spin_rq_lock_nested +.try_to_wake_up.default_wake_function.woken_wake_function. +This might help explain the performance improvement - Because this patch allows +the waking task to remain on the previous CPU, rather than grabbing other CPUs' +lock. + +Each hackbench test is a: +hackbench -g $job --process/threads --pipe/sockets -l 1000000 -s 100 +hackbench.throughput +========= +case load baseline(std%) compare%( std%) +process-pipe 1 group 1.00 ( 1.29) +0.57 ( 0.47) +process-pipe 2 groups 1.00 ( 0.27) +0.77 ( 0.81) +process-pipe 4 groups 1.00 ( 0.26) +1.17 ( 0.02) +process-pipe 8 groups 1.00 ( 0.15) -4.79 ( 0.02) +process-sockets 1 group 1.00 ( 0.63) -0.92 ( 0.13) +process-sockets 2 groups 1.00 ( 0.03) -0.83 ( 0.14) +process-sockets 4 groups 1.00 ( 0.40) +5.20 ( 0.26) +process-sockets 8 groups 1.00 ( 0.04) +3.52 ( 0.03) +threads-pipe 1 group 1.00 ( 1.28) +0.07 ( 0.14) +threads-pipe 2 groups 1.00 ( 0.22) -0.49 ( 0.74) +threads-pipe 4 groups 1.00 ( 0.05) +1.88 ( 0.13) +threads-pipe 8 groups 1.00 ( 0.09) -4.90 ( 0.06) +threads-sockets 1 group 1.00 ( 0.25) -0.70 ( 0.53) +threads-sockets 2 groups 1.00 ( 0.10) -0.63 ( 0.26) +threads-sockets 4 groups 1.00 ( 0.19) +11.92 ( 0.24) +threads-sockets 8 groups 1.00 ( 0.08) +4.31 ( 0.11) + +Each tbench test is a: +tbench -t 100 $job 127.0.0.1 +tbench.throughput +====== +case load baseline(std%) compare%( std%) +loopback 28 threads 1.00 ( 0.06) -0.14 ( 0.09) +loopback 56 threads 1.00 ( 0.03) -0.04 ( 0.17) +loopback 84 threads 1.00 ( 0.05) +0.36 ( 0.13) +loopback 112 threads 1.00 ( 0.03) +0.51 ( 0.03) +loopback 140 threads 1.00 ( 0.02) -1.67 ( 0.19) +loopback 168 threads 1.00 ( 0.38) +1.27 ( 0.27) +loopback 196 threads 1.00 ( 0.11) +1.34 ( 0.17) +loopback 224 threads 1.00 ( 0.11) +1.67 ( 0.22) + +Each schbench test is a: +schbench -m $job -t 28 -r 100 -s 30000 -c 30000 +schbench.latency_90%_us +======== +case load baseline(std%) compare%( std%) +normal 1 mthread 1.00 ( 31.22) -7.36 ( 20.25)* +normal 2 mthreads 1.00 ( 2.45) -0.48 ( 1.79) +normal 4 mthreads 1.00 ( 1.69) +0.45 ( 0.64) +normal 8 mthreads 1.00 ( 5.47) +9.81 ( 14.28) + +*Consider the Standard Deviation, this -7.36% regression might not be valid. + +Also, a OLTP workload with a commercial RDBMS has been tested, and there +is no significant change. + +There were concerns that unbalanced tasks among CPUs would cause problems. +For example, suppose the LLC domain is composed of 8 CPUs, and 7 tasks are +bound to CPU0~CPU6, while CPU7 is idle: + + CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 +util_avg 1024 1024 1024 1024 1024 1024 1024 0 + +Since the util_avg ratio is 87.5%( = 7/8 ), which is higher than 85%, +select_idle_cpu() will not scan, thus CPU7 is undetected during scan. +But according to Mel, it is unlikely the CPU7 will be idle all the time +because CPU7 could pull some tasks via CPU_NEWLY_IDLE. + +lkp(kernel test robot) has reported a regression on stress-ng.sock on a +very busy system. According to the sched_debug statistics, it might be caused +by SIS_UTIL terminates the scan and chooses a previous CPU earlier, and this +might introduce more context switch, especially involuntary preemption, which +impacts a busy stress-ng. This regression has shown that, not all benchmarks +in every scenario benefit from idle CPU scan limit, and it needs further +investigation. + +Besides, there is slight regression in hackbench's 16 groups case when the +LLC domain has 16 CPUs. Prateek mentioned that we should scan aggressively +in an LLC domain with 16 CPUs. Because the cost to search for an idle one +among 16 CPUs is negligible. The current patch aims to propose a generic +solution and only considers the util_avg. Something like the below could +be applied on top of the current patch to fulfill the requirement: + + if (llc_weight <= 16) + nr_scan = nr_scan * 32 / llc_weight; + +For LLC domain with 16 CPUs, the nr_scan will be expanded to 2 times large. +The smaller the CPU number this LLC domain has, the larger nr_scan will be +expanded. This needs further investigation. + +There is also ongoing work[2] from Abel to filter out the busy CPUs during +wakeup, to further speed up the idle CPU scan. And it could be a following-up +optimization on top of this change. + +Suggested-by: Tim Chen +Suggested-by: Peter Zijlstra +Signed-off-by: Chen Yu +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Yicong Yang +Tested-by: Mohini Narkhede +Tested-by: K Prateek Nayak +Link: https://lore.kernel.org/r/20220612163428.849378-1-yu.c.chen@intel.com +Signed-off-by: Jialin Zhang +Signed-off-by: Guan Jing +Reviewed-by: Zhang Qiao +Reviewed-by: Chen Hui +Signed-off-by: Zheng Zengkai +Signed-off-by: Xue Sinian +--- + include/linux/sched/topology.h | 1 + + kernel/sched/fair.c | 87 ++++++++++++++++++++++++++++++++++ + kernel/sched/features.h | 3 +- + 3 files changed, 90 insertions(+), 1 deletion(-) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index ae4ba452c111..0bc030657db4 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -80,6 +80,7 @@ struct sched_domain_shared { + atomic_t ref; + atomic_t nr_busy_cpus; + int has_idle_cores; ++ int nr_idle_scan; + #if defined(CONFIG_SCHED_STEAL) && !defined(__GENKSYMS__) + struct sparsemask *cfs_overload_cpus; + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 11fc12369c7d..bebe0a3adf45 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -6898,6 +6898,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t + u64 time, cost; + s64 delta; + int cpu, nr = INT_MAX; ++ struct sched_domain_shared *sd_share; + + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) +@@ -6929,6 +6930,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t + cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + #endif + ++ if (sched_feat(SIS_UTIL)) { ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); ++ if (sd_share) { ++ /* because !--nr is the condition to stop scan */ ++ nr = READ_ONCE(sd_share->nr_idle_scan) + 1; ++ /* overloaded LLC is unlikely to have idle cpu/core */ ++ if (nr == 1) ++ return -1; ++ } ++ } ++ + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_domain *sdc = + rcu_dereference(per_cpu(sd_cluster, target)); +@@ -9372,6 +9384,77 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) + } + #endif /* CONFIG_NUMA_BALANCING */ + ++static void update_idle_cpu_scan(struct lb_env *env, ++ unsigned long sum_util) ++{ ++ struct sched_domain_shared *sd_share; ++ int llc_weight, pct; ++ u64 x, y, tmp; ++ /* ++ * Update the number of CPUs to scan in LLC domain, which could ++ * be used as a hint in select_idle_cpu(). The update of sd_share ++ * could be expensive because it is within a shared cache line. ++ * So the write of this hint only occurs during periodic load ++ * balancing, rather than CPU_NEWLY_IDLE, because the latter ++ * can fire way more frequently than the former. ++ */ ++ if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) ++ return; ++ ++ llc_weight = per_cpu(sd_llc_size, env->dst_cpu); ++ if (env->sd->span_weight != llc_weight) ++ return; ++ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); ++ if (!sd_share) ++ return; ++ ++ /* ++ * The number of CPUs to search drops as sum_util increases, when ++ * sum_util hits 85% or above, the scan stops. ++ * The reason to choose 85% as the threshold is because this is the ++ * imbalance_pct(117) when a LLC sched group is overloaded. ++ * ++ * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] ++ * and y'= y / SCHED_CAPACITY_SCALE ++ * ++ * x is the ratio of sum_util compared to the CPU capacity: ++ * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) ++ * y' is the ratio of CPUs to be scanned in the LLC domain, ++ * and the number of CPUs to scan is calculated by: ++ * ++ * nr_scan = llc_weight * y' [2] ++ * ++ * When x hits the threshold of overloaded, AKA, when ++ * x = 100 / pct, y drops to 0. According to [1], ++ * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 ++ * ++ * Scale x by SCHED_CAPACITY_SCALE: ++ * x' = sum_util / llc_weight; [3] ++ * ++ * and finally [1] becomes: ++ * y = SCHED_CAPACITY_SCALE - ++ * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] ++ * ++ */ ++ /* equation [3] */ ++ x = sum_util; ++ do_div(x, llc_weight); ++ ++ /* equation [4] */ ++ pct = env->sd->imbalance_pct; ++ tmp = x * x * pct * pct; ++ do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); ++ tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); ++ y = SCHED_CAPACITY_SCALE - tmp; ++ ++ /* equation [2] */ ++ y *= llc_weight; ++ do_div(y, SCHED_CAPACITY_SCALE); ++ if ((int)y != sd_share->nr_idle_scan) ++ WRITE_ONCE(sd_share->nr_idle_scan, (int)y); ++} ++ + /** + * update_sd_lb_stats - Update sched_domain's statistics for load balancing. + * @env: The load balancing environment. +@@ -9385,6 +9468,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd + struct sg_lb_stats tmp_sgs; + int load_idx, prefer_sibling = 0; + bool overload = false; ++ unsigned long sum_util = 0; + + if (child && child->flags & SD_PREFER_SIBLING) + prefer_sibling = 1; +@@ -9444,6 +9528,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd + sds->total_load += sgs->group_load; + sds->total_capacity += sgs->group_capacity; + ++ sum_util += sgs->group_util; + sg = sg->next; + } while (sg != env->sd->groups); + +@@ -9464,6 +9549,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd + if (env->dst_rq->rd->overload != overload) + env->dst_rq->rd->overload = overload; + } ++ ++ update_idle_cpu_scan(env, sum_util); + } + + /** +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 515bfbcc6c99..beda3a619bb9 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -56,7 +56,8 @@ SCHED_FEAT(TTWU_QUEUE, true) + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ + SCHED_FEAT(SIS_AVG_CPU, false) +-SCHED_FEAT(SIS_PROP, true) ++SCHED_FEAT(SIS_PROP, false) ++SCHED_FEAT(SIS_UTIL, true) + + #ifdef CONFIG_SCHED_STEAL + /* +-- +2.23.0 + diff --git a/patches/0775-sched-fair-Fix-kabi-borken-in-sched_domain_shared.patch b/patches/0775-sched-fair-Fix-kabi-borken-in-sched_domain_shared.patch new file mode 100644 index 0000000000000000000000000000000000000000..79a579937f227cb4170ae635ffce458abb6f6579 --- /dev/null +++ b/patches/0775-sched-fair-Fix-kabi-borken-in-sched_domain_shared.patch @@ -0,0 +1,46 @@ +From 921829ba19bdde53098b3a0c3141550c909c5a34 Mon Sep 17 00:00:00 2001 +From: Guan Jing +Date: Wed, 30 Nov 2022 18:23:25 +0800 +Subject: [PATCH 18/19] sched/fair: Fix kabi borken in sched_domain_shared + +hulk inclusion +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I61E4M +CVE: NA + +-------------------------------- + +The sched_domain_shared structure is only used as pointer, and other +drivers don't use it directly. + +Signed-off-by: Guan Jing +Reviewed-by: zhangjialin +Reviewed-by: Zhang Qiao +Reviewed-by: Chen Hui +Signed-off-by: Zheng Zengkai +Signed-off-by: Xue Sinian +--- + include/linux/sched/topology.h | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 0bc030657db4..9bb1d067d84b 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -80,10 +80,12 @@ struct sched_domain_shared { + atomic_t ref; + atomic_t nr_busy_cpus; + int has_idle_cores; +- int nr_idle_scan; + #if defined(CONFIG_SCHED_STEAL) && !defined(__GENKSYMS__) + struct sparsemask *cfs_overload_cpus; + #endif ++#ifndef __GENKSYMS__ ++ int nr_idle_scan ++#endif + }; + + struct sched_domain { +-- +2.23.0 + diff --git a/patches/0776-sched-fair-ARM64-enables-SIS_UTIL-and-disables-SIS_P.patch b/patches/0776-sched-fair-ARM64-enables-SIS_UTIL-and-disables-SIS_P.patch new file mode 100644 index 0000000000000000000000000000000000000000..3ff92e2c42fe43e9c794a3ac8c6b93c33e3099c3 --- /dev/null +++ b/patches/0776-sched-fair-ARM64-enables-SIS_UTIL-and-disables-SIS_P.patch @@ -0,0 +1,46 @@ +From d63e50cf6b8afe84463fccc775965e765d840f95 Mon Sep 17 00:00:00 2001 +From: Guan Jing +Date: Wed, 30 Nov 2022 18:23:26 +0800 +Subject: [PATCH 19/19] sched/fair:ARM64 enables SIS_UTIL and disables SIS_PROP + +hulk inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I61E4M +CVE: NA + +-------------------------------- + +When doing wakeups, attempt to limit superfluous scans of the LLC domain. +ARM64 enables SIS_UTIL and disables SIS_PROP to search idle CPU based on +sum of util_avg. + +Signed-off-by: Guan Jing +Reviewed-by: Zhang Qiao +Reviewed-by: Chen Hui +Signed-off-by: Zheng Zengkai +Signed-off-by: Xue Sinian +--- + kernel/sched/features.h | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index beda3a619bb9..f25619f2c0de 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -56,8 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true) + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. + */ + SCHED_FEAT(SIS_AVG_CPU, false) ++#ifdef CONFIG_ARM64 + SCHED_FEAT(SIS_PROP, false) + SCHED_FEAT(SIS_UTIL, true) ++#else ++SCHED_FEAT(SIS_PROP, true) ++SCHED_FEAT(SIS_UTIL, false) ++#endif + + #ifdef CONFIG_SCHED_STEAL + /* +-- +2.23.0 + diff --git a/series.conf b/series.conf index f5f2522d303f5b19a6053fdcbd270ade9f6bcd49..194e934237426399927ab0255a086f719d66c6a2 100644 --- a/series.conf +++ b/series.conf @@ -758,3 +758,22 @@ patches/0754-scsi-hisi_sas_v3_hw-Remove-extra-function-calls-for-.patch patches/0755-config-arm64-Enable-dubugfs-config-of-hisi-sas.patch patches/0756-crypto-hisilicon-Add-value-profile-support-for-kerne.patch patches/0757-Revert-genirq-Increase-the-number-of-IRQ-descriptors.patch +patches/0758-topology-Represent-clusters-of-CPUs-within-a-die.patch +patches/0759-sched-Add-cluster-scheduler-level-in-core-and-relate.patch +patches/0760-topology-sysfs-export-cluster-attributes-only-if-an-.patch +patches/0761-topology-Remove-unused-cpu_cluster_mask.patch +patches/0762-arch_topology-Limit-span-of-cpu_clustergroup_mask.patch +patches/0763-arch_topology-Make-cluster-topology-span-at-least-SM.patch +patches/0764-sched-Add-per_cpu-cluster-domain-info-and-cpus_share.patch +patches/0765-sched-fair-Scan-cluster-before-scanning-LLC-in-wake-.patch +patches/0766-scheduler-Create-SDTL_SKIP-flag-to-skip-topology-lev.patch +patches/0767-sysctl-add-a-new-register_sysctl_init-interface.patch +patches/0768-sched-topology-drivers-base-arch_topology-Rebuild-th.patch +patches/0769-sched-topology-arch-arm64-Rebuild-the-sched_domain-h.patch +patches/0770-scheduler-Add-runtime-knob-sysctl_sched_cluster.patch +patches/0771-scheduler-Add-boot-time-enabling-disabling-of-cluste.patch +patches/0772-scheduler-Disable-cluster-scheduling-by-default.patch +patches/0773-sched-Open-the-kernel-configuration-for-cluster.patch +patches/0774-sched-fair-Introduce-SIS_UTIL-to-search-idle-CPU-bas.patch +patches/0775-sched-fair-Fix-kabi-borken-in-sched_domain_shared.patch +patches/0776-sched-fair-ARM64-enables-SIS_UTIL-and-disables-SIS_P.patch