From 3ea471fec9d5305a657ab91947b8f89c1b5889b3 Mon Sep 17 00:00:00 2001 From: Jiang Yi Date: Tue, 7 Nov 2023 10:01:14 +0800 Subject: [PATCH] Backport the support for cluster scheduler level on Kunpeng Server Signed-off-by: Jiang Yi --- kernel.spec | 20 +- ...resent-clusters-of-CPUs-within-a-die.patch | 475 ++++++++++++++++++ ...r-scheduler-level-in-core-and-relate.patch | 264 ++++++++++ ...xport-cluster-attributes-only-if-an-.patch | 128 +++++ ...ology-Remove-unused-cpu_cluster_mask.patch | 49 ++ ...-Limit-span-of-cpu_clustergroup_mask.patch | 73 +++ ...ke-cluster-topology-span-at-least-SM.patch | 72 +++ ...u-cluster-domain-info-and-cpus_share.patch | 176 +++++++ ...cluster-before-scanning-LLC-in-wake-.patch | 224 +++++++++ ...-SDTL_SKIP-flag-to-skip-topology-lev.patch | 78 +++ ...a-new-register_sysctl_init-interface.patch | 196 ++++++++ ...rivers-base-arch_topology-Rebuild-th.patch | 87 ++++ ...rch-arm64-Rebuild-the-sched_domain-h.patch | 58 +++ ...dd-runtime-knob-sysctl_sched_cluster.patch | 237 +++++++++ ...ot-time-enabling-disabling-of-cluste.patch | 72 +++ ...isable-cluster-scheduling-by-default.patch | 38 ++ ...the-kernel-configuration-for-cluster.patch | 35 ++ series.conf | 16 + 18 files changed, 2297 insertions(+), 1 deletion(-) create mode 100644 patches/0117-topology-Represent-clusters-of-CPUs-within-a-die.patch create mode 100644 patches/0118-sched-Add-cluster-scheduler-level-in-core-and-relate.patch create mode 100644 patches/0119-topology-sysfs-export-cluster-attributes-only-if-an-.patch create mode 100644 patches/0120-topology-Remove-unused-cpu_cluster_mask.patch create mode 100644 patches/0121-arch_topology-Limit-span-of-cpu_clustergroup_mask.patch create mode 100644 patches/0122-arch_topology-Make-cluster-topology-span-at-least-SM.patch create mode 100644 patches/0123-sched-Add-per_cpu-cluster-domain-info-and-cpus_share.patch create mode 100644 patches/0124-sched-fair-Scan-cluster-before-scanning-LLC-in-wake-.patch create mode 100644 patches/0125-scheduler-Create-SDTL_SKIP-flag-to-skip-topology-lev.patch create mode 100644 patches/0126-sysctl-add-a-new-register_sysctl_init-interface.patch create mode 100644 patches/0127-sched-topology-drivers-base-arch_topology-Rebuild-th.patch create mode 100644 patches/0128-sched-topology-arch-arm64-Rebuild-the-sched_domain-h.patch create mode 100644 patches/0129-scheduler-Add-runtime-knob-sysctl_sched_cluster.patch create mode 100644 patches/0130-scheduler-Add-boot-time-enabling-disabling-of-cluste.patch create mode 100644 patches/0131-scheduler-Disable-cluster-scheduling-by-default.patch create mode 100644 patches/0132-sched-Open-the-kernel-configuration-for-cluster.patch diff --git a/kernel.spec b/kernel.spec index 5f736aaa..2453253a 100644 --- a/kernel.spec +++ b/kernel.spec @@ -32,7 +32,7 @@ Name: kernel Version: 4.19.90 -Release: %{hulkrelease}.0236 +Release: %{hulkrelease}.0237 Summary: Linux Kernel License: GPLv2 URL: http://www.kernel.org/ @@ -836,6 +836,24 @@ fi %changelog +* Thu Nov 9 2023 Jiang Yi - 4.19.90-2311.1.0.0237 +- sched:Open the kernel configuration for cluster. +- scheduler: Disable cluster scheduling by default +- scheduler: Add boot time enabling/disabling of cluster scheduling +- scheduler: Add runtime knob sysctl_sched_cluster +- sched/topology, arch/arm64: Rebuild the sched_domain hierarchy when the CPU capacity changes +- sched/topology, drivers/base/arch_topology: Rebuild the sched_domain hierarchy when capacities change +- sysctl: add a new register_sysctl_init() interface +- scheduler: Create SDTL_SKIP flag to skip topology level +- sched/fair: Scan cluster before scanning LLC in wake-up path +- sched: Add per_cpu cluster domain info and cpus_share_lowest_cache API +- arch_topology: Make cluster topology span at least SMT CPUs +- arch_topology: Limit span of cpu_clustergroup_mask() +- topology: Remove unused cpu_cluster_mask() +- topology/sysfs: export cluster attributes only if an architectures has support +- sched: Add cluster scheduler level in core and related Kconfig for ARM64 +- topology: Represent clusters of CPUs within a die + * Wed Nov 8 2023 Yu Liao - 4.19.90-2311.1.0.0236 - kernel.spec: skip check patches that from linux master or stable diff --git a/patches/0117-topology-Represent-clusters-of-CPUs-within-a-die.patch b/patches/0117-topology-Represent-clusters-of-CPUs-within-a-die.patch new file mode 100644 index 00000000..3e4d33ab --- /dev/null +++ b/patches/0117-topology-Represent-clusters-of-CPUs-within-a-die.patch @@ -0,0 +1,475 @@ +From c94a98a1de262778bb5902d55e10e88d3a89e251 Mon Sep 17 00:00:00 2001 +From: Jonathan Cameron +Date: Thu, 18 Nov 2021 20:43:35 +0800 +Subject: [PATCH 117/132] topology: Represent clusters of CPUs within a die + +mainline inclusion +from mainline-v5.16-rc1 +commit c5e22feffdd736cb02b98b0f5b375c8ebc858dd4 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I4GEZS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c5e22feffdd736cb02b98b0f5b375c8ebc858dd4 + +------------------------------------------------------------------------ + +Both ACPI and DT provide the ability to describe additional layers of +topology between that of individual cores and higher level constructs +such as the level at which the last level cache is shared. +In ACPI this can be represented in PPTT as a Processor Hierarchy +Node Structure [1] that is the parent of the CPU cores and in turn +has a parent Processor Hierarchy Nodes Structure representing +a higher level of topology. + +For example Kunpeng 920 has 6 or 8 clusters in each NUMA node, and each +cluster has 4 cpus. All clusters share L3 cache data, but each cluster +has local L3 tag. On the other hand, each clusters will share some +internal system bus. + ++-----------------------------------+ +---------+ +| +------+ +------+ +--------------------------+ | +| | CPU0 | | cpu1 | | +-----------+ | | +| +------+ +------+ | | | | | +| +----+ L3 | | | +| +------+ +------+ cluster | | tag | | | +| | CPU2 | | CPU3 | | | | | | +| +------+ +------+ | +-----------+ | | +| | | | ++-----------------------------------+ | | ++-----------------------------------+ | | +| +------+ +------+ +--------------------------+ | +| | | | | | +-----------+ | | +| +------+ +------+ | | | | | +| | | L3 | | | +| +------+ +------+ +----+ tag | | | +| | | | | | | | | | +| +------+ +------+ | +-----------+ | | +| | | | ++-----------------------------------+ | L3 | + | data | ++-----------------------------------+ | | +| +------+ +------+ | +-----------+ | | +| | | | | | | | | | +| +------+ +------+ +----+ L3 | | | +| | | tag | | | +| +------+ +------+ | | | | | +| | | | | | +-----------+ | | +| +------+ +------+ +--------------------------+ | ++-----------------------------------| | | ++-----------------------------------| | | +| +------+ +------+ +--------------------------+ | +| | | | | | +-----------+ | | +| +------+ +------+ | | | | | +| +----+ L3 | | | +| +------+ +------+ | | tag | | | +| | | | | | | | | | +| +------+ +------+ | +-----------+ | | +| | | | ++-----------------------------------+ | | ++-----------------------------------+ | | +| +------+ +------+ +--------------------------+ | +| | | | | | +-----------+ | | +| +------+ +------+ | | | | | +| | | L3 | | | +| +------+ +------+ +---+ tag | | | +| | | | | | | | | | +| +------+ +------+ | +-----------+ | | +| | | | ++-----------------------------------+ | | ++-----------------------------------+ | | +| +------+ +------+ +--------------------------+ | +| | | | | | +-----------+ | | +| +------+ +------+ | | | | | +| | | L3 | | | +| +------+ +------+ +--+ tag | | | +| | | | | | | | | | +| +------+ +------+ | +-----------+ | | +| | +---------+ ++-----------------------------------+ + +That means spreading tasks among clusters will bring more bandwidth +while packing tasks within one cluster will lead to smaller cache +synchronization latency. So both kernel and userspace will have +a chance to leverage this topology to deploy tasks accordingly to +achieve either smaller cache latency within one cluster or an even +distribution of load among clusters for higher throughput. + +This patch exposes cluster topology to both kernel and userspace. +Libraried like hwloc will know cluster by cluster_cpus and related +sysfs attributes. PoC of HWLOC support at [2]. + +Note this patch only handle the ACPI case. + +Special consideration is needed for SMT processors, where it is +necessary to move 2 levels up the hierarchy from the leaf nodes +(thus skipping the processor core level). + +Note that arm64 / ACPI does not provide any means of identifying +a die level in the topology but that may be unrelate to the cluster +level. + +[1] ACPI Specification 6.3 - section 5.2.29.1 processor hierarchy node + structure (Type 0) +[2] https://github.com/hisilicon/hwloc/tree/linux-cluster + +Signed-off-by: Jonathan Cameron +Signed-off-by: Tian Tao +Signed-off-by: Barry Song +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20210924085104.44806-2-21cnbao@gmail.com +Signed-off-by: Yicong Yang +Reviewed-by: tao zeng +Signed-off-by: Zheng Zengkai + +Conflicts: + Documentation/ABI/stable/sysfs-devices-system-cpu + Documentation/admin-guide/cputopology.rst + drivers/base/arch_topology.c + drivers/base/topology.c + include/linux/arch_topology.h + include/linux/topology.h + +Signed-off-by: Jiang Yi +--- + Documentation/cputopology.txt | 26 +++++++++-- + arch/arm64/include/asm/topology.h | 5 +++ + arch/arm64/kernel/topology.c | 17 ++++++++ + drivers/acpi/pptt.c | 72 +++++++++++++++++++++++++++++++ + drivers/base/topology.c | 10 +++++ + include/linux/acpi.h | 5 +++ + include/linux/topology.h | 6 +++ + 7 files changed, 137 insertions(+), 4 deletions(-) + +diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt +index 2ff8a1e9a2db..acd55bf0c718 100644 +--- a/Documentation/cputopology.txt ++++ b/Documentation/cputopology.txt +@@ -18,6 +18,11 @@ die_id: + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + ++cluster_id: ++ the cluster ID of cpuX. Typically it is the hardware platform's ++ identifier (rather than the kernel's). The actual value is ++ architecture and platform dependent. ++ + core_id: + + the CPU core ID of cpuX. Typically it is the hardware platform's +@@ -36,6 +41,15 @@ drawer_id: + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + ++cluster_cpus: ++ ++ internal kernel map of CPUs within the same cluster ++ ++cluster_cpus_list: ++ ++ human-readable list of CPUs within the same cluster. ++ The format is like 0-3, 8-11, 14,17. ++ + thread_siblings: + + internal kernel map of cpuX's hardware threads within the same +@@ -88,11 +102,13 @@ these macros in include/asm-XXX/topology.h:: + + #define topology_physical_package_id(cpu) + #define topology_die_id(cpu) ++ #define topology_cluster_id(cpu) + #define topology_core_id(cpu) + #define topology_book_id(cpu) + #define topology_drawer_id(cpu) + #define topology_sibling_cpumask(cpu) + #define topology_core_cpumask(cpu) ++ #define topology_cluster_cpumask(cpu) + #define topology_book_cpumask(cpu) + #define topology_drawer_cpumask(cpu) + +@@ -107,10 +123,12 @@ not defined by include/asm-XXX/topology.h: + + 1) topology_physical_package_id: -1 + 2) topology_die_id: -1 +-3) topology_core_id: 0 +-4) topology_sibling_cpumask: just the given CPU +-5) topology_core_cpumask: just the given CPU +-6) topology_die_cpumask: just the given CPU ++3) topology_cluster_id: -1 ++4) topology_core_id: 0 ++5) topology_sibling_cpumask: just the given CPU ++6) topology_core_cpumask: just the given CPU ++7) topology_cluster_cpumask: just the given CPU ++8) topology_die_cpumask: just the given CPU + + For architectures that don't support books (CONFIG_SCHED_BOOK) there are no + default definitions for topology_book_id() and topology_book_cpumask(). +diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h +index 49a0fee4f89b..164e26035653 100644 +--- a/arch/arm64/include/asm/topology.h ++++ b/arch/arm64/include/asm/topology.h +@@ -7,25 +7,30 @@ + struct cpu_topology { + int thread_id; + int core_id; ++ int cluster_id; + int package_id; + int llc_id; + cpumask_t thread_sibling; + cpumask_t core_sibling; ++ cpumask_t cluster_sibling; + cpumask_t llc_sibling; + }; + + extern struct cpu_topology cpu_topology[NR_CPUS]; + + #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) ++#define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id) + #define topology_core_id(cpu) (cpu_topology[cpu].core_id) + #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) + #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) ++#define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) + #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) + + void init_cpu_topology(void); + void store_cpu_topology(unsigned int cpuid); + void remove_cpu_topology(unsigned int cpuid); + const struct cpumask *cpu_coregroup_mask(int cpu); ++const struct cpumask *cpu_clustergroup_mask(int cpu); + + #ifdef CONFIG_NUMA + +diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c +index bf937d334b81..e4effe6f3177 100644 +--- a/arch/arm64/kernel/topology.c ++++ b/arch/arm64/kernel/topology.c +@@ -230,6 +230,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) + return core_mask; + } + ++const struct cpumask *cpu_clustergroup_mask(int cpu) ++{ ++ return &cpu_topology[cpu].cluster_sibling; ++} ++ + static void update_siblings_masks(unsigned int cpuid) + { + struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; +@@ -247,6 +252,12 @@ static void update_siblings_masks(unsigned int cpuid) + if (cpuid_topo->package_id != cpu_topo->package_id) + continue; + ++ if (cpuid_topo->cluster_id == cpu_topo->cluster_id && ++ cpuid_topo->cluster_id != -1) { ++ cpumask_set_cpu(cpu, &cpuid_topo->cluster_sibling); ++ cpumask_set_cpu(cpuid, &cpu_topo->cluster_sibling); ++ } ++ + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); + cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); + +@@ -312,6 +323,9 @@ static void clear_cpu_topology(int cpu) + cpumask_clear(&cpu_topo->llc_sibling); + cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); + ++ cpumask_clear(&cpu_topo->cluster_sibling); ++ cpumask_set_cpu(cpu, &cpu_topo->cluster_sibling); ++ + cpumask_clear(&cpu_topo->core_sibling); + cpumask_set_cpu(cpu, &cpu_topo->core_sibling); + cpumask_clear(&cpu_topo->thread_sibling); +@@ -327,6 +341,7 @@ static void __init reset_cpu_topology(void) + + cpu_topo->thread_id = -1; + cpu_topo->core_id = 0; ++ cpu_topo->cluster_id = -1; + cpu_topo->package_id = -1; + cpu_topo->llc_id = -1; + +@@ -438,6 +453,8 @@ static int __init parse_acpi_topology(void) + cpu_topology[cpu].thread_id = -1; + cpu_topology[cpu].core_id = topology_id; + } ++ topology_id = find_acpi_cpu_topology_cluster(cpu); ++ cpu_topology[cpu].cluster_id = topology_id; + topology_id = find_acpi_cpu_topology_package(cpu); + cpu_topology[cpu].package_id = topology_id; + +diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c +index 879b9155b7b4..0f382545cf06 100644 +--- a/drivers/acpi/pptt.c ++++ b/drivers/acpi/pptt.c +@@ -617,6 +617,11 @@ static struct acpi_pptt_processor *acpi_find_processor_tag(struct acpi_table_hea + return cpu; + } + ++static void acpi_pptt_warn_missing(void) ++{ ++ pr_warn_once("No PPTT table found, CPU and cache topology may be inaccurate\n"); ++} ++ + /** + * topology_get_acpi_cpu_tag() - Find a unique topology value for a feature + * @table: Pointer to the head of the PPTT table +@@ -871,6 +876,73 @@ int find_acpi_cpu_topology_package(unsigned int cpu) + ACPI_PPTT_PHYSICAL_PACKAGE); + } + ++/** ++ * find_acpi_cpu_topology_cluster() - Determine a unique CPU cluster value ++ * @cpu: Kernel logical CPU number ++ * ++ * Determine a topology unique cluster ID for the given CPU/thread. ++ * This ID can then be used to group peers, which will have matching ids. ++ * ++ * The cluster, if present is the level of topology above CPUs. In a ++ * multi-thread CPU, it will be the level above the CPU, not the thread. ++ * It may not exist in single CPU systems. In simple multi-CPU systems, ++ * it may be equal to the package topology level. ++ * ++ * Return: -ENOENT if the PPTT doesn't exist, the CPU cannot be found ++ * or there is no toplogy level above the CPU.. ++ * Otherwise returns a value which represents the package for this CPU. ++ */ ++ ++int find_acpi_cpu_topology_cluster(unsigned int cpu) ++{ ++ struct acpi_table_header *table; ++ acpi_status status; ++ struct acpi_pptt_processor *cpu_node, *cluster_node; ++ u32 acpi_cpu_id; ++ int retval; ++ int is_thread; ++ ++ status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); ++ if (ACPI_FAILURE(status)) { ++ acpi_pptt_warn_missing(); ++ return -ENOENT; ++ } ++ ++ acpi_cpu_id = get_acpi_id_for_cpu(cpu); ++ cpu_node = acpi_find_processor_node(table, acpi_cpu_id); ++ if (cpu_node == NULL || !cpu_node->parent) { ++ retval = -ENOENT; ++ goto put_table; ++ } ++ ++ is_thread = cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_IS_THREAD; ++ cluster_node = fetch_pptt_node(table, cpu_node->parent); ++ if (cluster_node == NULL) { ++ retval = -ENOENT; ++ goto put_table; ++ } ++ if (is_thread) { ++ if (!cluster_node->parent) { ++ retval = -ENOENT; ++ goto put_table; ++ } ++ cluster_node = fetch_pptt_node(table, cluster_node->parent); ++ if (cluster_node == NULL) { ++ retval = -ENOENT; ++ goto put_table; ++ } ++ } ++ if (cluster_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID) ++ retval = cluster_node->acpi_processor_id; ++ else ++ retval = ACPI_PTR_DIFF(cluster_node, table); ++ ++put_table: ++ acpi_put_table(table); ++ ++ return retval; ++} ++ + /** + * find_acpi_cpu_topology_hetero_id() - Get a core architecture tag + * @cpu: Kernel logical CPU number +diff --git a/drivers/base/topology.c b/drivers/base/topology.c +index da74231de498..7e4bdf65e27a 100644 +--- a/drivers/base/topology.c ++++ b/drivers/base/topology.c +@@ -46,6 +46,9 @@ static DEVICE_ATTR_RO(physical_package_id); + define_id_show_func(die_id); + static DEVICE_ATTR_RO(die_id); + ++define_id_show_func(cluster_id); ++static DEVICE_ATTR_RO(cluster_id); ++ + define_id_show_func(core_id); + static DEVICE_ATTR_RO(core_id); + +@@ -57,6 +60,10 @@ define_siblings_show_func(core_siblings, core_cpumask); + static DEVICE_ATTR_RO(core_siblings); + static DEVICE_ATTR_RO(core_siblings_list); + ++define_siblings_show_func(cluster_cpus, cluster_cpumask); ++static DEVICE_ATTR_RO(cluster_cpus); ++static DEVICE_ATTR_RO(cluster_cpus_list); ++ + #ifdef CONFIG_SCHED_BOOK + define_id_show_func(book_id); + static DEVICE_ATTR_RO(book_id); +@@ -76,11 +83,14 @@ static DEVICE_ATTR_RO(drawer_siblings_list); + static struct attribute *default_attrs[] = { + &dev_attr_physical_package_id.attr, + &dev_attr_die_id.attr, ++ &dev_attr_cluster_id.attr, + &dev_attr_core_id.attr, + &dev_attr_thread_siblings.attr, + &dev_attr_thread_siblings_list.attr, + &dev_attr_core_siblings.attr, + &dev_attr_core_siblings_list.attr, ++ &dev_attr_cluster_cpus.attr, ++ &dev_attr_cluster_cpus_list.attr, + #ifdef CONFIG_SCHED_BOOK + &dev_attr_book_id.attr, + &dev_attr_book_siblings.attr, +diff --git a/include/linux/acpi.h b/include/linux/acpi.h +index 4a0142276cb8..2713d2032bff 100644 +--- a/include/linux/acpi.h ++++ b/include/linux/acpi.h +@@ -1328,6 +1328,7 @@ static inline int lpit_read_residency_count_address(u64 *address) + #ifdef CONFIG_ACPI_PPTT + int acpi_pptt_cpu_is_thread(unsigned int cpu); + int find_acpi_cpu_topology(unsigned int cpu, int level); ++int find_acpi_cpu_topology_cluster(unsigned int cpu); + int find_acpi_cpu_topology_package(unsigned int cpu); + int find_acpi_cpu_topology_hetero_id(unsigned int cpu); + int find_acpi_cpu_cache_topology(unsigned int cpu, int level); +@@ -1340,6 +1341,10 @@ static inline int find_acpi_cpu_topology(unsigned int cpu, int level) + { + return -EINVAL; + } ++static inline int find_acpi_cpu_topology_cluster(unsigned int cpu) ++{ ++ return -EINVAL; ++} + static inline int find_acpi_cpu_topology_package(unsigned int cpu) + { + return -EINVAL; +diff --git a/include/linux/topology.h b/include/linux/topology.h +index a19771cd267d..90dd075394b2 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -188,6 +188,9 @@ static inline int cpu_to_mem(int cpu) + #ifndef topology_die_id + #define topology_die_id(cpu) ((void)(cpu), -1) + #endif ++#ifndef topology_cluster_id ++#define topology_cluster_id(cpu) ((void)(cpu), -1) ++#endif + #ifndef topology_core_id + #define topology_core_id(cpu) ((void)(cpu), 0) + #endif +@@ -197,6 +200,9 @@ static inline int cpu_to_mem(int cpu) + #ifndef topology_core_cpumask + #define topology_core_cpumask(cpu) cpumask_of(cpu) + #endif ++#ifndef topology_cluster_cpumask ++#define topology_cluster_cpumask(cpu) cpumask_of(cpu) ++#endif + + #ifdef CONFIG_SCHED_SMT + static inline const struct cpumask *cpu_smt_mask(int cpu) +-- +2.23.0 + diff --git a/patches/0118-sched-Add-cluster-scheduler-level-in-core-and-relate.patch b/patches/0118-sched-Add-cluster-scheduler-level-in-core-and-relate.patch new file mode 100644 index 00000000..e22475fa --- /dev/null +++ b/patches/0118-sched-Add-cluster-scheduler-level-in-core-and-relate.patch @@ -0,0 +1,264 @@ +From 73796877f3e5809bf1e5803bf62f4eaf8f5f4764 Mon Sep 17 00:00:00 2001 +From: Barry Song +Date: Fri, 24 Sep 2021 20:51:03 +1200 +Subject: [PATCH 118/132] sched: Add cluster scheduler level in core and + related Kconfig for ARM64 + +mainline inclusion +from mainline-v5.16-rc1 +commit 778c558f49a2cb3dc7b18a80ff515e82aa813627 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I4GEZS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=778c558f49a2cb3dc7b18a80ff515e82aa813627 + +------------------------------------------------------------------------ + +This patch adds scheduler level for clusters and automatically enables +the load balance among clusters. It will directly benefit a lot of +workload which loves more resources such as memory bandwidth, caches. + +Testing has widely been done in two different hardware configurations of +Kunpeng920: + + 24 cores in one NUMA(6 clusters in each NUMA node); + 32 cores in one NUMA(8 clusters in each NUMA node) + +Workload is running on either one NUMA node or four NUMA nodes, thus, +this can estimate the effect of cluster spreading w/ and w/o NUMA load +balance. + +* Stream benchmark: + +4threads stream (on 1NUMA * 24cores = 24cores) + stream stream + w/o patch w/ patch +MB/sec copy 29929.64 ( 0.00%) 32932.68 ( 10.03%) +MB/sec scale 29861.10 ( 0.00%) 32710.58 ( 9.54%) +MB/sec add 27034.42 ( 0.00%) 32400.68 ( 19.85%) +MB/sec triad 27225.26 ( 0.00%) 31965.36 ( 17.41%) + +6threads stream (on 1NUMA * 24cores = 24cores) + stream stream + w/o patch w/ patch +MB/sec copy 40330.24 ( 0.00%) 42377.68 ( 5.08%) +MB/sec scale 40196.42 ( 0.00%) 42197.90 ( 4.98%) +MB/sec add 37427.00 ( 0.00%) 41960.78 ( 12.11%) +MB/sec triad 37841.36 ( 0.00%) 42513.64 ( 12.35%) + +12threads stream (on 1NUMA * 24cores = 24cores) + stream stream + w/o patch w/ patch +MB/sec copy 52639.82 ( 0.00%) 53818.04 ( 2.24%) +MB/sec scale 52350.30 ( 0.00%) 53253.38 ( 1.73%) +MB/sec add 53607.68 ( 0.00%) 55198.82 ( 2.97%) +MB/sec triad 54776.66 ( 0.00%) 56360.40 ( 2.89%) + +Thus, it could help memory-bound workload especially under medium load. +Similar improvement is also seen in lkp-pbzip2: + +* lkp-pbzip2 benchmark + +2-96 threads (on 4NUMA * 24cores = 96cores) + lkp-pbzip2 lkp-pbzip2 + w/o patch w/ patch +Hmean tput-2 11062841.57 ( 0.00%) 11341817.51 * 2.52%* +Hmean tput-5 26815503.70 ( 0.00%) 27412872.65 * 2.23%* +Hmean tput-8 41873782.21 ( 0.00%) 43326212.92 * 3.47%* +Hmean tput-12 61875980.48 ( 0.00%) 64578337.51 * 4.37%* +Hmean tput-21 105814963.07 ( 0.00%) 111381851.01 * 5.26%* +Hmean tput-30 150349470.98 ( 0.00%) 156507070.73 * 4.10%* +Hmean tput-48 237195937.69 ( 0.00%) 242353597.17 * 2.17%* +Hmean tput-79 360252509.37 ( 0.00%) 362635169.23 * 0.66%* +Hmean tput-96 394571737.90 ( 0.00%) 400952978.48 * 1.62%* + +2-24 threads (on 1NUMA * 24cores = 24cores) + lkp-pbzip2 lkp-pbzip2 + w/o patch w/ patch +Hmean tput-2 11071705.49 ( 0.00%) 11296869.10 * 2.03%* +Hmean tput-4 20782165.19 ( 0.00%) 21949232.15 * 5.62%* +Hmean tput-6 30489565.14 ( 0.00%) 33023026.96 * 8.31%* +Hmean tput-8 40376495.80 ( 0.00%) 42779286.27 * 5.95%* +Hmean tput-12 61264033.85 ( 0.00%) 62995632.78 * 2.83%* +Hmean tput-18 86697139.39 ( 0.00%) 86461545.74 ( -0.27%) +Hmean tput-24 104854637.04 ( 0.00%) 104522649.46 * -0.32%* + +In the case of 6 threads and 8 threads, we see the greatest performance +improvement. + +Similar improvement can be seen on lkp-pixz though the improvement is +smaller: + +* lkp-pixz benchmark + +2-24 threads lkp-pixz (on 1NUMA * 24cores = 24cores) + lkp-pixz lkp-pixz + w/o patch w/ patch +Hmean tput-2 6486981.16 ( 0.00%) 6561515.98 * 1.15%* +Hmean tput-4 11645766.38 ( 0.00%) 11614628.43 ( -0.27%) +Hmean tput-6 15429943.96 ( 0.00%) 15957350.76 * 3.42%* +Hmean tput-8 19974087.63 ( 0.00%) 20413746.98 * 2.20%* +Hmean tput-12 28172068.18 ( 0.00%) 28751997.06 * 2.06%* +Hmean tput-18 39413409.54 ( 0.00%) 39896830.55 * 1.23%* +Hmean tput-24 49101815.85 ( 0.00%) 49418141.47 * 0.64%* + +* SPECrate benchmark + +4,8,16 copies mcf_r(on 1NUMA * 32cores = 32cores) + Base Base + Run Time Rate + ------- --------- +4 Copies w/o 580 (w/ 570) w/o 11.1 (w/ 11.3) +8 Copies w/o 647 (w/ 605) w/o 20.0 (w/ 21.4, +7%) +16 Copies w/o 844 (w/ 844) w/o 30.6 (w/ 30.6) + +32 Copies(on 4NUMA * 32 cores = 128cores) +[w/o patch] + Base Base Base +Benchmarks Copies Run Time Rate +--------------- ------- --------- --------- +500.perlbench_r 32 584 87.2 * +502.gcc_r 32 503 90.2 * +505.mcf_r 32 745 69.4 * +520.omnetpp_r 32 1031 40.7 * +523.xalancbmk_r 32 597 56.6 * +525.x264_r 1 -- CE +531.deepsjeng_r 32 336 109 * +541.leela_r 32 556 95.4 * +548.exchange2_r 32 513 163 * +557.xz_r 32 530 65.2 * + Est. SPECrate2017_int_base 80.3 + +[w/ patch] + Base Base Base +Benchmarks Copies Run Time Rate +--------------- ------- --------- --------- +500.perlbench_r 32 580 87.8 (+0.688%) * +502.gcc_r 32 477 95.1 (+5.432%) * +505.mcf_r 32 644 80.3 (+13.574%) * +520.omnetpp_r 32 942 44.6 (+9.58%) * +523.xalancbmk_r 32 560 60.4 (+6.714%%) * +525.x264_r 1 -- CE +531.deepsjeng_r 32 337 109 (+0.000%) * +541.leela_r 32 554 95.6 (+0.210%) * +548.exchange2_r 32 515 163 (+0.000%) * +557.xz_r 32 524 66.0 (+1.227%) * + Est. SPECrate2017_int_base 83.7 (+4.062%) + +On the other hand, it is slightly helpful to CPU-bound tasks like +kernbench: + +* 24-96 threads kernbench (on 4NUMA * 24cores = 96cores) + kernbench kernbench + w/o cluster w/ cluster +Min user-24 12054.67 ( 0.00%) 12024.19 ( 0.25%) +Min syst-24 1751.51 ( 0.00%) 1731.68 ( 1.13%) +Min elsp-24 600.46 ( 0.00%) 598.64 ( 0.30%) +Min user-48 12361.93 ( 0.00%) 12315.32 ( 0.38%) +Min syst-48 1917.66 ( 0.00%) 1892.73 ( 1.30%) +Min elsp-48 333.96 ( 0.00%) 332.57 ( 0.42%) +Min user-96 12922.40 ( 0.00%) 12921.17 ( 0.01%) +Min syst-96 2143.94 ( 0.00%) 2110.39 ( 1.56%) +Min elsp-96 211.22 ( 0.00%) 210.47 ( 0.36%) +Amean user-24 12063.99 ( 0.00%) 12030.78 * 0.28%* +Amean syst-24 1755.20 ( 0.00%) 1735.53 * 1.12%* +Amean elsp-24 601.60 ( 0.00%) 600.19 ( 0.23%) +Amean user-48 12362.62 ( 0.00%) 12315.56 * 0.38%* +Amean syst-48 1921.59 ( 0.00%) 1894.95 * 1.39%* +Amean elsp-48 334.10 ( 0.00%) 332.82 * 0.38%* +Amean user-96 12925.27 ( 0.00%) 12922.63 ( 0.02%) +Amean syst-96 2146.66 ( 0.00%) 2122.20 * 1.14%* +Amean elsp-96 211.96 ( 0.00%) 211.79 ( 0.08%) + +Note this patch isn't an universal win, it might hurt those workload +which can benefit from packing. Though tasks which want to take +advantages of lower communication latency of one cluster won't +necessarily been packed in one cluster while kernel is not aware of +clusters, they have some chance to be randomly packed. But this +patch will make them more likely spread. + +Signed-off-by: Barry Song +Tested-by: Yicong Yang +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Jiang Yi +--- + arch/arm64/Kconfig | 9 +++++++++ + include/linux/sched/topology.h | 7 +++++++ + include/linux/topology.h | 7 +++++++ + kernel/sched/topology.c | 5 +++++ + 4 files changed, 28 insertions(+) + +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 88b8031a93b2..47aa27fcd895 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -784,6 +784,15 @@ config SCHED_MC + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + ++config SCHED_CLUSTER ++ bool "Cluster scheduler support" ++ help ++ Cluster scheduler support improves the CPU scheduler's decision ++ making when dealing with machines that have clusters of CPUs. ++ Cluster usually means a couple of CPUs which are placed closely ++ by sharing mid-level caches, last-level cache tags or internal ++ busses. ++ + config SCHED_SMT + bool "SMT scheduler support" + help +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index a9032116c13e..15d2e06f690b 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -40,6 +40,13 @@ static inline int cpu_smt_flags(void) + } + #endif + ++#ifdef CONFIG_SCHED_CLUSTER ++static inline int cpu_cluster_flags(void) ++{ ++ return SD_SHARE_PKG_RESOURCES; ++} ++#endif ++ + #ifdef CONFIG_SCHED_MC + static inline int cpu_core_flags(void) + { +diff --git a/include/linux/topology.h b/include/linux/topology.h +index 90dd075394b2..58f8a9e9d90b 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -211,6 +211,13 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) + } + #endif + ++#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) ++static inline const struct cpumask *cpu_cluster_mask(int cpu) ++{ ++ return topology_cluster_cpumask(cpu); ++} ++#endif ++ + static inline const struct cpumask *cpu_cpu_mask(int cpu) + { + return cpumask_of_node(cpu_to_node(cpu)); +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index ad5591520c99..5d662314c08b 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1298,6 +1298,11 @@ static struct sched_domain_topology_level default_topology[] = { + #ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + #endif ++ ++#ifdef CONFIG_SCHED_CLUSTER ++ { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, ++#endif ++ + #ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + #endif +-- +2.23.0 + diff --git a/patches/0119-topology-sysfs-export-cluster-attributes-only-if-an-.patch b/patches/0119-topology-sysfs-export-cluster-attributes-only-if-an-.patch new file mode 100644 index 00000000..10c82afd --- /dev/null +++ b/patches/0119-topology-sysfs-export-cluster-attributes-only-if-an-.patch @@ -0,0 +1,128 @@ +From 8072124e989fd6183877494d17c098ec9f308683 Mon Sep 17 00:00:00 2001 +From: Heiko Carstens +Date: Mon, 29 Nov 2021 14:03:08 +0100 +Subject: [PATCH 119/132] topology/sysfs: export cluster attributes only if an + architectures has support + +mainline inclusion +from mainline-v5.17-rc1 +commit e795707703b32fecdd7467afcc33ff1e92416c05 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I4GEZS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e795707703b32fecdd7467afcc33ff1e92416c05 + +---------------------------------------------------------------------- + +The cluster_id and cluster_cpus topology sysfs attributes have been +added with commit c5e22feffdd7 ("topology: Represent clusters of CPUs +within a die"). + +They are currently only used for x86, arm64, and riscv (via generic +arch topology), however they are still present with bogus default +values for all other architectures. Instead of enforcing such new +sysfs attributes to all architectures, make them only optional visible +if an architecture opts in by defining both the topology_cluster_id +and topology_cluster_cpumask attributes. + +This is similar to what was done when the book and drawer topology +levels were introduced: avoid useless and therefore confusing sysfs +attributes for architectures which cannot make use of them. + +This should not break any existing applications, since this is a +new interface introduced with the v5.16 merge window. + +Acked-by: Peter Zijlstra (Intel) +Signed-off-by: Heiko Carstens +Link: https://lore.kernel.org/r/20211129130309.3256168-3-hca@linux.ibm.com +Signed-off-by: Greg Kroah-Hartman + +Conflicts: + Documentation/admin-guide/cputopology.rst + drivers/base/topology.c + include/linux/topology.h + +Signed-off-by: Jiang Yi +--- + Documentation/cputopology.txt | 3 +++ + drivers/base/topology.c | 8 ++++++++ + include/linux/topology.h | 4 ++++ + 3 files changed, 15 insertions(+) + +diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt +index acd55bf0c718..e6645ff18994 100644 +--- a/Documentation/cputopology.txt ++++ b/Documentation/cputopology.txt +@@ -94,6 +94,9 @@ Architecture-neutral, drivers/base/topology.c, exports these attributes. + However, the book and drawer related sysfs files will only be created if + CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. + ++The cluster hierarchy related sysfs files will only be created if an ++architecture provides the related macros as described below. ++ + CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, + where they reflect the cpu and cache hierarchy. + +diff --git a/drivers/base/topology.c b/drivers/base/topology.c +index 7e4bdf65e27a..66ebcf05030f 100644 +--- a/drivers/base/topology.c ++++ b/drivers/base/topology.c +@@ -46,8 +46,10 @@ static DEVICE_ATTR_RO(physical_package_id); + define_id_show_func(die_id); + static DEVICE_ATTR_RO(die_id); + ++#ifdef TOPOLOGY_CLUSTER_SYSFS + define_id_show_func(cluster_id); + static DEVICE_ATTR_RO(cluster_id); ++#endif + + define_id_show_func(core_id); + static DEVICE_ATTR_RO(core_id); +@@ -60,9 +62,11 @@ define_siblings_show_func(core_siblings, core_cpumask); + static DEVICE_ATTR_RO(core_siblings); + static DEVICE_ATTR_RO(core_siblings_list); + ++#ifdef TOPOLOGY_CLUSTER_SYSFS + define_siblings_show_func(cluster_cpus, cluster_cpumask); + static DEVICE_ATTR_RO(cluster_cpus); + static DEVICE_ATTR_RO(cluster_cpus_list); ++#endif + + #ifdef CONFIG_SCHED_BOOK + define_id_show_func(book_id); +@@ -83,14 +87,18 @@ static DEVICE_ATTR_RO(drawer_siblings_list); + static struct attribute *default_attrs[] = { + &dev_attr_physical_package_id.attr, + &dev_attr_die_id.attr, ++#ifdef TOPOLOGY_CLUSTER_SYSFS + &dev_attr_cluster_id.attr, ++#endif + &dev_attr_core_id.attr, + &dev_attr_thread_siblings.attr, + &dev_attr_thread_siblings_list.attr, + &dev_attr_core_siblings.attr, + &dev_attr_core_siblings_list.attr, ++#ifdef TOPOLOGY_CLUSTER_SYSFS + &dev_attr_cluster_cpus.attr, + &dev_attr_cluster_cpus_list.attr, ++#endif + #ifdef CONFIG_SCHED_BOOK + &dev_attr_book_id.attr, + &dev_attr_book_siblings.attr, +diff --git a/include/linux/topology.h b/include/linux/topology.h +index 58f8a9e9d90b..9033a952ee68 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -182,6 +182,10 @@ static inline int cpu_to_mem(int cpu) + + #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ + ++#if defined(topology_cluster_id) && defined(topology_cluster_cpumask) ++#define TOPOLOGY_CLUSTER_SYSFS ++#endif ++ + #ifndef topology_physical_package_id + #define topology_physical_package_id(cpu) ((void)(cpu), -1) + #endif +-- +2.23.0 + diff --git a/patches/0120-topology-Remove-unused-cpu_cluster_mask.patch b/patches/0120-topology-Remove-unused-cpu_cluster_mask.patch new file mode 100644 index 00000000..3ce44fe7 --- /dev/null +++ b/patches/0120-topology-Remove-unused-cpu_cluster_mask.patch @@ -0,0 +1,49 @@ +From 68d8b00dca7138b644e472c797957907b98aa535 Mon Sep 17 00:00:00 2001 +From: Dietmar Eggemann +Date: Fri, 13 May 2022 11:34:33 +0200 +Subject: [PATCH 120/132] topology: Remove unused cpu_cluster_mask() + +mainline inclusion +from mainline-v5.19-rc1 +commit 15f214f9bdb7c1f560b4bf863c5a72ff53b442a4 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I4GEZS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=15f214f9bdb7c1f560b4bf863c5a72ff53b442a4 + +------------------------------------------------------------------------ + +default_topology[] uses cpu_clustergroup_mask() for the CLS level +(guarded by CONFIG_SCHED_CLUSTER) which is currently provided by x86 +(arch/x86/kernel/smpboot.c) and arm64 (drivers/base/arch_topology.c). + +Fixes: 778c558f49a2 ("sched: Add cluster scheduler level in core and related Kconfig for ARM64") +Acked-by: Barry Song +Signed-off-by: Dietmar Eggemann +Link: https://lore.kernel.org/r/20220513093433.425163-1-dietmar.eggemann@arm.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Jiang Yi +--- + include/linux/topology.h | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/include/linux/topology.h b/include/linux/topology.h +index 9033a952ee68..9a7753fcae6a 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -215,13 +215,6 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) + } + #endif + +-#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) +-static inline const struct cpumask *cpu_cluster_mask(int cpu) +-{ +- return topology_cluster_cpumask(cpu); +-} +-#endif +- + static inline const struct cpumask *cpu_cpu_mask(int cpu) + { + return cpumask_of_node(cpu_to_node(cpu)); +-- +2.23.0 + diff --git a/patches/0121-arch_topology-Limit-span-of-cpu_clustergroup_mask.patch b/patches/0121-arch_topology-Limit-span-of-cpu_clustergroup_mask.patch new file mode 100644 index 00000000..b827bbfd --- /dev/null +++ b/patches/0121-arch_topology-Limit-span-of-cpu_clustergroup_mask.patch @@ -0,0 +1,73 @@ +From 7f7344453623deceaa16f6e1cdc71064521bbfca Mon Sep 17 00:00:00 2001 +From: Ionela Voinescu +Date: Mon, 4 Jul 2022 11:16:01 +0100 +Subject: [PATCH 121/132] arch_topology: Limit span of cpu_clustergroup_mask() + +mainline inclusion +from mainline-v6.0-rc1 +commit bfcc4397435dc0407099b9a805391abc05c2313b +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I88UKS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bfcc4397435dc0407099b9a805391abc05c2313b + +---------------------------------------------------------------------- + +Currently the cluster identifier is not set on DT based platforms. +The reset or default value is -1 for all the CPUs. Once we assign the +cluster identifier values correctly, the cluster_sibling mask will be +populated and returned by cpu_clustergroup_mask() to contribute in the +creation of the CLS scheduling domain level, if SCHED_CLUSTER is +enabled. + +To avoid topologies that will result in questionable or incorrect +scheduling domains, impose restrictions regarding the span of clusters, +as presented to scheduling domains building code: cluster_sibling should +not span more or the same CPUs as cpu_coregroup_mask(). + +This is needed in order to obtain a strict separation between the MC and +CLS levels, and maintain the same domains for existing platforms in +the presence of CONFIG_SCHED_CLUSTER, where the new cluster information +is redundant and irrelevant for the scheduler. + +While previously the scheduling domain builder code would have removed MC +as redundant and kept CLS if SCHED_CLUSTER was enabled and the +cpu_coregroup_mask() and cpu_clustergroup_mask() spanned the same CPUs, +now CLS will be removed and MC kept. + +Link: https://lore.kernel.org/r/20220704101605.1318280-18-sudeep.holla@arm.com +Cc: Darren Hart +Tested-by: Conor Dooley +Acked-by: Vincent Guittot +Signed-off-by: Ionela Voinescu +Signed-off-by: Sudeep Holla + +Conflicts: + drivers/base/arch_topology.c + +Signed-off-by: Jiang Yi +--- + arch/arm64/kernel/topology.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c +index e4effe6f3177..b76c92b6ac14 100644 +--- a/arch/arm64/kernel/topology.c ++++ b/arch/arm64/kernel/topology.c +@@ -232,6 +232,14 @@ const struct cpumask *cpu_coregroup_mask(int cpu) + + const struct cpumask *cpu_clustergroup_mask(int cpu) + { ++ /* ++ * Forbid cpu_clustergroup_mask() to span more or the same CPUs as ++ * cpu_coregroup_mask(). ++ */ ++ if (cpumask_subset(cpu_coregroup_mask(cpu), ++ &cpu_topology[cpu].cluster_sibling)) ++ return get_cpu_mask(cpu); ++ + return &cpu_topology[cpu].cluster_sibling; + } + +-- +2.23.0 + diff --git a/patches/0122-arch_topology-Make-cluster-topology-span-at-least-SM.patch b/patches/0122-arch_topology-Make-cluster-topology-span-at-least-SM.patch new file mode 100644 index 00000000..a3f0f872 --- /dev/null +++ b/patches/0122-arch_topology-Make-cluster-topology-span-at-least-SM.patch @@ -0,0 +1,72 @@ +From 91cf59f2bfe15ac0cdcb0da47c286000f7859469 Mon Sep 17 00:00:00 2001 +From: Yicong Yang +Date: Mon, 5 Sep 2022 20:26:15 +0800 +Subject: [PATCH 122/132] arch_topology: Make cluster topology span at least + SMT CPUs + +mainline inclusion +from mainline-v6.0-rc5 +commit 5ac251c8a05ce074e5efac779debf82a15d870a3 +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I88UKS +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5ac251c8a05ce074e5efac779debf82a15d870a3 + +---------------------------------------------------------------------- + +Currently cpu_clustergroup_mask() will return CPU mask if cluster span more +or the same CPUs as cpu_coregroup_mask(). This will result topology borken +on non-Cluster SMT machines when building with CONFIG_SCHED_CLUSTER=y. + +Test with: +qemu-system-aarch64 -enable-kvm -machine virt \ + -net none \ + -cpu host \ + -bios ./QEMU_EFI.fd \ + -m 2G \ + -smp 48,sockets=2,cores=12,threads=2 \ + -kernel $Image \ + -initrd $Rootfs \ + -nographic + -append "rdinit=init console=ttyAMA0 sched_verbose loglevel=8" + +We'll get below error: +[ 3.084568] BUG: arch topology borken +[ 3.084570] the SMT domain not a subset of the CLS domain + +Since cluster is a level higher than SMT, fix this by making cluster +spans at least SMT CPUs. + +Fixes: bfcc4397435d ("arch_topology: Limit span of cpu_clustergroup_mask()") +Cc: Sudeep Holla +Cc: Vincent Guittot +Cc: Ionela Voinescu +Cc: Greg KH +Reviewed-by: Sudeep Holla +Signed-off-by: Yicong Yang +Link: https://lore.kernel.org/r/20220905122615.12946-1-yangyicong@huawei.com +Signed-off-by: Greg Kroah-Hartman + +Conflicts: + drivers/base/arch_topology.c + +Signed-off-by: Jiang Yi +--- + arch/arm64/kernel/topology.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c +index b76c92b6ac14..46f58120392c 100644 +--- a/arch/arm64/kernel/topology.c ++++ b/arch/arm64/kernel/topology.c +@@ -238,7 +238,7 @@ const struct cpumask *cpu_clustergroup_mask(int cpu) + */ + if (cpumask_subset(cpu_coregroup_mask(cpu), + &cpu_topology[cpu].cluster_sibling)) +- return get_cpu_mask(cpu); ++ return topology_sibling_cpumask(cpu); + + return &cpu_topology[cpu].cluster_sibling; + } +-- +2.23.0 + diff --git a/patches/0123-sched-Add-per_cpu-cluster-domain-info-and-cpus_share.patch b/patches/0123-sched-Add-per_cpu-cluster-domain-info-and-cpus_share.patch new file mode 100644 index 00000000..8e32c4f2 --- /dev/null +++ b/patches/0123-sched-Add-per_cpu-cluster-domain-info-and-cpus_share.patch @@ -0,0 +1,176 @@ +From d99abce298f4d141832460998bd398f38aef4eee Mon Sep 17 00:00:00 2001 +From: Barry Song +Date: Mon, 17 Oct 2022 15:01:55 +0800 +Subject: [PATCH 123/132] sched: Add per_cpu cluster domain info and + cpus_share_lowest_cache API + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/20220915073423.25535-1-yangyicong@huawei.com/ + +---------------------------------------------------------------------- + +Add per-cpu cluster domain info and cpus_share_lowest_cache() API. +This is the preparation for the optimization of select_idle_cpu() +on platforms with cluster scheduler level. + +Tested-by: K Prateek Nayak +Signed-off-by: Barry Song +Signed-off-by: Yicong Yang +Reviewed-by: Gautham R. Shenoy +Reviewed-by: Tim Chen +Reviewed-by: Vincent Guittot +Signed-off-by: Jie Liu + +Conflicts: + include/linux/sched/sd_flags.h + kernel/sched/core.c + kernel/sched/sched.h + kernel/sched/topology.c + +Signed-off-by: Jiang Yi +--- + include/linux/sched/topology.h | 23 +++++++++++++++-------- + kernel/sched/core.c | 14 ++++++++++++++ + kernel/sched/sched.h | 2 ++ + kernel/sched/topology.c | 15 +++++++++++++++ + 4 files changed, 46 insertions(+), 8 deletions(-) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 15d2e06f690b..55eec54e7f1e 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -25,13 +25,14 @@ + #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ + #define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ + #define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ +-#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ +-#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ +-#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ +-#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ +-#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ +-#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ +-#define SD_NUMA 0x4000 /* cross-node balancing */ ++#define SD_CLUSTER 0x0100 /* Domain members share CPU cluster */ ++#define SD_SHARE_POWERDOMAIN 0x0200 /* Domain members share power domain */ ++#define SD_SHARE_PKG_RESOURCES 0x0400 /* Domain members share cpu pkg resources */ ++#define SD_SERIALIZE 0x0800 /* Only a single load balancing instance */ ++#define SD_ASYM_PACKING 0x1000 /* Place busy groups earlier in the domain */ ++#define SD_PREFER_SIBLING 0x2000 /* Prefer to place tasks in a sibling domain */ ++#define SD_OVERLAP 0x4000 /* sched_domains of this level overlap */ ++#define SD_NUMA 0x8000 /* cross-node balancing */ + + #ifdef CONFIG_SCHED_SMT + static inline int cpu_smt_flags(void) +@@ -43,7 +44,7 @@ static inline int cpu_smt_flags(void) + #ifdef CONFIG_SCHED_CLUSTER + static inline int cpu_cluster_flags(void) + { +- return SD_SHARE_PKG_RESOURCES; ++ return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; + } + #endif + +@@ -180,6 +181,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); + void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); + + bool cpus_share_cache(int this_cpu, int that_cpu); ++bool cpus_share_lowest_cache(int this_cpu, int that_cpu); + + typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + typedef int (*sched_domain_flags_f)(void); +@@ -227,6 +229,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) + return true; + } + ++static inline bool cpus_share_lowest_cache(int this_cpu, int that_cpu) ++{ ++ return true; ++} ++ + #endif /* !CONFIG_SMP */ + + static inline int task_node(const struct task_struct *p) +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 7825ceaae0c4..bbfed1ce2372 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1851,6 +1851,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu) + + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); + } ++ ++/* ++ * Whether CPUs are share lowest cache, which means LLC on non-cluster ++ * machines and LLC tag or L2 on machines with clusters. ++ */ ++bool cpus_share_lowest_cache(int this_cpu, int that_cpu) ++{ ++ if (this_cpu == that_cpu) ++ return true; ++ ++ return per_cpu(sd_lowest_cache_id, this_cpu) == ++ per_cpu(sd_lowest_cache_id, that_cpu); ++} ++ + #endif /* CONFIG_SMP */ + + static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 1d882a2b8d5f..c9019e1a6296 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1307,7 +1307,9 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) + DECLARE_PER_CPU(struct sched_domain *, sd_llc); + DECLARE_PER_CPU(int, sd_llc_size); + DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(int, sd_lowest_cache_id); + DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); + DECLARE_PER_CPU(struct sched_domain *, sd_numa); + DECLARE_PER_CPU(struct sched_domain *, sd_asym); + +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 5d662314c08b..0b299f9d60cf 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -412,6 +412,8 @@ static void destroy_sched_domains(struct sched_domain *sd) + DEFINE_PER_CPU(struct sched_domain *, sd_llc); + DEFINE_PER_CPU(int, sd_llc_size); + DEFINE_PER_CPU(int, sd_llc_id); ++DEFINE_PER_CPU(int, sd_lowest_cache_id); ++DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); + DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); + DEFINE_PER_CPU(struct sched_domain *, sd_numa); + DEFINE_PER_CPU(struct sched_domain *, sd_asym); +@@ -445,6 +447,18 @@ static void update_top_cache_domain(int cpu) + per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + ++ sd = lowest_flag_domain(cpu, SD_CLUSTER); ++ if (sd) ++ id = cpumask_first(sched_domain_span(sd)); ++ rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd); ++ ++ /* ++ * This assignment should be placed after the sd_llc_id as ++ * we want this id equals to cluster id on cluster machines ++ * but equals to LLC id on non-Cluster machines. ++ */ ++ per_cpu(sd_lowest_cache_id, cpu) = id; ++ + sd = lowest_flag_domain(cpu, SD_NUMA); + rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + +@@ -1162,6 +1176,7 @@ static struct cpumask ***sched_domains_numa_masks; + */ + #define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUCAPACITY | \ ++ SD_CLUSTER | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ +-- +2.23.0 + diff --git a/patches/0124-sched-fair-Scan-cluster-before-scanning-LLC-in-wake-.patch b/patches/0124-sched-fair-Scan-cluster-before-scanning-LLC-in-wake-.patch new file mode 100644 index 00000000..35772e29 --- /dev/null +++ b/patches/0124-sched-fair-Scan-cluster-before-scanning-LLC-in-wake-.patch @@ -0,0 +1,224 @@ +From 30fc24beff2ab85a2e29cbd483fb62f4a99b5283 Mon Sep 17 00:00:00 2001 +From: Barry Song +Date: Mon, 17 Oct 2022 15:34:27 +0800 +Subject: [PATCH 124/132] sched/fair: Scan cluster before scanning LLC in + wake-up path + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/20220915073423.25535-1-yangyicong@huawei.com/ + +---------------------------------------------------------------------- + +For platforms having clusters like Kunpeng920, CPUs within the same cluster +have lower latency when synchronizing and accessing shared resources like +cache. Thus, this patch tries to find an idle cpu within the cluster of the +target CPU before scanning the whole LLC to gain lower latency. + +Testing has been done on Kunpeng920 by pinning tasks to one numa and two +numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs. + +With this patch, We noticed enhancement on tbench within one numa or cross +two numa. + +On numa 0: + 6.0-rc1 patched +Hmean 1 351.20 ( 0.00%) 396.45 * 12.88%* +Hmean 2 700.43 ( 0.00%) 793.76 * 13.32%* +Hmean 4 1404.42 ( 0.00%) 1583.62 * 12.76%* +Hmean 8 2833.31 ( 0.00%) 3147.85 * 11.10%* +Hmean 16 5501.90 ( 0.00%) 6089.89 * 10.69%* +Hmean 32 10428.59 ( 0.00%) 10619.63 * 1.83%* +Hmean 64 8223.39 ( 0.00%) 8306.93 * 1.02%* +Hmean 128 7042.88 ( 0.00%) 7068.03 * 0.36%* + +On numa 0-1: + 6.0-rc1 patched +Hmean 1 363.06 ( 0.00%) 397.13 * 9.38%* +Hmean 2 721.68 ( 0.00%) 789.84 * 9.44%* +Hmean 4 1435.15 ( 0.00%) 1566.01 * 9.12%* +Hmean 8 2776.17 ( 0.00%) 3007.05 * 8.32%* +Hmean 16 5471.71 ( 0.00%) 6103.91 * 11.55%* +Hmean 32 10164.98 ( 0.00%) 11531.81 * 13.45%* +Hmean 64 17143.28 ( 0.00%) 20078.68 * 17.12%* +Hmean 128 14552.70 ( 0.00%) 15156.41 * 4.15%* +Hmean 256 12827.37 ( 0.00%) 13326.86 * 3.89%* + +Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch +in the code has not been tested but it supposed to work. + +Suggested-by: Peter Zijlstra +[https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-ass.net] +Tested-by: Yicong Yang +Signed-off-by: Barry Song +Signed-off-by: Yicong Yang +Reviewed-by: Tim Chen +Reviewed-by: Chen Yu +Signed-off-by: Jie Liu + +Conflicts: + kernel/sched/fair.c + kernel/sched/sched.h + kernel/sched/topology.c + +Signed-off-by: Jiang Yi +--- + kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++---- + kernel/sched/sched.h | 1 + + kernel/sched/topology.c | 11 +++++++++ + 3 files changed, 58 insertions(+), 4 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 1c78e2f29901..8ff0f87f1a76 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -6796,6 +6796,30 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int + cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + #endif + ++ if (static_branch_unlikely(&sched_cluster_active)) { ++ struct sched_domain *sdc = ++ rcu_dereference(per_cpu(sd_cluster, target)); ++ ++ if (sdc) { ++ for_each_cpu_wrap(cpu, sched_domain_span(sdc), target) { ++ bool idle = true; ++ ++ if (!cpumask_test_cpu(cpu, cpus)) ++ continue; ++ ++ for_each_cpu(cpu, cpu_smt_mask(core)) { ++ cpumask_clear_cpu(cpu, cpus); ++ if (!available_idle_cpu(cpu)) ++ idle = false; ++ } ++ ++ if (idle) ++ return core; ++ } ++ cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); ++ } ++ } ++ + for_each_cpu_wrap(core, cpus, target) { + bool idle = true; + +@@ -6901,8 +6925,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t + cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + #endif + ++ if (static_branch_unlikely(&sched_cluster_active)) { ++ struct sched_domain *sdc = ++ rcu_dereference(per_cpu(sd_cluster, target)); ++ ++ if (sdc) { ++ for_each_cpu_wrap(cpu, sched_domain_span(sdc), target) { ++ if (!cpumask_test_cpu(cpu, cpus)) ++ continue; ++ if (--nr <= 0) ++ return -1; ++ if (available_idle_cpu(cpu) || ++ sched_idle_cpu(cpu)) ++ return cpu; ++ } ++ cpumask_andnot(cpus, cpus, sched_domain_span(sdc)); ++ } ++ } ++ + for_each_cpu_wrap(cpu, cpus, target) { +- if (!--nr) ++ if (--nr <= 0) + return -1; + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + break; +@@ -6952,11 +6994,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + * If the previous CPU is cache affine and idle, don't be stupid: + */ + #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +- if (prev != target && cpus_share_cache(prev, target) && ++ if (prev != target && cpus_share_lowest_cache(prev, target) && + cpumask_test_cpu(prev, p->select_cpus) && + (available_idle_cpu(prev) || sched_idle_cpu(prev))) { + #else +- if (prev != target && cpus_share_cache(prev, target) && ++ if (prev != target && cpus_share_lowest_cache(prev, target) && + (available_idle_cpu(prev) || sched_idle_cpu(prev))) { + #endif + SET_STAT(found_idle_cpu_easy); +@@ -6967,7 +7009,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + recent_used_cpu = p->recent_used_cpu; + if (recent_used_cpu != prev && + recent_used_cpu != target && +- cpus_share_cache(recent_used_cpu, target) && ++ cpus_share_lowest_cache(recent_used_cpu, target) && + (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + cpumask_test_cpu(p->recent_used_cpu, p->select_cpus)) { +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c9019e1a6296..131228b5c268 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1312,6 +1312,7 @@ DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); + DECLARE_PER_CPU(struct sched_domain *, sd_numa); + DECLARE_PER_CPU(struct sched_domain *, sd_asym); ++extern struct static_key_false sched_cluster_active; + + struct sched_group_capacity { + atomic_t ref; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 0b299f9d60cf..eda15f08577f 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -418,6 +418,8 @@ DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); + DEFINE_PER_CPU(struct sched_domain *, sd_numa); + DEFINE_PER_CPU(struct sched_domain *, sd_asym); + ++DEFINE_STATIC_KEY_FALSE(sched_cluster_active); ++ + static void update_top_cache_domain(int cpu) + { + #ifdef CONFIG_SCHED_STEAL +@@ -1856,6 +1858,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + struct s_data d; + struct rq *rq = NULL; + int i, ret = -ENOMEM; ++ bool has_cluster = false; + + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) +@@ -1868,6 +1871,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + sd = NULL; + for_each_sd_topology(tl) { + sd = build_sched_domain(tl, cpu_map, attr, sd, i); ++ has_cluster |= sd->flags & SD_CLUSTER; + if (tl == sched_domain_topology) + *per_cpu_ptr(d.sd, i) = sd; + if (tl->flags & SDTL_OVERLAP) +@@ -1924,6 +1928,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + } + rcu_read_unlock(); + ++ if (has_cluster) ++ static_branch_inc_cpuslocked(&sched_cluster_active); ++ + if (rq && sched_debug_enabled) { + pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", + cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); +@@ -2018,8 +2025,12 @@ int sched_init_domains(const struct cpumask *cpu_map) + */ + static void detach_destroy_domains(const struct cpumask *cpu_map) + { ++ unsigned int cpu = cpumask_any(cpu_map); + int i; + ++ if (rcu_access_pointer(per_cpu(sd_cluster, cpu))) ++ static_branch_dec_cpuslocked(&sched_cluster_active); ++ + rcu_read_lock(); + for_each_cpu(i, cpu_map) + cpu_attach_domain(NULL, &def_root_domain, i); +-- +2.23.0 + diff --git a/patches/0125-scheduler-Create-SDTL_SKIP-flag-to-skip-topology-lev.patch b/patches/0125-scheduler-Create-SDTL_SKIP-flag-to-skip-topology-lev.patch new file mode 100644 index 00000000..acf8ffea --- /dev/null +++ b/patches/0125-scheduler-Create-SDTL_SKIP-flag-to-skip-topology-lev.patch @@ -0,0 +1,78 @@ +From 302c73229a43f4c11f262c971129db0d12f4b8a4 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 3 Dec 2021 12:32:38 -0800 +Subject: [PATCH 125/132] scheduler: Create SDTL_SKIP flag to skip topology + level + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ + +---------------------------------------------------------------------- + +A system admin may not want to use cluster scheduling. Make changes to +allow cluster topology level to be skipped when building sched domains. + +Create SDTL_SKIP bit on the sched_domain_topology_level flag so we can +check if the cluster topology level should be skipped when building +sched domains. + +Signed-off-by: Tim Chen +Signed-off-by: Jie Liu + +Conflicts: + kernel/sched/topology.c + +Signed-off-by: Jiang Yi +--- + include/linux/sched/topology.h | 1 + + kernel/sched/topology.c | 12 ++++++++++-- + 2 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 55eec54e7f1e..ae4ba452c111 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -187,6 +187,7 @@ typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + typedef int (*sched_domain_flags_f)(void); + + #define SDTL_OVERLAP 0x01 ++#define SDTL_SKIP 0x02 + + struct sd_data { + struct sched_domain *__percpu *sd; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index eda15f08577f..887e2d06d98a 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1330,8 +1330,16 @@ static struct sched_domain_topology_level default_topology[] = { + static struct sched_domain_topology_level *sched_domain_topology = + default_topology; + ++static struct sched_domain_topology_level * ++next_tl(struct sched_domain_topology_level *tl) ++{ ++ while (tl->mask && tl->flags & SDTL_SKIP) ++ ++tl; ++ return tl; ++} ++ + #define for_each_sd_topology(tl) \ +- for (tl = sched_domain_topology; tl->mask; tl++) ++ for (tl = next_tl(sched_domain_topology); tl->mask; tl = next_tl(++tl)) + + void set_sched_topology(struct sched_domain_topology_level *tl) + { +@@ -1872,7 +1880,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + for_each_sd_topology(tl) { + sd = build_sched_domain(tl, cpu_map, attr, sd, i); + has_cluster |= sd->flags & SD_CLUSTER; +- if (tl == sched_domain_topology) ++ if (tl == next_tl(sched_domain_topology)) + *per_cpu_ptr(d.sd, i) = sd; + if (tl->flags & SDTL_OVERLAP) + sd->flags |= SD_OVERLAP; +-- +2.23.0 + diff --git a/patches/0126-sysctl-add-a-new-register_sysctl_init-interface.patch b/patches/0126-sysctl-add-a-new-register_sysctl_init-interface.patch new file mode 100644 index 00000000..422158ca --- /dev/null +++ b/patches/0126-sysctl-add-a-new-register_sysctl_init-interface.patch @@ -0,0 +1,196 @@ +From 39518365ec710c299de64d5fb28ce80f22869536 Mon Sep 17 00:00:00 2001 +From: Xiaoming Ni +Date: Thu, 28 Jul 2022 18:06:57 +0800 +Subject: [PATCH 126/132] sysctl: add a new register_sysctl_init() interface + +mainline inclusion +from mainline-v5.17-rc1 +commit 3ddd9a808cee7284931312f2f3e854c9617f44b2 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ddd9a808cee7284931312f2f3e854c9617f44b2 + +---------------------------------------------------------------------- + +Patch series "sysctl: first set of kernel/sysctl cleanups", v2. + +Finally had time to respin the series of the work we had started last +year on cleaning up the kernel/sysct.c kitchen sink. People keeps +stuffing their sysctls in that file and this creates a maintenance +burden. So this effort is aimed at placing sysctls where they actually +belong. + +I'm going to split patches up into series as there is quite a bit of +work. + +This first set adds register_sysctl_init() for uses of registerting a +sysctl on the init path, adds const where missing to a few places, +generalizes common values so to be more easy to share, and starts the +move of a few kernel/sysctl.c out where they belong. + +The majority of rework on v2 in this first patch set is 0-day fixes. +Eric Biederman's feedback is later addressed in subsequent patch sets. + +I'll only post the first two patch sets for now. We can address the +rest once the first two patch sets get completely reviewed / Acked. + +This patch (of 9): + +The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty +dishes, this makes it very difficult to maintain. + +To help with this maintenance let's start by moving sysctls to places +where they actually belong. The proc sysctl maintainers do not want to +know what sysctl knobs you wish to add for your own piece of code, we +just care about the core logic. + +Today though folks heavily rely on tables on kernel/sysctl.c so they can +easily just extend this table with their needed sysctls. In order to +help users move their sysctls out we need to provide a helper which can +be used during code initialization. + +We special-case the initialization use of register_sysctl() since it +*is* safe to fail, given all that sysctls do is provide a dynamic +interface to query or modify at runtime an existing variable. So the +use case of register_sysctl() on init should *not* stop if the sysctls +don't end up getting registered. It would be counter productive to stop +boot if a simple sysctl registration failed. + +Provide a helper for init then, and document the recommended init levels +to use for callers of this routine. We will later use this in +subsequent patches to start slimming down kernel/sysctl.c tables and +moving sysctl registration to the code which actually needs these +sysctls. + +[mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c ] + +Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org +Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org +Signed-off-by: Xiaoming Ni +Signed-off-by: Luis Chamberlain +Reviewed-by: Kees Cook +Cc: Iurii Zaikin +Cc: "Eric W. Biederman" +Cc: Peter Zijlstra +Cc: Greg Kroah-Hartman +Cc: Paul Turner +Cc: Andy Shevchenko +Cc: Sebastian Reichel +Cc: Tetsuo Handa +Cc: Petr Mladek +Cc: Sergey Senozhatsky +Cc: Qing Wang +Cc: Benjamin LaHaise +Cc: Al Viro +Cc: Jan Kara +Cc: Amir Goldstein +Cc: Stephen Kitt +Cc: Antti Palosaari +Cc: Arnd Bergmann +Cc: Benjamin Herrenschmidt +Cc: Clemens Ladisch +Cc: David Airlie +Cc: Jani Nikula +Cc: Joel Becker +Cc: Joonas Lahtinen +Cc: Joseph Qi +Cc: Julia Lawall +Cc: Lukas Middendorf +Cc: Mark Fasheh +Cc: Phillip Potter +Cc: Rodrigo Vivi +Cc: Douglas Gilbert +Cc: James E.J. Bottomley +Cc: Jani Nikula +Cc: John Ogness +Cc: Martin K. Petersen +Cc: "Rafael J. Wysocki" +Cc: Steven Rostedt (VMware) +Cc: Suren Baghdasaryan +Cc: "Theodore Ts'o" +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Liu Shixin +Reviewed-by: Kefeng Wang +Signed-off-by: Zheng Zengkai + +Conflicts: + both modified: fs/proc/proc_sysctl.c + both modified: include/linux/sysctl.h + +Signed-off-by: Jiang Yi +--- + fs/proc/proc_sysctl.c | 34 ++++++++++++++++++++++++++++++++++ + include/linux/sysctl.h | 4 ++++ + 2 files changed, 38 insertions(+) + +diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c +index c95f32b83a94..2712aa568331 100644 +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include "internal.h" + + static const struct dentry_operations proc_sys_dentry_operations; +@@ -1376,6 +1377,39 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab + } + EXPORT_SYMBOL(register_sysctl); + ++/** ++ * __register_sysctl_init() - register sysctl table to path ++ * @path: path name for sysctl base ++ * @table: This is the sysctl table that needs to be registered to the path ++ * @table_name: The name of sysctl table, only used for log printing when ++ * registration fails ++ * ++ * The sysctl interface is used by userspace to query or modify at runtime ++ * a predefined value set on a variable. These variables however have default ++ * values pre-set. Code which depends on these variables will always work even ++ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the ++ * ability to query or modify the sysctls dynamically at run time. Chances of ++ * register_sysctl() failing on init are extremely low, and so for both reasons ++ * this function does not return any error as it is used by initialization code. ++ * ++ * Context: Can only be called after your respective sysctl base path has been ++ * registered. So for instance, most base directories are registered early on ++ * init before init levels are processed through proc_sys_init() and ++ * sysctl_init(). ++ */ ++void __init __register_sysctl_init(const char *path, struct ctl_table *table, ++ const char *table_name) ++{ ++ struct ctl_table_header *hdr = register_sysctl(path, table); ++ ++ if (unlikely(!hdr)) { ++ pr_err("failed when register_sysctl %s to %s\n", ++ table_name, path); ++ return; ++ } ++ kmemleak_not_leak(hdr); ++} ++ + static char *append_path(const char *path, char *pos, const char *name) + { + int namelen; +diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h +index b769ecfcc3bd..04c822f6e7e9 100644 +--- a/include/linux/sysctl.h ++++ b/include/linux/sysctl.h +@@ -198,6 +198,10 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + void unregister_sysctl_table(struct ctl_table_header * table); + + extern int sysctl_init(void); ++extern void __register_sysctl_init(const char *path, struct ctl_table *table, ++ const char *table_name); ++#define register_sysctl_init(path, table) \ ++ __register_sysctl_init(path, table, #table) + + extern struct ctl_table sysctl_mount_point[]; + +-- +2.23.0 + diff --git a/patches/0127-sched-topology-drivers-base-arch_topology-Rebuild-th.patch b/patches/0127-sched-topology-drivers-base-arch_topology-Rebuild-th.patch new file mode 100644 index 00000000..4594b3d4 --- /dev/null +++ b/patches/0127-sched-topology-drivers-base-arch_topology-Rebuild-th.patch @@ -0,0 +1,87 @@ +From 965864f440f01ae1212284dfff617ef3e435569b Mon Sep 17 00:00:00 2001 +From: Morten Rasmussen +Date: Fri, 20 Jul 2018 14:32:32 +0100 +Subject: [PATCH 127/132] sched/topology, drivers/base/arch_topology: Rebuild + the sched_domain hierarchy when capacities change + +mainline inclusion +from mainline-v4.20-rc1 +commit bb1fbdd3c3fd12b612c7d8cdf13bd6bfeebdefa3 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bb1fbdd3c3fd12b612c7d8cdf13bd6bfeebdefa3 + +---------------------------------------------------------------------- + +The setting of SD_ASYM_CPUCAPACITY depends on the per-CPU capacities. +These might not have their final values when the hierarchy is initially +built as the values depend on cpufreq to be initialized or the values +being set through sysfs. To ensure that the flags are set correctly we +need to rebuild the sched_domain hierarchy whenever the reported per-CPU +capacity (arch_scale_cpu_capacity()) changes. + +This patch ensure that a full sched_domain rebuild happens when CPU +capacity changes occur. + +Signed-off-by: Morten Rasmussen +Signed-off-by: Peter Zijlstra (Intel) +Cc: Greg Kroah-Hartman +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: dietmar.eggemann@arm.com +Cc: valentin.schneider@arm.com +Cc: vincent.guittot@linaro.org +Link: http://lkml.kernel.org/r/1532093554-30504-3-git-send-email-morten.rasmussen@arm.com +Signed-off-by: Ingo Molnar + +Conflicts: + drivers/base/arch_topology.c + +Signed-off-by: Jiang Yi +--- + drivers/base/arch_topology.c | 8 ++++++++ + include/linux/arch_topology.h | 1 + + 2 files changed, 9 insertions(+) + +diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c +index 729dded51e7b..5ef5e0198f9e 100644 +--- a/drivers/base/arch_topology.c ++++ b/drivers/base/arch_topology.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; + +@@ -67,6 +68,13 @@ static int register_cpu_capacity_sysctl(void) + } + subsys_initcall(register_cpu_capacity_sysctl); + ++static int update_topology; ++ ++int topology_update_cpu_topology(void) ++{ ++ return update_topology; ++} ++ + static u32 capacity_scale; + static u32 *raw_capacity; + +diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h +index 80c28bfce557..a0889776a9e0 100644 +--- a/include/linux/arch_topology.h ++++ b/include/linux/arch_topology.h +@@ -9,6 +9,7 @@ + #include + + void topology_normalize_cpu_scale(void); ++int topology_update_cpu_topology(void); + + struct device_node; + bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu); +-- +2.23.0 + diff --git a/patches/0128-sched-topology-arch-arm64-Rebuild-the-sched_domain-h.patch b/patches/0128-sched-topology-arch-arm64-Rebuild-the-sched_domain-h.patch new file mode 100644 index 00000000..29e47d41 --- /dev/null +++ b/patches/0128-sched-topology-arch-arm64-Rebuild-the-sched_domain-h.patch @@ -0,0 +1,58 @@ +From 006c23c028bec016ef2dd7210932b3296330c840 Mon Sep 17 00:00:00 2001 +From: Morten Rasmussen +Date: Fri, 20 Jul 2018 14:32:33 +0100 +Subject: [PATCH 128/132] sched/topology, arch/arm64: Rebuild the sched_domain + hierarchy when the CPU capacity changes + +mainline inclusion +from mainline-v4.20-rc1 +commit 3ba09df4b8b6e3f01ed6381e8fb890840fd0bca3 +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3ba09df4b8b6e3f01ed6381e8fb890840fd0bca3 + +---------------------------------------------------------------------- + +Asymmetric CPU capacity can not necessarily be determined accurately at +the time the initial sched_domain hierarchy is built during boot. It is +therefore necessary to be able to force a full rebuild of the hierarchy +later triggered by the arch_topology driver. A full rebuild requires the +arch-code to implement arch_update_cpu_topology() which isn't yet +implemented for arm64. This patch points the arm64 implementation to +arch_topology driver to ensure that full hierarchy rebuild happens when +needed. + +Signed-off-by: Morten Rasmussen +Signed-off-by: Peter Zijlstra (Intel) +Cc: Catalin Marinas +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Will Deacon +Cc: dietmar.eggemann@arm.com +Cc: valentin.schneider@arm.com +Cc: vincent.guittot@linaro.org +Link: http://lkml.kernel.org/r/1532093554-30504-4-git-send-email-morten.rasmussen@arm.com +Signed-off-by: Ingo Molnar +Signed-off-by: Jiang Yi +--- + arch/arm64/include/asm/topology.h | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h +index 164e26035653..7f0de9ec65f7 100644 +--- a/arch/arm64/include/asm/topology.h ++++ b/arch/arm64/include/asm/topology.h +@@ -50,6 +50,9 @@ int pcibus_to_node(struct pci_bus *bus); + /* Replace task scheduler's default cpu-invariant accounting */ + #define arch_scale_cpu_capacity topology_get_cpu_scale + ++/* Enable topology flag updates */ ++#define arch_update_cpu_topology topology_update_cpu_topology ++ + #include + + #endif /* _ASM_ARM_TOPOLOGY_H */ +-- +2.23.0 + diff --git a/patches/0129-scheduler-Add-runtime-knob-sysctl_sched_cluster.patch b/patches/0129-scheduler-Add-runtime-knob-sysctl_sched_cluster.patch new file mode 100644 index 00000000..6b46431f --- /dev/null +++ b/patches/0129-scheduler-Add-runtime-knob-sysctl_sched_cluster.patch @@ -0,0 +1,237 @@ +From 54062cb62d4060a08ebe460e82eb94a49da70f6a Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 3 Dec 2021 12:32:40 -0800 +Subject: [PATCH 129/132] scheduler: Add runtime knob sysctl_sched_cluster + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ + +---------------------------------------------------------------------- + +Allow run time configuration of the scheduler to use cluster +scheduling. Configuration can be changed via the sysctl variable +/proc/sys/kernel/sched_cluster. Setting it to 1 enable cluster +scheduling and setting it to 0 turns it off. + +Cluster scheduling should benefit independent tasks by load balancing +them between clusters. It reaps the most benefit when the system's CPUs +are not fully busy, so we can spread the tasks out between the clusters to +reduce contention on cluster resource (e.g. L2 cache). + +However, if the system is expected to operate close to full utilization, +the system admin could turn this feature off so as not to incur +extra load balancing overhead between the cluster domains. + +Signed-off-by: Tim Chen +Signed-off-by: Jie Liu + +Conflicts: + arch/x86/kernel/smpboot.c + drivers/base/arch_topology.c + include/linux/sched/sysctl.h + +Signed-off-by: Jiang Yi +--- + arch/x86/kernel/smpboot.c | 8 +++++ + drivers/base/arch_topology.c | 10 +++++- + include/linux/sched/sysctl.h | 7 ++++ + include/linux/topology.h | 1 + + kernel/sched/core.c | 1 + + kernel/sched/sched.h | 6 ++++ + kernel/sched/topology.c | 67 ++++++++++++++++++++++++++++++++++++ + 7 files changed, 99 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index e9dd01f7d602..1993690cfd80 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -56,6 +56,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -116,6 +117,13 @@ int arch_update_cpu_topology(void) + return retval; + } + ++void arch_rebuild_cpu_topology(void) ++{ ++ x86_topology_update = true; ++ rebuild_sched_domains(); ++ x86_topology_update = false; ++} ++ + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) + { + unsigned long flags; +diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c +index 5ef5e0198f9e..f601eb3238a1 100644 +--- a/drivers/base/arch_topology.c ++++ b/drivers/base/arch_topology.c +@@ -68,6 +68,7 @@ static int register_cpu_capacity_sysctl(void) + } + subsys_initcall(register_cpu_capacity_sysctl); + ++static u32 capacity_scale; + static int update_topology; + + int topology_update_cpu_topology(void) +@@ -75,7 +76,14 @@ int topology_update_cpu_topology(void) + return update_topology; + } + +-static u32 capacity_scale; ++void __weak arch_rebuild_cpu_topology(void) ++{ ++ update_topology = 1; ++ rebuild_sched_domains(); ++ pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); ++ update_topology = 0; ++} ++ + static u32 *raw_capacity; + + static int free_raw_capacity(void) +diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h +index ad472760e97d..691037143faa 100644 +--- a/include/linux/sched/sysctl.h ++++ b/include/linux/sched/sysctl.h +@@ -104,4 +104,11 @@ extern int sysctl_schedstats(struct ctl_table *table, int write, + loff_t *ppos); + + extern int sysctl_umh_affinity; ++ ++#ifdef CONFIG_SCHED_CLUSTER ++extern unsigned int sysctl_sched_cluster; ++int sched_cluster_handler(struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos); ++#endif ++ + #endif /* _LINUX_SCHED_SYSCTL_H */ +diff --git a/include/linux/topology.h b/include/linux/topology.h +index 9a7753fcae6a..63fb192f425b 100644 +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -43,6 +43,7 @@ + if (nr_cpus_node(node)) + + int arch_update_cpu_topology(void); ++void arch_rebuild_cpu_topology(void); + + /* Conform to ACPI 2.0 SLIT distance definitions */ + #define LOCAL_DISTANCE 10 +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index bbfed1ce2372..e518fc08fd41 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5968,6 +5968,7 @@ int sched_cpu_dying(unsigned int cpu) + void __init sched_init_smp(void) + { + sched_init_numa(); ++ set_sched_cluster(); + + /* + * There's no userspace yet to cause hotplug operations; hence all the +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 131228b5c268..7e2c49032615 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1190,6 +1190,12 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) + #endif + } + ++#ifdef CONFIG_SCHED_CLUSTER ++extern void set_sched_cluster(void); ++#else ++static inline void set_sched_cluster(void) { } ++#endif ++ + #ifdef CONFIG_NUMA + enum numa_topology_type { + NUMA_DIRECT, +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 887e2d06d98a..8157e9fb9bfa 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1330,6 +1330,73 @@ static struct sched_domain_topology_level default_topology[] = { + static struct sched_domain_topology_level *sched_domain_topology = + default_topology; + ++#ifdef CONFIG_SCHED_CLUSTER ++void set_sched_cluster(void) ++{ ++ struct sched_domain_topology_level *tl; ++ ++ for (tl = sched_domain_topology; tl->mask; tl++) { ++ if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) { ++ if (!sysctl_sched_cluster) ++ tl->flags |= SDTL_SKIP; ++ else ++ tl->flags &= ~SDTL_SKIP; ++ break; ++ } ++ } ++} ++ ++/* set via /proc/sys/kernel/sched_cluster */ ++unsigned int __read_mostly sysctl_sched_cluster = 1; ++ ++static DEFINE_MUTEX(sched_cluster_mutex); ++int sched_cluster_handler(struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret; ++ unsigned int oldval; ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ mutex_lock(&sched_cluster_mutex); ++ oldval = sysctl_sched_cluster; ++ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); ++ if (!ret && write) { ++ if (oldval != sysctl_sched_cluster) { ++ set_sched_cluster(); ++ arch_rebuild_cpu_topology(); ++ } ++ } ++ mutex_unlock(&sched_cluster_mutex); ++ ++ return ret; ++} ++ ++static int zero; ++static int one = 1; ++ ++static struct ctl_table sched_cluster_sysctls[] = { ++ { ++ .procname = "sched_cluster", ++ .data = &sysctl_sched_cluster, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_cluster_handler, ++ .extra1 = (void *)&zero, ++ .extra2 = (void *)&one, ++ }, ++ {} ++}; ++ ++static int __init sched_cluster_sysctl_init(void) ++{ ++ register_sysctl_init("kernel", sched_cluster_sysctls); ++ return 0; ++} ++late_initcall(sched_cluster_sysctl_init); ++#endif ++ + static struct sched_domain_topology_level * + next_tl(struct sched_domain_topology_level *tl) + { +-- +2.23.0 + diff --git a/patches/0130-scheduler-Add-boot-time-enabling-disabling-of-cluste.patch b/patches/0130-scheduler-Add-boot-time-enabling-disabling-of-cluste.patch new file mode 100644 index 00000000..c690683f --- /dev/null +++ b/patches/0130-scheduler-Add-boot-time-enabling-disabling-of-cluste.patch @@ -0,0 +1,72 @@ +From 57ed48e6a2f3f540d27a795c875e957a72272245 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Fri, 3 Dec 2021 12:32:41 -0800 +Subject: [PATCH 130/132] scheduler: Add boot time enabling/disabling of + cluster scheduling + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S +Reference: https://lore.kernel.org/lkml/cover.1638563225.git.tim.c.chen@linux.intel.com/ + +---------------------------------------------------------------------- + +Add boot time parameter sched_cluster to enable or disable cluster +scheduling. Set boot parameter as follow: + + sched_cluster=0 disables cluster scheduling + sched_cluster=1 enables cluster scheduling + +Signed-off-by: Tim Chen +Signed-off-by: Jie Liu +Signed-off-by: Jiang Yi +--- + Documentation/admin-guide/kernel-parameters.txt | 4 ++++ + kernel/sched/topology.c | 16 ++++++++++++++++ + 2 files changed, 20 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 81c3e5e6447f..cd413b202ea5 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4310,6 +4310,10 @@ + + sched_debug [KNL] Enables verbose scheduler debug messages. + ++ sched_cluster= Enable or disable cluster scheduling. ++ 0 -- disable. ++ 1 -- enable. ++ + schedstats= [KNL,X86] Enable or disable scheduled statistics. + Allowed values are enable and disable. This feature + incurs a small amount of overhead in the scheduler +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 8157e9fb9bfa..fdc3ae9e1bc0 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1395,6 +1395,22 @@ static int __init sched_cluster_sysctl_init(void) + return 0; + } + late_initcall(sched_cluster_sysctl_init); ++ ++static int __init sched_cluster_option(char *str) ++{ ++ int enable; ++ ++ if (get_option(&str, &enable)) { ++ if (enable != 0 && enable != 1) ++ return -EINVAL; ++ ++ sysctl_sched_cluster = enable; ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++early_param("sched_cluster", sched_cluster_option); + #endif + + static struct sched_domain_topology_level * +-- +2.23.0 + diff --git a/patches/0131-scheduler-Disable-cluster-scheduling-by-default.patch b/patches/0131-scheduler-Disable-cluster-scheduling-by-default.patch new file mode 100644 index 00000000..f2e27545 --- /dev/null +++ b/patches/0131-scheduler-Disable-cluster-scheduling-by-default.patch @@ -0,0 +1,38 @@ +From cab126264ad499e2b122e2926b8dc24ca2eb4a7c Mon Sep 17 00:00:00 2001 +From: Yicong Yang +Date: Mon, 13 Feb 2023 10:48:54 +0800 +Subject: [PATCH 131/132] scheduler: Disable cluster scheduling by default + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S + +---------------------------------------------------------------------- + +Disable cluster scheduling by default since it's not a universal win. +User can choose to enable it through sysctl or at boot time according to +their scenario. + +Signed-off-by: Yicong Yang +Signed-off-by: Jie Liu +Signed-off-by: Jiang Yi +--- + kernel/sched/topology.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index fdc3ae9e1bc0..1cc9ec74d24b 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1347,7 +1347,7 @@ void set_sched_cluster(void) + } + + /* set via /proc/sys/kernel/sched_cluster */ +-unsigned int __read_mostly sysctl_sched_cluster = 1; ++unsigned int __read_mostly sysctl_sched_cluster; + + static DEFINE_MUTEX(sched_cluster_mutex); + int sched_cluster_handler(struct ctl_table *table, int write, +-- +2.23.0 + diff --git a/patches/0132-sched-Open-the-kernel-configuration-for-cluster.patch b/patches/0132-sched-Open-the-kernel-configuration-for-cluster.patch new file mode 100644 index 00000000..ea12d9ba --- /dev/null +++ b/patches/0132-sched-Open-the-kernel-configuration-for-cluster.patch @@ -0,0 +1,35 @@ +From 9714cea606654bd9b4118c7604db5854ae68e626 Mon Sep 17 00:00:00 2001 +From: Jie Liu +Date: Mon, 24 Oct 2022 09:34:57 +0800 +Subject: [PATCH 132/132] sched:Open the kernel configuration for cluster. + +kunpeng inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I5W44S + +---------------------------------------------------------------------- + +In the past configuration, CONFIG_SCHED_CLUSTER was not set. Now, we need +to open the configuration. + +Signed-off-by: Jie Liu +Signed-off-by: Jiang Yi +--- + arch/arm64/configs/openeuler_defconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig +index b04256636d4b..5f818e5e1790 100644 +--- a/arch/arm64/configs/openeuler_defconfig ++++ b/arch/arm64/configs/openeuler_defconfig +@@ -420,6 +420,7 @@ CONFIG_ARM64_PA_BITS=48 + CONFIG_SCHED_MC=y + # CONFIG_SCHED_SMT is not set + CONFIG_NR_CPUS=1024 ++CONFIG_SCHED_CLUSTER=y + CONFIG_HOTPLUG_CPU=y + CONFIG_ARM64_ERR_RECOV=y + CONFIG_MPAM=y +-- +2.23.0 + diff --git a/series.conf b/series.conf index c02d844c..fc6d3050 100644 --- a/series.conf +++ b/series.conf @@ -117,3 +117,19 @@ patches/0113-perf-auxtrace-arm-Refactor-event-list-iteration-in-a.patch patches/0114-perf-auxtrace-arm64-Add-support-for-HiSilicon-PCIe-T.patch patches/0115-perf-auxtrace-arm64-Add-support-for-parsing-HiSilico.patch patches/0116-Fix-the-header-file-location-error-and-adjust-the-fu.patch +patches/0117-topology-Represent-clusters-of-CPUs-within-a-die.patch +patches/0118-sched-Add-cluster-scheduler-level-in-core-and-relate.patch +patches/0119-topology-sysfs-export-cluster-attributes-only-if-an-.patch +patches/0120-topology-Remove-unused-cpu_cluster_mask.patch +patches/0121-arch_topology-Limit-span-of-cpu_clustergroup_mask.patch +patches/0122-arch_topology-Make-cluster-topology-span-at-least-SM.patch +patches/0123-sched-Add-per_cpu-cluster-domain-info-and-cpus_share.patch +patches/0124-sched-fair-Scan-cluster-before-scanning-LLC-in-wake-.patch +patches/0125-scheduler-Create-SDTL_SKIP-flag-to-skip-topology-lev.patch +patches/0126-sysctl-add-a-new-register_sysctl_init-interface.patch +patches/0127-sched-topology-drivers-base-arch_topology-Rebuild-th.patch +patches/0128-sched-topology-arch-arm64-Rebuild-the-sched_domain-h.patch +patches/0129-scheduler-Add-runtime-knob-sysctl_sched_cluster.patch +patches/0130-scheduler-Add-boot-time-enabling-disabling-of-cluste.patch +patches/0131-scheduler-Disable-cluster-scheduling-by-default.patch +patches/0132-sched-Open-the-kernel-configuration-for-cluster.patch -- Gitee