From 973e6254e1746bc65dbdd702b24e8731ed6f084a Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Thu, 5 Feb 2026 14:29:10 +0800 Subject: [PATCH 001/124] anolis: Revert "KVM: arm64: Disable MPAM visibility by default and ignore VMM writes" ANBZ: #31060 This reverts commit 4e76efda1f0aaad82f967e5ed955e12f6efb1dbd. Signed-off-by: Wei Chen --- arch/arm64/kvm/sys_regs.c | 53 ++------------------------------------- 1 file changed, 2 insertions(+), 51 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 49625da2c089..eb9eb9f71bea 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1541,7 +1541,6 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -1689,13 +1688,6 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, val &= ~ID_AA64PFR0_EL1_AMU_MASK; - /* - * MPAM is disabled by default as KVM also needs a set of PARTID to - * program the MPAMVPMx_EL2 PARTID remapping registers with. But some - * older kernels let the guest see the ID bit. - */ - val &= ~ID_AA64PFR0_EL1_MPAM_MASK; - return val; } @@ -1806,42 +1798,6 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, val); } -static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, u64 user_val) -{ - u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK; - - /* - * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits - * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to - * guests, but didn't add trap handling. KVM doesn't support MPAM and - * always returns an UNDEF for these registers. The guest must see 0 - * for this field. - * - * But KVM must also accept values from user-space that were provided - * by KVM. On CPUs that support MPAM, permit user-space to write - * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field. - */ - if ((hw_val & mpam_mask) == (user_val & mpam_mask)) - user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; - - return set_id_reg(vcpu, rd, user_val); -} - -static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, u64 user_val) -{ - u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); - u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; - - /* See set_id_aa64pfr0_el1 for comment about MPAM */ - if ((hw_val & mpam_mask) == (user_val & mpam_mask)) - user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; - - return set_id_reg(vcpu, rd, user_val); -} - /* * cpufeature ID register user accessors * @@ -2348,20 +2304,15 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg, .get_user = get_id_reg, - .set_user = set_id_aa64pfr0_el1, + .set_user = set_id_reg, .reset = read_sanitised_id_aa64pfr0_el1, .val = ~(ID_AA64PFR0_EL1_AMU | - ID_AA64PFR0_EL1_MPAM | ID_AA64PFR0_EL1_SVE | ID_AA64PFR0_EL1_RAS | ID_AA64PFR0_EL1_GIC | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP), }, - { SYS_DESC(SYS_ID_AA64PFR1_EL1), - .access = access_id_reg, - .get_user = get_id_reg, - .set_user = set_id_aa64pfr1_el1, - .reset = kvm_read_sanitised_id_reg, }, + ID_SANITISED(ID_AA64PFR1_EL1), ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), -- Gitee From f9fdf74351de9827feeef788d1f841c117b33024 Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 23 Jul 2024 03:20:00 -0400 Subject: [PATCH 002/124] KVM: arm64: Disable fields that KVM doesn't know how to handle in ID_AA64PFR1_EL1 ANBZ: #31060 commit ffe68b2d19a5a84440fea99a732cfc3b157559eb upstream. For some of the fields in the ID_AA64PFR1_EL1 register, KVM doesn't know how to handle them right now. So explicitly disable them in the register accessor, then those fields value will be masked to 0 even if on the hardware the field value is 1. This is safe because from a UAPI point of view that read_sanitised_ftr_reg() doesn't yet return a nonzero value for any of those fields. This will benifit the migration if the host and VM have different values when restoring a VM. Those fields include RNDR_trap, NMI, MTE_frac, GCS, THE, MTEX, DF2, PFAR. Signed-off-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240723072004.1470688-2-shahuang@redhat.com Signed-off-by: Marc Zyngier Signed-off-by: Wei Chen --- arch/arm64/kvm/sys_regs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index eb9eb9f71bea..06eba1aa7cbf 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1541,6 +1541,14 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) -- Gitee From 8e1d777e00d07424450835ffb53a233f0da56d98 Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 23 Jul 2024 03:20:01 -0400 Subject: [PATCH 003/124] KVM: arm64: Use kvm_has_feat() to check if FEAT_SSBS is advertised to the guest ANBZ: #31060 commit e8d164974cfa46fe5ec87869c8a7113641f322d5 upstream. Currently KVM use cpus_have_final_cap() to check if FEAT_SSBS is advertised to the guest. But if FEAT_SSBS is writable and isn't advertised to the guest, this is wrong. Update it to use kvm_has_feat() to check if FEAT_SSBS is advertised to the guest, thus the KVM can do the right thing if FEAT_SSBS isn't advertised to the guest. Signed-off-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240723072004.1470688-3-shahuang@redhat.com Signed-off-by: Marc Zyngier Signed-off-by: Wei Chen --- arch/arm64/kvm/hypercalls.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 3dffe74928ad..fcedd031f5c9 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -318,7 +318,7 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) * to the guest, and hide SSBS so that the * guest stays protected. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) break; fallthrough; case SPECTRE_UNAFFECTED: @@ -459,7 +459,7 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) * Convert the workaround level into an easy-to-compare number, where higher * values mean better protection. */ -static int get_kernel_wa_level(u64 regid) +static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid) { switch (regid) { case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: @@ -480,7 +480,7 @@ static int get_kernel_wa_level(u64 regid) * don't have any FW mitigation if SSBS is there at * all times. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; fallthrough; case SPECTRE_UNAFFECTED: @@ -517,7 +517,7 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK; break; case KVM_REG_ARM_STD_BMAP: val = READ_ONCE(smccc_feat->std_bmap); @@ -619,7 +619,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (val & ~KVM_REG_FEATURE_LEVEL_MASK) return -EINVAL; - if (get_kernel_wa_level(reg->id) < val) + if (get_kernel_wa_level(vcpu, reg->id) < val) return -EINVAL; return 0; @@ -655,7 +655,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the * other way around. */ - if (get_kernel_wa_level(reg->id) < wa_level) + if (get_kernel_wa_level(vcpu, reg->id) < wa_level) return -EINVAL; return 0; -- Gitee From 26cfa3873868efdeb4a8ac7348235fe02e7ee84b Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 23 Jul 2024 03:20:02 -0400 Subject: [PATCH 004/124] KVM: arm64: Allow userspace to change ID_AA64PFR1_EL1 ANBZ: #31060 commit 78c4446b5f957fb14737582e503b1b25f66edc45 upstream. Allow userspace to change the guest-visible value of the register with different way of handling: - Since the RAS and MPAM is not writable in the ID_AA64PFR0_EL1 register, RAS_frac and MPAM_frac are also not writable in the ID_AA64PFR1_EL1 register. - The MTE is controlled by a separate UAPI (KVM_CAP_ARM_MTE) with an internal flag (KVM_ARCH_FLAG_MTE_ENABLED). So it's not writable. - For those fields which KVM doesn't know how to handle, they are not exposed to the guest (being disabled in the register read accessor), those fields value will always be 0. Those fields don't have a known behavior now, so don't advertise them to the userspace. Thus still not writable. Those fields include SME, RNDR_trap, NMI, GCS, THE, DF2, PFAR, MTE_frac, MTEX. - The BT, SSBS, CSV2_frac don't introduce any new registers which KVM doesn't know how to handle, they can be written without ill effect. So let them writable. Besides, we don't do the crosscheck in KVM about the CSV2_frac even if it depends on the value of CSV2, it should be made sure by the VMM instead of KVM. Signed-off-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240723072004.1470688-4-shahuang@redhat.com Signed-off-by: Marc Zyngier Signed-off-by: Wei Chen --- arch/arm64/kvm/sys_regs.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 06eba1aa7cbf..3c2705d4f218 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2320,7 +2320,19 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_GIC | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP), }, - ID_SANITISED(ID_AA64PFR1_EL1), + ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR | + ID_AA64PFR1_EL1_DF2 | + ID_AA64PFR1_EL1_MTEX | + ID_AA64PFR1_EL1_THE | + ID_AA64PFR1_EL1_GCS | + ID_AA64PFR1_EL1_MTE_frac | + ID_AA64PFR1_EL1_NMI | + ID_AA64PFR1_EL1_RNDR_trap | + ID_AA64PFR1_EL1_SME | + ID_AA64PFR1_EL1_RES0 | + ID_AA64PFR1_EL1_MPAM_frac | + ID_AA64PFR1_EL1_RAS_frac | + ID_AA64PFR1_EL1_MTE)), ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), -- Gitee From c411c846419d4e5b0a2b53d25b512191a69dbe90 Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 23 Jul 2024 03:20:03 -0400 Subject: [PATCH 005/124] KVM: selftests: aarch64: Add writable test for ID_AA64PFR1_EL1 ANBZ: #31060 commit dc9b5d7e0bd40e68a94013766b27be3dda10c006 upstream. Add writable test for the ID_AA64PFR1_EL1 register. Signed-off-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240723072004.1470688-5-shahuang@redhat.com Signed-off-by: Marc Zyngier Signed-off-by: Wei Chen --- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index cc162edcb466..3a3fffb07956 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -133,6 +133,13 @@ static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = { REG_FTR_END, }; +static const struct reg_ftr_bits ftr_id_aa64pfr1_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, CSV2_frac, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, SSBS, ID_AA64PFR1_EL1_SSBS_NI), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, BT, 0), + REG_FTR_END, +}; + static const struct reg_ftr_bits ftr_id_aa64mmfr0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ECV, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, EXS, 0), @@ -198,6 +205,7 @@ static struct test_feature_reg test_regs[] = { TEST_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1_el1), TEST_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2_el1), TEST_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0_el1), + TEST_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1_el1), TEST_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0_el1), TEST_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1_el1), TEST_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2_el1), @@ -469,9 +477,9 @@ int main(void) ftr_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - - ARRAY_SIZE(test_regs); + ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs); ksft_set_plan(ftr_cnt); -- Gitee From d61df9610772d13915a9bc6bb2842339f145da72 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:11 +0000 Subject: [PATCH 006/124] arm64/sysreg: Convert existing MPAM sysregs and add the remaining entries ANBZ: #31060 commit 83732ce6a056c4bb242d64fd25e1fc78f35e6a74 upstream. Move the existing MPAM system register defines from sysreg.h to tools/sysreg and add the remaining system registers. Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Acked-by: Catalin Marinas Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-2-joey.gouly@arm.com Signed-off-by: Oliver Upton [ changes for backport ] Upstream has used generated sysregs for MPAM, so we remove redefine macros of MPAM in: arch/arm64/include/asm/sysreg.h Signed-off-by: Wei Chen --- arch/arm64/include/asm/sysreg.h | 12 --- arch/arm64/tools/sysreg | 161 ++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d1700f6594f5..fcbae547498a 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -506,18 +506,6 @@ #define SYS_MAIR_EL2 sys_reg(3, 4, 10, 2, 0) #define SYS_AMAIR_EL2 sys_reg(3, 4, 10, 3, 0) -#define SYS_MPAMHCR_EL2 sys_reg(3, 4, 10, 4, 0) -#define SYS_MPAMVPMV_EL2 sys_reg(3, 4, 10, 4, 1) -#define SYS_MPAM2_EL2 sys_reg(3, 4, 10, 5, 0) -#define __SYS__MPAMVPMx_EL2(x) sys_reg(3, 4, 10, 6, x) -#define SYS_MPAMVPM0_EL2 __SYS__MPAMVPMx_EL2(0) -#define SYS_MPAMVPM1_EL2 __SYS__MPAMVPMx_EL2(1) -#define SYS_MPAMVPM2_EL2 __SYS__MPAMVPMx_EL2(2) -#define SYS_MPAMVPM3_EL2 __SYS__MPAMVPMx_EL2(3) -#define SYS_MPAMVPM4_EL2 __SYS__MPAMVPMx_EL2(4) -#define SYS_MPAMVPM5_EL2 __SYS__MPAMVPMx_EL2(5) -#define SYS_MPAMVPM6_EL2 __SYS__MPAMVPMx_EL2(6) -#define SYS_MPAMVPM7_EL2 __SYS__MPAMVPMx_EL2(7) #define SYS_VBAR_EL2 sys_reg(3, 4, 12, 0, 0) #define SYS_RVBAR_EL2 sys_reg(3, 4, 12, 0, 1) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index fdeec84c22ad..9e0a5f49db4d 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2992,6 +2992,126 @@ Field 1 E2SPE Field 0 E0HSPE EndSysreg +Sysreg MPAMHCR_EL2 3 4 10 4 0 +Res0 63:32 +Field 31 TRAP_MPAMIDR_EL1 +Res0 30:9 +Field 8 GSTAPP_PLK +Res0 7:2 +Field 1 EL1_VPMEN +Field 0 EL0_VPMEN +EndSysreg + +Sysreg MPAMVPMV_EL2 3 4 10 4 1 +Res0 63:32 +Field 31 VPM_V31 +Field 30 VPM_V30 +Field 29 VPM_V29 +Field 28 VPM_V28 +Field 27 VPM_V27 +Field 26 VPM_V26 +Field 25 VPM_V25 +Field 24 VPM_V24 +Field 23 VPM_V23 +Field 22 VPM_V22 +Field 21 VPM_V21 +Field 20 VPM_V20 +Field 19 VPM_V19 +Field 18 VPM_V18 +Field 17 VPM_V17 +Field 16 VPM_V16 +Field 15 VPM_V15 +Field 14 VPM_V14 +Field 13 VPM_V13 +Field 12 VPM_V12 +Field 11 VPM_V11 +Field 10 VPM_V10 +Field 9 VPM_V9 +Field 8 VPM_V8 +Field 7 VPM_V7 +Field 6 VPM_V6 +Field 5 VPM_V5 +Field 4 VPM_V4 +Field 3 VPM_V3 +Field 2 VPM_V2 +Field 1 VPM_V1 +Field 0 VPM_V0 +EndSysreg + +Sysreg MPAM2_EL2 3 4 10 5 0 +Field 63 MPAMEN +Res0 62:59 +Field 58 TIDR +Res0 57 +Field 56 ALTSP_HFC +Field 55 ALTSP_EL2 +Field 54 ALTSP_FRCD +Res0 53:51 +Field 50 EnMPAMSM +Field 49 TRAPMPAM0EL1 +Field 48 TRAPMPAM1EL1 +Field 47:40 PMG_D +Field 39:32 PMG_I +Field 31:16 PARTID_D +Field 15:0 PARTID_I +EndSysreg + +Sysreg MPAMVPM0_EL2 3 4 10 6 0 +Field 63:48 PhyPARTID3 +Field 47:32 PhyPARTID2 +Field 31:16 PhyPARTID1 +Field 15:0 PhyPARTID0 +EndSysreg + +Sysreg MPAMVPM1_EL2 3 4 10 6 1 +Field 63:48 PhyPARTID7 +Field 47:32 PhyPARTID6 +Field 31:16 PhyPARTID5 +Field 15:0 PhyPARTID4 +EndSysreg + +Sysreg MPAMVPM2_EL2 3 4 10 6 2 +Field 63:48 PhyPARTID11 +Field 47:32 PhyPARTID10 +Field 31:16 PhyPARTID9 +Field 15:0 PhyPARTID8 +EndSysreg + +Sysreg MPAMVPM3_EL2 3 4 10 6 3 +Field 63:48 PhyPARTID15 +Field 47:32 PhyPARTID14 +Field 31:16 PhyPARTID13 +Field 15:0 PhyPARTID12 +EndSysreg + +Sysreg MPAMVPM4_EL2 3 4 10 6 4 +Field 63:48 PhyPARTID19 +Field 47:32 PhyPARTID18 +Field 31:16 PhyPARTID17 +Field 15:0 PhyPARTID16 +EndSysreg + +Sysreg MPAMVPM5_EL2 3 4 10 6 5 +Field 63:48 PhyPARTID23 +Field 47:32 PhyPARTID22 +Field 31:16 PhyPARTID21 +Field 15:0 PhyPARTID20 +EndSysreg + +Sysreg MPAMVPM6_EL2 3 4 10 6 6 +Field 63:48 PhyPARTID27 +Field 47:32 PhyPARTID26 +Field 31:16 PhyPARTID25 +Field 15:0 PhyPARTID24 +EndSysreg + +Sysreg MPAMVPM7_EL2 3 4 10 6 7 +Field 63:48 PhyPARTID31 +Field 47:32 PhyPARTID30 +Field 31:16 PhyPARTID29 +Field 15:0 PhyPARTID28 +EndSysreg + Sysreg CONTEXTIDR_EL2 3 4 13 0 1 Fields CONTEXTIDR_ELx EndSysreg @@ -3028,6 +3148,10 @@ Sysreg FAR_EL12 3 5 6 0 0 Field 63:0 ADDR EndSysreg +Sysreg MPAM1_EL12 3 5 10 5 0 +Fields MPAM1_ELx +EndSysreg + Sysreg CONTEXTIDR_EL12 3 5 13 0 1 Fields CONTEXTIDR_ELx EndSysreg @@ -3165,6 +3289,22 @@ Res0 1 Field 0 EN EndSysreg +Sysreg MPAMIDR_EL1 3 0 10 4 4 +Res0 63:62 +Field 61 HAS_SDEFLT +Field 60 HAS_FORCE_NS +Field 59 SP4 +Field 58 HAS_TIDR +Field 57 HAS_ALTSP +Res0 56:40 +Field 39:32 PMG_MAX +Res0 31:21 +Field 20:18 VPMR_MAX +Field 17 HAS_HCR +Res0 16 +Field 15:0 PARTID_MAX +EndSysreg + Sysreg LORID_EL1 3 0 10 4 7 Res0 63:24 Field 23:16 LD @@ -3172,6 +3312,27 @@ Res0 15:8 Field 7:0 LR EndSysreg +Sysreg MPAM1_EL1 3 0 10 5 0 +Field 63 MPAMEN +Res0 62:61 +Field 60 FORCED_NS +Res0 59:55 +Field 54 ALTSP_FRCD +Res0 53:48 +Field 47:40 PMG_D +Field 39:32 PMG_I +Field 31:16 PARTID_D +Field 15:0 PARTID_I +EndSysreg + +Sysreg MPAM0_EL1 3 0 10 5 1 +Res0 63:48 +Field 47:40 PMG_D +Field 39:32 PMG_I +Field 31:16 PARTID_D +Field 15:0 PARTID_I +EndSysreg + Sysreg ISR_EL1 3 0 12 1 0 Res0 63:11 Field 10 IS -- Gitee From a74d7751caaf3b426f4a817d80a14e82b8c85ec1 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:12 +0000 Subject: [PATCH 007/124] arm64: head.S: Initialise MPAM EL2 registers and disable traps ANBZ: #31060 commit 23b33d1e168cfcc96666f025beb3bccfcb58403a upstream. Add code to head.S's el2_setup to detect MPAM and disable any EL2 traps. This register resets to an unknown value, setting it to the default parititons/pmg before we enable the MMU is the best thing to do. Kexec/kdump will depend on this if the previous kernel left the CPU configured with a restrictive configuration. If linux is booted at the highest implemented exception level el2_setup will clear the enable bit, disabling MPAM. This code can't be enabled until a subsequent patch adds the Kconfig and cpufeature boiler plate. Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Acked-by: Catalin Marinas Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-3-joey.gouly@arm.com Signed-off-by: Oliver Upton Signed-off-by: Wei Chen --- arch/arm64/include/asm/el2_setup.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 81aa0a4d2817..735bfe239702 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -275,6 +275,19 @@ msr spsr_el2, x0 .endm +.macro __init_el2_mpam + /* Memory Partitioning And Monitoring: disable EL2 traps */ + mrs x1, id_aa64pfr0_el1 + ubfx x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4 + cbz x0, .Lskip_mpam_\@ // skip if no MPAM + msr_s SYS_MPAM2_EL2, xzr // use the default partition + // and disable lower traps + mrs_s x0, SYS_MPAMIDR_EL1 + tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg + msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 +.Lskip_mpam_\@: +.endm + /** * Initialize EL2 registers to sane values. This should be called early on all * cores that were booted in EL2. Note that everything gets initialised as @@ -293,6 +306,7 @@ __init_el2_stage2 __init_el2_gicv3 __init_el2_hstr + __init_el2_mpam __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt -- Gitee From 856904c30a3acb54b3c6b71bae0ea094a39644df Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:13 +0000 Subject: [PATCH 008/124] arm64: cpufeature: discover CPU support for MPAM ANBZ: #31060 commit 09e6b306f3bad803a9743e40da6a644d66d19928 upstream. ARMv8.4 adds support for 'Memory Partitioning And Monitoring' (MPAM) which describes an interface to cache and bandwidth controls wherever they appear in the system. Add support to detect MPAM. Like SVE, MPAM has an extra id register that describes some more properties, including the virtualisation support, which is optional. Detect this separately so we can detect mismatched/insane systems, but still use MPAM on the host even if the virtualisation support is missing. MPAM needs enabling at the highest implemented exception level, otherwise the register accesses trap. The 'enabled' flag is accessible to lower exception levels, but its in a register that traps when MPAM isn't enabled. The cpufeature 'matches' hook is extended to test this on one of the CPUs, so that firmware can emulate MPAM as disabled if it is reserved for use by secure world. Secondary CPUs that appear late could trip cpufeature's 'lower safe' behaviour after the MPAM properties have been advertised to user-space. Add a verify call to ensure late secondaries match the existing CPUs. (If you have a boot failure that bisects here its likely your CPUs advertise MPAM in the id registers, but firmware failed to either enable or MPAM, or emulate the trap as if it were disabled) Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Acked-by: Catalin Marinas Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-4-joey.gouly@arm.com Signed-off-by: Oliver Upton Signed-off-by: Wei Chen --- .../arch/arm64/cpu-feature-registers.rst | 2 + arch/arm64/include/asm/cpu.h | 1 + arch/arm64/include/asm/cpufeature.h | 17 ++++ arch/arm64/kernel/cpufeature.c | 96 +++++++++++++++++++ arch/arm64/kernel/cpuinfo.c | 3 + arch/arm64/tools/cpucaps | 2 + 6 files changed, 121 insertions(+) diff --git a/Documentation/arch/arm64/cpu-feature-registers.rst b/Documentation/arch/arm64/cpu-feature-registers.rst index 44f9bd78539d..253e9743de2f 100644 --- a/Documentation/arch/arm64/cpu-feature-registers.rst +++ b/Documentation/arch/arm64/cpu-feature-registers.rst @@ -152,6 +152,8 @@ infrastructure: +------------------------------+---------+---------+ | DIT | [51-48] | y | +------------------------------+---------+---------+ + | MPAM | [43-40] | n | + +------------------------------+---------+---------+ | SVE | [35-32] | y | +------------------------------+---------+---------+ | GIC | [27-24] | n | diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 6deab764c779..999756176068 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -47,6 +47,7 @@ struct cpuinfo_arm64 { u64 reg_revidr; u64 reg_gmid; u64 reg_smidr; + u64 reg_mpamidr; u64 reg_id_aa64dfr0; u64 reg_id_aa64dfr1; diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 4cf52e377142..6169279b0ca6 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -620,6 +620,13 @@ static inline bool id_aa64pfr1_sme(u64 pfr1) return val > 0; } +static inline bool id_aa64pfr0_mpam(u64 pfr0) +{ + u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT); + + return val > 0; +} + static inline bool id_aa64pfr1_mte(u64 pfr1) { u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_MTE_SHIFT); @@ -849,6 +856,16 @@ static inline bool system_supports_gcs(void) alternative_has_cap_unlikely(ARM64_HAS_GCS); } +static inline bool system_supports_mpam(void) +{ + return alternative_has_cap_unlikely(ARM64_MPAM); +} + +static __always_inline bool system_supports_mpam_hcr(void) +{ + return alternative_has_cap_unlikely(ARM64_MPAM_HCR); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 1cdd81cf3efc..349eb3ebdaab 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -700,6 +700,14 @@ static const struct arm64_ftr_bits ftr_smcr[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_mpamidr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0), + ARM64_FTR_END, +}; + /* * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of @@ -820,6 +828,9 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr), ARM64_FTR_REG(SYS_SMCR_EL1, ftr_smcr), + /* Op1 = 0, CRn = 10, CRm = 4 */ + ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr), + /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), @@ -1146,6 +1157,9 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) vec_init_vq_map(ARM64_VEC_SME); } + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); @@ -1412,6 +1426,11 @@ void update_cpu_features(int cpu, vec_update_vq_map(ARM64_VEC_SME); } + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) { + taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, + info->reg_mpamidr, boot->reg_mpamidr); + } + /* * The kernel uses the LDGM/STGM instructions and the number of tags * they read/write depends on the GMID_EL1.BS field. Check that the @@ -2402,6 +2421,36 @@ cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); } +static bool +test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return false; + + /* Check firmware actually enabled MPAM on this cpu. */ + return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN); +} + +static void +cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) +{ + /* + * Access by the kernel (at EL1) should use the reserved PARTID + * which is configured unrestricted. This avoids priority-inversion + * where latency sensitive tasks have to wait for a task that has + * been throttled to release the lock. + */ + write_sysreg_s(0, SYS_MPAM1_EL1); +} + +static bool +test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + + return idr & MPAMIDR_EL1_HAS_HCR; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -2897,6 +2946,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_nv1, ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) }, + { + .desc = "Memory Partitioning And Monitoring", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM, + .matches = test_has_mpam, + .cpu_enable = cpu_enable_mpam, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1) + }, + { + .desc = "Memory Partitioning And Monitoring Virtualisation", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM_HCR, + .matches = test_has_mpam_hcr, + }, #ifdef CONFIG_ARM64_POE { .desc = "Stage-1 Permission Overlay Extension (S1POE)", @@ -3460,6 +3523,36 @@ static void verify_hyp_capabilities(void) } #endif +static void verify_mpam_capabilities(void) +{ + u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1); + u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max; + + if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) != + FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) { + pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_idr = read_cpuid(MPAMIDR_EL1); + sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) != + FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) { + pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr); + cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr); + sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr); + sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr); + if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) { + pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id()); + cpu_die_early(); + } +} + /* * Run through the enabled system capabilities and enable() it on this CPU. * The capabilities were decided based on the available CPUs at the boot time. @@ -3486,6 +3579,9 @@ static void verify_local_cpu_capabilities(void) if (is_hyp_mode_available()) verify_hyp_capabilities(); + + if (system_supports_mpam()) + verify_mpam_capabilities(); } void check_local_cpu_capabilities(void) diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index f8adc6c92652..c4512a59be79 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -505,6 +505,9 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + cpuinfo_detect_icache_policy(info); } diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 8686cff7063d..3123696a5fd4 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -60,6 +60,8 @@ HW_DBM KVM_HVHE KVM_PROTECTED_MODE MISMATCHED_CACHE_TYPE +MPAM +MPAM_HCR MTE MTE_ASYMM SME -- Gitee From 98d7a4e75df013dc67ea3e7dd0510524e816491d Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:14 +0000 Subject: [PATCH 009/124] KVM: arm64: Fix missing traps of guest accesses to the MPAM registers ANBZ: #31060 commit 31ff96c38ea393d9707f1d95b4bf8d372cf32177 upstream. commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to guests, but didn't add trap handling. If you are unlucky, this results in an MPAM aware guest being delivered an undef during boot. The host prints: | kvm [97]: Unsupported guest sys_reg access at: ffff800080024c64 [00000005] | { Op0( 3), Op1( 0), CRn(10), CRm( 5), Op2( 0), func_read }, Which results in: | Internal error: Oops - Undefined instruction: 0000000002000000 [#1] PREEMPT SMP | Modules linked in: | CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.6.0-rc7-00559-gd89c186d50b2 #14616 | Hardware name: linux,dummy-virt (DT) | pstate: 00000005 (nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : test_has_mpam+0x18/0x30 | lr : test_has_mpam+0x10/0x30 | sp : ffff80008000bd90 ... | Call trace: | test_has_mpam+0x18/0x30 | update_cpu_capabilities+0x7c/0x11c | setup_cpu_features+0x14/0xd8 | smp_cpus_done+0x24/0xb8 | smp_init+0x7c/0x8c | kernel_init_freeable+0xf8/0x280 | kernel_init+0x24/0x1e0 | ret_from_fork+0x10/0x20 | Code: 910003fd 97ffffde 72001c00 54000080 (d538a500) | ---[ end trace 0000000000000000 ]--- | Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b | ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]--- Add the support to enable the traps, and handle the three guest accessible registers by injecting an UNDEF. This stops KVM from spamming the host log, but doesn't yet hide the feature from the id registers. With MPAM v1.0 we can trap the MPAMIDR_EL1 register only if ARM64_HAS_MPAM_HCR, with v1.1 an additional MPAM2_EL2.TIDR bit traps MPAMIDR_EL1 on platforms that don't have MPAMHCR_EL2. Enable one of these if either is supported. If neither is supported, the guest can discover that the CPU has MPAM support, and how many PARTID etc the host has ... but it can't influence anything, so its harmless. Fixes: 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits in ID_AA64PFR0 register") CC: Anshuman Khandual Link: https://lore.kernel.org/linux-arm-kernel/20200925160102.118858-1-james.morse@arm.com/ Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-5-joey.gouly@arm.com Signed-off-by: Oliver Upton Signed-off-by: Wei Chen --- arch/arm64/include/asm/cpufeature.h | 2 +- arch/arm64/include/asm/kvm_arm.h | 1 + arch/arm64/kvm/hyp/include/hyp/switch.h | 31 +++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 14 +++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 6169279b0ca6..dc636045a9ea 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -856,7 +856,7 @@ static inline bool system_supports_gcs(void) alternative_has_cap_unlikely(ARM64_HAS_GCS); } -static inline bool system_supports_mpam(void) +static __always_inline bool system_supports_mpam(void) { return alternative_has_cap_unlikely(ARM64_MPAM); } diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index c11010965b8e..96c7aad7cc43 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -108,6 +108,7 @@ #define HCRX_GUEST_FLAGS (HCRX_EL2_SMPME | HCRX_EL2_TCR2En) #define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM) +#define MPAMHCR_HOST_FLAGS 0 /* TCR_EL2 Registers bits */ #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 2b6c9724d55f..64b489177b27 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -172,6 +172,35 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) write_sysreg_s(ctxt_sys_reg(hctxt, HDFGWTR_EL2), SYS_HDFGWTR_EL2); } +static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) +{ + u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + + if (!system_supports_mpam()) + return; + + /* trap guest access to MPAMIDR_EL1 */ + if (system_supports_mpam_hcr()) { + write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); + } else { + /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ + r |= MPAM2_EL2_TIDR; + } + + write_sysreg_s(r, SYS_MPAM2_EL2); +} + +static inline void __deactivate_traps_mpam(void) +{ + if (!system_supports_mpam()) + return; + + write_sysreg_s(0, SYS_MPAM2_EL2); + + if (system_supports_mpam_hcr()) + write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); +} + static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ @@ -212,6 +241,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) } __activate_traps_hfgxtr(vcpu); + __activate_traps_mpam(vcpu); } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) @@ -231,6 +261,7 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); __deactivate_traps_hfgxtr(vcpu); + __deactivate_traps_mpam(); } static inline void ___activate_traps(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 3c2705d4f218..155d5975ffdb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2471,8 +2471,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_LOREA_EL1), trap_loregion }, { SYS_DESC(SYS_LORN_EL1), trap_loregion }, { SYS_DESC(SYS_LORC_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAMIDR_EL1), undef_access }, { SYS_DESC(SYS_LORID_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAM1_EL1), undef_access }, + { SYS_DESC(SYS_MPAM0_EL1), undef_access }, { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, @@ -2743,6 +2746,17 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(MAIR_EL2, access_rw, reset_val, 0), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_MPAMHCR_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access }, + { SYS_DESC(SYS_MPAM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access }, EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), -- Gitee From cfec6b917c56827447bc18b0e0c323fa90c6eedd Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:15 +0000 Subject: [PATCH 010/124] KVM: arm64: Add a macro for creating filtered sys_reg_descs entries ANBZ: #31060 commit 7da540e29dea6016ed55d16450d3133c70761d21 upstream. The sys_reg_descs array holds function pointers and reset value for managing the user-space and guest view of system registers. These are mostly created by a set of macro's as only some combinations of behaviour are needed. If a register needs special treatment, its sys_reg_descs entry is open-coded. This is true of some id registers where the value provided by user-space is validated by some helpers. Before adding another one of these, add a helper that covers the existing special cases. 'ID_FILTERED' expects helpers to set the user-space value, and retrieve the modified reset value. Like ID_WRITABLE() this uses id_visibility(), which should have no functional change for the registers converted to use ID_FILTERED(). read_sanitised_id_aa64dfr0_el1() and read_sanitised_id_aa64pfr0_el1() have been refactored to be called from kvm_read_sanitised_id_reg(), to try be consistent with ID_WRITABLE(). Signed-off-by: James Morse Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-6-joey.gouly@arm.com [ changes of backport ] Drop ID_AA64DFR0_EL1_DoubleLock_MASK and ID_AA64DFR0_EL1_WRPs_MASK for id_aa64dfr0_el1 ID_FILTERED, as ANCK 6.6 doesn't support them. Signed-off-by: Oliver Upton Signed-off-by: Wei Chen --- arch/arm64/kvm/sys_regs.c | 63 +++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 155d5975ffdb..700610ee79ad 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1523,6 +1523,9 @@ static u8 pmuver_to_perfmon(u8 pmuver) } } +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); + /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) @@ -1536,6 +1539,12 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val = read_sanitised_ftr_reg(id); switch (id) { + case SYS_ID_AA64DFR0_EL1: + val = sanitise_id_aa64dfr0_el1(vcpu, val); + break; + case SYS_ID_AA64PFR0_EL1: + val = sanitise_id_aa64pfr0_el1(vcpu, val); + break; case SYS_ID_AA64PFR1_EL1: if (!kvm_has_mte(vcpu->kvm)) val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); @@ -1664,11 +1673,8 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } -static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - if (!vcpu_has_sve(vcpu)) val &= ~ID_AA64PFR0_EL1_SVE_MASK; @@ -1709,11 +1715,8 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, (val); \ }) -static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); - val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); /* @@ -1806,6 +1809,12 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, val); } +static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 val) +{ + return set_id_reg(vcpu, rd, val); +} + /* * cpufeature ID register user accessors * @@ -2130,6 +2139,15 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, .val = mask, \ } +/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ +#define ID_FILTERED(sysreg, name, mask) { \ + ID_DESC(sysreg), \ + .set_user = set_##name, \ + .visibility = id_visibility, \ + .reset = kvm_read_sanitised_id_reg, \ + .val = (mask), \ +} + /* * sys_reg_desc initialiser for architecturally unallocated cpufeature ID * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 @@ -2309,17 +2327,14 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 ID registers */ /* CRm=4 */ - { SYS_DESC(SYS_ID_AA64PFR0_EL1), - .access = access_id_reg, - .get_user = get_id_reg, - .set_user = set_id_reg, - .reset = read_sanitised_id_aa64pfr0_el1, - .val = ~(ID_AA64PFR0_EL1_AMU | - ID_AA64PFR0_EL1_SVE | - ID_AA64PFR0_EL1_RAS | - ID_AA64PFR0_EL1_GIC | - ID_AA64PFR0_EL1_AdvSIMD | - ID_AA64PFR0_EL1_FP), }, + ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1, + ~(ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SVE | + ID_AA64PFR0_EL1_RAS | + ID_AA64PFR0_EL1_GIC | + ID_AA64PFR0_EL1_AdvSIMD | + ID_AA64PFR0_EL1_FP)), ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR | ID_AA64PFR1_EL1_DF2 | ID_AA64PFR1_EL1_MTEX | @@ -2341,13 +2356,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(4,7), /* CRm=5 */ - { SYS_DESC(SYS_ID_AA64DFR0_EL1), - .access = access_id_reg, - .get_user = get_id_reg, - .set_user = set_id_aa64dfr0_el1, - .reset = read_sanitised_id_aa64dfr0_el1, - .val = ID_AA64DFR0_EL1_PMUVer_MASK | - ID_AA64DFR0_EL1_DebugVer_MASK, }, + ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1, + ID_AA64DFR0_EL1_PMUVer_MASK | + ID_AA64DFR0_EL1_DebugVer_MASK), ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), -- Gitee From 7886697d7fde0cf7ea90e2554da694b32f80c1d2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:16 +0000 Subject: [PATCH 011/124] KVM: arm64: Disable MPAM visibility by default and ignore VMM writes ANBZ: #31060 commit 6685f5d572c22e1003e7c0d089afe1c64340ab1f upstream. commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to guests, but didn't add trap handling. A previous patch supplied the missing trap handling. Existing VMs that have the MPAM field of ID_AA64PFR0_EL1 set need to be migratable, but there is little point enabling the MPAM CPU interface on new VMs until there is something a guest can do with it. Clear the MPAM field from the guest's ID_AA64PFR0_EL1 and on hardware that supports MPAM, politely ignore the VMMs attempts to set this bit. Guests exposed to this bug have the sanitised value of the MPAM field, so only the correct value needs to be ignored. This means the field can continue to be used to block migration to incompatible hardware (between MPAM=1 and MPAM=5), and the VMM can't rely on the field being ignored. Signed-off-by: James Morse Co-developed-by: Joey Gouly Signed-off-by: Joey Gouly Reviewed-by: Gavin Shan Tested-by: Shameer Kolothum Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-7-joey.gouly@arm.com Signed-off-by: Oliver Upton Signed-off-by: Wei Chen --- arch/arm64/kvm/sys_regs.c | 45 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 700610ee79ad..fc19351666bb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1558,6 +1558,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -1702,6 +1703,13 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) val &= ~ID_AA64PFR0_EL1_AMU_MASK; + /* + * MPAM is disabled by default as KVM also needs a set of PARTID to + * program the MPAMVPMx_EL2 PARTID remapping registers with. But some + * older kernels let the guest see the ID bit. + */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + return val; } @@ -1810,9 +1818,39 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, } static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, u64 val) + const struct sys_reg_desc *rd, u64 user_val) { - return set_id_reg(vcpu, rd, val); + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK; + + /* + * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits + * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to + * guests, but didn't add trap handling. KVM doesn't support MPAM and + * always returns an UNDEF for these registers. The guest must see 0 + * for this field. + * + * But KVM must also accept values from user-space that were provided + * by KVM. On CPUs that support MPAM, permit user-space to write + * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field. + */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; + + /* See set_id_aa64pfr0_el1 for comment about MPAM */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + + return set_id_reg(vcpu, rd, user_val); } /* @@ -2335,7 +2373,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_GIC | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP)), - ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR | + ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, + ~(ID_AA64PFR1_EL1_PFAR | ID_AA64PFR1_EL1_DF2 | ID_AA64PFR1_EL1_MTEX | ID_AA64PFR1_EL1_THE | -- Gitee From da550b18f17e2ee9c2b46edd2735d80c3fbf8f08 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 30 Oct 2024 16:03:17 +0000 Subject: [PATCH 012/124] KVM: arm64: selftests: Test ID_AA64PFR0.MPAM isn't completely ignored ANBZ: #31060 commit 75cd027cbcb161e277209e20df14f0818c62d9e7 upstream. The ID_AA64PFR0.MPAM bit was previously accidentally exposed to guests, and is ignored by KVM. KVM will always present the guest with 0 here, and trap the MPAM system registers to inject an undef. But, this value is still needed to prevent migration when the value is incompatible with the target hardware. Add a kvm unit test to try and write multiple values to ID_AA64PFR0.MPAM. Only the hardware value previously exposed should be ignored, all other values should be rejected. Signed-off-by: James Morse Signed-off-by: Joey Gouly Tested-by: Shameer Kolothum Reviewed-by: Gavin Shan Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241030160317.2528209-8-joey.gouly@arm.com Signed-off-by: Oliver Upton Signed-off-by: Wei Chen --- .../selftests/kvm/aarch64/set_id_regs.c | 100 +++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index 3a3fffb07956..e37a10ceec90 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -428,6 +428,101 @@ static void test_user_set_reg(struct kvm_vcpu *vcpu, bool aarch64_only) } } +#define MPAM_IDREG_TEST 6 +static void test_user_set_mpam_reg(struct kvm_vcpu *vcpu) +{ + uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE]; + struct reg_mask_range range = { + .addr = (__u64)masks, + }; + uint64_t val; + int idx, err; + + /* + * If ID_AA64PFR0.MPAM is _not_ officially modifiable and is zero, + * check that if it can be set to 1, (i.e. it is supported by the + * hardware), that it can't be set to other values. + */ + + /* Get writable masks for feature ID registers */ + memset(range.reserved, 0, sizeof(range.reserved)); + vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range); + + /* Writeable? Nothing to test! */ + idx = encoding_to_range_idx(SYS_ID_AA64PFR0_EL1); + if ((masks[idx] & ID_AA64PFR0_EL1_MPAM_MASK) == ID_AA64PFR0_EL1_MPAM_MASK) { + ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is officially writable, nothing to test\n"); + return; + } + + /* Get the id register value */ + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val); + + /* Try to set MPAM=0. This should always be possible. */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 0); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val); + if (err) + ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM=0 was not accepted\n"); + else + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=0 worked\n"); + + /* Try to set MPAM=1 */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 1); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val); + if (err) + ksft_test_result_skip("ID_AA64PFR0_EL1.MPAM is not writable, nothing to test\n"); + else + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM=1 was writable\n"); + + /* Try to set MPAM=2 */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + val |= FIELD_PREP(ID_AA64PFR0_EL1_MPAM_MASK, 2); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), val); + if (err) + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM not arbitrarily modifiable\n"); + else + ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM value should not be ignored\n"); + + /* And again for ID_AA64PFR1_EL1.MPAM_frac */ + idx = encoding_to_range_idx(SYS_ID_AA64PFR1_EL1); + if ((masks[idx] & ID_AA64PFR1_EL1_MPAM_frac_MASK) == ID_AA64PFR1_EL1_MPAM_frac_MASK) { + ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is officially writable, nothing to test\n"); + return; + } + + /* Get the id register value */ + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), &val); + + /* Try to set MPAM_frac=0. This should always be possible. */ + val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 0); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val); + if (err) + ksft_test_result_fail("ID_AA64PFR0_EL1.MPAM_frac=0 was not accepted\n"); + else + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=0 worked\n"); + + /* Try to set MPAM_frac=1 */ + val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 1); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val); + if (err) + ksft_test_result_skip("ID_AA64PFR1_EL1.MPAM_frac is not writable, nothing to test\n"); + else + ksft_test_result_pass("ID_AA64PFR0_EL1.MPAM_frac=1 was writable\n"); + + /* Try to set MPAM_frac=2 */ + val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + val |= FIELD_PREP(ID_AA64PFR1_EL1_MPAM_frac_MASK, 2); + err = __vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1), val); + if (err) + ksft_test_result_pass("ID_AA64PFR1_EL1.MPAM_frac not arbitrarily modifiable\n"); + else + ksft_test_result_fail("ID_AA64PFR1_EL1.MPAM_frac value should not be ignored\n"); +} + static void test_guest_reg_read(struct kvm_vcpu *vcpu) { bool done = false; @@ -479,11 +574,14 @@ int main(void) ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + - ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs); + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + + MPAM_IDREG_TEST; ksft_set_plan(ftr_cnt); test_user_set_reg(vcpu, aarch64_only); + test_user_set_mpam_reg(vcpu); + test_guest_reg_read(vcpu); kvm_vm_free(vm); -- Gitee From 066d227c7927a67ec2ebdc9343a14b4a83f254f4 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Mon, 2 Jun 2025 12:33:21 +0800 Subject: [PATCH 013/124] arm64: Add override for MPAM ANBZ: #31060 commit 10f885d63a0efd50b0d22bf27eb3cf727838e99e upstream. As the message of the commit 09e6b306f3ba ("arm64: cpufeature: discover CPU support for MPAM") already states, if a buggy firmware fails to either enable MPAM or emulate the trap as if it were disabled, the kernel will just fail to boot. While upgrading the firmware should be the best solution, we have some hardware of which the vendor have made no response 2 months after we requested a firmware update. Allow overriding it so our devices don't become some e-waste. Cc: James Morse Cc: Marc Zyngier Cc: Will Deacon Cc: Shameer Kolothum Cc: Mingcong Bai Cc: Shaopeng Tan Cc: Ben Horgan Signed-off-by: Xi Ruoyao Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250602043723.216338-1-xry111@xry111.site Signed-off-by: Will Deacon Signed-off-by: Wei Chen --- .../admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/include/asm/el2_setup.h | 24 ++++++++----------- arch/arm64/kernel/cpufeature.c | 7 ++++-- arch/arm64/kernel/cpuinfo.c | 7 ++++-- arch/arm64/kernel/idreg-override.c | 3 +++ 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 506c7e7b8b89..beda032be1dc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -426,6 +426,9 @@ arm64.nomops [ARM64] Unconditionally disable Memory Copy and Memory Set instructions support + arm64.nompam [ARM64] Unconditionally disable Memory Partitioning And + Monitoring support + arm64.nomte [ARM64] Unconditionally disable Memory Tagging Extension support diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 735bfe239702..a00e9d695273 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -275,19 +275,6 @@ msr spsr_el2, x0 .endm -.macro __init_el2_mpam - /* Memory Partitioning And Monitoring: disable EL2 traps */ - mrs x1, id_aa64pfr0_el1 - ubfx x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4 - cbz x0, .Lskip_mpam_\@ // skip if no MPAM - msr_s SYS_MPAM2_EL2, xzr // use the default partition - // and disable lower traps - mrs_s x0, SYS_MPAMIDR_EL1 - tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg - msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 -.Lskip_mpam_\@: -.endm - /** * Initialize EL2 registers to sane values. This should be called early on all * cores that were booted in EL2. Note that everything gets initialised as @@ -306,7 +293,6 @@ __init_el2_stage2 __init_el2_gicv3 __init_el2_hstr - __init_el2_mpam __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt @@ -351,6 +337,16 @@ #endif .macro finalise_el2_state + check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2 + +.Linit_mpam_\@: + msr_s SYS_MPAM2_EL2, xzr // use the default partition + // and disable lower traps + mrs_s x0, SYS_MPAMIDR_EL1 + tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg + msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 + +.Lskip_mpam_\@: check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2 .Linit_sve_\@: /* SVE register access */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 349eb3ebdaab..5e26020a66e5 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1157,8 +1157,10 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) vec_init_vq_map(ARM64_VEC_SME); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + } if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); @@ -1426,7 +1428,8 @@ void update_cpu_features(int cpu, vec_update_vq_map(ARM64_VEC_SME); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) { + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, info->reg_mpamidr, boot->reg_mpamidr); } diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index c4512a59be79..1fc0a147e76c 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -505,8 +505,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) - info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + /* + * info->reg_mpamidr deferred to {init,update}_cpu_features because we + * don't want to read it (and trigger a trap on buggy firmware) if + * using an aa64pfr0_el1 override to unconditionally disable MPAM. + */ cpuinfo_detect_icache_policy(info); } diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 3addc09f8746..9f1a845c892b 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -75,6 +75,7 @@ static const struct ftr_set_desc pfr0 __initconst = { .override = &id_aa64pfr0_override, .fields = { FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), + FIELD("mpam", ID_AA64PFR0_EL1_MPAM_SHIFT, NULL), {} }, }; @@ -101,6 +102,7 @@ static const struct ftr_set_desc pfr1 __initconst = { FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + FIELD("mpam_frac", ID_AA64PFR1_EL1_MPAM_frac_SHIFT, NULL), {} }, }; @@ -185,6 +187,7 @@ static const struct { { "arm64.nomops", "id_aa64isar2.mops=0" }, { "arm64.nomte", "id_aa64pfr1.mte=0" }, { "nokaslr", "arm64_sw.nokaslr=1" }, + { "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" }, }; static int __init parse_nokaslr(char *unused) -- Gitee From 521b91c82f3804c9ab2d5fc349e00d86a6810618 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Wed, 7 Feb 2024 19:54:20 +0000 Subject: [PATCH 014/124] ACPICA: MPAM: Correct the typo in struct acpi_mpam_msc_node member ANBZ: #31060 commit cf94e10a037c337559bf6c5486cc1abdf4900a08 upstream. ACPICA commit 3da3f7d776d17e9bfbb15de88317de8d7397ce38 A member of the struct acpi_mpam_msc_node that represents a Memory System Controller node structure - num_resource_nodes has a typo. Fix the typo No functional change. Link: https://github.com/acpica/acpica/commit/3da3f7d7 Signed-off-by: Punit Agrawal Signed-off-by: Rafael J. Wysocki Signed-off-by: Wei Chen --- include/acpi/actbl2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h index 1414bc008257..2fd68292f944 100644 --- a/include/acpi/actbl2.h +++ b/include/acpi/actbl2.h @@ -1637,7 +1637,7 @@ struct acpi_mpam_msc_node { u32 max_nrdy_usec; u64 hardware_id_linked_device; u32 instance_id_linked_device; - u32 num_resouce_nodes; + u32 num_resource_nodes; }; struct acpi_table_mpam { -- Gitee From f27e7d8442a6492d51008fc82710c294805d5eb0 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:31 +0000 Subject: [PATCH 015/124] ACPI / PPTT: Add a helper to fill a cpumask from a processor container ANBZ: #31060 commit 796e29b857aed89f83f70f2c199585c45db5dc0f upstream. The ACPI MPAM table uses the UID of a processor container specified in the PPTT to indicate the subset of CPUs and cache topology that can access each MPAM System Component (MSC). This information is not directly useful to the kernel. The equivalent cpumask is needed instead. Add a helper to find the processor container by its id, then walk the possible CPUs to fill a cpumask with the CPUs that have this processor container as a parent. CC: Dave Martin Reviewed-by: Sudeep Holla Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Hanjun Guo Reviewed-by: Jeremy Linton Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/acpi/pptt.c | 84 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 3 ++ 2 files changed, 87 insertions(+) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 54676e3d82dd..b8248c0092fe 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -817,3 +817,87 @@ int find_acpi_cpu_topology_hetero_id(unsigned int cpu) return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE, ACPI_PPTT_ACPI_IDENTICAL); } + +/** + * acpi_pptt_get_child_cpus() - Find all the CPUs below a PPTT + * processor hierarchy node + * + * @table_hdr: A reference to the PPTT table + * @parent_node: A pointer to the processor hierarchy node in the + * table_hdr + * @cpus: A cpumask to fill with the CPUs below @parent_node + * + * Walks up the PPTT from every possible CPU to find if the provided + * @parent_node is a parent of this CPU. + */ +static void acpi_pptt_get_child_cpus(struct acpi_table_header *table_hdr, + struct acpi_pptt_processor *parent_node, + cpumask_t *cpus) +{ + struct acpi_pptt_processor *cpu_node; + u32 acpi_id; + int cpu; + + cpumask_clear(cpus); + + for_each_possible_cpu(cpu) { + acpi_id = get_acpi_id_for_cpu(cpu); + cpu_node = acpi_find_processor_node(table_hdr, acpi_id); + + while (cpu_node) { + if (cpu_node == parent_node) { + cpumask_set_cpu(cpu, cpus); + break; + } + cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent); + } + } +} + +/** + * acpi_pptt_get_cpus_from_container() - Populate a cpumask with all CPUs in a + * processor container + * @acpi_cpu_id: The UID of the processor container + * @cpus: The resulting CPU mask + * + * Find the specified Processor Container, and fill @cpus with all the cpus + * below it. + * + * Not all 'Processor Hierarchy' entries in the PPTT are either a CPU + * or a Processor Container, they may exist purely to describe a + * Private resource. CPUs have to be leaves, so a Processor Container + * is a non-leaf that has the 'ACPI Processor ID valid' flag set. + */ +void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) +{ + struct acpi_table_header *table_hdr; + struct acpi_subtable_header *entry; + unsigned long table_end; + u32 proc_sz; + + cpumask_clear(cpus); + + table_hdr = acpi_get_pptt(); + if (!table_hdr) + return; + + table_end = (unsigned long)table_hdr + table_hdr->length; + entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr, + sizeof(struct acpi_table_pptt)); + proc_sz = sizeof(struct acpi_pptt_processor); + while ((unsigned long)entry + proc_sz <= table_end) { + if (entry->type == ACPI_PPTT_TYPE_PROCESSOR) { + struct acpi_pptt_processor *cpu_node; + + cpu_node = (struct acpi_pptt_processor *)entry; + if (cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID && + !acpi_pptt_leaf_node(table_hdr, cpu_node) && + cpu_node->acpi_processor_id == acpi_cpu_id) { + acpi_pptt_get_child_cpus(table_hdr, cpu_node, cpus); + break; + } + } + entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry, + entry->length); + } +} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 2131bf9f5bc1..29dc5527ba4c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1504,6 +1504,7 @@ int find_acpi_cpu_topology(unsigned int cpu, int level); int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); +void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1525,6 +1526,8 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) { return -EINVAL; } +static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, + cpumask_t *cpus) { } #endif #ifdef CONFIG_ARM64 -- Gitee From d9e413f1a6f1c0953ef079e57786cc7da970e8cf Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:32 +0000 Subject: [PATCH 016/124] ACPI / PPTT: Stop acpi_count_levels() expecting callers to clear levels ANBZ: #31060 commit eeec7845e966f9278973c02573e3587e6733a4dd upstream. In acpi_count_levels(), the initial value of *levels passed by the caller is really an implementation detail of acpi_count_levels(), so it is unreasonable to expect the callers of this function to know what to pass in for this parameter. The only sensible initial value is 0, which is what the only upstream caller (acpi_get_cache_info()) passes. Use a local variable for the starting cache level in acpi_count_levels(), and pass the result back to the caller via the function return value. Get rid of the levels parameter, which has no remaining purpose. Fix acpi_get_cache_info() to match. Suggested-by: Jonathan Cameron Signed-off-by: James Morse Reviewed-by: Lorenzo Pieralisi Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Hanjun Guo Reviewed-by: Jeremy Linton Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/acpi/pptt.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index b8248c0092fe..2856254e29d7 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -177,14 +177,14 @@ acpi_find_cache_level(struct acpi_table_header *table_hdr, } /** - * acpi_count_levels() - Given a PPTT table, and a CPU node, count the cache - * levels and split cache levels (data/instruction). + * acpi_count_levels() - Given a PPTT table, and a CPU node, count the + * total number of levels and split cache levels (data/instruction). * @table_hdr: Pointer to the head of the PPTT table * @cpu_node: processor node we wish to count caches for - * @levels: Number of levels if success. * @split_levels: Number of split cache levels (data/instruction) if * success. Can by NULL. * + * Return: number of levels. * Given a processor node containing a processing unit, walk into it and count * how many levels exist solely for it, and then walk up each level until we hit * the root node (ignore the package level because it may be possible to have @@ -192,14 +192,18 @@ acpi_find_cache_level(struct acpi_table_header *table_hdr, * split cache levels (data/instruction) that exist at each level on the way * up. */ -static void acpi_count_levels(struct acpi_table_header *table_hdr, - struct acpi_pptt_processor *cpu_node, - unsigned int *levels, unsigned int *split_levels) +static int acpi_count_levels(struct acpi_table_header *table_hdr, + struct acpi_pptt_processor *cpu_node, + unsigned int *split_levels) { + int current_level = 0; + do { - acpi_find_cache_level(table_hdr, cpu_node, levels, split_levels, 0, 0); + acpi_find_cache_level(table_hdr, cpu_node, ¤t_level, split_levels, 0, 0); cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent); } while (cpu_node); + + return current_level; } /** @@ -645,7 +649,7 @@ int acpi_get_cache_info(unsigned int cpu, unsigned int *levels, if (!cpu_node) return -ENOENT; - acpi_count_levels(table, cpu_node, levels, split_levels); + *levels = acpi_count_levels(table, cpu_node, split_levels); pr_debug("Cache Setup: last_level=%d split_levels=%d\n", *levels, split_levels ? *split_levels : -1); -- Gitee From e0049bad8e2579011110d2562693f03f6f104385 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Wed, 19 Nov 2025 12:22:33 +0000 Subject: [PATCH 017/124] ACPI / PPTT: Add acpi_pptt_cache_v1_full to use pptt cache as one structure ANBZ: #31060 commit cfc085af8398479e855b86236a21e1d870d51184 upstream. In actbl2.h, acpi_pptt_cache describes the fields in the original Cache Type Structure. In PPTT table version 3 a new field was added at the end, cache_id. This is described in acpi_pptt_cache_v1 but rather than including all v1 fields it just includes this one. In lieu of this being fixed in acpica, introduce acpi_pptt_cache_v1_full to contain all the fields of the Cache Type Structure . Update the existing code to use this new struct. This simplifies the code and removes a non-standard use of ACPI_ADD_PTR. Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Hanjun Guo Reviewed-by: Jeremy Linton Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/acpi/pptt.c | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 2856254e29d7..ef39b176dc00 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -21,6 +21,25 @@ #include #include +/* + * The acpi_pptt_cache_v1 in actbl2.h, which is imported from acpica, + * only contains the cache_id field rather than all the fields of the + * Cache Type Structure. Use this alternative structure until it is + * resolved in acpica. + */ +struct acpi_pptt_cache_v1_full { + struct acpi_subtable_header header; + u16 reserved; + u32 flags; + u32 next_level_of_cache; + u32 size; + u32 number_of_sets; + u8 associativity; + u8 attributes; + u16 line_size; + u32 cache_id; +} __packed; + static struct acpi_subtable_header *fetch_pptt_subtable(struct acpi_table_header *table_hdr, u32 pptt_ref) { @@ -56,6 +75,18 @@ static struct acpi_pptt_cache *fetch_pptt_cache(struct acpi_table_header *table_ return (struct acpi_pptt_cache *)fetch_pptt_subtable(table_hdr, pptt_ref); } +static struct acpi_pptt_cache_v1_full *upgrade_pptt_cache(struct acpi_pptt_cache *cache) +{ + if (cache->header.length < sizeof(struct acpi_pptt_cache_v1_full)) + return NULL; + + /* No use for v1 if the only additional field is invalid */ + if (!(cache->flags & ACPI_PPTT_CACHE_ID_VALID)) + return NULL; + + return (struct acpi_pptt_cache_v1_full *)cache; +} + static struct acpi_subtable_header *acpi_get_pptt_resource(struct acpi_table_header *table_hdr, struct acpi_pptt_processor *node, int resource) @@ -355,7 +386,6 @@ static struct acpi_pptt_cache *acpi_find_cache_node(struct acpi_table_header *ta * @this_leaf: Kernel cache info structure being updated * @found_cache: The PPTT node describing this cache instance * @cpu_node: A unique reference to describe this cache instance - * @revision: The revision of the PPTT table * * The ACPI spec implies that the fields in the cache structures are used to * extend and correct the information probed from the hardware. Lets only @@ -365,10 +395,9 @@ static struct acpi_pptt_cache *acpi_find_cache_node(struct acpi_table_header *ta */ static void update_cache_properties(struct cacheinfo *this_leaf, struct acpi_pptt_cache *found_cache, - struct acpi_pptt_processor *cpu_node, - u8 revision) + struct acpi_pptt_processor *cpu_node) { - struct acpi_pptt_cache_v1* found_cache_v1; + struct acpi_pptt_cache_v1_full *found_cache_v1; this_leaf->fw_token = cpu_node; if (found_cache->flags & ACPI_PPTT_SIZE_PROPERTY_VALID) @@ -418,9 +447,8 @@ static void update_cache_properties(struct cacheinfo *this_leaf, found_cache->flags & ACPI_PPTT_CACHE_TYPE_VALID) this_leaf->type = CACHE_TYPE_UNIFIED; - if (revision >= 3 && (found_cache->flags & ACPI_PPTT_CACHE_ID_VALID)) { - found_cache_v1 = ACPI_ADD_PTR(struct acpi_pptt_cache_v1, - found_cache, sizeof(struct acpi_pptt_cache)); + found_cache_v1 = upgrade_pptt_cache(found_cache); + if (found_cache_v1) { this_leaf->id = found_cache_v1->cache_id; this_leaf->attributes |= CACHE_ID; } @@ -445,8 +473,7 @@ static void cache_setup_acpi_cpu(struct acpi_table_header *table, pr_debug("found = %p %p\n", found_cache, cpu_node); if (found_cache) update_cache_properties(this_leaf, found_cache, - ACPI_TO_POINTER(ACPI_PTR_DIFF(cpu_node, table)), - table->revision); + ACPI_TO_POINTER(ACPI_PTR_DIFF(cpu_node, table))); index++; } -- Gitee From 7478593c52069a1b887f018210485480510694d6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:34 +0000 Subject: [PATCH 018/124] ACPI / PPTT: Find cache level by cache-id ANBZ: #31060 commit 41a7bb39fede8ecc053c261b86cdfadea45b7b10 upstream. The MPAM table identifies caches by id. The MPAM driver also wants to know the cache level to determine if the platform is of the shape that can be managed via resctrl. Cacheinfo has this information, but only for CPUs that are online. Waiting for all CPUs to come online is a problem for platforms where CPUs are brought online late by user-space. Add a helper that walks every possible cache, until it finds the one identified by cache-id, then return the level. Signed-off-by: James Morse Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Fenghua Yu Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Jeremy Linton Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/acpi/pptt.c | 66 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 5 ++++ 2 files changed, 71 insertions(+) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index ef39b176dc00..da49b56a1ef2 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -932,3 +932,69 @@ void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) entry->length); } } + +/** + * find_acpi_cache_level_from_id() - Get the level of the specified cache + * @cache_id: The id field of the cache + * + * Determine the level relative to any CPU for the cache identified by + * cache_id. This allows the property to be found even if the CPUs are offline. + * + * The returned level can be used to group caches that are peers. + * + * The PPTT table must be rev 3 or later. + * + * If one CPU's L2 is shared with another CPU as L3, this function will return + * an unpredictable value. + * + * Return: -ENOENT if the PPTT doesn't exist, the revision isn't supported or + * the cache cannot be found. + * Otherwise returns a value which represents the level of the specified cache. + */ +int find_acpi_cache_level_from_id(u32 cache_id) +{ + int cpu; + struct acpi_table_header *table; + + table = acpi_get_pptt(); + if (!table) + return -ENOENT; + + if (table->revision < 3) + return -ENOENT; + + for_each_possible_cpu(cpu) { + bool empty; + int level = 1; + u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu); + struct acpi_pptt_cache *cache; + struct acpi_pptt_processor *cpu_node; + + cpu_node = acpi_find_processor_node(table, acpi_cpu_id); + if (!cpu_node) + continue; + + do { + int cache_type[] = {CACHE_TYPE_INST, CACHE_TYPE_DATA, CACHE_TYPE_UNIFIED}; + + empty = true; + for (int i = 0; i < ARRAY_SIZE(cache_type); i++) { + struct acpi_pptt_cache_v1_full *cache_v1; + + cache = acpi_find_cache_node(table, acpi_cpu_id, cache_type[i], + level, &cpu_node); + if (!cache) + continue; + + empty = false; + + cache_v1 = upgrade_pptt_cache(cache); + if (cache_v1 && cache_v1->cache_id == cache_id) + return level; + } + level++; + } while (!empty); + } + + return -ENOENT; +} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 29dc5527ba4c..09d8c915ab3b 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1505,6 +1505,7 @@ int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); +int find_acpi_cache_level_from_id(u32 cache_id); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1528,6 +1529,10 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) } static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) { } +static inline int find_acpi_cache_level_from_id(u32 cache_id) +{ + return -ENOENT; +} #endif #ifdef CONFIG_ARM64 -- Gitee From 1b8fae113b20f3002ce996ef93d5fbf403f65784 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:35 +0000 Subject: [PATCH 019/124] ACPI / PPTT: Add a helper to fill a cpumask from a cache_id ANBZ: #31060 commit a39a723a6f1ed9a1602ccf8dd56392402afa7339 upstream. MPAM identifies CPUs by the cache_id in the PPTT cache structure. The driver needs to know which CPUs are associated with the cache. The CPUs may not all be online, so cacheinfo does not have the information. Add a helper to pull this information out of the PPTT. CC: Rohit Mathew Reviewed-by: Gavin Shan Reviewed-by: Fenghua Yu Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Jeremy Linton Tested-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/acpi/pptt.c | 65 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 6 ++++ 2 files changed, 71 insertions(+) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index da49b56a1ef2..de5f8c018333 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -998,3 +998,68 @@ int find_acpi_cache_level_from_id(u32 cache_id) return -ENOENT; } + +/** + * acpi_pptt_get_cpumask_from_cache_id() - Get the cpus associated with the + * specified cache + * @cache_id: The id field of the cache + * @cpus: Where to build the cpumask + * + * Determine which CPUs are below this cache in the PPTT. This allows the property + * to be found even if the CPUs are offline. + * + * The PPTT table must be rev 3 or later, + * + * Return: -ENOENT if the PPTT doesn't exist, or the cache cannot be found. + * Otherwise returns 0 and sets the cpus in the provided cpumask. + */ +int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus) +{ + int cpu; + struct acpi_table_header *table; + + cpumask_clear(cpus); + + table = acpi_get_pptt(); + if (!table) + return -ENOENT; + + if (table->revision < 3) + return -ENOENT; + + for_each_possible_cpu(cpu) { + bool empty; + int level = 1; + u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu); + struct acpi_pptt_cache *cache; + struct acpi_pptt_processor *cpu_node; + + cpu_node = acpi_find_processor_node(table, acpi_cpu_id); + if (!cpu_node) + continue; + + do { + int cache_type[] = {CACHE_TYPE_INST, CACHE_TYPE_DATA, CACHE_TYPE_UNIFIED}; + + empty = true; + for (int i = 0; i < ARRAY_SIZE(cache_type); i++) { + struct acpi_pptt_cache_v1_full *cache_v1; + + cache = acpi_find_cache_node(table, acpi_cpu_id, cache_type[i], + level, &cpu_node); + + if (!cache) + continue; + + empty = false; + + cache_v1 = upgrade_pptt_cache(cache); + if (cache_v1 && cache_v1->cache_id == cache_id) + cpumask_set_cpu(cpu, cpus); + } + level++; + } while (!empty); + } + + return 0; +} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 09d8c915ab3b..7dcdfe0d19af 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1506,6 +1506,7 @@ int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); int find_acpi_cache_level_from_id(u32 cache_id); +int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1533,6 +1534,11 @@ static inline int find_acpi_cache_level_from_id(u32 cache_id) { return -ENOENT; } +static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, + cpumask_t *cpus) +{ + return -ENOENT; +} #endif #ifdef CONFIG_ARM64 -- Gitee From 4472f75ce419fa6e124a0900cc9635412a0eef2c Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:36 +0000 Subject: [PATCH 020/124] arm64: kconfig: Add Kconfig entry for MPAM ANBZ: #31060 commit d8bf01d80919e81a06dca77556dcfb351fa99b0c upstream. The bulk of the MPAM driver lives outside the arch code because it largely manages MMIO devices that generate interrupts. The driver needs a Kconfig symbol to enable it. As MPAM is only found on arm64 platforms, the arm64 tree is the most natural home for the Kconfig option. This Kconfig option will later be used by the arch code to enable or disable the MPAM context-switch code, and to register properties of CPUs with the MPAM driver. Signed-off-by: James Morse Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo CC: Dave Martin Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- arch/arm64/Kconfig | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 2ebdd77b0b25..ed61863089ea 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2093,6 +2093,29 @@ config ARM64_TLB_RANGE The feature introduces new assembly instructions, and they were support when binutils >= 2.30. +config ARM64_MPAM + bool "Enable support for MPAM" + help + Memory System Resource Partitioning and Monitoring (MPAM) is an + optional extension to the Arm architecture that allows each + transaction issued to the memory system to be labelled with a + Partition identifier (PARTID) and Performance Monitoring Group + identifier (PMG). + + Memory system components, such as the caches, can be configured with + policies to control how much of various physical resources (such as + memory bandwidth or cache memory) the transactions labelled with each + PARTID can consume. Depending on the capabilities of the hardware, + the PARTID and PMG can also be used as filtering criteria to measure + the memory system resource consumption of different parts of a + workload. + + Use of this extension requires CPU support, support in the + Memory System Components (MSC), and a description from firmware + of where the MSCs are in the address space. + + MPAM is exposed to user-space via the resctrl pseudo filesystem. + endmenu # "ARMv8.4 architectural features" menu "ARMv8.5 architectural features" -- Gitee From fec7c251f42de85edb4c02b9ac2696c30c7a8e55 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Wed, 19 Nov 2025 12:22:37 +0000 Subject: [PATCH 021/124] platform: Define platform_device_put cleanup handler ANBZ: #31060 commit f5915600cc4ca0338a37d5a8a4032e25d939156b upstream. Define a cleanup helper for use with __free to destroy platform devices automatically when the pointer goes out of scope. This is only intended to be used in error cases and so should be used with return_ptr() or no_free_ptr() directly to avoid the automatic destruction on success. A first use of this is introduced in a subsequent commit. Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- include/linux/platform_device.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 7a41c72c1959..1ddc35623b4c 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -232,6 +232,7 @@ extern int platform_device_add_data(struct platform_device *pdev, extern int platform_device_add(struct platform_device *pdev); extern void platform_device_del(struct platform_device *pdev); extern void platform_device_put(struct platform_device *pdev); +DEFINE_FREE(platform_device_put, struct platform_device *, if (_T) platform_device_put(_T)) struct platform_driver { int (*probe)(struct platform_device *); -- Gitee From ce003133a91ba14ba97c24a60fed56cf95ea6c7b Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Wed, 19 Nov 2025 12:22:38 +0000 Subject: [PATCH 022/124] ACPI: Define acpi_put_table cleanup handler and acpi_get_table_pointer() helper ANBZ: #31060 commit 96f4a4d53e6660d9b62e8d739388267fbb660e9f upstream. Define a cleanup helper for use with __free to release the acpi table when the pointer goes out of scope. Also, introduce the helper acpi_get_table_pointer() to simplify a commonly used pattern involving acpi_get_table(). These are first used in a subsequent commit. Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- include/linux/acpi.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 7dcdfe0d19af..a99d3a052a81 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -8,6 +8,7 @@ #ifndef _LINUX_ACPI_H #define _LINUX_ACPI_H +#include #include #include /* for struct resource */ #include @@ -224,6 +225,17 @@ void acpi_reserve_initial_tables (void); void acpi_table_init_complete (void); int acpi_table_init (void); +static inline struct acpi_table_header *acpi_get_table_pointer(char *signature, u32 instance) +{ + struct acpi_table_header *table; + int status = acpi_get_table(signature, instance, &table); + + if (ACPI_FAILURE(status)) + return ERR_PTR(-ENOENT); + return table; +} +DEFINE_FREE(acpi_put_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T)) + int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init_or_acpilib acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, -- Gitee From dfa3d288964076169e6a9c2b29f28fb95253caa8 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:39 +0000 Subject: [PATCH 023/124] ACPI / MPAM: Parse the MPAM table ANBZ: #31060 commit 115c5325beae7199219ab7c12ec2a2af8dea6c3c upstream. Add code to parse the arm64 specific MPAM table, looking up the cache level from the PPTT and feeding the end result into the MPAM driver. This happens in two stages. Platform devices are created first for the MSC devices. Once the driver probes it calls acpi_mpam_parse_resources() to discover the RIS entries the MSC contains. For now the MPAM hook mpam_ris_create() is stubbed out, but will update the MPAM driver with optional discovered data about the RIS entries. CC: Carl Worth Link: https://developer.arm.com/documentation/den0065/3-0bet/?lang=en Reviewed-by: Lorenzo Pieralisi Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- arch/arm64/Kconfig | 1 + drivers/acpi/arm64/Kconfig | 3 + drivers/acpi/arm64/Makefile | 1 + drivers/acpi/arm64/mpam.c | 411 ++++++++++++++++++++++++++++++++++++ drivers/acpi/tables.c | 2 +- include/linux/arm_mpam.h | 47 +++++ 6 files changed, 464 insertions(+), 1 deletion(-) create mode 100644 drivers/acpi/arm64/mpam.c create mode 100644 include/linux/arm_mpam.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ed61863089ea..5f2dc989f6f2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2095,6 +2095,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" + select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) is an optional extension to the Arm architecture that allows each diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig index b3ed6212244c..f2fd79f22e7d 100644 --- a/drivers/acpi/arm64/Kconfig +++ b/drivers/acpi/arm64/Kconfig @@ -21,3 +21,6 @@ config ACPI_AGDI config ACPI_APMT bool + +config ACPI_MPAM + bool diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile index 726944648c9b..a9e9c7e89b51 100644 --- a/drivers/acpi/arm64/Makefile +++ b/drivers/acpi/arm64/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_ACPI_AGDI) += agdi.o obj-$(CONFIG_ACPI_IORT) += iort.o obj-$(CONFIG_ACPI_GTDT) += gtdt.o +obj-$(CONFIG_ACPI_MPAM) += mpam.o obj-$(CONFIG_ACPI_APMT) += apmt.o obj-$(CONFIG_ARM_AMBA) += amba.o obj-y += dma.o init.o diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c new file mode 100644 index 000000000000..84963a20c3e7 --- /dev/null +++ b/drivers/acpi/arm64/mpam.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +/* Parse the MPAM ACPI table feeding the discovered nodes into the driver */ + +#define pr_fmt(fmt) "ACPI MPAM: " fmt + +#include +#include +#include +#include +#include +#include + +#include + +/* + * Flags for acpi_table_mpam_msc.*_interrupt_flags. + * See 2.1.1 Interrupt Flags, Table 5, of DEN0065B_MPAM_ACPI_3.0-bet. + */ +#define ACPI_MPAM_MSC_IRQ_MODE BIT(0) +#define ACPI_MPAM_MSC_IRQ_TYPE_MASK GENMASK(2, 1) +#define ACPI_MPAM_MSC_IRQ_TYPE_WIRED 0 +#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK BIT(3) +#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR 0 +#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER 1 +#define ACPI_MPAM_MSC_IRQ_AFFINITY_VALID BIT(4) + +/* + * Encodings for the MSC node body interface type field. + * See 2.1 MPAM MSC node, Table 4 of DEN0065B_MPAM_ACPI_3.0-bet. + */ +#define ACPI_MPAM_MSC_IFACE_MMIO 0x00 +#define ACPI_MPAM_MSC_IFACE_PCC 0x0a + +static bool _is_ppi_partition(u32 flags) +{ + u32 aff_type, is_ppi; + bool ret; + + is_ppi = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_VALID, flags); + if (!is_ppi) + return false; + + aff_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK, flags); + ret = (aff_type == ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER); + if (ret) + pr_err_once("Partitioned interrupts not supported\n"); + + return ret; +} + +static int acpi_mpam_register_irq(struct platform_device *pdev, + u32 intid, u32 flags) +{ + int irq; + u32 int_type; + int trigger; + + if (!intid) + return -EINVAL; + + if (_is_ppi_partition(flags)) + return -EINVAL; + + trigger = FIELD_GET(ACPI_MPAM_MSC_IRQ_MODE, flags); + int_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_TYPE_MASK, flags); + if (int_type != ACPI_MPAM_MSC_IRQ_TYPE_WIRED) + return -EINVAL; + + irq = acpi_register_gsi(&pdev->dev, intid, trigger, ACPI_ACTIVE_HIGH); + if (irq < 0) + pr_err_once("Failed to register interrupt 0x%x with ACPI\n", intid); + + return irq; +} + +static void acpi_mpam_parse_irqs(struct platform_device *pdev, + struct acpi_mpam_msc_node *tbl_msc, + struct resource *res, int *res_idx) +{ + u32 flags, intid; + int irq; + + intid = tbl_msc->overflow_interrupt; + flags = tbl_msc->overflow_interrupt_flags; + irq = acpi_mpam_register_irq(pdev, intid, flags); + if (irq > 0) + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "overflow"); + + intid = tbl_msc->error_interrupt; + flags = tbl_msc->error_interrupt_flags; + irq = acpi_mpam_register_irq(pdev, intid, flags); + if (irq > 0) + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error"); +} + +static int acpi_mpam_parse_resource(struct mpam_msc *msc, + struct acpi_mpam_resource_node *res) +{ + int level, nid; + u32 cache_id; + + switch (res->locator_type) { + case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: + cache_id = res->locator.cache_locator.cache_reference; + level = find_acpi_cache_level_from_id(cache_id); + if (level <= 0) { + pr_err_once("Bad level (%d) for cache with id %u\n", level, cache_id); + return -EINVAL; + } + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, + level, cache_id); + case ACPI_MPAM_LOCATION_TYPE_MEMORY: + nid = pxm_to_node(res->locator.memory_locator.proximity_domain); + if (nid == NUMA_NO_NODE) { + pr_debug("Bad proximity domain %lld, using node 0 instead\n", + res->locator.memory_locator.proximity_domain); + nid = 0; + } + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, + MPAM_CLASS_ID_DEFAULT, nid); + default: + /* These get discovered later and are treated as unknown */ + return 0; + } +} + +int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc) +{ + int i, err; + char *ptr, *table_end; + struct acpi_mpam_resource_node *resource; + + table_end = (char *)tbl_msc + tbl_msc->length; + ptr = (char *)(tbl_msc + 1); + for (i = 0; i < tbl_msc->num_resource_nodes; i++) { + u64 max_deps, remaining_table; + + if (ptr + sizeof(*resource) > table_end) + return -EINVAL; + + resource = (struct acpi_mpam_resource_node *)ptr; + + remaining_table = table_end - ptr; + max_deps = remaining_table / sizeof(struct acpi_mpam_func_deps); + if (resource->num_functional_deps > max_deps) { + pr_debug("MSC has impossible number of functional dependencies\n"); + return -EINVAL; + } + + err = acpi_mpam_parse_resource(msc, resource); + if (err) + return err; + + ptr += sizeof(*resource); + ptr += resource->num_functional_deps * sizeof(struct acpi_mpam_func_deps); + } + + return 0; +} + +/* + * Creates the device power management link and returns true if the + * acpi id is valid and usable for cpu affinity. This is the case + * when the linked device is a processor or a processor container. + */ +static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc, + struct platform_device *pdev, + u32 *acpi_id) +{ + char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1] = { 0 }; + bool acpi_id_valid = false; + struct acpi_device *buddy; + char uid[11]; + int len; + + memcpy(hid, &tbl_msc->hardware_id_linked_device, + sizeof(tbl_msc->hardware_id_linked_device)); + + if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) { + *acpi_id = tbl_msc->instance_id_linked_device; + acpi_id_valid = true; + } + + len = snprintf(uid, sizeof(uid), "%u", + tbl_msc->instance_id_linked_device); + if (len >= sizeof(uid)) { + pr_debug("Failed to convert uid of device for power management."); + return acpi_id_valid; + } + + buddy = acpi_dev_get_first_match_dev(hid, uid, -1); + if (buddy) { + device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS); + acpi_dev_put(buddy); + } + + return acpi_id_valid; +} + +static int decode_interface_type(struct acpi_mpam_msc_node *tbl_msc, + enum mpam_msc_iface *iface) +{ + switch (tbl_msc->interface_type) { + case ACPI_MPAM_MSC_IFACE_MMIO: + *iface = MPAM_IFACE_MMIO; + return 0; + case ACPI_MPAM_MSC_IFACE_PCC: + *iface = MPAM_IFACE_PCC; + return 0; + default: + return -EINVAL; + } +} + +static struct platform_device * __init acpi_mpam_parse_msc(struct acpi_mpam_msc_node *tbl_msc) +{ + struct platform_device *pdev __free(platform_device_put) = + platform_device_alloc("mpam_msc", tbl_msc->identifier); + int next_res = 0, next_prop = 0, err; + /* pcc, nrdy, affinity and a sentinel */ + struct property_entry props[4] = { 0 }; + /* mmio, 2xirq, no sentinel. */ + struct resource res[3] = { 0 }; + struct acpi_device *companion; + enum mpam_msc_iface iface; + char uid[16]; + u32 acpi_id; + + if (!pdev) + return ERR_PTR(-ENOMEM); + + /* Some power management is described in the namespace: */ + err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier); + if (err > 0 && err < sizeof(uid)) { + companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1); + if (companion) { + ACPI_COMPANION_SET(&pdev->dev, companion); + acpi_dev_put(companion); + } else { + pr_debug("MSC.%u: missing namespace entry\n", tbl_msc->identifier); + } + } + + if (decode_interface_type(tbl_msc, &iface)) { + pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier); + return ERR_PTR(-EINVAL); + } + + if (iface == MPAM_IFACE_MMIO) { + res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address, + tbl_msc->mmio_size, + "MPAM:MSC"); + } else if (iface == MPAM_IFACE_PCC) { + props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel", + tbl_msc->base_address); + } + + acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res); + + WARN_ON_ONCE(next_res > ARRAY_SIZE(res)); + err = platform_device_add_resources(pdev, res, next_res); + if (err) + return ERR_PTR(err); + + props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us", + tbl_msc->max_nrdy_usec); + + /* + * The MSC's CPU affinity is described via its linked power + * management device, but only if it points at a Processor or + * Processor Container. + */ + if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id)) + props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity", acpi_id); + + WARN_ON_ONCE(next_prop > ARRAY_SIZE(props) - 1); + err = device_create_managed_software_node(&pdev->dev, props, NULL); + if (err) + return ERR_PTR(err); + + /* + * Stash the table entry for acpi_mpam_parse_resources() to discover + * what this MSC controls. + */ + err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length); + if (err) + return ERR_PTR(err); + + err = platform_device_add(pdev); + if (err) + return ERR_PTR(err); + + return_ptr(pdev); +} + +static int __init acpi_mpam_parse(void) +{ + char *table_end, *table_offset; + struct acpi_mpam_msc_node *tbl_msc; + struct platform_device *pdev; + + if (acpi_disabled || !system_supports_mpam()) + return 0; + + struct acpi_table_header *table __free(acpi_put_table) = + acpi_get_table_pointer(ACPI_SIG_MPAM, 0); + + if (IS_ERR(table)) + return 0; + + if (table->revision < 1) { + pr_debug("MPAM ACPI table revision %d not supported\n", table->revision); + return 0; + } + + table_offset = (char *)(table + 1); + table_end = (char *)table + table->length; + + while (table_offset < table_end) { + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; + if (table_offset + sizeof(*tbl_msc) > table_end || + table_offset + tbl_msc->length > table_end) { + pr_err("MSC entry overlaps end of ACPI table\n"); + return -EINVAL; + } + table_offset += tbl_msc->length; + + /* + * If any of the reserved fields are set, make no attempt to + * parse the MSC structure. This MSC will still be counted by + * acpi_mpam_count_msc(), meaning the MPAM driver can't probe + * against all MSC, and will never be enabled. There is no way + * to enable it safely, because we cannot determine safe + * system-wide partid and pmg ranges in this situation. + */ + if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) { + pr_err_once("Unrecognised MSC, MPAM not usable\n"); + pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier); + continue; + } + + if (!tbl_msc->mmio_size) { + pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier); + continue; + } + + pdev = acpi_mpam_parse_msc(tbl_msc); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + } + + return 0; +} + +/** + * acpi_mpam_count_msc() - Count the number of MSC described by firmware. + * + * Returns the number of MSCs, or zero for an error. + * + * This can be called before or in parallel with acpi_mpam_parse(). + */ +int acpi_mpam_count_msc(void) +{ + char *table_end, *table_offset; + struct acpi_mpam_msc_node *tbl_msc; + int count = 0; + + if (acpi_disabled || !system_supports_mpam()) + return 0; + + struct acpi_table_header *table __free(acpi_put_table) = + acpi_get_table_pointer(ACPI_SIG_MPAM, 0); + + if (IS_ERR(table)) + return 0; + + if (table->revision < 1) + return 0; + + table_offset = (char *)(table + 1); + table_end = (char *)table + table->length; + + while (table_offset < table_end) { + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; + + if (table_offset + sizeof(*tbl_msc) > table_end) + return -EINVAL; + if (tbl_msc->length < sizeof(*tbl_msc)) + return -EINVAL; + if (tbl_msc->length > table_end - table_offset) + return -EINVAL; + table_offset += tbl_msc->length; + + if (!tbl_msc->mmio_size) + continue; + + count++; + } + + return count; +} + +/* + * Call after ACPI devices have been created, which happens behind acpi_scan_init() + * called from subsys_initcall(). PCC requires the mailbox driver, which is + * initialised from postcore_initcall(). + */ +subsys_initcall_sync(acpi_mpam_parse); diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index fdfbfa1f9e19..e23ec976d298 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -432,7 +432,7 @@ static const char table_sigs[][ACPI_NAMESEG_SIZE] __initconst = { ACPI_SIG_PSDT, ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT, ACPI_SIG_IORT, ACPI_SIG_NFIT, ACPI_SIG_HMAT, ACPI_SIG_PPTT, ACPI_SIG_NHLT, ACPI_SIG_AEST, ACPI_SIG_CEDT, ACPI_SIG_AGDI, - ACPI_SIG_NBFT }; + ACPI_SIG_NBFT, ACPI_SIG_MPAM}; #define ACPI_HEADER_SIZE sizeof(struct acpi_table_header) diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h new file mode 100644 index 000000000000..4b7f335181e0 --- /dev/null +++ b/include/linux/arm_mpam.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __LINUX_ARM_MPAM_H +#define __LINUX_ARM_MPAM_H + +#include +#include + +struct mpam_msc; + +enum mpam_msc_iface { + MPAM_IFACE_MMIO, /* a real MPAM MSC */ + MPAM_IFACE_PCC, /* a fake MPAM MSC */ +}; + +enum mpam_class_types { + MPAM_CLASS_CACHE, /* Caches, e.g. L2, L3 */ + MPAM_CLASS_MEMORY, /* Main memory */ + MPAM_CLASS_UNKNOWN, /* Everything else, e.g. SMMU */ +}; + +#define MPAM_CLASS_ID_DEFAULT 255 + +#ifdef CONFIG_ACPI_MPAM +int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc); + +int acpi_mpam_count_msc(void); +#else +static inline int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc) +{ + return -EINVAL; +} + +static inline int acpi_mpam_count_msc(void) { return -EINVAL; } +#endif + +static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, + int component_id) +{ + return -EINVAL; +} + +#endif /* __LINUX_ARM_MPAM_H */ -- Gitee From 63799690835ad6028bdff2e45b08c9a95343e19e Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:40 +0000 Subject: [PATCH 024/124] arm_mpam: Add probe/remove for mpam msc driver and kbuild boiler plate ANBZ: #31060 commit f04046f2577a5c76167333ca99d3903ee5331ba0 upstream. Probing MPAM is convoluted. MSCs that are integrated with a CPU may only be accessible from those CPUs, and they may not be online. Touching the hardware early is pointless as MPAM can't be used until the system-wide common values for num_partid and num_pmg have been discovered. Start with driver probe/remove and mapping the MSC. Cc: Carl Worth Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas [Adjust the return value of mpam_msc_drv_remove for 6.6 platform driver] Signed-off-by: Wei Chen --- arch/arm64/Kconfig | 1 + drivers/Kconfig | 2 + drivers/Makefile | 1 + drivers/resctrl/Kconfig | 15 +++ drivers/resctrl/Makefile | 4 + drivers/resctrl/mpam_devices.c | 192 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 49 ++++++++ 7 files changed, 264 insertions(+) create mode 100644 drivers/resctrl/Kconfig create mode 100644 drivers/resctrl/Makefile create mode 100644 drivers/resctrl/mpam_devices.c create mode 100644 drivers/resctrl/mpam_internal.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5f2dc989f6f2..fadf4017ae15 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2095,6 +2095,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" + select ARM64_MPAM_DRIVER if EXPERT # does nothing yet select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) is an diff --git a/drivers/Kconfig b/drivers/Kconfig index efb66e25fa2d..572436262798 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -243,4 +243,6 @@ source "drivers/hte/Kconfig" source "drivers/cdx/Kconfig" +source "drivers/resctrl/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 5d9e2232267c..15f4df087e2e 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -197,6 +197,7 @@ obj-$(CONFIG_PECI) += peci/ obj-$(CONFIG_HTE) += hte/ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ +obj-y += resctrl/ obj-$(CONFIG_DIBS) += dibs/ obj-$(CONFIG_S390) += s390/ diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig new file mode 100644 index 000000000000..5f7f748e611e --- /dev/null +++ b/drivers/resctrl/Kconfig @@ -0,0 +1,15 @@ +menuconfig ARM64_MPAM_DRIVER + bool "MPAM driver" + depends on ARM64 && ARM64_MPAM && EXPERT + help + Memory System Resource Partitioning and Monitoring (MPAM) driver for + System IP, e.g. caches and memory controllers. + +if ARM64_MPAM_DRIVER + +config ARM64_MPAM_DRIVER_DEBUG + bool "Enable debug messages from the MPAM driver" + help + Say yes here to enable debug messages from the MPAM driver. + +endif diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile new file mode 100644 index 000000000000..898199dcf80d --- /dev/null +++ b/drivers/resctrl/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o +mpam-y += mpam_devices.o + +ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c new file mode 100644 index 000000000000..c88893e4a6db --- /dev/null +++ b/drivers/resctrl/mpam_devices.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mpam_internal.h" + +/* + * mpam_list_lock protects the SRCU lists when writing. Once the + * mpam_enabled key is enabled these lists are read-only, + * unless the error interrupt disables the driver. + */ +static DEFINE_MUTEX(mpam_list_lock); +static LIST_HEAD(mpam_all_msc); + +struct srcu_struct mpam_srcu; + +/* + * Number of MSCs that have been probed. Once all MSCs have been probed MPAM + * can be enabled. + */ +static atomic_t mpam_num_msc; + +/* + * An MSC can control traffic from a set of CPUs, but may only be accessible + * from a (hopefully wider) set of CPUs. The common reason for this is power + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the + * corresponding cache may also be powered off. By making accesses from + * one of those CPUs, we ensure we don't access a cache that's powered off. + */ +static void update_msc_accessibility(struct mpam_msc *msc) +{ + u32 affinity_id; + int err; + + err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity", + &affinity_id); + if (err) + cpumask_copy(&msc->accessibility, cpu_possible_mask); + else + acpi_pptt_get_cpus_from_container(affinity_id, &msc->accessibility); +} + +static void mpam_msc_destroy(struct mpam_msc *msc) +{ + struct platform_device *pdev = msc->pdev; + + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&msc->all_msc_list); + platform_set_drvdata(pdev, NULL); +} + +static int mpam_msc_drv_remove(struct platform_device *pdev) +{ + struct mpam_msc *msc = platform_get_drvdata(pdev); + + mutex_lock(&mpam_list_lock); + mpam_msc_destroy(msc); + mutex_unlock(&mpam_list_lock); + + synchronize_srcu(&mpam_srcu); + + return 0; +} + +static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) +{ + int err; + u32 tmp; + struct mpam_msc *msc; + struct resource *msc_res; + struct device *dev = &pdev->dev; + + lockdep_assert_held(&mpam_list_lock); + + msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL); + if (!msc) + return ERR_PTR(-ENOMEM); + + err = devm_mutex_init(dev, &msc->probe_lock); + if (err) + return ERR_PTR(err); + + err = devm_mutex_init(dev, &msc->part_sel_lock); + if (err) + return ERR_PTR(err); + + msc->id = pdev->id; + msc->pdev = pdev; + INIT_LIST_HEAD_RCU(&msc->all_msc_list); + INIT_LIST_HEAD_RCU(&msc->ris); + + update_msc_accessibility(msc); + if (cpumask_empty(&msc->accessibility)) { + dev_err_once(dev, "MSC is not accessible from any CPU!"); + return ERR_PTR(-EINVAL); + } + + if (device_property_read_u32(&pdev->dev, "pcc-channel", &tmp)) + msc->iface = MPAM_IFACE_MMIO; + else + msc->iface = MPAM_IFACE_PCC; + + if (msc->iface == MPAM_IFACE_MMIO) { + void __iomem *io; + + io = devm_platform_get_and_ioremap_resource(pdev, 0, + &msc_res); + if (IS_ERR(io)) { + dev_err_once(dev, "Failed to map MSC base address\n"); + return ERR_CAST(io); + } + msc->mapped_hwpage_sz = msc_res->end - msc_res->start; + msc->mapped_hwpage = io; + } else { + return ERR_PTR(-EINVAL); + } + + list_add_rcu(&msc->all_msc_list, &mpam_all_msc); + platform_set_drvdata(pdev, msc); + + return msc; +} + +static int fw_num_msc; + +static int mpam_msc_drv_probe(struct platform_device *pdev) +{ + int err; + struct mpam_msc *msc = NULL; + void *plat_data = pdev->dev.platform_data; + + mutex_lock(&mpam_list_lock); + msc = do_mpam_msc_drv_probe(pdev); + mutex_unlock(&mpam_list_lock); + + if (IS_ERR(msc)) + return PTR_ERR(msc); + + /* Create RIS entries described by firmware */ + err = acpi_mpam_parse_resources(msc, plat_data); + if (err) { + mpam_msc_drv_remove(pdev); + return err; + } + + if (atomic_add_return(1, &mpam_num_msc) == fw_num_msc) + pr_info("Discovered all MSCs\n"); + + return 0; +} + +static struct platform_driver mpam_msc_driver = { + .driver = { + .name = "mpam_msc", + }, + .probe = mpam_msc_drv_probe, + .remove = mpam_msc_drv_remove, +}; + +static int __init mpam_msc_driver_init(void) +{ + if (!system_supports_mpam()) + return -EOPNOTSUPP; + + init_srcu_struct(&mpam_srcu); + + fw_num_msc = acpi_mpam_count_msc(); + if (fw_num_msc <= 0) { + pr_err("No MSC devices found in firmware\n"); + return -EINVAL; + } + + return platform_driver_register(&mpam_msc_driver); +} +subsys_initcall(mpam_msc_driver_init); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h new file mode 100644 index 000000000000..540066903eca --- /dev/null +++ b/drivers/resctrl/mpam_internal.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (C) 2025 Arm Ltd. + +#ifndef MPAM_INTERNAL_H +#define MPAM_INTERNAL_H + +#include +#include +#include +#include +#include + +struct platform_device; + +struct mpam_msc { + /* member of mpam_all_msc */ + struct list_head all_msc_list; + + int id; + struct platform_device *pdev; + + /* Not modified after mpam_is_enabled() becomes true */ + enum mpam_msc_iface iface; + u32 nrdy_usec; + cpumask_t accessibility; + + /* + * probe_lock is only taken during discovery. After discovery these + * properties become read-only and the lists are protected by SRCU. + */ + struct mutex probe_lock; + unsigned long ris_idxs; + u32 ris_max; + + /* mpam_msc_ris of this component */ + struct list_head ris; + + /* + * part_sel_lock protects access to the MSC hardware registers that are + * affected by MPAMCFG_PART_SEL. (including the ID registers that vary + * by RIS). + * If needed, take msc->probe_lock first. + */ + struct mutex part_sel_lock; + + void __iomem *mapped_hwpage; + size_t mapped_hwpage_sz; +}; +#endif /* MPAM_INTERNAL_H */ -- Gitee From e7f90b80f45eb1cd13a86a4223c28d376e6e9311 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:41 +0000 Subject: [PATCH 025/124] arm_mpam: Add the class and component structures for firmware described ris ANBZ: #31060 commit 01fb4b8224726aa0f2170b63e4685cf0eec85d8d upstream. An MSC is a container of resources, each identified by their RIS index. Some RIS are described by firmware to provide their position in the system. Others are discovered when the driver probes the hardware. To configure a resource it needs to be found by its class, e.g. 'L2'. There are two kinds of grouping, a class is a set of components, which are visible to user-space as there are likely to be multiple instances of the L2 cache. (e.g. one per cluster or package) Add support for creating and destroying structures to allow a hierarchy of resources to be created. Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 392 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 94 ++++++++ include/linux/arm_mpam.h | 5 + 3 files changed, 491 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index c88893e4a6db..d178ef5284dc 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -36,6 +36,383 @@ struct srcu_struct mpam_srcu; */ static atomic_t mpam_num_msc; +/* + * An MSC is a physical container for controls and monitors, each identified by + * their RIS index. These share a base-address, interrupts and some MMIO + * registers. A vMSC is a virtual container for RIS in an MSC that control or + * monitor the same thing. Members of a vMSC are all RIS in the same MSC, but + * not all RIS in an MSC share a vMSC. + * + * Components are a group of vMSC that control or monitor the same thing but + * are from different MSC, so have different base-address, interrupts etc. + * Classes are the set components of the same type. + * + * The features of a vMSC is the union of the RIS it contains. + * The features of a Class and Component are the common subset of the vMSC + * they contain. + * + * e.g. The system cache may have bandwidth controls on multiple interfaces, + * for regulating traffic from devices independently of traffic from CPUs. + * If these are two RIS in one MSC, they will be treated as controlling + * different things, and will not share a vMSC/component/class. + * + * e.g. The L2 may have one MSC and two RIS, one for cache-controls another + * for bandwidth. These two RIS are members of the same vMSC. + * + * e.g. The set of RIS that make up the L2 are grouped as a component. These + * are sometimes termed slices. They should be configured the same, as if there + * were only one. + * + * e.g. The SoC probably has more than one L2, each attached to a distinct set + * of CPUs. All the L2 components are grouped as a class. + * + * When creating an MSC, struct mpam_msc is added to the all mpam_all_msc list, + * then linked via struct mpam_ris to a vmsc, component and class. + * The same MSC may exist under different class->component->vmsc paths, but the + * RIS index will be unique. + */ +LIST_HEAD(mpam_classes); + +/* List of all objects that can be free()d after synchronise_srcu() */ +static LLIST_HEAD(mpam_garbage); + +static inline void init_garbage(struct mpam_garbage *garbage) +{ + init_llist_node(&garbage->llist); +} + +#define add_to_garbage(x) \ +do { \ + __typeof__(x) _x = (x); \ + _x->garbage.to_free = _x; \ + llist_add(&_x->garbage.llist, &mpam_garbage); \ +} while (0) + +static void mpam_free_garbage(void) +{ + struct mpam_garbage *iter, *tmp; + struct llist_node *to_free = llist_del_all(&mpam_garbage); + + if (!to_free) + return; + + synchronize_srcu(&mpam_srcu); + + llist_for_each_entry_safe(iter, tmp, to_free, llist) { + if (iter->pdev) + devm_kfree(&iter->pdev->dev, iter->to_free); + else + kfree(iter->to_free); + } +} + +static struct mpam_class * +mpam_class_alloc(u8 level_idx, enum mpam_class_types type) +{ + struct mpam_class *class; + + lockdep_assert_held(&mpam_list_lock); + + class = kzalloc(sizeof(*class), GFP_KERNEL); + if (!class) + return ERR_PTR(-ENOMEM); + init_garbage(&class->garbage); + + INIT_LIST_HEAD_RCU(&class->components); + /* Affinity is updated when ris are added */ + class->level = level_idx; + class->type = type; + INIT_LIST_HEAD_RCU(&class->classes_list); + + list_add_rcu(&class->classes_list, &mpam_classes); + + return class; +} + +static void mpam_class_destroy(struct mpam_class *class) +{ + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&class->classes_list); + add_to_garbage(class); +} + +static struct mpam_class * +mpam_class_find(u8 level_idx, enum mpam_class_types type) +{ + struct mpam_class *class; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(class, &mpam_classes, classes_list) { + if (class->type == type && class->level == level_idx) + return class; + } + + return mpam_class_alloc(level_idx, type); +} + +static struct mpam_component * +mpam_component_alloc(struct mpam_class *class, int id) +{ + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + comp = kzalloc(sizeof(*comp), GFP_KERNEL); + if (!comp) + return ERR_PTR(-ENOMEM); + init_garbage(&comp->garbage); + + comp->comp_id = id; + INIT_LIST_HEAD_RCU(&comp->vmsc); + /* Affinity is updated when RIS are added */ + INIT_LIST_HEAD_RCU(&comp->class_list); + comp->class = class; + + list_add_rcu(&comp->class_list, &class->components); + + return comp; +} + +static void mpam_component_destroy(struct mpam_component *comp) +{ + struct mpam_class *class = comp->class; + + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&comp->class_list); + add_to_garbage(comp); + + if (list_empty(&class->components)) + mpam_class_destroy(class); +} + +static struct mpam_component * +mpam_component_find(struct mpam_class *class, int id) +{ + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(comp, &class->components, class_list) { + if (comp->comp_id == id) + return comp; + } + + return mpam_component_alloc(class, id); +} + +static struct mpam_vmsc * +mpam_vmsc_alloc(struct mpam_component *comp, struct mpam_msc *msc) +{ + struct mpam_vmsc *vmsc; + + lockdep_assert_held(&mpam_list_lock); + + vmsc = kzalloc(sizeof(*vmsc), GFP_KERNEL); + if (!vmsc) + return ERR_PTR(-ENOMEM); + init_garbage(&vmsc->garbage); + + INIT_LIST_HEAD_RCU(&vmsc->ris); + INIT_LIST_HEAD_RCU(&vmsc->comp_list); + vmsc->comp = comp; + vmsc->msc = msc; + + list_add_rcu(&vmsc->comp_list, &comp->vmsc); + + return vmsc; +} + +static void mpam_vmsc_destroy(struct mpam_vmsc *vmsc) +{ + struct mpam_component *comp = vmsc->comp; + + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&vmsc->comp_list); + add_to_garbage(vmsc); + + if (list_empty(&comp->vmsc)) + mpam_component_destroy(comp); +} + +static struct mpam_vmsc * +mpam_vmsc_find(struct mpam_component *comp, struct mpam_msc *msc) +{ + struct mpam_vmsc *vmsc; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + if (vmsc->msc->id == msc->id) + return vmsc; + } + + return mpam_vmsc_alloc(comp, msc); +} + +/* + * The cacheinfo structures are only populated when CPUs are online. + * This helper walks the acpi tables to include offline CPUs too. + */ +int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, + cpumask_t *affinity) +{ + return acpi_pptt_get_cpumask_from_cache_id(cache_id, affinity); +} + +/* + * cpumask_of_node() only knows about online CPUs. This can't tell us whether + * a class is represented on all possible CPUs. + */ +static void get_cpumask_from_node_id(u32 node_id, cpumask_t *affinity) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (node_id == cpu_to_node(cpu)) + cpumask_set_cpu(cpu, affinity); + } +} + +static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, + enum mpam_class_types type, + struct mpam_class *class, + struct mpam_component *comp) +{ + int err; + + switch (type) { + case MPAM_CLASS_CACHE: + err = mpam_get_cpumask_from_cache_id(comp->comp_id, class->level, + affinity); + if (err) { + dev_warn_once(&msc->pdev->dev, + "Failed to determine CPU affinity\n"); + return err; + } + + if (cpumask_empty(affinity)) + dev_warn_once(&msc->pdev->dev, "no CPUs associated with cache node\n"); + + break; + case MPAM_CLASS_MEMORY: + get_cpumask_from_node_id(comp->comp_id, affinity); + /* affinity may be empty for CPU-less memory nodes */ + break; + case MPAM_CLASS_UNKNOWN: + return 0; + } + + cpumask_and(affinity, affinity, &msc->accessibility); + + return 0; +} + +static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, + int component_id) +{ + int err; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + struct mpam_class *class; + struct mpam_component *comp; + struct platform_device *pdev = msc->pdev; + + lockdep_assert_held(&mpam_list_lock); + + if (ris_idx > MPAM_MSC_MAX_NUM_RIS) + return -EINVAL; + + if (test_and_set_bit(ris_idx, &msc->ris_idxs)) + return -EBUSY; + + ris = devm_kzalloc(&msc->pdev->dev, sizeof(*ris), GFP_KERNEL); + if (!ris) + return -ENOMEM; + init_garbage(&ris->garbage); + ris->garbage.pdev = pdev; + + class = mpam_class_find(class_id, type); + if (IS_ERR(class)) + return PTR_ERR(class); + + comp = mpam_component_find(class, component_id); + if (IS_ERR(comp)) { + if (list_empty(&class->components)) + mpam_class_destroy(class); + return PTR_ERR(comp); + } + + vmsc = mpam_vmsc_find(comp, msc); + if (IS_ERR(vmsc)) { + if (list_empty(&comp->vmsc)) + mpam_component_destroy(comp); + return PTR_ERR(vmsc); + } + + err = mpam_ris_get_affinity(msc, &ris->affinity, type, class, comp); + if (err) { + if (list_empty(&vmsc->ris)) + mpam_vmsc_destroy(vmsc); + return err; + } + + ris->ris_idx = ris_idx; + INIT_LIST_HEAD_RCU(&ris->msc_list); + INIT_LIST_HEAD_RCU(&ris->vmsc_list); + ris->vmsc = vmsc; + + cpumask_or(&comp->affinity, &comp->affinity, &ris->affinity); + cpumask_or(&class->affinity, &class->affinity, &ris->affinity); + list_add_rcu(&ris->vmsc_list, &vmsc->ris); + list_add_rcu(&ris->msc_list, &msc->ris); + + return 0; +} + +static void mpam_ris_destroy(struct mpam_msc_ris *ris) +{ + struct mpam_vmsc *vmsc = ris->vmsc; + struct mpam_msc *msc = vmsc->msc; + struct mpam_component *comp = vmsc->comp; + struct mpam_class *class = comp->class; + + lockdep_assert_held(&mpam_list_lock); + + /* + * It is assumed affinities don't overlap. If they do the class becomes + * unusable immediately. + */ + cpumask_andnot(&class->affinity, &class->affinity, &ris->affinity); + cpumask_andnot(&comp->affinity, &comp->affinity, &ris->affinity); + clear_bit(ris->ris_idx, &msc->ris_idxs); + list_del_rcu(&ris->msc_list); + list_del_rcu(&ris->vmsc_list); + add_to_garbage(ris); + + if (list_empty(&vmsc->ris)) + mpam_vmsc_destroy(vmsc); +} + +int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, int component_id) +{ + int err; + + mutex_lock(&mpam_list_lock); + err = mpam_ris_create_locked(msc, ris_idx, type, class_id, + component_id); + mutex_unlock(&mpam_list_lock); + if (err) + mpam_free_garbage(); + + return err; +} + /* * An MSC can control traffic from a set of CPUs, but may only be accessible * from a (hopefully wider) set of CPUs. The common reason for this is power @@ -56,14 +433,25 @@ static void update_msc_accessibility(struct mpam_msc *msc) acpi_pptt_get_cpus_from_container(affinity_id, &msc->accessibility); } +/* + * There are two ways of reaching a struct mpam_msc_ris. Via the + * class->component->vmsc->ris, or via the msc. + * When destroying the msc, the other side needs unlinking and cleaning up too. + */ static void mpam_msc_destroy(struct mpam_msc *msc) { struct platform_device *pdev = msc->pdev; + struct mpam_msc_ris *ris, *tmp; lockdep_assert_held(&mpam_list_lock); + list_for_each_entry_safe(ris, tmp, &msc->ris, msc_list) + mpam_ris_destroy(ris); + list_del_rcu(&msc->all_msc_list); platform_set_drvdata(pdev, NULL); + + add_to_garbage(msc); } static int mpam_msc_drv_remove(struct platform_device *pdev) @@ -76,6 +464,8 @@ static int mpam_msc_drv_remove(struct platform_device *pdev) synchronize_srcu(&mpam_srcu); + mpam_free_garbage(); + return 0; } @@ -92,6 +482,8 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL); if (!msc) return ERR_PTR(-ENOMEM); + init_garbage(&msc->garbage); + msc->garbage.pdev = pdev; err = devm_mutex_init(dev, &msc->probe_lock); if (err) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 540066903eca..8f7a28d2c021 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -7,11 +7,30 @@ #include #include #include +#include #include +#include #include +#define MPAM_MSC_MAX_NUM_RIS 16 + struct platform_device; +/* + * Structures protected by SRCU may not be freed for a surprising amount of + * time (especially if perf is running). To ensure the MPAM error interrupt can + * tear down all the structures, build a list of objects that can be garbage + * collected once synchronize_srcu() has returned. + * If pdev is non-NULL, use devm_kfree(). + */ +struct mpam_garbage { + /* member of mpam_garbage */ + struct llist_node llist; + + void *to_free; + struct platform_device *pdev; +}; + struct mpam_msc { /* member of mpam_all_msc */ struct list_head all_msc_list; @@ -45,5 +64,80 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + + struct mpam_garbage garbage; +}; + +struct mpam_class { + /* mpam_components in this class */ + struct list_head components; + + cpumask_t affinity; + + u8 level; + enum mpam_class_types type; + + /* member of mpam_classes */ + struct list_head classes_list; + + struct mpam_garbage garbage; +}; + +struct mpam_component { + u32 comp_id; + + /* mpam_vmsc in this component */ + struct list_head vmsc; + + cpumask_t affinity; + + /* member of mpam_class:components */ + struct list_head class_list; + + /* parent: */ + struct mpam_class *class; + + struct mpam_garbage garbage; +}; + +struct mpam_vmsc { + /* member of mpam_component:vmsc_list */ + struct list_head comp_list; + + /* mpam_msc_ris in this vmsc */ + struct list_head ris; + + /* All RIS in this vMSC are members of this MSC */ + struct mpam_msc *msc; + + /* parent: */ + struct mpam_component *comp; + + struct mpam_garbage garbage; +}; + +struct mpam_msc_ris { + u8 ris_idx; + + cpumask_t affinity; + + /* member of mpam_vmsc:ris */ + struct list_head vmsc_list; + + /* member of mpam_msc:ris */ + struct list_head msc_list; + + /* parent: */ + struct mpam_vmsc *vmsc; + + struct mpam_garbage garbage; }; + +/* List of all classes - protected by srcu*/ +extern struct srcu_struct mpam_srcu; +extern struct list_head mpam_classes; + +int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, + cpumask_t *affinity); + #endif /* MPAM_INTERNAL_H */ diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 4b7f335181e0..13a8ac5c2cbd 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -37,11 +37,16 @@ static inline int acpi_mpam_parse_resources(struct mpam_msc *msc, static inline int acpi_mpam_count_msc(void) { return -EINVAL; } #endif +#ifdef CONFIG_ARM64_MPAM_DRIVER +int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, int component_id); +#else static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, int component_id) { return -EINVAL; } +#endif #endif /* __LINUX_ARM_MPAM_H */ -- Gitee From 88d1e980240edd5e9ac5ca5821396e3bae28bbc3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:42 +0000 Subject: [PATCH 026/124] arm_mpam: Add MPAM MSC register layout definitions ANBZ: #31060 commit aa64b9e110515610b6498df0f8fce9b1c6c44f72 upstream. Memory Partitioning and Monitoring (MPAM) has memory mapped devices (MSCs) with an identity/configuration page. Add the definitions for these registers as offset within the page(s). Link: https://developer.arm.com/documentation/ihi0099/aa/ Signed-off-by: James Morse Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_internal.h | 267 ++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 8f7a28d2c021..51f791cc207b 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -140,4 +140,271 @@ extern struct list_head mpam_classes; int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +/* + * MPAM MSCs have the following register layout. See: + * Arm Memory System Resource Partitioning and Monitoring (MPAM) System + * Component Specification. + * https://developer.arm.com/documentation/ihi0099/aa/ + */ +#define MPAM_ARCHITECTURE_V1 0x10 + +/* Memory mapped control pages */ +/* ID Register offsets in the memory mapped page */ +#define MPAMF_IDR 0x0000 /* features id register */ +#define MPAMF_IIDR 0x0018 /* implementer id register */ +#define MPAMF_AIDR 0x0020 /* architectural id register */ +#define MPAMF_IMPL_IDR 0x0028 /* imp-def partitioning */ +#define MPAMF_CPOR_IDR 0x0030 /* cache-portion partitioning */ +#define MPAMF_CCAP_IDR 0x0038 /* cache-capacity partitioning */ +#define MPAMF_MBW_IDR 0x0040 /* mem-bw partitioning */ +#define MPAMF_PRI_IDR 0x0048 /* priority partitioning */ +#define MPAMF_MSMON_IDR 0x0080 /* performance monitoring features */ +#define MPAMF_CSUMON_IDR 0x0088 /* cache-usage monitor */ +#define MPAMF_MBWUMON_IDR 0x0090 /* mem-bw usage monitor */ +#define MPAMF_PARTID_NRW_IDR 0x0050 /* partid-narrowing */ + +/* Configuration and Status Register offsets in the memory mapped page */ +#define MPAMCFG_PART_SEL 0x0100 /* partid to configure */ +#define MPAMCFG_CPBM 0x1000 /* cache-portion config */ +#define MPAMCFG_CMAX 0x0108 /* cache-capacity config */ +#define MPAMCFG_CMIN 0x0110 /* cache-capacity config */ +#define MPAMCFG_CASSOC 0x0118 /* cache-associativity config */ +#define MPAMCFG_MBW_MIN 0x0200 /* min mem-bw config */ +#define MPAMCFG_MBW_MAX 0x0208 /* max mem-bw config */ +#define MPAMCFG_MBW_WINWD 0x0220 /* mem-bw accounting window config */ +#define MPAMCFG_MBW_PBM 0x2000 /* mem-bw portion bitmap config */ +#define MPAMCFG_PRI 0x0400 /* priority partitioning config */ +#define MPAMCFG_MBW_PROP 0x0500 /* mem-bw stride config */ +#define MPAMCFG_INTPARTID 0x0600 /* partid-narrowing config */ + +#define MSMON_CFG_MON_SEL 0x0800 /* monitor selector */ +#define MSMON_CFG_CSU_FLT 0x0810 /* cache-usage monitor filter */ +#define MSMON_CFG_CSU_CTL 0x0818 /* cache-usage monitor config */ +#define MSMON_CFG_MBWU_FLT 0x0820 /* mem-bw monitor filter */ +#define MSMON_CFG_MBWU_CTL 0x0828 /* mem-bw monitor config */ +#define MSMON_CSU 0x0840 /* current cache-usage */ +#define MSMON_CSU_CAPTURE 0x0848 /* last cache-usage value captured */ +#define MSMON_MBWU 0x0860 /* current mem-bw usage value */ +#define MSMON_MBWU_CAPTURE 0x0868 /* last mem-bw value captured */ +#define MSMON_MBWU_L 0x0880 /* current long mem-bw usage value */ +#define MSMON_MBWU_L_CAPTURE 0x0890 /* last long mem-bw value captured */ +#define MSMON_CAPT_EVNT 0x0808 /* signal a capture event */ +#define MPAMF_ESR 0x00F8 /* error status register */ +#define MPAMF_ECR 0x00F0 /* error control register */ + +/* MPAMF_IDR - MPAM features ID register */ +#define MPAMF_IDR_PARTID_MAX GENMASK(15, 0) +#define MPAMF_IDR_PMG_MAX GENMASK(23, 16) +#define MPAMF_IDR_HAS_CCAP_PART BIT(24) +#define MPAMF_IDR_HAS_CPOR_PART BIT(25) +#define MPAMF_IDR_HAS_MBW_PART BIT(26) +#define MPAMF_IDR_HAS_PRI_PART BIT(27) +#define MPAMF_IDR_EXT BIT(28) +#define MPAMF_IDR_HAS_IMPL_IDR BIT(29) +#define MPAMF_IDR_HAS_MSMON BIT(30) +#define MPAMF_IDR_HAS_PARTID_NRW BIT(31) +#define MPAMF_IDR_HAS_RIS BIT(32) +#define MPAMF_IDR_HAS_EXTD_ESR BIT(38) +#define MPAMF_IDR_HAS_ESR BIT(39) +#define MPAMF_IDR_RIS_MAX GENMASK(59, 56) + +/* MPAMF_MSMON_IDR - MPAM performance monitoring ID register */ +#define MPAMF_MSMON_IDR_MSMON_CSU BIT(16) +#define MPAMF_MSMON_IDR_MSMON_MBWU BIT(17) +#define MPAMF_MSMON_IDR_HAS_LOCAL_CAPT_EVNT BIT(31) + +/* MPAMF_CPOR_IDR - MPAM features cache portion partitioning ID register */ +#define MPAMF_CPOR_IDR_CPBM_WD GENMASK(15, 0) + +/* MPAMF_CCAP_IDR - MPAM features cache capacity partitioning ID register */ +#define MPAMF_CCAP_IDR_CMAX_WD GENMASK(5, 0) +#define MPAMF_CCAP_IDR_CASSOC_WD GENMASK(12, 8) +#define MPAMF_CCAP_IDR_HAS_CASSOC BIT(28) +#define MPAMF_CCAP_IDR_HAS_CMIN BIT(29) +#define MPAMF_CCAP_IDR_NO_CMAX BIT(30) +#define MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM BIT(31) + +/* MPAMF_MBW_IDR - MPAM features memory bandwidth partitioning ID register */ +#define MPAMF_MBW_IDR_BWA_WD GENMASK(5, 0) +#define MPAMF_MBW_IDR_HAS_MIN BIT(10) +#define MPAMF_MBW_IDR_HAS_MAX BIT(11) +#define MPAMF_MBW_IDR_HAS_PBM BIT(12) +#define MPAMF_MBW_IDR_HAS_PROP BIT(13) +#define MPAMF_MBW_IDR_WINDWR BIT(14) +#define MPAMF_MBW_IDR_BWPBM_WD GENMASK(28, 16) + +/* MPAMF_PRI_IDR - MPAM features priority partitioning ID register */ +#define MPAMF_PRI_IDR_HAS_INTPRI BIT(0) +#define MPAMF_PRI_IDR_INTPRI_0_IS_LOW BIT(1) +#define MPAMF_PRI_IDR_INTPRI_WD GENMASK(9, 4) +#define MPAMF_PRI_IDR_HAS_DSPRI BIT(16) +#define MPAMF_PRI_IDR_DSPRI_0_IS_LOW BIT(17) +#define MPAMF_PRI_IDR_DSPRI_WD GENMASK(25, 20) + +/* MPAMF_CSUMON_IDR - MPAM cache storage usage monitor ID register */ +#define MPAMF_CSUMON_IDR_NUM_MON GENMASK(15, 0) +#define MPAMF_CSUMON_IDR_HAS_OFLOW_CAPT BIT(24) +#define MPAMF_CSUMON_IDR_HAS_CEVNT_OFLW BIT(25) +#define MPAMF_CSUMON_IDR_HAS_OFSR BIT(26) +#define MPAMF_CSUMON_IDR_HAS_OFLOW_LNKG BIT(27) +#define MPAMF_CSUMON_IDR_HAS_XCL BIT(29) +#define MPAMF_CSUMON_IDR_CSU_RO BIT(30) +#define MPAMF_CSUMON_IDR_HAS_CAPTURE BIT(31) + +/* MPAMF_MBWUMON_IDR - MPAM memory bandwidth usage monitor ID register */ +#define MPAMF_MBWUMON_IDR_NUM_MON GENMASK(15, 0) +#define MPAMF_MBWUMON_IDR_HAS_RWBW BIT(28) +#define MPAMF_MBWUMON_IDR_LWD BIT(29) +#define MPAMF_MBWUMON_IDR_HAS_LONG BIT(30) +#define MPAMF_MBWUMON_IDR_HAS_CAPTURE BIT(31) + +/* MPAMF_PARTID_NRW_IDR - MPAM PARTID narrowing ID register */ +#define MPAMF_PARTID_NRW_IDR_INTPARTID_MAX GENMASK(15, 0) + +/* MPAMF_IIDR - MPAM implementation ID register */ +#define MPAMF_IIDR_IMPLEMENTER GENMASK(11, 0) +#define MPAMF_IIDR_REVISION GENMASK(15, 12) +#define MPAMF_IIDR_VARIANT GENMASK(19, 16) +#define MPAMF_IIDR_PRODUCTID GENMASK(31, 20) + +/* MPAMF_AIDR - MPAM architecture ID register */ +#define MPAMF_AIDR_ARCH_MINOR_REV GENMASK(3, 0) +#define MPAMF_AIDR_ARCH_MAJOR_REV GENMASK(7, 4) + +/* MPAMCFG_PART_SEL - MPAM partition configuration selection register */ +#define MPAMCFG_PART_SEL_PARTID_SEL GENMASK(15, 0) +#define MPAMCFG_PART_SEL_INTERNAL BIT(16) +#define MPAMCFG_PART_SEL_RIS GENMASK(27, 24) + +/* MPAMCFG_CASSOC - MPAM cache maximum associativity partition configuration register */ +#define MPAMCFG_CASSOC_CASSOC GENMASK(15, 0) + +/* MPAMCFG_CMAX - MPAM cache capacity configuration register */ +#define MPAMCFG_CMAX_SOFTLIM BIT(31) +#define MPAMCFG_CMAX_CMAX GENMASK(15, 0) + +/* MPAMCFG_CMIN - MPAM cache capacity configuration register */ +#define MPAMCFG_CMIN_CMIN GENMASK(15, 0) + +/* + * MPAMCFG_MBW_MIN - MPAM memory minimum bandwidth partitioning configuration + * register + */ +#define MPAMCFG_MBW_MIN_MIN GENMASK(15, 0) + +/* + * MPAMCFG_MBW_MAX - MPAM memory maximum bandwidth partitioning configuration + * register + */ +#define MPAMCFG_MBW_MAX_MAX GENMASK(15, 0) +#define MPAMCFG_MBW_MAX_HARDLIM BIT(31) + +/* + * MPAMCFG_MBW_WINWD - MPAM memory bandwidth partitioning window width + * register + */ +#define MPAMCFG_MBW_WINWD_US_FRAC GENMASK(7, 0) +#define MPAMCFG_MBW_WINWD_US_INT GENMASK(23, 8) + +/* MPAMCFG_PRI - MPAM priority partitioning configuration register */ +#define MPAMCFG_PRI_INTPRI GENMASK(15, 0) +#define MPAMCFG_PRI_DSPRI GENMASK(31, 16) + +/* + * MPAMCFG_MBW_PROP - Memory bandwidth proportional stride partitioning + * configuration register + */ +#define MPAMCFG_MBW_PROP_STRIDEM1 GENMASK(15, 0) +#define MPAMCFG_MBW_PROP_EN BIT(31) + +/* + * MPAMCFG_INTPARTID - MPAM internal partition narrowing configuration register + */ +#define MPAMCFG_INTPARTID_INTPARTID GENMASK(15, 0) +#define MPAMCFG_INTPARTID_INTERNAL BIT(16) + +/* MSMON_CFG_MON_SEL - Memory system performance monitor selection register */ +#define MSMON_CFG_MON_SEL_MON_SEL GENMASK(15, 0) +#define MSMON_CFG_MON_SEL_RIS GENMASK(27, 24) + +/* MPAMF_ESR - MPAM Error Status Register */ +#define MPAMF_ESR_PARTID_MON GENMASK(15, 0) +#define MPAMF_ESR_PMG GENMASK(23, 16) +#define MPAMF_ESR_ERRCODE GENMASK(27, 24) +#define MPAMF_ESR_OVRWR BIT(31) +#define MPAMF_ESR_RIS GENMASK(35, 32) + +/* MPAMF_ECR - MPAM Error Control Register */ +#define MPAMF_ECR_INTEN BIT(0) + +/* Error conditions in accessing memory mapped registers */ +#define MPAM_ERRCODE_NONE 0 +#define MPAM_ERRCODE_PARTID_SEL_RANGE 1 +#define MPAM_ERRCODE_REQ_PARTID_RANGE 2 +#define MPAM_ERRCODE_MSMONCFG_ID_RANGE 3 +#define MPAM_ERRCODE_REQ_PMG_RANGE 4 +#define MPAM_ERRCODE_MONITOR_RANGE 5 +#define MPAM_ERRCODE_INTPARTID_RANGE 6 +#define MPAM_ERRCODE_UNEXPECTED_INTERNAL 7 +#define MPAM_ERRCODE_UNDEFINED_RIS_PART_SEL 8 +#define MPAM_ERRCODE_RIS_NO_CONTROL 9 +#define MPAM_ERRCODE_UNDEFINED_RIS_MON_SEL 10 +#define MPAM_ERRCODE_RIS_NO_MONITOR 11 + +/* + * MSMON_CFG_CSU_CTL - Memory system performance monitor configure cache storage + * usage monitor control register + * MSMON_CFG_MBWU_CTL - Memory system performance monitor configure memory + * bandwidth usage monitor control register + */ +#define MSMON_CFG_x_CTL_TYPE GENMASK(7, 0) +#define MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L BIT(15) +#define MSMON_CFG_x_CTL_MATCH_PARTID BIT(16) +#define MSMON_CFG_x_CTL_MATCH_PMG BIT(17) +#define MSMON_CFG_MBWU_CTL_SCLEN BIT(19) +#define MSMON_CFG_x_CTL_SUBTYPE GENMASK(22, 20) +#define MSMON_CFG_x_CTL_OFLOW_FRZ BIT(24) +#define MSMON_CFG_x_CTL_OFLOW_INTR BIT(25) +#define MSMON_CFG_x_CTL_OFLOW_STATUS BIT(26) +#define MSMON_CFG_x_CTL_CAPT_RESET BIT(27) +#define MSMON_CFG_x_CTL_CAPT_EVNT GENMASK(30, 28) +#define MSMON_CFG_x_CTL_EN BIT(31) + +#define MSMON_CFG_MBWU_CTL_TYPE_MBWU 0x42 +#define MSMON_CFG_CSU_CTL_TYPE_CSU 0x43 + +/* + * MSMON_CFG_CSU_FLT - Memory system performance monitor configure cache storage + * usage monitor filter register + * MSMON_CFG_MBWU_FLT - Memory system performance monitor configure memory + * bandwidth usage monitor filter register + */ +#define MSMON_CFG_x_FLT_PARTID GENMASK(15, 0) +#define MSMON_CFG_x_FLT_PMG GENMASK(23, 16) + +#define MSMON_CFG_MBWU_FLT_RWBW GENMASK(31, 30) +#define MSMON_CFG_CSU_FLT_XCL BIT(31) + +/* + * MSMON_CSU - Memory system performance monitor cache storage usage monitor + * register + * MSMON_CSU_CAPTURE - Memory system performance monitor cache storage usage + * capture register + * MSMON_MBWU - Memory system performance monitor memory bandwidth usage + * monitor register + * MSMON_MBWU_CAPTURE - Memory system performance monitor memory bandwidth usage + * capture register + */ +#define MSMON___VALUE GENMASK(30, 0) +#define MSMON___NRDY BIT(31) +#define MSMON___L_NRDY BIT(63) +#define MSMON___L_VALUE GENMASK(43, 0) +#define MSMON___LWD_VALUE GENMASK(62, 0) + +/* + * MSMON_CAPT_EVNT - Memory system performance monitoring capture event + * generation register + */ +#define MSMON_CAPT_EVNT_NOW BIT(0) + #endif /* MPAM_INTERNAL_H */ -- Gitee From 6f2f340167b01686e4cdcee052da5ed4a54c912e Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:43 +0000 Subject: [PATCH 027/124] arm_mpam: Add cpuhp callbacks to probe MSC hardware ANBZ: #31060 commit 8f8d0ac1da7885c0d619636f93e0983239dc145c upstream. Because an MSC can only by accessed from the CPUs in its cpu-affinity set we need to be running on one of those CPUs to probe the MSC hardware. Do this work in the cpuhp callback. Probing the hardware will only happen before MPAM is enabled, walk all the MSCs and probe those we can reach that haven't already been probed as each CPU's online call is made. This adds the low-level MSC register read accessors. Once all MSCs reported by the firmware have been probed from a CPU in their respective cpu-affinity set, the probe-time cpuhp callbacks are replaced. The replacement callbacks will ultimately need to handle save/restore of the runtime MSC state across power transitions, but for now there is nothing to do in them: so do nothing. The architecture's context switch code will be enabled by a static-key, this can be set by mpam_enable(), but must be done from process context, not a cpuhp callback because both take the cpuhp lock. Whenever a new MSC has been probed, the mpam_enable() work is scheduled to test if all the MSCs have been probed. If probing fails, mpam_disable() is scheduled to unregister the cpuhp callbacks and free memory. CC: Lecopzer Chen Signed-off-by: James Morse Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 176 +++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 5 + 2 files changed, 180 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index d178ef5284dc..63acd3fec7eb 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -4,8 +4,10 @@ #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ #include +#include #include #include +#include #include #include #include @@ -17,6 +19,7 @@ #include #include #include +#include #include "mpam_internal.h" @@ -36,6 +39,25 @@ struct srcu_struct mpam_srcu; */ static atomic_t mpam_num_msc; +static int mpam_cpuhp_state; +static DEFINE_MUTEX(mpam_cpuhp_state_lock); + +/* + * mpam is enabled once all devices have been probed from CPU online callbacks, + * scheduled via this work_struct. If access to an MSC depends on a CPU that + * was not brought online at boot, this can happen surprisingly late. + */ +static DECLARE_WORK(mpam_enable_work, &mpam_enable); + +/* + * All mpam error interrupts indicate a software bug. On receipt, disable the + * driver. + */ +static DECLARE_WORK(mpam_broken_work, &mpam_disable); + +/* When mpam is disabled, the printed reason to aid debugging */ +static char *mpam_disable_reason; + /* * An MSC is a physical container for controls and monitors, each identified by * their RIS index. These share a base-address, interrupts and some MMIO @@ -106,6 +128,21 @@ static void mpam_free_garbage(void) } } +static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) +{ + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + return readl_relaxed(msc->mapped_hwpage + reg); +} + +static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg) +{ + lockdep_assert_held_once(&msc->part_sel_lock); + return __mpam_read_reg(msc, reg); +} + +#define mpam_read_partsel_reg(msc, reg) _mpam_read_partsel_reg(msc, MPAMF_##reg) + static struct mpam_class * mpam_class_alloc(u8 level_idx, enum mpam_class_types type) { @@ -413,6 +450,86 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, return err; } +static int mpam_msc_hw_probe(struct mpam_msc *msc) +{ + u64 idr; + struct device *dev = &msc->pdev->dev; + + lockdep_assert_held(&msc->probe_lock); + + idr = __mpam_read_reg(msc, MPAMF_AIDR); + if ((idr & MPAMF_AIDR_ARCH_MAJOR_REV) != MPAM_ARCHITECTURE_V1) { + dev_err_once(dev, "MSC does not match MPAM architecture v1.x\n"); + return -EIO; + } + + msc->probed = true; + + return 0; +} + +static int mpam_cpu_online(unsigned int cpu) +{ + return 0; +} + +/* Before mpam is enabled, try to probe new MSC */ +static int mpam_discovery_cpu_online(unsigned int cpu) +{ + int err = 0; + struct mpam_msc *msc; + bool new_device_probed = false; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &msc->accessibility)) + continue; + + mutex_lock(&msc->probe_lock); + if (!msc->probed) + err = mpam_msc_hw_probe(msc); + mutex_unlock(&msc->probe_lock); + + if (err) + break; + new_device_probed = true; + } + + if (new_device_probed && !err) + schedule_work(&mpam_enable_work); + if (err) { + mpam_disable_reason = "error during probing"; + schedule_work(&mpam_broken_work); + } + + return err; +} + +static int mpam_cpu_offline(unsigned int cpu) +{ + return 0; +} + +static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online), + int (*offline)(unsigned int offline), + char *name) +{ + mutex_lock(&mpam_cpuhp_state_lock); + if (mpam_cpuhp_state) { + cpuhp_remove_state(mpam_cpuhp_state); + mpam_cpuhp_state = 0; + } + + mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, name, online, + offline); + if (mpam_cpuhp_state <= 0) { + pr_err("Failed to register cpuhp callbacks"); + mpam_cpuhp_state = 0; + } + mutex_unlock(&mpam_cpuhp_state_lock); +} + /* * An MSC can control traffic from a set of CPUs, but may only be accessible * from a (hopefully wider) set of CPUs. The common reason for this is power @@ -553,7 +670,8 @@ static int mpam_msc_drv_probe(struct platform_device *pdev) } if (atomic_add_return(1, &mpam_num_msc) == fw_num_msc) - pr_info("Discovered all MSCs\n"); + mpam_register_cpuhp_callbacks(mpam_discovery_cpu_online, NULL, + "mpam:drv_probe"); return 0; } @@ -566,6 +684,62 @@ static struct platform_driver mpam_msc_driver = { .remove = mpam_msc_drv_remove, }; +static void mpam_enable_once(void) +{ + mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, + "mpam:online"); + + pr_info("MPAM enabled\n"); +} + +void mpam_disable(struct work_struct *ignored) +{ + struct mpam_msc *msc, *tmp; + + mutex_lock(&mpam_cpuhp_state_lock); + if (mpam_cpuhp_state) { + cpuhp_remove_state(mpam_cpuhp_state); + mpam_cpuhp_state = 0; + } + mutex_unlock(&mpam_cpuhp_state_lock); + + mutex_lock(&mpam_list_lock); + list_for_each_entry_safe(msc, tmp, &mpam_all_msc, all_msc_list) + mpam_msc_destroy(msc); + mutex_unlock(&mpam_list_lock); + mpam_free_garbage(); + + pr_err_once("MPAM disabled due to %s\n", mpam_disable_reason); +} + +/* + * Enable mpam once all devices have been probed. + * Scheduled by mpam_discovery_cpu_online() once all devices have been created. + * Also scheduled when new devices are probed when new CPUs come online. + */ +void mpam_enable(struct work_struct *work) +{ + static atomic_t once; + struct mpam_msc *msc; + bool all_devices_probed = true; + + /* Have we probed all the hw devices? */ + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + mutex_lock(&msc->probe_lock); + if (!msc->probed) + all_devices_probed = false; + mutex_unlock(&msc->probe_lock); + + if (!all_devices_probed) + break; + } + + if (all_devices_probed && !atomic_fetch_inc(&once)) + mpam_enable_once(); +} + static int __init mpam_msc_driver_init(void) { if (!system_supports_mpam()) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 51f791cc207b..4e1538d29783 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -48,6 +48,7 @@ struct mpam_msc { * properties become read-only and the lists are protected by SRCU. */ struct mutex probe_lock; + bool probed; unsigned long ris_idxs; u32 ris_max; @@ -137,6 +138,10 @@ struct mpam_msc_ris { extern struct srcu_struct mpam_srcu; extern struct list_head mpam_classes; +/* Scheduled work callback to enable mpam once all MSC have been probed */ +void mpam_enable(struct work_struct *work); +void mpam_disable(struct work_struct *work); + int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); -- Gitee From cd01038ed94a1487be1617978da777f2c412c7f8 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:44 +0000 Subject: [PATCH 028/124] arm_mpam: Probe hardware to find the supported partid/pmg values ANBZ: #31060 commit bd221f9f82afb616887e0b88b43fbb937479d744 upstream. CPUs can generate traffic with a range of PARTID and PMG values, but each MSC may also have its own maximum size for these fields. Before MPAM can be used, the driver needs to probe each RIS on each MSC, to find the system-wide smallest value that can be used. The limits from requestors (e.g. CPUs) also need taking into account. While doing this, RIS entries that firmware didn't describe are created under MPAM_CLASS_UNKNOWN. This adds the low level MSC write accessors. While we're here, implement the mpam_register_requestor() call for the arch code to register the CPU limits. Future callers of this will tell us about the SMMU and ITS. Signed-off-by: James Morse Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 148 +++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 6 ++ include/linux/arm_mpam.h | 14 +++ 3 files changed, 167 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 63acd3fec7eb..6de84634733f 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,15 @@ static atomic_t mpam_num_msc; static int mpam_cpuhp_state; static DEFINE_MUTEX(mpam_cpuhp_state_lock); +/* + * The smallest common values for any CPU or MSC in the system. + * Generating traffic outside this range will result in screaming interrupts. + */ +u16 mpam_partid_max; +u8 mpam_pmg_max; +static bool partid_max_init, partid_max_published; +static DEFINE_SPINLOCK(partid_max_lock); + /* * mpam is enabled once all devices have been probed from CPU online callbacks, * scheduled via this work_struct. If access to an MSC depends on a CPU that @@ -143,6 +153,70 @@ static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg) #define mpam_read_partsel_reg(msc, reg) _mpam_read_partsel_reg(msc, MPAMF_##reg) +static void __mpam_write_reg(struct mpam_msc *msc, u16 reg, u32 val) +{ + WARN_ON_ONCE(reg + sizeof(u32) > msc->mapped_hwpage_sz); + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + writel_relaxed(val, msc->mapped_hwpage + reg); +} + +static inline void _mpam_write_partsel_reg(struct mpam_msc *msc, u16 reg, u32 val) +{ + lockdep_assert_held_once(&msc->part_sel_lock); + __mpam_write_reg(msc, reg, val); +} + +#define mpam_write_partsel_reg(msc, reg, val) _mpam_write_partsel_reg(msc, MPAMCFG_##reg, val) + +static u64 mpam_msc_read_idr(struct mpam_msc *msc) +{ + u64 idr_high = 0, idr_low; + + lockdep_assert_held(&msc->part_sel_lock); + + idr_low = mpam_read_partsel_reg(msc, IDR); + if (FIELD_GET(MPAMF_IDR_EXT, idr_low)) + idr_high = mpam_read_partsel_reg(msc, IDR + 4); + + return (idr_high << 32) | idr_low; +} + +static void __mpam_part_sel_raw(u32 partsel, struct mpam_msc *msc) +{ + lockdep_assert_held(&msc->part_sel_lock); + + mpam_write_partsel_reg(msc, PART_SEL, partsel); +} + +static void __mpam_part_sel(u8 ris_idx, u16 partid, struct mpam_msc *msc) +{ + u32 partsel = FIELD_PREP(MPAMCFG_PART_SEL_RIS, ris_idx) | + FIELD_PREP(MPAMCFG_PART_SEL_PARTID_SEL, partid); + + __mpam_part_sel_raw(partsel, msc); +} + +int mpam_register_requestor(u16 partid_max, u8 pmg_max) +{ + guard(spinlock)(&partid_max_lock); + if (!partid_max_init) { + mpam_partid_max = partid_max; + mpam_pmg_max = pmg_max; + partid_max_init = true; + } else if (!partid_max_published) { + mpam_partid_max = min(mpam_partid_max, partid_max); + mpam_pmg_max = min(mpam_pmg_max, pmg_max); + } else { + /* New requestors can't lower the values */ + if (partid_max < mpam_partid_max || pmg_max < mpam_pmg_max) + return -EBUSY; + } + + return 0; +} +EXPORT_SYMBOL(mpam_register_requestor); + static struct mpam_class * mpam_class_alloc(u8 level_idx, enum mpam_class_types type) { @@ -450,9 +524,35 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, return err; } +static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, + u8 ris_idx) +{ + int err; + struct mpam_msc_ris *ris; + + lockdep_assert_held(&mpam_list_lock); + + if (!test_bit(ris_idx, &msc->ris_idxs)) { + err = mpam_ris_create_locked(msc, ris_idx, MPAM_CLASS_UNKNOWN, + 0, 0); + if (err) + return ERR_PTR(err); + } + + list_for_each_entry(ris, &msc->ris, msc_list) { + if (ris->ris_idx == ris_idx) + return ris; + } + + return ERR_PTR(-ENOENT); +} + static int mpam_msc_hw_probe(struct mpam_msc *msc) { u64 idr; + u16 partid_max; + u8 ris_idx, pmg_max; + struct mpam_msc_ris *ris; struct device *dev = &msc->pdev->dev; lockdep_assert_held(&msc->probe_lock); @@ -463,6 +563,40 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) return -EIO; } + /* Grab an IDR value to find out how many RIS there are */ + mutex_lock(&msc->part_sel_lock); + idr = mpam_msc_read_idr(msc); + mutex_unlock(&msc->part_sel_lock); + + msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr); + + /* Use these values so partid/pmg always starts with a valid value */ + msc->partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr); + msc->pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr); + + for (ris_idx = 0; ris_idx <= msc->ris_max; ris_idx++) { + mutex_lock(&msc->part_sel_lock); + __mpam_part_sel(ris_idx, 0, msc); + idr = mpam_msc_read_idr(msc); + mutex_unlock(&msc->part_sel_lock); + + partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr); + pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr); + msc->partid_max = min(msc->partid_max, partid_max); + msc->pmg_max = min(msc->pmg_max, pmg_max); + + mutex_lock(&mpam_list_lock); + ris = mpam_get_or_create_ris(msc, ris_idx); + mutex_unlock(&mpam_list_lock); + if (IS_ERR(ris)) + return PTR_ERR(ris); + } + + spin_lock(&partid_max_lock); + mpam_partid_max = min(mpam_partid_max, msc->partid_max); + mpam_pmg_max = min(mpam_pmg_max, msc->pmg_max); + spin_unlock(&partid_max_lock); + msc->probed = true; return 0; @@ -686,10 +820,20 @@ static struct platform_driver mpam_msc_driver = { static void mpam_enable_once(void) { + /* + * Once the cpuhp callbacks have been changed, mpam_partid_max can no + * longer change. + */ + spin_lock(&partid_max_lock); + partid_max_published = true; + spin_unlock(&partid_max_lock); + mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); - pr_info("MPAM enabled\n"); + /* Use printk() to avoid the pr_fmt adding the function name. */ + printk(KERN_INFO "MPAM enabled with %u PARTIDs and %u PMGs\n", + mpam_partid_max + 1, mpam_pmg_max + 1); } void mpam_disable(struct work_struct *ignored) @@ -755,4 +899,6 @@ static int __init mpam_msc_driver_init(void) return platform_driver_register(&mpam_msc_driver); } + +/* Must occur after arm64_mpam_register_cpus() from arch_initcall() */ subsys_initcall(mpam_msc_driver_init); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 4e1538d29783..768a58a3ab27 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -49,6 +49,8 @@ struct mpam_msc { */ struct mutex probe_lock; bool probed; + u16 partid_max; + u8 pmg_max; unsigned long ris_idxs; u32 ris_max; @@ -138,6 +140,10 @@ struct mpam_msc_ris { extern struct srcu_struct mpam_srcu; extern struct list_head mpam_classes; +/* System wide partid/pmg values */ +extern u16 mpam_partid_max; +extern u8 mpam_pmg_max; + /* Scheduled work callback to enable mpam once all MSC have been probed */ void mpam_enable(struct work_struct *work); void mpam_disable(struct work_struct *work); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 13a8ac5c2cbd..7f00c5285a32 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -49,4 +49,18 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, } #endif +/** + * mpam_register_requestor() - Register a requestor with the MPAM driver + * @partid_max: The maximum PARTID value the requestor can generate. + * @pmg_max: The maximum PMG value the requestor can generate. + * + * Registers a requestor with the MPAM driver to ensure the chosen system-wide + * minimum PARTID and PMG values will allow the requestors features to be used. + * + * Returns an error if the registration is too late, and a larger PARTID/PMG + * value has been advertised to user-space. In this case the requestor should + * not use its MPAM features. Returns 0 on success. + */ +int mpam_register_requestor(u16 partid_max, u8 pmg_max); + #endif /* __LINUX_ARM_MPAM_H */ -- Gitee From 8f5b7e4259666dfd646fa858e1e5fec58238254a Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:45 +0000 Subject: [PATCH 029/124] arm_mpam: Add helpers for managing the locking around the mon_sel registers ANBZ: #31060 commit d02beb06ca2a624e17004659c79d26a23484aa8b upstream. The MSC MON_SEL register needs to be accessed from hardirq for the overflow interrupt, and when taking an IPI to access these registers on platforms where MSC are not accessible from every CPU. This makes an irqsave spinlock the obvious lock to protect these registers. On systems with SCMI or PCC mailboxes it must be able to sleep, meaning a mutex must be used. The SCMI or PCC platforms can't support an overflow interrupt, and can't access the registers from hardirq context. Clearly these two can't exist for one MSC at the same time. Add helpers for the MON_SEL locking. For now, use a irqsave spinlock and only support 'real' MMIO platforms. In the future this lock will be split in two allowing SCMI/PCC platforms to take a mutex. Because there are contexts where the SCMI/PCC platforms can't make an access, mpam_mon_sel_lock() needs to be able to fail. Do this now, so that all the error handling on these paths is present. This allows the relevant paths to fail if they are needed on a platform where this isn't possible, instead of having to make explicit checks of the interface type. Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 2 ++ drivers/resctrl/mpam_internal.h | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 6de84634733f..5bf6669cb48e 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -744,6 +745,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) if (err) return ERR_PTR(err); + mpam_mon_sel_lock_init(msc); msc->id = pdev->id; msc->pdev = pdev; INIT_LIST_HEAD_RCU(&msc->all_msc_list); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 768a58a3ab27..97f02cf92d7a 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #define MPAM_MSC_MAX_NUM_RIS 16 @@ -65,12 +66,52 @@ struct mpam_msc { */ struct mutex part_sel_lock; + /* + * mon_sel_lock protects access to the MSC hardware registers that are + * affected by MPAMCFG_MON_SEL, and the mbwu_state. + * Access to mon_sel is needed from both process and interrupt contexts, + * but is complicated by firmware-backed platforms that can't make any + * access unless they can sleep. + * Always use the mpam_mon_sel_lock() helpers. + * Accesses to mon_sel need to be able to fail if they occur in the wrong + * context. + * If needed, take msc->probe_lock first. + */ + raw_spinlock_t _mon_sel_lock; + unsigned long _mon_sel_flags; + void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; struct mpam_garbage garbage; }; +/* Returning false here means accesses to mon_sel must fail and report an error. */ +static inline bool __must_check mpam_mon_sel_lock(struct mpam_msc *msc) +{ + /* Locking will require updating to support a firmware backed interface */ + if (WARN_ON_ONCE(msc->iface != MPAM_IFACE_MMIO)) + return false; + + raw_spin_lock_irqsave(&msc->_mon_sel_lock, msc->_mon_sel_flags); + return true; +} + +static inline void mpam_mon_sel_unlock(struct mpam_msc *msc) +{ + raw_spin_unlock_irqrestore(&msc->_mon_sel_lock, msc->_mon_sel_flags); +} + +static inline void mpam_mon_sel_lock_held(struct mpam_msc *msc) +{ + lockdep_assert_held_once(&msc->_mon_sel_lock); +} + +static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) +{ + raw_spin_lock_init(&msc->_mon_sel_lock); +} + struct mpam_class { /* mpam_components in this class */ struct list_head components; -- Gitee From 071cacba8ce1b63558fc5d3d2319eb613345dec4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:46 +0000 Subject: [PATCH 030/124] arm_mpam: Probe the hardware features resctrl supports ANBZ: #31060 commit 8c90dc68a5de4349ef9ba51449fb0a29cd690547 upstream. Expand the probing support with the control and monitor types we can use with resctrl. CC: Dave Martin Signed-off-by: James Morse Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 149 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 33 +++++++ 2 files changed, 182 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 5bf6669cb48e..c4d14dad3bb2 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -170,6 +170,22 @@ static inline void _mpam_write_partsel_reg(struct mpam_msc *msc, u16 reg, u32 va #define mpam_write_partsel_reg(msc, reg, val) _mpam_write_partsel_reg(msc, MPAMCFG_##reg, val) +static inline u32 _mpam_read_monsel_reg(struct mpam_msc *msc, u16 reg) +{ + mpam_mon_sel_lock_held(msc); + return __mpam_read_reg(msc, reg); +} + +#define mpam_read_monsel_reg(msc, reg) _mpam_read_monsel_reg(msc, MSMON_##reg) + +static inline void _mpam_write_monsel_reg(struct mpam_msc *msc, u16 reg, u32 val) +{ + mpam_mon_sel_lock_held(msc); + __mpam_write_reg(msc, reg, val); +} + +#define mpam_write_monsel_reg(msc, reg, val) _mpam_write_monsel_reg(msc, MSMON_##reg, val) + static u64 mpam_msc_read_idr(struct mpam_msc *msc) { u64 idr_high = 0, idr_low; @@ -548,6 +564,133 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +/* + * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour + * of NRDY, software can use this bit for any purpose" - so hardware might not + * implement this - but it isn't RES0. + * + * Try and see what values stick in this bit. If we can write either value, + * its probably not implemented by hardware. + */ +static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) +{ + u32 now; + u64 mon_sel; + bool can_set, can_clear; + struct mpam_msc *msc = ris->vmsc->msc; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + return false; + + mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) | + FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); + _mpam_write_monsel_reg(msc, mon_reg, mon_sel); + + _mpam_write_monsel_reg(msc, mon_reg, MSMON___NRDY); + now = _mpam_read_monsel_reg(msc, mon_reg); + can_set = now & MSMON___NRDY; + + _mpam_write_monsel_reg(msc, mon_reg, 0); + now = _mpam_read_monsel_reg(msc, mon_reg); + can_clear = !(now & MSMON___NRDY); + mpam_mon_sel_unlock(msc); + + return (!can_set || !can_clear); +} + +#define mpam_ris_hw_probe_hw_nrdy(_ris, _mon_reg) \ + _mpam_ris_hw_probe_hw_nrdy(_ris, MSMON_##_mon_reg) + +static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) +{ + int err; + struct mpam_msc *msc = ris->vmsc->msc; + struct device *dev = &msc->pdev->dev; + struct mpam_props *props = &ris->props; + + lockdep_assert_held(&msc->probe_lock); + lockdep_assert_held(&msc->part_sel_lock); + + /* Cache Portion partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_CPOR_PART, ris->idr)) { + u32 cpor_features = mpam_read_partsel_reg(msc, CPOR_IDR); + + props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, cpor_features); + if (props->cpbm_wd) + mpam_set_feature(mpam_feat_cpor_part, props); + } + + /* Memory bandwidth partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_MBW_PART, ris->idr)) { + u32 mbw_features = mpam_read_partsel_reg(msc, MBW_IDR); + + /* portion bitmap resolution */ + props->mbw_pbm_bits = FIELD_GET(MPAMF_MBW_IDR_BWPBM_WD, mbw_features); + if (props->mbw_pbm_bits && + FIELD_GET(MPAMF_MBW_IDR_HAS_PBM, mbw_features)) + mpam_set_feature(mpam_feat_mbw_part, props); + + props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) + mpam_set_feature(mpam_feat_mbw_max, props); + } + + /* Performance Monitoring */ + if (FIELD_GET(MPAMF_IDR_HAS_MSMON, ris->idr)) { + u32 msmon_features = mpam_read_partsel_reg(msc, MSMON_IDR); + + /* + * If the firmware max-nrdy-us property is missing, the + * CSU counters can't be used. Should we wait forever? + */ + err = device_property_read_u32(&msc->pdev->dev, + "arm,not-ready-us", + &msc->nrdy_usec); + + if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_CSU, msmon_features)) { + u32 csumonidr; + + csumonidr = mpam_read_partsel_reg(msc, CSUMON_IDR); + props->num_csu_mon = FIELD_GET(MPAMF_CSUMON_IDR_NUM_MON, csumonidr); + if (props->num_csu_mon) { + bool hw_managed; + + mpam_set_feature(mpam_feat_msmon_csu, props); + + /* Is NRDY hardware managed? */ + hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); + if (hw_managed) + mpam_set_feature(mpam_feat_msmon_csu_hw_nrdy, props); + } + + /* + * Accept the missing firmware property if NRDY appears + * un-implemented. + */ + if (err && mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, props)) + dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware."); + } + if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_MBWU, msmon_features)) { + bool hw_managed; + u32 mbwumon_idr = mpam_read_partsel_reg(msc, MBWUMON_IDR); + + props->num_mbwu_mon = FIELD_GET(MPAMF_MBWUMON_IDR_NUM_MON, mbwumon_idr); + if (props->num_mbwu_mon) + mpam_set_feature(mpam_feat_msmon_mbwu, props); + + /* Is NRDY hardware managed? */ + hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); + if (hw_managed) + mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); + + /* + * Don't warn about any missing firmware property for + * MBWU NRDY - it doesn't make any sense! + */ + } + } +} + static int mpam_msc_hw_probe(struct mpam_msc *msc) { u64 idr; @@ -591,6 +734,12 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) mutex_unlock(&mpam_list_lock); if (IS_ERR(ris)) return PTR_ERR(ris); + ris->idr = idr; + + mutex_lock(&msc->part_sel_lock); + __mpam_part_sel(ris_idx, 0, msc); + mpam_ris_hw_probe(ris); + mutex_unlock(&msc->part_sel_lock); } spin_lock(&partid_max_lock); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 97f02cf92d7a..cdaa019367e9 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -5,12 +5,14 @@ #define MPAM_INTERNAL_H #include +#include #include #include #include #include #include #include +#include #include #define MPAM_MSC_MAX_NUM_RIS 16 @@ -112,6 +114,33 @@ static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) raw_spin_lock_init(&msc->_mon_sel_lock); } +/* Bits for mpam features bitmaps */ +enum mpam_device_features { + mpam_feat_cpor_part, + mpam_feat_mbw_part, + mpam_feat_mbw_min, + mpam_feat_mbw_max, + mpam_feat_msmon, + mpam_feat_msmon_csu, + mpam_feat_msmon_csu_hw_nrdy, + mpam_feat_msmon_mbwu, + mpam_feat_msmon_mbwu_hw_nrdy, + MPAM_FEATURE_LAST +}; + +struct mpam_props { + DECLARE_BITMAP(features, MPAM_FEATURE_LAST); + + u16 cpbm_wd; + u16 mbw_pbm_bits; + u16 bwa_wd; + u16 num_csu_mon; + u16 num_mbwu_mon; +}; + +#define mpam_has_feature(_feat, x) test_bit(_feat, (x)->features) +#define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) + struct mpam_class { /* mpam_components in this class */ struct list_head components; @@ -151,6 +180,8 @@ struct mpam_vmsc { /* mpam_msc_ris in this vmsc */ struct list_head ris; + struct mpam_props props; + /* All RIS in this vMSC are members of this MSC */ struct mpam_msc *msc; @@ -162,6 +193,8 @@ struct mpam_vmsc { struct mpam_msc_ris { u8 ris_idx; + u64 idr; + struct mpam_props props; cpumask_t affinity; -- Gitee From 31f8f640ec50ab84c1e25303a2799086103c1ffd Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:47 +0000 Subject: [PATCH 031/124] arm_mpam: Merge supported features during mpam_enable() into mpam_class ANBZ: #31060 commit c10ca83a778304f976cbea60bbbb2f1fac003f5c upstream. To make a decision about whether to expose an mpam class as a resctrl resource we need to know its overall supported features and properties. Once we've probed all the resources, we can walk the tree and produce overall values by merging the bitmaps. This eliminates features that are only supported by some MSC that make up a component or class. If bitmap properties are mismatched within a component we cannot support the mismatched feature. Care has to be taken as vMSC may hold mismatched RIS. Signed-off-by: James Morse Reviewed-by: Ben Horgan Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 214 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 3 + 2 files changed, 217 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index c4d14dad3bb2..d31681a8e003 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -969,6 +969,216 @@ static struct platform_driver mpam_msc_driver = { .remove = mpam_msc_drv_remove, }; +/* Any of these features mean the BWA_WD field is valid. */ +static bool mpam_has_bwa_wd_feature(struct mpam_props *props) +{ + if (mpam_has_feature(mpam_feat_mbw_min, props)) + return true; + if (mpam_has_feature(mpam_feat_mbw_max, props)) + return true; + return false; +} + +#define MISMATCHED_HELPER(parent, child, helper, field, alias) \ + helper(parent) && \ + ((helper(child) && (parent)->field != (child)->field) || \ + (!helper(child) && !(alias))) + +#define MISMATCHED_FEAT(parent, child, feat, field, alias) \ + mpam_has_feature((feat), (parent)) && \ + ((mpam_has_feature((feat), (child)) && (parent)->field != (child)->field) || \ + (!mpam_has_feature((feat), (child)) && !(alias))) + +#define CAN_MERGE_FEAT(parent, child, feat, alias) \ + (alias) && !mpam_has_feature((feat), (parent)) && \ + mpam_has_feature((feat), (child)) + +/* + * Combine two props fields. + * If this is for controls that alias the same resource, it is safe to just + * copy the values over. If two aliasing controls implement the same scheme + * a safe value must be picked. + * For non-aliasing controls, these control different resources, and the + * resulting safe value must be compatible with both. When merging values in + * the tree, all the aliasing resources must be handled first. + * On mismatch, parent is modified. + */ +static void __props_mismatch(struct mpam_props *parent, + struct mpam_props *child, bool alias) +{ + if (CAN_MERGE_FEAT(parent, child, mpam_feat_cpor_part, alias)) { + parent->cpbm_wd = child->cpbm_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_cpor_part, + cpbm_wd, alias)) { + pr_debug("cleared cpor_part\n"); + mpam_clear_feature(mpam_feat_cpor_part, parent); + parent->cpbm_wd = 0; + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_mbw_part, alias)) { + parent->mbw_pbm_bits = child->mbw_pbm_bits; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_mbw_part, + mbw_pbm_bits, alias)) { + pr_debug("cleared mbw_part\n"); + mpam_clear_feature(mpam_feat_mbw_part, parent); + parent->mbw_pbm_bits = 0; + } + + /* bwa_wd is a count of bits, fewer bits means less precision */ + if (alias && !mpam_has_bwa_wd_feature(parent) && + mpam_has_bwa_wd_feature(child)) { + parent->bwa_wd = child->bwa_wd; + } else if (MISMATCHED_HELPER(parent, child, mpam_has_bwa_wd_feature, + bwa_wd, alias)) { + pr_debug("took the min bwa_wd\n"); + parent->bwa_wd = min(parent->bwa_wd, child->bwa_wd); + } + + /* For num properties, take the minimum */ + if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_csu, alias)) { + parent->num_csu_mon = child->num_csu_mon; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_msmon_csu, + num_csu_mon, alias)) { + pr_debug("took the min num_csu_mon\n"); + parent->num_csu_mon = min(parent->num_csu_mon, + child->num_csu_mon); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_mbwu, alias)) { + parent->num_mbwu_mon = child->num_mbwu_mon; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_msmon_mbwu, + num_mbwu_mon, alias)) { + pr_debug("took the min num_mbwu_mon\n"); + parent->num_mbwu_mon = min(parent->num_mbwu_mon, + child->num_mbwu_mon); + } + + if (alias) { + /* Merge features for aliased resources */ + bitmap_or(parent->features, parent->features, child->features, MPAM_FEATURE_LAST); + } else { + /* Clear missing features for non aliasing */ + bitmap_and(parent->features, parent->features, child->features, MPAM_FEATURE_LAST); + } +} + +/* + * If a vmsc doesn't match class feature/configuration, do the right thing(tm). + * For 'num' properties we can just take the minimum. + * For properties where the mismatched unused bits would make a difference, we + * nobble the class feature, as we can't configure all the resources. + * e.g. The L3 cache is composed of two resources with 13 and 17 portion + * bitmaps respectively. + */ +static void +__class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) +{ + struct mpam_props *cprops = &class->props; + struct mpam_props *vprops = &vmsc->props; + struct device *dev = &vmsc->msc->pdev->dev; + + lockdep_assert_held(&mpam_list_lock); /* we modify class */ + + dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n", + (long)cprops->features, (long)vprops->features); + + /* Take the safe value for any common features */ + __props_mismatch(cprops, vprops, false); +} + +static void +__vmsc_props_mismatch(struct mpam_vmsc *vmsc, struct mpam_msc_ris *ris) +{ + struct mpam_props *rprops = &ris->props; + struct mpam_props *vprops = &vmsc->props; + struct device *dev = &vmsc->msc->pdev->dev; + + lockdep_assert_held(&mpam_list_lock); /* we modify vmsc */ + + dev_dbg(dev, "Merging features for vmsc:0x%lx |= ris:0x%lx\n", + (long)vprops->features, (long)rprops->features); + + /* + * Merge mismatched features - Copy any features that aren't common, + * but take the safe value for any common features. + */ + __props_mismatch(vprops, rprops, true); +} + +/* + * Copy the first component's first vMSC's properties and features to the + * class. __class_props_mismatch() will remove conflicts. + * It is not possible to have a class with no components, or a component with + * no resources. The vMSC properties have already been built. + */ +static void mpam_enable_init_class_features(struct mpam_class *class) +{ + struct mpam_vmsc *vmsc; + struct mpam_component *comp; + + comp = list_first_entry(&class->components, + struct mpam_component, class_list); + vmsc = list_first_entry(&comp->vmsc, + struct mpam_vmsc, comp_list); + + class->props = vmsc->props; +} + +static void mpam_enable_merge_vmsc_features(struct mpam_component *comp) +{ + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + struct mpam_class *class = comp->class; + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + list_for_each_entry(ris, &vmsc->ris, vmsc_list) { + __vmsc_props_mismatch(vmsc, ris); + class->nrdy_usec = max(class->nrdy_usec, + vmsc->msc->nrdy_usec); + } + } +} + +static void mpam_enable_merge_class_features(struct mpam_component *comp) +{ + struct mpam_vmsc *vmsc; + struct mpam_class *class = comp->class; + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) + __class_props_mismatch(class, vmsc); +} + +/* + * Merge all the common resource features into class. + * vmsc features are bitwise-or'd together by mpam_enable_merge_vmsc_features() + * as the first step so that mpam_enable_init_class_features() can initialise + * the class with a representative set of features. + * Next the mpam_enable_merge_class_features() bitwise-and's all the vmsc + * features to form the class features. + * Other features are the min/max as appropriate. + * + * To avoid walking the whole tree twice, the class->nrdy_usec property is + * updated when working with the vmsc as it is a max(), and doesn't need + * initialising first. + */ +static void mpam_enable_merge_features(struct list_head *all_classes_list) +{ + struct mpam_class *class; + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(class, all_classes_list, classes_list) { + list_for_each_entry(comp, &class->components, class_list) + mpam_enable_merge_vmsc_features(comp); + + mpam_enable_init_class_features(class); + + list_for_each_entry(comp, &class->components, class_list) + mpam_enable_merge_class_features(comp); + } +} + static void mpam_enable_once(void) { /* @@ -979,6 +1189,10 @@ static void mpam_enable_once(void) partid_max_published = true; spin_unlock(&partid_max_lock); + mutex_lock(&mpam_list_lock); + mpam_enable_merge_features(&mpam_classes); + mutex_unlock(&mpam_list_lock); + mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index cdaa019367e9..4749ac223adc 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -140,6 +140,7 @@ struct mpam_props { #define mpam_has_feature(_feat, x) test_bit(_feat, (x)->features) #define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) +#define mpam_clear_feature(_feat, x) clear_bit(_feat, (x)->features) struct mpam_class { /* mpam_components in this class */ @@ -147,6 +148,8 @@ struct mpam_class { cpumask_t affinity; + struct mpam_props props; + u32 nrdy_usec; u8 level; enum mpam_class_types type; -- Gitee From cd0f23ae94ca7b898e30dbc48cbfdd0310989856 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:48 +0000 Subject: [PATCH 032/124] arm_mpam: Reset MSC controls from cpuhp callbacks ANBZ: #31060 commit f188a36ca2416e8090453eacbabd2925b20eb906 upstream. When a CPU comes online, it may bring a newly accessible MSC with it. Only the default partid has its value reset by hardware, and even then the MSC might not have been reset since its config was previously dirtied. e.g. Kexec. Any in-use partid must have its configuration restored, or reset. In-use partids may be held in caches and evicted later. MSC are also reset when CPUs are taken offline to cover cases where firmware doesn't reset the MSC over reboot using UEFI, or kexec where there is no firmware involvement. If the configuration for a RIS has not been touched since it was brought online, it does not need resetting again. To reset, write the maximum values for all discovered controls. CC: Rohit Mathew Signed-off-by: James Morse Reviewed-by: Fenghua Yu Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Reviewed-by: Gavin Shan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 109 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 3 + 2 files changed, 112 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index d31681a8e003..007c834fb29e 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -752,8 +753,104 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) return 0; } +static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) +{ + u32 num_words, msb; + u32 bm = ~0; + int i; + + lockdep_assert_held(&msc->part_sel_lock); + + if (wd == 0) + return; + + /* + * Write all ~0 to all but the last 32bit-word, which may + * have fewer bits... + */ + num_words = DIV_ROUND_UP(wd, 32); + for (i = 0; i < num_words - 1; i++, reg += sizeof(bm)) + __mpam_write_reg(msc, reg, bm); + + /* + * ....and then the last (maybe) partial 32bit word. When wd is a + * multiple of 32, msb should be 31 to write a full 32bit word. + */ + msb = (wd - 1) % 32; + bm = GENMASK(msb, 0); + __mpam_write_reg(msc, reg, bm); +} + +static void mpam_reset_ris_partid(struct mpam_msc_ris *ris, u16 partid) +{ + struct mpam_msc *msc = ris->vmsc->msc; + struct mpam_props *rprops = &ris->props; + + WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); + + mutex_lock(&msc->part_sel_lock); + __mpam_part_sel(ris->ris_idx, partid, msc); + + if (mpam_has_feature(mpam_feat_cpor_part, rprops)) + mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); + + if (mpam_has_feature(mpam_feat_mbw_part, rprops)) + mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); + + if (mpam_has_feature(mpam_feat_mbw_min, rprops)) + mpam_write_partsel_reg(msc, MBW_MIN, 0); + + if (mpam_has_feature(mpam_feat_mbw_max, rprops)) + mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); + + mutex_unlock(&msc->part_sel_lock); +} + +static void mpam_reset_ris(struct mpam_msc_ris *ris) +{ + u16 partid, partid_max; + + WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); + + if (ris->in_reset_state) + return; + + spin_lock(&partid_max_lock); + partid_max = mpam_partid_max; + spin_unlock(&partid_max_lock); + for (partid = 0; partid <= partid_max; partid++) + mpam_reset_ris_partid(ris, partid); +} + +static void mpam_reset_msc(struct mpam_msc *msc, bool online) +{ + struct mpam_msc_ris *ris; + + list_for_each_entry_srcu(ris, &msc->ris, msc_list, srcu_read_lock_held(&mpam_srcu)) { + mpam_reset_ris(ris); + + /* + * Set in_reset_state when coming online. The reset state + * for non-zero partid may be lost while the CPUs are offline. + */ + ris->in_reset_state = online; + } +} + static int mpam_cpu_online(unsigned int cpu) { + struct mpam_msc *msc; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &msc->accessibility)) + continue; + + if (atomic_fetch_inc(&msc->online_refs) == 0) + mpam_reset_msc(msc, true); + } + return 0; } @@ -792,6 +889,18 @@ static int mpam_discovery_cpu_online(unsigned int cpu) static int mpam_cpu_offline(unsigned int cpu) { + struct mpam_msc *msc; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &msc->accessibility)) + continue; + + if (atomic_dec_and_test(&msc->online_refs)) + mpam_reset_msc(msc, false); + } + return 0; } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 4749ac223adc..dec485cd8a91 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -5,6 +5,7 @@ #define MPAM_INTERNAL_H #include +#include #include #include #include @@ -45,6 +46,7 @@ struct mpam_msc { enum mpam_msc_iface iface; u32 nrdy_usec; cpumask_t accessibility; + atomic_t online_refs; /* * probe_lock is only taken during discovery. After discovery these @@ -198,6 +200,7 @@ struct mpam_msc_ris { u8 ris_idx; u64 idr; struct mpam_props props; + bool in_reset_state; cpumask_t affinity; -- Gitee From 537616afb8d792b9fddf96aa3fed5c05a415052d Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:49 +0000 Subject: [PATCH 033/124] arm_mpam: Add a helper to touch an MSC from any CPU ANBZ: #31060 commit 475228d15dd653584b840b8e6c5828cdc3884b1c upstream. Resetting RIS entries from the cpuhp callback is easy as the callback occurs on the correct CPU. This won't be true for any other caller that wants to reset or configure an MSC. Add a helper that schedules the provided function if necessary. Callers should take the cpuhp lock to prevent the cpuhp callbacks from changing the MSC state. Signed-off-by: James Morse Reviewed-by: Ben Horgan Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 37 +++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 007c834fb29e..e4c4722cdd52 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -806,20 +806,51 @@ static void mpam_reset_ris_partid(struct mpam_msc_ris *ris, u16 partid) mutex_unlock(&msc->part_sel_lock); } -static void mpam_reset_ris(struct mpam_msc_ris *ris) +/* + * Called via smp_call_on_cpu() to prevent migration, while still being + * pre-emptible. + */ +static int mpam_reset_ris(void *arg) { u16 partid, partid_max; + struct mpam_msc_ris *ris = arg; WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); if (ris->in_reset_state) - return; + return 0; spin_lock(&partid_max_lock); partid_max = mpam_partid_max; spin_unlock(&partid_max_lock); for (partid = 0; partid <= partid_max; partid++) mpam_reset_ris_partid(ris, partid); + + return 0; +} + +/* + * Get the preferred CPU for this MSC. If it is accessible from this CPU, + * this CPU is preferred. This can be preempted/migrated, it will only result + * in more work. + */ +static int mpam_get_msc_preferred_cpu(struct mpam_msc *msc) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_cpu(cpu, &msc->accessibility)) + return cpu; + + return cpumask_first_and(&msc->accessibility, cpu_online_mask); +} + +static int mpam_touch_msc(struct mpam_msc *msc, int (*fn)(void *a), void *arg) +{ + lockdep_assert_irqs_enabled(); + lockdep_assert_cpus_held(); + WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); + + return smp_call_on_cpu(mpam_get_msc_preferred_cpu(msc), fn, arg, true); } static void mpam_reset_msc(struct mpam_msc *msc, bool online) @@ -827,7 +858,7 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online) struct mpam_msc_ris *ris; list_for_each_entry_srcu(ris, &msc->ris, msc_list, srcu_read_lock_held(&mpam_srcu)) { - mpam_reset_ris(ris); + mpam_touch_msc(msc, &mpam_reset_ris, ris); /* * Set in_reset_state when coming online. The reset state -- Gitee From 76793ed52c822d22acf70e8c0e04e2564a246aa5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:50 +0000 Subject: [PATCH 034/124] arm_mpam: Extend reset logic to allow devices to be reset any time ANBZ: #31060 commit 3bd04fe7d807bbdcfe75b29ca82fae4e2d7dc524 upstream. cpuhp callbacks aren't the only time the MSC configuration may need to be reset. Resctrl has an API call to reset a class. If an MPAM error interrupt arrives it indicates the driver has misprogrammed an MSC. The safest thing to do is reset all the MSCs and disable MPAM. Add a helper to reset RIS via their class. Call this from mpam_disable(), which can be scheduled from the error interrupt handler. Signed-off-by: James Morse Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Fenghua Yu Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 57 ++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index e4c4722cdd52..8ff35e28c0da 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -808,15 +808,13 @@ static void mpam_reset_ris_partid(struct mpam_msc_ris *ris, u16 partid) /* * Called via smp_call_on_cpu() to prevent migration, while still being - * pre-emptible. + * pre-emptible. Caller must hold mpam_srcu. */ static int mpam_reset_ris(void *arg) { u16 partid, partid_max; struct mpam_msc_ris *ris = arg; - WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); - if (ris->in_reset_state) return 0; @@ -1341,8 +1339,55 @@ static void mpam_enable_once(void) mpam_partid_max + 1, mpam_pmg_max + 1); } +static void mpam_reset_component_locked(struct mpam_component *comp) +{ + struct mpam_vmsc *vmsc; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_msc *msc = vmsc->msc; + struct mpam_msc_ris *ris; + + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!ris->in_reset_state) + mpam_touch_msc(msc, mpam_reset_ris, ris); + ris->in_reset_state = true; + } + } +} + +static void mpam_reset_class_locked(struct mpam_class *class) +{ + struct mpam_component *comp; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) + mpam_reset_component_locked(comp); +} + +static void mpam_reset_class(struct mpam_class *class) +{ + cpus_read_lock(); + mpam_reset_class_locked(class); + cpus_read_unlock(); +} + +/* + * Called in response to an error IRQ. + * All of MPAMs errors indicate a software bug, restore any modified + * controls to their reset values. + */ void mpam_disable(struct work_struct *ignored) { + int idx; + struct mpam_class *class; struct mpam_msc *msc, *tmp; mutex_lock(&mpam_cpuhp_state_lock); @@ -1352,6 +1397,12 @@ void mpam_disable(struct work_struct *ignored) } mutex_unlock(&mpam_cpuhp_state_lock); + idx = srcu_read_lock(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) + mpam_reset_class(class); + srcu_read_unlock(&mpam_srcu, idx); + mutex_lock(&mpam_list_lock); list_for_each_entry_safe(msc, tmp, &mpam_all_msc, all_msc_list) mpam_msc_destroy(msc); -- Gitee From 74a67e3a1d5dd610ac121a799ff8340025dfba48 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:51 +0000 Subject: [PATCH 035/124] arm_mpam: Register and enable IRQs ANBZ: #31060 commit 49aa621c4dcaf8e3cfeb9e73d07a9746b889f9e8 upstream. Register and enable error IRQs. All the MPAM error interrupts indicate a software bug, e.g. out of range partid. If the error interrupt is ever signalled, attempt to disable MPAM. Only the irq handler accesses the MPAMF_ESR register, so no locking is needed. The work to disable MPAM after an error needs to happen at process context as it takes mutex. It also unregisters the interrupts, meaning it can't be done from the threaded part of a threaded interrupt. Instead, mpam_disable() gets scheduled. Enabling the IRQs in the MSC may involve cross calling to a CPU that can access the MSC. Once the IRQ is requested, the mpam_disable() path can be called asynchronously, which will walk structures sized by max_partid. Ensure this size is fixed before the interrupt is requested. CC: Rohit Mathew Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Rohit Mathew Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 280 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 13 ++ 2 files changed, 293 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 8ff35e28c0da..7d474ebc5c27 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -200,6 +203,35 @@ static u64 mpam_msc_read_idr(struct mpam_msc *msc) return (idr_high << 32) | idr_low; } +static void mpam_msc_clear_esr(struct mpam_msc *msc) +{ + u64 esr_low = __mpam_read_reg(msc, MPAMF_ESR); + + if (!esr_low) + return; + + /* + * Clearing the high/low bits of MPAMF_ESR can not be atomic. + * Clear the top half first, so that the pending error bits in the + * lower half prevent hardware from updating either half of the + * register. + */ + if (msc->has_extd_esr) + __mpam_write_reg(msc, MPAMF_ESR + 4, 0); + __mpam_write_reg(msc, MPAMF_ESR, 0); +} + +static u64 mpam_msc_read_esr(struct mpam_msc *msc) +{ + u64 esr_high = 0, esr_low; + + esr_low = __mpam_read_reg(msc, MPAMF_ESR); + if (msc->has_extd_esr) + esr_high = __mpam_read_reg(msc, MPAMF_ESR + 4); + + return (esr_high << 32) | esr_low; +} + static void __mpam_part_sel_raw(u32 partsel, struct mpam_msc *msc) { lockdep_assert_held(&msc->part_sel_lock); @@ -729,6 +761,7 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr); msc->partid_max = min(msc->partid_max, partid_max); msc->pmg_max = min(msc->pmg_max, pmg_max); + msc->has_extd_esr = FIELD_GET(MPAMF_IDR_HAS_EXTD_ESR, idr); mutex_lock(&mpam_list_lock); ris = mpam_get_or_create_ris(msc, ris_idx); @@ -743,6 +776,9 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) mutex_unlock(&msc->part_sel_lock); } + /* Clear any stale errors */ + mpam_msc_clear_esr(msc); + spin_lock(&partid_max_lock); mpam_partid_max = min(mpam_partid_max, msc->partid_max); mpam_pmg_max = min(mpam_pmg_max, msc->pmg_max); @@ -866,6 +902,13 @@ static void mpam_reset_msc(struct mpam_msc *msc, bool online) } } +static void _enable_percpu_irq(void *_irq) +{ + int *irq = _irq; + + enable_percpu_irq(*irq, IRQ_TYPE_NONE); +} + static int mpam_cpu_online(unsigned int cpu) { struct mpam_msc *msc; @@ -876,6 +919,9 @@ static int mpam_cpu_online(unsigned int cpu) if (!cpumask_test_cpu(cpu, &msc->accessibility)) continue; + if (msc->reenable_error_ppi) + _enable_percpu_irq(&msc->reenable_error_ppi); + if (atomic_fetch_inc(&msc->online_refs) == 0) mpam_reset_msc(msc, true); } @@ -926,6 +972,9 @@ static int mpam_cpu_offline(unsigned int cpu) if (!cpumask_test_cpu(cpu, &msc->accessibility)) continue; + if (msc->reenable_error_ppi) + disable_percpu_irq(msc->reenable_error_ppi); + if (atomic_dec_and_test(&msc->online_refs)) mpam_reset_msc(msc, false); } @@ -952,6 +1001,42 @@ static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online), mutex_unlock(&mpam_cpuhp_state_lock); } +static int __setup_ppi(struct mpam_msc *msc) +{ + int cpu; + + msc->error_dev_id = alloc_percpu(struct mpam_msc *); + if (!msc->error_dev_id) + return -ENOMEM; + + for_each_cpu(cpu, &msc->accessibility) + *per_cpu_ptr(msc->error_dev_id, cpu) = msc; + + return 0; +} + +static int mpam_msc_setup_error_irq(struct mpam_msc *msc) +{ + int irq; + + irq = platform_get_irq_byname_optional(msc->pdev, "error"); + if (irq <= 0) + return 0; + + /* Allocate and initialise the percpu device pointer for PPI */ + if (irq_is_percpu(irq)) + return __setup_ppi(msc); + + /* sanity check: shared interrupts can be routed anywhere? */ + if (!cpumask_equal(&msc->accessibility, cpu_possible_mask)) { + pr_err_once("msc:%u is a private resource with a shared error interrupt", + msc->id); + return -EINVAL; + } + + return 0; +} + /* * An MSC can control traffic from a set of CPUs, but may only be accessible * from a (hopefully wider) set of CPUs. The common reason for this is power @@ -1032,6 +1117,9 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) if (err) return ERR_PTR(err); + err = devm_mutex_init(dev, &msc->error_irq_lock); + if (err) + return ERR_PTR(err); mpam_mon_sel_lock_init(msc); msc->id = pdev->id; msc->pdev = pdev; @@ -1044,6 +1132,10 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) return ERR_PTR(-EINVAL); } + err = mpam_msc_setup_error_irq(msc); + if (err) + return ERR_PTR(err); + if (device_property_read_u32(&pdev->dev, "pcc-channel", &tmp)) msc->iface = MPAM_IFACE_MMIO; else @@ -1317,8 +1409,177 @@ static void mpam_enable_merge_features(struct list_head *all_classes_list) } } +static char *mpam_errcode_names[16] = { + [MPAM_ERRCODE_NONE] = "No error", + [MPAM_ERRCODE_PARTID_SEL_RANGE] = "PARTID_SEL_Range", + [MPAM_ERRCODE_REQ_PARTID_RANGE] = "Req_PARTID_Range", + [MPAM_ERRCODE_MSMONCFG_ID_RANGE] = "MSMONCFG_ID_RANGE", + [MPAM_ERRCODE_REQ_PMG_RANGE] = "Req_PMG_Range", + [MPAM_ERRCODE_MONITOR_RANGE] = "Monitor_Range", + [MPAM_ERRCODE_INTPARTID_RANGE] = "intPARTID_Range", + [MPAM_ERRCODE_UNEXPECTED_INTERNAL] = "Unexpected_INTERNAL", + [MPAM_ERRCODE_UNDEFINED_RIS_PART_SEL] = "Undefined_RIS_PART_SEL", + [MPAM_ERRCODE_RIS_NO_CONTROL] = "RIS_No_Control", + [MPAM_ERRCODE_UNDEFINED_RIS_MON_SEL] = "Undefined_RIS_MON_SEL", + [MPAM_ERRCODE_RIS_NO_MONITOR] = "RIS_No_Monitor", + [12 ... 15] = "Reserved" +}; + +static int mpam_enable_msc_ecr(void *_msc) +{ + struct mpam_msc *msc = _msc; + + __mpam_write_reg(msc, MPAMF_ECR, MPAMF_ECR_INTEN); + + return 0; +} + +/* This can run in mpam_disable(), and the interrupt handler on the same CPU */ +static int mpam_disable_msc_ecr(void *_msc) +{ + struct mpam_msc *msc = _msc; + + __mpam_write_reg(msc, MPAMF_ECR, 0); + + return 0; +} + +static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc) +{ + u64 reg; + u16 partid; + u8 errcode, pmg, ris; + + if (WARN_ON_ONCE(!msc) || + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &msc->accessibility))) + return IRQ_NONE; + + reg = mpam_msc_read_esr(msc); + + errcode = FIELD_GET(MPAMF_ESR_ERRCODE, reg); + if (!errcode) + return IRQ_NONE; + + /* Clear level triggered irq */ + mpam_msc_clear_esr(msc); + + partid = FIELD_GET(MPAMF_ESR_PARTID_MON, reg); + pmg = FIELD_GET(MPAMF_ESR_PMG, reg); + ris = FIELD_GET(MPAMF_ESR_RIS, reg); + + pr_err_ratelimited("error irq from msc:%u '%s', partid:%u, pmg: %u, ris: %u\n", + msc->id, mpam_errcode_names[errcode], partid, pmg, + ris); + + /* Disable this interrupt. */ + mpam_disable_msc_ecr(msc); + + /* + * Schedule the teardown work. Don't use a threaded IRQ as we can't + * unregister the interrupt from the threaded part of the handler. + */ + mpam_disable_reason = "hardware error interrupt"; + schedule_work(&mpam_broken_work); + + return IRQ_HANDLED; +} + +static irqreturn_t mpam_ppi_handler(int irq, void *dev_id) +{ + struct mpam_msc *msc = *(struct mpam_msc **)dev_id; + + return __mpam_irq_handler(irq, msc); +} + +static irqreturn_t mpam_spi_handler(int irq, void *dev_id) +{ + struct mpam_msc *msc = dev_id; + + return __mpam_irq_handler(irq, msc); +} + +static int mpam_register_irqs(void) +{ + int err, irq; + struct mpam_msc *msc; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + irq = platform_get_irq_byname_optional(msc->pdev, "error"); + if (irq <= 0) + continue; + + /* The MPAM spec says the interrupt can be SPI, PPI or LPI */ + /* We anticipate sharing the interrupt with other MSCs */ + if (irq_is_percpu(irq)) { + err = request_percpu_irq(irq, &mpam_ppi_handler, + "mpam:msc:error", + msc->error_dev_id); + if (err) + return err; + + msc->reenable_error_ppi = irq; + smp_call_function_many(&msc->accessibility, + &_enable_percpu_irq, &irq, + true); + } else { + err = devm_request_irq(&msc->pdev->dev, irq, + &mpam_spi_handler, IRQF_SHARED, + "mpam:msc:error", msc); + if (err) + return err; + } + + mutex_lock(&msc->error_irq_lock); + msc->error_irq_req = true; + mpam_touch_msc(msc, mpam_enable_msc_ecr, msc); + msc->error_irq_hw_enabled = true; + mutex_unlock(&msc->error_irq_lock); + } + + return 0; +} + +static void mpam_unregister_irqs(void) +{ + int irq; + struct mpam_msc *msc; + + guard(cpus_read_lock)(); + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + irq = platform_get_irq_byname_optional(msc->pdev, "error"); + if (irq <= 0) + continue; + + mutex_lock(&msc->error_irq_lock); + if (msc->error_irq_hw_enabled) { + mpam_touch_msc(msc, mpam_disable_msc_ecr, msc); + msc->error_irq_hw_enabled = false; + } + + if (msc->error_irq_req) { + if (irq_is_percpu(irq)) { + msc->reenable_error_ppi = 0; + free_percpu_irq(irq, msc->error_dev_id); + } else { + devm_free_irq(&msc->pdev->dev, irq, msc); + } + msc->error_irq_req = false; + } + mutex_unlock(&msc->error_irq_lock); + } +} + static void mpam_enable_once(void) { + int err; + /* * Once the cpuhp callbacks have been changed, mpam_partid_max can no * longer change. @@ -1327,9 +1588,26 @@ static void mpam_enable_once(void) partid_max_published = true; spin_unlock(&partid_max_lock); + /* + * If all the MSC have been probed, enabling the IRQs happens next. + * That involves cross-calling to a CPU that can reach the MSC, and + * the locks must be taken in this order: + */ + cpus_read_lock(); mutex_lock(&mpam_list_lock); mpam_enable_merge_features(&mpam_classes); + + err = mpam_register_irqs(); + mutex_unlock(&mpam_list_lock); + cpus_read_unlock(); + + if (err) { + pr_warn("Failed to register irqs: %d\n", err); + mpam_disable_reason = "Failed to enable."; + schedule_work(&mpam_broken_work); + return; + } mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); @@ -1397,6 +1675,8 @@ void mpam_disable(struct work_struct *ignored) } mutex_unlock(&mpam_cpuhp_state_lock); + mpam_unregister_irqs(); + idx = srcu_read_lock(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, srcu_read_lock_held(&mpam_srcu)) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index dec485cd8a91..fa9d9a176a54 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -46,6 +46,11 @@ struct mpam_msc { enum mpam_msc_iface iface; u32 nrdy_usec; cpumask_t accessibility; + bool has_extd_esr; + + int reenable_error_ppi; + struct mpam_msc * __percpu *error_dev_id; + atomic_t online_refs; /* @@ -59,6 +64,14 @@ struct mpam_msc { unsigned long ris_idxs; u32 ris_max; + /* + * error_irq_lock is taken when registering/unregistering the error + * interrupt and maniupulating the below flags. + */ + struct mutex error_irq_lock; + bool error_irq_req; + bool error_irq_hw_enabled; + /* mpam_msc_ris of this component */ struct list_head ris; -- Gitee From b48d92bf3e83bac7d98cad2cfa9c7463b53daee2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:52 +0000 Subject: [PATCH 036/124] arm_mpam: Use a static key to indicate when mpam is enabled ANBZ: #31060 commit 3796f75aa7958d26b93a2508de5fc1e0b2f8a853 upstream. Once all the MSC have been probed, the system wide usable number of PARTID is known and the configuration arrays can be allocated. After this point, checking all the MSC have been probed is pointless, and the cpuhp callbacks should restore the configuration, instead of just resetting the MSC. Add a static key to enable this behaviour. This will also allow MPAM to be disabled in response to an error, and the architecture code to enable/disable the context switch of the MPAM system registers. Signed-off-by: James Morse Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 12 ++++++++++++ drivers/resctrl/mpam_internal.h | 8 ++++++++ 2 files changed, 20 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 7d474ebc5c27..27ab754240f5 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -29,6 +29,8 @@ #include "mpam_internal.h" +DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */ + /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, @@ -936,6 +938,9 @@ static int mpam_discovery_cpu_online(unsigned int cpu) struct mpam_msc *msc; bool new_device_probed = false; + if (mpam_is_enabled()) + return 0; + guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, srcu_read_lock_held(&mpam_srcu)) { @@ -1475,6 +1480,10 @@ static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc) /* Disable this interrupt. */ mpam_disable_msc_ecr(msc); + /* Are we racing with the thread disabling MPAM? */ + if (!mpam_is_enabled()) + return IRQ_HANDLED; + /* * Schedule the teardown work. Don't use a threaded IRQ as we can't * unregister the interrupt from the threaded part of the handler. @@ -1609,6 +1618,7 @@ static void mpam_enable_once(void) return; } + static_branch_enable(&mpam_enabled); mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); @@ -1675,6 +1685,8 @@ void mpam_disable(struct work_struct *ignored) } mutex_unlock(&mpam_cpuhp_state_lock); + static_branch_disable(&mpam_enabled); + mpam_unregister_irqs(); idx = srcu_read_lock(&mpam_srcu); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index fa9d9a176a54..93a629f6e15a 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,13 @@ struct platform_device; +DECLARE_STATIC_KEY_FALSE(mpam_enabled); + +static inline bool mpam_is_enabled(void) +{ + return static_branch_likely(&mpam_enabled); +} + /* * Structures protected by SRCU may not be freed for a surprising amount of * time (especially if perf is running). To ensure the MPAM error interrupt can -- Gitee From c81de68c8d2cb1e5f78bd4b2cbfc13f3b76ff542 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:53 +0000 Subject: [PATCH 037/124] arm_mpam: Allow configuration to be applied and restored during cpu online ANBZ: #31060 commit 09b89d2a72f37b078198cbb09d5b9e13ba9d68b9 upstream. When CPUs come online the MSC's original configuration should be restored. Add struct mpam_config to hold the configuration. For each component, this has a bitmap of features that have been changed from the reset values. The mpam_config is also used on RIS reset where all bits are set to ensure all features are reset. Once the maximum partid is known, allocate a configuration array for each component, and reprogram each RIS configuration from this. CC: Dave Martin Signed-off-by: James Morse Cc: Fujitsu Fujitsu Cc: Peter Newman peternewman@google.com Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 288 +++++++++++++++++++++++++++++--- drivers/resctrl/mpam_internal.h | 27 +++ 2 files changed, 290 insertions(+), 25 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 27ab754240f5..d64f81dcb4d4 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -145,6 +145,16 @@ static void mpam_free_garbage(void) } } +/* + * Once mpam is enabled, new requestors cannot further reduce the available + * partid. Assert that the size is fixed, and new requestors will be turned + * away. + */ +static void mpam_assert_partid_sizes_fixed(void) +{ + WARN_ON_ONCE(!partid_max_published); +} + static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) { WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); @@ -338,12 +348,16 @@ mpam_component_alloc(struct mpam_class *class, int id) return comp; } +static void __destroy_component_cfg(struct mpam_component *comp); + static void mpam_component_destroy(struct mpam_component *comp) { struct mpam_class *class = comp->class; lockdep_assert_held(&mpam_list_lock); + __destroy_component_cfg(comp); + list_del_rcu(&comp->class_list); add_to_garbage(comp); @@ -819,31 +833,57 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) __mpam_write_reg(msc, reg, bm); } -static void mpam_reset_ris_partid(struct mpam_msc_ris *ris, u16 partid) +/* Called via IPI. Call while holding an SRCU reference */ +static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, + struct mpam_config *cfg) { struct mpam_msc *msc = ris->vmsc->msc; struct mpam_props *rprops = &ris->props; - WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); - mutex_lock(&msc->part_sel_lock); __mpam_part_sel(ris->ris_idx, partid, msc); - if (mpam_has_feature(mpam_feat_cpor_part, rprops)) - mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); + if (mpam_has_feature(mpam_feat_cpor_part, rprops) && + mpam_has_feature(mpam_feat_cpor_part, cfg)) { + if (cfg->reset_cpbm) + mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); + else + mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); + } - if (mpam_has_feature(mpam_feat_mbw_part, rprops)) - mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); + if (mpam_has_feature(mpam_feat_mbw_part, rprops) && + mpam_has_feature(mpam_feat_mbw_part, cfg)) { + if (cfg->reset_mbw_pbm) + mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); + else + mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); + } - if (mpam_has_feature(mpam_feat_mbw_min, rprops)) + if (mpam_has_feature(mpam_feat_mbw_min, rprops) && + mpam_has_feature(mpam_feat_mbw_min, cfg)) mpam_write_partsel_reg(msc, MBW_MIN, 0); - if (mpam_has_feature(mpam_feat_mbw_max, rprops)) - mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); + if (mpam_has_feature(mpam_feat_mbw_max, rprops) && + mpam_has_feature(mpam_feat_mbw_max, cfg)) { + if (cfg->reset_mbw_max) + mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); + else + mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); + } mutex_unlock(&msc->part_sel_lock); } +static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) +{ + *reset_cfg = (struct mpam_config) { + .reset_cpbm = true, + .reset_mbw_pbm = true, + .reset_mbw_max = true, + }; + bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); +} + /* * Called via smp_call_on_cpu() to prevent migration, while still being * pre-emptible. Caller must hold mpam_srcu. @@ -851,16 +891,19 @@ static void mpam_reset_ris_partid(struct mpam_msc_ris *ris, u16 partid) static int mpam_reset_ris(void *arg) { u16 partid, partid_max; + struct mpam_config reset_cfg; struct mpam_msc_ris *ris = arg; if (ris->in_reset_state) return 0; + mpam_init_reset_cfg(&reset_cfg); + spin_lock(&partid_max_lock); partid_max = mpam_partid_max; spin_unlock(&partid_max_lock); for (partid = 0; partid <= partid_max; partid++) - mpam_reset_ris_partid(ris, partid); + mpam_reprogram_ris_partid(ris, partid, &reset_cfg); return 0; } @@ -889,19 +932,58 @@ static int mpam_touch_msc(struct mpam_msc *msc, int (*fn)(void *a), void *arg) return smp_call_on_cpu(mpam_get_msc_preferred_cpu(msc), fn, arg, true); } -static void mpam_reset_msc(struct mpam_msc *msc, bool online) +struct mpam_write_config_arg { + struct mpam_msc_ris *ris; + struct mpam_component *comp; + u16 partid; +}; + +static int __write_config(void *arg) +{ + struct mpam_write_config_arg *c = arg; + + mpam_reprogram_ris_partid(c->ris, c->partid, &c->comp->cfg[c->partid]); + + return 0; +} + +static void mpam_reprogram_msc(struct mpam_msc *msc) { + u16 partid; + bool reset; + struct mpam_config *cfg; struct mpam_msc_ris *ris; + struct mpam_write_config_arg arg; + + /* + * No lock for mpam_partid_max as partid_max_published has been + * set by mpam_enabled(), so the values can no longer change. + */ + mpam_assert_partid_sizes_fixed(); - list_for_each_entry_srcu(ris, &msc->ris, msc_list, srcu_read_lock_held(&mpam_srcu)) { - mpam_touch_msc(msc, &mpam_reset_ris, ris); + mutex_lock(&msc->cfg_lock); + list_for_each_entry_srcu(ris, &msc->ris, msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!mpam_is_enabled() && !ris->in_reset_state) { + mpam_touch_msc(msc, &mpam_reset_ris, ris); + ris->in_reset_state = true; + continue; + } - /* - * Set in_reset_state when coming online. The reset state - * for non-zero partid may be lost while the CPUs are offline. - */ - ris->in_reset_state = online; + arg.comp = ris->vmsc->comp; + arg.ris = ris; + reset = true; + for (partid = 0; partid <= mpam_partid_max; partid++) { + cfg = &ris->vmsc->comp->cfg[partid]; + if (!bitmap_empty(cfg->features, MPAM_FEATURE_LAST)) + reset = false; + + arg.partid = partid; + mpam_touch_msc(msc, __write_config, &arg); + } + ris->in_reset_state = reset; } + mutex_unlock(&msc->cfg_lock); } static void _enable_percpu_irq(void *_irq) @@ -925,7 +1007,7 @@ static int mpam_cpu_online(unsigned int cpu) _enable_percpu_irq(&msc->reenable_error_ppi); if (atomic_fetch_inc(&msc->online_refs) == 0) - mpam_reset_msc(msc, true); + mpam_reprogram_msc(msc); } return 0; @@ -980,8 +1062,22 @@ static int mpam_cpu_offline(unsigned int cpu) if (msc->reenable_error_ppi) disable_percpu_irq(msc->reenable_error_ppi); - if (atomic_dec_and_test(&msc->online_refs)) - mpam_reset_msc(msc, false); + if (atomic_dec_and_test(&msc->online_refs)) { + struct mpam_msc_ris *ris; + + mutex_lock(&msc->cfg_lock); + list_for_each_entry_srcu(ris, &msc->ris, msc_list, + srcu_read_lock_held(&mpam_srcu)) { + mpam_touch_msc(msc, &mpam_reset_ris, ris); + + /* + * The reset state for non-zero partid may be + * lost while the CPUs are offline. + */ + ris->in_reset_state = false; + } + mutex_unlock(&msc->cfg_lock); + } } return 0; @@ -1125,6 +1221,11 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) err = devm_mutex_init(dev, &msc->error_irq_lock); if (err) return ERR_PTR(err); + + err = devm_mutex_init(dev, &msc->cfg_lock); + if (err) + return ERR_PTR(err); + mpam_mon_sel_lock_init(msc); msc->id = pdev->id; msc->pdev = pdev; @@ -1585,6 +1686,72 @@ static void mpam_unregister_irqs(void) } } +static void __destroy_component_cfg(struct mpam_component *comp) +{ + add_to_garbage(comp->cfg); +} + +static void mpam_reset_component_cfg(struct mpam_component *comp) +{ + int i; + struct mpam_props *cprops = &comp->class->props; + + mpam_assert_partid_sizes_fixed(); + + if (!comp->cfg) + return; + + for (i = 0; i <= mpam_partid_max; i++) { + comp->cfg[i] = (struct mpam_config) {}; + if (cprops->cpbm_wd) + comp->cfg[i].cpbm = GENMASK(cprops->cpbm_wd - 1, 0); + if (cprops->mbw_pbm_bits) + comp->cfg[i].mbw_pbm = GENMASK(cprops->mbw_pbm_bits - 1, 0); + if (cprops->bwa_wd) + comp->cfg[i].mbw_max = GENMASK(15, 16 - cprops->bwa_wd); + } +} + +static int __allocate_component_cfg(struct mpam_component *comp) +{ + mpam_assert_partid_sizes_fixed(); + + if (comp->cfg) + return 0; + + comp->cfg = kcalloc(mpam_partid_max + 1, sizeof(*comp->cfg), GFP_KERNEL); + if (!comp->cfg) + return -ENOMEM; + + /* + * The array is free()d in one go, so only cfg[0]'s structure needs + * to be initialised. + */ + init_garbage(&comp->cfg[0].garbage); + + mpam_reset_component_cfg(comp); + + return 0; +} + +static int mpam_allocate_config(void) +{ + struct mpam_class *class; + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(class, &mpam_classes, classes_list) { + list_for_each_entry(comp, &class->components, class_list) { + int err = __allocate_component_cfg(comp); + if (err) + return err; + } + } + + return 0; +} + static void mpam_enable_once(void) { int err; @@ -1604,15 +1771,25 @@ static void mpam_enable_once(void) */ cpus_read_lock(); mutex_lock(&mpam_list_lock); - mpam_enable_merge_features(&mpam_classes); + do { + mpam_enable_merge_features(&mpam_classes); - err = mpam_register_irqs(); + err = mpam_register_irqs(); + if (err) { + pr_warn("Failed to register irqs: %d\n", err); + break; + } + err = mpam_allocate_config(); + if (err) { + pr_err("Failed to allocate configuration arrays.\n"); + break; + } + } while (0); mutex_unlock(&mpam_list_lock); cpus_read_unlock(); if (err) { - pr_warn("Failed to register irqs: %d\n", err); mpam_disable_reason = "Failed to enable."; schedule_work(&mpam_broken_work); return; @@ -1632,6 +1809,9 @@ static void mpam_reset_component_locked(struct mpam_component *comp) struct mpam_vmsc *vmsc; lockdep_assert_cpus_held(); + mpam_assert_partid_sizes_fixed(); + + mpam_reset_component_cfg(comp); guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, @@ -1732,6 +1912,64 @@ void mpam_enable(struct work_struct *work) mpam_enable_once(); } +#define maybe_update_config(cfg, feature, newcfg, member, changes) do { \ + if (mpam_has_feature(feature, newcfg) && \ + (newcfg)->member != (cfg)->member) { \ + (cfg)->member = (newcfg)->member; \ + mpam_set_feature(feature, cfg); \ + \ + (changes) = true; \ + } \ +} while (0) + +static bool mpam_update_config(struct mpam_config *cfg, + const struct mpam_config *newcfg) +{ + bool has_changes = false; + + maybe_update_config(cfg, mpam_feat_cpor_part, newcfg, cpbm, has_changes); + maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes); + maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes); + + return has_changes; +} + +int mpam_apply_config(struct mpam_component *comp, u16 partid, + struct mpam_config *cfg) +{ + struct mpam_write_config_arg arg; + struct mpam_msc_ris *ris; + struct mpam_vmsc *vmsc; + struct mpam_msc *msc; + + lockdep_assert_cpus_held(); + + /* Don't pass in the current config! */ + WARN_ON_ONCE(&comp->cfg[partid] == cfg); + + if (!mpam_update_config(&comp->cfg[partid], cfg)) + return 0; + + arg.comp = comp; + arg.partid = partid; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + msc = vmsc->msc; + + mutex_lock(&msc->cfg_lock); + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + arg.ris = ris; + mpam_touch_msc(msc, __write_config, &arg); + } + mutex_unlock(&msc->cfg_lock); + } + + return 0; +} + static int __init mpam_msc_driver_init(void) { if (!system_supports_mpam()) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 93a629f6e15a..b8fdbd7ab7a5 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -91,6 +91,9 @@ struct mpam_msc { */ struct mutex part_sel_lock; + /* cfg_lock protects the msc configuration. */ + struct mutex cfg_lock; + /* * mon_sel_lock protects access to the MSC hardware registers that are * affected by MPAMCFG_MON_SEL, and the mbwu_state. @@ -182,6 +185,21 @@ struct mpam_class { struct mpam_garbage garbage; }; +struct mpam_config { + /* Which configuration values are valid. */ + DECLARE_BITMAP(features, MPAM_FEATURE_LAST); + + u32 cpbm; + u32 mbw_pbm; + u16 mbw_max; + + bool reset_cpbm; + bool reset_mbw_pbm; + bool reset_mbw_max; + + struct mpam_garbage garbage; +}; + struct mpam_component { u32 comp_id; @@ -190,6 +208,12 @@ struct mpam_component { cpumask_t affinity; + /* + * Array of configuration values, indexed by partid. + * Read from cpuhp callbacks, hold the cpuhp lock when writing. + */ + struct mpam_config *cfg; + /* member of mpam_class:components */ struct list_head class_list; @@ -249,6 +273,9 @@ extern u8 mpam_pmg_max; void mpam_enable(struct work_struct *work); void mpam_disable(struct work_struct *work); +int mpam_apply_config(struct mpam_component *comp, u16 partid, + struct mpam_config *cfg); + int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); -- Gitee From bf4a0dde8e75b3086503989d8ca73cd7baf28db5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:54 +0000 Subject: [PATCH 038/124] arm_mpam: Probe and reset the rest of the features ANBZ: #31060 commit 880df85d8673f8e2395f139d3618661366e5d4d8 upstream. MPAM supports more features than are going to be exposed to resctrl. For partid other than 0, the reset values of these controls isn't known. Discover the rest of the features so they can be reset to avoid any side effects when resctrl is in use. PARTID narrowing allows MSC/RIS to support less configuration space than is usable. If this feature is found on a class of device we are likely to use, then reduce the partid_max to make it usable. This allows us to map a PARTID to itself. CC: Rohit Mathew CC: Zeng Heng CC: Dave Martin Signed-off-by: James Morse Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 188 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 18 +++ 2 files changed, 206 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index d64f81dcb4d4..9533d2d04ff3 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -259,6 +259,15 @@ static void __mpam_part_sel(u8 ris_idx, u16 partid, struct mpam_msc *msc) __mpam_part_sel_raw(partsel, msc); } +static void __mpam_intpart_sel(u8 ris_idx, u16 intpartid, struct mpam_msc *msc) +{ + u32 partsel = FIELD_PREP(MPAMCFG_PART_SEL_RIS, ris_idx) | + FIELD_PREP(MPAMCFG_PART_SEL_PARTID_SEL, intpartid) | + MPAMCFG_PART_SEL_INTERNAL; + + __mpam_part_sel_raw(partsel, msc); +} + int mpam_register_requestor(u16 partid_max, u8 pmg_max) { guard(spinlock)(&partid_max_lock); @@ -656,10 +665,34 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) struct mpam_msc *msc = ris->vmsc->msc; struct device *dev = &msc->pdev->dev; struct mpam_props *props = &ris->props; + struct mpam_class *class = ris->vmsc->comp->class; lockdep_assert_held(&msc->probe_lock); lockdep_assert_held(&msc->part_sel_lock); + /* Cache Capacity Partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_CCAP_PART, ris->idr)) { + u32 ccap_features = mpam_read_partsel_reg(msc, CCAP_IDR); + + props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ccap_features); + if (props->cmax_wd && + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ccap_features)) + mpam_set_feature(mpam_feat_cmax_softlim, props); + + if (props->cmax_wd && + !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ccap_features)) + mpam_set_feature(mpam_feat_cmax_cmax, props); + + if (props->cmax_wd && + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ccap_features)) + mpam_set_feature(mpam_feat_cmax_cmin, props); + + props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ccap_features); + if (props->cassoc_wd && + FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ccap_features)) + mpam_set_feature(mpam_feat_cmax_cassoc, props); + } + /* Cache Portion partitioning */ if (FIELD_GET(MPAMF_IDR_HAS_CPOR_PART, ris->idr)) { u32 cpor_features = mpam_read_partsel_reg(msc, CPOR_IDR); @@ -682,6 +715,31 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) mpam_set_feature(mpam_feat_mbw_max, props); + + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MIN, mbw_features)) + mpam_set_feature(mpam_feat_mbw_min, props); + + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_PROP, mbw_features)) + mpam_set_feature(mpam_feat_mbw_prop, props); + } + + /* Priority partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_PRI_PART, ris->idr)) { + u32 pri_features = mpam_read_partsel_reg(msc, PRI_IDR); + + props->intpri_wd = FIELD_GET(MPAMF_PRI_IDR_INTPRI_WD, pri_features); + if (props->intpri_wd && FIELD_GET(MPAMF_PRI_IDR_HAS_INTPRI, pri_features)) { + mpam_set_feature(mpam_feat_intpri_part, props); + if (FIELD_GET(MPAMF_PRI_IDR_INTPRI_0_IS_LOW, pri_features)) + mpam_set_feature(mpam_feat_intpri_part_0_low, props); + } + + props->dspri_wd = FIELD_GET(MPAMF_PRI_IDR_DSPRI_WD, pri_features); + if (props->dspri_wd && FIELD_GET(MPAMF_PRI_IDR_HAS_DSPRI, pri_features)) { + mpam_set_feature(mpam_feat_dspri_part, props); + if (FIELD_GET(MPAMF_PRI_IDR_DSPRI_0_IS_LOW, pri_features)) + mpam_set_feature(mpam_feat_dspri_part_0_low, props); + } } /* Performance Monitoring */ @@ -706,6 +764,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_msmon_csu, props); + if (FIELD_GET(MPAMF_CSUMON_IDR_HAS_XCL, csumonidr)) + mpam_set_feature(mpam_feat_msmon_csu_xcl, props); + /* Is NRDY hardware managed? */ hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); if (hw_managed) @@ -727,6 +788,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) if (props->num_mbwu_mon) mpam_set_feature(mpam_feat_msmon_mbwu, props); + if (FIELD_GET(MPAMF_MBWUMON_IDR_HAS_RWBW, mbwumon_idr)) + mpam_set_feature(mpam_feat_msmon_mbwu_rwbw, props); + /* Is NRDY hardware managed? */ hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); if (hw_managed) @@ -738,6 +802,21 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) */ } } + + /* + * RIS with PARTID narrowing don't have enough storage for one + * configuration per PARTID. If these are in a class we could use, + * reduce the supported partid_max to match the number of intpartid. + * If the class is unknown, just ignore it. + */ + if (FIELD_GET(MPAMF_IDR_HAS_PARTID_NRW, ris->idr) && + class->type != MPAM_CLASS_UNKNOWN) { + u32 nrwidr = mpam_read_partsel_reg(msc, PARTID_NRW_IDR); + u16 partid_max = FIELD_GET(MPAMF_PARTID_NRW_IDR_INTPARTID_MAX, nrwidr); + + mpam_set_feature(mpam_feat_partid_nrw, props); + msc->partid_max = min(msc->partid_max, partid_max); + } } static int mpam_msc_hw_probe(struct mpam_msc *msc) @@ -837,12 +916,28 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) { + u32 pri_val = 0; + u16 cmax = MPAMCFG_CMAX_CMAX; struct mpam_msc *msc = ris->vmsc->msc; struct mpam_props *rprops = &ris->props; + u16 dspri = GENMASK(rprops->dspri_wd, 0); + u16 intpri = GENMASK(rprops->intpri_wd, 0); mutex_lock(&msc->part_sel_lock); __mpam_part_sel(ris->ris_idx, partid, msc); + if (mpam_has_feature(mpam_feat_partid_nrw, rprops)) { + /* Update the intpartid mapping */ + mpam_write_partsel_reg(msc, INTPARTID, + MPAMCFG_INTPARTID_INTERNAL | partid); + + /* + * Then switch to the 'internal' partid to update the + * configuration. + */ + __mpam_intpart_sel(ris->ris_idx, partid, msc); + } + if (mpam_has_feature(mpam_feat_cpor_part, rprops) && mpam_has_feature(mpam_feat_cpor_part, cfg)) { if (cfg->reset_cpbm) @@ -871,6 +966,35 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); } + if (mpam_has_feature(mpam_feat_mbw_prop, rprops) && + mpam_has_feature(mpam_feat_mbw_prop, cfg)) + mpam_write_partsel_reg(msc, MBW_PROP, 0); + + if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) + mpam_write_partsel_reg(msc, CMAX, cmax); + + if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) + mpam_write_partsel_reg(msc, CMIN, 0); + + if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops)) + mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC); + + if (mpam_has_feature(mpam_feat_intpri_part, rprops) || + mpam_has_feature(mpam_feat_dspri_part, rprops)) { + /* aces high? */ + if (!mpam_has_feature(mpam_feat_intpri_part_0_low, rprops)) + intpri = 0; + if (!mpam_has_feature(mpam_feat_dspri_part_0_low, rprops)) + dspri = 0; + + if (mpam_has_feature(mpam_feat_intpri_part, rprops)) + pri_val |= FIELD_PREP(MPAMCFG_PRI_INTPRI, intpri); + if (mpam_has_feature(mpam_feat_dspri_part, rprops)) + pri_val |= FIELD_PREP(MPAMCFG_PRI_DSPRI, dspri); + + mpam_write_partsel_reg(msc, PRI, pri_val); + } + mutex_unlock(&msc->part_sel_lock); } @@ -1312,6 +1436,18 @@ static bool mpam_has_bwa_wd_feature(struct mpam_props *props) return true; if (mpam_has_feature(mpam_feat_mbw_max, props)) return true; + if (mpam_has_feature(mpam_feat_mbw_prop, props)) + return true; + return false; +} + +/* Any of these features mean the CMAX_WD field is valid. */ +static bool mpam_has_cmax_wd_feature(struct mpam_props *props) +{ + if (mpam_has_feature(mpam_feat_cmax_cmax, props)) + return true; + if (mpam_has_feature(mpam_feat_cmax_cmin, props)) + return true; return false; } @@ -1370,6 +1506,23 @@ static void __props_mismatch(struct mpam_props *parent, parent->bwa_wd = min(parent->bwa_wd, child->bwa_wd); } + if (alias && !mpam_has_cmax_wd_feature(parent) && mpam_has_cmax_wd_feature(child)) { + parent->cmax_wd = child->cmax_wd; + } else if (MISMATCHED_HELPER(parent, child, mpam_has_cmax_wd_feature, + cmax_wd, alias)) { + pr_debug("%s took the min cmax_wd\n", __func__); + parent->cmax_wd = min(parent->cmax_wd, child->cmax_wd); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_cmax_cassoc, alias)) { + parent->cassoc_wd = child->cassoc_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_cmax_cassoc, + cassoc_wd, alias)) { + pr_debug("%s cleared cassoc_wd\n", __func__); + mpam_clear_feature(mpam_feat_cmax_cassoc, parent); + parent->cassoc_wd = 0; + } + /* For num properties, take the minimum */ if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_csu, alias)) { parent->num_csu_mon = child->num_csu_mon; @@ -1389,6 +1542,41 @@ static void __props_mismatch(struct mpam_props *parent, child->num_mbwu_mon); } + if (CAN_MERGE_FEAT(parent, child, mpam_feat_intpri_part, alias)) { + parent->intpri_wd = child->intpri_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_intpri_part, + intpri_wd, alias)) { + pr_debug("%s took the min intpri_wd\n", __func__); + parent->intpri_wd = min(parent->intpri_wd, child->intpri_wd); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_dspri_part, alias)) { + parent->dspri_wd = child->dspri_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_dspri_part, + dspri_wd, alias)) { + pr_debug("%s took the min dspri_wd\n", __func__); + parent->dspri_wd = min(parent->dspri_wd, child->dspri_wd); + } + + /* TODO: alias support for these two */ + /* {int,ds}pri may not have differing 0-low behaviour */ + if (mpam_has_feature(mpam_feat_intpri_part, parent) && + (!mpam_has_feature(mpam_feat_intpri_part, child) || + mpam_has_feature(mpam_feat_intpri_part_0_low, parent) != + mpam_has_feature(mpam_feat_intpri_part_0_low, child))) { + pr_debug("%s cleared intpri_part\n", __func__); + mpam_clear_feature(mpam_feat_intpri_part, parent); + mpam_clear_feature(mpam_feat_intpri_part_0_low, parent); + } + if (mpam_has_feature(mpam_feat_dspri_part, parent) && + (!mpam_has_feature(mpam_feat_dspri_part, child) || + mpam_has_feature(mpam_feat_dspri_part_0_low, parent) != + mpam_has_feature(mpam_feat_dspri_part_0_low, child))) { + pr_debug("%s cleared dspri_part\n", __func__); + mpam_clear_feature(mpam_feat_dspri_part, parent); + mpam_clear_feature(mpam_feat_dspri_part_0_low, parent); + } + if (alias) { /* Merge features for aliased resources */ bitmap_or(parent->features, parent->features, child->features, MPAM_FEATURE_LAST); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index b8fdbd7ab7a5..618e5355a95e 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -143,14 +143,28 @@ static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) /* Bits for mpam features bitmaps */ enum mpam_device_features { mpam_feat_cpor_part, + mpam_feat_cmax_softlim, + mpam_feat_cmax_cmax, + mpam_feat_cmax_cmin, + mpam_feat_cmax_cassoc, mpam_feat_mbw_part, mpam_feat_mbw_min, mpam_feat_mbw_max, + mpam_feat_mbw_prop, + mpam_feat_intpri_part, + mpam_feat_intpri_part_0_low, + mpam_feat_dspri_part, + mpam_feat_dspri_part_0_low, mpam_feat_msmon, mpam_feat_msmon_csu, + mpam_feat_msmon_csu_capture, + mpam_feat_msmon_csu_xcl, mpam_feat_msmon_csu_hw_nrdy, mpam_feat_msmon_mbwu, + mpam_feat_msmon_mbwu_capture, + mpam_feat_msmon_mbwu_rwbw, mpam_feat_msmon_mbwu_hw_nrdy, + mpam_feat_partid_nrw, MPAM_FEATURE_LAST }; @@ -160,6 +174,10 @@ struct mpam_props { u16 cpbm_wd; u16 mbw_pbm_bits; u16 bwa_wd; + u16 cmax_wd; + u16 cassoc_wd; + u16 intpri_wd; + u16 dspri_wd; u16 num_csu_mon; u16 num_mbwu_mon; }; -- Gitee From b8f60e165e4c2a5405261094e88c8cc07df0d40d Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:55 +0000 Subject: [PATCH 039/124] arm_mpam: Add helpers to allocate monitors ANBZ: #31060 commit c891bae66423bc69a680ca1de34940132e2c8ace upstream. MPAM's MSC support a number of monitors, each of which supports bandwidth counters, or cache-storage-utilisation counters. To use a counter, a monitor needs to be configured. Add helpers to allocate and free CSU or MBWU monitors. Signed-off-by: James Morse Reviewed-by: Ben Horgan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 2 ++ drivers/resctrl/mpam_internal.h | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 9533d2d04ff3..9a2c1eb6c332 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -305,6 +305,8 @@ mpam_class_alloc(u8 level_idx, enum mpam_class_types type) class->level = level_idx; class->type = type; INIT_LIST_HEAD_RCU(&class->classes_list); + ida_init(&class->ida_csu_mon); + ida_init(&class->ida_mbwu_mon); list_add_rcu(&class->classes_list, &mpam_classes); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 618e5355a95e..8bbc67df6d97 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -200,6 +200,9 @@ struct mpam_class { /* member of mpam_classes */ struct list_head classes_list; + struct ida ida_csu_mon; + struct ida ida_mbwu_mon; + struct mpam_garbage garbage; }; @@ -279,6 +282,38 @@ struct mpam_msc_ris { struct mpam_garbage garbage; }; +static inline int mpam_alloc_csu_mon(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return -EOPNOTSUPP; + + return ida_alloc_max(&class->ida_csu_mon, cprops->num_csu_mon - 1, + GFP_KERNEL); +} + +static inline void mpam_free_csu_mon(struct mpam_class *class, int csu_mon) +{ + ida_free(&class->ida_csu_mon, csu_mon); +} + +static inline int mpam_alloc_mbwu_mon(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_mbwu, cprops)) + return -EOPNOTSUPP; + + return ida_alloc_max(&class->ida_mbwu_mon, cprops->num_mbwu_mon - 1, + GFP_KERNEL); +} + +static inline void mpam_free_mbwu_mon(struct mpam_class *class, int mbwu_mon) +{ + ida_free(&class->ida_mbwu_mon, mbwu_mon); +} + /* List of all classes - protected by srcu*/ extern struct srcu_struct mpam_srcu; extern struct list_head mpam_classes; -- Gitee From 14219b8d66b7e58ed42be3fc3acd421d23e2327f Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:56 +0000 Subject: [PATCH 040/124] arm_mpam: Add mpam_msmon_read() to read monitor value ANBZ: #31060 commit 823e7c3712c584641b4ef890a8af34884c677197 upstream. Reading a monitor involves configuring what you want to monitor, and reading the value. Components made up of multiple MSC may need values from each MSC. MSCs may take time to configure, returning 'not ready'. The maximum 'not ready' time should have been provided by firmware. Add mpam_msmon_read() to hide all this. If (one of) the MSC returns not ready, then wait the full timeout value before trying again. CC: Shanker Donthineni Cc: Shaopeng Tan (Fujitsu) Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 235 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 19 +++ 2 files changed, 254 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 9a2c1eb6c332..a4fedec87fae 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -886,6 +886,241 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) return 0; } +struct mon_read { + struct mpam_msc_ris *ris; + struct mon_cfg *ctx; + enum mpam_device_features type; + u64 *val; + int err; +}; + +static void gen_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, + u32 *flt_val) +{ + struct mon_cfg *ctx = m->ctx; + + /* + * For CSU counters its implementation-defined what happens when not + * filtering by partid. + */ + *ctl_val = MSMON_CFG_x_CTL_MATCH_PARTID; + + *flt_val = FIELD_PREP(MSMON_CFG_x_FLT_PARTID, ctx->partid); + + if (m->ctx->match_pmg) { + *ctl_val |= MSMON_CFG_x_CTL_MATCH_PMG; + *flt_val |= FIELD_PREP(MSMON_CFG_x_FLT_PMG, ctx->pmg); + } + + switch (m->type) { + case mpam_feat_msmon_csu: + *ctl_val |= MSMON_CFG_CSU_CTL_TYPE_CSU; + + if (mpam_has_feature(mpam_feat_msmon_csu_xcl, &m->ris->props)) + *flt_val |= FIELD_PREP(MSMON_CFG_CSU_FLT_XCL, ctx->csu_exclude_clean); + + break; + case mpam_feat_msmon_mbwu: + *ctl_val |= MSMON_CFG_MBWU_CTL_TYPE_MBWU; + + if (mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, &m->ris->props)) + *flt_val |= FIELD_PREP(MSMON_CFG_MBWU_FLT_RWBW, ctx->opts); + + break; + default: + pr_warn("Unexpected monitor type %d\n", m->type); + } +} + +static void read_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, + u32 *flt_val) +{ + struct mpam_msc *msc = m->ris->vmsc->msc; + + switch (m->type) { + case mpam_feat_msmon_csu: + *ctl_val = mpam_read_monsel_reg(msc, CFG_CSU_CTL); + *flt_val = mpam_read_monsel_reg(msc, CFG_CSU_FLT); + break; + case mpam_feat_msmon_mbwu: + *ctl_val = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); + *flt_val = mpam_read_monsel_reg(msc, CFG_MBWU_FLT); + break; + default: + pr_warn("Unexpected monitor type %d\n", m->type); + } +} + +/* Remove values set by the hardware to prevent apparent mismatches. */ +static inline void clean_msmon_ctl_val(u32 *cur_ctl) +{ + *cur_ctl &= ~MSMON_CFG_x_CTL_OFLOW_STATUS; +} + +static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, + u32 flt_val) +{ + struct mpam_msc *msc = m->ris->vmsc->msc; + + /* + * Write the ctl_val with the enable bit cleared, reset the counter, + * then enable counter. + */ + switch (m->type) { + case mpam_feat_msmon_csu: + mpam_write_monsel_reg(msc, CFG_CSU_FLT, flt_val); + mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val); + mpam_write_monsel_reg(msc, CSU, 0); + mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); + break; + case mpam_feat_msmon_mbwu: + mpam_write_monsel_reg(msc, CFG_MBWU_FLT, flt_val); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); + /* Counting monitors require NRDY to be reset by software */ + mpam_write_monsel_reg(msc, MBWU, 0); + break; + default: + pr_warn("Unexpected monitor type %d\n", m->type); + } +} + +static void __ris_msmon_read(void *arg) +{ + u64 now; + bool nrdy = false; + bool config_mismatch; + struct mon_read *m = arg; + struct mon_cfg *ctx = m->ctx; + struct mpam_msc_ris *ris = m->ris; + struct mpam_props *rprops = &ris->props; + struct mpam_msc *msc = m->ris->vmsc->msc; + u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; + + if (!mpam_mon_sel_lock(msc)) { + m->err = -EIO; + return; + } + mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, ctx->mon) | + FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); + mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + + /* + * Read the existing configuration to avoid re-writing the same values. + * This saves waiting for 'nrdy' on subsequent reads. + */ + read_msmon_ctl_flt_vals(m, &cur_ctl, &cur_flt); + clean_msmon_ctl_val(&cur_ctl); + gen_msmon_ctl_flt_vals(m, &ctl_val, &flt_val); + config_mismatch = cur_flt != flt_val || + cur_ctl != (ctl_val | MSMON_CFG_x_CTL_EN); + + if (config_mismatch) + write_msmon_ctl_flt_vals(m, ctl_val, flt_val); + + switch (m->type) { + case mpam_feat_msmon_csu: + now = mpam_read_monsel_reg(msc, CSU); + if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) + nrdy = now & MSMON___NRDY; + break; + case mpam_feat_msmon_mbwu: + now = mpam_read_monsel_reg(msc, MBWU); + if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) + nrdy = now & MSMON___NRDY; + break; + default: + m->err = -EINVAL; + } + mpam_mon_sel_unlock(msc); + + if (nrdy) { + m->err = -EBUSY; + return; + } + + now = FIELD_GET(MSMON___VALUE, now); + *m->val += now; +} + +static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) +{ + int err, any_err = 0; + struct mpam_vmsc *vmsc; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_msc *msc = vmsc->msc; + struct mpam_msc_ris *ris; + + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + arg->ris = ris; + + err = smp_call_function_any(&msc->accessibility, + __ris_msmon_read, arg, + true); + if (!err && arg->err) + err = arg->err; + + /* + * Save one error to be returned to the caller, but + * keep reading counters so that get reprogrammed. On + * platforms with NRDY this lets us wait once. + */ + if (err) + any_err = err; + } + } + + return any_err; +} + +int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, + enum mpam_device_features type, u64 *val) +{ + int err; + struct mon_read arg; + u64 wait_jiffies = 0; + struct mpam_props *cprops = &comp->class->props; + + might_sleep(); + + if (!mpam_is_enabled()) + return -EIO; + + if (!mpam_has_feature(type, cprops)) + return -EOPNOTSUPP; + + arg = (struct mon_read) { + .ctx = ctx, + .type = type, + .val = val, + }; + *val = 0; + + err = _msmon_read(comp, &arg); + if (err == -EBUSY && comp->class->nrdy_usec) + wait_jiffies = usecs_to_jiffies(comp->class->nrdy_usec); + + while (wait_jiffies) + wait_jiffies = schedule_timeout_uninterruptible(wait_jiffies); + + if (err == -EBUSY) { + arg = (struct mon_read) { + .ctx = ctx, + .type = type, + .val = val, + }; + *val = 0; + + err = _msmon_read(comp, &arg); + } + + return err; +} + static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) { u32 num_words, msb; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 8bbc67df6d97..12f0a5b7f39e 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -186,6 +186,22 @@ struct mpam_props { #define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) #define mpam_clear_feature(_feat, x) clear_bit(_feat, (x)->features) +/* The values for MSMON_CFG_MBWU_FLT.RWBW */ +enum mon_filter_options { + COUNT_BOTH = 0, + COUNT_WRITE = 1, + COUNT_READ = 2, +}; + +struct mon_cfg { + u16 mon; + u8 pmg; + bool match_pmg; + bool csu_exclude_clean; + u32 partid; + enum mon_filter_options opts; +}; + struct mpam_class { /* mpam_components in this class */ struct list_head components; @@ -329,6 +345,9 @@ void mpam_disable(struct work_struct *work); int mpam_apply_config(struct mpam_component *comp, u16 partid, struct mpam_config *cfg); +int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, + enum mpam_device_features, u64 *val); + int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); -- Gitee From 11ab73dd957b3463f2bd58b98c20b1fcca845ae9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:57 +0000 Subject: [PATCH 041/124] arm_mpam: Track bandwidth counter state for power management ANBZ: #31060 commit 41e8a14950e1732af51cfec8fa09f8ded02a5ca9 upstream. Bandwidth counters need to run continuously to correctly reflect the bandwidth. Save the counter state when the hardware is reset due to CPU hotplug. Add struct mbwu_state to track the bandwidth counter. Support for tracking overflow with the same structure will be added in a subsequent commit. Cc: Zeng Heng Reviewed-by: Gavin Shan Reviewed-by: Zeng Heng Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 126 +++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 21 +++++- 2 files changed, 145 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index a4fedec87fae..91fc396f3bc7 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -993,6 +993,7 @@ static void __ris_msmon_read(void *arg) struct mon_read *m = arg; struct mon_cfg *ctx = m->ctx; struct mpam_msc_ris *ris = m->ris; + struct msmon_mbwu_state *mbwu_state; struct mpam_props *rprops = &ris->props; struct mpam_msc *msc = m->ris->vmsc->msc; u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; @@ -1023,11 +1024,21 @@ static void __ris_msmon_read(void *arg) now = mpam_read_monsel_reg(msc, CSU); if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) nrdy = now & MSMON___NRDY; + now = FIELD_GET(MSMON___VALUE, now); break; case mpam_feat_msmon_mbwu: now = mpam_read_monsel_reg(msc, MBWU); if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) nrdy = now & MSMON___NRDY; + now = FIELD_GET(MSMON___VALUE, now); + + if (nrdy) + break; + + mbwu_state = &ris->mbwu_state[ctx->mon]; + + /* Include bandwidth consumed before the last hardware reset */ + now += mbwu_state->correction; break; default: m->err = -EINVAL; @@ -1039,7 +1050,6 @@ static void __ris_msmon_read(void *arg) return; } - now = FIELD_GET(MSMON___VALUE, now); *m->val += now; } @@ -1235,6 +1245,67 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mutex_unlock(&msc->part_sel_lock); } +/* Call with msc cfg_lock held */ +static int mpam_restore_mbwu_state(void *_ris) +{ + int i; + struct mon_read mwbu_arg; + struct mpam_msc_ris *ris = _ris; + + for (i = 0; i < ris->props.num_mbwu_mon; i++) { + if (ris->mbwu_state[i].enabled) { + mwbu_arg.ris = ris; + mwbu_arg.ctx = &ris->mbwu_state[i].cfg; + mwbu_arg.type = mpam_feat_msmon_mbwu; + + __ris_msmon_read(&mwbu_arg); + } + } + + return 0; +} + +/* Call with MSC cfg_lock held */ +static int mpam_save_mbwu_state(void *arg) +{ + int i; + u64 val; + struct mon_cfg *cfg; + u32 cur_flt, cur_ctl, mon_sel; + struct mpam_msc_ris *ris = arg; + struct msmon_mbwu_state *mbwu_state; + struct mpam_msc *msc = ris->vmsc->msc; + + for (i = 0; i < ris->props.num_mbwu_mon; i++) { + mbwu_state = &ris->mbwu_state[i]; + cfg = &mbwu_state->cfg; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + return -EIO; + + mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, i) | + FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); + mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + + cur_flt = mpam_read_monsel_reg(msc, CFG_MBWU_FLT); + cur_ctl = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, 0); + + val = mpam_read_monsel_reg(msc, MBWU); + mpam_write_monsel_reg(msc, MBWU, 0); + + cfg->mon = i; + cfg->pmg = FIELD_GET(MSMON_CFG_x_FLT_PMG, cur_flt); + cfg->match_pmg = FIELD_GET(MSMON_CFG_x_CTL_MATCH_PMG, cur_ctl); + cfg->partid = FIELD_GET(MSMON_CFG_x_FLT_PARTID, cur_flt); + mbwu_state->correction += val; + mbwu_state->enabled = FIELD_GET(MSMON_CFG_x_CTL_EN, cur_ctl); + mpam_mon_sel_unlock(msc); + } + + return 0; +} + static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) { *reset_cfg = (struct mpam_config) { @@ -1343,6 +1414,9 @@ static void mpam_reprogram_msc(struct mpam_msc *msc) mpam_touch_msc(msc, __write_config, &arg); } ris->in_reset_state = reset; + + if (mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) + mpam_touch_msc(msc, &mpam_restore_mbwu_state, ris); } mutex_unlock(&msc->cfg_lock); } @@ -1436,6 +1510,9 @@ static int mpam_cpu_offline(unsigned int cpu) * lost while the CPUs are offline. */ ris->in_reset_state = false; + + if (mpam_is_enabled()) + mpam_touch_msc(msc, &mpam_save_mbwu_state, ris); } mutex_unlock(&msc->cfg_lock); } @@ -2113,7 +2190,22 @@ static void mpam_unregister_irqs(void) static void __destroy_component_cfg(struct mpam_component *comp) { + struct mpam_msc *msc; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + + lockdep_assert_held(&mpam_list_lock); + add_to_garbage(comp->cfg); + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + msc = vmsc->msc; + + if (mpam_mon_sel_lock(msc)) { + list_for_each_entry(ris, &vmsc->ris, vmsc_list) + add_to_garbage(ris->mbwu_state); + mpam_mon_sel_unlock(msc); + } + } } static void mpam_reset_component_cfg(struct mpam_component *comp) @@ -2139,6 +2231,8 @@ static void mpam_reset_component_cfg(struct mpam_component *comp) static int __allocate_component_cfg(struct mpam_component *comp) { + struct mpam_vmsc *vmsc; + mpam_assert_partid_sizes_fixed(); if (comp->cfg) @@ -2156,6 +2250,36 @@ static int __allocate_component_cfg(struct mpam_component *comp) mpam_reset_component_cfg(comp); + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + struct mpam_msc *msc; + struct mpam_msc_ris *ris; + struct msmon_mbwu_state *mbwu_state; + + if (!vmsc->props.num_mbwu_mon) + continue; + + msc = vmsc->msc; + list_for_each_entry(ris, &vmsc->ris, vmsc_list) { + if (!ris->props.num_mbwu_mon) + continue; + + mbwu_state = kcalloc(ris->props.num_mbwu_mon, + sizeof(*ris->mbwu_state), + GFP_KERNEL); + if (!mbwu_state) { + __destroy_component_cfg(comp); + return -ENOMEM; + } + + init_garbage(&mbwu_state[0].garbage); + + if (mpam_mon_sel_lock(msc)) { + ris->mbwu_state = mbwu_state; + mpam_mon_sel_unlock(msc); + } + } + } + return 0; } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 12f0a5b7f39e..12ce80bc7ff7 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -91,7 +91,10 @@ struct mpam_msc { */ struct mutex part_sel_lock; - /* cfg_lock protects the msc configuration. */ + /* + * cfg_lock protects the msc configuration and guards against mbwu_state + * save and restore racing. + */ struct mutex cfg_lock; /* @@ -202,6 +205,19 @@ struct mon_cfg { enum mon_filter_options opts; }; +/* Changes to msmon_mbwu_state are protected by the msc's mon_sel_lock. */ +struct msmon_mbwu_state { + bool enabled; + struct mon_cfg cfg; + + /* + * The value to add to the new reading to account for power management. + */ + u64 correction; + + struct mpam_garbage garbage; +}; + struct mpam_class { /* mpam_components in this class */ struct list_head components; @@ -295,6 +311,9 @@ struct mpam_msc_ris { /* parent: */ struct mpam_vmsc *vmsc; + /* msmon mbwu configuration is preserved over reset */ + struct msmon_mbwu_state *mbwu_state; + struct mpam_garbage garbage; }; -- Gitee From 6fc350fa1ae612d37c36a32193d85d35f670aa44 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Wed, 19 Nov 2025 12:22:58 +0000 Subject: [PATCH 042/124] arm_mpam: Consider overflow in bandwidth counter state ANBZ: #31060 commit b35363793291e36c91d4a5b62d7ae7079c70d826 upstream. Use the overflow status bit to track overflow on each bandwidth counter read and add the counter size to the correction when overflow is detected. This assumes that only a single overflow has occurred since the last read of the counter. Overflow interrupts, on hardware that supports them could be used to remove this limitation. Cc: Zeng Heng Reviewed-by: Gavin Shan Reviewed-by: Zeng Heng Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 24 ++++++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 3 ++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 91fc396f3bc7..d3681fe750e4 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -985,11 +985,18 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, } } +static u64 mpam_msmon_overflow_val(enum mpam_device_features type) +{ + /* TODO: scaling, and long counters */ + return BIT_ULL(hweight_long(MSMON___VALUE)); +} + static void __ris_msmon_read(void *arg) { u64 now; bool nrdy = false; bool config_mismatch; + bool overflow; struct mon_read *m = arg; struct mon_cfg *ctx = m->ctx; struct mpam_msc_ris *ris = m->ris; @@ -1011,13 +1018,20 @@ static void __ris_msmon_read(void *arg) * This saves waiting for 'nrdy' on subsequent reads. */ read_msmon_ctl_flt_vals(m, &cur_ctl, &cur_flt); + overflow = cur_ctl & MSMON_CFG_x_CTL_OFLOW_STATUS; + clean_msmon_ctl_val(&cur_ctl); gen_msmon_ctl_flt_vals(m, &ctl_val, &flt_val); config_mismatch = cur_flt != flt_val || cur_ctl != (ctl_val | MSMON_CFG_x_CTL_EN); - if (config_mismatch) + if (config_mismatch) { write_msmon_ctl_flt_vals(m, ctl_val, flt_val); + overflow = false; + } else if (overflow) { + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, + cur_ctl & ~MSMON_CFG_x_CTL_OFLOW_STATUS); + } switch (m->type) { case mpam_feat_msmon_csu: @@ -1037,7 +1051,13 @@ static void __ris_msmon_read(void *arg) mbwu_state = &ris->mbwu_state[ctx->mon]; - /* Include bandwidth consumed before the last hardware reset */ + if (overflow) + mbwu_state->correction += mpam_msmon_overflow_val(m->type); + + /* + * Include bandwidth consumed before the last hardware reset and + * a counter size increment for each overflow. + */ now += mbwu_state->correction; break; default: diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 12ce80bc7ff7..218e2f48c7bf 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -211,7 +211,8 @@ struct msmon_mbwu_state { struct mon_cfg cfg; /* - * The value to add to the new reading to account for power management. + * The value to add to the new reading to account for power management, + * and overflow. */ u64 correction; -- Gitee From a5b4f502e05c0ed1847e1b10ed48c0ca0a5ea399 Mon Sep 17 00:00:00 2001 From: Rohit Mathew Date: Wed, 19 Nov 2025 12:22:59 +0000 Subject: [PATCH 043/124] arm_mpam: Probe for long/lwd mbwu counters ANBZ: #31060 commit fdc29a141d6364645509cb20129cba1f84e4c10f upstream. mpam v0.1 and versions above v1.0 support optional long counter for memory bandwidth monitoring. The MPAMF_MBWUMON_IDR register has fields indicating support for long counters. Probe these feature bits. The mpam_feat_msmon_mbwu feature is used to indicate that bandwidth monitors are supported, instead of muddling this with which size of bandwidth monitors, add an explicit 31 bit counter feature. Signed-off-by: Rohit Mathew [ morse: Added 31bit counter feature to simplify later logic ] Signed-off-by: James Morse Reviewed-by: Ben Horgan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 35 ++++++++++++++++++++++----------- drivers/resctrl/mpam_internal.h | 3 +++ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index d3681fe750e4..c12e99dc275c 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -783,25 +783,36 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware."); } if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_MBWU, msmon_features)) { - bool hw_managed; + bool has_long, hw_managed; u32 mbwumon_idr = mpam_read_partsel_reg(msc, MBWUMON_IDR); props->num_mbwu_mon = FIELD_GET(MPAMF_MBWUMON_IDR_NUM_MON, mbwumon_idr); - if (props->num_mbwu_mon) + if (props->num_mbwu_mon) { mpam_set_feature(mpam_feat_msmon_mbwu, props); - if (FIELD_GET(MPAMF_MBWUMON_IDR_HAS_RWBW, mbwumon_idr)) - mpam_set_feature(mpam_feat_msmon_mbwu_rwbw, props); + if (FIELD_GET(MPAMF_MBWUMON_IDR_HAS_RWBW, mbwumon_idr)) + mpam_set_feature(mpam_feat_msmon_mbwu_rwbw, props); - /* Is NRDY hardware managed? */ - hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); - if (hw_managed) - mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); + has_long = FIELD_GET(MPAMF_MBWUMON_IDR_HAS_LONG, mbwumon_idr); + if (has_long) { + if (FIELD_GET(MPAMF_MBWUMON_IDR_LWD, mbwumon_idr)) + mpam_set_feature(mpam_feat_msmon_mbwu_63counter, props); + else + mpam_set_feature(mpam_feat_msmon_mbwu_44counter, props); + } else { + mpam_set_feature(mpam_feat_msmon_mbwu_31counter, props); + } - /* - * Don't warn about any missing firmware property for - * MBWU NRDY - it doesn't make any sense! - */ + /* Is NRDY hardware managed? */ + hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); + if (hw_managed) + mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); + + /* + * Don't warn about any missing firmware property for + * MBWU NRDY - it doesn't make any sense! + */ + } } } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 218e2f48c7bf..693a315c4710 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -164,6 +164,9 @@ enum mpam_device_features { mpam_feat_msmon_csu_xcl, mpam_feat_msmon_csu_hw_nrdy, mpam_feat_msmon_mbwu, + mpam_feat_msmon_mbwu_31counter, + mpam_feat_msmon_mbwu_44counter, + mpam_feat_msmon_mbwu_63counter, mpam_feat_msmon_mbwu_capture, mpam_feat_msmon_mbwu_rwbw, mpam_feat_msmon_mbwu_hw_nrdy, -- Gitee From dd0a4770b01ed5568f30bca5c30ad849d154ee49 Mon Sep 17 00:00:00 2001 From: Rohit Mathew Date: Wed, 19 Nov 2025 12:23:00 +0000 Subject: [PATCH 044/124] arm_mpam: Use long MBWU counters if supported ANBZ: #31060 commit 9e5afb7c32830bcd123976a7729ef4e2dff0cd77 upstream. Now that the larger counter sizes are probed, make use of them. Callers of mpam_msmon_read() may not know (or care!) about the different counter sizes. Allow them to specify mpam_feat_msmon_mbwu and have the driver pick the counter to use. Only 32bit accesses to the MSC are required to be supported by the spec, but these registers are 64bits. The lower half may overflow into the higher half between two 32bit reads. To avoid this, use a helper that reads the top half multiple times to check for overflow. Signed-off-by: Rohit Mathew [morse: merged multiple patches from Rohit, added explicit counter selection ] Signed-off-by: James Morse Cc: Peter Newman Reviewed-by: Ben Horgan Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 145 ++++++++++++++++++++++++++++----- 1 file changed, 126 insertions(+), 19 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index c12e99dc275c..0a1c89688adc 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -905,6 +905,50 @@ struct mon_read { int err; }; +static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris) +{ + return (mpam_has_feature(mpam_feat_msmon_mbwu_63counter, &ris->props) || + mpam_has_feature(mpam_feat_msmon_mbwu_44counter, &ris->props)); +} + +static u64 mpam_msc_read_mbwu_l(struct mpam_msc *msc) +{ + int retry = 3; + u32 mbwu_l_low; + u64 mbwu_l_high1, mbwu_l_high2; + + mpam_mon_sel_lock_held(msc); + + WARN_ON_ONCE((MSMON_MBWU_L + sizeof(u64)) > msc->mapped_hwpage_sz); + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + mbwu_l_high2 = __mpam_read_reg(msc, MSMON_MBWU_L + 4); + do { + mbwu_l_high1 = mbwu_l_high2; + mbwu_l_low = __mpam_read_reg(msc, MSMON_MBWU_L); + mbwu_l_high2 = __mpam_read_reg(msc, MSMON_MBWU_L + 4); + + retry--; + } while (mbwu_l_high1 != mbwu_l_high2 && retry > 0); + + if (mbwu_l_high1 == mbwu_l_high2) + return (mbwu_l_high1 << 32) | mbwu_l_low; + + pr_warn("Failed to read a stable value\n"); + return MSMON___L_NRDY; +} + +static void mpam_msc_zero_mbwu_l(struct mpam_msc *msc) +{ + mpam_mon_sel_lock_held(msc); + + WARN_ON_ONCE((MSMON_MBWU_L + sizeof(u64)) > msc->mapped_hwpage_sz); + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + __mpam_write_reg(msc, MSMON_MBWU_L, 0); + __mpam_write_reg(msc, MSMON_MBWU_L + 4, 0); +} + static void gen_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, u32 *flt_val) { @@ -931,7 +975,9 @@ static void gen_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, *flt_val |= FIELD_PREP(MSMON_CFG_CSU_FLT_XCL, ctx->csu_exclude_clean); break; - case mpam_feat_msmon_mbwu: + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: *ctl_val |= MSMON_CFG_MBWU_CTL_TYPE_MBWU; if (mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, &m->ris->props)) @@ -953,7 +999,9 @@ static void read_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, *ctl_val = mpam_read_monsel_reg(msc, CFG_CSU_CTL); *flt_val = mpam_read_monsel_reg(msc, CFG_CSU_FLT); break; - case mpam_feat_msmon_mbwu: + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: *ctl_val = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); *flt_val = mpam_read_monsel_reg(msc, CFG_MBWU_FLT); break; @@ -966,6 +1014,9 @@ static void read_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, static inline void clean_msmon_ctl_val(u32 *cur_ctl) { *cur_ctl &= ~MSMON_CFG_x_CTL_OFLOW_STATUS; + + if (FIELD_GET(MSMON_CFG_x_CTL_TYPE, *cur_ctl) == MSMON_CFG_MBWU_CTL_TYPE_MBWU) + *cur_ctl &= ~MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L; } static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, @@ -984,12 +1035,17 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, mpam_write_monsel_reg(msc, CSU, 0); mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); break; - case mpam_feat_msmon_mbwu: + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: mpam_write_monsel_reg(msc, CFG_MBWU_FLT, flt_val); mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val); mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); /* Counting monitors require NRDY to be reset by software */ - mpam_write_monsel_reg(msc, MBWU, 0); + if (m->type == mpam_feat_msmon_mbwu_31counter) + mpam_write_monsel_reg(msc, MBWU, 0); + else + mpam_msc_zero_mbwu_l(m->ris->vmsc->msc); break; default: pr_warn("Unexpected monitor type %d\n", m->type); @@ -998,8 +1054,17 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, static u64 mpam_msmon_overflow_val(enum mpam_device_features type) { - /* TODO: scaling, and long counters */ - return BIT_ULL(hweight_long(MSMON___VALUE)); + /* TODO: implement scaling counters */ + switch (type) { + case mpam_feat_msmon_mbwu_63counter: + return BIT_ULL(hweight_long(MSMON___LWD_VALUE)); + case mpam_feat_msmon_mbwu_44counter: + return BIT_ULL(hweight_long(MSMON___L_VALUE)); + case mpam_feat_msmon_mbwu_31counter: + return BIT_ULL(hweight_long(MSMON___VALUE)); + default: + return 0; + } } static void __ris_msmon_read(void *arg) @@ -1029,7 +1094,12 @@ static void __ris_msmon_read(void *arg) * This saves waiting for 'nrdy' on subsequent reads. */ read_msmon_ctl_flt_vals(m, &cur_ctl, &cur_flt); - overflow = cur_ctl & MSMON_CFG_x_CTL_OFLOW_STATUS; + + if (mpam_feat_msmon_mbwu_31counter == m->type) + overflow = cur_ctl & MSMON_CFG_x_CTL_OFLOW_STATUS; + else if (mpam_feat_msmon_mbwu_44counter == m->type || + mpam_feat_msmon_mbwu_63counter == m->type) + overflow = cur_ctl & MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L; clean_msmon_ctl_val(&cur_ctl); gen_msmon_ctl_flt_vals(m, &ctl_val, &flt_val); @@ -1041,7 +1111,9 @@ static void __ris_msmon_read(void *arg) overflow = false; } else if (overflow) { mpam_write_monsel_reg(msc, CFG_MBWU_CTL, - cur_ctl & ~MSMON_CFG_x_CTL_OFLOW_STATUS); + cur_ctl & + ~(MSMON_CFG_x_CTL_OFLOW_STATUS | + MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L)); } switch (m->type) { @@ -1051,11 +1123,24 @@ static void __ris_msmon_read(void *arg) nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); break; - case mpam_feat_msmon_mbwu: - now = mpam_read_monsel_reg(msc, MBWU); - if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) - nrdy = now & MSMON___NRDY; - now = FIELD_GET(MSMON___VALUE, now); + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: + if (m->type != mpam_feat_msmon_mbwu_31counter) { + now = mpam_msc_read_mbwu_l(msc); + if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) + nrdy = now & MSMON___L_NRDY; + + if (m->type == mpam_feat_msmon_mbwu_63counter) + now = FIELD_GET(MSMON___LWD_VALUE, now); + else + now = FIELD_GET(MSMON___L_VALUE, now); + } else { + now = mpam_read_monsel_reg(msc, MBWU); + if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) + nrdy = now & MSMON___NRDY; + now = FIELD_GET(MSMON___VALUE, now); + } if (nrdy) break; @@ -1118,13 +1203,26 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) return any_err; } +static enum mpam_device_features mpam_msmon_choose_counter(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (mpam_has_feature(mpam_feat_msmon_mbwu_63counter, cprops)) + return mpam_feat_msmon_mbwu_63counter; + if (mpam_has_feature(mpam_feat_msmon_mbwu_44counter, cprops)) + return mpam_feat_msmon_mbwu_44counter; + + return mpam_feat_msmon_mbwu_31counter; +} + int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, enum mpam_device_features type, u64 *val) { int err; struct mon_read arg; u64 wait_jiffies = 0; - struct mpam_props *cprops = &comp->class->props; + struct mpam_class *class = comp->class; + struct mpam_props *cprops = &class->props; might_sleep(); @@ -1134,6 +1232,9 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, if (!mpam_has_feature(type, cprops)) return -EOPNOTSUPP; + if (type == mpam_feat_msmon_mbwu) + type = mpam_msmon_choose_counter(class); + arg = (struct mon_read) { .ctx = ctx, .type = type, @@ -1142,8 +1243,8 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, *val = 0; err = _msmon_read(comp, &arg); - if (err == -EBUSY && comp->class->nrdy_usec) - wait_jiffies = usecs_to_jiffies(comp->class->nrdy_usec); + if (err == -EBUSY && class->nrdy_usec) + wait_jiffies = usecs_to_jiffies(class->nrdy_usec); while (wait_jiffies) wait_jiffies = schedule_timeout_uninterruptible(wait_jiffies); @@ -1282,12 +1383,13 @@ static int mpam_restore_mbwu_state(void *_ris) int i; struct mon_read mwbu_arg; struct mpam_msc_ris *ris = _ris; + struct mpam_class *class = ris->vmsc->comp->class; for (i = 0; i < ris->props.num_mbwu_mon; i++) { if (ris->mbwu_state[i].enabled) { mwbu_arg.ris = ris; mwbu_arg.ctx = &ris->mbwu_state[i].cfg; - mwbu_arg.type = mpam_feat_msmon_mbwu; + mwbu_arg.type = mpam_msmon_choose_counter(class); __ris_msmon_read(&mwbu_arg); } @@ -1322,8 +1424,13 @@ static int mpam_save_mbwu_state(void *arg) cur_ctl = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); mpam_write_monsel_reg(msc, CFG_MBWU_CTL, 0); - val = mpam_read_monsel_reg(msc, MBWU); - mpam_write_monsel_reg(msc, MBWU, 0); + if (mpam_ris_has_mbwu_long_counter(ris)) { + val = mpam_msc_read_mbwu_l(msc); + mpam_msc_zero_mbwu_l(msc); + } else { + val = mpam_read_monsel_reg(msc, MBWU); + mpam_write_monsel_reg(msc, MBWU, 0); + } cfg->mon = i; cfg->pmg = FIELD_GET(MSMON_CFG_x_FLT_PMG, cur_flt); -- Gitee From 2f1f3cb0bc1b2941fc3126f6d590d4a5ff394ffd Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:23:01 +0000 Subject: [PATCH 045/124] arm_mpam: Add helper to reset saved mbwu state ANBZ: #31060 commit 201d96ca4c867695880450930258cd5c97f099d4 upstream. resctrl expects to reset the bandwidth counters when the filesystem is mounted. To allow this, add a helper that clears the saved mbwu state. Instead of cross calling to each CPU that can access the component MSC to write to the counter, set a flag that causes it to be zero'd on the the next read. This is easily done by forcing a configuration update. Signed-off-by: James Morse Cc: Peter Newman Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 48 ++++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 2 ++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0a1c89688adc..63d3b5e1030c 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1075,6 +1075,7 @@ static void __ris_msmon_read(void *arg) bool overflow; struct mon_read *m = arg; struct mon_cfg *ctx = m->ctx; + bool reset_on_next_read = false; struct mpam_msc_ris *ris = m->ris; struct msmon_mbwu_state *mbwu_state; struct mpam_props *rprops = &ris->props; @@ -1089,6 +1090,20 @@ static void __ris_msmon_read(void *arg) FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + switch (m->type) { + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: + mbwu_state = &ris->mbwu_state[ctx->mon]; + if (mbwu_state) { + reset_on_next_read = mbwu_state->reset_on_next_read; + mbwu_state->reset_on_next_read = false; + } + break; + default: + break; + } + /* * Read the existing configuration to avoid re-writing the same values. * This saves waiting for 'nrdy' on subsequent reads. @@ -1106,7 +1121,7 @@ static void __ris_msmon_read(void *arg) config_mismatch = cur_flt != flt_val || cur_ctl != (ctl_val | MSMON_CFG_x_CTL_EN); - if (config_mismatch) { + if (config_mismatch || reset_on_next_read) { write_msmon_ctl_flt_vals(m, ctl_val, flt_val); overflow = false; } else if (overflow) { @@ -1263,6 +1278,37 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, return err; } +void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) +{ + struct mpam_msc *msc; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + + if (!mpam_is_enabled()) + return; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!mpam_has_feature(mpam_feat_msmon_mbwu, &vmsc->props)) + continue; + + msc = vmsc->msc; + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) + continue; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + continue; + + ris->mbwu_state[ctx->mon].correction = 0; + ris->mbwu_state[ctx->mon].reset_on_next_read = true; + mpam_mon_sel_unlock(msc); + } + } +} + static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) { u32 num_words, msb; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 693a315c4710..18d53c07b3d7 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -211,6 +211,7 @@ struct mon_cfg { /* Changes to msmon_mbwu_state are protected by the msc's mon_sel_lock. */ struct msmon_mbwu_state { bool enabled; + bool reset_on_next_read; struct mon_cfg cfg; /* @@ -370,6 +371,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid, int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, enum mpam_device_features, u64 *val); +void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); -- Gitee From 29a0876d2e08ff611ba286b7f688c7385585e1db Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:23:02 +0000 Subject: [PATCH 046/124] arm_mpam: Add kunit test for bitmap reset ANBZ: #31060 commit e3565d1fd4dcf2c7ee6912094066e47c7500eaf2 upstream. The bitmap reset code has been a source of bugs. Add a unit test. This currently has to be built in, as the rest of the driver is builtin. Suggested-by: Jonathan Cameron Signed-off-by: James Morse Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/Kconfig | 9 ++++ drivers/resctrl/mpam_devices.c | 4 ++ drivers/resctrl/test_mpam_devices.c | 69 +++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 drivers/resctrl/test_mpam_devices.c diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index 5f7f748e611e..c808e0470394 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -12,4 +12,13 @@ config ARM64_MPAM_DRIVER_DEBUG help Say yes here to enable debug messages from the MPAM driver. +config MPAM_KUNIT_TEST + bool "KUnit tests for MPAM driver " if !KUNIT_ALL_TESTS + depends on KUNIT=y + default KUNIT_ALL_TESTS + help + Enable this option to run tests in the MPAM driver. + + If unsure, say N. + endif diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 63d3b5e1030c..6934c4724654 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2721,3 +2721,7 @@ static int __init mpam_msc_driver_init(void) /* Must occur after arm64_mpam_register_cpus() from arch_initcall() */ subsys_initcall(mpam_msc_driver_init); + +#ifdef CONFIG_MPAM_KUNIT_TEST +#include "test_mpam_devices.c" +#endif diff --git a/drivers/resctrl/test_mpam_devices.c b/drivers/resctrl/test_mpam_devices.c new file mode 100644 index 000000000000..0cfb41b665c4 --- /dev/null +++ b/drivers/resctrl/test_mpam_devices.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. +/* This file is intended to be included into mpam_devices.c */ + +#include + +static void test_mpam_reset_msc_bitmap(struct kunit *test) +{ + char __iomem *buf = kunit_kzalloc(test, SZ_16K, GFP_KERNEL); + struct mpam_msc fake_msc = {}; + u32 *test_result; + + if (!buf) + return; + + fake_msc.mapped_hwpage = buf; + fake_msc.mapped_hwpage_sz = SZ_16K; + cpumask_copy(&fake_msc.accessibility, cpu_possible_mask); + + /* Satisfy lockdep checks */ + mutex_init(&fake_msc.part_sel_lock); + mutex_lock(&fake_msc.part_sel_lock); + + test_result = (u32 *)(buf + MPAMCFG_CPBM); + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 0); + KUNIT_EXPECT_EQ(test, test_result[0], 0); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 1); + KUNIT_EXPECT_EQ(test, test_result[0], 1); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 16); + KUNIT_EXPECT_EQ(test, test_result[0], 0xffff); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 32); + KUNIT_EXPECT_EQ(test, test_result[0], 0xffffffff); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 33); + KUNIT_EXPECT_EQ(test, test_result[0], 0xffffffff); + KUNIT_EXPECT_EQ(test, test_result[1], 1); + test_result[0] = 0; + test_result[1] = 0; + + mutex_unlock(&fake_msc.part_sel_lock); +} + +static struct kunit_case mpam_devices_test_cases[] = { + KUNIT_CASE(test_mpam_reset_msc_bitmap), + {} +}; + +static struct kunit_suite mpam_devices_test_suite = { + .name = "mpam_devices_test_suite", + .test_cases = mpam_devices_test_cases, +}; + +kunit_test_suites(&mpam_devices_test_suite); -- Gitee From b875b88f42fadd0b9f4fab7e15c6d681b3366c11 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:23:03 +0000 Subject: [PATCH 047/124] arm_mpam: Add kunit tests for props_mismatch() ANBZ: #31060 commit 2557e0eafec1547aa9e0e768d2376e66252dada4 upstream. When features are mismatched between MSC the way features are combined to the class determines whether resctrl can support this SoC. Add some tests to illustrate the sort of thing that is expected to work, and those that must be removed. Signed-off-by: James Morse Reviewed-by: Ben Horgan Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_internal.h | 14 +- drivers/resctrl/test_mpam_devices.c | 320 ++++++++++++++++++++++++++++ 2 files changed, 333 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 18d53c07b3d7..e79c3c47259c 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -23,6 +23,12 @@ struct platform_device; DECLARE_STATIC_KEY_FALSE(mpam_enabled); +#ifdef CONFIG_MPAM_KUNIT_TEST +#define PACKED_FOR_KUNIT __packed +#else +#define PACKED_FOR_KUNIT +#endif + static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -186,7 +192,13 @@ struct mpam_props { u16 dspri_wd; u16 num_csu_mon; u16 num_mbwu_mon; -}; + +/* + * Kunit tests use memset() to set up feature combinations that should be + * removed, and will false-positive if the compiler introduces padding that + * isn't cleared during sanitisation. + */ +} PACKED_FOR_KUNIT; #define mpam_has_feature(_feat, x) test_bit(_feat, (x)->features) #define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) diff --git a/drivers/resctrl/test_mpam_devices.c b/drivers/resctrl/test_mpam_devices.c index 0cfb41b665c4..3e8d564a0c64 100644 --- a/drivers/resctrl/test_mpam_devices.c +++ b/drivers/resctrl/test_mpam_devices.c @@ -4,6 +4,324 @@ #include +/* + * This test catches fields that aren't being sanitised - but can't tell you + * which one... + */ +static void test__props_mismatch(struct kunit *test) +{ + struct mpam_props parent = { 0 }; + struct mpam_props child; + + memset(&child, 0xff, sizeof(child)); + __props_mismatch(&parent, &child, false); + + memset(&child, 0, sizeof(child)); + KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0); + + memset(&child, 0xff, sizeof(child)); + __props_mismatch(&parent, &child, true); + + KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0); +} + +static struct list_head fake_classes_list; +static struct mpam_class fake_class = { 0 }; +static struct mpam_component fake_comp1 = { 0 }; +static struct mpam_component fake_comp2 = { 0 }; +static struct mpam_vmsc fake_vmsc1 = { 0 }; +static struct mpam_vmsc fake_vmsc2 = { 0 }; +static struct mpam_msc fake_msc1 = { 0 }; +static struct mpam_msc fake_msc2 = { 0 }; +static struct mpam_msc_ris fake_ris1 = { 0 }; +static struct mpam_msc_ris fake_ris2 = { 0 }; +static struct platform_device fake_pdev = { 0 }; + +static inline void reset_fake_hierarchy(void) +{ + INIT_LIST_HEAD(&fake_classes_list); + + memset(&fake_class, 0, sizeof(fake_class)); + fake_class.level = 3; + fake_class.type = MPAM_CLASS_CACHE; + INIT_LIST_HEAD_RCU(&fake_class.components); + INIT_LIST_HEAD(&fake_class.classes_list); + + memset(&fake_comp1, 0, sizeof(fake_comp1)); + memset(&fake_comp2, 0, sizeof(fake_comp2)); + fake_comp1.comp_id = 1; + fake_comp2.comp_id = 2; + INIT_LIST_HEAD(&fake_comp1.vmsc); + INIT_LIST_HEAD(&fake_comp1.class_list); + INIT_LIST_HEAD(&fake_comp2.vmsc); + INIT_LIST_HEAD(&fake_comp2.class_list); + + memset(&fake_vmsc1, 0, sizeof(fake_vmsc1)); + memset(&fake_vmsc2, 0, sizeof(fake_vmsc2)); + INIT_LIST_HEAD(&fake_vmsc1.ris); + INIT_LIST_HEAD(&fake_vmsc1.comp_list); + fake_vmsc1.msc = &fake_msc1; + INIT_LIST_HEAD(&fake_vmsc2.ris); + INIT_LIST_HEAD(&fake_vmsc2.comp_list); + fake_vmsc2.msc = &fake_msc2; + + memset(&fake_ris1, 0, sizeof(fake_ris1)); + memset(&fake_ris2, 0, sizeof(fake_ris2)); + fake_ris1.ris_idx = 1; + INIT_LIST_HEAD(&fake_ris1.msc_list); + fake_ris2.ris_idx = 2; + INIT_LIST_HEAD(&fake_ris2.msc_list); + + fake_msc1.pdev = &fake_pdev; + fake_msc2.pdev = &fake_pdev; + + list_add(&fake_class.classes_list, &fake_classes_list); +} + +static void test_mpam_enable_merge_features(struct kunit *test) +{ + reset_fake_hierarchy(); + + mutex_lock(&mpam_list_lock); + + /* One Class+Comp, two RIS in one vMSC with common features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = NULL; + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc1; + list_add(&fake_ris2.vmsc_list, &fake_vmsc1.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cpbm_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + + reset_fake_hierarchy(); + + /* One Class+Comp, two RIS in one vMSC with non-overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = NULL; + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc1; + list_add(&fake_ris2.vmsc_list, &fake_vmsc1.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cmax_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + /* Multiple RIS within one MSC controlling the same resource can be mismatched */ + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_vmsc1.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + KUNIT_EXPECT_EQ(test, fake_vmsc1.props.cmax_wd, 4); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 4); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cpbm_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with non-overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cmax_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * Multiple RIS in different MSC can't control the same resource, + * mismatched features can not be supported. + */ + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 0); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with incompatible overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + mpam_set_feature(mpam_feat_mbw_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_mbw_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 5; + fake_ris2.props.cpbm_wd = 3; + fake_ris1.props.mbw_pbm_bits = 5; + fake_ris2.props.mbw_pbm_bits = 3; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * Multiple RIS in different MSC can't control the same resource, + * mismatched features can not be supported. + */ + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_mbw_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0); + KUNIT_EXPECT_EQ(test, fake_class.props.mbw_pbm_bits, 0); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with overlapping features that need tweaking */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_mbw_min, &fake_ris1.props); + mpam_set_feature(mpam_feat_mbw_min, &fake_ris2.props); + mpam_set_feature(mpam_feat_cmax_cmax, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmax, &fake_ris2.props); + fake_ris1.props.bwa_wd = 5; + fake_ris2.props.bwa_wd = 3; + fake_ris1.props.cmax_wd = 5; + fake_ris2.props.cmax_wd = 3; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * RIS with different control properties need to be sanitised so the + * class has the common set of properties. + */ + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_min, &fake_class.props)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmax, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.bwa_wd, 3); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 3); + + reset_fake_hierarchy(); + + /* One Class Two Comp with overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = &fake_class; + list_add(&fake_comp2.class_list, &fake_class.components); + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp2; + list_add(&fake_vmsc2.comp_list, &fake_comp2.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cpbm_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + + reset_fake_hierarchy(); + + /* One Class Two Comp with non-overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = &fake_class; + list_add(&fake_comp2.class_list, &fake_class.components); + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp2; + list_add(&fake_vmsc2.comp_list, &fake_comp2.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cmax_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * Multiple components can't control the same resource, mismatched features can + * not be supported. + */ + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 0); + + mutex_unlock(&mpam_list_lock); +} + static void test_mpam_reset_msc_bitmap(struct kunit *test) { char __iomem *buf = kunit_kzalloc(test, SZ_16K, GFP_KERNEL); @@ -58,6 +376,8 @@ static void test_mpam_reset_msc_bitmap(struct kunit *test) static struct kunit_case mpam_devices_test_cases[] = { KUNIT_CASE(test_mpam_reset_msc_bitmap), + KUNIT_CASE(test_mpam_enable_merge_features), + KUNIT_CASE(test__props_mismatch), {} }; -- Gitee From 16a2c5e2d1dfa0f9fe7397e58b0377dc298a04ab Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Wed, 19 Nov 2025 12:23:04 +0000 Subject: [PATCH 048/124] MAINTAINERS: new entry for MPAM Driver ANBZ: #31060 commit ce1e1421f8d8cdb5e05e13dbb516caedd67e5ee8 upstream. Create a maintainer entry for the new MPAM Driver. Add myself and James Morse as maintainers. James created the driver and I have taken up the later versions of his series. Cc: James Morse Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Catalin Marinas Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index d384fb7630b2..1af176b5eaeb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14619,6 +14619,16 @@ S: Maintained F: Documentation/driver-api/tty/moxa-smartio.rst F: drivers/tty/mxser.* +MPAM DRIVER +M: James Morse +M: Ben Horgan +R: Reinette Chatre +R: Fenghua Yu +S: Maintained +F: drivers/resctrl/mpam_* +F: drivers/resctrl/test_mpam_* +F: include/linux/arm_mpam.h + MR800 AVERMEDIA USB FM RADIO DRIVER M: Alexey Klimov L: linux-media@vger.kernel.org -- Gitee From 376a254ad0fd7bbc2e7895d9027c14e6e3fb081e Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 19 Dec 2025 18:11:03 +0000 Subject: [PATCH 049/124] arm_mpam: Stop using uninitialized variables in __ris_msmon_read() ANBZ: #31060 commit c2803bd580db226008aabf2fb2f0c9a7d3b5d0de upstream. Dan has reported two uses of uninitialized variables in __ris_msmon_read(). If an unknown monitor type is encountered then the local variable, now, is used uninitialized. Fix this by returning early on error. If a non-mbwu monitor is being read then the local variable, overflow, is not initialized but still read. Initialize it to false as overflow is not relevant for csu monitors. Fixes: 823e7c3712c5 ("arm_mpam: Add mpam_msmon_read() to read monitor value") Fixes: 9e5afb7c3283 ("arm_mpam: Use long MBWU counters if supported") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202512091519.RBwiJcSq-lkp@intel.com/ Closes: https://lore.kernel.org/r/202512100547.N7QPYgfb-lkp@intel.com/ Signed-off-by: Ben Horgan Reviewed-by: Jonathan Cameron Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 6934c4724654..9ff579d01ba6 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1072,7 +1072,7 @@ static void __ris_msmon_read(void *arg) u64 now; bool nrdy = false; bool config_mismatch; - bool overflow; + bool overflow = false; struct mon_read *m = arg; struct mon_cfg *ctx = m->ctx; bool reset_on_next_read = false; @@ -1176,10 +1176,11 @@ static void __ris_msmon_read(void *arg) } mpam_mon_sel_unlock(msc); - if (nrdy) { + if (nrdy) m->err = -EBUSY; + + if (m->err) return; - } *m->val += now; } -- Gitee From c16e7b71c0f8269e472f387885ad57fe67b91a75 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 12 Jan 2026 16:58:28 +0000 Subject: [PATCH 050/124] arm_mpam: Remove duplicate linux/srcu.h header ANBZ: #31060 commit b5a69c4869211a6ab61a95f5cc987b25f383dbc3 upstream. ./drivers/resctrl/mpam_internal.h: linux/srcu.h is included more than once. Reviewed-by: Jonathan Cameron Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=27328 Signed-off-by: Jiapeng Chong Acked-by: James Morse [BH: Keep alphabetical order] Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_internal.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e79c3c47259c..17cdc3080d58 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include -- Gitee From f621b6f438dbc16da477f285e3c1ada64ed948a7 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Mon, 12 Jan 2026 16:58:29 +0000 Subject: [PATCH 051/124] arm_mpam: Use non-atomic bitops when modifying feature bitmap ANBZ: #31060 commit b9f5c38e4af1a094384650d2fc79fb992d6d5e64 upstream. In the test__props_mismatch() kunit test we rely on the struct mpam_props being packed to ensure memcmp doesn't consider packing. Making it packed reduces the alignment of the features bitmap and so breaks a requirement for the use of atomics. As we don't rely on the set/clear of these bits being atomic, just make them non-atomic. Reviewed-by: Jonathan Cameron Signed-off-by: Ben Horgan Fixes: 8c90dc68a5de ("arm_mpam: Probe the hardware features resctrl supports") Reviewed-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Signed-off-by: Catalin Marinas Signed-off-by: Wei Chen --- drivers/resctrl/mpam_internal.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 17cdc3080d58..e8971842b124 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -200,8 +200,12 @@ struct mpam_props { } PACKED_FOR_KUNIT; #define mpam_has_feature(_feat, x) test_bit(_feat, (x)->features) -#define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) -#define mpam_clear_feature(_feat, x) clear_bit(_feat, (x)->features) +/* + * The non-atomic get/set operations are used because if struct mpam_props is + * packed, the alignment requirements for atomics aren't met. + */ +#define mpam_set_feature(_feat, x) __set_bit(_feat, (x)->features) +#define mpam_clear_feature(_feat, x) __clear_bit(_feat, (x)->features) /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { -- Gitee From 76a112c3b07690fa8f01af4e329b6fd2320a37ee Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:48 -0800 Subject: [PATCH 052/124] x86,fs/resctrl: Improve domain type checking ANBZ: #31060 commit 03eb578b37659e10bed14c2d9e7cc45dfe24123b upstream. Every resctrl resource has a list of domain structures. struct rdt_ctrl_domain and struct rdt_mon_domain both begin with struct rdt_domain_hdr with rdt_domain_hdr::type used in validity checks before accessing the domain of a particular type. Add the resource id to struct rdt_domain_hdr in preparation for a new monitoring domain structure that will be associated with a new monitoring resource. Improve existing domain validity checks with a new helper domain_header_is_valid() that checks both domain type and resource id. domain_header_is_valid() should be used before every call to container_of() that accesses a domain structure. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 10 ++++++---- fs/resctrl/ctrlmondata.c | 2 +- include/linux/resctrl.h | 9 +++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index bb6e757f2d46..d8cb41a1048f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -458,7 +458,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos); if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); @@ -475,6 +475,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_CTRL_DOMAIN; + d->hdr.rid = r->rid; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); rdt_domain_reconfigure_cdp(r); @@ -514,7 +515,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_mon_domain, hdr); @@ -532,6 +533,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = r->rid; ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); @@ -592,7 +594,7 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); @@ -638,7 +640,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_mon_domain, hdr); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 0d0ef54fc4de..f248eaf50d3c 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -649,7 +649,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * the resource to find the domain with "domid". */ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + if (!hdr || !domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, resid)) { ret = -ENOENT; goto out; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a7d92718b653..dfc91c5e8483 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -131,15 +131,24 @@ enum resctrl_domain_type { * @list: all instances of this resource * @id: unique id for this instance * @type: type of this instance + * @rid: resource id for this instance * @cpu_mask: which CPUs share this resource */ struct rdt_domain_hdr { struct list_head list; int id; enum resctrl_domain_type type; + enum resctrl_res_level rid; struct cpumask cpu_mask; }; +static inline bool domain_header_is_valid(struct rdt_domain_hdr *hdr, + enum resctrl_domain_type type, + enum resctrl_res_level rid) +{ + return !WARN_ON_ONCE(hdr->type != type || hdr->rid != rid); +} + /** * struct rdt_ctrl_domain - group of CPUs sharing a resctrl control resource * @hdr: common header for different domain types -- Gitee From a38e2e9964c9f7f2a92d2b71d46c5c5ef0259cfd Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:49 -0800 Subject: [PATCH 053/124] x86/resctrl: Move L3 initialization into new helper function ANBZ: #31060 commit 0d6447623d788806b5504182032a0837ffa2174c upstream. Carve out the resource monitoring domain init code into a separate helper in order to be able to initialize new types of monitoring domains besides the usual L3 ones. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 64 ++++++++++++++++-------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d8cb41a1048f..2fc2b3010a47 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -495,37 +495,13 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) } } -static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos) { - int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct list_head *add_pos = NULL; struct rdt_hw_mon_domain *hw_dom; - struct rdt_domain_hdr *hdr; struct rdt_mon_domain *d; struct cacheinfo *ci; int err; - lockdep_assert_held(&domain_list_lock); - - if (id < 0) { - pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", - cpu, r->mon_scope, r->name); - return; - } - - hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); - if (hdr) { - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) - return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - - cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - return; - } - hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); if (!hw_dom) return; @@ -533,7 +509,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; - d->hdr.rid = r->rid; + d->hdr.rid = RDT_RESOURCE_L3; ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); @@ -543,10 +519,6 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - arch_mon_domain_online(r, d); if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { @@ -564,6 +536,38 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) } } +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct list_head *add_pos = NULL; + struct rdt_domain_hdr *hdr; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; + } + + hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); + if (hdr) + cpumask_set_cpu(cpu, &hdr->cpu_mask); + + switch (r->rid) { + case RDT_RESOURCE_L3: + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + if (!hdr) + l3_mon_domain_setup(cpu, id, r, add_pos); + break; + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; + } +} + static void domain_add_cpu(int cpu, struct rdt_resource *r) { if (r->alloc_capable) -- Gitee From 0a56fde6a4cf1829a11267f3aaa42a9a43c1e502 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:50 -0800 Subject: [PATCH 054/124] x86/resctrl: Refactor domain_remove_cpu_mon() ready for new domain types ANBZ: #31060 commit 6396fc5351ea9130a72f6a2fc58eb7298ce6c15a upstream. New telemetry events will be associated with a new package scoped resource with a new domain structure. Refactor domain_remove_cpu_mon() so all the L3 domain processing is separate from the general domain action of clearing the CPU bit in the mask. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 2fc2b3010a47..8bfc33b43e13 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -625,9 +625,7 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) { int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; - struct rdt_mon_domain *d; lockdep_assert_held(&domain_list_lock); @@ -644,20 +642,29 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) return; } - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - hw_dom = resctrl_to_arch_mon_dom(d); + switch (r->rid) { + case RDT_RESOURCE_L3: { + struct rdt_hw_mon_domain *hw_dom; + struct rdt_mon_domain *d; - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + + d = container_of(hdr, struct rdt_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); resctrl_offline_mon_domain(r, d); - list_del_rcu(&d->hdr.list); + list_del_rcu(&hdr->list); synchronize_rcu(); mon_domain_free(hw_dom); - - return; + break; + } + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; } } -- Gitee From da624a44f5fd5d51fcabe6c084ac3c2d80344d06 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:51 -0800 Subject: [PATCH 055/124] x86/resctrl: Clean up domain_remove_cpu_ctrl() ANBZ: #31060 commit c1b630573c8ca51a89bd480f7eeaf8754c7609f2 upstream. For symmetry with domain_remove_cpu_mon() refactor domain_remove_cpu_ctrl() to take an early return when removing a CPU does not empty the domain. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 8bfc33b43e13..7ec3e7dfee38 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -598,28 +598,27 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) return; } + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) + return; + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); hw_dom = resctrl_to_arch_ctrl_dom(d); - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { - resctrl_offline_ctrl_domain(r, d); - list_del_rcu(&d->hdr.list); - synchronize_rcu(); - - /* - * rdt_ctrl_domain "d" is going to be freed below, so clear - * its pointer from pseudo_lock_region struct. - */ - if (d->plr) - d->plr->d = NULL; - ctrl_domain_free(hw_dom); + resctrl_offline_ctrl_domain(r, d); + list_del_rcu(&hdr->list); + synchronize_rcu(); - return; - } + /* + * rdt_ctrl_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + ctrl_domain_free(hw_dom); } static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) -- Gitee From 504a2b9e71685d17fe6a2af42e2a6d243786d84b Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:52 -0800 Subject: [PATCH 056/124] x86,fs/resctrl: Refactor domain create/remove using struct rdt_domain_hdr ANBZ: #31060 commit 97fec06d35b2c1ce6d80cf3b01bfddd82c720a2d upstream. Up until now, all monitoring events were associated with the L3 resource and it made sense to use the L3 specific "struct rdt_mon_domain *" argument to functions operating on domains. Telemetry events will be tied to a new resource with its instances represented by a new domain structure that, just like struct rdt_mon_domain, starts with the generic struct rdt_domain_hdr. Prepare to support domains belonging to different resources by changing the calling convention of functions operating on domains. Pass the generic header and use that to find the domain specific structure where needed. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 4 +- fs/resctrl/ctrlmondata.c | 14 ++++-- fs/resctrl/internal.h | 2 +- fs/resctrl/rdtgroup.c | 69 +++++++++++++++++++++--------- include/linux/resctrl.h | 4 +- 5 files changed, 63 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7ec3e7dfee38..4b71819a3a23 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -528,7 +528,7 @@ static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_add_tail_rcu(&d->hdr.list, add_pos); - err = resctrl_online_mon_domain(r, d); + err = resctrl_online_mon_domain(r, &d->hdr); if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); @@ -655,7 +655,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) d = container_of(hdr, struct rdt_mon_domain, hdr); hw_dom = resctrl_to_arch_mon_dom(d); - resctrl_offline_mon_domain(r, d); + resctrl_offline_mon_domain(r, hdr); list_del_rcu(&hdr->list); synchronize_rcu(); mon_domain_free(hw_dom); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index f248eaf50d3c..a2ea6a66fa67 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -547,14 +547,21 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, } void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first) { + struct rdt_mon_domain *d = NULL; int cpu; /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); + if (hdr) { + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + d = container_of(hdr, struct rdt_mon_domain, hdr); + } + /* * Setup the parameters to pass to mon_event_count() to read the data. */ @@ -649,12 +656,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * the resource to find the domain with "domid". */ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || !domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, resid)) { + if (!hdr) { ret = -ENOENT; goto out; } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); + mon_event_read(&rr, r, hdr, rdtgrp, &hdr->cpu_mask, evtid, false); } checkresult: diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index cf1fd82dc5a9..22fdb3a9b6f4 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -362,7 +362,7 @@ void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first); int resctrl_mon_resource_init(void); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 41ce4b377af4..31997ede64b4 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3164,17 +3164,22 @@ static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subn * when last domain being summed is removed. */ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) + struct rdt_domain_hdr *hdr) { struct rdtgroup *prgrp, *crgrp; + struct rdt_mon_domain *d; char subname[32]; bool snc_mode; char name[32]; + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + + d = container_of(hdr, struct rdt_mon_domain, hdr); snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : hdr->id); if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); + sprintf(subname, "mon_sub_%s_%02d", r->name, hdr->id); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); @@ -3184,15 +3189,20 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } } -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, +static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, struct rdt_resource *r, struct rdtgroup *prgrp, bool do_sum) { struct rmid_read rr = {0}; + struct rdt_mon_domain *d; struct mon_data *priv; struct mon_evt *mevt; int ret, domid; + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; + + d = container_of(hdr, struct rdt_mon_domain, hdr); for_each_mon_event(mevt) { if (mevt->rid != r->rid || !mevt->enabled) continue; @@ -3206,23 +3216,28 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, return ret; if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); + mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt->evtid, true); } return 0; } static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_mon_domain *d, + struct rdt_domain_hdr *hdr, struct rdt_resource *r, struct rdtgroup *prgrp) { struct kernfs_node *kn, *ckn; + struct rdt_mon_domain *d; char name[32]; bool snc_mode; int ret = 0; lockdep_assert_held(&rdtgroup_mutex); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; + + d = container_of(hdr, struct rdt_mon_domain, hdr); snc_mode = r->mon_scope == RESCTRL_L3_NODE; sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); kn = kernfs_find_and_get(parent_kn, name); @@ -3240,13 +3255,13 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, ret = rdtgroup_kn_set_ugid(kn); if (ret) goto out_destroy; - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); + ret = mon_add_all_files(kn, hdr, r, prgrp, snc_mode); if (ret) goto out_destroy; } if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); + sprintf(name, "mon_sub_%s_%02d", r->name, hdr->id); ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); if (IS_ERR(ckn)) { ret = -EINVAL; @@ -3257,7 +3272,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, if (ret) goto out_destroy; - ret = mon_add_all_files(ckn, d, r, prgrp, false); + ret = mon_add_all_files(ckn, hdr, r, prgrp, false); if (ret) goto out_destroy; } @@ -3275,7 +3290,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, * and "monitor" groups with given domain id. */ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) + struct rdt_domain_hdr *hdr) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; @@ -3283,12 +3298,12 @@ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); + mkdir_mondata_subdir(parent_kn, hdr, r, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); + mkdir_mondata_subdir(parent_kn, hdr, r, crgrp); } } } @@ -3297,14 +3312,14 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_resource *r, struct rdtgroup *prgrp) { - struct rdt_mon_domain *dom; + struct rdt_domain_hdr *hdr; int ret; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + list_for_each_entry(hdr, &r->mon_domains, list) { + ret = mkdir_mondata_subdir(parent_kn, hdr, r, prgrp); if (ret) return ret; } @@ -4188,16 +4203,23 @@ void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain mutex_unlock(&rdtgroup_mutex); } -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { + struct rdt_mon_domain *d; + mutex_lock(&rdtgroup_mutex); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + goto out_unlock; + + d = container_of(hdr, struct rdt_mon_domain, hdr); + /* * If resctrl is mounted, remove all the * per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d); + rmdir_mondata_subdir_allrdtgrp(r, hdr); if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); @@ -4215,7 +4237,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d } domain_destroy_mon_state(d); - +out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -4288,12 +4310,17 @@ int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d return err; } -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { - int err; + struct rdt_mon_domain *d; + int err = -EINVAL; mutex_lock(&rdtgroup_mutex); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + goto out_unlock; + + d = container_of(hdr, struct rdt_mon_domain, hdr); err = domain_setup_mon_state(r, d); if (err) goto out_unlock; @@ -4314,7 +4341,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) * If resctrl is mounted, add per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); + mkdir_mondata_subdir_allrdtgrp(r, hdr); out_unlock: mutex_unlock(&rdtgroup_mutex); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index dfc91c5e8483..0b55809af5d7 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -504,9 +504,9 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr); void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr); void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); -- Gitee From 7de8d5b5cb9439d64476a5874c83d687140e6a59 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:53 -0800 Subject: [PATCH 057/124] fs/resctrl: Split L3 dependent parts out of __mon_event_count() ANBZ: #31060 commit ad5c2ff75e0c53d2588dfc10eb87458e759b6bbe upstream. Carve out the L3 resource specific event reading code into a separate helper to support reading event data from a new monitoring resource. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- fs/resctrl/monitor.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 572a9925bd6c..b5e0db38c8bf 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -413,7 +413,7 @@ static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); } -static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) +static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); u32 closid = rdtgrp->closid; @@ -494,6 +494,17 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) return ret; } +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) +{ + switch (rr->r->rid) { + case RDT_RESOURCE_L3: + return __l3_mon_event_count(rdtgrp, rr); + default: + rr->err = -EINVAL; + return -EINVAL; + } +} + /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). -- Gitee From 4bb49f12b0cc2f0a476d88eb4fae872ae3837e27 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:54 -0800 Subject: [PATCH 058/124] x86,fs/resctrl: Use struct rdt_domain_hdr when reading counters ANBZ: #31060 commit 6b10cf7b6ea857cdf9570e21c077a05803f60575 upstream. Convert the whole call sequence from mon_event_read() to resctrl_arch_rmid_read() to pass resource independent struct rdt_domain_hdr instead of an L3 specific domain structure to prepare for monitoring events in other resources. This additional layer of indirection obscures which aspects of event counting depend on a valid domain. Event initialization, support for assignable counters, and normal event counting implicitly depend on a valid domain while summing of domains does not. Split summing domains from the core event counting handling to make their respective dependencies obvious. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/monitor.c | 12 +++- fs/resctrl/ctrlmondata.c | 9 +-- fs/resctrl/internal.h | 18 +++--- fs/resctrl/monitor.c | 85 ++++++++++++++++++--------- include/linux/resctrl.h | 4 +- 5 files changed, 78 insertions(+), 50 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 52c14a7d36da..393379477289 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -237,19 +237,25 @@ static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, return chunks * hw_res->mon_scale; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - int cpu = cpumask_any(&d->hdr.cpu_mask); + struct rdt_hw_mon_domain *hw_dom; struct arch_mbm_state *am; + struct rdt_mon_domain *d; u64 msr_val; u32 prmid; + int cpu; int ret; resctrl_arch_rmid_read_context_check(); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; + d = container_of(hdr, struct rdt_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + cpu = cpumask_any(&hdr->cpu_mask); prmid = logical_rmid_to_physical_rmid(cpu, rmid); ret = __rmid_read_phys(prmid, eventid, &msr_val); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index a2ea6a66fa67..ad347ab4ed29 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -550,25 +550,18 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first) { - struct rdt_mon_domain *d = NULL; int cpu; /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - if (hdr) { - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) - return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - } - /* * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; rr->r = r; - rr->d = d; + rr->hdr = hdr; rr->first = first; if (resctrl_arch_mbm_cntr_assign_enabled(r) && resctrl_is_mbm_event(evtid)) { diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 22fdb3a9b6f4..698ed84fd073 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -106,24 +106,26 @@ struct mon_data { * resource group then its event count is summed with the count from all * its child resource groups. * @r: Resource describing the properties of the event being read. - * @d: Domain that the counter should be read from. If NULL then sum all - * domains in @r sharing L3 @ci.id + * @hdr: Header of domain that the counter should be read from. If NULL then + * sum all domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @ci: Cacheinfo for L3. Only set when @hdr is NULL. Used when summing + * domains. * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it * is an MBM event. * @err: Error encountered when reading counter. - * @val: Returned value of event counter. If @rgrp is a parent resource group, - * @val includes the sum of event counts from its child resource groups. - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, - * (summed across child resource groups if @rgrp is a parent resource group). + * @val: Returned value of event counter. If @rgrp is a parent resource + * group, @val includes the sum of event counts from its child + * resource groups. If @hdr is NULL, @val includes the sum of all + * domains in @r sharing @ci.id, (summed across child resource groups + * if @rgrp is a parent resource group). * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). */ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; - struct rdt_mon_domain *d; + struct rdt_domain_hdr *hdr; enum resctrl_event_id evtid; bool first; struct cacheinfo *ci; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b5e0db38c8bf..e1c12201388f 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -159,7 +159,7 @@ void __check_limbo(struct rdt_mon_domain *d, bool force_free) break; entry = __rmid_entry(idx); - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + if (resctrl_arch_rmid_read(r, &d->hdr, entry->closid, entry->rmid, QOS_L3_OCCUP_EVENT_ID, &val, arch_mon_ctx)) { rmid_dirty = true; @@ -421,11 +421,16 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) struct rdt_mon_domain *d; int cntr_id = -ENOENT; struct mbm_state *m; - int err, ret; u64 tval = 0; + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) { + rr->err = -EIO; + return -EINVAL; + } + d = container_of(rr->hdr, struct rdt_mon_domain, hdr); + if (rr->is_mbm_cntr) { - cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evtid); if (cntr_id < 0) { rr->err = -ENOENT; return -EINVAL; @@ -434,31 +439,50 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) if (rr->first) { if (rr->is_mbm_cntr) - resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + resctrl_arch_reset_cntr(rr->r, d, closid, rmid, cntr_id, rr->evtid); else - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, d, closid, rmid, rr->evtid); + m = get_mbm_state(d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - if (rr->d) { - /* Reading a single domain, must be on a CPU in that domain. */ - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) - return -EINVAL; - if (rr->is_mbm_cntr) - rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, - rr->evtid, &tval); - else - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return -EINVAL; + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; - rr->val += tval; + rr->val += tval; - return 0; + return 0; +} + +static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *rr) +{ + int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; + struct rdt_mon_domain *d; + u64 tval = 0; + int err, ret; + + /* + * Summing across domains is only done for systems that implement + * Sub-NUMA Cluster. There is no overlap with systems that support + * assignable counters. + */ + if (rr->is_mbm_cntr) { + pr_warn_once("Summing domains using assignable counters is not supported\n"); + rr->err = -EINVAL; + return -EINVAL; } /* Summing domains that share a cache, must be on a CPU for that cache. */ @@ -476,12 +500,8 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { if (d->ci_id != rr->ci->id) continue; - if (rr->is_mbm_cntr) - err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, - rr->evtid, &tval); - else - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + err = resctrl_arch_rmid_read(rr->r, &d->hdr, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -498,7 +518,10 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { switch (rr->r->rid) { case RDT_RESOURCE_L3: - return __l3_mon_event_count(rdtgrp, rr); + if (rr->hdr) + return __l3_mon_event_count(rdtgrp, rr); + else + return __l3_mon_event_count_sum(rdtgrp, rr); default: rr->err = -EINVAL; return -EINVAL; @@ -522,9 +545,13 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) u64 cur_bw, bytes, cur_bytes; u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; + struct rdt_mon_domain *d; struct mbm_state *m; - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + d = container_of(rr->hdr, struct rdt_mon_domain, hdr); + m = get_mbm_state(d, closid, rmid, rr->evtid); if (WARN_ON_ONCE(!m)) return; @@ -697,7 +724,7 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * struct rmid_read rr = {0}; rr.r = r; - rr.d = d; + rr.hdr = &d->hdr; rr.evtid = evtid; if (resctrl_arch_mbm_cntr_assign_enabled(r)) { rr.is_mbm_cntr = true; diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 0b55809af5d7..1a33d5e6ae23 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -514,7 +514,7 @@ void resctrl_offline_cpu(unsigned int cpu); * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid * for this resource and domain. * @r: resource that the counter should be read from. - * @d: domain that the counter should be read from. + * @hdr: Header of domain that the counter should be read from. * @closid: closid that matches the rmid. Depending on the architecture, the * counter may match traffic of both @closid and @rmid, or @rmid * only. @@ -535,7 +535,7 @@ void resctrl_offline_cpu(unsigned int cpu); * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *arch_mon_ctx); -- Gitee From c08f35b033caf20c81c9bb5f0b3701212f90e58b Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:55 -0800 Subject: [PATCH 059/124] x86,fs/resctrl: Rename struct rdt_mon_domain and rdt_hw_mon_domain ANBZ: #31060 commit 4bc3ef46ff41d5e7ba557e56e9cd2031527cd7f8 upstream. The upcoming telemetry event monitoring is not tied to the L3 resource and will have a new domain structure. Rename the L3 resource specific domain data structures to include "l3_" in their names to avoid confusion between the different resource specific domain structures: rdt_mon_domain -> rdt_l3_mon_domain rdt_hw_mon_domain -> rdt_hw_l3_mon_domain No functional change. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 14 +++--- arch/x86/kernel/cpu/resctrl/internal.h | 16 +++--- arch/x86/kernel/cpu/resctrl/monitor.c | 36 ++++++------- fs/resctrl/ctrlmondata.c | 2 +- fs/resctrl/internal.h | 8 +-- fs/resctrl/monitor.c | 70 +++++++++++++------------- fs/resctrl/rdtgroup.c | 40 +++++++-------- include/linux/resctrl.h | 22 ++++---- 8 files changed, 104 insertions(+), 104 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4b71819a3a23..9bc9be1408b7 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -362,7 +362,7 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) kfree(hw_dom); } -static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) +static void mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom) { int idx; @@ -399,7 +399,7 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom) { size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); enum resctrl_event_id eventid; @@ -497,8 +497,8 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos) { - struct rdt_hw_mon_domain *hw_dom; - struct rdt_mon_domain *d; + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct cacheinfo *ci; int err; @@ -647,13 +647,13 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) switch (r->rid) { case RDT_RESOURCE_L3: { - struct rdt_hw_mon_domain *hw_dom; - struct rdt_mon_domain *d; + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); hw_dom = resctrl_to_arch_mon_dom(d); resctrl_offline_mon_domain(r, hdr); list_del_rcu(&hdr->list); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 6da9bd1a188b..5b3ccebdd39c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -63,17 +63,17 @@ struct rdt_hw_ctrl_domain { }; /** - * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share - * a resource for a monitor function - * @d_resctrl: Properties exposed to the resctrl file system + * struct rdt_hw_l3_mon_domain - Arch private attributes of a set of CPUs sharing + * RDT_RESOURCE_L3 monitoring + * @d_resctrl: Properties exposed to the resctrl file system * @arch_mbm_states: Per-event pointer to the MBM event's saved state. * An MBM event's state is an array of struct arch_mbm_state * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ -struct rdt_hw_mon_domain { - struct rdt_mon_domain d_resctrl; +struct rdt_hw_l3_mon_domain { + struct rdt_l3_mon_domain d_resctrl; struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; @@ -82,9 +82,9 @@ static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctr return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl); } -static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r) +static inline struct rdt_hw_l3_mon_domain *resctrl_to_arch_mon_dom(struct rdt_l3_mon_domain *r) { - return container_of(r, struct rdt_hw_mon_domain, d_resctrl); + return container_of(r, struct rdt_hw_l3_mon_domain, d_resctrl); } /** @@ -138,7 +138,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r extern struct rdt_hw_resource rdt_resources_all[]; -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d); /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 393379477289..9eb2388d78e4 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -108,7 +108,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) * * In RMID sharing mode there are fewer "logical RMID" values available * to accumulate data ("physical RMIDs" are divided evenly between SNC - * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * nodes that share an L3 cache). Linux creates an rdt_l3_mon_domain for * each SNC node. * * The value loaded into IA32_PQR_ASSOC is the "logical RMID". @@ -156,7 +156,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) return 0; } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_l3_mon_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) { @@ -170,11 +170,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do return state ? &state[rmid] : NULL; } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; u32 prmid; @@ -193,9 +193,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, * Assumes that hardware counters are also reset and thus that there is * no need to record initial non-zero counts. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); enum resctrl_event_id eventid; int idx; @@ -216,10 +216,10 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 rmid, enum resctrl_event_id eventid, u64 msr_val) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct arch_mbm_state *am; u64 chunks; @@ -241,9 +241,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom; + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct arch_mbm_state *am; - struct rdt_mon_domain *d; u64 msr_val; u32 prmid; int cpu; @@ -253,7 +253,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); hw_dom = resctrl_to_arch_mon_dom(d); cpu = cpumask_any(&hdr->cpu_mask); prmid = logical_rmid_to_physical_rmid(cpu, rmid); @@ -307,11 +307,11 @@ static int __cntr_id_read(u32 cntr_id, u64 *val) return 0; } -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct arch_mbm_state *am; am = get_arch_mbm_state(hw_dom, rmid, eventid); @@ -323,7 +323,7 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, } } -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val) { @@ -353,7 +353,7 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, * must adjust RMID counter numbers based on SNC node. See * logical_rmid_to_physical_rmid() for code that does this. */ -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { if (snc_nodes_per_l3_cache > 1) msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); @@ -515,7 +515,7 @@ static void resctrl_abmc_set_one_amd(void *arg) */ static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; lockdep_assert_cpus_held(); @@ -554,11 +554,11 @@ static void resctrl_abmc_config_one_amd(void *info) /* * Send an IPI to the domain to assign the counter to RMID, event pair. */ -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); union l3_qos_abmc_cfg abmc_cfg = { 0 }; struct arch_mbm_state *am; diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index ad347ab4ed29..b74c69f2d54e 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -596,9 +596,9 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) struct kernfs_open_file *of = m->private; enum resctrl_res_level resid; enum resctrl_event_id evtid; + struct rdt_l3_mon_domain *d; struct rdt_domain_hdr *hdr; struct rmid_read rr = {0}; - struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; int domid, cpu, ret = 0; struct rdt_resource *r; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 698ed84fd073..d9e291d94926 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -369,7 +369,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, int resctrl_mon_resource_init(void); -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, +void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); @@ -377,14 +377,14 @@ void mbm_handle_overflow(struct work_struct *work); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_mon_domain *d); +bool has_busy_rmid(struct rdt_l3_mon_domain *d); -void __check_limbo(struct rdt_mon_domain *d, bool force_free); +void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free); void resctrl_file_fflags_init(const char *config, unsigned long fflags); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index e1c12201388f..9edbe9805d33 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -130,7 +130,7 @@ static void limbo_release_entry(struct rmid_entry *entry) * decrement the count. If the busy count gets to zero on an RMID, we * free the RMID */ -void __check_limbo(struct rdt_mon_domain *d, bool force_free) +void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -188,7 +188,7 @@ void __check_limbo(struct rdt_mon_domain *d, bool force_free) resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } -bool has_busy_rmid(struct rdt_mon_domain *d) +bool has_busy_rmid(struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -289,7 +289,7 @@ int alloc_rmid(u32 closid) static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; u32 idx; lockdep_assert_held(&rdtgroup_mutex); @@ -342,7 +342,7 @@ void free_rmid(u32 closid, u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, +static struct mbm_state *get_mbm_state(struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); @@ -362,7 +362,7 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, * Return: * Valid counter ID on success, or -ENOENT on failure. */ -static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { int cntr_id; @@ -389,7 +389,7 @@ static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, * Return: * Valid counter ID on success, or -ENOSPC on failure. */ -static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { int cntr_id; @@ -408,7 +408,7 @@ static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, /* * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. */ -static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +static void mbm_cntr_free(struct rdt_l3_mon_domain *d, int cntr_id) { memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); } @@ -418,7 +418,7 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) int cpu = smp_processor_id(); u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int cntr_id = -ENOENT; struct mbm_state *m; u64 tval = 0; @@ -427,7 +427,7 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) rr->err = -EIO; return -EINVAL; } - d = container_of(rr->hdr, struct rdt_mon_domain, hdr); + d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); if (rr->is_mbm_cntr) { cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evtid); @@ -470,7 +470,7 @@ static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *r int cpu = smp_processor_id(); u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; u64 tval = 0; int err, ret; @@ -545,12 +545,12 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) u64 cur_bw, bytes, cur_bytes; u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct mbm_state *m; if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; - d = container_of(rr->hdr, struct rdt_mon_domain, hdr); + d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); m = get_mbm_state(d, closid, rmid, rr->evtid); if (WARN_ON_ONCE(!m)) return; @@ -650,7 +650,7 @@ static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, * throttle MSRs already have low percentage values. To avoid * unnecessarily restricting such rdtgroups, we also increase the bandwidth. */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_l3_mon_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; @@ -718,7 +718,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); } -static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; @@ -750,7 +750,7 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, +static void mbm_update(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp) { /* @@ -771,12 +771,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); + d = container_of(work, struct rdt_l3_mon_domain, cqm_limbo.work); __check_limbo(d, false); @@ -799,7 +799,7 @@ void cqm_handle_limbo(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); @@ -816,7 +816,7 @@ void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct list_head *head; struct rdt_resource *r; @@ -831,7 +831,7 @@ void mbm_handle_overflow(struct work_struct *work) goto out_unlock; r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - d = container_of(work, struct rdt_mon_domain, mbm_over.work); + d = container_of(work, struct rdt_l3_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mbm_update(r, d, prgrp); @@ -865,7 +865,7 @@ void mbm_handle_overflow(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); @@ -1120,7 +1120,7 @@ ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf * mbm_cntr_free_all() - Clear all the counter ID configuration details in the * domain @d. Called when mbm_assign_mode is changed. */ -static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); } @@ -1129,7 +1129,7 @@ static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) * resctrl_reset_rmid_all() - Reset all non-architecture states for all the * supported RMIDs. */ -static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); enum resctrl_event_id evt; @@ -1150,7 +1150,7 @@ static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * Assign the counter if @assign is true else unassign the counter. Reset the * associated non-architectural state. */ -static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { @@ -1170,7 +1170,7 @@ static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain * * Return: * 0 on success, < 0 on failure. */ -static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int cntr_id; @@ -1205,7 +1205,7 @@ static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_dom * Return: * 0 on success, < 0 on failure. */ -static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, +static int rdtgroup_assign_cntr_event(struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); @@ -1255,7 +1255,7 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. */ -static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int cntr_id; @@ -1276,7 +1276,7 @@ static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_d * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign * the counters from all the domains if @d is NULL else unassign from @d. */ -static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, +static void rdtgroup_unassign_cntr_event(struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); @@ -1351,7 +1351,7 @@ static int resctrl_parse_mem_transactions(char *tok, u32 *val) static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int cntr_id; list_for_each_entry(d, &r->mon_domains, hdr.list) { @@ -1457,7 +1457,7 @@ ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int ret = 0; bool enable; @@ -1530,7 +1530,7 @@ int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; cpus_read_lock(); @@ -1554,7 +1554,7 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; u32 cntrs, i; int ret = 0; @@ -1595,7 +1595,7 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; struct mon_evt *mevt; int ret = 0; @@ -1658,7 +1658,7 @@ static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *n return NULL; } -static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, +static int rdtgroup_modify_assign_state(char *assign, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int ret = 0; @@ -1684,7 +1684,7 @@ static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, char *event, char *tok) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; unsigned long dom_id = 0; char *dom_str, *id_str; struct mon_evt *mevt; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 31997ede64b4..c5523efb2a02 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1618,7 +1618,7 @@ static void mondata_config_read(struct resctrl_mon_config_info *mon_info) static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) { struct resctrl_mon_config_info mon_info; - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; cpus_read_lock(); @@ -1666,7 +1666,7 @@ static int mbm_local_bytes_config_show(struct kernfs_open_file *of, } static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_mon_domain *d, u32 evtid, u32 val) + struct rdt_l3_mon_domain *d, u32 evtid, u32 val) { struct resctrl_mon_config_info mon_info = {0}; @@ -1707,8 +1707,8 @@ static void mbm_config_write_domain(struct rdt_resource *r, static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) { char *dom_str = NULL, *id_str; + struct rdt_l3_mon_domain *d; unsigned long dom_id, val; - struct rdt_mon_domain *d; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); @@ -2716,7 +2716,7 @@ static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; struct rdt_resource *r; int ret; @@ -3167,7 +3167,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; char subname[32]; bool snc_mode; char name[32]; @@ -3175,7 +3175,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); snc_mode = r->mon_scope == RESCTRL_L3_NODE; sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : hdr->id); if (snc_mode) @@ -3193,8 +3193,8 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, struct rdt_resource *r, struct rdtgroup *prgrp, bool do_sum) { + struct rdt_l3_mon_domain *d; struct rmid_read rr = {0}; - struct rdt_mon_domain *d; struct mon_data *priv; struct mon_evt *mevt; int ret, domid; @@ -3202,7 +3202,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); for_each_mon_event(mevt) { if (mevt->rid != r->rid || !mevt->enabled) continue; @@ -3227,7 +3227,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, struct rdt_resource *r, struct rdtgroup *prgrp) { struct kernfs_node *kn, *ckn; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; char name[32]; bool snc_mode; int ret = 0; @@ -3237,7 +3237,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); snc_mode = r->mon_scope == RESCTRL_L3_NODE; sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); kn = kernfs_find_and_get(parent_kn, name); @@ -4181,7 +4181,7 @@ static void rdtgroup_setup_default(void) mutex_unlock(&rdtgroup_mutex); } -static void domain_destroy_mon_state(struct rdt_mon_domain *d) +static void domain_destroy_mon_state(struct rdt_l3_mon_domain *d) { int idx; @@ -4205,14 +4205,14 @@ void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; mutex_lock(&rdtgroup_mutex); if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) goto out_unlock; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); /* * If resctrl is mounted, remove all the @@ -4254,7 +4254,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h * * Returns 0 for success, or -ENOMEM. */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize = sizeof(*d->mbm_states[0]); @@ -4312,7 +4312,7 @@ int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int err = -EINVAL; mutex_lock(&rdtgroup_mutex); @@ -4320,7 +4320,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) goto out_unlock; - d = container_of(hdr, struct rdt_mon_domain, hdr); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); err = domain_setup_mon_state(r, d); if (err) goto out_unlock; @@ -4367,10 +4367,10 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) } } -static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, - struct rdt_resource *r) +static struct rdt_l3_mon_domain *get_mon_domain_from_cpu(int cpu, + struct rdt_resource *r) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; lockdep_assert_cpus_held(); @@ -4386,7 +4386,7 @@ static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, void resctrl_offline_cpu(unsigned int cpu) { struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; mutex_lock(&rdtgroup_mutex); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 1a33d5e6ae23..a1df26798161 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -178,7 +178,7 @@ struct mbm_cntr_cfg { }; /** - * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource + * struct rdt_l3_mon_domain - group of CPUs sharing RDT_RESOURCE_L3 monitoring * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold @@ -192,7 +192,7 @@ struct mbm_cntr_cfg { * @cntr_cfg: array of assignable counters' configuration (indexed * by counter ID) */ -struct rdt_mon_domain { +struct rdt_l3_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; @@ -364,10 +364,10 @@ struct resctrl_cpu_defaults { }; struct resctrl_mon_config_info { - struct rdt_resource *r; - struct rdt_mon_domain *d; - u32 evtid; - u32 mon_config; + struct rdt_resource *r; + struct rdt_l3_mon_domain *d; + u32 evtid; + u32 mon_config; }; /** @@ -582,7 +582,7 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid); @@ -595,7 +595,7 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d); /** * resctrl_arch_reset_all_ctrls() - Reset the control for each CLOSID to its @@ -621,7 +621,7 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); * * This can be called from any CPU. */ -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign); @@ -644,7 +644,7 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val); @@ -659,7 +659,7 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid); -- Gitee From f0b0c7a12dc4fdd657323bf35b4575bb3be289c1 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:56 -0800 Subject: [PATCH 060/124] x86,fs/resctrl: Rename some L3 specific functions ANBZ: #31060 commit 9c214d10c50990c7a61b95887493df9ae713eec5 upstream. With the arrival of monitor events tied to new domains associated with a different resource it would be clearer if the L3 resource specific functions are more accurately named. Rename three groups of functions: Functions that allocate/free architecture per-RMID MBM state information: arch_domain_mbm_alloc() -> l3_mon_domain_mbm_alloc() mon_domain_free() -> l3_mon_domain_free() Functions that allocate/free filesystem per-RMID MBM state information: domain_setup_mon_state() -> domain_setup_l3_mon_state() domain_destroy_mon_state() -> domain_destroy_l3_mon_state() Initialization/exit: rdt_get_mon_l3_config() -> rdt_get_l3_mon_config() resctrl_mon_resource_init() -> resctrl_l3_mon_resource_init() resctrl_mon_resource_exit() -> resctrl_l3_mon_resource_exit() Ensure kernel-doc descriptions of these functions' return values are present and correctly formatted. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 20 +++++++++++--------- arch/x86/kernel/cpu/resctrl/internal.h | 2 +- arch/x86/kernel/cpu/resctrl/monitor.c | 2 +- fs/resctrl/internal.h | 6 +++--- fs/resctrl/monitor.c | 8 ++++---- fs/resctrl/rdtgroup.c | 24 ++++++++++++------------ 6 files changed, 32 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 9bc9be1408b7..e12d970051e9 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -362,7 +362,7 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) kfree(hw_dom); } -static void mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom) +static void l3_mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom) { int idx; @@ -395,11 +395,13 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * } /** - * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * l3_mon_domain_mbm_alloc() - Allocate arch private storage for the MBM counters * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays + * + * Return: 0 for success, or -ENOMEM. */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom) +static int l3_mon_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom) { size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); enum resctrl_event_id eventid; @@ -513,7 +515,7 @@ static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); return; } d->ci_id = ci->id; @@ -521,8 +523,8 @@ static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { - mon_domain_free(hw_dom); + if (l3_mon_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { + l3_mon_domain_free(hw_dom); return; } @@ -532,7 +534,7 @@ static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); } } @@ -658,7 +660,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) resctrl_offline_mon_domain(r, hdr); list_del_rcu(&hdr->list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); break; } default: @@ -908,7 +910,7 @@ static __init bool get_rdt_mon_resources(void) if (!ret) return false; - return !rdt_get_mon_l3_config(r); + return !rdt_get_l3_mon_config(r); } static __init void __check_quirks_intel(void) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5b3ccebdd39c..6c9e75440c10 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -211,7 +211,7 @@ union l3_qos_abmc_cfg { void rdt_ctrl_update(void *arg); -int rdt_get_mon_l3_config(struct rdt_resource *r); +int rdt_get_l3_mon_config(struct rdt_resource *r); bool rdt_cpu_has(int flag); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 9eb2388d78e4..8760073a8cb2 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -423,7 +423,7 @@ static __init int snc_get_config(void) return ret; } -int __init rdt_get_mon_l3_config(struct rdt_resource *r) +int __init rdt_get_l3_mon_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index d9e291d94926..88b4489b68e1 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -357,7 +357,9 @@ int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); -void resctrl_mon_resource_exit(void); +int resctrl_l3_mon_resource_init(void); + +void resctrl_l3_mon_resource_exit(void); void mon_event_count(void *info); @@ -367,8 +369,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first); -int resctrl_mon_resource_init(void); - void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 9edbe9805d33..d5ae0ef4c947 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1780,7 +1780,7 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, } /** - * resctrl_mon_resource_init() - Initialise global monitoring structures. + * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. * * Allocate and initialise global monitor resources that do not belong to a * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. @@ -1789,9 +1789,9 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, * Resctrl's cpuhp callbacks may be called before this point to bring a domain * online. * - * Returns 0 for success, or -ENOMEM. + * Return: 0 for success, or -ENOMEM. */ -int resctrl_mon_resource_init(void) +int resctrl_l3_mon_resource_init(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); int ret; @@ -1841,7 +1841,7 @@ int resctrl_mon_resource_init(void) return 0; } -void resctrl_mon_resource_exit(void) +void resctrl_l3_mon_resource_exit(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index c5523efb2a02..0d5a816dd3ef 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4181,7 +4181,7 @@ static void rdtgroup_setup_default(void) mutex_unlock(&rdtgroup_mutex); } -static void domain_destroy_mon_state(struct rdt_l3_mon_domain *d) +static void domain_destroy_l3_mon_state(struct rdt_l3_mon_domain *d) { int idx; @@ -4236,13 +4236,13 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h cancel_delayed_work(&d->cqm_limbo); } - domain_destroy_mon_state(d); + domain_destroy_l3_mon_state(d); out_unlock: mutex_unlock(&rdtgroup_mutex); } /** - * domain_setup_mon_state() - Initialise domain monitoring structures. + * domain_setup_l3_mon_state() - Initialise domain monitoring structures. * @r: The resource for the newly online domain. * @d: The newly online domain. * @@ -4250,11 +4250,11 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h * Called when the first CPU of a domain comes online, regardless of whether * the filesystem is mounted. * During boot this may be called before global allocations have been made by - * resctrl_mon_resource_init(). + * resctrl_l3_mon_resource_init(). * - * Returns 0 for success, or -ENOMEM. + * Return: 0 for success, or -ENOMEM. */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +static int domain_setup_l3_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize = sizeof(*d->mbm_states[0]); @@ -4321,7 +4321,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr goto out_unlock; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); - err = domain_setup_mon_state(r, d); + err = domain_setup_l3_mon_state(r, d); if (err) goto out_unlock; @@ -4436,13 +4436,13 @@ int resctrl_init(void) thread_throttle_mode_init(); - ret = resctrl_mon_resource_init(); + ret = resctrl_l3_mon_resource_init(); if (ret) return ret; ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) { - resctrl_mon_resource_exit(); + resctrl_l3_mon_resource_exit(); return ret; } @@ -4477,7 +4477,7 @@ int resctrl_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_mon_resource_exit(); + resctrl_l3_mon_resource_exit(); return ret; } @@ -4513,7 +4513,7 @@ static bool resctrl_online_domains_exist(void) * When called by the architecture code, all CPUs and resctrl domains must be * offline. This ensures the limbo and overflow handlers are not scheduled to * run, meaning the data structures they access can be freed by - * resctrl_mon_resource_exit(). + * resctrl_l3_mon_resource_exit(). * * After resctrl_exit() returns, the architecture code should return an * error from all resctrl_arch_ functions that can do this. @@ -4540,5 +4540,5 @@ void resctrl_exit(void) * it can be used to umount resctrl. */ - resctrl_mon_resource_exit(); + resctrl_l3_mon_resource_exit(); } -- Gitee From ee6d07ec857615fa6815593984ecd35fd60ebd59 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:57 -0800 Subject: [PATCH 061/124] fs/resctrl: Make event details accessible to functions when reading events ANBZ: #31060 commit dd110880e80d35ad07e460e7a8da007c8058e7bf upstream. Reading monitoring event data from MMIO requires more context than the event id to be able to read the correct memory location. struct mon_evt is the appropriate place for this event specific context. Prepare for addition of extra fields to struct mon_evt by changing the calling conventions to pass a pointer to the mon_evt structure instead of just the event id. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- fs/resctrl/ctrlmondata.c | 18 +++++++++--------- fs/resctrl/internal.h | 10 +++++----- fs/resctrl/monitor.c | 22 +++++++++++----------- fs/resctrl/rdtgroup.c | 6 +++--- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index b74c69f2d54e..c3656812848b 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -548,7 +548,7 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) + cpumask_t *cpumask, struct mon_evt *evt, int first) { int cpu; @@ -559,15 +559,15 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; - rr->evtid = evtid; + rr->evt = evt; rr->r = r; rr->hdr = hdr; rr->first = first; if (resctrl_arch_mbm_cntr_assign_enabled(r) && - resctrl_is_mbm_event(evtid)) { + resctrl_is_mbm_event(evt->evtid)) { rr->is_mbm_cntr = true; } else { - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evt->evtid); if (IS_ERR(rr->arch_mon_ctx)) { rr->err = -EINVAL; return; @@ -588,14 +588,13 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); if (rr->arch_mon_ctx) - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; enum resctrl_res_level resid; - enum resctrl_event_id evtid; struct rdt_l3_mon_domain *d; struct rdt_domain_hdr *hdr; struct rmid_read rr = {0}; @@ -603,6 +602,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) int domid, cpu, ret = 0; struct rdt_resource *r; struct cacheinfo *ci; + struct mon_evt *evt; struct mon_data *md; rdtgrp = rdtgroup_kn_lock_live(of->kn); @@ -619,7 +619,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) resid = md->rid; domid = md->domid; - evtid = md->evtid; + evt = md->evt; r = resctrl_arch_get_resource(resid); if (md->sum) { @@ -637,7 +637,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) continue; rr.ci = ci; mon_event_read(&rr, r, NULL, rdtgrp, - &ci->shared_cpu_map, evtid, false); + &ci->shared_cpu_map, evt, false); goto checkresult; } } @@ -653,7 +653,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) ret = -ENOENT; goto out; } - mon_event_read(&rr, r, hdr, rdtgrp, &hdr->cpu_mask, evtid, false); + mon_event_read(&rr, r, hdr, rdtgrp, &hdr->cpu_mask, evt, false); } checkresult: diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 88b4489b68e1..12a2ab7e3c9b 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -81,7 +81,7 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. * @rid: Resource id associated with the event file. - * @evtid: Event id associated with the event file. + * @evt: Event structure associated with the event file. * @sum: Set when event must be summed across multiple * domains. * @domid: When @sum is zero this is the domain to which @@ -95,7 +95,7 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; struct mon_data { struct list_head list; enum resctrl_res_level rid; - enum resctrl_event_id evtid; + struct mon_evt *evt; int domid; bool sum; }; @@ -108,7 +108,7 @@ struct mon_data { * @r: Resource describing the properties of the event being read. * @hdr: Header of domain that the counter should be read from. If NULL then * sum all domains in @r sharing L3 @ci.id - * @evtid: Which monitor event to read. + * @evt: Which monitor event to read. * @first: Initialize MBM counter when true. * @ci: Cacheinfo for L3. Only set when @hdr is NULL. Used when summing * domains. @@ -126,7 +126,7 @@ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; struct rdt_domain_hdr *hdr; - enum resctrl_event_id evtid; + struct mon_evt *evt; bool first; struct cacheinfo *ci; bool is_mbm_cntr; @@ -367,7 +367,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); + cpumask_t *cpumask, struct mon_evt *evt, int first); void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index d5ae0ef4c947..340b847ab397 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -430,7 +430,7 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); if (rr->is_mbm_cntr) { - cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evtid); + cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evt->evtid); if (cntr_id < 0) { rr->err = -ENOENT; return -EINVAL; @@ -439,10 +439,10 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) if (rr->first) { if (rr->is_mbm_cntr) - resctrl_arch_reset_cntr(rr->r, d, closid, rmid, cntr_id, rr->evtid); + resctrl_arch_reset_cntr(rr->r, d, closid, rmid, cntr_id, rr->evt->evtid); else - resctrl_arch_reset_rmid(rr->r, d, closid, rmid, rr->evtid); - m = get_mbm_state(d, closid, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, d, closid, rmid, rr->evt->evtid); + m = get_mbm_state(d, closid, rmid, rr->evt->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; @@ -453,10 +453,10 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) return -EINVAL; if (rr->is_mbm_cntr) rr->err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, - rr->evtid, &tval); + rr->evt->evtid, &tval); else rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + rr->evt->evtid, &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -501,7 +501,7 @@ static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *r if (d->ci_id != rr->ci->id) continue; err = resctrl_arch_rmid_read(rr->r, &d->hdr, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + rr->evt->evtid, &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -551,7 +551,7 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); - m = get_mbm_state(d, closid, rmid, rr->evtid); + m = get_mbm_state(d, closid, rmid, rr->evt->evtid); if (WARN_ON_ONCE(!m)) return; @@ -725,11 +725,11 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_l3_mon_domai rr.r = r; rr.hdr = &d->hdr; - rr.evtid = evtid; + rr.evt = &mon_event_all[evtid]; if (resctrl_arch_mbm_cntr_assign_enabled(r)) { rr.is_mbm_cntr = true; } else { - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, evtid); if (IS_ERR(rr.arch_mon_ctx)) { pr_warn_ratelimited("Failed to allocate monitor context: %ld", PTR_ERR(rr.arch_mon_ctx)); @@ -747,7 +747,7 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_l3_mon_domai mbm_bw_count(rdtgrp, &rr); if (rr.arch_mon_ctx) - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + resctrl_arch_mon_ctx_free(rr.r, evtid, rr.arch_mon_ctx); } static void mbm_update(struct rdt_resource *r, struct rdt_l3_mon_domain *d, diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 0d5a816dd3ef..68c7a12f16a3 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3038,7 +3038,7 @@ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, list_for_each_entry(priv, &mon_data_kn_priv_list, list) { if (priv->rid == rid && priv->domid == domid && - priv->sum == do_sum && priv->evtid == mevt->evtid) + priv->sum == do_sum && priv->evt == mevt) return priv; } @@ -3049,7 +3049,7 @@ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, priv->rid = rid; priv->domid = domid; priv->sum = do_sum; - priv->evtid = mevt->evtid; + priv->evt = mevt; list_add_tail(&priv->list, &mon_data_kn_priv_list); return priv; @@ -3216,7 +3216,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, return ret; if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt->evtid, true); + mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt, true); } return 0; -- Gitee From e1ca182b8f79f89ea7da6113da578e8c711b621d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:58 -0800 Subject: [PATCH 062/124] x86,fs/resctrl: Handle events that can be read from any CPU ANBZ: #31060 commit ab0308aee3819a3eccde42f9eb5bb01d6733be38 upstream. resctrl assumes that monitor events can only be read from a CPU in the cpumask_t set of each domain. This is true for x86 events accessed with an MSR interface, but may not be true for other access methods such as MMIO. Introduce and use flag mon_evt::any_cpu, settable by architecture, that indicates there are no restrictions on which CPU can read that event. This flag is not supported by the L3 event reading that requires to be run on a CPU that belongs to the L3 domain of the event being read. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 6 +++--- fs/resctrl/ctrlmondata.c | 6 ++++++ fs/resctrl/internal.h | 2 ++ fs/resctrl/monitor.c | 4 +++- include/linux/resctrl.h | 2 +- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index e12d970051e9..80cae8c421c8 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -893,15 +893,15 @@ static __init bool get_rdt_mon_resources(void) bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { - resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false); ret = true; } if (rdt_cpu_has(X86_FEATURE_ABMC)) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index c3656812848b..883be6f0810f 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -574,6 +574,11 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, } } + if (evt->any_cpu) { + mon_event_count(rr); + goto out_ctx_free; + } + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); /* @@ -587,6 +592,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); +out_ctx_free: if (rr->arch_mon_ctx) resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx); } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 12a2ab7e3c9b..40b76eaa33d0 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -61,6 +61,7 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * READS_TO_REMOTE_MEM) being tracked by @evtid. * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable + * @any_cpu: true if the event can be read from any CPU * @enabled: true if the event is enabled */ struct mon_evt { @@ -69,6 +70,7 @@ struct mon_evt { char *name; u32 evt_cfg; bool configurable; + bool any_cpu; bool enabled; }; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 340b847ab397..8c76ac133bca 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -518,6 +518,7 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { switch (rr->r->rid) { case RDT_RESOURCE_L3: + WARN_ON_ONCE(rr->evt->any_cpu); if (rr->hdr) return __l3_mon_event_count(rdtgrp, rr); else @@ -987,7 +988,7 @@ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { }, }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid) +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu) { if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) return; @@ -996,6 +997,7 @@ void resctrl_enable_mon_event(enum resctrl_event_id eventid) return; } + mon_event_all[eventid].any_cpu = any_cpu; mon_event_all[eventid].enabled = true; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a1df26798161..255d76b7d54f 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -409,7 +409,7 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); -void resctrl_enable_mon_event(enum resctrl_event_id eventid); +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); -- Gitee From dfcce8332f1a1123777e242a71f783b90a2fa85a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:20:59 -0800 Subject: [PATCH 063/124] x86,fs/resctrl: Support binary fixed point event counters ANBZ: #31060 commit e37c9a3dc9f9645532780d5ef34ea3b8fcf9ddef upstream. resctrl assumes that all monitor events can be displayed as unsigned decimal integers. Hardware architecture counters may provide some telemetry events with greater precision where the event is not a simple count, but is a measurement of some sort (e.g. Joules for energy consumed). Add a new argument to resctrl_enable_mon_event() for architecture code to inform the file system that the value for a counter is a fixed-point value with a specific number of binary places. Only allow architecture to use floating point format on events that the file system has marked with mon_evt::is_floating_point which reflects the contract with user space on how the event values are displayed. Display fixed point values with values rounded to ceil(binary_bits * log10(2)) decimal places. Special case for zero binary bits to print "{value}.0". Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 6 +-- fs/resctrl/ctrlmondata.c | 74 ++++++++++++++++++++++++++++++ fs/resctrl/internal.h | 8 ++++ fs/resctrl/monitor.c | 10 +++- include/linux/resctrl.h | 3 +- 5 files changed, 95 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 80cae8c421c8..2734d353c02a 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -893,15 +893,15 @@ static __init bool get_rdt_mon_resources(void) bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { - resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false); + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false); + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false); + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0); ret = true; } if (rdt_cpu_has(X86_FEATURE_ABMC)) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 883be6f0810f..f329e5dfac47 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -597,6 +598,77 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx); } +/* + * Decimal place precision to use for each number of fixed-point + * binary bits computed from ceil(binary_bits * log10(2)) except + * binary_bits == 0 which will print "value.0" + */ +static const unsigned int decplaces[MAX_BINARY_BITS + 1] = { + [0] = 1, + [1] = 1, + [2] = 1, + [3] = 1, + [4] = 2, + [5] = 2, + [6] = 2, + [7] = 3, + [8] = 3, + [9] = 3, + [10] = 4, + [11] = 4, + [12] = 4, + [13] = 4, + [14] = 5, + [15] = 5, + [16] = 5, + [17] = 6, + [18] = 6, + [19] = 6, + [20] = 7, + [21] = 7, + [22] = 7, + [23] = 7, + [24] = 8, + [25] = 8, + [26] = 8, + [27] = 9 +}; + +static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val) +{ + unsigned long long frac = 0; + + if (binary_bits) { + /* Mask off the integer part of the fixed-point value. */ + frac = val & GENMASK_ULL(binary_bits - 1, 0); + + /* + * Multiply by 10^{desired decimal places}. The integer part of + * the fixed point value is now almost what is needed. + */ + frac *= int_pow(10ull, decplaces[binary_bits]); + + /* + * Round to nearest by adding a value that would be a "1" in the + * binary_bits + 1 place. Integer part of fixed point value is + * now the needed value. + */ + frac += 1ull << (binary_bits - 1); + + /* + * Extract the integer part of the value. This is the decimal + * representation of the original fixed-point fractional value. + */ + frac >>= binary_bits; + } + + /* + * "frac" is now in the range [0 .. 10^decplaces). I.e. string + * representation will fit into chosen number of decimal places. + */ + seq_printf(m, "%llu.%0*llu\n", val >> binary_bits, decplaces[binary_bits], frac); +} + int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -674,6 +746,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) seq_puts(m, "Unavailable\n"); else if (rr.err == -ENOENT) seq_puts(m, "Unassigned\n"); + else if (evt->is_floating_point) + print_event_value(m, evt->binary_bits, rr.val); else seq_printf(m, "%llu\n", rr.val); diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 40b76eaa33d0..f5189b6771a0 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -62,6 +62,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable * @any_cpu: true if the event can be read from any CPU + * @is_floating_point: event values are displayed in floating point format + * @binary_bits: number of fixed-point binary bits from architecture, + * only valid if @is_floating_point is true * @enabled: true if the event is enabled */ struct mon_evt { @@ -71,6 +74,8 @@ struct mon_evt { u32 evt_cfg; bool configurable; bool any_cpu; + bool is_floating_point; + unsigned int binary_bits; bool enabled; }; @@ -79,6 +84,9 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; #define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) +/* Limit for mon_evt::binary_bits */ +#define MAX_BINARY_BITS 27 + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 8c76ac133bca..844cf6875f60 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -988,16 +988,22 @@ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { }, }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu) +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits) { - if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS || + binary_bits > MAX_BINARY_BITS)) return; if (mon_event_all[eventid].enabled) { pr_warn("Duplicate enable for event %d\n", eventid); return; } + if (binary_bits && !mon_event_all[eventid].is_floating_point) { + pr_warn("Event %d may not be floating point\n", eventid); + return; + } mon_event_all[eventid].any_cpu = any_cpu; + mon_event_all[eventid].binary_bits = binary_bits; mon_event_all[eventid].enabled = true; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 255d76b7d54f..775818b4ac7b 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -409,7 +409,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu); +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, + unsigned int binary_bits); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); -- Gitee From ac6e31b325230900227ea088f22bf74f13e71ccc Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 8 Jan 2026 09:42:25 -0800 Subject: [PATCH 064/124] x86,fs/resctrl: Add an architectural hook called for first mount ANBZ: #31060 commit 39208e73a40e0e81a5b12ddc11157c0a414df307 upstream. Enumeration of Intel telemetry events is an asynchronous process involving several mutually dependent drivers added as auxiliary devices during the device_initcall() phase of Linux boot. The process finishes after the probe functions of these drivers completes. But this happens after resctrl_arch_late_init() is executed. Tracing the enumeration process shows that it does complete a full seven seconds before the earliest possible mount of the resctrl file system (when included in /etc/fstab for automatic mount by systemd). Add a hook for use by telemetry event enumeration and initialization and run it once at the beginning of resctrl mount without any locks held. The architecture is responsible for any required locking. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20260105191711.GBaVwON5nZn-uO6Sqg@fat_crate.local Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++++ fs/resctrl/rdtgroup.c | 3 +++ include/linux/resctrl.h | 6 ++++++ 3 files changed, 13 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 2734d353c02a..74d9372acc4b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -720,6 +720,10 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) return 0; } +void resctrl_arch_pre_mount(void) +{ +} + enum { RDT_FLAG_CMT, RDT_FLAG_MBM_TOTAL, diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 68c7a12f16a3..19cc6e0102cf 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -2720,6 +2721,8 @@ static int rdt_get_tree(struct fs_context *fc) struct rdt_resource *r; int ret; + DO_ONCE_SLEEPABLE(resctrl_arch_pre_mount); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 775818b4ac7b..0d8d47fcd733 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -511,6 +511,12 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); +/* + * Architecture hook called at beginning of first file system mount attempt. + * No locks are held. + */ +void resctrl_arch_pre_mount(void); + /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid * for this resource and domain. -- Gitee From 0b0e619d51ba4f10457f814365080e6c8d222d7f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:01 -0800 Subject: [PATCH 065/124] x86,fs/resctrl: Add and initialize a resource for package scope monitoring ANBZ: #31060 commit 2e53ad66686a46b141c3395719afeee3057ffe2f upstream. Add a new PERF_PKG resource and introduce package level scope for monitoring telemetry events so that CPU hotplug notifiers can build domains at the package granularity. Use the physical package ID available via topology_physical_package_id() to identify the monitoring domains with package level scope. This enables user space to use: /sys/devices/system/cpu/cpuX/topology/physical_package_id to identify the monitoring domain a CPU is associated with. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 10 ++++++++++ fs/resctrl/internal.h | 2 ++ fs/resctrl/rdtgroup.c | 2 ++ include/linux/resctrl.h | 2 ++ 4 files changed, 16 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 74d9372acc4b..3a2dca362c11 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -99,6 +99,14 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, + [RDT_RESOURCE_PERF_PKG] = + { + .r_resctrl = { + .name = "PERF_PKG", + .mon_scope = RESCTRL_PACKAGE, + .mon_domains = mon_domain_init(RDT_RESOURCE_PERF_PKG), + }, + }, }; u32 resctrl_arch_system_num_rmid_idx(void) @@ -434,6 +442,8 @@ static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) return get_cpu_cacheinfo_id(cpu, scope); case RESCTRL_L3_NODE: return cpu_to_node(cpu); + case RESCTRL_PACKAGE: + return topology_physical_package_id(cpu); default: break; } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index f5189b6771a0..1409a5885f86 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -255,6 +255,8 @@ struct rdtgroup { #define RFTYPE_ASSIGN_CONFIG BIT(11) +#define RFTYPE_RES_PERF_PKG BIT(12) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 19cc6e0102cf..0e23c0eaaf03 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2331,6 +2331,8 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) case RDT_RESOURCE_MBA: case RDT_RESOURCE_SMBA: return RFTYPE_RES_MB; + case RDT_RESOURCE_PERF_PKG: + return RFTYPE_RES_PERF_PKG; } return WARN_ON_ONCE(1); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 0d8d47fcd733..9eaad4befd0f 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -53,6 +53,7 @@ enum resctrl_res_level { RDT_RESOURCE_L2, RDT_RESOURCE_MBA, RDT_RESOURCE_SMBA, + RDT_RESOURCE_PERF_PKG, /* Must be the last */ RDT_NUM_RESOURCES, @@ -267,6 +268,7 @@ enum resctrl_scope { RESCTRL_L2_CACHE = 2, RESCTRL_L3_CACHE = 3, RESCTRL_L3_NODE, + RESCTRL_PACKAGE, }; /** -- Gitee From 6ad92b76da074d8151d2dd2f81e58e3c5b132bd8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:02 -0800 Subject: [PATCH 066/124] fs/resctrl: Emphasize that L3 monitoring resource is required for summing domains ANBZ: #31060 commit db64994d115e7c2cd72fec11b854467e97169379 upstream. The feature to sum event data across multiple domains supports systems with Sub-NUMA Cluster (SNC) mode enabled. The top-level monitoring files in each "mon_L3_XX" directory provide the sum of data across all SNC nodes sharing an L3 cache instance while the "mon_sub_L3_YY" sub-directories provide the event data of the individual nodes. SNC is only associated with the L3 resource and domains and as a result the flow handling the sum of event data implicitly assumes it is working with the L3 resource and domains. Reading of telemetry events does not require to sum event data so this feature can remain dedicated to SNC and keep the implicit assumption of working with the L3 resource and domains. Add a WARN to where the implicit assumption of working with the L3 resource is made and add comments on how the structure controlling the event sum feature is used. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- fs/resctrl/ctrlmondata.c | 8 +++++++- fs/resctrl/internal.h | 4 ++-- fs/resctrl/rdtgroup.c | 3 ++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index f329e5dfac47..0c6f9a823066 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -673,7 +673,6 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; enum resctrl_res_level resid; - struct rdt_l3_mon_domain *d; struct rdt_domain_hdr *hdr; struct rmid_read rr = {0}; struct rdtgroup *rdtgrp; @@ -701,6 +700,13 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) r = resctrl_arch_get_resource(resid); if (md->sum) { + struct rdt_l3_mon_domain *d; + + if (WARN_ON_ONCE(resid != RDT_RESOURCE_L3)) { + ret = -EINVAL; + goto out; + } + /* * This file requires summing across all domains that share * the L3 cache id that was provided in the "domid" field of the diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 1409a5885f86..56fd43068ab4 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -92,8 +92,8 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; * @list: Member of the global @mon_data_kn_priv_list list. * @rid: Resource id associated with the event file. * @evt: Event structure associated with the event file. - * @sum: Set when event must be summed across multiple - * domains. + * @sum: Set for RDT_RESOURCE_L3 when event must be summed + * across multiple domains. * @domid: When @sum is zero this is the domain to which * the event file belongs. When @sum is one this * is the id of the L3 cache that all domains to be diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 0e23c0eaaf03..09b52aa2a773 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3031,7 +3031,8 @@ static void rmdir_all_sub(void) * @rid: The resource id for the event file being created. * @domid: The domain id for the event file being created. * @mevt: The type of event file being created. - * @do_sum: Whether SNC summing monitors are being created. + * @do_sum: Whether SNC summing monitors are being created. Only set + * when @rid == RDT_RESOURCE_L3. */ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, struct mon_evt *mevt, -- Gitee From 0281ff5e8b9e84f934c79700703a798058343f75 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 8 Jan 2026 09:42:26 -0800 Subject: [PATCH 067/124] x86/resctrl: Discover hardware telemetry events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31060 commit 1fb2daa60de640efb13f907d43d72d28763f696c upstream. Each CPU collects data for telemetry events that it sends to the nearest telemetry event aggregator either when the value of MSR_IA32_PQR_ASSOC.RMID changes, or when a two millisecond timer expires. There is a feature type ("energy" or "perf"), GUID, and MMIO region associated with each aggregator. This combination links to an XML description of the set of telemetry events tracked by the aggregator. XML files are published by Intel in a GitHub repository¹. The telemetry event aggregators maintain per-RMID per-event counts of the total seen for all the CPUs. There may be multiple telemetry event aggregators per package. There are separate sets of aggregators for each feature type. Aggregators in a set may have different GUIDs. All aggregators with the same feature type and GUID are symmetric keeping counts for the same set of events for the CPUs that provide data to them. The XML file for each aggregator provides the following information: 0) Feature type of the events ("perf" or "energy") 1) Which telemetry events are tracked by the aggregator. 2) The order in which the event counters appear for each RMID. 3) The value type of each event counter (integer or fixed-point). 4) The number of RMIDs supported. 5) Which additional aggregator status registers are included. 6) The total size of the MMIO region for an aggregator. Introduce struct event_group that condenses the relevant information from an XML file. Hereafter an "event group" refers to a group of events of a particular feature type (event_group::pfname set to "energy" or "perf") with a particular GUID. Use event_group::pfname to determine the feature id needed to obtain the aggregator details. It will later be used in console messages and with the rdt= boot parameter. The INTEL_PMT_TELEMETRY driver enumerates support for telemetry events. This driver provides intel_pmt_get_regions_by_feature() to list all available telemetry event aggregators of a given feature type. The list includes the "guid", the base address in MMIO space for the region where the event counters are exposed, and the package id where the all the CPUs that report to this aggregator are located. Call INTEL_PMT_TELEMETRY's intel_pmt_get_regions_by_feature() for each event group to obtain a private copy of that event group's aggregator data. Duplicate the aggregator data between event groups that have the same feature type but different GUID. Further processing on this private copy will be unique to the event group. ¹https://github.com/intel/Intel-PMT [ bp: Zap text explaining the code, s/guid/GUID/g ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/Kconfig | 13 +++ arch/x86/kernel/cpu/resctrl/Makefile | 1 + arch/x86/kernel/cpu/resctrl/core.c | 4 + arch/x86/kernel/cpu/resctrl/intel_aet.c | 109 ++++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 8 ++ 5 files changed, 135 insertions(+) create mode 100644 arch/x86/kernel/cpu/resctrl/intel_aet.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d74eef96d693..484fda0192fc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -515,6 +515,19 @@ config X86_CPU_RESCTRL Say N if unsure. +config X86_CPU_RESCTRL_INTEL_AET + bool "Intel Application Energy Telemetry" + depends on X86_CPU_RESCTRL && CPU_SUP_INTEL && INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y + help + Enable per-RMID telemetry events in resctrl. + + Intel feature that collects per-RMID execution data + about energy consumption, measure of frequency independent + activity and other performance metrics. Data is aggregated + per package. + + Say N if unsure. + if X86_32 config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index d8a04b195da2..273ddfa30836 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o +obj-$(CONFIG_X86_CPU_RESCTRL_INTEL_AET) += intel_aet.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o # To allow define_trace.h's recursive include: diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 3a2dca362c11..f6f46e42fb23 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -732,6 +732,8 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) void resctrl_arch_pre_mount(void) { + if (!intel_aet_get_events()) + return; } enum { @@ -1102,6 +1104,8 @@ late_initcall(resctrl_arch_late_init); static void __exit resctrl_arch_exit(void) { + intel_aet_exit(); + cpuhp_remove_state(rdt_online); resctrl_exit(); diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c new file mode 100644 index 000000000000..404564739bef --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Intel Application Energy Telemetry + * + * Copyright (C) 2025 Intel Corporation + * + * Author: + * Tony Luck + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/** + * struct event_group - Events with the same feature type ("energy" or "perf") and GUID. + * @pfname: PMT feature name ("energy" or "perf") of this event group. + * @pfg: Points to the aggregated telemetry space information + * returned by the intel_pmt_get_regions_by_feature() + * call to the INTEL_PMT_TELEMETRY driver that contains + * data for all telemetry regions of type @pfname. + * Valid if the system supports the event group, + * NULL otherwise. + */ +struct event_group { + /* Data fields for additional structures to manage this group. */ + const char *pfname; + struct pmt_feature_group *pfg; +}; + +static struct event_group *known_event_groups[] = { +}; + +#define for_each_event_group(_peg) \ + for (_peg = known_event_groups; \ + _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \ + _peg++) + +/* Stub for now */ +static bool enable_events(struct event_group *e, struct pmt_feature_group *p) +{ + return false; +} + +static enum pmt_feature_id lookup_pfid(const char *pfname) +{ + if (!strcmp(pfname, "energy")) + return FEATURE_PER_RMID_ENERGY_TELEM; + else if (!strcmp(pfname, "perf")) + return FEATURE_PER_RMID_PERF_TELEM; + + pr_warn("Unknown PMT feature name '%s'\n", pfname); + + return FEATURE_INVALID; +} + +/* + * Request a copy of struct pmt_feature_group for each event group. If there is + * one, the returned structure has an array of telemetry_region structures, + * each element of the array describes one telemetry aggregator. The + * telemetry aggregators may have different GUIDs so obtain duplicate struct + * pmt_feature_group for event groups with same feature type but different + * GUID. Post-processing ensures an event group can only use the telemetry + * aggregators that match its GUID. An event group keeps a pointer to its + * struct pmt_feature_group to indicate that its events are successfully + * enabled. + */ +bool intel_aet_get_events(void) +{ + struct pmt_feature_group *p; + enum pmt_feature_id pfid; + struct event_group **peg; + bool ret = false; + + for_each_event_group(peg) { + pfid = lookup_pfid((*peg)->pfname); + p = intel_pmt_get_regions_by_feature(pfid); + if (IS_ERR_OR_NULL(p)) + continue; + if (enable_events(*peg, p)) { + (*peg)->pfg = p; + ret = true; + } else { + intel_pmt_put_feature_group(p); + } + } + + return ret; +} + +void __exit intel_aet_exit(void) +{ + struct event_group **peg; + + for_each_event_group(peg) { + if ((*peg)->pfg) { + intel_pmt_put_feature_group((*peg)->pfg); + (*peg)->pfg = NULL; + } + } +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 6c9e75440c10..bdef7f91a2b9 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -220,4 +220,12 @@ void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); +#ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET +bool intel_aet_get_events(void); +void __exit intel_aet_exit(void); +#else +static inline bool intel_aet_get_events(void) { return false; } +static inline void __exit intel_aet_exit(void) { } +#endif + #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ -- Gitee From 77f187bc47e10f89be5fb105383bb405b0bd3262 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:04 -0800 Subject: [PATCH 068/124] x86,fs/resctrl: Fill in details of events for performance and energy GUIDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #31060 commit 8f6b6ad69b50bf16bb762ffafbfa44a4884f9a17 upstream. The telemetry event aggregators of the Intel Clearwater Forest CPU support two RMID-based feature types: "energy" with GUID 0x26696143¹, and "perf" with GUID 0x26557651². The event counter offsets in an aggregator's MMIO space are arranged in groups for each RMID. E.g., the "energy" counters for GUID 0x26696143 are arranged like this: MMIO offset:0x0000 Counter for RMID 0 PMT_EVENT_ENERGY MMIO offset:0x0008 Counter for RMID 0 PMT_EVENT_ACTIVITY MMIO offset:0x0010 Counter for RMID 1 PMT_EVENT_ENERGY MMIO offset:0x0018 Counter for RMID 1 PMT_EVENT_ACTIVITY ... MMIO offset:0x23F0 Counter for RMID 575 PMT_EVENT_ENERGY MMIO offset:0x23F8 Counter for RMID 575 PMT_EVENT_ACTIVITY After all counters there are three status registers that provide indications of how many times an aggregator was unable to process event counts, the time stamp for the most recent loss of data, and the time stamp of the most recent successful update. MMIO offset:0x2400 AGG_DATA_LOSS_COUNT MMIO offset:0x2408 AGG_DATA_LOSS_TIMESTAMP MMIO offset:0x2410 LAST_UPDATE_TIMESTAMP Define event_group structures for both of these aggregator types and define the events tracked by the aggregators in the file system code. PMT_EVENT_ENERGY and PMT_EVENT_ACTIVITY are produced in fixed point format. File system code must output as floating point values. ¹https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml ²https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml [ bp: Massage commit message. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/intel_aet.c | 66 +++++++++++++++++++++++++ fs/resctrl/monitor.c | 35 +++++++------ include/linux/resctrl_types.h | 11 +++++ 3 files changed, 97 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 404564739bef..8e042b530c91 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -11,15 +11,33 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include #include #include #include #include #include +#include #include +#include #include "internal.h" +/** + * struct pmt_event - Telemetry event. + * @id: Resctrl event id. + * @idx: Counter index within each per-RMID block of counters. + * @bin_bits: Zero for integer valued events, else number bits in fraction + * part of fixed-point. + */ +struct pmt_event { + enum resctrl_event_id id; + unsigned int idx; + unsigned int bin_bits; +}; + +#define EVT(_id, _idx, _bits) { .id = _id, .idx = _idx, .bin_bits = _bits } + /** * struct event_group - Events with the same feature type ("energy" or "perf") and GUID. * @pfname: PMT feature name ("energy" or "perf") of this event group. @@ -29,14 +47,62 @@ * data for all telemetry regions of type @pfname. * Valid if the system supports the event group, * NULL otherwise. + * @guid: Unique number per XML description file. + * @mmio_size: Number of bytes of MMIO registers for this group. + * @num_events: Number of events in this group. + * @evts: Array of event descriptors. */ struct event_group { /* Data fields for additional structures to manage this group. */ const char *pfname; struct pmt_feature_group *pfg; + + /* Remaining fields initialized from XML file. */ + u32 guid; + size_t mmio_size; + unsigned int num_events; + struct pmt_event evts[] __counted_by(num_events); +}; + +#define XML_MMIO_SIZE(num_rmids, num_events, num_extra_status) \ + (((num_rmids) * (num_events) + (num_extra_status)) * sizeof(u64)) + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml + */ +static struct event_group energy_0x26696143 = { + .pfname = "energy", + .guid = 0x26696143, + .mmio_size = XML_MMIO_SIZE(576, 2, 3), + .num_events = 2, + .evts = { + EVT(PMT_EVENT_ENERGY, 0, 18), + EVT(PMT_EVENT_ACTIVITY, 1, 18), + } +}; + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml + */ +static struct event_group perf_0x26557651 = { + .pfname = "perf", + .guid = 0x26557651, + .mmio_size = XML_MMIO_SIZE(576, 7, 3), + .num_events = 7, + .evts = { + EVT(PMT_EVENT_STALLS_LLC_HIT, 0, 0), + EVT(PMT_EVENT_C1_RES, 1, 0), + EVT(PMT_EVENT_UNHALTED_CORE_CYCLES, 2, 0), + EVT(PMT_EVENT_STALLS_LLC_MISS, 3, 0), + EVT(PMT_EVENT_AUTO_C6_RES, 4, 0), + EVT(PMT_EVENT_UNHALTED_REF_CYCLES, 5, 0), + EVT(PMT_EVENT_UOPS_RETIRED, 6, 0), + } }; static struct event_group *known_event_groups[] = { + &energy_0x26696143, + &perf_0x26557651, }; #define for_each_event_group(_peg) \ diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 844cf6875f60..9729acacdc19 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -965,27 +965,32 @@ static void dom_data_exit(struct rdt_resource *r) mutex_unlock(&rdtgroup_mutex); } +#define MON_EVENT(_eventid, _name, _res, _fp) \ + [_eventid] = { \ + .name = _name, \ + .evtid = _eventid, \ + .rid = _res, \ + .is_floating_point = _fp, \ +} + /* * All available events. Architecture code marks the ones that * are supported by a system using resctrl_enable_mon_event() * to set .enabled. */ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { - [QOS_L3_OCCUP_EVENT_ID] = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, - [QOS_L3_MBM_TOTAL_EVENT_ID] = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, - [QOS_L3_MBM_LOCAL_EVENT_ID] = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, + MON_EVENT(QOS_L3_OCCUP_EVENT_ID, "llc_occupancy", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_LOCAL_EVENT_ID, "mbm_local_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(PMT_EVENT_ENERGY, "core_energy", RDT_RESOURCE_PERF_PKG, true), + MON_EVENT(PMT_EVENT_ACTIVITY, "activity", RDT_RESOURCE_PERF_PKG, true), + MON_EVENT(PMT_EVENT_STALLS_LLC_HIT, "stalls_llc_hit", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_C1_RES, "c1_res", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UNHALTED_CORE_CYCLES, "unhalted_core_cycles", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_STALLS_LLC_MISS, "stalls_llc_miss", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_AUTO_C6_RES, "c6_res", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UNHALTED_REF_CYCLES, "unhalted_ref_cycles", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UOPS_RETIRED, "uops_retired", RDT_RESOURCE_PERF_PKG, false), }; void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits) diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index acfe07860b34..a5f56faa18d2 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -50,6 +50,17 @@ enum resctrl_event_id { QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, + /* Intel Telemetry Events */ + PMT_EVENT_ENERGY, + PMT_EVENT_ACTIVITY, + PMT_EVENT_STALLS_LLC_HIT, + PMT_EVENT_C1_RES, + PMT_EVENT_UNHALTED_CORE_CYCLES, + PMT_EVENT_STALLS_LLC_MISS, + PMT_EVENT_AUTO_C6_RES, + PMT_EVENT_UNHALTED_REF_CYCLES, + PMT_EVENT_UOPS_RETIRED, + /* Must be the last */ QOS_NUM_EVENTS, }; -- Gitee From 94eca335cfda0c2201c74c7b4ab8c5260cbf7c04 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:05 -0800 Subject: [PATCH 069/124] x86,fs/resctrl: Add architectural event pointer ANBZ: #31060 commit 8ccb1f8fa6a3dfde32cf33e7ded3558014e6cca2 upstream. The resctrl file system layer passes the domain, RMID, and event id to the architecture to fetch an event counter. Fetching a telemetry event counter requires additional information that is private to the architecture, for example, the offset into MMIO space from where the counter should be read. Add mon_evt::arch_priv that architecture can use for any private data related to the event. The resctrl filesystem initializes mon_evt::arch_priv when the architecture enables the event and passes it back to architecture when needing to fetch an event counter. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 6 +++--- arch/x86/kernel/cpu/resctrl/monitor.c | 2 +- fs/resctrl/internal.h | 4 ++++ fs/resctrl/monitor.c | 14 ++++++++++---- include/linux/resctrl.h | 7 +++++-- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index f6f46e42fb23..b8b565257d22 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -909,15 +909,15 @@ static __init bool get_rdt_mon_resources(void) bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { - resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0); + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0); + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0); + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_ABMC)) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 8760073a8cb2..c422196c4566 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -239,7 +239,7 @@ static u64 get_corrected_val(struct rdt_resource *r, struct rdt_l3_mon_domain *d int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 unused, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *ignored) + void *arch_priv, u64 *val, void *ignored) { struct rdt_hw_l3_mon_domain *hw_dom; struct rdt_l3_mon_domain *d; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 56fd43068ab4..2715afa5951b 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -66,6 +66,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * @binary_bits: number of fixed-point binary bits from architecture, * only valid if @is_floating_point is true * @enabled: true if the event is enabled + * @arch_priv: Architecture private data for this event. + * The @arch_priv provided by the architecture via + * resctrl_enable_mon_event(). */ struct mon_evt { enum resctrl_event_id evtid; @@ -77,6 +80,7 @@ struct mon_evt { bool is_floating_point; unsigned int binary_bits; bool enabled; + void *arch_priv; }; extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 9729acacdc19..af43a33ce4cb 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -137,9 +137,11 @@ void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) struct rmid_entry *entry; u32 idx, cur_idx = 1; void *arch_mon_ctx; + void *arch_priv; bool rmid_dirty; u64 val = 0; + arch_priv = mon_event_all[QOS_L3_OCCUP_EVENT_ID].arch_priv; arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); if (IS_ERR(arch_mon_ctx)) { pr_warn_ratelimited("Failed to allocate monitor context: %ld", @@ -160,7 +162,7 @@ void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) entry = __rmid_entry(idx); if (resctrl_arch_rmid_read(r, &d->hdr, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, + QOS_L3_OCCUP_EVENT_ID, arch_priv, &val, arch_mon_ctx)) { rmid_dirty = true; } else { @@ -456,7 +458,8 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) rr->evt->evtid, &tval); else rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, closid, rmid, - rr->evt->evtid, &tval, rr->arch_mon_ctx); + rr->evt->evtid, rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -501,7 +504,8 @@ static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *r if (d->ci_id != rr->ci->id) continue; err = resctrl_arch_rmid_read(rr->r, &d->hdr, closid, rmid, - rr->evt->evtid, &tval, rr->arch_mon_ctx); + rr->evt->evtid, rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -993,7 +997,8 @@ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { MON_EVENT(PMT_EVENT_UOPS_RETIRED, "uops_retired", RDT_RESOURCE_PERF_PKG, false), }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits) +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, + unsigned int binary_bits, void *arch_priv) { if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS || binary_bits > MAX_BINARY_BITS)) @@ -1009,6 +1014,7 @@ void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsig mon_event_all[eventid].any_cpu = any_cpu; mon_event_all[eventid].binary_bits = binary_bits; + mon_event_all[eventid].arch_priv = arch_priv; mon_event_all[eventid].enabled = true; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 9eaad4befd0f..fda7b0ccfd8b 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -412,7 +412,7 @@ u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, - unsigned int binary_bits); + unsigned int binary_bits, void *arch_priv); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); @@ -529,6 +529,9 @@ void resctrl_arch_pre_mount(void); * only. * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. + * @arch_priv: Architecture private data for this event. + * The @arch_priv provided by the architecture via + * resctrl_enable_mon_event(). * @val: result of the counter read in bytes. * @arch_mon_ctx: An architecture specific value from * resctrl_arch_mon_ctx_alloc(), for MPAM this identifies @@ -546,7 +549,7 @@ void resctrl_arch_pre_mount(void); */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *arch_mon_ctx); + void *arch_priv, u64 *val, void *arch_mon_ctx); /** * resctrl_arch_rmid_read_context_check() - warn about invalid contexts -- Gitee From cf434d026636b2a6a3a6b1d494350d9e7a8353d5 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:06 -0800 Subject: [PATCH 070/124] x86/resctrl: Find and enable usable telemetry events ANBZ: #31060 commit 7e6df9614546ae7eb1f1b2074d7b6039bb01540d upstream. Every event group has a private copy of the data of all telemetry event aggregators (aka "telemetry regions") tracking its feature type. Included may be regions that have the same feature type but tracking different GUID from the event group's. Traverse the event group's telemetry region data and mark all regions that are not usable by the event group as unusable by clearing those regions' MMIO addresses. A region is considered unusable if: 1) GUID does not match the GUID of the event group. 2) Package ID is invalid. 3) The enumerated size of the MMIO region does not match the expected value from the XML description file. Hereafter any telemetry region with an MMIO address is considered valid for the event group it is associated with. Enable all the event group's events as long as there is at least one usable region from where data for its events can be read. Enabling of an event can fail if the same event has already been enabled as part of another event group. It should never happen that the same event is described by different GUID supported by the same system so just WARN (via resctrl_enable_mon_event()) and skip the event. Note that it is architecturally possible that some telemetry events are only supported by a subset of the packages in the system. It is not expected that systems will ever do this. If they do the user will see event files in resctrl that always return "Unavailable". Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/intel_aet.c | 63 ++++++++++++++++++++++++- fs/resctrl/monitor.c | 10 ++-- include/linux/resctrl.h | 2 +- 3 files changed, 68 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 8e042b530c91..7d0bd7b070a7 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -16,9 +16,11 @@ #include #include #include +#include #include #include #include +#include #include #include "internal.h" @@ -110,12 +112,69 @@ static struct event_group *known_event_groups[] = { _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \ _peg++) -/* Stub for now */ -static bool enable_events(struct event_group *e, struct pmt_feature_group *p) +static bool skip_telem_region(struct telemetry_region *tr, struct event_group *e) { + if (tr->guid != e->guid) + return true; + if (tr->plat_info.package_id >= topology_max_packages()) { + pr_warn("Bad package %u in guid 0x%x\n", tr->plat_info.package_id, + tr->guid); + return true; + } + if (tr->size != e->mmio_size) { + pr_warn("MMIO space wrong size (%zu bytes) for guid 0x%x. Expected %zu bytes.\n", + tr->size, e->guid, e->mmio_size); + return true; + } + return false; } +static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_group *p) +{ + bool usable_regions = false; + + for (int i = 0; i < p->count; i++) { + if (skip_telem_region(&p->regions[i], e)) { + /* + * Clear the address field of regions that did not pass the checks in + * skip_telem_region() so they will not be used by intel_aet_read_event(). + * This is safe to do because intel_pmt_get_regions_by_feature() allocates + * a new pmt_feature_group structure to return to each caller and only makes + * use of the pmt_feature_group::kref field when intel_pmt_put_feature_group() + * returns the structure. + */ + p->regions[i].addr = NULL; + + continue; + } + usable_regions = true; + } + + return usable_regions; +} + +static bool enable_events(struct event_group *e, struct pmt_feature_group *p) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int skipped_events = 0; + + if (!group_has_usable_regions(e, p)) + return false; + + for (int j = 0; j < e->num_events; j++) { + if (!resctrl_enable_mon_event(e->evts[j].id, true, + e->evts[j].bin_bits, &e->evts[j])) + skipped_events++; + } + if (e->num_events == skipped_events) { + pr_info("No events enabled in %s %s:0x%x\n", r->name, e->pfname, e->guid); + return false; + } + + return true; +} + static enum pmt_feature_id lookup_pfid(const char *pfname) { if (!strcmp(pfname, "energy")) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index af43a33ce4cb..9af08b673e39 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -997,25 +997,27 @@ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { MON_EVENT(PMT_EVENT_UOPS_RETIRED, "uops_retired", RDT_RESOURCE_PERF_PKG, false), }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, +bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits, void *arch_priv) { if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS || binary_bits > MAX_BINARY_BITS)) - return; + return false; if (mon_event_all[eventid].enabled) { pr_warn("Duplicate enable for event %d\n", eventid); - return; + return false; } if (binary_bits && !mon_event_all[eventid].is_floating_point) { pr_warn("Event %d may not be floating point\n", eventid); - return; + return false; } mon_event_all[eventid].any_cpu = any_cpu; mon_event_all[eventid].binary_bits = binary_bits; mon_event_all[eventid].arch_priv = arch_priv; mon_event_all[eventid].enabled = true; + + return true; } bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index fda7b0ccfd8b..c8fd506648bb 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -411,7 +411,7 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, +bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits, void *arch_priv); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); -- Gitee From 01c318a15a4ced160aff1c7e535b7175104bc42f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:07 -0800 Subject: [PATCH 071/124] x86/resctrl: Read telemetry events ANBZ: #31060 commit 51541f6ca7718d8278e12fe80af80033268743b2 upstream. Introduce intel_aet_read_event() to read telemetry events for resource RDT_RESOURCE_PERF_PKG. There may be multiple aggregators tracking each package, so scan all of them and add up all counters. Aggregators may return an invalid data indication if they have received no records for a given RMID. The user will see "Unavailable" if none of the aggregators on a package provide valid counts. Resctrl now uses readq() so depends on X86_64. Update Kconfig. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/resctrl/intel_aet.c | 51 +++++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 5 +++ arch/x86/kernel/cpu/resctrl/monitor.c | 4 ++ fs/resctrl/monitor.c | 14 +++++++ 5 files changed, 75 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 484fda0192fc..dd0faa285d98 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -517,7 +517,7 @@ config X86_CPU_RESCTRL config X86_CPU_RESCTRL_INTEL_AET bool "Intel Application Energy Telemetry" - depends on X86_CPU_RESCTRL && CPU_SUP_INTEL && INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y + depends on X86_64 && X86_CPU_RESCTRL && CPU_SUP_INTEL && INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y help Enable per-RMID telemetry events in resctrl. diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 7d0bd7b070a7..96d627e2c52d 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -11,11 +11,15 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include #include +#include #include +#include #include #include #include +#include #include #include #include @@ -232,3 +236,50 @@ void __exit intel_aet_exit(void) } } } + +#define DATA_VALID BIT_ULL(63) +#define DATA_BITS GENMASK_ULL(62, 0) + +/* + * Read counter for an event on a domain (summing all aggregators on the + * domain). If an aggregator hasn't received any data for a specific RMID, + * the MMIO read indicates that data is not valid. Return success if at + * least one aggregator has valid data. + */ +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + struct pmt_event *pevt = arch_priv; + struct event_group *e; + bool valid = false; + u64 total = 0; + u64 evtcount; + void *pevt0; + u32 idx; + + pevt0 = pevt - pevt->idx; + e = container_of(pevt0, struct event_group, evts); + idx = rmid * e->num_events; + idx += pevt->idx; + + if (idx * sizeof(u64) + sizeof(u64) > e->mmio_size) { + pr_warn_once("MMIO index %u out of range\n", idx); + return -EIO; + } + + for (int i = 0; i < e->pfg->count; i++) { + if (!e->pfg->regions[i].addr) + continue; + if (e->pfg->regions[i].plat_info.package_id != domid) + continue; + evtcount = readq(e->pfg->regions[i].addr + idx * sizeof(u64)); + if (!(evtcount & DATA_VALID)) + continue; + total += evtcount & DATA_BITS; + valid = true; + } + + if (valid) + *val = total; + + return valid ? 0 : -EINVAL; +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index bdef7f91a2b9..a7b4ff6e9957 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -223,9 +223,14 @@ void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); #ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET bool intel_aet_get_events(void); void __exit intel_aet_exit(void); +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val); #else static inline bool intel_aet_get_events(void) { return false; } static inline void __exit intel_aet_exit(void) { } +static inline int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + return -EINVAL; +} #endif #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c422196c4566..53d6f642f486 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -250,6 +250,10 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, int ret; resctrl_arch_rmid_read_context_check(); + + if (r->rid == RDT_RESOURCE_PERF_PKG) + return intel_aet_read_event(hdr->id, rmid, arch_priv, val); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 9af08b673e39..8a4c2ae72740 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -527,6 +527,20 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) return __l3_mon_event_count(rdtgrp, rr); else return __l3_mon_event_count_sum(rdtgrp, rr); + case RDT_RESOURCE_PERF_PKG: { + u64 tval = 0; + + rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, rdtgrp->closid, + rdtgrp->mon.rmid, rr->evt->evtid, + rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; + + rr->val += tval; + + return 0; + } default: rr->err = -EINVAL; return -EINVAL; -- Gitee From 554db63bff31f4f7366bf5050d31f74445c65705 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:08 -0800 Subject: [PATCH 072/124] fs/resctrl: Refactor mkdir_mondata_subdir() ANBZ: #31060 commit 0ec1db4cac8239bb32da87586c3638200b65dd8c upstream. Population of a monitor group's mon_data directory is unreasonably complicated because of the support for Sub-NUMA Cluster (SNC) mode. Split out the SNC code into a helper function to make it easier to add support for a new telemetry resource. Move all the duplicated code to make and set owner of domain directories into the mon_add_all_files() helper and rename to _mkdir_mondata_subdir(). Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- fs/resctrl/rdtgroup.c | 108 +++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 50 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 09b52aa2a773..6af00af7d6ec 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3195,57 +3195,65 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } } -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_domain_hdr *hdr, - struct rdt_resource *r, struct rdtgroup *prgrp, - bool do_sum) +/* + * Create a directory for a domain and populate it with monitor files. Create + * summing monitors when @hdr is NULL. No need to initialize summing monitors. + */ +static struct kernfs_node *_mkdir_mondata_subdir(struct kernfs_node *parent_kn, char *name, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, + struct rdtgroup *prgrp, int domid) { - struct rdt_l3_mon_domain *d; struct rmid_read rr = {0}; + struct kernfs_node *kn; struct mon_data *priv; struct mon_evt *mevt; - int ret, domid; + int ret; - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) - return -EINVAL; + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return kn; + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; - d = container_of(hdr, struct rdt_l3_mon_domain, hdr); for_each_mon_event(mevt) { if (mevt->rid != r->rid || !mevt->enabled) continue; - domid = do_sum ? d->ci_id : d->hdr.id; - priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); - if (WARN_ON_ONCE(!priv)) - return -EINVAL; + priv = mon_get_kn_priv(r->rid, domid, mevt, !hdr); + if (WARN_ON_ONCE(!priv)) { + ret = -EINVAL; + goto out_destroy; + } ret = mon_addfile(kn, mevt->name, priv); if (ret) - return ret; + goto out_destroy; - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) + if (hdr && resctrl_is_mbm_event(mevt->evtid)) mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt, true); } - return 0; + return kn; +out_destroy: + kernfs_remove(kn); + return ERR_PTR(ret); } -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_domain_hdr *hdr, - struct rdt_resource *r, struct rdtgroup *prgrp) +static int mkdir_mondata_subdir_snc(struct kernfs_node *parent_kn, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, struct rdtgroup *prgrp) { - struct kernfs_node *kn, *ckn; + struct kernfs_node *ckn, *kn; struct rdt_l3_mon_domain *d; char name[32]; - bool snc_mode; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return -EINVAL; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, d->ci_id); kn = kernfs_find_and_get(parent_kn, name); if (kn) { /* @@ -3254,41 +3262,41 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, */ kernfs_put(kn); } else { - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + kn = _mkdir_mondata_subdir(parent_kn, name, NULL, r, prgrp, d->ci_id); if (IS_ERR(kn)) return PTR_ERR(kn); + } - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - ret = mon_add_all_files(kn, hdr, r, prgrp, snc_mode); - if (ret) - goto out_destroy; + sprintf(name, "mon_sub_%s_%02d", r->name, hdr->id); + ckn = _mkdir_mondata_subdir(kn, name, hdr, r, prgrp, hdr->id); + if (IS_ERR(ckn)) { + kernfs_remove(kn); + return PTR_ERR(ckn); } - if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, hdr->id); - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); - if (IS_ERR(ckn)) { - ret = -EINVAL; - goto out_destroy; - } + kernfs_activate(kn); + return 0; +} - ret = rdtgroup_kn_set_ugid(ckn); - if (ret) - goto out_destroy; +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + struct kernfs_node *kn; + char name[32]; - ret = mon_add_all_files(ckn, hdr, r, prgrp, false); - if (ret) - goto out_destroy; - } + lockdep_assert_held(&rdtgroup_mutex); + + if (r->rid == RDT_RESOURCE_L3 && r->mon_scope == RESCTRL_L3_NODE) + return mkdir_mondata_subdir_snc(parent_kn, hdr, r, prgrp); + + sprintf(name, "mon_%s_%02d", r->name, hdr->id); + kn = _mkdir_mondata_subdir(parent_kn, name, hdr, r, prgrp, hdr->id); + if (IS_ERR(kn)) + return PTR_ERR(kn); kernfs_activate(kn); return 0; - -out_destroy: - kernfs_remove(kn); - return ret; } /* -- Gitee From d8c420f1b11c8b7fc933112fe43edad0b4328baf Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:09 -0800 Subject: [PATCH 073/124] fs/resctrl: Refactor rmdir_mondata_subdir_allrdtgrp() ANBZ: #31060 commit 93d9fd89995181d7ff420752328cc8b4b228f100 upstream. Clearing a monitor group's mon_data directory is complicated because of the support for Sub-NUMA Cluster (SNC) mode. Refactor the SNC case into a helper function to make it easier to add support for a new telemetry resource. Suggested-by: Reinette Chatre Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- fs/resctrl/rdtgroup.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 6af00af7d6ec..51dbbc7e34c9 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3164,28 +3164,24 @@ static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subn } /* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups for the given domain. - * Remove files and directories containing "sum" of domain data - * when last domain being summed is removed. + * Remove files and directories for one SNC node. If it is the last node + * sharing an L3 cache, then remove the upper level directory containing + * the "sum" files too. */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain_hdr *hdr) +static void rmdir_mondata_subdir_allrdtgrp_snc(struct rdt_resource *r, + struct rdt_domain_hdr *hdr) { struct rdtgroup *prgrp, *crgrp; struct rdt_l3_mon_domain *d; char subname[32]; - bool snc_mode; char name[32]; if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) return; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : hdr->id); - if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, hdr->id); + sprintf(name, "mon_%s_%02d", r->name, d->ci_id); + sprintf(subname, "mon_sub_%s_%02d", r->name, hdr->id); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); @@ -3195,6 +3191,30 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } } +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups for the given domain. + */ +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain_hdr *hdr) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (r->rid == RDT_RESOURCE_L3 && r->mon_scope == RESCTRL_L3_NODE) { + rmdir_mondata_subdir_allrdtgrp_snc(r, hdr); + return; + } + + sprintf(name, "mon_%s_%02d", r->name, hdr->id); + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); + } +} + /* * Create a directory for a domain and populate it with monitor files. Create * summing monitors when @hdr is NULL. No need to initialize summing monitors. -- Gitee From f58c7d5e002828f607a1df91f3d2147f0a368038 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:10 -0800 Subject: [PATCH 074/124] x86,fs/resctrl: Handle domain creation/deletion for RDT_RESOURCE_PERF_PKG ANBZ: #31060 commit f4e0cd80d3e7c31327459008b01d63804838a89d upstream. The L3 resource has several requirements for domains. There are per-domain structures that hold the 64-bit values of counters, and elements to keep track of the overflow and limbo threads. None of these are needed for the PERF_PKG resource. The hardware counters are wide enough that they do not wrap around for decades. Define a new rdt_perf_pkg_mon_domain structure which just consists of the standard rdt_domain_hdr to keep track of domain id and CPU mask. Update resctrl_online_mon_domain() for RDT_RESOURCE_PERF_PKG. The only action needed for this resource is to create and populate domain directories if a domain is added while resctrl is mounted. Similarly resctrl_offline_mon_domain() only needs to remove domain directories. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 17 +++++++++++++++ arch/x86/kernel/cpu/resctrl/intel_aet.c | 29 +++++++++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 13 +++++++++++ fs/resctrl/rdtgroup.c | 17 ++++++++++----- 4 files changed, 71 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index b8b565257d22..ac09f583c27a 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -574,6 +574,10 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) if (!hdr) l3_mon_domain_setup(cpu, id, r, add_pos); break; + case RDT_RESOURCE_PERF_PKG: + if (!hdr) + intel_aet_mon_domain_setup(cpu, id, r, add_pos); + break; default: pr_warn_once("Unknown resource rid=%d\n", r->rid); break; @@ -673,6 +677,19 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) l3_mon_domain_free(hw_dom); break; } + case RDT_RESOURCE_PERF_PKG: { + struct rdt_perf_pkg_mon_domain *pkgd; + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_PERF_PKG)) + return; + + pkgd = container_of(hdr, struct rdt_perf_pkg_mon_domain, hdr); + resctrl_offline_mon_domain(r, hdr); + list_del_rcu(&hdr->list); + synchronize_rcu(); + kfree(pkgd); + break; + } default: pr_warn_once("Unknown resource rid=%d\n", r->rid); break; diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 96d627e2c52d..9351fe5b645a 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -14,15 +14,20 @@ #include #include #include +#include #include #include +#include #include #include #include #include #include +#include +#include #include #include +#include #include #include #include @@ -283,3 +288,27 @@ int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) return valid ? 0 : -EINVAL; } + +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) +{ + struct rdt_perf_pkg_mon_domain *d; + int err; + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->hdr.id = id; + d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = RDT_RESOURCE_PERF_PKG; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_mon_domain(r, &d->hdr); + if (err) { + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + kfree(d); + } +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index a7b4ff6e9957..2b5be3024037 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -87,6 +87,14 @@ static inline struct rdt_hw_l3_mon_domain *resctrl_to_arch_mon_dom(struct rdt_l3 return container_of(r, struct rdt_hw_l3_mon_domain, d_resctrl); } +/** + * struct rdt_perf_pkg_mon_domain - CPUs sharing an package scoped resctrl monitor resource + * @hdr: common header for different domain types + */ +struct rdt_perf_pkg_mon_domain { + struct rdt_domain_hdr hdr; +}; + /** * struct msr_param - set a range of MSRs from a domain * @res: The resource to use @@ -224,6 +232,8 @@ void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); bool intel_aet_get_events(void); void __exit intel_aet_exit(void); int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val); +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos); #else static inline bool intel_aet_get_events(void) { return false; } static inline void __exit intel_aet_exit(void) { } @@ -231,6 +241,9 @@ static inline int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 { return -EINVAL; } + +static inline void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) { } #endif #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 51dbbc7e34c9..274fe9b1433e 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4243,11 +4243,6 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h mutex_lock(&rdtgroup_mutex); - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) - goto out_unlock; - - d = container_of(hdr, struct rdt_l3_mon_domain, hdr); - /* * If resctrl is mounted, remove all the * per domain monitor data directories. @@ -4255,6 +4250,13 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, hdr); + if (r->rid != RDT_RESOURCE_L3) + goto out_unlock; + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + goto out_unlock; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { @@ -4351,6 +4353,9 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr mutex_lock(&rdtgroup_mutex); + if (r->rid != RDT_RESOURCE_L3) + goto mkdir; + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) goto out_unlock; @@ -4368,6 +4373,8 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); +mkdir: + err = 0; /* * If the filesystem is not mounted then only the default resource group * exists. Creation of its directories is deferred until mount time -- Gitee From e2174c38581804ee8d39cadb5d2f3f8ca871450d Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:11 -0800 Subject: [PATCH 075/124] x86/resctrl: Add energy/perf choices to rdt boot option ANBZ: #31060 commit 842e7f97d71a4116a650ec0045d6444b4377b512 upstream. Legacy resctrl features are enumerated by X86_FEATURE_* flags. These may be overridden by quirks to disable features in the case of errata. Users can use kernel command line options to either disable a feature, or to force enable a feature that was disabled by a quirk. A different approach is needed for hardware features that do not have an X86_FEATURE_* flag. Update parsing of the "rdt=" boot parameter to call the telemetry driver directly to handle new "perf" and "energy" options that controls activation of telemetry monitoring of the named type. By itself a "perf" or "energy" option controls the forced enabling or disabling (with ! prefix) of all event groups of the named type. A ":guid" suffix allows for fine grained control per event group. [ bp: s/intel_aet_option/intel_handle_aet_option/g ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- .../admin-guide/kernel-parameters.txt | 7 +++- arch/x86/kernel/cpu/resctrl/core.c | 2 + arch/x86/kernel/cpu/resctrl/intel_aet.c | 38 +++++++++++++++++++ arch/x86/kernel/cpu/resctrl/internal.h | 2 + 4 files changed, 48 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index beda032be1dc..1c19a23748e7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5556,9 +5556,14 @@ rdt= [HW,X86,RDT] Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, - mba, smba, bmec, abmc. + mba, smba, bmec, abmc, energy[:guid], + perf[:guid]. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba + To turn off all energy telemetry monitoring and ensure that + perf telemetry monitoring associated with guid 0x12345 + is enabled use: + rdt=!energy,perf:0x12345 reboot= [KNL] Format (x86 or x86_64): diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index ac09f583c27a..dfcc3df46d25 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -806,6 +806,8 @@ static int __init set_rdt_options(char *str) force_off = *tok == '!'; if (force_off) tok++; + if (intel_handle_aet_option(force_off, tok)) + continue; for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { if (strcmp(tok, o->name) == 0) { if (force_off) diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index 9351fe5b645a..dc25e8d2527d 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -52,12 +52,17 @@ struct pmt_event { /** * struct event_group - Events with the same feature type ("energy" or "perf") and GUID. * @pfname: PMT feature name ("energy" or "perf") of this event group. + * Used by boot rdt= option. * @pfg: Points to the aggregated telemetry space information * returned by the intel_pmt_get_regions_by_feature() * call to the INTEL_PMT_TELEMETRY driver that contains * data for all telemetry regions of type @pfname. * Valid if the system supports the event group, * NULL otherwise. + * @force_off: True when "rdt" command line or architecture code disables + * this event group. + * @force_on: True when "rdt" command line overrides disable of this + * event group. * @guid: Unique number per XML description file. * @mmio_size: Number of bytes of MMIO registers for this group. * @num_events: Number of events in this group. @@ -67,6 +72,7 @@ struct event_group { /* Data fields for additional structures to manage this group. */ const char *pfname; struct pmt_feature_group *pfg; + bool force_off, force_on; /* Remaining fields initialized from XML file. */ u32 guid; @@ -121,6 +127,35 @@ static struct event_group *known_event_groups[] = { _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \ _peg++) +bool intel_handle_aet_option(bool force_off, char *tok) +{ + struct event_group **peg; + bool ret = false; + u32 guid = 0; + char *name; + + if (!tok) + return false; + + name = strsep(&tok, ":"); + if (tok && kstrtou32(tok, 16, &guid)) + return false; + + for_each_event_group(peg) { + if (strcmp(name, (*peg)->pfname)) + continue; + if (guid && (*peg)->guid != guid) + continue; + if (force_off) + (*peg)->force_off = true; + else + (*peg)->force_on = true; + ret = true; + } + + return ret; +} + static bool skip_telem_region(struct telemetry_region *tr, struct event_group *e) { if (tr->guid != e->guid) @@ -168,6 +203,9 @@ static bool enable_events(struct event_group *e, struct pmt_feature_group *p) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; int skipped_events = 0; + if (e->force_off) + return false; + if (!group_has_usable_regions(e, p)) return false; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 2b5be3024037..2d76a02872e0 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -234,6 +234,7 @@ void __exit intel_aet_exit(void); int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val); void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos); +bool intel_handle_aet_option(bool force_off, char *tok); #else static inline bool intel_aet_get_events(void) { return false; } static inline void __exit intel_aet_exit(void) { } @@ -244,6 +245,7 @@ static inline int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 static inline void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos) { } +static inline bool intel_handle_aet_option(bool force_off, char *tok) { return false; } #endif #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ -- Gitee From 5ccc9c9d6bfcf41aad777df35daf29274ef227e5 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:12 -0800 Subject: [PATCH 076/124] x86/resctrl: Handle number of RMIDs supported by RDT_RESOURCE_PERF_PKG ANBZ: #31060 commit 67640e333b983298be624a41c43e3a8ed4713a73 upstream. There are now three meanings for "number of RMIDs": 1) The number for legacy features enumerated by CPUID leaf 0xF. This is the maximum number of distinct values that can be loaded into MSR_IA32_PQR_ASSOC. Note that systems with Sub-NUMA Cluster mode enabled will force scaling down the CPUID enumerated value by the number of SNC nodes per L3-cache. 2) The number of registers in MMIO space for each event. This is enumerated in the XML files and is the value initialized into event_group::num_rmid. 3) The number of "hardware counters" (this isn't a strictly accurate description of how things work, but serves as a useful analogy that does describe the limitations) feeding to those MMIO registers. This is enumerated in telemetry_region::num_rmids returned by intel_pmt_get_regions_by_feature(). Event groups with insufficient "hardware counters" to track all RMIDs are difficult for users to use, since the system may reassign "hardware counters" at any time. This means that users cannot reliably collect two consecutive event counts to compute the rate at which events are occurring. Disable such event groups by default. The user may override this with a command line "rdt=" option. In this case limit an under-resourced event group's number of possible monitor resource groups to the lowest number of "hardware counters". Scan all enabled event groups and assign the RDT_RESOURCE_PERF_PKG resource "num_rmid" value to the smallest of these values as this value will be used later to compare against the number of RMIDs supported by other resources to determine how many monitoring resource groups are supported. N.B. Change type of resctrl_mon::num_rmid to u32 to match its usage and the type of event_group::num_rmid so that min(r->num_rmid, e->num_rmid) won't complain about mixing signed and unsigned types. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/intel_aet.c | 53 ++++++++++++++++++++++++- fs/resctrl/rdtgroup.c | 2 +- include/linux/resctrl.h | 2 +- 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index dc25e8d2527d..aba997135003 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -60,10 +61,14 @@ struct pmt_event { * Valid if the system supports the event group, * NULL otherwise. * @force_off: True when "rdt" command line or architecture code disables - * this event group. + * this event group due to insufficient RMIDs. * @force_on: True when "rdt" command line overrides disable of this * event group. * @guid: Unique number per XML description file. + * @num_rmid: Number of RMIDs supported by this group. May be + * adjusted downwards if enumeration from + * intel_pmt_get_regions_by_feature() indicates fewer + * RMIDs can be tracked simultaneously. * @mmio_size: Number of bytes of MMIO registers for this group. * @num_events: Number of events in this group. * @evts: Array of event descriptors. @@ -76,6 +81,7 @@ struct event_group { /* Remaining fields initialized from XML file. */ u32 guid; + u32 num_rmid; size_t mmio_size; unsigned int num_events; struct pmt_event evts[] __counted_by(num_events); @@ -90,6 +96,7 @@ struct event_group { static struct event_group energy_0x26696143 = { .pfname = "energy", .guid = 0x26696143, + .num_rmid = 576, .mmio_size = XML_MMIO_SIZE(576, 2, 3), .num_events = 2, .evts = { @@ -104,6 +111,7 @@ static struct event_group energy_0x26696143 = { static struct event_group perf_0x26557651 = { .pfname = "perf", .guid = 0x26557651, + .num_rmid = 576, .mmio_size = XML_MMIO_SIZE(576, 7, 3), .num_events = 7, .evts = { @@ -198,6 +206,23 @@ static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_g return usable_regions; } +static bool all_regions_have_sufficient_rmid(struct event_group *e, struct pmt_feature_group *p) +{ + struct telemetry_region *tr; + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + tr = &p->regions[i]; + if (tr->num_rmids < e->num_rmid) { + e->force_off = true; + return false; + } + } + + return true; +} + static bool enable_events(struct event_group *e, struct pmt_feature_group *p) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; @@ -209,6 +234,27 @@ static bool enable_events(struct event_group *e, struct pmt_feature_group *p) if (!group_has_usable_regions(e, p)) return false; + /* + * Only enable event group with insufficient RMIDs if the user requested + * it from the kernel command line. + */ + if (!all_regions_have_sufficient_rmid(e, p) && !e->force_on) { + pr_info("%s %s:0x%x monitoring not enabled due to insufficient RMIDs\n", + r->name, e->pfname, e->guid); + return false; + } + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + /* + * e->num_rmid only adjusted lower if user (via rdt= kernel + * parameter) forces an event group with insufficient RMID + * to be enabled. + */ + e->num_rmid = min(e->num_rmid, p->regions[i].num_rmids); + } + for (int j = 0; j < e->num_events; j++) { if (!resctrl_enable_mon_event(e->evts[j].id, true, e->evts[j].bin_bits, &e->evts[j])) @@ -219,6 +265,11 @@ static bool enable_events(struct event_group *e, struct pmt_feature_group *p) return false; } + if (r->mon.num_rmid) + r->mon.num_rmid = min(r->mon.num_rmid, e->num_rmid); + else + r->mon.num_rmid = e->num_rmid; + return true; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 274fe9b1433e..d15ec6d41de8 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1136,7 +1136,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->mon.num_rmid); + seq_printf(seq, "%u\n", r->mon.num_rmid); return 0; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index c8fd506648bb..2e467cfb4a23 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -292,7 +292,7 @@ enum resctrl_schema_fmt { * events of monitor groups created via mkdir. */ struct resctrl_mon { - int num_rmid; + u32 num_rmid; unsigned int mbm_cfg_mask; int num_mbm_cntrs; bool mbm_cntr_assignable; -- Gitee From f92420cb474bd0a1f910de96f0a1bd8378d3f95f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:13 -0800 Subject: [PATCH 077/124] fs/resctrl: Move allocation/free of closid_num_dirty_rmid[] ANBZ: #31060 commit ee7f6af79f0916b6c49e15edd4cba020b3e4c4ac upstream. closid_num_dirty_rmid[] and rmid_ptrs[] are allocated together during resctrl initialization and freed together during resctrl exit. Telemetry events are enumerated on resctrl mount so only at resctrl mount will the number of RMID supported by all monitoring resources and needed as size for rmid_ptrs[] be known. Separate closid_num_dirty_rmid[] and rmid_ptrs[] allocation and free in preparation for rmid_ptrs[] to be allocated on resctrl mount. Keep the rdtgroup_mutex protection around the allocation and free of closid_num_dirty_rmid[] as ARM needs this to guarantee memory ordering. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- fs/resctrl/monitor.c | 79 ++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 8a4c2ae72740..bbf8c9037887 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -906,36 +906,14 @@ void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long del static int dom_data_init(struct rdt_resource *r) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; int err = 0, i; u32 idx; mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } - - closid_num_dirty_rmid = tmp; - } rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } err = -ENOMEM; goto out_unlock; } @@ -971,11 +949,6 @@ static void dom_data_exit(struct rdt_resource *r) if (!r->mon_capable) goto out_unlock; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - kfree(rmid_ptrs); rmid_ptrs = NULL; @@ -1814,6 +1787,45 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static int closid_num_dirty_rmid_alloc(struct rdt_resource *r) +{ + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 num_closid = resctrl_arch_get_num_closid(r); + u32 *tmp; + + /* For ARM memory ordering access to closid_num_dirty_rmid */ + mutex_lock(&rdtgroup_mutex); + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + mutex_unlock(&rdtgroup_mutex); + return -ENOMEM; + } + + closid_num_dirty_rmid = tmp; + + mutex_unlock(&rdtgroup_mutex); + } + + return 0; +} + +static void closid_num_dirty_rmid_free(void) +{ + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + mutex_lock(&rdtgroup_mutex); + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + mutex_unlock(&rdtgroup_mutex); + } +} + /** * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. * @@ -1834,10 +1846,16 @@ int resctrl_l3_mon_resource_init(void) if (!r->mon_capable) return 0; - ret = dom_data_init(r); + ret = closid_num_dirty_rmid_alloc(r); if (ret) return ret; + ret = dom_data_init(r); + if (ret) { + closid_num_dirty_rmid_free(); + return ret; + } + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", @@ -1880,5 +1898,10 @@ void resctrl_l3_mon_resource_exit(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + if (!r->mon_capable) + return; + + closid_num_dirty_rmid_free(); + dom_data_exit(r); } -- Gitee From f98df9dd73d0b452b0162079ef02183063d978e4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:14 -0800 Subject: [PATCH 078/124] x86,fs/resctrl: Compute number of RMIDs as minimum across resources ANBZ: #31060 commit 0ecc988b0232259cbdb2b7e452bda74f550f0911 upstream. resctrl assumes that only the L3 resource supports monitor events, so it simply takes the rdt_resource::num_rmid from RDT_RESOURCE_L3 as the system's number of RMIDs. The addition of telemetry events in a different resource breaks that assumption. Compute the number of available RMIDs as the minimum value across all mon_capable resources (analogous to how the number of CLOSIDs is computed across alloc_capable resources). Note that mount time enumeration of the telemetry resource means that this number can be reduced. If this happens, then some memory will be wasted as the allocations for rdt_l3_mon_domain::mbm_states[] and rdt_l3_mon_domain::rmid_busy_llc created during resctrl initialization will be larger than needed. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 15 +++++++++++++-- fs/resctrl/rdtgroup.c | 6 ++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index dfcc3df46d25..cb172a60e927 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -109,12 +109,23 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { }, }; +/** + * resctrl_arch_system_num_rmid_idx - Compute number of supported RMIDs + * (minimum across all mon_capable resource) + * + * Return: Number of supported RMIDs at time of call. Note that mount time + * enumeration of resources may reduce the number. + */ u32 resctrl_arch_system_num_rmid_idx(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 num_rmids = U32_MAX; + struct rdt_resource *r; + + for_each_mon_capable_rdt_resource(r) + num_rmids = min(num_rmids, r->mon.num_rmid); /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->mon.num_rmid; + return num_rmids == U32_MAX ? 0 : num_rmids; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index d15ec6d41de8..5913ac4b3acd 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4288,6 +4288,12 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h * During boot this may be called before global allocations have been made by * resctrl_l3_mon_resource_init(). * + * Called during CPU online that may run as soon as CPU online callbacks + * are set up during resctrl initialization. The number of supported RMIDs + * may be reduced if additional mon_capable resources are enumerated + * at mount time. This means the rdt_l3_mon_domain::mbm_states[] and + * rdt_l3_mon_domain::rmid_busy_llc allocations may be larger than needed. + * * Return: 0 for success, or -ENOMEM. */ static int domain_setup_l3_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) -- Gitee From 5e9907019f625685df2dfa7b7723f6245810a547 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:15 -0800 Subject: [PATCH 079/124] fs/resctrl: Move RMID initialization to first mount ANBZ: #31060 commit d0891647fbc6e931f27517364cbc4ee1811d76db upstream. L3 monitor features are enumerated during resctrl initialization and rmid_ptrs[] that tracks all RMIDs and depends on the number of supported RMIDs is allocated during this time. Telemetry monitor features are enumerated during first resctrl mount and may support a different number of RMIDs compared to L3 monitor features. Delay allocation and initialization of rmid_ptrs[] until first mount. Since the number of RMIDs cannot change on later mounts, keep the same set of rmid_ptrs[] until resctrl_exit(). This is required because the limbo handler keeps running after resctrl is unmounted and needs to access rmid_ptrs[] as it keeps tracking busy RMIDs after unmount. Rename routines to match what they now do: dom_data_init() -> setup_rmid_lru_list() dom_data_exit() -> free_rmid_lru_list() Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- fs/resctrl/internal.h | 4 ++++ fs/resctrl/monitor.c | 54 ++++++++++++++++++++----------------------- fs/resctrl/rdtgroup.c | 5 ++++ 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 2715afa5951b..ea833ca459b6 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -369,6 +369,10 @@ int closids_supported(void); void closid_free(int closid); +int setup_rmid_lru_list(void); + +void free_rmid_lru_list(void); + int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index bbf8c9037887..0cd5476a483a 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -903,20 +903,29 @@ void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long del schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } -static int dom_data_init(struct rdt_resource *r) +int setup_rmid_lru_list(void) { - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry = NULL; - int err = 0, i; + u32 idx_limit; u32 idx; + int i; - mutex_lock(&rdtgroup_mutex); + if (!resctrl_arch_mon_capable()) + return 0; + /* + * Called on every mount, but the number of RMIDs cannot change + * after the first mount, so keep using the same set of rmid_ptrs[] + * until resctrl_exit(). Note that the limbo handler continues to + * access rmid_ptrs[] after resctrl is unmounted. + */ + if (rmid_ptrs) + return 0; + + idx_limit = resctrl_arch_system_num_rmid_idx(); rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - err = -ENOMEM; - goto out_unlock; - } + if (!rmid_ptrs) + return -ENOMEM; for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; @@ -929,30 +938,24 @@ static int dom_data_init(struct rdt_resource *r) /* * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in resctrl_init(). + * control group, which was setup earlier in rdtgroup_setup_default(). */ idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); entry = __rmid_entry(idx); list_del(&entry->list); -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; + return 0; } -static void dom_data_exit(struct rdt_resource *r) +void free_rmid_lru_list(void) { - mutex_lock(&rdtgroup_mutex); - - if (!r->mon_capable) - goto out_unlock; + if (!resctrl_arch_mon_capable()) + return; + mutex_lock(&rdtgroup_mutex); kfree(rmid_ptrs); rmid_ptrs = NULL; - -out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -1830,7 +1833,8 @@ static void closid_num_dirty_rmid_free(void) * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. * * Allocate and initialise global monitor resources that do not belong to a - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. + * specific domain. i.e. the closid_num_dirty_rmid[] used to find the CLOSID + * with the cleanest set of RMIDs. * Called once during boot after the struct rdt_resource's have been configured * but before the filesystem is mounted. * Resctrl's cpuhp callbacks may be called before this point to bring a domain @@ -1850,12 +1854,6 @@ int resctrl_l3_mon_resource_init(void) if (ret) return ret; - ret = dom_data_init(r); - if (ret) { - closid_num_dirty_rmid_free(); - return ret; - } - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", @@ -1902,6 +1900,4 @@ void resctrl_l3_mon_resource_exit(void) return; closid_num_dirty_rmid_free(); - - dom_data_exit(r); } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 5913ac4b3acd..3e3ab762a64a 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2735,6 +2735,10 @@ static int rdt_get_tree(struct fs_context *fc) goto out; } + ret = setup_rmid_lru_list(); + if (ret) + goto out; + ret = rdtgroup_setup_root(ctx); if (ret) goto out; @@ -4588,4 +4592,5 @@ void resctrl_exit(void) */ resctrl_l3_mon_resource_exit(); + free_rmid_lru_list(); } -- Gitee From e53cf8592ae4344188070de7e9f3a8e74d83e202 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 8 Jan 2026 09:42:27 -0800 Subject: [PATCH 080/124] x86/resctrl: Enable RDT_RESOURCE_PERF_PKG ANBZ: #31060 commit 4bbfc90122e974ccbd9aa80c964413052b9519f3 upstream. Since telemetry events are enumerated on resctrl mount the RDT_RESOURCE_PERF_PKG resource is not considered "monitoring capable" during early resctrl initialization. This means that the domain list for RDT_RESOURCE_PERF_PKG is not built when the CPU hotplug notifiers are registered and run for the first time right after resctrl initialization. Mark the RDT_RESOURCE_PERF_PKG as "monitoring capable" upon successful telemetry event enumeration to ensure future CPU hotplug events include this resource and initialize its domain list for CPUs that are already online. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- arch/x86/kernel/cpu/resctrl/core.c | 16 ++++++++++++++++ arch/x86/kernel/cpu/resctrl/intel_aet.c | 6 ++++++ 2 files changed, 22 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index cb172a60e927..cd240b5e840f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -760,8 +760,24 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) void resctrl_arch_pre_mount(void) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int cpu; + if (!intel_aet_get_events()) return; + + /* + * Late discovery of telemetry events means the domains for the + * resource were not built. Do that now. + */ + cpus_read_lock(); + mutex_lock(&domain_list_lock); + r->mon_capable = true; + rdt_mon_capable = true; + for_each_online_cpu(cpu) + domain_add_cpu_mon(cpu, r); + mutex_unlock(&domain_list_lock); + cpus_read_unlock(); } enum { diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c index aba997135003..89b8b619d5d5 100644 --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -270,6 +270,12 @@ static bool enable_events(struct event_group *e, struct pmt_feature_group *p) else r->mon.num_rmid = e->num_rmid; + if (skipped_events) + pr_info("%s %s:0x%x monitoring detected (skipped %d events)\n", r->name, + e->pfname, e->guid, skipped_events); + else + pr_info("%s %s:0x%x monitoring detected\n", r->name, e->pfname, e->guid); + return true; } -- Gitee From 3997d71169ec3ebe81254ece97436ba0b8aaec5e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Dec 2025 09:21:19 -0800 Subject: [PATCH 081/124] x86,fs/resctrl: Update documentation for telemetry events ANBZ: #31060 commit a8848c4b43ad00c8a18db080206e3ffa53a08b91 upstream. Update resctrl filesystem documentation with the details about the resctrl files that support telemetry events. [ bp: Drop the debugfs hunk of the documentation until a better debugging solution is found. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/20251217172121.12030-1-tony.luck@intel.com Signed-off-by: Wei Chen --- Documentation/filesystems/resctrl.rst | 66 ++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index bd5901338f29..5a6e2258bc39 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -168,13 +168,12 @@ with respect to allocation: bandwidth percentages are directly applied to the threads running on the core -If RDT monitoring is available there will be an "L3_MON" directory +If L3 monitoring is available there will be an "L3_MON" directory with the following files: "num_rmids": - The number of RMIDs available. This is the - upper bound for how many "CTRL_MON" + "MON" - groups can be created. + The number of RMIDs supported by hardware for + L3 monitoring events. "mon_features": Lists the monitoring events if @@ -400,6 +399,24 @@ with the following files: bytes) at which a previously used LLC_occupancy counter can be considered for re-use. +If telemetry monitoring is available there will be a "PERF_PKG_MON" directory +with the following files: + +"num_rmids": + The number of RMIDs for telemetry monitoring events. + + On Intel resctrl will not enable telemetry events if the number of + RMIDs that can be tracked concurrently is lower than the total number + of RMIDs supported. Telemetry events can be force-enabled with the + "rdt=" kernel parameter, but this may reduce the number of + monitoring groups that can be created. + +"mon_features": + Lists the telemetry monitoring events that are enabled on this system. + +The upper bound for how many "CTRL_MON" + "MON" can be created +is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values. + Finally, in the top level of the "info" directory there is a file named "last_cmd_status". This is reset with every "command" issued via the file system (making new directories or writing to any of the @@ -505,15 +522,40 @@ When control is enabled all CTRL_MON groups will also contain: When monitoring is enabled all MON groups will also contain: "mon_data": - This contains a set of files organized by L3 domain and by - RDT event. E.g. on a system with two L3 domains there will - be subdirectories "mon_L3_00" and "mon_L3_01". Each of these - directories have one file per event (e.g. "llc_occupancy", - "mbm_total_bytes", and "mbm_local_bytes"). In a MON group these - files provide a read out of the current value of the event for - all tasks in the group. In CTRL_MON groups these files provide - the sum for all tasks in the CTRL_MON group and all tasks in + This contains directories for each monitor domain. + + If L3 monitoring is enabled, there will be a "mon_L3_XX" directory for + each instance of an L3 cache. Each directory contains files for the enabled + L3 events (e.g. "llc_occupancy", "mbm_total_bytes", and "mbm_local_bytes"). + + If telemetry monitoring is enabled, there will be a "mon_PERF_PKG_YY" + directory for each physical processor package. Each directory contains + files for the enabled telemetry events (e.g. "core_energy". "activity", + "uops_retired", etc.) + + The info/`*`/mon_features files provide the full list of enabled + event/file names. + + "core energy" reports a floating point number for the energy (in Joules) + consumed by cores (registers, arithmetic units, TLB and L1/L2 caches) + during execution of instructions summed across all logical CPUs on a + package for the current monitoring group. + + "activity" also reports a floating point value (in Farads). This provides + an estimate of work done independent of the frequency that the CPUs used + for execution. + + Note that "core energy" and "activity" only measure energy/activity in the + "core" of the CPU (arithmetic units, TLB, L1 and L2 caches, etc.). They + do not include L3 cache, memory, I/O devices etc. + + All other events report decimal integer values. + + In a MON group these files provide a read out of the current value of + the event for all tasks in the group. In CTRL_MON groups these files + provide the sum for all tasks in the CTRL_MON group and all tasks in MON groups. Please see example section for more details on usage. + On systems with Sub-NUMA Cluster (SNC) enabled there are extra directories for each node (located within the "mon_L3_XX" directory for the L3 cache they occupy). These are named "mon_sub_L3_YY" -- Gitee From 030615698a2974491561ee2944420caa8aeaf6e5 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Fri, 13 Feb 2026 15:50:54 +0800 Subject: [PATCH 082/124] arm_mpam: Ensure in_reset_state is false after applying configuration ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-2-ben.horgan@arm.com The per-RIS flag, in_reset_state, indicates whether or not the MSC registers are in reset state, and allows avoiding resetting when they are already in reset state. However, when mpam_apply_config() updates the configuration it doesn't update the in_reset_state flag and so even after the configuration update in_reset_state can be true and mpam_reset_ris() will skip the actual register restoration on subsequent resets. Once resctrl has a MPAM backend it will use resctrl_arch_reset_all_ctrls() to reset the MSC configuration on unmount and, if the in_reset_state flag is bogusly true, fail to reset the MSC configuration. The resulting non-reset MSC configuration can lead to persistent performance restrictions even after resctrl is unmounted. Fix by clearing in_reset_state to false immediately after successful configuration application, ensuring that the next reset operation properly restores MSC register defaults. Fixes: 09b89d2a72f3 ("arm_mpam: Allow configuration to be applied and restored during cpu online") Signed-off-by: Zeng Heng Acked-by: Ben Horgan [Horgan: rewrite commit message to not be specific to resctrl unmount] Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 9ff579d01ba6..454ce21f7318 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2697,6 +2697,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid, srcu_read_lock_held(&mpam_srcu)) { arg.ris = ris; mpam_touch_msc(msc, __write_config, &arg); + ris->in_reset_state = false; } mutex_unlock(&msc->cfg_lock); } -- Gitee From b411a5df8827a435e6907fb6904aef37baacc63b Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Tue, 3 Mar 2026 14:56:03 +0000 Subject: [PATCH 083/124] arm_mpam: Reset when feature configuration bit unset ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-3-ben.horgan@arm.com To indicate that the configuration, of the controls used by resctrl, in a RIS need resetting to driver defaults the reset flags in mpam_config are set. However, these flags are only ever set temporarily at RIS scope in mpam_reset_ris() and hence mpam_cpu_online() will never reset these controls to default. As the hardware reset is unknown this leads to unknown configuration when the control values haven't been configured away from the defaults. Use the policy that an unset feature configuration bit means reset. In this way the mpam_config in the component can encode that it should be in reset state and mpam_reprogram_msc() will reset controls as needed. Fixes: 09b89d2a72f3 ("arm_mpam: Allow configuration to be applied and restored during cpu online") Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 40 ++++++++++------------------------ 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 454ce21f7318..1a7de235e106 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1364,17 +1364,15 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, __mpam_intpart_sel(ris->ris_idx, partid, msc); } - if (mpam_has_feature(mpam_feat_cpor_part, rprops) && - mpam_has_feature(mpam_feat_cpor_part, cfg)) { - if (cfg->reset_cpbm) - mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); - else + if (mpam_has_feature(mpam_feat_cpor_part, rprops)) { + if (mpam_has_feature(mpam_feat_cpor_part, cfg)) mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); + else + mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); } - if (mpam_has_feature(mpam_feat_mbw_part, rprops) && - mpam_has_feature(mpam_feat_mbw_part, cfg)) { - if (cfg->reset_mbw_pbm) + if (mpam_has_feature(mpam_feat_mbw_part, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_part, cfg)) mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); else mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); @@ -1384,16 +1382,14 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_has_feature(mpam_feat_mbw_min, cfg)) mpam_write_partsel_reg(msc, MBW_MIN, 0); - if (mpam_has_feature(mpam_feat_mbw_max, rprops) && - mpam_has_feature(mpam_feat_mbw_max, cfg)) { - if (cfg->reset_mbw_max) - mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); - else + if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); + else + mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); } - if (mpam_has_feature(mpam_feat_mbw_prop, rprops) && - mpam_has_feature(mpam_feat_mbw_prop, cfg)) + if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) mpam_write_partsel_reg(msc, MBW_PROP, 0); if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) @@ -1491,16 +1487,6 @@ static int mpam_save_mbwu_state(void *arg) return 0; } -static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) -{ - *reset_cfg = (struct mpam_config) { - .reset_cpbm = true, - .reset_mbw_pbm = true, - .reset_mbw_max = true, - }; - bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); -} - /* * Called via smp_call_on_cpu() to prevent migration, while still being * pre-emptible. Caller must hold mpam_srcu. @@ -1508,14 +1494,12 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) static int mpam_reset_ris(void *arg) { u16 partid, partid_max; - struct mpam_config reset_cfg; + struct mpam_config reset_cfg = {}; struct mpam_msc_ris *ris = arg; if (ris->in_reset_state) return 0; - mpam_init_reset_cfg(&reset_cfg); - spin_lock(&partid_max_lock); partid_max = mpam_partid_max; spin_unlock(&partid_max_lock); -- Gitee From 7c9ecbb801bfdf253b79e273fff08dd9161820f8 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Mon, 15 Dec 2025 12:05:33 +0000 Subject: [PATCH 084/124] arm64/sysreg: Add MPAMSM_EL1 register ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-4-ben.horgan@arm.com The MPAMSM_EL1 register determines the MPAM configuration for an SMCU. Add the register definition. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Catalin Marinas Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/tools/sysreg | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 9e0a5f49db4d..61c1eeccb977 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -3333,6 +3333,14 @@ Field 31:16 PARTID_D Field 15:0 PARTID_I EndSysreg +Sysreg MPAMSM_EL1 3 0 10 5 3 +Res0 63:48 +Field 47:40 PMG_D +Res0 39:32 +Field 31:16 PARTID_D +Res0 15:0 +EndSysreg + Sysreg ISR_EL1 3 0 12 1 0 Res0 63:11 Field 10 IS -- Gitee From 6c337519f8c92e9a06aa1348eb0250d974719bd2 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 17 Apr 2025 12:13:16 +0100 Subject: [PATCH 085/124] KVM: arm64: Preserve host MPAM configuration when changing traps ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-5-ben.horgan@arm.com When KVM enables or disables MPAM traps to EL2 it clears all other bits in MPAM2_EL2. Notably, it clears the partition ids (PARTIDs) and performance monitoring groups (PMGs). Avoid changing these bits in anticipation of adding support for MPAM in the kernel. Otherwise, on a VHE system with the host running at EL2 where MPAM2_EL2 and MPAM1_EL1 access the same register, any attempt to use MPAM to monitor or partition resources for kernel space would be foiled by running a KVM guest. Additionally, MPAM2_EL2.EnMPAMSM is always set to 0 which causes MPAMSM_EL1 to always trap. Keep EnMPAMSM set to 1 when not in a guest so that the kernel can use MPAMSM_EL1. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Marc Zyngier Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/kvm/hyp/include/hyp/switch.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 64b489177b27..7de6a4074442 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -174,7 +174,8 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) { - u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + u64 clr = MPAM2_EL2_EnMPAMSM; + u64 set = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; if (!system_supports_mpam()) return; @@ -184,18 +185,21 @@ static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); } else { /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ - r |= MPAM2_EL2_TIDR; + set |= MPAM2_EL2_TIDR; } - write_sysreg_s(r, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); } static inline void __deactivate_traps_mpam(void) { + u64 clr = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1 | MPAM2_EL2_TIDR; + u64 set = MPAM2_EL2_EnMPAMSM; + if (!system_supports_mpam()) return; - write_sysreg_s(0, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); if (system_supports_mpam_hcr()) write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); -- Gitee From 7907b0a09fd0765c2418d52ff0306aceb5aa1334 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Mon, 15 Dec 2025 13:22:09 +0000 Subject: [PATCH 086/124] KVM: arm64: Make MPAMSM_EL1 accesses UNDEF ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-6-ben.horgan@arm.com The MPAMSM_EL1 register controls the MPAM labeling for an SMCU, Streaming Mode Compute Unit. As there is no MPAM support in KVM, make sure MPAMSM_EL1 accesses trigger an UNDEF. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Marc Zyngier Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/kvm/sys_regs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index fc19351666bb..ee84b5ca01cd 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2526,6 +2526,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MPAM1_EL1), undef_access }, { SYS_DESC(SYS_MPAM0_EL1), undef_access }, + { SYS_DESC(SYS_MPAMSM_EL1), undef_access }, + { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, -- Gitee From 24ad5c58493cecf8e80c9439a0434588a7836cf1 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:24 +0000 Subject: [PATCH 087/124] arm64: mpam: Context switch the MPAM registers ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-7-ben.horgan@arm.com MPAM allows traffic in the SoC to be labeled by the OS, these labels are used to apply policy in caches and bandwidth regulators, and to monitor traffic in the SoC. The label is made up of a PARTID and PMG value. The x86 equivalent calls these CLOSID and RMID, but they don't map precisely. MPAM has two CPU system registers that is used to hold the PARTID and PMG values that traffic generated at each exception level will use. These can be set per-task by the resctrl file system. (resctrl is the defacto interface for controlling this stuff). Add a helper to switch this. struct task_struct's separate CLOSID and RMID fields are insufficient to implement resctrl using MPAM, as resctrl can change the PARTID (CLOSID) and PMG (sort of like the RMID) separately. On x86, the rmid is an independent number, so a race that writes a mismatched closid and rmid into hardware is benign. On arm64, the pmg bits extend the partid. (i.e. partid-5 has a pmg-0 that is not the same as partid-6's pmg-0). In this case, mismatching the values will 'dirty' a pmg value that resctrl believes is clean, and is not tracking with its 'limbo' code. To avoid this, the partid and pmg are always read and written as a pair. This requires a new u64 field. In struct task_struct there are two u32, rmid and closid for the x86 case, but as we can't use them here do something else. Add this new field, mpam_partid_pmg, to struct thread_info to avoid adding more architecture specific code to struct task_struct. Always use READ_ONCE()/WRITE_ONCE() when accessing this field. Resctrl allows a per-cpu 'default' value to be set, this overrides the values when scheduling a task in the default control-group, which has PARTID 0. The way 'code data prioritisation' gets emulated means the register value for the default group needs to be a variable. The current system register value is kept in a per-cpu variable to avoid writing to the system register if the value isn't going to change. Writes to this register may reset the hardware state for regulating bandwidth. Finally, there is no reason to context switch these registers unless there is a driver changing the values in struct task_struct. Hide the whole thing behind a static key. This also allows the driver to disable MPAM in response to errors reported by hardware. Move the existing static key to belong to the arch code, as in the future the MPAM driver may become a loadable module. All this should depend on whether there is an MPAM driver, hide it behind CONFIG_ARM64_MPAM. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal CC: Amit Singh Tomar Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/Kconfig | 2 + arch/arm64/include/asm/mpam.h | 67 ++++++++++++++++++++++++++++ arch/arm64/include/asm/thread_info.h | 3 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/mpam.c | 13 ++++++ arch/arm64/kernel/process.c | 7 +++ drivers/resctrl/mpam_devices.c | 2 - drivers/resctrl/mpam_internal.h | 4 +- 8 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 arch/arm64/include/asm/mpam.h create mode 100644 arch/arm64/kernel/mpam.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fadf4017ae15..f971917f9a61 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2118,6 +2118,8 @@ config ARM64_MPAM MPAM is exposed to user-space via the resctrl pseudo filesystem. + This option enables the extra context switch code. + endmenu # "ARMv8.4 architectural features" menu "ARMv8.5 architectural features" diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h new file mode 100644 index 000000000000..0747e0526927 --- /dev/null +++ b/arch/arm64/include/asm/mpam.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __ASM__MPAM_H +#define __ASM__MPAM_H + +#include +#include +#include + +#include + +DECLARE_STATIC_KEY_FALSE(mpam_enabled); +DECLARE_PER_CPU(u64, arm64_mpam_default); +DECLARE_PER_CPU(u64, arm64_mpam_current); + +/* + * The value of the MPAM0_EL1 sysreg when a task is in resctrl's default group. + * This is used by the context switch code to use the resctrl CPU property + * instead. The value is modified when CDP is enabled/disabled by mounting + * the resctrl filesystem. + */ +extern u64 arm64_mpam_global_default; + +/* + * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, + * which may race with reads in mpam_thread_switch(). Ensure only one of the old + * or new values are used. Particular care should be taken with the pmg field as + * mpam_thread_switch() may read a partid and pmg that don't match, causing this + * value to be stored with cache allocations, despite being considered 'free' by + * resctrl. + */ +#ifdef CONFIG_ARM64_MPAM +static inline u64 mpam_get_regval(struct task_struct *tsk) +{ + return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg); +} + +static inline void mpam_thread_switch(struct task_struct *tsk) +{ + u64 oldregval; + int cpu = smp_processor_id(); + u64 regval = mpam_get_regval(tsk); + + if (!static_branch_likely(&mpam_enabled)) + return; + + if (regval == READ_ONCE(arm64_mpam_global_default)) + regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu)); + + oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + if (oldregval == regval) + return; + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); + + WRITE_ONCE(per_cpu(arm64_mpam_current, cpu), regval); +} +#else +static inline void mpam_thread_switch(struct task_struct *tsk) {} +#endif /* CONFIG_ARM64_MPAM */ + +#endif /* __ASM__MPAM_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 872b8e810a37..f36803eb2ebd 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -41,6 +41,9 @@ struct thread_info { #ifdef CONFIG_SHADOW_CALL_STACK void *scs_base; void *scs_sp; +#endif +#ifdef CONFIG_ARM64_MPAM + u64 mpam_partid_pmg; #endif u32 cpu; }; diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index e3bab76856a3..d3cc618ea348 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -72,6 +72,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_SDEI_WATCHDOG) += watchdog_sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MPAM) += mpam.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c new file mode 100644 index 000000000000..9866d2ca0faa --- /dev/null +++ b/arch/arm64/kernel/mpam.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Arm Ltd. */ + +#include + +#include +#include + +DEFINE_STATIC_KEY_FALSE(mpam_enabled); +DEFINE_PER_CPU(u64, arm64_mpam_default); +DEFINE_PER_CPU(u64, arm64_mpam_current); + +u64 arm64_mpam_global_default; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a44f0e97f6d7..c3b83023851f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -577,6 +578,12 @@ struct task_struct *__switch_to(struct task_struct *prev, if (prev->thread.sctlr_user != next->thread.sctlr_user) update_sctlr_el1(next->thread.sctlr_user); + /* + * MPAM thread switch happens after the DSB to ensure prev's accesses + * use prev's MPAM settings. + */ + mpam_thread_switch(next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 1a7de235e106..e410fd3b210b 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -29,8 +29,6 @@ #include "mpam_internal.h" -DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */ - /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e8971842b124..4632985bcca6 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -16,12 +16,12 @@ #include #include +#include + #define MPAM_MSC_MAX_NUM_RIS 16 struct platform_device; -DECLARE_STATIC_KEY_FALSE(mpam_enabled); - #ifdef CONFIG_MPAM_KUNIT_TEST #define PACKED_FOR_KUNIT __packed #else -- Gitee From ad4a213a572dd5681c23cc14a7c6a7f1f7b8a160 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:25 +0000 Subject: [PATCH 088/124] arm64: mpam: Re-initialise MPAM regs when CPU comes online ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-8-ben.horgan@arm.com Now that the MPAM system registers are expected to have values that change, reprogram them based on the previous value when a CPU is brought online. Previously MPAM's 'default PARTID' of 0 was always used for MPAM in kernel-space as this is the PARTID that hardware guarantees to reset. Because there are a limited number of PARTID, this value is exposed to user-space, meaning resctrl changes to the resctrl default group would also affect kernel threads. Instead, use the task's PARTID value for kernel work on behalf of user-space too. The default of 0 is kept for both user-space and kernel-space when MPAM is not enabled. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/kernel/cpufeature.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5e26020a66e5..a0456fe1b17a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include #include @@ -2437,13 +2438,17 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) static void cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) { - /* - * Access by the kernel (at EL1) should use the reserved PARTID - * which is configured unrestricted. This avoids priority-inversion - * where latency sensitive tasks have to wait for a task that has - * been throttled to release the lock. - */ - write_sysreg_s(0, SYS_MPAM1_EL1); + int cpu = smp_processor_id(); + u64 regval = 0; + + if (IS_ENABLED(CONFIG_ARM64_MPAM) && static_branch_likely(&mpam_enabled)) + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); } static bool -- Gitee From f2a77ae8dcc4d9139e407366cf8c326e8b61cccb Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Mon, 19 Jan 2026 14:40:26 +0000 Subject: [PATCH 089/124] arm64: mpam: Drop the CONFIG_EXPERT restriction ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-9-ben.horgan@arm.com In anticipation of MPAM being useful remove the CONFIG_EXPERT restriction. Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Catalin Marinas Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/Kconfig | 2 +- drivers/resctrl/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f971917f9a61..959743928dec 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2095,7 +2095,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" - select ARM64_MPAM_DRIVER if EXPERT # does nothing yet + select ARM64_MPAM_DRIVER select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) is an diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index c808e0470394..c34e059c6e41 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -1,6 +1,6 @@ menuconfig ARM64_MPAM_DRIVER bool "MPAM driver" - depends on ARM64 && ARM64_MPAM && EXPERT + depends on ARM64 && ARM64_MPAM help Memory System Resource Partitioning and Monitoring (MPAM) driver for System IP, e.g. caches and memory controllers. -- Gitee From d3999e0912ee188ac9017b421590c4fac1c49727 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:26 +0000 Subject: [PATCH 090/124] arm64: mpam: Advertise the CPUs MPAM limits to the driver ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-10-ben.horgan@arm.com Requestors need to populate the MPAM fields for any traffic they send on the interconnect. For the CPUs these values are taken from the corresponding MPAMy_ELx register. Each requestor may have a limit on the largest PARTID or PMG value that can be used. The MPAM driver has to determine the system-wide minimum supported PARTID and PMG values. To do this, the driver needs to be told what each requestor's limit is. CPUs are special, but this infrastructure is also needed for the SMMU and GIC ITS. Call the helper to tell the MPAM driver what the CPUs can do. The return value can be ignored by the arch code as it runs well before the MPAM driver starts probing. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/kernel/mpam.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index 9866d2ca0faa..e6feff2324ac 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -3,6 +3,7 @@ #include +#include #include #include @@ -11,3 +12,14 @@ DEFINE_PER_CPU(u64, arm64_mpam_default); DEFINE_PER_CPU(u64, arm64_mpam_current); u64 arm64_mpam_global_default; + +static int __init arm64_mpam_register_cpus(void) +{ + u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); + u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + + return mpam_register_requestor(partid_max, pmg_max); +} +/* Must occur before mpam_msc_driver_init() from subsys_initcall() */ +arch_initcall(arm64_mpam_register_cpus) -- Gitee From 25cc32de02ce0025581e7ce70e6e5828058737fe Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:27 +0000 Subject: [PATCH 091/124] arm64: mpam: Add cpu_pm notifier to restore MPAM sysregs ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-11-ben.horgan@arm.com The MPAM system registers will be lost if the CPU is reset during PSCI's CPU_SUSPEND. Add a PM notifier to restore them. mpam_thread_switch(current) can't be used as this won't make any changes if the in-memory copy says the register already has the correct value. In reality the system register is UNKNOWN out of reset. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/kernel/mpam.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index e6feff2324ac..48ec0ffd5999 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -13,12 +14,44 @@ DEFINE_PER_CPU(u64, arm64_mpam_current); u64 arm64_mpam_global_default; +static int mpam_pm_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + u64 regval; + int cpu = smp_processor_id(); + + switch (cmd) { + case CPU_PM_EXIT: + /* + * Don't use mpam_thread_switch() as the system register + * value has changed under our feet. + */ + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + isb(); + + write_sysreg_s(regval, SYS_MPAM0_EL1); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block mpam_pm_nb = { + .notifier_call = mpam_pm_notifier, +}; + static int __init arm64_mpam_register_cpus(void) { u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + if (!system_supports_mpam()) + return 0; + + cpu_pm_register_notifier(&mpam_pm_nb); return mpam_register_requestor(partid_max, pmg_max); } /* Must occur before mpam_msc_driver_init() from subsys_initcall() */ -- Gitee From 3c9c34d9549a0b80ff7f0b601a7ac549b1f1c3e6 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Mon, 15 Dec 2025 12:14:40 +0000 Subject: [PATCH 092/124] arm64: mpam: Initialise and context switch the MPAMSM_EL1 register ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-12-ben.horgan@arm.com The MPAMSM_EL1 sets the MPAM labels, PMG and PARTID, for loads and stores generated by a shared SMCU. Disable the traps so the kernel can use it and set it to the same configuration as the per-EL cpu MPAM configuration. If an SMCU is not shared with other cpus then it is implementation defined whether the configuration from MPAMSM_EL1 is used or that from the appropriate MPAMy_ELx. As we set the same, PMG_D and PARTID_D, configuration for MPAM0_EL1, MPAM1_EL1 and MPAMSM_EL1 the resulting configuration is the same regardless. The range of valid configurations for the PARTID and PMG in MPAMSM_EL1 is not currently specified in Arm Architectural Reference Manual but the architect has confirmed that it is intended to be the same as that for the cpu configuration in the MPAMy_ELx registers. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/include/asm/el2_setup.h | 3 ++- arch/arm64/include/asm/mpam.h | 2 ++ arch/arm64/kernel/cpufeature.c | 2 ++ arch/arm64/kernel/mpam.c | 4 ++++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index a00e9d695273..a5fdfc57ce56 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -340,7 +340,8 @@ check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2 .Linit_mpam_\@: - msr_s SYS_MPAM2_EL2, xzr // use the default partition + mov x0, #MPAM2_EL2_EnMPAMSM_MASK + msr_s SYS_MPAM2_EL2, x0 // use the default partition, // and disable lower traps mrs_s x0, SYS_MPAMIDR_EL1 tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 0747e0526927..6bccbfdccb87 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -53,6 +53,8 @@ static inline void mpam_thread_switch(struct task_struct *tsk) return; write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); isb(); /* Synchronising the EL0 write is left until the ERET to EL0 */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a0456fe1b17a..a10cbbc931a8 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2445,6 +2445,8 @@ cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (cpus_have_cap(ARM64_SME)) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); isb(); /* Synchronising the EL0 write is left until the ERET to EL0 */ diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index 48ec0ffd5999..3a490de4fa12 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -28,6 +28,10 @@ static int mpam_pm_notifier(struct notifier_block *self, */ regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) { + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), + SYS_MPAMSM_EL1); + } isb(); write_sysreg_s(regval, SYS_MPAM0_EL1); -- Gitee From 4bdea49f911182c3a35756f5fd495e7901d78048 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:28 +0000 Subject: [PATCH 093/124] arm64: mpam: Add helpers to change a task or cpu's MPAM PARTID/PMG values ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-13-ben.horgan@arm.com Care must be taken when modifying the PARTID and PMG of a task in any per-task structure as writing these values may race with the task being scheduled in, and reading the modified values. Add helpers to set the task properties, and the CPU default value. These use WRITE_ONCE() that pairs with the READ_ONCE() in mpam_get_regval() to avoid causing torn values. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Cc: Dave Martin Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/include/asm/mpam.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 6bccbfdccb87..05aa71200f61 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -4,6 +4,7 @@ #ifndef __ASM__MPAM_H #define __ASM__MPAM_H +#include #include #include #include @@ -22,6 +23,23 @@ DECLARE_PER_CPU(u64, arm64_mpam_current); */ extern u64 arm64_mpam_global_default; +#ifdef CONFIG_ARM64_MPAM +static inline u64 __mpam_regval(u16 partid_d, u16 partid_i, u8 pmg_d, u8 pmg_i) +{ + return FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d) | + FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i) | + FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d) | + FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i); +} + +static inline void mpam_set_cpu_defaults(int cpu, u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 default_val = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(per_cpu(arm64_mpam_default, cpu), default_val); +} + /* * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, * which may race with reads in mpam_thread_switch(). Ensure only one of the old @@ -30,12 +48,20 @@ extern u64 arm64_mpam_global_default; * value to be stored with cache allocations, despite being considered 'free' by * resctrl. */ -#ifdef CONFIG_ARM64_MPAM static inline u64 mpam_get_regval(struct task_struct *tsk) { return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg); } +static inline void mpam_set_task_partid_pmg(struct task_struct *tsk, + u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 regval = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(task_thread_info(tsk)->mpam_partid_pmg, regval); +} + static inline void mpam_thread_switch(struct task_struct *tsk) { u64 oldregval; -- Gitee From e82020fb06566cac3246842438d145dcade311f7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:29 +0000 Subject: [PATCH 094/124] KVM: arm64: Force guest EL1 to use user-space's partid configuration ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-14-ben.horgan@arm.com While we trap the guest's attempts to read/write the MPAM control registers, the hardware continues to use them. Guest-EL0 uses KVM's user-space's configuration, as the value is left in the register, and guest-EL1 uses either the host kernel's configuration, or in the case of VHE, the UNKNOWN reset value of MPAM1_EL1. We want to force the guest-EL1 to use KVM's user-space's MPAM configuration. On nVHE rely on MPAM0_EL1 and MPAM1_EL1 always being programmed the same and on VHE copy MPAM0_EL1 into the guest's MPAM1_EL1. There is no need to restore as this is out of context once TGE is set. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Marc Zyngier Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index ae763061909a..33bcaa03da89 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -51,6 +51,21 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) } NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); +/* + * The _EL0 value was written by the host's context switch and belongs to the + * VMM. Copy this into the guest's _EL1 register. + */ +static inline void __mpam_guest_load(void) +{ + u64 mask = MPAM0_EL1_PARTID_D | MPAM0_EL1_PARTID_I | MPAM0_EL1_PMG_D | MPAM0_EL1_PMG_I; + + if (system_supports_mpam()) { + u64 val = (read_sysreg_s(SYS_MPAM0_EL1) & mask) | MPAM1_EL1_MPAMEN; + + write_sysreg_el1(val, SYS_MPAM1); + } +} + /** * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU * @@ -89,6 +104,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); + __mpam_guest_load(); __sysreg_restore_el1_state(guest_ctxt); vcpu_set_flag(vcpu, SYSREGS_ON_CPU); -- Gitee From 559a7e7d40ae87b33efef9cdfa689e623e07ad26 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:30 +0000 Subject: [PATCH 095/124] arm_mpam: resctrl: Add boilerplate cpuhp and domain allocation ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-15-ben.horgan@arm.com resctrl has its own data structures to describe its resources. We can't use these directly as we play tricks with the 'MBA' resource, picking the MPAM controls or monitors that best apply. We may export the same component as both L3 and MBA. Add mpam_resctrl_res[] as the array of class->resctrl mappings we are exporting, and add the cpuhp hooks that allocated and free the resctrl domain structures. Only the mpam control feature are considered here and monitor support will be added later. While we're here, plumb in a few other obvious things. CONFIG_ARM_CPU_RESCTRL is used to allow this code to be built even though it can't yet be linked against resctrl. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/Makefile | 1 + drivers/resctrl/mpam_devices.c | 12 ++ drivers/resctrl/mpam_internal.h | 21 +++ drivers/resctrl/mpam_resctrl.c | 324 ++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 3 + 5 files changed, 361 insertions(+) create mode 100644 drivers/resctrl/mpam_resctrl.c diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 898199dcf80d..40beaf999582 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o mpam-y += mpam_devices.o +mpam-$(CONFIG_ARM_CPU_RESCTRL) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index e410fd3b210b..1a66a6235f65 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1612,6 +1612,9 @@ static int mpam_cpu_online(unsigned int cpu) mpam_reprogram_msc(msc); } + if (mpam_is_enabled()) + return mpam_resctrl_online_cpu(cpu); + return 0; } @@ -1655,6 +1658,9 @@ static int mpam_cpu_offline(unsigned int cpu) { struct mpam_msc *msc; + if (mpam_is_enabled()) + mpam_resctrl_offline_cpu(cpu); + guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, srcu_read_lock_held(&mpam_srcu)) { @@ -2505,6 +2511,12 @@ static void mpam_enable_once(void) mutex_unlock(&mpam_list_lock); cpus_read_unlock(); + if (!err) { + err = mpam_resctrl_setup(); + if (err) + pr_err("Failed to initialise resctrl: %d\n", err); + } + if (err) { mpam_disable_reason = "Failed to enable."; schedule_work(&mpam_broken_work); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 4632985bcca6..28ac501e1ac3 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -337,6 +338,16 @@ struct mpam_msc_ris { struct mpam_garbage garbage; }; +struct mpam_resctrl_dom { + struct mpam_component *ctrl_comp; + struct rdt_ctrl_domain resctrl_ctrl_dom; +}; + +struct mpam_resctrl_res { + struct mpam_class *class; + struct rdt_resource resctrl_res; +}; + static inline int mpam_alloc_csu_mon(struct mpam_class *class) { struct mpam_props *cprops = &class->props; @@ -391,6 +402,16 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +#ifdef CONFIG_RESCTRL_FS +int mpam_resctrl_setup(void); +int mpam_resctrl_online_cpu(unsigned int cpu); +void mpam_resctrl_offline_cpu(unsigned int cpu); +#else +static inline int mpam_resctrl_setup(void) { return 0; } +static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } +static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } +#endif /* CONFIG_RESCTRL_FS */ + /* * MPAM MSCs have the following register layout. See: * Arm Memory System Resource Partitioning and Monitoring (MPAM) System diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c new file mode 100644 index 000000000000..e698b534e3db --- /dev/null +++ b/drivers/resctrl/mpam_resctrl.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mpam_internal.h" + +/* + * The classes we've picked to map to resctrl resources, wrapped + * in with their resctrl structure. + * Class pointer may be NULL. + */ +static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; + +#define for_each_mpam_resctrl_control(res, rid) \ + for (rid = 0, res = &mpam_resctrl_controls[rid]; \ + rid < RDT_NUM_RESOURCES; \ + rid++, res = &mpam_resctrl_controls[rid]) + +/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ +static DEFINE_MUTEX(domain_list_lock); + +bool resctrl_arch_alloc_capable(void) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + for_each_mpam_resctrl_control(res, rid) { + if (res->resctrl_res.alloc_capable) + return true; + } + + return false; +} + +/* + * MSC may raise an error interrupt if it sees an out or range partid/pmg, + * and go on to truncate the value. Regardless of what the hardware supports, + * only the system wide safe value is safe to use. + */ +u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) +{ + return mpam_partid_max + 1; +} + +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &mpam_resctrl_controls[l].resctrl_res; +} + +static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) +{ + /* TODO: initialise the resctrl resources */ + + return 0; +} + +static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) +{ + struct mpam_class *class = comp->class; + + if (class->type == MPAM_CLASS_CACHE) + return comp->comp_id; + + /* TODO: repaint domain ids to match the L3 domain ids */ + /* Otherwise, expose the ID used by the firmware table code. */ + return comp->comp_id; +} + +static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, + enum resctrl_res_level rid, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + INIT_LIST_HEAD(&hdr->list); + hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); + hdr->rid = rid; + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +static void mpam_resctrl_online_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +/** + * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. + * @cpu: The CPU to remove from the domain. + * @hdr: The domain's header. + * + * Removes @cpu from the header mask. If this was the last CPU in the domain, + * the domain header is removed from its parent list and true is returned, + * indicating the parent structure can be freed. + * If there are other CPUs in the domain, returns false. + */ +static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_held(&domain_list_lock); + + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (cpumask_empty(&hdr->cpu_mask)) { + list_del_rcu(&hdr->list); + synchronize_rcu(); + return true; + } + + return false; +} + +static void mpam_resctrl_domain_insert(struct list_head *list, + struct rdt_domain_hdr *new) +{ + struct rdt_domain_hdr *err; + struct list_head *pos = NULL; + + lockdep_assert_held(&domain_list_lock); + + err = resctrl_find_domain(list, new->id, &pos); + if (WARN_ON_ONCE(err)) + return; + + list_add_tail_rcu(&new->list, pos); +} + +static struct mpam_resctrl_dom * +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +{ + int err; + struct mpam_resctrl_dom *dom; + struct rdt_ctrl_domain *ctrl_d; + struct mpam_class *class = res->class; + struct mpam_component *comp_iter, *ctrl_comp; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_held(&domain_list_lock); + + ctrl_comp = NULL; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp_iter, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { + ctrl_comp = comp_iter; + break; + } + } + + /* class has no component for this CPU */ + if (WARN_ON_ONCE(!ctrl_comp)) + return ERR_PTR(-EINVAL); + + dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!dom) + return ERR_PTR(-ENOMEM); + + if (r->alloc_capable) { + dom->ctrl_comp = ctrl_comp; + + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr); + ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; + err = resctrl_online_ctrl_domain(r, ctrl_d); + if (err) + goto free_domain; + + mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); + } else { + pr_debug("Skipped control domain online - no controls\n"); + } + return dom; + +free_domain: + kfree(dom); + dom = ERR_PTR(err); + + return dom; +} + +static struct mpam_resctrl_dom * +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +{ + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { + if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) + return dom; + } + + return NULL; +} + +int mpam_resctrl_online_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy_resource; + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (!dom) { + dom = mpam_resctrl_alloc_domain(cpu, res); + } else { + if (r->alloc_capable) { + struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom; + + mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); + } + } + if (IS_ERR(dom)) + return PTR_ERR(dom); + } + + resctrl_online_cpu(cpu); + + return 0; +} + +void mpam_resctrl_offline_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + resctrl_offline_cpu(cpu); + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_ctrl_domain *ctrl_d; + bool ctrl_dom_empty; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy resource + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (WARN_ON_ONCE(!dom)) + continue; + + if (r->alloc_capable) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } else { + ctrl_dom_empty = true; + } + + if (ctrl_dom_empty) + kfree(dom); + } +} + +int mpam_resctrl_setup(void) +{ + int err = 0; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + cpus_read_lock(); + for_each_mpam_resctrl_control(res, rid) { + INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + res->resctrl_res.rid = rid; + } + + /* TODO: pick MPAM classes to map to resctrl resources */ + + /* Initialise the resctrl structures from the classes */ + for_each_mpam_resctrl_control(res, rid) { + if (!res->class) + continue; // dummy resource + + err = mpam_resctrl_control_init(res); + if (err) { + pr_debug("Failed to initialise rid %u\n", rid); + break; + } + } + cpus_read_unlock(); + + if (err) { + pr_debug("Internal error %d - resctrl not supported\n", err); + return err; + } + + if (!resctrl_arch_alloc_capable()) { + pr_debug("No alloc(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable()); + return -EOPNOTSUPP; + } + + /* TODO: call resctrl_init() */ + + return 0; +} diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7f00c5285a32..2c7d1413a401 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -49,6 +49,9 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, } #endif +bool resctrl_arch_alloc_capable(void); +bool resctrl_arch_mon_capable(void); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. -- Gitee From 1fc2bd221e5f8a0502f424a15538d72d8c4a4997 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:31 +0000 Subject: [PATCH 096/124] arm_mpam: resctrl: Pick the caches we will use as resctrl resources ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-16-ben.horgan@arm.com Systems with MPAM support may have a variety of control types at any point of their system layout. We can only expose certain types of control, and only if they exist at particular locations. Start with the well-known caches. These have to be depth 2 or 3 and support MPAM's cache portion bitmap controls, with a number of portions fewer than resctrl's limit. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 91 +++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index e698b534e3db..b41b72200590 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -65,9 +65,95 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) return &mpam_resctrl_controls[l].resctrl_res; } +static bool cache_has_usable_cpor(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_cpor_part, cprops)) + return false; + + /* resctrl uses u32 for all bitmap configurations */ + return class->props.cpbm_wd <= 32; +} + +/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ +static void mpam_resctrl_pick_caches(void) +{ + struct mpam_class *class; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + if (class->type != MPAM_CLASS_CACHE) { + pr_debug("class %u is not a cache\n", class->level); + continue; + } + + if (class->level != 2 && class->level != 3) { + pr_debug("class %u is not L2 or L3\n", class->level); + continue; + } + + if (!cache_has_usable_cpor(class)) { + pr_debug("class %u cache misses CPOR\n", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level, + cpumask_pr_args(&class->affinity), + cpumask_pr_args(cpu_possible_mask)); + continue; + } + + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + res->class = class; + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { - /* TODO: initialise the resctrl resources */ + struct mpam_class *class = res->class; + struct rdt_resource *r = &res->resctrl_res; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + r->schema_fmt = RESCTRL_SCHEMA_BITMAP; + r->cache.arch_has_sparse_bitmasks = true; + + r->cache.cbm_len = class->props.cpbm_wd; + /* mpam_devices will reject empty bitmaps */ + r->cache.min_cbm_bits = 1; + + if (r->rid == RDT_RESOURCE_L2) { + r->name = "L2"; + r->ctrl_scope = RESCTRL_L2_CACHE; + r->cdp_capable = true; + } else { + r->name = "L3"; + r->ctrl_scope = RESCTRL_L3_CACHE; + r->cdp_capable = true; + } + + /* + * Which bits are shared with other ...things... Unknown + * devices use partid-0 which uses all the bitmap fields. Until + * we have configured the SMMU and GIC not to do this 'all the + * bits' is the correct answer here. + */ + r->cache.shareable_bits = resctrl_get_default_ctrl(r); + r->alloc_capable = true; + break; + default: + return -EINVAL; + } return 0; } @@ -292,7 +378,8 @@ int mpam_resctrl_setup(void) res->resctrl_res.rid = rid; } - /* TODO: pick MPAM classes to map to resctrl resources */ + /* Find some classes to use for controls */ + mpam_resctrl_pick_caches(); /* Initialise the resctrl structures from the classes */ for_each_mpam_resctrl_control(res, rid) { -- Gitee From 864bf75b17c764e7c9e9536446db00c9037b817e Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:32 +0000 Subject: [PATCH 097/124] arm_mpam: resctrl: Implement resctrl_arch_reset_all_ctrls() ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-17-ben.horgan@arm.com We already have a helper for resetting an mpam class and component. Hook it up to resctrl_arch_reset_all_ctrls() and the domain offline path. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Shaopeng Tan Reviewed-by: Zeng Heng Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 2 +- drivers/resctrl/mpam_internal.h | 3 +++ drivers/resctrl/mpam_resctrl.c | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 1a66a6235f65..05b2ea003379 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2556,7 +2556,7 @@ static void mpam_reset_component_locked(struct mpam_component *comp) } } -static void mpam_reset_class_locked(struct mpam_class *class) +void mpam_reset_class_locked(struct mpam_class *class) { struct mpam_component *comp; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 28ac501e1ac3..e2704f678af5 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -392,6 +392,9 @@ extern u8 mpam_pmg_max; void mpam_enable(struct work_struct *work); void mpam_disable(struct work_struct *work); +/* Reset all the RIS in a class under cpus_read_lock() */ +void mpam_reset_class_locked(struct mpam_class *class); + int mpam_apply_config(struct mpam_component *comp, u16 partid, struct mpam_config *cfg); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index b41b72200590..66cd6d813733 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -170,6 +170,19 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) +{ + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + mpam_reset_class_locked(res->class); +} + static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, enum resctrl_res_level rid, struct rdt_domain_hdr *hdr) -- Gitee From f82aea325303deb10f2fb8e5395a8896da4e70fe Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:33 +0000 Subject: [PATCH 098/124] arm_mpam: resctrl: Add resctrl_arch_get_config() ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-18-ben.horgan@arm.com Implement resctrl_arch_get_config() by testing the live configuration for a CPOR bitmap. For any other configuration type return the default. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 43 ++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 66cd6d813733..1d9004179374 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -170,6 +170,49 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type type) +{ + u32 partid; + struct mpam_config *cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + enum mpam_device_features configured_by; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return resctrl_get_default_ctrl(r); + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + partid = resctrl_get_config_index(closid, type); + cfg = &dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + configured_by = mpam_feat_cpor_part; + break; + default: + return resctrl_get_default_ctrl(r); + } + + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) || + !mpam_has_feature(configured_by, cfg)) + return resctrl_get_default_ctrl(r); + + switch (configured_by) { + case mpam_feat_cpor_part: + return cfg->cpbm; + default: + return resctrl_get_default_ctrl(r); + } +} + void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct mpam_resctrl_res *res; -- Gitee From d20d9b1f12b998494cdf4c099b2f2c9de508321f Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:34 +0000 Subject: [PATCH 099/124] arm_mpam: resctrl: Implement helpers to update configuration ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-19-ben.horgan@arm.com resctrl has two helpers for updating the configuration. resctrl_arch_update_one() updates a single value, and is used by the software-controller to apply feedback to the bandwidth controls, it has to be called on one of the CPUs in the resctrl:domain. resctrl_arch_update_domains() copies multiple staged configurations, it can be called from anywhere. Both helpers should update any changes to the underlying hardware. Implement resctrl_arch_update_domains() to use resctrl_arch_update_one(). Neither need to be called on a specific CPU as the mpam driver will send IPIs as needed. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 70 ++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 1d9004179374..827e777a67a5 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -213,6 +213,76 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, } } +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + u32 partid; + struct mpam_config cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + /* + * No need to check the CPU as mpam_apply_config() doesn't care, and + * resctrl_arch_update_domains() relies on this. + */ + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + partid = resctrl_get_config_index(closid, t); + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { + pr_debug("Not alloc capable or computed PARTID out of range\n"); + return -EINVAL; + } + + /* + * Copy the current config to avoid clearing other resources when the + * same component is exposed multiple times through resctrl. + */ + cfg = dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + cfg.cpbm = cfg_val; + mpam_set_feature(mpam_feat_cpor_part, &cfg); + break; + default: + return -EINVAL; + } + + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); +} + +int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) +{ + int err; + struct rdt_ctrl_domain *d; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) { + for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) { + struct resctrl_staged_config *cfg = &d->staged_config[t]; + + if (!cfg->have_new_ctrl) + continue; + + err = resctrl_arch_update_one(r, d, closid, t, + cfg->new_ctrl); + if (err) + return err; + } + } + + return 0; +} + void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct mpam_resctrl_res *res; -- Gitee From 3cd878d6d6a8e9e4507e73325cd1fd560b369d2f Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:35 +0000 Subject: [PATCH 100/124] arm_mpam: resctrl: Add plumbing against arm64 task and cpu hooks ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-20-ben.horgan@arm.com arm64 provides helpers for changing a task's and a cpu's mpam partid/pmg values. These are used to back a number of resctrl_arch_ functions. Connect them up. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 58 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 2 files changed, 63 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 827e777a67a5..8615c653e0c4 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,8 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); +static bool cdp_enabled; + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -57,6 +60,61 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) return mpam_partid_max + 1; } +void resctrl_arch_sched_in(struct task_struct *tsk) +{ + lockdep_assert_preemption_disabled(); + + mpam_thread_switch(tsk); +} + +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid); + } else { + /* + * When CDP is enabled, resctrl halves the closid range and we + * use odd/even partid for one closid. + */ + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid); + } +} + +void resctrl_arch_sync_cpu_closid_rmid(void *info) +{ + struct resctrl_cpu_defaults *r = info; + + lockdep_assert_preemption_disabled(); + + if (r) { + resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(), + r->closid, r->rmid); + } + + resctrl_arch_sched_in(current); +} + +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid); + } else { + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid); + } +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 2c7d1413a401..5a78299ec464 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -52,6 +52,11 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, bool resctrl_arch_alloc_capable(void); bool resctrl_arch_mon_capable(void); +void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); +void resctrl_arch_sched_in(struct task_struct *tsk); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. -- Gitee From 248f6c569ba27d1160ce70b80522a631e6a629ea Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:36 +0000 Subject: [PATCH 101/124] arm_mpam: resctrl: Add CDP emulation ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-21-ben.horgan@arm.com Intel RDT's CDP feature allows the cache to use a different control value depending on whether the accesses was for instruction fetch or a data access. MPAM's equivalent feature is the other way up: the CPU assigns a different partid label to traffic depending on whether it was instruction fetch or a data access, which causes the cache to use a different control value based solely on the partid. MPAM can emulate CDP, with the side effect that the alternative partid is seen by all MSC, it can't be enabled per-MSC. Add the resctrl hooks to turn this on or off. Add the helpers that match a closid against a task, which need to be aware that the value written to hardware is not the same as the one resctrl is using. Update the 'arm64_mpam_global_default' variable the arch code uses during context switch to know when the per-cpu value should be used instead. Also, update these per-cpu values and sync the resulting mpam partid/pmg configuration to hardware. resctrl can enable CDP for L2 caches, L3 caches or both. When it is enabled by one and not the other MPAM globally enabled CDP but hides the effect on the other cache resource. This hiding is possible as CPOR is the only supported cache control and that uses a resource bitmap; two partids with the same bitmap act as one. Awkwardly, the MB controls don't implement CDP and CDP can't be hidden as the memory bandwidth control is a maximum per partid which can't be modelled with more partids. If the total maximum is used for both the data and instruction partids then then the maximum may be exceeded and if it is split in two then the one using more bandwidth will hit a lower limit. Hence, hide the MB controls completely if CDP is enabled for any resource. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Cc: Dave Martin Cc: Amit Singh Tomar Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/include/asm/mpam.h | 1 + drivers/resctrl/mpam_internal.h | 1 + drivers/resctrl/mpam_resctrl.c | 122 ++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 2 + 4 files changed, 126 insertions(+) diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 05aa71200f61..70d396e7b6da 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -4,6 +4,7 @@ #ifndef __ASM__MPAM_H #define __ASM__MPAM_H +#include #include #include #include diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e2704f678af5..57c3d9b962b9 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -346,6 +346,7 @@ struct mpam_resctrl_dom { struct mpam_resctrl_res { struct mpam_class *class; struct rdt_resource resctrl_res; + bool cdp_enabled; }; static inline int mpam_alloc_csu_mon(struct mpam_class *class) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 8615c653e0c4..903d1a0f564f 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -35,6 +35,10 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); +/* + * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1. + * This applies globally to all traffic the CPU generates. + */ static bool cdp_enabled; bool resctrl_arch_alloc_capable(void) @@ -50,6 +54,74 @@ bool resctrl_arch_alloc_capable(void) return false; } +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) +{ + return mpam_resctrl_controls[rid].cdp_enabled; +} + +/** + * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks. + * + * At boot, all existing tasks use partid zero for D and I. + * To enable/disable CDP emulation, all these tasks need relabelling. + */ +static void resctrl_reset_task_closids(void) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + } + read_unlock(&tasklist_lock); +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) +{ + u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; + int cpu; + + /* + * resctrl_arch_set_cdp_enabled() is only called with enable set to + * false on error and unmount. + */ + cdp_enabled = enable; + mpam_resctrl_controls[rid].cdp_enabled = enable; + + /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */ + if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; + + if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled && + mpam_resctrl_controls[RDT_RESOURCE_MBA].class) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true; + + if (enable) { + if (mpam_partid_max < 1) + return -EINVAL; + + partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA); + partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE); + } + + mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0); + WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current)); + + resctrl_reset_task_closids(); + + for_each_possible_cpu(cpu) + mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0); + on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1); + + return 0; +} + +static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid) +{ + return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid); +} + /* * MSC may raise an error interrupt if it sees an out or range partid/pmg, * and go on to truncate the value. Regardless of what the hardware supports, @@ -115,6 +187,30 @@ void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) } } +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return tsk_closid == closid; +} + +/* The task's pmg is not unique, the partid must be considered too */ +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return (tsk_closid == closid) && (tsk_rmid == rmid); +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) @@ -247,6 +343,14 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); cprops = &res->class->props; + /* + * When CDP is enabled, but the resource doesn't support it, + * the control is cloned across both partids. + * Pick one at random to read: + */ + if (mpam_resctrl_hide_cdp(r->rid)) + type = CDP_DATA; + partid = resctrl_get_config_index(closid, type); cfg = &dom->ctrl_comp->cfg[partid]; @@ -274,6 +378,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { + int err; u32 partid; struct mpam_config cfg; struct mpam_props *cprops; @@ -291,6 +396,9 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); cprops = &res->class->props; + if (mpam_resctrl_hide_cdp(r->rid)) + t = CDP_DATA; + partid = resctrl_get_config_index(closid, t); if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { pr_debug("Not alloc capable or computed PARTID out of range\n"); @@ -313,6 +421,20 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, return -EINVAL; } + /* + * When CDP is enabled, but the resource doesn't support it, we need to + * apply the same configuration to the other partid. + */ + if (mpam_resctrl_hide_cdp(r->rid)) { + partid = resctrl_get_config_index(closid, CDP_CODE); + err = mpam_apply_config(dom->ctrl_comp, partid, &cfg); + if (err) + return err; + + partid = resctrl_get_config_index(closid, CDP_DATA); + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); + } + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); } diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 5a78299ec464..d329b1dc148b 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -56,6 +56,8 @@ void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); /** * mpam_register_requestor() - Register a requestor with the MPAM driver -- Gitee From a3bbd8e8c41f23f47898ebef221587ebff111f8a Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Wed, 11 Mar 2026 10:49:02 +0000 Subject: [PATCH 102/124] arm_mpam: resctrl: Hide CDP emulation behind CONFIG_EXPERT ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-22-ben.horgan@arm.com When CDP is not enabled, the 'rmid_entry's in the limbo list, rmid_busy_llc, map directly to a (PARTID,PMG) pair and when CDP is enabled the mapping is to two different pairs. As the limbo list is reused between mounts and CDP disabled on unmount this can lead to stale mapping and the limbo handler will then make monitor reads with potentially out of range PARTID. This may then cause an MPAM error interrupt and the driver will disable MPAM. No problems are expected if you just mount the resctrl file system once with CDP enabled and never unmount it. Hide CDP emulation behind CONFIG_EXPERT to protect the unwary. Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 903d1a0f564f..cab3e9ccb5c7 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -82,6 +82,18 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; int cpu; + if (!IS_ENABLED(CONFIG_EXPERT) && enable) { + /* + * If the resctrl fs is mounted more than once, sequentially, + * then CDP can lead to the use of out of range PARTIDs. + */ + pr_warn("CDP not supported\n"); + return -EOPNOTSUPP; + } + + if (enable) + pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n"); + /* * resctrl_arch_set_cdp_enabled() is only called with enable set to * false on error and unmount. -- Gitee From ad1fd7765108b210a8d2164d780cc24af1fd25c3 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 5 Dec 2025 21:58:38 +0000 Subject: [PATCH 103/124] arm_mpam: resctrl: Convert to/from MPAMs fixed-point formats ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-23-ben.horgan@arm.com MPAM uses a fixed-point formats for some hardware controls. Resctrl provides the bandwidth controls as a percentage. Add helpers to convert between these. Ensure bwa_wd is at most 16 to make it clear higher values have no meaning. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: Dave Martin Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 7 +++++ drivers/resctrl/mpam_resctrl.c | 51 ++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 05b2ea003379..9964d854d7e7 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -713,6 +713,13 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_mbw_part, props); props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); + + /* + * The BWA_WD field can represent 0-63, but the control fields it + * describes have a maximum of 16 bits. + */ + props->bwa_wd = min(props->bwa_wd, 16); + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) mpam_set_feature(mpam_feat_mbw_max, props); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index cab3e9ccb5c7..adaec522c1a1 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -242,6 +243,56 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return class->props.cpbm_wd <= 32; } +/* + * Each fixed-point hardware value architecturally represents a range + * of values: the full range 0% - 100% is split contiguously into + * (1 << cprops->bwa_wd) equal bands. + * + * Although the bwa_bwd fields have 6 bits the maximum valid value is 16 + * as it reports the width of fields that are at most 16 bits. When + * fewer than 16 bits are valid the least significant bits are + * ignored. The implied binary point is kept between bits 15 and 16 and + * so the valid bits are leftmost. + * + * See ARM IHI0099B.a "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format" for more information. + * + * Find the nearest percentage value to the upper bound of the selected band: + */ +static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +{ + u32 val = mbw_max; + + val >>= 16 - cprops->bwa_wd; + val += 1; + val *= MAX_MBA_BW; + val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); + + return val; +} + +/* + * Find the band whose upper bound is closest to the specified percentage. + * + * A round-to-nearest policy is followed here as a balanced compromise + * between unexpected under-commit of the resource (where the total of + * a set of resource allocations after conversion is less than the + * expected total, due to rounding of the individual converted + * percentages) and over-commit (where the total of the converted + * allocations is greater than expected). + */ +static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +{ + u32 val = pc; + + val <<= cprops->bwa_wd; + val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); + val = max(val, 1) - 1; + val <<= 16 - cprops->bwa_wd; + + return val; +} + /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { -- Gitee From 885d94e2b9cbfc40bf57bd22a5b3a9a741a66203 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 9 Jan 2026 12:30:09 +0000 Subject: [PATCH 104/124] arm_mpam: resctrl: Add rmid index helpers ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-24-ben.horgan@arm.com Because MPAM's pmg aren't identical to RDT's rmid, resctrl handles some data structures by index. This allows x86 to map indexes to RMID, and MPAM to map them to partid-and-pmg. Add the helpers to do this. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Suggested-by: James Morse Reviewed-by: Jonathan Cameron Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 16 ++++++++++++++++ include/linux/arm_mpam.h | 3 +++ 2 files changed, 19 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index adaec522c1a1..940446395ae1 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -145,6 +145,22 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) return mpam_partid_max + 1; } +u32 resctrl_arch_system_num_rmid_idx(void) +{ + return (mpam_pmg_max + 1) * (mpam_partid_max + 1); +} + +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid) +{ + return closid * (mpam_pmg_max + 1) + rmid; +} + +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *closid = idx / (mpam_pmg_max + 1); + *rmid = idx % (mpam_pmg_max + 1); +} + void resctrl_arch_sched_in(struct task_struct *tsk) { lockdep_assert_preemption_disabled(); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index d329b1dc148b..7d23c90f077d 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -58,6 +58,9 @@ void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); +u32 resctrl_arch_system_num_rmid_idx(void); /** * mpam_register_requestor() - Register a requestor with the MPAM driver -- Gitee From a9af276895c80fdc026551798be66e309ca72b84 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 8 Jan 2026 16:59:00 +0000 Subject: [PATCH 105/124] arm_mpam: resctrl: Wait for cacheinfo to be ready ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-25-ben.horgan@arm.com In order to calculate the rmid realloc threshold the size of the cache needs to be known. Cache domains will also be named after the cache id. So that this information can be extracted from cacheinfo we need to wait for it to be ready. The cacheinfo information is populated in device_initcall() so we wait for that. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse [horgan: split out from another patch] Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 940446395ae1..93c8a9608ed4 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -42,6 +43,13 @@ static DEFINE_MUTEX(domain_list_lock); */ static bool cdp_enabled; +/* + * We use cacheinfo to discover the size of the caches and their id. cacheinfo + * populates this from a device_initcall(). mpam_resctrl_setup() must wait. + */ +static bool cacheinfo_ready; +static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -757,6 +765,8 @@ int mpam_resctrl_setup(void) struct mpam_resctrl_res *res; enum resctrl_res_level rid; + wait_event(wait_cacheinfo_ready, cacheinfo_ready); + cpus_read_lock(); for_each_mpam_resctrl_control(res, rid) { INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); @@ -794,3 +804,12 @@ int mpam_resctrl_setup(void) return 0; } + +static int __init __cacheinfo_ready(void) +{ + cacheinfo_ready = true; + wake_up(&wait_cacheinfo_ready); + + return 0; +} +device_initcall_sync(__cacheinfo_ready); -- Gitee From dd273549693ca6113044578e492ab3a49189449d Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:39 +0000 Subject: [PATCH 106/124] arm_mpam: resctrl: Add support for 'MB' resource ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-26-ben.horgan@arm.com resctrl supports 'MB', as a percentage throttling of traffic from the L3. This is the control that mba_sc uses, so ideally the class chosen should be as close as possible to the counters used for mbm_total. If there is a single L3, it's the last cache, and the topology of the memory matches then the traffic at the memory controller will be equivalent to that at egress of the L3. If these conditions are met allow the memory class to back MB. MB's percentage control should be backed either with the fixed point fraction MBW_MAX or bandwidth portion bitmaps. The bandwidth portion bitmaps is not used as its tricky to pick which bits to use to avoid contention, and may be possible to expose this as something other than a percentage in the future. Tested-by: Shaopeng Tan Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: James Morse > Signed-off-by: Ben Horgan [backport changes] devel-6.6 does not support automatic cleanup / scope-based resource management. Need to adapt cpumask_var_t __free(free_cpumask_var) for devel-6.6. Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 293 ++++++++++++++++++++++++++++++++- 1 file changed, 292 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 93c8a9608ed4..e5437fdc2ac9 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -267,6 +267,33 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return class->props.cpbm_wd <= 32; } +static bool mba_class_use_mbw_max(struct mpam_props *cprops) +{ + return (mpam_has_feature(mpam_feat_mbw_max, cprops) && + cprops->bwa_wd); +} + +static bool class_has_usable_mba(struct mpam_props *cprops) +{ + return mba_class_use_mbw_max(cprops); +} + +/* + * Calculate the worst-case percentage change from each implemented step + * in the control. + */ +static u32 get_mba_granularity(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) + return 0; + + /* + * bwa_wd is the number of bits implemented in the 0.xxx + * fixed point fraction. 1 bit is 50%, 2 is 25% etc. + */ + return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd); +} + /* * Each fixed-point hardware value architecturally represents a range * of values: the full range 0% - 100% is split contiguously into @@ -317,6 +344,172 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) return val; } +static u32 get_mba_min(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) { + WARN_ON_ONCE(1); + return 0; + } + + return mbw_max_to_percent(0, cprops); +} + +/* Find the L3 cache that has affinity with this CPU */ +static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask) +{ + u32 cache_id = get_cpu_cacheinfo_id(cpu, 3); + + lockdep_assert_cpus_held(); + + return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask); +} + +static bool __free_cpumask_var(cpumask_var_t cpumask, bool ret) +{ + free_cpumask_var(cpumask); + return ret; +} + +/* + * topology_matches_l3() - Is the provided class the same shape as L3 + * @victim: The class we'd like to pretend is L3. + * + * resctrl expects all the world's a Xeon, and all counters are on the + * L3. We allow some mapping counters on other classes. This requires + * that the CPU->domain mapping is the same kind of shape. + * + * Using cacheinfo directly would make this work even if resctrl can't + * use the L3 - but cacheinfo can't tell us anything about offline CPUs. + * Using the L3 resctrl domain list also depends on CPUs being online. + * Using the mpam_class we picked for L3 so we can use its domain list + * assumes that there are MPAM controls on the L3. + * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id() + * helper which can tell us about offline CPUs ... but getting the cache_id + * to start with relies on at least one CPU per L3 cache being online at + * boot. + * + * Walk the victim component list and compare the affinity mask with the + * corresponding L3. The topology matches if each victim:component's affinity + * mask is the same as the CPU's corresponding L3's. These lists/masks are + * computed from firmware tables so don't change at runtime. + */ +static bool topology_matches_l3(struct mpam_class *victim) +{ + int cpu, err; + struct mpam_component *victim_iter; + + lockdep_assert_cpus_held(); + + cpumask_var_t tmp_cpumask; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) + return false; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(victim_iter, &victim->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_empty(&victim_iter->affinity)) { + pr_debug("class %u has CPU-less component %u - can't match L3!\n", + victim->level, victim_iter->comp_id); + return __free_cpumask_var(tmp_cpumask, false); + } + + cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return __free_cpumask_var(tmp_cpumask, false); + + cpumask_clear(tmp_cpumask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3's equivalent component to class %u component %u\n", + victim->level, victim_iter->comp_id); + return __free_cpumask_var(tmp_cpumask, false); + } + + /* Any differing bits in the affinity mask? */ + if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) { + pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n" + "L3:%*pbl != victim:%*pbl\n", + victim->level, victim_iter->comp_id, + cpumask_pr_args(tmp_cpumask), + cpumask_pr_args(&victim_iter->affinity)); + + return __free_cpumask_var(tmp_cpumask, false); + } + } + + return __free_cpumask_var(tmp_cpumask, true); +} + +/* + * Test if the traffic for a class matches that at egress from the L3. For + * MSC at memory controllers this is only possible if there is a single L3 + * as otherwise the counters at the memory can include bandwidth from the + * non-local L3. + */ +static bool traffic_matches_l3(struct mpam_class *class) +{ + int err, cpu; + + lockdep_assert_cpus_held(); + + if (class->type == MPAM_CLASS_CACHE && class->level == 3) + return true; + + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a different cache from L3\n", class->level); + return false; + } + + if (class->type != MPAM_CLASS_MEMORY) { + pr_debug("class %u is neither of type cache or memory\n", class->level); + return false; + } + + cpumask_var_t tmp_cpumask; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) { + pr_debug("cpumask allocation failed\n"); + return false; + } + + if (class->type != MPAM_CLASS_MEMORY) { + pr_debug("class %u is neither of type cache or memory\n", + class->level); + return __free_cpumask_var(tmp_cpumask, false); + } + + cpu = cpumask_any_and(&class->affinity, cpu_online_mask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3 downstream to cpu %d\n", cpu); + return __free_cpumask_var(tmp_cpumask, false); + } + + if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) { + pr_debug("There is more than one L3\n"); + return __free_cpumask_var(tmp_cpumask, false); + } + + /* Be strict; the traffic might stop in the intermediate cache. */ + if (get_cpu_cacheinfo_id(cpu, 4) != -1) { + pr_debug("L3 isn't the last level of cache\n"); + return __free_cpumask_var(tmp_cpumask, false); + } + + if (num_possible_nodes() > 1) { + pr_debug("There is more than one numa node\n"); + return __free_cpumask_var(tmp_cpumask, false); + } + +#ifdef CONFIG_HMEM_REPORTING + if (node_devices[cpu_to_node(cpu)]->cache_dev) { + pr_debug("There is a memory side cache\n"); + return __free_cpumask_var(tmp_cpumask, false); + } +#endif + + return __free_cpumask_var(tmp_cpumask, true); +} + /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { @@ -358,9 +551,68 @@ static void mpam_resctrl_pick_caches(void) } } +static void mpam_resctrl_pick_mba(void) +{ + struct mpam_class *class, *candidate_class = NULL; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_props *cprops = &class->props; + + if (class->level != 3 && class->type == MPAM_CLASS_CACHE) { + pr_debug("class %u is a cache but not the L3\n", class->level); + continue; + } + + if (!class_has_usable_mba(cprops)) { + pr_debug("class %u has no bandwidth control\n", + class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs\n", class->level); + continue; + } + + if (!topology_matches_l3(class)) { + pr_debug("class %u topology doesn't match L3\n", + class->level); + continue; + } + + if (!traffic_matches_l3(class)) { + pr_debug("class %u traffic doesn't match L3 egress\n", + class->level); + continue; + } + + /* + * Pick a resource to be MBA that as close as possible to + * the L3. mbm_total counts the bandwidth leaving the L3 + * cache and MBA should correspond as closely as possible + * for proper operation of mba_sc. + */ + if (!candidate_class || class->level < candidate_class->level) + candidate_class = class; + } + + if (candidate_class) { + pr_debug("selected class %u to back MBA\n", + candidate_class->level); + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + res->class = candidate_class; + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { struct mpam_class *class = res->class; + struct mpam_props *cprops = &class->props; struct rdt_resource *r = &res->resctrl_res; switch (r->rid) { @@ -392,6 +644,19 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->cache.shareable_bits = resctrl_get_default_ctrl(r); r->alloc_capable = true; break; + case RDT_RESOURCE_MBA: + r->schema_fmt = RESCTRL_SCHEMA_RANGE; + r->ctrl_scope = RESCTRL_L3_CACHE; + + r->membw.delay_linear = true; + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->membw.min_bw = get_mba_min(cprops); + r->membw.max_bw = MAX_MBA_BW; + r->membw.bw_gran = get_mba_granularity(cprops); + + r->name = "MB"; + r->alloc_capable = true; + break; default: return -EINVAL; } @@ -406,7 +671,17 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; - /* TODO: repaint domain ids to match the L3 domain ids */ + if (topology_matches_l3(class)) { + /* Use the corresponding L3 component ID as the domain ID */ + int id = get_cpu_cacheinfo_id(cpu, 3); + + /* Implies topology_matches_l3() made a mistake */ + if (WARN_ON_ONCE(id == -1)) + return comp->comp_id; + + return id; + } + /* Otherwise, expose the ID used by the firmware table code. */ return comp->comp_id; } @@ -446,6 +721,12 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case RDT_RESOURCE_L3: configured_by = mpam_feat_cpor_part; break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + configured_by = mpam_feat_mbw_max; + break; + } + fallthrough; default: return resctrl_get_default_ctrl(r); } @@ -457,6 +738,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, switch (configured_by) { case mpam_feat_cpor_part: return cfg->cpbm; + case mpam_feat_mbw_max: + return mbw_max_to_percent(cfg->mbw_max, cprops); default: return resctrl_get_default_ctrl(r); } @@ -504,6 +787,13 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, cfg.cpbm = cfg_val; mpam_set_feature(mpam_feat_cpor_part, &cfg); break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); + mpam_set_feature(mpam_feat_mbw_max, &cfg); + break; + } + fallthrough; default: return -EINVAL; } @@ -775,6 +1065,7 @@ int mpam_resctrl_setup(void) /* Find some classes to use for controls */ mpam_resctrl_pick_caches(); + mpam_resctrl_pick_mba(); /* Initialise the resctrl structures from the classes */ for_each_mpam_resctrl_control(res, rid) { -- Gitee From a63c3289f6a2066dc0afe5be4e71e3546af77ea0 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 5 Dec 2025 21:58:40 +0000 Subject: [PATCH 107/124] arm_mpam: resctrl: Add kunit test for control format conversions ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-27-ben.horgan@arm.com resctrl specifies the format of the control schemes, and these don't match the hardware. Some of the conversions are a bit hairy - add some kunit tests. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: Dave Martin [morse: squashed enough of Dave's fixes in here that it's his patch now!] Signed-off-by: James Morse Signed-off-by: Ben Horgan [ backport changes ] GENMASK_U32 is introduced in 6.7, we have to use GENMASK in 6.6. Adapt test_all_bwa_wd_gen_params for 6.6 Kunit framework. Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 4 + drivers/resctrl/test_mpam_resctrl.c | 314 ++++++++++++++++++++++++++++ 2 files changed, 318 insertions(+) create mode 100644 drivers/resctrl/test_mpam_resctrl.c diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index e5437fdc2ac9..878365c548f6 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1104,3 +1104,7 @@ static int __init __cacheinfo_ready(void) return 0; } device_initcall_sync(__cacheinfo_ready); + +#ifdef CONFIG_MPAM_KUNIT_TEST +#include "test_mpam_resctrl.c" +#endif diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c new file mode 100644 index 000000000000..5ba1db42a7bf --- /dev/null +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. +/* This file is intended to be included into mpam_resctrl.c */ + +#include +#include +#include +#include +#include + +struct percent_value_case { + u8 pc; + u8 width; + u16 value; +}; + +/* + * Mysterious inscriptions taken from the union of ARM DDI 0598D.b, + * "Arm Architecture Reference Manual Supplement - Memory System + * Resource Partitioning and Monitoring (MPAM), for A-profile + * architecture", Section 9.8, "About the fixed-point fractional + * format" (exact percentage entries only) and ARM IHI0099B.a + * "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format": + */ +static const struct percent_value_case percent_value_cases[] = { + /* Architectural cases: */ + { 1, 8, 1 }, { 1, 12, 0x27 }, { 1, 16, 0x28e }, + { 25, 8, 0x3f }, { 25, 12, 0x3ff }, { 25, 16, 0x3fff }, + { 33, 8, 0x53 }, { 33, 12, 0x546 }, { 33, 16, 0x5479 }, + { 35, 8, 0x58 }, { 35, 12, 0x598 }, { 35, 16, 0x5998 }, + { 45, 8, 0x72 }, { 45, 12, 0x732 }, { 45, 16, 0x7332 }, + { 50, 8, 0x7f }, { 50, 12, 0x7ff }, { 50, 16, 0x7fff }, + { 52, 8, 0x84 }, { 52, 12, 0x850 }, { 52, 16, 0x851d }, + { 55, 8, 0x8b }, { 55, 12, 0x8cb }, { 55, 16, 0x8ccb }, + { 58, 8, 0x93 }, { 58, 12, 0x946 }, { 58, 16, 0x9479 }, + { 75, 8, 0xbf }, { 75, 12, 0xbff }, { 75, 16, 0xbfff }, + { 80, 8, 0xcb }, { 80, 12, 0xccb }, { 80, 16, 0xcccb }, + { 88, 8, 0xe0 }, { 88, 12, 0xe13 }, { 88, 16, 0xe146 }, + { 95, 8, 0xf2 }, { 95, 12, 0xf32 }, { 95, 16, 0xf332 }, + { 100, 8, 0xff }, { 100, 12, 0xfff }, { 100, 16, 0xffff }, +}; + +static void test_percent_value_desc(const struct percent_value_case *param, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, + "pc=%d, width=%d, value=0x%.*x\n", + param->pc, param->width, + DIV_ROUND_UP(param->width, 4), param->value); +} + +KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases, + test_percent_value_desc); + +struct percent_value_test_info { + u32 pc; /* result of value-to-percent conversion */ + u32 value; /* result of percent-to-value conversion */ + u32 max_value; /* maximum raw value allowed by test params */ + unsigned int shift; /* promotes raw testcase value to 16 bits */ +}; + +/* + * Convert a reference percentage to a fixed-point MAX value and + * vice-versa, based on param (not test->param_value!) + */ +static void __prepare_percent_value_test(struct kunit *test, + struct percent_value_test_info *res, + const struct percent_value_case *param) +{ + struct mpam_props fake_props = { }; + + /* Reject bogus test parameters that would break the tests: */ + KUNIT_ASSERT_GE(test, param->width, 1); + KUNIT_ASSERT_LE(test, param->width, 16); + KUNIT_ASSERT_LT(test, param->value, 1 << param->width); + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = param->width; + + res->shift = 16 - param->width; + res->max_value = GENMASK(param->width - 1, 0); + res->value = percent_to_mbw_max(param->pc, &fake_props); + res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props); +} + +static void test_get_mba_granularity(struct kunit *test) +{ + int ret; + struct mpam_props fake_props = { }; + + /* Use MBW_MAX */ + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + + fake_props.bwa_wd = 0; + KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props)); + + fake_props.bwa_wd = 1; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* Architectural maximum: */ + fake_props.bwa_wd = 16; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* No usable control... */ + fake_props.bwa_wd = 0; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 0); + + fake_props.bwa_wd = 1; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 50); /* DIV_ROUND_UP(100, 1 << 1)% = 50% */ + + fake_props.bwa_wd = 2; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 1 << 2)% = 25% */ + + fake_props.bwa_wd = 3; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 13); /* DIV_ROUND_UP(100, 1 << 3)% = 13% */ + + fake_props.bwa_wd = 6; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 2); /* DIV_ROUND_UP(100, 1 << 6)% = 2% */ + + fake_props.bwa_wd = 7; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 7)% = 1% */ + + /* Granularity saturates at 1% */ + fake_props.bwa_wd = 16; /* architectural maximum */ + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */ +} + +static void test_mbw_max_to_percent(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + /* + * Since the reference values in percent_value_cases[] all + * correspond to exact percentages, round-to-nearest will + * always give the exact percentage back when the MPAM max + * value has precision of 0.5% or finer. (Always true for the + * reference data, since they all specify 8 bits or more of + * precision. + * + * So, keep it simple and demand an exact match: + */ + __prepare_percent_value_test(test, &res, param); + KUNIT_EXPECT_EQ(test, res.pc, param->pc); +} + +static void test_percent_to_mbw_max(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + __prepare_percent_value_test(test, &res, param); + + KUNIT_EXPECT_GE(test, res.value, param->value << res.shift); + KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift); + KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift); + + /* No flexibility allowed for 0% and 100%! */ + + if (param->pc == 0) + KUNIT_EXPECT_EQ(test, res.value, 0); + + if (param->pc == 100) + KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift); +} + +static const void *test_all_bwa_wd_gen_params(const void *prev, char *desc) +{ + uintptr_t param = (uintptr_t)prev; + + if (param > 15) + return NULL; + + param++; + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param); + + return (void *)param; +} + +static unsigned int test_get_bwa_wd(struct kunit *test) +{ + uintptr_t param = (uintptr_t)test->param_value; + + KUNIT_ASSERT_GE(test, param, 1); + KUNIT_ASSERT_LE(test, param, 16); + + return param; +} + +static void test_mbw_max_to_percent_limits(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + u32 max_value; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + max_value = GENMASK(15, 16 - fake_props.bwa_wd); + + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props), + MAX_MBA_BW); + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), + get_mba_min(&fake_props)); + + /* + * Rounding policy dependent 0% sanity-check: + * With round-to-nearest, the minimum mbw_max value really + * should map to 0% if there are at least 200 steps. + * (100 steps may be enough for some other rounding policies.) + */ + if (fake_props.bwa_wd >= 8) + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0); + + if (fake_props.bwa_wd < 8 && + mbw_max_to_percent(0, &fake_props) == 0) + kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?", + fake_props.bwa_wd); +} + +/* + * Check that converting a percentage to mbw_max and back again (or, as + * appropriate, vice-versa) always restores the original value: + */ +static void test_percent_max_roundtrip_stability(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + unsigned int shift; + u32 pc, max, pc2, max2; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + shift = 16 - fake_props.bwa_wd; + + /* + * Converting a valid value from the coarser scale to the finer + * scale and back again must yield the original value: + */ + if (fake_props.bwa_wd >= 7) { + /* More than 100 steps: only test exact pc values: */ + for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) { + max = percent_to_mbw_max(pc, &fake_props); + pc2 = mbw_max_to_percent(max, &fake_props); + KUNIT_EXPECT_EQ(test, pc2, pc); + } + } else { + /* Fewer than 100 steps: only test exact mbw_max values: */ + for (max = 0; max < 1 << 16; max += 1 << shift) { + pc = mbw_max_to_percent(max, &fake_props); + max2 = percent_to_mbw_max(pc, &fake_props); + KUNIT_EXPECT_EQ(test, max2, max); + } + } +} + +static void test_percent_to_max_rounding(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + unsigned int num_rounded_up = 0, total = 0; + struct percent_value_test_info res; + + for (param = percent_value_cases, total = 0; + param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)]; + param++, total++) { + __prepare_percent_value_test(test, &res, param); + if (res.value > param->value << res.shift) + num_rounded_up++; + } + + /* + * The MPAM driver applies a round-to-nearest policy, whereas a + * round-down policy seems to have been applied in the + * reference table from which the test vectors were selected. + * + * For a large and well-distributed suite of test vectors, + * about half should be rounded up and half down compared with + * the reference table. The actual test vectors are few in + * number and probably not very well distributed however, so + * tolerate a round-up rate of between 1/4 and 3/4 before + * crying foul: + */ + + kunit_info(test, "Round-up rate: %u%% (%u/%u)\n", + DIV_ROUND_CLOSEST(num_rounded_up * 100, total), + num_rounded_up, total); + + KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total); + KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total); +} + +static struct kunit_case mpam_resctrl_test_cases[] = { + KUNIT_CASE(test_get_mba_granularity), + KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params), + KUNIT_CASE(test_percent_to_max_rounding), + KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability, + test_all_bwa_wd_gen_params), + {} +}; + +static struct kunit_suite mpam_resctrl_test_suite = { + .name = "mpam_resctrl_test_suite", + .test_cases = mpam_resctrl_test_cases, +}; + +kunit_test_suites(&mpam_resctrl_test_suite); -- Gitee From 63ad4f2e92823a4f633c93aae5be81e9a392615c Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 19 Feb 2026 16:08:13 +0000 Subject: [PATCH 108/124] arm_mpam: resctrl: Add monitor initialisation and domain boilerplate ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-28-ben.horgan@arm.com Add the boilerplate that tells resctrl about the mpam monitors that are available. resctrl expects all (non-telemetry) monitors to be on the L3 and so advertise them there and invent an L3 resctrl resource if required. The L3 cache itself has to exist as the cache ids are used as the domain ids. Bring the resctrl monitor domains online and offline based on the cpus they contain. Support for specific monitor types is left to later. Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Jonathan Cameron Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_internal.h | 15 +++ drivers/resctrl/mpam_resctrl.c | 231 ++++++++++++++++++++++++++++++-- 2 files changed, 235 insertions(+), 11 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 57c3d9b962b9..d58428ba2005 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -340,7 +340,16 @@ struct mpam_msc_ris { struct mpam_resctrl_dom { struct mpam_component *ctrl_comp; + + /* + * There is no single mon_comp because different events may be backed + * by different class/components. mon_comp is indexed by the event + * number. + */ + struct mpam_component *mon_comp[QOS_NUM_EVENTS]; + struct rdt_ctrl_domain resctrl_ctrl_dom; + struct rdt_l3_mon_domain resctrl_mon_dom; }; struct mpam_resctrl_res { @@ -349,6 +358,12 @@ struct mpam_resctrl_res { bool cdp_enabled; }; +struct mpam_resctrl_mon { + struct mpam_class *class; + + /* per-class data that resctrl needs will live here */ +}; + static inline int mpam_alloc_csu_mon(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 878365c548f6..1ec929d71cf9 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -34,6 +34,23 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; rid < RDT_NUM_RESOURCES; \ rid++, res = &mpam_resctrl_controls[rid]) +/* + * The classes we've picked to map to resctrl events. + * Resctrl believes all the worlds a Xeon, and these are all on the L3. This + * array lets us find the actual class backing the event counters. e.g. + * the only memory bandwidth counters may be on the memory controller, but to + * make use of them, we pretend they are on L3. Restrict the events considered + * to those supported by MPAM. + * Class pointer may be NULL. + */ +#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID +static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; + +#define for_each_mpam_resctrl_mon(mon, eventid) \ + for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \ + eventid <= MPAM_MAX_EVENT; \ + eventid++, mon = &mpam_resctrl_counters[eventid]) + /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); @@ -63,6 +80,15 @@ bool resctrl_arch_alloc_capable(void) return false; } +bool resctrl_arch_mon_capable(void) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + /* All monitors are presented as being on the L3 cache */ + return l3->mon_capable; +} + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) { return mpam_resctrl_controls[rid].cdp_enabled; @@ -89,6 +115,8 @@ static void resctrl_reset_task_closids(void) int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) { u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; int cpu; if (!IS_ENABLED(CONFIG_EXPERT) && enable) { @@ -110,6 +138,11 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) cdp_enabled = enable; mpam_resctrl_controls[rid].cdp_enabled = enable; + if (enable) + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2; + else + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */ if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; @@ -686,6 +719,56 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, + enum resctrl_event_id type) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + /* + * There also needs to be an L3 cache present. + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1) + return 0; + + /* + * If there are no MPAM resources on L3, force it into existence. + * topology_matches_l3() already ensures this looks like the L3. + * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init(). + */ + if (!res->class) { + pr_warn_once("Faking L3 MSC to enable counters.\n"); + res->class = mpam_resctrl_counters[type].class; + } + + /* + * Called multiple times!, once per event type that has a + * monitoring class. + * Setting name is necessary on monitor only platforms. + */ + l3->name = "L3"; + l3->mon_scope = RESCTRL_L3_CACHE; + + /* + * num-rmid is the upper bound for the number of monitoring groups that + * can exist simultaneously, including the default monitoring group for + * each control group. Hence, advertise the whole rmid_idx space even + * though each control group has its own pmg/rmid space. Unfortunately, + * this does mean userspace needs to know the architecture to correctly + * interpret this value. + */ + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + + if (resctrl_enable_mon_event(type, false, 0, NULL)) + l3->mon_capable = true; + + return 0; +} + u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -913,11 +996,26 @@ static void mpam_resctrl_domain_insert(struct list_head *list, list_add_tail_rcu(&new->list, pos); } +static struct mpam_component *find_component(struct mpam_class *class, int cpu) +{ + struct mpam_component *comp; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp->affinity)) + return comp; + } + + return NULL; +} + static struct mpam_resctrl_dom * mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { int err; struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; struct mpam_class *class = res->class; struct mpam_component *comp_iter, *ctrl_comp; @@ -957,8 +1055,56 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) } else { pr_debug("Skipped control domain online - no controls\n"); } + + if (r->mon_capable) { + struct mpam_component *any_mon_comp; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + /* + * Even if the monitor domain is backed by a different + * component, the L3 component IDs need to be used... only + * there may be no ctrl_comp for the L3. + * Search each event's class list for a component with + * overlapping CPUs and set up the dom->mon_comp array. + */ + + for_each_mpam_resctrl_mon(mon, eventid) { + struct mpam_component *mon_comp; + + if (!mon->class) + continue; // dummy resource + + mon_comp = find_component(mon->class, cpu); + dom->mon_comp[eventid] = mon_comp; + if (mon_comp) + any_mon_comp = mon_comp; + } + if (!any_mon_comp) { + WARN_ON_ONCE(0); + err = -EFAULT; + goto offline_ctrl_domain; + } + + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr); + mon_d->hdr.type = RESCTRL_MON_DOMAIN; + err = resctrl_online_mon_domain(r, &mon_d->hdr); + if (err) + goto offline_ctrl_domain; + + mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); + } else { + pr_debug("Skipped monitor domain online - no monitors\n"); + } + return dom; +offline_ctrl_domain: + if (r->alloc_capable) { + mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + resctrl_offline_ctrl_domain(r, ctrl_d); + } free_domain: kfree(dom); dom = ERR_PTR(err); @@ -966,6 +1112,35 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) return dom; } +/* + * We know all the monitors are associated with the L3, even if there are no + * controls and therefore no control component. Find the cache-id for the CPU + * and use that to search for existing resctrl domains. + * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id + * for anything that is not a cache. + */ +static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +{ + int cache_id; + struct mpam_resctrl_dom *dom; + struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + + lockdep_assert_cpus_held(); + + if (!l3->class) + return NULL; + cache_id = get_cpu_cacheinfo_id(cpu, 3); + if (cache_id < 0) + return NULL; + + list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + if (dom->resctrl_mon_dom.hdr.id == cache_id) + return dom; + } + + return NULL; +} + static struct mpam_resctrl_dom * mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) { @@ -979,7 +1154,11 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) return dom; } - return NULL; + if (r->rid != RDT_RESOURCE_L3) + return NULL; + + /* Search the mon domain list too - needed on monitor only platforms. */ + return mpam_resctrl_get_mon_domain_from_cpu(cpu); } int mpam_resctrl_online_cpu(unsigned int cpu) @@ -1004,6 +1183,11 @@ int mpam_resctrl_online_cpu(unsigned int cpu) mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); } + if (r->mon_capable) { + struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom; + + mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + } } if (IS_ERR(dom)) return PTR_ERR(dom); @@ -1024,8 +1208,9 @@ void mpam_resctrl_offline_cpu(unsigned int cpu) guard(mutex)(&domain_list_lock); for_each_mpam_resctrl_control(res, rid) { struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; - bool ctrl_dom_empty; + bool ctrl_dom_empty, mon_dom_empty; struct rdt_resource *r = &res->resctrl_res; if (!res->class) @@ -1044,7 +1229,16 @@ void mpam_resctrl_offline_cpu(unsigned int cpu) ctrl_dom_empty = true; } - if (ctrl_dom_empty) + if (r->mon_capable) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); + } else { + mon_dom_empty = true; + } + + if (ctrl_dom_empty && mon_dom_empty) kfree(dom); } } @@ -1054,12 +1248,15 @@ int mpam_resctrl_setup(void) int err = 0; struct mpam_resctrl_res *res; enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; wait_event(wait_cacheinfo_ready, cacheinfo_ready); cpus_read_lock(); for_each_mpam_resctrl_control(res, rid) { INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains); res->resctrl_res.rid = rid; } @@ -1075,25 +1272,37 @@ int mpam_resctrl_setup(void) err = mpam_resctrl_control_init(res); if (err) { pr_debug("Failed to initialise rid %u\n", rid); - break; + goto internal_error; } } - cpus_read_unlock(); - if (err) { - pr_debug("Internal error %d - resctrl not supported\n", err); - return err; + for_each_mpam_resctrl_mon(mon, eventid) { + if (!mon->class) + continue; // dummy resource + + err = mpam_resctrl_monitor_init(mon, eventid); + if (err) { + pr_debug("Failed to initialise event %u\n", eventid); + goto internal_error; + } } - if (!resctrl_arch_alloc_capable()) { - pr_debug("No alloc(%u) found - resctrl not supported\n", - resctrl_arch_alloc_capable()); + cpus_read_unlock(); + + if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) { + pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable(), resctrl_arch_mon_capable()); return -EOPNOTSUPP; } /* TODO: call resctrl_init() */ return 0; + +internal_error: + cpus_read_unlock(); + pr_debug("Internal error %d - resctrl not supported\n", err); + return err; } static int __init __cacheinfo_ready(void) -- Gitee From e257356e9b826911c67e3bf7bd81c683c695c2d3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:41 +0000 Subject: [PATCH 109/124] arm_mpam: resctrl: Add support for csu counters ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-29-ben.horgan@arm.com resctrl exposes a counter via a file named llc_occupancy. This isn't really a counter as its value goes up and down, this is a snapshot of the cache storage usage monitor. Add some picking code which will only find an L3. The resctrl counter file is called llc_occupancy but we don't check it is the last one as it is already identified as L3. Tested-by: Shaopeng Tan Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 83 ++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 1ec929d71cf9..be29ff81e527 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -311,6 +311,28 @@ static bool class_has_usable_mba(struct mpam_props *cprops) return mba_class_use_mbw_max(cprops); } +static bool cache_has_usable_csu(struct mpam_class *class) +{ + struct mpam_props *cprops; + + if (!class) + return false; + + cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return false; + + /* + * CSU counters settle on the value, so we can get away with + * having only one. + */ + if (!cprops->num_csu_mon) + return false; + + return true; +} + /* * Calculate the worst-case percentage change from each implemented step * in the control. @@ -642,6 +664,64 @@ static void mpam_resctrl_pick_mba(void) } } +static void counter_update_class(enum resctrl_event_id evt_id, + struct mpam_class *class) +{ + struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; + + if (existing_class) { + if (class->level == 3) { + pr_debug("Existing class is L3 - L3 wins\n"); + return; + } + + if (existing_class->level < class->level) { + pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n", + existing_class->level, class->level); + return; + } + } + + mpam_resctrl_counters[evt_id].class = class; +} + +static void mpam_resctrl_pick_counters(void) +{ + struct mpam_class *class; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + /* The name of the resource is L3... */ + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a cache but not the L3", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u does not cover all CPUs", + class->level); + continue; + } + + if (cache_has_usable_csu(class)) { + pr_debug("class %u has usable CSU", + class->level); + + /* CSU counters only make sense on a cache. */ + switch (class->type) { + case MPAM_CLASS_CACHE: + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); + break; + default: + break; + } + } + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { struct mpam_class *class = res->class; @@ -1276,6 +1356,9 @@ int mpam_resctrl_setup(void) } } + /* Find some classes to use for monitors */ + mpam_resctrl_pick_counters(); + for_each_mpam_resctrl_mon(mon, eventid) { if (!mon->class) continue; // dummy resource -- Gitee From f5ce68ea47398f9ce42c473dc1648078ef161642 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:47 +0000 Subject: [PATCH 110/124] arm_mpam: resctrl: Allow resctrl to allocate monitors ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-30-ben.horgan@arm.com When resctrl wants to read a domain's 'QOS_L3_OCCUP', it needs to allocate a monitor on the corresponding resource. Monitors are allocated by class instead of component. Add helpers to allocate a CSU monitor. These helper return an out of range value for MBM counters. Allocating a montitor context is expected to block until hardware resources become available. This only makes sense for QOS_L3_OCCUP as unallocated MBM counters are losing data. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan [ backport changes ] Use kmalloc(sizeof(*ret), GFP_KERNEL) instead of kmalloc_obj(*ret). Signed-off-by: Wei Chen --- drivers/resctrl/mpam_internal.h | 14 ++++++- drivers/resctrl/mpam_resctrl.c | 67 +++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index d58428ba2005..5ebbd6322597 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -29,6 +29,14 @@ struct platform_device; #define PACKED_FOR_KUNIT #endif +/* + * This 'mon' values must not alias an actual monitor, so must be larger than + * U16_MAX, but not be confused with an errno value, so smaller than + * (u32)-SZ_4K. + * USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor. + */ +#define USE_PRE_ALLOCATED (U16_MAX + 1) + static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -216,7 +224,11 @@ enum mon_filter_options { }; struct mon_cfg { - u16 mon; + /* + * mon must be large enough to hold out of range values like + * USE_PRE_ALLOCATED + */ + u32 mon; u8 pmg; bool match_pmg; bool csu_exclude_clean; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index be29ff81e527..b4a61842e054 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -22,6 +22,8 @@ #include "mpam_internal.h" +DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); + /* * The classes we've picked to map to resctrl resources, wrapped * in with their resctrl structure. @@ -289,6 +291,71 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) return &mpam_resctrl_controls[l].resctrl_res; } +static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->class) + return -EINVAL; + + switch (evtid) { + case QOS_L3_OCCUP_EVENT_ID: + /* With CDP, one monitor gets used for both code/data reads */ + return mpam_alloc_csu_mon(mon->class); + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + return USE_PRE_ALLOCATED; + default: + return -EOPNOTSUPP; + } +} + +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) +{ + DEFINE_WAIT(wait); + int *ret; + + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + + do { + prepare_to_wait(&resctrl_mon_ctx_waiters, &wait, + TASK_INTERRUPTIBLE); + *ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid); + if (*ret == -ENOSPC) + schedule(); + } while (*ret == -ENOSPC && !signal_pending(current)); + finish_wait(&resctrl_mon_ctx_waiters, &wait); + + return ret; +} + +static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, + u32 mon_idx) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->class) + return; + + if (evtid == QOS_L3_OCCUP_EVENT_ID) + mpam_free_csu_mon(mon->class, mon_idx); + + wake_up(&resctrl_mon_ctx_waiters); +} + +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, void *arch_mon_ctx) +{ + u32 mon_idx = *(u32 *)arch_mon_ctx; + + kfree(arch_mon_ctx); + + resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7d23c90f077d..e1461e32af75 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -5,6 +5,7 @@ #define __LINUX_ARM_MPAM_H #include +#include #include struct mpam_msc; @@ -62,6 +63,10 @@ u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); u32 resctrl_arch_system_num_rmid_idx(void); +struct rdt_resource; +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. -- Gitee From 21b7aac815c448c47e0ea7a7ee1ccea50be65a1f Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:48 +0000 Subject: [PATCH 111/124] arm_mpam: resctrl: Add resctrl_arch_rmid_read() ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-31-ben.horgan@arm.com resctrl uses resctrl_arch_rmid_read() to read counters. CDP emulation means the counter may need reading in three different ways. The helpers behind the resctrl_arch_ functions will be re-used for the ABMC equivalent functions. Add the rounding helper for checking monitor values while we're here. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 82 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 2 files changed, 87 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index b4a61842e054..ca3dcdf5f052 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -356,6 +356,88 @@ void resctrl_arch_mon_ctx_free(struct rdt_resource *r, resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); } +static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) +{ + struct mon_cfg cfg; + + if (!mpam_is_enabled()) + return -EINVAL; + + /* Shift closid to account for CDP */ + closid = resctrl_get_config_index(closid, cdp_type); + + if (irqs_disabled()) { + /* Check if we can access this domain without an IPI */ + return -EIO; + } + + cfg = (struct mon_cfg) { + .mon = mon_idx, + .match_pmg = true, + .partid = closid, + .pmg = rmid, + }; + + return mpam_msmon_read(mon_comp, &cfg, mon_type, val); +} + +static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, u32 closid, u32 rmid, u64 *val) +{ + if (cdp_enabled) { + u64 code_val = 0, data_val = 0; + int err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_CODE, closid, rmid, &code_val); + if (err) + return err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_DATA, closid, rmid, &data_val); + if (err) + return err; + + *val += code_val + data_val; + return 0; + } + + return __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_NONE, closid, rmid, val); +} + +/* MBWU when not in ABMC mode (not supported), and CSU counters. */ +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, + u32 closid, u32 rmid, enum resctrl_event_id eventid, + void *arch_priv, u64 *val, void *arch_mon_ctx) +{ + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + u32 mon_idx = *(u32 *)arch_mon_ctx; + enum mpam_device_features mon_type; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + + resctrl_arch_rmid_read_context_check(); + + if (eventid >= QOS_NUM_EVENTS || !mon->class) + return -EINVAL; + + l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); + mon_comp = l3_dom->mon_comp[eventid]; + + if (eventid != QOS_L3_OCCUP_EVENT_ID) + return -EINVAL; + + mon_type = mpam_feat_msmon_csu; + + return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, + closid, rmid, val); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index e1461e32af75..86d5e326d2bd 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -67,6 +67,11 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + return val; +} + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. -- Gitee From 9b3941830122646719c9adcc569cf05514d6cc79 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:54 +0000 Subject: [PATCH 112/124] arm_mpam: resctrl: Update the rmid reallocation limit ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-32-ben.horgan@arm.com resctrl's limbo code needs to be told when the data left in a cache is small enough for the partid+pmg value to be re-allocated. x86 uses the cache size divided by the number of rmid users the cache may have. Do the same, but for the smallest cache, and with the number of partid-and-pmg users. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index ca3dcdf5f052..31798806079e 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -438,6 +438,42 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, closid, rmid, val); } +/* + * The rmid realloc threshold should be for the smallest cache exposed to + * resctrl. + */ +static int update_rmid_limits(struct mpam_class *class) +{ + u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); + struct mpam_props *cprops = &class->props; + struct cacheinfo *ci; + + lockdep_assert_cpus_held(); + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return 0; + + /* + * Assume cache levels are the same size for all CPUs... + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); + if (!ci || ci->size == 0) { + pr_debug("Could not read cache size for class %u\n", + class->level); + return -EINVAL; + } + + if (!resctrl_rmid_realloc_limit || + ci->size < resctrl_rmid_realloc_limit) { + resctrl_rmid_realloc_limit = ci->size; + resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; + } + + return 0; +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; @@ -862,6 +898,9 @@ static void mpam_resctrl_pick_counters(void) /* CSU counters only make sense on a cache. */ switch (class->type) { case MPAM_CLASS_CACHE: + if (update_rmid_limits(class)) + break; + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); break; default: -- Gitee From 2db57993fbae852072656afc4384b2f3fdde775d Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:50 +0000 Subject: [PATCH 113/124] arm_mpam: resctrl: Add empty definitions for assorted resctrl functions ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-33-ben.horgan@arm.com A few resctrl features and hooks need to be provided, but aren't needed or supported on MPAM platforms. resctrl has individual hooks to separately enable and disable the closid/partid and rmid/pmg context switching code. For MPAM this is all the same thing, as the value in struct task_struct is used to cache the value that should be written to hardware. arm64's context switching code is enabled once MPAM is usable, but doesn't touch the hardware unless the value has changed. For now event configuration is not supported, and can be turned off by returning 'false' from resctrl_arch_is_evt_configurable(). The new io_alloc feature is not supported either, always return false from the enable helper to indicate and fail the enable. Add this, and empty definitions for the other hooks. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_resctrl.c | 65 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 9 +++++ 2 files changed, 74 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 31798806079e..3e31d02c817a 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -91,6 +91,71 @@ bool resctrl_arch_mon_capable(void) return l3->mon_capable; } +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + return false; +} + +void resctrl_arch_mon_event_config_read(void *info) +{ +} + +void resctrl_arch_mon_event_config_write(void *info) +{ +} + +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +{ +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return false; +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + return -EINVAL; +} + +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) +{ + return false; +} + +void resctrl_arch_pre_mount(void) +{ +} + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) { return mpam_resctrl_controls[rid].cdp_enabled; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 86d5e326d2bd..f92a36187a52 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -67,6 +67,15 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +/* + * The CPU configuration for MPAM is cheap to write, and is only written if it + * has changed. No need for fine grained enables. + */ +static inline void resctrl_arch_enable_mon(void) { } +static inline void resctrl_arch_disable_mon(void) { } +static inline void resctrl_arch_enable_alloc(void) { } +static inline void resctrl_arch_disable_alloc(void) { } + static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) { return val; -- Gitee From 19cd2a2e63b5fc06ea1bd90653e96357b54a1c2b Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:51 +0000 Subject: [PATCH 114/124] arm64: mpam: Select ARCH_HAS_CPU_RESCTRL ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-34-ben.horgan@arm.com Enough MPAM support is present to enable ARCH_HAS_CPU_RESCTRL. Let it rip^Wlink! ARCH_HAS_CPU_RESCTRL indicates resctrl can be enabled. It is enabled by the arch code simply because it has 'arch' in its name. This removes ARM_CPU_RESCTRL as a mimic of X86_CPU_RESCTRL. While here, move the ACPI dependency to the driver's Kconfig file. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/resctrl.h | 2 ++ drivers/resctrl/Kconfig | 7 +++++++ drivers/resctrl/Makefile | 2 +- 4 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/resctrl.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 959743928dec..219331f22aa3 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2096,7 +2096,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" select ARM64_MPAM_DRIVER - select ACPI_MPAM if ACPI + select ARCH_HAS_CPU_RESCTRL help Memory System Resource Partitioning and Monitoring (MPAM) is an optional extension to the Arm architecture that allows each diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h new file mode 100644 index 000000000000..b506e95cf6e3 --- /dev/null +++ b/arch/arm64/include/asm/resctrl.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index c34e059c6e41..672abea3b03c 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -1,6 +1,7 @@ menuconfig ARM64_MPAM_DRIVER bool "MPAM driver" depends on ARM64 && ARM64_MPAM + select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) driver for System IP, e.g. caches and memory controllers. @@ -22,3 +23,9 @@ config MPAM_KUNIT_TEST If unsure, say N. endif + +config ARM64_MPAM_RESCTRL_FS + bool + default y if ARM64_MPAM_DRIVER && RESCTRL_FS + select RESCTRL_RMID_DEPENDS_ON_CLOSID + select RESCTRL_ASSIGN_FIXED diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 40beaf999582..4f6d0e81f9b8 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o mpam-y += mpam_devices.o -mpam-$(CONFIG_ARM_CPU_RESCTRL) += mpam_resctrl.o +mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG -- Gitee From 828908efcef870efd118af08c4a47697d493edd9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:58:52 +0000 Subject: [PATCH 115/124] arm_mpam: resctrl: Call resctrl_init() on platforms that can support resctrl ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-35-ben.horgan@arm.com Now that MPAM links against resctrl, call resctrl_init() to register the filesystem and setup resctrl's structures. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 32 ++++++++++++++--- drivers/resctrl/mpam_internal.h | 4 +++ drivers/resctrl/mpam_resctrl.c | 63 ++++++++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 5 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 9964d854d7e7..0189ee4099b6 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -73,6 +73,14 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable); /* When mpam is disabled, the printed reason to aid debugging */ static char *mpam_disable_reason; +/* + * Whether resctrl has been setup. Used by cpuhp in preference to + * mpam_is_enabled(). The disable call after an error interrupt makes + * mpam_is_enabled() false before the cpuhp callbacks are made. + * Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks). + */ +static bool mpam_resctrl_enabled; + /* * An MSC is a physical container for controls and monitors, each identified by * their RIS index. These share a base-address, interrupts and some MMIO @@ -1619,7 +1627,7 @@ static int mpam_cpu_online(unsigned int cpu) mpam_reprogram_msc(msc); } - if (mpam_is_enabled()) + if (mpam_resctrl_enabled) return mpam_resctrl_online_cpu(cpu); return 0; @@ -1665,7 +1673,7 @@ static int mpam_cpu_offline(unsigned int cpu) { struct mpam_msc *msc; - if (mpam_is_enabled()) + if (mpam_resctrl_enabled) mpam_resctrl_offline_cpu(cpu); guard(srcu)(&mpam_srcu); @@ -2531,6 +2539,7 @@ static void mpam_enable_once(void) } static_branch_enable(&mpam_enabled); + mpam_resctrl_enabled = true; mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); @@ -2590,24 +2599,39 @@ static void mpam_reset_class(struct mpam_class *class) void mpam_disable(struct work_struct *ignored) { int idx; + bool do_resctrl_exit; struct mpam_class *class; struct mpam_msc *msc, *tmp; + if (mpam_is_enabled()) + static_branch_disable(&mpam_enabled); + mutex_lock(&mpam_cpuhp_state_lock); if (mpam_cpuhp_state) { cpuhp_remove_state(mpam_cpuhp_state); mpam_cpuhp_state = 0; } + + /* + * Removing the cpuhp state called mpam_cpu_offline() and told resctrl + * all the CPUs are offline. + */ + do_resctrl_exit = mpam_resctrl_enabled; + mpam_resctrl_enabled = false; mutex_unlock(&mpam_cpuhp_state_lock); - static_branch_disable(&mpam_enabled); + if (do_resctrl_exit) + mpam_resctrl_exit(); mpam_unregister_irqs(); idx = srcu_read_lock(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, - srcu_read_lock_held(&mpam_srcu)) + srcu_read_lock_held(&mpam_srcu)) { mpam_reset_class(class); + if (do_resctrl_exit) + mpam_resctrl_teardown_class(class); + } srcu_read_unlock(&mpam_srcu, idx); mutex_lock(&mpam_list_lock); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 5ebbd6322597..ce9e0e0483fb 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -435,12 +435,16 @@ int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, #ifdef CONFIG_RESCTRL_FS int mpam_resctrl_setup(void); +void mpam_resctrl_exit(void); int mpam_resctrl_online_cpu(unsigned int cpu); void mpam_resctrl_offline_cpu(unsigned int cpu); +void mpam_resctrl_teardown_class(struct mpam_class *class); #else static inline int mpam_resctrl_setup(void) { return 0; } +static inline void mpam_resctrl_exit(void) { } static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } +static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } #endif /* CONFIG_RESCTRL_FS */ /* diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 3e31d02c817a..5ebc56c515a7 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -69,6 +69,12 @@ static bool cdp_enabled; static bool cacheinfo_ready; static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); +/* + * If resctrl_init() succeeded, resctrl_exit() can be used to remove support + * for the filesystem in the event of an error. + */ +static bool resctrl_enabled; + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -360,6 +366,9 @@ static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + if (!mpam_is_enabled()) + return -EINVAL; + if (!mon->class) return -EINVAL; @@ -402,6 +411,9 @@ static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + if (!mpam_is_enabled()) + return; + if (!mon->class) return; @@ -488,6 +500,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, resctrl_arch_rmid_read_context_check(); + if (!mpam_is_enabled()) + return -EINVAL; + if (eventid >= QOS_NUM_EVENTS || !mon->class) return -EINVAL; @@ -1174,6 +1189,9 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); + if (!mpam_is_enabled()) + return -EINVAL; + /* * No need to check the CPU as mpam_apply_config() doesn't care, and * resctrl_arch_update_domains() relies on this. @@ -1239,6 +1257,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); + if (!mpam_is_enabled()) + return -EINVAL; + list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) { for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) { struct resctrl_staged_config *cfg = &d->staged_config[t]; @@ -1631,7 +1652,11 @@ int mpam_resctrl_setup(void) return -EOPNOTSUPP; } - /* TODO: call resctrl_init() */ + err = resctrl_init(); + if (err) + return err; + + WRITE_ONCE(resctrl_enabled, true); return 0; @@ -1641,6 +1666,42 @@ int mpam_resctrl_setup(void) return err; } +void mpam_resctrl_exit(void) +{ + if (!READ_ONCE(resctrl_enabled)) + return; + + WRITE_ONCE(resctrl_enabled, false); + resctrl_exit(); +} + +/* + * The driver is detaching an MSC from this class, if resctrl was using it, + * pull on resctrl_exit(). + */ +void mpam_resctrl_teardown_class(struct mpam_class *class) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + might_sleep(); + + for_each_mpam_resctrl_control(res, rid) { + if (res->class == class) { + res->class = NULL; + break; + } + } + for_each_mpam_resctrl_mon(mon, eventid) { + if (mon->class == class) { + mon->class = NULL; + break; + } + } +} + static int __init __cacheinfo_ready(void) { cacheinfo_ready = true; -- Gitee From b368316fff83ca30fefd7d5302619b7905d9183a Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 5 Dec 2025 21:58:57 +0000 Subject: [PATCH 116/124] arm_mpam: Add quirk framework ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-36-ben.horgan@arm.com The MPAM specification includes the MPAMF_IIDR, which serves to uniquely identify the MSC implementation through a combination of implementer details, product ID, variant, and revision. Certain hardware issues/errata can be resolved using software workarounds. Introduce a quirk framework to allow workarounds to be enabled based on the MPAMF_IIDR value. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Zeng Heng Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: Shanker Donthineni Co-developed-by: James Morse Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- drivers/resctrl/mpam_devices.c | 32 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 25 +++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0189ee4099b6..bb10359f2473 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -630,6 +630,30 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +static const struct mpam_quirk mpam_quirks[] = { + { NULL } /* Sentinel */ +}; + +static void mpam_enable_quirks(struct mpam_msc *msc) +{ + const struct mpam_quirk *quirk; + + for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) { + int err = 0; + + if (quirk->iidr != (msc->iidr & quirk->iidr_mask)) + continue; + + if (quirk->init) + err = quirk->init(msc, quirk); + + if (err) + continue; + + mpam_set_quirk(quirk->workaround, msc); + } +} + /* * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour * of NRDY, software can use this bit for any purpose" - so hardware might not @@ -864,8 +888,11 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) /* Grab an IDR value to find out how many RIS there are */ mutex_lock(&msc->part_sel_lock); idr = mpam_msc_read_idr(msc); + msc->iidr = mpam_read_partsel_reg(msc, IIDR); mutex_unlock(&msc->part_sel_lock); + mpam_enable_quirks(msc); + msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr); /* Use these values so partid/pmg always starts with a valid value */ @@ -1976,6 +2003,7 @@ static bool mpam_has_cmax_wd_feature(struct mpam_props *props) * resulting safe value must be compatible with both. When merging values in * the tree, all the aliasing resources must be handled first. * On mismatch, parent is modified. + * Quirks on an MSC will apply to all MSC in that class. */ static void __props_mismatch(struct mpam_props *parent, struct mpam_props *child, bool alias) @@ -2095,6 +2123,7 @@ static void __props_mismatch(struct mpam_props *parent, * nobble the class feature, as we can't configure all the resources. * e.g. The L3 cache is composed of two resources with 13 and 17 portion * bitmaps respectively. + * Quirks on an MSC will apply to all MSC in that class. */ static void __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) @@ -2108,6 +2137,9 @@ __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n", (long)cprops->features, (long)vprops->features); + /* Merge quirks */ + class->quirks |= vmsc->msc->quirks; + /* Take the safe value for any common features */ __props_mismatch(cprops, vprops, false); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index ce9e0e0483fb..e28a168419d4 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -85,6 +85,8 @@ struct mpam_msc { u8 pmg_max; unsigned long ris_idxs; u32 ris_max; + u32 iidr; + u16 quirks; /* * error_irq_lock is taken when registering/unregistering the error @@ -216,6 +218,28 @@ struct mpam_props { #define mpam_set_feature(_feat, x) __set_bit(_feat, (x)->features) #define mpam_clear_feature(_feat, x) __clear_bit(_feat, (x)->features) +/* Workaround bits for msc->quirks */ +enum mpam_device_quirks { + MPAM_QUIRK_LAST +}; + +#define mpam_has_quirk(_quirk, x) ((1 << (_quirk) & (x)->quirks)) +#define mpam_set_quirk(_quirk, x) ((x)->quirks |= (1 << (_quirk))) + +struct mpam_quirk { + int (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk); + + u32 iidr; + u32 iidr_mask; + + enum mpam_device_quirks workaround; +}; + +#define MPAM_IIDR_MATCH_ONE (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0xfff) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, @@ -259,6 +283,7 @@ struct mpam_class { struct mpam_props props; u32 nrdy_usec; + u16 quirks; u8 level; enum mpam_class_types type; -- Gitee From 85cb8285e14b9b50fa24e59745f9c62241858027 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 5 Dec 2025 21:58:58 +0000 Subject: [PATCH 117/124] arm_mpam: Add workaround for T241-MPAM-1 ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-37-ben.horgan@arm.com The MPAM bandwidth partitioning controls will not be correctly configured, and hardware will retain default configuration register values, meaning generally that bandwidth will remain unprovisioned. To address the issue, follow the below steps after updating the MBW_MIN and/or MBW_MAX registers. - Perform 64b reads from all 12 bridge MPAM shadow registers at offsets (0x360048 + slice*0x10000 + partid*8). These registers are read-only. - Continue iterating until all 12 shadow register values match in a loop. pr_warn_once if the values fail to match within the loop count 1000. - Perform 64b writes with the value 0x0 to the two spare registers at offsets 0x1b0000 and 0x1c0000. In the hardware, writes to the MPAMCFG_MBW_MAX MPAMCFG_MBW_MIN registers are transformed into broadcast writes to the 12 shadow registers. The final two writes to the spare registers cause a final rank of downstream micro-architectural MPAM registers to be updated from the shadow copies. The intervening loop to read the 12 shadow registers helps avoid a race condition where writes to the spare registers occur before all shadow registers have been updated. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Signed-off-by: Shanker Donthineni Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- Documentation/arch/arm64/silicon-errata.rst | 2 + drivers/resctrl/mpam_devices.c | 88 +++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 9 +++ 3 files changed, 99 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 7a054c5834ad..48bf8e2877fb 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -240,6 +240,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index bb10359f2473..4c10bd78768f 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -29,6 +29,16 @@ #include "mpam_internal.h" +/* Values for the T241 errata workaround */ +#define T241_CHIPS_MAX 4 +#define T241_CHIP_NSLICES 12 +#define T241_SPARE_REG0_OFF 0x1b0000 +#define T241_SPARE_REG1_OFF 0x1c0000 +#define T241_CHIP_ID(phys) FIELD_GET(GENMASK_ULL(44, 43), phys) +#define T241_SHADOW_REG_OFF(sidx, pid) (0x360048 + (sidx) * 0x10000 + (pid) * 8) +#define SMCCC_SOC_ID_T241 0x036b0241 +static void __iomem *t241_scratch_regs[T241_CHIPS_MAX]; + /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, @@ -630,7 +640,45 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +static int mpam_enable_quirk_nvidia_t241_1(struct mpam_msc *msc, + const struct mpam_quirk *quirk) +{ + s32 soc_id = arm_smccc_get_soc_id_version(); + struct resource *r; + phys_addr_t phys; + + /* + * A mapping to a device other than the MSC is needed, check + * SOC_ID is NVIDIA T241 chip (036b:0241) + */ + if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241) + return -EINVAL; + + r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0); + if (!r) + return -EINVAL; + + /* Find the internal registers base addr from the CHIP ID */ + msc->t241_id = T241_CHIP_ID(r->start); + phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL; + + t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M); + if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id])) + return -EINVAL; + + pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n"); + + return 0; +} + static const struct mpam_quirk mpam_quirks[] = { + { + /* NVIDIA t241 erratum T241-MPAM-1 */ + .init = mpam_enable_quirk_nvidia_t241_1, + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_SCRUB_SHADOW_REGS, + }, { NULL } /* Sentinel */ }; @@ -1378,6 +1426,44 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) __mpam_write_reg(msc, reg, bm); } +static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid) +{ + int sidx, i, lcount = 1000; + void __iomem *regs; + u64 val0, val; + + regs = t241_scratch_regs[ris->vmsc->msc->t241_id]; + + for (i = 0; i < lcount; i++) { + /* Read the shadow register at index 0 */ + val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid)); + + /* Check if all the shadow registers have the same value */ + for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) { + val = readq_relaxed(regs + + T241_SHADOW_REG_OFF(sidx, partid)); + if (val != val0) + break; + } + if (sidx == T241_CHIP_NSLICES) + break; + } + + if (i == lcount) + pr_warn_once("t241: inconsistent values in shadow regs"); + + /* Write a value zero to spare registers to take effect of MBW conf */ + writeq_relaxed(0, regs + T241_SPARE_REG0_OFF); + writeq_relaxed(0, regs + T241_SPARE_REG1_OFF); +} + +static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, + struct mpam_config *cfg) +{ + if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc)) + mpam_apply_t241_erratum(ris, partid); +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) @@ -1457,6 +1543,8 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, PRI, pri_val); } + mpam_quirk_post_config_change(ris, partid, cfg); + mutex_unlock(&msc->part_sel_lock); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e28a168419d4..e38954a735d8 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -130,6 +130,9 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + /* Values only used on some platforms for quirks */ + u32 t241_id; + struct mpam_garbage garbage; }; @@ -220,6 +223,7 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { + T241_SCRUB_SHADOW_REGS, MPAM_QUIRK_LAST }; @@ -240,6 +244,11 @@ struct mpam_quirk { FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff)) +#define MPAM_IIDR_NVIDIA_T241 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0x241) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, -- Gitee From 7966c9233ad46fbdbf124914483fab3c31bafa33 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 5 Dec 2025 21:58:59 +0000 Subject: [PATCH 118/124] arm_mpam: Add workaround for T241-MPAM-4 ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-38-ben.horgan@arm.com In the T241 implementation of memory-bandwidth partitioning, in the absence of contention for bandwidth, the minimum bandwidth setting can affect the amount of achieved bandwidth. Specifically, the achieved bandwidth in the absence of contention can settle to any value between the values of MPAMCFG_MBW_MIN and MPAMCFG_MBW_MAX. Also, if MPAMCFG_MBW_MIN is set zero (below 0.78125%), once a core enters a throttled state, it will never leave that state. The first issue is not a concern if the MPAM software allows to program MPAMCFG_MBW_MIN through the sysfs interface. This patch ensures program MBW_MIN=1 (0.78125%) whenever MPAMCFG_MBW_MIN=0 is programmed. In the scenario where the resctrl doesn't support the MBW_MIN interface via sysfs, to achieve bandwidth closer to MBW_MAX in the absence of contention, software should configure a relatively narrow gap between MBW_MIN and MBW_MAX. The recommendation is to use a 5% gap to mitigate the problem. Clear the feature MBW_MIN feature from the class to ensure we don't accidentally change behaviour when resctrl adds support for a MBW_MIN interface. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Signed-off-by: Shanker Donthineni Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- Documentation/arch/arm64/silicon-errata.rst | 2 + drivers/resctrl/mpam_devices.c | 55 +++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 48bf8e2877fb..778de2aef711 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -242,6 +242,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 4c10bd78768f..71cb3767d9ce 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -679,6 +679,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = MPAM_IIDR_MATCH_ONE, .workaround = T241_SCRUB_SHADOW_REGS, }, + { + /* NVIDIA t241 erratum T241-MPAM-4 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_FORCE_MBW_MIN_TO_ONE, + }, { NULL } /* Sentinel */ }; @@ -1464,6 +1470,37 @@ static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, mpam_apply_t241_erratum(ris, partid); } +static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props) +{ + u16 max_hw_value, min_hw_granule, res0_bits; + + res0_bits = 16 - props->bwa_wd; + max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + + return min_hw_granule + 1; +} + +static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props, + struct mpam_config *cfg) +{ + u16 val = 0; + u16 max; + u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; + + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) { + max = cfg->mbw_max; + } else { + /* Resetting. Hence, use the ris specific default. */ + max = GENMASK(15, 16 - props->bwa_wd); + } + + if (max > delta) + val = max - delta; + + return val; +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) @@ -1504,9 +1541,18 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); } - if (mpam_has_feature(mpam_feat_mbw_min, rprops) && - mpam_has_feature(mpam_feat_mbw_min, cfg)) - mpam_write_partsel_reg(msc, MBW_MIN, 0); + if (mpam_has_feature(mpam_feat_mbw_min, rprops)) { + u16 val = 0; + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) { + u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops); + + val = mpam_wa_t241_calc_min_from_max(rprops, cfg); + val = max(val, min); + } + + mpam_write_partsel_reg(msc, MBW_MIN, val); + } if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { if (mpam_has_feature(mpam_feat_mbw_max, cfg)) @@ -2292,6 +2338,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp) list_for_each_entry(vmsc, &comp->vmsc, comp_list) __class_props_mismatch(class, vmsc); + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) + mpam_clear_feature(mpam_feat_mbw_min, &class->props); } /* diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e38954a735d8..010f4e16a637 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -224,6 +224,7 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, + T241_FORCE_MBW_MIN_TO_ONE, MPAM_QUIRK_LAST }; -- Gitee From 5d2c3e5d3829b8ac66c93e891452849386fe73e2 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 5 Dec 2025 21:59:00 +0000 Subject: [PATCH 119/124] arm_mpam: Add workaround for T241-MPAM-6 ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-39-ben.horgan@arm.com The registers MSMON_MBWU_L and MSMON_MBWU return the number of requests rather than the number of bytes transferred. Bandwidth resource monitoring is performed at the last level cache, where each request arrive in 64Byte granularity. The current implementation returns the number of transactions received at the last level cache but does not provide the value in bytes. Scaling by 64 gives an accurate byte count to match the MPAM specification for the MSMON_MBWU and MSMON_MBWU_L registers. This patch fixes the issue by reporting the actual number of bytes instead of the number of transactions from __ris_msmon_read(). Tested-by: Gavin Shan Tested-by: Shaopeng Tan Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Signed-off-by: Shanker Donthineni Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- Documentation/arch/arm64/silicon-errata.rst | 2 ++ drivers/resctrl/mpam_devices.c | 26 +++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 778de2aef711..0c5b14cfef87 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -244,6 +244,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-6 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 71cb3767d9ce..348d02511854 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -685,6 +685,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = MPAM_IIDR_MATCH_ONE, .workaround = T241_FORCE_MBW_MIN_TO_ONE, }, + { + /* NVIDIA t241 erratum T241-MPAM-6 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_MBW_COUNTER_SCALE_64, + }, { NULL } /* Sentinel */ }; @@ -1146,7 +1152,7 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, } } -static u64 mpam_msmon_overflow_val(enum mpam_device_features type) +static u64 __mpam_msmon_overflow_val(enum mpam_device_features type) { /* TODO: implement scaling counters */ switch (type) { @@ -1161,6 +1167,18 @@ static u64 mpam_msmon_overflow_val(enum mpam_device_features type) } } +static u64 mpam_msmon_overflow_val(enum mpam_device_features type, + struct mpam_msc *msc) +{ + u64 overflow_val = __mpam_msmon_overflow_val(type); + + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + type != mpam_feat_msmon_mbwu_63counter) + overflow_val *= 64; + + return overflow_val; +} + static void __ris_msmon_read(void *arg) { u64 now; @@ -1251,13 +1269,17 @@ static void __ris_msmon_read(void *arg) now = FIELD_GET(MSMON___VALUE, now); } + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + m->type != mpam_feat_msmon_mbwu_63counter) + now *= 64; + if (nrdy) break; mbwu_state = &ris->mbwu_state[ctx->mon]; if (overflow) - mbwu_state->correction += mpam_msmon_overflow_val(m->type); + mbwu_state->correction += mpam_msmon_overflow_val(m->type, msc); /* * Include bandwidth consumed before the last hardware reset and diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 010f4e16a637..1c9e07955fc8 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -225,6 +225,7 @@ struct mpam_props { enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, T241_FORCE_MBW_MIN_TO_ONE, + T241_MBW_COUNTER_SCALE_64, MPAM_QUIRK_LAST }; -- Gitee From 44e23f374910201788ff21851e9057a24954213b Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Dec 2025 21:59:01 +0000 Subject: [PATCH 120/124] arm_mpam: Quirk CMN-650's CSU NRDY behaviour ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-40-ben.horgan@arm.com CMN-650 is afflicted with an erratum where the CSU NRDY bit never clears. This tells us the monitor never finishes scanning the cache. The erratum document says to wait the maximum time, then ignore the field. Add a flag to indicate whether this is the final attempt to read the counter, and when this quirk is applied, ignore the NRDY field. This means accesses to this counter will always retry, even if the counter was previously programmed to the same values. The counter value is not expected to be stable, it drifts up and down with each allocation and eviction. The CSU register provides the value for a point in time. Reviewed-by: Zeng Heng Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- Documentation/arch/arm64/silicon-errata.rst | 3 +++ drivers/resctrl/mpam_devices.c | 12 ++++++++++++ drivers/resctrl/mpam_internal.h | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 0c5b14cfef87..d3b9900518c1 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -207,6 +207,9 @@ stable kernels. | ARM | GIC-700 | #2941627 | ARM64_ERRATUM_2941627 | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ +| ARM | CMN-650 | #3642720 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_845719 | +----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_843419 | diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 348d02511854..ee55d46dafac 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -691,6 +691,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = MPAM_IIDR_MATCH_ONE, .workaround = T241_MBW_COUNTER_SCALE_64, }, + { + /* ARM CMN-650 CSU erratum 3642720 */ + .iidr = MPAM_IIDR_ARM_CMN_650, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = IGNORE_CSU_NRDY, + }, { NULL } /* Sentinel */ }; @@ -1003,6 +1009,7 @@ struct mon_read { enum mpam_device_features type; u64 *val; int err; + bool waited_timeout; }; static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris) @@ -1249,6 +1256,10 @@ static void __ris_msmon_read(void *arg) if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); + + if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout) + nrdy = false; + break; case mpam_feat_msmon_mbwu_31counter: case mpam_feat_msmon_mbwu_44counter: @@ -1386,6 +1397,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, .ctx = ctx, .type = type, .val = val, + .waited_timeout = true, }; *val = 0; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1c9e07955fc8..dbb99d9b0795 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -226,6 +226,7 @@ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, T241_FORCE_MBW_MIN_TO_ONE, T241_MBW_COUNTER_SCALE_64, + IGNORE_CSU_NRDY, MPAM_QUIRK_LAST }; @@ -251,6 +252,11 @@ struct mpam_quirk { FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b)) +#define MPAM_IIDR_ARM_CMN_650 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x43b)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, -- Gitee From a2255a357f1fece05d221110ff3e6742c32e3023 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Mon, 2 Feb 2026 15:41:41 +0000 Subject: [PATCH 121/124] arm64: mpam: Add initial MPAM documentation ANBZ: #31060 cherry-picked from https://lore.kernel.org/r/20260313144617.3420416-41-ben.horgan@arm.com MPAM (Memory Partitioning and Monitoring) is now exposed to user-space via resctrl. Add some documentation so the user knows what features to expect. Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Signed-off-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Ben Horgan Signed-off-by: Wei Chen --- Documentation/arch/arm64/index.rst | 2 + Documentation/arch/arm64/mpam.rst | 72 ++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 Documentation/arch/arm64/mpam.rst diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index d08e924204bf..79fe180a3dad 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -19,6 +19,8 @@ ARM64 Architecture legacy_instructions memory memory-tagging-extension + mops + mpam perf pointer-authentication ptdump diff --git a/Documentation/arch/arm64/mpam.rst b/Documentation/arch/arm64/mpam.rst new file mode 100644 index 000000000000..570f51a8d4eb --- /dev/null +++ b/Documentation/arch/arm64/mpam.rst @@ -0,0 +1,72 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +MPAM +==== + +What is MPAM +============ +MPAM (Memory Partitioning and Monitoring) is a feature in the CPUs and memory +system components such as the caches or memory controllers that allow memory +traffic to be labelled, partitioned and monitored. + +Traffic is labelled by the CPU, based on the control or monitor group the +current task is assigned to using resctrl. Partitioning policy can be set +using the schemata file in resctrl, and monitor values read via resctrl. +See Documentation/filesystems/resctrl.rst for more details. + +This allows tasks that share memory system resources, such as caches, to be +isolated from each other according to the partitioning policy (so called noisy +neighbours). + +Supported Platforms +=================== +Use of this feature requires CPU support, support in the memory system +components, and a description from firmware of where the MPAM device controls +are in the MMIO address space. (e.g. the 'MPAM' ACPI table). + +The MMIO device that provides MPAM controls/monitors for a memory system +component is called a memory system component. (MSC). + +Because the user interface to MPAM is via resctrl, only MPAM features that are +compatible with resctrl can be exposed to user-space. + +MSC are considered as a group based on the topology. MSC that correspond with +the L3 cache are considered together, it is not possible to mix MSC between L2 +and L3 to 'cover' a resctrl schema. + +The supported features are: + +* Cache portion bitmap controls (CPOR) on the L2 or L3 caches. To expose + CPOR at L2 or L3, every CPU must have a corresponding CPU cache at this + level that also supports the feature. Mismatched big/little platforms are + not supported as resctrl's controls would then also depend on task + placement. + +* Memory bandwidth maximum controls (MBW_MAX) on or after the L3 cache. + resctrl uses the L3 cache-id to identify where the memory bandwidth + control is applied. For this reason the platform must have an L3 cache + with cache-id's supplied by firmware. (It doesn't need to support MPAM.) + + To be exported as the 'MB' schema, the topology of the group of MSC chosen + must match the topology of the L3 cache so that the cache-id's can be + repainted. For example: Platforms with Memory bandwidth maximum controls + on CPU-less NUMA nodes cannot expose the 'MB' schema to resctrl as these + nodes do not have a corresponding L3 cache. If the memory bandwidth + control is on the memory rather than the L3 then there must be a single + global L3 as otherwise it is unknown which L3 the traffic came from. There + must be no caches between the L3 and the memory so that the two ends of + the path have equivalent traffic. + + When the MPAM driver finds multiple groups of MSC it can use for the 'MB' + schema, it prefers the group closest to the L3 cache. + +* Cache Storage Usage (CSU) counters can expose the 'llc_occupancy' provided + there is at least one CSU monitor on each MSC that makes up the L3 group. + Exposing CSU counters from other caches or devices is not supported. + +Reporting Bugs +============== +If you are not seeing the counters or controls you expect please share the +debug messages produced when enabling dynamic debug and booting with: +dyndbg="file mpam_resctrl.c +pl" -- Gitee From 6dbc2f3613a2ee604f2885d299f125cfda7153c3 Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Wed, 11 Mar 2026 22:04:39 +0800 Subject: [PATCH 122/124] anolis: KVM: arm64: Adapt mpam to allow KVM to be built as a module ANBZ: #31060 We need to export mpam_enabled for KVM to support module mode. Signed-off-by: Wei Chen --- arch/arm64/kernel/mpam.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index 3a490de4fa12..3fc8ae90cd8c 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -9,6 +9,7 @@ #include DEFINE_STATIC_KEY_FALSE(mpam_enabled); +EXPORT_SYMBOL_FOR_KVM(mpam_enabled); DEFINE_PER_CPU(u64, arm64_mpam_default); DEFINE_PER_CPU(u64, arm64_mpam_current); -- Gitee From 6baa63a9fbca18f18fe3d558401ca6a36f0470c9 Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Tue, 7 Apr 2026 21:16:26 +0800 Subject: [PATCH 123/124] anolis: configs: arm64: Enable ARM64_MPAM ANBZ: #31060 Add MPAM configs to anolis configs. Signed-off-by: Wei Chen --- anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM | 1 + anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS | 1 + anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM | 1 + anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER | 1 + anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG | 1 + .../L2-OPTIONAL/{x86 => default}/CONFIG_ARCH_HAS_CPU_RESCTRL | 0 anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL | 1 + anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL | 1 + anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_ARCH_HAS_CPU_RESCTRL | 1 + anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_ARCH_HAS_CPU_RESCTRL | 1 + 10 files changed, 9 insertions(+) create mode 100644 anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM create mode 100644 anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG rename anolis/configs/L2-OPTIONAL/{x86 => default}/CONFIG_ARCH_HAS_CPU_RESCTRL (100%) create mode 100644 anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL create mode 100644 anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL create mode 100644 anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_ARCH_HAS_CPU_RESCTRL create mode 100644 anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_ARCH_HAS_CPU_RESCTRL diff --git a/anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM b/anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM new file mode 100644 index 000000000000..45957b7b4ea2 --- /dev/null +++ b/anolis/configs/L0-MANDATORY/arm64/CONFIG_ARM64_MPAM @@ -0,0 +1 @@ +CONFIG_ARM64_MPAM=y diff --git a/anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS new file mode 100644 index 000000000000..2147c5f9ea80 --- /dev/null +++ b/anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS @@ -0,0 +1 @@ +# CONFIG_RESCTRL_FS is not set diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM new file mode 100644 index 000000000000..e93cbd36cedc --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ACPI_MPAM @@ -0,0 +1 @@ +CONFIG_ACPI_MPAM=y diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER new file mode 100644 index 000000000000..9e4b32224138 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER @@ -0,0 +1 @@ +CONFIG_ARM64_MPAM_DRIVER=y diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG new file mode 100644 index 000000000000..76eca7c2ff09 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_DRIVER_DEBUG @@ -0,0 +1 @@ +# CONFIG_ARM64_MPAM_DRIVER_DEBUG is not set diff --git a/anolis/configs/L2-OPTIONAL/x86/CONFIG_ARCH_HAS_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/default/CONFIG_ARCH_HAS_CPU_RESCTRL similarity index 100% rename from anolis/configs/L2-OPTIONAL/x86/CONFIG_ARCH_HAS_CPU_RESCTRL rename to anolis/configs/L2-OPTIONAL/default/CONFIG_ARCH_HAS_CPU_RESCTRL diff --git a/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL new file mode 100644 index 000000000000..dd3c6353e127 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_ARCH_HAS_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_ARCH_HAS_CPU_RESCTRL is not set diff --git a/anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL new file mode 100644 index 000000000000..dd3c6353e127 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/riscv/CONFIG_ARCH_HAS_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_ARCH_HAS_CPU_RESCTRL is not set diff --git a/anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_ARCH_HAS_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_ARCH_HAS_CPU_RESCTRL new file mode 100644 index 000000000000..dd3c6353e127 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_ARCH_HAS_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_ARCH_HAS_CPU_RESCTRL is not set diff --git a/anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_ARCH_HAS_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_ARCH_HAS_CPU_RESCTRL new file mode 100644 index 000000000000..dd3c6353e127 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_ARCH_HAS_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_ARCH_HAS_CPU_RESCTRL is not set -- Gitee From 8d32f3334ae3888e5b25d14ecdf6b2441a8fca03 Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Tue, 7 Apr 2026 21:26:00 +0800 Subject: [PATCH 124/124] anolis: configs: arm64: Enable RESCTRL_FS for MPAM ANBZ: #31060 Add MPAM configs to anolis configs: Move CONFIG_RESCTRL_FS from x86 to default. Move CONFIG_PROC_CPU_RESCTRL from x86 to default. Signed-off-by: Wei Chen --- anolis/configs/L0-MANDATORY/{x86 => default}/CONFIG_RESCTRL_FS | 0 .../configs/L0-MANDATORY/{arm64 => loongarch}/CONFIG_RESCTRL_FS | 0 anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS | 1 + anolis/configs/L0-MANDATORY/sw_64-6b/CONFIG_RESCTRL_FS | 1 + anolis/configs/L0-MANDATORY/sw_64-8a/CONFIG_RESCTRL_FS | 1 + anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS | 1 + .../L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID | 1 + .../configs/L2-OPTIONAL/{x86 => default}/CONFIG_PROC_CPU_RESCTRL | 0 anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL | 1 + anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL | 1 + anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_PROC_CPU_RESCTRL | 1 + anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_PROC_CPU_RESCTRL | 1 + 12 files changed, 9 insertions(+) rename anolis/configs/L0-MANDATORY/{x86 => default}/CONFIG_RESCTRL_FS (100%) rename anolis/configs/L0-MANDATORY/{arm64 => loongarch}/CONFIG_RESCTRL_FS (100%) create mode 100644 anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS create mode 100644 anolis/configs/L0-MANDATORY/sw_64-6b/CONFIG_RESCTRL_FS create mode 100644 anolis/configs/L0-MANDATORY/sw_64-8a/CONFIG_RESCTRL_FS create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS create mode 100644 anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID rename anolis/configs/L2-OPTIONAL/{x86 => default}/CONFIG_PROC_CPU_RESCTRL (100%) create mode 100644 anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL create mode 100644 anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL create mode 100644 anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_PROC_CPU_RESCTRL create mode 100644 anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_PROC_CPU_RESCTRL diff --git a/anolis/configs/L0-MANDATORY/x86/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/default/CONFIG_RESCTRL_FS similarity index 100% rename from anolis/configs/L0-MANDATORY/x86/CONFIG_RESCTRL_FS rename to anolis/configs/L0-MANDATORY/default/CONFIG_RESCTRL_FS diff --git a/anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/loongarch/CONFIG_RESCTRL_FS similarity index 100% rename from anolis/configs/L0-MANDATORY/arm64/CONFIG_RESCTRL_FS rename to anolis/configs/L0-MANDATORY/loongarch/CONFIG_RESCTRL_FS diff --git a/anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS new file mode 100644 index 000000000000..2147c5f9ea80 --- /dev/null +++ b/anolis/configs/L0-MANDATORY/riscv/CONFIG_RESCTRL_FS @@ -0,0 +1 @@ +# CONFIG_RESCTRL_FS is not set diff --git a/anolis/configs/L0-MANDATORY/sw_64-6b/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/sw_64-6b/CONFIG_RESCTRL_FS new file mode 100644 index 000000000000..2147c5f9ea80 --- /dev/null +++ b/anolis/configs/L0-MANDATORY/sw_64-6b/CONFIG_RESCTRL_FS @@ -0,0 +1 @@ +# CONFIG_RESCTRL_FS is not set diff --git a/anolis/configs/L0-MANDATORY/sw_64-8a/CONFIG_RESCTRL_FS b/anolis/configs/L0-MANDATORY/sw_64-8a/CONFIG_RESCTRL_FS new file mode 100644 index 000000000000..2147c5f9ea80 --- /dev/null +++ b/anolis/configs/L0-MANDATORY/sw_64-8a/CONFIG_RESCTRL_FS @@ -0,0 +1 @@ +# CONFIG_RESCTRL_FS is not set diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS new file mode 100644 index 000000000000..c91ce4ffbafa --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_ARM64_MPAM_RESCTRL_FS @@ -0,0 +1 @@ +CONFIG_ARM64_MPAM_RESCTRL_FS=y diff --git a/anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID new file mode 100644 index 000000000000..8cddb03cb135 --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/arm64/CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID @@ -0,0 +1 @@ +CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID=y diff --git a/anolis/configs/L2-OPTIONAL/x86/CONFIG_PROC_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/default/CONFIG_PROC_CPU_RESCTRL similarity index 100% rename from anolis/configs/L2-OPTIONAL/x86/CONFIG_PROC_CPU_RESCTRL rename to anolis/configs/L2-OPTIONAL/default/CONFIG_PROC_CPU_RESCTRL diff --git a/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL new file mode 100644 index 000000000000..b4dd102b0a4e --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/loongarch/CONFIG_PROC_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_PROC_CPU_RESCTRL is not set diff --git a/anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL new file mode 100644 index 000000000000..b4dd102b0a4e --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/riscv/CONFIG_PROC_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_PROC_CPU_RESCTRL is not set diff --git a/anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_PROC_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_PROC_CPU_RESCTRL new file mode 100644 index 000000000000..b4dd102b0a4e --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/sw_64-6b/CONFIG_PROC_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_PROC_CPU_RESCTRL is not set diff --git a/anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_PROC_CPU_RESCTRL b/anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_PROC_CPU_RESCTRL new file mode 100644 index 000000000000..b4dd102b0a4e --- /dev/null +++ b/anolis/configs/L2-OPTIONAL/sw_64-8a/CONFIG_PROC_CPU_RESCTRL @@ -0,0 +1 @@ +# CONFIG_PROC_CPU_RESCTRL is not set -- Gitee