From ea0b4c6f072e906979893dfd3a1b2115dcf8fcb8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Nov 2019 10:45:07 +0100 Subject: [PATCH 001/257] KVM: x86: create mmu/ subdirectory commit c50d8ae3a1274f32c9033bbb0e1c5b3115da2112 upstream Preparatory work for shattering mmu.c into multiple files. Besides making it easier to follow, this will also make it possible to write unit tests for various parts. Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- arch/x86/kvm/Makefile | 4 ++-- arch/x86/kvm/{ => mmu}/mmu.c | 0 arch/x86/kvm/{ => mmu}/page_track.c | 0 arch/x86/kvm/{ => mmu}/paging_tmpl.h | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename arch/x86/kvm/{ => mmu}/mmu.c (100%) rename arch/x86/kvm/{ => mmu}/page_track.c (100%) rename arch/x86/kvm/{ => mmu}/paging_tmpl.h (100%) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31ecf7a76d5a..b19ef421084d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -8,9 +8,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o -kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ +kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o page_track.o debugfs.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu/mmu.c similarity index 100% rename from arch/x86/kvm/mmu.c rename to arch/x86/kvm/mmu/mmu.c diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/mmu/page_track.c similarity index 100% rename from arch/x86/kvm/page_track.c rename to arch/x86/kvm/mmu/page_track.c diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h similarity index 100% rename from arch/x86/kvm/paging_tmpl.h rename to arch/x86/kvm/mmu/paging_tmpl.h -- Gitee From 05b663b81108b80d4ad127dcd96cdc59fcc0fea4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:57:01 -0800 Subject: [PATCH 002/257] KVM: SVM: Refactor logging of NPT enabled/disabled commit 213e0e1f500b5f035d4c8441025a74478818a64f upstream Tweak SVM's logging of NPT enabled/disabled to handle the logging in a single pr_info() in preparation for merging kvm_enable_tdp() and kvm_disable_tdp() into a single function. No functional change intended. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- arch/x86/kvm/svm.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4e2502855b26..229a76551fdb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1423,16 +1423,14 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - if (npt_enabled && !npt) { - printk(KERN_INFO "kvm: Nested Paging disabled\n"); + if (npt_enabled && !npt) npt_enabled = false; - } - if (npt_enabled) { - printk(KERN_INFO "kvm: Nested Paging enabled\n"); + if (npt_enabled) kvm_enable_tdp(); - } else + else kvm_disable_tdp(); + pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { if (!boot_cpu_has(X86_FEATURE_NRIPS)) -- Gitee From dbda533aa53f742b7cfe888aa9cc352ec5a590bc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:57:02 -0800 Subject: [PATCH 003/257] KVM: x86/mmu: Merge kvm_{enable,disable}_tdp() into a common function commit bde7723559586d6afd18fa1717fc143531d4c77d upstream Combine kvm_enable_tdp() and kvm_disable_tdp() into a single function, kvm_configure_mmu(), in preparation for doing additional configuration during hardware setup. And because having separate helpers is silly. No functional change intended. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/mmu/mmu.c | 13 +++---------- arch/x86/kvm/svm.c | 5 +---- arch/x86/kvm/vmx/vmx.c | 4 +--- 4 files changed, 6 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e3c13b874919..0f188c3cfd31 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1456,8 +1456,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); -void kvm_enable_tdp(void); -void kvm_disable_tdp(void); +void kvm_configure_mmu(bool enable_tdp); static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 83de01b0b534..fcdcb22f1902 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5694,18 +5694,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_enable_tdp(void) +void kvm_configure_mmu(bool enable_tdp) { - tdp_enabled = true; + tdp_enabled = enable_tdp; } -EXPORT_SYMBOL_GPL(kvm_enable_tdp); - -void kvm_disable_tdp(void) -{ - tdp_enabled = false; -} -EXPORT_SYMBOL_GPL(kvm_disable_tdp); - +EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 229a76551fdb..9c51a44e0652 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1426,10 +1426,7 @@ static __init int svm_hardware_setup(void) if (npt_enabled && !npt) npt_enabled = false; - if (npt_enabled) - kvm_enable_tdp(); - else - kvm_disable_tdp(); + kvm_configure_mmu(npt_enabled); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ef3a4cf65cfe..ddd1cd63f139 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5418,7 +5418,6 @@ static void vmx_enable_tdp(void) VMX_EPT_RWX_MASK, 0ull); ept_set_mmio_spte_mask(); - kvm_enable_tdp(); } /* @@ -7800,8 +7799,7 @@ static __init int hardware_setup(void) if (enable_ept) vmx_enable_tdp(); - else - kvm_disable_tdp(); + kvm_configure_mmu(enable_ept); /* * Only enable PML when hardware supports PML feature, and both EPT -- Gitee From 7978f84033fedb041f3a7ec921bb913059acb7b2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Mar 2020 10:41:51 +0100 Subject: [PATCH 004/257] kVM SVM: Move SVM related files to own sub-directory commit 46a010dd6896045484c7749b3d30b685c649eb18 upstream Move svm.c and pmu_amd.c into their own arch/x86/kvm/svm/ subdirectory. Signed-off-by: Joerg Roedel Message-Id: <20200324094154.32352-2-joro@8bytes.org> Signed-off-by: Paolo Bonzini Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/{pmu_amd.c => svm/pmu.c} | 0 arch/x86/kvm/{ => svm}/svm.c | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename arch/x86/kvm/{pmu_amd.c => svm/pmu.c} (100%) rename arch/x86/kvm/{ => svm}/svm.c (100%) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b19ef421084d..c855aeb9ad7e 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -13,7 +13,7 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o -kvm-amd-y += svm.o pmu_amd.o +kvm-amd-y += svm/svm.o svm/pmu.o obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/svm/pmu.c similarity index 100% rename from arch/x86/kvm/pmu_amd.c rename to arch/x86/kvm/svm/pmu.c diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm/svm.c similarity index 100% rename from arch/x86/kvm/svm.c rename to arch/x86/kvm/svm/svm.c -- Gitee From 726432a3205ec25c2d1ea5c9d0716b7d64e68460 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:20 -0700 Subject: [PATCH 005/257] KVM: x86: Dynamically calculate TDP level from max level and MAXPHYADDR commit d468d94b7bafa7a2dd9bc72e5f7868469be3f7c4 upstream Calculate the desired TDP level on the fly using the max TDP level and MAXPHYADDR instead of doing the same when CPUID is updated. This avoids the hidden dependency on cpuid_maxphyaddr() in vmx_get_tdp_level() and also standardizes the "use 5-level paging iff MAXPHYADDR > 48" behavior across x86. [Backport Changes] 1. In files arch/x86/include/asm/kvm_host.h and arch/x86/kvm/cupid.c, removal related to 'tdp_level' was skipped, This is because, the commit e93fd3b3e89e9 that added 'tdp_level', has not been applied to the current source tree. Doing so avoids unnecessary backporting of intermediate refactors. 2. In file arch/x86/kvm/mmu/mmu.c, intermediate changes starting with commit afaf0b2f9b801, that transitioned kvm_x86_ops from a pointer to an instance of the struct, followed by changes in e93fd3b3e89e9, were skipped. Instead, the call to kvm_x86_ops->get_tdp_level(vcpu) was directly replaced with kvm_mmu_get_tdp_level(vcpu) to reflect the final upstream state. This approach avoids unnecessary backporting of intermediate refactors. 3. In arch/x86/kvm/svm/svm.c, the function get_npt_level() was renamed to get_max_npt_level() in the current upstream commit d468d94b7bafa7. Accordingly, all existing get_npt_level() calls were updated to use the new updated get_max_npt_level() function call. 4. In arch/x86/kvm/vmx/vmx.c, the function get_ept_level() was directly replaced with vmx_get_max_tdp_level() in the current commit .The function was intermediately renamed to vmx_get_ept_level() in commit 0047fcade4cb1. However, these intermediate step was skipped to avoid backporting additional unnecessary changes. As part of this change, all the pre-existing get_ept_level() calls were replaced the new vmx_get_max_tdp_level() function call. 5. In file arch/x86/kvm/x86.c, upstream changes originally made to kvm_arch_vcpu_create() were applied to kvm_arch_cpu_init() instead, as commit 95a0d01eef7a1 (which relocates vCPU initialization) is absent in the current source tree. Upstream commit d468d94b7bafa7, makes a call to get_max_tdp_level() via an instance of the struct kvm_x86_ops. However, the change of kvm_x86_ops from a pointer to a struct instance was introduced as part of commit afaf0b2f9b801, which is absent in the current source tree. Hence, a call to get_max_tdp_level() is made via a pointer to kvm_x86_ops. This does not change the functional behavior of the code. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-8-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/mmu/mmu.c | 15 ++++++++++++--- arch/x86/kvm/svm/svm.c | 6 +++--- arch/x86/kvm/vmx/vmx.c | 11 ++++------- arch/x86/kvm/x86.c | 1 + 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0f188c3cfd31..c2795bb0f44a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -650,6 +650,7 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; int maxphyaddr; + int max_tdp_level; /* emulate context */ @@ -1101,7 +1102,7 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - int (*get_tdp_level)(struct kvm_vcpu *vcpu); + int (*get_max_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fcdcb22f1902..3bab9869a452 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5043,13 +5043,22 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, return role; } +static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) +{ + /* Use 5-level TDP if and only if it's useful/necessary. */ + if (vcpu->arch.max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + return 4; + + return vcpu->arch.max_tdp_level; +} + static union kvm_mmu_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = kvm_x86_ops->get_tdp_level(vcpu); + role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -5070,7 +5079,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; - context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); + context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; @@ -5795,7 +5804,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) + if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9c51a44e0652..d847701bd1ac 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -739,7 +739,7 @@ static inline void invlpga(unsigned long addr, u32 asid) asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); } -static int get_npt_level(struct kvm_vcpu *vcpu) +static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; @@ -3068,7 +3068,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu->get_cr3 = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu); + vcpu->arch.mmu->shadow_root_level = get_max_npt_level(); reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } @@ -7369,7 +7369,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .set_tss_addr = svm_set_tss_addr, .set_identity_map_addr = svm_set_identity_map_addr, - .get_tdp_level = get_npt_level, + .get_max_tdp_level = get_max_npt_level, .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ddd1cd63f139..153f8418eecc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3049,12 +3049,9 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static int get_ept_level(struct kvm_vcpu *vcpu) +static int vmx_get_max_tdp_level(void) { - /* Nested EPT currently only supports 4-level walks. */ - if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return 4; - if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) + if (cpu_has_vmx_ept_5levels()) return 5; return 4; } @@ -3063,7 +3060,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) { u64 eptp = VMX_EPTP_MT_WB; - eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + eptp |= (vmx_get_max_tdp_level() == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) @@ -7963,7 +7960,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = get_ept_level, + .get_max_tdp_level = vmx_get_max_tdp_level, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a2b1d8081121..5b8ac4ff9db3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9553,6 +9553,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.max_tdp_level = kvm_x86_ops->get_max_tdp_level(); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; -- Gitee From d3c60ab519edf1179e2c532b23f909948955a3ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:22 -0700 Subject: [PATCH 006/257] KVM: x86: Specify max TDP level via kvm_configure_mmu() commit 83013059bdc5486dcbcd33b5c4abf8431766e06c upstream Capture the max TDP level during kvm_configure_mmu() instead of using a kvm_x86_ops hook to do it at every vCPU creation. [Backport Changes] 1. In files arch/x86/kvm/vmx/vmx.c and arch/x86/kvm/svm/svm.c, function declaration of kvm_configure_mmu() is directly updated to latest upstream version from commit 703c335d06934. Similarly in file arch/x86/kvm/mmu/mmu.c, function definition directly updated to latest upsteam version from commit 1d92d2e8e7069 to avoid introducing unnecessary dependencies. 2. In file arch/x86/kvm/mmu/mmu.c, the variable max_huge_page_level is added from commit 1d92d2e8e7069, as it is required by the updated kvm_configure_mmu(). 3. In file arch/x86/kvm/vmx/vmx.c, within the function hardware_setup(), the conditional logic that sets the variable ept_lpage_level based on enable_ept and CPU capabilities, was backported from commit 703c335d06934. This variable is then passed to the kvm_configure_mmu() function as part of its updated parameters. 4. In file arch/x86/kvm/x86.c, since commit afaf0b2f9b801, the conversion of kvm_x86_ops from a pointer to an instance of the struct, has not been applied in the current source code. Therefore, in the function kvm_arch_vcpu_init(), pointer based function call kvm_x86_ops->get_max_tdp_level was removed instead of kvm_x86_ops.get_max_tdp_level. Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-10-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- arch/x86/include/asm/kvm_host.h | 5 ++--- arch/x86/kvm/mmu/mmu.c | 25 ++++++++++++++++++++++--- arch/x86/kvm/svm/svm.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 13 ++++++++++--- arch/x86/kvm/x86.c | 1 - 5 files changed, 35 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c2795bb0f44a..d508ee91bcf2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1102,7 +1102,6 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - int (*get_max_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); @@ -1457,8 +1456,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); -void kvm_configure_mmu(bool enable_tdp); - static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { @@ -1471,6 +1468,8 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3bab9869a452..501f7048dda5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -86,6 +86,9 @@ __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); */ bool tdp_enabled = false; +static int max_huge_page_level __read_mostly; +static int max_tdp_level __read_mostly; + enum { AUDIT_PRE_PAGE_FAULT, AUDIT_POST_PAGE_FAULT, @@ -5046,10 +5049,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { /* Use 5-level TDP if and only if it's useful/necessary. */ - if (vcpu->arch.max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; - return vcpu->arch.max_tdp_level; + return max_tdp_level; } static union kvm_mmu_role @@ -5703,9 +5706,25 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_configure_mmu(bool enable_tdp) +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level) { tdp_enabled = enable_tdp; + max_tdp_level = tdp_max_root_level; + + /* + * max_huge_page_level reflects KVM's MMU capabilities irrespective + * of kernel support, e.g. KVM may be capable of using 1GB pages when + * the kernel is not. But, KVM never creates a page size greater than + * what is used by the kernel for any given HVA, i.e. the kernel's + * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). + */ + if (tdp_enabled) + max_huge_page_level = tdp_huge_page_level; + else if (boot_cpu_has(X86_FEATURE_GBPAGES)) + max_huge_page_level = PG_LEVEL_1G; + else + max_huge_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d847701bd1ac..1834af166629 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1426,7 +1426,7 @@ static __init int svm_hardware_setup(void) if (npt_enabled && !npt) npt_enabled = false; - kvm_configure_mmu(npt_enabled); + kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { @@ -7369,7 +7369,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .set_tss_addr = svm_set_tss_addr, .set_identity_map_addr = svm_set_identity_map_addr, - .get_max_tdp_level = get_max_npt_level, .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 153f8418eecc..cdd88e0477ee 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7704,7 +7704,7 @@ static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, i; + int r, i, ept_lpage_level; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7796,7 +7796,15 @@ static __init int hardware_setup(void) if (enable_ept) vmx_enable_tdp(); - kvm_configure_mmu(enable_ept); + if (!enable_ept) + ept_lpage_level = 0; + else if (cpu_has_vmx_ept_1g_page()) + ept_lpage_level = PG_LEVEL_1G; + else if (cpu_has_vmx_ept_2m_page()) + ept_lpage_level = PG_LEVEL_2M; + else + ept_lpage_level = PG_LEVEL_4K; + kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -7960,7 +7968,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, - .get_max_tdp_level = vmx_get_max_tdp_level, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5b8ac4ff9db3..a2b1d8081121 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9553,7 +9553,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.max_tdp_level = kvm_x86_ops->get_max_tdp_level(); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; -- Gitee From 870550fcab7abef562fd6a9855e6417f117293c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Mar 2021 18:16:37 -0800 Subject: [PATCH 007/257] KVM: SVM: Connect 'npt' module param to KVM's internal 'npt_enabled' commit 99840a75454b66d69d2a450ab04e6438d75eba48 upstream Directly connect the 'npt' param to the 'npt_enabled' variable so that runtime adjustments to npt_enabled are reflected in sysfs. Move the !PAE restriction to a runtime check to ensure NPT is forced off if the host is using 2-level paging, and add a comment explicitly stating why NPT requires a 64-bit kernel or a kernel with PAE enabled. Opportunistically switch the param to octal permissions. [Backport Changes] 1. In file arch/x86/kvm/svm/svm.c, static declaration of variable 'npt_enabled' has been removed instead of removing non static declaration of 'npt_enabled'. This is because, the commit 883b0a91f41ab that converts 'npt_enabled' from static to non static declaration, has not been applied to the current source tree. Doing so avoids unnecessary backporting of intermediate refactors. Signed-off-by: Sean Christopherson Message-Id: <20210305021637.3768573-1-seanjc@google.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- arch/x86/kvm/svm/svm.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1834af166629..ea31666833f6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -296,13 +296,6 @@ static const struct svm_direct_access_msrs { { .index = MSR_INVALID, .always = false }, }; -/* enable NPT for AMD64 and X86 with PAE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -static bool npt_enabled = true; -#else -static bool npt_enabled; -#endif - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * pause_filter_count: On processors that support Pause filtering(indicated @@ -351,9 +344,12 @@ module_param(pause_filter_count_shrink, ushort, 0444); static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; module_param(pause_filter_count_max, ushort, 0444); -/* allow nested paging (virtualized MMU) for all guests */ -static int npt = true; -module_param(npt, int, S_IRUGO); +/* + * Use nested page tables by default. Note, NPT may get forced off by + * svm_hardware_setup() if it's unsupported by hardware or the host kernel. + */ +bool npt_enabled = true; +module_param_named(npt, npt_enabled, bool, 0444); /* allow nested virtualization in KVM/SVM */ static int nested = true; @@ -1420,10 +1416,15 @@ static __init int svm_hardware_setup(void) goto err; } - if (!boot_cpu_has(X86_FEATURE_NPT)) + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) npt_enabled = false; - if (npt_enabled && !npt) + if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); -- Gitee From 6d6420f035e2bed8d472de01f610e2755b57f384 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 18 Aug 2021 11:55:47 -0500 Subject: [PATCH 008/257] KVM: x86: Allow CPU to force vendor-specific TDP level commit 746700d21fd52399c97aeb7791584bbf5426983c upstream AMD future CPUs will require a 5-level NPT if host CR4.LA57 is set. To prevent kvm_mmu_get_tdp_level() from incorrectly changing NPT level on behalf of CPUs, add a new parameter in kvm_configure_mmu() to force a fixed TDP level. Signed-off-by: Wei Huang Message-Id: <20210818165549.3771014-2-wei.huang2@amd.com> Signed-off-by: Paolo Bonzini Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- arch/x86/include/asm/kvm_host.h | 5 ++--- arch/x86/kvm/mmu/mmu.c | 10 ++++++++-- arch/x86/kvm/svm/svm.c | 4 +++- arch/x86/kvm/vmx/vmx.c | 3 ++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d508ee91bcf2..09cfc9c36c2d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -650,7 +650,6 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; int maxphyaddr; - int max_tdp_level; /* emulate context */ @@ -1468,8 +1467,8 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level); +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 501f7048dda5..b5be61336695 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -87,6 +87,7 @@ __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); bool tdp_enabled = false; static int max_huge_page_level __read_mostly; +static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; enum { @@ -5048,6 +5049,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + /* tdp_root_level is architecture forced level, use it if nonzero */ + if (tdp_root_level) + return tdp_root_level; + /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; @@ -5706,10 +5711,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; + tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ea31666833f6..179a2301c4ce 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1427,7 +1427,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); + /* Force VM NPT level equal to the host's max NPT level */ + kvm_configure_mmu(npt_enabled, get_max_npt_level(), + get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cdd88e0477ee..189f6598ea63 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7804,7 +7804,8 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT -- Gitee From 482371b41eed80f6b60642a47c481e13cedd4c91 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 6 Mar 2020 16:19:54 +0300 Subject: [PATCH 009/257] i2c: designware: Detect the FIFO size in the common code commit 1f1a714658307a1a5ec65b0a23d87a87da64c86f upstream The problem with detecting the FIFO depth in the platform driver is that in order to implement this we have to access the controller IC_COMP_PARAM_1 register. Currently it's done before the i2c_dw_set_reg_access() method execution, which is errors prone since the method determines the registers endianness and access mode and we can't use dw_readl/dw_writel accessors before this information is retrieved. We also can't move the i2c_dw_set_reg_access() function invocation to after the master/slave probe functions call (when endianness and access mode are determined), since the FIFO depth information is used by them for initializations. So in order to fix the problem we have no choice but to move the FIFO size detection methods to the common code and call it at the probe stage. Signed-off-by: Serge Semin Signed-off-by: Alexey Malahov Reviewed-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/i2c/busses/i2c-designware-common.c | 22 +++++++++++++++++++ drivers/i2c/busses/i2c-designware-core.h | 1 + drivers/i2c/busses/i2c-designware-master.c | 2 ++ drivers/i2c/busses/i2c-designware-platdrv.c | 24 --------------------- drivers/i2c/busses/i2c-designware-slave.c | 2 ++ 5 files changed, 27 insertions(+), 24 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index 8e275c8c63e6..998d14e96c58 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -351,6 +351,28 @@ int i2c_dw_handle_tx_abort(struct dw_i2c_dev *dev) return -EIO; } +void i2c_dw_set_fifo_size(struct dw_i2c_dev *dev) +{ + u32 param, tx_fifo_depth, rx_fifo_depth; + + /* + * Try to detect the FIFO depth if not set by interface driver, + * the depth could be from 2 to 256 from HW spec. + */ + param = dw_readl(dev, DW_IC_COMP_PARAM_1); + tx_fifo_depth = ((param >> 16) & 0xff) + 1; + rx_fifo_depth = ((param >> 8) & 0xff) + 1; + if (!dev->tx_fifo_depth) { + dev->tx_fifo_depth = tx_fifo_depth; + dev->rx_fifo_depth = rx_fifo_depth; + } else if (tx_fifo_depth >= 2) { + dev->tx_fifo_depth = min_t(u32, dev->tx_fifo_depth, + tx_fifo_depth); + dev->rx_fifo_depth = min_t(u32, dev->rx_fifo_depth, + rx_fifo_depth); + } +} + u32 i2c_dw_func(struct i2c_adapter *adap) { struct dw_i2c_dev *dev = i2c_get_adapdata(adap); diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 8a9e492716a4..bc16a51c0588 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -300,6 +300,7 @@ int i2c_dw_acquire_lock(struct dw_i2c_dev *dev); void i2c_dw_release_lock(struct dw_i2c_dev *dev); int i2c_dw_wait_bus_not_busy(struct dw_i2c_dev *dev); int i2c_dw_handle_tx_abort(struct dw_i2c_dev *dev); +void i2c_dw_set_fifo_size(struct dw_i2c_dev *dev); u32 i2c_dw_func(struct i2c_adapter *adap); void i2c_dw_disable(struct dw_i2c_dev *dev); void i2c_dw_disable_int(struct dw_i2c_dev *dev); diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index 05bf8c9b332c..be265b575855 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -713,6 +713,8 @@ int i2c_dw_probe(struct dw_i2c_dev *dev) if (ret) return ret; + i2c_dw_set_fifo_size(dev); + ret = dev->init(dev); if (ret) return ret; diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 871c714093a4..7d2359db2d2a 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -220,28 +220,6 @@ static void i2c_dw_configure_slave(struct dw_i2c_dev *dev) dev->mode = DW_IC_SLAVE; } -static void dw_i2c_set_fifo_size(struct dw_i2c_dev *dev) -{ - u32 param, tx_fifo_depth, rx_fifo_depth; - - /* - * Try to detect the FIFO depth if not set by interface driver, - * the depth could be from 2 to 256 from HW spec. - */ - param = i2c_dw_read_comp_param(dev); - tx_fifo_depth = ((param >> 16) & 0xff) + 1; - rx_fifo_depth = ((param >> 8) & 0xff) + 1; - if (!dev->tx_fifo_depth) { - dev->tx_fifo_depth = tx_fifo_depth; - dev->rx_fifo_depth = rx_fifo_depth; - } else if (tx_fifo_depth >= 2) { - dev->tx_fifo_depth = min_t(u32, dev->tx_fifo_depth, - tx_fifo_depth); - dev->rx_fifo_depth = min_t(u32, dev->rx_fifo_depth, - rx_fifo_depth); - } -} - static void dw_i2c_plat_pm_cleanup(struct dw_i2c_dev *dev) { pm_runtime_disable(dev->dev); @@ -372,8 +350,6 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) div_u64(clk_khz * t->sda_hold_ns + 500000, 1000000); } - dw_i2c_set_fifo_size(dev); - adap = &dev->adapter; adap->owner = THIS_MODULE; adap->class = I2C_CLASS_DEPRECATED; diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index f5f001738df5..0fc3aa31d46a 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -260,6 +260,8 @@ int i2c_dw_probe_slave(struct dw_i2c_dev *dev) if (ret) return ret; + i2c_dw_set_fifo_size(dev); + ret = dev->init(dev); if (ret) return ret; -- Gitee From 02fee04a24bc1993fe855a0d063c4eb2bae41a9b Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 6 Mar 2020 16:19:56 +0300 Subject: [PATCH 010/257] i2c: designware: Discard i2c_dw_read_comp_param() function commit d816f216c36445d1f9180b32ac30a3094317d6bb upstream There is no code left in the kernel which would be using the function. So just remove it. Signed-off-by: Serge Semin Signed-off-by: Alexey Malahov Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/i2c/busses/i2c-designware-common.c | 6 ------ drivers/i2c/busses/i2c-designware-core.h | 1 - 2 files changed, 7 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index 998d14e96c58..7872756e544f 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -395,11 +395,5 @@ void i2c_dw_disable_int(struct dw_i2c_dev *dev) dw_writel(dev, 0, DW_IC_INTR_MASK); } -u32 i2c_dw_read_comp_param(struct dw_i2c_dev *dev) -{ - return dw_readl(dev, DW_IC_COMP_PARAM_1); -} -EXPORT_SYMBOL_GPL(i2c_dw_read_comp_param); - MODULE_DESCRIPTION("Synopsys DesignWare I2C bus adapter core"); MODULE_LICENSE("GPL"); diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index bc16a51c0588..31d611a24b83 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -317,7 +317,6 @@ static inline void __i2c_dw_disable_nowait(struct dw_i2c_dev *dev) void __i2c_dw_disable(struct dw_i2c_dev *dev); -extern u32 i2c_dw_read_comp_param(struct dw_i2c_dev *dev); extern int i2c_dw_probe(struct dw_i2c_dev *dev); #if IS_ENABLED(CONFIG_I2C_DESIGNWARE_SLAVE) extern int i2c_dw_probe_slave(struct dw_i2c_dev *dev); -- Gitee From 64f8da02af2920ee8a6c7774841710e945d29148 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Thu, 28 May 2020 12:33:18 +0300 Subject: [PATCH 011/257] i2c: designware: Convert driver to using regmap API commit 0daede80f870025141cbb16e6a826d5e7f43f4a5 upstream Seeing the DW I2C driver is using flags-based accessors with two conditional clauses it would be better to replace them with the regmap API IO methods and to initialize the regmap object with read/write callbacks specific to the controller registers map implementation. This will be also handy for the drivers with non-standard registers mapping (like an embedded into the Baikal-T1 System Controller DW I2C block, which glue-driver is a part of this series). As before the driver tries to detect the mapping setup at probe stage and creates a regmap object accordingly, which will be used by the rest of the code to correctly access the controller registers. In two places it was appropriate to convert the hand-written read-modify-write and read-poll-loop design patterns to the corresponding regmap API ready-to-use methods. Note the regmap IO methods return value is checked only at the probe stage. The rest of the code won't do this because basically we have MMIO-based regmap so non of the read/write methods can fail (this also won't be needed for the Baikal-T1-specific I2C controller). [Backport Changes] 1. In drivers/i2c/busses/i2c-designware-common.c: the upstream commit 0daede80f87002 uses writel_relaxed/readl_relaxed and writew_relaxed/readw_relaxed calls in dw_reg_read(), dw_reg_write(), dw_reg_read_swab(), dw_reg_write_swab(), dw_reg_read_word() and dw_reg_write_word(). However, a more recent upstream commit f726eaa787e9f has changed these calls from writel_relaxed() to writel(), readl_relaxed() to readl(), writew_relaxed() to writew() and readw_relaxed() to readw(). A part of this change has already been made to OpenCloud distro via commit 31540607045b59. Therefore, to align with OpenCloud's changes, stick to using writel/readl and writew/readw calls in all of the aforementioned functions. 2. In drivers/i2c/busses/i2c-designware-common.c, within the __i2c_dw_disable() function, a non-upstream if condition, if (dw_readl(dev, DW_IC_STATUS)& DW_IC_STATUS_MASTER_ACTIVITY)), introduced in OpenCloud commit 31540607045b59, was updated to use regmap_read and regmap_write instead of dw_readl and dw_writel, which aligns with upstream changes. 3. In file with drivers/i2c/busses/i2c-designware-master.c, within the i2c_dw_recv_len() function, non-upstream function calls (dw_readl(dev, DW_IC_INTR_MASK) and dw_writel(dev, intr_mask, DW_IC_INTR_MASK)), introduced in opencloud commit 31540607045b59, were updated to use regmap_write and regmap_read respectively, thereby aligning with upstream changes. Suggested-by: Andy Shevchenko Signed-off-by: Serge Semin Tested-by: Jarkko Nikula Acked-by: Jarkko Nikula Reviewed-by: Andy Shevchenko [wsa: fix type of 'rx_valid' and remove outdated kdoc var description] Signed-off-by: Wolfram Sang Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/i2c/busses/Kconfig | 1 + drivers/i2c/busses/i2c-designware-common.c | 182 +++++++++++++++------ drivers/i2c/busses/i2c-designware-core.h | 22 ++- drivers/i2c/busses/i2c-designware-master.c | 131 ++++++++------- drivers/i2c/busses/i2c-designware-slave.c | 77 +++++---- 5 files changed, 253 insertions(+), 160 deletions(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 99f368d354f5..6a2250078189 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -523,6 +523,7 @@ config I2C_DAVINCI config I2C_DESIGNWARE_CORE tristate + select REGMAP config I2C_DESIGNWARE_PLATFORM tristate "Synopsys DesignWare Platform" diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index 7872756e544f..eaa0fadc9f11 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "i2c-designware-core.h" @@ -53,66 +54,122 @@ static char *abort_sources[] = { "incorrect slave-transmitter mode configuration", }; -u32 dw_readl(struct dw_i2c_dev *dev, int offset) +static int dw_reg_read(void *context, unsigned int reg, unsigned int *val) { - u32 value; + struct dw_i2c_dev *dev = context; - if (dev->flags & ACCESS_16BIT) - value = readw(dev->base + offset) | - (readw(dev->base + offset + 2) << 16); - else - value = readl(dev->base + offset); + *val = readl(dev->base + reg); - if (dev->flags & ACCESS_SWAP) - return swab32(value); - else - return value; + return 0; } -void dw_writel(struct dw_i2c_dev *dev, u32 b, int offset) +static int dw_reg_write(void *context, unsigned int reg, unsigned int val) { - if (dev->flags & ACCESS_SWAP) - b = swab32(b); - - if (dev->flags & ACCESS_16BIT) { - writew((u16)b, dev->base + offset); - writew((u16)(b >> 16), dev->base + offset + 2); - } else { - writel(b, dev->base + offset); - } + struct dw_i2c_dev *dev = context; + + writel(val, dev->base + reg); + + return 0; +} + +static int dw_reg_read_swab(void *context, unsigned int reg, unsigned int *val) +{ + struct dw_i2c_dev *dev = context; + + *val = swab32(readl(dev->base + reg)); + + return 0; +} + +static int dw_reg_write_swab(void *context, unsigned int reg, unsigned int val) +{ + struct dw_i2c_dev *dev = context; + + writel(swab32(val), dev->base + reg); + + return 0; +} + +static int dw_reg_read_word(void *context, unsigned int reg, unsigned int *val) +{ + struct dw_i2c_dev *dev = context; + + *val = readw(dev->base + reg) | + (readw(dev->base + reg + 2) << 16); + + return 0; +} + +static int dw_reg_write_word(void *context, unsigned int reg, unsigned int val) +{ + struct dw_i2c_dev *dev = context; + + writew(val, dev->base + reg); + writew(val >> 16, dev->base + reg + 2); + + return 0; } /** - * i2c_dw_set_reg_access() - Set register access flags + * i2c_dw_init_regmap() - Initialize registers map * @dev: device private data * - * Autodetects needed register access mode and sets access flags accordingly. - * This must be called before doing any other register access. + * Autodetects needed register access mode and creates the regmap with + * corresponding read/write callbacks. This must be called before doing any + * other register access. */ -int i2c_dw_set_reg_access(struct dw_i2c_dev *dev) +int i2c_dw_init_regmap(struct dw_i2c_dev *dev) { + struct regmap_config map_cfg = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .disable_locking = true, + .reg_read = dw_reg_read, + .reg_write = dw_reg_write, + .max_register = DW_IC_COMP_TYPE, + }; u32 reg; int ret; + /* + * Skip detecting the registers map configuration if the regmap has + * already been provided by a higher code. + */ + if (dev->map) + return 0; + ret = i2c_dw_acquire_lock(dev); if (ret) return ret; - reg = dw_readl(dev, DW_IC_COMP_TYPE); + reg = readl(dev->base + DW_IC_COMP_TYPE); i2c_dw_release_lock(dev); if (reg == swab32(DW_IC_COMP_TYPE_VALUE)) { - /* Configure register endianess access */ - dev->flags |= ACCESS_SWAP; + map_cfg.reg_read = dw_reg_read_swab; + map_cfg.reg_write = dw_reg_write_swab; } else if (reg == (DW_IC_COMP_TYPE_VALUE & 0x0000ffff)) { - /* Configure register access mode 16bit */ - dev->flags |= ACCESS_16BIT; + map_cfg.reg_read = dw_reg_read_word; + map_cfg.reg_write = dw_reg_write_word; } else if (reg != DW_IC_COMP_TYPE_VALUE) { dev_err(dev->dev, "Unknown Synopsys component type: 0x%08x\n", reg); return -ENODEV; } + /* + * Note we'll check the return value of the regmap IO accessors only + * at the probe stage. The rest of the code won't do this because + * basically we have MMIO-based regmap so non of the read/write methods + * can fail. + */ + dev->map = devm_regmap_init(dev->dev, NULL, dev, &map_cfg); + if (IS_ERR(dev->map)) { + dev_err(dev->dev, "Failed to init the registers map\n"); + return PTR_ERR(dev->map); + } + return 0; } @@ -181,11 +238,17 @@ int i2c_dw_set_sda_hold(struct dw_i2c_dev *dev) return ret; /* Configure SDA Hold Time if required */ - reg = dw_readl(dev, DW_IC_COMP_VERSION); + ret = regmap_read(dev->map, DW_IC_COMP_VERSION, ®); + if (ret) + goto err_release_lock; + if (reg >= DW_IC_SDA_HOLD_MIN_VERS) { if (!dev->sda_hold_time) { /* Keep previous hold time setting if no one set it */ - dev->sda_hold_time = dw_readl(dev, DW_IC_SDA_HOLD); + ret = regmap_read(dev->map, DW_IC_SDA_HOLD, + &dev->sda_hold_time); + if (ret) + goto err_release_lock; } /* @@ -209,22 +272,25 @@ int i2c_dw_set_sda_hold(struct dw_i2c_dev *dev) dev->sda_hold_time = 0; } +err_release_lock: i2c_dw_release_lock(dev); - return 0; + return ret; } void __i2c_dw_disable(struct dw_i2c_dev *dev) { int timeout = 100; + u32 status; /* * Workaround: When a slave goes offline and the master tries to send * it data, the bus gets stuck. Issuing abort seems to work. */ - if (dw_readl(dev, DW_IC_STATUS) & DW_IC_STATUS_MASTER_ACTIVITY) { + regmap_read(dev->map, DW_IC_STATUS, &status); + if (status & DW_IC_STATUS_MASTER_ACTIVITY) { /* Issue abort. */ - dw_writel(dev, DW_IC_ENABLE_ENABLE | DW_IC_ENABLE_ABORT, DW_IC_ENABLE); + regmap_write(dev->map, DW_IC_ENABLE, DW_IC_ENABLE_ENABLE | DW_IC_ENABLE_ABORT); usleep_range(50000, 100000); } @@ -234,7 +300,8 @@ void __i2c_dw_disable(struct dw_i2c_dev *dev) * The enable status register may be unimplemented, but * in that case this test reads zero and exits the loop. */ - if ((dw_readl(dev, DW_IC_ENABLE_STATUS) & 1) == 0) + regmap_read(dev->map, DW_IC_ENABLE_STATUS, &status); + if ((status & 1) == 0) return; /* @@ -310,22 +377,23 @@ void i2c_dw_release_lock(struct dw_i2c_dev *dev) */ int i2c_dw_wait_bus_not_busy(struct dw_i2c_dev *dev) { - int timeout = TIMEOUT; + u32 status; + int ret; - while (dw_readl(dev, DW_IC_STATUS) & DW_IC_STATUS_ACTIVITY) { - if (timeout <= 0) { - dev_warn(dev->dev, "timeout waiting for bus ready\n"); - i2c_recover_bus(&dev->adapter); + ret = regmap_read_poll_timeout(dev->map, DW_IC_STATUS, status, + !(status & DW_IC_STATUS_ACTIVITY), + 1100, 20000); + if (ret) { + dev_warn(dev->dev, "timeout waiting for bus ready\n"); - if (dw_readl(dev, DW_IC_STATUS) & DW_IC_STATUS_ACTIVITY) - return -ETIMEDOUT; - return 0; - } - timeout--; - usleep_range(1000, 1100); + i2c_recover_bus(&dev->adapter); + + regmap_read(dev->map, DW_IC_STATUS, &status); + if (!(status & DW_IC_STATUS_ACTIVITY)) + ret = 0; } - return 0; + return ret; } int i2c_dw_handle_tx_abort(struct dw_i2c_dev *dev) @@ -351,15 +419,19 @@ int i2c_dw_handle_tx_abort(struct dw_i2c_dev *dev) return -EIO; } -void i2c_dw_set_fifo_size(struct dw_i2c_dev *dev) +int i2c_dw_set_fifo_size(struct dw_i2c_dev *dev) { u32 param, tx_fifo_depth, rx_fifo_depth; + int ret; /* * Try to detect the FIFO depth if not set by interface driver, * the depth could be from 2 to 256 from HW spec. */ - param = dw_readl(dev, DW_IC_COMP_PARAM_1); + ret = regmap_read(dev->map, DW_IC_COMP_PARAM_1, ¶m); + if (ret) + return ret; + tx_fifo_depth = ((param >> 16) & 0xff) + 1; rx_fifo_depth = ((param >> 8) & 0xff) + 1; if (!dev->tx_fifo_depth) { @@ -371,6 +443,8 @@ void i2c_dw_set_fifo_size(struct dw_i2c_dev *dev) dev->rx_fifo_depth = min_t(u32, dev->rx_fifo_depth, rx_fifo_depth); } + + return 0; } u32 i2c_dw_func(struct i2c_adapter *adap) @@ -382,17 +456,19 @@ u32 i2c_dw_func(struct i2c_adapter *adap) void i2c_dw_disable(struct dw_i2c_dev *dev) { + u32 dummy; + /* Disable controller */ __i2c_dw_disable(dev); /* Disable all interupts */ - dw_writel(dev, 0, DW_IC_INTR_MASK); - dw_readl(dev, DW_IC_CLR_INTR); + regmap_write(dev->map, DW_IC_INTR_MASK, 0); + regmap_read(dev->map, DW_IC_CLR_INTR, &dummy); } void i2c_dw_disable_int(struct dw_i2c_dev *dev) { - dw_writel(dev, 0, DW_IC_INTR_MASK); + regmap_write(dev->map, DW_IC_INTR_MASK, 0); } MODULE_DESCRIPTION("Synopsys DesignWare I2C bus adapter core"); diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 31d611a24b83..57060001ce19 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -10,6 +10,7 @@ */ #include +#include #define DW_IC_DEFAULT_FUNCTIONALITY (I2C_FUNC_I2C | \ I2C_FUNC_SMBUS_BYTE | \ @@ -123,8 +124,6 @@ #define STATUS_WRITE_IN_PROGRESS 0x1 #define STATUS_READ_IN_PROGRESS 0x2 -#define TIMEOUT 20 /* ms */ - /* * operation modes */ @@ -177,7 +176,9 @@ /** * struct dw_i2c_dev - private i2c-designware data * @dev: driver model device node + * @map: IO registers map * @base: IO registers pointer + * @ext: Extended IO registers pointer * @cmd_complete: tx completion indicator * @clk: input reference clock * @pclk: clock required to access the registers @@ -227,6 +228,7 @@ */ struct dw_i2c_dev { struct device *dev; + struct regmap *map; void __iomem *base; void __iomem *ext; struct completion cmd_complete; @@ -279,18 +281,14 @@ struct dw_i2c_dev { bool suspended; }; -#define ACCESS_SWAP 0x00000001 -#define ACCESS_16BIT 0x00000002 -#define ACCESS_INTR_MASK 0x00000004 -#define ACCESS_NO_IRQ_SUSPEND 0x00000008 +#define ACCESS_INTR_MASK 0x00000001 +#define ACCESS_NO_IRQ_SUSPEND 0x00000002 #define MODEL_CHERRYTRAIL 0x00000100 #define MODEL_MSCC_OCELOT 0x00000200 #define MODEL_MASK 0x00000f00 -u32 dw_readl(struct dw_i2c_dev *dev, int offset); -void dw_writel(struct dw_i2c_dev *dev, u32 b, int offset); -int i2c_dw_set_reg_access(struct dw_i2c_dev *dev); +int i2c_dw_init_regmap(struct dw_i2c_dev *dev); u32 i2c_dw_scl_hcnt(u32 ic_clk, u32 tSYMBOL, u32 tf, int cond, int offset); u32 i2c_dw_scl_lcnt(u32 ic_clk, u32 tLOW, u32 tf, int offset); int i2c_dw_set_sda_hold(struct dw_i2c_dev *dev); @@ -300,19 +298,19 @@ int i2c_dw_acquire_lock(struct dw_i2c_dev *dev); void i2c_dw_release_lock(struct dw_i2c_dev *dev); int i2c_dw_wait_bus_not_busy(struct dw_i2c_dev *dev); int i2c_dw_handle_tx_abort(struct dw_i2c_dev *dev); -void i2c_dw_set_fifo_size(struct dw_i2c_dev *dev); +int i2c_dw_set_fifo_size(struct dw_i2c_dev *dev); u32 i2c_dw_func(struct i2c_adapter *adap); void i2c_dw_disable(struct dw_i2c_dev *dev); void i2c_dw_disable_int(struct dw_i2c_dev *dev); static inline void __i2c_dw_enable(struct dw_i2c_dev *dev) { - dw_writel(dev, 1, DW_IC_ENABLE); + regmap_write(dev->map, DW_IC_ENABLE, 1); } static inline void __i2c_dw_disable_nowait(struct dw_i2c_dev *dev) { - dw_writel(dev, 0, DW_IC_ENABLE); + regmap_write(dev->map, DW_IC_ENABLE, 0); } void __i2c_dw_disable(struct dw_i2c_dev *dev); diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index be265b575855..af92405003d4 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "i2c-designware-core.h" @@ -25,11 +26,11 @@ static void i2c_dw_configure_fifo_master(struct dw_i2c_dev *dev) { /* Configure Tx/Rx FIFO threshold levels */ - dw_writel(dev, dev->tx_fifo_depth / 2, DW_IC_TX_TL); - dw_writel(dev, 0, DW_IC_RX_TL); + regmap_write(dev->map, DW_IC_TX_TL, dev->tx_fifo_depth / 2); + regmap_write(dev->map, DW_IC_RX_TL, 0); /* Configure the I2C master */ - dw_writel(dev, dev->master_cfg, DW_IC_CON); + regmap_write(dev->map, DW_IC_CON, dev->master_cfg); } static int i2c_dw_set_timings_master(struct dw_i2c_dev *dev) @@ -44,8 +45,11 @@ static int i2c_dw_set_timings_master(struct dw_i2c_dev *dev) ret = i2c_dw_acquire_lock(dev); if (ret) return ret; - comp_param1 = dw_readl(dev, DW_IC_COMP_PARAM_1); + + ret = regmap_read(dev->map, DW_IC_COMP_PARAM_1, &comp_param1); i2c_dw_release_lock(dev); + if (ret) + return ret; /* Set standard and fast speed dividers for high/low periods */ sda_falling_time = t->sda_fall_ns ?: 300; /* ns */ @@ -162,22 +166,22 @@ static int i2c_dw_init_master(struct dw_i2c_dev *dev) __i2c_dw_disable(dev); /* Write standard speed timing parameters */ - dw_writel(dev, dev->ss_hcnt, DW_IC_SS_SCL_HCNT); - dw_writel(dev, dev->ss_lcnt, DW_IC_SS_SCL_LCNT); + regmap_write(dev->map, DW_IC_SS_SCL_HCNT, dev->ss_hcnt); + regmap_write(dev->map, DW_IC_SS_SCL_LCNT, dev->ss_lcnt); /* Write fast mode/fast mode plus timing parameters */ - dw_writel(dev, dev->fs_hcnt, DW_IC_FS_SCL_HCNT); - dw_writel(dev, dev->fs_lcnt, DW_IC_FS_SCL_LCNT); + regmap_write(dev->map, DW_IC_FS_SCL_HCNT, dev->fs_hcnt); + regmap_write(dev->map, DW_IC_FS_SCL_LCNT, dev->fs_lcnt); /* Write high speed timing parameters if supported */ if (dev->hs_hcnt && dev->hs_lcnt) { - dw_writel(dev, dev->hs_hcnt, DW_IC_HS_SCL_HCNT); - dw_writel(dev, dev->hs_lcnt, DW_IC_HS_SCL_LCNT); + regmap_write(dev->map, DW_IC_HS_SCL_HCNT, dev->hs_hcnt); + regmap_write(dev->map, DW_IC_HS_SCL_LCNT, dev->hs_lcnt); } /* Write SDA hold time if supported */ if (dev->sda_hold_time) - dw_writel(dev, dev->sda_hold_time, DW_IC_SDA_HOLD); + regmap_write(dev->map, DW_IC_SDA_HOLD, dev->sda_hold_time); i2c_dw_configure_fifo_master(dev); i2c_dw_release_lock(dev); @@ -188,15 +192,15 @@ static int i2c_dw_init_master(struct dw_i2c_dev *dev) static void i2c_dw_xfer_init(struct dw_i2c_dev *dev) { struct i2c_msg *msgs = dev->msgs; - u32 ic_con, ic_tar = 0; + u32 ic_con = 0, ic_tar = 0; + u32 dummy; /* Disable the adapter */ __i2c_dw_disable(dev); /* If the slave address is ten bit address, enable 10BITADDR */ - ic_con = dw_readl(dev, DW_IC_CON); if (msgs[dev->msg_write_idx].flags & I2C_M_TEN) { - ic_con |= DW_IC_CON_10BITADDR_MASTER; + ic_con = DW_IC_CON_10BITADDR_MASTER; /* * If I2C_DYNAMIC_TAR_UPDATE is set, the 10-bit addressing * mode has to be enabled via bit 12 of IC_TAR register. @@ -204,17 +208,17 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev) * detected from registers. */ ic_tar = DW_IC_TAR_10BITADDR_MASTER; - } else { - ic_con &= ~DW_IC_CON_10BITADDR_MASTER; } - dw_writel(dev, ic_con, DW_IC_CON); + regmap_update_bits(dev->map, DW_IC_CON, DW_IC_CON_10BITADDR_MASTER, + ic_con); /* * Set the slave (target) address and enable 10-bit addressing mode * if applicable. */ - dw_writel(dev, msgs[dev->msg_write_idx].addr | ic_tar, DW_IC_TAR); + regmap_write(dev->map, DW_IC_TAR, + msgs[dev->msg_write_idx].addr | ic_tar); /* Enforce disabled interrupts (due to HW issues) */ i2c_dw_disable_int(dev); @@ -223,11 +227,11 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev) __i2c_dw_enable(dev); /* Dummy read to avoid the register getting stuck on Bay Trail */ - dw_readl(dev, DW_IC_ENABLE_STATUS); + regmap_read(dev->map, DW_IC_ENABLE_STATUS, &dummy); /* Clear and enable interrupts */ - dw_readl(dev, DW_IC_CLR_INTR); - dw_writel(dev, DW_IC_INTR_MASTER_MASK, DW_IC_INTR_MASK); + regmap_read(dev->map, DW_IC_CLR_INTR, &dummy); + regmap_write(dev->map, DW_IC_INTR_MASK, DW_IC_INTR_MASTER_MASK); } /* @@ -246,6 +250,7 @@ i2c_dw_xfer_msg(struct dw_i2c_dev *dev) u32 buf_len = dev->tx_buf_len; u8 *buf = dev->tx_buf; bool need_restart = false; + unsigned int flr; intr_mask = DW_IC_INTR_MASTER_MASK; @@ -278,8 +283,11 @@ i2c_dw_xfer_msg(struct dw_i2c_dev *dev) need_restart = true; } - tx_limit = dev->tx_fifo_depth - dw_readl(dev, DW_IC_TXFLR); - rx_limit = dev->rx_fifo_depth - dw_readl(dev, DW_IC_RXFLR); + regmap_read(dev->map, DW_IC_TXFLR, &flr); + tx_limit = dev->tx_fifo_depth - flr; + + regmap_read(dev->map, DW_IC_RXFLR, &flr); + rx_limit = dev->rx_fifo_depth - flr; while (buf_len > 0 && tx_limit > 0 && rx_limit > 0) { u32 cmd = 0; @@ -312,11 +320,14 @@ i2c_dw_xfer_msg(struct dw_i2c_dev *dev) if (dev->rx_outstanding >= dev->rx_fifo_depth) break; - dw_writel(dev, cmd | 0x100, DW_IC_DATA_CMD); + regmap_write(dev->map, DW_IC_DATA_CMD, + cmd | 0x100); rx_limit--; dev->rx_outstanding++; - } else - dw_writel(dev, cmd | *buf++, DW_IC_DATA_CMD); + } else { + regmap_write(dev->map, DW_IC_DATA_CMD, + cmd | *buf++); + } tx_limit--; buf_len--; } @@ -352,7 +363,7 @@ i2c_dw_xfer_msg(struct dw_i2c_dev *dev) if (dev->msg_err) intr_mask = 0; - dw_writel(dev, intr_mask, DW_IC_INTR_MASK); + regmap_write(dev->map, DW_IC_INTR_MASK, intr_mask); } static u8 @@ -375,9 +386,9 @@ i2c_dw_recv_len(struct dw_i2c_dev *dev, u8 len) * Received buffer length, re-enable TX_EMPTY interrupt * to resume the SMBUS transaction. */ - intr_mask = dw_readl(dev, DW_IC_INTR_MASK); + regmap_read(dev->map, DW_IC_INTR_MASK, &intr_mask); intr_mask |= DW_IC_INTR_TX_EMPTY; - dw_writel(dev, intr_mask, DW_IC_INTR_MASK); + regmap_write(dev->map, DW_IC_INTR_MASK, intr_mask); return len; } @@ -386,10 +397,10 @@ static void i2c_dw_read(struct dw_i2c_dev *dev) { struct i2c_msg *msgs = dev->msgs; - int rx_valid; + unsigned int rx_valid; for (; dev->msg_read_idx < dev->msgs_num; dev->msg_read_idx++) { - u32 len; + u32 len, tmp; u8 *buf; if (!(msgs[dev->msg_read_idx].flags & I2C_M_RD)) @@ -403,18 +414,18 @@ i2c_dw_read(struct dw_i2c_dev *dev) buf = dev->rx_buf; } - rx_valid = dw_readl(dev, DW_IC_RXFLR); + regmap_read(dev->map, DW_IC_RXFLR, &rx_valid); for (; len > 0 && rx_valid > 0; len--, rx_valid--) { u32 flags = msgs[dev->msg_read_idx].flags; - *buf = dw_readl(dev, DW_IC_DATA_CMD); + regmap_read(dev->map, DW_IC_DATA_CMD, &tmp); /* Ensure length byte is a valid value */ if (flags & I2C_M_RECV_LEN && - *buf <= I2C_SMBUS_BLOCK_MAX && *buf > 0) { - len = i2c_dw_recv_len(dev, *buf); + tmp <= I2C_SMBUS_BLOCK_MAX && tmp > 0) { + len = i2c_dw_recv_len(dev, tmp); } - buf++; + *buf++ = tmp; dev->rx_outstanding--; } @@ -532,7 +543,7 @@ static const struct i2c_adapter_quirks i2c_dw_quirks = { static u32 i2c_dw_read_clear_intrbits(struct dw_i2c_dev *dev) { - u32 stat; + u32 stat, dummy; /* * The IC_INTR_STAT register just indicates "enabled" interrupts. @@ -540,47 +551,47 @@ static u32 i2c_dw_read_clear_intrbits(struct dw_i2c_dev *dev) * in the IC_RAW_INTR_STAT register. * * That is, - * stat = dw_readl(IC_INTR_STAT); + * stat = readl(IC_INTR_STAT); * equals to, - * stat = dw_readl(IC_RAW_INTR_STAT) & dw_readl(IC_INTR_MASK); + * stat = readl(IC_RAW_INTR_STAT) & readl(IC_INTR_MASK); * * The raw version might be useful for debugging purposes. */ - stat = dw_readl(dev, DW_IC_INTR_STAT); + regmap_read(dev->map, DW_IC_INTR_STAT, &stat); /* * Do not use the IC_CLR_INTR register to clear interrupts, or * you'll miss some interrupts, triggered during the period from - * dw_readl(IC_INTR_STAT) to dw_readl(IC_CLR_INTR). + * readl(IC_INTR_STAT) to readl(IC_CLR_INTR). * * Instead, use the separately-prepared IC_CLR_* registers. */ if (stat & DW_IC_INTR_RX_UNDER) - dw_readl(dev, DW_IC_CLR_RX_UNDER); + regmap_read(dev->map, DW_IC_CLR_RX_UNDER, &dummy); if (stat & DW_IC_INTR_RX_OVER) - dw_readl(dev, DW_IC_CLR_RX_OVER); + regmap_read(dev->map, DW_IC_CLR_RX_OVER, &dummy); if (stat & DW_IC_INTR_TX_OVER) - dw_readl(dev, DW_IC_CLR_TX_OVER); + regmap_read(dev->map, DW_IC_CLR_TX_OVER, &dummy); if (stat & DW_IC_INTR_RD_REQ) - dw_readl(dev, DW_IC_CLR_RD_REQ); + regmap_read(dev->map, DW_IC_CLR_RD_REQ, &dummy); if (stat & DW_IC_INTR_TX_ABRT) { /* * The IC_TX_ABRT_SOURCE register is cleared whenever * the IC_CLR_TX_ABRT is read. Preserve it beforehand. */ - dev->abort_source = dw_readl(dev, DW_IC_TX_ABRT_SOURCE); - dw_readl(dev, DW_IC_CLR_TX_ABRT); + regmap_read(dev->map, DW_IC_TX_ABRT_SOURCE, &dev->abort_source); + regmap_read(dev->map, DW_IC_CLR_TX_ABRT, &dummy); } if (stat & DW_IC_INTR_RX_DONE) - dw_readl(dev, DW_IC_CLR_RX_DONE); + regmap_read(dev->map, DW_IC_CLR_RX_DONE, &dummy); if (stat & DW_IC_INTR_ACTIVITY) - dw_readl(dev, DW_IC_CLR_ACTIVITY); + regmap_read(dev->map, DW_IC_CLR_ACTIVITY, &dummy); if (stat & DW_IC_INTR_STOP_DET) - dw_readl(dev, DW_IC_CLR_STOP_DET); + regmap_read(dev->map, DW_IC_CLR_STOP_DET, &dummy); if (stat & DW_IC_INTR_START_DET) - dw_readl(dev, DW_IC_CLR_START_DET); + regmap_read(dev->map, DW_IC_CLR_START_DET, &dummy); if (stat & DW_IC_INTR_GEN_CALL) - dw_readl(dev, DW_IC_CLR_GEN_CALL); + regmap_read(dev->map, DW_IC_CLR_GEN_CALL, &dummy); return stat; } @@ -602,7 +613,7 @@ static int i2c_dw_irq_handler_master(struct dw_i2c_dev *dev) * Anytime TX_ABRT is set, the contents of the tx/rx * buffers are flushed. Make sure to skip them. */ - dw_writel(dev, 0, DW_IC_INTR_MASK); + regmap_write(dev->map, DW_IC_INTR_MASK, 0); goto tx_aborted; } @@ -623,9 +634,9 @@ static int i2c_dw_irq_handler_master(struct dw_i2c_dev *dev) complete(&dev->cmd_complete); else if (unlikely(dev->flags & ACCESS_INTR_MASK)) { /* Workaround to trigger pending interrupt */ - stat = dw_readl(dev, DW_IC_INTR_MASK); + regmap_read(dev->map, DW_IC_INTR_MASK, &stat); i2c_dw_disable_int(dev); - dw_writel(dev, stat, DW_IC_INTR_MASK); + regmap_write(dev->map, DW_IC_INTR_MASK, stat); } return 0; @@ -636,8 +647,8 @@ static irqreturn_t i2c_dw_isr(int this_irq, void *dev_id) struct dw_i2c_dev *dev = dev_id; u32 stat, enabled; - enabled = dw_readl(dev, DW_IC_ENABLE); - stat = dw_readl(dev, DW_IC_RAW_INTR_STAT); + regmap_read(dev->map, DW_IC_ENABLE, &enabled); + regmap_read(dev->map, DW_IC_RAW_INTR_STAT, &stat); dev_dbg(dev->dev, "enabled=%#x stat=%#x\n", enabled, stat); if (!enabled || !(stat & ~DW_IC_INTR_ACTIVITY)) return IRQ_NONE; @@ -705,7 +716,7 @@ int i2c_dw_probe(struct dw_i2c_dev *dev) dev->disable = i2c_dw_disable; dev->disable_int = i2c_dw_disable_int; - ret = i2c_dw_set_reg_access(dev); + ret = i2c_dw_init_regmap(dev); if (ret) return ret; @@ -713,7 +724,9 @@ int i2c_dw_probe(struct dw_i2c_dev *dev) if (ret) return ret; - i2c_dw_set_fifo_size(dev); + ret = i2c_dw_set_fifo_size(dev); + if (ret) + return ret; ret = dev->init(dev); if (ret) diff --git a/drivers/i2c/busses/i2c-designware-slave.c b/drivers/i2c/busses/i2c-designware-slave.c index 0fc3aa31d46a..aab5b3473b07 100644 --- a/drivers/i2c/busses/i2c-designware-slave.c +++ b/drivers/i2c/busses/i2c-designware-slave.c @@ -14,18 +14,19 @@ #include #include #include +#include #include "i2c-designware-core.h" static void i2c_dw_configure_fifo_slave(struct dw_i2c_dev *dev) { /* Configure Tx/Rx FIFO threshold levels. */ - dw_writel(dev, 0, DW_IC_TX_TL); - dw_writel(dev, 0, DW_IC_RX_TL); + regmap_write(dev->map, DW_IC_TX_TL, 0); + regmap_write(dev->map, DW_IC_RX_TL, 0); /* Configure the I2C slave. */ - dw_writel(dev, dev->slave_cfg, DW_IC_CON); - dw_writel(dev, DW_IC_INTR_SLAVE_MASK, DW_IC_INTR_MASK); + regmap_write(dev->map, DW_IC_CON, dev->slave_cfg); + regmap_write(dev->map, DW_IC_INTR_MASK, DW_IC_INTR_SLAVE_MASK); } /** @@ -49,7 +50,7 @@ static int i2c_dw_init_slave(struct dw_i2c_dev *dev) /* Write SDA hold time if supported */ if (dev->sda_hold_time) - dw_writel(dev, dev->sda_hold_time, DW_IC_SDA_HOLD); + regmap_write(dev->map, DW_IC_SDA_HOLD, dev->sda_hold_time); i2c_dw_configure_fifo_slave(dev); i2c_dw_release_lock(dev); @@ -72,7 +73,7 @@ static int i2c_dw_reg_slave(struct i2c_client *slave) * the address to which the DW_apb_i2c responds. */ __i2c_dw_disable_nowait(dev); - dw_writel(dev, slave->addr, DW_IC_SAR); + regmap_write(dev->map, DW_IC_SAR, slave->addr); dev->slave = slave; __i2c_dw_enable(dev); @@ -103,7 +104,7 @@ static int i2c_dw_unreg_slave(struct i2c_client *slave) static u32 i2c_dw_read_clear_intrbits_slave(struct dw_i2c_dev *dev) { - u32 stat; + u32 stat, dummy; /* * The IC_INTR_STAT register just indicates "enabled" interrupts. @@ -111,39 +112,39 @@ static u32 i2c_dw_read_clear_intrbits_slave(struct dw_i2c_dev *dev) * in the IC_RAW_INTR_STAT register. * * That is, - * stat = dw_readl(IC_INTR_STAT); + * stat = readl(IC_INTR_STAT); * equals to, - * stat = dw_readl(IC_RAW_INTR_STAT) & dw_readl(IC_INTR_MASK); + * stat = readl(IC_RAW_INTR_STAT) & readl(IC_INTR_MASK); * * The raw version might be useful for debugging purposes. */ - stat = dw_readl(dev, DW_IC_INTR_STAT); + regmap_read(dev->map, DW_IC_INTR_STAT, &stat); /* * Do not use the IC_CLR_INTR register to clear interrupts, or * you'll miss some interrupts, triggered during the period from - * dw_readl(IC_INTR_STAT) to dw_readl(IC_CLR_INTR). + * readl(IC_INTR_STAT) to readl(IC_CLR_INTR). * * Instead, use the separately-prepared IC_CLR_* registers. */ if (stat & DW_IC_INTR_TX_ABRT) - dw_readl(dev, DW_IC_CLR_TX_ABRT); + regmap_read(dev->map, DW_IC_CLR_TX_ABRT, &dummy); if (stat & DW_IC_INTR_RX_UNDER) - dw_readl(dev, DW_IC_CLR_RX_UNDER); + regmap_read(dev->map, DW_IC_CLR_RX_UNDER, &dummy); if (stat & DW_IC_INTR_RX_OVER) - dw_readl(dev, DW_IC_CLR_RX_OVER); + regmap_read(dev->map, DW_IC_CLR_RX_OVER, &dummy); if (stat & DW_IC_INTR_TX_OVER) - dw_readl(dev, DW_IC_CLR_TX_OVER); + regmap_read(dev->map, DW_IC_CLR_TX_OVER, &dummy); if (stat & DW_IC_INTR_RX_DONE) - dw_readl(dev, DW_IC_CLR_RX_DONE); + regmap_read(dev->map, DW_IC_CLR_RX_DONE, &dummy); if (stat & DW_IC_INTR_ACTIVITY) - dw_readl(dev, DW_IC_CLR_ACTIVITY); + regmap_read(dev->map, DW_IC_CLR_ACTIVITY, &dummy); if (stat & DW_IC_INTR_STOP_DET) - dw_readl(dev, DW_IC_CLR_STOP_DET); + regmap_read(dev->map, DW_IC_CLR_STOP_DET, &dummy); if (stat & DW_IC_INTR_START_DET) - dw_readl(dev, DW_IC_CLR_START_DET); + regmap_read(dev->map, DW_IC_CLR_START_DET, &dummy); if (stat & DW_IC_INTR_GEN_CALL) - dw_readl(dev, DW_IC_CLR_GEN_CALL); + regmap_read(dev->map, DW_IC_CLR_GEN_CALL, &dummy); return stat; } @@ -155,14 +156,14 @@ static u32 i2c_dw_read_clear_intrbits_slave(struct dw_i2c_dev *dev) static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) { - u32 raw_stat, stat, enabled; - u8 val, slave_activity; + u32 raw_stat, stat, enabled, tmp; + u8 val = 0, slave_activity; - stat = dw_readl(dev, DW_IC_INTR_STAT); - enabled = dw_readl(dev, DW_IC_ENABLE); - raw_stat = dw_readl(dev, DW_IC_RAW_INTR_STAT); - slave_activity = ((dw_readl(dev, DW_IC_STATUS) & - DW_IC_STATUS_SLAVE_ACTIVITY) >> 6); + regmap_read(dev->map, DW_IC_INTR_STAT, &stat); + regmap_read(dev->map, DW_IC_ENABLE, &enabled); + regmap_read(dev->map, DW_IC_RAW_INTR_STAT, &raw_stat); + regmap_read(dev->map, DW_IC_STATUS, &tmp); + slave_activity = ((tmp & DW_IC_STATUS_SLAVE_ACTIVITY) >> 6); if (!enabled || !(raw_stat & ~DW_IC_INTR_ACTIVITY) || !dev->slave) return 0; @@ -177,7 +178,8 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) if (stat & DW_IC_INTR_RD_REQ) { if (slave_activity) { if (stat & DW_IC_INTR_RX_FULL) { - val = dw_readl(dev, DW_IC_DATA_CMD); + regmap_read(dev->map, DW_IC_DATA_CMD, &tmp); + val = tmp; if (!i2c_slave_event(dev->slave, I2C_SLAVE_WRITE_RECEIVED, @@ -185,24 +187,24 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) dev_vdbg(dev->dev, "Byte %X acked!", val); } - dw_readl(dev, DW_IC_CLR_RD_REQ); + regmap_read(dev->map, DW_IC_CLR_RD_REQ, &tmp); stat = i2c_dw_read_clear_intrbits_slave(dev); } else { - dw_readl(dev, DW_IC_CLR_RD_REQ); - dw_readl(dev, DW_IC_CLR_RX_UNDER); + regmap_read(dev->map, DW_IC_CLR_RD_REQ, &tmp); + regmap_read(dev->map, DW_IC_CLR_RX_UNDER, &tmp); stat = i2c_dw_read_clear_intrbits_slave(dev); } if (!i2c_slave_event(dev->slave, I2C_SLAVE_READ_REQUESTED, &val)) - dw_writel(dev, val, DW_IC_DATA_CMD); + regmap_write(dev->map, DW_IC_DATA_CMD, val); } } if (stat & DW_IC_INTR_RX_DONE) { if (!i2c_slave_event(dev->slave, I2C_SLAVE_READ_PROCESSED, &val)) - dw_readl(dev, DW_IC_CLR_RX_DONE); + regmap_read(dev->map, DW_IC_CLR_RX_DONE, &tmp); i2c_slave_event(dev->slave, I2C_SLAVE_STOP, &val); stat = i2c_dw_read_clear_intrbits_slave(dev); @@ -210,7 +212,8 @@ static int i2c_dw_irq_handler_slave(struct dw_i2c_dev *dev) } if (stat & DW_IC_INTR_RX_FULL) { - val = dw_readl(dev, DW_IC_DATA_CMD); + regmap_read(dev->map, DW_IC_DATA_CMD, &tmp); + val = tmp; if (!i2c_slave_event(dev->slave, I2C_SLAVE_WRITE_RECEIVED, &val)) dev_vdbg(dev->dev, "Byte %X acked!", val); @@ -252,7 +255,7 @@ int i2c_dw_probe_slave(struct dw_i2c_dev *dev) dev->disable = i2c_dw_disable; dev->disable_int = i2c_dw_disable_int; - ret = i2c_dw_set_reg_access(dev); + ret = i2c_dw_init_regmap(dev); if (ret) return ret; @@ -260,7 +263,9 @@ int i2c_dw_probe_slave(struct dw_i2c_dev *dev) if (ret) return ret; - i2c_dw_set_fifo_size(dev); + ret = i2c_dw_set_fifo_size(dev); + if (ret) + return ret; ret = dev->init(dev); if (ret) -- Gitee From 182445ddad6893d0b501b557cd082bb33d231779 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Tue, 24 Jan 2023 16:41:27 +0530 Subject: [PATCH 012/257] i2c: designware: add a new bit check for IC_CON control commit 60a1f9f28660657f863a3846a64b6c3cbb18f07d upstream On some AMD platforms, based on the new designware datasheet, BIOS sets the BIT(11) within the IC_CON register to advertise the "bus clear feature capability". AMD/Designware datasheet says: Bit(11) BUS_CLEAR_FEATURE_CTRL. Read-write,Volatile. Reset: 0. Description: In Master mode: - 1'b1: Bus Clear Feature is enabled. - 1'b0: Bus Clear Feature is Disabled. In Slave mode, this register bit is not applicable. On AMD platform designs: 1. BIOS programs the BUS_CLEAR_FEATURE_CTRL and enables the detection of SCL/SDA stuck low. 2. Whenever the stuck low is detected, the SMU FW shall do the bus recovery procedure. Currently, the way in which the "master_cfg" is built in the driver, it overrides the BUS_CLEAR_FEATURE_CTRL advertised by BIOS and the SMU FW cannot initiate the bus recovery if the stuck low is detected. Hence add a check in i2c_dw_probe_master() that if the BIOS advertises the bus clear feature, let driver not ignore it and adapt accordingly. Reviewed-by: Andy Shevchenko Signed-off-by: Shyam Sundar S K Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- drivers/i2c/busses/i2c-designware-core.h | 1 + drivers/i2c/busses/i2c-designware-master.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 57060001ce19..bf48ff6fa516 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -31,6 +31,7 @@ #define DW_IC_CON_STOP_DET_IFADDRESSED 0x80 #define DW_IC_CON_TX_EMPTY_CTRL 0x100 #define DW_IC_CON_RX_FIFO_FULL_HLD_CTRL 0x200 +#define DW_IC_CON_BUS_CLEAR_CTRL BIT(11) /* * Registers offset diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index af92405003d4..1335ee5e3137 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -708,6 +708,7 @@ int i2c_dw_probe(struct dw_i2c_dev *dev) { struct i2c_adapter *adap = &dev->adapter; unsigned long irq_flags; + unsigned int ic_con; int ret; init_completion(&dev->cmd_complete); @@ -728,6 +729,25 @@ int i2c_dw_probe(struct dw_i2c_dev *dev) if (ret) return ret; + /* Lock the bus for accessing DW_IC_CON */ + ret = i2c_dw_acquire_lock(dev); + if (ret) + return ret; + + /* + * On AMD platforms BIOS advertises the bus clear feature + * and enables the SCL/SDA stuck low. SMU FW does the + * bus recovery process. Driver should not ignore this BIOS + * advertisement of bus clear feature. + */ + ret = regmap_read(dev->map, DW_IC_CON, &ic_con); + i2c_dw_release_lock(dev); + if (ret) + return ret; + + if (ic_con & DW_IC_CON_BUS_CLEAR_CTRL) + dev->master_cfg |= DW_IC_CON_BUS_CLEAR_CTRL; + ret = dev->init(dev); if (ret) return ret; -- Gitee From 86a867101646609f2a381b23d2df810f56c43ed5 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Thu, 17 Oct 2019 22:35:11 +0000 Subject: [PATCH 013/257] crypto: ccp - Retry SEV INIT command in case of integrity check failure. commit 1d55fdc85799372ab3b0d2a6928e73439f8149aa upstream SEV INIT command loads the SEV related persistent data from NVS and initializes the platform context. The firmware validates the persistent state. If validation fails, the firmware will reset the persisent state and return an integrity check failure status. At this point, a subsequent INIT command should succeed, so retry the command. The INIT command retry is only done during driver initialization. Additional enums along with SEV_RET_SECURE_DATA_INVALID are added to sev_ret_code to maintain continuity and relevance of enum values. Signed-off-by: Ashish Kalra Acked-by: David Rientjes Reviewed-by: Brijesh Singh Signed-off-by: Herbert Xu Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- drivers/crypto/ccp/psp-dev.c | 12 ++++++++++++ include/uapi/linux/psp-sev.h | 3 +++ 2 files changed, 15 insertions(+) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 5acf6ae5af66..48ca5305d62d 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -1068,6 +1068,18 @@ void psp_pci_init(void) /* Initialize the platform */ rc = sev_platform_init(&error); + if (rc && (error == SEV_RET_SECURE_DATA_INVALID)) { + /* + * INIT command returned an integrity check failure + * status code, meaning that firmware load and + * validation of SEV related persistent data has + * failed and persistent state has been erased. + * Retrying INIT command here should succeed. + */ + dev_dbg(sp->dev, "SEV: retrying INIT command"); + rc = sev_platform_init(&error); + } + if (rc) { dev_err(sp->dev, "SEV: failed to INIT error %#x\n", error); return; diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 592a0c1b77c9..0549a5c622bf 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -58,6 +58,9 @@ typedef enum { SEV_RET_HWSEV_RET_PLATFORM, SEV_RET_HWSEV_RET_UNSAFE, SEV_RET_UNSUPPORTED, + SEV_RET_INVALID_PARAM, + SEV_RET_RESOURCE_LIMIT, + SEV_RET_SECURE_DATA_INVALID, SEV_RET_MAX, } sev_ret_code; -- Gitee From 68e52a61c9a58e683c245f51f43706d1e995c60b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 12 Nov 2019 13:58:34 -0600 Subject: [PATCH 014/257] crypto: ccp - add SEV command privilege separation commit ec310caf13b5505c268cfa526b7b28152a879d1e upstream Currently, there is no privilege separation of the SEV command; you can run them all or none of them. This is less than ideal because it means that a compromise of the code which launches VMs could make permanent change to the SEV certifcate chain which will affect others. These commands are required to attest the VM environment: - SEV_PDH_CERT_EXPORT - SEV_PLATFORM_STATUS - SEV_GET_{ID,ID2} These commands manage the SEV certificate chain: - SEV_PEK_CERR_IMPORT - SEV_FACTORY_RESET - SEV_PEK_GEN - SEV_PEK_CSR - SEV_PDH_GEN Lets add the CAP_SYS_ADMIN check for the group of the commands which alters the SEV certificate chain to provide some level of privilege separation. Cc: Herbert Xu Cc: Gary Hook Cc: Erdem Aktas Cc: Tom Lendacky Tested-by: David Rientjes Co-developed-by: David Rientjes Signed-off-by: David Rientjes Signed-off-by: Brijesh Singh Signed-off-by: Herbert Xu Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- drivers/crypto/ccp/psp-dev.c | 29 ++++++++++++++++++++++------- drivers/crypto/ccp/psp-dev.h | 1 + 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 48ca5305d62d..336c8d8c8a41 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -298,6 +298,9 @@ static int sev_ioctl_do_reset(struct sev_issue_cmd *argp) { int state, rc; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* * The SEV spec requires that FACTORY_RESET must be issued in * UNINIT state. Before we go further lets check if any guest is @@ -342,6 +345,9 @@ static int sev_ioctl_do_pek_pdh_gen(int cmd, struct sev_issue_cmd *argp) { int rc; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (psp_master->sev_state == SEV_STATE_UNINIT) { rc = __sev_platform_init_locked(&argp->error); if (rc) @@ -358,6 +364,9 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp) void *blob = NULL; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&input, (void __user *)argp->data, sizeof(input))) return -EFAULT; @@ -544,6 +553,9 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp) void *pek_blob, *oca_blob; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&input, (void __user *)argp->data, sizeof(input))) return -EFAULT; @@ -699,6 +711,16 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp) struct sev_data_pdh_cert_export *data; int ret; + /* If platform is not in INIT state then transition it to INIT. */ + if (psp_master->sev_state != SEV_STATE_INIT) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = __sev_platform_init_locked(&argp->error); + if (ret) + return ret; + } + if (copy_from_user(&input, (void __user *)argp->data, sizeof(input))) return -EFAULT; @@ -745,13 +767,6 @@ static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp) data->cert_chain_len = input.cert_chain_len; cmd: - /* If platform is not in INIT state then transition it to INIT. */ - if (psp_master->sev_state != SEV_STATE_INIT) { - ret = __sev_platform_init_locked(&argp->error); - if (ret) - goto e_free_cert; - } - ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, data, &argp->error); /* If we query the length, FW responded with expected data. */ diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index 82a084f02990..dd516b35ba86 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "sp-dev.h" -- Gitee From 0ffe213bec50ad8b0b92fe681c3ff92a53d6cfae Mon Sep 17 00:00:00 2001 From: "Hook, Gary" Date: Mon, 21 Oct 2019 13:44:44 +0000 Subject: [PATCH 015/257] crypto: ccp - Verify access to device registers before initializing commit 03f008c52b76114b83483de2cf15ed36fc34930c upstream Check early whether device registers can be accessed. Some BIOSes have a broken security policy that prevents access to the device registers, and return values from ioread() can be misinterpreted. If a read of a feature register returns a -1, we may not be able to access any device register, so report the problem and suggestion, and return. For the PSP, the feature register is checked. For the CCP, the queue register is checked. Signed-off-by: Gary R Hook Signed-off-by: Herbert Xu Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- drivers/crypto/ccp/ccp-dev-v5.c | 12 ++++++++++++ drivers/crypto/ccp/psp-dev.c | 18 ++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/ccp/ccp-dev-v5.c b/drivers/crypto/ccp/ccp-dev-v5.c index 57eb53b8ac21..5cd883cfec9f 100644 --- a/drivers/crypto/ccp/ccp-dev-v5.c +++ b/drivers/crypto/ccp/ccp-dev-v5.c @@ -789,6 +789,18 @@ static int ccp5_init(struct ccp_device *ccp) /* Find available queues */ qmr = ioread32(ccp->io_regs + Q_MASK_REG); + /* + * Check for a access to the registers. If this read returns + * 0xffffffff, it's likely that the system is running a broken + * BIOS which disallows access to the device. Stop here and fail + * the initialization (but not the load, as the PSP could get + * properly initialized). + */ + if (qmr == 0xffffffff) { + dev_notice(dev, "ccp: unable to access the device: you might be running a broken BIOS.\n"); + return 1; + } + for (i = 0; (i < MAX_HW_QUEUES) && (ccp->cmd_q_count < ccp->max_q_count); i++) { if (!(qmr & (1 << i))) continue; diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 336c8d8c8a41..4d8cde575b41 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -948,8 +948,22 @@ static int sev_misc_init(struct psp_device *psp) static int psp_check_sev_support(struct psp_device *psp) { - /* Check if device supports SEV feature */ - if (!(ioread32(psp->io_regs + psp->vdata->feature_reg) & 1)) { + unsigned int val = ioread32(psp->io_regs + psp->vdata->feature_reg); + + /* + * Check for a access to the registers. If this read returns + * 0xffffffff, it's likely that the system is running a broken + * BIOS which disallows access to the device. Stop here and + * fail the PSP initialization (but not the load, as the CCP + * could get properly initialized). + */ + if (val == 0xffffffff) { + dev_notice(psp->dev, "psp: unable to access the device: you might be running a broken BIOS.\n"); + return -ENODEV; + } + + if (!(val & 1)) { + /* Device does not support the SEV feature */ dev_dbg(psp->dev, "psp does not support SEV\n"); return -ENODEV; } -- Gitee From ff70d7ef45145e4be68ba9b871f302e429b52b09 Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Wed, 4 Dec 2019 11:48:58 +0530 Subject: [PATCH 016/257] crypto: ccp - rename psp-dev files to sev-dev commit 9b67d08dbc1751ab15d972a63a4d9132e7e7442f upstream This is a preliminary patch for creating a generic PSP device driver file, which will have support for both SEV and TEE (Trusted Execution Environment) interface. This patch does not introduce any new functionality, but simply renames psp-dev.c and psp-dev.h files to sev-dev.c and sev-dev.h files respectively. Cc: Ard Biesheuvel Cc: Tom Lendacky Cc: Jens Wiklander Co-developed-by: Devaraj Rangasamy Signed-off-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Acked-by: Gary R Hook Signed-off-by: Herbert Xu Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- drivers/crypto/ccp/Makefile | 2 +- drivers/crypto/ccp/{psp-dev.c => sev-dev.c} | 6 +++--- drivers/crypto/ccp/{psp-dev.h => sev-dev.h} | 8 ++++---- drivers/crypto/ccp/sp-pci.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) rename drivers/crypto/ccp/{psp-dev.c => sev-dev.c} (99%) rename drivers/crypto/ccp/{psp-dev.h => sev-dev.h} (90%) diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 6b86f1e6d634..9dafcf2c08f8 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -8,7 +8,7 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_CCP) += ccp-dev.o \ ccp-dmaengine.o ccp-$(CONFIG_CRYPTO_DEV_CCP_DEBUGFS) += ccp-debugfs.o ccp-$(CONFIG_PCI) += sp-pci.o -ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o +ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += sev-dev.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/sev-dev.c similarity index 99% rename from drivers/crypto/ccp/psp-dev.c rename to drivers/crypto/ccp/sev-dev.c index 4d8cde575b41..8bf0cf8afab4 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * AMD Platform Security Processor (PSP) interface + * AMD Secure Encrypted Virtualization (SEV) interface * - * Copyright (C) 2016,2018 Advanced Micro Devices, Inc. + * Copyright (C) 2016,2019 Advanced Micro Devices, Inc. * * Author: Brijesh Singh */ @@ -22,7 +22,7 @@ #include #include "sp-dev.h" -#include "psp-dev.h" +#include "sev-dev.h" #define DEVICE_NAME "sev" #define SEV_FW_FILE "amd/sev.fw" diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/sev-dev.h similarity index 90% rename from drivers/crypto/ccp/psp-dev.h rename to drivers/crypto/ccp/sev-dev.h index dd516b35ba86..e86164785daf 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/sev-dev.h @@ -2,13 +2,13 @@ /* * AMD Platform Security Processor (PSP) interface driver * - * Copyright (C) 2017-2018 Advanced Micro Devices, Inc. + * Copyright (C) 2017-2019 Advanced Micro Devices, Inc. * * Author: Brijesh Singh */ -#ifndef __PSP_DEV_H__ -#define __PSP_DEV_H__ +#ifndef __SEV_DEV_H__ +#define __SEV_DEV_H__ #include #include @@ -64,4 +64,4 @@ struct psp_device { u8 build; }; -#endif /* __PSP_DEV_H */ +#endif /* __SEV_DEV_H */ diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 39a5f10e61a2..85085f49cea5 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -22,7 +22,7 @@ #include #include "ccp-dev.h" -#include "psp-dev.h" +#include "sev-dev.h" #define MSIX_VECTORS 2 -- Gitee From 68f2747dba11f944384adb01dbbfbace04cb2fa8 Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Wed, 4 Dec 2019 11:48:59 +0530 Subject: [PATCH 017/257] crypto: ccp - create a generic psp-dev file commit b93566f1bb54e02a1ff1e3b4782073be1886744e upstream The PSP (Platform Security Processor) provides support for key management commands in Secure Encrypted Virtualization (SEV) mode, along with software-based Trusted Execution Environment (TEE) to enable third-party Trusted Applications. Therefore, introduce psp-dev.c and psp-dev.h files, which can invoke SEV (or TEE) initialization based on platform feature support. TEE interface support will be introduced in a later patch. [Backport Changes] The deviation in the patch is due to a difference in line numbers. Cc: Ard Biesheuvel Cc: Tom Lendacky Cc: Jens Wiklander Co-developed-by: Devaraj Rangasamy Signed-off-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Acked-by: Gary R Hook Signed-off-by: Herbert Xu Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- drivers/crypto/ccp/Makefile | 3 +- drivers/crypto/ccp/psp-dev.c | 194 +++++++++++++++++++++++++ drivers/crypto/ccp/psp-dev.h | 52 +++++++ drivers/crypto/ccp/sev-dev.c | 273 +++++++++++++---------------------- drivers/crypto/ccp/sev-dev.h | 36 ++--- drivers/crypto/ccp/sp-pci.c | 2 +- 6 files changed, 367 insertions(+), 193 deletions(-) create mode 100644 drivers/crypto/ccp/psp-dev.c create mode 100644 drivers/crypto/ccp/psp-dev.h diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 9dafcf2c08f8..3b29ea4a3583 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -8,7 +8,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_CCP) += ccp-dev.o \ ccp-dmaengine.o ccp-$(CONFIG_CRYPTO_DEV_CCP_DEBUGFS) += ccp-debugfs.o ccp-$(CONFIG_PCI) += sp-pci.o -ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += sev-dev.o +ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ + sev-dev.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c new file mode 100644 index 000000000000..2cd7a5ea4156 --- /dev/null +++ b/drivers/crypto/ccp/psp-dev.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Platform Security Processor (PSP) interface + * + * Copyright (C) 2016,2019 Advanced Micro Devices, Inc. + * + * Author: Brijesh Singh + */ + +#include +#include + +#include "sp-dev.h" +#include "psp-dev.h" +#include "sev-dev.h" + +struct psp_device *psp_master; + +static struct psp_device *psp_alloc_struct(struct sp_device *sp) +{ + struct device *dev = sp->dev; + struct psp_device *psp; + + psp = devm_kzalloc(dev, sizeof(*psp), GFP_KERNEL); + if (!psp) + return NULL; + + psp->dev = dev; + psp->sp = sp; + + snprintf(psp->name, sizeof(psp->name), "psp-%u", sp->ord); + + return psp; +} + +static irqreturn_t psp_irq_handler(int irq, void *data) +{ + struct psp_device *psp = data; + unsigned int status; + + /* Read the interrupt status: */ + status = ioread32(psp->io_regs + psp->vdata->intsts_reg); + + /* invoke subdevice interrupt handlers */ + if (status) { + if (psp->sev_irq_handler) + psp->sev_irq_handler(irq, psp->sev_irq_data, status); + } + + /* Clear the interrupt status by writing the same value we read. */ + iowrite32(status, psp->io_regs + psp->vdata->intsts_reg); + + return IRQ_HANDLED; +} + +static int psp_check_sev_support(struct psp_device *psp) +{ + unsigned int val = ioread32(psp->io_regs + psp->vdata->feature_reg); + + /* + * Check for a access to the registers. If this read returns + * 0xffffffff, it's likely that the system is running a broken + * BIOS which disallows access to the device. Stop here and + * fail the PSP initialization (but not the load, as the CCP + * could get properly initialized). + */ + if (val == 0xffffffff) { + dev_notice(psp->dev, "psp: unable to access the device: you might be running a broken BIOS.\n"); + return -ENODEV; + } + + if (!(val & 1)) { + /* Device does not support the SEV feature */ + dev_dbg(psp->dev, "psp does not support SEV\n"); + return -ENODEV; + } + + return 0; +} + +int psp_dev_init(struct sp_device *sp) +{ + struct device *dev = sp->dev; + struct psp_device *psp; + int ret; + + ret = -ENOMEM; + psp = psp_alloc_struct(sp); + if (!psp) + goto e_err; + + sp->psp_data = psp; + + psp->vdata = (struct psp_vdata *)sp->dev_vdata->psp_vdata; + if (!psp->vdata) { + ret = -ENODEV; + dev_err(dev, "missing driver data\n"); + goto e_err; + } + + psp->io_regs = sp->io_map; + + ret = psp_check_sev_support(psp); + if (ret) + goto e_disable; + + /* Disable and clear interrupts until ready */ + iowrite32(0, psp->io_regs + psp->vdata->inten_reg); + iowrite32(-1, psp->io_regs + psp->vdata->intsts_reg); + + /* Request an irq */ + ret = sp_request_psp_irq(psp->sp, psp_irq_handler, psp->name, psp); + if (ret) { + dev_err(dev, "psp: unable to allocate an IRQ\n"); + goto e_err; + } + + ret = sev_dev_init(psp); + if (ret) + goto e_irq; + + if (sp->set_psp_master_device) + sp->set_psp_master_device(sp); + + /* Enable interrupt */ + iowrite32(-1, psp->io_regs + psp->vdata->inten_reg); + + dev_notice(dev, "psp enabled\n"); + + return 0; + +e_irq: + sp_free_psp_irq(psp->sp, psp); +e_err: + sp->psp_data = NULL; + + dev_notice(dev, "psp initialization failed\n"); + + return ret; + +e_disable: + sp->psp_data = NULL; + + return ret; +} + +void psp_dev_destroy(struct sp_device *sp) +{ + struct psp_device *psp = sp->psp_data; + + if (!psp) + return; + + sev_dev_destroy(psp); + + sp_free_psp_irq(sp, psp); +} + +void psp_set_sev_irq_handler(struct psp_device *psp, psp_irq_handler_t handler, + void *data) +{ + psp->sev_irq_data = data; + psp->sev_irq_handler = handler; +} + +void psp_clear_sev_irq_handler(struct psp_device *psp) +{ + psp_set_sev_irq_handler(psp, NULL, NULL); +} + +struct psp_device *psp_get_master_device(void) +{ + struct sp_device *sp = sp_get_psp_master_device(); + + return sp ? sp->psp_data : NULL; +} + +void psp_pci_init(void) +{ + psp_master = psp_get_master_device(); + + if (!psp_master) + return; + + sev_pci_init(); +} + +void psp_pci_exit(void) +{ + if (!psp_master) + return; + + sev_pci_exit(); +} diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h new file mode 100644 index 000000000000..7c014acdd69c --- /dev/null +++ b/drivers/crypto/ccp/psp-dev.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Platform Security Processor (PSP) interface driver + * + * Copyright (C) 2017-2019 Advanced Micro Devices, Inc. + * + * Author: Brijesh Singh + */ + +#ifndef __PSP_DEV_H__ +#define __PSP_DEV_H__ + +#include +#include +#include +#include + +#include "sp-dev.h" + +#define PSP_CMDRESP_RESP BIT(31) +#define PSP_CMDRESP_ERR_MASK 0xffff + +#define MAX_PSP_NAME_LEN 16 + +extern struct psp_device *psp_master; + +typedef void (*psp_irq_handler_t)(int, void *, unsigned int); + +struct psp_device { + struct list_head entry; + + struct psp_vdata *vdata; + char name[MAX_PSP_NAME_LEN]; + + struct device *dev; + struct sp_device *sp; + + void __iomem *io_regs; + + psp_irq_handler_t sev_irq_handler; + void *sev_irq_data; + + void *sev_data; +}; + +void psp_set_sev_irq_handler(struct psp_device *psp, psp_irq_handler_t handler, + void *data); +void psp_clear_sev_irq_handler(struct psp_device *psp); + +struct psp_device *psp_get_master_device(void); + +#endif /* __PSP_DEV_H */ diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 8bf0cf8afab4..12e3501df186 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -21,7 +21,7 @@ #include #include -#include "sp-dev.h" +#include "psp-dev.h" #include "sev-dev.h" #define DEVICE_NAME "sev" @@ -30,7 +30,6 @@ static DEFINE_MUTEX(sev_cmd_mutex); static struct sev_misc_dev *misc_dev; -static struct psp_device *psp_master; static int psp_cmd_timeout = 100; module_param(psp_cmd_timeout, int, 0644); @@ -49,68 +48,45 @@ static int psp_timeout; static inline bool sev_version_greater_or_equal(u8 maj, u8 min) { - if (psp_master->api_major > maj) - return true; - if (psp_master->api_major == maj && psp_master->api_minor >= min) - return true; - return false; -} - -static struct psp_device *psp_alloc_struct(struct sp_device *sp) -{ - struct device *dev = sp->dev; - struct psp_device *psp; + struct sev_device *sev = psp_master->sev_data; - psp = devm_kzalloc(dev, sizeof(*psp), GFP_KERNEL); - if (!psp) - return NULL; - - psp->dev = dev; - psp->sp = sp; + if (sev->api_major > maj) + return true; - snprintf(psp->name, sizeof(psp->name), "psp-%u", sp->ord); + if (sev->api_major == maj && sev->api_minor >= min) + return true; - return psp; + return false; } -static irqreturn_t psp_irq_handler(int irq, void *data) +static void sev_irq_handler(int irq, void *data, unsigned int status) { - struct psp_device *psp = data; - unsigned int status; + struct sev_device *sev = data; int reg; - /* Read the interrupt status: */ - status = ioread32(psp->io_regs + psp->vdata->intsts_reg); - /* Check if it is command completion: */ - if (!(status & PSP_CMD_COMPLETE)) - goto done; + if (!(status & SEV_CMD_COMPLETE)) + return; /* Check if it is SEV command completion: */ - reg = ioread32(psp->io_regs + psp->vdata->cmdresp_reg); + reg = ioread32(sev->io_regs + sev->psp->vdata->cmdresp_reg); if (reg & PSP_CMDRESP_RESP) { - psp->sev_int_rcvd = 1; - wake_up(&psp->sev_int_queue); + sev->int_rcvd = 1; + wake_up(&sev->int_queue); } - -done: - /* Clear the interrupt status by writing the same value we read. */ - iowrite32(status, psp->io_regs + psp->vdata->intsts_reg); - - return IRQ_HANDLED; } -static int sev_wait_cmd_ioc(struct psp_device *psp, +static int sev_wait_cmd_ioc(struct sev_device *sev, unsigned int *reg, unsigned int timeout) { int ret; - ret = wait_event_timeout(psp->sev_int_queue, - psp->sev_int_rcvd, timeout * HZ); + ret = wait_event_timeout(sev->int_queue, + sev->int_rcvd, timeout * HZ); if (!ret) return -ETIMEDOUT; - *reg = ioread32(psp->io_regs + psp->vdata->cmdresp_reg); + *reg = ioread32(sev->io_regs + sev->psp->vdata->cmdresp_reg); return 0; } @@ -154,42 +130,45 @@ static int sev_cmd_buffer_len(int cmd) static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) { struct psp_device *psp = psp_master; + struct sev_device *sev; unsigned int phys_lsb, phys_msb; unsigned int reg, ret = 0; - if (!psp) + if (!psp || !psp->sev_data) return -ENODEV; if (psp_dead) return -EBUSY; + sev = psp->sev_data; + /* Get the physical address of the command buffer */ phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0; phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0; - dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n", + dev_dbg(sev->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n", cmd, phys_msb, phys_lsb, psp_timeout); print_hex_dump_debug("(in): ", DUMP_PREFIX_OFFSET, 16, 2, data, sev_cmd_buffer_len(cmd), false); - iowrite32(phys_lsb, psp->io_regs + psp->vdata->cmdbuff_addr_lo_reg); - iowrite32(phys_msb, psp->io_regs + psp->vdata->cmdbuff_addr_hi_reg); + iowrite32(phys_lsb, sev->io_regs + psp->vdata->cmdbuff_addr_lo_reg); + iowrite32(phys_msb, sev->io_regs + psp->vdata->cmdbuff_addr_hi_reg); - psp->sev_int_rcvd = 0; + sev->int_rcvd = 0; reg = cmd; - reg <<= PSP_CMDRESP_CMD_SHIFT; - reg |= PSP_CMDRESP_IOC; - iowrite32(reg, psp->io_regs + psp->vdata->cmdresp_reg); + reg <<= SEV_CMDRESP_CMD_SHIFT; + reg |= SEV_CMDRESP_IOC; + iowrite32(reg, sev->io_regs + psp->vdata->cmdresp_reg); /* wait for command completion */ - ret = sev_wait_cmd_ioc(psp, ®, psp_timeout); + ret = sev_wait_cmd_ioc(sev, ®, psp_timeout); if (ret) { if (psp_ret) *psp_ret = 0; - dev_err(psp->dev, "sev command %#x timed out, disabling PSP \n", cmd); + dev_err(sev->dev, "sev command %#x timed out, disabling PSP\n", cmd); psp_dead = true; return ret; @@ -201,7 +180,7 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) *psp_ret = reg & PSP_CMDRESP_ERR_MASK; if (reg & PSP_CMDRESP_ERR_MASK) { - dev_dbg(psp->dev, "sev command %#x failed (%#010x)\n", + dev_dbg(sev->dev, "sev command %#x failed (%#010x)\n", cmd, reg & PSP_CMDRESP_ERR_MASK); ret = -EIO; } @@ -226,20 +205,23 @@ static int sev_do_cmd(int cmd, void *data, int *psp_ret) static int __sev_platform_init_locked(int *error) { struct psp_device *psp = psp_master; + struct sev_device *sev; int rc = 0; - if (!psp) + if (!psp || !psp->sev_data) return -ENODEV; - if (psp->sev_state == SEV_STATE_INIT) + sev = psp->sev_data; + + if (sev->state == SEV_STATE_INIT) return 0; - rc = __sev_do_cmd_locked(SEV_CMD_INIT, &psp->init_cmd_buf, error); + rc = __sev_do_cmd_locked(SEV_CMD_INIT, &sev->init_cmd_buf, error); if (rc) return rc; - psp->sev_state = SEV_STATE_INIT; - dev_dbg(psp->dev, "SEV firmware initialized\n"); + sev->state = SEV_STATE_INIT; + dev_dbg(sev->dev, "SEV firmware initialized\n"); return rc; } @@ -258,14 +240,15 @@ EXPORT_SYMBOL_GPL(sev_platform_init); static int __sev_platform_shutdown_locked(int *error) { + struct sev_device *sev = psp_master->sev_data; int ret; ret = __sev_do_cmd_locked(SEV_CMD_SHUTDOWN, NULL, error); if (ret) return ret; - psp_master->sev_state = SEV_STATE_UNINIT; - dev_dbg(psp_master->dev, "SEV firmware shutdown\n"); + sev->state = SEV_STATE_UNINIT; + dev_dbg(sev->dev, "SEV firmware shutdown\n"); return ret; } @@ -283,14 +266,15 @@ static int sev_platform_shutdown(int *error) static int sev_get_platform_state(int *state, int *error) { + struct sev_device *sev = psp_master->sev_data; int rc; rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, - &psp_master->status_cmd_buf, error); + &sev->status_cmd_buf, error); if (rc) return rc; - *state = psp_master->status_cmd_buf.state; + *state = sev->status_cmd_buf.state; return rc; } @@ -328,7 +312,8 @@ static int sev_ioctl_do_reset(struct sev_issue_cmd *argp) static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp) { - struct sev_user_data_status *data = &psp_master->status_cmd_buf; + struct sev_device *sev = psp_master->sev_data; + struct sev_user_data_status *data = &sev->status_cmd_buf; int ret; ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, data, &argp->error); @@ -343,12 +328,13 @@ static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp) static int sev_ioctl_do_pek_pdh_gen(int cmd, struct sev_issue_cmd *argp) { + struct sev_device *sev = psp_master->sev_data; int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (psp_master->sev_state == SEV_STATE_UNINIT) { + if (sev->state == SEV_STATE_UNINIT) { rc = __sev_platform_init_locked(&argp->error); if (rc) return rc; @@ -359,6 +345,7 @@ static int sev_ioctl_do_pek_pdh_gen(int cmd, struct sev_issue_cmd *argp) static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp) { + struct sev_device *sev = psp_master->sev_data; struct sev_user_data_pek_csr input; struct sev_data_pek_csr *data; void *blob = NULL; @@ -395,7 +382,7 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp) data->len = input.length; cmd: - if (psp_master->sev_state == SEV_STATE_UNINIT) { + if (sev->state == SEV_STATE_UNINIT) { ret = __sev_platform_init_locked(&argp->error); if (ret) goto e_free_blob; @@ -438,21 +425,22 @@ EXPORT_SYMBOL_GPL(psp_copy_user_blob); static int sev_get_api_version(void) { + struct sev_device *sev = psp_master->sev_data; struct sev_user_data_status *status; int error = 0, ret; - status = &psp_master->status_cmd_buf; + status = &sev->status_cmd_buf; ret = sev_platform_status(status, &error); if (ret) { - dev_err(psp_master->dev, + dev_err(sev->dev, "SEV: failed to get status. Error: %#x\n", error); return 1; } - psp_master->api_major = status->api_major; - psp_master->api_minor = status->api_minor; - psp_master->build = status->build; - psp_master->sev_state = status->state; + sev->api_major = status->api_major; + sev->api_minor = status->api_minor; + sev->build = status->build; + sev->state = status->state; return 0; } @@ -548,6 +536,7 @@ static int sev_update_firmware(struct device *dev) static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp) { + struct sev_device *sev = psp_master->sev_data; struct sev_user_data_pek_cert_import input; struct sev_data_pek_cert_import *data; void *pek_blob, *oca_blob; @@ -584,7 +573,7 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp) data->oca_cert_len = input.oca_cert_len; /* If platform is not in INIT state then transition it to INIT */ - if (psp_master->sev_state != SEV_STATE_INIT) { + if (sev->state != SEV_STATE_INIT) { ret = __sev_platform_init_locked(&argp->error); if (ret) goto e_free_oca; @@ -706,13 +695,14 @@ static int sev_ioctl_do_get_id(struct sev_issue_cmd *argp) static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp) { + struct sev_device *sev = psp_master->sev_data; struct sev_user_data_pdh_cert_export input; void *pdh_blob = NULL, *cert_blob = NULL; struct sev_data_pdh_cert_export *data; int ret; /* If platform is not in INIT state then transition it to INIT. */ - if (psp_master->sev_state != SEV_STATE_INIT) { + if (sev->state != SEV_STATE_INIT) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -807,7 +797,7 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg) struct sev_issue_cmd input; int ret = -EFAULT; - if (!psp_master) + if (!psp_master || !psp_master->sev_data) return -ENODEV; if (ioctl != SEV_ISSUE_CMD) @@ -906,9 +896,9 @@ static void sev_exit(struct kref *ref) misc_deregister(&misc_dev->misc); } -static int sev_misc_init(struct psp_device *psp) +static int sev_misc_init(struct sev_device *sev) { - struct device *dev = psp->dev; + struct device *dev = sev->dev; int ret; /* @@ -939,115 +929,61 @@ static int sev_misc_init(struct psp_device *psp) kref_get(&misc_dev->refcount); } - init_waitqueue_head(&psp->sev_int_queue); - psp->sev_misc = misc_dev; + init_waitqueue_head(&sev->int_queue); + sev->misc = misc_dev; dev_dbg(dev, "registered SEV device\n"); return 0; } -static int psp_check_sev_support(struct psp_device *psp) -{ - unsigned int val = ioread32(psp->io_regs + psp->vdata->feature_reg); - - /* - * Check for a access to the registers. If this read returns - * 0xffffffff, it's likely that the system is running a broken - * BIOS which disallows access to the device. Stop here and - * fail the PSP initialization (but not the load, as the CCP - * could get properly initialized). - */ - if (val == 0xffffffff) { - dev_notice(psp->dev, "psp: unable to access the device: you might be running a broken BIOS.\n"); - return -ENODEV; - } - - if (!(val & 1)) { - /* Device does not support the SEV feature */ - dev_dbg(psp->dev, "psp does not support SEV\n"); - return -ENODEV; - } - - return 0; -} - -int psp_dev_init(struct sp_device *sp) +int sev_dev_init(struct psp_device *psp) { - struct device *dev = sp->dev; - struct psp_device *psp; - int ret; + struct device *dev = psp->dev; + struct sev_device *sev; + int ret = -ENOMEM; - ret = -ENOMEM; - psp = psp_alloc_struct(sp); - if (!psp) + sev = devm_kzalloc(dev, sizeof(*sev), GFP_KERNEL); + if (!sev) goto e_err; - sp->psp_data = psp; + psp->sev_data = sev; - psp->vdata = (struct psp_vdata *)sp->dev_vdata->psp_vdata; - if (!psp->vdata) { - ret = -ENODEV; - dev_err(dev, "missing driver data\n"); - goto e_err; - } + sev->dev = dev; + sev->psp = psp; - psp->io_regs = sp->io_map; + sev->io_regs = psp->io_regs; - ret = psp_check_sev_support(psp); - if (ret) - goto e_disable; + psp_set_sev_irq_handler(psp, sev_irq_handler, sev); - /* Disable and clear interrupts until ready */ - iowrite32(0, psp->io_regs + psp->vdata->inten_reg); - iowrite32(-1, psp->io_regs + psp->vdata->intsts_reg); - - /* Request an irq */ - ret = sp_request_psp_irq(psp->sp, psp_irq_handler, psp->name, psp); - if (ret) { - dev_err(dev, "psp: unable to allocate an IRQ\n"); - goto e_err; - } - - ret = sev_misc_init(psp); + ret = sev_misc_init(sev); if (ret) goto e_irq; - if (sp->set_psp_master_device) - sp->set_psp_master_device(sp); - - /* Enable interrupt */ - iowrite32(-1, psp->io_regs + psp->vdata->inten_reg); - - dev_notice(dev, "psp enabled\n"); + dev_notice(dev, "sev enabled\n"); return 0; e_irq: - sp_free_psp_irq(psp->sp, psp); + psp_clear_sev_irq_handler(psp); e_err: - sp->psp_data = NULL; + psp->sev_data = NULL; - dev_notice(dev, "psp initialization failed\n"); - - return ret; - -e_disable: - sp->psp_data = NULL; + dev_notice(dev, "sev initialization failed\n"); return ret; } -void psp_dev_destroy(struct sp_device *sp) +void sev_dev_destroy(struct psp_device *psp) { - struct psp_device *psp = sp->psp_data; + struct sev_device *sev = psp->sev_data; - if (!psp) + if (!sev) return; - if (psp->sev_misc) + if (sev->misc) kref_put(&misc_dev->refcount, sev_exit); - sp_free_psp_irq(sp, psp); + psp_clear_sev_irq_handler(psp); } int sev_issue_cmd_external_user(struct file *filep, unsigned int cmd, @@ -1056,21 +992,18 @@ int sev_issue_cmd_external_user(struct file *filep, unsigned int cmd, if (!filep || filep->f_op != &sev_fops) return -EBADF; - return sev_do_cmd(cmd, data, error); + return sev_do_cmd(cmd, data, error); } EXPORT_SYMBOL_GPL(sev_issue_cmd_external_user); -void psp_pci_init(void) +void sev_pci_init(void) { - struct sp_device *sp; + struct sev_device *sev = psp_master->sev_data; int error, rc; - sp = sp_get_psp_master_device(); - if (!sp) + if (!sev) return; - psp_master = sp->psp_data; - psp_timeout = psp_probe_timeout; if (sev_get_api_version()) @@ -1086,13 +1019,13 @@ void psp_pci_init(void) * firmware in INIT or WORKING state. */ - if (psp_master->sev_state != SEV_STATE_UNINIT) { + if (sev->state != SEV_STATE_UNINIT) { sev_platform_shutdown(NULL); - psp_master->sev_state = SEV_STATE_UNINIT; + sev->state = SEV_STATE_UNINIT; } if (sev_version_greater_or_equal(0, 15) && - sev_update_firmware(psp_master->dev) == 0) + sev_update_firmware(sev->dev) == 0) sev_get_api_version(); /* Initialize the platform */ @@ -1105,27 +1038,27 @@ void psp_pci_init(void) * failed and persistent state has been erased. * Retrying INIT command here should succeed. */ - dev_dbg(sp->dev, "SEV: retrying INIT command"); + dev_dbg(sev->dev, "SEV: retrying INIT command"); rc = sev_platform_init(&error); } if (rc) { - dev_err(sp->dev, "SEV: failed to INIT error %#x\n", error); + dev_err(sev->dev, "SEV: failed to INIT error %#x\n", error); return; } - dev_info(sp->dev, "SEV API:%d.%d build:%d\n", psp_master->api_major, - psp_master->api_minor, psp_master->build); + dev_info(sev->dev, "SEV API:%d.%d build:%d\n", sev->api_major, + sev->api_minor, sev->build); return; err: - psp_master = NULL; + psp_master->sev_data = NULL; } -void psp_pci_exit(void) +void sev_pci_exit(void) { - if (!psp_master) + if (!psp_master->sev_data) return; sev_platform_shutdown(NULL); diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h index e86164785daf..3d84ac380bdc 100644 --- a/drivers/crypto/ccp/sev-dev.h +++ b/drivers/crypto/ccp/sev-dev.h @@ -25,37 +25,25 @@ #include #include -#include "sp-dev.h" - -#define PSP_CMD_COMPLETE BIT(1) - -#define PSP_CMDRESP_CMD_SHIFT 16 -#define PSP_CMDRESP_IOC BIT(0) -#define PSP_CMDRESP_RESP BIT(31) -#define PSP_CMDRESP_ERR_MASK 0xffff - -#define MAX_PSP_NAME_LEN 16 +#define SEV_CMD_COMPLETE BIT(1) +#define SEV_CMDRESP_CMD_SHIFT 16 +#define SEV_CMDRESP_IOC BIT(0) struct sev_misc_dev { struct kref refcount; struct miscdevice misc; }; -struct psp_device { - struct list_head entry; - - struct psp_vdata *vdata; - char name[MAX_PSP_NAME_LEN]; - +struct sev_device { struct device *dev; - struct sp_device *sp; + struct psp_device *psp; void __iomem *io_regs; - int sev_state; - unsigned int sev_int_rcvd; - wait_queue_head_t sev_int_queue; - struct sev_misc_dev *sev_misc; + int state; + unsigned int int_rcvd; + wait_queue_head_t int_queue; + struct sev_misc_dev *misc; struct sev_user_data_status status_cmd_buf; struct sev_data_init init_cmd_buf; @@ -64,4 +52,10 @@ struct psp_device { u8 build; }; +int sev_dev_init(struct psp_device *psp); +void sev_dev_destroy(struct psp_device *psp); + +void sev_pci_init(void); +void sev_pci_exit(void); + #endif /* __SEV_DEV_H */ diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 85085f49cea5..39a5f10e61a2 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -22,7 +22,7 @@ #include #include "ccp-dev.h" -#include "sev-dev.h" +#include "psp-dev.h" #define MSIX_VECTORS 2 -- Gitee From 3ba185d409554a2a3a4aaf4b109a92fdb2fb409f Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Wed, 4 Dec 2019 11:49:00 +0530 Subject: [PATCH 018/257] crypto: ccp - move SEV vdata to a dedicated data structure commit 6eb0cc72bcbe9cc5b9ccd41d7226929767e41311 upstream PSP can support both SEV and TEE interface. Therefore, move SEV specific registers to a dedicated data structure. TEE interface specific registers will be added in a later patch. Cc: Ard Biesheuvel Cc: Tom Lendacky Cc: Jens Wiklander Co-developed-by: Devaraj Rangasamy Signed-off-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Acked-by: Gary R Hook Signed-off-by: Herbert Xu Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- drivers/crypto/ccp/sev-dev.c | 17 ++++++++++++----- drivers/crypto/ccp/sev-dev.h | 2 ++ drivers/crypto/ccp/sp-dev.h | 6 +++++- drivers/crypto/ccp/sp-pci.c | 16 ++++++++++++---- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 12e3501df186..b00dcc2be897 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -69,7 +69,7 @@ static void sev_irq_handler(int irq, void *data, unsigned int status) return; /* Check if it is SEV command completion: */ - reg = ioread32(sev->io_regs + sev->psp->vdata->cmdresp_reg); + reg = ioread32(sev->io_regs + sev->vdata->cmdresp_reg); if (reg & PSP_CMDRESP_RESP) { sev->int_rcvd = 1; wake_up(&sev->int_queue); @@ -86,7 +86,7 @@ static int sev_wait_cmd_ioc(struct sev_device *sev, if (!ret) return -ETIMEDOUT; - *reg = ioread32(sev->io_regs + sev->psp->vdata->cmdresp_reg); + *reg = ioread32(sev->io_regs + sev->vdata->cmdresp_reg); return 0; } @@ -152,15 +152,15 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) print_hex_dump_debug("(in): ", DUMP_PREFIX_OFFSET, 16, 2, data, sev_cmd_buffer_len(cmd), false); - iowrite32(phys_lsb, sev->io_regs + psp->vdata->cmdbuff_addr_lo_reg); - iowrite32(phys_msb, sev->io_regs + psp->vdata->cmdbuff_addr_hi_reg); + iowrite32(phys_lsb, sev->io_regs + sev->vdata->cmdbuff_addr_lo_reg); + iowrite32(phys_msb, sev->io_regs + sev->vdata->cmdbuff_addr_hi_reg); sev->int_rcvd = 0; reg = cmd; reg <<= SEV_CMDRESP_CMD_SHIFT; reg |= SEV_CMDRESP_IOC; - iowrite32(reg, sev->io_regs + psp->vdata->cmdresp_reg); + iowrite32(reg, sev->io_regs + sev->vdata->cmdresp_reg); /* wait for command completion */ ret = sev_wait_cmd_ioc(sev, ®, psp_timeout); @@ -953,6 +953,13 @@ int sev_dev_init(struct psp_device *psp) sev->io_regs = psp->io_regs; + sev->vdata = (struct sev_vdata *)psp->vdata->sev; + if (!sev->vdata) { + ret = -ENODEV; + dev_err(dev, "sev: missing driver data\n"); + goto e_err; + } + psp_set_sev_irq_handler(psp, sev_irq_handler, sev); ret = sev_misc_init(sev); diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h index 3d84ac380bdc..dd5c4fe82914 100644 --- a/drivers/crypto/ccp/sev-dev.h +++ b/drivers/crypto/ccp/sev-dev.h @@ -40,6 +40,8 @@ struct sev_device { void __iomem *io_regs; + struct sev_vdata *vdata; + int state; unsigned int int_rcvd; wait_queue_head_t int_queue; diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h index 53c12562d31e..0394c752a7c8 100644 --- a/drivers/crypto/ccp/sp-dev.h +++ b/drivers/crypto/ccp/sp-dev.h @@ -39,10 +39,14 @@ struct ccp_vdata { const unsigned int rsamax; }; -struct psp_vdata { +struct sev_vdata { const unsigned int cmdresp_reg; const unsigned int cmdbuff_addr_lo_reg; const unsigned int cmdbuff_addr_hi_reg; +}; + +struct psp_vdata { + const struct sev_vdata *sev; const unsigned int feature_reg; const unsigned int inten_reg; const unsigned int intsts_reg; diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 39a5f10e61a2..788983580e01 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -264,19 +264,27 @@ static int sp_pci_resume(struct pci_dev *pdev) #endif #ifdef CONFIG_CRYPTO_DEV_SP_PSP -static const struct psp_vdata pspv1 = { +static const struct sev_vdata sevv1 = { .cmdresp_reg = 0x10580, .cmdbuff_addr_lo_reg = 0x105e0, .cmdbuff_addr_hi_reg = 0x105e4, +}; + +static const struct sev_vdata sevv2 = { + .cmdresp_reg = 0x10980, + .cmdbuff_addr_lo_reg = 0x109e0, + .cmdbuff_addr_hi_reg = 0x109e4, +}; + +static const struct psp_vdata pspv1 = { + .sev = &sevv1, .feature_reg = 0x105fc, .inten_reg = 0x10610, .intsts_reg = 0x10614, }; static const struct psp_vdata pspv2 = { - .cmdresp_reg = 0x10980, - .cmdbuff_addr_lo_reg = 0x109e0, - .cmdbuff_addr_hi_reg = 0x109e4, + .sev = &sevv2, .feature_reg = 0x109fc, .inten_reg = 0x10690, .intsts_reg = 0x10694, -- Gitee From 8f87460d95ef9af4e0a3e48d5b19eaa65d088523 Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Wed, 4 Dec 2019 11:49:01 +0530 Subject: [PATCH 019/257] crypto: ccp - check whether PSP supports SEV or TEE before initialization commit f100ab62b68922c343a8efc84e83d2275c1ade47 upstream Read PSP feature register to check for TEE (Trusted Execution Environment) support. If neither SEV nor TEE is supported by PSP, then skip PSP initialization. Cc: Tom Lendacky Cc: Jens Wiklander Cc: Ard Biesheuvel Co-developed-by: Devaraj Rangasamy Signed-off-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Acked-by: Gary R Hook Signed-off-by: Herbert Xu Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- drivers/crypto/ccp/psp-dev.c | 46 ++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 2cd7a5ea4156..3bedf7254a97 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -53,7 +53,7 @@ static irqreturn_t psp_irq_handler(int irq, void *data) return IRQ_HANDLED; } -static int psp_check_sev_support(struct psp_device *psp) +static unsigned int psp_get_capability(struct psp_device *psp) { unsigned int val = ioread32(psp->io_regs + psp->vdata->feature_reg); @@ -66,11 +66,17 @@ static int psp_check_sev_support(struct psp_device *psp) */ if (val == 0xffffffff) { dev_notice(psp->dev, "psp: unable to access the device: you might be running a broken BIOS.\n"); - return -ENODEV; + return 0; } - if (!(val & 1)) { - /* Device does not support the SEV feature */ + return val; +} + +static int psp_check_sev_support(struct psp_device *psp, + unsigned int capability) +{ + /* Check if device supports SEV feature */ + if (!(capability & 1)) { dev_dbg(psp->dev, "psp does not support SEV\n"); return -ENODEV; } @@ -78,10 +84,36 @@ static int psp_check_sev_support(struct psp_device *psp) return 0; } +static int psp_check_tee_support(struct psp_device *psp, + unsigned int capability) +{ + /* Check if device supports TEE feature */ + if (!(capability & 2)) { + dev_dbg(psp->dev, "psp does not support TEE\n"); + return -ENODEV; + } + + return 0; +} + +static int psp_check_support(struct psp_device *psp, + unsigned int capability) +{ + int sev_support = psp_check_sev_support(psp, capability); + int tee_support = psp_check_tee_support(psp, capability); + + /* Return error if device neither supports SEV nor TEE */ + if (sev_support && tee_support) + return -ENODEV; + + return 0; +} + int psp_dev_init(struct sp_device *sp) { struct device *dev = sp->dev; struct psp_device *psp; + unsigned int capability; int ret; ret = -ENOMEM; @@ -100,7 +132,11 @@ int psp_dev_init(struct sp_device *sp) psp->io_regs = sp->io_map; - ret = psp_check_sev_support(psp); + capability = psp_get_capability(psp); + if (!capability) + goto e_disable; + + ret = psp_check_support(psp, capability); if (ret) goto e_disable; -- Gitee From 89c625b3e619cc1c9110811375821ed809a4c7d8 Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Wed, 4 Dec 2019 11:49:02 +0530 Subject: [PATCH 020/257] crypto: ccp - add TEE support for Raven Ridge commit 33960acccfbd7f24d443cb3d0312ac28abe62bae upstream Adds a PCI device entry for Raven Ridge. Raven Ridge is an APU with a dedicated AMD Secure Processor having Trusted Execution Environment (TEE) support. The TEE provides a secure environment for running Trusted Applications (TAs) which implement security-sensitive parts of a feature. This patch configures AMD Secure Processor's TEE interface by initializing a ring buffer (shared memory between Rich OS and Trusted OS) which can hold multiple command buffer entries. The TEE interface is facilitated by a set of CPU to PSP mailbox registers. The next patch will address how commands are submitted to the ring buffer. [Backport Changes] 1. In drivers/crypto/ccp/sp-pci.c, within the sp_pci_table (an instance of struct pci_device_id),the CCP/PSP PCI device 0x14CA was initially added at index 4 (instead of index 5) in OpenCloud commit 31540607045b5, along with its corresponding dev_vdata entry, because support for device 0x15DF was not yet present in the OpenCloud source code. Since the current patch introduces this missing device 0x15DF at index 4, the entry for device 0x14CA has been moved to index 5 to restore the correct upstream ordering from commit 3438de03e98a, in both struct sp_pci_table and dev_vdata entry. Cc: Jens Wiklander Cc: Tom Lendacky Cc: Ard Biesheuvel Co-developed-by: Devaraj Rangasamy Signed-off-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Acked-by: Gary R Hook Signed-off-by: Herbert Xu Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- drivers/crypto/ccp/Makefile | 3 +- drivers/crypto/ccp/psp-dev.c | 39 +++++- drivers/crypto/ccp/psp-dev.h | 8 ++ drivers/crypto/ccp/sp-dev.h | 11 +- drivers/crypto/ccp/sp-pci.c | 29 ++++- drivers/crypto/ccp/tee-dev.c | 238 +++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/tee-dev.h | 109 ++++++++++++++++ 7 files changed, 432 insertions(+), 5 deletions(-) create mode 100644 drivers/crypto/ccp/tee-dev.c create mode 100644 drivers/crypto/ccp/tee-dev.h diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 3b29ea4a3583..db362fe472ea 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -9,7 +9,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_CCP) += ccp-dev.o \ ccp-$(CONFIG_CRYPTO_DEV_CCP_DEBUGFS) += ccp-debugfs.o ccp-$(CONFIG_PCI) += sp-pci.o ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ - sev-dev.o + sev-dev.o \ + tee-dev.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 3bedf7254a97..e95e7aa5dbf1 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -13,6 +13,7 @@ #include "sp-dev.h" #include "psp-dev.h" #include "sev-dev.h" +#include "tee-dev.h" struct psp_device *psp_master; @@ -45,6 +46,9 @@ static irqreturn_t psp_irq_handler(int irq, void *data) if (status) { if (psp->sev_irq_handler) psp->sev_irq_handler(irq, psp->sev_irq_data, status); + + if (psp->tee_irq_handler) + psp->tee_irq_handler(irq, psp->tee_irq_data, status); } /* Clear the interrupt status by writing the same value we read. */ @@ -109,6 +113,25 @@ static int psp_check_support(struct psp_device *psp, return 0; } +static int psp_init(struct psp_device *psp, unsigned int capability) +{ + int ret; + + if (!psp_check_sev_support(psp, capability)) { + ret = sev_dev_init(psp); + if (ret) + return ret; + } + + if (!psp_check_tee_support(psp, capability)) { + ret = tee_dev_init(psp); + if (ret) + return ret; + } + + return 0; +} + int psp_dev_init(struct sp_device *sp) { struct device *dev = sp->dev; @@ -151,7 +174,7 @@ int psp_dev_init(struct sp_device *sp) goto e_err; } - ret = sev_dev_init(psp); + ret = psp_init(psp, capability); if (ret) goto e_irq; @@ -189,6 +212,8 @@ void psp_dev_destroy(struct sp_device *sp) sev_dev_destroy(psp); + tee_dev_destroy(psp); + sp_free_psp_irq(sp, psp); } @@ -204,6 +229,18 @@ void psp_clear_sev_irq_handler(struct psp_device *psp) psp_set_sev_irq_handler(psp, NULL, NULL); } +void psp_set_tee_irq_handler(struct psp_device *psp, psp_irq_handler_t handler, + void *data) +{ + psp->tee_irq_data = data; + psp->tee_irq_handler = handler; +} + +void psp_clear_tee_irq_handler(struct psp_device *psp) +{ + psp_set_tee_irq_handler(psp, NULL, NULL); +} + struct psp_device *psp_get_master_device(void) { struct sp_device *sp = sp_get_psp_master_device(); diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index 7c014acdd69c..ef38e4135d81 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -40,13 +40,21 @@ struct psp_device { psp_irq_handler_t sev_irq_handler; void *sev_irq_data; + psp_irq_handler_t tee_irq_handler; + void *tee_irq_data; + void *sev_data; + void *tee_data; }; void psp_set_sev_irq_handler(struct psp_device *psp, psp_irq_handler_t handler, void *data); void psp_clear_sev_irq_handler(struct psp_device *psp); +void psp_set_tee_irq_handler(struct psp_device *psp, psp_irq_handler_t handler, + void *data); +void psp_clear_tee_irq_handler(struct psp_device *psp); + struct psp_device *psp_get_master_device(void); #endif /* __PSP_DEV_H */ diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h index 0394c752a7c8..423594608ad1 100644 --- a/drivers/crypto/ccp/sp-dev.h +++ b/drivers/crypto/ccp/sp-dev.h @@ -2,7 +2,7 @@ /* * AMD Secure Processor driver * - * Copyright (C) 2017-2018 Advanced Micro Devices, Inc. + * Copyright (C) 2017-2019 Advanced Micro Devices, Inc. * * Author: Tom Lendacky * Author: Gary R Hook @@ -45,8 +45,17 @@ struct sev_vdata { const unsigned int cmdbuff_addr_hi_reg; }; +struct tee_vdata { + const unsigned int cmdresp_reg; + const unsigned int cmdbuff_addr_lo_reg; + const unsigned int cmdbuff_addr_hi_reg; + const unsigned int ring_wptr_reg; + const unsigned int ring_rptr_reg; +}; + struct psp_vdata { const struct sev_vdata *sev; + const struct tee_vdata *tee; const unsigned int feature_reg; const unsigned int inten_reg; const unsigned int intsts_reg; diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 788983580e01..26061407a641 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -2,7 +2,7 @@ /* * AMD Secure Processor device driver * - * Copyright (C) 2013,2018 Advanced Micro Devices, Inc. + * Copyright (C) 2013,2019 Advanced Micro Devices, Inc. * * Author: Tom Lendacky * Author: Gary R Hook @@ -276,6 +276,14 @@ static const struct sev_vdata sevv2 = { .cmdbuff_addr_hi_reg = 0x109e4, }; +static const struct tee_vdata teev1 = { + .cmdresp_reg = 0x10544, + .cmdbuff_addr_lo_reg = 0x10548, + .cmdbuff_addr_hi_reg = 0x1054c, + .ring_wptr_reg = 0x10550, + .ring_rptr_reg = 0x10554, +}; + static const struct psp_vdata pspv1 = { .sev = &sevv1, .feature_reg = 0x105fc, @@ -289,6 +297,13 @@ static const struct psp_vdata pspv2 = { .inten_reg = 0x10690, .intsts_reg = 0x10694, }; + +static const struct psp_vdata pspv3 = { + .tee = &teev1, + .feature_reg = 0x109fc, + .inten_reg = 0x10690, + .intsts_reg = 0x10694, +}; #endif static const struct sp_dev_vdata dev_vdata[] = { @@ -324,6 +339,15 @@ static const struct sp_dev_vdata dev_vdata[] = { }, { /* 4 */ .bar = 2, +#ifdef CONFIG_CRYPTO_DEV_SP_CCP + .ccp_vdata = &ccpv5a, +#endif +#ifdef CONFIG_CRYPTO_DEV_SP_PSP + .psp_vdata = &pspv3, +#endif + }, + { /* 5 */ + .bar = 2, #ifdef CONFIG_CRYPTO_DEV_SP_PSP .psp_vdata = &pspv2, #endif @@ -334,7 +358,8 @@ static const struct pci_device_id sp_pci_table[] = { { PCI_VDEVICE(AMD, 0x1456), (kernel_ulong_t)&dev_vdata[1] }, { PCI_VDEVICE(AMD, 0x1468), (kernel_ulong_t)&dev_vdata[2] }, { PCI_VDEVICE(AMD, 0x1486), (kernel_ulong_t)&dev_vdata[3] }, - { PCI_VDEVICE(AMD, 0x14CA), (kernel_ulong_t)&dev_vdata[4] }, + { PCI_VDEVICE(AMD, 0x15DF), (kernel_ulong_t)&dev_vdata[4] }, + { PCI_VDEVICE(AMD, 0x14CA), (kernel_ulong_t)&dev_vdata[5] }, /* Last entry must be zero */ { 0, } }; diff --git a/drivers/crypto/ccp/tee-dev.c b/drivers/crypto/ccp/tee-dev.c new file mode 100644 index 000000000000..ccbc2ce59d51 --- /dev/null +++ b/drivers/crypto/ccp/tee-dev.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: MIT +/* + * AMD Trusted Execution Environment (TEE) interface + * + * Author: Rijo Thomas + * Author: Devaraj Rangasamy + * + * Copyright 2019 Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include +#include + +#include "psp-dev.h" +#include "tee-dev.h" + +static bool psp_dead; + +static int tee_alloc_ring(struct psp_tee_device *tee, int ring_size) +{ + struct ring_buf_manager *rb_mgr = &tee->rb_mgr; + void *start_addr; + + if (!ring_size) + return -EINVAL; + + /* We need actual physical address instead of DMA address, since + * Trusted OS running on AMD Secure Processor will map this region + */ + start_addr = (void *)__get_free_pages(GFP_KERNEL, get_order(ring_size)); + if (!start_addr) + return -ENOMEM; + + rb_mgr->ring_start = start_addr; + rb_mgr->ring_size = ring_size; + rb_mgr->ring_pa = __psp_pa(start_addr); + + return 0; +} + +static void tee_free_ring(struct psp_tee_device *tee) +{ + struct ring_buf_manager *rb_mgr = &tee->rb_mgr; + + if (!rb_mgr->ring_start) + return; + + free_pages((unsigned long)rb_mgr->ring_start, + get_order(rb_mgr->ring_size)); + + rb_mgr->ring_start = NULL; + rb_mgr->ring_size = 0; + rb_mgr->ring_pa = 0; +} + +static int tee_wait_cmd_poll(struct psp_tee_device *tee, unsigned int timeout, + unsigned int *reg) +{ + /* ~10ms sleep per loop => nloop = timeout * 100 */ + int nloop = timeout * 100; + + while (--nloop) { + *reg = ioread32(tee->io_regs + tee->vdata->cmdresp_reg); + if (*reg & PSP_CMDRESP_RESP) + return 0; + + usleep_range(10000, 10100); + } + + dev_err(tee->dev, "tee: command timed out, disabling PSP\n"); + psp_dead = true; + + return -ETIMEDOUT; +} + +static +struct tee_init_ring_cmd *tee_alloc_cmd_buffer(struct psp_tee_device *tee) +{ + struct tee_init_ring_cmd *cmd; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return NULL; + + cmd->hi_addr = upper_32_bits(tee->rb_mgr.ring_pa); + cmd->low_addr = lower_32_bits(tee->rb_mgr.ring_pa); + cmd->size = tee->rb_mgr.ring_size; + + dev_dbg(tee->dev, "tee: ring address: high = 0x%x low = 0x%x size = %u\n", + cmd->hi_addr, cmd->low_addr, cmd->size); + + return cmd; +} + +static inline void tee_free_cmd_buffer(struct tee_init_ring_cmd *cmd) +{ + kfree(cmd); +} + +static int tee_init_ring(struct psp_tee_device *tee) +{ + int ring_size = MAX_RING_BUFFER_ENTRIES * sizeof(struct tee_ring_cmd); + struct tee_init_ring_cmd *cmd; + phys_addr_t cmd_buffer; + unsigned int reg; + int ret; + + BUILD_BUG_ON(sizeof(struct tee_ring_cmd) != 1024); + + ret = tee_alloc_ring(tee, ring_size); + if (ret) { + dev_err(tee->dev, "tee: ring allocation failed %d\n", ret); + return ret; + } + + tee->rb_mgr.wptr = 0; + + cmd = tee_alloc_cmd_buffer(tee); + if (!cmd) { + tee_free_ring(tee); + return -ENOMEM; + } + + cmd_buffer = __psp_pa((void *)cmd); + + /* Send command buffer details to Trusted OS by writing to + * CPU-PSP message registers + */ + + iowrite32(lower_32_bits(cmd_buffer), + tee->io_regs + tee->vdata->cmdbuff_addr_lo_reg); + iowrite32(upper_32_bits(cmd_buffer), + tee->io_regs + tee->vdata->cmdbuff_addr_hi_reg); + iowrite32(TEE_RING_INIT_CMD, + tee->io_regs + tee->vdata->cmdresp_reg); + + ret = tee_wait_cmd_poll(tee, TEE_DEFAULT_TIMEOUT, ®); + if (ret) { + dev_err(tee->dev, "tee: ring init command timed out\n"); + tee_free_ring(tee); + goto free_buf; + } + + if (reg & PSP_CMDRESP_ERR_MASK) { + dev_err(tee->dev, "tee: ring init command failed (%#010x)\n", + reg & PSP_CMDRESP_ERR_MASK); + tee_free_ring(tee); + ret = -EIO; + } + +free_buf: + tee_free_cmd_buffer(cmd); + + return ret; +} + +static void tee_destroy_ring(struct psp_tee_device *tee) +{ + unsigned int reg; + int ret; + + if (!tee->rb_mgr.ring_start) + return; + + if (psp_dead) + goto free_ring; + + iowrite32(TEE_RING_DESTROY_CMD, + tee->io_regs + tee->vdata->cmdresp_reg); + + ret = tee_wait_cmd_poll(tee, TEE_DEFAULT_TIMEOUT, ®); + if (ret) { + dev_err(tee->dev, "tee: ring destroy command timed out\n"); + } else if (reg & PSP_CMDRESP_ERR_MASK) { + dev_err(tee->dev, "tee: ring destroy command failed (%#010x)\n", + reg & PSP_CMDRESP_ERR_MASK); + } + +free_ring: + tee_free_ring(tee); +} + +int tee_dev_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + struct psp_tee_device *tee; + int ret; + + ret = -ENOMEM; + tee = devm_kzalloc(dev, sizeof(*tee), GFP_KERNEL); + if (!tee) + goto e_err; + + psp->tee_data = tee; + + tee->dev = dev; + tee->psp = psp; + + tee->io_regs = psp->io_regs; + + tee->vdata = (struct tee_vdata *)psp->vdata->tee; + if (!tee->vdata) { + ret = -ENODEV; + dev_err(dev, "tee: missing driver data\n"); + goto e_err; + } + + ret = tee_init_ring(tee); + if (ret) { + dev_err(dev, "tee: failed to init ring buffer\n"); + goto e_err; + } + + dev_notice(dev, "tee enabled\n"); + + return 0; + +e_err: + psp->tee_data = NULL; + + dev_notice(dev, "tee initialization failed\n"); + + return ret; +} + +void tee_dev_destroy(struct psp_device *psp) +{ + struct psp_tee_device *tee = psp->tee_data; + + if (!tee) + return; + + tee_destroy_ring(tee); +} diff --git a/drivers/crypto/ccp/tee-dev.h b/drivers/crypto/ccp/tee-dev.h new file mode 100644 index 000000000000..b3db0fcb550c --- /dev/null +++ b/drivers/crypto/ccp/tee-dev.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2019 Advanced Micro Devices, Inc. + * + * Author: Rijo Thomas + * Author: Devaraj Rangasamy + * + */ + +/* This file describes the TEE communication interface between host and AMD + * Secure Processor + */ + +#ifndef __TEE_DEV_H__ +#define __TEE_DEV_H__ + +#include +#include + +#define TEE_DEFAULT_TIMEOUT 10 +#define MAX_BUFFER_SIZE 992 + +/** + * enum tee_ring_cmd_id - TEE interface commands for ring buffer configuration + * @TEE_RING_INIT_CMD: Initialize ring buffer + * @TEE_RING_DESTROY_CMD: Destroy ring buffer + * @TEE_RING_MAX_CMD: Maximum command id + */ +enum tee_ring_cmd_id { + TEE_RING_INIT_CMD = 0x00010000, + TEE_RING_DESTROY_CMD = 0x00020000, + TEE_RING_MAX_CMD = 0x000F0000, +}; + +/** + * struct tee_init_ring_cmd - Command to init TEE ring buffer + * @low_addr: bits [31:0] of the physical address of ring buffer + * @hi_addr: bits [63:32] of the physical address of ring buffer + * @size: size of ring buffer in bytes + */ +struct tee_init_ring_cmd { + u32 low_addr; + u32 hi_addr; + u32 size; +}; + +#define MAX_RING_BUFFER_ENTRIES 32 + +/** + * struct ring_buf_manager - Helper structure to manage ring buffer. + * @ring_start: starting address of ring buffer + * @ring_size: size of ring buffer in bytes + * @ring_pa: physical address of ring buffer + * @wptr: index to the last written entry in ring buffer + */ +struct ring_buf_manager { + void *ring_start; + u32 ring_size; + phys_addr_t ring_pa; + u32 wptr; +}; + +struct psp_tee_device { + struct device *dev; + struct psp_device *psp; + void __iomem *io_regs; + struct tee_vdata *vdata; + struct ring_buf_manager rb_mgr; +}; + +/** + * enum tee_cmd_state - TEE command states for the ring buffer interface + * @TEE_CMD_STATE_INIT: initial state of command when sent from host + * @TEE_CMD_STATE_PROCESS: command being processed by TEE environment + * @TEE_CMD_STATE_COMPLETED: command processing completed + */ +enum tee_cmd_state { + TEE_CMD_STATE_INIT, + TEE_CMD_STATE_PROCESS, + TEE_CMD_STATE_COMPLETED, +}; + +/** + * struct tee_ring_cmd - Structure of the command buffer in TEE ring + * @cmd_id: refers to &enum tee_cmd_id. Command id for the ring buffer + * interface + * @cmd_state: refers to &enum tee_cmd_state + * @status: status of TEE command execution + * @res0: reserved region + * @pdata: private data (currently unused) + * @res1: reserved region + * @buf: TEE command specific buffer + */ +struct tee_ring_cmd { + u32 cmd_id; + u32 cmd_state; + u32 status; + u32 res0[1]; + u64 pdata; + u32 res1[2]; + u8 buf[MAX_BUFFER_SIZE]; + + /* Total size: 1024 bytes */ +} __packed; + +int tee_dev_init(struct psp_device *psp); +void tee_dev_destroy(struct psp_device *psp); + +#endif /* __TEE_DEV_H__ */ -- Gitee From 217948d0c0d7fcb9c871a8de967edec155dc778d Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Fri, 27 Dec 2019 10:54:01 +0530 Subject: [PATCH 021/257] tee: add AMD-TEE driver commit 757cc3e9ff1d72d014096399d6e2bf03974d9da1 upstream Adds AMD-TEE driver. * targets AMD APUs which has AMD Secure Processor with software-based Trusted Execution Environment (TEE) support * registers with TEE subsystem * defines tee_driver_ops function callbacks * kernel allocated memory is used as shared memory between normal world and secure world. * acts as REE (Rich Execution Environment) communication agent, which uses the services of AMD Secure Processor driver to submit commands for processing in TEE environment Cc: Ard Biesheuvel Cc: Tom Lendacky Acked-by: Jens Wiklander Co-developed-by: Devaraj Rangasamy Signed-off-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Reviewed-by: Gary R Hook Signed-off-by: Herbert Xu Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- drivers/tee/Kconfig | 2 +- drivers/tee/Makefile | 1 + drivers/tee/amdtee/Kconfig | 8 + drivers/tee/amdtee/Makefile | 5 + drivers/tee/amdtee/amdtee_if.h | 183 ++++++++++ drivers/tee/amdtee/amdtee_private.h | 159 +++++++++ drivers/tee/amdtee/call.c | 373 ++++++++++++++++++++ drivers/tee/amdtee/core.c | 510 ++++++++++++++++++++++++++++ drivers/tee/amdtee/shm_pool.c | 93 +++++ include/uapi/linux/tee.h | 1 + 10 files changed, 1334 insertions(+), 1 deletion(-) create mode 100644 drivers/tee/amdtee/Kconfig create mode 100644 drivers/tee/amdtee/Makefile create mode 100644 drivers/tee/amdtee/amdtee_if.h create mode 100644 drivers/tee/amdtee/amdtee_private.h create mode 100644 drivers/tee/amdtee/call.c create mode 100644 drivers/tee/amdtee/core.c create mode 100644 drivers/tee/amdtee/shm_pool.c diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig index 676ffcb64985..19c796692aa5 100644 --- a/drivers/tee/Kconfig +++ b/drivers/tee/Kconfig @@ -14,7 +14,7 @@ if TEE menu "TEE drivers" source "drivers/tee/optee/Kconfig" - +source "drivers/tee/amdtee/Kconfig" endmenu endif diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile index 21f51fd88b07..68da044afbfa 100644 --- a/drivers/tee/Makefile +++ b/drivers/tee/Makefile @@ -4,3 +4,4 @@ tee-objs += tee_core.o tee-objs += tee_shm.o tee-objs += tee_shm_pool.o obj-$(CONFIG_OPTEE) += optee/ +obj-$(CONFIG_AMDTEE) += amdtee/ diff --git a/drivers/tee/amdtee/Kconfig b/drivers/tee/amdtee/Kconfig new file mode 100644 index 000000000000..4e32b6413b41 --- /dev/null +++ b/drivers/tee/amdtee/Kconfig @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: MIT +# AMD-TEE Trusted Execution Environment Configuration +config AMDTEE + tristate "AMD-TEE" + default m + depends on CRYPTO_DEV_SP_PSP + help + This implements AMD's Trusted Execution Environment (TEE) driver. diff --git a/drivers/tee/amdtee/Makefile b/drivers/tee/amdtee/Makefile new file mode 100644 index 000000000000..ff1485266117 --- /dev/null +++ b/drivers/tee/amdtee/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: MIT +obj-$(CONFIG_AMDTEE) += amdtee.o +amdtee-objs += core.o +amdtee-objs += call.o +amdtee-objs += shm_pool.o diff --git a/drivers/tee/amdtee/amdtee_if.h b/drivers/tee/amdtee/amdtee_if.h new file mode 100644 index 000000000000..ff48c3e47375 --- /dev/null +++ b/drivers/tee/amdtee/amdtee_if.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: MIT */ + +/* + * Copyright 2019 Advanced Micro Devices, Inc. + */ + +/* + * This file has definitions related to Host and AMD-TEE Trusted OS interface. + * These definitions must match the definitions on the TEE side. + */ + +#ifndef AMDTEE_IF_H +#define AMDTEE_IF_H + +#include + +/***************************************************************************** + ** TEE Param + ******************************************************************************/ +#define TEE_MAX_PARAMS 4 + +/** + * struct memref - memory reference structure + * @buf_id: buffer ID of the buffer mapped by TEE_CMD_ID_MAP_SHARED_MEM + * @offset: offset in bytes from beginning of the buffer + * @size: data size in bytes + */ +struct memref { + u32 buf_id; + u32 offset; + u32 size; +}; + +struct value { + u32 a; + u32 b; +}; + +/* + * Parameters passed to open_session or invoke_command + */ +union tee_op_param { + struct memref mref; + struct value val; +}; + +struct tee_operation { + u32 param_types; + union tee_op_param params[TEE_MAX_PARAMS]; +}; + +/* Must be same as in GP TEE specification */ +#define TEE_OP_PARAM_TYPE_NONE 0 +#define TEE_OP_PARAM_TYPE_VALUE_INPUT 1 +#define TEE_OP_PARAM_TYPE_VALUE_OUTPUT 2 +#define TEE_OP_PARAM_TYPE_VALUE_INOUT 3 +#define TEE_OP_PARAM_TYPE_INVALID 4 +#define TEE_OP_PARAM_TYPE_MEMREF_INPUT 5 +#define TEE_OP_PARAM_TYPE_MEMREF_OUTPUT 6 +#define TEE_OP_PARAM_TYPE_MEMREF_INOUT 7 + +#define TEE_PARAM_TYPE_GET(t, i) (((t) >> ((i) * 4)) & 0xF) +#define TEE_PARAM_TYPES(t0, t1, t2, t3) \ + ((t0) | ((t1) << 4) | ((t2) << 8) | ((t3) << 12)) + +/***************************************************************************** + ** TEE Commands + *****************************************************************************/ + +/* + * The shared memory between rich world and secure world may be physically + * non-contiguous. Below structures are meant to describe a shared memory region + * via scatter/gather (sg) list + */ + +/** + * struct tee_sg_desc - sg descriptor for a physically contiguous buffer + * @low_addr: [in] bits[31:0] of buffer's physical address. Must be 4KB aligned + * @hi_addr: [in] bits[63:32] of the buffer's physical address + * @size: [in] size in bytes (must be multiple of 4KB) + */ +struct tee_sg_desc { + u32 low_addr; + u32 hi_addr; + u32 size; +}; + +/** + * struct tee_sg_list - structure describing a scatter/gather list + * @count: [in] number of sg descriptors + * @size: [in] total size of all buffers in the list. Must be multiple of 4KB + * @buf: [in] list of sg buffer descriptors + */ +#define TEE_MAX_SG_DESC 64 +struct tee_sg_list { + u32 count; + u32 size; + struct tee_sg_desc buf[TEE_MAX_SG_DESC]; +}; + +/** + * struct tee_cmd_map_shared_mem - command to map shared memory + * @buf_id: [out] return buffer ID value + * @sg_list: [in] list describing memory to be mapped + */ +struct tee_cmd_map_shared_mem { + u32 buf_id; + struct tee_sg_list sg_list; +}; + +/** + * struct tee_cmd_unmap_shared_mem - command to unmap shared memory + * @buf_id: [in] buffer ID of memory to be unmapped + */ +struct tee_cmd_unmap_shared_mem { + u32 buf_id; +}; + +/** + * struct tee_cmd_load_ta - load Trusted Application (TA) binary into TEE + * @low_addr: [in] bits [31:0] of the physical address of the TA binary + * @hi_addr: [in] bits [63:32] of the physical address of the TA binary + * @size: [in] size of TA binary in bytes + * @ta_handle: [out] return handle of the loaded TA + */ +struct tee_cmd_load_ta { + u32 low_addr; + u32 hi_addr; + u32 size; + u32 ta_handle; +}; + +/** + * struct tee_cmd_unload_ta - command to unload TA binary from TEE environment + * @ta_handle: [in] handle of the loaded TA to be unloaded + */ +struct tee_cmd_unload_ta { + u32 ta_handle; +}; + +/** + * struct tee_cmd_open_session - command to call TA_OpenSessionEntryPoint in TA + * @ta_handle: [in] handle of the loaded TA + * @session_info: [out] pointer to TA allocated session data + * @op: [in/out] operation parameters + * @return_origin: [out] origin of return code after TEE processing + */ +struct tee_cmd_open_session { + u32 ta_handle; + u32 session_info; + struct tee_operation op; + u32 return_origin; +}; + +/** + * struct tee_cmd_close_session - command to call TA_CloseSessionEntryPoint() + * in TA + * @ta_handle: [in] handle of the loaded TA + * @session_info: [in] pointer to TA allocated session data + */ +struct tee_cmd_close_session { + u32 ta_handle; + u32 session_info; +}; + +/** + * struct tee_cmd_invoke_cmd - command to call TA_InvokeCommandEntryPoint() in + * TA + * @ta_handle: [in] handle of the loaded TA + * @cmd_id: [in] TA command ID + * @session_info: [in] pointer to TA allocated session data + * @op: [in/out] operation parameters + * @return_origin: [out] origin of return code after TEE processing + */ +struct tee_cmd_invoke_cmd { + u32 ta_handle; + u32 cmd_id; + u32 session_info; + struct tee_operation op; + u32 return_origin; +}; + +#endif /*AMDTEE_IF_H*/ diff --git a/drivers/tee/amdtee/amdtee_private.h b/drivers/tee/amdtee/amdtee_private.h new file mode 100644 index 000000000000..d7f798c3394b --- /dev/null +++ b/drivers/tee/amdtee/amdtee_private.h @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: MIT */ + +/* + * Copyright 2019 Advanced Micro Devices, Inc. + */ + +#ifndef AMDTEE_PRIVATE_H +#define AMDTEE_PRIVATE_H + +#include +#include +#include +#include +#include +#include "amdtee_if.h" + +#define DRIVER_NAME "amdtee" +#define DRIVER_AUTHOR "AMD-TEE Linux driver team" + +/* Some GlobalPlatform error codes used in this driver */ +#define TEEC_SUCCESS 0x00000000 +#define TEEC_ERROR_GENERIC 0xFFFF0000 +#define TEEC_ERROR_BAD_PARAMETERS 0xFFFF0006 +#define TEEC_ERROR_COMMUNICATION 0xFFFF000E + +#define TEEC_ORIGIN_COMMS 0x00000002 + +/* Maximum number of sessions which can be opened with a Trusted Application */ +#define TEE_NUM_SESSIONS 32 + +#define TA_LOAD_PATH "/amdtee" +#define TA_PATH_MAX 60 + +/** + * struct amdtee - main service struct + * @teedev: client device + * @pool: shared memory pool + */ +struct amdtee { + struct tee_device *teedev; + struct tee_shm_pool *pool; +}; + +/** + * struct amdtee_session - Trusted Application (TA) session related information. + * @ta_handle: handle to Trusted Application (TA) loaded in TEE environment + * @refcount: counter to keep track of sessions opened for the TA instance + * @session_info: an array pointing to TA allocated session data. + * @sess_mask: session usage bit-mask. If a particular bit is set, then the + * corresponding @session_info entry is in use or valid. + * + * Session structure is updated on open_session and this information is used for + * subsequent operations with the Trusted Application. + */ +struct amdtee_session { + struct list_head list_node; + u32 ta_handle; + struct kref refcount; + u32 session_info[TEE_NUM_SESSIONS]; + DECLARE_BITMAP(sess_mask, TEE_NUM_SESSIONS); + spinlock_t lock; /* synchronizes access to @sess_mask */ +}; + +/** + * struct amdtee_context_data - AMD-TEE driver context data + * @sess_list: Keeps track of sessions opened in current TEE context + */ +struct amdtee_context_data { + struct list_head sess_list; +}; + +struct amdtee_driver_data { + struct amdtee *amdtee; +}; + +struct shmem_desc { + void *kaddr; + u64 size; +}; + +/** + * struct amdtee_shm_data - Shared memory data + * @kaddr: Kernel virtual address of shared memory + * @buf_id: Buffer id of memory mapped by TEE_CMD_ID_MAP_SHARED_MEM + */ +struct amdtee_shm_data { + struct list_head shm_node; + void *kaddr; + u32 buf_id; +}; + +struct amdtee_shm_context { + struct list_head shmdata_list; +}; + +#define LOWER_TWO_BYTE_MASK 0x0000FFFF + +/** + * set_session_id() - Sets the session identifier. + * @ta_handle: [in] handle of the loaded Trusted Application (TA) + * @session_index: [in] Session index. Range: 0 to (TEE_NUM_SESSIONS - 1). + * @session: [out] Pointer to session id + * + * Lower two bytes of the session identifier represents the TA handle and the + * upper two bytes is session index. + */ +static inline void set_session_id(u32 ta_handle, u32 session_index, + u32 *session) +{ + *session = (session_index << 16) | (LOWER_TWO_BYTE_MASK & ta_handle); +} + +static inline u32 get_ta_handle(u32 session) +{ + return session & LOWER_TWO_BYTE_MASK; +} + +static inline u32 get_session_index(u32 session) +{ + return (session >> 16) & LOWER_TWO_BYTE_MASK; +} + +int amdtee_open_session(struct tee_context *ctx, + struct tee_ioctl_open_session_arg *arg, + struct tee_param *param); + +int amdtee_close_session(struct tee_context *ctx, u32 session); + +int amdtee_invoke_func(struct tee_context *ctx, + struct tee_ioctl_invoke_arg *arg, + struct tee_param *param); + +int amdtee_cancel_req(struct tee_context *ctx, u32 cancel_id, u32 session); + +int amdtee_map_shmem(struct tee_shm *shm); + +void amdtee_unmap_shmem(struct tee_shm *shm); + +int handle_load_ta(void *data, u32 size, + struct tee_ioctl_open_session_arg *arg); + +int handle_unload_ta(u32 ta_handle); + +int handle_open_session(struct tee_ioctl_open_session_arg *arg, u32 *info, + struct tee_param *p); + +int handle_close_session(u32 ta_handle, u32 info); + +int handle_map_shmem(u32 count, struct shmem_desc *start, u32 *buf_id); + +void handle_unmap_shmem(u32 buf_id); + +int handle_invoke_cmd(struct tee_ioctl_invoke_arg *arg, u32 sinfo, + struct tee_param *p); + +struct tee_shm_pool *amdtee_config_shm(void); + +u32 get_buffer_id(struct tee_shm *shm); +#endif /*AMDTEE_PRIVATE_H*/ diff --git a/drivers/tee/amdtee/call.c b/drivers/tee/amdtee/call.c new file mode 100644 index 000000000000..87ccad256686 --- /dev/null +++ b/drivers/tee/amdtee/call.c @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2019 Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include "amdtee_if.h" +#include "amdtee_private.h" + +static int tee_params_to_amd_params(struct tee_param *tee, u32 count, + struct tee_operation *amd) +{ + int i, ret = 0; + u32 type; + + if (!count) + return 0; + + if (!tee || !amd || count > TEE_MAX_PARAMS) + return -EINVAL; + + amd->param_types = 0; + for (i = 0; i < count; i++) { + /* AMD TEE does not support meta parameter */ + if (tee[i].attr > TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT) + return -EINVAL; + + amd->param_types |= ((tee[i].attr & 0xF) << i * 4); + } + + for (i = 0; i < count; i++) { + type = TEE_PARAM_TYPE_GET(amd->param_types, i); + pr_debug("%s: type[%d] = 0x%x\n", __func__, i, type); + + if (type == TEE_OP_PARAM_TYPE_INVALID) + return -EINVAL; + + if (type == TEE_OP_PARAM_TYPE_NONE) + continue; + + /* It is assumed that all values are within 2^32-1 */ + if (type > TEE_OP_PARAM_TYPE_VALUE_INOUT) { + u32 buf_id = get_buffer_id(tee[i].u.memref.shm); + + amd->params[i].mref.buf_id = buf_id; + amd->params[i].mref.offset = tee[i].u.memref.shm_offs; + amd->params[i].mref.size = tee[i].u.memref.size; + pr_debug("%s: bufid[%d] = 0x%x, offset[%d] = 0x%x, size[%d] = 0x%x\n", + __func__, + i, amd->params[i].mref.buf_id, + i, amd->params[i].mref.offset, + i, amd->params[i].mref.size); + } else { + if (tee[i].u.value.c) + pr_warn("%s: Discarding value c", __func__); + + amd->params[i].val.a = tee[i].u.value.a; + amd->params[i].val.b = tee[i].u.value.b; + pr_debug("%s: a[%d] = 0x%x, b[%d] = 0x%x\n", __func__, + i, amd->params[i].val.a, + i, amd->params[i].val.b); + } + } + return ret; +} + +static int amd_params_to_tee_params(struct tee_param *tee, u32 count, + struct tee_operation *amd) +{ + int i, ret = 0; + u32 type; + + if (!count) + return 0; + + if (!tee || !amd || count > TEE_MAX_PARAMS) + return -EINVAL; + + /* Assumes amd->param_types is valid */ + for (i = 0; i < count; i++) { + type = TEE_PARAM_TYPE_GET(amd->param_types, i); + pr_debug("%s: type[%d] = 0x%x\n", __func__, i, type); + + if (type == TEE_OP_PARAM_TYPE_INVALID || + type > TEE_OP_PARAM_TYPE_MEMREF_INOUT) + return -EINVAL; + + if (type == TEE_OP_PARAM_TYPE_NONE || + type == TEE_OP_PARAM_TYPE_VALUE_INPUT || + type == TEE_OP_PARAM_TYPE_MEMREF_INPUT) + continue; + + /* + * It is assumed that buf_id remains unchanged for + * both open_session and invoke_cmd call + */ + if (type > TEE_OP_PARAM_TYPE_MEMREF_INPUT) { + tee[i].u.memref.shm_offs = amd->params[i].mref.offset; + tee[i].u.memref.size = amd->params[i].mref.size; + pr_debug("%s: bufid[%d] = 0x%x, offset[%d] = 0x%x, size[%d] = 0x%x\n", + __func__, + i, amd->params[i].mref.buf_id, + i, amd->params[i].mref.offset, + i, amd->params[i].mref.size); + } else { + /* field 'c' not supported by AMD TEE */ + tee[i].u.value.a = amd->params[i].val.a; + tee[i].u.value.b = amd->params[i].val.b; + tee[i].u.value.c = 0; + pr_debug("%s: a[%d] = 0x%x, b[%d] = 0x%x\n", + __func__, + i, amd->params[i].val.a, + i, amd->params[i].val.b); + } + } + return ret; +} + +int handle_unload_ta(u32 ta_handle) +{ + struct tee_cmd_unload_ta cmd = {0}; + int ret = 0; + u32 status; + + if (!ta_handle) + return -EINVAL; + + cmd.ta_handle = ta_handle; + + ret = psp_tee_process_cmd(TEE_CMD_ID_UNLOAD_TA, (void *)&cmd, + sizeof(cmd), &status); + if (!ret && status != 0) { + pr_err("unload ta: status = 0x%x\n", status); + ret = -EBUSY; + } + + return ret; +} + +int handle_close_session(u32 ta_handle, u32 info) +{ + struct tee_cmd_close_session cmd = {0}; + int ret = 0; + u32 status; + + if (ta_handle == 0) + return -EINVAL; + + cmd.ta_handle = ta_handle; + cmd.session_info = info; + + ret = psp_tee_process_cmd(TEE_CMD_ID_CLOSE_SESSION, (void *)&cmd, + sizeof(cmd), &status); + if (!ret && status != 0) { + pr_err("close session: status = 0x%x\n", status); + ret = -EBUSY; + } + + return ret; +} + +void handle_unmap_shmem(u32 buf_id) +{ + struct tee_cmd_unmap_shared_mem cmd = {0}; + int ret = 0; + u32 status; + + cmd.buf_id = buf_id; + + ret = psp_tee_process_cmd(TEE_CMD_ID_UNMAP_SHARED_MEM, (void *)&cmd, + sizeof(cmd), &status); + if (!ret) + pr_debug("unmap shared memory: buf_id %u status = 0x%x\n", + buf_id, status); +} + +int handle_invoke_cmd(struct tee_ioctl_invoke_arg *arg, u32 sinfo, + struct tee_param *p) +{ + struct tee_cmd_invoke_cmd cmd = {0}; + int ret = 0; + + if (!arg || (!p && arg->num_params)) + return -EINVAL; + + arg->ret_origin = TEEC_ORIGIN_COMMS; + + if (arg->session == 0) { + arg->ret = TEEC_ERROR_BAD_PARAMETERS; + return -EINVAL; + } + + ret = tee_params_to_amd_params(p, arg->num_params, &cmd.op); + if (ret) { + pr_err("invalid Params. Abort invoke command\n"); + arg->ret = TEEC_ERROR_BAD_PARAMETERS; + return ret; + } + + cmd.ta_handle = get_ta_handle(arg->session); + cmd.cmd_id = arg->func; + cmd.session_info = sinfo; + + ret = psp_tee_process_cmd(TEE_CMD_ID_INVOKE_CMD, (void *)&cmd, + sizeof(cmd), &arg->ret); + if (ret) { + arg->ret = TEEC_ERROR_COMMUNICATION; + } else { + ret = amd_params_to_tee_params(p, arg->num_params, &cmd.op); + if (unlikely(ret)) { + pr_err("invoke command: failed to copy output\n"); + arg->ret = TEEC_ERROR_GENERIC; + return ret; + } + arg->ret_origin = cmd.return_origin; + pr_debug("invoke command: RO = 0x%x ret = 0x%x\n", + arg->ret_origin, arg->ret); + } + + return ret; +} + +int handle_map_shmem(u32 count, struct shmem_desc *start, u32 *buf_id) +{ + struct tee_cmd_map_shared_mem *cmd; + phys_addr_t paddr; + int ret = 0, i; + u32 status; + + if (!count || !start || !buf_id) + return -EINVAL; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + /* Size must be page aligned */ + for (i = 0; i < count ; i++) { + if (!start[i].kaddr || (start[i].size & (PAGE_SIZE - 1))) { + ret = -EINVAL; + goto free_cmd; + } + + if ((u64)start[i].kaddr & (PAGE_SIZE - 1)) { + pr_err("map shared memory: page unaligned. addr 0x%llx", + (u64)start[i].kaddr); + ret = -EINVAL; + goto free_cmd; + } + } + + cmd->sg_list.count = count; + + /* Create buffer list */ + for (i = 0; i < count ; i++) { + paddr = __psp_pa(start[i].kaddr); + cmd->sg_list.buf[i].hi_addr = upper_32_bits(paddr); + cmd->sg_list.buf[i].low_addr = lower_32_bits(paddr); + cmd->sg_list.buf[i].size = start[i].size; + cmd->sg_list.size += cmd->sg_list.buf[i].size; + + pr_debug("buf[%d]:hi addr = 0x%x\n", i, + cmd->sg_list.buf[i].hi_addr); + pr_debug("buf[%d]:low addr = 0x%x\n", i, + cmd->sg_list.buf[i].low_addr); + pr_debug("buf[%d]:size = 0x%x\n", i, cmd->sg_list.buf[i].size); + pr_debug("list size = 0x%x\n", cmd->sg_list.size); + } + + *buf_id = 0; + + ret = psp_tee_process_cmd(TEE_CMD_ID_MAP_SHARED_MEM, (void *)cmd, + sizeof(*cmd), &status); + if (!ret && !status) { + *buf_id = cmd->buf_id; + pr_debug("mapped buffer ID = 0x%x\n", *buf_id); + } else { + pr_err("map shared memory: status = 0x%x\n", status); + ret = -ENOMEM; + } + +free_cmd: + kfree(cmd); + + return ret; +} + +int handle_open_session(struct tee_ioctl_open_session_arg *arg, u32 *info, + struct tee_param *p) +{ + struct tee_cmd_open_session cmd = {0}; + int ret = 0; + + if (!arg || !info || (!p && arg->num_params)) + return -EINVAL; + + arg->ret_origin = TEEC_ORIGIN_COMMS; + + if (arg->session == 0) { + arg->ret = TEEC_ERROR_GENERIC; + return -EINVAL; + } + + ret = tee_params_to_amd_params(p, arg->num_params, &cmd.op); + if (ret) { + pr_err("invalid Params. Abort open session\n"); + arg->ret = TEEC_ERROR_BAD_PARAMETERS; + return ret; + } + + cmd.ta_handle = get_ta_handle(arg->session); + *info = 0; + + ret = psp_tee_process_cmd(TEE_CMD_ID_OPEN_SESSION, (void *)&cmd, + sizeof(cmd), &arg->ret); + if (ret) { + arg->ret = TEEC_ERROR_COMMUNICATION; + } else { + ret = amd_params_to_tee_params(p, arg->num_params, &cmd.op); + if (unlikely(ret)) { + pr_err("open session: failed to copy output\n"); + arg->ret = TEEC_ERROR_GENERIC; + return ret; + } + arg->ret_origin = cmd.return_origin; + *info = cmd.session_info; + pr_debug("open session: session info = 0x%x\n", *info); + } + + pr_debug("open session: ret = 0x%x RO = 0x%x\n", arg->ret, + arg->ret_origin); + + return ret; +} + +int handle_load_ta(void *data, u32 size, struct tee_ioctl_open_session_arg *arg) +{ + struct tee_cmd_load_ta cmd = {0}; + phys_addr_t blob; + int ret = 0; + + if (size == 0 || !data || !arg) + return -EINVAL; + + blob = __psp_pa(data); + if (blob & (PAGE_SIZE - 1)) { + pr_err("load TA: page unaligned. blob 0x%llx", blob); + return -EINVAL; + } + + cmd.hi_addr = upper_32_bits(blob); + cmd.low_addr = lower_32_bits(blob); + cmd.size = size; + + ret = psp_tee_process_cmd(TEE_CMD_ID_LOAD_TA, (void *)&cmd, + sizeof(cmd), &arg->ret); + if (ret) { + arg->ret_origin = TEEC_ORIGIN_COMMS; + arg->ret = TEEC_ERROR_COMMUNICATION; + } else { + set_session_id(cmd.ta_handle, 0, &arg->session); + } + + pr_debug("load TA: TA handle = 0x%x, RO = 0x%x, ret = 0x%x\n", + cmd.ta_handle, arg->ret_origin, arg->ret); + + return 0; +} diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c new file mode 100644 index 000000000000..dd360f378ef0 --- /dev/null +++ b/drivers/tee/amdtee/core.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2019 Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "amdtee_private.h" +#include "../tee_private.h" + +static struct amdtee_driver_data *drv_data; +static DEFINE_MUTEX(session_list_mutex); +static struct amdtee_shm_context shmctx; + +static void amdtee_get_version(struct tee_device *teedev, + struct tee_ioctl_version_data *vers) +{ + struct tee_ioctl_version_data v = { + .impl_id = TEE_IMPL_ID_AMDTEE, + .impl_caps = 0, + .gen_caps = TEE_GEN_CAP_GP, + }; + *vers = v; +} + +static int amdtee_open(struct tee_context *ctx) +{ + struct amdtee_context_data *ctxdata; + + ctxdata = kzalloc(sizeof(*ctxdata), GFP_KERNEL); + if (!ctxdata) + return -ENOMEM; + + INIT_LIST_HEAD(&ctxdata->sess_list); + INIT_LIST_HEAD(&shmctx.shmdata_list); + + ctx->data = ctxdata; + return 0; +} + +static void release_session(struct amdtee_session *sess) +{ + int i = 0; + + /* Close any open session */ + for (i = 0; i < TEE_NUM_SESSIONS; ++i) { + /* Check if session entry 'i' is valid */ + if (!test_bit(i, sess->sess_mask)) + continue; + + handle_close_session(sess->ta_handle, sess->session_info[i]); + } + + /* Unload Trusted Application once all sessions are closed */ + handle_unload_ta(sess->ta_handle); + kfree(sess); +} + +static void amdtee_release(struct tee_context *ctx) +{ + struct amdtee_context_data *ctxdata = ctx->data; + + if (!ctxdata) + return; + + while (true) { + struct amdtee_session *sess; + + sess = list_first_entry_or_null(&ctxdata->sess_list, + struct amdtee_session, + list_node); + + if (!sess) + break; + + list_del(&sess->list_node); + release_session(sess); + } + kfree(ctxdata); + + ctx->data = NULL; +} + +/** + * alloc_session() - Allocate a session structure + * @ctxdata: TEE Context data structure + * @session: Session ID for which 'struct amdtee_session' structure is to be + * allocated. + * + * Scans the TEE context's session list to check if TA is already loaded in to + * TEE. If yes, returns the 'session' structure for that TA. Else allocates, + * initializes a new 'session' structure and adds it to context's session list. + * + * The caller must hold a mutex. + * + * Returns: + * 'struct amdtee_session *' on success and NULL on failure. + */ +static struct amdtee_session *alloc_session(struct amdtee_context_data *ctxdata, + u32 session) +{ + struct amdtee_session *sess; + u32 ta_handle = get_ta_handle(session); + + /* Scan session list to check if TA is already loaded in to TEE */ + list_for_each_entry(sess, &ctxdata->sess_list, list_node) + if (sess->ta_handle == ta_handle) { + kref_get(&sess->refcount); + return sess; + } + + /* Allocate a new session and add to list */ + sess = kzalloc(sizeof(*sess), GFP_KERNEL); + if (sess) { + sess->ta_handle = ta_handle; + kref_init(&sess->refcount); + spin_lock_init(&sess->lock); + list_add(&sess->list_node, &ctxdata->sess_list); + } + + return sess; +} + +/* Requires mutex to be held */ +static struct amdtee_session *find_session(struct amdtee_context_data *ctxdata, + u32 session) +{ + u32 ta_handle = get_ta_handle(session); + u32 index = get_session_index(session); + struct amdtee_session *sess; + + list_for_each_entry(sess, &ctxdata->sess_list, list_node) + if (ta_handle == sess->ta_handle && + test_bit(index, sess->sess_mask)) + return sess; + + return NULL; +} + +u32 get_buffer_id(struct tee_shm *shm) +{ + u32 buf_id = 0; + struct amdtee_shm_data *shmdata; + + list_for_each_entry(shmdata, &shmctx.shmdata_list, shm_node) + if (shmdata->kaddr == shm->kaddr) { + buf_id = shmdata->buf_id; + break; + } + + return buf_id; +} + +static DEFINE_MUTEX(drv_mutex); +static int copy_ta_binary(struct tee_context *ctx, void *ptr, void **ta, + size_t *ta_size) +{ + const struct firmware *fw; + char fw_name[TA_PATH_MAX]; + struct { + u32 lo; + u16 mid; + u16 hi_ver; + u8 seq_n[8]; + } *uuid = ptr; + int n = 0, rc = 0; + + n = snprintf(fw_name, TA_PATH_MAX, + "%s/%08x-%04x-%04x-%02x%02x%02x%02x%02x%02x%02x%02x.bin", + TA_LOAD_PATH, uuid->lo, uuid->mid, uuid->hi_ver, + uuid->seq_n[0], uuid->seq_n[1], + uuid->seq_n[2], uuid->seq_n[3], + uuid->seq_n[4], uuid->seq_n[5], + uuid->seq_n[6], uuid->seq_n[7]); + if (n < 0 || n >= TA_PATH_MAX) { + pr_err("failed to get firmware name\n"); + return -EINVAL; + } + + mutex_lock(&drv_mutex); + n = request_firmware(&fw, fw_name, &ctx->teedev->dev); + if (n) { + pr_err("failed to load firmware %s\n", fw_name); + rc = -ENOMEM; + goto unlock; + } + + *ta_size = roundup(fw->size, PAGE_SIZE); + *ta = (void *)__get_free_pages(GFP_KERNEL, get_order(*ta_size)); + if (IS_ERR(*ta)) { + pr_err("%s: get_free_pages failed 0x%llx\n", __func__, + (u64)*ta); + rc = -ENOMEM; + goto rel_fw; + } + + memcpy(*ta, fw->data, fw->size); +rel_fw: + release_firmware(fw); +unlock: + mutex_unlock(&drv_mutex); + return rc; +} + +int amdtee_open_session(struct tee_context *ctx, + struct tee_ioctl_open_session_arg *arg, + struct tee_param *param) +{ + struct amdtee_context_data *ctxdata = ctx->data; + struct amdtee_session *sess = NULL; + u32 session_info; + void *ta = NULL; + size_t ta_size; + int rc = 0, i; + + if (arg->clnt_login != TEE_IOCTL_LOGIN_PUBLIC) { + pr_err("unsupported client login method\n"); + return -EINVAL; + } + + rc = copy_ta_binary(ctx, &arg->uuid[0], &ta, &ta_size); + if (rc) { + pr_err("failed to copy TA binary\n"); + return rc; + } + + /* Load the TA binary into TEE environment */ + handle_load_ta(ta, ta_size, arg); + if (arg->ret == TEEC_SUCCESS) { + mutex_lock(&session_list_mutex); + sess = alloc_session(ctxdata, arg->session); + mutex_unlock(&session_list_mutex); + } + + if (arg->ret != TEEC_SUCCESS) + goto out; + + if (!sess) { + rc = -ENOMEM; + goto out; + } + + /* Find an empty session index for the given TA */ + spin_lock(&sess->lock); + i = find_first_zero_bit(sess->sess_mask, TEE_NUM_SESSIONS); + if (i < TEE_NUM_SESSIONS) + set_bit(i, sess->sess_mask); + spin_unlock(&sess->lock); + + if (i >= TEE_NUM_SESSIONS) { + pr_err("reached maximum session count %d\n", TEE_NUM_SESSIONS); + rc = -ENOMEM; + goto out; + } + + /* Open session with loaded TA */ + handle_open_session(arg, &session_info, param); + + if (arg->ret == TEEC_SUCCESS) { + sess->session_info[i] = session_info; + set_session_id(sess->ta_handle, i, &arg->session); + } else { + pr_err("open_session failed %d\n", arg->ret); + spin_lock(&sess->lock); + clear_bit(i, sess->sess_mask); + spin_unlock(&sess->lock); + } +out: + free_pages((u64)ta, get_order(ta_size)); + return rc; +} + +static void destroy_session(struct kref *ref) +{ + struct amdtee_session *sess = container_of(ref, struct amdtee_session, + refcount); + + /* Unload the TA from TEE */ + handle_unload_ta(sess->ta_handle); + mutex_lock(&session_list_mutex); + list_del(&sess->list_node); + mutex_unlock(&session_list_mutex); + kfree(sess); +} + +int amdtee_close_session(struct tee_context *ctx, u32 session) +{ + struct amdtee_context_data *ctxdata = ctx->data; + u32 i, ta_handle, session_info; + struct amdtee_session *sess; + + pr_debug("%s: sid = 0x%x\n", __func__, session); + + /* + * Check that the session is valid and clear the session + * usage bit + */ + mutex_lock(&session_list_mutex); + sess = find_session(ctxdata, session); + if (sess) { + ta_handle = get_ta_handle(session); + i = get_session_index(session); + session_info = sess->session_info[i]; + spin_lock(&sess->lock); + clear_bit(i, sess->sess_mask); + spin_unlock(&sess->lock); + } + mutex_unlock(&session_list_mutex); + + if (!sess) + return -EINVAL; + + /* Close the session */ + handle_close_session(ta_handle, session_info); + + kref_put(&sess->refcount, destroy_session); + + return 0; +} + +int amdtee_map_shmem(struct tee_shm *shm) +{ + struct shmem_desc shmem; + struct amdtee_shm_data *shmnode; + int rc, count; + u32 buf_id; + + if (!shm) + return -EINVAL; + + shmnode = kmalloc(sizeof(*shmnode), GFP_KERNEL); + if (!shmnode) + return -ENOMEM; + + count = 1; + shmem.kaddr = shm->kaddr; + shmem.size = shm->size; + + /* + * Send a MAP command to TEE and get the corresponding + * buffer Id + */ + rc = handle_map_shmem(count, &shmem, &buf_id); + if (rc) { + pr_err("map_shmem failed: ret = %d\n", rc); + kfree(shmnode); + return rc; + } + + shmnode->kaddr = shm->kaddr; + shmnode->buf_id = buf_id; + list_add(&shmnode->shm_node, &shmctx.shmdata_list); + + pr_debug("buf_id :[%x] kaddr[%p]\n", shmnode->buf_id, shmnode->kaddr); + + return 0; +} + +void amdtee_unmap_shmem(struct tee_shm *shm) +{ + u32 buf_id; + struct amdtee_shm_data *shmnode = NULL; + + if (!shm) + return; + + buf_id = get_buffer_id(shm); + /* Unmap the shared memory from TEE */ + handle_unmap_shmem(buf_id); + + list_for_each_entry(shmnode, &shmctx.shmdata_list, shm_node) + if (buf_id == shmnode->buf_id) { + list_del(&shmnode->shm_node); + kfree(shmnode); + break; + } +} + +int amdtee_invoke_func(struct tee_context *ctx, + struct tee_ioctl_invoke_arg *arg, + struct tee_param *param) +{ + struct amdtee_context_data *ctxdata = ctx->data; + struct amdtee_session *sess; + u32 i, session_info; + + /* Check that the session is valid */ + mutex_lock(&session_list_mutex); + sess = find_session(ctxdata, arg->session); + if (sess) { + i = get_session_index(arg->session); + session_info = sess->session_info[i]; + } + mutex_unlock(&session_list_mutex); + + if (!sess) + return -EINVAL; + + handle_invoke_cmd(arg, session_info, param); + + return 0; +} + +int amdtee_cancel_req(struct tee_context *ctx, u32 cancel_id, u32 session) +{ + return -EINVAL; +} + +static const struct tee_driver_ops amdtee_ops = { + .get_version = amdtee_get_version, + .open = amdtee_open, + .release = amdtee_release, + .open_session = amdtee_open_session, + .close_session = amdtee_close_session, + .invoke_func = amdtee_invoke_func, + .cancel_req = amdtee_cancel_req, +}; + +static const struct tee_desc amdtee_desc = { + .name = DRIVER_NAME "-clnt", + .ops = &amdtee_ops, + .owner = THIS_MODULE, +}; + +static int __init amdtee_driver_init(void) +{ + struct amdtee *amdtee = NULL; + struct tee_device *teedev; + struct tee_shm_pool *pool = ERR_PTR(-EINVAL); + int rc; + + drv_data = kzalloc(sizeof(*drv_data), GFP_KERNEL); + if (IS_ERR(drv_data)) + return -ENOMEM; + + amdtee = kzalloc(sizeof(*amdtee), GFP_KERNEL); + if (IS_ERR(amdtee)) { + rc = -ENOMEM; + goto err_kfree_drv_data; + } + + pool = amdtee_config_shm(); + if (IS_ERR(pool)) { + pr_err("shared pool configuration error\n"); + rc = PTR_ERR(pool); + goto err_kfree_amdtee; + } + + teedev = tee_device_alloc(&amdtee_desc, NULL, pool, amdtee); + if (IS_ERR(teedev)) { + rc = PTR_ERR(teedev); + goto err; + } + amdtee->teedev = teedev; + + rc = tee_device_register(amdtee->teedev); + if (rc) + goto err; + + amdtee->pool = pool; + + drv_data->amdtee = amdtee; + + pr_info("amd-tee driver initialization successful\n"); + return 0; + +err: + tee_device_unregister(amdtee->teedev); + if (pool) + tee_shm_pool_free(pool); + +err_kfree_amdtee: + kfree(amdtee); + +err_kfree_drv_data: + kfree(drv_data); + drv_data = NULL; + + pr_err("amd-tee driver initialization failed\n"); + return rc; +} +module_init(amdtee_driver_init); + +static void __exit amdtee_driver_exit(void) +{ + struct amdtee *amdtee; + + if (!drv_data || !drv_data->amdtee) + return; + + amdtee = drv_data->amdtee; + + tee_device_unregister(amdtee->teedev); + tee_shm_pool_free(amdtee->pool); +} +module_exit(amdtee_driver_exit); + +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION("AMD-TEE driver"); +MODULE_VERSION("1.0"); +MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/tee/amdtee/shm_pool.c b/drivers/tee/amdtee/shm_pool.c new file mode 100644 index 000000000000..065854e2db18 --- /dev/null +++ b/drivers/tee/amdtee/shm_pool.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2019 Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include "amdtee_private.h" + +static int pool_op_alloc(struct tee_shm_pool_mgr *poolm, struct tee_shm *shm, + size_t size) +{ + unsigned int order = get_order(size); + unsigned long va; + int rc; + + va = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!va) + return -ENOMEM; + + shm->kaddr = (void *)va; + shm->paddr = __psp_pa((void *)va); + shm->size = PAGE_SIZE << order; + + /* Map the allocated memory in to TEE */ + rc = amdtee_map_shmem(shm); + if (rc) { + free_pages(va, order); + shm->kaddr = NULL; + return rc; + } + + return 0; +} + +static void pool_op_free(struct tee_shm_pool_mgr *poolm, struct tee_shm *shm) +{ + /* Unmap the shared memory from TEE */ + amdtee_unmap_shmem(shm); + free_pages((unsigned long)shm->kaddr, get_order(shm->size)); + shm->kaddr = NULL; +} + +static void pool_op_destroy_poolmgr(struct tee_shm_pool_mgr *poolm) +{ + kfree(poolm); +} + +static const struct tee_shm_pool_mgr_ops pool_ops = { + .alloc = pool_op_alloc, + .free = pool_op_free, + .destroy_poolmgr = pool_op_destroy_poolmgr, +}; + +static struct tee_shm_pool_mgr *pool_mem_mgr_alloc(void) +{ + struct tee_shm_pool_mgr *mgr = kzalloc(sizeof(*mgr), GFP_KERNEL); + + if (!mgr) + return ERR_PTR(-ENOMEM); + + mgr->ops = &pool_ops; + + return mgr; +} + +struct tee_shm_pool *amdtee_config_shm(void) +{ + struct tee_shm_pool_mgr *priv_mgr; + struct tee_shm_pool_mgr *dmabuf_mgr; + void *rc; + + rc = pool_mem_mgr_alloc(); + if (IS_ERR(rc)) + return rc; + priv_mgr = rc; + + rc = pool_mem_mgr_alloc(); + if (IS_ERR(rc)) { + tee_shm_pool_mgr_destroy(priv_mgr); + return rc; + } + dmabuf_mgr = rc; + + rc = tee_shm_pool_alloc(priv_mgr, dmabuf_mgr); + if (IS_ERR(rc)) { + tee_shm_pool_mgr_destroy(priv_mgr); + tee_shm_pool_mgr_destroy(dmabuf_mgr); + } + + return rc; +} diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 4b9eb064d7e7..6596f3a09e54 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -56,6 +56,7 @@ * TEE Implementation ID */ #define TEE_IMPL_ID_OPTEE 1 +#define TEE_IMPL_ID_AMDTEE 2 /* * OP-TEE specific capabilities -- Gitee From d7836d3d93f012c4bafd93dee5d50a34188d8dcc Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 19 Jul 2022 11:13:28 -0500 Subject: [PATCH 022/257] crypto: ccp - Add support for new CCP/PSP device ID commit 96ec8dfdd094b7b2b015e092d581835a732949ff upstream Add a new CCP/PSP PCI device ID. This uses same register offsets as the previously supported structure. Signed-off-by: Mario Limonciello Acked-by: Tom Lendacky Acked-by: Rijo Thomas Signed-off-by: Herbert Xu Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- drivers/crypto/ccp/sp-pci.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 26061407a641..d4933960b36e 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -350,6 +350,12 @@ static const struct sp_dev_vdata dev_vdata[] = { .bar = 2, #ifdef CONFIG_CRYPTO_DEV_SP_PSP .psp_vdata = &pspv2, +#endif + }, + { /* 6 */ + .bar = 2, +#ifdef CONFIG_CRYPTO_DEV_SP_PSP + .psp_vdata = &pspv3, #endif }, }; @@ -360,6 +366,7 @@ static const struct pci_device_id sp_pci_table[] = { { PCI_VDEVICE(AMD, 0x1486), (kernel_ulong_t)&dev_vdata[3] }, { PCI_VDEVICE(AMD, 0x15DF), (kernel_ulong_t)&dev_vdata[4] }, { PCI_VDEVICE(AMD, 0x14CA), (kernel_ulong_t)&dev_vdata[5] }, + { PCI_VDEVICE(AMD, 0x15C7), (kernel_ulong_t)&dev_vdata[6] }, /* Last entry must be zero */ { 0, } }; -- Gitee From e29cc42d2db1396bce0b7226102fb98c1495b3be Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 28 Sep 2022 13:45:05 -0500 Subject: [PATCH 023/257] crypto: ccp - Add support for TEE for PCI ID 0x14CA commit 10da230a4df1dfe32a58eb09246f5ffe82346f27 upstream SoCs containing 0x14CA are present both in datacenter parts that support SEV as well as client parts that support TEE. Cc: stable@vger.kernel.org # 5.15+ Tested-by: Rijo-john Thomas Signed-off-by: Mario Limonciello Acked-by: Tom Lendacky Signed-off-by: Herbert Xu Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- drivers/crypto/ccp/sp-pci.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index d4933960b36e..aa7a94a49e34 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -304,6 +304,15 @@ static const struct psp_vdata pspv3 = { .inten_reg = 0x10690, .intsts_reg = 0x10694, }; + +static const struct psp_vdata pspv4 = { + .sev = &sevv2, + .tee = &teev1, + .feature_reg = 0x109fc, + .inten_reg = 0x10690, + .intsts_reg = 0x10694, +}; + #endif static const struct sp_dev_vdata dev_vdata[] = { @@ -349,7 +358,7 @@ static const struct sp_dev_vdata dev_vdata[] = { { /* 5 */ .bar = 2, #ifdef CONFIG_CRYPTO_DEV_SP_PSP - .psp_vdata = &pspv2, + .psp_vdata = &pspv4, #endif }, { /* 6 */ -- Gitee From 646b5713a56696ddb49139b304d846d24bd297b6 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 18 May 2023 22:24:13 -0500 Subject: [PATCH 024/257] crypto: ccp - Add support for PCI device 0x17E0 commit 4aa0931be8f0a2b1571dd24611b26602d2285a1a upstream PCI device 0x17E0 includes new TEE offsets, doesn't support a platform mailbox, and does support platform doorbell so introduce a new structure to represent it. [Backport Changes] 1. In file drivers/crypto/ccp/sp-dev.h, the struct platform_access_vdata definition was picked from commit 7ccc4f4e2e50, because it is required for supporting the newly introduced PCI device 0x17E0. Signed-off-by: Mario Limonciello Signed-off-by: Herbert Xu Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- drivers/crypto/ccp/sp-dev.h | 10 ++++++++++ drivers/crypto/ccp/sp-pci.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h index 423594608ad1..e5f4fa58df2c 100644 --- a/drivers/crypto/ccp/sp-dev.h +++ b/drivers/crypto/ccp/sp-dev.h @@ -53,9 +53,19 @@ struct tee_vdata { const unsigned int ring_rptr_reg; }; +struct platform_access_vdata { + const unsigned int cmdresp_reg; + const unsigned int cmdbuff_addr_lo_reg; + const unsigned int cmdbuff_addr_hi_reg; + const unsigned int doorbell_button_reg; + const unsigned int doorbell_cmd_reg; + +}; + struct psp_vdata { const struct sev_vdata *sev; const struct tee_vdata *tee; + const struct platform_access_vdata *platform_access; const unsigned int feature_reg; const unsigned int inten_reg; const unsigned int intsts_reg; diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index aa7a94a49e34..a986d5e21ccc 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -282,6 +282,20 @@ static const struct tee_vdata teev1 = { .cmdbuff_addr_hi_reg = 0x1054c, .ring_wptr_reg = 0x10550, .ring_rptr_reg = 0x10554, + +}; + +static const struct tee_vdata teev2 = { + .cmdresp_reg = 0x10944, /* C2PMSG_17 */ + .cmdbuff_addr_lo_reg = 0x10948, /* C2PMSG_18 */ + .cmdbuff_addr_hi_reg = 0x1094c, /* C2PMSG_19 */ + .ring_wptr_reg = 0x10950, /* C2PMSG_20 */ + .ring_rptr_reg = 0x10954, /* C2PMSG_21 */ +}; + +static const struct platform_access_vdata pa_v2 = { + .doorbell_button_reg = 0x10a24, /* C2PMSG_73 */ + .doorbell_cmd_reg = 0x10a40, /* C2PMSG_80 */ }; static const struct psp_vdata pspv1 = { @@ -313,6 +327,14 @@ static const struct psp_vdata pspv4 = { .intsts_reg = 0x10694, }; +static const struct psp_vdata pspv5 = { + .tee = &teev2, + .platform_access = &pa_v2, + .feature_reg = 0x109fc, /* C2PMSG_63 */ + .inten_reg = 0x10510, /* P2CMSG_INTEN */ + .intsts_reg = 0x10514, /* P2CMSG_INTSTS */ +}; + #endif static const struct sp_dev_vdata dev_vdata[] = { @@ -365,6 +387,12 @@ static const struct sp_dev_vdata dev_vdata[] = { .bar = 2, #ifdef CONFIG_CRYPTO_DEV_SP_PSP .psp_vdata = &pspv3, +#endif + }, + { /* 7 */ + .bar = 2, +#ifdef CONFIG_CRYPTO_DEV_SP_PSP + .psp_vdata = &pspv5, #endif }, }; @@ -376,6 +404,7 @@ static const struct pci_device_id sp_pci_table[] = { { PCI_VDEVICE(AMD, 0x15DF), (kernel_ulong_t)&dev_vdata[4] }, { PCI_VDEVICE(AMD, 0x14CA), (kernel_ulong_t)&dev_vdata[5] }, { PCI_VDEVICE(AMD, 0x15C7), (kernel_ulong_t)&dev_vdata[6] }, + { PCI_VDEVICE(AMD, 0x17E0), (kernel_ulong_t)&dev_vdata[7] }, /* Last entry must be zero */ { 0, } }; -- Gitee From 17b14335002b6112ae8924b6c25579893c31aaa4 Mon Sep 17 00:00:00 2001 From: John Allen Date: Thu, 18 May 2023 22:24:14 -0500 Subject: [PATCH 025/257] crypto: ccp - Add support for PCI device 0x156E commit bb4185e595e476d4ceccd366bca39762823c5a2e upstream Add a new CCP/PSP PCI device ID and new PSP register offsets. Signed-off-by: John Allen Signed-off-by: Mario Limonciello Signed-off-by: Herbert Xu Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- drivers/crypto/ccp/sp-pci.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index a986d5e21ccc..6e70af6d91ed 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -335,6 +335,14 @@ static const struct psp_vdata pspv5 = { .intsts_reg = 0x10514, /* P2CMSG_INTSTS */ }; +static const struct psp_vdata pspv6 = { + .sev = &sevv2, + .tee = &teev2, + .feature_reg = 0x109fc, /* C2PMSG_63 */ + .inten_reg = 0x10510, /* P2CMSG_INTEN */ + .intsts_reg = 0x10514, /* P2CMSG_INTSTS */ +}; + #endif static const struct sp_dev_vdata dev_vdata[] = { @@ -393,6 +401,12 @@ static const struct sp_dev_vdata dev_vdata[] = { .bar = 2, #ifdef CONFIG_CRYPTO_DEV_SP_PSP .psp_vdata = &pspv5, +#endif + }, + { /* 8 */ + .bar = 2, +#ifdef CONFIG_CRYPTO_DEV_SP_PSP + .psp_vdata = &pspv6, #endif }, }; @@ -405,6 +419,7 @@ static const struct pci_device_id sp_pci_table[] = { { PCI_VDEVICE(AMD, 0x14CA), (kernel_ulong_t)&dev_vdata[5] }, { PCI_VDEVICE(AMD, 0x15C7), (kernel_ulong_t)&dev_vdata[6] }, { PCI_VDEVICE(AMD, 0x17E0), (kernel_ulong_t)&dev_vdata[7] }, + { PCI_VDEVICE(AMD, 0x156E), (kernel_ulong_t)&dev_vdata[8] }, /* Last entry must be zero */ { 0, } }; -- Gitee From 8e3653c9c743aeedb423d96fe8fedc76c0a97a28 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 28 Sep 2019 16:53:56 +0200 Subject: [PATCH 026/257] x86/microcode/amd: Fix two -Wunused-but-set-variable warnings commit 2b730952066cd022d1f46e801f06ca6ca9878823 upstream The dummy variable is the high part of the microcode revision MSR which is defined as reserved. Mark it unused so that W=1 builds don't trigger the above warning. No functional changes. [Backport Changes] The deviation in the patch is due to a difference in line numbers. Signed-off-by: Borislav Petkov Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190928162559.26294-1-bp@alien8.de Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/kernel/cpu/microcode/amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index a738f294ab7c..2a9ec49684eb 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -593,8 +593,8 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) void reload_ucode_amd(unsigned int cpu) { - u32 rev, dummy; struct microcode_amd *mc; + u32 rev, dummy __always_unused; mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)]; @@ -700,7 +700,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct ucode_cpu_info *uci; struct ucode_patch *p; enum ucode_state ret; - u32 rev, dummy; + u32 rev, dummy __always_unused; BUG_ON(raw_smp_processor_id() != cpu); -- Gitee From 4d077410748aa09dcaa432648ba196bedd7b9e82 Mon Sep 17 00:00:00 2001 From: Jithu Joseph Date: Fri, 6 May 2022 15:53:59 -0700 Subject: [PATCH 027/257] x86/microcode/intel: Expose collect_cpu_info_early() for IFS commit d3287fb0d3c8afdfd4870a6cd4a852abc9008b3b upstream IFS is a CPU feature that allows a binary blob, similar to microcode, to be loaded and consumed to perform low level validation of CPU circuitry. In fact, it carries the same Processor Signature (family/model/stepping) details that are contained in Intel microcode blobs. In support of an IFS driver to trigger loading, validation, and running of these tests blobs, make the functionality of cpu_signatures_match() and collect_cpu_info_early() available outside of the microcode driver. Add an "intel_" prefix and drop the "_early" suffix from collect_cpu_info_early() and EXPORT_SYMBOL_GPL() it. Add declaration to x86 Make cpu_signatures_match() an inline function in x86 , and also give it an "intel_" prefix. No functional change intended. Reviewed-by: Dan Williams Signed-off-by: Jithu Joseph Co-developed-by: Tony Luck Signed-off-by: Tony Luck Reviewed-by: Thomas Gleixner Acked-by: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220506225410.1652287-2-tony.luck@intel.com Signed-off-by: Hans de Goede Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/include/asm/cpu.h | 18 ++++++++ arch/x86/kernel/cpu/intel.c | 32 +++++++++++++++ arch/x86/kernel/cpu/microcode/intel.c | 59 ++++----------------------- 3 files changed, 57 insertions(+), 52 deletions(-) diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index adc6cc86b062..258996697e58 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -40,4 +40,22 @@ int mwait_usable(const struct cpuinfo_x86 *); unsigned int x86_family(unsigned int sig); unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); +struct ucode_cpu_info; + +int intel_cpu_collect_info(struct ucode_cpu_info *uci); + +static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) +{ + if (s1 != s2) + return false; + + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; + + /* ... or they intersect. */ + return p1 & p2; +} + #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0f6f230323b9..4b5a819bb6aa 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -188,6 +188,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 896f456b3f5c..de7068461e38 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - /* * Returns 1 if update has been found, 0 otherwise. */ @@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) struct extended_signature *ext_sig; int i; - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) return 1; /* Look for ext. headers: */ @@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; } @@ -342,37 +328,6 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) return patch; } -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = x86_family(eax); - model = x86_model(eax); - - if ((model >= 5) || (family > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - static void show_saved_mc(void) { #ifdef DEBUG @@ -386,7 +341,7 @@ static void show_saved_mc(void) return; } - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); sig = uci.cpu_sig.sig; pf = uci.cpu_sig.pf; @@ -495,7 +450,7 @@ void show_ucode_info_early(void) struct ucode_cpu_info uci; if (delay_ucode_info) { - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); print_ucode_info(&uci, current_mc_date); delay_ucode_info = 0; } @@ -597,7 +552,7 @@ int __init save_microcode_in_initrd_intel(void) if (!(cp.data && cp.size)) return 0; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); scan_microcode(cp.data, cp.size, &uci, true); @@ -630,7 +585,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) if (!(cp.data && cp.size)) return NULL; - collect_cpu_info_early(uci); + intel_cpu_collect_info(uci); return scan_microcode(cp.data, cp.size, uci, false); } @@ -699,7 +654,7 @@ void reload_ucode_intel(void) struct microcode_intel *p; struct ucode_cpu_info uci; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); p = find_patch(&uci); if (!p) -- Gitee From 5c1a6b40d2aa041e7a606651d9b6d8eeac1bea69 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 19 Dec 2022 22:06:55 +0100 Subject: [PATCH 028/257] x86/microcode/AMD: Rename a couple of functions commit 61de9b7036f26448a1916291c456f62dd6bf07ea upstream - Rename apply_microcode_early_amd() to early_apply_microcode(): simplify the name so that it is clear what it does and when does it do it. - Rename __load_ucode_amd() to find_blobs_in_containers(): the new name actually explains what it does. Document some. No functional changes. Signed-off-by: Borislav Petkov Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20221219210656.5140-1-bp@alien8.de Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/kernel/cpu/microcode/amd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 2a9ec49684eb..84d07bdae4c4 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -417,8 +417,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool -apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) +static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) { struct cont_desc desc = { 0 }; u8 (*patch)[PATCH_MAX_SIZE]; @@ -481,7 +480,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) #endif } -static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) +static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) { struct ucode_cpu_info *uci; struct cpio_data cp; @@ -518,11 +517,11 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) { struct cpio_data cp = { }; - __load_ucode_amd(cpuid_1_eax, &cp); + find_blobs_in_containers(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return; - apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true); + early_apply_microcode(cpuid_1_eax, cp.data, cp.size, true); } void load_ucode_amd_ap(unsigned int cpuid_1_eax) @@ -553,11 +552,11 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) } } - __load_ucode_amd(cpuid_1_eax, &cp); + find_blobs_in_containers(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return; - apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false); + early_apply_microcode(cpuid_1_eax, cp.data, cp.size, false); } static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); @@ -828,6 +827,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, return 0; } +/* Scan the blob in @data and add microcode patches to the cache. */ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size) { -- Gitee From 9f63a5807ee1caa1baca9e464037fa17a06f08bf Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 3 Nov 2023 23:40:48 +0100 Subject: [PATCH 029/257] x86/CPU/AMD: Drop now unused CPU erratum checking function commit 05f5f73936fa4c1bc0a852702edf53789398d278 upstream Bye bye. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-14-bp@alien8.de Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/kernel/cpu/amd.c | 56 --------------------------------------- 1 file changed, 56 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 84ca77a2f943..5bc10df599df 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -34,62 +34,6 @@ */ static u32 nodes_per_socket = 1; -/* - * AMD errata checking - * - * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or - * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that - * have an OSVW id assigned, which it takes as first argument. Both take a - * variable number of family-specific model-stepping ranges created by - * AMD_MODEL_RANGE(). - * - * Example: - * - * const int amd_erratum_319[] = - * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), - * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), - * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); - */ - -#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } -#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } -#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ - ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) -#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) -#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) -#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) - -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) -{ - int osvw_id = *erratum++; - u32 range; - u32 ms; - - if (osvw_id >= 0 && osvw_id < 65536 && - cpu_has(cpu, X86_FEATURE_OSVW)) { - u64 osvw_len; - - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); - if (osvw_id < osvw_len) { - u64 osvw_bits; - - rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), - osvw_bits); - return osvw_bits & (1ULL << (osvw_id & 0x3f)); - } - } - - /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_stepping; - while ((range = *erratum++)) - if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && - (ms >= AMD_MODEL_RANGE_START(range)) && - (ms <= AMD_MODEL_RANGE_END(range))) - return true; - - return false; -} - static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; -- Gitee From ad70470421209ae1324eae349266af0196f40d48 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 7 Jun 2023 21:01:06 +0200 Subject: [PATCH 030/257] x86/microcode/AMD: Rip out static buffers commit 05e91e72113833385fb8c9a33bda9dbd93e27609 upstream Load straight from the containers (initrd or builtin, for example). There's no need to cache the patch per node. This even simplifies the code a bit with the opportunity for more cleanups later. [Backport Changes] 1. In arch/x86/kernel/cpu/microcode/core.c, within the function load_ucode_ap(), function call load_ucode_amd_ap() was renamed to load_ucode_amd_early() in the current upstream commit 05e91e721138. Accordingly, the existing load_ucode_amd_ap() call in switch case X86_VENDOR_HYGON, added in commit 871c9d196405b, was also updated to use the new load_ucode_amd_early() call. Updating this ensures continued support for Hygon family CPUs. Signed-off-by: Borislav Petkov (AMD) Tested-by: John Allen Link: https://lore.kernel.org/r/20230720202813.3269888-1-john.allen@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/include/asm/microcode_amd.h | 6 +- arch/x86/kernel/cpu/microcode/amd.c | 90 +++++++++------------------- arch/x86/kernel/cpu/microcode/core.c | 6 +- 3 files changed, 32 insertions(+), 70 deletions(-) diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index c6d83cee427d..df42512087e2 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -44,13 +44,11 @@ struct microcode_amd { #define PATCH_MAX_SIZE (3 * PAGE_SIZE) #ifdef CONFIG_MICROCODE_AMD -extern void __init load_ucode_amd_bsp(unsigned int family); -extern void load_ucode_amd_ap(unsigned int family); +extern void load_ucode_amd_early(unsigned int cpuid_1_eax); extern int __init save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(unsigned int cpu); #else -static inline void __init load_ucode_amd_bsp(unsigned int family) {} -static inline void load_ucode_amd_ap(unsigned int family) {} +static inline void load_ucode_amd_early(unsigned int cpuid_1_eax) {} static inline int __init save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } static inline void reload_ucode_amd(unsigned int cpu) {} diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 84d07bdae4c4..ff11a3777bae 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -56,9 +56,6 @@ struct cont_desc { static u32 ucode_new_rev; -/* One blob per node. */ -static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; - /* * Microcode patch container file is prepended to the initrd in cpio * format. See Documentation/x86/microcode.rst @@ -417,20 +414,17 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) +static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) { struct cont_desc desc = { 0 }; - u8 (*patch)[PATCH_MAX_SIZE]; struct microcode_amd *mc; u32 rev, dummy, *new_rev; bool ret = false; #ifdef CONFIG_X86_32 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; - patch = &amd_ucode_patch[0]; #endif desc.cpuid_1_eax = cpuid_1_eax; @@ -454,9 +448,6 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, boo if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; ret = true; - - if (save_patch) - memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE)); } return ret; @@ -513,7 +504,7 @@ static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret = cp; } -void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) +static void apply_ucode_from_containers(unsigned int cpuid_1_eax) { struct cpio_data cp = { }; @@ -521,42 +512,12 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) if (!(cp.data && cp.size)) return; - early_apply_microcode(cpuid_1_eax, cp.data, cp.size, true); + early_apply_microcode(cpuid_1_eax, cp.data, cp.size); } -void load_ucode_amd_ap(unsigned int cpuid_1_eax) +void load_ucode_amd_early(unsigned int cpuid_1_eax) { - struct microcode_amd *mc; - struct cpio_data cp; - u32 *new_rev, rev, dummy; - - if (IS_ENABLED(CONFIG_X86_32)) { - mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - } else { - mc = (struct microcode_amd *)amd_ucode_patch; - new_rev = &ucode_new_rev; - } - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - /* - * Check whether a new patch has been saved already. Also, allow application of - * the same revision in order to pick up SMT-thread-specific configuration even - * if the sibling SMT thread already has an up-to-date revision. - */ - if (*new_rev && rev <= mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - *new_rev = mc->hdr.patch_id; - return; - } - } - - find_blobs_in_containers(cpuid_1_eax, &cp); - if (!(cp.data && cp.size)) - return; - - early_apply_microcode(cpuid_1_eax, cp.data, cp.size, false); + return apply_ucode_from_containers(cpuid_1_eax); } static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); @@ -590,22 +551,6 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) return 0; } -void reload_ucode_amd(unsigned int cpu) -{ - struct microcode_amd *mc; - u32 rev, dummy __always_unused; - - mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)]; - - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - pr_info("reload patch_level=0x%08x\n", ucode_new_rev); - } - } -} static u16 __find_equiv_id(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -670,6 +615,28 @@ static struct ucode_patch *find_patch(unsigned int cpu) return cache_find_patch(equiv_id); } +void reload_ucode_amd(unsigned int cpu) +{ + u32 rev, dummy __always_unused; + struct microcode_amd *mc; + struct ucode_patch *p; + + p = find_patch(cpu); + if (!p) + return; + + mc = p->data; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + if (rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("reload patch_level=0x%08x\n", ucode_new_rev); + } + } +} + static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -890,9 +857,6 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz continue; ret = UCODE_NEW; - - memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE); - memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); } return ret; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 4eced47ed17c..1b2fd31d0aa7 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -205,7 +205,7 @@ void __init load_ucode_bsp(void) if (intel) load_ucode_intel_bsp(); else - load_ucode_amd_bsp(cpuid_1_eax); + load_ucode_amd_early(cpuid_1_eax); } static bool check_loader_disabled_ap(void) @@ -233,10 +233,10 @@ void load_ucode_ap(void) break; case X86_VENDOR_AMD: if (x86_family(cpuid_1_eax) >= 0x10) - load_ucode_amd_ap(cpuid_1_eax); + load_ucode_amd_early(cpuid_1_eax); break; case X86_VENDOR_HYGON: - load_ucode_amd_ap(cpuid_1_eax); + load_ucode_amd_early(cpuid_1_eax); break; default: break; -- Gitee From ed96def278038e8308f18243a490267134d3a7e4 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 8 Aug 2023 22:52:42 -0500 Subject: [PATCH 031/257] x86/amd_nb: Add PCI IDs for AMD Family 1Ah-based models commit c64016609b6f66b753b5f37929a191477fa584c0 upstream Add new PCI Device IDs required to support AMD's new Family 1Ah-based models 00h-1Fh, 20h and 40h-4Fh. [ bp: Zap a useless sentence. ] Co-developed-by: Mario Limonciello Signed-off-by: Mario Limonciello Signed-off-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230809035244.2722455-2-avadhut.naik@amd.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- arch/x86/kernel/amd_nb.c | 8 ++++++++ include/linux/pci_ids.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index d1afddf9a140..cdeba6e3b2a7 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -26,7 +26,10 @@ #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 +#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a +#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 +#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 #define PCI_DEVICE_ID_HYGON_18H_M05H_ROOT 0x14a0 #define PCI_DEVICE_ID_HYGON_18H_M10H_ROOT 0x14c0 @@ -48,6 +51,8 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, {} }; @@ -70,6 +75,8 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, {} }; EXPORT_SYMBOL_GPL(amd_nb_misc_ids); @@ -88,6 +95,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, {} }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 2a87de3c0b3e..e0c6235ef0be 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -556,6 +556,8 @@ #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 #define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F3 0x14b0 +#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3 +#define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 -- Gitee From 413e5d2ce740b09a41a4c58e4bd21c138ec4a0c6 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 8 Aug 2023 22:52:43 -0500 Subject: [PATCH 032/257] hwmon: (k10temp) Add thermal support for AMD Family 1Ah-based models commit 3cd9da416d5b625d52053c816ee91daf4e97007c upstream Add thermal info support for AMD Family 1Ah-based models. Support is provided on a per-socket granularity. Co-developed-by: Mario Limonciello Signed-off-by: Mario Limonciello Signed-off-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Acked-by: Guenter Roeck Link: https://lore.kernel.org/r/20230809035244.2722455-3-avadhut.naik@amd.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- drivers/hwmon/k10temp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index e08bb92d3451..47595a5beca1 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -66,7 +66,7 @@ static DEFINE_MUTEX(nb_smu_ind_mutex); #define F15H_M60H_REPORTED_TEMP_CTRL_OFFSET 0xd8200ca4 -/* Common for Zen CPU families (Family 17h and 18h and 19h) */ +/* Common for Zen CPU families (Family 17h and 18h and 19h and 1Ah) */ #define ZEN_REPORTED_TEMP_CTRL_BASE 0x00059800 #define ZEN_CCD_TEMP(offset, x) (ZEN_REPORTED_TEMP_CTRL_BASE + \ @@ -512,6 +512,10 @@ static int k10temp_probe(struct pci_dev *pdev, const struct pci_device_id *id) break; } + } else if (boot_cpu_data.x86 == 0x1a) { + data->temp_adjust_mask = ZEN_CUR_TEMP_RANGE_SEL_MASK; + data->read_tempreg = read_tempreg_nb_zen; + data->is_zen = true; } else { data->read_htcreg = read_htcreg_pci; data->read_tempreg = read_tempreg_pci; @@ -552,6 +556,8 @@ static const struct pci_device_id k10temp_id_table[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_HYGON_18H_M05H_DF_F3) }, -- Gitee From 04d18a1d05c6294d4be8d889a013b840d685794b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 Mar 2020 14:13:55 +0100 Subject: [PATCH 033/257] EDAC: Convert to new X86 CPU match macros commit 298426211c4b36e1e2475deb941f8fa59d6686c6 upstream The new macro set has a consistent namespace and uses C99 initializers instead of the grufty C89 ones. [Backport Changes] 1. In file drivers/edac/i10nm_base.c, within the struct x86_cpu_id instance, i.e., i10nm_cpuids, while replacing the legacy struct initializers with X86_MATCH_INTEL_FAM6_MODEL, the i10nm_cfg pointer was retained. 2. In file drivers/edac/skx_base.c, while updating the entries for the skx_cpuids instance of struct x86_cpu_id, to use X86_MATCH_INTEL_FAM6_MODEL, the skx_cfg pointer was also retained. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Acked-by: Tony Luck Link: https://lkml.kernel.org/r/20200320131509.673579000@linutronix.de Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 14 +++++++------- drivers/edac/i10nm_base.c | 8 ++++---- drivers/edac/pnd2_edac.c | 4 ++-- drivers/edac/sb_edac.c | 14 +++++++------- drivers/edac/skx_base.c | 2 +- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 5659ab477eba..cee9e4db681c 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3885,13 +3885,13 @@ static void setup_pci_device(void) } static const struct x86_cpu_id amd64_cpuids[] = { - { X86_VENDOR_AMD, 0xF, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, - { X86_VENDOR_AMD, 0x10, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, - { X86_VENDOR_AMD, 0x15, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, - { X86_VENDOR_AMD, 0x16, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, - { X86_VENDOR_AMD, 0x17, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, - { X86_VENDOR_HYGON, 0x18, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, - { X86_VENDOR_AMD, 0x19, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, + X86_MATCH_VENDOR_FAM(AMD, 0x0F, NULL), + X86_MATCH_VENDOR_FAM(AMD, 0x10, NULL), + X86_MATCH_VENDOR_FAM(AMD, 0x15, NULL), + X86_MATCH_VENDOR_FAM(AMD, 0x16, NULL), + X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL), + X86_MATCH_VENDOR_FAM(HYGON, 0x18, NULL), + X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL), { } }; MODULE_DEVICE_TABLE(x86cpu, amd64_cpuids); diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 6ab09bf26bab..96a790d04531 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -255,10 +255,10 @@ static struct res_config i10nm_cfg1 = { }; static const struct x86_cpu_id i10nm_cpuids[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_TREMONT_D, 0, (kernel_ulong_t)&i10nm_cfg0 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_X, 0, (kernel_ulong_t)&i10nm_cfg0 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ICELAKE_D, 0, (kernel_ulong_t)&i10nm_cfg1 }, - { } + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &i10nm_cfg0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &i10nm_cfg0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &i10nm_cfg1), + {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index ef6f882a870b..c6194b97b667 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -1538,8 +1538,8 @@ static struct dunit_ops dnv_ops = { }; static const struct x86_cpu_id pnd2_cpuids[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT, 0, (kernel_ulong_t)&apl_ops }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_D, 0, (kernel_ulong_t)&dnv_ops }, + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &apl_ops), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &dnv_ops), { } }; MODULE_DEVICE_TABLE(x86cpu, pnd2_cpuids); diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 58a772ef437b..ecbc7b9645f3 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3423,13 +3423,13 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) } static const struct x86_cpu_id sbridge_cpuids[] = { - INTEL_CPU_FAM6(SANDYBRIDGE_X, pci_dev_descr_sbridge_table), - INTEL_CPU_FAM6(IVYBRIDGE_X, pci_dev_descr_ibridge_table), - INTEL_CPU_FAM6(HASWELL_X, pci_dev_descr_haswell_table), - INTEL_CPU_FAM6(BROADWELL_X, pci_dev_descr_broadwell_table), - INTEL_CPU_FAM6(BROADWELL_D, pci_dev_descr_broadwell_table), - INTEL_CPU_FAM6(XEON_PHI_KNL, pci_dev_descr_knl_table), - INTEL_CPU_FAM6(XEON_PHI_KNM, pci_dev_descr_knl_table), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &pci_dev_descr_sbridge_table), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &pci_dev_descr_ibridge_table), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &pci_dev_descr_haswell_table), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &pci_dev_descr_broadwell_table), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &pci_dev_descr_broadwell_table), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &pci_dev_descr_knl_table), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &pci_dev_descr_knl_table), { } }; MODULE_DEVICE_TABLE(x86cpu, sbridge_cpuids); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 9130da0f7421..1c9df95ac96e 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -164,7 +164,7 @@ static struct res_config skx_cfg = { }; static const struct x86_cpu_id skx_cpuids[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X, 0, (kernel_ulong_t)&skx_cfg}, + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_cfg), { } }; MODULE_DEVICE_TABLE(x86cpu, skx_cpuids); -- Gitee From 6d162e6702f37f31c9ab2c275a8fbf9778bed7a3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 12 Dec 2020 15:20:28 +0100 Subject: [PATCH 034/257] EDAC/amd64: Do not load on family 0x15, model 0x13 commit 6c13d7ff81e6d2f01f62ccbfa49d1b8d87f274d0 upstream Those were only laptops and are very very unlikely to have ECC memory. Currently, when the driver attempts to load, it issues: EDAC amd64: Error: F1 not found: device 0x1601 (broken BIOS?) because the PCI device is the wrong one (it uses the F15h default one). So do not load the driver on them as that is pointless. Reported-by: Don Curtis Signed-off-by: Borislav Petkov Tested-by: Don Curtis Link: http://bugzilla.opensuse.org/show_bug.cgi?id=1179763 Link: https://lkml.kernel.org/r/20201218160622.20146-1-bp@alien8.de Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index cee9e4db681c..926ed7deb2c7 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3547,10 +3547,13 @@ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) fam_type = &family_types[F15_M60H_CPUS]; pvt->ops = &family_types[F15_M60H_CPUS].ops; break; + /* Richland is only client */ + } else if (pvt->model == 0x13) { + return NULL; + } else { + fam_type = &family_types[F15_CPUS]; + pvt->ops = &family_types[F15_CPUS].ops; } - - fam_type = &family_types[F15_CPUS]; - pvt->ops = &family_types[F15_CPUS].ops; break; case 0x16: @@ -3786,6 +3789,7 @@ static int probe_one_instance(unsigned int nid) pvt->mc_node_id = nid; pvt->F3 = F3; + ret = -ENODEV; fam_type = per_family_init(pvt); if (!fam_type) goto err_enable; -- Gitee From 27c4751dfde5acc98aef94b610793090d848f21a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 15 Dec 2020 17:01:31 +0000 Subject: [PATCH 035/257] EDAC/amd64: Tone down messages about missing PCI IDs commit 6a4afe38787d7556948b026652a6770e46ce0f6d upstream Give these messages a debug severity as they are really only useful to the module developers. Also, drop the "(broken BIOS?)" phrase, since this can cause churn for BIOS folks. The PCI IDs needed by the module, at least on modern systems, are fixed in hardware. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201215170131.8496-1-Yazen.Ghannam@amd.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 926ed7deb2c7..8ebe1a99a014 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2842,7 +2842,7 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) if (pvt->umc) { pvt->F0 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); if (!pvt->F0) { - amd64_err("F0 not found, device 0x%x (broken BIOS?)\n", pci_id1); + edac_dbg(1, "F0 not found, device 0x%x\n", pci_id1); return -ENODEV; } @@ -2851,7 +2851,7 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) pci_dev_put(pvt->F0); pvt->F0 = NULL; - amd64_err("F6 not found: device 0x%x (broken BIOS?)\n", pci_id2); + edac_dbg(1, "F6 not found: device 0x%x\n", pci_id2); return -ENODEV; } @@ -2868,7 +2868,7 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) /* Reserve the ADDRESS MAP Device */ pvt->F1 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); if (!pvt->F1) { - amd64_err("F1 not found: device 0x%x (broken BIOS?)\n", pci_id1); + edac_dbg(1, "F1 not found: device 0x%x\n", pci_id1); return -ENODEV; } @@ -2878,7 +2878,7 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) pci_dev_put(pvt->F1); pvt->F1 = NULL; - amd64_err("F2 not found: device 0x%x (broken BIOS?)\n", pci_id2); + edac_dbg(1, "F2 not found: device 0x%x\n", pci_id2); return -ENODEV; } -- Gitee From ab25a71b712a0bfd2f2f780a4768759945f1b65f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 13 Jan 2021 20:13:30 +0100 Subject: [PATCH 036/257] EDAC/amd64: Issue probing messages only on properly detected hardware commit 4cbcb73b1c7a73a13b336e0c048601d6e8eecaa8 upstream amd64_edac was converted to CPU family autoprobing (from PCI device IDs) to not have to add a new PCI device ID each time a new platform is shipped but to support the whole family out-of-the-box. However, this caused a lot of noise in dmesg even when the machine doesn't have ECC DIMMs or ECC has been disabled in the BIOS: EDAC MC: Ver: 3.0.0 EDAC amd64: F17h detected (node 0). EDAC amd64: Node 0: DRAM ECC disabled. EDAC amd64: F17h detected (node 1). EDAC amd64: Node 1: DRAM ECC disabled. EDAC amd64: F17h detected (node 2). EDAC amd64: Node 2: DRAM ECC disabled. EDAC amd64: F17h detected (node 3). EDAC amd64: Node 3: DRAM ECC disabled. EDAC amd64: F17h detected (node 4). EDAC amd64: Node 4: DRAM ECC disabled. EDAC amd64: F17h detected (node 5). EDAC amd64: Node 5: DRAM ECC disabled. EDAC amd64: F17h detected (node 6). EDAC amd64: Node 6: DRAM ECC disabled. EDAC amd64: F17h detected (node 7). EDAC amd64: Node 7: DRAM ECC disabled. or even $ grep EDAC dmesg.log | sed 's/\[.*\] //' | sort | uniq -c 128 EDAC amd64: F17h detected (node 0). 128 EDAC amd64: Node 0: DRAM ECC disabled. 1 EDAC MC: Ver: 3.0.0 on a big machine. Yap, that's once per CPU for 128 of them. So move the init messages after all probing has succeeded to avoid unnecessary spew in dmesg. Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210119164141.17417-1-bp@alien8.de Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 8ebe1a99a014..723ab29bfde9 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3449,8 +3449,7 @@ static bool ecc_enabled(struct amd64_pvt *pvt) MSR_IA32_MCG_CTL, nid); } - amd64_info("Node %d: DRAM ECC %s.\n", - nid, (ecc_en ? "enabled" : "disabled")); + edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, (ecc_en ? "enabled" : "disabled")); if (!ecc_en || !nb_mce_en) return false; @@ -3652,11 +3651,6 @@ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) return NULL; } - amd64_info("%s %sdetected (node %d).\n", fam_type->ctl_name, - (pvt->fam == 0xf ? - (pvt->ext_model >= K8_REV_F ? "revF or later " - : "revE or earlier ") - : ""), pvt->mc_node_id); return fam_type; } @@ -3830,6 +3824,12 @@ static int probe_one_instance(unsigned int nid) goto err_enable; } + amd64_info("%s %sdetected (node %d).\n", fam_type->ctl_name, + (pvt->fam == 0xf ? + (pvt->ext_model >= K8_REV_F ? "revF or later " + : "revE or earlier ") + : ""), pvt->mc_node_id); + dump_misc_regs(pvt); return ret; -- Gitee From 6f88ab864120ef966e8b52690252747a5ea05c8a Mon Sep 17 00:00:00 2001 From: Marc Bevand Date: Tue, 21 Dec 2021 15:31:12 -0800 Subject: [PATCH 037/257] EDAC/amd64: Add support for family 19h, models 50h-5fh commit 0b8bf9cb142da59a14622bba168ebcd6d0a54499 upstream Add the new family 19h models 50h-5fh PCI IDs (device 18h functions 0 and 6) to support Ryzen 5000 APUs ("Cezanne"). Signed-off-by: Marc Bevand Signed-off-by: Borislav Petkov Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20211221233112.556927-1-m@zorinaq.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 15 +++++++++++++++ drivers/edac/amd64_edac.h | 3 +++ 2 files changed, 18 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 723ab29bfde9..221c21057ef3 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2510,6 +2510,16 @@ static struct amd64_family_type family_types[] = { .dbam_to_cs = f17_addr_mask_to_cs_size, } }, + [F19_M50H_CPUS] = { + .ctl_name = "F19h_M50h", + .f0_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F0, + .f6_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F6, + .max_mcs = 2, + .ops = { + .early_channel_count = f17_early_channel_count, + .dbam_to_cs = f17_addr_mask_to_cs_size, + } + }, }; /* @@ -3635,6 +3645,11 @@ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) pvt->ops = &family_types[F17_M70H_CPUS].ops; fam_type->ctl_name = "F19h_M20h"; break; + } else if (pvt->model >= 0x50 && pvt->model <= 0x5f) { + fam_type = &family_types[F19_M50H_CPUS]; + pvt->ops = &family_types[F19_M50H_CPUS].ops; + fam_type->ctl_name = "F19h_M50h"; + break; } else if (pvt->model >= 0xa0 && pvt->model <= 0xaf) { fam_type = &family_types[F19_M10H_CPUS]; pvt->ops = &family_types[F19_M10H_CPUS].ops; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ac2d939fb984..59ec8851a06e 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -128,6 +128,8 @@ #define PCI_DEVICE_ID_AMD_19H_DF_F6 0x1656 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F0 0x14ad #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F6 0x14b3 +#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F0 0x166a +#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F6 0x1670 #define PCI_DEVICE_ID_HYGON_18H_M06H_DF_F0 0x14b0 #define PCI_DEVICE_ID_HYGON_18H_M06H_DF_F6 0x14b6 @@ -311,6 +313,7 @@ enum amd_families { F18_M10H_CPUS, F19_CPUS, F19_M10H_CPUS, + F19_M50H_CPUS, NUM_FAMILIES, }; -- Gitee From de5115b4b972fc927b576ca94c6a47806198aa3a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:03:58 +0000 Subject: [PATCH 038/257] EDAC/amd64: Don't set up EDAC PCI control on Family 17h+ commit fdce765a13384cf411d456472d396d8db2b3114f upstream EDAC PCI control is used to detect/report legacy PCI errors like "Parity" and "SERROR". Modern AMD systems use PCIe Advanced Error Reporting (AER), and legacy PCI errors should not be reported. Remove EDAC PCI control setup on AMD Family 17h and later systems. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-2-yazen.ghannam@amd.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 221c21057ef3..0da152c607b5 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3968,12 +3968,12 @@ static int __init amd64_edac_init(void) if (report_gart_errors) amd_report_gart_errors(true); - if (boot_cpu_data.x86 >= 0x17) + if (boot_cpu_data.x86 >= 0x17) { amd_register_ecc_decoder(decode_umc_error); - else + } else { amd_register_ecc_decoder(decode_bus_error); - - setup_pci_device(); + setup_pci_device(); + } #ifdef CONFIG_X86_32 amd64_err("%s on 32-bit is unsupported. USE AT YOUR OWN RISK!\n", EDAC_MOD_STR); -- Gitee From 43ec28c80c631f6a552ba65d3657a305d0300196 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:03:59 +0000 Subject: [PATCH 039/257] EDAC/amd64: Remove scrub rate control for Family 17h and later commit 6e241bc93c9f0ae6f76ed65b44132948ca70fd76 upstream The scrub registers on AMD Family 17h and later may be inaccessible to the OS. Furthermore, hardware designers recommend that the scrubbing feature is managed by the firmware. Remove support for the sdram_scrub_rate interface for AMD Family 17h systems and later by not setting the scrub function pointers. The EDAC MC core will then not expose the scrub files in sysfs. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-3-yazen.ghannam@amd.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 33 +++++---------------------------- drivers/edac/amd64_edac.h | 2 -- 2 files changed, 5 insertions(+), 30 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 0da152c607b5..ac1e7778dd6e 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -196,21 +196,6 @@ static inline int amd64_read_dct_pci_cfg(struct amd64_pvt *pvt, u8 dct, * other archs, we might not have access to the caches directly. */ -static inline void __f17h_set_scrubval(struct amd64_pvt *pvt, u32 scrubval) -{ - /* - * Fam17h supports scrub values between 0x5 and 0x14. Also, the values - * are shifted down by 0x5, so scrubval 0x5 is written to the register - * as 0x0, scrubval 0x6 as 0x1, etc. - */ - if (scrubval >= 0x5 && scrubval <= 0x14) { - scrubval -= 0x5; - pci_write_bits32(pvt->F6, F17H_SCR_LIMIT_ADDR, scrubval, 0xF); - pci_write_bits32(pvt->F6, F17H_SCR_BASE_ADDR, 1, 0x1); - } else { - pci_write_bits32(pvt->F6, F17H_SCR_BASE_ADDR, 0, 0x1); - } -} /* * Scan the scrub rate mapping table for a close or matching bandwidth value to * issue. If requested is too big, then use last maximum value found. @@ -243,9 +228,7 @@ static int __set_scrub_rate(struct amd64_pvt *pvt, u32 new_bw, u32 min_rate) scrubval = scrubrates[i].scrubval; - if (pvt->umc) { - __f17h_set_scrubval(pvt, scrubval); - } else if (pvt->fam == 0x15 && pvt->model == 0x60) { + if (pvt->fam == 0x15 && pvt->model == 0x60) { f15h_select_dct(pvt, 0); pci_write_bits32(pvt->F2, F15H_M60H_SCRCTRL, scrubval, 0x001F); f15h_select_dct(pvt, 1); @@ -285,16 +268,7 @@ static int get_scrub_rate(struct mem_ctl_info *mci) int i, retval = -EINVAL; u32 scrubval = 0; - if (pvt->umc) { - amd64_read_pci_cfg(pvt->F6, F17H_SCR_BASE_ADDR, &scrubval); - if (scrubval & BIT(0)) { - amd64_read_pci_cfg(pvt->F6, F17H_SCR_LIMIT_ADDR, &scrubval); - scrubval &= 0xF; - scrubval += 0x5; - } else { - scrubval = 0; - } - } else if (pvt->fam == 0x15) { + if (pvt->fam == 0x15) { /* Erratum #505 */ if (pvt->model < 0x10) f15h_select_dct(pvt, 0); @@ -3521,6 +3495,9 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci) mci->dev_name = pci_name(pvt->F3); mci->ctl_page_to_phys = NULL; + if (pvt->fam >= 0x17) + return; + /* memory scrubber interface */ mci->set_sdram_scrub_rate = set_scrub_rate; mci->get_sdram_scrub_rate = get_scrub_rate; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 59ec8851a06e..6c91ae6d9ffd 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -220,8 +220,6 @@ #define DCT_SEL_HI 0x114 #define F15H_M60H_SCRCTRL 0x1C8 -#define F17H_SCR_BASE_ADDR 0x48 -#define F17H_SCR_LIMIT_ADDR 0x4C /* * Function 3 - Misc Control -- Gitee From 47dfc861c6c91b882328c991cd5d580abf27244f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:04:00 +0000 Subject: [PATCH 040/257] EDAC/amd64: Remove PCI Function 6 commit 6229235f7c6616c96ac06fd1094520b037410f46 upstream PCI Function 6 is used on Family 17h and later to access scrub registers. With scrub access removed, this function has no other use. Remove all Function 6 PCI IDs and related code. [Backport Changes] 1. In drivers/edac/amd64_edac.c, since the current commit removes scrub register access via Function 6, the Function 6 PCI IDs and related code were also removed for Hygon 18h CPUs in the family_types instance of struct amd64_family_type, to maintain consistency with the updated source code. 2. In drivers/edac/amd64_edac.h, the macros PCI_DEVICE_ID_HYGON_18H_M06H_DF_F6 and PCI_DEVICE_ID_HYGON_18H_M10H_DF_F6 were also removed as part of the cleanup. These IDs were specific to Hygon CPUs using Function 6, which are no longer supported. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-4-yazen.ghannam@amd.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 24 +----------------------- drivers/edac/amd64_edac.h | 14 ++------------ 2 files changed, 3 insertions(+), 35 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index ac1e7778dd6e..e48b1157934b 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2396,7 +2396,6 @@ static struct amd64_family_type family_types[] = { [F17_CPUS] = { .ctl_name = "F17h", .f0_id = PCI_DEVICE_ID_AMD_17H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2406,7 +2405,6 @@ static struct amd64_family_type family_types[] = { [F17_M10H_CPUS] = { .ctl_name = "F17h_M10h", .f0_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2416,7 +2414,6 @@ static struct amd64_family_type family_types[] = { [F17_M30H_CPUS] = { .ctl_name = "F17h_M30h", .f0_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F6, .max_mcs = 8, .ops = { .early_channel_count = f17_early_channel_count, @@ -2426,7 +2423,6 @@ static struct amd64_family_type family_types[] = { [F17_M60H_CPUS] = { .ctl_name = "F17h_M60h", .f0_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2436,7 +2432,6 @@ static struct amd64_family_type family_types[] = { [F17_M70H_CPUS] = { .ctl_name = "F17h_M70h", .f0_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2446,7 +2441,6 @@ static struct amd64_family_type family_types[] = { [F18_M06H_CPUS] = { .ctl_name = "F18h_M06h", .f0_id = PCI_DEVICE_ID_HYGON_18H_M06H_DF_F0, - .f6_id = PCI_DEVICE_ID_HYGON_18H_M06H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2456,7 +2450,6 @@ static struct amd64_family_type family_types[] = { [F18_M10H_CPUS] = { .ctl_name = "F18h_M10h", .f0_id = PCI_DEVICE_ID_HYGON_18H_M10H_DF_F0, - .f6_id = PCI_DEVICE_ID_HYGON_18H_M10H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2466,7 +2459,6 @@ static struct amd64_family_type family_types[] = { [F19_CPUS] = { .ctl_name = "F19h", .f0_id = PCI_DEVICE_ID_AMD_19H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_19H_DF_F6, .max_mcs = 8, .ops = { .early_channel_count = f17_early_channel_count, @@ -2476,7 +2468,6 @@ static struct amd64_family_type family_types[] = { [F19_M10H_CPUS] = { .ctl_name = "F19h_M10h", .f0_id = PCI_DEVICE_ID_AMD_19H_M10H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_19H_M10H_DF_F6, .max_mcs = 12, .flags.zn_regs_v2 = 1, .ops = { @@ -2487,7 +2478,6 @@ static struct amd64_family_type family_types[] = { [F19_M50H_CPUS] = { .ctl_name = "F19h_M50h", .f0_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F0, - .f6_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F6, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2818,7 +2808,7 @@ static void decode_umc_error(int node_id, struct mce *m) /* * Use pvt->F3 which contains the F3 CPU PCI device to get the related * F1 (AddrMap) and F2 (Dct) devices. Return negative value on error. - * Reserve F0 and F6 on systems with a UMC. + * Reserve F0 on systems with a UMC. */ static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) @@ -2830,21 +2820,11 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) return -ENODEV; } - pvt->F6 = pci_get_related_function(pvt->F3->vendor, pci_id2, pvt->F3); - if (!pvt->F6) { - pci_dev_put(pvt->F0); - pvt->F0 = NULL; - - edac_dbg(1, "F6 not found: device 0x%x\n", pci_id2); - return -ENODEV; - } - if (!pci_ctl_dev) pci_ctl_dev = &pvt->F0->dev; edac_dbg(1, "F0: %s\n", pci_name(pvt->F0)); edac_dbg(1, "F3: %s\n", pci_name(pvt->F3)); - edac_dbg(1, "F6: %s\n", pci_name(pvt->F6)); return 0; } @@ -2880,7 +2860,6 @@ static void free_mc_sibling_devs(struct amd64_pvt *pvt) { if (pvt->umc) { pci_dev_put(pvt->F0); - pci_dev_put(pvt->F6); } else { pci_dev_put(pvt->F1); pci_dev_put(pvt->F2); @@ -3667,7 +3646,6 @@ static int hw_info_get(struct amd64_pvt *pvt) return -ENOMEM; pci_id1 = fam_type->f0_id; - pci_id2 = fam_type->f6_id; } else { pci_id1 = fam_type->f1_id; pci_id2 = fam_type->f2_id; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 6c91ae6d9ffd..75ed2645abfc 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -115,26 +115,16 @@ #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582 #define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460 -#define PCI_DEVICE_ID_AMD_17H_DF_F6 0x1466 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F0 0x15e8 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490 -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F6 0x1496 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F0 0x1448 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F6 0x144e #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440 -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F6 0x1446 #define PCI_DEVICE_ID_AMD_19H_DF_F0 0x1650 -#define PCI_DEVICE_ID_AMD_19H_DF_F6 0x1656 #define PCI_DEVICE_ID_AMD_19H_M10H_DF_F0 0x14ad -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F6 0x14b3 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F0 0x166a -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F6 0x1670 #define PCI_DEVICE_ID_HYGON_18H_M06H_DF_F0 0x14b0 -#define PCI_DEVICE_ID_HYGON_18H_M06H_DF_F6 0x14b6 #define PCI_DEVICE_ID_HYGON_18H_M10H_DF_F0 0x14d0 -#define PCI_DEVICE_ID_HYGON_18H_M10H_DF_F6 0x14d6 /* * Function 1 - Address Map @@ -361,7 +351,7 @@ struct amd64_pvt { struct low_ops *ops; /* pci_device handles which we utilize */ - struct pci_dev *F0, *F1, *F2, *F3, *F6; + struct pci_dev *F0, *F1, *F2, *F3; u16 mc_node_id; /* MC index of this MC node */ u8 fam; /* CPU family */ @@ -516,7 +506,7 @@ struct amd64_family_flags { struct amd64_family_type { const char *ctl_name; - u16 f0_id, f1_id, f2_id, f6_id; + u16 f0_id, f1_id, f2_id; /* Maximum number of memory controllers per die/node. */ u8 max_mcs; struct amd64_family_flags flags; -- Gitee From 9f7c8698898b52c5512997432168b5f783a2303f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:04:01 +0000 Subject: [PATCH 041/257] EDAC/amd64: Remove PCI Function 0 commit cf981562e6270397a2b0dc871a1f11ccc2a687e8 upstream PCI Function 0 is used on Family 17h and later only to read the "dhar" value. This value is printed and provided through a module-specific debug sysfs file. The value is not used for any Family 17h and later code, and it does not have any apparent debug value on these systems. Remove "dhar", Function 0 PCI IDs, and all related code. [Backport Changes] 1. In drivers/edac/amd64_edac.c, since the current commit removes Function 0 support for accessing the "dhar" value, the Function 0 PCI IDs and related code were also removed for Hygon 18h CPUs in the family_types instance of struct amd64_family_type, to maintain consistency with the updated source code. 2. In drivers/edac/amd64_edac.h, the macros PCI_DEVICE_ID_HYGON_18H_M06H_DF_F0 and PCI_DEVICE_ID_HYGON_18H_M10H_DF_F0 were also removed as part of the cleanup. These IDs were specific to Hygon CPUs using Function 0, which are no longer supported. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-5-yazen.ghannam@amd.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 40 +++++---------------------------------- drivers/edac/amd64_edac.h | 15 ++------------- 2 files changed, 7 insertions(+), 48 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e48b1157934b..9f43ed360526 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -906,9 +906,6 @@ static void __dump_misc_regs_df(struct amd64_pvt *pvt) debug_display_dimm_sizes_df(pvt, i); } - - edac_dbg(1, "F0x104 (DRAM Hole Address): 0x%08x, base: 0x%08x\n", - pvt->dhar, dhar_base(pvt)); } /* Display and decode various NB registers for debug purposes. */ @@ -943,6 +940,8 @@ static void __dump_misc_regs(struct amd64_pvt *pvt) /* Only if NOT ganged does dclr1 have valid info */ if (!dct_ganging_enabled(pvt)) debug_dump_dramcfg_low(pvt, pvt->dclr1, 1); + + edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); } /* Display and decode various NB registers for debug purposes. */ @@ -953,8 +952,6 @@ static void dump_misc_regs(struct amd64_pvt *pvt) else __dump_misc_regs(pvt); - edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); - amd64_info("using x%u syndromes.\n", pvt->ecc_sym_sz); } @@ -2395,7 +2392,6 @@ static struct amd64_family_type family_types[] = { }, [F17_CPUS] = { .ctl_name = "F17h", - .f0_id = PCI_DEVICE_ID_AMD_17H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2404,7 +2400,6 @@ static struct amd64_family_type family_types[] = { }, [F17_M10H_CPUS] = { .ctl_name = "F17h_M10h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2413,7 +2408,6 @@ static struct amd64_family_type family_types[] = { }, [F17_M30H_CPUS] = { .ctl_name = "F17h_M30h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M30H_DF_F0, .max_mcs = 8, .ops = { .early_channel_count = f17_early_channel_count, @@ -2422,7 +2416,6 @@ static struct amd64_family_type family_types[] = { }, [F17_M60H_CPUS] = { .ctl_name = "F17h_M60h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M60H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2431,7 +2424,6 @@ static struct amd64_family_type family_types[] = { }, [F17_M70H_CPUS] = { .ctl_name = "F17h_M70h", - .f0_id = PCI_DEVICE_ID_AMD_17H_M70H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2440,7 +2432,6 @@ static struct amd64_family_type family_types[] = { }, [F18_M06H_CPUS] = { .ctl_name = "F18h_M06h", - .f0_id = PCI_DEVICE_ID_HYGON_18H_M06H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2449,7 +2440,6 @@ static struct amd64_family_type family_types[] = { }, [F18_M10H_CPUS] = { .ctl_name = "F18h_M10h", - .f0_id = PCI_DEVICE_ID_HYGON_18H_M10H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2458,7 +2448,6 @@ static struct amd64_family_type family_types[] = { }, [F19_CPUS] = { .ctl_name = "F19h", - .f0_id = PCI_DEVICE_ID_AMD_19H_DF_F0, .max_mcs = 8, .ops = { .early_channel_count = f17_early_channel_count, @@ -2467,7 +2456,6 @@ static struct amd64_family_type family_types[] = { }, [F19_M10H_CPUS] = { .ctl_name = "F19h_M10h", - .f0_id = PCI_DEVICE_ID_AMD_19H_M10H_DF_F0, .max_mcs = 12, .flags.zn_regs_v2 = 1, .ops = { @@ -2477,7 +2465,6 @@ static struct amd64_family_type family_types[] = { }, [F19_M50H_CPUS] = { .ctl_name = "F19h_M50h", - .f0_id = PCI_DEVICE_ID_AMD_19H_M50H_DF_F0, .max_mcs = 2, .ops = { .early_channel_count = f17_early_channel_count, @@ -2808,26 +2795,12 @@ static void decode_umc_error(int node_id, struct mce *m) /* * Use pvt->F3 which contains the F3 CPU PCI device to get the related * F1 (AddrMap) and F2 (Dct) devices. Return negative value on error. - * Reserve F0 on systems with a UMC. */ static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) { - if (pvt->umc) { - pvt->F0 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); - if (!pvt->F0) { - edac_dbg(1, "F0 not found, device 0x%x\n", pci_id1); - return -ENODEV; - } - - if (!pci_ctl_dev) - pci_ctl_dev = &pvt->F0->dev; - - edac_dbg(1, "F0: %s\n", pci_name(pvt->F0)); - edac_dbg(1, "F3: %s\n", pci_name(pvt->F3)); - + if (pvt->umc) return 0; - } /* Reserve the ADDRESS MAP Device */ pvt->F1 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); @@ -2859,7 +2832,7 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) static void free_mc_sibling_devs(struct amd64_pvt *pvt) { if (pvt->umc) { - pci_dev_put(pvt->F0); + return; } else { pci_dev_put(pvt->F1); pci_dev_put(pvt->F2); @@ -2972,7 +2945,6 @@ static void read_mc_regs(struct amd64_pvt *pvt) if (pvt->umc) { __read_mc_regs_df(pvt); - amd64_read_pci_cfg(pvt->F0, DF_DHAR, &pvt->dhar); goto skip; } @@ -3644,8 +3616,6 @@ static int hw_info_get(struct amd64_pvt *pvt) pvt->umc = kcalloc(fam_type->max_mcs, sizeof(struct amd64_umc), GFP_KERNEL); if (!pvt->umc) return -ENOMEM; - - pci_id1 = fam_type->f0_id; } else { pci_id1 = fam_type->f1_id; pci_id2 = fam_type->f2_id; @@ -3662,7 +3632,7 @@ static int hw_info_get(struct amd64_pvt *pvt) static void hw_info_put(struct amd64_pvt *pvt) { - if (pvt->F0 || pvt->F1) + if (pvt->F1) free_mc_sibling_devs(pvt); kfree(pvt->umc); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 75ed2645abfc..8d184726ed6d 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -114,17 +114,6 @@ #define PCI_DEVICE_ID_AMD_16H_NB_F2 0x1532 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582 -#define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F0 0x15e8 -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F0 0x1490 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F0 0x1448 -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F0 0x1440 -#define PCI_DEVICE_ID_AMD_19H_DF_F0 0x1650 -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F0 0x14ad -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F0 0x166a - -#define PCI_DEVICE_ID_HYGON_18H_M06H_DF_F0 0x14b0 -#define PCI_DEVICE_ID_HYGON_18H_M10H_DF_F0 0x14d0 /* * Function 1 - Address Map @@ -351,7 +340,7 @@ struct amd64_pvt { struct low_ops *ops; /* pci_device handles which we utilize */ - struct pci_dev *F0, *F1, *F2, *F3; + struct pci_dev *F1, *F2, *F3; u16 mc_node_id; /* MC index of this MC node */ u8 fam; /* CPU family */ @@ -506,7 +495,7 @@ struct amd64_family_flags { struct amd64_family_type { const char *ctl_name; - u16 f0_id, f1_id, f2_id; + u16 f1_id, f2_id; /* Maximum number of memory controllers per die/node. */ u8 max_mcs; struct amd64_family_flags flags; -- Gitee From cdf1eeaa6b863c79f4587746ef2add82a7d8a9a1 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:04:02 +0000 Subject: [PATCH 042/257] EDAC/amd64: Remove early_channel_count() commit c4605bde334367b22bbf43cbbef0d1b7c75433dc upstream The early_channel_count() function seems to have been useful in the past for knowing how many EDAC mci structures to populate. However, this is no longer needed as the maximum channel count for a system is used instead. Remove the early_channel_count() helper functions and related code. Use the size of the channel layer when iterating over channel structures. [Backport Changes] 1. In drivers/edac/amd64_edac.c, since the current commit removes the early_channel_count hooks across all CPUs. The corresponding early_channel_count hook for Hygon 18h CPUs was also removed from the family_types instance of struct amd64_family_type, ensuring consistency with the updated source code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-6-yazen.ghannam@amd.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 118 +------------------------------------- drivers/edac/amd64_edac.h | 2 - 2 files changed, 2 insertions(+), 118 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9f43ed360526..c0f7967f394a 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1193,24 +1193,6 @@ static void determine_memory_type(struct amd64_pvt *pvt) pvt->dram_type = (pvt->dclr0 & BIT(16)) ? MEM_DDR3 : MEM_RDDR3; } -/* Get the number of DCT channels the memory controller is using. */ -static int k8_early_channel_count(struct amd64_pvt *pvt) -{ - int flag; - - if (pvt->ext_model >= K8_REV_F) - /* RevF (NPT) and later */ - flag = pvt->dclr0 & WIDTH_128; - else - /* RevE and earlier */ - flag = pvt->dclr0 & REVE_WIDTH_128; - - /* not used */ - pvt->dclr1 = 0; - - return (flag) ? 2 : 1; -} - /* On F10h and later ErrAddr is MC4_ADDR[47:1] */ static u64 get_error_address(struct amd64_pvt *pvt, struct mce *m) { @@ -1462,69 +1444,6 @@ static int k8_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct, } } -/* - * Get the number of DCT channels in use. - * - * Return: - * number of Memory Channels in operation - * Pass back: - * contents of the DCL0_LOW register - */ -static int f1x_early_channel_count(struct amd64_pvt *pvt) -{ - int i, j, channels = 0; - - /* On F10h, if we are in 128 bit mode, then we are using 2 channels */ - if (pvt->fam == 0x10 && (pvt->dclr0 & WIDTH_128)) - return 2; - - /* - * Need to check if in unganged mode: In such, there are 2 channels, - * but they are not in 128 bit mode and thus the above 'dclr0' status - * bit will be OFF. - * - * Need to check DCT0[0] and DCT1[0] to see if only one of them has - * their CSEnable bit on. If so, then SINGLE DIMM case. - */ - edac_dbg(0, "Data width is not 128 bits - need more decoding\n"); - - /* - * Check DRAM Bank Address Mapping values for each DIMM to see if there - * is more than just one DIMM present in unganged mode. Need to check - * both controllers since DIMMs can be placed in either one. - */ - for (i = 0; i < 2; i++) { - u32 dbam = (i ? pvt->dbam1 : pvt->dbam0); - - for (j = 0; j < 4; j++) { - if (DBAM_DIMM(j, dbam) > 0) { - channels++; - break; - } - } - } - - if (channels > 2) - channels = 2; - - amd64_info("MCT channel count: %d\n", channels); - - return channels; -} - -static int f17_early_channel_count(struct amd64_pvt *pvt) -{ - int i, channels = 0; - - /* SDP Control bit 31 (SdpInit) is clear for unused UMC channels */ - for_each_umc(i) - channels += !!(pvt->umc[i].sdp_ctrl & UMC_SDP_INIT); - - amd64_info("MCT channel count: %d\n", channels); - - return channels; -} - static int ddr3_cs_size(unsigned i, bool dct_width) { unsigned shift = 0; @@ -2319,7 +2238,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, .max_mcs = 2, .ops = { - .early_channel_count = k8_early_channel_count, .map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow, .dbam_to_cs = k8_dbam_to_chip_select, } @@ -2330,7 +2248,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f10_dbam_to_chip_select, } @@ -2341,7 +2258,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f15_dbam_to_chip_select, } @@ -2352,7 +2268,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f16_dbam_to_chip_select, } @@ -2363,7 +2278,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f15_m60h_dbam_to_chip_select, } @@ -2374,7 +2288,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f16_dbam_to_chip_select, } @@ -2385,7 +2298,6 @@ static struct amd64_family_type family_types[] = { .f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2, .max_mcs = 2, .ops = { - .early_channel_count = f1x_early_channel_count, .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, .dbam_to_cs = f16_dbam_to_chip_select, } @@ -2394,7 +2306,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2402,7 +2313,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M10h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2410,7 +2320,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M30h", .max_mcs = 8, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2418,7 +2327,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M60h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2426,7 +2334,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F17h_M70h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2434,7 +2341,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F18h_M06h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2442,7 +2348,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F18h_M10h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2450,7 +2355,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F19h", .max_mcs = 8, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2459,7 +2363,6 @@ static struct amd64_family_type family_types[] = { .max_mcs = 12, .flags.zn_regs_v2 = 1, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -2467,7 +2370,6 @@ static struct amd64_family_type family_types[] = { .ctl_name = "F19h_M50h", .max_mcs = 2, .ops = { - .early_channel_count = f17_early_channel_count, .dbam_to_cs = f17_addr_mask_to_cs_size, } }, @@ -3167,7 +3069,7 @@ static int init_csrows(struct mem_ctl_info *mci) : EDAC_SECDED; } - for (j = 0; j < pvt->channel_count; j++) { + for (j = 0; j < fam_type->max_mcs; j++) { dimm = csrow->channels[j]->dimm; dimm->mtype = pvt->dram_type; dimm->edac_mode = edac_mode; @@ -3642,28 +3544,12 @@ static int init_one_instance(struct amd64_pvt *pvt) { struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; - int ret = -EINVAL; + int ret = -ENOMEM; - /* - * We need to determine how many memory channels there are. Then use - * that information for calculating the size of the dynamic instance - * tables in the 'mci' structure. - */ - pvt->channel_count = pvt->ops->early_channel_count(pvt); - if (pvt->channel_count < 0) - return ret; - - ret = -ENOMEM; layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; layers[0].size = pvt->csels[0].b_cnt; layers[0].is_virt_csrow = true; layers[1].type = EDAC_MC_LAYER_CHANNEL; - - /* - * Always allocate two channels since we can have setups with DIMMs on - * only one channel. Also, this simplifies handling later for the price - * of a couple of KBs tops. - */ layers[1].size = fam_type->max_mcs; layers[1].is_virt_csrow = false; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 8d184726ed6d..359deeff62c5 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -348,7 +348,6 @@ struct amd64_pvt { u8 stepping; /* ... stepping */ int ext_model; /* extended model value of this node */ - int channel_count; /* Raw registers */ u32 dclr0; /* DRAM Configuration Low DCT0 reg */ @@ -476,7 +475,6 @@ extern const struct attribute_group amd64_edac_inj_group; * functions and per device encoding/decoding logic. */ struct low_ops { - int (*early_channel_count) (struct amd64_pvt *pvt); void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, struct err_info *); int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, -- Gitee From 1c8a5debb293776bbeb6b70a9d75ab7adde1eaf3 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:04:03 +0000 Subject: [PATCH 043/257] EDAC/amd64: Rename debug_display_dimm_sizes() commit 00e4feb8c0476bbde3c8bf4a593e0f82ca9a4df6 upstream Use the "dct" and "umc" prefixes for legacy and modern versions respectively. Also, move the "dct" version to avoid a forward declaration, and fixup some checkpatch warnings in the process. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-7-yazen.ghannam@amd.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 128 +++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 65 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index c0f7967f394a..313788a2748a 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -749,7 +749,65 @@ static unsigned long determine_edac_cap(struct amd64_pvt *pvt) return edac_cap; } -static void debug_display_dimm_sizes(struct amd64_pvt *, u8); +/* + * debug routine to display the memory sizes of all logical DIMMs and its + * CSROWs + */ +static void dct_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) +{ + u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases; + u32 dbam = ctrl ? pvt->dbam1 : pvt->dbam0; + int dimm, size0, size1; + + if (pvt->fam == 0xf) { + /* K8 families < revF not supported yet */ + if (pvt->ext_model < K8_REV_F) + return; + + WARN_ON(ctrl != 0); + } + + if (pvt->fam == 0x10) { + dbam = (ctrl && !dct_ganging_enabled(pvt)) ? pvt->dbam1 + : pvt->dbam0; + dcsb = (ctrl && !dct_ganging_enabled(pvt)) ? + pvt->csels[1].csbases : + pvt->csels[0].csbases; + } else if (ctrl) { + dbam = pvt->dbam0; + dcsb = pvt->csels[1].csbases; + } + edac_dbg(1, "F2x%d80 (DRAM Bank Address Mapping): 0x%08x\n", + ctrl, dbam); + + edac_printk(KERN_DEBUG, EDAC_MC, "DCT%d chip selects:\n", ctrl); + + /* Dump memory sizes for DIMM and its CSROWs */ + for (dimm = 0; dimm < 4; dimm++) { + size0 = 0; + if (dcsb[dimm * 2] & DCSB_CS_ENABLE) + /* + * For F15m60h, we need multiplier for LRDIMM cs_size + * calculation. We pass dimm value to the dbam_to_cs + * mapper so we can find the multiplier from the + * corresponding DCSM. + */ + size0 = pvt->ops->dbam_to_cs(pvt, ctrl, + DBAM_DIMM(dimm, dbam), + dimm); + + size1 = 0; + if (dcsb[dimm * 2 + 1] & DCSB_CS_ENABLE) + size1 = pvt->ops->dbam_to_cs(pvt, ctrl, + DBAM_DIMM(dimm, dbam), + dimm); + + amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", + dimm * 2, size0, + dimm * 2 + 1, size1); + } +} + static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) { @@ -824,7 +882,7 @@ static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) return cs_mode; } -static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl) +static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) { int dimm, size0, size1, cs0, cs1, cs_mode; @@ -904,7 +962,7 @@ static void __dump_misc_regs_df(struct amd64_pvt *pvt) i, 1 << ((tmp >> 4) & 0x3)); } - debug_display_dimm_sizes_df(pvt, i); + umc_debug_display_dimm_sizes(pvt, i); } } @@ -929,13 +987,13 @@ static void __dump_misc_regs(struct amd64_pvt *pvt) (pvt->fam == 0xf) ? k8_dhar_offset(pvt) : f10_dhar_offset(pvt)); - debug_display_dimm_sizes(pvt, 0); + dct_debug_display_dimm_sizes(pvt, 0); /* everything below this point is Fam10h and above */ if (pvt->fam == 0xf) return; - debug_display_dimm_sizes(pvt, 1); + dct_debug_display_dimm_sizes(pvt, 1); /* Only if NOT ganged does dclr1 have valid info */ if (!dct_ganging_enabled(pvt)) @@ -2171,66 +2229,6 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome); } -/* - * debug routine to display the memory sizes of all logical DIMMs and its - * CSROWs - */ -static void debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) -{ - int dimm, size0, size1; - u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases; - u32 dbam = ctrl ? pvt->dbam1 : pvt->dbam0; - - if (pvt->fam == 0xf) { - /* K8 families < revF not supported yet */ - if (pvt->ext_model < K8_REV_F) - return; - else - WARN_ON(ctrl != 0); - } - - if (pvt->fam == 0x10) { - dbam = (ctrl && !dct_ganging_enabled(pvt)) ? pvt->dbam1 - : pvt->dbam0; - dcsb = (ctrl && !dct_ganging_enabled(pvt)) ? - pvt->csels[1].csbases : - pvt->csels[0].csbases; - } else if (ctrl) { - dbam = pvt->dbam0; - dcsb = pvt->csels[1].csbases; - } - edac_dbg(1, "F2x%d80 (DRAM Bank Address Mapping): 0x%08x\n", - ctrl, dbam); - - edac_printk(KERN_DEBUG, EDAC_MC, "DCT%d chip selects:\n", ctrl); - - /* Dump memory sizes for DIMM and its CSROWs */ - for (dimm = 0; dimm < 4; dimm++) { - - size0 = 0; - if (dcsb[dimm*2] & DCSB_CS_ENABLE) - /* - * For F15m60h, we need multiplier for LRDIMM cs_size - * calculation. We pass dimm value to the dbam_to_cs - * mapper so we can find the multiplier from the - * corresponding DCSM. - */ - size0 = pvt->ops->dbam_to_cs(pvt, ctrl, - DBAM_DIMM(dimm, dbam), - dimm); - - size1 = 0; - if (dcsb[dimm*2 + 1] & DCSB_CS_ENABLE) - size1 = pvt->ops->dbam_to_cs(pvt, ctrl, - DBAM_DIMM(dimm, dbam), - dimm); - - amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", - dimm * 2, size0, - dimm * 2 + 1, size1); - } -} - static struct amd64_family_type family_types[] = { [K8_CPUS] = { .ctl_name = "K8", -- Gitee From c48aef40c9a7ac72948c50e3d3f3cad717478f39 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:04:04 +0000 Subject: [PATCH 044/257] EDAC/amd64: Split get_csrow_nr_pages() into dct/umc functions commit c0984666fde9cf7da4c1123443b469f44e94dd84 upstream Split get_csrow_nr_pages() into a legacy and modern versions in preparation for further legacy/modern refactoring. Also, rename f17_get_cs_mode() to match the new convention. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-8-yazen.ghannam@amd.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 40 +++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 313788a2748a..599e7ec71799 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -850,7 +850,7 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) #define CS_EVEN (CS_EVEN_PRIMARY | CS_EVEN_SECONDARY) #define CS_ODD (CS_ODD_PRIMARY | CS_ODD_SECONDARY) -static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) +static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) { u8 base, count = 0; int cs_mode = 0; @@ -892,7 +892,7 @@ static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) cs0 = dimm * 2; cs1 = dimm * 2 + 1; - cs_mode = f17_get_cs_mode(dimm, ctrl, pvt); + cs_mode = umc_get_cs_mode(dimm, ctrl, pvt); size0 = pvt->ops->dbam_to_cs(pvt, ctrl, cs_mode, cs0); size1 = pvt->ops->dbam_to_cs(pvt, ctrl, cs_mode, cs1); @@ -2934,24 +2934,36 @@ static void read_mc_regs(struct amd64_pvt *pvt) * encompasses * */ -static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig) +static u32 dct_get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) { u32 dbam = dct ? pvt->dbam1 : pvt->dbam0; - int csrow_nr = csrow_nr_orig; u32 cs_mode, nr_pages; - if (!pvt->umc) { - csrow_nr >>= 1; - cs_mode = DBAM_DIMM(csrow_nr, dbam); - } else { - cs_mode = f17_get_cs_mode(csrow_nr >> 1, dct, pvt); - } + csrow_nr >>= 1; + cs_mode = DBAM_DIMM(csrow_nr, dbam); nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, csrow_nr); nr_pages <<= 20 - PAGE_SHIFT; edac_dbg(0, "csrow: %d, channel: %d, DBAM idx: %d\n", - csrow_nr_orig, dct, cs_mode); + csrow_nr, dct, cs_mode); + edac_dbg(0, "nr_pages/channel: %u\n", nr_pages); + + return nr_pages; +} + +static u32 umc_get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig) +{ + int csrow_nr = csrow_nr_orig; + u32 cs_mode, nr_pages; + + cs_mode = umc_get_cs_mode(csrow_nr >> 1, dct, pvt); + + nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, csrow_nr); + nr_pages <<= 20 - PAGE_SHIFT; + + edac_dbg(0, "csrow: %d, channel: %d, cs_mode %d\n", + csrow_nr_orig, dct, cs_mode); edac_dbg(0, "nr_pages/channel: %u\n", nr_pages); return nr_pages; @@ -2990,7 +3002,7 @@ static int init_csrows_df(struct mem_ctl_info *mci) edac_dbg(1, "MC node: %d, csrow: %d\n", pvt->mc_node_id, cs); - dimm->nr_pages = get_csrow_nr_pages(pvt, umc, cs); + dimm->nr_pages = umc_get_csrow_nr_pages(pvt, umc, cs); dimm->mtype = pvt->umc[umc].dram_type; dimm->edac_mode = edac_mode; dimm->dtype = dev_type; @@ -3046,13 +3058,13 @@ static int init_csrows(struct mem_ctl_info *mci) pvt->mc_node_id, i); if (row_dct0) { - nr_pages = get_csrow_nr_pages(pvt, 0, i); + nr_pages = dct_get_csrow_nr_pages(pvt, 0, i); csrow->channels[0]->dimm->nr_pages = nr_pages; } /* K8 has only one DCT */ if (pvt->fam != 0xf && row_dct1) { - int row_dct1_pages = get_csrow_nr_pages(pvt, 1, i); + int row_dct1_pages = dct_get_csrow_nr_pages(pvt, 1, i); csrow->channels[1]->dimm->nr_pages = row_dct1_pages; nr_pages += row_dct1_pages; -- Gitee From 5495864a85081a83304f35a5bfd15bc33d2fdf13 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 27 Jan 2023 17:04:05 +0000 Subject: [PATCH 045/257] EDAC/amd64: Drop dbam_to_cs() for Family 17h and later commit a2e59ab8e93301466cb3ff3f2b7b65eb1a4e8786 upstream The same function is used to calculate chip select size for all Zen-based family/models. Therefore, a family/model function pointer is not necessary. Drop the dbam_to_cs() function pointer for Family 17h and later systems. Also, move the Family 17h function to avoid a forward declaration. Rename it to indicate that the UMC Address Mask is used rather than the legacy DBAM value. [Backport Changes] 1. In drivers/edac/amd64_edac.c, since the current commit removes the dbam_to_cs function pointer for all CPUs starting from Family 17h, the corresponding dbam_to_cs entry for Hygon 18h CPUs was also removed from the family_types instance of struct amd64_family_type, ensuring consistency with the updated source code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-9-yazen.ghannam@amd.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 192 ++++++++++++++++---------------------- 1 file changed, 81 insertions(+), 111 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 599e7ec71799..48110cb5ce61 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -882,6 +882,84 @@ static int umc_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt) return cs_mode; } +static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, + unsigned int cs_mode, int csrow_nr) +{ + u32 addr_mask_orig, addr_mask_deinterleaved; + u32 msb, weight, num_zero_bits; + int cs_mask_nr = csrow_nr; + int dimm, size = 0; + + /* No Chip Selects are enabled. */ + if (!cs_mode) + return size; + + /* Requested size of an even CS but none are enabled. */ + if (!(cs_mode & CS_EVEN) && !(csrow_nr & 1)) + return size; + + /* Requested size of an odd CS but none are enabled. */ + if (!(cs_mode & CS_ODD) && (csrow_nr & 1)) + return size; + + /* + * Family 17h introduced systems with one mask per DIMM, + * and two Chip Selects per DIMM. + * + * CS0 and CS1 -> MASK0 / DIMM0 + * CS2 and CS3 -> MASK1 / DIMM1 + * + * Family 19h Model 10h introduced systems with one mask per Chip Select, + * and two Chip Selects per DIMM. + * + * CS0 -> MASK0 -> DIMM0 + * CS1 -> MASK1 -> DIMM0 + * CS2 -> MASK2 -> DIMM1 + * CS3 -> MASK3 -> DIMM1 + * + * Keep the mask number equal to the Chip Select number for newer systems, + * and shift the mask number for older systems. + */ + dimm = csrow_nr >> 1; + + if (!fam_type->flags.zn_regs_v2) + cs_mask_nr >>= 1; + + /* Asymmetric dual-rank DIMM support. */ + if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY)) + addr_mask_orig = pvt->csels[umc].csmasks_sec[cs_mask_nr]; + else + addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr]; + + /* + * The number of zero bits in the mask is equal to the number of bits + * in a full mask minus the number of bits in the current mask. + * + * The MSB is the number of bits in the full mask because BIT[0] is + * always 0. + * + * In the special 3 Rank interleaving case, a single bit is flipped + * without swapping with the most significant bit. This can be handled + * by keeping the MSB where it is and ignoring the single zero bit. + */ + msb = fls(addr_mask_orig) - 1; + weight = hweight_long(addr_mask_orig); + num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE); + + /* Take the number of zero bits off from the top of the mask. */ + addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1); + + edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm); + edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig); + edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved); + + /* Register [31:1] = Address [39:9]. Size is in kBs here. */ + size = (addr_mask_deinterleaved >> 2) + 1; + + /* Return size in MBs. */ + return size >> 10; +} + static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) { int dimm, size0, size1, cs0, cs1, cs_mode; @@ -894,8 +972,8 @@ static void umc_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) cs_mode = umc_get_cs_mode(dimm, ctrl, pvt); - size0 = pvt->ops->dbam_to_cs(pvt, ctrl, cs_mode, cs0); - size1 = pvt->ops->dbam_to_cs(pvt, ctrl, cs_mode, cs1); + size0 = umc_addr_mask_to_cs_size(pvt, ctrl, cs_mode, cs0); + size1 = umc_addr_mask_to_cs_size(pvt, ctrl, cs_mode, cs1); amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", cs0, size0, @@ -1629,84 +1707,6 @@ static int f16_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct, return ddr3_cs_size(cs_mode, false); } -static int f17_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, - unsigned int cs_mode, int csrow_nr) -{ - u32 addr_mask_orig, addr_mask_deinterleaved; - u32 msb, weight, num_zero_bits; - int cs_mask_nr = csrow_nr; - int dimm, size = 0; - - /* No Chip Selects are enabled. */ - if (!cs_mode) - return size; - - /* Requested size of an even CS but none are enabled. */ - if (!(cs_mode & CS_EVEN) && !(csrow_nr & 1)) - return size; - - /* Requested size of an odd CS but none are enabled. */ - if (!(cs_mode & CS_ODD) && (csrow_nr & 1)) - return size; - - /* - * Family 17h introduced systems with one mask per DIMM, - * and two Chip Selects per DIMM. - * - * CS0 and CS1 -> MASK0 / DIMM0 - * CS2 and CS3 -> MASK1 / DIMM1 - * - * Family 19h Model 10h introduced systems with one mask per Chip Select, - * and two Chip Selects per DIMM. - * - * CS0 -> MASK0 -> DIMM0 - * CS1 -> MASK1 -> DIMM0 - * CS2 -> MASK2 -> DIMM1 - * CS3 -> MASK3 -> DIMM1 - * - * Keep the mask number equal to the Chip Select number for newer systems, - * and shift the mask number for older systems. - */ - dimm = csrow_nr >> 1; - - if (!fam_type->flags.zn_regs_v2) - cs_mask_nr >>= 1; - - /* Asymmetric dual-rank DIMM support. */ - if ((csrow_nr & 1) && (cs_mode & CS_ODD_SECONDARY)) - addr_mask_orig = pvt->csels[umc].csmasks_sec[cs_mask_nr]; - else - addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr]; - - /* - * The number of zero bits in the mask is equal to the number of bits - * in a full mask minus the number of bits in the current mask. - * - * The MSB is the number of bits in the full mask because BIT[0] is - * always 0. - * - * In the special 3 Rank interleaving case, a single bit is flipped - * without swapping with the most significant bit. This can be handled - * by keeping the MSB where it is and ignoring the single zero bit. - */ - msb = fls(addr_mask_orig) - 1; - weight = hweight_long(addr_mask_orig); - num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE); - - /* Take the number of zero bits off from the top of the mask. */ - addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1); - - edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm); - edac_dbg(1, " Original AddrMask: 0x%x\n", addr_mask_orig); - edac_dbg(1, " Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved); - - /* Register [31:1] = Address [39:9]. Size is in kBs here. */ - size = (addr_mask_deinterleaved >> 2) + 1; - - /* Return size in MBs. */ - return size >> 10; -} - static void read_dram_ctl_register(struct amd64_pvt *pvt) { @@ -2303,73 +2303,43 @@ static struct amd64_family_type family_types[] = { [F17_CPUS] = { .ctl_name = "F17h", .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, [F17_M10H_CPUS] = { .ctl_name = "F17h_M10h", .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, [F17_M30H_CPUS] = { .ctl_name = "F17h_M30h", .max_mcs = 8, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, [F17_M60H_CPUS] = { .ctl_name = "F17h_M60h", .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, [F17_M70H_CPUS] = { .ctl_name = "F17h_M70h", .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, [F18_M06H_CPUS] = { .ctl_name = "F18h_M06h", .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, [F18_M10H_CPUS] = { .ctl_name = "F18h_M10h", .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, [F19_CPUS] = { .ctl_name = "F19h", .max_mcs = 8, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, [F19_M10H_CPUS] = { .ctl_name = "F19h_M10h", .max_mcs = 12, .flags.zn_regs_v2 = 1, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, [F19_M50H_CPUS] = { .ctl_name = "F19h_M50h", .max_mcs = 2, - .ops = { - .dbam_to_cs = f17_addr_mask_to_cs_size, - } }, }; @@ -2959,7 +2929,7 @@ static u32 umc_get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_or cs_mode = umc_get_cs_mode(csrow_nr >> 1, dct, pvt); - nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, csrow_nr); + nr_pages = umc_addr_mask_to_cs_size(pvt, dct, cs_mode, csrow_nr); nr_pages <<= 20 - PAGE_SHIFT; edac_dbg(0, "csrow: %d, channel: %d, cs_mode %d\n", -- Gitee From 662e02b70fb597da4fdae666bce3e15716ae0852 Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Fri, 27 Jan 2023 17:04:07 +0000 Subject: [PATCH 046/257] EDAC/amd64: Merge struct amd64_family_type into struct amd64_pvt commit ed623d55eef46d0ca7557b9665b4693931bfb507 upstream Future AMD systems will support heterogeneous "AMD Node" types, e.g. CPU and GPU types. Therefore, a global family type shared across all AMD nodes is no longer appropriate. Move struct low_ops routines and members of struct amd64_family_type to struct amd64_pvt. Currently, there are many code branches that split between "modern" and "legacy" systems. Another code branch will be needed in order to cover GPU cases. However, rather than introduce another branching case in multiple functions, the current branching code should be switched to a set of function pointers. This change makes the code more readable and simplifies adding support for new families/models. In order to reuse code, define two sets of function pointers. Use one for modern systems (Family 17h and later). This will not change between current CPU families. Use another set of function pointers for legacy systems (before Family 17h). Use the Family 16h versions as default for the legacy ops since these are the latest, and adjust the function pointers as needed for older families. [ Yazen: rebased/reworked patch and reworded commit message. ] [ bp: Fix rev8 or later check. ] [Backport Changes] 1. In drivers/edac/amd64_edac.c, within function read_umc_base_mask(), the existing variable "umc_base" was retained, and the call to get_umc_reg() was updated with the latest changes as per the current commit. This retention is due to previously applied commit 36d1aba7a1e65, which introduced logic that pre-assigned "umc_base" with the output of either get_umc_base_f18h_m4h() or get_umc_base() based on the CPU family. This differs from the upstream version, which directly uses get_umc_base(umc). The current approach integrates the latest upstream changes while preserving Hygon-specific handling. 2. In drivers/edac/amd64_edac.c, within the function determine_memory_type_df(), the condition check was updated to use pvt->flags.zn_regs_v2 instead of fam_type->flags.zn_regs_v2, as per the current upstream commit. The surrounding logic involving Hygon support, including the hygon_f18h_m4h() check added in commit 36d1aba7a1e65, was retained to ensure compatibility. This preserves Hygon-specific behavior while aligning with the updated upstream condition handling. 3. In drivers/edac/amd64_edac.c, the family_types instance of struct amd64_family_type was removed by the current upstream commit. Therefore, the entries for Hygon 18h CPUs (F18_M06H_CPUS and F18_M10H_CPUS) were also removed, to maintain consistency with the updated source code. 4. In drivers/edac/amd64_edac.c, within the per_family_init() function, the if/else blocks added as part of handling Family 18h (pvt->fam == 0x18) were also replaced with equivalent switch (pvt->model) structure, similar to changes in upstream where each model-specific condition was moved into their own case statement. This change simplifies the logic and aligns with the refactored approach introduced in the current upstream patch, while preserving functionality for Hygon CPUs. 5. In drivers/edac/amd64_edac.h, the enum amd_families was removed entirely in the current upstream commit. Therefore, the Hygon-specific enumerators (F18_M06H_CPUS and F18_M10H_CPUS) which were part of enum amd_families were also removed to maintain consistency with the updated source code. Signed-off-by: Muralidhara M K Co-developed-by: Naveen Krishna Chatradhi Signed-off-by: Naveen Krishna Chatradhi Co-developed-by: Yazen Ghannam Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230127170419.1824692-11-yazen.ghannam@amd.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 368 ++++++++++++++------------------------ drivers/edac/amd64_edac.h | 64 +++---- 2 files changed, 154 insertions(+), 278 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 48110cb5ce61..91e6a155ebdd 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -16,11 +16,9 @@ module_param(ecc_enable_override, int, 0644); static struct msr __percpu *msrs; -static struct amd64_family_type *fam_type; - -static inline u32 get_umc_reg(u32 reg) +static inline u32 get_umc_reg(struct amd64_pvt *pvt, u32 reg) { - if (!fam_type->flags.zn_regs_v2) + if (!pvt->flags.zn_regs_v2) return reg; switch (reg) { @@ -449,7 +447,7 @@ static void get_cs_base_and_mask(struct amd64_pvt *pvt, int csrow, u8 dct, for (i = 0; i < pvt->csels[dct].m_cnt; i++) #define for_each_umc(i) \ - for (i = 0; i < fam_type->max_mcs; i++) + for (i = 0; i < pvt->max_mcs; i++) /* * @input_addr is an InputAddr associated with the node given by mci. Return the @@ -922,7 +920,7 @@ static int umc_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc, */ dimm = csrow_nr >> 1; - if (!fam_type->flags.zn_regs_v2) + if (!pvt->flags.zn_regs_v2) cs_mask_nr >>= 1; /* Asymmetric dual-rank DIMM support. */ @@ -1034,7 +1032,7 @@ static void __dump_misc_regs_df(struct amd64_pvt *pvt) if (umc->dram_type == MEM_LRDDR4 || umc->dram_type == MEM_LRDDR5) { amd_smn_read(pvt->mc_node_id, - umc_base + get_umc_reg(UMCCH_ADDR_CFG), + umc_base + get_umc_reg(pvt, UMCCH_ADDR_CFG), &tmp); edac_dbg(1, "UMC%d LRDIMM %dx rank multiply\n", i, 1 << ((tmp >> 4) & 0x3)); @@ -1107,7 +1105,7 @@ static void prep_chip_selects(struct amd64_pvt *pvt) for_each_umc(umc) { pvt->csels[umc].b_cnt = 4; - pvt->csels[umc].m_cnt = fam_type->flags.zn_regs_v2 ? 4 : 2; + pvt->csels[umc].m_cnt = pvt->flags.zn_regs_v2 ? 4 : 2; } } else { @@ -1156,7 +1154,7 @@ static void read_umc_base_mask(struct amd64_pvt *pvt) } umc_mask_reg = umc_base + UMCCH_ADDR_MASK; - umc_mask_reg_sec = umc_base + get_umc_reg(UMCCH_ADDR_MASK_SEC); + umc_mask_reg_sec = umc_base + get_umc_reg(pvt, UMCCH_ADDR_MASK_SEC); for_each_chip_select_mask(cs, umc, pvt) { mask = &pvt->csels[umc].csmasks[cs]; @@ -1244,7 +1242,7 @@ static void determine_memory_type_df(struct amd64_pvt *pvt) * Check if the system supports the "DDR Type" field in UMC Config * and has DDR5 DIMMs in use. */ - if ((fam_type->flags.zn_regs_v2 || + if ((pvt->flags.zn_regs_v2 || hygon_f18h_m4h() || hygon_f18h_m10h()) && ((umc->umc_cfg & GENMASK(2, 0)) == 0x1)) { @@ -2229,120 +2227,6 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome); } -static struct amd64_family_type family_types[] = { - [K8_CPUS] = { - .ctl_name = "K8", - .f1_id = PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, - .f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow, - .dbam_to_cs = k8_dbam_to_chip_select, - } - }, - [F10_CPUS] = { - .ctl_name = "F10h", - .f1_id = PCI_DEVICE_ID_AMD_10H_NB_MAP, - .f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f10_dbam_to_chip_select, - } - }, - [F15_CPUS] = { - .ctl_name = "F15h", - .f1_id = PCI_DEVICE_ID_AMD_15H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f15_dbam_to_chip_select, - } - }, - [F15_M30H_CPUS] = { - .ctl_name = "F15h_M30h", - .f1_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f16_dbam_to_chip_select, - } - }, - [F15_M60H_CPUS] = { - .ctl_name = "F15h_M60h", - .f1_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f15_m60h_dbam_to_chip_select, - } - }, - [F16_CPUS] = { - .ctl_name = "F16h", - .f1_id = PCI_DEVICE_ID_AMD_16H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f16_dbam_to_chip_select, - } - }, - [F16_M30H_CPUS] = { - .ctl_name = "F16h_M30h", - .f1_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F1, - .f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2, - .max_mcs = 2, - .ops = { - .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, - .dbam_to_cs = f16_dbam_to_chip_select, - } - }, - [F17_CPUS] = { - .ctl_name = "F17h", - .max_mcs = 2, - }, - [F17_M10H_CPUS] = { - .ctl_name = "F17h_M10h", - .max_mcs = 2, - }, - [F17_M30H_CPUS] = { - .ctl_name = "F17h_M30h", - .max_mcs = 8, - }, - [F17_M60H_CPUS] = { - .ctl_name = "F17h_M60h", - .max_mcs = 2, - }, - [F17_M70H_CPUS] = { - .ctl_name = "F17h_M70h", - .max_mcs = 2, - }, - [F18_M06H_CPUS] = { - .ctl_name = "F18h_M06h", - .max_mcs = 2, - }, - [F18_M10H_CPUS] = { - .ctl_name = "F18h_M10h", - .max_mcs = 2, - }, - [F19_CPUS] = { - .ctl_name = "F19h", - .max_mcs = 8, - }, - [F19_M10H_CPUS] = { - .ctl_name = "F19h_M10h", - .max_mcs = 12, - .flags.zn_regs_v2 = 1, - }, - [F19_M50H_CPUS] = { - .ctl_name = "F19h_M50h", - .max_mcs = 2, - }, -}; - /* * These are tables of eigenvectors (one per line) which can be used for the * construction of the syndrome tables. The modified syndrome search algorithm @@ -2780,7 +2664,7 @@ static void __read_mc_regs_df(struct amd64_pvt *pvt) umc = &pvt->umc[i]; - amd_smn_read(nid, umc_base + get_umc_reg(UMCCH_DIMM_CFG), &umc->dimm_cfg); + amd_smn_read(nid, umc_base + get_umc_reg(pvt, UMCCH_DIMM_CFG), &umc->dimm_cfg); amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg); amd_smn_read(nid, umc_base + UMCCH_SDP_CTRL, &umc->sdp_ctrl); amd_smn_read(nid, umc_base + UMCCH_ECC_CTRL, &umc->ecc_ctrl); @@ -3049,7 +2933,7 @@ static int init_csrows(struct mem_ctl_info *mci) : EDAC_SECDED; } - for (j = 0; j < fam_type->max_mcs; j++) { + for (j = 0; j < pvt->max_mcs; j++) { dimm = csrow->channels[j]->dimm; dimm->mtype = pvt->dram_type; dimm->edac_mode = edac_mode; @@ -3324,7 +3208,7 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci) mci->edac_cap = determine_edac_cap(pvt); mci->mod_name = EDAC_MOD_STR; - mci->ctl_name = fam_type->ctl_name; + mci->ctl_name = pvt->ctl_name; mci->dev_name = pci_name(pvt->F3); mci->ctl_page_to_phys = NULL; @@ -3336,147 +3220,168 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci) mci->get_sdram_scrub_rate = get_scrub_rate; } -/* - * returns a pointer to the family descriptor on success, NULL otherwise. - */ -static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) +static struct low_ops umc_ops = { +}; + +/* Use Family 16h versions for defaults and adjust as needed below. */ +static struct low_ops dct_ops = { + .map_sysaddr_to_csrow = f1x_map_sysaddr_to_csrow, + .dbam_to_cs = f16_dbam_to_chip_select, +}; + +static int per_family_init(struct amd64_pvt *pvt) { pvt->ext_model = boot_cpu_data.x86_model >> 4; pvt->stepping = boot_cpu_data.x86_stepping; pvt->model = boot_cpu_data.x86_model; pvt->fam = boot_cpu_data.x86; + pvt->max_mcs = 2; + + /* + * Decide on which ops group to use here and do any family/model + * overrides below. + */ + if (pvt->fam >= 0x17) + pvt->ops = &umc_ops; + else + pvt->ops = &dct_ops; switch (pvt->fam) { case 0xf: - fam_type = &family_types[K8_CPUS]; - pvt->ops = &family_types[K8_CPUS].ops; + pvt->ctl_name = (pvt->ext_model >= K8_REV_F) ? + "K8 revF or later" : "K8 revE or earlier"; + pvt->f1_id = PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP; + pvt->f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL; + pvt->ops->map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow; + pvt->ops->dbam_to_cs = k8_dbam_to_chip_select; break; case 0x10: - fam_type = &family_types[F10_CPUS]; - pvt->ops = &family_types[F10_CPUS].ops; + pvt->ctl_name = "F10h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_10H_NB_MAP; + pvt->f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM; + pvt->ops->dbam_to_cs = f10_dbam_to_chip_select; break; case 0x15: - if (pvt->model == 0x30) { - fam_type = &family_types[F15_M30H_CPUS]; - pvt->ops = &family_types[F15_M30H_CPUS].ops; + switch (pvt->model) { + case 0x30: + pvt->ctl_name = "F15h_M30h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2; break; - } else if (pvt->model == 0x60) { - fam_type = &family_types[F15_M60H_CPUS]; - pvt->ops = &family_types[F15_M60H_CPUS].ops; + case 0x60: + pvt->ctl_name = "F15h_M60h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2; + pvt->ops->dbam_to_cs = f15_m60h_dbam_to_chip_select; + break; + case 0x13: + /* Richland is only client */ + return -ENODEV; + default: + pvt->ctl_name = "F15h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_15H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2; + pvt->ops->dbam_to_cs = f15_dbam_to_chip_select; break; - /* Richland is only client */ - } else if (pvt->model == 0x13) { - return NULL; - } else { - fam_type = &family_types[F15_CPUS]; - pvt->ops = &family_types[F15_CPUS].ops; } break; case 0x16: - if (pvt->model == 0x30) { - fam_type = &family_types[F16_M30H_CPUS]; - pvt->ops = &family_types[F16_M30H_CPUS].ops; + switch (pvt->model) { + case 0x30: + pvt->ctl_name = "F16h_M30h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2; + break; + default: + pvt->ctl_name = "F16h"; + pvt->f1_id = PCI_DEVICE_ID_AMD_16H_NB_F1; + pvt->f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2; break; } - fam_type = &family_types[F16_CPUS]; - pvt->ops = &family_types[F16_CPUS].ops; break; case 0x17: - if (pvt->model >= 0x10 && pvt->model <= 0x2f) { - fam_type = &family_types[F17_M10H_CPUS]; - pvt->ops = &family_types[F17_M10H_CPUS].ops; + switch (pvt->model) { + case 0x10 ... 0x2f: + pvt->ctl_name = "F17h_M10h"; break; - } else if (pvt->model >= 0x30 && pvt->model <= 0x3f) { - fam_type = &family_types[F17_M30H_CPUS]; - pvt->ops = &family_types[F17_M30H_CPUS].ops; + case 0x30 ... 0x3f: + pvt->ctl_name = "F17h_M30h"; + pvt->max_mcs = 8; break; - } else if (pvt->model >= 0x60 && pvt->model <= 0x6f) { - fam_type = &family_types[F17_M60H_CPUS]; - pvt->ops = &family_types[F17_M60H_CPUS].ops; + case 0x60 ... 0x6f: + pvt->ctl_name = "F17h_M60h"; break; - } else if (pvt->model >= 0x70 && pvt->model <= 0x7f) { - fam_type = &family_types[F17_M70H_CPUS]; - pvt->ops = &family_types[F17_M70H_CPUS].ops; + case 0x70 ... 0x7f: + pvt->ctl_name = "F17h_M70h"; + break; + default: + pvt->ctl_name = "F17h"; break; } - fam_type = &family_types[F17_CPUS]; - pvt->ops = &family_types[F17_CPUS].ops; break; case 0x18: - if (pvt->model == 0x4) { - fam_type = &family_types[F17_M30H_CPUS]; - pvt->ops = &family_types[F17_M30H_CPUS].ops; - family_types[F17_M30H_CPUS].max_mcs = 3; - family_types[F17_M30H_CPUS].ctl_name = "F18h_M04h"; - break; - } else if (pvt->model == 0x5) { - fam_type = &family_types[F17_M30H_CPUS]; - pvt->ops = &family_types[F17_M30H_CPUS].ops; - family_types[F17_M30H_CPUS].max_mcs = 1; - family_types[F17_M30H_CPUS].ctl_name = "F18h_M05h"; - break; - } else if (pvt->model == 0x6) { - fam_type = &family_types[F18_M06H_CPUS]; - pvt->ops = &family_types[F18_M06H_CPUS].ops; - break; - } else if (pvt->model == 0x7) { - fam_type = &family_types[F18_M06H_CPUS]; - pvt->ops = &family_types[F18_M06H_CPUS].ops; - family_types[F18_M06H_CPUS].ctl_name = "F18h_M07h"; - break; - } else if (pvt->model == 0x8) { - fam_type = &family_types[F18_M06H_CPUS]; - pvt->ops = &family_types[F18_M06H_CPUS].ops; - family_types[F18_M06H_CPUS].ctl_name = "F18h_M08h"; - break; - } else if (pvt->model == 0x10) { - fam_type = &family_types[F18_M10H_CPUS]; - pvt->ops = &family_types[F18_M10H_CPUS].ops; - family_types[F18_M10H_CPUS].ctl_name = "F18h_M10h"; - break; - } - fam_type = &family_types[F17_CPUS]; - pvt->ops = &family_types[F17_CPUS].ops; - family_types[F17_CPUS].ctl_name = "F18h"; + switch (pvt->model) { + case 0x04: + pvt->ctl_name = "F18h_M04h"; + pvt->max_mcs = 3; + break; + case 0x05: + pvt->ctl_name = "F18h_M05h"; + pvt->max_mcs = 1; + break; + case 0x06: + pvt->ctl_name = "F18h_M06h"; + pvt->max_mcs = 2; + break; + case 0x07: + pvt->ctl_name = "F18h_M07h"; + break; + case 0x08: + pvt->ctl_name = "F18h_M08h"; + break; + case 0x10: + pvt->ctl_name = "F18h_M10h"; + pvt->max_mcs = 2; + break; + } break; case 0x19: - if (pvt->model >= 0x10 && pvt->model <= 0x1f) { - fam_type = &family_types[F19_M10H_CPUS]; - pvt->ops = &family_types[F19_M10H_CPUS].ops; + switch (pvt->model) { + case 0x00 ... 0x0f: + pvt->ctl_name = "F19h"; + pvt->max_mcs = 8; + break; + case 0x10 ... 0x1f: + pvt->ctl_name = "F19h_M10h"; + pvt->max_mcs = 12; + pvt->flags.zn_regs_v2 = 1; break; - } else if (pvt->model >= 0x20 && pvt->model <= 0x2f) { - fam_type = &family_types[F17_M70H_CPUS]; - pvt->ops = &family_types[F17_M70H_CPUS].ops; - fam_type->ctl_name = "F19h_M20h"; + case 0x20 ... 0x2f: + pvt->ctl_name = "F19h_M20h"; break; - } else if (pvt->model >= 0x50 && pvt->model <= 0x5f) { - fam_type = &family_types[F19_M50H_CPUS]; - pvt->ops = &family_types[F19_M50H_CPUS].ops; - fam_type->ctl_name = "F19h_M50h"; + case 0x50 ... 0x5f: + pvt->ctl_name = "F19h_M50h"; break; - } else if (pvt->model >= 0xa0 && pvt->model <= 0xaf) { - fam_type = &family_types[F19_M10H_CPUS]; - pvt->ops = &family_types[F19_M10H_CPUS].ops; - fam_type->ctl_name = "F19h_MA0h"; + case 0xa0 ... 0xaf: + pvt->ctl_name = "F19h_MA0h"; + pvt->max_mcs = 12; + pvt->flags.zn_regs_v2 = 1; break; } - fam_type = &family_types[F19_CPUS]; - pvt->ops = &family_types[F19_CPUS].ops; - family_types[F19_CPUS].ctl_name = "F19h"; break; default: amd64_err("Unsupported family!\n"); - return NULL; + return -ENODEV; } - return fam_type; + return 0; } static const struct attribute_group *amd64_edac_attr_groups[] = { @@ -3495,12 +3400,12 @@ static int hw_info_get(struct amd64_pvt *pvt) int ret = -EINVAL; if (pvt->fam >= 0x17) { - pvt->umc = kcalloc(fam_type->max_mcs, sizeof(struct amd64_umc), GFP_KERNEL); + pvt->umc = kcalloc(pvt->max_mcs, sizeof(struct amd64_umc), GFP_KERNEL); if (!pvt->umc) return -ENOMEM; } else { - pci_id1 = fam_type->f1_id; - pci_id2 = fam_type->f2_id; + pci_id1 = pvt->f1_id; + pci_id2 = pvt->f2_id; } ret = reserve_mc_sibling_devs(pvt, pci_id1, pci_id2); @@ -3530,7 +3435,7 @@ static int init_one_instance(struct amd64_pvt *pvt) layers[0].size = pvt->csels[0].b_cnt; layers[0].is_virt_csrow = true; layers[1].type = EDAC_MC_LAYER_CHANNEL; - layers[1].size = fam_type->max_mcs; + layers[1].size = pvt->max_mcs; layers[1].is_virt_csrow = false; mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0); @@ -3560,7 +3465,7 @@ static bool instance_has_memory(struct amd64_pvt *pvt) bool cs_enabled = false; int cs = 0, dct = 0; - for (dct = 0; dct < fam_type->max_mcs; dct++) { + for (dct = 0; dct < pvt->max_mcs; dct++) { for_each_chip_select(cs, dct, pvt) cs_enabled |= csrow_enabled(cs, dct, pvt); } @@ -3589,9 +3494,8 @@ static int probe_one_instance(unsigned int nid) pvt->mc_node_id = nid; pvt->F3 = F3; - ret = -ENODEV; - fam_type = per_family_init(pvt); - if (!fam_type) + ret = per_family_init(pvt); + if (ret < 0) goto err_enable; ret = hw_info_get(pvt); @@ -3630,11 +3534,7 @@ static int probe_one_instance(unsigned int nid) goto err_enable; } - amd64_info("%s %sdetected (node %d).\n", fam_type->ctl_name, - (pvt->fam == 0xf ? - (pvt->ext_model >= K8_REV_F ? "revF or later " - : "revE or earlier ") - : ""), pvt->mc_node_id); + amd64_info("%s detected (node %d).\n", pvt->ctl_name, pvt->mc_node_id); dump_misc_regs(pvt); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 359deeff62c5..2bb817940727 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -273,27 +273,6 @@ #define UMC_SDP_INIT BIT(31) -enum amd_families { - K8_CPUS = 0, - F10_CPUS, - F15_CPUS, - F15_M30H_CPUS, - F15_M60H_CPUS, - F16_CPUS, - F16_M30H_CPUS, - F17_CPUS, - F17_M10H_CPUS, - F17_M30H_CPUS, - F17_M60H_CPUS, - F17_M70H_CPUS, - F18_M06H_CPUS, - F18_M10H_CPUS, - F19_CPUS, - F19_M10H_CPUS, - F19_M50H_CPUS, - NUM_FAMILIES, -}; - /* Error injection control structure */ struct error_injection { u32 section; @@ -336,6 +315,16 @@ struct amd64_umc { enum mem_type dram_type; }; +struct amd64_family_flags { + /* + * Indicates that the system supports the new register offsets, etc. + * first introduced with Family 19h Model 10h. + */ + __u64 zn_regs_v2 : 1, + + __reserved : 63; +}; + struct amd64_pvt { struct low_ops *ops; @@ -377,6 +366,12 @@ struct amd64_pvt { /* x4, x8, or x16 syndromes in use */ u8 ecc_sym_sz; + const char *ctl_name; + u16 f1_id, f2_id; + /* Maximum number of memory controllers per die/node. */ + u8 max_mcs; + + struct amd64_family_flags flags; /* place to store error injection parameters prior to issue */ struct error_injection injection; @@ -475,29 +470,10 @@ extern const struct attribute_group amd64_edac_inj_group; * functions and per device encoding/decoding logic. */ struct low_ops { - void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, - struct err_info *); - int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, - unsigned cs_mode, int cs_mask_nr); -}; - -struct amd64_family_flags { - /* - * Indicates that the system supports the new register offsets, etc. - * first introduced with Family 19h Model 10h. - */ - __u64 zn_regs_v2 : 1, - - __reserved : 63; -}; - -struct amd64_family_type { - const char *ctl_name; - u16 f1_id, f2_id; - /* Maximum number of memory controllers per die/node. */ - u8 max_mcs; - struct amd64_family_flags flags; - struct low_ops ops; + void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci, u64 sys_addr, + struct err_info *err); + int (*dbam_to_cs)(struct amd64_pvt *pvt, u8 dct, + unsigned int cs_mode, int cs_mask_nr); }; int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset, -- Gitee From ad7d87b32eeb493f49fef1eece77963b42a1dbba Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Tue, 8 Aug 2023 22:52:44 -0500 Subject: [PATCH 047/257] EDAC/amd64: Add support for AMD family 1Ah models 00h-1Fh and 40h-4Fh commit c4d07c371283cb0453c8ce187551e4d064cc407e upstream Add support for family 1Ah-based models 00h-1Fh and 40h-4Fh. [ bp: Simplify. ] Signed-off-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230809035244.2722455-4-avadhut.naik@amd.com Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/edac/amd64_edac.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 91e6a155ebdd..cac8c8f74a3f 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3376,6 +3376,20 @@ static int per_family_init(struct amd64_pvt *pvt) } break; + case 0x1A: + switch (pvt->model) { + case 0x00 ... 0x1f: + pvt->ctl_name = "F1Ah"; + pvt->max_mcs = 12; + pvt->flags.zn_regs_v2 = 1; + break; + case 0x40 ... 0x4f: + pvt->ctl_name = "F1Ah_M40h"; + pvt->flags.zn_regs_v2 = 1; + break; + } + break; + default: amd64_err("Unsupported family!\n"); return -ENODEV; @@ -3602,6 +3616,7 @@ static const struct x86_cpu_id amd64_cpuids[] = { X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL), X86_MATCH_VENDOR_FAM(HYGON, 0x18, NULL), X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL), + X86_MATCH_VENDOR_FAM(AMD, 0x1A, NULL), { } }; MODULE_DEVICE_TABLE(x86cpu, amd64_cpuids); -- Gitee From 4c7dce3b07ef64ffe1936436e0f0b003c4032805 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 27 Oct 2023 14:24:16 +0200 Subject: [PATCH 048/257] x86/barrier: Do not serialize MSR accesses on AMD commit 04c3024560d3a14acd18d0a51a1d0a89d29b7eb5 upstream AMD does not have the requirement for a synchronization barrier when acccessing a certain group of MSRs. Do not incur that unnecessary penalty there. There will be a CPUID bit which explicitly states that a MFENCE is not needed. Once that bit is added to the APM, this will be extended with it. While at it, move to processor.h to avoid include hell. Untangling that file properly is a matter for another day. Some notes on the performance aspect of why this is relevant, courtesy of Kishon VijayAbraham : On a AMD Zen4 system with 96 cores, a modified ipi-bench[1] on a VM shows x2AVIC IPI rate is 3% to 4% lower than AVIC IPI rate. The ipi-bench is modified so that the IPIs are sent between two vCPUs in the same CCX. This also requires to pin the vCPU to a physical core to prevent any latencies. This simulates the use case of pinning vCPUs to the thread of a single CCX to avoid interrupt IPI latency. In order to avoid run-to-run variance (for both x2AVIC and AVIC), the below configurations are done: 1) Disable Power States in BIOS (to prevent the system from going to lower power state) 2) Run the system at fixed frequency 2500MHz (to prevent the system from increasing the frequency when the load is more) With the above configuration: *) Performance measured using ipi-bench for AVIC: Average Latency: 1124.98ns [Time to send IPI from one vCPU to another vCPU] Cumulative throughput: 42.6759M/s [Total number of IPIs sent in a second from 48 vCPUs simultaneously] *) Performance measured using ipi-bench for x2AVIC: Average Latency: 1172.42ns [Time to send IPI from one vCPU to another vCPU] Cumulative throughput: 40.9432M/s [Total number of IPIs sent in a second from 48 vCPUs simultaneously] From above, x2AVIC latency is ~4% more than AVIC. However, the expectation is x2AVIC performance to be better or equivalent to AVIC. Upon analyzing the perf captures, it is observed significant time is spent in weak_wrmsr_fence() invoked by x2apic_send_IPI(). With the fix to skip weak_wrmsr_fence() *) Performance measured using ipi-bench for x2AVIC: Average Latency: 1117.44ns [Time to send IPI from one vCPU to another vCPU] Cumulative throughput: 42.9608M/s [Total number of IPIs sent in a second from 48 vCPUs simultaneously] Comparing the performance of x2AVIC with and without the fix, it can be seen the performance improves by ~4%. Performance captured using an unmodified ipi-bench using the 'mesh-ipi' option with and without weak_wrmsr_fence() on a Zen4 system also showed significant performance improvement without weak_wrmsr_fence(). The 'mesh-ipi' option ignores CCX or CCD and just picks random vCPU. Average throughput (10 iterations) with weak_wrmsr_fence(), Cumulative throughput: 4933374 IPI/s Average throughput (10 iterations) without weak_wrmsr_fence(), Cumulative throughput: 6355156 IPI/s [1] https://github.com/bytedance/kvm-utils/tree/master/microbenchmark/ipi-bench [Backport Changes] 1. In file include/hw/core/alternative.h, new macro definitions (ALT_NOT, ALT_FLAG_SHIFT and ALT_FLAG_NOT) have been introduced. These definitions are needed for newly introduced function: weak_wrmsr_fence(), defined in arch/x86/include/asm/processor.h. These macros were originally introduced in upstream commit 5d1dd961e743. Since applying the full commit introduces additional dependencies and conflicts, therefore, only the required definitions were backported to avoid unnecessary changes. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230622095212.20940-1-bp@alien8.de Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/include/asm/alternative.h | 4 ++++ arch/x86/include/asm/barrier.h | 18 ------------------ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/processor.h | 18 ++++++++++++++++++ arch/x86/kernel/cpu/amd.c | 3 +++ arch/x86/kernel/cpu/common.c | 7 +++++++ arch/x86/kernel/cpu/hygon.c | 3 +++ 7 files changed, 36 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 13adca37c99a..f4a52222cb22 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -9,6 +9,10 @@ #include #include +#define ALT_FLAGS_SHIFT 16 +#define ALT_FLAG_NOT BIT(0) +#define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature)) + /* * Alternative inline assembly for SMP. * diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 4819d5e5a335..7f828fe49797 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -84,22 +84,4 @@ do { \ #include -/* - * Make previous memory operations globally visible before - * a WRMSR. - * - * MFENCE makes writes visible, but only affects load/store - * instructions. WRMSR is unfortunately not a load/store - * instruction and is unaffected by MFENCE. The LFENCE ensures - * that the WRMSR is not reordered. - * - * Most WRMSRs are full serializing instructions themselves and - * do not require this barrier. This is only required for the - * IA32_TSC_DEADLINE and X2APIC MSRs. - */ -static inline void weak_wrmsr_fence(void) -{ - asm volatile("mfence; lfence" : : : "memory"); -} - #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 34ff851bd2fa..554a12d2bd58 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -311,6 +311,7 @@ #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ #define X86_FEATURE_MSR_TSX_CTRL (11*32+18) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ #define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ #define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ #define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ac48297094ea..258e54676f1a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -987,4 +987,22 @@ enum taa_mitigations { TAA_MITIGATION_TSX_DISABLED, }; +/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE)); +} + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5bc10df599df..b953bfc739b3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1190,6 +1190,9 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); + + /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e667cc3ffeec..0497b8b228d7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1615,6 +1615,13 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif + + /* + * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and + * Hygon will clear it in ->c_init() below. + */ + set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 3da9c951ac31..94ad45d491ad 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -367,6 +367,9 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); check_null_seg_clears_base(c); + + /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) -- Gitee From f31a39a9957f6d58b42666796c8b4b21d76e0eb3 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:34:36 -0600 Subject: [PATCH 049/257] cpupower: Update msr_pstate union struct naming commit 629d512d682de2259179046e2364f1f1ff4232e3 upstream The msr_pstate union struct named fam17h_bits is misleading since this is the struct to use for all families >= 0x17, not just for family 0x17. Rename the bits structs to be 'pstate' (for pre family 17h CPUs) and 'pstatedef' (for CPUs since fam 17h) to align closer with PPR/BDKG (1) naming. There are no functional changes as part of this update. 1: AMD Processor Programming Reference (PPR) and BIOS and Kernel Developer's Guide (BKDG) available at: http://developer.amd.com/resources/developer-guides-manuals Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Reviewed-by: skhan@linuxfoundation.org Signed-off-by: Shuah Khan Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- tools/power/cpupower/utils/helpers/amd.c | 26 +++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 7c4f83a8c973..34368436bbd6 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -13,7 +13,8 @@ #define MSR_AMD_PSTATE 0xc0010064 #define MSR_AMD_PSTATE_LIMIT 0xc0010061 -union msr_pstate { +union core_pstate { + /* pre fam 17h: */ struct { unsigned fid:6; unsigned did:3; @@ -26,7 +27,8 @@ union msr_pstate { unsigned idddiv:2; unsigned res3:21; unsigned en:1; - } bits; + } pstate; + /* since fam 17h: */ struct { unsigned fid:8; unsigned did:6; @@ -35,36 +37,36 @@ union msr_pstate { unsigned idddiv:2; unsigned res1:31; unsigned en:1; - } fam17h_bits; + } pstatedef; unsigned long long val; }; -static int get_did(int family, union msr_pstate pstate) +static int get_did(int family, union core_pstate pstate) { int t; if (family == 0x12) t = pstate.val & 0xf; else if (family == 0x17 || family == 0x18) - t = pstate.fam17h_bits.did; + t = pstate.pstatedef.did; else - t = pstate.bits.did; + t = pstate.pstate.did; return t; } -static int get_cof(int family, union msr_pstate pstate) +static int get_cof(int family, union core_pstate pstate) { int t; int fid, did, cof; did = get_did(family, pstate); if (family == 0x17 || family == 0x18) { - fid = pstate.fam17h_bits.fid; + fid = pstate.pstatedef.fid; cof = 200 * fid / did; } else { t = 0x10; - fid = pstate.bits.fid; + fid = pstate.pstate.fid; if (family == 0x11) t = 0x8; cof = (100 * (fid + t)) >> did; @@ -89,7 +91,7 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, int boost_states, unsigned long *pstates, int *no) { int i, psmax, pscur; - union msr_pstate pstate; + union core_pstate pstate; unsigned long long val; /* Only read out frequencies from HW when CPU might be boostable @@ -119,9 +121,9 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, } if (read_msr(cpu, MSR_AMD_PSTATE + i, &pstate.val)) return -1; - if ((cpu_family == 0x17) && (!pstate.fam17h_bits.en)) + if ((cpu_family == 0x17) && (!pstate.pstatedef.en)) continue; - else if (!pstate.bits.en) + else if (!pstate.pstate.en) continue; pstates[i] = get_cof(cpu_family, pstate); -- Gitee From 0a8288ce0c097673fbfea2c6c89dc05a21f2461b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 Jan 2021 11:34:42 -0600 Subject: [PATCH 050/257] cpupower: Correct macro name for CPB caps flag commit 7a136a8fcd7ef14c63d07667e81c4dcac77e0a13 upstream The name is Core Performance Boost (CPB) for the cpuid flag. Correct cpuid caps flag to use this name (instead of CBP). Signed-off-by: Robert Richter Signed-off-by: Nathan Fontenot Signed-off-by: Shuah Khan Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- tools/power/cpupower/utils/helpers/cpuid.c | 2 +- tools/power/cpupower/utils/helpers/helpers.h | 2 +- tools/power/cpupower/utils/helpers/misc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c index 5cc39d4e23ed..ebb3cc2e23d0 100644 --- a/tools/power/cpupower/utils/helpers/cpuid.c +++ b/tools/power/cpupower/utils/helpers/cpuid.c @@ -130,7 +130,7 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info) cpu_info->vendor == X86_VENDOR_HYGON) { if (ext_cpuid_level >= 0x80000007 && (cpuid_edx(0x80000007) & (1 << 9))) - cpu_info->caps |= CPUPOWER_CAP_AMD_CBP; + cpu_info->caps |= CPUPOWER_CAP_AMD_CPB; } if (cpu_info->vendor == X86_VENDOR_INTEL) { diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 357b19bb136e..421ce2464761 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -64,7 +64,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL, #define CPUPOWER_CAP_INV_TSC 0x00000001 #define CPUPOWER_CAP_APERF 0x00000002 -#define CPUPOWER_CAP_AMD_CBP 0x00000004 +#define CPUPOWER_CAP_AMD_CPB 0x00000004 #define CPUPOWER_CAP_PERF_BIAS 0x00000008 #define CPUPOWER_CAP_HAS_TURBO_RATIO 0x00000010 #define CPUPOWER_CAP_IS_SNB 0x00000020 diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index f406adc40bad..3113e551bd8a 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -18,7 +18,7 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, if (ret) return ret; - if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CBP) { + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB) { *support = 1; /* AMD Family 0x17 does not utilize PCI D18F4 like prior -- Gitee From 8721ad34db45e3c025430d2acd659d51c3f08c85 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:34:49 -0600 Subject: [PATCH 051/257] cpupower: Add CPUPOWER_CAP_AMD_HW_PSTATE cpuid caps flag commit a0255a76bf3a78d322adfe4eb4e73eb83998f61a upstream Add a check in get_cpu_info() for the ability to read frequencies from hardware and set the CPUPOWER_CAP_AMD_HW_PSTATE cpuid flag. The cpuid flag is set when CPUID_80000007_EDX[7] is set, which is all families >= 10h. The check excludes family 14h because HW pstate reporting was not implemented on family 14h. This is intended to reduce family checks in the main code paths. Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Reviewed-by: skhan@linuxfoundation.org Signed-off-by: Shuah Khan Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- tools/power/cpupower/utils/helpers/amd.c | 9 ++++----- tools/power/cpupower/utils/helpers/cpuid.c | 12 +++++++++--- tools/power/cpupower/utils/helpers/helpers.h | 1 + 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 34368436bbd6..8b69c7ff639a 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -94,11 +94,10 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, union core_pstate pstate; unsigned long long val; - /* Only read out frequencies from HW when CPU might be boostable - to keep the code as short and clean as possible. - Otherwise frequencies are exported via ACPI tables. - */ - if (cpu_family < 0x10 || cpu_family == 0x14) + /* Only read out frequencies from HW if HW Pstate is supported, + * otherwise frequencies are exported via ACPI tables. + */ + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_HW_PSTATE)) return -1; if (read_msr(cpu, MSR_AMD_PSTATE_LIMIT, &val)) diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c index ebb3cc2e23d0..ebf8b28615a4 100644 --- a/tools/power/cpupower/utils/helpers/cpuid.c +++ b/tools/power/cpupower/utils/helpers/cpuid.c @@ -128,9 +128,15 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info) /* AMD or Hygon Boost state enable/disable register */ if (cpu_info->vendor == X86_VENDOR_AMD || cpu_info->vendor == X86_VENDOR_HYGON) { - if (ext_cpuid_level >= 0x80000007 && - (cpuid_edx(0x80000007) & (1 << 9))) - cpu_info->caps |= CPUPOWER_CAP_AMD_CPB; + if (ext_cpuid_level >= 0x80000007) { + if (cpuid_edx(0x80000007) & (1 << 9)) + cpu_info->caps |= CPUPOWER_CAP_AMD_CPB; + + if ((cpuid_edx(0x80000007) & (1 << 7)) && + cpu_info->family != 0x14) + /* HW pstate was not implemented in family 0x14 */ + cpu_info->caps |= CPUPOWER_CAP_AMD_HW_PSTATE; + } } if (cpu_info->vendor == X86_VENDOR_INTEL) { diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 421ce2464761..39bc7d4286ed 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -69,6 +69,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL, #define CPUPOWER_CAP_HAS_TURBO_RATIO 0x00000010 #define CPUPOWER_CAP_IS_SNB 0x00000020 #define CPUPOWER_CAP_INTEL_IDA 0x00000040 +#define CPUPOWER_CAP_AMD_HW_PSTATE 0x00000100 #define CPUPOWER_AMD_CPBDIS 0x02000000 -- Gitee From 0ba60fcafeff0238dd5e5eb98c650602508d2424 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:35:17 -0600 Subject: [PATCH 052/257] cpupower: Update family checks when decoding HW pstates commit 23765b82a808da416b70b41d711468e723531e6a upstream The family checks in get_cof() and get_did() need to use the correct MSR format depending on the family. Add a cpupower capability for using the pstatedef (family 17h and newer) to control this instead of direct family checks. Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Signed-off-by: Shuah Khan Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- tools/power/cpupower/utils/helpers/amd.c | 8 ++++---- tools/power/cpupower/utils/helpers/cpuid.c | 6 +++++- tools/power/cpupower/utils/helpers/helpers.h | 1 + 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 8b69c7ff639a..4c0fb37517f8 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -45,10 +45,10 @@ static int get_did(int family, union core_pstate pstate) { int t; - if (family == 0x12) - t = pstate.val & 0xf; - else if (family == 0x17 || family == 0x18) + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) t = pstate.pstatedef.did; + else if (family == 0x12) + t = pstate.val & 0xf; else t = pstate.pstate.did; @@ -61,7 +61,7 @@ static int get_cof(int family, union core_pstate pstate) int fid, did, cof; did = get_did(family, pstate); - if (family == 0x17 || family == 0x18) { + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) { fid = pstate.pstatedef.fid; cof = 200 * fid / did; } else { diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c index ebf8b28615a4..929f1dcb1695 100644 --- a/tools/power/cpupower/utils/helpers/cpuid.c +++ b/tools/power/cpupower/utils/helpers/cpuid.c @@ -133,9 +133,13 @@ int get_cpu_info(struct cpupower_cpu_info *cpu_info) cpu_info->caps |= CPUPOWER_CAP_AMD_CPB; if ((cpuid_edx(0x80000007) & (1 << 7)) && - cpu_info->family != 0x14) + cpu_info->family != 0x14) { /* HW pstate was not implemented in family 0x14 */ cpu_info->caps |= CPUPOWER_CAP_AMD_HW_PSTATE; + + if (cpu_info->family >= 0x17) + cpu_info->caps |= CPUPOWER_CAP_AMD_PSTATEDEF; + } } } diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 39bc7d4286ed..7d2b510c0d7e 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -70,6 +70,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL, #define CPUPOWER_CAP_IS_SNB 0x00000020 #define CPUPOWER_CAP_INTEL_IDA 0x00000040 #define CPUPOWER_CAP_AMD_HW_PSTATE 0x00000100 +#define CPUPOWER_CAP_AMD_PSTATEDEF 0x00000200 #define CPUPOWER_AMD_CPBDIS 0x02000000 -- Gitee From 38f307ea253e525a16662234ba41441a192b80b9 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Tue, 30 Apr 2024 14:07:06 +0530 Subject: [PATCH 053/257] tools/power/cpupower: Fix Pstate frequency reporting on AMD Family 1Ah CPUs commit 43cad521c6d228ea0c51e248f8e5b3a6295a2849 upstream Update cpupower's P-State frequency calculation and reporting with AMD Family 1Ah+ processors, when using the acpi-cpufreq driver. This is due to a change in the PStateDef MSR layout in AMD Family 1Ah+. Tested on 4th and 5th Gen AMD EPYC system Signed-off-by: Ananth Narayan Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Signed-off-by: Shuah Khan Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- tools/power/cpupower/utils/helpers/amd.c | 26 +++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 4c0fb37517f8..6c693a026b83 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -38,6 +38,16 @@ union core_pstate { unsigned res1:31; unsigned en:1; } pstatedef; + /* since fam 1Ah: */ + struct { + unsigned fid:12; + unsigned res1:2; + unsigned vid:8; + unsigned iddval:8; + unsigned idddiv:2; + unsigned res2:31; + unsigned en:1; + } pstatedef2; unsigned long long val; }; @@ -45,6 +55,10 @@ static int get_did(int family, union core_pstate pstate) { int t; + /* Fam 1Ah onward do not use did */ + if (cpupower_cpu_info.family >= 0x1A) + return 0; + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) t = pstate.pstatedef.did; else if (family == 0x12) @@ -58,12 +72,18 @@ static int get_did(int family, union core_pstate pstate) static int get_cof(int family, union core_pstate pstate) { int t; - int fid, did, cof; + int fid, did, cof = 0; did = get_did(family, pstate); if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) { - fid = pstate.pstatedef.fid; - cof = 200 * fid / did; + if (cpupower_cpu_info.family >= 0x1A) { + fid = pstate.pstatedef2.fid; + if (fid > 0x0f) + cof = (fid * 5); + } else { + fid = pstate.pstatedef.fid; + cof = 200 * fid / did; + } } else { t = 0x10; fid = pstate.pstate.fid; -- Gitee From b799c10fe814f7113eb4fe20891e0ec616b1fba1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 4 Mar 2020 17:34:32 -0800 Subject: [PATCH 054/257] KVM: x86: Add helpers to perform CPUID-based guest vendor check commit 15608ed03f10a1414188612b0ee9653e3c78bbd9 upstream Add helpers to provide CPUID-based guest vendor checks, i.e. to do the ugly register comparisons. Use the new helpers to check for an AMD guest vendor in guest_cpuid_is_amd() as well as in the existing emulator flows. Using the new helpers fixes a _very_ theoretical bug where guest_cpuid_is_amd() would get a false positive on a non-AMD virtual CPU with a vendor string beginning with "Auth" due to the previous logic only checking EBX. It also fixes a marginally less theoretically bug where guest_cpuid_is_amd() would incorrectly return false for a guest CPU with "AMDisbetter!" as its vendor string. [Backport Changes] 1. In the current commit, changes intended for arch/x86/kvm/kvm_emulate.h were instead applied to the existing header file: arch/x86/include/asm/kvm_emulate.h, since the upstream commit 2f728d66e8a7 (KVM: x86: Move kvm_emulate.h into KVM's private directory) which relocates kvm_emulate.h as part of a broader header reorganization has not been applied in the current source tree. Backporting commit 2f728d66e8a7 introduces additional dependencies and unnecessary conflicts. So,the necessary changes to kvm_emulate.h were applied at its original location, maintaining compatibility with the existing structure. Fixes: a0c0feb57992c ("KVM: x86: reserve bit 8 of non-leaf PDPEs and PML4Es in 64-bit mode on AMD") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/include/asm/kvm_emulate.h | 24 ++++++++++++++++++++ arch/x86/kvm/cpuid.h | 2 +- arch/x86/kvm/emulate.c | 36 +++++++----------------------- 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 77cf6c11f66b..757d66bb848d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -374,6 +374,30 @@ struct x86_emulate_ctxt { #define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e #define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 +static inline bool is_guest_vendor_intel(u32 ebx, u32 ecx, u32 edx) +{ + return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; +} + +static inline bool is_guest_vendor_amd(u32 ebx, u32 ecx, u32 edx) +{ + return (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) || + (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx); +} + +static inline bool is_guest_vendor_hygon(u32 ebx, u32 ecx, u32 edx) +{ + return ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx && + ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx && + edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx; +} + enum x86_intercept_stage { X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ X86_ICPT_PRE_EXCEPT, diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 7dec43b2c420..878e10d582f1 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -118,7 +118,7 @@ static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0, 0); - return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx; + return best && is_guest_vendor_amd(best->ebx, best->ecx, best->edx); } static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1a9fa2903852..f116f56ab83b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2756,9 +2756,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) eax = ecx = 0; ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx - && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx - && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; + return is_guest_vendor_intel(ebx, ecx, edx); } static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) @@ -2777,34 +2775,16 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) ecx = 0x00000000; ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); /* - * Intel ("GenuineIntel") - * remark: Intel CPUs only support "syscall" in 64bit - * longmode. Also an 64bit guest with a - * 32bit compat-app running will #UD !! While this - * behaviour can be fixed (by emulating) into AMD - * response - CPUs of AMD can't behave like Intel. + * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a + * 64bit guest with a 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD response - CPUs of + * AMD can't behave like Intel. */ - if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && - ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && - edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + if (is_guest_vendor_intel(ebx, ecx, edx)) return false; - /* AMD ("AuthenticAMD") */ - if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && - ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && - edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) - return true; - - /* AMD ("AMDisbetter!") */ - if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && - ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && - edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) - return true; - - /* Hygon ("HygonGenuine") */ - if (ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx && - ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx && - edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx) + if (is_guest_vendor_amd(ebx, ecx, edx) || + is_guest_vendor_hygon(ebx, ecx, edx)) return true; /* -- Gitee From 589504f380808cedea70c8938f770d054d5bdad5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 4 Mar 2020 17:34:33 -0800 Subject: [PATCH 055/257] KVM x86: Extend AMD specific guest behavior to Hygon virtual CPUs commit 23493d0a1731205a4bccaa0e283da642ca6f6e29 upstream Extend guest_cpuid_is_amd() to cover Hygon virtual CPUs and rename it accordingly. Hygon CPUs use an AMD-based core and so have the same basic behavior as AMD CPUs. Fixes: b8f4abb652146 ("x86/kvm: Add Hygon Dhyana support to KVM") Cc: Pu Wen Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 6 ++++-- arch/x86/kvm/mmu/mmu.c | 3 ++- arch/x86/kvm/x86.c | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index db3838667466..b693067f809e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1024,7 +1024,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, * requested. AMD CPUID semantics returns all zeroes for any * undefined leaf, whether or not the leaf is in range. */ - if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && + if (!entry && check_limit && !guest_cpuid_is_amd_or_hygon(vcpu) && !cpuid_function_in_range(vcpu, function)) { max = kvm_find_cpuid_entry(vcpu, 0, 0); if (max) { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 878e10d582f1..facab37dcc5c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -113,12 +113,14 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x8 *reg &= ~bit(x86_feature); } -static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) +static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0, 0); - return best && is_guest_vendor_amd(best->ebx, best->ecx, best->edx); + return best && + (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || + is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); } static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b5be61336695..9b7c92cfb3f5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4648,7 +4648,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, cpuid_maxphyaddr(vcpu), context->root_level, context->nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), - is_pse(vcpu), guest_cpuid_is_amd(vcpu)); + is_pse(vcpu), + guest_cpuid_is_amd_or_hygon(vcpu)); } static void diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a2b1d8081121..33215ddcf299 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2535,7 +2535,7 @@ static void kvmclock_sync_fn(struct work_struct *work) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd(vcpu)) + if (guest_cpuid_is_amd_or_hygon(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; -- Gitee From f8c34d0682be3794d6f371638ddcb307d0f921e5 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 25 Sep 2023 17:34:47 +0000 Subject: [PATCH 056/257] KVM: x86: Mask LVTPC when handling a PMI commit a16eb25b09c02a54c1c1b449d4b6cfa2cf3f013a upstream Per the SDM, "When the local APIC handles a performance-monitoring counters interrupt, it automatically sets the mask flag in the LVT performance counter register." Add this behavior to KVM's local APIC emulation. Failure to mask the LVTPC entry results in spurious PMIs, e.g. when running Linux as a guest, PMI handlers that do a "late_ack" spew a large number of "dazed and confused" spurious NMI warnings. Fixes: f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests") Cc: stable@vger.kernel.org Signed-off-by: Jim Mattson Tested-by: Mingwei Zhang Signed-off-by: Mingwei Zhang Link: https://lore.kernel.org/r/20230925173448.3518223-3-mizhang@google.com [sean: massage changelog, correct Fixes] Signed-off-by: Sean Christopherson Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/kvm/lapic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 23480d8e4ef1..319ed873a111 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2244,13 +2244,17 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; + int r; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode, - NULL); + + r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); + if (r && lvt_type == APIC_LVTPC) + kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); + return r; } return 0; } -- Gitee From c5d076e0601dd87659f496b919c821904b5461de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 5 Apr 2024 16:55:54 -0700 Subject: [PATCH 057/257] KVM: x86: Snapshot if a vCPU's vendor model is AMD vs. Intel compatible commit fd706c9b1674e2858766bfbf7430534c2b26fbef upstream Add kvm_vcpu_arch.is_amd_compatible to cache if a vCPU's vendor model is compatible with AMD, i.e. if the vCPU vendor is AMD or Hygon, along with helpers to check if a vCPU is compatible AMD vs. Intel. To handle Intel vs. AMD behavior related to masking the LVTPC entry, KVM will need to check for vendor compatibility on every PMI injection, i.e. querying for AMD will soon be a moderately hot path. Note! This subtly (or maybe not-so-subtly) makes "Intel compatible" KVM's default behavior, both if userspace omits (or never sets) CPUID 0x0 and if userspace sets a completely unknown vendor. One could argue that KVM should treat such vCPUs as not being compatible with Intel *or* AMD, but that would add useless complexity to KVM. KVM needs to do *something* in the face of vendor specific behavior, and so unless KVM conjured up a magic third option, choosing to treat unknown vendors as neither Intel nor AMD means that checks on AMD compatibility would yield Intel behavior, and checks for Intel compatibility would yield AMD behavior. And that's far worse as it would effectively yield random behavior depending on whether KVM checked for AMD vs. Intel vs. !AMD vs. !Intel. And practically speaking, all x86 CPUs follow either Intel or AMD architecture, i.e. "supporting" an unknown third architecture adds no value. Deliberately don't convert any of the existing guest_cpuid_is_intel() checks, as the Intel side of things is messier due to some flows explicitly checking for exactly vendor==Intel, versus some flows assuming anything that isn't "AMD compatible" gets Intel behavior. The Intel code will be cleaned up in the future. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20240405235603.1173076-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/cpuid.h | 10 ++++++++++ arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/x86.c | 2 +- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 09cfc9c36c2d..a324193effda 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -648,6 +648,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + bool is_amd_compatible; int maxphyaddr; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b693067f809e..9c28cc22dae0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -144,6 +144,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) } } + vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu); /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); kvm_mmu_reset_context(vcpu); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index facab37dcc5c..d732dfa8dea0 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -123,6 +123,16 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); } +static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.is_amd_compatible; +} + +static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu) +{ + return !guest_cpuid_is_amd_compatible(vcpu); +} + static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9b7c92cfb3f5..15d41308590a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4649,7 +4649,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, context->nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), - guest_cpuid_is_amd_or_hygon(vcpu)); + guest_cpuid_is_amd_compatible(vcpu)); } static void diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 33215ddcf299..a99588f68c99 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2535,7 +2535,7 @@ static void kvmclock_sync_fn(struct work_struct *work) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd_or_hygon(vcpu)) + if (guest_cpuid_is_amd_compatible(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; -- Gitee From eda39e3ae678810fbe6cdeb84cb803e1b8055f40 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 5 Apr 2024 16:55:55 -0700 Subject: [PATCH 058/257] KVM: x86/pmu: Do not mask LVTPC when handling a PMI on AMD platforms commit 49ff3b4aec51e3abfc9369997cc603319b02af9a upstream On AMD and Hygon platforms, the local APIC does not automatically set the mask bit of the LVTPC register when handling a PMI and there is no need to clear it in the kernel's PMI handler. For guests, the mask bit is currently set by kvm_apic_local_deliver() and unless it is cleared by the guest kernel's PMI handler, PMIs stop arriving and break use-cases like sampling with perf record. This does not affect non-PerfMonV2 guests because PMIs are handled in the guest kernel by x86_pmu_handle_irq() which always clears the LVTPC mask bit irrespective of the vendor. Before: $ perf record -e cycles:u true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.001 MB perf.data (1 samples) ] After: $ perf record -e cycles:u true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.002 MB perf.data (19 samples) ] Fixes: a16eb25b09c0 ("KVM: x86: Mask LVTPC when handling a PMI") Cc: stable@vger.kernel.org Signed-off-by: Sandipan Das Reviewed-by: Jim Mattson [sean: use is_intel_compatible instead of !is_amd_or_hygon()] Signed-off-by: Sean Christopherson Message-ID: <20240405235603.1173076-3-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 319ed873a111..6691729ef081 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2252,7 +2252,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); - if (r && lvt_type == APIC_LVTPC) + if (r && lvt_type == APIC_LVTPC && + guest_cpuid_is_intel_compatible(apic->vcpu)) kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); return r; } -- Gitee From 9c2f8f448aa370badf26b6122b0c4fc95c5669dc Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Thu, 9 Jan 2020 18:23:18 +0530 Subject: [PATCH 059/257] tee: amdtee: remove unused variable initialization commit 5ae63958a6dea78467c95f1669a4f0affa59935b upstream Remove unused variable initialization from driver code. If enabled as a compiler option, compiler may throw warning for unused assignments. Reported-by: Dan Carpenter Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Signed-off-by: Rijo Thomas Signed-off-by: Herbert Xu Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/call.c | 14 +++++++------- drivers/tee/amdtee/core.c | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/tee/amdtee/call.c b/drivers/tee/amdtee/call.c index 87ccad256686..096dd4d92d39 100644 --- a/drivers/tee/amdtee/call.c +++ b/drivers/tee/amdtee/call.c @@ -124,8 +124,8 @@ static int amd_params_to_tee_params(struct tee_param *tee, u32 count, int handle_unload_ta(u32 ta_handle) { struct tee_cmd_unload_ta cmd = {0}; - int ret = 0; u32 status; + int ret; if (!ta_handle) return -EINVAL; @@ -145,8 +145,8 @@ int handle_unload_ta(u32 ta_handle) int handle_close_session(u32 ta_handle, u32 info) { struct tee_cmd_close_session cmd = {0}; - int ret = 0; u32 status; + int ret; if (ta_handle == 0) return -EINVAL; @@ -167,8 +167,8 @@ int handle_close_session(u32 ta_handle, u32 info) void handle_unmap_shmem(u32 buf_id) { struct tee_cmd_unmap_shared_mem cmd = {0}; - int ret = 0; u32 status; + int ret; cmd.buf_id = buf_id; @@ -183,7 +183,7 @@ int handle_invoke_cmd(struct tee_ioctl_invoke_arg *arg, u32 sinfo, struct tee_param *p) { struct tee_cmd_invoke_cmd cmd = {0}; - int ret = 0; + int ret; if (!arg || (!p && arg->num_params)) return -EINVAL; @@ -229,7 +229,7 @@ int handle_map_shmem(u32 count, struct shmem_desc *start, u32 *buf_id) { struct tee_cmd_map_shared_mem *cmd; phys_addr_t paddr; - int ret = 0, i; + int ret, i; u32 status; if (!count || !start || !buf_id) @@ -294,7 +294,7 @@ int handle_open_session(struct tee_ioctl_open_session_arg *arg, u32 *info, struct tee_param *p) { struct tee_cmd_open_session cmd = {0}; - int ret = 0; + int ret; if (!arg || !info || (!p && arg->num_params)) return -EINVAL; @@ -342,7 +342,7 @@ int handle_load_ta(void *data, u32 size, struct tee_ioctl_open_session_arg *arg) { struct tee_cmd_load_ta cmd = {0}; phys_addr_t blob; - int ret = 0; + int ret; if (size == 0 || !data || !arg) return -EINVAL; diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index dd360f378ef0..d768774e7875 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -49,7 +49,7 @@ static int amdtee_open(struct tee_context *ctx) static void release_session(struct amdtee_session *sess) { - int i = 0; + int i; /* Close any open session */ for (i = 0; i < TEE_NUM_SESSIONS; ++i) { @@ -172,7 +172,7 @@ static int copy_ta_binary(struct tee_context *ctx, void *ptr, void **ta, u16 hi_ver; u8 seq_n[8]; } *uuid = ptr; - int n = 0, rc = 0; + int n, rc = 0; n = snprintf(fw_name, TA_PATH_MAX, "%s/%08x-%04x-%04x-%02x%02x%02x%02x%02x%02x%02x%02x.bin", @@ -218,9 +218,9 @@ int amdtee_open_session(struct tee_context *ctx, struct amdtee_context_data *ctxdata = ctx->data; struct amdtee_session *sess = NULL; u32 session_info; - void *ta = NULL; size_t ta_size; - int rc = 0, i; + int rc, i; + void *ta; if (arg->clnt_login != TEE_IOCTL_LOGIN_PUBLIC) { pr_err("unsupported client login method\n"); @@ -367,8 +367,8 @@ int amdtee_map_shmem(struct tee_shm *shm) void amdtee_unmap_shmem(struct tee_shm *shm) { + struct amdtee_shm_data *shmnode; u32 buf_id; - struct amdtee_shm_data *shmnode = NULL; if (!shm) return; @@ -433,9 +433,9 @@ static const struct tee_desc amdtee_desc = { static int __init amdtee_driver_init(void) { - struct amdtee *amdtee = NULL; struct tee_device *teedev; - struct tee_shm_pool *pool = ERR_PTR(-EINVAL); + struct tee_shm_pool *pool; + struct amdtee *amdtee; int rc; drv_data = kzalloc(sizeof(*drv_data), GFP_KERNEL); -- Gitee From d5ad8bc691b1fd7bfc015e7c00d4878fee9322c9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 16 Jan 2020 15:48:52 +0000 Subject: [PATCH 060/257] tee: fix memory allocation failure checks on drv_data and amdtee commit 48d625e4c4cec813cdd1e439864a8ffc0b5081f1 upstream Currently the memory allocation failure checks on drv_data and amdtee are using IS_ERR rather than checking for a null pointer. Fix these checks to use the conventional null pointer check. Addresses-Coverity: ("Dereference null return") Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Signed-off-by: Colin Ian King Reviewed-by: Rijo Thomas Acked-by: Jens Wiklander Signed-off-by: Herbert Xu Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index d768774e7875..5629c46b5e80 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -439,11 +439,11 @@ static int __init amdtee_driver_init(void) int rc; drv_data = kzalloc(sizeof(*drv_data), GFP_KERNEL); - if (IS_ERR(drv_data)) + if (!drv_data) return -ENOMEM; amdtee = kzalloc(sizeof(*amdtee), GFP_KERNEL); - if (IS_ERR(amdtee)) { + if (!amdtee) { rc = -ENOMEM; goto err_kfree_drv_data; } -- Gitee From 529f886a527c96b4dd92263da4d1759ec5eee9ef Mon Sep 17 00:00:00 2001 From: Hongbo Yao Date: Wed, 22 Jan 2020 17:12:38 +0800 Subject: [PATCH 061/257] tee: amdtee: amdtee depends on CRYPTO_DEV_CCP_DD commit 872d92dec353a8d30fa186892cd5ea3e17ca75d3 upstream If CRYPTO_DEV_CCP_DD=m and AMDTEE=y, the following error is seen while building call.c or core.c drivers/tee/amdtee/call.o: In function `handle_unload_ta': call.c:(.text+0x35f): undefined reference to `psp_tee_process_cmd' drivers/tee/amdtee/core.o: In function `amdtee_driver_init': core.c:(.init.text+0xf): undefined reference to `psp_check_tee_status Fix the config dependency for AMDTEE here. Reported-by: Hulk Robot Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Signed-off-by: Hongbo Yao Reviewed-by: Rijo Thomas Acked-by: Jens Wiklander Signed-off-by: Herbert Xu Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tee/amdtee/Kconfig b/drivers/tee/amdtee/Kconfig index 4e32b6413b41..191f9715fa9a 100644 --- a/drivers/tee/amdtee/Kconfig +++ b/drivers/tee/amdtee/Kconfig @@ -3,6 +3,6 @@ config AMDTEE tristate "AMD-TEE" default m - depends on CRYPTO_DEV_SP_PSP + depends on CRYPTO_DEV_SP_PSP && CRYPTO_DEV_CCP_DD help This implements AMD's Trusted Execution Environment (TEE) driver. -- Gitee From 95c740439e58f4901ddafe076e39b263797e9a2d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 24 Feb 2020 13:51:39 +0300 Subject: [PATCH 062/257] tee: amdtee: fix memory leak in amdtee_open_session() commit b83685bceedbeed33a6adc2d0579a011708d2b18 upstream On these error paths the "sess" variable isn't freed. It's a refcounted pointer so we need to call kref_put(). I re-arranged the code a bit so the error case is always handled before the success case and the error paths are indented two tabs. Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Reviewed-by: Rijo Thomas Signed-off-by: Dan Carpenter Signed-off-by: Jens Wiklander Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/core.c | 48 +++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index 5629c46b5e80..73a69ecb633b 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -211,6 +211,19 @@ static int copy_ta_binary(struct tee_context *ctx, void *ptr, void **ta, return rc; } +static void destroy_session(struct kref *ref) +{ + struct amdtee_session *sess = container_of(ref, struct amdtee_session, + refcount); + + /* Unload the TA from TEE */ + handle_unload_ta(sess->ta_handle); + mutex_lock(&session_list_mutex); + list_del(&sess->list_node); + mutex_unlock(&session_list_mutex); + kfree(sess); +} + int amdtee_open_session(struct tee_context *ctx, struct tee_ioctl_open_session_arg *arg, struct tee_param *param) @@ -235,15 +248,13 @@ int amdtee_open_session(struct tee_context *ctx, /* Load the TA binary into TEE environment */ handle_load_ta(ta, ta_size, arg); - if (arg->ret == TEEC_SUCCESS) { - mutex_lock(&session_list_mutex); - sess = alloc_session(ctxdata, arg->session); - mutex_unlock(&session_list_mutex); - } - if (arg->ret != TEEC_SUCCESS) goto out; + mutex_lock(&session_list_mutex); + sess = alloc_session(ctxdata, arg->session); + mutex_unlock(&session_list_mutex); + if (!sess) { rc = -ENOMEM; goto out; @@ -258,40 +269,29 @@ int amdtee_open_session(struct tee_context *ctx, if (i >= TEE_NUM_SESSIONS) { pr_err("reached maximum session count %d\n", TEE_NUM_SESSIONS); + kref_put(&sess->refcount, destroy_session); rc = -ENOMEM; goto out; } /* Open session with loaded TA */ handle_open_session(arg, &session_info, param); - - if (arg->ret == TEEC_SUCCESS) { - sess->session_info[i] = session_info; - set_session_id(sess->ta_handle, i, &arg->session); - } else { + if (arg->ret != TEEC_SUCCESS) { pr_err("open_session failed %d\n", arg->ret); spin_lock(&sess->lock); clear_bit(i, sess->sess_mask); spin_unlock(&sess->lock); + kref_put(&sess->refcount, destroy_session); + goto out; } + + sess->session_info[i] = session_info; + set_session_id(sess->ta_handle, i, &arg->session); out: free_pages((u64)ta, get_order(ta_size)); return rc; } -static void destroy_session(struct kref *ref) -{ - struct amdtee_session *sess = container_of(ref, struct amdtee_session, - refcount); - - /* Unload the TA from TEE */ - handle_unload_ta(sess->ta_handle); - mutex_lock(&session_list_mutex); - list_del(&sess->list_node); - mutex_unlock(&session_list_mutex); - kfree(sess); -} - int amdtee_close_session(struct tee_context *ctx, u32 session) { struct amdtee_context_data *ctxdata = ctx->data; -- Gitee From 370c80393df33d6b64b99ab65c0151adb9d94a62 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 27 Feb 2020 19:19:54 +0300 Subject: [PATCH 063/257] tee: amdtee: out of bounds read in find_session() commit 36fa3e50085e3858dd506e4431b9abd1bcb1f542 upstream The "index" is a user provided value from 0-USHRT_MAX. If it's over TEE_NUM_SESSIONS (31) then it results in an out of bounds read when we call test_bit(index, sess->sess_mask). Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Acked-by: Rijo Thomas Signed-off-by: Dan Carpenter Signed-off-by: Jens Wiklander Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index 73a69ecb633b..89f2226475e1 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -138,6 +138,9 @@ static struct amdtee_session *find_session(struct amdtee_context_data *ctxdata, u32 index = get_session_index(session); struct amdtee_session *sess; + if (index >= TEE_NUM_SESSIONS) + return NULL; + list_for_each_entry(sess, &ctxdata->sess_list, list_node) if (ta_handle == sess->ta_handle && test_bit(index, sess->sess_mask)) -- Gitee From a5abcba2ab977a97aedb2662b4b5dbe14d684d1c Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Wed, 4 Nov 2020 11:56:09 +0530 Subject: [PATCH 064/257] tee: amdtee: fix memory leak due to reset of global shm list commit ff1f855804cdbbb6db7b9b6df6cab783d1a40d66 upstream The driver maintains a list of shared memory buffers along with their mapped buffer id's in a global linked list. These buffers need to be unmapped after use by the user-space client. The global shared memory list is initialized to zero entries in the function amdtee_open(). This clearing of list entries can be a source for memory leak on secure side if the global linked list previously held some mapped buffer entries allocated from another TEE context. Fix potential memory leak issue by moving global shared memory list to AMD-TEE driver context data structure. Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Reviewed-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Signed-off-by: Jens Wiklander Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/amdtee_private.h | 7 +++---- drivers/tee/amdtee/core.c | 18 +++++++++++------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/tee/amdtee/amdtee_private.h b/drivers/tee/amdtee/amdtee_private.h index d7f798c3394b..97df16a17285 100644 --- a/drivers/tee/amdtee/amdtee_private.h +++ b/drivers/tee/amdtee/amdtee_private.h @@ -64,9 +64,12 @@ struct amdtee_session { /** * struct amdtee_context_data - AMD-TEE driver context data * @sess_list: Keeps track of sessions opened in current TEE context + * @shm_list: Keeps track of buffers allocated and mapped in current TEE + * context */ struct amdtee_context_data { struct list_head sess_list; + struct list_head shm_list; }; struct amdtee_driver_data { @@ -89,10 +92,6 @@ struct amdtee_shm_data { u32 buf_id; }; -struct amdtee_shm_context { - struct list_head shmdata_list; -}; - #define LOWER_TWO_BYTE_MASK 0x0000FFFF /** diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index 89f2226475e1..3238170dedc0 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -19,7 +19,6 @@ static struct amdtee_driver_data *drv_data; static DEFINE_MUTEX(session_list_mutex); -static struct amdtee_shm_context shmctx; static void amdtee_get_version(struct tee_device *teedev, struct tee_ioctl_version_data *vers) @@ -41,7 +40,7 @@ static int amdtee_open(struct tee_context *ctx) return -ENOMEM; INIT_LIST_HEAD(&ctxdata->sess_list); - INIT_LIST_HEAD(&shmctx.shmdata_list); + INIT_LIST_HEAD(&ctxdata->shm_list); ctx->data = ctxdata; return 0; @@ -151,10 +150,11 @@ static struct amdtee_session *find_session(struct amdtee_context_data *ctxdata, u32 get_buffer_id(struct tee_shm *shm) { - u32 buf_id = 0; + struct amdtee_context_data *ctxdata = shm->ctx->data; struct amdtee_shm_data *shmdata; + u32 buf_id = 0; - list_for_each_entry(shmdata, &shmctx.shmdata_list, shm_node) + list_for_each_entry(shmdata, &ctxdata->shm_list, shm_node) if (shmdata->kaddr == shm->kaddr) { buf_id = shmdata->buf_id; break; @@ -332,8 +332,9 @@ int amdtee_close_session(struct tee_context *ctx, u32 session) int amdtee_map_shmem(struct tee_shm *shm) { - struct shmem_desc shmem; + struct amdtee_context_data *ctxdata; struct amdtee_shm_data *shmnode; + struct shmem_desc shmem; int rc, count; u32 buf_id; @@ -361,7 +362,8 @@ int amdtee_map_shmem(struct tee_shm *shm) shmnode->kaddr = shm->kaddr; shmnode->buf_id = buf_id; - list_add(&shmnode->shm_node, &shmctx.shmdata_list); + ctxdata = shm->ctx->data; + list_add(&shmnode->shm_node, &ctxdata->shm_list); pr_debug("buf_id :[%x] kaddr[%p]\n", shmnode->buf_id, shmnode->kaddr); @@ -370,6 +372,7 @@ int amdtee_map_shmem(struct tee_shm *shm) void amdtee_unmap_shmem(struct tee_shm *shm) { + struct amdtee_context_data *ctxdata; struct amdtee_shm_data *shmnode; u32 buf_id; @@ -380,7 +383,8 @@ void amdtee_unmap_shmem(struct tee_shm *shm) /* Unmap the shared memory from TEE */ handle_unmap_shmem(buf_id); - list_for_each_entry(shmnode, &shmctx.shmdata_list, shm_node) + ctxdata = shm->ctx->data; + list_for_each_entry(shmnode, &ctxdata->shm_list, shm_node) if (buf_id == shmnode->buf_id) { list_del(&shmnode->shm_node); kfree(shmnode); -- Gitee From a20c813f2ecdfe040ece7e2f8814d28d4f0acabf Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Wed, 4 Nov 2020 11:56:10 +0530 Subject: [PATCH 065/257] tee: amdtee: synchronize access to shm list commit be353be27874f40837327d9a39e3ad2149ab66d3 upstream Synchronize access to shm or shared memory buffer list to prevent race conditions due to concurrent updates to shared shm list by multiple threads. Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Reviewed-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Signed-off-by: Jens Wiklander Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/amdtee_private.h | 1 + drivers/tee/amdtee/core.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/tee/amdtee/amdtee_private.h b/drivers/tee/amdtee/amdtee_private.h index 97df16a17285..337c8d82f74e 100644 --- a/drivers/tee/amdtee/amdtee_private.h +++ b/drivers/tee/amdtee/amdtee_private.h @@ -70,6 +70,7 @@ struct amdtee_session { struct amdtee_context_data { struct list_head sess_list; struct list_head shm_list; + struct mutex shm_mutex; /* synchronizes access to @shm_list */ }; struct amdtee_driver_data { diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index 3238170dedc0..c0fe240d7042 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -41,6 +41,7 @@ static int amdtee_open(struct tee_context *ctx) INIT_LIST_HEAD(&ctxdata->sess_list); INIT_LIST_HEAD(&ctxdata->shm_list); + mutex_init(&ctxdata->shm_mutex); ctx->data = ctxdata; return 0; @@ -84,6 +85,7 @@ static void amdtee_release(struct tee_context *ctx) list_del(&sess->list_node); release_session(sess); } + mutex_destroy(&ctxdata->shm_mutex); kfree(ctxdata); ctx->data = NULL; @@ -154,11 +156,13 @@ u32 get_buffer_id(struct tee_shm *shm) struct amdtee_shm_data *shmdata; u32 buf_id = 0; + mutex_lock(&ctxdata->shm_mutex); list_for_each_entry(shmdata, &ctxdata->shm_list, shm_node) if (shmdata->kaddr == shm->kaddr) { buf_id = shmdata->buf_id; break; } + mutex_unlock(&ctxdata->shm_mutex); return buf_id; } @@ -363,7 +367,9 @@ int amdtee_map_shmem(struct tee_shm *shm) shmnode->kaddr = shm->kaddr; shmnode->buf_id = buf_id; ctxdata = shm->ctx->data; + mutex_lock(&ctxdata->shm_mutex); list_add(&shmnode->shm_node, &ctxdata->shm_list); + mutex_unlock(&ctxdata->shm_mutex); pr_debug("buf_id :[%x] kaddr[%p]\n", shmnode->buf_id, shmnode->kaddr); @@ -384,12 +390,14 @@ void amdtee_unmap_shmem(struct tee_shm *shm) handle_unmap_shmem(buf_id); ctxdata = shm->ctx->data; + mutex_lock(&ctxdata->shm_mutex); list_for_each_entry(shmnode, &ctxdata->shm_list, shm_node) if (buf_id == shmnode->buf_id) { list_del(&shmnode->shm_node); kfree(shmnode); break; } + mutex_unlock(&ctxdata->shm_mutex); } int amdtee_invoke_func(struct tee_context *ctx, -- Gitee From 36bbc8c9263976bbae981e921d6b730b5378b291 Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Wed, 14 Apr 2021 23:08:27 +0530 Subject: [PATCH 066/257] tee: amdtee: unload TA only when its refcount becomes 0 commit 9f015b3765bf593b3ed5d3b588e409dc0ffa9f85 upstream Same Trusted Application (TA) can be loaded in multiple TEE contexts. If it is a single instance TA, the TA should not get unloaded from AMD Secure Processor, while it is still in use in another TEE context. Therefore reference count TA and unload it when the count becomes zero. Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Reviewed-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Acked-by: Dan Carpenter Signed-off-by: Jens Wiklander Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/amdtee_private.h | 13 ++++ drivers/tee/amdtee/call.c | 94 ++++++++++++++++++++++++++--- drivers/tee/amdtee/core.c | 15 +++-- 3 files changed, 106 insertions(+), 16 deletions(-) diff --git a/drivers/tee/amdtee/amdtee_private.h b/drivers/tee/amdtee/amdtee_private.h index 337c8d82f74e..6d0f7062bb87 100644 --- a/drivers/tee/amdtee/amdtee_private.h +++ b/drivers/tee/amdtee/amdtee_private.h @@ -21,6 +21,7 @@ #define TEEC_SUCCESS 0x00000000 #define TEEC_ERROR_GENERIC 0xFFFF0000 #define TEEC_ERROR_BAD_PARAMETERS 0xFFFF0006 +#define TEEC_ERROR_OUT_OF_MEMORY 0xFFFF000C #define TEEC_ERROR_COMMUNICATION 0xFFFF000E #define TEEC_ORIGIN_COMMS 0x00000002 @@ -93,6 +94,18 @@ struct amdtee_shm_data { u32 buf_id; }; +/** + * struct amdtee_ta_data - Keeps track of all TAs loaded in AMD Secure + * Processor + * @ta_handle: Handle to TA loaded in TEE + * @refcount: Reference count for the loaded TA + */ +struct amdtee_ta_data { + struct list_head list_node; + u32 ta_handle; + u32 refcount; +}; + #define LOWER_TWO_BYTE_MASK 0x0000FFFF /** diff --git a/drivers/tee/amdtee/call.c b/drivers/tee/amdtee/call.c index 096dd4d92d39..07f36ac834c8 100644 --- a/drivers/tee/amdtee/call.c +++ b/drivers/tee/amdtee/call.c @@ -121,15 +121,69 @@ static int amd_params_to_tee_params(struct tee_param *tee, u32 count, return ret; } +static DEFINE_MUTEX(ta_refcount_mutex); +static struct list_head ta_list = LIST_HEAD_INIT(ta_list); + +static u32 get_ta_refcount(u32 ta_handle) +{ + struct amdtee_ta_data *ta_data; + u32 count = 0; + + /* Caller must hold a mutex */ + list_for_each_entry(ta_data, &ta_list, list_node) + if (ta_data->ta_handle == ta_handle) + return ++ta_data->refcount; + + ta_data = kzalloc(sizeof(*ta_data), GFP_KERNEL); + if (ta_data) { + ta_data->ta_handle = ta_handle; + ta_data->refcount = 1; + count = ta_data->refcount; + list_add(&ta_data->list_node, &ta_list); + } + + return count; +} + +static u32 put_ta_refcount(u32 ta_handle) +{ + struct amdtee_ta_data *ta_data; + u32 count = 0; + + /* Caller must hold a mutex */ + list_for_each_entry(ta_data, &ta_list, list_node) + if (ta_data->ta_handle == ta_handle) { + count = --ta_data->refcount; + if (count == 0) { + list_del(&ta_data->list_node); + kfree(ta_data); + break; + } + } + + return count; +} + int handle_unload_ta(u32 ta_handle) { struct tee_cmd_unload_ta cmd = {0}; - u32 status; + u32 status, count; int ret; if (!ta_handle) return -EINVAL; + mutex_lock(&ta_refcount_mutex); + + count = put_ta_refcount(ta_handle); + + if (count) { + pr_debug("unload ta: not unloading %u count %u\n", + ta_handle, count); + ret = -EBUSY; + goto unlock; + } + cmd.ta_handle = ta_handle; ret = psp_tee_process_cmd(TEE_CMD_ID_UNLOAD_TA, (void *)&cmd, @@ -137,8 +191,12 @@ int handle_unload_ta(u32 ta_handle) if (!ret && status != 0) { pr_err("unload ta: status = 0x%x\n", status); ret = -EBUSY; + } else { + pr_debug("unloaded ta handle %u\n", ta_handle); } +unlock: + mutex_unlock(&ta_refcount_mutex); return ret; } @@ -340,7 +398,8 @@ int handle_open_session(struct tee_ioctl_open_session_arg *arg, u32 *info, int handle_load_ta(void *data, u32 size, struct tee_ioctl_open_session_arg *arg) { - struct tee_cmd_load_ta cmd = {0}; + struct tee_cmd_unload_ta unload_cmd = {}; + struct tee_cmd_load_ta load_cmd = {}; phys_addr_t blob; int ret; @@ -353,21 +412,36 @@ int handle_load_ta(void *data, u32 size, struct tee_ioctl_open_session_arg *arg) return -EINVAL; } - cmd.hi_addr = upper_32_bits(blob); - cmd.low_addr = lower_32_bits(blob); - cmd.size = size; + load_cmd.hi_addr = upper_32_bits(blob); + load_cmd.low_addr = lower_32_bits(blob); + load_cmd.size = size; - ret = psp_tee_process_cmd(TEE_CMD_ID_LOAD_TA, (void *)&cmd, - sizeof(cmd), &arg->ret); + mutex_lock(&ta_refcount_mutex); + + ret = psp_tee_process_cmd(TEE_CMD_ID_LOAD_TA, (void *)&load_cmd, + sizeof(load_cmd), &arg->ret); if (ret) { arg->ret_origin = TEEC_ORIGIN_COMMS; arg->ret = TEEC_ERROR_COMMUNICATION; - } else { - set_session_id(cmd.ta_handle, 0, &arg->session); + } else if (arg->ret == TEEC_SUCCESS) { + ret = get_ta_refcount(load_cmd.ta_handle); + if (!ret) { + arg->ret_origin = TEEC_ORIGIN_COMMS; + arg->ret = TEEC_ERROR_OUT_OF_MEMORY; + + /* Unload the TA on error */ + unload_cmd.ta_handle = load_cmd.ta_handle; + psp_tee_process_cmd(TEE_CMD_ID_UNLOAD_TA, + (void *)&unload_cmd, + sizeof(unload_cmd), &ret); + } else { + set_session_id(load_cmd.ta_handle, 0, &arg->session); + } } + mutex_unlock(&ta_refcount_mutex); pr_debug("load TA: TA handle = 0x%x, RO = 0x%x, ret = 0x%x\n", - cmd.ta_handle, arg->ret_origin, arg->ret); + load_cmd.ta_handle, arg->ret_origin, arg->ret); return 0; } diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index c0fe240d7042..de3c774839cf 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -58,10 +58,9 @@ static void release_session(struct amdtee_session *sess) continue; handle_close_session(sess->ta_handle, sess->session_info[i]); + handle_unload_ta(sess->ta_handle); } - /* Unload Trusted Application once all sessions are closed */ - handle_unload_ta(sess->ta_handle); kfree(sess); } @@ -223,8 +222,6 @@ static void destroy_session(struct kref *ref) struct amdtee_session *sess = container_of(ref, struct amdtee_session, refcount); - /* Unload the TA from TEE */ - handle_unload_ta(sess->ta_handle); mutex_lock(&session_list_mutex); list_del(&sess->list_node); mutex_unlock(&session_list_mutex); @@ -237,7 +234,7 @@ int amdtee_open_session(struct tee_context *ctx, { struct amdtee_context_data *ctxdata = ctx->data; struct amdtee_session *sess = NULL; - u32 session_info; + u32 session_info, ta_handle; size_t ta_size; int rc, i; void *ta; @@ -258,11 +255,14 @@ int amdtee_open_session(struct tee_context *ctx, if (arg->ret != TEEC_SUCCESS) goto out; + ta_handle = get_ta_handle(arg->session); + mutex_lock(&session_list_mutex); sess = alloc_session(ctxdata, arg->session); mutex_unlock(&session_list_mutex); if (!sess) { + handle_unload_ta(ta_handle); rc = -ENOMEM; goto out; } @@ -276,6 +276,7 @@ int amdtee_open_session(struct tee_context *ctx, if (i >= TEE_NUM_SESSIONS) { pr_err("reached maximum session count %d\n", TEE_NUM_SESSIONS); + handle_unload_ta(ta_handle); kref_put(&sess->refcount, destroy_session); rc = -ENOMEM; goto out; @@ -288,12 +289,13 @@ int amdtee_open_session(struct tee_context *ctx, spin_lock(&sess->lock); clear_bit(i, sess->sess_mask); spin_unlock(&sess->lock); + handle_unload_ta(ta_handle); kref_put(&sess->refcount, destroy_session); goto out; } sess->session_info[i] = session_info; - set_session_id(sess->ta_handle, i, &arg->session); + set_session_id(ta_handle, i, &arg->session); out: free_pages((u64)ta, get_order(ta_size)); return rc; @@ -328,6 +330,7 @@ int amdtee_close_session(struct tee_context *ctx, u32 session) /* Close the session */ handle_close_session(ta_handle, session_info); + handle_unload_ta(ta_handle); kref_put(&sess->refcount, destroy_session); -- Gitee From bd62b77dfbd1bd01ab20a93611948c6058d51153 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 24 Nov 2021 17:54:04 +0300 Subject: [PATCH 067/257] tee: amdtee: fix an IS_ERR() vs NULL bug commit 9d7482771fac8d8e38e763263f2ca0ca12dd22c6 upstream The __get_free_pages() function does not return error pointers it returns NULL so fix this condition to avoid a NULL dereference. Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Signed-off-by: Dan Carpenter Acked-by: Rijo Thomas Signed-off-by: Jens Wiklander Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index de3c774839cf..0f0e86cca2e5 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -202,9 +202,8 @@ static int copy_ta_binary(struct tee_context *ctx, void *ptr, void **ta, *ta_size = roundup(fw->size, PAGE_SIZE); *ta = (void *)__get_free_pages(GFP_KERNEL, get_order(*ta_size)); - if (IS_ERR(*ta)) { - pr_err("%s: get_free_pages failed 0x%llx\n", __func__, - (u64)*ta); + if (!*ta) { + pr_err("%s: get_free_pages failed\n", __func__); rc = -ENOMEM; goto rel_fw; } -- Gitee From a338dfc55d6e782a127af06a7bce0c8f2e7f0dc5 Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Tue, 28 Feb 2023 15:11:20 +0530 Subject: [PATCH 068/257] tee: amdtee: fix race condition in amdtee_open_session commit f8502fba45bd30e1a6a354d9d898bc99d1a11e6d upstream There is a potential race condition in amdtee_open_session that may lead to use-after-free. For instance, in amdtee_open_session() after sess->sess_mask is set, and before setting: sess->session_info[i] = session_info; if amdtee_close_session() closes this same session, then 'sess' data structure will be released, causing kernel panic when 'sess' is accessed within amdtee_open_session(). The solution is to set the bit sess->sess_mask as the last step in amdtee_open_session(). Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Cc: stable@vger.kernel.org Signed-off-by: Rijo Thomas Acked-by: Sumit Garg Signed-off-by: Jens Wiklander Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/core.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index 0f0e86cca2e5..7f403bca3a51 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -266,35 +266,34 @@ int amdtee_open_session(struct tee_context *ctx, goto out; } + /* Open session with loaded TA */ + handle_open_session(arg, &session_info, param); + if (arg->ret != TEEC_SUCCESS) { + pr_err("open_session failed %d\n", arg->ret); + handle_unload_ta(ta_handle); + kref_put(&sess->refcount, destroy_session); + goto out; + } + /* Find an empty session index for the given TA */ spin_lock(&sess->lock); i = find_first_zero_bit(sess->sess_mask, TEE_NUM_SESSIONS); - if (i < TEE_NUM_SESSIONS) + if (i < TEE_NUM_SESSIONS) { + sess->session_info[i] = session_info; + set_session_id(ta_handle, i, &arg->session); set_bit(i, sess->sess_mask); + } spin_unlock(&sess->lock); if (i >= TEE_NUM_SESSIONS) { pr_err("reached maximum session count %d\n", TEE_NUM_SESSIONS); + handle_close_session(ta_handle, session_info); handle_unload_ta(ta_handle); kref_put(&sess->refcount, destroy_session); rc = -ENOMEM; goto out; } - /* Open session with loaded TA */ - handle_open_session(arg, &session_info, param); - if (arg->ret != TEEC_SUCCESS) { - pr_err("open_session failed %d\n", arg->ret); - spin_lock(&sess->lock); - clear_bit(i, sess->sess_mask); - spin_unlock(&sess->lock); - handle_unload_ta(ta_handle); - kref_put(&sess->refcount, destroy_session); - goto out; - } - - sess->session_info[i] = session_info; - set_session_id(ta_handle, i, &arg->session); out: free_pages((u64)ta, get_order(ta_size)); return rc; -- Gitee From 34a5295f4bdfe6b022b1d5611ee2b10439d80dbb Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Tue, 9 May 2023 13:02:40 +0530 Subject: [PATCH 069/257] tee: amdtee: Add return_origin to 'struct tee_cmd_load_ta' commit 436eeae0411acdfc54521ddea80ee76d4ae8a7ea upstream After TEE has completed processing of TEE_CMD_ID_LOAD_TA, set proper value in 'return_origin' argument passed by open_session() call. To do so, add 'return_origin' field to the structure tee_cmd_load_ta. The Trusted OS shall update return_origin as part of TEE processing. This change to 'struct tee_cmd_load_ta' interface requires a similar update in AMD-TEE Trusted OS's TEE_CMD_ID_LOAD_TA interface. This patch has been verified on Phoenix Birman setup. On older APUs, return_origin value will be 0. Cc: stable@vger.kernel.org Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Tested-by: Sourabh Das Signed-off-by: Rijo Thomas Acked-by: Sumit Garg Signed-off-by: Jens Wiklander Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/amdtee_if.h | 10 ++++++---- drivers/tee/amdtee/call.c | 30 +++++++++++++++++------------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/tee/amdtee/amdtee_if.h b/drivers/tee/amdtee/amdtee_if.h index ff48c3e47375..e2014e21530a 100644 --- a/drivers/tee/amdtee/amdtee_if.h +++ b/drivers/tee/amdtee/amdtee_if.h @@ -118,16 +118,18 @@ struct tee_cmd_unmap_shared_mem { /** * struct tee_cmd_load_ta - load Trusted Application (TA) binary into TEE - * @low_addr: [in] bits [31:0] of the physical address of the TA binary - * @hi_addr: [in] bits [63:32] of the physical address of the TA binary - * @size: [in] size of TA binary in bytes - * @ta_handle: [out] return handle of the loaded TA + * @low_addr: [in] bits [31:0] of the physical address of the TA binary + * @hi_addr: [in] bits [63:32] of the physical address of the TA binary + * @size: [in] size of TA binary in bytes + * @ta_handle: [out] return handle of the loaded TA + * @return_origin: [out] origin of return code after TEE processing */ struct tee_cmd_load_ta { u32 low_addr; u32 hi_addr; u32 size; u32 ta_handle; + u32 return_origin; }; /** diff --git a/drivers/tee/amdtee/call.c b/drivers/tee/amdtee/call.c index 07f36ac834c8..63d428423e90 100644 --- a/drivers/tee/amdtee/call.c +++ b/drivers/tee/amdtee/call.c @@ -423,19 +423,23 @@ int handle_load_ta(void *data, u32 size, struct tee_ioctl_open_session_arg *arg) if (ret) { arg->ret_origin = TEEC_ORIGIN_COMMS; arg->ret = TEEC_ERROR_COMMUNICATION; - } else if (arg->ret == TEEC_SUCCESS) { - ret = get_ta_refcount(load_cmd.ta_handle); - if (!ret) { - arg->ret_origin = TEEC_ORIGIN_COMMS; - arg->ret = TEEC_ERROR_OUT_OF_MEMORY; - - /* Unload the TA on error */ - unload_cmd.ta_handle = load_cmd.ta_handle; - psp_tee_process_cmd(TEE_CMD_ID_UNLOAD_TA, - (void *)&unload_cmd, - sizeof(unload_cmd), &ret); - } else { - set_session_id(load_cmd.ta_handle, 0, &arg->session); + } else { + arg->ret_origin = load_cmd.return_origin; + + if (arg->ret == TEEC_SUCCESS) { + ret = get_ta_refcount(load_cmd.ta_handle); + if (!ret) { + arg->ret_origin = TEEC_ORIGIN_COMMS; + arg->ret = TEEC_ERROR_OUT_OF_MEMORY; + + /* Unload the TA on error */ + unload_cmd.ta_handle = load_cmd.ta_handle; + psp_tee_process_cmd(TEE_CMD_ID_UNLOAD_TA, + (void *)&unload_cmd, + sizeof(unload_cmd), &ret); + } else { + set_session_id(load_cmd.ta_handle, 0, &arg->session); + } } } mutex_unlock(&ta_refcount_mutex); -- Gitee From 4372c66bd6875b85e40abe76a02e13e51e36d93a Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Fri, 29 Sep 2023 12:30:24 +0530 Subject: [PATCH 070/257] tee: amdtee: fix use-after-free vulnerability in amdtee_close_session commit f4384b3e54ea813868bb81a861bf5b2406e15d8f upstream There is a potential race condition in amdtee_close_session that may cause use-after-free in amdtee_open_session. For instance, if a session has refcount == 1, and one thread tries to free this session via: kref_put(&sess->refcount, destroy_session); the reference count will get decremented, and the next step would be to call destroy_session(). However, if in another thread, amdtee_open_session() is called before destroy_session() has completed execution, alloc_session() may return 'sess' that will be freed up later in destroy_session() leading to use-after-free in amdtee_open_session. To fix this issue, treat decrement of sess->refcount and removal of 'sess' from session list in destroy_session() as a critical section, so that it is executed atomically. Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver") Cc: stable@vger.kernel.org Signed-off-by: Rijo Thomas Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/tee/amdtee/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c index 7f403bca3a51..c969eb13885e 100644 --- a/drivers/tee/amdtee/core.c +++ b/drivers/tee/amdtee/core.c @@ -216,12 +216,12 @@ static int copy_ta_binary(struct tee_context *ctx, void *ptr, void **ta, return rc; } +/* mutex must be held by caller */ static void destroy_session(struct kref *ref) { struct amdtee_session *sess = container_of(ref, struct amdtee_session, refcount); - mutex_lock(&session_list_mutex); list_del(&sess->list_node); mutex_unlock(&session_list_mutex); kfree(sess); @@ -271,7 +271,8 @@ int amdtee_open_session(struct tee_context *ctx, if (arg->ret != TEEC_SUCCESS) { pr_err("open_session failed %d\n", arg->ret); handle_unload_ta(ta_handle); - kref_put(&sess->refcount, destroy_session); + kref_put_mutex(&sess->refcount, destroy_session, + &session_list_mutex); goto out; } @@ -289,7 +290,8 @@ int amdtee_open_session(struct tee_context *ctx, pr_err("reached maximum session count %d\n", TEE_NUM_SESSIONS); handle_close_session(ta_handle, session_info); handle_unload_ta(ta_handle); - kref_put(&sess->refcount, destroy_session); + kref_put_mutex(&sess->refcount, destroy_session, + &session_list_mutex); rc = -ENOMEM; goto out; } @@ -330,7 +332,7 @@ int amdtee_close_session(struct tee_context *ctx, u32 session) handle_close_session(ta_handle, session_info); handle_unload_ta(ta_handle); - kref_put(&sess->refcount, destroy_session); + kref_put_mutex(&sess->refcount, destroy_session, &session_list_mutex); return 0; } -- Gitee From 50b3a4f23605a5529ae6d0d5510d6aa29a9218ae Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Wed, 4 Dec 2019 11:49:03 +0530 Subject: [PATCH 071/257] crypto: ccp - provide in-kernel API to submit TEE commands commit 632b0b5301f67ce54b840d55950707003a489151 upstream Extend the functionality of AMD Secure Processor (SP) driver by providing an in-kernel API to submit commands to TEE ring buffer for processing by Trusted OS running on AMD Secure Processor. Following TEE commands are supported by Trusted OS: * TEE_CMD_ID_LOAD_TA : Load Trusted Application (TA) binary into TEE environment * TEE_CMD_ID_UNLOAD_TA : Unload TA binary from TEE environment * TEE_CMD_ID_OPEN_SESSION : Open session with loaded TA * TEE_CMD_ID_CLOSE_SESSION : Close session with loaded TA * TEE_CMD_ID_INVOKE_CMD : Invoke a command with loaded TA * TEE_CMD_ID_MAP_SHARED_MEM : Map shared memory * TEE_CMD_ID_UNMAP_SHARED_MEM : Unmap shared memory Linux AMD-TEE driver will use this API to submit command buffers for processing in Trusted Execution Environment. The AMD-TEE driver shall be introduced in a separate patch. Cc: Jens Wiklander Cc: Tom Lendacky Cc: Ard Biesheuvel Co-developed-by: Devaraj Rangasamy Signed-off-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Acked-by: Gary R Hook Signed-off-by: Herbert Xu Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/crypto/ccp/tee-dev.c | 126 +++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/tee-dev.h | 1 + include/linux/psp-tee.h | 73 ++++++++++++++++++++ 3 files changed, 200 insertions(+) create mode 100644 include/linux/psp-tee.h diff --git a/drivers/crypto/ccp/tee-dev.c b/drivers/crypto/ccp/tee-dev.c index ccbc2ce59d51..555c8a7c5684 100644 --- a/drivers/crypto/ccp/tee-dev.c +++ b/drivers/crypto/ccp/tee-dev.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "psp-dev.h" #include "tee-dev.h" @@ -38,6 +39,7 @@ static int tee_alloc_ring(struct psp_tee_device *tee, int ring_size) rb_mgr->ring_start = start_addr; rb_mgr->ring_size = ring_size; rb_mgr->ring_pa = __psp_pa(start_addr); + mutex_init(&rb_mgr->mutex); return 0; } @@ -55,6 +57,7 @@ static void tee_free_ring(struct psp_tee_device *tee) rb_mgr->ring_start = NULL; rb_mgr->ring_size = 0; rb_mgr->ring_pa = 0; + mutex_destroy(&rb_mgr->mutex); } static int tee_wait_cmd_poll(struct psp_tee_device *tee, unsigned int timeout, @@ -236,3 +239,126 @@ void tee_dev_destroy(struct psp_device *psp) tee_destroy_ring(tee); } + +static int tee_submit_cmd(struct psp_tee_device *tee, enum tee_cmd_id cmd_id, + void *buf, size_t len, struct tee_ring_cmd **resp) +{ + struct tee_ring_cmd *cmd; + u32 rptr, wptr; + int nloop = 1000, ret = 0; + + *resp = NULL; + + mutex_lock(&tee->rb_mgr.mutex); + + wptr = tee->rb_mgr.wptr; + + /* Check if ring buffer is full */ + do { + rptr = ioread32(tee->io_regs + tee->vdata->ring_rptr_reg); + + if (!(wptr + sizeof(struct tee_ring_cmd) == rptr)) + break; + + dev_info(tee->dev, "tee: ring buffer full. rptr = %u wptr = %u\n", + rptr, wptr); + + /* Wait if ring buffer is full */ + mutex_unlock(&tee->rb_mgr.mutex); + schedule_timeout_interruptible(msecs_to_jiffies(10)); + mutex_lock(&tee->rb_mgr.mutex); + + } while (--nloop); + + if (!nloop && (wptr + sizeof(struct tee_ring_cmd) == rptr)) { + dev_err(tee->dev, "tee: ring buffer full. rptr = %u wptr = %u\n", + rptr, wptr); + ret = -EBUSY; + goto unlock; + } + + /* Pointer to empty data entry in ring buffer */ + cmd = (struct tee_ring_cmd *)(tee->rb_mgr.ring_start + wptr); + + /* Write command data into ring buffer */ + cmd->cmd_id = cmd_id; + cmd->cmd_state = TEE_CMD_STATE_INIT; + memset(&cmd->buf[0], 0, sizeof(cmd->buf)); + memcpy(&cmd->buf[0], buf, len); + + /* Update local copy of write pointer */ + tee->rb_mgr.wptr += sizeof(struct tee_ring_cmd); + if (tee->rb_mgr.wptr >= tee->rb_mgr.ring_size) + tee->rb_mgr.wptr = 0; + + /* Trigger interrupt to Trusted OS */ + iowrite32(tee->rb_mgr.wptr, tee->io_regs + tee->vdata->ring_wptr_reg); + + /* The response is provided by Trusted OS in same + * location as submitted data entry within ring buffer. + */ + *resp = cmd; + +unlock: + mutex_unlock(&tee->rb_mgr.mutex); + + return ret; +} + +static int tee_wait_cmd_completion(struct psp_tee_device *tee, + struct tee_ring_cmd *resp, + unsigned int timeout) +{ + /* ~5ms sleep per loop => nloop = timeout * 200 */ + int nloop = timeout * 200; + + while (--nloop) { + if (resp->cmd_state == TEE_CMD_STATE_COMPLETED) + return 0; + + usleep_range(5000, 5100); + } + + dev_err(tee->dev, "tee: command 0x%x timed out, disabling PSP\n", + resp->cmd_id); + + psp_dead = true; + + return -ETIMEDOUT; +} + +int psp_tee_process_cmd(enum tee_cmd_id cmd_id, void *buf, size_t len, + u32 *status) +{ + struct psp_device *psp = psp_get_master_device(); + struct psp_tee_device *tee; + struct tee_ring_cmd *resp; + int ret; + + if (!buf || !status || !len || len > sizeof(resp->buf)) + return -EINVAL; + + *status = 0; + + if (!psp || !psp->tee_data) + return -ENODEV; + + if (psp_dead) + return -EBUSY; + + tee = psp->tee_data; + + ret = tee_submit_cmd(tee, cmd_id, buf, len, &resp); + if (ret) + return ret; + + ret = tee_wait_cmd_completion(tee, resp, TEE_DEFAULT_TIMEOUT); + if (ret) + return ret; + + memcpy(buf, &resp->buf[0], len); + *status = resp->status; + + return 0; +} +EXPORT_SYMBOL(psp_tee_process_cmd); diff --git a/drivers/crypto/ccp/tee-dev.h b/drivers/crypto/ccp/tee-dev.h index b3db0fcb550c..f09960112115 100644 --- a/drivers/crypto/ccp/tee-dev.h +++ b/drivers/crypto/ccp/tee-dev.h @@ -54,6 +54,7 @@ struct tee_init_ring_cmd { * @wptr: index to the last written entry in ring buffer */ struct ring_buf_manager { + struct mutex mutex; /* synchronizes access to ring buffer */ void *ring_start; u32 ring_size; phys_addr_t ring_pa; diff --git a/include/linux/psp-tee.h b/include/linux/psp-tee.h new file mode 100644 index 000000000000..63bb2212fce0 --- /dev/null +++ b/include/linux/psp-tee.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: MIT */ +/* + * AMD Trusted Execution Environment (TEE) interface + * + * Author: Rijo Thomas + * + * Copyright 2019 Advanced Micro Devices, Inc. + * + */ + +#ifndef __PSP_TEE_H_ +#define __PSP_TEE_H_ + +#include +#include + +/* This file defines the Trusted Execution Environment (TEE) interface commands + * and the API exported by AMD Secure Processor driver to communicate with + * AMD-TEE Trusted OS. + */ + +/** + * enum tee_cmd_id - TEE Interface Command IDs + * @TEE_CMD_ID_LOAD_TA: Load Trusted Application (TA) binary into + * TEE environment + * @TEE_CMD_ID_UNLOAD_TA: Unload TA binary from TEE environment + * @TEE_CMD_ID_OPEN_SESSION: Open session with loaded TA + * @TEE_CMD_ID_CLOSE_SESSION: Close session with loaded TA + * @TEE_CMD_ID_INVOKE_CMD: Invoke a command with loaded TA + * @TEE_CMD_ID_MAP_SHARED_MEM: Map shared memory + * @TEE_CMD_ID_UNMAP_SHARED_MEM: Unmap shared memory + */ +enum tee_cmd_id { + TEE_CMD_ID_LOAD_TA = 1, + TEE_CMD_ID_UNLOAD_TA, + TEE_CMD_ID_OPEN_SESSION, + TEE_CMD_ID_CLOSE_SESSION, + TEE_CMD_ID_INVOKE_CMD, + TEE_CMD_ID_MAP_SHARED_MEM, + TEE_CMD_ID_UNMAP_SHARED_MEM, +}; + +#ifdef CONFIG_CRYPTO_DEV_SP_PSP +/** + * psp_tee_process_cmd() - Process command in Trusted Execution Environment + * @cmd_id: TEE command ID (&enum tee_cmd_id) + * @buf: Command buffer for TEE processing. On success, is updated + * with the response + * @len: Length of command buffer in bytes + * @status: On success, holds the TEE command execution status + * + * This function submits a command to the Trusted OS for processing in the + * TEE environment and waits for a response or until the command times out. + * + * Returns: + * 0 if TEE successfully processed the command + * -%ENODEV if PSP device not available + * -%EINVAL if invalid input + * -%ETIMEDOUT if TEE command timed out + * -%EBUSY if PSP device is not responsive + */ +int psp_tee_process_cmd(enum tee_cmd_id cmd_id, void *buf, size_t len, + u32 *status); + +#else /* !CONFIG_CRYPTO_DEV_SP_PSP */ + +static inline int psp_tee_process_cmd(enum tee_cmd_id cmd_id, void *buf, + size_t len, u32 *status) +{ + return -ENODEV; +} +#endif /* CONFIG_CRYPTO_DEV_SP_PSP */ +#endif /* __PSP_TEE_H_ */ -- Gitee From d3fa41f5cba1806650def89cef067cdec8b68515 Mon Sep 17 00:00:00 2001 From: Rijo Thomas Date: Mon, 15 Mar 2021 13:55:29 +0530 Subject: [PATCH 072/257] crypto: ccp - fix command queuing to TEE ring buffer commit 00aa6e65aa04e500a11a2c91e92a11c37b9e234d upstream Multiple threads or clients can submit a command to the TEE ring buffer. This patch helps to synchronize command submission to the ring. One thread shall write a command to a TEE ring buffer entry only if: - Trusted OS has notified that the TEE command for the given entry has been processed and driver has copied the TEE response into client buffer. - The command entry is empty and can be written into. After a command has been written to the TEE ring buffer, the global wptr (mutex protected) shall be incremented for use by next client. If PSP became unresponsive while processing TEE request from a client, then further command submission to queue will be disabled. Fixes: 33960acccfbd (crypto: ccp - add TEE support for Raven Ridge) Reviewed-by: Devaraj Rangasamy Signed-off-by: Rijo Thomas Signed-off-by: Herbert Xu Signed-off-by: Abhishek Rajput Signed-off-by: mohanasv2 --- drivers/crypto/ccp/tee-dev.c | 49 +++++++++++++++++++++++++----------- drivers/crypto/ccp/tee-dev.h | 20 +++++++++++++-- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/drivers/crypto/ccp/tee-dev.c b/drivers/crypto/ccp/tee-dev.c index 555c8a7c5684..758473a3e1be 100644 --- a/drivers/crypto/ccp/tee-dev.c +++ b/drivers/crypto/ccp/tee-dev.c @@ -36,6 +36,7 @@ static int tee_alloc_ring(struct psp_tee_device *tee, int ring_size) if (!start_addr) return -ENOMEM; + memset(start_addr, 0x0, ring_size); rb_mgr->ring_start = start_addr; rb_mgr->ring_size = ring_size; rb_mgr->ring_pa = __psp_pa(start_addr); @@ -244,41 +245,54 @@ static int tee_submit_cmd(struct psp_tee_device *tee, enum tee_cmd_id cmd_id, void *buf, size_t len, struct tee_ring_cmd **resp) { struct tee_ring_cmd *cmd; - u32 rptr, wptr; int nloop = 1000, ret = 0; + u32 rptr; *resp = NULL; mutex_lock(&tee->rb_mgr.mutex); - wptr = tee->rb_mgr.wptr; - - /* Check if ring buffer is full */ + /* Loop until empty entry found in ring buffer */ do { + /* Get pointer to ring buffer command entry */ + cmd = (struct tee_ring_cmd *) + (tee->rb_mgr.ring_start + tee->rb_mgr.wptr); + rptr = ioread32(tee->io_regs + tee->vdata->ring_rptr_reg); - if (!(wptr + sizeof(struct tee_ring_cmd) == rptr)) + /* Check if ring buffer is full or command entry is waiting + * for response from TEE + */ + if (!(tee->rb_mgr.wptr + sizeof(struct tee_ring_cmd) == rptr || + cmd->flag == CMD_WAITING_FOR_RESPONSE)) break; - dev_info(tee->dev, "tee: ring buffer full. rptr = %u wptr = %u\n", - rptr, wptr); + dev_dbg(tee->dev, "tee: ring buffer full. rptr = %u wptr = %u\n", + rptr, tee->rb_mgr.wptr); - /* Wait if ring buffer is full */ + /* Wait if ring buffer is full or TEE is processing data */ mutex_unlock(&tee->rb_mgr.mutex); schedule_timeout_interruptible(msecs_to_jiffies(10)); mutex_lock(&tee->rb_mgr.mutex); } while (--nloop); - if (!nloop && (wptr + sizeof(struct tee_ring_cmd) == rptr)) { - dev_err(tee->dev, "tee: ring buffer full. rptr = %u wptr = %u\n", - rptr, wptr); + if (!nloop && + (tee->rb_mgr.wptr + sizeof(struct tee_ring_cmd) == rptr || + cmd->flag == CMD_WAITING_FOR_RESPONSE)) { + dev_err(tee->dev, "tee: ring buffer full. rptr = %u wptr = %u response flag %u\n", + rptr, tee->rb_mgr.wptr, cmd->flag); ret = -EBUSY; goto unlock; } - /* Pointer to empty data entry in ring buffer */ - cmd = (struct tee_ring_cmd *)(tee->rb_mgr.ring_start + wptr); + /* Do not submit command if PSP got disabled while processing any + * command in another thread + */ + if (psp_dead) { + ret = -EBUSY; + goto unlock; + } /* Write command data into ring buffer */ cmd->cmd_id = cmd_id; @@ -286,6 +300,9 @@ static int tee_submit_cmd(struct psp_tee_device *tee, enum tee_cmd_id cmd_id, memset(&cmd->buf[0], 0, sizeof(cmd->buf)); memcpy(&cmd->buf[0], buf, len); + /* Indicate driver is waiting for response */ + cmd->flag = CMD_WAITING_FOR_RESPONSE; + /* Update local copy of write pointer */ tee->rb_mgr.wptr += sizeof(struct tee_ring_cmd); if (tee->rb_mgr.wptr >= tee->rb_mgr.ring_size) @@ -353,12 +370,16 @@ int psp_tee_process_cmd(enum tee_cmd_id cmd_id, void *buf, size_t len, return ret; ret = tee_wait_cmd_completion(tee, resp, TEE_DEFAULT_TIMEOUT); - if (ret) + if (ret) { + resp->flag = CMD_RESPONSE_TIMEDOUT; return ret; + } memcpy(buf, &resp->buf[0], len); *status = resp->status; + resp->flag = CMD_RESPONSE_COPIED; + return 0; } EXPORT_SYMBOL(psp_tee_process_cmd); diff --git a/drivers/crypto/ccp/tee-dev.h b/drivers/crypto/ccp/tee-dev.h index f09960112115..49d26158b71e 100644 --- a/drivers/crypto/ccp/tee-dev.h +++ b/drivers/crypto/ccp/tee-dev.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: MIT */ /* - * Copyright 2019 Advanced Micro Devices, Inc. + * Copyright (C) 2019,2021 Advanced Micro Devices, Inc. * * Author: Rijo Thomas * Author: Devaraj Rangasamy @@ -18,7 +18,7 @@ #include #define TEE_DEFAULT_TIMEOUT 10 -#define MAX_BUFFER_SIZE 992 +#define MAX_BUFFER_SIZE 988 /** * enum tee_ring_cmd_id - TEE interface commands for ring buffer configuration @@ -81,6 +81,20 @@ enum tee_cmd_state { TEE_CMD_STATE_COMPLETED, }; +/** + * enum cmd_resp_state - TEE command's response status maintained by driver + * @CMD_RESPONSE_INVALID: initial state when no command is written to ring + * @CMD_WAITING_FOR_RESPONSE: driver waiting for response from TEE + * @CMD_RESPONSE_TIMEDOUT: failed to get response from TEE + * @CMD_RESPONSE_COPIED: driver has copied response from TEE + */ +enum cmd_resp_state { + CMD_RESPONSE_INVALID, + CMD_WAITING_FOR_RESPONSE, + CMD_RESPONSE_TIMEDOUT, + CMD_RESPONSE_COPIED, +}; + /** * struct tee_ring_cmd - Structure of the command buffer in TEE ring * @cmd_id: refers to &enum tee_cmd_id. Command id for the ring buffer @@ -91,6 +105,7 @@ enum tee_cmd_state { * @pdata: private data (currently unused) * @res1: reserved region * @buf: TEE command specific buffer + * @flag: refers to &enum cmd_resp_state */ struct tee_ring_cmd { u32 cmd_id; @@ -100,6 +115,7 @@ struct tee_ring_cmd { u64 pdata; u32 res1[2]; u8 buf[MAX_BUFFER_SIZE]; + u32 flag; /* Total size: 1024 bytes */ } __packed; -- Gitee From 7a5c4f6c7c9e3675771e74388a59e0b4ee69d06b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 8 Sep 2020 11:01:27 -0700 Subject: [PATCH 073/257] perf: Stop using deprecated bpf_program__title() commit 8081ede1f73139687a75147d529cebcc5e4149c5 upstream Switch from deprecated bpf_program__title() API to bpf_program__section_name(). Also drop unnecessary error checks because neither bpf_program__title() nor bpf_program__section_name() can fail or return NULL. Fixes: 521095842027 ("libbpf: Deprecate notion of BPF program "title" in favor of "section name"") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Tobias Klauser Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20200908180127.1249-1-andriin@fb.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/bpf-loader.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 10c187b8b8ea..fe6767ecfbba 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -328,12 +328,6 @@ config_bpf_program(struct bpf_program *prog) probe_conf.no_inlines = false; probe_conf.force_add = false; - config_str = bpf_program__title(prog, false); - if (IS_ERR(config_str)) { - pr_debug("bpf: unable to get title for program\n"); - return PTR_ERR(config_str); - } - priv = calloc(sizeof(*priv), 1); if (!priv) { pr_debug("bpf: failed to alloc priv\n"); @@ -341,6 +335,7 @@ config_bpf_program(struct bpf_program *prog) } pev = &priv->pev; + config_str = bpf_program__section_name(prog); pr_debug("bpf: config program '%s'\n", config_str); err = parse_prog_config(config_str, &main_str, &is_tp, pev); if (err) @@ -454,10 +449,7 @@ preproc_gen_prologue(struct bpf_program *prog, int n, if (err) { const char *title; - title = bpf_program__title(prog, false); - if (!title) - title = "[unknown]"; - + title = bpf_program__section_name(prog); pr_debug("Failed to generate prologue for program %s\n", title); return err; -- Gitee From ef82d3dd277eb3289f6a6988a48b2a0fe50aa8a5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 22 Jan 2021 12:40:46 -0800 Subject: [PATCH 074/257] x86/cpufeatures: Assign dedicated feature word for CPUID_0x8000001F[EAX] commit fb35d30fe5b06cc24444f0405da8fbe0be5330d1 upstream Collect the scattered SME/SEV related feature flags into a dedicated word. There are now five recognized features in CPUID.0x8000001F.EAX, with at least one more on the horizon (SEV-SNP). Using a dedicated word allows KVM to use its automagic CPUID adjustment logic when reporting the set of supported features to userspace. No functional change intended. [Backport changes] 1.Skipped deletion of SEV-ES and VM_PAGE_FLUSH related code in arch/x86/include/asm/cpufeatures.h and arch/x86/kernel/cpu/scattered.c, as these features are not available in kernel 5.4. However, the required macros (X86_FEATURE_VM_PAGE_FLUSH and X86_FEATURE_SEV_ES) are added in the appropriate locations as per current upstream commit to avoid future conflicts. Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov Reviewed-by: Brijesh Singh Link: https://lkml.kernel.org/r/20210122204047.2860075-2-seanjc@google.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/include/asm/cpufeature.h | 7 +++++-- arch/x86/include/asm/cpufeatures.h | 15 +++++++++++---- arch/x86/include/asm/disabled-features.h | 3 ++- arch/x86/include/asm/required-features.h | 3 ++- arch/x86/kernel/cpu/common.c | 3 +++ arch/x86/kernel/cpu/scattered.c | 3 --- tools/arch/x86/include/asm/disabled-features.h | 3 ++- tools/arch/x86/include/asm/required-features.h | 3 ++- 8 files changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 619c1f80a2ab..00d329a990e5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -30,6 +30,7 @@ enum cpuid_leafs CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, + CPUID_8000_001F_EAX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 554a12d2bd58..8719506ee207 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -94,7 +94,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */ +/* FREE! ( 3*32+17) */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -220,7 +220,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +/* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ #define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ @@ -230,7 +230,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ +/* FREE! ( 7*32+20) */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ @@ -402,6 +402,13 @@ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ +#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 210bc1350e87..251a84d87ec2 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -90,6 +90,7 @@ #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 6847d85400a8..fa5700097f64 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0497b8b228d7..236de8d51553 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -959,6 +959,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + init_scattered_cpuid_features(c); init_speculation_control(c); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index a03e309a0ac5..37f716eaf0e6 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,9 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, - { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 8e1d0bb46361..f6d8002d11e0 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -84,6 +84,7 @@ #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index 6847d85400a8..fa5700097f64 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ -- Gitee From 1197a50a50e350e92c76e4e826a26c848150fed7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:53 +0530 Subject: [PATCH 075/257] x86/cpufeatures: Add PerfMonV2 feature bit commit d6d0c7f681fda1d07e005c8f653e578b77a0eb40 upstream CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 0 of EAX indicates support for Performance Monitoring Version 2 (PerfMonV2) features. If found to be set during PMU initialization, the EBX bits of the same CPUID function can be used to determine the number of available PMCs for different PMU types. Additionally, Core PMCs can be managed using new global control and status registers. For better utilization of feature words, PerfMonV2 is added as a scattered feature bit. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/c70e497e22f18e7f05b025bb64ca21cc12b17792.1650515382.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8719506ee207..690da46120b6 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -230,7 +230,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -/* FREE! ( 7*32+20) */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 37f716eaf0e6..a5872551f3f6 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,6 +40,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; -- Gitee From 61bf929f46d6bd7bdb8fc5f1032dd36a46ce61c0 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:54 +0530 Subject: [PATCH 076/257] x86/msr: Add PerfCntrGlobal* registers commit 089be16d5992dd0bc6df15ef12042fd1023ded9a upstream Add MSR definitions that will be used to enable the new AMD Performance Monitoring Version 2 (PerfMonV2) features. These include: * Performance Counter Global Control (PerfCntrGlobalCtl) * Performance Counter Global Status (PerfCntrGlobalStatus) * Performance Counter Global Status Clear (PerfCntrGlobalStatusClr) The new Performance Counter Global Control and Status MSRs provide an interface for enabling or disabling multiple counters at the same time and for testing overflow without probing the individual registers for each PMC. The availability of these registers is indicated through the PerfMonV2 feature bit of CPUID leaf 0x80000022 EAX. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/cdc0d8f75bd519848731b5c64d924f5a0619a573.1650515382.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/include/asm/msr-index.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f779f80bb7c5..752a1a82a77b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -497,6 +497,11 @@ #define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 #define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) +/* AMD Performance Counter Global Status and Control MSRs */ +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 +#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 -- Gitee From 696a589fbd26b2551d8651b58e42cab31d072e87 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:55 +0530 Subject: [PATCH 077/257] perf/x86/amd/core: Detect PerfMonV2 support commit 21d59e3e2c403c83ba196a5857d517054124168e upstream AMD Performance Monitoring Version 2 (PerfMonV2) introduces some new Core PMU features such as detection of the number of available PMCs and managing PMCs using global registers namely, PerfCntrGlobalCtl and PerfCntrGlobalStatus. Clearing PerfCntrGlobalCtl and PerfCntrGlobalStatus ensures that all PMCs are inactive and have no pending overflows when CPUs are onlined or offlined. The PMU version (x86_pmu.version) now indicates PerfMonV2 support and will be used to bypass the new features on unsupported processors. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/dc8672ecbddff394e088ca8abf94b089b8ecc2e7.1650515382.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index f6d8808b7b3c..cc85b6303d21 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -18,6 +18,9 @@ static unsigned long perf_nmi_window; #define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) #define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) +/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */ +static u64 amd_pmu_global_cntr_mask __read_mostly; + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -510,6 +513,18 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +static void amd_pmu_cpu_reset(int cpu) +{ + if (x86_pmu.version < 2) + return; + + /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + + /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); +} + static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -555,6 +570,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -574,6 +590,8 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } + + amd_pmu_cpu_reset(cpu); } /* @@ -966,6 +984,15 @@ static int __init amd_core_pmu_init(void) x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + /* Check for Performance Monitoring v2 support */ + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + /* Update PMU version for later usage */ + x86_pmu.version = 2; + + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + } + /* * AMD Core perfctr has separate MSRs for the NB events, see * the amd/uncore.c driver. -- Gitee From 880335fa780d54133177f8c1cd4f0ea24a667699 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:56 +0530 Subject: [PATCH 078/257] perf/x86/amd/core: Detect available counters commit 56e026a7ca3f92b8e44359e1f705febd1833f701 upstream If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use CPUID leaf 0x80000022 EBX to detect the number of Core PMCs. This offers more flexibility if the counts change in later processor families. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/68a6d9688df189267db26530378870edd34f7b06.1650515382.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 6 ++++++ arch/x86/include/asm/perf_event.h | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index cc85b6303d21..f552340026ec 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -967,6 +967,7 @@ static __initconst const struct x86_pmu amd_pmu = { static int __init amd_core_pmu_init(void) { + union cpuid_0x80000022_ebx ebx; u64 even_ctr_mask = 0ULL; int i; @@ -987,9 +988,14 @@ static int __init amd_core_pmu_init(void) /* Check for Performance Monitoring v2 support */ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + /* Update PMU version for later usage */ x86_pmu.version = 2; + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 9c0200ede242..95c9f25ce189 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -166,6 +166,18 @@ union cpuid10_edx { unsigned int full; }; +/* + * AMD "Extended Performance Monitoring and Debug" CPUID + * detection/enumeration details: + */ +union cpuid_0x80000022_ebx { + struct { + /* Number of Core Performance Counters */ + unsigned int num_core_pmc:4; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; @@ -345,6 +357,11 @@ struct pebs_lbr { struct pebs_lbr_entry lbr[0]; /* Variable length */ }; +/* + * AMD Extended Performance Monitoring and Debug cpuid feature detection + */ +#define EXT_PERFMON_DEBUG_FEATURES 0x80000022 + /* * IBS cpuid feature detection */ -- Gitee From 4e84c10419cdc62bb861bb5c58985bd441904df3 Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Wed, 23 Oct 2019 10:12:54 +0300 Subject: [PATCH 079/257] perf/x86/intel: Implement LBR callstack context synchronization commit 421ca868ea3b7c1ca1a541ed6dff3c101a563b95 upstream Implement intel_pmu_lbr_swap_task_ctx() method updating counters of the events that requested LBR callstack data on a sample. The counter can be zero for the case when task context belongs to a thread that has just come from a block on a futex and the context contains saved (lbr_stack_state == LBR_VALID) LBR register values. For the values to be restored at LBR registers on the next thread's switch-in event it swaps the counter value with the one that is expected to be non zero at the previous equivalent task perf event context. Swap operation type ensures the previous task perf event context stays consistent with the amount of events that requested LBR callstack data on a sample. Signed-off-by: Alexey Budankov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/261ac742-9022-c3f4-5885-1eae7415b091@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 23 +++++++++++++++++++++++ arch/x86/events/perf_event.h | 3 +++ 2 files changed, 26 insertions(+) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 73dbd30f82d3..f1a2af32f3a5 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -423,6 +423,29 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } +void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, + struct perf_event_context *next) +{ + struct x86_perf_task_context *prev_ctx_data, *next_ctx_data; + + swap(prev->task_ctx_data, next->task_ctx_data); + + /* + * Architecture specific synchronization makes sense in + * case both prev->task_ctx_data and next->task_ctx_data + * pointers are allocated. + */ + + prev_ctx_data = next->task_ctx_data; + next_ctx_data = prev->task_ctx_data; + + if (!prev_ctx_data || !next_ctx_data) + return; + + swap(prev_ctx_data->lbr_callstack_users, + next_ctx_data->lbr_callstack_users); +} + void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 39db37ee8ffa..b3e13f72922b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1093,6 +1093,9 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); void intel_ds_init(void); +void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, + struct perf_event_context *next); + void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); u64 lbr_from_signext_quirk_wr(u64 val); -- Gitee From a27ffa946f8fd111d4cb4580a38d6ef96c150efe Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Sun, 10 Nov 2019 17:44:53 +0800 Subject: [PATCH 080/257] perf/x86/amd: Remove set but not used variable 'active' commit 8f05c1ff8bfb8cbae0898e5dc6791927d1e5c503 upstream '-Wunused-but-set-variable' triggers this warning: arch/x86/events/amd/core.c: In function amd_pmu_handle_irq: arch/x86/events/amd/core.c:656:6: warning: variable active set but not used [-Wunused-but-set-variable] GCC is right, 'active' is not used anymore. This variable was introduced earlier this year and then removed in: df4d29732fdad perf/x86/amd: Change/fix NMI latency mitigation to use a timestamp [ mingo: Improved the changelog, fixed build warning caused by this fix, improved surrounding code. ] Reported-by: Hulk Robot Signed-off-by: Zheng Yongjun Cc: Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191110094453.113001-1-zhengyongjun3@huawei.com Signed-off-by: Ingo Molnar Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index f552340026ec..941d034627c2 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -689,15 +689,7 @@ static void amd_pmu_disable_event(struct perf_event *event) */ static int amd_pmu_handle_irq(struct pt_regs *regs) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int active, handled; - - /* - * Obtain the active count before calling x86_pmu_handle_irq() since - * it is possible that x86_pmu_handle_irq() may make a counter - * inactive (through x86_pmu_stop). - */ - active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX); + int handled; /* Process any counter overflows */ handled = x86_pmu_handle_irq(regs); @@ -707,8 +699,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) * NMIs will be claimed if arriving within that window. */ if (handled) { - this_cpu_write(perf_nmi_tstamp, - jiffies + perf_nmi_window); + this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); return handled; } -- Gitee From 46aea45a990c2b42da0ff9286b4b7b020283c2fa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 20 Dec 2019 20:45:12 -0800 Subject: [PATCH 081/257] perf/x86: Provide stubs of KVM helpers for non-Intel CPUs commit 616c59b52342b0370ab822ce88fa0ff7f3671e4a upstream Provide stubs for perf_guest_get_msrs() and intel_pt_handle_vmx() when building without support for Intel CPUs, i.e. CPU_SUP_INTEL=n. Lack of stubs is not currently a problem as the only user, KVM_INTEL, takes a dependency on CPU_SUP_INTEL=y. Provide the stubs for all CPUs so that KVM_INTEL can be built for any CPU with compatible hardware support, e.g. Centuar and Zhaoxin CPUs. Note, the existing stub for perf_guest_get_msrs() is essentially dead code as KVM selects CONFIG_PERF_EVENTS, i.e. the only user guarantees the full implementation is built. [Backport Changes] In arch/x86/include/perf_event.h, the upstream patch is introducing the conditional block which checks for state of macros CONFIG_PERF_EVENTS and CONFIG_CPU_SUP_INTEL before declaring function perf_guest_get_msrs(). However, this condition block already exists in the current codebase, added in commit c62d6b571d57. Therefore, the function declaration was appended within the existing conditional block to avoid duplication. Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20191221044513.21680-19-sean.j.christopherson@intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/include/asm/perf_event.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 95c9f25ce189..ebb2b78ba8af 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -472,17 +472,10 @@ struct x86_pmu_lbr { unsigned int info; }; -extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); extern void perf_check_microcode(void); extern int x86_perf_rdpmc_index(struct perf_event *event); #else -static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) -{ - *nr = 0; - return NULL; -} - static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { memset(cap, 0, sizeof(*cap)); @@ -494,15 +487,26 @@ static inline void perf_check_microcode(void) { } #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); #else static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { return -1; } +static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + *nr = 0; + return NULL; +} #endif #ifdef CONFIG_CPU_SUP_INTEL extern void intel_pt_handle_vmx(int on); +#else +static inline void intel_pt_handle_vmx(int on) +{ + +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) -- Gitee From e1496189af98ad34a214e5b66baf8760d27a5679 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 27 Jan 2020 08:53:54 -0800 Subject: [PATCH 082/257] perf/core: Add new branch sample type for HW index of raw branch records commit bbfd5e4fab63703375eafaf241a0c696024a59e1 upstream The low level index is the index in the underlying hardware buffer of the most recently captured taken branch which is always saved in branch_entries[0]. It is very useful for reconstructing the call stack. For example, in Intel LBR call stack mode, the depth of reconstructed LBR call stack limits to the number of LBR registers. With the low level index information, perf tool may stitch the stacks of two samples. The reconstructed LBR call stack can break the HW limitation. Add a new branch sample type to retrieve low level index of raw branch records. The low level index is between -1 (unknown) and max depth which can be retrieved in /sys/devices/cpu/caps/branches. Only when the new branch sample type is set, the low level index information is dumped into the PERF_SAMPLE_BRANCH_STACK output. Perf tool should check the attr.branch_sample_type, and apply the corresponding format for PERF_SAMPLE_BRANCH_STACK samples. Otherwise, some user case may be broken. For example, users may parse a perf.data, which include the new branch sample type, with an old version perf tool (without the check). Users probably get incorrect information without any warning. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200127165355.27495-2-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/powerpc/perf/core-book3s.c | 1 + arch/x86/events/intel/lbr.c | 3 +++ include/linux/perf_event.h | 12 ++++++++++++ include/uapi/linux/perf_event.h | 8 +++++++- kernel/events/core.c | 10 ++++++++++ 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 6f013e418834..58de6cec907b 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -529,6 +529,7 @@ static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) } } cpuhw->bhrb_stack.nr = u_index; + cpuhw->bhrb_stack.hw_idx = -1ULL; return; } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f1a2af32f3a5..784f4c48e113 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -605,6 +605,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; + cpuc->lbr_stack.hw_idx = -1ULL; } /* @@ -700,6 +701,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) out++; } cpuc->lbr_stack.nr = out; + cpuc->lbr_stack.hw_idx = -1ULL; } void intel_pmu_lbr_read(void) @@ -1141,6 +1143,7 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) int i; cpuc->lbr_stack.nr = x86_pmu.lbr_nr; + cpuc->lbr_stack.hw_idx = -1ULL; for (i = 0; i < x86_pmu.lbr_nr; i++) { u64 info = lbr->lbr[i].info; struct perf_branch_entry *e = &cpuc->lbr_entries[i]; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 07fc4b3e3ce4..f7792394ccf8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -92,14 +92,26 @@ struct perf_raw_record { /* * branch stack layout: * nr: number of taken branches stored in entries[] + * hw_idx: The low level index of raw branch records + * for the most recent branch. + * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. + * The entries[] is an abstraction of raw branch records, + * which may not be stored in age order in HW, e.g. Intel LBR. + * The hw_idx is to expose the low level index of raw + * branch record for the most recent branch aka entries[0]. + * The hw_idx index is between -1 (unknown) and max depth, + * which can be retrieved in /sys/devices/cpu/caps/branches. + * For the architectures whose raw branch records are + * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; + __u64 hw_idx; struct perf_branch_entry entries[0]; }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index ceccd980ffcf..507eddef8715 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -180,6 +180,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -207,6 +209,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -849,7 +853,9 @@ enum perf_event_type { * char data[size];}&& PERF_SAMPLE_RAW * * { u64 nr; - * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER diff --git a/kernel/events/core.c b/kernel/events/core.c index 2043b0f729a2..d478c9789ff6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6403,6 +6403,11 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } +static inline bool perf_sample_save_hw_index(struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -6491,6 +6496,8 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); + if (perf_sample_save_hw_index(event)) + perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { /* @@ -6680,6 +6687,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->br_stack) { + if (perf_sample_save_hw_index(event)) + size += sizeof(u64); + size += data->br_stack->nr * sizeof(struct perf_branch_entry); } -- Gitee From e0a608f19e918a56fdeb9ab662c41e092fcd654f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 27 Jan 2020 08:53:55 -0800 Subject: [PATCH 083/257] perf/x86/intel: Output LBR TOS information correctly commit db278b90c326ce5895be09b6171f5ff3df1e3cca upstream For Intel LBR, the LBR Top-of-Stack (TOS) information is the HW index of raw branch record for the most recent branch. For non-adaptive PEBS and non-PEBS, the TOS information can be directly retrieved from TOS MSR read in intel_pmu_lbr_read(). For adaptive PEBS, the LBR information stored in PEBS record doesn't include the TOS information. For single PEBS, TOS can be directly read from MSR, because the PMI is triggered immediately after PEBS is written. TOS MSR is still unchanged. For large PEBS, TOS MSR has stale value. Set -1ULL to indicate that the TOS information is not available. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200127165355.27495-3-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 784f4c48e113..83a12e961df4 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -605,7 +605,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; - cpuc->lbr_stack.hw_idx = -1ULL; + cpuc->lbr_stack.hw_idx = tos; } /* @@ -701,7 +701,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) out++; } cpuc->lbr_stack.nr = out; - cpuc->lbr_stack.hw_idx = -1ULL; + cpuc->lbr_stack.hw_idx = tos; } void intel_pmu_lbr_read(void) @@ -1143,7 +1143,13 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) int i; cpuc->lbr_stack.nr = x86_pmu.lbr_nr; - cpuc->lbr_stack.hw_idx = -1ULL; + + /* Cannot get TOS for large PEBS */ + if (cpuc->n_pebs == cpuc->n_large_pebs) + cpuc->lbr_stack.hw_idx = -1ULL; + else + cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); + for (i = 0; i < x86_pmu.lbr_nr; i++) { u64 info = lbr->lbr[i].info; struct perf_branch_entry *e = &cpuc->lbr_entries[i]; -- Gitee From 95e04f75e9366bade2332d9c82d0d97e3d4b60ad Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:07 +0300 Subject: [PATCH 084/257] perf evsel: Add support for synthesized sample type commit e11869a065e36f3d22a575ccfb1097c262bb4f6e upstream For reporting purposes, an evsel sample can have a callchain synthesized from AUX area data. Add support for keeping track of synthesized sample types. Note, the recorded sample_type cannot be changed because it is needed to continue to parse events. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-11-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/util/evsel.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index ddc5ee6f6592..54becc97c688 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -100,6 +100,14 @@ struct evsel { perf_evsel__sb_cb_t *cb; void *data; } side_band; + /* + * For reporting purposes, an evsel sample can have a callchain + * synthesized from AUX area data. Keep track of synthesized sample + * types here. Note, the recorded sample_type cannot be changed because + * it is needed to continue to parse events. + * See also evsel__has_callchain(). + */ + __u64 synth_sample_type; }; struct perf_missing_features { @@ -384,7 +392,12 @@ static inline bool perf_evsel__has_branch_callstack(const struct evsel *evsel) static inline bool evsel__has_callchain(const struct evsel *evsel) { - return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0; + /* + * For reporting purposes, an evsel sample can have a recorded callchain + * or a callchain synthesized from AUX area data. + */ + return evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN || + evsel->synth_sample_type & PERF_SAMPLE_CALLCHAIN; } struct perf_env *perf_evsel__env(struct evsel *evsel); -- Gitee From 3e82681f4285e97eeb59c40b78270f78907e6862 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 29 Apr 2020 18:07:47 +0300 Subject: [PATCH 085/257] perf evsel: Add support for synthesized branch stack sample type commit 6cd2cbfc6865589c64ac37ec48937e93725622f1 upstream Allow for a synthesized branch stack to be added to samples. As with synthesized call chains, the sample type cannot be changed because it is needed to continue to parse events. So add and use helper function evsel__has_br_stack() to indicate a branch stack, whether original or synthesized. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200429150751.12570-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/util/evsel.h | 10 ++++++++++ tools/perf/util/session.c | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 54becc97c688..88882e55294e 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -400,6 +400,16 @@ static inline bool evsel__has_callchain(const struct evsel *evsel) evsel->synth_sample_type & PERF_SAMPLE_CALLCHAIN; } +static inline bool evsel__has_br_stack(const struct evsel *evsel) +{ + /* + * For reporting purposes, an evsel sample can have a recorded branch + * stack or a branch stack synthesized from AUX area data. + */ + return evsel->core.attr.sample_type & PERF_SAMPLE_BRANCH_STACK || + evsel->synth_sample_type & PERF_SAMPLE_BRANCH_STACK; +} + struct perf_env *perf_evsel__env(struct evsel *evsel); int perf_evsel__store_ids(struct evsel *evsel, struct evlist *evlist); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 5de955d60a04..17caadb33e6c 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1236,7 +1236,7 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, if (evsel__has_callchain(evsel)) callchain__printf(evsel, sample); - if (sample_type & PERF_SAMPLE_BRANCH_STACK) + if (evsel__has_br_stack(evsel)) branch_stack__printf(sample, perf_evsel__has_branch_callstack(evsel)); if (sample_type & PERF_SAMPLE_REGS_USER) -- Gitee From 81c1a99ef9838e08f34abe4874906da36c315aff Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:07 -0700 Subject: [PATCH 086/257] x86/cpufeatures: Add Architectural LBRs feature bit commit bd657aa3dd8514e62486ce7f90b5e484c18d684d upstream CPUID.(EAX=07H, ECX=0):EDX[19] indicates whether an Intel CPU supports Architectural LBRs. The "X86_FEATURE_..., word 18" is already mirrored from CPUID "0x00000007:0 (EDX)". Add X86_FEATURE_ARCH_LBR under the "word 18" section. The feature will appear as "arch_lbr" in /proc/cpuinfo. The Architectural Last Branch Records (LBR) feature enables recording of software path history by logging taken branches and other control flows. The feature will be supported in the perf_events subsystem. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/1593780569-62993-2-git-send-email-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 690da46120b6..947b7d449dd6 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -396,6 +396,7 @@ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ -- Gitee From 35dfb2e93a7d96b633817c09443aebf3fcf5aa12 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:08 -0700 Subject: [PATCH 087/257] perf/x86/intel/lbr: Add a function pointer for LBR reset commit 9f354a726cb1d4eb00a0784a27eaa0a3283cff71 upstream The method to reset Architectural LBRs is different from previous model-specific LBR. Perf has to implement a different function. A function pointer is introduced for LBR reset. The enum of LBR_FORMAT_* is also moved to perf_event.h. Perf should initialize the corresponding functions at boot time, and avoid checking lbr_format at run time. The current 64-bit LBR reset function is set as default. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-3-git-send-email-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/intel/core.c | 7 +++++++ arch/x86/events/intel/lbr.c | 20 +++----------------- arch/x86/events/perf_event.h | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bf891caafa3c..1f155efa52d0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4277,6 +4277,8 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_dead = intel_pmu_cpu_dead, .check_period = intel_pmu_check_period, + + .lbr_reset = intel_pmu_lbr_reset_64, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4321,6 +4323,8 @@ static __initconst const struct x86_pmu intel_pmu = { .check_period = intel_pmu_check_period, .aux_output_match = intel_pmu_aux_output_match, + + .lbr_reset = intel_pmu_lbr_reset_64, }; static __init void intel_clovertown_quirk(void) @@ -4960,6 +4964,9 @@ __init int intel_pmu_init(void) x86_pmu.intel_cap.capabilities = capabilities; } + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; + intel_ds_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 83a12e961df4..7626caa6f235 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -8,17 +8,6 @@ #include "../perf_event.h" -enum { - LBR_FORMAT_32 = 0x00, - LBR_FORMAT_LIP = 0x01, - LBR_FORMAT_EIP = 0x02, - LBR_FORMAT_EIP_FLAGS = 0x03, - LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_TIME = 0x06, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, -}; - static const enum { LBR_EIP_FLAGS = 1, LBR_TSX = 2, @@ -194,7 +183,7 @@ static void __intel_pmu_lbr_disable(void) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } -static void intel_pmu_lbr_reset_32(void) +void intel_pmu_lbr_reset_32(void) { int i; @@ -202,7 +191,7 @@ static void intel_pmu_lbr_reset_32(void) wrmsrl(x86_pmu.lbr_from + i, 0); } -static void intel_pmu_lbr_reset_64(void) +void intel_pmu_lbr_reset_64(void) { int i; @@ -221,10 +210,7 @@ void intel_pmu_lbr_reset(void) if (!x86_pmu.lbr_nr) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_reset_32(); - else - intel_pmu_lbr_reset_64(); + x86_pmu.lbr_reset(); cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index b3e13f72922b..0ef76ac07336 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -204,6 +204,17 @@ struct intel_excl_cntrs { struct x86_perf_task_context; #define MAX_LBR_ENTRIES 32 +enum { + LBR_FORMAT_32 = 0x00, + LBR_FORMAT_LIP = 0x01, + LBR_FORMAT_EIP = 0x02, + LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_INFO = 0x05, + LBR_FORMAT_TIME = 0x06, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, +}; + enum { X86_PERF_KFREE_SHARED = 0, X86_PERF_KFREE_EXCL = 1, @@ -727,6 +738,8 @@ struct x86_pmu { bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ + void (*lbr_reset)(void); + /* * Intel PT/LBR/BTS are exclusive */ @@ -1102,6 +1115,10 @@ u64 lbr_from_signext_quirk_wr(u64 val); void intel_pmu_lbr_reset(void); +void intel_pmu_lbr_reset_32(void); + +void intel_pmu_lbr_reset_64(void); + void intel_pmu_lbr_add(struct perf_event *event); void intel_pmu_lbr_del(struct perf_event *event); -- Gitee From 05b877dae53982cb3f3eb205110a5c962ce10796 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:09 -0700 Subject: [PATCH 088/257] perf/x86/intel/lbr: Add a function pointer for LBR read commit c301b1d80ed5b806834fe0f739f028f65fb4fb16 upstream The method to read Architectural LBRs is different from previous model-specific LBR. Perf has to implement a different function. A function pointer for LBR read is introduced. Perf should initialize the corresponding function at boot time, and avoid checking lbr_format at run time. The current 64-bit LBR read function is set as default. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-4-git-send-email-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/intel/core.c | 6 +++++- arch/x86/events/intel/lbr.c | 9 +++------ arch/x86/events/perf_event.h | 5 +++++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1f155efa52d0..3a9c142521da 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4279,6 +4279,7 @@ static __initconst const struct x86_pmu core_pmu = { .check_period = intel_pmu_check_period, .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4325,6 +4326,7 @@ static __initconst const struct x86_pmu intel_pmu = { .aux_output_match = intel_pmu_aux_output_match, .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, }; static __init void intel_clovertown_quirk(void) @@ -4964,8 +4966,10 @@ __init int intel_pmu_init(void) x86_pmu.intel_cap.capabilities = capabilities; } - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) { x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; + x86_pmu.lbr_read = intel_pmu_lbr_read_32; + } intel_ds_init(); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 7626caa6f235..34eb9a652c1a 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -562,7 +562,7 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } -static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; u64 tos = intel_pmu_lbr_tos(); @@ -599,7 +599,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) * is the same as the linear address, allowing us to merge the LIP and EIP * LBR formats. */ -static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; @@ -704,10 +704,7 @@ void intel_pmu_lbr_read(void) cpuc->lbr_users == cpuc->lbr_pebs_users) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_read_32(cpuc); - else - intel_pmu_lbr_read_64(cpuc); + x86_pmu.lbr_read(cpuc); intel_pmu_lbr_filter(cpuc); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 0ef76ac07336..39681d7bf422 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -739,6 +739,7 @@ struct x86_pmu { bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ void (*lbr_reset)(void); + void (*lbr_read)(struct cpu_hw_events *cpuc); /* * Intel PT/LBR/BTS are exclusive @@ -1129,6 +1130,10 @@ void intel_pmu_lbr_disable_all(void); void intel_pmu_lbr_read(void); +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); + +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); + void intel_pmu_lbr_init_core(void); void intel_pmu_lbr_init_nhm(void); -- Gitee From 99eff8beff28905e049c70fc4ef85dbaa72edd6a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:10 -0700 Subject: [PATCH 089/257] perf/x86/intel/lbr: Add the function pointers for LBR save and restore commit 799571bf38fc2b4b744fa448184b5915739b10fd upstream The MSRs of Architectural LBR are different from previous model-specific LBR. Perf has to implement different functions to save and restore them. The function pointers for LBR save and restore are introduced. Perf should initialize the corresponding functions at boot time. The generic optimizations, e.g. avoiding restore LBR if no one else touched them, still apply for Architectural LBRs. The related codes are not moved to model-specific functions. Current model-specific LBR functions are set as default. [Backport Changes] In file arch/x86/events/intel/lbr.c, in function __intel_pmu_lbr_save(), the rdmsrl() macro with MSR_LBR_SELECT which had a extra leading whitespace added in commit c62d6b571d57, was removed as per the upstream patch. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-5-git-send-email-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/intel/core.c | 4 ++ arch/x86/events/intel/lbr.c | 79 ++++++++++++++++++++++-------------- arch/x86/events/perf_event.h | 6 +++ 3 files changed, 59 insertions(+), 30 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3a9c142521da..adb44cbc0a1f 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4280,6 +4280,8 @@ static __initconst const struct x86_pmu core_pmu = { .lbr_reset = intel_pmu_lbr_reset_64, .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4327,6 +4329,8 @@ static __initconst const struct x86_pmu intel_pmu = { .lbr_reset = intel_pmu_lbr_reset_64, .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, }; static __init void intel_clovertown_quirk(void) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 34eb9a652c1a..b2b8dc973057 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -323,31 +323,13 @@ static inline u64 rdlbr_to(unsigned int idx) return val; } -static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +void intel_pmu_lbr_restore(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; int i; unsigned lbr_idx, mask; - u64 tos; - - if (task_ctx->lbr_callstack_users == 0 || - task_ctx->lbr_stack_state == LBR_NONE) { - intel_pmu_lbr_reset(); - return; - } - - tos = task_ctx->tos; - /* - * Does not restore the LBR registers, if - * - No one else touched them, and - * - Did not enter C6 - */ - if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->log_id == cpuc->last_log_id) && - rdlbr_from(tos)) { - task_ctx->lbr_stack_state = LBR_NONE; - return; - } + u64 tos = task_ctx->tos; mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { @@ -368,24 +350,48 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) } wrmsrl(x86_pmu.lbr_tos, tos); - task_ctx->lbr_stack_state = LBR_NONE; if (cpuc->lbr_select) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - unsigned lbr_idx, mask; - u64 tos, from; - int i; + u64 tos; - if (task_ctx->lbr_callstack_users == 0) { + if (task_ctx->lbr_callstack_users == 0 || + task_ctx->lbr_stack_state == LBR_NONE) { + intel_pmu_lbr_reset(); + return; + } + + tos = task_ctx->tos; + /* + * Does not restore the LBR registers, if + * - No one else touched them, and + * - Did not enter C6 + */ + if ((task_ctx == cpuc->last_task_ctx) && + (task_ctx->log_id == cpuc->last_log_id) && + rdlbr_from(tos)) { task_ctx->lbr_stack_state = LBR_NONE; return; } + x86_pmu.lbr_restore(task_ctx); + + task_ctx->lbr_stack_state = LBR_NONE; +} + +void intel_pmu_lbr_save(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; + unsigned lbr_idx, mask; + u64 tos, from; + int i; + mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { @@ -400,13 +406,26 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) } task_ctx->valid_lbrs = i; task_ctx->tos = tos; + + if (cpuc->lbr_select) + rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); +} + +static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (task_ctx->lbr_callstack_users == 0) { + task_ctx->lbr_stack_state = LBR_NONE; + return; + } + + x86_pmu.lbr_save(task_ctx); + task_ctx->lbr_stack_state = LBR_VALID; cpuc->last_task_ctx = task_ctx; cpuc->last_log_id = ++task_ctx->log_id; - - if (cpuc->lbr_select) - rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 39681d7bf422..943d9e649dc5 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -740,6 +740,8 @@ struct x86_pmu { void (*lbr_reset)(void); void (*lbr_read)(struct cpu_hw_events *cpuc); + void (*lbr_save)(void *ctx); + void (*lbr_restore)(void *ctx); /* * Intel PT/LBR/BTS are exclusive @@ -1134,6 +1136,10 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); +void intel_pmu_lbr_save(void *ctx); + +void intel_pmu_lbr_restore(void *ctx); + void intel_pmu_lbr_init_core(void); void intel_pmu_lbr_init_nhm(void); -- Gitee From e655e628b156b2ac8921cbf21986bddf5acdf158 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:11 -0700 Subject: [PATCH 090/257] perf/x86/intel/lbr: Factor out a new struct for generic optimization commit 530bfff6480307d210734222a54d56af7f908957 upstream To reduce the overhead of a context switch with LBR enabled, some generic optimizations were introduced, e.g. avoiding restore LBR if no one else touched them. The generic optimizations can also be used by Architecture LBR later. Currently, the fields for the generic optimizations are part of structure x86_perf_task_context, which will be deprecated by Architecture LBR. A new structure should be introduced for the common fields of generic optimization, which can be shared between Architecture LBR and model-specific LBR. Both 'valid_lbrs' and 'tos' are also used by the generic optimizations, but they are not moved into the new structure, because Architecture LBR is stack-like. The 'valid_lbrs' which records the index of the valid LBR is not required anymore. The TOS MSR will be removed. LBR registers may be cleared in the deep Cstate. If so, the generic optimizations should not be applied. Perf has to unconditionally restore the LBR registers. A generic function is required to detect the reset due to the deep Cstate. lbr_is_reset_in_cstate() is introduced. Currently, for the model-specific LBR, the TOS MSR is used to detect the reset. There will be another method introduced for Architecture LBR later. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-6-git-send-email-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 38 ++++++++++++++++++++---------------- arch/x86/events/perf_event.h | 10 +++++++--- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b2b8dc973057..bba9939635b6 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -355,33 +355,37 @@ void intel_pmu_lbr_restore(void *ctx) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } +static __always_inline bool +lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx) +{ + return !rdlbr_from(task_ctx->tos); +} + static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 tos; - if (task_ctx->lbr_callstack_users == 0 || - task_ctx->lbr_stack_state == LBR_NONE) { + if (task_ctx->opt.lbr_callstack_users == 0 || + task_ctx->opt.lbr_stack_state == LBR_NONE) { intel_pmu_lbr_reset(); return; } - tos = task_ctx->tos; /* * Does not restore the LBR registers, if * - No one else touched them, and - * - Did not enter C6 + * - Was not cleared in Cstate */ if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->log_id == cpuc->last_log_id) && - rdlbr_from(tos)) { - task_ctx->lbr_stack_state = LBR_NONE; + (task_ctx->opt.log_id == cpuc->last_log_id) && + !lbr_is_reset_in_cstate(task_ctx)) { + task_ctx->opt.lbr_stack_state = LBR_NONE; return; } x86_pmu.lbr_restore(task_ctx); - task_ctx->lbr_stack_state = LBR_NONE; + task_ctx->opt.lbr_stack_state = LBR_NONE; } void intel_pmu_lbr_save(void *ctx) @@ -415,17 +419,17 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (task_ctx->lbr_callstack_users == 0) { - task_ctx->lbr_stack_state = LBR_NONE; + if (task_ctx->opt.lbr_callstack_users == 0) { + task_ctx->opt.lbr_stack_state = LBR_NONE; return; } x86_pmu.lbr_save(task_ctx); - task_ctx->lbr_stack_state = LBR_VALID; + task_ctx->opt.lbr_stack_state = LBR_VALID; cpuc->last_task_ctx = task_ctx; - cpuc->last_log_id = ++task_ctx->log_id; + cpuc->last_log_id = ++task_ctx->opt.log_id; } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, @@ -447,8 +451,8 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, if (!prev_ctx_data || !next_ctx_data) return; - swap(prev_ctx_data->lbr_callstack_users, - next_ctx_data->lbr_callstack_users); + swap(prev_ctx_data->opt.lbr_callstack_users, + next_ctx_data->opt.lbr_callstack_users); } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) @@ -503,7 +507,7 @@ void intel_pmu_lbr_add(struct perf_event *event) if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users++; + task_ctx->opt.lbr_callstack_users++; } /* @@ -543,7 +547,7 @@ void intel_pmu_lbr_del(struct perf_event *event) if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users--; + task_ctx->opt.lbr_callstack_users--; } if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 943d9e649dc5..062ab3db6070 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -779,6 +779,12 @@ struct x86_pmu { int (*aux_output_match) (struct perf_event *event); }; +struct x86_perf_task_context_opt { + int lbr_callstack_users; + int lbr_stack_state; + int log_id; +}; + struct x86_perf_task_context { u64 lbr_from[MAX_LBR_ENTRIES]; u64 lbr_to[MAX_LBR_ENTRIES]; @@ -786,9 +792,7 @@ struct x86_perf_task_context { u64 lbr_sel; int tos; int valid_lbrs; - int lbr_callstack_users; - int lbr_stack_state; - int log_id; + struct x86_perf_task_context_opt opt; }; #define x86_add_quirk(func_) \ -- Gitee From ef599b83b902ebce2b4492f3f59691fc4983443f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:12 -0700 Subject: [PATCH 091/257] perf/x86/intel/lbr: Use dynamic data structure for task_ctx commit f42be8651a7a9d5cb165e5d176fc0b09621b4f4d upstream The type of task_ctx is hardcoded as struct x86_perf_task_context, which doesn't apply for Architecture LBR. For example, Architecture LBR doesn't have the TOS MSR. The number of LBR entries is variable. A new struct will be introduced for Architecture LBR. Perf has to determine the type of task_ctx at run time. The type of task_ctx pointer is changed to 'void *', which will be determined at run time. The generic LBR optimization can be shared between Architecture LBR and model-specific LBR. Both need to access the structure for the generic LBR optimization. A helper task_context_opt() is introduced to retrieve the pointer of the structure at run time. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-7-git-send-email-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 59 ++++++++++++++++-------------------- arch/x86/events/perf_event.h | 7 ++++- 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index bba9939635b6..e62baa996474 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -355,18 +355,17 @@ void intel_pmu_lbr_restore(void *ctx) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static __always_inline bool -lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx) +static __always_inline bool lbr_is_reset_in_cstate(void *ctx) { - return !rdlbr_from(task_ctx->tos); + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos); } -static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +static void __intel_pmu_lbr_restore(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (task_ctx->opt.lbr_callstack_users == 0 || - task_ctx->opt.lbr_stack_state == LBR_NONE) { + if (task_context_opt(ctx)->lbr_callstack_users == 0 || + task_context_opt(ctx)->lbr_stack_state == LBR_NONE) { intel_pmu_lbr_reset(); return; } @@ -376,16 +375,16 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) * - No one else touched them, and * - Was not cleared in Cstate */ - if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->opt.log_id == cpuc->last_log_id) && - !lbr_is_reset_in_cstate(task_ctx)) { - task_ctx->opt.lbr_stack_state = LBR_NONE; + if ((ctx == cpuc->last_task_ctx) && + (task_context_opt(ctx)->log_id == cpuc->last_log_id) && + !lbr_is_reset_in_cstate(ctx)) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; return; } - x86_pmu.lbr_restore(task_ctx); + x86_pmu.lbr_restore(ctx); - task_ctx->opt.lbr_stack_state = LBR_NONE; + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; } void intel_pmu_lbr_save(void *ctx) @@ -415,27 +414,27 @@ void intel_pmu_lbr_save(void *ctx) rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +static void __intel_pmu_lbr_save(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (task_ctx->opt.lbr_callstack_users == 0) { - task_ctx->opt.lbr_stack_state = LBR_NONE; + if (task_context_opt(ctx)->lbr_callstack_users == 0) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; return; } - x86_pmu.lbr_save(task_ctx); + x86_pmu.lbr_save(ctx); - task_ctx->opt.lbr_stack_state = LBR_VALID; + task_context_opt(ctx)->lbr_stack_state = LBR_VALID; - cpuc->last_task_ctx = task_ctx; - cpuc->last_log_id = ++task_ctx->opt.log_id; + cpuc->last_task_ctx = ctx; + cpuc->last_log_id = ++task_context_opt(ctx)->log_id; } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, struct perf_event_context *next) { - struct x86_perf_task_context *prev_ctx_data, *next_ctx_data; + void *prev_ctx_data, *next_ctx_data; swap(prev->task_ctx_data, next->task_ctx_data); @@ -451,14 +450,14 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, if (!prev_ctx_data || !next_ctx_data) return; - swap(prev_ctx_data->opt.lbr_callstack_users, - next_ctx_data->opt.lbr_callstack_users); + swap(task_context_opt(prev_ctx_data)->lbr_callstack_users, + task_context_opt(next_ctx_data)->lbr_callstack_users); } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; + void *task_ctx; if (!cpuc->lbr_users) return; @@ -495,7 +494,6 @@ static inline bool branch_user_callstack(unsigned br_sel) void intel_pmu_lbr_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; @@ -505,10 +503,8 @@ void intel_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->opt.lbr_callstack_users++; - } + if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++; /* * Request pmu::sched_task() callback, which will fire inside the @@ -539,16 +535,13 @@ void intel_pmu_lbr_add(struct perf_event *event) void intel_pmu_lbr_del(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; if (branch_user_callstack(cpuc->br_sel) && - event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->opt.lbr_callstack_users--; - } + event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--; if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) cpuc->lbr_select = 0; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 062ab3db6070..c4c6908c112d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -274,7 +274,7 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; u64 br_sel; - struct x86_perf_task_context *last_task_ctx; + void *last_task_ctx; int last_log_id; int lbr_select; @@ -843,6 +843,11 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ struct pmu *x86_get_pmu(void); extern struct x86_pmu x86_pmu __read_mostly; +static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) +{ + return &((struct x86_perf_task_context *)ctx)->opt; +} + static inline bool x86_pmu_has_lbr_callstack(void) { return x86_pmu.lbr_sel_map && -- Gitee From 28b58c0146ee540412a7fecece1f50d958bbe680 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:13 -0700 Subject: [PATCH 092/257] x86/msr-index: Add bunch of MSRs for Arch LBR commit d6a162a41bfd2ff9ea4cbb338d3df6a3f9b7e89f upstream Add Arch LBR related MSRs and the new LBR INFO bits in MSR-index. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-8-git-send-email-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/include/asm/msr-index.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 752a1a82a77b..b5fcde5114f9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -192,7 +192,23 @@ #define LBR_INFO_MISPRED BIT_ULL(63) #define LBR_INFO_IN_TX BIT_ULL(62) #define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYC_CNT_VALID BIT_ULL(60) #define LBR_INFO_CYCLES 0xffff +#define LBR_INFO_BR_TYPE_OFFSET 56 +#define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) + +#define MSR_ARCH_LBR_CTL 0x000014ce +#define ARCH_LBR_CTL_LBREN BIT(0) +#define ARCH_LBR_CTL_CPL_OFFSET 1 +#define ARCH_LBR_CTL_CPL (0x3ull << ARCH_LBR_CTL_CPL_OFFSET) +#define ARCH_LBR_CTL_STACK_OFFSET 3 +#define ARCH_LBR_CTL_STACK (0x1ull << ARCH_LBR_CTL_STACK_OFFSET) +#define ARCH_LBR_CTL_FILTER_OFFSET 16 +#define ARCH_LBR_CTL_FILTER (0x7full << ARCH_LBR_CTL_FILTER_OFFSET) +#define MSR_ARCH_LBR_DEPTH 0x000014cf +#define MSR_ARCH_LBR_FROM_0 0x00001500 +#define MSR_ARCH_LBR_TO_0 0x00001600 +#define MSR_ARCH_LBR_INFO_0 0x00001200 #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_PEBS_DATA_CFG 0x000003f2 -- Gitee From da282aa6b3437cdba185245d22f8b182d31d423e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:14 -0700 Subject: [PATCH 093/257] perf/x86: Expose CPUID enumeration bits for arch LBR commit af6cf129706b2f79e12f97e62d977e7f653cdfd1 upstream The LBR capabilities of Architecture LBR are retrieved from the CPUID enumeration once at boot time. The capabilities have to be saved for future usage. Several new fields are added into structure x86_pmu to indicate the capabilities. The fields will be used in the following patches. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-9-git-send-email-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/perf_event.h | 13 ++++++++++ arch/x86/include/asm/perf_event.h | 40 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index c4c6908c112d..47e252964fa5 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -738,6 +738,19 @@ struct x86_pmu { bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ + /* + * Intel Architectural LBR CPUID Enumeration + */ + unsigned int lbr_depth_mask:8; + unsigned int lbr_deep_c_reset:1; + unsigned int lbr_lip:1; + unsigned int lbr_cpl:1; + unsigned int lbr_filter:1; + unsigned int lbr_call_stack:1; + unsigned int lbr_mispred:1; + unsigned int lbr_timed_lbr:1; + unsigned int lbr_br_type:1; + void (*lbr_reset)(void); void (*lbr_read)(struct cpu_hw_events *cpuc); void (*lbr_save)(void *ctx); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ebb2b78ba8af..cbf824d85357 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -166,6 +166,46 @@ union cpuid10_edx { unsigned int full; }; +/* + * Intel Architectural LBR CPUID detection/enumeration details: + */ +union cpuid28_eax { + struct { + /* Supported LBR depth values */ + unsigned int lbr_depth_mask:8; + unsigned int reserved:22; + /* Deep C-state Reset */ + unsigned int lbr_deep_c_reset:1; + /* IP values contain LIP */ + unsigned int lbr_lip:1; + } split; + unsigned int full; +}; + +union cpuid28_ebx { + struct { + /* CPL Filtering Supported */ + unsigned int lbr_cpl:1; + /* Branch Filtering Supported */ + unsigned int lbr_filter:1; + /* Call-stack Mode Supported */ + unsigned int lbr_call_stack:1; + } split; + unsigned int full; +}; + +union cpuid28_ecx { + struct { + /* Mispredict Bit Supported */ + unsigned int lbr_mispred:1; + /* Timed LBRs Supported */ + unsigned int lbr_timed_lbr:1; + /* Branch Type Field Supported */ + unsigned int lbr_br_type:1; + } split; + unsigned int full; +}; + /* * AMD "Extended Performance Monitoring and Debug" CPUID * detection/enumeration details: -- Gitee From e4c9d4cb419fcb04821ae8118deca38dbf6057f4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:15 -0700 Subject: [PATCH 094/257] perf/x86/intel/lbr: Support LBR_CTL commit 49d8184f2036ff5b8d1eea3d61bac8b23420eca7 upstream An IA32_LBR_CTL is introduced for Architecture LBR to enable and config LBR registers to replace the previous LBR_SELECT. All the related members in struct cpu_hw_events and struct x86_pmu have to be renamed. Some new macros are added to reflect the layout of LBR_CTL. The mapping from PERF_SAMPLE_BRANCH_* to the corresponding bits in LBR_CTL MSR is saved in lbr_ctl_map now, which is not a const value. The value relies on the CPUID enumeration. For the previous model-specific LBR, most of the bits in LBR_SELECT operate in the suppressed mode. For the bits in LBR_CTL, the polarity is inverted. For the previous model-specific LBR format 5 (LBR_FORMAT_INFO), if the NO_CYCLES and NO_FLAGS type are set, the flag LBR_NO_INFO will be set to avoid the unnecessary LBR_INFO MSR read. Although Architecture LBR also has a dedicated LBR_INFO MSR, perf doesn't need to check and set the flag LBR_NO_INFO. For Architecture LBR, XSAVES instruction will be used as the default way to read the LBR MSRs all together. The overhead which the flag tries to avoid doesn't exist anymore. Dropping the flag can save the extra check for the flag in the lbr_read() later, and make the code cleaner. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-10-git-send-email-kan.liang@linux.intel.com Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 43 ++++++++++++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 15 ++++++++++--- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index e62baa996474..77425624752c 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -132,6 +132,44 @@ enum { X86_BR_IRQ |\ X86_BR_INT) +/* + * Intel LBR_CTL bits + * + * Hardware branch filter for Arch LBR + */ +#define ARCH_LBR_KERNEL_BIT 1 /* capture at ring0 */ +#define ARCH_LBR_USER_BIT 2 /* capture at ring > 0 */ +#define ARCH_LBR_CALL_STACK_BIT 3 /* enable call stack */ +#define ARCH_LBR_JCC_BIT 16 /* capture conditional branches */ +#define ARCH_LBR_REL_JMP_BIT 17 /* capture relative jumps */ +#define ARCH_LBR_IND_JMP_BIT 18 /* capture indirect jumps */ +#define ARCH_LBR_REL_CALL_BIT 19 /* capture relative calls */ +#define ARCH_LBR_IND_CALL_BIT 20 /* capture indirect calls */ +#define ARCH_LBR_RETURN_BIT 21 /* capture near returns */ +#define ARCH_LBR_OTHER_BRANCH_BIT 22 /* capture other branches */ + +#define ARCH_LBR_KERNEL (1ULL << ARCH_LBR_KERNEL_BIT) +#define ARCH_LBR_USER (1ULL << ARCH_LBR_USER_BIT) +#define ARCH_LBR_CALL_STACK (1ULL << ARCH_LBR_CALL_STACK_BIT) +#define ARCH_LBR_JCC (1ULL << ARCH_LBR_JCC_BIT) +#define ARCH_LBR_REL_JMP (1ULL << ARCH_LBR_REL_JMP_BIT) +#define ARCH_LBR_IND_JMP (1ULL << ARCH_LBR_IND_JMP_BIT) +#define ARCH_LBR_REL_CALL (1ULL << ARCH_LBR_REL_CALL_BIT) +#define ARCH_LBR_IND_CALL (1ULL << ARCH_LBR_IND_CALL_BIT) +#define ARCH_LBR_RETURN (1ULL << ARCH_LBR_RETURN_BIT) +#define ARCH_LBR_OTHER_BRANCH (1ULL << ARCH_LBR_OTHER_BRANCH_BIT) + +#define ARCH_LBR_ANY \ + (ARCH_LBR_JCC |\ + ARCH_LBR_REL_JMP |\ + ARCH_LBR_IND_JMP |\ + ARCH_LBR_REL_CALL |\ + ARCH_LBR_IND_CALL |\ + ARCH_LBR_RETURN |\ + ARCH_LBR_OTHER_BRANCH) + +#define ARCH_LBR_CTL_MASK 0x7f000e + static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); /* @@ -820,6 +858,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { + reg->config = mask; + return 0; + } + /* * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate * in suppress mode. So LBR_SELECT should be set to diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 47e252964fa5..8d769a02a7e8 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -272,7 +272,10 @@ struct cpu_hw_events { int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; - struct er_account *lbr_sel; + union { + struct er_account *lbr_sel; + struct er_account *lbr_ctl; + }; u64 br_sel; void *last_task_ctx; int last_log_id; @@ -733,8 +736,14 @@ struct x86_pmu { */ unsigned int lbr_tos, lbr_from, lbr_to, lbr_nr; /* LBR base regs and size */ - u64 lbr_sel_mask; /* LBR_SELECT valid bits */ - const int *lbr_sel_map; /* lbr_select mappings */ + union { + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + u64 lbr_ctl_mask; /* LBR_CTL valid bits */ + }; + union { + const int *lbr_sel_map; /* lbr_select mappings */ + int *lbr_ctl_map; /* LBR_CTL mappings */ + }; bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ -- Gitee From 81abd7764c3cde1d0c737ad50b52528c640311c4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:16 -0700 Subject: [PATCH 095/257] perf/x86/intel/lbr: Unify the stored format of LBR information commit 5624986dc61b81a77fb6136bc232593483d1c254 upstream Current LBR information in the structure x86_perf_task_context is stored in a different format from the PEBS LBR record and Architecture LBR, which prevents the sharing of the common codes. Use the format of the PEBS LBR record as a unified format. Use a generic name lbr_entry to replace pebs_lbr_entry. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-11-git-send-email-kan.liang@linux.intel.com Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/events/intel/ds.c | 6 +++--- arch/x86/events/intel/lbr.c | 20 ++++++++++---------- arch/x86/events/perf_event.h | 6 ++---- arch/x86/include/asm/perf_event.h | 6 +----- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 6da90c6f3390..7c7319154e1b 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -956,7 +956,7 @@ static void adaptive_pebs_record_size_update(void) if (pebs_data_cfg & PEBS_DATACFG_XMMS) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) - sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); + sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); cpuc->pebs_record_size = sz; } @@ -1597,10 +1597,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_LBRS) { - struct pebs_lbr *lbr = next_record; + struct lbr_entry *lbr = next_record; int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; - next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); + next_record = next_record + num_lbr * sizeof(struct lbr_entry); if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 77425624752c..b8baaf15c5f4 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -372,11 +372,11 @@ void intel_pmu_lbr_restore(void *ctx) mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); - wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); + wrlbr_from(lbr_idx, task_ctx->lbr[i].from); + wrlbr_to(lbr_idx, task_ctx->lbr[i].to); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); } for (; i < x86_pmu.lbr_nr; i++) { @@ -440,10 +440,10 @@ void intel_pmu_lbr_save(void *ctx) from = rdlbr_from(lbr_idx); if (!from) break; - task_ctx->lbr_from[i] = from; - task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); + task_ctx->lbr[i].from = from; + task_ctx->lbr[i].to = rdlbr_to(lbr_idx); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); } task_ctx->valid_lbrs = i; task_ctx->tos = tos; @@ -1179,7 +1179,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) } } -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; @@ -1193,11 +1193,11 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { - u64 info = lbr->lbr[i].info; + u64 info = lbr[i].info; struct perf_branch_entry *e = &cpuc->lbr_entries[i]; - e->from = lbr->lbr[i].from; - e->to = lbr->lbr[i].to; + e->from = lbr[i].from; + e->to = lbr[i].to; e->mispred = !!(info & LBR_INFO_MISPRED); e->predicted = !(info & LBR_INFO_MISPRED); e->in_tx = !!(info & LBR_INFO_IN_TX); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8d769a02a7e8..a1594cdb6ff2 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -808,13 +808,11 @@ struct x86_perf_task_context_opt { }; struct x86_perf_task_context { - u64 lbr_from[MAX_LBR_ENTRIES]; - u64 lbr_to[MAX_LBR_ENTRIES]; - u64 lbr_info[MAX_LBR_ENTRIES]; u64 lbr_sel; int tos; int valid_lbrs; struct x86_perf_task_context_opt opt; + struct lbr_entry lbr[MAX_LBR_ENTRIES]; }; #define x86_add_quirk(func_) \ @@ -1136,7 +1134,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index cbf824d85357..a3c2e3b0c4c8 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -389,14 +389,10 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; -struct pebs_lbr_entry { +struct lbr_entry { u64 from, to, info; }; -struct pebs_lbr { - struct pebs_lbr_entry lbr[0]; /* Variable length */ -}; - /* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ -- Gitee From 5a23f635eb6cc0f4fa077794e0ee5fcb742e08f1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:17 -0700 Subject: [PATCH 096/257] perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from} wrappers __always_inline commit 020d91e5f32da4f4b929b3a6e680135fd526107c upstream The {rd,wr}lbr_{to,from} wrappers are invoked in hot paths, e.g. context switch and NMI handler. They should be always inline to achieve better performance. However, the CONFIG_OPTIMIZE_INLINING allows the compiler to uninline functions marked 'inline'. Mark the {rd,wr}lbr_{to,from} wrappers as __always_inline to force inline the wrappers. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-12-git-send-email-kan.liang@linux.intel.com Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b8baaf15c5f4..21f4f071f2c0 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -332,18 +332,18 @@ static u64 lbr_from_signext_quirk_rd(u64 val) return val; } -static inline void wrlbr_from(unsigned int idx, u64 val) +static __always_inline void wrlbr_from(unsigned int idx, u64 val) { val = lbr_from_signext_quirk_wr(val); wrmsrl(x86_pmu.lbr_from + idx, val); } -static inline void wrlbr_to(unsigned int idx, u64 val) +static __always_inline void wrlbr_to(unsigned int idx, u64 val) { wrmsrl(x86_pmu.lbr_to + idx, val); } -static inline u64 rdlbr_from(unsigned int idx) +static __always_inline u64 rdlbr_from(unsigned int idx) { u64 val; @@ -352,7 +352,7 @@ static inline u64 rdlbr_from(unsigned int idx) return lbr_from_signext_quirk_rd(val); } -static inline u64 rdlbr_to(unsigned int idx) +static __always_inline u64 rdlbr_to(unsigned int idx) { u64 val; -- Gitee From a75f59b1dd6e7f88fd79d51d8fec939d596e07b2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:18 -0700 Subject: [PATCH 097/257] perf/x86/intel/lbr: Factor out rdlbr_all() and wrlbr_all() commit fda1f99f34a8f0975086bcfef34da865009995c1 upstream The previous model-specific LBR and Architecture LBR (legacy way) use a similar method to save/restore the LBR information, which directly accesses the LBR registers. The codes which read/write a set of LBR registers can be shared between them. Factor out two functions which are used to read/write a set of LBR registers. Add lbr_info into structure x86_pmu, and use it to replace the hardcoded LBR INFO MSR, because the LBR INFO MSR address of the previous model-specific LBR is different from Architecture LBR. The MSR address should be assigned at boot time. For now, only Sky Lake and later platforms have the LBR INFO MSR. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-13-git-send-email-kan.liang@linux.intel.com Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 66 +++++++++++++++++++++++++++--------- arch/x86/events/perf_event.h | 2 +- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 21f4f071f2c0..d3d129c7d7ef 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -237,7 +237,7 @@ void intel_pmu_lbr_reset_64(void) wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + i, 0); + wrmsrl(x86_pmu.lbr_info + i, 0); } } @@ -343,6 +343,11 @@ static __always_inline void wrlbr_to(unsigned int idx, u64 val) wrmsrl(x86_pmu.lbr_to + idx, val); } +static __always_inline void wrlbr_info(unsigned int idx, u64 val) +{ + wrmsrl(x86_pmu.lbr_info + idx, val); +} + static __always_inline u64 rdlbr_from(unsigned int idx) { u64 val; @@ -361,8 +366,44 @@ static __always_inline u64 rdlbr_to(unsigned int idx) return val; } +static __always_inline u64 rdlbr_info(unsigned int idx) +{ + u64 val; + + rdmsrl(x86_pmu.lbr_info + idx, val); + + return val; +} + +static inline void +wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + wrlbr_from(idx, lbr->from); + wrlbr_to(idx, lbr->to); + if (need_info) + wrlbr_info(idx, lbr->info); +} + +static inline bool +rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + u64 from = rdlbr_from(idx); + + /* Don't read invalid entry */ + if (!from) + return false; + + lbr->from = from; + lbr->to = rdlbr_to(idx); + if (need_info) + lbr->info = rdlbr_info(idx); + + return true; +} + void intel_pmu_lbr_restore(void *ctx) { + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; int i; @@ -372,11 +413,7 @@ void intel_pmu_lbr_restore(void *ctx) mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr[i].from); - wrlbr_to(lbr_idx, task_ctx->lbr[i].to); - - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); + wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info); } for (; i < x86_pmu.lbr_nr; i++) { @@ -384,7 +421,7 @@ void intel_pmu_lbr_restore(void *ctx) wrlbr_from(lbr_idx, 0); wrlbr_to(lbr_idx, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + wrlbr_info(lbr_idx, 0); } wrmsrl(x86_pmu.lbr_tos, tos); @@ -427,23 +464,19 @@ static void __intel_pmu_lbr_restore(void *ctx) void intel_pmu_lbr_save(void *ctx) { + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; unsigned lbr_idx, mask; - u64 tos, from; + u64 tos; int i; mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; - from = rdlbr_from(lbr_idx); - if (!from) + if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info)) break; - task_ctx->lbr[i].from = from; - task_ctx->lbr[i].to = rdlbr_to(lbr_idx); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info); } task_ctx->valid_lbrs = i; task_ctx->tos = tos; @@ -689,7 +722,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); + info = rdlbr_info(lbr_idx); mis = !!(info & LBR_INFO_MISPRED); pred = !mis; in_tx = !!(info & LBR_INFO_IN_TX); @@ -1336,6 +1369,7 @@ __init void intel_pmu_lbr_init_skl(void) x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; x86_pmu.lbr_to = MSR_LBR_NHM_TO; + x86_pmu.lbr_info = MSR_LBR_INFO_0; x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; @@ -1421,7 +1455,7 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->nr = x86_pmu.lbr_nr; lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; - lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? MSR_LBR_INFO_0 : 0; + lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0; return 0; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a1594cdb6ff2..a6318aafd041 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -735,7 +735,7 @@ struct x86_pmu { * Intel LBR */ unsigned int lbr_tos, lbr_from, lbr_to, - lbr_nr; /* LBR base regs and size */ + lbr_info, lbr_nr; /* LBR base regs and size */ union { u64 lbr_sel_mask; /* LBR_SELECT valid bits */ u64 lbr_ctl_mask; /* LBR_CTL valid bits */ -- Gitee From 48b724a8de467173ff6a3c370ed0629fcb189621 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:19 -0700 Subject: [PATCH 098/257] perf/x86/intel/lbr: Factor out intel_pmu_store_lbr commit 631618a0dca31dc23dcce38cf345c6139bd8a1e9 upstream The way to store the LBR information from a PEBS LBR record can be reused in Architecture LBR, because - The LBR information is stored like a stack. Entry 0 is always the youngest branch. - The layout of the LBR INFO MSR is similar. The LBR information may be retrieved from either the LBR registers (non-PEBS event) or a buffer (PEBS event). Extend rdlbr_*() to support both methods. Explicitly check the invalid entry (0s), which can avoid unnecessary MSR access if using a non-PEBS event. For a PEBS event, the check should slightly improve the performance as well. The invalid entries are cut. The intel_pmu_lbr_filter() doesn't need to check and filter them out. Cannot share the function with current model-specific LBR read, because the direction of the LBR growth is opposite. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-14-git-send-email-kan.liang@linux.intel.com Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 82 +++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index d3d129c7d7ef..0d7a85903964 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -348,28 +348,37 @@ static __always_inline void wrlbr_info(unsigned int idx, u64 val) wrmsrl(x86_pmu.lbr_info + idx, val); } -static __always_inline u64 rdlbr_from(unsigned int idx) +static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->from; + rdmsrl(x86_pmu.lbr_from + idx, val); return lbr_from_signext_quirk_rd(val); } -static __always_inline u64 rdlbr_to(unsigned int idx) +static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->to; + rdmsrl(x86_pmu.lbr_to + idx, val); return val; } -static __always_inline u64 rdlbr_info(unsigned int idx) +static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->info; + rdmsrl(x86_pmu.lbr_info + idx, val); return val; @@ -387,16 +396,16 @@ wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) static inline bool rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) { - u64 from = rdlbr_from(idx); + u64 from = rdlbr_from(idx, NULL); /* Don't read invalid entry */ if (!from) return false; lbr->from = from; - lbr->to = rdlbr_to(idx); + lbr->to = rdlbr_to(idx, NULL); if (need_info) - lbr->info = rdlbr_info(idx); + lbr->info = rdlbr_info(idx, NULL); return true; } @@ -432,7 +441,7 @@ void intel_pmu_lbr_restore(void *ctx) static __always_inline bool lbr_is_reset_in_cstate(void *ctx) { - return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos); + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); } static void __intel_pmu_lbr_restore(void *ctx) @@ -709,8 +718,8 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) u16 cycles = 0; int lbr_flags = lbr_desc[lbr_format]; - from = rdlbr_from(lbr_idx); - to = rdlbr_to(lbr_idx); + from = rdlbr_from(lbr_idx, NULL); + to = rdlbr_to(lbr_idx, NULL); /* * Read LBR call stack entries @@ -722,7 +731,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; - info = rdlbr_info(lbr_idx); + info = rdlbr_info(lbr_idx, NULL); mis = !!(info & LBR_INFO_MISPRED); pred = !mis; in_tx = !!(info & LBR_INFO_IN_TX); @@ -777,6 +786,42 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.hw_idx = tos; } +static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, + struct lbr_entry *entries) +{ + struct perf_branch_entry *e; + struct lbr_entry *lbr; + u64 from, to, info; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr = entries ? &entries[i] : NULL; + e = &cpuc->lbr_entries[i]; + + from = rdlbr_from(i, lbr); + /* + * Read LBR entries until invalid entry (0s) is detected. + */ + if (!from) + break; + + to = rdlbr_to(i, lbr); + info = rdlbr_info(i, lbr); + + e->from = from; + e->to = to; + e->mispred = !!(info & LBR_INFO_MISPRED); + e->predicted = !(info & LBR_INFO_MISPRED); + e->in_tx = !!(info & LBR_INFO_IN_TX); + e->abort = !!(info & LBR_INFO_ABORT); + e->cycles = info & LBR_INFO_CYCLES; + e->type = 0; + e->reserved = 0; + } + + cpuc->lbr_stack.nr = i; +} + void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1215,9 +1260,6 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int i; - - cpuc->lbr_stack.nr = x86_pmu.lbr_nr; /* Cannot get TOS for large PEBS */ if (cpuc->n_pebs == cpuc->n_large_pebs) @@ -1225,19 +1267,7 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) else cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); - for (i = 0; i < x86_pmu.lbr_nr; i++) { - u64 info = lbr[i].info; - struct perf_branch_entry *e = &cpuc->lbr_entries[i]; - - e->from = lbr[i].from; - e->to = lbr[i].to; - e->mispred = !!(info & LBR_INFO_MISPRED); - e->predicted = !(info & LBR_INFO_MISPRED); - e->in_tx = !!(info & LBR_INFO_IN_TX); - e->abort = !!(info & LBR_INFO_ABORT); - e->cycles = info & LBR_INFO_CYCLES; - e->reserved = 0; - } + intel_pmu_store_lbr(cpuc, lbr); intel_pmu_lbr_filter(cpuc); } -- Gitee From 9b98568999d2d893b2cd2d5b13cfa181fecf3796 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:20 -0700 Subject: [PATCH 099/257] perf/x86/intel/lbr: Support Architectural LBR commit 47125db27e47e9d44c878bf8925aa057824bb0d5 upstream Last Branch Records (LBR) enables recording of software path history by logging taken branches and other control flows within architectural registers now. Intel CPUs have had model-specific LBR for quite some time, but this evolves them into an architectural feature now. The main improvements of Architectural LBR implemented includes: - Linux kernel can support the LBR features without knowing the model number of the current CPU. - Architectural LBR capabilities can be enumerated by CPUID. The lbr_ctl_map is based on the CPUID Enumeration. - The possible LBR depth can be retrieved from CPUID enumeration. The max value is written to the new MSR_ARCH_LBR_DEPTH as the number of LBR entries. - A new IA32_LBR_CTL MSR is introduced to enable and configure LBRs, which replaces the IA32_DEBUGCTL[bit 0] and the LBR_SELECT MSR. - Each LBR record or entry is still comprised of three MSRs, IA32_LBR_x_FROM_IP, IA32_LBR_x_TO_IP and IA32_LBR_x_TO_IP. But they become the architectural MSRs. - Architectural LBR is stack-like now. Entry 0 is always the youngest branch, entry 1 the next youngest... The TOS MSR has been removed. The way to enable/disable Architectural LBR is similar to the previous model-specific LBR. __intel_pmu_lbr_enable/disable() can be reused, but some modifications are required, which include: - MSR_ARCH_LBR_CTL is used to enable and configure the Architectural LBR. - When checking the value of the IA32_DEBUGCTL MSR, ignoring the DEBUGCTLMSR_LBR (bit 0) for Architectural LBR, which has no meaning and always return 0. - The FREEZE_LBRS_ON_PMI has to be explicitly set/clear, because MSR_IA32_DEBUGCTLMSR is not touched in __intel_pmu_lbr_disable() for Architectural LBR. - Only MSR_ARCH_LBR_CTL is cleared in __intel_pmu_lbr_disable() for Architectural LBR. Some Architectural LBR dedicated functions are implemented to reset/read/save/restore LBR. - For reset, writing to the ARCH_LBR_DEPTH MSR clears all Arch LBR entries, which is a lot faster and can improve the context switch latency. - For read, the branch type information can be retrieved from the MSR_ARCH_LBR_INFO_*. But it's not fully compatible due to OTHER_BRANCH type. The software decoding is still required for the OTHER_BRANCH case. LBR records are stored in the age order as well. Reuse intel_pmu_store_lbr(). Check the CPUID enumeration before accessing the corresponding bits in LBR_INFO. - For save/restore, applying the fast reset (writing ARCH_LBR_DEPTH). Reading 'lbr_from' of entry 0 instead of the TOS MSR to check if the LBR registers are reset in the deep C-state. If 'the deep C-state reset' bit is not set in CPUID enumeration, ignoring the check. XSAVE support for Architectural LBR will be implemented later. The number of LBR entries cannot be hardcoded anymore, which should be retrieved from CPUID enumeration. A new structure x86_perf_task_context_arch_lbr is introduced for Architectural LBR. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-15-git-send-email-kan.liang@linux.intel.com Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/events/intel/core.c | 3 + arch/x86/events/intel/lbr.c | 251 +++++++++++++++++++++++++++++++++-- arch/x86/events/perf_event.h | 10 ++ 3 files changed, 253 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index adb44cbc0a1f..091420463569 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4975,6 +4975,9 @@ __init int intel_pmu_init(void) x86_pmu.lbr_read = intel_pmu_lbr_read_32; } + if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) + intel_pmu_arch_lbr_init(); + intel_ds_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 0d7a85903964..e4e249a78451 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -172,6 +172,14 @@ enum { static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); +static __always_inline bool is_lbr_call_stack_bit_set(u64 config) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return !!(config & ARCH_LBR_CALL_STACK); + + return !!(config & LBR_CALL_STACK); +} + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -195,27 +203,40 @@ static void __intel_pmu_lbr_enable(bool pmi) */ if (cpuc->lbr_sel) lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; - if (!pmi && cpuc->lbr_sel) + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel) wrmsrl(MSR_LBR_SELECT, lbr_select); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); orig_debugctl = debugctl; - debugctl |= DEBUGCTLMSR_LBR; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + debugctl |= DEBUGCTLMSR_LBR; /* * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions * may cause superfluous increase/decrease of LBR_TOS. */ - if (!(lbr_select & LBR_CALL_STACK)) + if (is_lbr_call_stack_bit_set(lbr_select)) + debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + else debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + if (orig_debugctl != debugctl) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN); } static void __intel_pmu_lbr_disable(void) { u64 debugctl; + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { + wrmsrl(MSR_ARCH_LBR_CTL, 0); + return; + } + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); @@ -241,6 +262,12 @@ void intel_pmu_lbr_reset_64(void) } } +static void intel_pmu_arch_lbr_reset(void) +{ + /* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */ + wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr); +} + void intel_pmu_lbr_reset(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -439,8 +466,28 @@ void intel_pmu_lbr_restore(void *ctx) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } +static void intel_pmu_arch_lbr_restore(void *ctx) +{ + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; + int i; + + /* Fast reset the LBRs before restore if the call stack is not full. */ + if (!entries[x86_pmu.lbr_nr - 1].from) + intel_pmu_arch_lbr_reset(); + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!entries[i].from) + break; + wrlbr_all(&entries[i], i, true); + } +} + static __always_inline bool lbr_is_reset_in_cstate(void *ctx) { + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL); + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); } @@ -494,6 +541,22 @@ void intel_pmu_lbr_save(void *ctx) rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } +static void intel_pmu_arch_lbr_save(void *ctx) +{ + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!rdlbr_all(&entries[i], i, true)) + break; + } + + /* LBR call stack is not full. Reset is required in restore. */ + if (i < x86_pmu.lbr_nr) + entries[x86_pmu.lbr_nr - 1].from = 0; +} + static void __intel_pmu_lbr_save(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -786,6 +849,39 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.hw_idx = tos; } +static __always_inline int get_lbr_br_type(u64 info) +{ + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type) + return 0; + + return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; +} + +static __always_inline bool get_lbr_mispred(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) + return 0; + + return !!(info & LBR_INFO_MISPRED); +} + +static __always_inline bool get_lbr_predicted(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) + return 0; + + return !(info & LBR_INFO_MISPRED); +} + +static __always_inline bool get_lbr_cycles(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) + return 0; + + return info & LBR_INFO_CYCLES; +} + static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, struct lbr_entry *entries) { @@ -810,18 +906,23 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->from = from; e->to = to; - e->mispred = !!(info & LBR_INFO_MISPRED); - e->predicted = !(info & LBR_INFO_MISPRED); + e->mispred = get_lbr_mispred(info); + e->predicted = get_lbr_predicted(info); e->in_tx = !!(info & LBR_INFO_IN_TX); e->abort = !!(info & LBR_INFO_ABORT); - e->cycles = info & LBR_INFO_CYCLES; - e->type = 0; + e->cycles = get_lbr_cycles(info); + e->type = get_lbr_br_type(info); e->reserved = 0; } cpuc->lbr_stack.nr = i; } +static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) +{ + intel_pmu_store_lbr(cpuc, NULL); +} + void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1197,6 +1298,27 @@ common_branch_type(int type) return PERF_BR_UNKNOWN; } +enum { + ARCH_LBR_BR_TYPE_JCC = 0, + ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, + ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2, + ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3, + ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4, + ARCH_LBR_BR_TYPE_NEAR_RET = 5, + ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET, + + ARCH_LBR_BR_TYPE_MAP_MAX = 16, +}; + +static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = { + [ARCH_LBR_BR_TYPE_JCC] = X86_BR_JCC, + [ARCH_LBR_BR_TYPE_NEAR_IND_JMP] = X86_BR_IND_JMP, + [ARCH_LBR_BR_TYPE_NEAR_REL_JMP] = X86_BR_JMP, + [ARCH_LBR_BR_TYPE_NEAR_IND_CALL] = X86_BR_IND_CALL, + [ARCH_LBR_BR_TYPE_NEAR_REL_CALL] = X86_BR_CALL, + [ARCH_LBR_BR_TYPE_NEAR_RET] = X86_BR_RET, +}; + /* * implement actual branch filter based on user demand. * Hardware may not exactly satisfy that request, thus @@ -1209,7 +1331,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) { u64 from, to; int br_sel = cpuc->br_sel; - int i, j, type; + int i, j, type, to_plm; bool compress = false; /* if sampling all branches, then nothing to filter */ @@ -1221,8 +1343,19 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; + type = cpuc->lbr_entries[i].type; - type = branch_type(from, to, cpuc->lbr_entries[i].abort); + /* + * Parse the branch type recorded in LBR_x_INFO MSR. + * Doesn't support OTHER_BRANCH decoding for now. + * OTHER_BRANCH branch type still rely on software decoding. + */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) { + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + type = arch_lbr_br_type_map[type] | to_plm; + } else + type = branch_type(from, to, cpuc->lbr_entries[i].abort); if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { if (cpuc->lbr_entries[i].in_tx) type |= X86_BR_IN_TX; @@ -1261,8 +1394,9 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - /* Cannot get TOS for large PEBS */ - if (cpuc->n_pebs == cpuc->n_large_pebs) + /* Cannot get TOS for large PEBS and Arch LBR */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) || + (cpuc->n_pebs == cpuc->n_large_pebs)) cpuc->lbr_stack.hw_idx = -1ULL; else cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); @@ -1324,6 +1458,26 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; +static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = ARCH_LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = ARCH_LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = ARCH_LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = ARCH_LBR_RETURN | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = ARCH_LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = ARCH_LBR_JCC, + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_RETURN | + ARCH_LBR_CALL_STACK, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = ARCH_LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = ARCH_LBR_REL_CALL, +}; + /* core */ void __init intel_pmu_lbr_init_core(void) { @@ -1471,6 +1625,81 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } +void __init intel_pmu_arch_lbr_init(void) +{ + union cpuid28_eax eax; + union cpuid28_ebx ebx; + union cpuid28_ecx ecx; + unsigned int unused_edx; + u64 lbr_nr; + + /* Arch LBR Capabilities */ + cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx); + + lbr_nr = fls(eax.split.lbr_depth_mask) * 8; + if (!lbr_nr) + goto clear_arch_lbr; + + /* Apply the max depth of Arch LBR */ + if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr)) + goto clear_arch_lbr; + + x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask; + x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset; + x86_pmu.lbr_lip = eax.split.lbr_lip; + x86_pmu.lbr_cpl = ebx.split.lbr_cpl; + x86_pmu.lbr_filter = ebx.split.lbr_filter; + x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack; + x86_pmu.lbr_mispred = ecx.split.lbr_mispred; + x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr; + x86_pmu.lbr_br_type = ecx.split.lbr_br_type; + x86_pmu.lbr_nr = lbr_nr; + + x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) + + lbr_nr * sizeof(struct lbr_entry); + + x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; + x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0; + x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0; + + /* LBR callstack requires both CPL and Branch Filtering support */ + if (!x86_pmu.lbr_cpl || + !x86_pmu.lbr_filter || + !x86_pmu.lbr_call_stack) + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP; + + if (!x86_pmu.lbr_cpl) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP; + } else if (!x86_pmu.lbr_filter) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP; + } + + x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK; + x86_pmu.lbr_ctl_map = arch_lbr_ctl_map; + + if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter) + x86_pmu.lbr_ctl_map = NULL; + + x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read; + x86_pmu.lbr_save = intel_pmu_arch_lbr_save; + x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore; + + pr_cont("Architectural LBR, "); + + return; + +clear_arch_lbr: + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR); +} + /** * x86_perf_get_lbr - get the LBR records information * diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a6318aafd041..9fd34a4d2aaa 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -815,6 +815,11 @@ struct x86_perf_task_context { struct lbr_entry lbr[MAX_LBR_ENTRIES]; }; +struct x86_perf_task_context_arch_lbr { + struct x86_perf_task_context_opt opt; + struct lbr_entry entries[]; +}; + #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ @@ -865,6 +870,9 @@ extern struct x86_pmu x86_pmu __read_mostly; static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt; + return &((struct x86_perf_task_context *)ctx)->opt; } @@ -1185,6 +1193,8 @@ void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); +void intel_pmu_arch_lbr_init(void); + void intel_pmu_pebs_data_source_nhm(void); void intel_pmu_pebs_data_source_skl(bool pmem); -- Gitee From 30e7c982657c8d6403b91117c60ee02a9ae6a61d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 18 Aug 2020 15:57:41 +0200 Subject: [PATCH 100/257] static_call: Add basic static call infrastructure commit 115284d89a436e9b66da0c6c4f6efded806874b2 upstream Static calls are a replacement for global function pointers. They use code patching to allow direct calls to be used instead of indirect calls. They give the flexibility of function pointers, but with improved performance. This is especially important for cases where retpolines would otherwise be used, as retpolines can significantly impact performance. The concept and code are an extension of previous work done by Ard Biesheuvel and Steven Rostedt: https://lkml.kernel.org/r/20181005081333.15018-1-ard.biesheuvel@linaro.org https://lkml.kernel.org/r/20181006015110.653946300@goodmis.org There are two implementations, depending on arch support: 1) out-of-line: patched trampolines (CONFIG_HAVE_STATIC_CALL) 2) basic function pointers For more details, see the comments in include/linux/static_call.h. [peterz: simplified interface] Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Link: https://lore.kernel.org/r/20200818135804.623259796@infradead.org Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/Kconfig | 3 + include/linux/static_call.h | 156 ++++++++++++++++++++++++++++++ include/linux/static_call_types.h | 15 +++ 3 files changed, 174 insertions(+) create mode 100644 include/linux/static_call.h create mode 100644 include/linux/static_call_types.h diff --git a/arch/Kconfig b/arch/Kconfig index 80682c6f6d12..7adde6e58dce 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -973,6 +973,9 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG address translations. Page table walkers that clear the accessed bit may use this capability to reduce their search space. +config HAVE_STATIC_CALL + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/include/linux/static_call.h b/include/linux/static_call.h new file mode 100644 index 000000000000..d8892dff2e91 --- /dev/null +++ b/include/linux/static_call.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_STATIC_CALL_H +#define _LINUX_STATIC_CALL_H + +/* + * Static call support + * + * Static calls use code patching to hard-code function pointers into direct + * branch instructions. They give the flexibility of function pointers, but + * with improved performance. This is especially important for cases where + * retpolines would otherwise be used, as retpolines can significantly impact + * performance. + * + * + * API overview: + * + * DECLARE_STATIC_CALL(name, func); + * DEFINE_STATIC_CALL(name, func); + * static_call(name)(args...); + * static_call_update(name, func); + * + * Usage example: + * + * # Start with the following functions (with identical prototypes): + * int func_a(int arg1, int arg2); + * int func_b(int arg1, int arg2); + * + * # Define a 'my_name' reference, associated with func_a() by default + * DEFINE_STATIC_CALL(my_name, func_a); + * + * # Call func_a() + * static_call(my_name)(arg1, arg2); + * + * # Update 'my_name' to point to func_b() + * static_call_update(my_name, &func_b); + * + * # Call func_b() + * static_call(my_name)(arg1, arg2); + * + * + * Implementation details: + * + * This requires some arch-specific code (CONFIG_HAVE_STATIC_CALL). + * Otherwise basic indirect calls are used (with function pointers). + * + * Each static_call() site calls into a trampoline associated with the name. + * The trampoline has a direct branch to the default function. Updates to a + * name will modify the trampoline's branch destination. + * + * If the arch has CONFIG_HAVE_STATIC_CALL_INLINE, then the call sites + * themselves will be patched at runtime to call the functions directly, + * rather than calling through the trampoline. This requires objtool or a + * compiler plugin to detect all the static_call() sites and annotate them + * in the .static_call_sites section. + */ + +#include +#include +#include + +#ifdef CONFIG_HAVE_STATIC_CALL +#include + +/* + * Either @site or @tramp can be NULL. + */ +extern void arch_static_call_transform(void *site, void *tramp, void *func); + +#define STATIC_CALL_TRAMP_ADDR(name) &STATIC_CALL_TRAMP(name) + +/* + * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from + * the symbol table so that objtool can reference it when it generates the + * .static_call_sites section. + */ +#define __static_call(name) \ +({ \ + __ADDRESSABLE(STATIC_CALL_KEY(name)); \ + &STATIC_CALL_TRAMP(name); \ +}) + +#else +#define STATIC_CALL_TRAMP_ADDR(name) NULL +#endif + + +#define DECLARE_STATIC_CALL(name, func) \ + extern struct static_call_key STATIC_CALL_KEY(name); \ + extern typeof(func) STATIC_CALL_TRAMP(name); + +#define static_call_update(name, func) \ +({ \ + BUILD_BUG_ON(!__same_type(*(func), STATIC_CALL_TRAMP(name))); \ + __static_call_update(&STATIC_CALL_KEY(name), \ + STATIC_CALL_TRAMP_ADDR(name), func); \ +}) + +#if defined(CONFIG_HAVE_STATIC_CALL) + +struct static_call_key { + void *func; +}; + +#define DEFINE_STATIC_CALL(name, _func) \ + DECLARE_STATIC_CALL(name, _func); \ + struct static_call_key STATIC_CALL_KEY(name) = { \ + .func = _func, \ + }; \ + ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func) + +#define static_call(name) __static_call(name) + +static inline +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + cpus_read_lock(); + WRITE_ONCE(key->func, func); + arch_static_call_transform(NULL, tramp, func); + cpus_read_unlock(); +} + +#define EXPORT_STATIC_CALL(name) \ + EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \ + EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)) + +#define EXPORT_STATIC_CALL_GPL(name) \ + EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)); \ + EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)) + +#else /* Generic implementation */ + +struct static_call_key { + void *func; +}; + +#define DEFINE_STATIC_CALL(name, _func) \ + DECLARE_STATIC_CALL(name, _func); \ + struct static_call_key STATIC_CALL_KEY(name) = { \ + .func = _func, \ + } + +#define static_call(name) \ + ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func)) + +static inline +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + WRITE_ONCE(key->func, func); +} + +#define EXPORT_STATIC_CALL(name) EXPORT_SYMBOL(STATIC_CALL_KEY(name)) +#define EXPORT_STATIC_CALL_GPL(name) EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)) + +#endif /* CONFIG_HAVE_STATIC_CALL */ + +#endif /* _LINUX_STATIC_CALL_H */ diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h new file mode 100644 index 000000000000..5ed249dc47d3 --- /dev/null +++ b/include/linux/static_call_types.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _STATIC_CALL_TYPES_H +#define _STATIC_CALL_TYPES_H + +#include + +#define STATIC_CALL_KEY_PREFIX __SCK__ +#define STATIC_CALL_KEY(name) __PASTE(STATIC_CALL_KEY_PREFIX, name) + +#define STATIC_CALL_TRAMP_PREFIX __SCT__ +#define STATIC_CALL_TRAMP_PREFIX_STR __stringify(STATIC_CALL_TRAMP_PREFIX) +#define STATIC_CALL_TRAMP(name) __PASTE(STATIC_CALL_TRAMP_PREFIX, name) +#define STATIC_CALL_TRAMP_STR(name) __stringify(STATIC_CALL_TRAMP(name)) + +#endif /* _STATIC_CALL_TYPES_H */ -- Gitee From 6a8897c1e8f7a4a674c95bbbbbd5052f34c0167a Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 8 Sep 2020 16:47:39 -0500 Subject: [PATCH 101/257] perf/x86/amd/ibs: Support 27-bit extended Op/cycle counter commit 8b0bed7d410f48499d72af2e2bcd890daad94e0d upstream IBS hardware with the OpCntExt feature gets a 7-bit wider internal counter. Both the maximum and current count bitfields in the IBS_OP_CTL register are extended to support reading and writing it. No changes are necessary to the driver for handling the extra contiguous current count bits (IbsOpCurCnt), as the driver already passes through 32 bits of that field. However, the driver has to do some extra bit manipulation when converting from a period to the non-contiguous (although conveniently aligned) extra bits in the IbsOpMaxCnt bitfield. This decreases IBS Op interrupt overhead when the period is over 1,048,560 (0xffff0), which would previously activate the driver's software counter. That threshold is now 134,217,712 (0x7fffff0). Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200908214740.18097-7-kim.phillips@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/ibs.c | 42 +++++++++++++++++++++++-------- arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2e930d8c04d9..b08d009aa9d8 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -351,10 +351,13 @@ static u64 get_ibs_op_count(u64 config) * and the lower 7 bits of CurCnt are randomized. * Otherwise CurCnt has the full 27-bit current counter value. */ - if (config & IBS_OP_VAL) + if (config & IBS_OP_VAL) { count = (config & IBS_OP_MAX_CNT) << 4; - else if (ibs_caps & IBS_CAPS_RDWROPCNT) + if (ibs_caps & IBS_CAPS_OPCNTEXT) + count += config & IBS_OP_MAX_CNT_EXT_MASK; + } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { count = (config & IBS_OP_CUR_CNT) >> 32; + } return count; } @@ -415,7 +418,7 @@ static void perf_ibs_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 period; + u64 period, config = 0; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -424,13 +427,19 @@ static void perf_ibs_start(struct perf_event *event, int flags) hwc->state = 0; perf_ibs_set_period(perf_ibs, hwc, &period); + if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { + config |= period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + config |= period >> 4; + /* * Set STARTED before enabling the hardware, such that a subsequent NMI * must observe it. */ set_bit(IBS_STARTED, pcpu->state); clear_bit(IBS_STOPPING, pcpu->state); - perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + perf_ibs_enable_event(perf_ibs, hwc, config); perf_event_update_userpage(event); } @@ -599,7 +608,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_ibs_data ibs_data; int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; - u64 *buf, *config, period; + u64 *buf, *config, period, new_config = 0; if (!test_bit(IBS_STARTED, pcpu->state)) { fail: @@ -706,13 +715,17 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (throttle) { perf_ibs_stop(event, 0); } else { - period >>= 4; - - if ((ibs_caps & IBS_CAPS_RDWROPCNT) && - (*config & IBS_OP_CNT_CTL)) - period |= *config & IBS_OP_CUR_CNT_RAND; + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + new_config = period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL)) + new_config |= *config & IBS_OP_CUR_CNT_RAND; + } + new_config |= period >> 4; - perf_ibs_enable_event(perf_ibs, hwc, period); + perf_ibs_enable_event(perf_ibs, hwc, new_config); } perf_event_update_userpage(event); @@ -789,6 +802,13 @@ static __init void perf_event_ibs_init(void) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; *attr++ = &format_attr_cnt_ctl.attr; } + + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + } + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index a3c2e3b0c4c8..3ceae0473115 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -450,6 +450,7 @@ struct lbr_entry { #define IBS_OP_ENABLE (1ULL<<17) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +#define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ #define IBS_RIP_INVALID (1ULL<<38) #ifdef CONFIG_X86_LOCAL_APIC -- Gitee From d07f3b048c35703cb06d13103ff871d5dc30d61e Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 21 Sep 2020 09:43:30 -0500 Subject: [PATCH 102/257] perf/amd/uncore: Inform the user how many counters each uncore PMU has commit 9ed9647dc0671486f9e998b7258f75167a9c4697 upstream Previously, the uncore driver would say "NB counters detected" on F17h machines, which don't have NorthBridge (NB) counters. They have Data Fabric (DF) counters. Just use the pmu.name to inform users which pmu to use and its associated counter count. F17h dmesg BEFORE: amd_uncore: AMD NB counters detected amd_uncore: AMD LLC counters detected F17h dmesg AFTER: amd_uncore: 4 amd_df counters detected amd_uncore: 6 amd_l3 counters detected Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200921144330.6331-5-kim.phillips@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/uncore.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index af14e78a1697..5def365af921 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -663,9 +663,10 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; - pr_info("%s NB counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + pr_info("%d %s %s counters detected\n", num_counters_nb, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_nb_pmu.name); + ret = 0; } @@ -706,9 +707,9 @@ static int __init amd_uncore_init(void) if (ret) goto fail_llc; - pr_info("%s LLC counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + pr_info("%d %s %s counters detected\n", num_counters_llc, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_llc_pmu.name); ret = 0; } -- Gitee From 4a6918ba679ab249264dbd1d1a133fb1fef21541 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:10:47 -0500 Subject: [PATCH 103/257] perf/amd/uncore: Allow the driver to be built as a module commit 05485745ad482c1910a45f23a5c255f6a0df0f46 upstream Add support to build the AMD uncore driver as a module. This is in order to facilitate development without having to reboot the kernel in most cases. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210817221048.88063-8-kim.phillips@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/Kconfig | 10 ++++++++++ arch/x86/events/amd/Makefile | 5 +++-- arch/x86/events/amd/uncore.c | 28 +++++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 4a809c6cbd2f..03b8674b192c 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -34,4 +34,14 @@ config PERF_EVENTS_AMD_POWER (CPUID Fn8000_0007_EDX[12]) interface to calculate the average power consumption on Family 15h processors. +config PERF_EVENTS_AMD_UNCORE + tristate "AMD Uncore performance events" + depends on PERF_EVENTS && CPU_SUP_AMD + default y + help + Include support for AMD uncore performance events for use with + e.g., perf stat -e amd_l3/.../,amd_df/.../. + + To compile this driver as a module, choose M here: the + module will be called 'amd-uncore'. endmenu diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index fe8795a67385..6cbe38d5fd9d 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o +obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o +amd-uncore-objs := uncore.o ifdef CONFIG_AMD_IOMMU obj-$(CONFIG_CPU_SUP_AMD) += iommu.o endif - diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 5def365af921..3c13f0968ccb 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -402,6 +402,7 @@ static struct pmu amd_nb_pmu = { .stop = amd_uncore_stop, .read = amd_uncore_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu amd_llc_pmu = { @@ -415,6 +416,7 @@ static struct pmu amd_llc_pmu = { .stop = amd_uncore_stop, .read = amd_uncore_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) @@ -747,4 +749,28 @@ static int __init amd_uncore_init(void) return ret; } -device_initcall(amd_uncore_init); + +static void __exit amd_uncore_exit(void) +{ + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE); + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); + cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); + + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { + perf_pmu_unregister(&amd_llc_pmu); + free_percpu(amd_uncore_llc); + amd_uncore_llc = NULL; + } + + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { + perf_pmu_unregister(&amd_nb_pmu); + free_percpu(amd_uncore_nb); + amd_uncore_nb = NULL; + } +} + +module_init(amd_uncore_init); +module_exit(amd_uncore_exit); + +MODULE_DESCRIPTION("AMD Uncore Driver"); +MODULE_LICENSE("GPL v2"); -- Gitee From 01140cb2e54832f624160de15d2d54c3374696b4 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:10:48 -0500 Subject: [PATCH 104/257] perf/x86/amd/ibs: Add bitfield definitions in new header commit 6a371bafe613b7746c3d3ac486bdb3035f77e029 upstream Add with bitfield definitions for IBS MSRs, and demonstrate usage within the driver. Also move 'struct perf_ibs_data' where it can be shared with the perf tool that will soon be using it. No functional changes. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210817221048.88063-9-kim.phillips@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/ibs.c | 23 +++--- arch/x86/include/asm/amd-ibs.h | 132 +++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 14 deletions(-) create mode 100644 arch/x86/include/asm/amd-ibs.h diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index b08d009aa9d8..11e8b493e015 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -26,6 +26,7 @@ static u32 ibs_caps; #include #include +#include #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT @@ -100,15 +101,6 @@ struct perf_ibs { u64 (*get_count)(u64 config); }; -struct perf_ibs_data { - u32 size; - union { - u32 data[0]; /* data buffer starts here */ - u32 caps; - }; - u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; -}; - static int perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) { @@ -339,11 +331,14 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs, static u64 get_ibs_fetch_count(u64 config) { - return (config & IBS_FETCH_CNT) >> 12; + union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config; + + return fetch_ctl.fetch_cnt << 4; } static u64 get_ibs_op_count(u64 config) { + union ibs_op_ctl op_ctl = (union ibs_op_ctl)config; u64 count = 0; /* @@ -351,12 +346,12 @@ static u64 get_ibs_op_count(u64 config) * and the lower 7 bits of CurCnt are randomized. * Otherwise CurCnt has the full 27-bit current counter value. */ - if (config & IBS_OP_VAL) { - count = (config & IBS_OP_MAX_CNT) << 4; + if (op_ctl.op_val) { + count = op_ctl.opmaxcnt << 4; if (ibs_caps & IBS_CAPS_OPCNTEXT) - count += config & IBS_OP_MAX_CNT_EXT_MASK; + count += op_ctl.opmaxcnt_ext << 20; } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { - count = (config & IBS_OP_CUR_CNT) >> 32; + count = op_ctl.opcurcnt; } return count; diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h new file mode 100644 index 000000000000..46e1df45efc0 --- /dev/null +++ b/arch/x86/include/asm/amd-ibs.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * From PPR Vol 1 for AMD Family 19h Model 01h B1 + * 55898 Rev 0.35 - Feb 5, 2021 + */ + +#include + +/* + * IBS Hardware MSRs + */ + +/* MSR 0xc0011030: IBS Fetch Control */ +union ibs_fetch_ctl { + __u64 val; + struct { + __u64 fetch_maxcnt:16,/* 0-15: instruction fetch max. count */ + fetch_cnt:16, /* 16-31: instruction fetch count */ + fetch_lat:16, /* 32-47: instruction fetch latency */ + fetch_en:1, /* 48: instruction fetch enable */ + fetch_val:1, /* 49: instruction fetch valid */ + fetch_comp:1, /* 50: instruction fetch complete */ + ic_miss:1, /* 51: i-cache miss */ + phy_addr_valid:1,/* 52: physical address valid */ + l1tlb_pgsz:2, /* 53-54: i-cache L1TLB page size + * (needs IbsPhyAddrValid) */ + l1tlb_miss:1, /* 55: i-cache fetch missed in L1TLB */ + l2tlb_miss:1, /* 56: i-cache fetch missed in L2TLB */ + rand_en:1, /* 57: random tagging enable */ + fetch_l2_miss:1,/* 58: L2 miss for sampled fetch + * (needs IbsFetchComp) */ + reserved:5; /* 59-63: reserved */ + }; +}; + +/* MSR 0xc0011033: IBS Execution Control */ +union ibs_op_ctl { + __u64 val; + struct { + __u64 opmaxcnt:16, /* 0-15: periodic op max. count */ + reserved0:1, /* 16: reserved */ + op_en:1, /* 17: op sampling enable */ + op_val:1, /* 18: op sample valid */ + cnt_ctl:1, /* 19: periodic op counter control */ + opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ + reserved1:5, /* 27-31: reserved */ + opcurcnt:27, /* 32-58: periodic op counter current count */ + reserved2:5; /* 59-63: reserved */ + }; +}; + +/* MSR 0xc0011035: IBS Op Data 2 */ +union ibs_op_data { + __u64 val; + struct { + __u64 comp_to_ret_ctr:16, /* 0-15: op completion to retire count */ + tag_to_ret_ctr:16, /* 15-31: op tag to retire count */ + reserved1:2, /* 32-33: reserved */ + op_return:1, /* 34: return op */ + op_brn_taken:1, /* 35: taken branch op */ + op_brn_misp:1, /* 36: mispredicted branch op */ + op_brn_ret:1, /* 37: branch op retired */ + op_rip_invalid:1, /* 38: RIP is invalid */ + op_brn_fuse:1, /* 39: fused branch op */ + op_microcode:1, /* 40: microcode op */ + reserved2:23; /* 41-63: reserved */ + }; +}; + +/* MSR 0xc0011036: IBS Op Data 2 */ +union ibs_op_data2 { + __u64 val; + struct { + __u64 data_src:3, /* 0-2: data source */ + reserved0:1, /* 3: reserved */ + rmt_node:1, /* 4: destination node */ + cache_hit_st:1, /* 5: cache hit state */ + reserved1:57; /* 5-63: reserved */ + }; +}; + +/* MSR 0xc0011037: IBS Op Data 3 */ +union ibs_op_data3 { + __u64 val; + struct { + __u64 ld_op:1, /* 0: load op */ + st_op:1, /* 1: store op */ + dc_l1tlb_miss:1, /* 2: data cache L1TLB miss */ + dc_l2tlb_miss:1, /* 3: data cache L2TLB hit in 2M page */ + dc_l1tlb_hit_2m:1, /* 4: data cache L1TLB hit in 2M page */ + dc_l1tlb_hit_1g:1, /* 5: data cache L1TLB hit in 1G page */ + dc_l2tlb_hit_2m:1, /* 6: data cache L2TLB hit in 2M page */ + dc_miss:1, /* 7: data cache miss */ + dc_mis_acc:1, /* 8: misaligned access */ + reserved:4, /* 9-12: reserved */ + dc_wc_mem_acc:1, /* 13: write combining memory access */ + dc_uc_mem_acc:1, /* 14: uncacheable memory access */ + dc_locked_op:1, /* 15: locked operation */ + dc_miss_no_mab_alloc:1, /* 16: DC miss with no MAB allocated */ + dc_lin_addr_valid:1, /* 17: data cache linear address valid */ + dc_phy_addr_valid:1, /* 18: data cache physical address valid */ + dc_l2_tlb_hit_1g:1, /* 19: data cache L2 hit in 1GB page */ + l2_miss:1, /* 20: L2 cache miss */ + sw_pf:1, /* 21: software prefetch */ + op_mem_width:4, /* 22-25: load/store size in bytes */ + op_dc_miss_open_mem_reqs:6, /* 26-31: outstanding mem reqs on DC fill */ + dc_miss_lat:16, /* 32-47: data cache miss latency */ + tlb_refill_lat:16; /* 48-63: L1 TLB refill latency */ + }; +}; + +/* MSR 0xc001103c: IBS Fetch Control Extended */ +union ic_ibs_extd_ctl { + __u64 val; + struct { + __u64 itlb_refill_lat:16, /* 0-15: ITLB Refill latency for sampled fetch */ + reserved:48; /* 16-63: reserved */ + }; +}; + +/* + * IBS driver related + */ + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; -- Gitee From b6372fbf241daba0b0b09852728408c0ea4ce067 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 8 Dec 2021 14:11:20 -0600 Subject: [PATCH 105/257] x86: perf: Move RDPMC event flag to a common definition commit 369461ce8fb6c8156206c7110d7da48e9fbc41bb upstream In preparation to enable user counter access on arm64 and to move some of the user access handling to perf core, create a common event flag for user counter access and convert x86 to use it. Since the architecture specific flags start at the LSB, starting at the MSB for common flags. Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Kan Liang Cc: Thomas Gleixner Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: linux-perf-users@vger.kernel.org Reviewed-by: Mark Rutland Reviewed-by: Thomas Gleixner Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20211208201124.310740-2-robh@kernel.org Signed-off-by: Will Deacon Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/core.c | 10 +++++----- arch/x86/events/perf_event.h | 2 +- include/linux/perf_event.h | 9 +++++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6f32db1a251d..822a62251749 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2219,7 +2219,7 @@ static int x86_pmu_event_init(struct perf_event *event) if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) - event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; + event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } @@ -2231,7 +2231,7 @@ static void refresh_pce(void *ignored) static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* @@ -2253,7 +2253,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) @@ -2264,7 +2264,7 @@ static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) @@ -2427,7 +2427,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = - !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); + !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 9fd34a4d2aaa..b1bd1783cd5a 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -72,7 +72,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ #define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ #define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ + #define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ #define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f7792394ccf8..7f5001086ecb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -127,6 +127,15 @@ struct hw_perf_event_extra { int idx; /* index in shared_regs->regs[] */ }; +/** + * hw_perf_event::flag values + * + * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific + * usage. + */ +#define PERF_EVENT_FLAG_ARCH 0x0000ffff +#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 + /** * struct hw_perf_event - performance event hardware details: */ -- Gitee From ec0de4fb9122a49ecd7df5caa0757de3109090d7 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:05 -0700 Subject: [PATCH 106/257] perf/core: Add perf_clear_branch_entry_bitfields() helper commit bfe4daf850f45d92dcd3da477f0b0456620294c3 upstream Make it simpler to reset all the info fields on the perf_branch_entry by adding a helper inline function. The goal is to centralize the initialization to avoid missing a field in case more are added. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-2-eranian@google.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 36 +++++++++++++++++------------------- include/linux/perf_event.h | 16 ++++++++++++++++ 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index e4e249a78451..e01c48eaaf6b 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -724,6 +724,7 @@ void intel_pmu_lbr_disable_all(void) void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; @@ -739,15 +740,11 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].in_tx = 0; - cpuc->lbr_entries[i].abort = 0; - cpuc->lbr_entries[i].cycles = 0; - cpuc->lbr_entries[i].type = 0; - cpuc->lbr_entries[i].reserved = 0; + perf_clear_branch_entry_bitfields(br); + + br->from = msr_lastbranch.from; + br->to = msr_lastbranch.to; + br++; } cpuc->lbr_stack.nr = i; cpuc->lbr_stack.hw_idx = tos; @@ -763,6 +760,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; int lbr_format = x86_pmu.intel_cap.lbr_format; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -834,15 +832,14 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (abort && x86_pmu.lbr_double_abort && out > 0) out--; - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].type = 0; - cpuc->lbr_entries[out].reserved = 0; + perf_clear_branch_entry_bitfields(br+out); + br[out].from = from; + br[out].to = to; + br[out].mispred = mis; + br[out].predicted = pred; + br[out].in_tx = in_tx; + br[out].abort = abort; + br[out].cycles = cycles; out++; } cpuc->lbr_stack.nr = out; @@ -904,6 +901,8 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, to = rdlbr_to(i, lbr); info = rdlbr_info(i, lbr); + perf_clear_branch_entry_bitfields(e); + e->from = from; e->to = to; e->mispred = get_lbr_mispred(info); @@ -912,7 +911,6 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); e->type = get_lbr_br_type(info); - e->reserved = 0; } cpuc->lbr_stack.nr = i; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7f5001086ecb..2d85c2bfedc8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1033,6 +1033,22 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->txn = 0; } +/* + * Clear all bitfields in the perf_branch_entry. + * The to and from fields are not cleared because they are + * systematically modified by caller. + */ +static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) +{ + br->mispred = 0; + br->predicted = 0; + br->in_tx = 0; + br->abort = 0; + br->cycles = 0; + br->type = 0; + br->reserved = 0; +} + extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, -- Gitee From 4e3ee4e1a4a5e373aca4f8c2788423bd822105c3 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:06 -0700 Subject: [PATCH 107/257] x86/cpufeatures: Add AMD Fam19h Branch Sampling feature commit a77d41ac3a0f41c80120ec5b8b08ab284fec950a upstream Add a cpu feature for AMD Fam19h Branch Sampling feature as bit 31 of EBX on CPUID leaf function 0x80000008. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-3-eranian@google.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 947b7d449dd6..9e87d60f1bdd 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -333,6 +333,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -- Gitee From 45e1c28fcb3b22aa806c204b6c66c1f17ecb7bf7 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:07 -0700 Subject: [PATCH 108/257] perf/x86/amd: Add AMD Fam19h Branch Sampling support commit ada543459cab7f653dcacdaba4011a8bb19c627c upstream Add support for the AMD Fam19h 16-deep branch sampling feature as described in the AMD PPR Fam19h Model 01h Revision B1. This is a model specific extension. It is not an architected AMD feature. The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR registers. There is no branch type filtering. All control flow changes are captured. BRS relies on specific programming of the core PMU of Fam19h. In particular, the following requirements must be met: - the sampling period be greater than 16 (BRS depth) - the sampling period must use a fixed and not frequency mode BRS interacts with the NMI interrupt as well. Because enabling BRS is expensive, it is only activated after P event occurrences, where P is the desired sampling period. At P occurrences of the event, the counter overflows, the CPU catches the interrupt, activates BRS for 16 branches until it saturates, and then delivers the NMI to the kernel. Between the overflow and the time BRS activates more branches may be executed skewing the period. All along, the sampling event keeps counting. The skid may be attenuated by reducing the sampling period by 16 (subsequent patch). BRS is integrated into perf_events seamlessly via the same PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry records in the sampling buffer. No prediction information is supported. The branches are stored in reverse order of execution. The most recent branch is the first entry in each record. No modification to the perf tool is necessary. BRS can be used with any sampling event. However, it is recommended to use the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS captures. $ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test $ perf report -D 56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0 ... branch stack: nr:16 ..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0 ..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0 ..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0 ..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0 ..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0 ..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0 ..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0 ..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0 ..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0 ..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0 ..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0 ..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0 ..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0 ..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0 ..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0 ..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0 ... thread: test:18230 ...... dso: test [Backport Changes] In file arch/x86/events/perf_event.h, the removal of the PERF_X86_EVENT_PEBS_STLAT macro definition was skipped, as the macro is not introduced yet in the current source tree. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/Makefile | 2 +- arch/x86/events/amd/brs.c | 317 +++++++++++++++++++++++++++++++ arch/x86/events/amd/core.c | 233 ++++++++++++++++++++++- arch/x86/events/core.c | 10 +- arch/x86/events/perf_event.h | 100 ++++++++-- arch/x86/include/asm/msr-index.h | 4 + 6 files changed, 645 insertions(+), 21 deletions(-) create mode 100644 arch/x86/events/amd/brs.c diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index 6cbe38d5fd9d..cf323ffab5cd 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 000000000000..3c13c484c637 --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian + */ +#include +#include +#include + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!boot_cpu_has(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + /* can only capture at all priv levels due to the way BRS works */ + if ((type & PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_PLM_ALL) + return -EINVAL; + + return 0; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + rdmsrl(brs_from(brs_idx), from); + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 941d034627c2..2604343dd371 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -328,8 +328,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + static int amd_core_hw_config(struct perf_event *event) { + int ret = 0; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -346,7 +354,66 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - return 0; + /* + * if branch stack is requested + */ + if (has_branch_stack(event)) { + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + } + return ret; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -369,7 +436,7 @@ static int amd_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && get_ibs_caps()) return -ENOENT; - if (has_branch_stack(event)) + if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; ret = x86_pmu_hw_config(event); @@ -571,6 +638,8 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; amd_pmu_cpu_reset(cpu); + + amd_brs_reset(); } static void amd_pmu_cpu_dead(int cpu) @@ -628,6 +697,8 @@ static void amd_pmu_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; + amd_brs_disable_all(); + x86_pmu_disable_all(); /* @@ -652,6 +723,30 @@ static void amd_pmu_disable_all(void) } } +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc; + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + hwc = &cpuc->events[idx]->hw; + + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -669,6 +764,18 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_add(event); +} + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_del(event); +} + /* * Because of NMI latency, if multiple PMC counters are active or other sources * of NMIs are received, the perf NMI handler can handle one or more overflowed @@ -689,11 +796,31 @@ static void amd_pmu_disable_event(struct perf_event *event) */ static int amd_pmu_handle_irq(struct pt_regs *regs) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int handled; + int pmu_enabled; + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* stop everything (includes BRS) */ + amd_pmu_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); /* Process any counter overflows */ handled = x86_pmu_handle_irq(regs); + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_pmu_enable_all(0); + /* * If a counter was handled, record a timestamp such that un-handled * NMIs will be claimed if arriving within that window. @@ -915,6 +1042,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, --cpuc->n_pair; } +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -923,12 +1095,19 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } +static void amd_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (sched_in && x86_pmu.lbr_nr) + amd_pmu_brs_sched_task(ctx, sched_in); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -938,6 +1117,8 @@ static __initconst const struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -956,6 +1137,37 @@ static __initconst const struct x86_pmu amd_pmu = { .amd_nb_constraints = 1, }; +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_brs_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_brs = { + .name = "caps", + .attrs = amd_pmu_brs_attrs, + .is_visible = amd_brs_is_visible, +}; + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_brs, + NULL, +}; + static int __init amd_core_pmu_init(void) { union cpuid_0x80000022_ebx ebx; @@ -1022,6 +1234,19 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_sched_task; + /* + * put_event_constraints callback same as Fam17h, set above + */ + } + + x86_pmu.attr_update = amd_attr_update; + pr_cont("core perfctr, "); return 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 822a62251749..aba34228231d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1259,6 +1259,10 @@ static void x86_pmu_enable(struct pmu *pmu) if (hwc->state & PERF_HES_ARCH) continue; + /* + * if cpuc->enabled = 0, then no wrmsr as + * per x86_pmu_enable_event() + */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1624,11 +1628,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index b1bd1783cd5a..232db8465cfe 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -65,21 +65,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ - -#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ + +#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ +#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -320,6 +322,8 @@ struct cpu_hw_events { * AMD specific bits */ struct amd_nb *amd_nb; + int brs_active; /* BRS is enabled */ + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ @@ -942,6 +946,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool has_amd_brs(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_AMD_BRS; +} + static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; @@ -1032,6 +1041,50 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); +int amd_brs_init(void); +void amd_brs_disable(void); +void amd_brs_enable(void); +void amd_brs_enable_all(void); +void amd_brs_disable_all(void); +void amd_brs_drain(void); +void amd_brs_disable_all(void); +int amd_brs_setup_filter(struct perf_event *event); +void amd_brs_reset(void); + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + perf_sched_cb_inc(event->ctx->pmu); + cpuc->lbr_users++; + /* + * No need to reset BRS because it is reset + * on brs_enable() and it is saturating + */ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); + +/* + * check if BRS is activated on the CPU + * active defined as it has non-zero users and DBG_EXT_CFG.BRSEN=1 + */ +static inline bool amd_brs_active(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + return cpuc->brs_active; +} #else /* CONFIG_CPU_SUP_AMD */ @@ -1040,6 +1093,23 @@ static inline int amd_pmu_init(void) return 0; } +static inline int amd_brs_init(void) +{ + return -EOPNOTSUPP; +} + +static inline void amd_brs_drain(void) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +static inline void amd_brs_disable_all(void) +{ +} + #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b5fcde5114f9..220448e729b5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -682,6 +682,10 @@ #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff +/* AMD Branch Sampling configuration */ +#define MSR_AMD_DBG_EXTN_CFG 0xc000010f +#define MSR_AMD_SAMP_BR_FROM 0xc0010300 + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 -- Gitee From dfe1d35236b6d6e4241841ddd5f3f62658ec4879 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:57 +0530 Subject: [PATCH 109/257] perf/x86/amd/core: Add PerfMonV2 counter control commit 9622e67e3980c01872490de0925e5c6c23247c94 upstream If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use a new scheme to manage the Core PMCs using the new global control and status registers. This will be bypassed on unsupported hardware (x86_pmu.version < 2). Currently, all PMCs have dedicated control (PERF_CTL) and counter (PERF_CTR) registers. For a given PMC, the enable (En) bit of its PERF_CTL register is used to start or stop counting. The Performance Counter Global Control (PerfCntrGlobalCtl) register has enable (PerfCntrEn) bits for each PMC. For a PMC to start counting, both PERF_CTL and PerfCntrGlobalCtl enable bits must be set. If either of those are cleared, the PMC stops counting. In x86_pmu_{en,dis}able_all(), the PERF_CTL registers of all active PMCs are written to in a loop. Ideally, PMCs counting the same event that were started and stopped at the same time should record the same counts. Due to delays in between writes to the PERF_CTL registers across loop iterations, the PMCs cannot be enabled or disabled at the same instant and hence, record slightly different counts. This is fixed by enabling or disabling all active PMCs at the same time with a single write to the PerfCntrGlobalCtl register. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/dfe8e934074aaabc6ba748dfaccd0a77c974bb82.1650515382.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 50 ++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 2604343dd371..c691617e4080 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -663,6 +663,11 @@ static void amd_pmu_cpu_dead(int cpu) amd_pmu_cpu_reset(cpu); } +static inline void amd_pmu_set_global_ctl(u64 ctl) +{ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); +} + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -692,15 +697,11 @@ static void amd_pmu_wait_on_overflow(int idx) } } -static void amd_pmu_disable_all(void) +static void amd_pmu_check_overflow(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - amd_brs_disable_all(); - - x86_pmu_disable_all(); - /* * This shouldn't be called from NMI context, but add a safeguard here * to return, since if we're in NMI context we can't wait for an NMI @@ -747,6 +748,26 @@ static void amd_pmu_enable_all(int added) } } +static void amd_pmu_v2_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * Testing cpu_hw_events.enabled should be skipped in this case unlike + * in x86_pmu_enable_event(). + * + * Since cpu_hw_events.enabled is set only after returning from + * x86_pmu_start(), the PMCs must be programmed and kept ready. + * Counting starts only after x86_pmu_enable_all() is called. + */ + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -764,6 +785,20 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_disable_all(void) +{ + amd_brs_disable_all(); + x86_pmu_disable_all(); + amd_pmu_check_overflow(); +} + +static void amd_pmu_v2_disable_all(void) +{ + /* Disable all PMCs */ + amd_pmu_set_global_ctl(0); + amd_pmu_check_overflow(); +} + static void amd_pmu_add_event(struct perf_event *event) { if (needs_branch_stack(event)) @@ -1200,6 +1235,11 @@ static int __init amd_core_pmu_init(void) x86_pmu.num_counters = ebx.split.num_core_pmc; amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + + /* Update PMC handling functions */ + x86_pmu.enable_all = amd_pmu_v2_enable_all; + x86_pmu.disable_all = amd_pmu_v2_disable_all; + x86_pmu.enable = amd_pmu_v2_enable_event; } /* -- Gitee From 884eb4a4770b6cca1dde515ee293e237cebf5c53 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:58 +0530 Subject: [PATCH 110/257] perf/x86/amd/core: Add PerfMonV2 overflow handling commit 7685665c390dc68c2d9a74e8445f41494cc8f6cf upstream If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use a new scheme to process Core PMC overflows in the NMI handler using the new global control and status registers. This will be bypassed on unsupported hardware (x86_pmu.version < 2). In x86_pmu_handle_irq(), overflows are detected by testing the contents of the PERF_CTR register for each active PMC in a loop. The new scheme instead inspects the overflow bits of the global status register. The Performance Counter Global Status (PerfCntrGlobalStatus) register has overflow (PerfCntrOvfl) bits for each PMC. This is, however, a read-only MSR. To acknowledge that overflows have been processed, the NMI handler must clear the bits by writing to the PerfCntrGlobalStatusClr register. In x86_pmu_handle_irq(), PMCs counting the same event that are started and stopped at the same time record slightly different counts due to delays in between reads from the PERF_CTR registers. This is fixed by stopping and starting the PMCs at the same before and with a single write to the Performance Counter Global Control (PerfCntrGlobalCtl) upon entering and before exiting the NMI handler. [Backport Changes] Backporting of static_call() used in the upstream patch requires entire static call module backport and requires 100+ dependencies. Instead of using static_call(), we check for perfmonv2 support using the version field of the struct x86_pmu instance. If the version is 2 or higher, we call amd_pmu_test_overflow_status(). For older versions, we call amd_pmu_test_overflow_topbit(). These changes are based on the assignment and invocation of the static call. The DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit() macro initially sets amd_pmu_test_overflow to amd_pmu_test_overflow_topbit() in arch/x86/events/amd/core.c. When perfmonV2 is enabled, amd_pmu_test_overflow is updated to point to amd_pmu_test_overflow_status() using static_call_update() within the amd_core_pmu_init() function in arch/x86/events/amd/core.c Therefore, for version 2 and above, the code uses amd_pmu_test_overflow_status(), while for lower versions, it uses amd_pmu_test_overflow_topbit(). Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/f20b7e4da0b0a83bdbe05857f354146623bc63ab.1650515382.git.sandipan.das@amd.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 148 ++++++++++++++++++++++++++++++++++--- 1 file changed, 136 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index c691617e4080..d60a16644e59 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "../perf_event.h" @@ -668,6 +669,43 @@ static inline void amd_pmu_set_global_ctl(u64 ctl) wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); } +static inline u64 amd_pmu_get_global_status(void) +{ + u64 status; + + /* PerfCntrGlobalStatus is read-only */ + rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + + return status & amd_pmu_global_cntr_mask; +} + +static inline void amd_pmu_ack_global_status(u64 status) +{ + /* + * PerfCntrGlobalStatus is read-only but an overflow acknowledgment + * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr + * clears the same bit in PerfCntrGlobalStatus + */ + + /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ + status &= amd_pmu_global_cntr_mask; + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); +} + +static bool amd_pmu_test_overflow_topbit(int idx) +{ + u64 counter; + + rdmsrl(x86_pmu_event_addr(idx), counter); + + return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); +} + +static bool amd_pmu_test_overflow_status(int idx) +{ + return amd_pmu_get_global_status() & BIT_ULL(idx); +} + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -680,7 +718,6 @@ static inline void amd_pmu_set_global_ctl(u64 ctl) static void amd_pmu_wait_on_overflow(int idx) { unsigned int i; - u64 counter; /* * Wait for the counter to be reset if it has overflowed. This loop @@ -688,9 +725,13 @@ static void amd_pmu_wait_on_overflow(int idx) * forever... */ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { - rdmsrl(x86_pmu_event_addr(idx), counter); - if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) - break; + if ( x86_pmu.version >= 2 ) { + if ( !amd_pmu_test_overflow_status(idx) ) + break; + } else { + if ( !amd_pmu_test_overflow_topbit(idx) ) + break; + } /* Might be in IRQ context, so can't sleep */ udelay(1); @@ -829,6 +870,24 @@ static void amd_pmu_del_event(struct perf_event *event) * handled a counter. When an un-handled NMI is received, it will be claimed * only if arriving within that window. */ +static inline int amd_pmu_adjust_nmi_window(int handled) +{ + /* + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. + */ + if (handled) { + this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + + return handled; + } + + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) + return NMI_DONE; + + return NMI_HANDLED; +} + static int amd_pmu_handle_irq(struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -856,20 +915,84 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (pmu_enabled) amd_pmu_enable_all(0); + return amd_pmu_adjust_nmi_window(handled); +} + +static int amd_pmu_v2_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_sample_data data; + struct hw_perf_event *hwc; + struct perf_event *event; + int handled = 0, idx; + u64 status, mask; + bool pmu_enabled; + /* - * If a counter was handled, record a timestamp such that un-handled - * NMIs will be claimed if arriving within that window. + * Save the PMU state as it needs to be restored when leaving the + * handler */ - if (handled) { - this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; - return handled; + /* Stop counting */ + amd_pmu_v2_disable_all(); + + status = amd_pmu_get_global_status(); + + /* Check if any overflows are pending */ + if (!status) + goto done; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + x86_perf_event_update(event); + mask = BIT_ULL(idx); + + if (!(status & mask)) + continue; + + /* Event overflow */ + handled++; + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + + status &= ~mask; } - if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) - return NMI_DONE; + /* + * It should never be the case that some overflows are not handled as + * the corresponding PMCs are expected to be inactive according to the + * active_mask + */ + WARN_ON(status > 0); - return NMI_HANDLED; + /* Clear overflow bits */ + amd_pmu_ack_global_status(~status); + + /* + * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT + * PMI entry is not set by the local APIC when a PMC overflow occurs + */ + inc_irq_stat(apic_perf_irqs); + +done: + cpuc->enabled = pmu_enabled; + + /* Resume counting only if PMU is active */ + if (pmu_enabled) + amd_pmu_v2_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); } static struct event_constraint * @@ -1240,6 +1363,7 @@ static int __init amd_core_pmu_init(void) x86_pmu.enable_all = amd_pmu_v2_enable_all; x86_pmu.disable_all = amd_pmu_v2_disable_all; x86_pmu.enable = amd_pmu_v2_enable_event; + x86_pmu.handle_irq = amd_pmu_v2_handle_irq; } /* -- Gitee From da6d618e833764435c7b42c0acbae62217a9d485 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:07 +0530 Subject: [PATCH 111/257] perf/amd/ibs: Cascade pmu init functions' return value commit 39b2ca75eec8a33e2ffdb8aa0c4840ec3e3b472c upstream IBS pmu initialization code ignores return value provided by callee functions. Fix it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-2-ravi.bangoria@amd.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/ibs.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 11e8b493e015..2704ec1e42a3 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -777,9 +777,10 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init void perf_event_ibs_init(void) +static __init int perf_event_ibs_init(void) { struct attribute **attr = ibs_op_format_attrs; + int ret; /* * Some chips fail to reset the fetch count when it is written; instead @@ -791,7 +792,9 @@ static __init void perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; - perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ret) + return ret; if (ibs_caps & IBS_CAPS_OPCNT) { perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; @@ -804,15 +807,35 @@ static __init void perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } - perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + if (ret) + goto err_op; + + ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + if (ret) + goto err_nmi; - register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); + return 0; + +err_nmi: + perf_pmu_unregister(&perf_ibs_op.pmu); + free_percpu(perf_ibs_op.pcpu); + perf_ibs_op.pcpu = NULL; +err_op: + perf_pmu_unregister(&perf_ibs_fetch.pmu); + free_percpu(perf_ibs_fetch.pcpu); + perf_ibs_fetch.pcpu = NULL; + + return ret; } #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ -static __init void perf_event_ibs_init(void) { } +static __init int perf_event_ibs_init(void) +{ + return 0; +} #endif @@ -1082,9 +1105,7 @@ static __init int amd_ibs_init(void) x86_pmu_amd_ibs_starting_cpu, x86_pmu_amd_ibs_dying_cpu); - perf_event_ibs_init(); - - return 0; + return perf_event_ibs_init(); } /* Since we need the pci subsystem to init ibs we can't do this earlier: */ -- Gitee From ddcbd342496dff4d5e79f1b17ab2ce8e3a53f028 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:08 +0530 Subject: [PATCH 112/257] perf/amd/ibs: Use ->is_visible callback for dynamic attributes commit 2a7a7e658682bfd7501dc6b4c9d365aa6c79788a upstream Currently, some attributes are added at build time whereas others at boot time depending on IBS pmu capabilities. Instead, we can just add all attribute groups at build time but hide individual group at boot time using more appropriate ->is_visible() callback. Also, struct perf_ibs has bunch of fields for pmu attributes which just pass on the pointer, does not do anything else. Remove them. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-3-ravi.bangoria@amd.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/ibs.c | 78 +++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2704ec1e42a3..ece4f6a7d24b 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -94,10 +94,6 @@ struct perf_ibs { unsigned int fetch_ignore_if_zero_rip : 1; struct cpu_perf_ibs __percpu *pcpu; - struct attribute **format_attrs; - struct attribute_group format_group; - const struct attribute_group *attr_groups[2]; - u64 (*get_count)(u64 config); }; @@ -528,16 +524,61 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +/* + * We need to initialize with empty group if all attributes in the + * group are dynamic. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group empty_format_group = { + .name = "format", + .attrs = attrs_empty, +}; + +static const struct attribute_group *empty_attr_groups[] = { + &empty_format_group, + NULL, +}; + PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); -static struct attribute *ibs_fetch_format_attrs[] = { +static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; -static struct attribute *ibs_op_format_attrs[] = { - NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ +static struct attribute_group group_rand_en = { + .name = "format", + .attrs = rand_en_attrs, +}; + +static const struct attribute_group *fetch_attr_groups[] = { + &group_rand_en, + NULL, +}; + +static umode_t +cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; +} + +static struct attribute *cnt_ctl_attrs[] = { + &format_attr_cnt_ctl.attr, + NULL, +}; + +static struct attribute_group group_cnt_ctl = { + .name = "format", + .attrs = cnt_ctl_attrs, + .is_visible = cnt_ctl_is_visible, +}; + +static const struct attribute_group *op_attr_update[] = { + &group_cnt_ctl, NULL, }; @@ -561,7 +602,6 @@ static struct perf_ibs perf_ibs_fetch = { .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, - .format_attrs = ibs_fetch_format_attrs, .get_count = get_ibs_fetch_count, }; @@ -587,7 +627,6 @@ static struct perf_ibs perf_ibs_op = { .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, - .format_attrs = ibs_op_format_attrs, .get_count = get_ibs_op_count, }; @@ -757,17 +796,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) perf_ibs->pcpu = pcpu; - /* register attributes */ - if (perf_ibs->format_attrs[0]) { - memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); - perf_ibs->format_group.name = "format"; - perf_ibs->format_group.attrs = perf_ibs->format_attrs; - - memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); - perf_ibs->attr_groups[0] = &perf_ibs->format_group; - perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; - } - ret = perf_pmu_register(&perf_ibs->pmu, name, -1); if (ret) { perf_ibs->pcpu = NULL; @@ -779,7 +807,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) static __init int perf_event_ibs_init(void) { - struct attribute **attr = ibs_op_format_attrs; int ret; /* @@ -792,14 +819,14 @@ static __init int perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); if (ret) return ret; - if (ibs_caps & IBS_CAPS_OPCNT) { + if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; - *attr++ = &format_attr_cnt_ctl.attr; - } if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; @@ -807,6 +834,9 @@ static __init int perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } + perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_update = op_attr_update; + ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); if (ret) goto err_op; -- Gitee From 129205ef8f452f7d9c12468923f10b3477f41455 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:09 +0530 Subject: [PATCH 113/257] perf/amd/ibs: Add support for L3 miss filtering commit ba5d35b442c65f32d38ef61f732218274c6dcf4c upstream IBS L3 miss filtering works by tagging an instruction on IBS counter overflow and generating an NMI if the tagged instruction causes an L3 miss. Samples without an L3 miss are discarded and counter is reset with random value (between 1-15 for fetch pmu and 1-127 for op pmu). This helps in reducing sampling overhead when user is interested only in such samples. One of the use case of such filtered samples is to feed data to page-migration daemon in tiered memory systems. Add support for L3 miss filtering in IBS driver via new pmu attribute "l3missonly". Example usage: # perf record -a -e ibs_op/l3missonly=1/ --raw-samples sleep 5 Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-4-ravi.bangoria@amd.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/ibs.c | 67 +++++++++++++++++++++++++++---- arch/x86/include/asm/perf_event.h | 3 ++ 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ece4f6a7d24b..2dc8b7ec030a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -544,22 +544,46 @@ static const struct attribute_group *empty_attr_groups[] = { PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); +PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); + +static umode_t +zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; +} static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; +static struct attribute *fetch_l3missonly_attrs[] = { + &fetch_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, }; +static struct attribute_group group_fetch_l3missonly = { + .name = "format", + .attrs = fetch_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, NULL, }; +static const struct attribute_group *fetch_attr_update[] = { + &group_fetch_l3missonly, + NULL, +}; + static umode_t cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) { @@ -571,14 +595,26 @@ static struct attribute *cnt_ctl_attrs[] = { NULL, }; +static struct attribute *op_l3missonly_attrs[] = { + &op_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, .is_visible = cnt_ctl_is_visible, }; +static struct attribute_group group_op_l3missonly = { + .name = "format", + .attrs = op_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, + &group_op_l3missonly, NULL, }; @@ -805,10 +841,8 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init int perf_event_ibs_init(void) +static __init int perf_ibs_fetch_init(void) { - int ret; - /* * Some chips fail to reset the fetch count when it is written; instead * they need a 0-1 transition of IbsFetchEn. @@ -819,12 +853,17 @@ static __init int perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + perf_ibs_fetch.pmu.attr_update = fetch_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); - if (ret) - return ret; + return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); +} +static __init int perf_ibs_op_init(void) +{ if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; @@ -834,10 +873,24 @@ static __init int perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; + perf_ibs_op.pmu.attr_groups = empty_attr_groups; perf_ibs_op.pmu.attr_update = op_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); +} + +static __init int perf_event_ibs_init(void) +{ + int ret; + + ret = perf_ibs_fetch_init(); + if (ret) + return ret; + + ret = perf_ibs_op_init(); if (ret) goto err_op; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 3ceae0473115..0d13b29c392f 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -419,6 +419,7 @@ struct lbr_entry { #define IBS_CAPS_OPBRNFUSE (1U<<8) #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) +#define IBS_CAPS_ZEN4 (1U<<11) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -432,6 +433,7 @@ struct lbr_entry { #define IBSCTL_LVT_OFFSET_MASK 0x0F /* IBS fetch bits/masks */ +#define IBS_FETCH_L3MISSONLY (1ULL<<59) #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) @@ -448,6 +450,7 @@ struct lbr_entry { #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_L3MISSONLY (1ULL<<16) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ -- Gitee From 2b1eb7c8ce19bf9914e66c57c08e1780e087db23 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:10 +0530 Subject: [PATCH 114/257] perf/amd/ibs: Advertise zen4_ibs_extensions as pmu capability attribute commit 838de1d843fc9b6161e0e1c6308a8c027d08606d upstream PMU driver can advertise certain feature via capability attribute('caps' sysfs directory) which can be consumed by userspace tools like perf. Add zen4_ibs_extensions capability attribute for IBS pmus. This attribute will be enabled when CPUID_Fn8000001B_EAX[11] is set. With patch on Zen4: $ ls /sys/bus/event_source/devices/ibs_op/caps zen4_ibs_extensions Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-5-ravi.bangoria@amd.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/ibs.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2dc8b7ec030a..c251bc44c088 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -537,8 +537,14 @@ static struct attribute_group empty_format_group = { .attrs = attrs_empty, }; +static struct attribute_group empty_caps_group = { + .name = "caps", + .attrs = attrs_empty, +}; + static const struct attribute_group *empty_attr_groups[] = { &empty_format_group, + &empty_caps_group, NULL, }; @@ -546,6 +552,7 @@ PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -563,6 +570,11 @@ static struct attribute *fetch_l3missonly_attrs[] = { NULL, }; +static struct attribute *zen4_ibs_extensions_attrs[] = { + &zen4_ibs_extensions.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, @@ -574,13 +586,21 @@ static struct attribute_group group_fetch_l3missonly = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_zen4_ibs_extensions = { + .name = "caps", + .attrs = zen4_ibs_extensions_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, + &empty_caps_group, NULL, }; static const struct attribute_group *fetch_attr_update[] = { &group_fetch_l3missonly, + &group_zen4_ibs_extensions, NULL, }; @@ -615,6 +635,7 @@ static struct attribute_group group_op_l3missonly = { static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, + &group_zen4_ibs_extensions, NULL, }; -- Gitee From d8bce7c0725bf3d3d034de1d938500d3f4dd18a7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 18 May 2022 14:13:27 +0530 Subject: [PATCH 115/257] perf/x86/amd/core: Fix reloading events for SVM commit bae19fdd7e9e759580ac4693d2df3bc23ab415d7 upstream Commit 1018faa6cf23 ("perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled") addresses an issue in which the Host-Only bit in the counter control registers needs to be masked off when SVM is not enabled. The events need to be reloaded whenever SVM is enabled or disabled for a CPU and this requires the PERF_CTL registers to be reprogrammed using {enable,disable}_all(). However, PerfMonV2 variants of these functions do not reprogram the PERF_CTL registers. Hence, the legacy enable_all() function should also be called. Fixes: 9622e67e3980 ("perf/x86/amd/core: Add PerfMonV2 counter control") Reported-by: Like Xu Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220518084327.464005-1-sandipan.das@amd.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d60a16644e59..9b95649bf16a 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1445,6 +1445,24 @@ __init int amd_pmu_init(void) return 0; } +static inline void amd_pmu_reload_virt(void) +{ + if (x86_pmu.version >= 2) { + /* + * Clear global enable bits, reprogram the PERF_CTL + * registers with updated perf_ctr_virt_mask and then + * set global enable bits once again + */ + amd_pmu_v2_disable_all(); + amd_pmu_enable_all(0); + amd_pmu_v2_enable_all(0); + return; + } + + amd_pmu_disable_all(); + amd_pmu_enable_all(0); +} + void amd_pmu_enable_virt(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1452,8 +1470,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -1470,7 +1487,6 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); -- Gitee From 0e2769519577ce9868aac2729b514bfd392e0227 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 19 May 2022 15:33:30 +0530 Subject: [PATCH 116/257] perf/x86/amd/uncore: Use dynamic events array commit 39621c5808f5dda75d03dc4b2d4d2b13a5a1c34b upstream If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, the number of available counters for a given uncore PMU may not be fixed across families and models and has to be determined at runtime. The per-cpu uncore PMU data currently uses a fixed-sized array for event information. Make it dynamic based on the number of available counters. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/21eea0cb6de9d14f78d52d1d62637ae02bc900f5.1652954372.git.sandipan.das@amd.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/uncore.c | 38 +++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 3c13f0968ccb..75f194f895f2 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -21,7 +21,6 @@ #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 #define NUM_COUNTERS_L3 6 -#define MAX_COUNTERS 6 #define RDPMC_BASE_NB 6 #define RDPMC_BASE_LLC 10 @@ -46,7 +45,7 @@ struct amd_uncore { u32 msr_base; cpumask_t *active_mask; struct pmu *pmu; - struct perf_event *events[MAX_COUNTERS]; + struct perf_event **events; struct hlist_node node; }; @@ -425,11 +424,19 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) cpu_to_node(cpu)); } +static inline struct perf_event ** +amd_uncore_events_alloc(unsigned int num, unsigned int cpu) +{ + return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL, + cpu_to_node(cpu)); +} + static int amd_uncore_cpu_up_prepare(unsigned int cpu) { - struct amd_uncore *uncore_nb = NULL, *uncore_llc; + struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL; if (amd_uncore_nb) { + *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; uncore_nb = amd_uncore_alloc(cpu); if (!uncore_nb) goto fail; @@ -439,11 +446,15 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; uncore_nb->pmu = &amd_nb_pmu; + uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu); + if (!uncore_nb->events) + goto fail; uncore_nb->id = -1; *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } if (amd_uncore_llc) { + *per_cpu_ptr(amd_uncore_llc, cpu) = NULL; uncore_llc = amd_uncore_alloc(cpu); if (!uncore_llc) goto fail; @@ -453,6 +464,9 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL; uncore_llc->active_mask = &amd_llc_active_mask; uncore_llc->pmu = &amd_llc_pmu; + uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu); + if (!uncore_llc->events) + goto fail; uncore_llc->id = -1; *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc; } @@ -460,9 +474,16 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) return 0; fail: - if (amd_uncore_nb) - *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; - kfree(uncore_nb); + if (uncore_nb) { + kfree(uncore_nb->events); + kfree(uncore_nb); + } + + if (uncore_llc) { + kfree(uncore_llc->events); + kfree(uncore_llc); + } + return -ENOMEM; } @@ -595,8 +616,11 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) if (cpu == uncore->cpu) cpumask_clear_cpu(cpu, uncore->active_mask); - if (!--uncore->refcnt) + if (!--uncore->refcnt) { + kfree(uncore->events); kfree(uncore); + } + *per_cpu_ptr(uncores, cpu) = NULL; } -- Gitee From 913c2a739a7b7b343dec61082c30195feabedf1f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 19 May 2022 15:33:31 +0530 Subject: [PATCH 117/257] perf/x86/amd/uncore: Use attr_update for format attributes commit 847f3268bb644ee852732f8e3b5748e4319244b7 upstream Use the update_attrs attribute group introduced by commit f3a3a8257e5a ("perf/core: Add attr_groups_update into struct pmu") and the is_visible() callback to populate the family specifc attributes for uncore events. The changes apply to attributes that are unique to families such as slicemask for Family 17h and coreid for Family 19h. The addition of common attributes such as event and umask, whose formats change across families, remain unchanged. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/a5e4f4dd5c459199fc497e82b858ba09dc91c064.1652954372.git.sandipan.das@amd.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/uncore.c | 68 ++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 75f194f895f2..ccc820669da2 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -279,6 +279,19 @@ hygon_f18h_m6h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, i attr->mode : 0; } +static umode_t +amd_f17h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return boot_cpu_data.x86 >= 0x17 && boot_cpu_data.x86 < 0x19 ? + attr->mode : 0; +} + +static umode_t +amd_f19h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return boot_cpu_data.x86 >= 0x19 ? attr->mode : 0; +} + static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -334,20 +347,33 @@ DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */ DEFINE_UNCORE_FORMAT_ATTR(slicemask4, slicemask, "config:28-31"); /* F18h L3 */ DEFINE_UNCORE_FORMAT_ATTR(threadmask32, threadmask, "config:32-63"); /* F18h L3 */ +/* Common DF and NB attributes */ static struct attribute *amd_uncore_df_format_attr[] = { - &format_attr_event12.attr, /* event14 if F17h+ */ - &format_attr_umask.attr, + &format_attr_event12.attr, /* event */ + &format_attr_umask.attr, /* umask */ NULL, }; +/* Common L2 and L3 attributes */ static struct attribute *amd_uncore_l3_format_attr[] = { - &format_attr_event12.attr, /* event8 if F17h+ */ - &format_attr_umask.attr, - NULL, /* slicemask if F17h, coreid if F19h */ - NULL, /* threadmask8 if F17h, enallslices if F19h */ - NULL, /* enallcores if F19h */ - NULL, /* sliceid if F19h */ - NULL, /* threadmask2 if F19h */ + &format_attr_event12.attr, /* event */ + &format_attr_umask.attr, /* umask */ + NULL, /* threadmask */ + NULL, +}; + +/* F17h unique L3 attributes */ +static struct attribute *amd_f17h_uncore_l3_format_attr[] = { + &format_attr_slicemask.attr, /* slicemask */ + NULL, +}; + +/* F19h unique L3 attributes */ +static struct attribute *amd_f19h_uncore_l3_format_attr[] = { + &format_attr_coreid.attr, /* coreid */ + &format_attr_enallslices.attr, /* enallslices */ + &format_attr_enallcores.attr, /* enallcores */ + &format_attr_sliceid.attr, /* sliceid */ NULL, }; @@ -373,6 +399,18 @@ static struct attribute_group hygon_f18h_m6h_uncore_l3_format_group = { .is_visible = hygon_f18h_m6h_uncore_is_visible, }; +static struct attribute_group amd_f17h_uncore_l3_format_group = { + .name = "format", + .attrs = amd_f17h_uncore_l3_format_attr, + .is_visible = amd_f17h_uncore_is_visible, +}; + +static struct attribute_group amd_f19h_uncore_l3_format_group = { + .name = "format", + .attrs = amd_f19h_uncore_l3_format_attr, + .is_visible = amd_f19h_uncore_is_visible, +}; + static const struct attribute_group *amd_uncore_df_attr_groups[] = { &amd_uncore_attr_group, &amd_uncore_df_format_group, @@ -390,6 +428,12 @@ static const struct attribute_group *hygon_uncore_l3_attr_update[] = { NULL, }; +static const struct attribute_group *amd_uncore_l3_attr_update[] = { + &amd_f17h_uncore_l3_format_group, + &amd_f19h_uncore_l3_format_group, + NULL, +}; + static struct pmu amd_nb_pmu = { .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_df_attr_groups, @@ -407,6 +451,7 @@ static struct pmu amd_nb_pmu = { static struct pmu amd_llc_pmu = { .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_l3_attr_groups, + .attr_update = amd_uncore_l3_attr_update, .name = "amd_l2", .event_init = amd_uncore_event_init, .add = amd_uncore_add, @@ -700,16 +745,11 @@ static int __init amd_uncore_init(void) if (boot_cpu_data.x86 >= 0x19) { *l3_attr++ = &format_attr_event8.attr; *l3_attr++ = &format_attr_umask.attr; - *l3_attr++ = &format_attr_coreid.attr; - *l3_attr++ = &format_attr_enallslices.attr; - *l3_attr++ = &format_attr_enallcores.attr; - *l3_attr++ = &format_attr_sliceid.attr; *l3_attr++ = &format_attr_threadmask2.attr; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0x17) { *l3_attr++ = &format_attr_event8.attr; *l3_attr++ = &format_attr_umask.attr; - *l3_attr++ = &format_attr_slicemask.attr; *l3_attr++ = &format_attr_threadmask8.attr; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 == 0x18) { -- Gitee From cc4ee4895e0748f8c91db04101484662d932dd58 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 19 May 2022 15:33:32 +0530 Subject: [PATCH 118/257] perf/x86/amd/uncore: Detect available DF counters commit 16b48c3f5ed85b8017526b1acacf5115461b489a upstream If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use CPUID leaf 0x80000022 EBX to detect the number of Data Fabric (DF) PMCs. This offers more flexibility if the counts change in later processor families. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/bac7b2806561e03f2acc7fdc9db94f102df80e1d.1652954372.git.sandipan.das@amd.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/amd/uncore.c | 10 ++++++++++ arch/x86/include/asm/perf_event.h | 3 +++ 2 files changed, 13 insertions(+) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index ccc820669da2..0c3f4250a78c 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -30,6 +30,7 @@ #undef pr_fmt #define pr_fmt(fmt) "amd_uncore: " fmt +static int pmu_version; static int num_counters_llc; static int num_counters_nb; static bool l3_mask; @@ -684,6 +685,7 @@ static int __init amd_uncore_init(void) { struct attribute **df_attr = amd_uncore_df_format_attr; struct attribute **l3_attr = amd_uncore_l3_format_attr; + union cpuid_0x80000022_ebx ebx; int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && @@ -693,6 +695,9 @@ static int __init amd_uncore_init(void) if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) return -ENODEV; + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) + pmu_version = 2; + num_counters_nb = NUM_COUNTERS_NB; num_counters_llc = NUM_COUNTERS_L2; if (boot_cpu_data.x86 >= 0x17) { @@ -734,6 +739,11 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; + if (pmu_version >= 2) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + num_counters_nb = ebx.split.num_df_pmc; + } + pr_info("%d %s %s counters detected\n", num_counters_nb, boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", amd_nb_pmu.name); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 0d13b29c392f..e29a52eb5fdc 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -214,6 +214,9 @@ union cpuid_0x80000022_ebx { struct { /* Number of Core Performance Counters */ unsigned int num_core_pmc:4; + unsigned int reserved:6; + /* Number of Data Fabric Counters */ + unsigned int num_df_pmc:6; } split; unsigned int full; }; -- Gitee From 92ff9356cdac71265c017943c9e9d3fae42b4481 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 19 May 2022 15:33:33 +0530 Subject: [PATCH 119/257] perf/x86/amd/uncore: Add PerfMonV2 DF event format commit c390241a93260b377c84ad9e7cd5242adf667aac upstream If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use bits 0-7, 32-37 as EventSelect and bits 8-15, 24-27 as UnitMask for Data Fabric (DF) events. [Backport changes] 1. In arch/x86/events/amd/uncore.c, the upstream patch replaces the boot_cpu_data.x86 >= 0x17 check with a PMU version check to select the appropriate Data Fabric (DF) format attributes. However, the current kernel also includes a check for Hygon hardware, which is not present in the upstream code. To align with upstream while preserving existing Hygon functionality, the changes were applied only within the AMD-specific code path. 2. In arch/x86/events/amd/uncore.c, within the amd_uncore_init() function, the attribute `format_attr_umask.attr` has been replaced with `format_attr_umask8.attr` for Hygon-specific code part also. Since, which is not covered by the upstream patch. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/ffc24d5a3375b1d6e457d88e83241114de5c1942.1652954372.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/uncore.c | 27 ++++++++++++++++++--------- arch/x86/include/asm/perf_event.h | 13 +++++++++++++ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 0c3f4250a78c..103f20db6d2e 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -238,6 +238,9 @@ static int amd_uncore_event_init(struct perf_event *event) boot_cpu_data.x86_model == 0x10) event_mask = HYGON_F18H_M6H_RAW_EVENT_MASK_NB; } + if (pmu_version >= 2 && is_nb_event(event)) + event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB; + /* * NB and Last level cache counters (MSRs) are shared across all cores * that share the same NB / Last level cache. On family 16h and below, @@ -333,11 +336,13 @@ static struct device_attribute format_attr_##_var = \ DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35"); DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */ +DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */ DEFINE_UNCORE_FORMAT_ATTR(event14f18h, event, "config:0-7,32-35,61-62"); /* F18h DF */ DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */ -DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(umask10f18h, umask, "config:8-17"); /* F18h M4h DF */ DEFINE_UNCORE_FORMAT_ATTR(umask12f18h, umask, "config:8-19"); /* F18h M6h DF */ +DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */ DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */ DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */ DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */ @@ -351,14 +356,14 @@ DEFINE_UNCORE_FORMAT_ATTR(threadmask32, threadmask, "config:32-63"); /* F18h /* Common DF and NB attributes */ static struct attribute *amd_uncore_df_format_attr[] = { &format_attr_event12.attr, /* event */ - &format_attr_umask.attr, /* umask */ + &format_attr_umask8.attr, /* umask */ NULL, }; /* Common L2 and L3 attributes */ static struct attribute *amd_uncore_l3_format_attr[] = { &format_attr_event12.attr, /* event */ - &format_attr_umask.attr, /* umask */ + &format_attr_umask8.attr, /* umask */ NULL, /* threadmask */ NULL, }; @@ -714,9 +719,13 @@ static int __init amd_uncore_init(void) } if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0x17) { - *df_attr = &format_attr_event14.attr; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD){ + if (pmu_version >= 2) { + *df_attr++ = &format_attr_event14v2.attr; + *df_attr++ = &format_attr_umask12.attr; + } else if (boot_cpu_data.x86 >= 0x17) { + *df_attr = &format_attr_event14.attr; + } } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 == 0x18) { *df_attr++ = &format_attr_event14f18h.attr; @@ -754,17 +763,17 @@ static int __init amd_uncore_init(void) if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { if (boot_cpu_data.x86 >= 0x19) { *l3_attr++ = &format_attr_event8.attr; - *l3_attr++ = &format_attr_umask.attr; + *l3_attr++ = &format_attr_umask8.attr; *l3_attr++ = &format_attr_threadmask2.attr; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0x17) { *l3_attr++ = &format_attr_event8.attr; - *l3_attr++ = &format_attr_umask.attr; + *l3_attr++ = &format_attr_umask8.attr; *l3_attr++ = &format_attr_threadmask8.attr; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 == 0x18) { *l3_attr++ = &format_attr_event8.attr; - *l3_attr++ = &format_attr_umask.attr; + *l3_attr++ = &format_attr_umask8.attr; if (boot_cpu_data.x86_model >= 0x6 && boot_cpu_data.x86_model <= 0xf) { *l3_attr++ = &format_attr_threadmask32.attr; amd_llc_pmu.attr_update = hygon_uncore_l3_attr_update; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e29a52eb5fdc..9ed1e1ae8bcb 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -98,6 +98,7 @@ #define AMD64_RAW_EVENT_MASK_NB \ (AMD64_EVENTSEL_EVENT | \ ARCH_PERFMON_EVENTSEL_UMASK) + #define HYGON_F18H_M4H_EVENTSEL_UMASK_NB 0x0003FF00ULL #define HYGON_F18H_M6H_EVENTSEL_UMASK_NB 0x000FFF00ULL @@ -111,6 +112,18 @@ (HYGON_F18H_EVENTSEL_EVENT | \ HYGON_F18H_M6H_EVENTSEL_UMASK_NB) +#define AMD64_PERFMON_V2_EVENTSEL_EVENT_NB \ + (AMD64_EVENTSEL_EVENT | \ + GENMASK_ULL(37, 36)) + +#define AMD64_PERFMON_V2_EVENTSEL_UMASK_NB \ + (ARCH_PERFMON_EVENTSEL_UMASK | \ + GENMASK_ULL(27, 24)) + +#define AMD64_PERFMON_V2_RAW_EVENT_MASK_NB \ + (AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \ + AMD64_PERFMON_V2_EVENTSEL_UMASK_NB) + #define AMD64_NUM_COUNTERS 4 #define AMD64_NUM_COUNTERS_CORE 6 #define AMD64_NUM_COUNTERS_NB 4 -- Gitee From 3dd8040d9170053868205a1a0009fbd1cf6872a4 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 19 May 2022 15:33:34 +0530 Subject: [PATCH 120/257] perf/x86/amd/uncore: Add PerfMonV2 RDPMC assignments commit f0fe9f3c7abcc946a676dfb295478c50312523c2 upstream The current RDPMC assignment scheme maps four DF PMCs and six L3 PMCs from index 6 to 15. If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, there may be additional DF counters available which are mapped starting from index 16 i.e. just after the L3 counters. Update the RDPMC assignments accordingly. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/1359379ef34da760f108b075ac138ab082caa3ba.1652954372.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/uncore.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 103f20db6d2e..2d7840ea0743 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -158,6 +158,16 @@ static int amd_uncore_add(struct perf_event *event, int flags) hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + /* + * The first four DF counters are accessible via RDPMC index 6 to 9 + * followed by the L3 counters from index 10 to 15. For processors + * with more than four DF counters, the DF RDPMC assignments become + * discontiguous as the additional counters are accessible starting + * from index 16. + */ + if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB) + hwc->event_base_rdpmc += NUM_COUNTERS_L3; + if (flags & PERF_EF_START) amd_uncore_start(event, PERF_EF_RELOAD); -- Gitee From 81ed4a89c86bff6409aa1b499f288a9c810a800f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Sep 2019 11:50:15 -0300 Subject: [PATCH 121/257] perf env: Add routine to read the env->cpuid from the running machine commit f1cedfb82858c8a7ec21e45d0ce7b6e2ce9edea0 upstream In 'perf top' we use that cpuid when initializing the per arch annotation init routines (e.g. x86__annotate_init()) and in that case (live mode, 'perf top') we need to obtain it from the running machine, not from a perf.data file header. Provide a means to do that. Will be used by 'perf top' in a followup patch. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-h2wb3sx7u7znx6lqfezrh7ca@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/env.c | 16 ++++++++++++++++ tools/perf/util/env.h | 1 + 2 files changed, 17 insertions(+) diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index ef64e197bc8d..63882f7ba781 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -2,6 +2,7 @@ #include "cpumap.h" #include "debug.h" #include "env.h" +#include "util/header.h" #include #include #include "bpf-event.h" @@ -260,6 +261,21 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; } +int perf_env__read_cpuid(struct perf_env *env) +{ + char cpuid[128]; + int err = get_cpuid(cpuid, sizeof(cpuid)); + + if (err) + return err; + + free(env->cpuid); + env->cpuid = strdup(cpuid); + if (env->cpuid == NULL) + return ENOMEM; + return 0; +} + static int perf_env__read_arch(struct perf_env *env) { struct utsname uts; diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 37028215d4a5..9d4d13771764 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -104,6 +104,7 @@ void perf_env__exit(struct perf_env *env); int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[]); +int perf_env__read_cpuid(struct perf_env *env); int perf_env__read_cpu_topology_map(struct perf_env *env); void cpu_cache_level__free(struct cpu_cache_level *cache); -- Gitee From d18c663529b27ee8aa432a288db3fde9b0ef5bc9 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 25 Oct 2019 17:08:33 +0300 Subject: [PATCH 122/257] perf/aux: Allow using AUX data in perf samples commit a4faf00d994c40e64f656805ac375c65e324eefb upstream AUX data can be used to annotate perf events such as performance counters or tracepoints/breakpoints by including it in sample records when PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging and profiling by providing, for example, a history of instruction flow leading up to the event's overflow. The implementation makes use of grouping an AUX event with all the events that wish to take samples of the AUX data, such that the former is the group leader. The samplees should also specify the desired size of the AUX sample via attr.aux_sample_size. AUX capable PMUs need to explicitly add support for sampling, because it relies on a new callback to take a snapshot of the buffer without touching the event states. [Backport changes] 1.In kernel/events/core.c, the difference in the patch is due to the addition of curly braces around the if condition block. This change was necessary to accommodate the additional statement inside the block. There are no functional changes intended; the modification is purely structural to ensure correct code organization. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: adrian.hunter@intel.com Cc: mathieu.poirier@linaro.org Link: https://lkml.kernel.org/r/20191025140835.53665-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- include/linux/perf_event.h | 19 ++++ include/uapi/linux/perf_event.h | 10 +- kernel/events/core.c | 173 +++++++++++++++++++++++++++++++- kernel/events/internal.h | 1 + kernel/events/ring_buffer.c | 36 +++++++ 5 files changed, 234 insertions(+), 5 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2d85c2bfedc8..09e0cca1eaaf 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -278,6 +278,8 @@ struct perf_event; #define PERF_PMU_CAP_NO_EXCLUDE 0x80 #define PERF_PMU_CAP_AUX_OUTPUT 0x100 +struct perf_output_handle; + /** * struct pmu - generic performance monitoring unit */ @@ -452,6 +454,19 @@ struct pmu { */ void (*free_aux) (void *aux); /* optional */ + /* + * Take a snapshot of the AUX buffer without touching the event + * state, so that preempting ->start()/->stop() callbacks does + * not interfere with their logic. Called in PMI context. + * + * Returns the size of AUX data copied to the output handle. + * + * Optional. + */ + long (*snapshot_aux) (struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size); + /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the @@ -999,6 +1014,7 @@ struct perf_sample_data { u32 reserved; } cpu_entry; struct perf_callchain_entry *callchain; + u64 aux_size; /* * regs_user may point to task_pt_regs or to regs_user_copy, depending @@ -1395,6 +1411,9 @@ extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); +extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, + struct perf_output_handle *handle, + unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 507eddef8715..7fa9f958cd9b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -141,8 +141,9 @@ enum perf_event_sample_format { PERF_SAMPLE_TRANSACTION = 1U << 17, PERF_SAMPLE_REGS_INTR = 1U << 18, PERF_SAMPLE_PHYS_ADDR = 1U << 19, + PERF_SAMPLE_AUX = 1U << 20, - PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -304,6 +305,7 @@ enum perf_event_read_format { /* add: sample_stack_user */ #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -428,7 +430,9 @@ struct perf_event_attr { */ __u32 aux_watermark; __u16 sample_max_stack; - __u16 __reserved_2; /* align to __u64 */ + __u16 __reserved_2; + __u32 aux_sample_size; + __u32 __reserved_3; }; /* @@ -870,6 +874,8 @@ enum perf_event_type { * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR + * { u64 size; + * char data[size]; } && PERF_SAMPLE_AUX * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index d478c9789ff6..6befff36dbc0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1947,6 +1947,11 @@ static void perf_put_aux_event(struct perf_event *event) } } +static bool perf_need_aux_event(struct perf_event *event) +{ + return !!event->attr.aux_output || !!event->attr.aux_sample_size; +} + static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { @@ -1959,7 +1964,17 @@ static int perf_get_aux_event(struct perf_event *event, if (!group_leader) return 0; - if (!perf_aux_output_match(event, group_leader)) + /* + * aux_output and aux_sample_size are mutually exclusive. + */ + if (event->attr.aux_output && event->attr.aux_sample_size) + return 0; + + if (event->attr.aux_output && + !perf_aux_output_match(event, group_leader)) + return 0; + + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) @@ -6234,6 +6249,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, } } +static unsigned long perf_prepare_sample_aux(struct perf_event *event, + struct perf_sample_data *data, + size_t size) +{ + struct perf_event *sampler = event->aux_event; + struct ring_buffer *rb; + + data->aux_size = 0; + + if (!sampler) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) + goto out; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + goto out; + + /* + * If this is an NMI hit inside sampling code, don't take + * the sample. See also perf_aux_sample_output(). + */ + if (READ_ONCE(rb->aux_in_sampling)) { + data->aux_size = 0; + } else { + size = min_t(size_t, size, perf_aux_size(rb)); + data->aux_size = ALIGN(size, sizeof(u64)); + } + ring_buffer_put(rb); + +out: + return data->aux_size; +} + +long perf_pmu_snapshot_aux(struct ring_buffer *rb, + struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) +{ + unsigned long flags; + long ret; + + /* + * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler + * paths. If we start calling them in NMI context, they may race with + * the IRQ ones, that is, for example, re-starting an event that's just + * been stopped, which is why we're using a separate callback that + * doesn't change the event state. + * + * IRQs need to be disabled to prevent IPIs from racing with us. + */ + local_irq_save(flags); + /* + * Guard against NMI hits inside the critical section; + * see also perf_prepare_sample_aux(). + */ + WRITE_ONCE(rb->aux_in_sampling, 1); + barrier(); + + ret = event->pmu->snapshot_aux(event, handle, size); + + barrier(); + WRITE_ONCE(rb->aux_in_sampling, 0); + local_irq_restore(flags); + + return ret; +} + +static void perf_aux_sample_output(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + struct perf_event *sampler = event->aux_event; + unsigned long pad; + struct ring_buffer *rb; + long size; + + if (WARN_ON_ONCE(!sampler || !data->aux_size)) + return; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + return; + + size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); + + /* + * An error here means that perf_output_copy() failed (returned a + * non-zero surplus that it didn't copy), which in its current + * enlightened implementation is not possible. If that changes, we'd + * like to know. + */ + if (WARN_ON_ONCE(size < 0)) + goto out_put; + + /* + * The pad comes from ALIGN()ing data->aux_size up to u64 in + * perf_prepare_sample_aux(), so should not be more than that. + */ + pad = data->aux_size - size; + if (WARN_ON_ONCE(pad >= sizeof(u64))) + pad = 8; + + if (pad) { + u64 zero = 0; + perf_output_copy(handle, &zero, pad); + } + +out_put: + ring_buffer_put(rb); +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -6560,6 +6691,13 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); + if (sample_type & PERF_SAMPLE_AUX) { + perf_output_put(handle, data->aux_size); + + if (data->aux_size) + perf_aux_sample_output(event, handle, data); + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -6754,6 +6892,35 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); + + if (sample_type & PERF_SAMPLE_AUX) { + u64 size; + + header->size += sizeof(u64); /* size */ + + /* + * Given the 16bit nature of header::size, an AUX sample can + * easily overflow it, what with all the preceding sample bits. + * Make sure this doesn't happen by using up to U16_MAX bytes + * per sample in total (rounded down to 8 byte boundary). + */ + size = min_t(size_t, U16_MAX - header->size, + event->attr.aux_sample_size); + size = rounddown(size, 8); + size = perf_prepare_sample_aux(event, data, size); + + WARN_ON_ONCE(size + header->size > U16_MAX); + header->size += size; + } + /* + * If you're adding more sample types here, you likely need to do + * something about the overflowing header::size, like repurpose the + * lowest 3 bits of size, which should be always zero at the moment. + * This raises a more important question, do we really need 512k sized + * samples and why, so good argumentation is in order for whatever you + * do here next. + */ + WARN_ON_ONCE(header->size & 7); } static __always_inline int @@ -10720,7 +10887,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->size = size; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -11291,7 +11458,7 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (event->attr.aux_output && !perf_get_aux_event(event, group_leader)) { + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; goto err_locked; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 6e87b358e082..00a0d3f08de3 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -50,6 +50,7 @@ struct ring_buffer { unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; + int aux_in_sampling; void **aux_pages; void *aux_priv; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ffb59a4ef4ff..07ab081952f3 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle) } EXPORT_SYMBOL_GPL(perf_get_aux); +/* + * Copy out AUX data from an AUX handle. + */ +long perf_output_copy_aux(struct perf_output_handle *aux_handle, + struct perf_output_handle *handle, + unsigned long from, unsigned long to) +{ + unsigned long tocopy, remainder, len = 0; + struct ring_buffer *rb = aux_handle->rb; + void *addr; + + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + + do { + tocopy = PAGE_SIZE - offset_in_page(from); + if (to > from) + tocopy = min(tocopy, to - from); + if (!tocopy) + break; + + addr = rb->aux_pages[from >> PAGE_SHIFT]; + addr += offset_in_page(from); + + remainder = perf_output_copy(handle, addr, tocopy); + if (remainder) + return -EFAULT; + + len += tocopy; + from += tocopy; + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + } while (to != from); + + return len; +} + #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) -- Gitee From e4147eb3dff060b2f435d431be4b79c468d6d785 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 1 Apr 2020 13:16:09 +0300 Subject: [PATCH 123/257] perf evsel: Move and globalize perf_evsel__find_pmu() and perf_evsel__is_aux_event() commit e12ee9f7513cb5dbe8b12aac030dfbeff35b3766 upstream Move and globalize 2 functions from the auxtrace specific sources so that they can be reused. Signed-off-by: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20200401101613.6201-13-adrian.hunter@intel.com [ Move to pmu.c, as moving to evsel.h breaks the python binding ] Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/auxtrace.c | 19 ------------------- tools/perf/util/evsel.h | 3 +++ tools/perf/util/pmu.c | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 23a8c152e568..4086d5848734 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -59,25 +59,6 @@ #include "symbol/kallsyms.h" #include -static struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel) -{ - struct perf_pmu *pmu = NULL; - - while ((pmu = perf_pmu__scan(pmu)) != NULL) { - if (pmu->type == evsel->core.attr.type) - break; - } - - return pmu; -} - -static bool perf_evsel__is_aux_event(struct evsel *evsel) -{ - struct perf_pmu *pmu = perf_evsel__find_pmu(evsel); - - return pmu && pmu->auxtrace; -} - /* * Make a group from 'leader' to 'last', requiring that the events were not * already grouped to a different leader. diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 88882e55294e..da65be7ca536 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -152,6 +152,9 @@ int perf_evsel__object_config(size_t object_size, int (*init)(struct evsel *evsel), void (*fini)(struct evsel *evsel)); +struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel); +bool perf_evsel__is_aux_event(struct evsel *evsel); + struct evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx); static inline struct evsel *evsel__new(struct perf_event_attr *attr) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 99227038063b..36e3a777d20d 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -18,6 +18,7 @@ #include #include #include "debug.h" +#include "evsel.h" #include "pmu.h" #include "parse-events.h" #include "header.h" @@ -882,6 +883,25 @@ struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu) return NULL; } +struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel) +{ + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmu__scan(pmu)) != NULL) { + if (pmu->type == evsel->core.attr.type) + break; + } + + return pmu; +} + +bool perf_evsel__is_aux_event(struct evsel *evsel) +{ + struct perf_pmu *pmu = perf_evsel__find_pmu(evsel); + + return pmu && pmu->auxtrace; +} + struct perf_pmu *perf_pmu__find(const char *name) { struct perf_pmu *pmu; -- Gitee From 105d2254eec8bc362a6395c0f7142d5fcb5fdc7f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 29 Aug 2019 13:31:48 +0200 Subject: [PATCH 124/257] perf env: Add perf_env__numa_node() commit 389799a7a1e86c55f38897e679762efadcc9dedd upstream To speed up cpu to node lookup, add perf_env__numa_node(), that creates cpu array on the first lookup, that holds numa nodes for each stored cpu. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Andi Kleen Cc: Joe Mario Cc: Kan Liang Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20190904073415.723-3-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/env.c | 40 ++++++++++++++++++++++++++++++++++++++++ tools/perf/util/env.h | 6 ++++++ 2 files changed, 46 insertions(+) diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 63882f7ba781..350d91faa5d2 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -184,6 +184,7 @@ void perf_env__exit(struct perf_env *env) zfree(&env->sibling_threads); zfree(&env->pmu_mappings); zfree(&env->cpu); + zfree(&env->numa_map); for (i = 0; i < env->nr_numa_nodes; i++) perf_cpu_map__put(env->numa_nodes[i].map); @@ -358,3 +359,42 @@ const char *perf_env__arch(struct perf_env *env) return normalize_arch(arch_name); } + + +int perf_env__numa_node(struct perf_env *env, int cpu) +{ + if (!env->nr_numa_map) { + struct numa_node *nn; + int i, nr = 0; + + for (i = 0; i < env->nr_numa_nodes; i++) { + nn = &env->numa_nodes[i]; + nr = max(nr, perf_cpu_map__max(nn->map)); + } + + nr++; + + /* + * We initialize the numa_map array to prepare + * it for missing cpus, which return node -1 + */ + env->numa_map = malloc(nr * sizeof(int)); + if (!env->numa_map) + return -1; + + for (i = 0; i < nr; i++) + env->numa_map[i] = -1; + + env->nr_numa_map = nr; + + for (i = 0; i < env->nr_numa_nodes; i++) { + int tmp, j; + + nn = &env->numa_nodes[i]; + perf_cpu_map__for_each_cpu(j, tmp, nn->map) + env->numa_map[j] = i; + } + } + + return cpu >= 0 && cpu < env->nr_numa_map ? env->numa_map[cpu] : -1; +} diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 9d4d13771764..b6c716dfc776 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -87,6 +87,10 @@ struct perf_env { struct rb_root btfs; u32 btfs_cnt; } bpf_progs; + + /* For fast cpu to numa node lookup via perf_env__numa_node */ + int *numa_map; + int nr_numa_map; }; enum perf_compress_type { @@ -120,4 +124,6 @@ struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env, __u32 prog_id); bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node); struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id); + +int perf_env__numa_node(struct perf_env *env, int cpu); #endif /* __PERF_ENV_H */ -- Gitee From e3b8227f0a3e1de3dcaf12387b058e20c6cdf957 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:15:07 -0500 Subject: [PATCH 125/257] perf env: Add perf_env__cpuid, perf_env__{nr_}pmu_mappings commit 9fe8895a27a840095281864803b128ddc26dcd30 upstream To be used by IBS raw data display: It needs the recorder's cpuid in order to determine which errata workarounds to apply to the data, and the pmu_mappings are needed in order to figure out which PMU sample type is IBS Fetch vs. IBS Op. When not available from perf.data, we assume local operation, and retrieve cpuid and pmu mappings directly from the running system. Signed-off-by: Kim Phillips Cc: Alexander Shishkin Cc: Boris Ostrovsky Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Joao Martins Cc: Konrad Rzeszutek Wilk Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Link: https //lore.kernel.org/r/20210817221509.88391-2-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/env.c | 78 +++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/env.h | 5 +++ 2 files changed, 83 insertions(+) diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 350d91faa5d2..6b8252dd42c1 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -11,6 +11,7 @@ #include #include #include +#include "strbuf.h" struct perf_env perf_env; @@ -262,6 +263,45 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; } +int perf_env__read_pmu_mappings(struct perf_env *env) +{ + struct perf_pmu *pmu = NULL; + u32 pmu_num = 0; + struct strbuf sb; + + while ((pmu = perf_pmu__scan(pmu))) { + if (!pmu->name) + continue; + pmu_num++; + } + if (!pmu_num) { + pr_debug("pmu mappings not available\n"); + return -ENOENT; + } + env->nr_pmu_mappings = pmu_num; + + if (strbuf_init(&sb, 128 * pmu_num) < 0) + return -ENOMEM; + + while ((pmu = perf_pmu__scan(pmu))) { + if (!pmu->name) + continue; + if (strbuf_addf(&sb, "%u:%s", pmu->type, pmu->name) < 0) + goto error; + /* include a NULL character at the end */ + if (strbuf_add(&sb, "", 1) < 0) + goto error; + } + + env->pmu_mappings = strbuf_detach(&sb, NULL); + + return 0; + +error: + strbuf_release(&sb); + return -1; +} + int perf_env__read_cpuid(struct perf_env *env) { char cpuid[128]; @@ -360,6 +400,44 @@ const char *perf_env__arch(struct perf_env *env) return normalize_arch(arch_name); } +const char *perf_env__cpuid(struct perf_env *env) +{ + int status; + + if (!env || !env->cpuid) { /* Assume local operation */ + status = perf_env__read_cpuid(env); + if (status) + return NULL; + } + + return env->cpuid; +} + +int perf_env__nr_pmu_mappings(struct perf_env *env) +{ + int status; + + if (!env || !env->nr_pmu_mappings) { /* Assume local operation */ + status = perf_env__read_pmu_mappings(env); + if (status) + return 0; + } + + return env->nr_pmu_mappings; +} + +const char *perf_env__pmu_mappings(struct perf_env *env) +{ + int status; + + if (!env || !env->pmu_mappings) { /* Assume local operation */ + status = perf_env__read_pmu_mappings(env); + if (status) + return NULL; + } + + return env->pmu_mappings; +} int perf_env__numa_node(struct perf_env *env, int cpu) { diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index b6c716dfc776..0b862b4a01e3 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -109,11 +109,16 @@ void perf_env__exit(struct perf_env *env); int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[]); int perf_env__read_cpuid(struct perf_env *env); +int perf_env__read_pmu_mappings(struct perf_env *env); +int perf_env__nr_pmu_mappings(struct perf_env *env); +const char *perf_env__pmu_mappings(struct perf_env *env); + int perf_env__read_cpu_topology_map(struct perf_env *env); void cpu_cache_level__free(struct cpu_cache_level *cache); const char *perf_env__arch(struct perf_env *env); +const char *perf_env__cpuid(struct perf_env *env); const char *perf_env__raw_arch(struct perf_env *env); int perf_env__nr_cpus_avail(struct perf_env *env); -- Gitee From b6496d1fb47b67fcc243991e0a3a8a6692ee3703 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 4 Oct 2021 16:41:13 -0500 Subject: [PATCH 126/257] perf evsel: Make evsel__env() always return a valid env commit 7b830875d22d68f9a49a02a23c24272513ddb392 upstream It's possible to have an evsel and evsel->evlist populated without an evsel->evlist->env, when, e.g., cmd_record is in its error path. Future patches will add support for evsel__open_strerror to be able to customize error messaging based on perf_env__{arch,cpuid}, so let's have evsel__env return &perf_env instead of NULL in that case. Reviewed-by: Kajol Jain Signed-off-by: Kim Phillips Cc: Alexander Shishkin Cc: Boris Ostrovsky Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Joao Martins Cc: Konrad Rzeszutek Wilk Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Link: https://lore.kernel.org/r/20211004214114.188477-1-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/evsel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 39912d9b51b3..797534dbee08 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2547,7 +2547,7 @@ int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, struct perf_env *perf_evsel__env(struct evsel *evsel) { - if (evsel && evsel->evlist) + if (evsel && evsel->evlist && evsel->evlist->env) return evsel->evlist->env; return &perf_env; } -- Gitee From 52d6543bb62b75f90a3c2a9923cdb75067615e5b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 24 Apr 2020 10:24:04 -0300 Subject: [PATCH 127/257] perf record: Move sb_evlist to 'struct record' commit bc477d7983e345262757568ec27be0395dc2fe73 upstream Where state related to a 'perf record' session is grouped. Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Namhyung Kim Cc: Song Liu Link: http://lore.kernel.org/lkml/20200429131106.27974-2-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-record.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index ceae6168d329..a0d247edf55a 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -80,6 +80,7 @@ struct record { struct auxtrace_record *itr; struct evlist *evlist; struct perf_session *session; + struct evlist *sb_evlist; int realtime_prio; bool no_buildid; bool no_buildid_set; @@ -1360,7 +1361,6 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) struct perf_data *data = &rec->data; struct perf_session *session; bool disabled = false, draining = false; - struct evlist *sb_evlist = NULL; int fd; float ratio = 0; @@ -1472,9 +1472,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) } if (!opts->no_bpf_event) - bpf_event__add_sb_event(&sb_evlist, &session->header.env); + bpf_event__add_sb_event(&rec->sb_evlist, &session->header.env); - if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) { + if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) { pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); opts->no_bpf_event = true; } @@ -1748,7 +1748,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) perf_session__delete(session); if (!opts->no_bpf_event) - perf_evlist__stop_sb_thread(sb_evlist); + perf_evlist__stop_sb_thread(rec->sb_evlist); return status; } -- Gitee From f47f3addfc61a36a5393083f574dbb7613029850 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 24 Apr 2020 10:40:54 -0300 Subject: [PATCH 128/257] perf top: Move sb_evlist to 'struct perf_top' commit ca6c9c8b107f9788662117587cd24bbb19cea94d upstream Where state related to a 'perf top' session is grouped. Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Namhyung Kim Cc: Song Liu Link: http://lore.kernel.org/lkml/20200429131106.27974-3-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-top.c | 7 +++---- tools/perf/util/top.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index a30d62186f5e..c228f872ace4 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1542,7 +1542,6 @@ int cmd_top(int argc, const char **argv) OPTS_EVSWITCH(&top.evswitch), OPT_END() }; - struct evlist *sb_evlist = NULL; const char * const top_usage[] = { "perf top []", NULL @@ -1683,9 +1682,9 @@ int cmd_top(int argc, const char **argv) } if (!top.record_opts.no_bpf_event) - bpf_event__add_sb_event(&sb_evlist, &perf_env); + bpf_event__add_sb_event(&top.sb_evlist, &perf_env); - if (perf_evlist__start_sb_thread(sb_evlist, target)) { + if (perf_evlist__start_sb_thread(top.sb_evlist, target)) { pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); opts->no_bpf_event = true; } @@ -1693,7 +1692,7 @@ int cmd_top(int argc, const char **argv) status = __cmd_top(&top); if (!opts->no_bpf_event) - perf_evlist__stop_sb_thread(sb_evlist); + perf_evlist__stop_sb_thread(top.sb_evlist); out_delete_evlist: evlist__delete(top.evlist); diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index f117d4f4821e..7bea36a61645 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -18,7 +18,7 @@ struct perf_session; struct perf_top { struct perf_tool tool; - struct evlist *evlist; + struct evlist *evlist, *sb_evlist; struct record_opts record_opts; struct annotation_options annotation_opts; struct evswitch evswitch; -- Gitee From 90fd80f5a88384141a26f33d6513c33a5c44ccde Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 24 Apr 2020 12:24:51 -0300 Subject: [PATCH 129/257] perf bpf: Decouple creating the evlist from adding the SB event commit b38d85ef49cf6af9d1deaaf01daf0986d47e6c7a upstream Renaming bpf_event__add_sb_event() to evlist__add_sb_event() and requiring that the evlist be allocated beforehand. This will allow using the same side band thread and evlist to be used for multiple purposes in addition to react to PERF_RECORD_BPF_EVENT soon after they are generated. Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Namhyung Kim Cc: Song Liu Link: http://lore.kernel.org/lkml/20200429131106.27974-4-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-record.c | 17 ++++++++++++++--- tools/perf/builtin-top.c | 15 +++++++++++++-- tools/perf/util/bpf-event.c | 3 +-- tools/perf/util/bpf-event.h | 7 +++---- tools/perf/util/evlist.c | 21 ++++----------------- tools/perf/util/evlist.h | 2 +- 6 files changed, 36 insertions(+), 29 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index a0d247edf55a..997b03ba6dad 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1463,16 +1463,27 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) goto out_child; } + err = -1; if (!rec->no_buildid && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { pr_err("Couldn't generate buildids. " "Use --no-buildid to profile anyway.\n"); - err = -1; goto out_child; } - if (!opts->no_bpf_event) - bpf_event__add_sb_event(&rec->sb_evlist, &session->header.env); + if (!opts->no_bpf_event) { + rec->sb_evlist = evlist__new(); + + if (rec->sb_evlist == NULL) { + pr_err("Couldn't create side band evlist.\n."); + goto out_child; + } + + if (evlist__add_bpf_sb_event(rec->sb_evlist, &session->header.env)) { + pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n."); + goto out_child; + } + } if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) { pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index c228f872ace4..65fe4f5f5de2 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1681,8 +1681,19 @@ int cmd_top(int argc, const char **argv) goto out_delete_evlist; } - if (!top.record_opts.no_bpf_event) - bpf_event__add_sb_event(&top.sb_evlist, &perf_env); + if (!top.record_opts.no_bpf_event) { + top.sb_evlist = evlist__new(); + + if (top.sb_evlist == NULL) { + pr_err("Couldn't create side band evlist.\n."); + goto out_delete_evlist; + } + + if (evlist__add_bpf_sb_event(top.sb_evlist, &perf_env)) { + pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n."); + goto out_delete_evlist; + } + } if (perf_evlist__start_sb_thread(top.sb_evlist, target)) { pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n"); diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 782c0c8a9a83..96e0a31a38f6 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -422,8 +422,7 @@ static int bpf_event__sb_cb(union perf_event *event, void *data) return 0; } -int bpf_event__add_sb_event(struct evlist **evlist, - struct perf_env *env) +int evlist__add_bpf_sb_event(struct evlist *evlist, struct perf_env *env) { struct perf_event_attr attr = { .type = PERF_TYPE_SOFTWARE, diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h index 81fdc88e6c1a..68f315c3df5b 100644 --- a/tools/perf/util/bpf-event.h +++ b/tools/perf/util/bpf-event.h @@ -33,8 +33,7 @@ struct btf_node { #ifdef HAVE_LIBBPF_SUPPORT int machine__process_bpf(struct machine *machine, union perf_event *event, struct perf_sample *sample); -int bpf_event__add_sb_event(struct evlist **evlist, - struct perf_env *env); +int evlist__add_bpf_sb_event(struct evlist *evlist, struct perf_env *env); void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info, struct perf_env *env, FILE *fp); @@ -46,8 +45,8 @@ static inline int machine__process_bpf(struct machine *machine __maybe_unused, return 0; } -static inline int bpf_event__add_sb_event(struct evlist **evlist __maybe_unused, - struct perf_env *env __maybe_unused) +static inline int evlist__add_bpf_sb_event(struct evlist *evlist __maybe_unused, + struct perf_env *env __maybe_unused) { return 0; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 505b890ac85c..b110deb88425 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1672,39 +1672,26 @@ struct evsel *perf_evlist__reset_weak_group(struct evlist *evsel_list, return leader; } -int perf_evlist__add_sb_event(struct evlist **evlist, +int perf_evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *attr, perf_evsel__sb_cb_t cb, void *data) { struct evsel *evsel; - bool new_evlist = (*evlist) == NULL; - - if (*evlist == NULL) - *evlist = evlist__new(); - if (*evlist == NULL) - return -1; if (!attr->sample_id_all) { pr_warning("enabling sample_id_all for all side band events\n"); attr->sample_id_all = 1; } - evsel = perf_evsel__new_idx(attr, (*evlist)->core.nr_entries); + evsel = perf_evsel__new_idx(attr, evlist->core.nr_entries); if (!evsel) - goto out_err; + return -1; evsel->side_band.cb = cb; evsel->side_band.data = data; - evlist__add(*evlist, evsel); + evlist__add(evlist, evsel); return 0; - -out_err: - if (new_evlist) { - evlist__delete(*evlist); - *evlist = NULL; - } - return -1; } static void *perf_evlist__poll_thread(void *arg) diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index d89e72bd1c81..10fc888d6127 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -107,7 +107,7 @@ int __perf_evlist__add_default_attrs(struct evlist *evlist, int perf_evlist__add_dummy(struct evlist *evlist); -int perf_evlist__add_sb_event(struct evlist **evlist, +int perf_evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *attr, perf_evsel__sb_cb_t cb, void *data); -- Gitee From f87eb8da59c8059e4914b2c5c3d3709ad0e07b0c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:09 +0200 Subject: [PATCH 130/257] libperf: Add perf_mmap__init() function commit 353120b48d4f61288e4745b0c8a191784b11c0f4 upstream Add perf_mmap__init() function to initialize 'struct perf_mmap' objects. Add it to a new mmap.c source file, that will carry all the mmap related functions. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-2-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/lib/Build | 1 + tools/perf/lib/include/internal/mmap.h | 2 ++ tools/perf/lib/mmap.c | 9 +++++++++ tools/perf/util/evlist.c | 5 ++--- 4 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 tools/perf/lib/mmap.c diff --git a/tools/perf/lib/Build b/tools/perf/lib/Build index c31f1c111f8f..2ef9a4ec6d99 100644 --- a/tools/perf/lib/Build +++ b/tools/perf/lib/Build @@ -3,6 +3,7 @@ libperf-y += cpumap.o libperf-y += threadmap.o libperf-y += evsel.o libperf-y += evlist.o +libperf-y += mmap.o libperf-y += zalloc.o libperf-y += xyarray.o libperf-y += lib.o diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index ba1e519c15b9..e25890de6a55 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -29,4 +29,6 @@ struct perf_mmap { char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); }; +void perf_mmap__init(struct perf_mmap *map, bool overwrite); + #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c new file mode 100644 index 000000000000..3da6177510e6 --- /dev/null +++ b/tools/perf/lib/mmap.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +void perf_mmap__init(struct perf_mmap *map, bool overwrite) +{ + map->fd = -1; + map->overwrite = overwrite; + refcount_set(&map->refcnt, 0); +} diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index b110deb88425..e3de04e41e81 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -605,8 +605,6 @@ static struct mmap *evlist__alloc_mmap(struct evlist *evlist, return NULL; for (i = 0; i < evlist->core.nr_mmaps; i++) { - map[i].core.fd = -1; - map[i].core.overwrite = overwrite; /* * When the perf_mmap() call is made we grab one refcount, plus * one extra to let perf_mmap__consume() get the last @@ -616,8 +614,9 @@ static struct mmap *evlist__alloc_mmap(struct evlist *evlist, * Each PERF_EVENT_IOC_SET_OUTPUT points to this mmap and * thus does perf_mmap__get() on it. */ - refcount_set(&map[i].core.refcnt, 0); + perf_mmap__init(&map[i].core, overwrite); } + return map; } -- Gitee From e54106b77cebf1c226dc6f53241b40c5a7884899 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:10 +0200 Subject: [PATCH 131/257] libperf: Add 'struct perf_mmap_param' commit e440979faf6ac8048e1792af383df6af78dd1cb0 upstream Add libperf's version of mmap params 'struct perf_mmap_param' object with the basics: 'prot' and 'mask'. Encapsulate it in the current 'struct mmap_params' object. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-3-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/lib/include/internal/mmap.h | 5 +++++ tools/perf/util/evlist.c | 14 +++++++++----- tools/perf/util/mmap.c | 4 ++-- tools/perf/util/mmap.h | 3 ++- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index e25890de6a55..b26806b36bb6 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -29,6 +29,11 @@ struct perf_mmap { char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); }; +struct perf_mmap_param { + int prot; + int mask; +}; + void perf_mmap__init(struct perf_mmap *map, bool overwrite); #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index e3de04e41e81..430bfea8b3e5 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -643,7 +643,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, int fd; int cpu; - mp->prot = PROT_READ | PROT_WRITE; + mp->core.prot = PROT_READ | PROT_WRITE; if (evsel->core.attr.write_backward) { output = _output_overwrite; maps = evlist->overwrite_mmap; @@ -656,7 +656,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, if (evlist->bkw_mmap_state == BKW_MMAP_NOTREADY) perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_RUNNING); } - mp->prot &= ~PROT_WRITE; + mp->core.prot &= ~PROT_WRITE; } if (evsel->core.system_wide && thread) @@ -897,8 +897,12 @@ int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, * Its value is decided by evsel's write_backward. * So &mp should not be passed through const pointer. */ - struct mmap_params mp = { .nr_cblocks = nr_cblocks, .affinity = affinity, .flush = flush, - .comp_level = comp_level }; + struct mmap_params mp = { + .nr_cblocks = nr_cblocks, + .affinity = affinity, + .flush = flush, + .comp_level = comp_level + }; if (!evlist->mmap) evlist->mmap = evlist__alloc_mmap(evlist, false); @@ -910,7 +914,7 @@ int evlist__mmap_ex(struct evlist *evlist, unsigned int pages, evlist->core.mmap_len = evlist__mmap_size(pages); pr_debug("mmap size %zuB\n", evlist->core.mmap_len); - mp.mask = evlist->core.mmap_len - page_size - 1; + mp.core.mask = evlist->core.mmap_len - page_size - 1; auxtrace_mmap_params__init(&mp.auxtrace_mp, evlist->core.mmap_len, auxtrace_pages, auxtrace_overwrite); diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index a35dc57d5995..a496ced5ed2a 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -370,8 +370,8 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) */ refcount_set(&map->core.refcnt, 2); map->core.prev = 0; - map->core.mask = mp->mask; - map->core.base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, + map->core.mask = mp->core.mask; + map->core.base = mmap(NULL, perf_mmap__mmap_len(map), mp->core.prot, MAP_SHARED, fd, 0); if (map->core.base == MAP_FAILED) { pr_debug2("failed to mmap perf event ring buffer, error %d\n", diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index e567c1c875bd..4ff75d8aeb05 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -37,7 +37,8 @@ struct mmap { }; struct mmap_params { - int prot, mask, nr_cblocks, affinity, flush, comp_level; + struct perf_mmap_param core; + int nr_cblocks, affinity, flush, comp_level; struct auxtrace_mmap_params auxtrace_mp; }; -- Gitee From 6e96d77d842390ec612e930eca7fcc7b719ee28f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:11 +0200 Subject: [PATCH 132/257] libperf: Adopt perf_mmap__mmap_len() function from tools/perf commit bf59b3053e63783520c2810fc3f676553bc7eedd upstream Move perf_mmap__mmap_len() from tools/perf wto libperf, it will be used in the following patches. And rename the existing perf's function to mmap__mmap_len(). Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/builtin-record.c | 4 ++-- tools/perf/lib/include/internal/mmap.h | 2 ++ tools/perf/lib/mmap.c | 6 ++++++ tools/perf/util/mmap.c | 20 ++++++++++---------- tools/perf/util/mmap.h | 2 +- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 997b03ba6dad..41b1c12e22ba 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -277,7 +277,7 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size if (record__comp_enabled(aio->rec)) { size = zstd_compress(aio->rec->session, aio->data + aio->size, - perf_mmap__mmap_len(map) - aio->size, + mmap__mmap_len(map) - aio->size, buf, size); } else { memcpy(aio->data + aio->size, buf, size); @@ -489,7 +489,7 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size) struct record *rec = to; if (record__comp_enabled(rec)) { - size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size); + size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size); bf = map->data; } diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index b26806b36bb6..e7a67260940c 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -34,6 +34,8 @@ struct perf_mmap_param { int mask; }; +size_t perf_mmap__mmap_len(struct perf_mmap *map); + void perf_mmap__init(struct perf_mmap *map, bool overwrite); #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c index 3da6177510e6..cc4284da4d99 100644 --- a/tools/perf/lib/mmap.c +++ b/tools/perf/lib/mmap.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include void perf_mmap__init(struct perf_mmap *map, bool overwrite) { @@ -7,3 +8,8 @@ void perf_mmap__init(struct perf_mmap *map, bool overwrite) map->overwrite = overwrite; refcount_set(&map->refcnt, 0); } + +size_t perf_mmap__mmap_len(struct perf_mmap *map) +{ + return map->mask + 1 + page_size; +} diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index a496ced5ed2a..a8e81c4cbae8 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -23,9 +23,9 @@ #include "../perf.h" #include /* page_size */ -size_t perf_mmap__mmap_len(struct mmap *map) +size_t mmap__mmap_len(struct mmap *map) { - return map->core.mask + 1 + page_size; + return perf_mmap__mmap_len(&map->core); } /* When check_messup is true, 'end' must points to a good entry */ @@ -170,7 +170,7 @@ static int perf_mmap__aio_enabled(struct mmap *map) #ifdef HAVE_LIBNUMA_SUPPORT static int perf_mmap__aio_alloc(struct mmap *map, int idx) { - map->aio.data[idx] = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE, + map->aio.data[idx] = mmap(NULL, mmap__mmap_len(map), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (map->aio.data[idx] == MAP_FAILED) { map->aio.data[idx] = NULL; @@ -183,7 +183,7 @@ static int perf_mmap__aio_alloc(struct mmap *map, int idx) static void perf_mmap__aio_free(struct mmap *map, int idx) { if (map->aio.data[idx]) { - munmap(map->aio.data[idx], perf_mmap__mmap_len(map)); + munmap(map->aio.data[idx], mmap__mmap_len(map)); map->aio.data[idx] = NULL; } } @@ -196,7 +196,7 @@ static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity) if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) { data = map->aio.data[idx]; - mmap_len = perf_mmap__mmap_len(map); + mmap_len = mmap__mmap_len(map); node_mask = 1UL << cpu__get_node(cpu); if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) { pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n", @@ -210,7 +210,7 @@ static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity) #else /* !HAVE_LIBNUMA_SUPPORT */ static int perf_mmap__aio_alloc(struct mmap *map, int idx) { - map->aio.data[idx] = malloc(perf_mmap__mmap_len(map)); + map->aio.data[idx] = malloc(mmap__mmap_len(map)); if (map->aio.data[idx] == NULL) return -1; @@ -315,11 +315,11 @@ void perf_mmap__munmap(struct mmap *map) { perf_mmap__aio_munmap(map); if (map->data != NULL) { - munmap(map->data, perf_mmap__mmap_len(map)); + munmap(map->data, mmap__mmap_len(map)); map->data = NULL; } if (map->core.base != NULL) { - munmap(map->core.base, perf_mmap__mmap_len(map)); + munmap(map->core.base, mmap__mmap_len(map)); map->core.base = NULL; map->core.fd = -1; refcount_set(&map->core.refcnt, 0); @@ -371,7 +371,7 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) refcount_set(&map->core.refcnt, 2); map->core.prev = 0; map->core.mask = mp->core.mask; - map->core.base = mmap(NULL, perf_mmap__mmap_len(map), mp->core.prot, + map->core.base = mmap(NULL, mmap__mmap_len(map), mp->core.prot, MAP_SHARED, fd, 0); if (map->core.base == MAP_FAILED) { pr_debug2("failed to mmap perf event ring buffer, error %d\n", @@ -389,7 +389,7 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) map->comp_level = mp->comp_level; if (map->comp_level && !perf_mmap__aio_enabled(map)) { - map->data = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE, + map->data = mmap(NULL, mmap__mmap_len(map), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (map->data == MAP_FAILED) { pr_debug2("failed to mmap data buffer, error %d\n", diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 4ff75d8aeb05..2b97dc6d9ee2 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -67,7 +67,7 @@ union perf_event *perf_mmap__read_event(struct mmap *map); int perf_mmap__push(struct mmap *md, void *to, int push(struct mmap *map, void *to, void *buf, size_t size)); -size_t perf_mmap__mmap_len(struct mmap *map); +size_t mmap__mmap_len(struct mmap *map); int perf_mmap__read_init(struct mmap *md); void perf_mmap__read_done(struct mmap *map); -- Gitee From a0cc9d32f15bf92c747a70a4098654bc3d6b9ea2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:12 +0200 Subject: [PATCH 133/257] libperf: Adopt perf_mmap__mmap() function from tools/perf commit 32c261c070c222858148c2171698d2954242ddd9 upstream Move perf_mmap__mmap() from tools/perf to libperf, it will be used in the following patches. And rename the existing perf's function to mmap__mmap(). Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-5-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/lib/include/internal/mmap.h | 2 ++ tools/perf/lib/mmap.c | 18 ++++++++++++++++++ tools/perf/util/evlist.c | 2 +- tools/perf/util/mmap.c | 12 +++--------- tools/perf/util/mmap.h | 2 +- 5 files changed, 25 insertions(+), 11 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index e7a67260940c..7067b70c6f61 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -37,5 +37,7 @@ struct perf_mmap_param { size_t perf_mmap__mmap_len(struct perf_mmap *map); void perf_mmap__init(struct perf_mmap *map, bool overwrite); +int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp, + int fd, int cpu); #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c index cc4284da4d99..b216a7db857f 100644 --- a/tools/perf/lib/mmap.c +++ b/tools/perf/lib/mmap.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include @@ -13,3 +14,20 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map) { return map->mask + 1 + page_size; } + +int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp, + int fd, int cpu) +{ + map->prev = 0; + map->mask = mp->mask; + map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, + MAP_SHARED, fd, 0); + if (map->base == MAP_FAILED) { + map->base = NULL; + return -1; + } + + map->fd = fd; + map->cpu = cpu; + return 0; +} diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 430bfea8b3e5..3df73d2f4dfc 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -671,7 +671,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, if (*output == -1) { *output = fd; - if (perf_mmap__mmap(&maps[idx], mp, *output, evlist_cpu) < 0) + if (mmap__mmap(&maps[idx], mp, *output, evlist_cpu) < 0) return -1; } else { if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, *output) != 0) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index a8e81c4cbae8..acef6e3f6b80 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -353,7 +353,7 @@ static void perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params CPU_SET(map->core.cpu, &map->affinity_mask); } -int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) +int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) { /* * The last one will be done at perf_mmap__consume(), so that we @@ -369,18 +369,12 @@ int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) * perf_evlist__filter_pollfd(). */ refcount_set(&map->core.refcnt, 2); - map->core.prev = 0; - map->core.mask = mp->core.mask; - map->core.base = mmap(NULL, mmap__mmap_len(map), mp->core.prot, - MAP_SHARED, fd, 0); - if (map->core.base == MAP_FAILED) { + + if (perf_mmap__mmap(&map->core, &mp->core, fd, cpu)) { pr_debug2("failed to mmap perf event ring buffer, error %d\n", errno); - map->core.base = NULL; return -1; } - map->core.fd = fd; - map->core.cpu = cpu; perf_mmap__setup_affinity_mask(map, mp); diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 2b97dc6d9ee2..a60e6ead7255 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -42,7 +42,7 @@ struct mmap_params { struct auxtrace_mmap_params auxtrace_mp; }; -int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu); +int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu); void perf_mmap__munmap(struct mmap *map); void perf_mmap__get(struct mmap *map); -- Gitee From 0c4f7bd24a9afaeaf388987b0cde043ccfb62a42 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:13 +0200 Subject: [PATCH 134/257] libperf: Adopt perf_mmap__get() function from tools/perf commit e75710f063e29ae7715c57b45eb27c2d504b32ca upstream Move perf_mmap__get() from tools/perf to libperf in the internal header internal/mmap.h. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-6-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/builtin-record.c | 2 +- tools/perf/lib/include/internal/mmap.h | 1 + tools/perf/lib/mmap.c | 5 +++++ tools/perf/util/evlist.c | 2 +- tools/perf/util/mmap.c | 5 ----- tools/perf/util/mmap.h | 1 - 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 41b1c12e22ba..4973bee8a84f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -294,7 +294,7 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size * after started aio request completion or at record__aio_push() * if the request failed to start. */ - perf_mmap__get(map); + perf_mmap__get(&map->core); } aio->size += size; diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index 7067b70c6f61..2e68974bffb4 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -39,5 +39,6 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map); void perf_mmap__init(struct perf_mmap *map, bool overwrite); int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp, int fd, int cpu); +void perf_mmap__get(struct perf_mmap *map); #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c index b216a7db857f..b765e0505bb6 100644 --- a/tools/perf/lib/mmap.c +++ b/tools/perf/lib/mmap.c @@ -31,3 +31,8 @@ int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp, map->cpu = cpu; return 0; } + +void perf_mmap__get(struct perf_mmap *map) +{ + refcount_inc(&map->refcnt); +} diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 3df73d2f4dfc..a40d42d79f5c 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -677,7 +677,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, *output) != 0) return -1; - perf_mmap__get(&maps[idx]); + perf_mmap__get(&maps[idx].core); } revent = perf_evlist__should_poll(evlist, evsel) ? POLLIN : 0; diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index acef6e3f6b80..be691b58d8ab 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -110,11 +110,6 @@ static bool perf_mmap__empty(struct mmap *map) return perf_mmap__read_head(map) == map->core.prev && !map->auxtrace_mmap.base; } -void perf_mmap__get(struct mmap *map) -{ - refcount_inc(&map->core.refcnt); -} - void perf_mmap__put(struct mmap *map) { BUG_ON(map->core.base && refcount_read(&map->core.refcnt) == 0); diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index a60e6ead7255..a73402ee8fe0 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -45,7 +45,6 @@ struct mmap_params { int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu); void perf_mmap__munmap(struct mmap *map); -void perf_mmap__get(struct mmap *map); void perf_mmap__put(struct mmap *map); void perf_mmap__consume(struct mmap *map); -- Gitee From fef5a973a52e61540efe78559063d18901cc4a59 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:14 +0200 Subject: [PATCH 135/257] libperf: Adopt perf_mmap__unmap() function from tools/perf commit 59d7ea620b58fa7d107834a81528e3098f1c27b0 upstream Move perf_mmap__unmap() from tools/perf to libperf, to internal header internal/mmap.h. It will be used in the following patches. And rename the existing perf's function to mmap__munmap(). Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-7-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/lib/include/internal/mmap.h | 1 + tools/perf/lib/mmap.c | 10 ++++++++++ tools/perf/util/evlist.c | 4 ++-- tools/perf/util/mmap.c | 11 +++-------- tools/perf/util/mmap.h | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index 2e68974bffb4..5c2ca9ab12cd 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -39,6 +39,7 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map); void perf_mmap__init(struct perf_mmap *map, bool overwrite); int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp, int fd, int cpu); +void perf_mmap__munmap(struct perf_mmap *map); void perf_mmap__get(struct perf_mmap *map); #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c index b765e0505bb6..6eb228d89206 100644 --- a/tools/perf/lib/mmap.c +++ b/tools/perf/lib/mmap.c @@ -32,6 +32,16 @@ int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp, return 0; } +void perf_mmap__munmap(struct perf_mmap *map) +{ + if (map && map->base != NULL) { + munmap(map->base, perf_mmap__mmap_len(map)); + map->base = NULL; + map->fd = -1; + refcount_set(&map->refcnt, 0); + } +} + void perf_mmap__get(struct perf_mmap *map) { refcount_inc(&map->refcnt); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index a40d42d79f5c..75711d2429b6 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -577,11 +577,11 @@ static void evlist__munmap_nofree(struct evlist *evlist) if (evlist->mmap) for (i = 0; i < evlist->core.nr_mmaps; i++) - perf_mmap__munmap(&evlist->mmap[i]); + mmap__munmap(&evlist->mmap[i]); if (evlist->overwrite_mmap) for (i = 0; i < evlist->core.nr_mmaps; i++) - perf_mmap__munmap(&evlist->overwrite_mmap[i]); + mmap__munmap(&evlist->overwrite_mmap[i]); } void evlist__munmap(struct evlist *evlist) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index be691b58d8ab..2c73b5bcf74e 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -115,7 +115,7 @@ void perf_mmap__put(struct mmap *map) BUG_ON(map->core.base && refcount_read(&map->core.refcnt) == 0); if (refcount_dec_and_test(&map->core.refcnt)) - perf_mmap__munmap(map); + mmap__munmap(map); } void perf_mmap__consume(struct mmap *map) @@ -306,19 +306,14 @@ static void perf_mmap__aio_munmap(struct mmap *map __maybe_unused) } #endif -void perf_mmap__munmap(struct mmap *map) +void mmap__munmap(struct mmap *map) { + perf_mmap__munmap(&map->core); perf_mmap__aio_munmap(map); if (map->data != NULL) { munmap(map->data, mmap__mmap_len(map)); map->data = NULL; } - if (map->core.base != NULL) { - munmap(map->core.base, mmap__mmap_len(map)); - map->core.base = NULL; - map->core.fd = -1; - refcount_set(&map->core.refcnt, 0); - } auxtrace_mmap__munmap(&map->auxtrace_mmap); } diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index a73402ee8fe0..6a18b2990059 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -43,7 +43,7 @@ struct mmap_params { }; int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu); -void perf_mmap__munmap(struct mmap *map); +void mmap__munmap(struct mmap *map); void perf_mmap__put(struct mmap *map); -- Gitee From 15fa591d0d461760e5e42dde6980527f10316ada Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:15 +0200 Subject: [PATCH 136/257] libperf: Adopt perf_mmap__put() function from tools/perf commit 80e53d1148231d7d4fdc4cd89e5393616b33bf82 upstream Move perf_mmap__put() from tools/perf to libperf. Once perf_mmap__put() is moved, we need a way to call application related unmap code (AIO and aux related code for eprf), when the map goes away. Add the perf_mmap::unmap callback to do that. The unmap path from perf is: perf_mmap__put (libperf) perf_mmap__munmap (libperf) map->unmap_cb -> perf_mmap__unmap_cb (perf) mmap__munmap (perf) Committer notes: Add missing linux/kernel.h to tools/perf/lib/mmap.c to get the BUG_ON definition. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-8-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/builtin-record.c | 4 ++-- tools/perf/lib/include/internal/mmap.h | 31 ++++++++++++++++---------- tools/perf/lib/mmap.c | 15 ++++++++++++- tools/perf/util/evlist.c | 17 +++++++++----- tools/perf/util/mmap.c | 11 +-------- tools/perf/util/mmap.h | 2 -- 6 files changed, 48 insertions(+), 32 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 4973bee8a84f..077a0faa9b98 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -198,7 +198,7 @@ static int record__aio_complete(struct mmap *md, struct aiocb *cblock) * every aio write request started in record__aio_push() so * decrement it because the request is now complete. */ - perf_mmap__put(md); + perf_mmap__put(&md->core); rc = 1; } else { /* @@ -333,7 +333,7 @@ static int record__aio_push(struct record *rec, struct mmap *map, off_t *off) * map->refcount is decremented in record__aio_complete() after * aio write operation finishes successfully. */ - perf_mmap__put(map); + perf_mmap__put(&map->core); } return ret; diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index 5c2ca9ab12cd..bf9cc7d005ab 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -10,23 +10,28 @@ /* perf sample has 16 bits size limit */ #define PERF_SAMPLE_MAX_SIZE (1 << 16) +struct perf_mmap; + +typedef void (*libperf_unmap_cb_t)(struct perf_mmap *map); + /** * struct perf_mmap - perf's ring buffer mmap details * * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this */ struct perf_mmap { - void *base; - int mask; - int fd; - int cpu; - refcount_t refcnt; - u64 prev; - u64 start; - u64 end; - bool overwrite; - u64 flush; - char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); + void *base; + int mask; + int fd; + int cpu; + refcount_t refcnt; + u64 prev; + u64 start; + u64 end; + bool overwrite; + u64 flush; + libperf_unmap_cb_t unmap_cb; + char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); }; struct perf_mmap_param { @@ -36,10 +41,12 @@ struct perf_mmap_param { size_t perf_mmap__mmap_len(struct perf_mmap *map); -void perf_mmap__init(struct perf_mmap *map, bool overwrite); +void perf_mmap__init(struct perf_mmap *map, bool overwrite, + libperf_unmap_cb_t unmap_cb); int perf_mmap__mmap(struct perf_mmap *map, struct perf_mmap_param *mp, int fd, int cpu); void perf_mmap__munmap(struct perf_mmap *map); void perf_mmap__get(struct perf_mmap *map); +void perf_mmap__put(struct perf_mmap *map); #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c index 6eb228d89206..89c1e0e8b897 100644 --- a/tools/perf/lib/mmap.c +++ b/tools/perf/lib/mmap.c @@ -2,11 +2,14 @@ #include #include #include +#include -void perf_mmap__init(struct perf_mmap *map, bool overwrite) +void perf_mmap__init(struct perf_mmap *map, bool overwrite, + libperf_unmap_cb_t unmap_cb) { map->fd = -1; map->overwrite = overwrite; + map->unmap_cb = unmap_cb; refcount_set(&map->refcnt, 0); } @@ -40,9 +43,19 @@ void perf_mmap__munmap(struct perf_mmap *map) map->fd = -1; refcount_set(&map->refcnt, 0); } + if (map && map->unmap_cb) + map->unmap_cb(map); } void perf_mmap__get(struct perf_mmap *map) { refcount_inc(&map->refcnt); } + +void perf_mmap__put(struct perf_mmap *map) +{ + BUG_ON(map->base && refcount_read(&map->refcnt) == 0); + + if (refcount_dec_and_test(&map->refcnt)) + perf_mmap__munmap(map); +} diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 75711d2429b6..a0907b3d6800 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -409,7 +409,7 @@ static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd, struct mmap *map = fda->priv[fd].ptr; if (map) - perf_mmap__put(map); + perf_mmap__put(&map->core); } int evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask) @@ -577,11 +577,11 @@ static void evlist__munmap_nofree(struct evlist *evlist) if (evlist->mmap) for (i = 0; i < evlist->core.nr_mmaps; i++) - mmap__munmap(&evlist->mmap[i]); + perf_mmap__munmap(&evlist->mmap[i].core); if (evlist->overwrite_mmap) for (i = 0; i < evlist->core.nr_mmaps; i++) - mmap__munmap(&evlist->overwrite_mmap[i]); + perf_mmap__munmap(&evlist->overwrite_mmap[i].core); } void evlist__munmap(struct evlist *evlist) @@ -591,6 +591,13 @@ void evlist__munmap(struct evlist *evlist) zfree(&evlist->overwrite_mmap); } +static void perf_mmap__unmap_cb(struct perf_mmap *map) +{ + struct mmap *m = container_of(map, struct mmap, core); + + mmap__munmap(m); +} + static struct mmap *evlist__alloc_mmap(struct evlist *evlist, bool overwrite) { @@ -614,7 +621,7 @@ static struct mmap *evlist__alloc_mmap(struct evlist *evlist, * Each PERF_EVENT_IOC_SET_OUTPUT points to this mmap and * thus does perf_mmap__get() on it. */ - perf_mmap__init(&map[i].core, overwrite); + perf_mmap__init(&map[i].core, overwrite, perf_mmap__unmap_cb); } return map; @@ -691,7 +698,7 @@ static int evlist__mmap_per_evsel(struct evlist *evlist, int idx, */ if (!evsel->core.system_wide && perf_evlist__add_pollfd(&evlist->core, fd, &maps[idx], revent) < 0) { - perf_mmap__put(&maps[idx]); + perf_mmap__put(&maps[idx].core); return -1; } diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 2c73b5bcf74e..9f150d50cea5 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -110,14 +110,6 @@ static bool perf_mmap__empty(struct mmap *map) return perf_mmap__read_head(map) == map->core.prev && !map->auxtrace_mmap.base; } -void perf_mmap__put(struct mmap *map) -{ - BUG_ON(map->core.base && refcount_read(&map->core.refcnt) == 0); - - if (refcount_dec_and_test(&map->core.refcnt)) - mmap__munmap(map); -} - void perf_mmap__consume(struct mmap *map) { if (!map->core.overwrite) { @@ -127,7 +119,7 @@ void perf_mmap__consume(struct mmap *map) } if (refcount_read(&map->core.refcnt) == 1 && perf_mmap__empty(map)) - perf_mmap__put(map); + perf_mmap__put(&map->core); } int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused, @@ -308,7 +300,6 @@ static void perf_mmap__aio_munmap(struct mmap *map __maybe_unused) void mmap__munmap(struct mmap *map) { - perf_mmap__munmap(&map->core); perf_mmap__aio_munmap(map); if (map->data != NULL) { munmap(map->data, mmap__mmap_len(map)); diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 6a18b2990059..78e3c4436ce8 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -45,8 +45,6 @@ struct mmap_params { int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu); void mmap__munmap(struct mmap *map); -void perf_mmap__put(struct mmap *map); - void perf_mmap__consume(struct mmap *map); static inline u64 perf_mmap__read_head(struct mmap *mm) -- Gitee From 39ff9141703156864ddedeea15c47a1088ead3e5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:16 +0200 Subject: [PATCH 137/257] perf tools: Use perf_mmap way to detect aux mmap commit 1d40ae4e1784bfa1646fd153ca022db21511284f upstream We will move this code to libperf shortly, so we need to free it of 'struct auxtrace_mmap' usage, because it won't be available in libperf (for now). The perf_event_mmap_page::aux_size is set when the aux mmap is mapped, so the check is equivalent. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-9-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/util/mmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 9f150d50cea5..f246dd403507 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -107,7 +107,9 @@ union perf_event *perf_mmap__read_event(struct mmap *map) static bool perf_mmap__empty(struct mmap *map) { - return perf_mmap__read_head(map) == map->core.prev && !map->auxtrace_mmap.base; + struct perf_event_mmap_page *pc = map->core.base; + + return perf_mmap__read_head(map) == map->core.prev && !pc->aux_size; } void perf_mmap__consume(struct mmap *map) -- Gitee From 2aa6fd261a4cb58912a0cd8073a587c8a801a689 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:17 +0200 Subject: [PATCH 138/257] libperf: Adopt perf_mmap__consume() function from tools/perf commit 7728fa0cfaeb7d25b12c8865c733359cc8e5fb13 upstream Move perf_mmap__consume() vrom tools/perf to libperf and export it in the perf/mmap.h header. Move also the needed helpers perf_mmap__write_tail(), perf_mmap__read_head() and perf_mmap__empty(). Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-10-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 3 +- tools/perf/builtin-kvm.c | 5 +-- tools/perf/builtin-top.c | 3 +- tools/perf/builtin-trace.c | 3 +- tools/perf/lib/Makefile | 5 +-- tools/perf/lib/include/internal/mmap.h | 2 ++ tools/perf/lib/include/perf/mmap.h | 11 +++++++ tools/perf/lib/libperf.map | 1 + tools/perf/lib/mmap.c | 32 ++++++++++++++++++++ tools/perf/tests/backward-ring-buffer.c | 1 + tools/perf/tests/bpf.c | 1 + tools/perf/tests/code-reading.c | 3 +- tools/perf/tests/keep-tracking.c | 3 +- tools/perf/tests/mmap-basic.c | 3 +- tools/perf/tests/openat-syscall-tp-fields.c | 3 +- tools/perf/tests/perf-record.c | 3 +- tools/perf/tests/sw-clock.c | 3 +- tools/perf/tests/switch-tracking.c | 3 +- tools/perf/tests/task-exit.c | 3 +- tools/perf/util/evlist.c | 3 +- tools/perf/util/mmap.c | 32 +++++--------------- tools/perf/util/mmap.h | 12 -------- tools/perf/util/python.c | 3 +- 23 files changed, 87 insertions(+), 54 deletions(-) create mode 100644 tools/perf/lib/include/perf/mmap.h diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index fa947952c16a..3397898824f6 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "debug.h" #include "parse-events.h" @@ -139,7 +140,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe comm2_time = sample.time; } next_event: - perf_mmap__consume(md); + perf_mmap__consume(&md->core); } perf_mmap__read_done(md); } diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 58a9e0989491..0c04c4c6c1eb 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -46,6 +46,7 @@ #include #include #include +#include static const char *get_filename_for_perf_kvm(void) { @@ -766,7 +767,7 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx, while ((event = perf_mmap__read_event(md)) != NULL) { err = perf_evlist__parse_sample_timestamp(evlist, event, ×tamp); if (err) { - perf_mmap__consume(md); + perf_mmap__consume(&md->core); pr_err("Failed to parse sample\n"); return -1; } @@ -776,7 +777,7 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx, * FIXME: Here we can't consume the event, as perf_session__queue_event will * point to it, and it'll get possibly overwritten by the kernel. */ - perf_mmap__consume(md); + perf_mmap__consume(&md->core); if (err) { pr_err("Failed to enqueue sample: %d\n", err); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 65fe4f5f5de2..1c380edb8c66 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -82,6 +82,7 @@ #include #include +#include static volatile int done; static volatile int resize; @@ -885,7 +886,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx) if (ret) break; - perf_mmap__consume(md); + perf_mmap__consume(&md->core); if (top->qe.rotate) { pthread_mutex_lock(&top->qe.mutex); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 6052eb057821..56cc5fa99ff8 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -77,6 +77,7 @@ #include #include +#include #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 @@ -3509,7 +3510,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) if (err) goto out_disable; - perf_mmap__consume(md); + perf_mmap__consume(&md->core); if (interrupted) goto out_disable; diff --git a/tools/perf/lib/Makefile b/tools/perf/lib/Makefile index 85ccb8c439a4..0889c9c3ec19 100644 --- a/tools/perf/lib/Makefile +++ b/tools/perf/lib/Makefile @@ -172,8 +172,9 @@ install_headers: $(call do_install,include/perf/cpumap.h,$(prefix)/include/perf,644); \ $(call do_install,include/perf/threadmap.h,$(prefix)/include/perf,644); \ $(call do_install,include/perf/evlist.h,$(prefix)/include/perf,644); \ - $(call do_install,include/perf/evsel.h,$(prefix)/include/perf,644); - $(call do_install,include/perf/event.h,$(prefix)/include/perf,644); + $(call do_install,include/perf/evsel.h,$(prefix)/include/perf,644); \ + $(call do_install,include/perf/event.h,$(prefix)/include/perf,644); \ + $(call do_install,include/perf/mmap.h,$(prefix)/include/perf,644); install_pkgconfig: $(LIBPERF_PC) $(call QUIET_INSTALL, $(LIBPERF_PC)) \ diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h index bf9cc7d005ab..ee536c4441bb 100644 --- a/tools/perf/lib/include/internal/mmap.h +++ b/tools/perf/lib/include/internal/mmap.h @@ -49,4 +49,6 @@ void perf_mmap__munmap(struct perf_mmap *map); void perf_mmap__get(struct perf_mmap *map); void perf_mmap__put(struct perf_mmap *map); +u64 perf_mmap__read_head(struct perf_mmap *map); + #endif /* __LIBPERF_INTERNAL_MMAP_H */ diff --git a/tools/perf/lib/include/perf/mmap.h b/tools/perf/lib/include/perf/mmap.h new file mode 100644 index 000000000000..d3678d1834d9 --- /dev/null +++ b/tools/perf/lib/include/perf/mmap.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LIBPERF_MMAP_H +#define __LIBPERF_MMAP_H + +#include + +struct perf_mmap; + +LIBPERF_API void perf_mmap__consume(struct perf_mmap *map); + +#endif /* __LIBPERF_MMAP_H */ diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map index ab8dbde1136c..d7b327f224e2 100644 --- a/tools/perf/lib/libperf.map +++ b/tools/perf/lib/libperf.map @@ -40,6 +40,7 @@ LIBPERF_0.0.1 { perf_evlist__next; perf_evlist__set_maps; perf_evlist__poll; + perf_mmap__consume; local: *; }; diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c index 89c1e0e8b897..4cada1c89fdb 100644 --- a/tools/perf/lib/mmap.c +++ b/tools/perf/lib/mmap.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include +#include +#include #include #include #include @@ -59,3 +62,32 @@ void perf_mmap__put(struct perf_mmap *map) if (refcount_dec_and_test(&map->refcnt)) perf_mmap__munmap(map); } + +static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail) +{ + ring_buffer_write_tail(md->base, tail); +} + +u64 perf_mmap__read_head(struct perf_mmap *map) +{ + return ring_buffer_read_head(map->base); +} + +static bool perf_mmap__empty(struct perf_mmap *map) +{ + struct perf_event_mmap_page *pc = map->base; + + return perf_mmap__read_head(map) == map->prev && !pc->aux_size; +} + +void perf_mmap__consume(struct perf_mmap *map) +{ + if (!map->overwrite) { + u64 old = map->prev; + + perf_mmap__write_tail(map, old); + } + + if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map)) + perf_mmap__put(map); +} diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index 5128f727c0ef..89aa98af3d63 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -13,6 +13,7 @@ #include "util/mmap.h" #include #include +#include #define NR_ITERS 111 diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index 8669bb85e7c7..cd65469cd01f 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "tests.h" #include "llvm.h" #include "debug.h" diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index f5764a3890b9..b5a57bb54c25 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "debug.h" #include "dso.h" @@ -430,7 +431,7 @@ static int process_events(struct machine *machine, struct evlist *evlist, while ((event = perf_mmap__read_event(md)) != NULL) { ret = process_event(machine, evlist, event, state); - perf_mmap__consume(md); + perf_mmap__consume(&md->core); if (ret < 0) return ret; } diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index 92c7d591bcac..31c005e07b17 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "debug.h" #include "parse-events.h" @@ -46,7 +47,7 @@ static int find_comm(struct evlist *evlist, const char *comm) (pid_t)event->comm.tid == getpid() && strcmp(event->comm.comm, comm) == 0) found += 1; - perf_mmap__consume(md); + perf_mmap__consume(&md->core); } perf_mmap__read_done(md); } diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 3a22dce991ba..b176acc4f52e 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * This test will generate random numbers of calls to some getpid syscalls, @@ -139,7 +140,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse goto out_delete_evlist; } nr_events[evsel->idx]++; - perf_mmap__consume(md); + perf_mmap__consume(&md->core); } perf_mmap__read_done(md); diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index 2b5c46813053..bbf8ba320721 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -13,6 +13,7 @@ #include "debug.h" #include "util/mmap.h" #include +#include #ifndef O_DIRECTORY #define O_DIRECTORY 00200000 @@ -103,7 +104,7 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest ++nr_events; if (type != PERF_RECORD_SAMPLE) { - perf_mmap__consume(md); + perf_mmap__consume(&md->core); continue; } diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 437426be29e9..6ebbcc65749e 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -6,6 +6,7 @@ #include #include +#include #include "evlist.h" #include "evsel.h" #include "debug.h" @@ -276,7 +277,7 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus ++errs; } - perf_mmap__consume(md); + perf_mmap__consume(&md->core); } perf_mmap__read_done(md); } diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index 84519df87f30..1aeb558010c1 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -15,6 +15,7 @@ #include "util/mmap.h" #include "util/thread_map.h" #include +#include #define NR_LOOPS 10000000 @@ -117,7 +118,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) total_periods += sample.period; nr_samples++; next_event: - perf_mmap__consume(md); + perf_mmap__consume(&md->core); } perf_mmap__read_done(md); diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index ffa592e0020e..55728b3da057 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "debug.h" #include "parse-events.h" @@ -275,7 +276,7 @@ static int process_events(struct evlist *evlist, while ((event = perf_mmap__read_event(md)) != NULL) { cnt += 1; ret = add_event(evlist, &events, event); - perf_mmap__consume(md); + perf_mmap__consume(&md->core); if (ret < 0) goto out_free_nodes; } diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index d85c9f608564..a0f1deacfc62 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -12,6 +12,7 @@ #include #include #include +#include static int exited; static int nr_exit; @@ -126,7 +127,7 @@ int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused if (event->header.type == PERF_RECORD_EXIT) nr_exit++; - perf_mmap__consume(md); + perf_mmap__consume(&md->core); } perf_mmap__read_done(md); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index a0907b3d6800..1952f2b7ee45 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -42,6 +42,7 @@ #include #include #include +#include #include @@ -1741,7 +1742,7 @@ static void *perf_evlist__poll_thread(void *arg) else pr_warning("cannot locate proper evsel for the side band event\n"); - perf_mmap__consume(map); + perf_mmap__consume(&map->core); got_data = true; } perf_mmap__read_done(map); diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index f246dd403507..abe7cbe6c95f 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -13,6 +13,7 @@ #include #include #include // sysconf() +#include #ifdef HAVE_LIBNUMA_SUPPORT #include #endif @@ -95,7 +96,7 @@ union perf_event *perf_mmap__read_event(struct mmap *map) /* non-overwirte doesn't pause the ringbuffer */ if (!map->core.overwrite) - map->core.end = perf_mmap__read_head(map); + map->core.end = perf_mmap__read_head(&map->core); event = perf_mmap__read(map, &map->core.start, map->core.end); @@ -105,25 +106,6 @@ union perf_event *perf_mmap__read_event(struct mmap *map) return event; } -static bool perf_mmap__empty(struct mmap *map) -{ - struct perf_event_mmap_page *pc = map->core.base; - - return perf_mmap__read_head(map) == map->core.prev && !pc->aux_size; -} - -void perf_mmap__consume(struct mmap *map) -{ - if (!map->core.overwrite) { - u64 old = map->core.prev; - - perf_mmap__write_tail(map, old); - } - - if (refcount_read(&map->core.refcnt) == 1 && perf_mmap__empty(map)) - perf_mmap__put(&map->core); -} - int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused, struct auxtrace_mmap_params *mp __maybe_unused, void *userpg __maybe_unused, @@ -420,7 +402,7 @@ static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end) */ static int __perf_mmap__read_init(struct mmap *md) { - u64 head = perf_mmap__read_head(md); + u64 head = perf_mmap__read_head(&md->core); u64 old = md->core.prev; unsigned char *data = md->core.base + page_size; unsigned long size; @@ -437,7 +419,7 @@ static int __perf_mmap__read_init(struct mmap *md) WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); md->core.prev = head; - perf_mmap__consume(md); + perf_mmap__consume(&md->core); return -EAGAIN; } @@ -466,7 +448,7 @@ int perf_mmap__read_init(struct mmap *map) int perf_mmap__push(struct mmap *md, void *to, int push(struct mmap *map, void *to, void *buf, size_t size)) { - u64 head = perf_mmap__read_head(md); + u64 head = perf_mmap__read_head(&md->core); unsigned char *data = md->core.base + page_size; unsigned long size; void *buf; @@ -499,7 +481,7 @@ int perf_mmap__push(struct mmap *md, void *to, } md->core.prev = head; - perf_mmap__consume(md); + perf_mmap__consume(&md->core); out: return rc; } @@ -518,5 +500,5 @@ void perf_mmap__read_done(struct mmap *map) if (!refcount_read(&map->core.refcnt)) return; - map->core.prev = perf_mmap__read_head(map); + map->core.prev = perf_mmap__read_head(&map->core); } diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 78e3c4436ce8..89fb93267ff1 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -45,18 +45,6 @@ struct mmap_params { int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu); void mmap__munmap(struct mmap *map); -void perf_mmap__consume(struct mmap *map); - -static inline u64 perf_mmap__read_head(struct mmap *mm) -{ - return ring_buffer_read_head(mm->core.base); -} - -static inline void perf_mmap__write_tail(struct mmap *md, u64 tail) -{ - ring_buffer_write_tail(md->core.base, tail); -} - union perf_event *perf_mmap__read_forward(struct mmap *map); union perf_event *perf_mmap__read_event(struct mmap *map); diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 02460362256d..82a4fa6c87bd 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "evlist.h" #include "callchain.h" #include "evsel.h" @@ -1045,7 +1046,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, err = perf_evsel__parse_sample(evsel, event, &pevent->sample); /* Consume the even only after we parsed it out. */ - perf_mmap__consume(md); + perf_mmap__consume(&md->core); if (err) return PyErr_Format(PyExc_OSError, -- Gitee From 5a91299cd43fa86b82fc999dd28bb964043f1411 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:18 +0200 Subject: [PATCH 139/257] libperf: Adopt perf_mmap__read_init() from tools/perf commit 7c4d41824f9afc659ba425a41018546531cffd72 upstream Move perf_mmap__read_init() from tools/perf to libperf and export it in perf/mmap.h header. And add pr_debug2()/pr_debug3() macros support, because the code is using them. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-11-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 2 +- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/lib/include/perf/core.h | 2 + tools/perf/lib/include/perf/mmap.h | 1 + tools/perf/lib/internal.h | 2 + tools/perf/lib/libperf.map | 1 + tools/perf/lib/mmap.c | 84 ++++++++++++++++++++ tools/perf/tests/backward-ring-buffer.c | 2 +- tools/perf/tests/bpf.c | 2 +- tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/keep-tracking.c | 2 +- tools/perf/tests/mmap-basic.c | 2 +- tools/perf/tests/openat-syscall-tp-fields.c | 2 +- tools/perf/tests/perf-record.c | 2 +- tools/perf/tests/sw-clock.c | 2 +- tools/perf/tests/switch-tracking.c | 2 +- tools/perf/tests/task-exit.c | 2 +- tools/perf/util/evlist.c | 2 +- tools/perf/util/mmap.c | 82 +------------------ tools/perf/util/mmap.h | 1 - tools/perf/util/python.c | 2 +- 23 files changed, 107 insertions(+), 98 deletions(-) diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index 3397898824f6..6a0c3ff78e01 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -118,7 +118,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe for (i = 0; i < evlist->core.nr_mmaps; i++) { md = &evlist->mmap[i]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) continue; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 0c04c4c6c1eb..b6a8078dd446 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -760,7 +760,7 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx, *mmap_time = ULLONG_MAX; md = &evlist->mmap[idx]; - err = perf_mmap__read_init(md); + err = perf_mmap__read_init(&md->core); if (err < 0) return (err == -EAGAIN) ? 0 : -1; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 1c380edb8c66..1f0785d9b583 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -872,7 +872,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx) union perf_event *event; md = opts->overwrite ? &evlist->overwrite_mmap[idx] : &evlist->mmap[idx]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) return; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 56cc5fa99ff8..416d93dee556 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3500,7 +3500,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) struct mmap *md; md = &evlist->mmap[i]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) continue; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/lib/include/perf/core.h b/tools/perf/lib/include/perf/core.h index cfd70e720c1c..2a80e4b6f819 100644 --- a/tools/perf/lib/include/perf/core.h +++ b/tools/perf/lib/include/perf/core.h @@ -12,6 +12,8 @@ enum libperf_print_level { LIBPERF_WARN, LIBPERF_INFO, LIBPERF_DEBUG, + LIBPERF_DEBUG2, + LIBPERF_DEBUG3, }; typedef int (*libperf_print_fn_t)(enum libperf_print_level level, diff --git a/tools/perf/lib/include/perf/mmap.h b/tools/perf/lib/include/perf/mmap.h index d3678d1834d9..646e9052b003 100644 --- a/tools/perf/lib/include/perf/mmap.h +++ b/tools/perf/lib/include/perf/mmap.h @@ -7,5 +7,6 @@ struct perf_mmap; LIBPERF_API void perf_mmap__consume(struct perf_mmap *map); +LIBPERF_API int perf_mmap__read_init(struct perf_mmap *map); #endif /* __LIBPERF_MMAP_H */ diff --git a/tools/perf/lib/internal.h b/tools/perf/lib/internal.h index dc92f241732e..37db745e1502 100644 --- a/tools/perf/lib/internal.h +++ b/tools/perf/lib/internal.h @@ -14,5 +14,7 @@ do { \ #define pr_warning(fmt, ...) __pr(LIBPERF_WARN, fmt, ##__VA_ARGS__) #define pr_info(fmt, ...) __pr(LIBPERF_INFO, fmt, ##__VA_ARGS__) #define pr_debug(fmt, ...) __pr(LIBPERF_DEBUG, fmt, ##__VA_ARGS__) +#define pr_debug2(fmt, ...) __pr(LIBPERF_DEBUG2, fmt, ##__VA_ARGS__) +#define pr_debug3(fmt, ...) __pr(LIBPERF_DEBUG3, fmt, ##__VA_ARGS__) #endif /* __LIBPERF_INTERNAL_H */ diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map index d7b327f224e2..bc3fbb213a3e 100644 --- a/tools/perf/lib/libperf.map +++ b/tools/perf/lib/libperf.map @@ -41,6 +41,7 @@ LIBPERF_0.0.1 { perf_evlist__set_maps; perf_evlist__poll; perf_mmap__consume; + perf_mmap__read_init; local: *; }; diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c index 4cada1c89fdb..fdbc6c550dea 100644 --- a/tools/perf/lib/mmap.c +++ b/tools/perf/lib/mmap.c @@ -1,11 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include +#include +#include #include #include #include #include #include #include +#include "internal.h" void perf_mmap__init(struct perf_mmap *map, bool overwrite, libperf_unmap_cb_t unmap_cb) @@ -91,3 +95,83 @@ void perf_mmap__consume(struct perf_mmap *map) if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map)) perf_mmap__put(map); } + +static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end) +{ + struct perf_event_header *pheader; + u64 evt_head = *start; + int size = mask + 1; + + pr_debug2("%s: buf=%p, start=%"PRIx64"\n", __func__, buf, *start); + pheader = (struct perf_event_header *)(buf + (*start & mask)); + while (true) { + if (evt_head - *start >= (unsigned int)size) { + pr_debug("Finished reading overwrite ring buffer: rewind\n"); + if (evt_head - *start > (unsigned int)size) + evt_head -= pheader->size; + *end = evt_head; + return 0; + } + + pheader = (struct perf_event_header *)(buf + (evt_head & mask)); + + if (pheader->size == 0) { + pr_debug("Finished reading overwrite ring buffer: get start\n"); + *end = evt_head; + return 0; + } + + evt_head += pheader->size; + pr_debug3("move evt_head: %"PRIx64"\n", evt_head); + } + WARN_ONCE(1, "Shouldn't get here\n"); + return -1; +} + +/* + * Report the start and end of the available data in ringbuffer + */ +static int __perf_mmap__read_init(struct perf_mmap *md) +{ + u64 head = perf_mmap__read_head(md); + u64 old = md->prev; + unsigned char *data = md->base + page_size; + unsigned long size; + + md->start = md->overwrite ? head : old; + md->end = md->overwrite ? old : head; + + if ((md->end - md->start) < md->flush) + return -EAGAIN; + + size = md->end - md->start; + if (size > (unsigned long)(md->mask) + 1) { + if (!md->overwrite) { + WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); + + md->prev = head; + perf_mmap__consume(md); + return -EAGAIN; + } + + /* + * Backward ring buffer is full. We still have a chance to read + * most of data from it. + */ + if (overwrite_rb_find_range(data, md->mask, &md->start, &md->end)) + return -EINVAL; + } + + return 0; +} + +int perf_mmap__read_init(struct perf_mmap *map) +{ + /* + * Check if event was unmapped due to a POLLHUP/POLLERR. + */ + if (!refcount_read(&map->refcnt)) + return -ENOENT; + + return __perf_mmap__read_init(map); +} diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index 89aa98af3d63..7ea60b6df366 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -38,7 +38,7 @@ static int count_samples(struct evlist *evlist, int *sample_count, struct mmap *map = &evlist->overwrite_mmap[i]; union perf_event *event; - perf_mmap__read_init(map); + perf_mmap__read_init(&map->core); while ((event = perf_mmap__read_event(map)) != NULL) { const u32 type = event->header.type; diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index cd65469cd01f..f871d817cdeb 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -186,7 +186,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void), struct mmap *md; md = &evlist->mmap[i]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) continue; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index b5a57bb54c25..cf992e0b27ff 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -426,7 +426,7 @@ static int process_events(struct machine *machine, struct evlist *evlist, for (i = 0; i < evlist->core.nr_mmaps; i++) { md = &evlist->mmap[i]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) continue; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index 31c005e07b17..e85da7e77269 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -39,7 +39,7 @@ static int find_comm(struct evlist *evlist, const char *comm) found = 0; for (i = 0; i < evlist->core.nr_mmaps; i++) { md = &evlist->mmap[i]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) continue; while ((event = perf_mmap__read_event(md)) != NULL) { if (event->header.type == PERF_RECORD_COMM && diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index b176acc4f52e..77f42f0ac15d 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -114,7 +114,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse } md = &evlist->mmap[0]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) goto out_init; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index bbf8ba320721..d6a563120d93 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -93,7 +93,7 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest struct mmap *md; md = &evlist->mmap[i]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) continue; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 6ebbcc65749e..2587cb8b2c0f 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -171,7 +171,7 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus struct mmap *md; md = &evlist->mmap[i]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) continue; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index 1aeb558010c1..808669507c30 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -100,7 +100,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) evlist__disable(evlist); md = &evlist->mmap[0]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) goto out_init; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 55728b3da057..bedfdec34972 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -270,7 +270,7 @@ static int process_events(struct evlist *evlist, for (i = 0; i < evlist->core.nr_mmaps; i++) { md = &evlist->mmap[i]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) continue; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index a0f1deacfc62..e743df610880 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -120,7 +120,7 @@ int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused retry: md = &evlist->mmap[0]; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) goto out_init; while ((event = perf_mmap__read_event(md)) != NULL) { diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 1952f2b7ee45..5dd30b7e4597 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1732,7 +1732,7 @@ static void *perf_evlist__poll_thread(void *arg) struct mmap *map = &evlist->mmap[i]; union perf_event *event; - if (perf_mmap__read_init(map)) + if (perf_mmap__read_init(&map->core)) continue; while ((event = perf_mmap__read_event(map)) != NULL) { struct evsel *evsel = perf_evlist__event2evsel(evlist, event); diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index abe7cbe6c95f..59379118c2f1 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -365,86 +365,6 @@ int mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu) return perf_mmap__aio_mmap(map, mp); } -static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end) -{ - struct perf_event_header *pheader; - u64 evt_head = *start; - int size = mask + 1; - - pr_debug2("%s: buf=%p, start=%"PRIx64"\n", __func__, buf, *start); - pheader = (struct perf_event_header *)(buf + (*start & mask)); - while (true) { - if (evt_head - *start >= (unsigned int)size) { - pr_debug("Finished reading overwrite ring buffer: rewind\n"); - if (evt_head - *start > (unsigned int)size) - evt_head -= pheader->size; - *end = evt_head; - return 0; - } - - pheader = (struct perf_event_header *)(buf + (evt_head & mask)); - - if (pheader->size == 0) { - pr_debug("Finished reading overwrite ring buffer: get start\n"); - *end = evt_head; - return 0; - } - - evt_head += pheader->size; - pr_debug3("move evt_head: %"PRIx64"\n", evt_head); - } - WARN_ONCE(1, "Shouldn't get here\n"); - return -1; -} - -/* - * Report the start and end of the available data in ringbuffer - */ -static int __perf_mmap__read_init(struct mmap *md) -{ - u64 head = perf_mmap__read_head(&md->core); - u64 old = md->core.prev; - unsigned char *data = md->core.base + page_size; - unsigned long size; - - md->core.start = md->core.overwrite ? head : old; - md->core.end = md->core.overwrite ? old : head; - - if ((md->core.end - md->core.start) < md->core.flush) - return -EAGAIN; - - size = md->core.end - md->core.start; - if (size > (unsigned long)(md->core.mask) + 1) { - if (!md->core.overwrite) { - WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); - - md->core.prev = head; - perf_mmap__consume(&md->core); - return -EAGAIN; - } - - /* - * Backward ring buffer is full. We still have a chance to read - * most of data from it. - */ - if (overwrite_rb_find_range(data, md->core.mask, &md->core.start, &md->core.end)) - return -EINVAL; - } - - return 0; -} - -int perf_mmap__read_init(struct mmap *map) -{ - /* - * Check if event was unmapped due to a POLLHUP/POLLERR. - */ - if (!refcount_read(&map->core.refcnt)) - return -ENOENT; - - return __perf_mmap__read_init(map); -} - int perf_mmap__push(struct mmap *md, void *to, int push(struct mmap *map, void *to, void *buf, size_t size)) { @@ -454,7 +374,7 @@ int perf_mmap__push(struct mmap *md, void *to, void *buf; int rc = 0; - rc = perf_mmap__read_init(md); + rc = perf_mmap__read_init(&md->core); if (rc < 0) return (rc == -EAGAIN) ? 1 : -1; diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 89fb93267ff1..6d818ef51f05 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -54,6 +54,5 @@ int perf_mmap__push(struct mmap *md, void *to, size_t mmap__mmap_len(struct mmap *map); -int perf_mmap__read_init(struct mmap *md); void perf_mmap__read_done(struct mmap *map); #endif /*__PERF_MMAP_H */ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 82a4fa6c87bd..64eec2a239d4 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -1023,7 +1023,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, if (!md) return NULL; - if (perf_mmap__read_init(md) < 0) + if (perf_mmap__read_init(&md->core) < 0) goto end; event = perf_mmap__read_event(md); -- Gitee From 625e17abc83ebda6d67c9eed23715f5073d27732 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:19 +0200 Subject: [PATCH 140/257] libperf: Adopt perf_mmap__read_done() from tools/perf commit 32fdc2ca7e2ae8ae5d0ff660ca7783acd8ee6396 upstream Move perf_mmap__read_init() from tools/perf to libperf and export it in the perf/mmap.h header. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-12-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 2 +- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/lib/include/perf/mmap.h | 1 + tools/perf/lib/libperf.map | 1 + tools/perf/lib/mmap.c | 17 +++++++++++++++++ tools/perf/tests/backward-ring-buffer.c | 2 +- tools/perf/tests/bpf.c | 2 +- tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/keep-tracking.c | 2 +- tools/perf/tests/mmap-basic.c | 2 +- tools/perf/tests/openat-syscall-tp-fields.c | 2 +- tools/perf/tests/perf-record.c | 2 +- tools/perf/tests/sw-clock.c | 2 +- tools/perf/tests/switch-tracking.c | 2 +- tools/perf/tests/task-exit.c | 2 +- tools/perf/util/evlist.c | 2 +- tools/perf/util/mmap.c | 17 ----------------- tools/perf/util/mmap.h | 1 - 20 files changed, 34 insertions(+), 33 deletions(-) diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index 6a0c3ff78e01..c90d925f7ae6 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -142,7 +142,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe next_event: perf_mmap__consume(&md->core); } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); } if (!comm1_time || !comm2_time) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index b6a8078dd446..4c087a8c9fed 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -794,7 +794,7 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx, break; } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); return n; } diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 1f0785d9b583..66e18bae2a22 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -896,7 +896,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx) } } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); } static void perf_top__mmap_read(struct perf_top *top) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 416d93dee556..d631847072a5 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3520,7 +3520,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) draining = true; } } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); } if (trace->nr_events == before) { diff --git a/tools/perf/lib/include/perf/mmap.h b/tools/perf/lib/include/perf/mmap.h index 646e9052b003..4f946e7f724b 100644 --- a/tools/perf/lib/include/perf/mmap.h +++ b/tools/perf/lib/include/perf/mmap.h @@ -8,5 +8,6 @@ struct perf_mmap; LIBPERF_API void perf_mmap__consume(struct perf_mmap *map); LIBPERF_API int perf_mmap__read_init(struct perf_mmap *map); +LIBPERF_API void perf_mmap__read_done(struct perf_mmap *map); #endif /* __LIBPERF_MMAP_H */ diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map index bc3fbb213a3e..7e3ea2e9c917 100644 --- a/tools/perf/lib/libperf.map +++ b/tools/perf/lib/libperf.map @@ -42,6 +42,7 @@ LIBPERF_0.0.1 { perf_evlist__poll; perf_mmap__consume; perf_mmap__read_init; + perf_mmap__read_done; local: *; }; diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c index fdbc6c550dea..97297cba44e3 100644 --- a/tools/perf/lib/mmap.c +++ b/tools/perf/lib/mmap.c @@ -175,3 +175,20 @@ int perf_mmap__read_init(struct perf_mmap *map) return __perf_mmap__read_init(map); } + +/* + * Mandatory for overwrite mode + * The direction of overwrite mode is backward. + * The last perf_mmap__read() will set tail to map->core.prev. + * Need to correct the map->core.prev to head which is the end of next read. + */ +void perf_mmap__read_done(struct perf_mmap *map) +{ + /* + * Check if event was unmapped due to a POLLHUP/POLLERR. + */ + if (!refcount_read(&map->refcnt)) + return; + + map->prev = perf_mmap__read_head(map); +} diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index 7ea60b6df366..917ef9e21757 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -54,7 +54,7 @@ static int count_samples(struct evlist *evlist, int *sample_count, return TEST_FAIL; } } - perf_mmap__read_done(map); + perf_mmap__read_done(&map->core); } return TEST_OK; } diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index f871d817cdeb..fd5b7eb480f8 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -195,7 +195,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void), if (type == PERF_RECORD_SAMPLE) count ++; } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); } if (count != expect) { diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index cf992e0b27ff..9947cda29bad 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -435,7 +435,7 @@ static int process_events(struct machine *machine, struct evlist *evlist, if (ret < 0) return ret; } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); } return 0; } diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index e85da7e77269..e950907f6f57 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -49,7 +49,7 @@ static int find_comm(struct evlist *evlist, const char *comm) found += 1; perf_mmap__consume(&md->core); } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); } return found; } diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 77f42f0ac15d..bb15d405a42c 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -142,7 +142,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse nr_events[evsel->idx]++; perf_mmap__consume(&md->core); } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); out_init: err = 0; diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index d6a563120d93..c95eb1bbf396 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -124,7 +124,7 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest goto out_ok; } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); } if (nr_events == before) diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 2587cb8b2c0f..92a53be3b32b 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -279,7 +279,7 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus perf_mmap__consume(&md->core); } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); } /* diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index 808669507c30..ace20921ad55 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -120,7 +120,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) next_event: perf_mmap__consume(&md->core); } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); out_init: if ((u64) nr_samples == total_periods) { diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index bedfdec34972..8400fb17c170 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -280,7 +280,7 @@ static int process_events(struct evlist *evlist, if (ret < 0) goto out_free_nodes; } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); } events_array = calloc(cnt, sizeof(struct event_node)); diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index e743df610880..270dc8b46909 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -129,7 +129,7 @@ int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused perf_mmap__consume(&md->core); } - perf_mmap__read_done(md); + perf_mmap__read_done(&md->core); out_init: if (!exited || !nr_exit) { diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 5dd30b7e4597..776ab9d3c18f 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1745,7 +1745,7 @@ static void *perf_evlist__poll_thread(void *arg) perf_mmap__consume(&map->core); got_data = true; } - perf_mmap__read_done(map); + perf_mmap__read_done(&map->core); } if (draining && !got_data) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 59379118c2f1..2dedef9b06fd 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -405,20 +405,3 @@ int perf_mmap__push(struct mmap *md, void *to, out: return rc; } - -/* - * Mandatory for overwrite mode - * The direction of overwrite mode is backward. - * The last perf_mmap__read() will set tail to map->core.prev. - * Need to correct the map->core.prev to head which is the end of next read. - */ -void perf_mmap__read_done(struct mmap *map) -{ - /* - * Check if event was unmapped due to a POLLHUP/POLLERR. - */ - if (!refcount_read(&map->core.refcnt)) - return; - - map->core.prev = perf_mmap__read_head(&map->core); -} diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 6d818ef51f05..0b15702be1a5 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -54,5 +54,4 @@ int perf_mmap__push(struct mmap *md, void *to, size_t mmap__mmap_len(struct mmap *map); -void perf_mmap__read_done(struct mmap *map); #endif /*__PERF_MMAP_H */ -- Gitee From 53b1c14cbcd3c257f0374713e329d7a3458eb67d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Oct 2019 14:53:20 +0200 Subject: [PATCH 141/257] libperf: Adopt perf_mmap__read_event() from tools/perf commit 151ed5d70da87720022e4171227733a008b3c719 upstream Move perf_mmap__read_event() from tools/perf to libperf and export it in the perf/mmap.h header. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20191007125344.14268-13-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: PvsNarasimha Signed-off-by: mohanasv2 --- tools/perf/arch/x86/tests/perf-time-to-tsc.c | 2 +- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/lib/include/perf/mmap.h | 2 + tools/perf/lib/libperf.map | 1 + tools/perf/lib/mmap.c | 79 ++++++++++++++++++++ tools/perf/tests/backward-ring-buffer.c | 2 +- tools/perf/tests/bpf.c | 2 +- tools/perf/tests/code-reading.c | 2 +- tools/perf/tests/keep-tracking.c | 2 +- tools/perf/tests/mmap-basic.c | 2 +- tools/perf/tests/openat-syscall-tp-fields.c | 2 +- tools/perf/tests/perf-record.c | 2 +- tools/perf/tests/sw-clock.c | 2 +- tools/perf/tests/switch-tracking.c | 2 +- tools/perf/tests/task-exit.c | 2 +- tools/perf/util/evlist.c | 2 +- tools/perf/util/mmap.c | 77 ------------------- tools/perf/util/mmap.h | 2 - tools/perf/util/python.c | 2 +- 21 files changed, 98 insertions(+), 95 deletions(-) diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index c90d925f7ae6..909ead08a6f6 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -121,7 +121,7 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe if (perf_mmap__read_init(&md->core) < 0) continue; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { struct perf_sample sample; if (event->header.type != PERF_RECORD_COMM || diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 4c087a8c9fed..858da896b518 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -764,7 +764,7 @@ static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx, if (err < 0) return (err == -EAGAIN) ? 0 : -1; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { err = perf_evlist__parse_sample_timestamp(evlist, event, ×tamp); if (err) { perf_mmap__consume(&md->core); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 66e18bae2a22..8bd526b8c143 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -875,7 +875,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx) if (perf_mmap__read_init(&md->core) < 0) return; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { int ret; ret = perf_evlist__parse_sample_timestamp(evlist, event, &last_timestamp); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d631847072a5..d196b287a801 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3503,7 +3503,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) if (perf_mmap__read_init(&md->core) < 0) continue; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { ++trace->nr_events; err = trace__deliver_event(trace, event); diff --git a/tools/perf/lib/include/perf/mmap.h b/tools/perf/lib/include/perf/mmap.h index 4f946e7f724b..9508ad90d8b9 100644 --- a/tools/perf/lib/include/perf/mmap.h +++ b/tools/perf/lib/include/perf/mmap.h @@ -5,9 +5,11 @@ #include struct perf_mmap; +union perf_event; LIBPERF_API void perf_mmap__consume(struct perf_mmap *map); LIBPERF_API int perf_mmap__read_init(struct perf_mmap *map); LIBPERF_API void perf_mmap__read_done(struct perf_mmap *map); +LIBPERF_API union perf_event *perf_mmap__read_event(struct perf_mmap *map); #endif /* __LIBPERF_MMAP_H */ diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map index 7e3ea2e9c917..8bb0d73e0c6c 100644 --- a/tools/perf/lib/libperf.map +++ b/tools/perf/lib/libperf.map @@ -43,6 +43,7 @@ LIBPERF_0.0.1 { perf_mmap__consume; perf_mmap__read_init; perf_mmap__read_done; + perf_mmap__read_event; local: *; }; diff --git a/tools/perf/lib/mmap.c b/tools/perf/lib/mmap.c index 97297cba44e3..0752c193b0fb 100644 --- a/tools/perf/lib/mmap.c +++ b/tools/perf/lib/mmap.c @@ -3,9 +3,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -192,3 +194,80 @@ void perf_mmap__read_done(struct perf_mmap *map) map->prev = perf_mmap__read_head(map); } + +/* When check_messup is true, 'end' must points to a good entry */ +static union perf_event *perf_mmap__read(struct perf_mmap *map, + u64 *startp, u64 end) +{ + unsigned char *data = map->base + page_size; + union perf_event *event = NULL; + int diff = end - *startp; + + if (diff >= (int)sizeof(event->header)) { + size_t size; + + event = (union perf_event *)&data[*startp & map->mask]; + size = event->header.size; + + if (size < sizeof(event->header) || diff < (int)size) + return NULL; + + /* + * Event straddles the mmap boundary -- header should always + * be inside due to u64 alignment of output. + */ + if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) { + unsigned int offset = *startp; + unsigned int len = min(sizeof(*event), size), cpy; + void *dst = map->event_copy; + + do { + cpy = min(map->mask + 1 - (offset & map->mask), len); + memcpy(dst, &data[offset & map->mask], cpy); + offset += cpy; + dst += cpy; + len -= cpy; + } while (len); + + event = (union perf_event *)map->event_copy; + } + + *startp += size; + } + + return event; +} + +/* + * Read event from ring buffer one by one. + * Return one event for each call. + * + * Usage: + * perf_mmap__read_init() + * while(event = perf_mmap__read_event()) { + * //process the event + * perf_mmap__consume() + * } + * perf_mmap__read_done() + */ +union perf_event *perf_mmap__read_event(struct perf_mmap *map) +{ + union perf_event *event; + + /* + * Check if event was unmapped due to a POLLHUP/POLLERR. + */ + if (!refcount_read(&map->refcnt)) + return NULL; + + /* non-overwirte doesn't pause the ringbuffer */ + if (!map->overwrite) + map->end = perf_mmap__read_head(map); + + event = perf_mmap__read(map, &map->start, map->end); + + if (!map->overwrite) + map->prev = map->start; + + return event; +} diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index 917ef9e21757..15cea518f5ad 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -39,7 +39,7 @@ static int count_samples(struct evlist *evlist, int *sample_count, union perf_event *event; perf_mmap__read_init(&map->core); - while ((event = perf_mmap__read_event(map)) != NULL) { + while ((event = perf_mmap__read_event(&map->core)) != NULL) { const u32 type = event->header.type; switch (type) { diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index fd5b7eb480f8..a439868738bd 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -189,7 +189,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void), if (perf_mmap__read_init(&md->core) < 0) continue; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { const u32 type = event->header.type; if (type == PERF_RECORD_SAMPLE) diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 9947cda29bad..1f017e1b2a55 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -429,7 +429,7 @@ static int process_events(struct machine *machine, struct evlist *evlist, if (perf_mmap__read_init(&md->core) < 0) continue; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { ret = process_event(machine, evlist, event, state); perf_mmap__consume(&md->core); if (ret < 0) diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index e950907f6f57..50a0c9fcde7d 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -41,7 +41,7 @@ static int find_comm(struct evlist *evlist, const char *comm) md = &evlist->mmap[i]; if (perf_mmap__read_init(&md->core) < 0) continue; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { if (event->header.type == PERF_RECORD_COMM && (pid_t)event->comm.pid == getpid() && (pid_t)event->comm.tid == getpid() && diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index bb15d405a42c..5f4c0dbb4715 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -117,7 +117,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse if (perf_mmap__read_init(&md->core) < 0) goto out_init; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { struct perf_sample sample; if (event->header.type != PERF_RECORD_SAMPLE) { diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index c95eb1bbf396..c6b2d7aab608 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -96,7 +96,7 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest if (perf_mmap__read_init(&md->core) < 0) continue; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { const u32 type = event->header.type; int tp_flags; struct perf_sample sample; diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 92a53be3b32b..2195fc205e72 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -174,7 +174,7 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus if (perf_mmap__read_init(&md->core) < 0) continue; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { const u32 type = event->header.type; const char *name = perf_event__name(type); diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index ace20921ad55..bfb9986093d8 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -103,7 +103,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) if (perf_mmap__read_init(&md->core) < 0) goto out_init; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { struct perf_sample sample; if (event->header.type != PERF_RECORD_SAMPLE) diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 8400fb17c170..fcb0d03dba4e 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -273,7 +273,7 @@ static int process_events(struct evlist *evlist, if (perf_mmap__read_init(&md->core) < 0) continue; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { cnt += 1; ret = add_event(evlist, &events, event); perf_mmap__consume(&md->core); diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index 270dc8b46909..adaff9044331 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -123,7 +123,7 @@ int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused if (perf_mmap__read_init(&md->core) < 0) goto out_init; - while ((event = perf_mmap__read_event(md)) != NULL) { + while ((event = perf_mmap__read_event(&md->core)) != NULL) { if (event->header.type == PERF_RECORD_EXIT) nr_exit++; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 776ab9d3c18f..4a4928b7244b 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1734,7 +1734,7 @@ static void *perf_evlist__poll_thread(void *arg) if (perf_mmap__read_init(&map->core)) continue; - while ((event = perf_mmap__read_event(map)) != NULL) { + while ((event = perf_mmap__read_event(&map->core)) != NULL) { struct evsel *evsel = perf_evlist__event2evsel(evlist, event); if (evsel && evsel->side_band.cb) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 2dedef9b06fd..2a8bf0ab861c 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -29,83 +29,6 @@ size_t mmap__mmap_len(struct mmap *map) return perf_mmap__mmap_len(&map->core); } -/* When check_messup is true, 'end' must points to a good entry */ -static union perf_event *perf_mmap__read(struct mmap *map, - u64 *startp, u64 end) -{ - unsigned char *data = map->core.base + page_size; - union perf_event *event = NULL; - int diff = end - *startp; - - if (diff >= (int)sizeof(event->header)) { - size_t size; - - event = (union perf_event *)&data[*startp & map->core.mask]; - size = event->header.size; - - if (size < sizeof(event->header) || diff < (int)size) - return NULL; - - /* - * Event straddles the mmap boundary -- header should always - * be inside due to u64 alignment of output. - */ - if ((*startp & map->core.mask) + size != ((*startp + size) & map->core.mask)) { - unsigned int offset = *startp; - unsigned int len = min(sizeof(*event), size), cpy; - void *dst = map->core.event_copy; - - do { - cpy = min(map->core.mask + 1 - (offset & map->core.mask), len); - memcpy(dst, &data[offset & map->core.mask], cpy); - offset += cpy; - dst += cpy; - len -= cpy; - } while (len); - - event = (union perf_event *)map->core.event_copy; - } - - *startp += size; - } - - return event; -} - -/* - * Read event from ring buffer one by one. - * Return one event for each call. - * - * Usage: - * perf_mmap__read_init() - * while(event = perf_mmap__read_event()) { - * //process the event - * perf_mmap__consume() - * } - * perf_mmap__read_done() - */ -union perf_event *perf_mmap__read_event(struct mmap *map) -{ - union perf_event *event; - - /* - * Check if event was unmapped due to a POLLHUP/POLLERR. - */ - if (!refcount_read(&map->core.refcnt)) - return NULL; - - /* non-overwirte doesn't pause the ringbuffer */ - if (!map->core.overwrite) - map->core.end = perf_mmap__read_head(&map->core); - - event = perf_mmap__read(map, &map->core.start, map->core.end); - - if (!map->core.overwrite) - map->core.prev = map->core.start; - - return event; -} - int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused, struct auxtrace_mmap_params *mp __maybe_unused, void *userpg __maybe_unused, diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 0b15702be1a5..bee4e83f7109 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -47,8 +47,6 @@ void mmap__munmap(struct mmap *map); union perf_event *perf_mmap__read_forward(struct mmap *map); -union perf_event *perf_mmap__read_event(struct mmap *map); - int perf_mmap__push(struct mmap *md, void *to, int push(struct mmap *map, void *to, void *buf, size_t size)); diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 64eec2a239d4..25118605f3f8 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -1026,7 +1026,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, if (perf_mmap__read_init(&md->core) < 0) goto end; - event = perf_mmap__read_event(md); + event = perf_mmap__read_event(&md->core); if (event != NULL) { PyObject *pyevent = pyrf_event__new(event); struct pyrf_event *pevent = (struct pyrf_event *)pyevent; -- Gitee From 8679a33e9053728947545110b95bc74bef95f460 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 5 May 2020 12:18:21 -0300 Subject: [PATCH 142/257] perf evlist: Move the sideband thread routines to separate object commit 9a39994467d493eba38d8f69e42fd9c31cb1da9a upstream To avoid dragging more stuff into the perf python binding in the following csets. Reported-by: Jiri Olsa Cc: Adrian Hunter Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/perf/util/Build | 1 + tools/perf/util/evlist.c | 117 ---------------------------- tools/perf/util/sideband_evlist.c | 125 ++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 117 deletions(-) create mode 100644 tools/perf/util/sideband_evlist.c diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 013ec02ad872..2b198f645e46 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -9,6 +9,7 @@ perf-y += db-export.o perf-y += env.o perf-y += event.o perf-y += evlist.o +perf-y += sideband_evlist.o perf-y += evsel.o perf-y += evsel_fprintf.o perf-y += perf_event_attr_fprintf.o diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4a4928b7244b..4e3a4f352619 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1682,120 +1682,3 @@ struct evsel *perf_evlist__reset_weak_group(struct evlist *evsel_list, } return leader; } - -int perf_evlist__add_sb_event(struct evlist *evlist, - struct perf_event_attr *attr, - perf_evsel__sb_cb_t cb, - void *data) -{ - struct evsel *evsel; - - if (!attr->sample_id_all) { - pr_warning("enabling sample_id_all for all side band events\n"); - attr->sample_id_all = 1; - } - - evsel = perf_evsel__new_idx(attr, evlist->core.nr_entries); - if (!evsel) - return -1; - - evsel->side_band.cb = cb; - evsel->side_band.data = data; - evlist__add(evlist, evsel); - return 0; -} - -static void *perf_evlist__poll_thread(void *arg) -{ - struct evlist *evlist = arg; - bool draining = false; - int i, done = 0; - /* - * In order to read symbols from other namespaces perf to needs to call - * setns(2). This isn't permitted if the struct_fs has multiple users. - * unshare(2) the fs so that we may continue to setns into namespaces - * that we're observing when, for instance, reading the build-ids at - * the end of a 'perf record' session. - */ - unshare(CLONE_FS); - - while (!done) { - bool got_data = false; - - if (evlist->thread.done) - draining = true; - - if (!draining) - evlist__poll(evlist, 1000); - - for (i = 0; i < evlist->core.nr_mmaps; i++) { - struct mmap *map = &evlist->mmap[i]; - union perf_event *event; - - if (perf_mmap__read_init(&map->core)) - continue; - while ((event = perf_mmap__read_event(&map->core)) != NULL) { - struct evsel *evsel = perf_evlist__event2evsel(evlist, event); - - if (evsel && evsel->side_band.cb) - evsel->side_band.cb(event, evsel->side_band.data); - else - pr_warning("cannot locate proper evsel for the side band event\n"); - - perf_mmap__consume(&map->core); - got_data = true; - } - perf_mmap__read_done(&map->core); - } - - if (draining && !got_data) - break; - } - return NULL; -} - -int perf_evlist__start_sb_thread(struct evlist *evlist, - struct target *target) -{ - struct evsel *counter; - - if (!evlist) - return 0; - - if (perf_evlist__create_maps(evlist, target)) - goto out_delete_evlist; - - evlist__for_each_entry(evlist, counter) { - if (evsel__open(counter, evlist->core.cpus, - evlist->core.threads) < 0) - goto out_delete_evlist; - } - - if (evlist__mmap(evlist, UINT_MAX)) - goto out_delete_evlist; - - evlist__for_each_entry(evlist, counter) { - if (evsel__enable(counter)) - goto out_delete_evlist; - } - - evlist->thread.done = 0; - if (pthread_create(&evlist->thread.th, NULL, perf_evlist__poll_thread, evlist)) - goto out_delete_evlist; - - return 0; - -out_delete_evlist: - evlist__delete(evlist); - evlist = NULL; - return -1; -} - -void perf_evlist__stop_sb_thread(struct evlist *evlist) -{ - if (!evlist) - return; - evlist->thread.done = 1; - pthread_join(evlist->thread.th, NULL); - evlist__delete(evlist); -} diff --git a/tools/perf/util/sideband_evlist.c b/tools/perf/util/sideband_evlist.c new file mode 100644 index 000000000000..073d201bb6ea --- /dev/null +++ b/tools/perf/util/sideband_evlist.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "util/debug.h" +#include "util/evlist.h" +#include "util/evsel.h" +#include "util/mmap.h" +#include +#include +#include +#include +#include +#include + +int perf_evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *attr, + perf_evsel__sb_cb_t cb, void *data) +{ + struct evsel *evsel; + + if (!attr->sample_id_all) { + pr_warning("enabling sample_id_all for all side band events\n"); + attr->sample_id_all = 1; + } + + evsel = perf_evsel__new_idx(attr, evlist->core.nr_entries); + if (!evsel) + return -1; + + evsel->side_band.cb = cb; + evsel->side_band.data = data; + evlist__add(evlist, evsel); + return 0; +} + +static void *perf_evlist__poll_thread(void *arg) +{ + struct evlist *evlist = arg; + bool draining = false; + int i, done = 0; + /* + * In order to read symbols from other namespaces perf to needs to call + * setns(2). This isn't permitted if the struct_fs has multiple users. + * unshare(2) the fs so that we may continue to setns into namespaces + * that we're observing when, for instance, reading the build-ids at + * the end of a 'perf record' session. + */ + unshare(CLONE_FS); + + while (!done) { + bool got_data = false; + + if (evlist->thread.done) + draining = true; + + if (!draining) + evlist__poll(evlist, 1000); + + for (i = 0; i < evlist->core.nr_mmaps; i++) { + struct mmap *map = &evlist->mmap[i]; + union perf_event *event; + + if (perf_mmap__read_init(&map->core)) + continue; + while ((event = perf_mmap__read_event(&map->core)) != NULL) { + struct evsel *evsel = perf_evlist__event2evsel(evlist, event); + + if (evsel && evsel->side_band.cb) + evsel->side_band.cb(event, evsel->side_band.data); + else + pr_warning("cannot locate proper evsel for the side band event\n"); + + perf_mmap__consume(&map->core); + got_data = true; + } + perf_mmap__read_done(&map->core); + } + + if (draining && !got_data) + break; + } + return NULL; +} + +int perf_evlist__start_sb_thread(struct evlist *evlist, struct target *target) +{ + struct evsel *counter; + + if (!evlist) + return 0; + + if (perf_evlist__create_maps(evlist, target)) + goto out_delete_evlist; + + evlist__for_each_entry(evlist, counter) { + if (evsel__open(counter, evlist->core.cpus, evlist->core.threads) < 0) + goto out_delete_evlist; + } + + if (evlist__mmap(evlist, UINT_MAX)) + goto out_delete_evlist; + + evlist__for_each_entry(evlist, counter) { + if (evsel__enable(counter)) + goto out_delete_evlist; + } + + evlist->thread.done = 0; + if (pthread_create(&evlist->thread.th, NULL, perf_evlist__poll_thread, evlist)) + goto out_delete_evlist; + + return 0; + +out_delete_evlist: + evlist__delete(evlist); + evlist = NULL; + return -1; +} + +void perf_evlist__stop_sb_thread(struct evlist *evlist) +{ + if (!evlist) + return; + evlist->thread.done = 1; + pthread_join(evlist->thread.th, NULL); + evlist__delete(evlist); +} -- Gitee From 17fe7f7ea4a67543ad4195752a7a1424e134f6b8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Mon, 30 Sep 2019 17:36:23 -0700 Subject: [PATCH 143/257] perf tools: Avoid 'sample_reg_masks' being const + weak commit 42466b9f29b415c254dc4c2f4618e2a96951a406 upstream Being const + weak breaks with some compilers that constant-propagate from the weak symbol. This behavior is outside of the specification, but in LLVM is chosen to match GCC's behavior. LLVM's implementation was set in this patch: https://github.com/llvm/llvm-project/commit/f49573d1eedcf1e44893d5a062ac1b72c8419646 A const + weak symbol is set to be weak_odr: https://llvm.org/docs/LangRef.html ODR is one definition rule, and given there is one constant definition constant-propagation is possible. It is possible to get this code to miscompile with LLVM when applying link time optimization. As compilers become more aggressive, this is likely to break in more instances. Move the definition of sample_reg_masks to the conditional part of perf_regs.h and guard usage with HAVE_PERF_REGS_SUPPORT. This avoids the weak symbol. Fix an issue when HAVE_PERF_REGS_SUPPORT isn't defined from patch v1. In v3, add perf_regs.c for architectures that HAVE_PERF_REGS_SUPPORT but don't declare sample_regs_masks. Further notes: Jiri asked: "Is this just a precaution or you actualy saw some breakage?" Ian answered: "We saw a breakage with clang with thinlto enabled for linking. Our compiler team had recently seen, and were surprised by, a similar issue and were able to dig out the weak ODR issue." Signed-off-by: Ian Rogers Reviewed-by: Nick Desaulniers Acked-by: Jiri Olsa Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexey Budankov Cc: Andi Kleen Cc: clang-built-linux@googlegroups.com Cc: Guo Ren Cc: Kan Liang Cc: linux-riscv@lists.infradead.org Cc: Mao Han Cc: Mark Rutland Cc: Namhyung Kim Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20191001003623.255186-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/perf/arch/arm/util/Build | 2 ++ tools/perf/arch/arm/util/perf_regs.c | 6 ++++++ tools/perf/arch/arm64/util/Build | 1 + tools/perf/arch/arm64/util/perf_regs.c | 6 ++++++ tools/perf/arch/csky/util/Build | 2 ++ tools/perf/arch/csky/util/perf_regs.c | 6 ++++++ tools/perf/arch/riscv/util/Build | 2 ++ tools/perf/arch/riscv/util/perf_regs.c | 6 ++++++ tools/perf/arch/s390/util/Build | 1 + tools/perf/arch/s390/util/perf_regs.c | 6 ++++++ tools/perf/util/parse-regs-options.c | 8 ++++++-- tools/perf/util/perf_regs.c | 4 ---- tools/perf/util/perf_regs.h | 4 ++-- 13 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 tools/perf/arch/arm/util/perf_regs.c create mode 100644 tools/perf/arch/arm64/util/perf_regs.c create mode 100644 tools/perf/arch/csky/util/perf_regs.c create mode 100644 tools/perf/arch/riscv/util/perf_regs.c create mode 100644 tools/perf/arch/s390/util/perf_regs.c diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build index 296f0eac5e18..37fc63708966 100644 --- a/tools/perf/arch/arm/util/Build +++ b/tools/perf/arch/arm/util/Build @@ -1,3 +1,5 @@ +perf-y += perf_regs.o + perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o diff --git a/tools/perf/arch/arm/util/perf_regs.c b/tools/perf/arch/arm/util/perf_regs.c new file mode 100644 index 000000000000..2864e2e3776d --- /dev/null +++ b/tools/perf/arch/arm/util/perf_regs.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG_END +}; diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build index 4ed1ebdc5dc0..264ac3c74408 100644 --- a/tools/perf/arch/arm64/util/Build +++ b/tools/perf/arch/arm64/util/Build @@ -1,4 +1,5 @@ perf-y += header.o +perf-y += perf_regs.o perf-y += sym-handling.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c new file mode 100644 index 000000000000..2864e2e3776d --- /dev/null +++ b/tools/perf/arch/arm64/util/perf_regs.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG_END +}; diff --git a/tools/perf/arch/csky/util/Build b/tools/perf/arch/csky/util/Build index 1160bb2332ba..7d3050134ae0 100644 --- a/tools/perf/arch/csky/util/Build +++ b/tools/perf/arch/csky/util/Build @@ -1,2 +1,4 @@ +perf-y += perf_regs.o + perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/csky/util/perf_regs.c b/tools/perf/arch/csky/util/perf_regs.c new file mode 100644 index 000000000000..2864e2e3776d --- /dev/null +++ b/tools/perf/arch/csky/util/perf_regs.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG_END +}; diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build index 1160bb2332ba..7d3050134ae0 100644 --- a/tools/perf/arch/riscv/util/Build +++ b/tools/perf/arch/riscv/util/Build @@ -1,2 +1,4 @@ +perf-y += perf_regs.o + perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/riscv/util/perf_regs.c b/tools/perf/arch/riscv/util/perf_regs.c new file mode 100644 index 000000000000..2864e2e3776d --- /dev/null +++ b/tools/perf/arch/riscv/util/perf_regs.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG_END +}; diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build index 22797f043b84..3d9d0f4f72ca 100644 --- a/tools/perf/arch/s390/util/Build +++ b/tools/perf/arch/s390/util/Build @@ -1,5 +1,6 @@ perf-y += header.o perf-y += kvm-stat.o +perf-y += perf_regs.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/s390/util/perf_regs.c b/tools/perf/arch/s390/util/perf_regs.c new file mode 100644 index 000000000000..2864e2e3776d --- /dev/null +++ b/tools/perf/arch/s390/util/perf_regs.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG_END +}; diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c index 869ef7e22bd9..a4a100425b3a 100644 --- a/tools/perf/util/parse-regs-options.c +++ b/tools/perf/util/parse-regs-options.c @@ -13,7 +13,7 @@ static int __parse_regs(const struct option *opt, const char *str, int unset, bool intr) { uint64_t *mode = (uint64_t *)opt->value; - const struct sample_reg *r; + const struct sample_reg *r = NULL; char *s, *os = NULL, *p; int ret = -1; uint64_t mask; @@ -46,19 +46,23 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr) if (!strcmp(s, "?")) { fprintf(stderr, "available registers: "); +#ifdef HAVE_PERF_REGS_SUPPORT for (r = sample_reg_masks; r->name; r++) { if (r->mask & mask) fprintf(stderr, "%s ", r->name); } +#endif fputc('\n', stderr); /* just printing available regs */ goto error; } +#ifdef HAVE_PERF_REGS_SUPPORT for (r = sample_reg_masks; r->name; r++) { if ((r->mask & mask) && !strcasecmp(s, r->name)) break; } - if (!r->name) { +#endif + if (!r || !r->name) { ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n", s, intr ? "-I" : "--user-regs="); goto error; diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c index 2774cec1f15f..5ee47ae1509c 100644 --- a/tools/perf/util/perf_regs.c +++ b/tools/perf/util/perf_regs.c @@ -3,10 +3,6 @@ #include "perf_regs.h" #include "event.h" -const struct sample_reg __weak sample_reg_masks[] = { - SMPL_REG_END -}; - int __weak arch_sdt_arg_parse_op(char *old_op __maybe_unused, char **new_op __maybe_unused) { diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h index ec7640cc4c91..a45499126184 100644 --- a/tools/perf/util/perf_regs.h +++ b/tools/perf/util/perf_regs.h @@ -15,8 +15,6 @@ struct sample_reg { #define SMPL_REG2(n, b) { .name = #n, .mask = 3ULL << (b) } #define SMPL_REG_END { .name = NULL } -extern const struct sample_reg sample_reg_masks[]; - enum { SDT_ARG_VALID = 0, SDT_ARG_SKIP, @@ -27,6 +25,8 @@ uint64_t arch__intr_reg_mask(void); uint64_t arch__user_reg_mask(void); #ifdef HAVE_PERF_REGS_SUPPORT +extern const struct sample_reg sample_reg_masks[]; + #include #define DWARF_MINIMAL_REGS ((1ULL << PERF_REG_IP) | (1ULL << PERF_REG_SP)) -- Gitee From d07e984723679fe9e49ff4879782f8f3b22f086a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 5 Mar 2020 23:11:08 -0800 Subject: [PATCH 144/257] tools: Fix off-by 1 relative directory includes commit 441b62acd9c809e87bab45ad1d82b1b3b77cb4f0 upstream This is currently working due to extra include paths in the build. Committer testing: $ cd tools/include/uapi/asm/ Before this patch: $ ls -la ../../arch/x86/include/uapi/asm/errno.h ls: cannot access '../../arch/x86/include/uapi/asm/errno.h': No such file or directory $ After this patch; $ ls -la ../../../arch/x86/include/uapi/asm/errno.h -rw-rw-r--. 1 acme acme 31 Feb 20 12:42 ../../../arch/x86/include/uapi/asm/errno.h $ Check that that is still under tools/, i.e. hasn't escaped into the main kernel sources: $ cd ../../../arch/x86/include/uapi/asm/ $ pwd /home/acme/git/perf/tools/arch/x86/include/uapi/asm $ [Backport changes] 1.In tools/perf/arch/x86/util/intel-pt.c, the header file #include "../../util/evsel_config.h" is not present in the current kernel version. Including it would require backporting over 20 additional dependencies, which is not feasible at this stage. Therefore, this header inclusion has been intentionally omitted to avoid introducing complex dependency chains. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexios Zavras Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Igor Lubashev Cc: Kan Liang Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Wei Li Link: http://lore.kernel.org/lkml/20200306071110.130202-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/include/uapi/asm/errno.h | 14 ++++++------ tools/perf/arch/arm64/util/arm-spe.c | 20 ++++++++--------- tools/perf/arch/arm64/util/perf_regs.c | 2 +- tools/perf/arch/powerpc/util/perf_regs.c | 4 ++-- tools/perf/arch/x86/util/auxtrace.c | 14 ++++++------ tools/perf/arch/x86/util/event.c | 12 +++++----- tools/perf/arch/x86/util/header.c | 4 ++-- tools/perf/arch/x86/util/intel-bts.c | 24 ++++++++++---------- tools/perf/arch/x86/util/intel-pt.c | 28 ++++++++++++------------ tools/perf/arch/x86/util/machine.c | 6 ++--- tools/perf/arch/x86/util/perf_regs.c | 8 +++---- tools/perf/arch/x86/util/pmu.c | 6 ++--- 12 files changed, 71 insertions(+), 71 deletions(-) diff --git a/tools/include/uapi/asm/errno.h b/tools/include/uapi/asm/errno.h index ce3c5945a1c4..637189ec1ab9 100644 --- a/tools/include/uapi/asm/errno.h +++ b/tools/include/uapi/asm/errno.h @@ -1,18 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ #if defined(__i386__) || defined(__x86_64__) -#include "../../arch/x86/include/uapi/asm/errno.h" +#include "../../../arch/x86/include/uapi/asm/errno.h" #elif defined(__powerpc__) -#include "../../arch/powerpc/include/uapi/asm/errno.h" +#include "../../../arch/powerpc/include/uapi/asm/errno.h" #elif defined(__sparc__) -#include "../../arch/sparc/include/uapi/asm/errno.h" +#include "../../../arch/sparc/include/uapi/asm/errno.h" #elif defined(__alpha__) -#include "../../arch/alpha/include/uapi/asm/errno.h" +#include "../../../arch/alpha/include/uapi/asm/errno.h" #elif defined(__mips__) -#include "../../arch/mips/include/uapi/asm/errno.h" +#include "../../../arch/mips/include/uapi/asm/errno.h" #elif defined(__ia64__) -#include "../../arch/ia64/include/uapi/asm/errno.h" +#include "../../../arch/ia64/include/uapi/asm/errno.h" #elif defined(__xtensa__) -#include "../../arch/xtensa/include/uapi/asm/errno.h" +#include "../../../arch/xtensa/include/uapi/asm/errno.h" #else #include #endif diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 8d6821d9c3f6..27653be24447 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -11,17 +11,17 @@ #include #include -#include "../../util/cpumap.h" -#include "../../util/event.h" -#include "../../util/evsel.h" -#include "../../util/evlist.h" -#include "../../util/session.h" +#include "../../../util/cpumap.h" +#include "../../../util/event.h" +#include "../../../util/evsel.h" +#include "../../../util/evlist.h" +#include "../../../util/session.h" #include // page_size -#include "../../util/pmu.h" -#include "../../util/debug.h" -#include "../../util/auxtrace.h" -#include "../../util/record.h" -#include "../../util/arm-spe.h" +#include "../../../util/pmu.h" +#include "../../../util/debug.h" +#include "../../../util/auxtrace.h" +#include "../../../util/record.h" +#include "../../../util/arm-spe.h" #define KiB(x) ((x) * 1024) #define MiB(x) ((x) * 1024 * 1024) diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c index 2864e2e3776d..2833e101a7c6 100644 --- a/tools/perf/arch/arm64/util/perf_regs.c +++ b/tools/perf/arch/arm64/util/perf_regs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../../util/perf_regs.h" +#include "../../../util/perf_regs.h" const struct sample_reg sample_reg_masks[] = { SMPL_REG_END diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c index e9c436eeffc9..0a5242900248 100644 --- a/tools/perf/arch/powerpc/util/perf_regs.c +++ b/tools/perf/arch/powerpc/util/perf_regs.c @@ -4,8 +4,8 @@ #include #include -#include "../../util/perf_regs.h" -#include "../../util/debug.h" +#include "../../../util/perf_regs.h" +#include "../../../util/debug.h" #include diff --git a/tools/perf/arch/x86/util/auxtrace.c b/tools/perf/arch/x86/util/auxtrace.c index 96f4a2c11893..330d03216b0e 100644 --- a/tools/perf/arch/x86/util/auxtrace.c +++ b/tools/perf/arch/x86/util/auxtrace.c @@ -7,13 +7,13 @@ #include #include -#include "../../util/header.h" -#include "../../util/debug.h" -#include "../../util/pmu.h" -#include "../../util/auxtrace.h" -#include "../../util/intel-pt.h" -#include "../../util/intel-bts.h" -#include "../../util/evlist.h" +#include "../../../util/header.h" +#include "../../../util/debug.h" +#include "../../../util/pmu.h" +#include "../../../util/auxtrace.h" +#include "../../../util/intel-pt.h" +#include "../../../util/intel-bts.h" +#include "../../../util/evlist.h" static struct auxtrace_record *auxtrace_record__init_intel(struct evlist *evlist, diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c index d357c625c09f..91d938e88b42 100644 --- a/tools/perf/arch/x86/util/event.c +++ b/tools/perf/arch/x86/util/event.c @@ -3,12 +3,12 @@ #include #include -#include "../../util/event.h" -#include "../../util/synthetic-events.h" -#include "../../util/machine.h" -#include "../../util/tool.h" -#include "../../util/map.h" -#include "../../util/debug.h" +#include "../../../util/event.h" +#include "../../../util/synthetic-events.h" +#include "../../../util/machine.h" +#include "../../../util/tool.h" +#include "../../../util/map.h" +#include "../../../util/debug.h" #if defined(__x86_64__) diff --git a/tools/perf/arch/x86/util/header.c b/tools/perf/arch/x86/util/header.c index aa6deb463bf3..578c8c568ffd 100644 --- a/tools/perf/arch/x86/util/header.c +++ b/tools/perf/arch/x86/util/header.c @@ -7,8 +7,8 @@ #include #include -#include "../../util/debug.h" -#include "../../util/header.h" +#include "../../../util/debug.h" +#include "../../../util/header.h" static inline void cpuid(unsigned int op, unsigned int *a, unsigned int *b, unsigned int *c, diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index 7d22dbdf0f1d..c7cc05dd2852 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -11,18 +11,18 @@ #include #include -#include "../../util/cpumap.h" -#include "../../util/event.h" -#include "../../util/evsel.h" -#include "../../util/evlist.h" -#include "../../util/mmap.h" -#include "../../util/session.h" -#include "../../util/pmu.h" -#include "../../util/debug.h" -#include "../../util/record.h" -#include "../../util/tsc.h" -#include "../../util/auxtrace.h" -#include "../../util/intel-bts.h" +#include "../../../util/cpumap.h" +#include "../../../util/event.h" +#include "../../../util/evsel.h" +#include "../../../util/evlist.h" +#include "../../../util/mmap.h" +#include "../../../util/session.h" +#include "../../../util/pmu.h" +#include "../../../util/debug.h" +#include "../../../util/record.h" +#include "../../../util/tsc.h" +#include "../../../util/auxtrace.h" +#include "../../../util/intel-bts.h" #include // page_size #define KiB(x) ((x) * 1024) diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 761140ff14f2..00356704d23d 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -13,22 +13,22 @@ #include #include -#include "../../util/session.h" -#include "../../util/event.h" -#include "../../util/evlist.h" -#include "../../util/evsel.h" -#include "../../util/cpumap.h" -#include "../../util/mmap.h" +#include "../../../util/session.h" +#include "../../../util/event.h" +#include "../../../util/evlist.h" +#include "../../../util/evsel.h" +#include "../../../util/cpumap.h" +#include "../../../util/mmap.h" #include -#include "../../util/parse-events.h" -#include "../../util/pmu.h" -#include "../../util/debug.h" -#include "../../util/auxtrace.h" -#include "../../util/record.h" -#include "../../util/target.h" -#include "../../util/tsc.h" +#include "../../../util/parse-events.h" +#include "../../../util/pmu.h" +#include "../../../util/debug.h" +#include "../../../util/auxtrace.h" +#include "../../../util/record.h" +#include "../../../util/target.h" +#include "../../../util/tsc.h" #include // page_size -#include "../../util/intel-pt.h" +#include "../../../util/intel-pt.h" #define KiB(x) ((x) * 1024) #define MiB(x) ((x) * 1024 * 1024) diff --git a/tools/perf/arch/x86/util/machine.c b/tools/perf/arch/x86/util/machine.c index e17e080e76f4..31679c35d493 100644 --- a/tools/perf/arch/x86/util/machine.c +++ b/tools/perf/arch/x86/util/machine.c @@ -5,9 +5,9 @@ #include #include // page_size -#include "../../util/machine.h" -#include "../../util/map.h" -#include "../../util/symbol.h" +#include "../../../util/machine.h" +#include "../../../util/map.h" +#include "../../../util/symbol.h" #include #include diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c index c218b83e063b..fca81b39b09f 100644 --- a/tools/perf/arch/x86/util/perf_regs.c +++ b/tools/perf/arch/x86/util/perf_regs.c @@ -5,10 +5,10 @@ #include #include -#include "../../perf-sys.h" -#include "../../util/perf_regs.h" -#include "../../util/debug.h" -#include "../../util/event.h" +#include "../../../perf-sys.h" +#include "../../../util/perf_regs.h" +#include "../../../util/debug.h" +#include "../../../util/event.h" const struct sample_reg sample_reg_masks[] = { SMPL_REG(AX, PERF_REG_X86_AX), diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c index e33ef5bc31c5..d48d608517fd 100644 --- a/tools/perf/arch/x86/util/pmu.c +++ b/tools/perf/arch/x86/util/pmu.c @@ -4,9 +4,9 @@ #include #include -#include "../../util/intel-pt.h" -#include "../../util/intel-bts.h" -#include "../../util/pmu.h" +#include "../../../util/intel-pt.h" +#include "../../../util/intel-bts.h" +#include "../../../util/pmu.h" struct perf_event_attr *perf_pmu__get_default_config(struct perf_pmu *pmu __maybe_unused) { -- Gitee From 728c857eed650d944f020545f2360e85163dbdf2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 7 Oct 2019 16:43:03 -0300 Subject: [PATCH 145/257] perf evlist: Factor out asprintf routine to build a tracepoint pid filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 05cea4492c9dd28439cc73de1047ab3b26033736 upstream Will be used to append such lists to existing filters. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Luis Cláudio Gonçalves Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-798vlyqfqw938ehoe8etivx1@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/perf/util/evlist.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 4e3a4f352619..3a4d373ae1f7 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1044,6 +1044,9 @@ int perf_evlist__set_tp_filter(struct evlist *evlist, const char *filter) struct evsel *evsel; int err = 0; + if (filter == NULL) + return -1; + evlist__for_each_entry(evlist, evsel) { if (evsel->core.attr.type != PERF_TYPE_TRACEPOINT) continue; @@ -1056,16 +1059,15 @@ int perf_evlist__set_tp_filter(struct evlist *evlist, const char *filter) return err; } -int perf_evlist__set_tp_filter_pids(struct evlist *evlist, size_t npids, pid_t *pids) +static char *asprintf__tp_filter_pids(size_t npids, pid_t *pids) { char *filter; - int ret = -1; size_t i; for (i = 0; i < npids; ++i) { if (i == 0) { if (asprintf(&filter, "common_pid != %d", pids[i]) < 0) - return -1; + return NULL; } else { char *tmp; @@ -1077,8 +1079,17 @@ int perf_evlist__set_tp_filter_pids(struct evlist *evlist, size_t npids, pid_t * } } - ret = perf_evlist__set_tp_filter(evlist, filter); + return filter; out_free: + free(filter); + return NULL; +} + +int perf_evlist__set_tp_filter_pids(struct evlist *evlist, size_t npids, pid_t *pids) +{ + char *filter = asprintf__tp_filter_pids(npids, pids); + int ret = perf_evlist__set_tp_filter(evlist, filter); + free(filter); return ret; } -- Gitee From 56e2ce9195c7a1d3fb68d68ea0b48767fca913d6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 7 Oct 2019 16:52:17 -0300 Subject: [PATCH 146/257] perf evlist: Introduce append_tp_filter() method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 53c92f73389d049d72b2e1d1cbc81c007241d422 upstream Will be used by 'perf trace' to support 'perf trace --filter', we need to append to any pre-existing filter. When parse_filter() gets invoked to process --filter, it'll set the filter to that specified on the command line, later on, when we filter out 'perf trace' own pid to avoid an event feedback loop, we need to preserve the command line filter put in place by parse_filter(). Cc: Adrian Hunter Cc: Jiri Olsa Cc: Luis Cláudio Gonçalves Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-h9rot08qmxlnfmte0holt68x@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/perf/util/evlist.c | 20 ++++++++++++++++++++ tools/perf/util/evlist.h | 2 ++ 2 files changed, 22 insertions(+) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 3a4d373ae1f7..49c528dfe1e9 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1059,6 +1059,26 @@ int perf_evlist__set_tp_filter(struct evlist *evlist, const char *filter) return err; } +int perf_evlist__append_tp_filter(struct evlist *evlist, const char *filter) +{ + struct evsel *evsel; + int err = 0; + + if (filter == NULL) + return -1; + + evlist__for_each_entry(evlist, evsel) { + if (evsel->core.attr.type != PERF_TYPE_TRACEPOINT) + continue; + + err = perf_evsel__append_tp_filter(evsel, filter); + if (err) + break; + } + + return err; +} + static char *asprintf__tp_filter_pids(size_t npids, pid_t *pids) { char *filter; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 10fc888d6127..15de58642a90 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -133,6 +133,8 @@ int perf_evlist__set_tp_filter(struct evlist *evlist, const char *filter); int perf_evlist__set_tp_filter_pid(struct evlist *evlist, pid_t pid); int perf_evlist__set_tp_filter_pids(struct evlist *evlist, size_t npids, pid_t *pids); +int perf_evlist__append_tp_filter(struct evlist *evlist, const char *filter); + struct evsel * perf_evlist__find_tracepoint_by_id(struct evlist *evlist, int id); -- Gitee From d322d1e07007aac6da47507f6e562a1a8ac7ac17 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 14 Oct 2019 20:10:50 -0300 Subject: [PATCH 147/257] perf string: Export asprintf__tp_filter_pids() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit da949f507a73df5b5ad5031cb23b82a4d81846eb upstream Will be used directly in 'perf trace' for setting up the command line argv array to pass to cmd_record, as this was how 'perf trace record' was implemented, following the model used in 'perf kvm record', 'perf sched record', etc. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Luis Cláudio Gonçalves Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-w3cuwjs63lxf5zpryy3145uv@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/perf/util/evlist.c | 3 ++- tools/perf/util/string2.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 49c528dfe1e9..8da83ae90b54 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -21,6 +21,7 @@ #include "../perf.h" #include "asm/bug.h" #include "bpf-event.h" +#include "util/string2.h" #include #include #include @@ -1079,7 +1080,7 @@ int perf_evlist__append_tp_filter(struct evlist *evlist, const char *filter) return err; } -static char *asprintf__tp_filter_pids(size_t npids, pid_t *pids) +char *asprintf__tp_filter_pids(size_t npids, pid_t *pids) { char *filter; size_t i; diff --git a/tools/perf/util/string2.h b/tools/perf/util/string2.h index 708805f5573e..73df616ced43 100644 --- a/tools/perf/util/string2.h +++ b/tools/perf/util/string2.h @@ -4,6 +4,7 @@ #include #include +#include // pid_t #include #include @@ -32,6 +33,8 @@ static inline char *asprintf_expr_not_in_ints(const char *var, size_t nints, int return asprintf_expr_inout_ints(var, false, nints, ints); } +char *asprintf__tp_filter_pids(size_t npids, pid_t *pids); + char *strpbrk_esc(char *str, const char *stopset); char *strdup_esc(const char *str); -- Gitee From 763d3971db68a9db391ed69c16c4d9bd23a8037a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 5 May 2020 11:49:08 -0300 Subject: [PATCH 148/257] perf tools: Move routines that probe for perf API features to separate file commit 40c7d2460e03b0916c5fcc5edbedae05b4b571fc upstream Trying to disentangle this a bit further, unfortunately it uses parse_events(), its interesting to have it separated anyway, so do it. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/perf/arch/arm/util/cs-etm.c | 1 + tools/perf/arch/x86/util/intel-pt.c | 1 + tools/perf/builtin-record.c | 1 + tools/perf/util/Build | 1 + tools/perf/util/auxtrace.c | 1 + tools/perf/util/evlist.c | 1 + tools/perf/util/evlist.h | 4 - tools/perf/util/intel-pt.c | 1 + tools/perf/util/perf_api_probe.c | 164 ++++++++++++++++++++++++++++ tools/perf/util/perf_api_probe.h | 14 +++ tools/perf/util/record.c | 155 +------------------------- 11 files changed, 186 insertions(+), 158 deletions(-) create mode 100644 tools/perf/util/perf_api_probe.c create mode 100644 tools/perf/util/perf_api_probe.h diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index b4fe7e41f847..4a9daac1a145 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -23,6 +23,7 @@ #include "../../util/event.h" #include "../../util/evlist.h" #include "../../util/evsel.h" +#include "../../util/perf_api_probe.h" #include "../../util/evsel_config.h" #include "../../util/pmu.h" #include "../../util/cs-etm.h" diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index 00356704d23d..fcc18501b9e4 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -24,6 +24,7 @@ #include "../../../util/pmu.h" #include "../../../util/debug.h" #include "../../../util/auxtrace.h" +#include "../../../util/perf_api_probe.h" #include "../../../util/record.h" #include "../../../util/target.h" #include "../../../util/tsc.h" diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 077a0faa9b98..f34c450a9743 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -34,6 +34,7 @@ #include "util/tsc.h" #include "util/parse-branch-options.h" #include "util/parse-regs-options.h" +#include "util/perf_api_probe.h" #include "util/llvm-utils.h" #include "util/bpf-loader.h" #include "util/trigger.h" diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 2b198f645e46..cf6b6be27e86 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -86,6 +86,7 @@ perf-y += counts.o perf-y += stat.o perf-y += stat-shadow.o perf-y += stat-display.o +perf-y += perf_api_probe.o perf-y += record.o perf-y += srcline.o perf-y += srccode.o diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 4086d5848734..efec868c74c3 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -33,6 +33,7 @@ #include "evsel.h" #include "evsel_config.h" #include "symbol.h" +#include "util/perf_api_probe.h" #include "util/synthetic-events.h" #include "thread_map.h" #include "asm/bug.h" diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 8da83ae90b54..201cf1786f98 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -22,6 +22,7 @@ #include "asm/bug.h" #include "bpf-event.h" #include "util/string2.h" +#include "util/perf_api_probe.h" #include #include #include diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 15de58642a90..6fb6114880d9 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -163,10 +163,6 @@ void evlist__close(struct evlist *evlist); struct callchain_param; void perf_evlist__set_id_pos(struct evlist *evlist); -bool perf_can_sample_identifier(void); -bool perf_can_record_switch_events(void); -bool perf_can_record_cpu_wide(void); -bool perf_can_aux_sample(void); void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, struct callchain_param *callchain); int record_opts__config(struct record_opts *opts); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index b40832419a27..a76d9886dbf2 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -33,6 +33,7 @@ #include "tsc.h" #include "intel-pt.h" #include "config.h" +#include "util/perf_api_probe.h" #include "util/synthetic-events.h" #include "time-utils.h" diff --git a/tools/perf/util/perf_api_probe.c b/tools/perf/util/perf_api_probe.c new file mode 100644 index 000000000000..1337965673d7 --- /dev/null +++ b/tools/perf/util/perf_api_probe.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "perf-sys.h" +#include "util/cloexec.h" +#include "util/evlist.h" +#include "util/evsel.h" +#include "util/parse-events.h" +#include "util/perf_api_probe.h" +#include +#include + +typedef void (*setup_probe_fn_t)(struct evsel *evsel); + +static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str) +{ + struct evlist *evlist; + struct evsel *evsel; + unsigned long flags = perf_event_open_cloexec_flag(); + int err = -EAGAIN, fd; + static pid_t pid = -1; + + evlist = evlist__new(); + if (!evlist) + return -ENOMEM; + + if (parse_events(evlist, str, NULL)) + goto out_delete; + + evsel = evlist__first(evlist); + + while (1) { + fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags); + if (fd < 0) { + if (pid == -1 && errno == EACCES) { + pid = 0; + continue; + } + goto out_delete; + } + break; + } + close(fd); + + fn(evsel); + + fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags); + if (fd < 0) { + if (errno == EINVAL) + err = -EINVAL; + goto out_delete; + } + close(fd); + err = 0; + +out_delete: + evlist__delete(evlist); + return err; +} + +static bool perf_probe_api(setup_probe_fn_t fn) +{ + const char *try[] = {"cycles:u", "instructions:u", "cpu-clock:u", NULL}; + struct perf_cpu_map *cpus; + int cpu, ret, i = 0; + + cpus = perf_cpu_map__new(NULL); + if (!cpus) + return false; + cpu = cpus->map[0]; + perf_cpu_map__put(cpus); + + do { + ret = perf_do_probe_api(fn, cpu, try[i++]); + if (!ret) + return true; + } while (ret == -EAGAIN && try[i]); + + return false; +} + +static void perf_probe_sample_identifier(struct evsel *evsel) +{ + evsel->core.attr.sample_type |= PERF_SAMPLE_IDENTIFIER; +} + +static void perf_probe_comm_exec(struct evsel *evsel) +{ + evsel->core.attr.comm_exec = 1; +} + +static void perf_probe_context_switch(struct evsel *evsel) +{ + evsel->core.attr.context_switch = 1; +} + +bool perf_can_sample_identifier(void) +{ + return perf_probe_api(perf_probe_sample_identifier); +} + +bool perf_can_comm_exec(void) +{ + return perf_probe_api(perf_probe_comm_exec); +} + +bool perf_can_record_switch_events(void) +{ + return perf_probe_api(perf_probe_context_switch); +} + +bool perf_can_record_cpu_wide(void) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .exclude_kernel = 1, + }; + struct perf_cpu_map *cpus; + int cpu, fd; + + cpus = perf_cpu_map__new(NULL); + if (!cpus) + return false; + cpu = cpus->map[0]; + perf_cpu_map__put(cpus); + + fd = sys_perf_event_open(&attr, -1, cpu, -1, 0); + if (fd < 0) + return false; + close(fd); + + return true; +} + +/* + * Architectures are expected to know if AUX area sampling is supported by the + * hardware. Here we check for kernel support. + */ +bool perf_can_aux_sample(void) +{ + struct perf_event_attr attr = { + .size = sizeof(struct perf_event_attr), + .exclude_kernel = 1, + /* + * Non-zero value causes the kernel to calculate the effective + * attribute size up to that byte. + */ + .aux_sample_size = 1, + }; + int fd; + + fd = sys_perf_event_open(&attr, -1, 0, -1, 0); + /* + * If the kernel attribute is big enough to contain aux_sample_size + * then we assume that it is supported. We are relying on the kernel to + * validate the attribute size before anything else that could be wrong. + */ + if (fd < 0 && errno == E2BIG) + return false; + if (fd >= 0) + close(fd); + + return true; +} diff --git a/tools/perf/util/perf_api_probe.h b/tools/perf/util/perf_api_probe.h new file mode 100644 index 000000000000..706c3c6426e2 --- /dev/null +++ b/tools/perf/util/perf_api_probe.h @@ -0,0 +1,14 @@ + +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_API_PROBE_H +#define __PERF_API_PROBE_H + +#include + +bool perf_can_aux_sample(void); +bool perf_can_comm_exec(void); +bool perf_can_record_cpu_wide(void); +bool perf_can_record_switch_events(void); +bool perf_can_sample_identifier(void); + +#endif // __PERF_API_PROBE_H diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 7def66168503..5084ad6db1b3 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -10,163 +10,10 @@ #include #include #include "cloexec.h" +#include "util/perf_api_probe.h" #include "record.h" #include "../perf-sys.h" -typedef void (*setup_probe_fn_t)(struct evsel *evsel); - -static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str) -{ - struct evlist *evlist; - struct evsel *evsel; - unsigned long flags = perf_event_open_cloexec_flag(); - int err = -EAGAIN, fd; - static pid_t pid = -1; - - evlist = evlist__new(); - if (!evlist) - return -ENOMEM; - - if (parse_events(evlist, str, NULL)) - goto out_delete; - - evsel = evlist__first(evlist); - - while (1) { - fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags); - if (fd < 0) { - if (pid == -1 && errno == EACCES) { - pid = 0; - continue; - } - goto out_delete; - } - break; - } - close(fd); - - fn(evsel); - - fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags); - if (fd < 0) { - if (errno == EINVAL) - err = -EINVAL; - goto out_delete; - } - close(fd); - err = 0; - -out_delete: - evlist__delete(evlist); - return err; -} - -static bool perf_probe_api(setup_probe_fn_t fn) -{ - const char *try[] = {"cycles:u", "instructions:u", "cpu-clock:u", NULL}; - struct perf_cpu_map *cpus; - int cpu, ret, i = 0; - - cpus = perf_cpu_map__new(NULL); - if (!cpus) - return false; - cpu = cpus->map[0]; - perf_cpu_map__put(cpus); - - do { - ret = perf_do_probe_api(fn, cpu, try[i++]); - if (!ret) - return true; - } while (ret == -EAGAIN && try[i]); - - return false; -} - -static void perf_probe_sample_identifier(struct evsel *evsel) -{ - evsel->core.attr.sample_type |= PERF_SAMPLE_IDENTIFIER; -} - -static void perf_probe_comm_exec(struct evsel *evsel) -{ - evsel->core.attr.comm_exec = 1; -} - -static void perf_probe_context_switch(struct evsel *evsel) -{ - evsel->core.attr.context_switch = 1; -} - -bool perf_can_sample_identifier(void) -{ - return perf_probe_api(perf_probe_sample_identifier); -} - -static bool perf_can_comm_exec(void) -{ - return perf_probe_api(perf_probe_comm_exec); -} - -bool perf_can_record_switch_events(void) -{ - return perf_probe_api(perf_probe_context_switch); -} - -bool perf_can_record_cpu_wide(void) -{ - struct perf_event_attr attr = { - .type = PERF_TYPE_SOFTWARE, - .config = PERF_COUNT_SW_CPU_CLOCK, - .exclude_kernel = 1, - }; - struct perf_cpu_map *cpus; - int cpu, fd; - - cpus = perf_cpu_map__new(NULL); - if (!cpus) - return false; - cpu = cpus->map[0]; - perf_cpu_map__put(cpus); - - fd = sys_perf_event_open(&attr, -1, cpu, -1, 0); - if (fd < 0) - return false; - close(fd); - - return true; -} - -/* - * Architectures are expected to know if AUX area sampling is supported by the - * hardware. Here we check for kernel support. - */ -bool perf_can_aux_sample(void) -{ - struct perf_event_attr attr = { - .size = sizeof(struct perf_event_attr), - .exclude_kernel = 1, - /* - * Non-zero value causes the kernel to calculate the effective - * attribute size up to that byte. - */ - .aux_sample_size = 1, - }; - int fd; - - fd = sys_perf_event_open(&attr, -1, 0, -1, 0); - /* - * If the kernel attribute is big enough to contain aux_sample_size - * then we assume that it is supported. We are relying on the kernel to - * validate the attribute size before anything else that could be wrong. - */ - if (fd < 0 && errno == E2BIG) - return false; - if (fd >= 0) - close(fd); - - return true; -} - void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, struct callchain_param *callchain) { -- Gitee From d8123382de3dd70a7619d1174a85562473a11c34 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 27 Apr 2020 17:54:27 -0300 Subject: [PATCH 149/257] perf evlist: Allow reusing the side band thread for more purposes commit 976be84504b8285d43dc890b02ceff432cd0dd4b upstream I.e. so far we had just one event in that side band thread, a dummy one with attr.bpf_event set, so that 'perf record' can go ahead and ask the kernel for further information about BPF programs being loaded. Allow for more than one event to be there, so that we can use it as well for the upcoming --switch-output-event feature. Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Namhyung Kim Cc: Song Liu Link: http://lore.kernel.org/lkml/20200429131106.27974-6-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/perf/util/evlist.h | 1 + tools/perf/util/sideband_evlist.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 6fb6114880d9..b791f6473097 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -111,6 +111,7 @@ int perf_evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *attr, perf_evsel__sb_cb_t cb, void *data); +void evlist__set_cb(struct evlist *evlist, perf_evsel__sb_cb_t cb, void *data); int perf_evlist__start_sb_thread(struct evlist *evlist, struct target *target); void perf_evlist__stop_sb_thread(struct evlist *evlist); diff --git a/tools/perf/util/sideband_evlist.c b/tools/perf/util/sideband_evlist.c index 073d201bb6ea..1d6f470d64c4 100644 --- a/tools/perf/util/sideband_evlist.c +++ b/tools/perf/util/sideband_evlist.c @@ -4,6 +4,7 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/mmap.h" +#include "util/perf_api_probe.h" #include #include #include @@ -80,6 +81,19 @@ static void *perf_evlist__poll_thread(void *arg) return NULL; } +void evlist__set_cb(struct evlist *evlist, perf_evsel__sb_cb_t cb, void *data) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + evsel->core.attr.sample_id_all = 1; + evsel->core.attr.watermark = 1; + evsel->core.attr.wakeup_watermark = 1; + evsel->side_band.cb = cb; + evsel->side_band.data = data; + } +} + int perf_evlist__start_sb_thread(struct evlist *evlist, struct target *target) { struct evsel *counter; @@ -90,6 +104,15 @@ int perf_evlist__start_sb_thread(struct evlist *evlist, struct target *target) if (perf_evlist__create_maps(evlist, target)) goto out_delete_evlist; + if (evlist->core.nr_entries > 1) { + bool can_sample_identifier = perf_can_sample_identifier(); + + evlist__for_each_entry(evlist, counter) + perf_evsel__set_sample_id(counter, can_sample_identifier); + + perf_evlist__set_id_pos(evlist); + } + evlist__for_each_entry(evlist, counter) { if (evsel__open(counter, evlist->core.cpus, evlist->core.threads) < 0) goto out_delete_evlist; -- Gitee From 0fb62c5640f82f5f98c693e0ac521c0fa34621ee Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 15 Nov 2019 14:42:22 +0200 Subject: [PATCH 150/257] perf pmu: When using default config, record which bits of config were changed by the user commit a1ac7de6902c1ea6def7a743f1d2e6ba429684b3 upstream Default config for a PMU is defined before selected events are parsed. That allows the user-entered config to override the default config. However that does not allow for changing the default config based on other options. For example, if the user chooses AUX area sampling mode, in the case of Intel PT, the psb_period needs to be small for sampling, so there is a need to set the default psb_period to 0 (2 KiB) in that case. However that should not override a value set by the user. To allow for that, when using default config, record which bits of config were changed by the user. Signed-off-by: Adrian Hunter Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20191115124225.5247-13-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/perf/util/evsel.c | 2 ++ tools/perf/util/evsel_config.h | 2 ++ tools/perf/util/parse-events.c | 42 +++++++++++++++++++++++++++++++++- tools/perf/util/pmu.c | 10 ++++++++ tools/perf/util/pmu.h | 1 + 5 files changed, 56 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 797534dbee08..36238cc5cf80 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -849,6 +849,8 @@ static void apply_config_terms(struct evsel *evsel, case PERF_EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE: /* Already applied by auxtrace */ break; + case PERF_EVSEL__CONFIG_TERM_CFG_CHG: + break; default: break; } diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h index 6e654ede8fbe..1f8d2fe0b66e 100644 --- a/tools/perf/util/evsel_config.h +++ b/tools/perf/util/evsel_config.h @@ -26,6 +26,7 @@ enum evsel_term_type { PERF_EVSEL__CONFIG_TERM_PERCORE, PERF_EVSEL__CONFIG_TERM_AUX_OUTPUT, PERF_EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE, + PERF_EVSEL__CONFIG_TERM_CFG_CHG, }; struct perf_evsel_config_term { @@ -46,6 +47,7 @@ struct perf_evsel_config_term { bool percore; bool aux_output; u32 aux_sample_size; + u64 cfg_chg; } val; bool weak; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 229ed0713280..011189a8aa0d 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1273,7 +1273,40 @@ do { \ break; } } -#undef ADD_EVSEL_CONFIG + return 0; +} + +/* + * Add PERF_EVSEL__CONFIG_TERM_CFG_CHG where cfg_chg will have a bit set for + * each bit of attr->config that the user has changed. + */ +static int get_config_chgs(struct perf_pmu *pmu, struct list_head *head_config, + struct list_head *head_terms) +{ + struct parse_events_term *term; + u64 bits = 0; + int type; + + list_for_each_entry(term, head_config, list) { + switch (term->type_term) { + case PARSE_EVENTS__TERM_TYPE_USER: + type = perf_pmu__format_type(&pmu->format, term->config); + if (type != PERF_PMU_FORMAT_VALUE_CONFIG) + continue; + bits |= perf_pmu__format_bits(&pmu->format, term->config); + break; + case PARSE_EVENTS__TERM_TYPE_CONFIG: + bits = ~(u64)0; + break; + default: + break; + } + } + + if (bits) + ADD_CONFIG_TERM(CFG_CHG, cfg_chg, bits); + +#undef ADD_CONFIG_TERM return 0; } @@ -1402,6 +1435,13 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, if (get_config_terms(head_config, &config_terms)) return -ENOMEM; + /* + * When using default config, record which bits of attr->config were + * changed by the user. + */ + if (pmu->default_config && get_config_chgs(pmu, head_config, &config_terms)) + return -ENOMEM; + if (perf_pmu__config(pmu, &attr, head_config, parse_state->error)) { struct perf_evsel_config_term *pos, *tmp; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 36e3a777d20d..5f1ece607471 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -945,6 +945,16 @@ __u64 perf_pmu__format_bits(struct list_head *formats, const char *name) return bits; } +int perf_pmu__format_type(struct list_head *formats, const char *name) +{ + struct perf_pmu_format *format = pmu_find_format(formats, name); + + if (!format) + return -1; + + return format->value; +} + /* * Sets value based on the format definition (format parameter) * and unformated value (value parameter). diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index ed9154b182bf..d5e87aa9fd5b 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -71,6 +71,7 @@ int perf_pmu__config_terms(struct list_head *formats, struct list_head *head_terms, bool zero, struct parse_events_error *error); __u64 perf_pmu__format_bits(struct list_head *formats, const char *name); +int perf_pmu__format_type(struct list_head *formats, const char *name); int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms, struct perf_pmu_info *info); struct list_head *perf_pmu__alias(struct perf_pmu *pmu, -- Gitee From 21a7a47b2268443d9fe24229a5b2cb11fc832c69 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 17 Jan 2020 13:52:50 +0800 Subject: [PATCH 151/257] perf parse: Refactor 'struct perf_evsel_config_term' commit e884602b57c07fae54ff357e4b996b2053b47c1e upstream The struct perf_evsel_config_term::val is a union which contains fields 'callgraph', 'drv_cfg' and 'branch' as string pointers. This leads to the complex code logic for handling every type's string separately, and it's hard to release string as a general way. This patch refactors the structure to add a common field 'str' in the 'val' union as string pointer and remove the other three fields 'callgraph', 'drv_cfg' and 'branch'. Without passing field name, the patch simplifies the string handling with macro ADD_CONFIG_TERM_STR() for string pointer assignment. This patch fixes multiple warnings of line over 80 characters detected by checkpatch tool. Signed-off-by: Leo Yan Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Mark Rutland Cc: Mathieu Poirier Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/20200117055251.24058-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/perf/arch/arm/util/cs-etm.c | 2 +- tools/perf/util/evsel.c | 6 +-- tools/perf/util/evsel_config.h | 4 +- tools/perf/util/parse-events.c | 62 ++++++++++++++++++++----------- 4 files changed, 45 insertions(+), 29 deletions(-) diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 4a9daac1a145..b8df8054dd5e 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -227,7 +227,7 @@ static int cs_etm_set_sink_attr(struct perf_pmu *pmu, if (term->type != PERF_EVSEL__CONFIG_TERM_DRV_CFG) continue; - sink = term->val.drv_cfg; + sink = term->val.str; snprintf(path, PATH_MAX, "sinks/%s", sink); ret = perf_pmu__scan_file(pmu, path, "%x", &hash); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 36238cc5cf80..ec0af4dd87ea 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -808,12 +808,12 @@ static void apply_config_terms(struct evsel *evsel, perf_evsel__reset_sample_bit(evsel, TIME); break; case PERF_EVSEL__CONFIG_TERM_CALLGRAPH: - callgraph_buf = term->val.callgraph; + callgraph_buf = term->val.str; break; case PERF_EVSEL__CONFIG_TERM_BRANCH: - if (term->val.branch && strcmp(term->val.branch, "no")) { + if (term->val.str && strcmp(term->val.str, "no")) { perf_evsel__set_sample_bit(evsel, BRANCH_STACK); - parse_branch_str(term->val.branch, + parse_branch_str(term->val.str, &attr->branch_sample_type); } else perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h index 1f8d2fe0b66e..b4a65201e4f7 100644 --- a/tools/perf/util/evsel_config.h +++ b/tools/perf/util/evsel_config.h @@ -36,18 +36,16 @@ struct perf_evsel_config_term { u64 period; u64 freq; bool time; - char *callgraph; - char *drv_cfg; u64 stack_user; int max_stack; bool inherit; bool overwrite; - char *branch; unsigned long max_events; bool percore; bool aux_output; u32 aux_sample_size; u64 cfg_chg; + char *str; } val; bool weak; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 011189a8aa0d..e52056fcc7f4 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1201,8 +1201,7 @@ static int config_attr(struct perf_event_attr *attr, static int get_config_terms(struct list_head *head_config, struct list_head *head_terms __maybe_unused) { -#define ADD_CONFIG_TERM(__type, __name, __val) \ -do { \ +#define ADD_CONFIG_TERM(__type) \ struct perf_evsel_config_term *__t; \ \ __t = zalloc(sizeof(*__t)); \ @@ -1211,9 +1210,19 @@ do { \ \ INIT_LIST_HEAD(&__t->list); \ __t->type = PERF_EVSEL__CONFIG_TERM_ ## __type; \ - __t->val.__name = __val; \ __t->weak = term->weak; \ - list_add_tail(&__t->list, head_terms); \ + list_add_tail(&__t->list, head_terms) + +#define ADD_CONFIG_TERM_VAL(__type, __name, __val) \ +do { \ + ADD_CONFIG_TERM(__type); \ + __t->val.__name = __val; \ +} while (0) + +#define ADD_CONFIG_TERM_STR(__type, __val) \ +do { \ + ADD_CONFIG_TERM(__type); \ + __t->val.str = __val; \ } while (0) struct parse_events_term *term; @@ -1221,53 +1230,62 @@ do { \ list_for_each_entry(term, head_config, list) { switch (term->type_term) { case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: - ADD_CONFIG_TERM(PERIOD, period, term->val.num); + ADD_CONFIG_TERM_VAL(PERIOD, period, term->val.num); break; case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ: - ADD_CONFIG_TERM(FREQ, freq, term->val.num); + ADD_CONFIG_TERM_VAL(FREQ, freq, term->val.num); break; case PARSE_EVENTS__TERM_TYPE_TIME: - ADD_CONFIG_TERM(TIME, time, term->val.num); + ADD_CONFIG_TERM_VAL(TIME, time, term->val.num); break; case PARSE_EVENTS__TERM_TYPE_CALLGRAPH: - ADD_CONFIG_TERM(CALLGRAPH, callgraph, term->val.str); + ADD_CONFIG_TERM_STR(CALLGRAPH, term->val.str); break; case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE: - ADD_CONFIG_TERM(BRANCH, branch, term->val.str); + ADD_CONFIG_TERM_STR(BRANCH, term->val.str); break; case PARSE_EVENTS__TERM_TYPE_STACKSIZE: - ADD_CONFIG_TERM(STACK_USER, stack_user, term->val.num); + ADD_CONFIG_TERM_VAL(STACK_USER, stack_user, + term->val.num); break; case PARSE_EVENTS__TERM_TYPE_INHERIT: - ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 1 : 0); + ADD_CONFIG_TERM_VAL(INHERIT, inherit, + term->val.num ? 1 : 0); break; case PARSE_EVENTS__TERM_TYPE_NOINHERIT: - ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 0 : 1); + ADD_CONFIG_TERM_VAL(INHERIT, inherit, + term->val.num ? 0 : 1); break; case PARSE_EVENTS__TERM_TYPE_MAX_STACK: - ADD_CONFIG_TERM(MAX_STACK, max_stack, term->val.num); + ADD_CONFIG_TERM_VAL(MAX_STACK, max_stack, + term->val.num); break; case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: - ADD_CONFIG_TERM(MAX_EVENTS, max_events, term->val.num); + ADD_CONFIG_TERM_VAL(MAX_EVENTS, max_events, + term->val.num); break; case PARSE_EVENTS__TERM_TYPE_OVERWRITE: - ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 1 : 0); + ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite, + term->val.num ? 1 : 0); break; case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: - ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 0 : 1); + ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite, + term->val.num ? 0 : 1); break; case PARSE_EVENTS__TERM_TYPE_DRV_CFG: - ADD_CONFIG_TERM(DRV_CFG, drv_cfg, term->val.str); + ADD_CONFIG_TERM_STR(DRV_CFG, term->val.str); break; case PARSE_EVENTS__TERM_TYPE_PERCORE: - ADD_CONFIG_TERM(PERCORE, percore, - term->val.num ? true : false); + ADD_CONFIG_TERM_VAL(PERCORE, percore, + term->val.num ? true : false); break; case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT: - ADD_CONFIG_TERM(AUX_OUTPUT, aux_output, term->val.num ? 1 : 0); + ADD_CONFIG_TERM_VAL(AUX_OUTPUT, aux_output, + term->val.num ? 1 : 0); break; case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE: - ADD_CONFIG_TERM(AUX_SAMPLE_SIZE, aux_sample_size, term->val.num); + ADD_CONFIG_TERM_VAL(AUX_SAMPLE_SIZE, aux_sample_size, + term->val.num); break; default: break; @@ -1304,7 +1322,7 @@ static int get_config_chgs(struct perf_pmu *pmu, struct list_head *head_config, } if (bits) - ADD_CONFIG_TERM(CFG_CHG, cfg_chg, bits); + ADD_CONFIG_TERM_VAL(CFG_CHG, cfg_chg, bits); #undef ADD_CONFIG_TERM return 0; -- Gitee From 4e8e60fdd177eeba97621ff54f17eb7059015b5e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 4 Mar 2020 10:50:58 -0300 Subject: [PATCH 152/257] tools headers UAPI: Update tools's copy of linux/perf_event.h commit 6339998d22ecae5d6435dd87b4904ff6e16bfe56 upstream To get the changes in: bbfd5e4fab63 ("perf/core: Add new branch sample type for HW index of raw branch records") This silences this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h This update is a prerequisite to adding support for the HW index of raw branch records. Acked-by: Kan Liang Cc: Adrian Hunter Cc: Alexey Budankov Cc: Andi Kleen Cc: Jiri Olsa Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra (Intel) Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200304134902.GB12612@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Arukonda Rahul Signed-off-by: mohanasv2 --- tools/include/uapi/linux/perf_event.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index db439100de3a..29ec1a54bbaa 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -181,6 +181,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -208,6 +210,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -853,7 +857,9 @@ enum perf_event_type { * char data[size];}&& PERF_SAMPLE_RAW * * { u64 nr; - * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER -- Gitee From 607d86bba4698a192c3be6ab1eb81b876b89cd75 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 28 Feb 2020 08:30:01 -0800 Subject: [PATCH 153/257] perf evsel: Support PERF_SAMPLE_BRANCH_HW_INDEX commit d3f85437ad6a55113882d730beaa75759452da8f upstream A new branch sample type PERF_SAMPLE_BRANCH_HW_INDEX has been introduced in latest kernel. Enable HW_INDEX by default in LBR call stack mode. If kernel doesn't support the sample type, switching it off. Add HW_INDEX in attr_fprintf as well. User can check whether the branch sample type is set via debug information or header. Committer testing: First collect some samples with LBR callchains, system wide, for a few seconds: # perf record --call-graph lbr -a sleep 5 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.625 MB perf.data (224 samples) ] # Now lets use 'perf evlist -v' to look at the branch_sample_type: # perf evlist -v cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES|HW_INDEX # So the machine has the kernel feature, and it was correctly added to perf_event_attr.branch_sample_type, for the default 'cycles' event. If we do it in another machine, where the kernel lacks the HW_INDEX feature, we get: # perf record --call-graph lbr -a sleep 2s [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.690 MB perf.data (499 samples) ] # perf evlist -v cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|BRANCH_STACK, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, precise_ip: 3, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES # No HW_INDEX in attr.branch_sample_type. Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexey Budankov Cc: Andi Kleen Cc: Jiri Olsa Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200228163011.19358-3-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/util/evsel.c | 15 ++++++++++++--- tools/perf/util/evsel.h | 1 + tools/perf/util/perf_event_attr_fprintf.c | 1 + 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ec0af4dd87ea..ff0a5da9d99d 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -712,7 +712,8 @@ static void __perf_evsel__config_callchain(struct evsel *evsel, attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_CALL_STACK | PERF_SAMPLE_BRANCH_NO_CYCLES | - PERF_SAMPLE_BRANCH_NO_FLAGS; + PERF_SAMPLE_BRANCH_NO_FLAGS | + PERF_SAMPLE_BRANCH_HW_INDEX; } } else pr_warning("Cannot use LBR callstack with branch stack. " @@ -763,7 +764,8 @@ perf_evsel__reset_callgraph(struct evsel *evsel, if (param->record_mode == CALLCHAIN_LBR) { perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type &= ~(PERF_SAMPLE_BRANCH_USER | - PERF_SAMPLE_BRANCH_CALL_STACK); + PERF_SAMPLE_BRANCH_CALL_STACK | + PERF_SAMPLE_BRANCH_HW_INDEX); } if (param->record_mode == CALLCHAIN_DWARF) { perf_evsel__reset_sample_bit(evsel, REGS_USER); @@ -1662,6 +1664,8 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, evsel->core.attr.ksymbol = 0; if (perf_missing_features.bpf) evsel->core.attr.bpf_event = 0; + if (perf_missing_features.branch_hw_idx) + evsel->core.attr.branch_sample_type &= ~PERF_SAMPLE_BRANCH_HW_INDEX; retry_sample_id: if (perf_missing_features.sample_id_all) evsel->core.attr.sample_id_all = 0; @@ -1773,7 +1777,12 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) { + if (!perf_missing_features.branch_hw_idx && + (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX)) { + perf_missing_features.branch_hw_idx = true; + pr_debug2("switching off branch HW index support\n"); + goto fallback_missing_features; + } else if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) { perf_missing_features.aux_output = true; pr_debug2("Kernel has no attr.aux_output support, bailing out\n"); goto out_close; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index da65be7ca536..529f3bfd8a5d 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -123,6 +123,7 @@ struct perf_missing_features { bool ksymbol; bool bpf; bool aux_output; + bool branch_hw_idx; }; extern struct perf_missing_features perf_missing_features; diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 651203126c71..355d3458d4e6 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -50,6 +50,7 @@ static void __p_branch_sample_type(char *buf, size_t size, u64 value) bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), + bit_name(HW_INDEX), { .name = NULL, } }; #undef bit_name -- Gitee From 800aec9566cb48d664553d260e40ff80b441bbf4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 25 Mar 2020 21:45:28 +0900 Subject: [PATCH 154/257] perf/core: Add PERF_RECORD_CGROUP event commit 96aaab686505c449e24d76e76507290dcc30e008 upstream To support cgroup tracking, add CGROUP event to save a link between cgroup path and id number. This is needed since cgroups can go away when userspace tries to read the cgroup info (from the id) later. The attr.cgroup bit was also added to enable cgroup tracking from userspace. This event will be generated when a new cgroup becomes active. Userspace might need to synthesize those events for existing cgroups. Committer testing: From the resulting kernel, using /sys/kernel/btf/vmlinux: $ pahole perf_event_attr | grep -w cgroup -B5 -A1 __u64 write_backward:1; /* 40:27 8 */ __u64 namespaces:1; /* 40:28 8 */ __u64 ksymbol:1; /* 40:29 8 */ __u64 bpf_event:1; /* 40:30 8 */ __u64 aux_output:1; /* 40:31 8 */ __u64 cgroup:1; /* 40:32 8 */ __u64 __reserved_1:31; /* 40:33 8 */ $ Reported-by: kbuild test robot Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo [staticize perf_event_cgroup function] Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Johannes Weiner Cc: Mark Rutland Cc: Zefan Li Link: http://lore.kernel.org/lkml/20200325124536.2800725-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- include/uapi/linux/perf_event.h | 13 +++- kernel/events/core.c | 111 ++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7fa9f958cd9b..b77d271b06b1 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -381,7 +381,8 @@ struct perf_event_attr { ksymbol : 1, /* include ksymbol events */ bpf_event : 1, /* include bpf events */ aux_output : 1, /* generate AUX records instead of events */ - __reserved_1 : 32; + cgroup : 1, /* include cgroup events */ + __reserved_1 : 31; union { __u32 wakeup_events; /* wakeup every n events */ @@ -1012,6 +1013,16 @@ enum perf_event_type { */ PERF_RECORD_BPF_EVENT = 18, + /* + * struct { + * struct perf_event_header header; + * u64 id; + * char path[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CGROUP = 19, + PERF_RECORD_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6befff36dbc0..e58c1dcd075c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -392,6 +392,7 @@ static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; +static atomic_t nr_cgroup_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4468,6 +4469,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_comm_events); if (event->attr.namespaces) atomic_dec(&nr_namespaces_events); + if (event->attr.cgroup) + atomic_dec(&nr_cgroup_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) @@ -7572,6 +7575,105 @@ void perf_event_namespaces(struct task_struct *task) NULL); } +/* + * cgroup tracking + */ +#ifdef CONFIG_CGROUP_PERF + +struct perf_cgroup_event { + char *path; + int path_size; + struct { + struct perf_event_header header; + u64 id; + char path[]; + } event_id; +}; + +static int perf_event_cgroup_match(struct perf_event *event) +{ + return event->attr.cgroup; +} + +static void perf_event_cgroup_output(struct perf_event *event, void *data) +{ + struct perf_cgroup_event *cgroup_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u16 header_size = cgroup_event->event_id.header.size; + int ret; + + if (!perf_event_cgroup_match(event)) + return; + + perf_event_header__init_id(&cgroup_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + cgroup_event->event_id.header.size); + if (ret) + goto out; + + perf_output_put(&handle, cgroup_event->event_id); + __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + cgroup_event->event_id.header.size = header_size; +} + +static void perf_event_cgroup(struct cgroup *cgrp) +{ + struct perf_cgroup_event cgroup_event; + char path_enomem[16] = "//enomem"; + char *pathname; + size_t size; + + if (!atomic_read(&nr_cgroup_events)) + return; + + cgroup_event = (struct perf_cgroup_event){ + .event_id = { + .header = { + .type = PERF_RECORD_CGROUP, + .misc = 0, + .size = sizeof(cgroup_event.event_id), + }, + .id = cgroup_id(cgrp), + }, + }; + + pathname = kmalloc(PATH_MAX, GFP_KERNEL); + if (pathname == NULL) { + cgroup_event.path = path_enomem; + } else { + /* just to be sure to have enough space for alignment */ + cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); + cgroup_event.path = pathname; + } + + /* + * Since our buffer works in 8 byte units we need to align our string + * size to a multiple of 8. However, we must guarantee the tail end is + * zero'd out to avoid leaking random bits to userspace. + */ + size = strlen(cgroup_event.path) + 1; + while (!IS_ALIGNED(size, sizeof(u64))) + cgroup_event.path[size++] = '\0'; + + cgroup_event.event_id.header.size += size; + cgroup_event.path_size = size; + + perf_iterate_sb(perf_event_cgroup_output, + &cgroup_event, + NULL); + + kfree(pathname); +} + +#endif + /* * mmap tracking */ @@ -10588,6 +10690,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_comm_events); if (event->attr.namespaces) atomic_inc(&nr_namespaces_events); + if (event->attr.cgroup) + atomic_inc(&nr_cgroup_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) @@ -12589,6 +12693,12 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css) kfree(jc); } +static int perf_cgroup_css_online(struct cgroup_subsys_state *css) +{ + perf_event_cgroup(css->cgroup); + return 0; +} + static int __perf_cgroup_move(void *info) { struct task_struct *task = info; @@ -12610,6 +12720,7 @@ static void perf_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, + .css_online = perf_cgroup_css_online, .attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can -- Gitee From 1cdf61e94c0bf753ca7b6baca9e7e822e8e8fd0e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 25 Mar 2020 21:45:29 +0900 Subject: [PATCH 155/257] perf/core: Add PERF_SAMPLE_CGROUP feature commit 6546b19f95acc986807de981402bbac6b3a94b0f upstream The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information in the sample. It will add a 64-bit id to identify current cgroup and it's the file handle in the cgroup file system. Userspace should use this information with PERF_RECORD_CGROUP event to match which cgroup it belongs. I put it before PERF_SAMPLE_AUX for simplicity since it just needs a 64-bit word. But if we want bigger samples, I can work on that direction too. Committer testing: $ pahole perf_sample_data | grep -w cgroup -B5 -A5 /* --- cacheline 4 boundary (256 bytes) was 56 bytes ago --- */ struct perf_regs regs_intr; /* 312 16 */ /* --- cacheline 5 boundary (320 bytes) was 8 bytes ago --- */ u64 stack_user_size; /* 328 8 */ u64 phys_addr; /* 336 8 */ u64 cgroup; /* 344 8 */ /* size: 384, cachelines: 6, members: 22 */ /* padding: 32 */ }; $ Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Johannes Weiner Cc: Mark Rutland Cc: Zefan Li Link: http://lore.kernel.org/lkml/20200325124536.2800725-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 3 ++- init/Kconfig | 3 ++- kernel/events/core.c | 22 ++++++++++++++++++++++ 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 09e0cca1eaaf..3f4f867a26d1 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1027,6 +1027,7 @@ struct perf_sample_data { u64 stack_user_size; u64 phys_addr; + u64 cgroup; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b77d271b06b1..dc33e3051819 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -142,8 +142,9 @@ enum perf_event_sample_format { PERF_SAMPLE_REGS_INTR = 1U << 18, PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, + PERF_SAMPLE_CGROUP = 1U << 21, - PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; diff --git a/init/Kconfig b/init/Kconfig index 3aa73f27028f..57f06ffcf26e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1120,7 +1120,8 @@ config CGROUP_PERF help This option extends the perf per-cpu mode to restrict monitoring to threads which belong to the cgroup specified and run on the - designated cpu. + designated cpu. Or this can be used to have cgroup ID in samples + so that it can monitor performance events among cgroups. Say N if unsure. diff --git a/kernel/events/core.c b/kernel/events/core.c index e58c1dcd075c..644dcf01d171 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1760,6 +1760,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); + if (sample_type & PERF_SAMPLE_CGROUP) + size += sizeof(data->cgroup); + event->header_size = size; } @@ -6694,6 +6697,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); + if (sample_type & PERF_SAMPLE_CGROUP) + perf_output_put(handle, data->cgroup); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -6896,6 +6902,16 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); +#ifdef CONFIG_CGROUP_PERF + if (sample_type & PERF_SAMPLE_CGROUP) { + struct cgroup *cgrp; + + /* protected by RCU */ + cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; + data->cgroup = cgroup_id(cgrp); + } +#endif + if (sample_type & PERF_SAMPLE_AUX) { u64 size; @@ -11060,6 +11076,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); + +#ifndef CONFIG_CGROUP_PERF + if (attr->sample_type & PERF_SAMPLE_CGROUP) + return -EINVAL; +#endif + out: return ret; -- Gitee From 0ea484ec037174a143fda72e4fb68bb52e9d4a79 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 27 Mar 2020 11:07:17 -0300 Subject: [PATCH 156/257] tools headers UAPI: Update tools's copy of linux/perf_event.h commit 03590fb409bce428bce0b4535a488dedfbed0c00 upstream To get the changes in: 6546b19f95ac ("perf/core: Add PERF_SAMPLE_CGROUP feature") 96aaab686505 ("perf/core: Add PERF_RECORD_CGROUP event") This silences this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h This update is a prerequisite to adding support for the HW index of raw branch records. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Cc: http://lore.kernel.org/lkml/20200325124536.2800725-4-namhyung@kernel.org [ split from a larger patch ] Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/include/uapi/linux/perf_event.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 29ec1a54bbaa..bc8c4816ba38 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -142,8 +142,9 @@ enum perf_event_sample_format { PERF_SAMPLE_REGS_INTR = 1U << 18, PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, + PERF_SAMPLE_CGROUP = 1U << 21, - PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -381,7 +382,8 @@ struct perf_event_attr { ksymbol : 1, /* include ksymbol events */ bpf_event : 1, /* include bpf events */ aux_output : 1, /* generate AUX records instead of events */ - __reserved_1 : 32; + cgroup : 1, /* include cgroup events */ + __reserved_1 : 31; union { __u32 wakeup_events; /* wakeup every n events */ @@ -1012,6 +1014,16 @@ enum perf_event_type { */ PERF_RECORD_BPF_EVENT = 18, + /* + * struct { + * struct perf_event_header header; + * u64 id; + * char path[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CGROUP = 19, + PERF_RECORD_MAX, /* non-ABI */ }; -- Gitee From ab6afbfc2ea3cd340562db54ca52db020903d787 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 8 Nov 2019 15:11:28 +0530 Subject: [PATCH 157/257] perf tool: Provide an option to print perf_event_open args and return value commit ccd26741f5e6bdf2c18ea38b32e6a347c5648a97 upstream Perf record with verbose=2 already prints this information along with whole lot of other traces which requires lot of scrolling. Introduce an option to print only perf_event_open() arguments and return value. Sample o/p: $ perf --debug perf-event-open=1 record -- ls > /dev/null ------------------------------------------------------------ perf_event_attr: size 112 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|PERIOD read_format ID disabled 1 inherit 1 exclude_kernel 1 mmap 1 comm 1 freq 1 enable_on_exec 1 task 1 precise_ip 3 sample_id_all 1 exclude_guest 1 mmap2 1 comm_exec 1 ksymbol 1 bpf_event 1 ------------------------------------------------------------ sys_perf_event_open: pid 4308 cpu 0 group_fd -1 flags 0x8 = 4 sys_perf_event_open: pid 4308 cpu 1 group_fd -1 flags 0x8 = 5 sys_perf_event_open: pid 4308 cpu 2 group_fd -1 flags 0x8 = 6 sys_perf_event_open: pid 4308 cpu 3 group_fd -1 flags 0x8 = 8 sys_perf_event_open: pid 4308 cpu 4 group_fd -1 flags 0x8 = 9 sys_perf_event_open: pid 4308 cpu 5 group_fd -1 flags 0x8 = 10 sys_perf_event_open: pid 4308 cpu 6 group_fd -1 flags 0x8 = 11 sys_perf_event_open: pid 4308 cpu 7 group_fd -1 flags 0x8 = 12 ------------------------------------------------------------ perf_event_attr: type 1 size 112 config 0x9 watermark 1 sample_id_all 1 bpf_event 1 { wakeup_events, wakeup_watermark } 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 sys_perf_event_open failed, error -13 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.002 MB perf.data (9 samples) ] Committer notes: Just like the 'verbose' variable this new 'debug_peo_args' needs to be added to util/python.c, since we don't link the debug.o file in the python binding, which ended up making 'perf test python' fail with: # perf test -v python 18: 'import perf' in python : --- start --- test child forked, pid 19237 Traceback (most recent call last): File "", line 1, in ImportError: /tmp/build/perf/python/perf.so: undefined symbol: debug_peo_args test child finished with -1 ---- end ---- 'import perf' in python: FAILED! # After adding that new variable to util/python.c: # perf test -v python 18: 'import perf' in python : --- start --- test child forked, pid 22364 test child finished with 0 ---- end ---- 'import perf' in python: Ok # Signed-off-by: Ravi Bangoria Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Link: http://lore.kernel.org/lkml/20191108094128.28769-1-ravi.bangoria@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf.txt | 2 ++ tools/perf/util/debug.c | 2 ++ tools/perf/util/debug.h | 9 ++++++++ tools/perf/util/evsel.c | 36 +++++++++++++++---------------- tools/perf/util/python.c | 1 + 5 files changed, 32 insertions(+), 18 deletions(-) diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt index 401f0ed67439..3f37ded13f8c 100644 --- a/tools/perf/Documentation/perf.txt +++ b/tools/perf/Documentation/perf.txt @@ -24,6 +24,8 @@ OPTIONS data-convert - data convert command debug messages stderr - write debug output (option -v) to stderr in browser mode + perf-event-open - Print perf_event_open() arguments and + return value --buildid-dir:: Setup buildid cache directory. It has higher priority than diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index 682146d04379..1ab270c2254c 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -24,6 +24,7 @@ #include int verbose; +int debug_peo_args; bool dump_trace = false, quiet = false; int debug_ordered_events; static int redirect_to_stderr; @@ -180,6 +181,7 @@ static struct debug_variable { { .name = "ordered-events", .ptr = &debug_ordered_events}, { .name = "stderr", .ptr = &redirect_to_stderr}, { .name = "data-convert", .ptr = &debug_data_convert }, + { .name = "perf-event-open", .ptr = &debug_peo_args }, { .name = NULL, } }; diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index d25ae1c4cee9..f1734abd98dd 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -8,6 +8,7 @@ #include extern int verbose; +extern int debug_peo_args; extern bool quiet, dump_trace; extern int debug_ordered_events; extern int debug_data_convert; @@ -30,6 +31,14 @@ extern int debug_data_convert; #define pr_debug3(fmt, ...) pr_debugN(3, pr_fmt(fmt), ##__VA_ARGS__) #define pr_debug4(fmt, ...) pr_debugN(4, pr_fmt(fmt), ##__VA_ARGS__) +/* Special macro to print perf_event_open arguments/return value. */ +#define pr_debug2_peo(fmt, ...) { \ + if (debug_peo_args) \ + pr_debugN(0, pr_fmt(fmt), ##__VA_ARGS__); \ + else \ + pr_debugN(2, pr_fmt(fmt), ##__VA_ARGS__); \ +} + #define pr_time_N(n, var, t, fmt, ...) \ eprintf_time(n, var, t, fmt, ##__VA_ARGS__) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ff0a5da9d99d..f26f6fd2169c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1547,7 +1547,7 @@ static int __open_attr__fprintf(FILE *fp, const char *name, const char *val, static void display_attr(struct perf_event_attr *attr) { - if (verbose >= 2) { + if (verbose >= 2 || debug_peo_args) { fprintf(stderr, "%.60s\n", graph_dotted_line); fprintf(stderr, "perf_event_attr:\n"); perf_event_attr__fprintf(stderr, attr, __open_attr__fprintf, NULL); @@ -1563,7 +1563,7 @@ static int perf_event_open(struct evsel *evsel, int fd; while (1) { - pr_debug2("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx", + pr_debug2_peo("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx", pid, cpu, group_fd, flags); fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, group_fd, flags); @@ -1583,9 +1583,9 @@ static int perf_event_open(struct evsel *evsel, break; } - pr_debug2("\nsys_perf_event_open failed, error %d\n", -ENOTSUP); + pr_debug2_peo("\nsys_perf_event_open failed, error %d\n", -ENOTSUP); evsel->core.attr.precise_ip--; - pr_debug2("decreasing precise_ip by one (%d)\n", evsel->core.attr.precise_ip); + pr_debug2_peo("decreasing precise_ip by one (%d)\n", evsel->core.attr.precise_ip); display_attr(&evsel->core.attr); } @@ -1706,12 +1706,12 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, continue; } - pr_debug2("\nsys_perf_event_open failed, error %d\n", + pr_debug2_peo("\nsys_perf_event_open failed, error %d\n", err); goto try_fallback; } - pr_debug2(" = %d\n", fd); + pr_debug2_peo(" = %d\n", fd); if (evsel->bpf_fd >= 0) { int evt_fd = fd; @@ -1784,58 +1784,58 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, goto fallback_missing_features; } else if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) { perf_missing_features.aux_output = true; - pr_debug2("Kernel has no attr.aux_output support, bailing out\n"); + pr_debug2_peo("Kernel has no attr.aux_output support, bailing out\n"); goto out_close; } else if (!perf_missing_features.bpf && evsel->core.attr.bpf_event) { perf_missing_features.bpf = true; - pr_debug2("switching off bpf_event\n"); + pr_debug2_peo("switching off bpf_event\n"); goto fallback_missing_features; } else if (!perf_missing_features.ksymbol && evsel->core.attr.ksymbol) { perf_missing_features.ksymbol = true; - pr_debug2("switching off ksymbol\n"); + pr_debug2_peo("switching off ksymbol\n"); goto fallback_missing_features; } else if (!perf_missing_features.write_backward && evsel->core.attr.write_backward) { perf_missing_features.write_backward = true; - pr_debug2("switching off write_backward\n"); + pr_debug2_peo("switching off write_backward\n"); goto out_close; } else if (!perf_missing_features.clockid_wrong && evsel->core.attr.use_clockid) { perf_missing_features.clockid_wrong = true; - pr_debug2("switching off clockid\n"); + pr_debug2_peo("switching off clockid\n"); goto fallback_missing_features; } else if (!perf_missing_features.clockid && evsel->core.attr.use_clockid) { perf_missing_features.clockid = true; - pr_debug2("switching off use_clockid\n"); + pr_debug2_peo("switching off use_clockid\n"); goto fallback_missing_features; } else if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) { perf_missing_features.cloexec = true; - pr_debug2("switching off cloexec flag\n"); + pr_debug2_peo("switching off cloexec flag\n"); goto fallback_missing_features; } else if (!perf_missing_features.mmap2 && evsel->core.attr.mmap2) { perf_missing_features.mmap2 = true; - pr_debug2("switching off mmap2\n"); + pr_debug2_peo("switching off mmap2\n"); goto fallback_missing_features; } else if (!perf_missing_features.exclude_guest && (evsel->core.attr.exclude_guest || evsel->core.attr.exclude_host)) { perf_missing_features.exclude_guest = true; - pr_debug2("switching off exclude_guest, exclude_host\n"); + pr_debug2_peo("switching off exclude_guest, exclude_host\n"); goto fallback_missing_features; } else if (!perf_missing_features.sample_id_all) { perf_missing_features.sample_id_all = true; - pr_debug2("switching off sample_id_all\n"); + pr_debug2_peo("switching off sample_id_all\n"); goto retry_sample_id; } else if (!perf_missing_features.lbr_flags && (evsel->core.attr.branch_sample_type & (PERF_SAMPLE_BRANCH_NO_CYCLES | PERF_SAMPLE_BRANCH_NO_FLAGS))) { perf_missing_features.lbr_flags = true; - pr_debug2("switching off branch sample type no (cycles/flags)\n"); + pr_debug2_peo("switching off branch sample type no (cycles/flags)\n"); goto fallback_missing_features; } else if (!perf_missing_features.group_read && evsel->core.attr.inherit && (evsel->core.attr.read_format & PERF_FORMAT_GROUP) && perf_evsel__is_group_leader(evsel)) { perf_missing_features.group_read = true; - pr_debug2("switching off group read\n"); + pr_debug2_peo("switching off group read\n"); goto fallback_missing_features; } out_close: diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 25118605f3f8..83212c65848b 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -65,6 +65,7 @@ struct perf_env perf_env; * implementing 'verbose' and 'eprintf'. */ int verbose; +int debug_peo_args; int eprintf(int level, int var, const char *fmt, ...); -- Gitee From 71640710eebf8619e184076efbd0731b9ab31e35 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 25 Mar 2020 21:45:34 +0900 Subject: [PATCH 158/257] perf record: Add --all-cgroups option commit 8fb4b67939e169fca68174e9ac7be79fe9a04498 upstream The --all-cgroups option is to enable cgroup profiling support. It tells kernel to record CGROUP events in the ring buffer so that perf report can identify task/cgroup association later. [root@seventh ~]# perf record --all-cgroups --namespaces /wb/cgtest [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.042 MB perf.data (558 samples) ] [root@seventh ~]# perf report --stdio -s cgroup_id,cgroup,pid # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 558 of event 'cycles' # Event count (approx.): 458017341 # # Overhead cgroup id (dev/inode) Cgroup Pid:Command # ........ ..................... .......... ............... # 33.15% 4/0xeffffffb /sub 9615:looper0 32.83% 4/0xf00002f5 /sub/cgrp2 9620:looper2 32.79% 4/0xf00002f4 /sub/cgrp1 9619:looper1 0.35% 4/0xf00002f5 /sub/cgrp2 9618:cgtest 0.34% 4/0xf00002f4 /sub/cgrp1 9617:cgtest 0.32% 4/0xeffffffb / 9615:looper0 0.11% 4/0xeffffffb /sub 9617:cgtest 0.10% 4/0xeffffffb /sub 9618:cgtest # # (Tip: Sample related events with: perf record -e '{cycles,instructions}:S') # [root@seventh ~]# Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200325124536.2800725-8-namhyung@kernel.org Link: http://lore.kernel.org/lkml/20200402015249.3800462-1-namhyung@kernel.org [ Extracted the HAVE_FILE_HANDLE from the followup patch ] Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf-record.txt | 5 ++++- tools/perf/builtin-record.c | 11 +++++++++++ tools/perf/util/evsel.c | 11 ++++++++++- tools/perf/util/evsel.h | 1 + tools/perf/util/record.h | 1 + 5 files changed, 27 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 414a9f8b658d..63d9ac3995c2 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -389,7 +389,10 @@ displayed with the weight and local_weight sort keys. This currently works for abort events and some memory events in precise mode on modern Intel CPUs. --namespaces:: -Record events of type PERF_RECORD_NAMESPACES. +Record events of type PERF_RECORD_NAMESPACES. This enables 'cgroup_id' sort key. + +--all-cgroups:: +Record events of type PERF_RECORD_CGROUP. This enables 'cgroup' sort key. --transaction:: Record transaction flags for transaction related events. diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f34c450a9743..3ab5be1b7c52 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1374,6 +1374,15 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) if (rec->opts.record_namespaces) tool->namespace_events = true; + if (rec->opts.record_cgroup) { +#ifdef HAVE_FILE_HANDLE + tool->cgroup_events = true; +#else + pr_err("cgroup tracking is not supported\n"); + return -1; +#endif + } + if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) { signal(SIGUSR2, snapshot_sig_handler); if (rec->opts.auxtrace_snapshot_mode) @@ -2248,6 +2257,8 @@ static struct option __record_options[] = { "per thread proc mmap processing timeout in ms"), OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces, "Record namespaces events"), + OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup, + "Record cgroup events"), OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events, "Record context switch events"), OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f26f6fd2169c..04938694da6e 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1104,6 +1104,11 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, if (opts->record_namespaces) attr->namespaces = track; + if (opts->record_cgroup) { + attr->cgroup = track && !perf_missing_features.cgroup; + perf_evsel__set_sample_bit(evsel, CGROUP); + } + if (opts->record_switch_events) attr->context_switch = track; @@ -1777,7 +1782,11 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.branch_hw_idx && + if (!perf_missing_features.cgroup && evsel->core.attr.cgroup) { + perf_missing_features.cgroup = true; + pr_debug2_peo("Kernel has no cgroup sampling support, bailing out\n"); + goto out_close; + } else if (!perf_missing_features.branch_hw_idx && (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX)) { perf_missing_features.branch_hw_idx = true; pr_debug2("switching off branch HW index support\n"); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 529f3bfd8a5d..9d6050831eca 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -124,6 +124,7 @@ struct perf_missing_features { bool bpf; bool aux_output; bool branch_hw_idx; + bool cgroup; }; extern struct perf_missing_features perf_missing_features; diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h index 941f9ae49e18..ae3db7a272be 100644 --- a/tools/perf/util/record.h +++ b/tools/perf/util/record.h @@ -34,6 +34,7 @@ struct record_opts { bool auxtrace_snapshot_on_exit; bool auxtrace_sample_mode; bool record_namespaces; + bool record_cgroup; bool record_switch_events; bool all_kernel; bool all_user; -- Gitee From e8ab411ef221d3dc0d134e256570a7c6e9dabc33 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 29 Apr 2020 16:12:15 -0300 Subject: [PATCH 159/257] perf evsel: Rename *perf_evsel__*set_sample_*() to *evsel__*set_sample_*() commit 862b2f8fbc5b3a13d096b06560b6408f93388cf9 upstream As they are not 'struct evsel' methods, not part of tools/lib/perf/, aka libperf, to whom the perf_ prefix belongs. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/arch/arm/util/cs-etm.c | 4 +- tools/perf/arch/arm64/util/arm-spe.c | 12 ++-- tools/perf/arch/x86/util/intel-bts.c | 2 +- tools/perf/arch/x86/util/intel-pt.c | 20 +++---- tools/perf/builtin-annotate.c | 2 +- tools/perf/builtin-diff.c | 2 +- tools/perf/builtin-kvm.c | 18 +++--- tools/perf/tests/hists_cumulate.c | 8 +-- tools/perf/tests/mmap-basic.c | 2 +- tools/perf/tests/perf-record.c | 6 +- tools/perf/tests/switch-tracking.c | 10 ++-- tools/perf/util/auxtrace.c | 4 +- tools/perf/util/evlist.c | 4 +- tools/perf/util/evsel.c | 88 ++++++++++++++-------------- tools/perf/util/evsel.h | 17 +++--- tools/perf/util/record.c | 2 +- tools/perf/util/sideband_evlist.c | 2 +- 17 files changed, 100 insertions(+), 103 deletions(-) diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index b8df8054dd5e..8cefba31143a 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -402,7 +402,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, * when a context switch happened. */ if (!perf_cpu_map__empty(cpus)) { - perf_evsel__set_sample_bit(cs_etm_evsel, CPU); + evsel__set_sample_bit(cs_etm_evsel, CPU); err = cs_etm_set_option(itr, cs_etm_evsel, ETM_OPT_CTXTID | ETM_OPT_TS); @@ -426,7 +426,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, /* In per-cpu case, always need the time of mmap events etc */ if (!perf_cpu_map__empty(cpus)) - perf_evsel__set_sample_bit(tracking_evsel, TIME); + evsel__set_sample_bit(tracking_evsel, TIME); } out: diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 27653be24447..e3593063b3d1 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -120,9 +120,9 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, */ perf_evlist__to_front(evlist, arm_spe_evsel); - perf_evsel__set_sample_bit(arm_spe_evsel, CPU); - perf_evsel__set_sample_bit(arm_spe_evsel, TIME); - perf_evsel__set_sample_bit(arm_spe_evsel, TID); + evsel__set_sample_bit(arm_spe_evsel, CPU); + evsel__set_sample_bit(arm_spe_evsel, TIME); + evsel__set_sample_bit(arm_spe_evsel, TID); /* Add dummy event to keep tracking */ err = parse_events(evlist, "dummy:u", NULL); @@ -134,9 +134,9 @@ static int arm_spe_recording_options(struct auxtrace_record *itr, tracking_evsel->core.attr.freq = 0; tracking_evsel->core.attr.sample_period = 1; - perf_evsel__set_sample_bit(tracking_evsel, TIME); - perf_evsel__set_sample_bit(tracking_evsel, CPU); - perf_evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK); + evsel__set_sample_bit(tracking_evsel, TIME); + evsel__set_sample_bit(tracking_evsel, CPU); + evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK); return 0; } diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index c7cc05dd2852..138d0058a6ab 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -219,7 +219,7 @@ static int intel_bts_recording_options(struct auxtrace_record *itr, * AUX event. */ if (!perf_cpu_map__empty(cpus)) - perf_evsel__set_sample_bit(intel_bts_evsel, CPU); + evsel__set_sample_bit(intel_bts_evsel, CPU); } /* Add dummy event to keep tracking */ diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index fcc18501b9e4..4b4dffd2b4c4 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -420,8 +420,8 @@ static int intel_pt_track_switches(struct evlist *evlist) evsel = evlist__last(evlist); - perf_evsel__set_sample_bit(evsel, CPU); - perf_evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, CPU); + evsel__set_sample_bit(evsel, TIME); evsel->core.system_wide = true; evsel->no_aux_samples = true; @@ -729,10 +729,10 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, switch_evsel->no_aux_samples = true; switch_evsel->immediate = true; - perf_evsel__set_sample_bit(switch_evsel, TID); - perf_evsel__set_sample_bit(switch_evsel, TIME); - perf_evsel__set_sample_bit(switch_evsel, CPU); - perf_evsel__reset_sample_bit(switch_evsel, BRANCH_STACK); + evsel__set_sample_bit(switch_evsel, TID); + evsel__set_sample_bit(switch_evsel, TIME); + evsel__set_sample_bit(switch_evsel, CPU); + evsel__reset_sample_bit(switch_evsel, BRANCH_STACK); opts->record_switch_events = false; ptr->have_sched_switch = 3; @@ -766,7 +766,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, * AUX event. */ if (!perf_cpu_map__empty(cpus)) - perf_evsel__set_sample_bit(intel_pt_evsel, CPU); + evsel__set_sample_bit(intel_pt_evsel, CPU); } /* Add dummy event to keep tracking */ @@ -790,11 +790,11 @@ static int intel_pt_recording_options(struct auxtrace_record *itr, /* In per-cpu case, always need the time of mmap events etc */ if (!perf_cpu_map__empty(cpus)) { - perf_evsel__set_sample_bit(tracking_evsel, TIME); + evsel__set_sample_bit(tracking_evsel, TIME); /* And the CPU for switch events */ - perf_evsel__set_sample_bit(tracking_evsel, CPU); + evsel__set_sample_bit(tracking_evsel, CPU); } - perf_evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK); + evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK); } /* diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 8db8fc9bddef..2cfb6e5da381 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -433,7 +433,7 @@ static int __cmd_annotate(struct perf_annotate *ann) total_nr_samples += nr_samples; hists__collapse_resort(hists, NULL); /* Don't sort callchain */ - perf_evsel__reset_sample_bit(pos, CALLCHAIN); + evsel__reset_sample_bit(pos, CALLCHAIN); perf_evsel__output_resort(pos, NULL); if (symbol_conf.event_group && diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 265682296836..9dc27770d848 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -1052,7 +1052,7 @@ static void data_process(void) data__fprintf(); /* Don't sort callchain for perf diff */ - perf_evsel__reset_sample_bit(evsel_base, CALLCHAIN); + evsel__reset_sample_bit(evsel_base, CALLCHAIN); hists__process(hists_base); } diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 858da896b518..20366ba14de1 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1033,16 +1033,16 @@ static int kvm_live_open_events(struct perf_kvm_stat *kvm) struct perf_event_attr *attr = &pos->core.attr; /* make sure these *are* set */ - perf_evsel__set_sample_bit(pos, TID); - perf_evsel__set_sample_bit(pos, TIME); - perf_evsel__set_sample_bit(pos, CPU); - perf_evsel__set_sample_bit(pos, RAW); + evsel__set_sample_bit(pos, TID); + evsel__set_sample_bit(pos, TIME); + evsel__set_sample_bit(pos, CPU); + evsel__set_sample_bit(pos, RAW); /* make sure these are *not*; want as small a sample as possible */ - perf_evsel__reset_sample_bit(pos, PERIOD); - perf_evsel__reset_sample_bit(pos, IP); - perf_evsel__reset_sample_bit(pos, CALLCHAIN); - perf_evsel__reset_sample_bit(pos, ADDR); - perf_evsel__reset_sample_bit(pos, READ); + evsel__reset_sample_bit(pos, PERIOD); + evsel__reset_sample_bit(pos, IP); + evsel__reset_sample_bit(pos, CALLCHAIN); + evsel__reset_sample_bit(pos, ADDR); + evsel__reset_sample_bit(pos, READ); attr->mmap = 0; attr->comm = 0; attr->task = 0; diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index 6367c8f6ca22..7a542f1c1c78 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -280,7 +280,7 @@ static int test1(struct evsel *evsel, struct machine *machine) symbol_conf.use_callchain = false; symbol_conf.cumulate_callchain = false; - perf_evsel__reset_sample_bit(evsel, CALLCHAIN); + evsel__reset_sample_bit(evsel, CALLCHAIN); setup_sorting(NULL); callchain_register_param(&callchain_param); @@ -427,7 +427,7 @@ static int test2(struct evsel *evsel, struct machine *machine) symbol_conf.use_callchain = true; symbol_conf.cumulate_callchain = false; - perf_evsel__set_sample_bit(evsel, CALLCHAIN); + evsel__set_sample_bit(evsel, CALLCHAIN); setup_sorting(NULL); callchain_register_param(&callchain_param); @@ -485,7 +485,7 @@ static int test3(struct evsel *evsel, struct machine *machine) symbol_conf.use_callchain = false; symbol_conf.cumulate_callchain = true; - perf_evsel__reset_sample_bit(evsel, CALLCHAIN); + evsel__reset_sample_bit(evsel, CALLCHAIN); setup_sorting(NULL); callchain_register_param(&callchain_param); @@ -669,7 +669,7 @@ static int test4(struct evsel *evsel, struct machine *machine) symbol_conf.use_callchain = true; symbol_conf.cumulate_callchain = true; - perf_evsel__set_sample_bit(evsel, CALLCHAIN); + evsel__set_sample_bit(evsel, CALLCHAIN); setup_sorting(NULL); diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 5f4c0dbb4715..e9d530d7e7de 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -86,7 +86,7 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse } evsels[i]->core.attr.wakeup_events = 1; - perf_evsel__set_sample_id(evsels[i], false); + evsel__set_sample_id(evsels[i], false); evlist__add(evlist, evsels[i]); diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 2195fc205e72..83adfd846ccd 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -106,9 +106,9 @@ int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unus * Config the evsels, setting attr->comm on the first one, etc. */ evsel = evlist__first(evlist); - perf_evsel__set_sample_bit(evsel, CPU); - perf_evsel__set_sample_bit(evsel, TID); - perf_evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, CPU); + evsel__set_sample_bit(evsel, TID); + evsel__set_sample_bit(evsel, TIME); perf_evlist__config(evlist, &opts, NULL); err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask); diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index fcb0d03dba4e..b08c8a0898d3 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -394,8 +394,8 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ switch_evsel = evlist__last(evlist); - perf_evsel__set_sample_bit(switch_evsel, CPU); - perf_evsel__set_sample_bit(switch_evsel, TIME); + evsel__set_sample_bit(switch_evsel, CPU); + evsel__set_sample_bit(switch_evsel, TIME); switch_evsel->core.system_wide = true; switch_evsel->no_aux_samples = true; @@ -412,8 +412,8 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ goto out_err; } - perf_evsel__set_sample_bit(cycles_evsel, CPU); - perf_evsel__set_sample_bit(cycles_evsel, TIME); + evsel__set_sample_bit(cycles_evsel, CPU); + evsel__set_sample_bit(cycles_evsel, TIME); /* Fourth event */ err = parse_events(evlist, "dummy:u", NULL); @@ -429,7 +429,7 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_ tracking_evsel->core.attr.freq = 0; tracking_evsel->core.attr.sample_period = 1; - perf_evsel__set_sample_bit(tracking_evsel, TIME); + evsel__set_sample_bit(tracking_evsel, TIME); /* Config events */ perf_evlist__config(evlist, &opts, NULL); diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index efec868c74c3..ea11526bef07 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -702,10 +702,10 @@ static int auxtrace_validate_aux_sample_size(struct evlist *evlist, pr_err("Cannot add AUX area sampling because group leader is not an AUX area event\n"); return -EINVAL; } - perf_evsel__set_sample_bit(evsel, AUX); + evsel__set_sample_bit(evsel, AUX); opts->auxtrace_sample_mode = true; } else { - perf_evsel__reset_sample_bit(evsel, AUX); + evsel__reset_sample_bit(evsel, AUX); } } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 201cf1786f98..03a066ce1f6d 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1006,7 +1006,7 @@ void __perf_evlist__set_sample_bit(struct evlist *evlist, struct evsel *evsel; evlist__for_each_entry(evlist, evsel) - __perf_evsel__set_sample_bit(evsel, bit); + __evsel__set_sample_bit(evsel, bit); } void __perf_evlist__reset_sample_bit(struct evlist *evlist, @@ -1015,7 +1015,7 @@ void __perf_evlist__reset_sample_bit(struct evlist *evlist, struct evsel *evsel; evlist__for_each_entry(evlist, evsel) - __perf_evsel__reset_sample_bit(evsel, bit); + __evsel__reset_sample_bit(evsel, bit); } int perf_evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 04938694da6e..e86db917bfb9 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -184,7 +184,7 @@ void perf_evsel__calc_id_pos(struct evsel *evsel) evsel->is_pos = __perf_evsel__calc_is_pos(evsel->core.attr.sample_type); } -void __perf_evsel__set_sample_bit(struct evsel *evsel, +void __evsel__set_sample_bit(struct evsel *evsel, enum perf_event_sample_format bit) { if (!(evsel->core.attr.sample_type & bit)) { @@ -194,7 +194,7 @@ void __perf_evsel__set_sample_bit(struct evsel *evsel, } } -void __perf_evsel__reset_sample_bit(struct evsel *evsel, +void __evsel__reset_sample_bit(struct evsel *evsel, enum perf_event_sample_format bit) { if (evsel->core.attr.sample_type & bit) { @@ -204,14 +204,14 @@ void __perf_evsel__reset_sample_bit(struct evsel *evsel, } } -void perf_evsel__set_sample_id(struct evsel *evsel, +void evsel__set_sample_id(struct evsel *evsel, bool can_sample_identifier) { if (can_sample_identifier) { - perf_evsel__reset_sample_bit(evsel, ID); - perf_evsel__set_sample_bit(evsel, IDENTIFIER); + evsel__reset_sample_bit(evsel, ID); + evsel__set_sample_bit(evsel, IDENTIFIER); } else { - perf_evsel__set_sample_bit(evsel, ID); + evsel__set_sample_bit(evsel, ID); } evsel->core.attr.read_format |= PERF_FORMAT_ID; } @@ -693,7 +693,7 @@ static void __perf_evsel__config_callchain(struct evsel *evsel, bool function = perf_evsel__is_function_event(evsel); struct perf_event_attr *attr = &evsel->core.attr; - perf_evsel__set_sample_bit(evsel, CALLCHAIN); + evsel__set_sample_bit(evsel, CALLCHAIN); attr->sample_max_stack = param->max_stack; @@ -708,7 +708,7 @@ static void __perf_evsel__config_callchain(struct evsel *evsel, "to get user callchain information. " "Falling back to framepointers.\n"); } else { - perf_evsel__set_sample_bit(evsel, BRANCH_STACK); + evsel__set_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_CALL_STACK | PERF_SAMPLE_BRANCH_NO_CYCLES | @@ -722,8 +722,8 @@ static void __perf_evsel__config_callchain(struct evsel *evsel, if (param->record_mode == CALLCHAIN_DWARF) { if (!function) { - perf_evsel__set_sample_bit(evsel, REGS_USER); - perf_evsel__set_sample_bit(evsel, STACK_USER); + evsel__set_sample_bit(evsel, REGS_USER); + evsel__set_sample_bit(evsel, STACK_USER); if (opts->sample_user_regs && DWARF_MINIMAL_REGS != PERF_REGS_MASK) { attr->sample_regs_user |= DWARF_MINIMAL_REGS; pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, " @@ -760,16 +760,16 @@ perf_evsel__reset_callgraph(struct evsel *evsel, { struct perf_event_attr *attr = &evsel->core.attr; - perf_evsel__reset_sample_bit(evsel, CALLCHAIN); + evsel__reset_sample_bit(evsel, CALLCHAIN); if (param->record_mode == CALLCHAIN_LBR) { - perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); + evsel__reset_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type &= ~(PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_CALL_STACK | PERF_SAMPLE_BRANCH_HW_INDEX); } if (param->record_mode == CALLCHAIN_DWARF) { - perf_evsel__reset_sample_bit(evsel, REGS_USER); - perf_evsel__reset_sample_bit(evsel, STACK_USER); + evsel__reset_sample_bit(evsel, REGS_USER); + evsel__reset_sample_bit(evsel, STACK_USER); } } @@ -793,32 +793,32 @@ static void apply_config_terms(struct evsel *evsel, if (!(term->weak && opts->user_interval != ULLONG_MAX)) { attr->sample_period = term->val.period; attr->freq = 0; - perf_evsel__reset_sample_bit(evsel, PERIOD); + evsel__reset_sample_bit(evsel, PERIOD); } break; case PERF_EVSEL__CONFIG_TERM_FREQ: if (!(term->weak && opts->user_freq != UINT_MAX)) { attr->sample_freq = term->val.freq; attr->freq = 1; - perf_evsel__set_sample_bit(evsel, PERIOD); + evsel__set_sample_bit(evsel, PERIOD); } break; case PERF_EVSEL__CONFIG_TERM_TIME: if (term->val.time) - perf_evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, TIME); else - perf_evsel__reset_sample_bit(evsel, TIME); + evsel__reset_sample_bit(evsel, TIME); break; case PERF_EVSEL__CONFIG_TERM_CALLGRAPH: callgraph_buf = term->val.str; break; case PERF_EVSEL__CONFIG_TERM_BRANCH: if (term->val.str && strcmp(term->val.str, "no")) { - perf_evsel__set_sample_bit(evsel, BRANCH_STACK); + evsel__set_sample_bit(evsel, BRANCH_STACK); parse_branch_str(term->val.str, &attr->branch_sample_type); } else - perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); + evsel__reset_sample_bit(evsel, BRANCH_STACK); break; case PERF_EVSEL__CONFIG_TERM_STACK_USER: dump_size = term->val.stack_user; @@ -897,8 +897,8 @@ static void apply_config_terms(struct evsel *evsel, /* set perf-event callgraph */ if (param.enabled) { if (sample_address) { - perf_evsel__set_sample_bit(evsel, ADDR); - perf_evsel__set_sample_bit(evsel, DATA_SRC); + evsel__set_sample_bit(evsel, ADDR); + evsel__set_sample_bit(evsel, DATA_SRC); evsel->core.attr.mmap_data = track; } perf_evsel__config_callchain(evsel, opts, ¶m); @@ -965,17 +965,17 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, attr->inherit = !opts->no_inherit; attr->write_backward = opts->overwrite ? 1 : 0; - perf_evsel__set_sample_bit(evsel, IP); - perf_evsel__set_sample_bit(evsel, TID); + evsel__set_sample_bit(evsel, IP); + evsel__set_sample_bit(evsel, TID); if (evsel->sample_read) { - perf_evsel__set_sample_bit(evsel, READ); + evsel__set_sample_bit(evsel, READ); /* * We need ID even in case of single event, because * PERF_SAMPLE_READ process ID specific data. */ - perf_evsel__set_sample_id(evsel, false); + evsel__set_sample_id(evsel, false); /* * Apply group format only if we belong to group @@ -994,7 +994,7 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, if (!attr->sample_period || (opts->user_freq != UINT_MAX || opts->user_interval != ULLONG_MAX)) { if (opts->freq) { - perf_evsel__set_sample_bit(evsel, PERIOD); + evsel__set_sample_bit(evsel, PERIOD); attr->freq = 1; attr->sample_freq = opts->freq; } else { @@ -1033,7 +1033,7 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, } if (opts->sample_address) { - perf_evsel__set_sample_bit(evsel, ADDR); + evsel__set_sample_bit(evsel, ADDR); attr->mmap_data = track; } @@ -1050,16 +1050,16 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, if (opts->sample_intr_regs && !evsel->no_aux_samples) { attr->sample_regs_intr = opts->sample_intr_regs; - perf_evsel__set_sample_bit(evsel, REGS_INTR); + evsel__set_sample_bit(evsel, REGS_INTR); } if (opts->sample_user_regs && !evsel->no_aux_samples) { attr->sample_regs_user |= opts->sample_user_regs; - perf_evsel__set_sample_bit(evsel, REGS_USER); + evsel__set_sample_bit(evsel, REGS_USER); } if (target__has_cpu(&opts->target) || opts->sample_cpu) - perf_evsel__set_sample_bit(evsel, CPU); + evsel__set_sample_bit(evsel, CPU); /* * When the user explicitly disabled time don't force it here. @@ -1068,31 +1068,31 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, (!perf_missing_features.sample_id_all && (!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu || opts->sample_time_set))) - perf_evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, TIME); if (opts->raw_samples && !evsel->no_aux_samples) { - perf_evsel__set_sample_bit(evsel, TIME); - perf_evsel__set_sample_bit(evsel, RAW); - perf_evsel__set_sample_bit(evsel, CPU); + evsel__set_sample_bit(evsel, TIME); + evsel__set_sample_bit(evsel, RAW); + evsel__set_sample_bit(evsel, CPU); } if (opts->sample_address) - perf_evsel__set_sample_bit(evsel, DATA_SRC); + evsel__set_sample_bit(evsel, DATA_SRC); if (opts->sample_phys_addr) - perf_evsel__set_sample_bit(evsel, PHYS_ADDR); + evsel__set_sample_bit(evsel, PHYS_ADDR); if (opts->no_buffering) { attr->watermark = 0; attr->wakeup_events = 1; } if (opts->branch_stack && !evsel->no_aux_samples) { - perf_evsel__set_sample_bit(evsel, BRANCH_STACK); + evsel__set_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type = opts->branch_stack; } if (opts->sample_weight) - perf_evsel__set_sample_bit(evsel, WEIGHT); + evsel__set_sample_bit(evsel, WEIGHT); attr->task = track; attr->mmap = track; @@ -1106,14 +1106,14 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, if (opts->record_cgroup) { attr->cgroup = track && !perf_missing_features.cgroup; - perf_evsel__set_sample_bit(evsel, CGROUP); + evsel__set_sample_bit(evsel, CGROUP); } if (opts->record_switch_events) attr->context_switch = track; if (opts->sample_transaction) - perf_evsel__set_sample_bit(evsel, TRANSACTION); + evsel__set_sample_bit(evsel, TRANSACTION); if (opts->running_time) { evsel->core.attr.read_format |= @@ -1176,9 +1176,9 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, /* The --period option takes the precedence. */ if (opts->period_set) { if (opts->period) - perf_evsel__set_sample_bit(evsel, PERIOD); + evsel__set_sample_bit(evsel, PERIOD); else - perf_evsel__reset_sample_bit(evsel, PERIOD); + evsel__reset_sample_bit(evsel, PERIOD); } /* @@ -1187,7 +1187,7 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, * if BRANCH_STACK bit is set. */ if (opts->initial_delay && is_dummy_event(evsel)) - perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); + evsel__reset_sample_bit(evsel, BRANCH_STACK); } int perf_evsel__set_filter(struct evsel *evsel, const char *filter) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 9d6050831eca..1c9d021c0304 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -213,19 +213,16 @@ const char *perf_evsel__name(struct evsel *evsel); const char *perf_evsel__group_name(struct evsel *evsel); int perf_evsel__group_desc(struct evsel *evsel, char *buf, size_t size); -void __perf_evsel__set_sample_bit(struct evsel *evsel, - enum perf_event_sample_format bit); -void __perf_evsel__reset_sample_bit(struct evsel *evsel, - enum perf_event_sample_format bit); +void __evsel__set_sample_bit(struct evsel *evsel, enum perf_event_sample_format bit); +void __evsel__reset_sample_bit(struct evsel *evsel, enum perf_event_sample_format bit); -#define perf_evsel__set_sample_bit(evsel, bit) \ - __perf_evsel__set_sample_bit(evsel, PERF_SAMPLE_##bit) +#define evsel__set_sample_bit(evsel, bit) \ + __evsel__set_sample_bit(evsel, PERF_SAMPLE_##bit) -#define perf_evsel__reset_sample_bit(evsel, bit) \ - __perf_evsel__reset_sample_bit(evsel, PERF_SAMPLE_##bit) +#define evsel__reset_sample_bit(evsel, bit) \ + __evsel__reset_sample_bit(evsel, PERF_SAMPLE_##bit) -void perf_evsel__set_sample_id(struct evsel *evsel, - bool use_sample_identifier); +void evsel__set_sample_id(struct evsel *evsel, bool use_sample_identifier); int perf_evsel__set_filter(struct evsel *evsel, const char *filter); int perf_evsel__append_tp_filter(struct evsel *evsel, const char *filter); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 5084ad6db1b3..24e3c867d3a7 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -62,7 +62,7 @@ void perf_evlist__config(struct evlist *evlist, struct record_opts *opts, if (sample_id) { evlist__for_each_entry(evlist, evsel) - perf_evsel__set_sample_id(evsel, use_sample_identifier); + evsel__set_sample_id(evsel, use_sample_identifier); } perf_evlist__set_id_pos(evlist); diff --git a/tools/perf/util/sideband_evlist.c b/tools/perf/util/sideband_evlist.c index 1d6f470d64c4..c51a8dfce08b 100644 --- a/tools/perf/util/sideband_evlist.c +++ b/tools/perf/util/sideband_evlist.c @@ -108,7 +108,7 @@ int perf_evlist__start_sb_thread(struct evlist *evlist, struct target *target) bool can_sample_identifier = perf_can_sample_identifier(); evlist__for_each_entry(evlist, counter) - perf_evsel__set_sample_id(counter, can_sample_identifier); + evsel__set_sample_id(counter, can_sample_identifier); perf_evlist__set_id_pos(evlist); } -- Gitee From b00e7a2776195941bd2ef22ba65d469c172be9fb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Oct 2020 06:57:46 -0700 Subject: [PATCH 160/257] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE commit 8d97e71811aaafe4abf611dc24822fd6e73df1a1 upstream Current perf can report both virtual addresses and physical addresses, but not the MMU page size. Without the MMU page size information of the utilized page, users cannot decide whether to promote/demote large pages to optimize memory usage. Add a new sample type for the data MMU page size. Current perf already has a facility to collect data virtual addresses. A page walker is required to walk the pages tables and calculate the MMU page size from a given virtual address. On some platforms, e.g., X86, the page walker is invoked in an NMI handler. So the page walker must be NMI-safe and low overhead. Besides, the page walker should work for both user and kernel virtual address. The existing generic page walker, e.g., walk_page_range_novma(), is a little bit complex and doesn't guarantee the NMI-safe. The follow_page() is only for user-virtual address. Add a new function perf_get_page_size() to walk the page tables and calculate the MMU page size. In the function: - Interrupts have to be disabled to prevent any teardown of the page tables. - For user space threads, the current->mm is used for the page walker. For kernel threads and the like, the current->mm is NULL. The init_mm is used for the page walker. The active_mm is not used here, because it can be NULL. Quote from Peter Zijlstra, "context_switch() can set prev->active_mm to NULL when it transfers it to @next. It does this before @current is updated. So an NMI that comes in between this active_mm swizzling and updating @current will see !active_mm." - The MMU page size is calculated from the page table level. The method should work for all architectures, but it has only been verified on X86. Should there be some architectures, which support perf, where the method doesn't work, it can be fixed later separately. Reporting the wrong page size would not be fatal for the architecture. Some under discussion features may impact the method in the future. Quote from Dave Hansen, "There are lots of weird things folks are trying to do with the page tables, like Address Space Isolation. For instance, if you get a perf NMI when running userspace, current->mm->pgd is *different* than the PGD that was in use when userspace was running. It's close enough today, but it might not stay that way." If the case happens later, lots of consecutive page walk errors will happen. The worst case is that lots of page-size '0' are returned, which would not be fatal. In the perf tool, a check is implemented to detect this case. Once it happens, a kernel patch could be implemented accordingly then. Suggested-by: Peter Zijlstra Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-2-kan.liang@linux.intel.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 4 +- kernel/events/core.c | 103 ++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3f4f867a26d1..6ed8f63260cb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,6 +1028,7 @@ struct perf_sample_data { u64 phys_addr; u64 cgroup; + u64 data_page_size; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index dc33e3051819..c413bca7971f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -143,8 +143,9 @@ enum perf_event_sample_format { PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, - PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 23, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -878,6 +879,7 @@ enum perf_event_type { * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * { u64 size; * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 644dcf01d171..3e30058c96ce 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "internal.h" @@ -1763,6 +1764,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + size += sizeof(data->data_page_size); + event->header_size = size; } @@ -6700,6 +6704,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + perf_output_put(handle, data->data_page_size); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -6757,6 +6764,94 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } +#ifdef CONFIG_MMU + +/* + * Return the MMU page size of a given virtual address + */ +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return 0; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_leaf(*p4d)) + return 1ULL << P4D_SHIFT; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return 0; + + if (pud_leaf(*pud)) + return 1ULL << PUD_SHIFT; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_leaf(*pmd)) + return 1ULL << PMD_SHIFT; + + pte = pte_offset_map(pmd, addr); + if (!pte_present(*pte)) { + pte_unmap(pte); + return 0; + } + + pte_unmap(pte); + return PAGE_SIZE; +} + +#else + +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} + +#endif + +static u64 perf_get_page_size(unsigned long addr) +{ + struct mm_struct *mm; + unsigned long flags; + u64 size; + + if (!addr) + return 0; + + /* + * Software page-table walkers must disable IRQs, + * which prevents any tear down of the page tables. + */ + local_irq_save(flags); + + mm = current->mm; + if (!mm) { + /* + * For kernel threads and the like, use init_mm so that + * we can find kernel memory. + */ + mm = &init_mm; + } + + size = __perf_get_page_size(mm, addr); + + local_irq_restore(flags); + + return size; +} + static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * @@ -6912,6 +7007,14 @@ void perf_prepare_sample(struct perf_event_header *header, } #endif + /* + * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't + * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, + * but the value will not dump to the userspace. + */ + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + data->data_page_size = perf_get_page_size(data->addr); + if (sample_type & PERF_SAMPLE_AUX) { u64 size; -- Gitee From 86ef86c8404dd7dde111d7db09372a5dad9175bc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 30 Nov 2020 09:27:52 -0800 Subject: [PATCH 161/257] tools headers UAPI: Update tools's copy of linux/perf_event.h commit 47d982202f8cfaac6f208c9109fa15cb6a0181f7 upstream To get the changes in: commit 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE") commit 995f088efebe ("perf/core: Add support for PERF_SAMPLE_CODE_PAGE_SIZE") This silences this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Andi Kleen Cc: Jiri Olsa Cc: Mark Rutland Cc: Michael Ellerman Cc: Stephane Eranian Cc: Will Deacon Link: http://lore.kernel.org/lkml/20201130172803.2676-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/include/uapi/linux/perf_event.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index bc8c4816ba38..74bf68b3f16b 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -143,8 +143,10 @@ enum perf_event_sample_format { PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, - PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -878,6 +880,8 @@ enum perf_event_type { * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * { u64 size; * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, -- Gitee From c736c5357d3281443808e1b3209ed744554aeec4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 25 Mar 2020 21:45:30 +0900 Subject: [PATCH 162/257] perf tools: Basic support for CGROUP event commit ba78c1c5461c2fc2f57b777e971b3a9ec0df5666 upstream Implement basic functionality to support cgroup tracking. Each cgroup can be identified by inode number which can be read from userspace too. The actual cgroup processing will come in the later patch. [Backport changes] 1. In the upstream commit, the event.h file is located at tools/lib/perf/include/perf/event.h, whereas in the current kernel it resides at tools/perf/lib/include/perf/event.h. We skipped a patch that moved these files (and their content) from tools/perf/lib/* to tools/lib/perf/* because applying it caused significant conflicts and additional deviations. Since that patch only involved file moves without any functional changes, there is no change in code part except the path. So, the changes from upstream were incorporate directly into the files under tools/perf/lib/include/perf/event.h. 2.The patch also shows differences due to a change in file order between the upstream commit and the backported patch. In the upstream commit, the file order is 1.tools/lib/perf/include/perf/event.h 2.tools/perf/builtin-diff.c 3.tools/perf/builtin-report.c 4.... whereas in backported patch, the file order is 1.tools/perf/builtin-diff.c 2.tools/perf/builtin-report.c 3.tools/perf/lib/include/perf/event.h 4.... All upstream changes have been correctly backported to the respective files. However, the diff tool compares files on a line-by-line basis, thus, the changed order causes it to report deviations even though there are no actual code differences in the files builtin-diff.c and builtin-report.c. Reported-by: kernel test robot Signed-off-by: Namhyung Kim Cc: Adrian Hunter [ fix perf test failure on sampling parsing ] Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200325124536.2800725-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/builtin-diff.c | 1 + tools/perf/builtin-report.c | 1 + tools/perf/lib/include/perf/event.h | 7 +++++++ tools/perf/tests/sample-parsing.c | 6 +++++- tools/perf/util/event.c | 18 ++++++++++++++++++ tools/perf/util/event.h | 6 ++++++ tools/perf/util/evsel.c | 6 ++++++ tools/perf/util/machine.c | 12 ++++++++++++ tools/perf/util/machine.h | 3 +++ tools/perf/util/perf_event_attr_fprintf.c | 2 ++ tools/perf/util/session.c | 4 ++++ tools/perf/util/synthetic-events.c | 8 ++++++++ tools/perf/util/tool.h | 1 + 13 files changed, 74 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 9dc27770d848..48ce1f99c1ed 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -448,6 +448,7 @@ static struct perf_diff pdiff = { .fork = perf_event__process_fork, .lost = perf_event__process_lost, .namespaces = perf_event__process_namespaces, + .cgroup = perf_event__process_cgroup, .ordered_events = true, .ordering_requires_timestamps = true, }, diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index dc228bdf2bbc..f5c5e0cdfcb3 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1050,6 +1050,7 @@ int cmd_report(int argc, const char **argv) .mmap2 = perf_event__process_mmap2, .comm = perf_event__process_comm, .namespaces = perf_event__process_namespaces, + .cgroup = perf_event__process_cgroup, .exit = perf_event__process_exit, .fork = perf_event__process_fork, .lost = perf_event__process_lost, diff --git a/tools/perf/lib/include/perf/event.h b/tools/perf/lib/include/perf/event.h index 18106899cb4e..69b44d2cc0f5 100644 --- a/tools/perf/lib/include/perf/event.h +++ b/tools/perf/lib/include/perf/event.h @@ -105,6 +105,12 @@ struct perf_record_bpf_event { __u8 tag[BPF_TAG_SIZE]; // prog tag }; +struct perf_record_cgroup { + struct perf_event_header header; + __u64 id; + char path[PATH_MAX]; +}; + struct perf_record_sample { struct perf_event_header header; __u64 array[]; @@ -352,6 +358,7 @@ union perf_event { struct perf_record_mmap2 mmap2; struct perf_record_comm comm; struct perf_record_namespaces namespaces; + struct perf_record_cgroup cgroup; struct perf_record_fork fork; struct perf_record_lost lost; struct perf_record_lost_samples lost_samples; diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 2a01dd1d725c..12b8278a0e35 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -150,6 +150,9 @@ static bool samples_same(const struct perf_sample *s1, if (type & PERF_SAMPLE_PHYS_ADDR) COMP(phys_addr); + if (type & PERF_SAMPLE_CGROUP) + COMP(cgroup); + if (type & PERF_SAMPLE_AUX) { COMP(aux_sample.size); if (memcmp(s1->aux_sample.data, s2->aux_sample.data, @@ -228,6 +231,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) .regs = regs, }, .phys_addr = 113, + .cgroup = 114, .aux_sample = { .size = sizeof(aux_data), .data = (void *)aux_data, @@ -331,7 +335,7 @@ int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_u * were added. Please actually update the test rather than just change * the condition below. */ - if (PERF_SAMPLE_MAX > PERF_SAMPLE_AUX << 1) { + if (PERF_SAMPLE_MAX > PERF_SAMPLE_CGROUP << 1) { pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n"); return -1; } diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index bfaa9afdb8b4..49b809419a12 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -54,6 +54,7 @@ static const char *perf_event__names[] = { [PERF_RECORD_NAMESPACES] = "NAMESPACES", [PERF_RECORD_KSYMBOL] = "KSYMBOL", [PERF_RECORD_BPF_EVENT] = "BPF_EVENT", + [PERF_RECORD_CGROUP] = "CGROUP", [PERF_RECORD_HEADER_ATTR] = "ATTR", [PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE", [PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA", @@ -180,6 +181,12 @@ size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp) return ret; } +size_t perf_event__fprintf_cgroup(union perf_event *event, FILE *fp) +{ + return fprintf(fp, " cgroup: %" PRI_lu64 " %s\n", + event->cgroup.id, event->cgroup.path); +} + int perf_event__process_comm(struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, @@ -196,6 +203,14 @@ int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused, return machine__process_namespaces_event(machine, event, sample); } +int perf_event__process_cgroup(struct perf_tool *tool __maybe_unused, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine) +{ + return machine__process_cgroup_event(machine, event, sample); +} + int perf_event__process_lost(struct perf_tool *tool __maybe_unused, union perf_event *event, struct perf_sample *sample, @@ -417,6 +432,9 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp) case PERF_RECORD_NAMESPACES: ret += perf_event__fprintf_namespaces(event, fp); break; + case PERF_RECORD_CGROUP: + ret += perf_event__fprintf_cgroup(event, fp); + break; case PERF_RECORD_MMAP2: ret += perf_event__fprintf_mmap2(event, fp); break; diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 85223159737c..0ad3ba22817d 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -135,6 +135,7 @@ struct perf_sample { u32 raw_size; u64 data_src; u64 phys_addr; + u64 cgroup; u32 flags; u16 insn_len; u8 cpumode; @@ -321,6 +322,10 @@ int perf_event__process_namespaces(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine); +int perf_event__process_cgroup(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine); int perf_event__process_mmap(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, @@ -376,6 +381,7 @@ size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp); size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp); size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp); size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp); +size_t perf_event__fprintf_cgroup(union perf_event *event, FILE *fp); size_t perf_event__fprintf_ksymbol(union perf_event *event, FILE *fp); size_t perf_event__fprintf_bpf(union perf_event *event, FILE *fp); size_t perf_event__fprintf(union perf_event *event, FILE *fp); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index e86db917bfb9..cfa91984bb29 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2245,6 +2245,12 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, array++; } + data->cgroup = 0; + if (type & PERF_SAMPLE_CGROUP) { + data->cgroup = *array; + array++; + } + if (type & PERF_SAMPLE_AUX) { OVERFLOW_CHECK_u64(array); sz = *array++; diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 8c3addc2e9e1..c02ec8fcce55 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -650,6 +650,16 @@ int machine__process_namespaces_event(struct machine *machine __maybe_unused, return err; } +int machine__process_cgroup_event(struct machine *machine __maybe_unused, + union perf_event *event, + struct perf_sample *sample __maybe_unused) +{ + if (dump_trace) + perf_event__fprintf_cgroup(event, stdout); + + return 0; +} + int machine__process_lost_event(struct machine *machine __maybe_unused, union perf_event *event, struct perf_sample *sample __maybe_unused) { @@ -1885,6 +1895,8 @@ int machine__process_event(struct machine *machine, union perf_event *event, ret = machine__process_mmap_event(machine, event, sample); break; case PERF_RECORD_NAMESPACES: ret = machine__process_namespaces_event(machine, event, sample); break; + case PERF_RECORD_CGROUP: + ret = machine__process_cgroup_event(machine, event, sample); break; case PERF_RECORD_MMAP2: ret = machine__process_mmap2_event(machine, event, sample); break; case PERF_RECORD_FORK: diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 18e13c0ccd6a..4f734337661a 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -127,6 +127,9 @@ int machine__process_switch_event(struct machine *machine, int machine__process_namespaces_event(struct machine *machine, union perf_event *event, struct perf_sample *sample); +int machine__process_cgroup_event(struct machine *machine, + union perf_event *event, + struct perf_sample *sample); int machine__process_mmap_event(struct machine *machine, union perf_event *event, struct perf_sample *sample); int machine__process_mmap2_event(struct machine *machine, union perf_event *event, diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 355d3458d4e6..b94fa07f5d32 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -35,6 +35,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value) bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), bit_name(WEIGHT), bit_name(PHYS_ADDR), bit_name(AUX), + bit_name(CGROUP), { .name = NULL, } }; #undef bit_name @@ -132,6 +133,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, PRINT_ATTRf(ksymbol, p_unsigned); PRINT_ATTRf(bpf_event, p_unsigned); PRINT_ATTRf(aux_output, p_unsigned); + PRINT_ATTRf(cgroup, p_unsigned); PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned); PRINT_ATTRf(bp_type, p_unsigned); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 17caadb33e6c..f163e408353a 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -467,6 +467,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool) tool->comm = process_event_stub; if (tool->namespaces == NULL) tool->namespaces = process_event_stub; + if (tool->cgroup == NULL) + tool->cgroup = process_event_stub; if (tool->fork == NULL) tool->fork = process_event_stub; if (tool->exit == NULL) @@ -1431,6 +1433,8 @@ static int machines__deliver_event(struct machines *machines, return tool->comm(tool, event, sample, machine); case PERF_RECORD_NAMESPACES: return tool->namespaces(tool, event, sample, machine); + case PERF_RECORD_CGROUP: + return tool->cgroup(tool, event, sample, machine); case PERF_RECORD_FORK: return tool->fork(tool, event, sample, machine); case PERF_RECORD_EXIT: diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index e357f2b268de..7f967c274b04 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1228,6 +1228,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, if (type & PERF_SAMPLE_PHYS_ADDR) result += sizeof(u64); + if (type & PERF_SAMPLE_CGROUP) + result += sizeof(u64); + if (type & PERF_SAMPLE_AUX) { result += sizeof(u64); result += sample->aux_sample.size; @@ -1401,6 +1404,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo array++; } + if (type & PERF_SAMPLE_CGROUP) { + *array = sample->cgroup; + array++; + } + if (type & PERF_SAMPLE_AUX) { sz = sample->aux_sample.size; *array++ = sz; diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index 2abbf668b8de..472ef5eb4068 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -46,6 +46,7 @@ struct perf_tool { mmap2, comm, namespaces, + cgroup, fork, exit, lost, -- Gitee From a7cf7645a47561a4154db56d2bf4a889481834d3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 30 Nov 2020 09:27:53 -0800 Subject: [PATCH 163/257] perf record: Support new sample type for data page size commit 542b88fd12769bf5be307b11ca0f94a6140bba82 upstream Support new sample type PERF_SAMPLE_DATA_PAGE_SIZE for page size. Add new option --data-page-size to record sample data page size. Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Andi Kleen Cc: Jiri Olsa Cc: Mark Rutland Cc: Michael Ellerman Cc: Stephane Eranian Cc: Will Deacon Link: http://lore.kernel.org/lkml/20201130172803.2676-3-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf-record.txt | 3 +++ tools/perf/builtin-record.c | 2 ++ tools/perf/util/event.h | 1 + tools/perf/util/evsel.c | 9 +++++++++ tools/perf/util/perf_event_attr_fprintf.c | 2 +- tools/perf/util/record.h | 1 + tools/perf/util/synthetic-events.c | 8 ++++++++ 7 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 63d9ac3995c2..91b4ffe3b914 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -287,6 +287,9 @@ OPTIONS --phys-data:: Record the sample physical addresses. +--data-page-size:: + Record the sampled data address data page size. + -T:: --timestamp:: Record the sample timestamps. Use it with 'perf report -D' to see the diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 3ab5be1b7c52..33de5d56e91c 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -2203,6 +2203,8 @@ static struct option __record_options[] = { OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, "Record the sample physical addresses"), + OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size, + "Record the sampled data address data page size"), OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, &record.opts.sample_time_set, diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 0ad3ba22817d..01b0cb05db8f 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -135,6 +135,7 @@ struct perf_sample { u32 raw_size; u64 data_src; u64 phys_addr; + u64 data_page_size; u64 cgroup; u32 flags; u16 insn_len; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index cfa91984bb29..52af2a1a978c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1109,6 +1109,9 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, evsel__set_sample_bit(evsel, CGROUP); } + if (opts->sample_data_page_size) + evsel__set_sample_bit(evsel, DATA_PAGE_SIZE); + if (opts->record_switch_events) attr->context_switch = track; @@ -2251,6 +2254,12 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, array++; } + data->data_page_size = 0; + if (type & PERF_SAMPLE_DATA_PAGE_SIZE) { + data->data_page_size = *array; + array++; + } + if (type & PERF_SAMPLE_AUX) { OVERFLOW_CHECK_u64(array); sz = *array++; diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index b94fa07f5d32..68188c7c188a 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -35,7 +35,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value) bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), bit_name(WEIGHT), bit_name(PHYS_ADDR), bit_name(AUX), - bit_name(CGROUP), + bit_name(CGROUP), bit_name(DATA_PAGE_SIZE), { .name = NULL, } }; #undef bit_name diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h index ae3db7a272be..99034d644322 100644 --- a/tools/perf/util/record.h +++ b/tools/perf/util/record.h @@ -22,6 +22,7 @@ struct record_opts { bool raw_samples; bool sample_address; bool sample_phys_addr; + bool sample_data_page_size; bool sample_weight; bool sample_time; bool sample_time_set; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 7f967c274b04..41df28804296 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1231,6 +1231,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, if (type & PERF_SAMPLE_CGROUP) result += sizeof(u64); + if (type & PERF_SAMPLE_DATA_PAGE_SIZE) + result += sizeof(u64); + if (type & PERF_SAMPLE_AUX) { result += sizeof(u64); result += sample->aux_sample.size; @@ -1409,6 +1412,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo array++; } + if (type & PERF_SAMPLE_DATA_PAGE_SIZE) { + *array = sample->data_page_size; + array++; + } + if (type & PERF_SAMPLE_AUX) { sz = sample->aux_sample.size; *array++ = sz; -- Gitee From e4ac2046c8e76af3f1480f7dad6014e34f96eec2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 7 Dec 2020 14:04:05 -0300 Subject: [PATCH 164/257] perf evsel: Emit warning about kernel not supporting the data page size sample_type bit commit 456ef4c11c06f0b8c53acaf796d77d2033f079f2 upstream Before we had this unhelpful message: $ perf record --data-page-size sleep 1 Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (cycles:u). /bin/dmesg | grep -i perf may provide additional information. $ Add support to the perf_missing_features variable to remember what caused evsel__open() to fail and then use that information in evsel__open_strerror(). $ perf record --data-page-size sleep 1 Error: Asking for the data page size isn't supported by this kernel. $ Cc: Kan Liang Cc: Namhyung Kim Cc: Andi Kleen Cc: Jiri Olsa Cc: Mark Rutland Cc: Michael Ellerman Cc: Stephane Eranian Cc: Will Deacon Link: http://lore.kernel.org/lkml/20201207170759.GB129853@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/util/evsel.c | 9 ++++++++- tools/perf/util/evsel.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 52af2a1a978c..8c92521a7e42 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1785,7 +1785,12 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.cgroup && evsel->core.attr.cgroup) { + if (!perf_missing_features.data_page_size && + (evsel->core.attr.sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)) { + perf_missing_features.data_page_size = true; + pr_debug2_peo("Kernel has no PERF_SAMPLE_DATA_PAGE_SIZE support, bailing out\n"); + goto out_close; + } else if (!perf_missing_features.cgroup && evsel->core.attr.cgroup) { perf_missing_features.cgroup = true; pr_debug2_peo("Kernel has no cgroup sampling support, bailing out\n"); goto out_close; @@ -2560,6 +2565,8 @@ int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, "We found oprofile daemon running, please stop it and try again."); break; case EINVAL: + if (evsel->core.attr.sample_type & PERF_SAMPLE_DATA_PAGE_SIZE && perf_missing_features.data_page_size) + return scnprintf(msg, size, "Asking for the data page size isn't supported by this kernel."); if (evsel->core.attr.write_backward && perf_missing_features.write_backward) return scnprintf(msg, size, "Reading from overwrite event is not supported by this kernel."); if (perf_missing_features.clockid) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 1c9d021c0304..4f42464daac4 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -125,6 +125,7 @@ struct perf_missing_features { bool aux_output; bool branch_hw_idx; bool cgroup; + bool data_page_size; }; extern struct perf_missing_features perf_missing_features; -- Gitee From 9ef0cc93dc5443dfb62c9ebcd15f7025b8575815 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 5 Jan 2021 11:57:49 -0800 Subject: [PATCH 165/257] perf record: Add support for PERF_SAMPLE_CODE_PAGE_SIZE commit c1de7f3d84ca324c7cda85c3ce27b11741af2124 upstream Adds the infrastructure to sample the code address page size. Introduce a new --code-page-size option for perf record. Signed-off-by: Kan Liang Originally-by: Stephane Eranian Acked-by: Jiri Olsa Acked-by: Namhyung Kim Cc: Andi Kleen Cc: Ingo Molnar Cc: Mark Rutland Cc: Michael Ellerman Cc: Will Deacon Link: https://lore.kernel.org/r/20210105195752.43489-4-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf-record.txt | 3 +++ tools/perf/builtin-record.c | 2 ++ tools/perf/util/event.h | 1 + tools/perf/util/evsel.c | 18 +++++++++++++++++- tools/perf/util/evsel.h | 1 + tools/perf/util/perf_event_attr_fprintf.c | 2 +- tools/perf/util/record.h | 1 + tools/perf/util/synthetic-events.c | 8 ++++++++ 8 files changed, 34 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 91b4ffe3b914..5a666750bd4b 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -290,6 +290,9 @@ OPTIONS --data-page-size:: Record the sampled data address data page size. +--code-page-size:: + Record the sampled code address (ip) page size + -T:: --timestamp:: Record the sample timestamps. Use it with 'perf report -D' to see the diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 33de5d56e91c..1d9c1dc20789 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -2205,6 +2205,8 @@ static struct option __record_options[] = { "Record the sample physical addresses"), OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size, "Record the sampled data address data page size"), + OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size, + "Record the sampled code address (ip) page size"), OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, &record.opts.sample_time_set, diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 01b0cb05db8f..c8d31205a3dd 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -136,6 +136,7 @@ struct perf_sample { u64 data_src; u64 phys_addr; u64 data_page_size; + u64 code_page_size; u64 cgroup; u32 flags; u16 insn_len; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 8c92521a7e42..0f08f9577d4c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1112,6 +1112,9 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, if (opts->sample_data_page_size) evsel__set_sample_bit(evsel, DATA_PAGE_SIZE); + if (opts->sample_code_page_size) + evsel__set_sample_bit(evsel, CODE_PAGE_SIZE); + if (opts->record_switch_events) attr->context_switch = track; @@ -1785,7 +1788,12 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.data_page_size && + if (!perf_missing_features.code_page_size && + (evsel->core.attr.sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)) { + perf_missing_features.code_page_size = true; + pr_debug2_peo("Kernel has no PERF_SAMPLE_CODE_PAGE_SIZE support, bailing out\n"); + goto out_close; + } else if (!perf_missing_features.data_page_size && (evsel->core.attr.sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)) { perf_missing_features.data_page_size = true; pr_debug2_peo("Kernel has no PERF_SAMPLE_DATA_PAGE_SIZE support, bailing out\n"); @@ -2265,6 +2273,12 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, array++; } + data->code_page_size = 0; + if (type & PERF_SAMPLE_CODE_PAGE_SIZE) { + data->code_page_size = *array; + array++; + } + if (type & PERF_SAMPLE_AUX) { OVERFLOW_CHECK_u64(array); sz = *array++; @@ -2565,6 +2579,8 @@ int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, "We found oprofile daemon running, please stop it and try again."); break; case EINVAL: + if (evsel->core.attr.sample_type & PERF_SAMPLE_CODE_PAGE_SIZE && perf_missing_features.code_page_size) + return scnprintf(msg, size, "Asking for the code page size isn't supported by this kernel."); if (evsel->core.attr.sample_type & PERF_SAMPLE_DATA_PAGE_SIZE && perf_missing_features.data_page_size) return scnprintf(msg, size, "Asking for the data page size isn't supported by this kernel."); if (evsel->core.attr.write_backward && perf_missing_features.write_backward) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 4f42464daac4..4d7f46c785a1 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -126,6 +126,7 @@ struct perf_missing_features { bool branch_hw_idx; bool cgroup; bool data_page_size; + bool code_page_size; }; extern struct perf_missing_features perf_missing_features; diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index 68188c7c188a..a97a95f3b1be 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -35,7 +35,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value) bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), bit_name(WEIGHT), bit_name(PHYS_ADDR), bit_name(AUX), - bit_name(CGROUP), bit_name(DATA_PAGE_SIZE), + bit_name(CGROUP), bit_name(DATA_PAGE_SIZE), bit_name(CODE_PAGE_SIZE), { .name = NULL, } }; #undef bit_name diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h index 99034d644322..e6a552a7882e 100644 --- a/tools/perf/util/record.h +++ b/tools/perf/util/record.h @@ -23,6 +23,7 @@ struct record_opts { bool sample_address; bool sample_phys_addr; bool sample_data_page_size; + bool sample_code_page_size; bool sample_weight; bool sample_time; bool sample_time_set; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 41df28804296..f78edda3986e 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1234,6 +1234,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, if (type & PERF_SAMPLE_DATA_PAGE_SIZE) result += sizeof(u64); + if (type & PERF_SAMPLE_CODE_PAGE_SIZE) + result += sizeof(u64); + if (type & PERF_SAMPLE_AUX) { result += sizeof(u64); result += sample->aux_sample.size; @@ -1417,6 +1420,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo array++; } + if (type & PERF_SAMPLE_CODE_PAGE_SIZE) { + *array = sample->code_page_size; + array++; + } + if (type & PERF_SAMPLE_AUX) { sz = sample->aux_sample.size; *array++ = sz; -- Gitee From bf4a45812918a961f24583bb12ffc20d089dea98 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 1 Oct 2020 06:57:49 -0700 Subject: [PATCH 166/257] perf/core: Add support for PERF_SAMPLE_CODE_PAGE_SIZE commit 995f088efebe1eba0282a6ffa12411b37f8990c2 upstream When studying code layout, it is useful to capture the page size of the sampled code address. Add a new sample type for code page size. The new sample type requires collecting the ip. The code page size can be calculated from the NMI-safe perf_get_page_size(). For large PEBS, it's very unlikely that the mapping is gone for the earlier PEBS records. Enable the feature for the large PEBS. The worst case is that page-size '0' is returned. Signed-off-by: Kan Liang Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-5-kan.liang@linux.intel.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- arch/x86/events/perf_event.h | 2 +- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 4 +++- kernel/events/core.c | 11 ++++++++++- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 232db8465cfe..9a377bd754f9 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -134,7 +134,7 @@ struct amd_nb { PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ - PERF_SAMPLE_PERIOD) + PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE) #define PEBS_GP_REGS \ ((1ULL << PERF_REG_X86_AX) | \ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6ed8f63260cb..78c8abdad2e3 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1029,6 +1029,7 @@ struct perf_sample_data { u64 phys_addr; u64 cgroup; u64 data_page_size; + u64 code_page_size; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index c413bca7971f..625f5ba89283 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -144,8 +144,9 @@ enum perf_event_sample_format { PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, - PERF_SAMPLE_MAX = 1U << 23, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -880,6 +881,7 @@ enum perf_event_type { * { u64 size; * char data[size]; } && PERF_SAMPLE_AUX * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e30058c96ce..b7b3a9a361dd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1767,6 +1767,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) size += sizeof(data->data_page_size); + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + size += sizeof(data->code_page_size); + event->header_size = size; } @@ -6707,6 +6710,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) perf_output_put(handle, data->data_page_size); + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + perf_output_put(handle, data->code_page_size); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -6887,7 +6893,7 @@ void perf_prepare_sample(struct perf_event_header *header, __perf_event_header__init_id(header, data, event); - if (sample_type & PERF_SAMPLE_IP) + if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); if (sample_type & PERF_SAMPLE_CALLCHAIN) { @@ -7015,6 +7021,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) data->data_page_size = perf_get_page_size(data->addr); + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + data->code_page_size = perf_get_page_size(data->ip); + if (sample_type & PERF_SAMPLE_AUX) { u64 size; -- Gitee From 0d827c8537dbee0eb53208d7e87668c9cf225d94 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Oct 2020 06:57:47 -0700 Subject: [PATCH 167/257] perf/x86/intel: Support PERF_SAMPLE_DATA_PAGE_SIZE commit 76a5433f95f32d8a17c9f836be2084ed947c466b upstream The new sample type, PERF_SAMPLE_DATA_PAGE_SIZE, requires the virtual address. Update the data->addr if the sample type is set. The large PEBS is disabled with the sample type, because perf doesn't support munmap tracking yet. The PEBS buffer for large PEBS cannot be flushed for each munmap. Wrong page size may be calculated. The large PEBS can be enabled later separately when munmap tracking is supported. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-3-kan.liang@linux.intel.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- arch/x86/events/intel/ds.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7c7319154e1b..09b0c637833c 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -963,7 +963,8 @@ static void adaptive_pebs_record_size_update(void) #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | \ + PERF_SAMPLE_DATA_PAGE_SIZE) static u64 pebs_update_adaptive_cfg(struct perf_event *event) { @@ -1339,6 +1340,10 @@ static u64 get_data_src(struct perf_event *event, u64 aux) return val; } +#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_DATA_PAGE_SIZE) + static void setup_pebs_fixed_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -1453,7 +1458,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, } - if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && + if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; @@ -1581,7 +1586,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = get_data_src(event, meminfo->aux); - if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; if (sample_type & PERF_SAMPLE_TRANSACTION) -- Gitee From 35318b1b35f591b4c613aa3a28e7447a3dc1b621 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 Jan 2021 14:40:07 -0800 Subject: [PATCH 168/257] perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT commit 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c upstream Current PERF_SAMPLE_WEIGHT sample type is very useful to expresses the cost of an action represented by the sample. This allows the profiler to scale the samples to be more informative to the programmer. It could also help to locate a hotspot, e.g., when profiling by memory latencies, the expensive load appear higher up in the histograms. But current PERF_SAMPLE_WEIGHT sample type is solely determined by one factor. This could be a problem, if users want two or more factors to contribute to the weight. For example, Golden Cove core PMU can provide both the instruction latency and the cache Latency information as factors for the memory profiling. For current X86 platforms, although meminfo::latency is defined as a u64, only the lower 32 bits include the valid data in practice (No memory access could last than 4G cycles). The higher 32 bits can be used to store new factors. Add a new sample type, PERF_SAMPLE_WEIGHT_STRUCT, to indicate the new sample weight structure. It shares the same space as the PERF_SAMPLE_WEIGHT sample type. Users can apply either the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type to retrieve the sample weight, but they cannot apply both sample types simultaneously. Currently, only X86 and PowerPC use the PERF_SAMPLE_WEIGHT sample type. - For PowerPC, there is nothing changed for the PERF_SAMPLE_WEIGHT sample type. There is no effect for the new PERF_SAMPLE_WEIGHT_STRUCT sample type. PowerPC can re-struct the weight field similarly later. - For X86, the same value will be dumped for the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type for now. The following patches will apply the new factors for the PERF_SAMPLE_WEIGHT_STRUCT sample type. The field in the union perf_sample_weight should be shared among different architectures. A generic name is required, but it's hard to abstract a name that applies to all architectures. For example, on X86, the fields are to store all kinds of latency. While on PowerPC, it stores MMCRA[TECX/TECM], which should not be latency. So a general name prefix 'var$NUM' is used here. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-2-git-send-email-kan.liang@linux.intel.com Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- arch/powerpc/perf/core-book3s.c | 2 +- arch/x86/events/intel/ds.c | 17 ++++++------- include/linux/perf_event.h | 4 ++-- include/uapi/linux/perf_event.h | 42 +++++++++++++++++++++++++++++++-- kernel/events/core.c | 11 +++++---- 5 files changed, 59 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 58de6cec907b..f5f973077c28 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2136,7 +2136,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (event->attr.sample_type & PERF_SAMPLE_WEIGHT && ppmu->get_mem_weight) - ppmu->get_mem_weight(&data.weight); + ppmu->get_mem_weight(&data.weight.full); if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 09b0c637833c..0fc8ce368e88 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -962,7 +962,8 @@ static void adaptive_pebs_record_size_update(void) } #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ - PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_WEIGHT_TYPE | \ PERF_SAMPLE_TRANSACTION | \ PERF_SAMPLE_DATA_PAGE_SIZE) @@ -989,7 +990,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event) gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && (attr->sample_regs_intr & PEBS_GP_REGS); - tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) && + tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) && ((attr->config & INTEL_ARCH_EVENT_MASK) == x86_pmu.rtm_abort_event); @@ -1371,8 +1372,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) - data->weight = pebs->lat; + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + data->weight.full = pebs->lat; /* * data.data_src encodes the data source @@ -1464,8 +1465,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data->weight = intel_get_tsx_weight(pebs->tsx_tuning); + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); if (sample_type & PERF_SAMPLE_TRANSACTION) data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, @@ -1579,8 +1580,8 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT) - data->weight = meminfo->latency ?: + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = meminfo->latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 78c8abdad2e3..2aa48df4e89f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -992,7 +992,7 @@ struct perf_sample_data { struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 period; - u64 weight; + union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; @@ -1047,7 +1047,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->raw = NULL; data->br_stack = NULL; data->period = period; - data->weight = 0; + data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 625f5ba89283..81c8bfa18e90 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -145,12 +145,14 @@ enum perf_event_sample_format { PERF_SAMPLE_CGROUP = 1U << 21, PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, + PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, - PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; +#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) /* * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set * @@ -872,7 +874,24 @@ enum perf_event_type { * char data[size]; * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * - * { u64 weight; } && PERF_SAMPLE_WEIGHT + * { union perf_sample_weight + * { + * u64 full; && PERF_SAMPLE_WEIGHT + * #if defined(__LITTLE_ENDIAN_BITFIELD) + * struct { + * u32 var1_dw; + * u16 var2_w; + * u16 var3_w; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #elif defined(__BIG_ENDIAN_BITFIELD) + * struct { + * u16 var3_w; + * u16 var2_w; + * u32 var1_dw; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #endif + * } + * } * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi @@ -1207,4 +1226,23 @@ struct perf_branch_entry { reserved:40; }; +union perf_sample_weight { + __u64 full; +#if defined(__LITTLE_ENDIAN_BITFIELD) + struct { + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; + }; +#elif defined(__BIG_ENDIAN_BITFIELD) + struct { + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; + }; +#else +#error "Unknown endianness" +#endif +}; + #endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index b7b3a9a361dd..418cbf41c7fd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1746,8 +1746,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); - if (sample_type & PERF_SAMPLE_WEIGHT) - size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; @@ -6675,8 +6675,8 @@ void perf_output_sample(struct perf_output_handle *handle, data->regs_user.regs); } - if (sample_type & PERF_SAMPLE_WEIGHT) - perf_output_put(handle, data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); @@ -11193,6 +11193,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif + if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && + (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) + return -EINVAL; out: return ret; -- Gitee From 200596bd66deefac4720297c03f8a025278b5293 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:05 -0800 Subject: [PATCH 169/257] tools headers uapi: Update tools's copy of linux/perf_event.h commit 81898ef1303d8fb5a3256b09b3140b4eee83dad8 upstream To get the changes in these csets: 2a6c6b7d7ad346f0 ("perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT") 61b985e3e775a3a7 ("perf/x86/intel: Add perf core PMU support for Sapphire Rapids") This cures the following warning during perf's build: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Committer notes: Picked by hand as I had already merged the MMAP buildid patch that also touches perf_event.h and is also only in {acme,tip}/perf/core, not yet upstream. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-2-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/include/uapi/linux/perf_event.h | 54 +++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 74bf68b3f16b..d4d99f945d13 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -145,12 +145,14 @@ enum perf_event_sample_format { PERF_SAMPLE_CGROUP = 1U << 21, PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, + PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, - PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; +#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) /* * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set * @@ -872,7 +874,24 @@ enum perf_event_type { * char data[size]; * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * - * { u64 weight; } && PERF_SAMPLE_WEIGHT + * { union perf_sample_weight + * { + * u64 full; && PERF_SAMPLE_WEIGHT + * #if defined(__LITTLE_ENDIAN_BITFIELD) + * struct { + * u32 var1_dw; + * u16 var2_w; + * u16 var3_w; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #elif defined(__BIG_ENDIAN_BITFIELD) + * struct { + * u16 var3_w; + * u16 var2_w; + * u32 var1_dw; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #endif + * } + * } * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi @@ -1086,14 +1105,16 @@ union perf_mem_data_src { mem_lvl_num:4, /* memory hierarchy level number */ mem_remote:1, /* remote */ mem_snoopx:2, /* snoop mode, ext */ - mem_rsvd:24; + mem_blk:3, /* access blocked */ + mem_rsvd:21; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:24, + __u64 mem_rsvd:21, + mem_blk:3, /* access blocked */ mem_snoopx:2, /* snoop mode, ext */ mem_remote:1, /* remote */ mem_lvl_num:4, /* memory hierarchy level number */ @@ -1176,6 +1197,12 @@ union perf_mem_data_src { #define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ #define PERF_MEM_TLB_SHIFT 26 +/* Access blocked */ +#define PERF_MEM_BLK_NA 0x01 /* not available */ +#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) @@ -1207,4 +1234,23 @@ struct perf_branch_entry { reserved:40; }; +union perf_sample_weight { + __u64 full; +#if defined(__LITTLE_ENDIAN_BITFIELD) + struct { + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; + }; +#elif defined(__BIG_ENDIAN_BITFIELD) + struct { + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; + }; +#else +#error "Unknown endianness" +#endif +}; + #endif /* _UAPI_LINUX_PERF_EVENT_H */ -- Gitee From b37ab05f44b675d734e748f31ee1b797bde2484c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:09 -0800 Subject: [PATCH 170/257] perf tools: Support PERF_SAMPLE_WEIGHT_STRUCT commit ea8d0ed6eae37b01953a29bca98112d9e2507a84 upstream The new sample type, PERF_SAMPLE_WEIGHT_STRUCT, is an alternative of the PERF_SAMPLE_WEIGHT sample type. Users can apply either the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type to retrieve the sample weight, but they cannot apply both sample types simultaneously. The new sample type shares the same space as the PERF_SAMPLE_WEIGHT sample type. The lower 32 bits are exactly the same for both sample type. The higher 32 bits may be different for different architecture. Add arch specific arch_evsel__set_sample_weight() to set the new sample type for X86. Only store the lower 32 bits for the sample->weight if the new sample type is applied. In practice, no memory access could last than 4G cycles. No data will be lost. If the kernel doesn't support the new sample type. Fall back to the PERF_SAMPLE_WEIGHT sample type. There is no impact for other architectures. Committer notes: Fixup related to PERF_SAMPLE_CODE_PAGE_SIZE, present in acme/perf/core but not upstream yet. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-6-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/evsel.c | 8 +++++++ tools/perf/util/evsel.c | 28 +++++++++++++++++++---- tools/perf/util/evsel.h | 3 +++ tools/perf/util/intel-pt.c | 22 +++++++++++++++--- tools/perf/util/perf_event_attr_fprintf.c | 1 + tools/perf/util/session.c | 2 +- tools/perf/util/synthetic-events.c | 6 +++-- 8 files changed, 61 insertions(+), 10 deletions(-) create mode 100644 tools/perf/arch/x86/util/evsel.c diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index 47f9c56e744f..1504b356f827 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -6,6 +6,7 @@ perf-y += perf_regs.o perf-y += group.o perf-y += machine.o perf-y += event.o +perf-y += evsel.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o diff --git a/tools/perf/arch/x86/util/evsel.c b/tools/perf/arch/x86/util/evsel.c new file mode 100644 index 000000000000..2f733cdc8dbb --- /dev/null +++ b/tools/perf/arch/x86/util/evsel.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "util/evsel.h" + +void arch_evsel__set_sample_weight(struct evsel *evsel) +{ + evsel__set_sample_bit(evsel, WEIGHT_STRUCT); +} diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 0f08f9577d4c..a1425c23ef06 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -925,6 +925,11 @@ struct perf_evsel_config_term *__perf_evsel__get_config_term(struct evsel *evsel return found_term; } +void __weak arch_evsel__set_sample_weight(struct evsel *evsel) +{ + evsel__set_sample_bit(evsel, WEIGHT); +} + /* * The enable_on_exec/disabled value strategy: * @@ -1092,7 +1097,7 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, } if (opts->sample_weight) - evsel__set_sample_bit(evsel, WEIGHT); + arch_evsel__set_sample_weight(evsel); attr->task = track; attr->mmap = track; @@ -1654,6 +1659,10 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, } fallback_missing_features: + if (perf_missing_features.weight_struct) { + evsel__set_sample_bit(evsel, WEIGHT); + evsel__reset_sample_bit(evsel, WEIGHT_STRUCT); + } if (perf_missing_features.clockid_wrong) evsel->core.attr.clockid = CLOCK_MONOTONIC; /* should always work */ if (perf_missing_features.clockid) { @@ -1788,7 +1797,12 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.code_page_size && + if (!perf_missing_features.weight_struct && + (evsel->core.attr.sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) { + perf_missing_features.weight_struct = true; + pr_debug2("switching off weight struct support\n"); + goto fallback_missing_features; + } else if (!perf_missing_features.code_page_size && (evsel->core.attr.sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)) { perf_missing_features.code_page_size = true; pr_debug2_peo("Kernel has no PERF_SAMPLE_CODE_PAGE_SIZE support, bailing out\n"); @@ -2220,9 +2234,15 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, } } - if (type & PERF_SAMPLE_WEIGHT) { + if (type & PERF_SAMPLE_WEIGHT_TYPE) { + union perf_sample_weight weight; + OVERFLOW_CHECK_u64(array); - data->weight = *array; + weight.full = *array; + if (type & PERF_SAMPLE_WEIGHT) + data->weight = weight.full; + else + data->weight = weight.var1_dw; array++; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 4d7f46c785a1..9b71dcd74dee 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -127,6 +127,7 @@ struct perf_missing_features { bool cgroup; bool data_page_size; bool code_page_size; + bool weight_struct; }; extern struct perf_missing_features perf_missing_features; @@ -230,6 +231,8 @@ int perf_evsel__set_filter(struct evsel *evsel, const char *filter); int perf_evsel__append_tp_filter(struct evsel *evsel, const char *filter); int perf_evsel__append_addr_filter(struct evsel *evsel, const char *filter); +void arch_evsel__set_sample_weight(struct evsel *evsel); + int evsel__enable(struct evsel *evsel); int evsel__disable(struct evsel *evsel); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index a76d9886dbf2..bdb4b2586a7d 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1803,13 +1803,29 @@ static int intel_pt_synth_pebs_sample(struct intel_pt_queue *ptq) if (sample_type & PERF_SAMPLE_ADDR && items->has_mem_access_address) sample.addr = items->mem_access_address; - if (sample_type & PERF_SAMPLE_WEIGHT) { + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { /* * Refer kernel's setup_pebs_adaptive_sample_data() and * intel_hsw_weight(). */ - if (items->has_mem_access_latency) - sample.weight = items->mem_access_latency; + if (items->has_mem_access_latency) { + u64 weight = items->mem_access_latency >> 32; + + /* + * Starts from SPR, the mem access latency field + * contains both cache latency [47:32] and instruction + * latency [15:0]. The cache latency is the same as the + * mem access latency on previous platforms. + * + * In practice, no memory access could last than 4G + * cycles. Use latency >> 32 to distinguish the + * different format of the mem access latency field. + */ + if (weight > 0) + sample.weight = weight & 0xffff; + else + sample.weight = items->mem_access_latency; + } if (!sample.weight && items->has_tsx_aux_info) { /* Cycles last block */ sample.weight = (u32)items->tsx_aux_info; diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index a97a95f3b1be..533377743dbc 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -36,6 +36,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value) bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), bit_name(WEIGHT), bit_name(PHYS_ADDR), bit_name(AUX), bit_name(CGROUP), bit_name(DATA_PAGE_SIZE), bit_name(CODE_PAGE_SIZE), + bit_name(WEIGHT_STRUCT), { .name = NULL, } }; #undef bit_name diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index f163e408353a..e8a4516070b3 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1250,7 +1250,7 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, if (sample_type & PERF_SAMPLE_STACK_USER) stack_user__printf(&sample->user_stack); - if (sample_type & PERF_SAMPLE_WEIGHT) + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) printf("... weight: %" PRIu64 "\n", sample->weight); if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index f78edda3986e..2c0f7ec3f8b7 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1206,7 +1206,7 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, } } - if (type & PERF_SAMPLE_WEIGHT) + if (type & PERF_SAMPLE_WEIGHT_TYPE) result += sizeof(u64); if (type & PERF_SAMPLE_DATA_SRC) @@ -1379,8 +1379,10 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo } } - if (type & PERF_SAMPLE_WEIGHT) { + if (type & PERF_SAMPLE_WEIGHT_TYPE) { *array = sample->weight; + if (type & PERF_SAMPLE_WEIGHT_STRUCT) + *array &= 0xffffffff; array++; } -- Gitee From 49da5ffcd0e8d3654b5df1e8da3b04a5eb389559 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 6 May 2020 13:27:04 -0300 Subject: [PATCH 171/257] perf evsel: Rename perf_evsel__new*() to evsel__new*() commit 8f6725a2c95c8b8719a95e72d68698b68401980f upstream As these are 'struct evsel' methods, not part of tools/lib/perf/, aka libperf, to whom the perf_ prefix belongs. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/builtin-trace.c | 10 +++++----- tools/perf/tests/evsel-tp-sched.c | 8 ++++---- tools/perf/tests/mmap-basic.c | 4 ++-- tools/perf/tests/openat-syscall-all-cpus.c | 2 +- tools/perf/tests/openat-syscall-tp-fields.c | 4 ++-- tools/perf/tests/openat-syscall.c | 2 +- tools/perf/tests/sw-clock.c | 2 +- tools/perf/util/evlist.c | 8 ++++---- tools/perf/util/evsel.c | 6 +++--- tools/perf/util/evsel.h | 12 ++++++------ tools/perf/util/parse-events.c | 5 ++--- tools/perf/util/sideband_evlist.c | 2 +- 12 files changed, 32 insertions(+), 33 deletions(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d196b287a801..6e2d13c5c950 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -394,11 +394,11 @@ static int perf_evsel__init_raw_syscall_tp(struct evsel *evsel, void *handler) static struct evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler) { - struct evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); + struct evsel *evsel = evsel__newtp("raw_syscalls", direction); /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */ if (IS_ERR(evsel)) - evsel = perf_evsel__newtp("syscalls", direction); + evsel = evsel__newtp("syscalls", direction); if (IS_ERR(evsel)) return NULL; @@ -2706,7 +2706,7 @@ static bool evlist__add_vfs_getname(struct evlist *evlist) return found; } -static struct evsel *perf_evsel__new_pgfault(u64 config) +static struct evsel *evsel__new_pgfault(u64 config) { struct evsel *evsel; struct perf_event_attr attr = { @@ -3333,7 +3333,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) } if ((trace->trace_pgfaults & TRACE_PFMAJ)) { - pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); + pgfault_maj = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); if (pgfault_maj == NULL) goto out_error_mem; perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); @@ -3341,7 +3341,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) } if ((trace->trace_pgfaults & TRACE_PFMIN)) { - pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); + pgfault_min = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); if (pgfault_min == NULL) goto out_error_mem; perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); diff --git a/tools/perf/tests/evsel-tp-sched.c b/tools/perf/tests/evsel-tp-sched.c index 261e6eaaee99..3805e5a74e17 100644 --- a/tools/perf/tests/evsel-tp-sched.c +++ b/tools/perf/tests/evsel-tp-sched.c @@ -35,11 +35,11 @@ static int perf_evsel__test_field(struct evsel *evsel, const char *name, int test__perf_evsel__tp_sched_test(struct test *test __maybe_unused, int subtest __maybe_unused) { - struct evsel *evsel = perf_evsel__newtp("sched", "sched_switch"); + struct evsel *evsel = evsel__newtp("sched", "sched_switch"); int ret = 0; if (IS_ERR(evsel)) { - pr_debug("perf_evsel__newtp failed with %ld\n", PTR_ERR(evsel)); + pr_debug("evsel__newtp failed with %ld\n", PTR_ERR(evsel)); return -1; } @@ -66,10 +66,10 @@ int test__perf_evsel__tp_sched_test(struct test *test __maybe_unused, int subtes evsel__delete(evsel); - evsel = perf_evsel__newtp("sched", "sched_wakeup"); + evsel = evsel__newtp("sched", "sched_wakeup"); if (IS_ERR(evsel)) { - pr_debug("perf_evsel__newtp failed with %ld\n", PTR_ERR(evsel)); + pr_debug("evsel__newtp failed with %ld\n", PTR_ERR(evsel)); return -1; } diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index e9d530d7e7de..3fcd6e56ecb3 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -79,9 +79,9 @@ int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unuse char name[64]; snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]); - evsels[i] = perf_evsel__newtp("syscalls", name); + evsels[i] = evsel__newtp("syscalls", name); if (IS_ERR(evsels[i])) { - pr_debug("perf_evsel__new(%s)\n", name); + pr_debug("evsel__new(%s)\n", name); goto out_delete_evlist; } diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c index 93c176523e38..e6013b99d2c7 100644 --- a/tools/perf/tests/openat-syscall-all-cpus.c +++ b/tools/perf/tests/openat-syscall-all-cpus.c @@ -44,7 +44,7 @@ int test__openat_syscall_event_on_all_cpus(struct test *test __maybe_unused, int CPU_ZERO(&cpu_set); - evsel = perf_evsel__newtp("syscalls", "sys_enter_openat"); + evsel = evsel__newtp("syscalls", "sys_enter_openat"); if (IS_ERR(evsel)) { tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "syscalls", "sys_enter_openat"); pr_debug("%s\n", errbuf); diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index c6b2d7aab608..d6b3ddf0c9ff 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -46,9 +46,9 @@ int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest goto out; } - evsel = perf_evsel__newtp("syscalls", "sys_enter_openat"); + evsel = evsel__newtp("syscalls", "sys_enter_openat"); if (IS_ERR(evsel)) { - pr_debug("%s: perf_evsel__newtp\n", __func__); + pr_debug("%s: evsel__newtp\n", __func__); goto out_delete_evlist; } diff --git a/tools/perf/tests/openat-syscall.c b/tools/perf/tests/openat-syscall.c index 5ebffae18605..7e9e57a1275b 100644 --- a/tools/perf/tests/openat-syscall.c +++ b/tools/perf/tests/openat-syscall.c @@ -27,7 +27,7 @@ int test__openat_syscall_event(struct test *test __maybe_unused, int subtest __m return -1; } - evsel = perf_evsel__newtp("syscalls", "sys_enter_openat"); + evsel = evsel__newtp("syscalls", "sys_enter_openat"); if (IS_ERR(evsel)) { tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "syscalls", "sys_enter_openat"); pr_debug("%s\n", errbuf); diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index bfb9986093d8..4b9b731977c8 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -56,7 +56,7 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) evsel = evsel__new(&attr); if (evsel == NULL) { - pr_debug("perf_evsel__new\n"); + pr_debug("evsel__new\n"); goto out_delete_evlist; } evlist__add(evlist, evsel); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 03a066ce1f6d..92a94b52e51c 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -213,7 +213,7 @@ void perf_evlist__set_leader(struct evlist *evlist) int __perf_evlist__add_default(struct evlist *evlist, bool precise) { - struct evsel *evsel = perf_evsel__new_cycles(precise); + struct evsel *evsel = evsel__new_cycles(precise); if (evsel == NULL) return -ENOMEM; @@ -229,7 +229,7 @@ int perf_evlist__add_dummy(struct evlist *evlist) .config = PERF_COUNT_SW_DUMMY, .size = sizeof(attr), /* to capture ABI version */ }; - struct evsel *evsel = perf_evsel__new_idx(&attr, evlist->core.nr_entries); + struct evsel *evsel = evsel__new_idx(&attr, evlist->core.nr_entries); if (evsel == NULL) return -ENOMEM; @@ -246,7 +246,7 @@ static int evlist__add_attrs(struct evlist *evlist, size_t i; for (i = 0; i < nr_attrs; i++) { - evsel = perf_evsel__new_idx(attrs + i, evlist->core.nr_entries + i); + evsel = evsel__new_idx(attrs + i, evlist->core.nr_entries + i); if (evsel == NULL) goto out_delete_partial_list; list_add_tail(&evsel->core.node, &head); @@ -305,7 +305,7 @@ perf_evlist__find_tracepoint_by_name(struct evlist *evlist, int perf_evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name, void *handler) { - struct evsel *evsel = perf_evsel__newtp(sys, name); + struct evsel *evsel = evsel__newtp(sys, name); if (IS_ERR(evsel)) return -1; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a1425c23ef06..cdd8738e9cdf 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -259,7 +259,7 @@ void evsel__init(struct evsel *evsel, evsel->pmu_name = NULL; } -struct evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx) +struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx) { struct evsel *evsel = zalloc(perf_evsel__object.size); @@ -292,7 +292,7 @@ static bool perf_event_can_profile_kernel(void) return perf_event_paranoid_check(1); } -struct evsel *perf_evsel__new_cycles(bool precise) +struct evsel *evsel__new_cycles(bool precise) { struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, @@ -334,7 +334,7 @@ struct evsel *perf_evsel__new_cycles(bool precise) /* * Returns pointer with encoded error via interface. */ -struct evsel *perf_evsel__newtp_idx(const char *sys, const char *name, int idx) +struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx) { struct evsel *evsel = zalloc(perf_evsel__object.size); int err = -ENOMEM; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 9b71dcd74dee..4aa78039cef7 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -160,24 +160,24 @@ int perf_evsel__object_config(size_t object_size, struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel); bool perf_evsel__is_aux_event(struct evsel *evsel); -struct evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx); +struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx); static inline struct evsel *evsel__new(struct perf_event_attr *attr) { - return perf_evsel__new_idx(attr, 0); + return evsel__new_idx(attr, 0); } -struct evsel *perf_evsel__newtp_idx(const char *sys, const char *name, int idx); +struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx); /* * Returns pointer with encoded error via interface. */ -static inline struct evsel *perf_evsel__newtp(const char *sys, const char *name) +static inline struct evsel *evsel__newtp(const char *sys, const char *name) { - return perf_evsel__newtp_idx(sys, name, 0); + return evsel__newtp_idx(sys, name, 0); } -struct evsel *perf_evsel__new_cycles(bool precise); +struct evsel *evsel__new_cycles(bool precise); struct tep_event *event_format__new(const char *sys, const char *name); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index e52056fcc7f4..378700519613 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -343,7 +343,7 @@ __add_event(struct list_head *list, int *idx, event_attr_init(attr); - evsel = perf_evsel__new_idx(attr, *idx); + evsel = evsel__new_idx(attr, *idx); if (!evsel) return NULL; @@ -526,9 +526,8 @@ static int add_tracepoint(struct list_head *list, int *idx, struct parse_events_error *err, struct list_head *head_config) { - struct evsel *evsel; + struct evsel *evsel = evsel__newtp_idx(sys_name, evt_name, (*idx)++); - evsel = perf_evsel__newtp_idx(sys_name, evt_name, (*idx)++); if (IS_ERR(evsel)) { tracepoint_error(err, PTR_ERR(evsel), sys_name, evt_name); return PTR_ERR(evsel); diff --git a/tools/perf/util/sideband_evlist.c b/tools/perf/util/sideband_evlist.c index c51a8dfce08b..4d01b4f80b3c 100644 --- a/tools/perf/util/sideband_evlist.c +++ b/tools/perf/util/sideband_evlist.c @@ -22,7 +22,7 @@ int perf_evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *att attr->sample_id_all = 1; } - evsel = perf_evsel__new_idx(attr, evlist->core.nr_entries); + evsel = evsel__new_idx(attr, evlist->core.nr_entries); if (!evsel) return -1; -- Gitee From c7ee4f2104d4c645963010a5d9a38c19a2117ca2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 22 Apr 2020 10:36:15 -0700 Subject: [PATCH 172/257] perf record: Add dummy event during system wide synthesis commit 0a892c1c947230f9373c9a3c94df1babe989861f upstream During the processing of /proc during event synthesis new processes may start. Add a dummy event if /proc is to be processed, to capture mmaps for starting processes. This reuses the existing logic for initial-delay. v3 fixes the attr test of test-record-C0 v2 fixes the dummy event configuration and a branch stack issue. Suggested-by: Stephane Eranian Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20200422173615.59436-1-irogers@google.com [ split from a larger patch ] Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Gokul K Signed-off-by: mohanasv2 --- tools/perf/builtin-record.c | 19 +++++++--- tools/perf/tests/attr/system-wide-dummy | 50 +++++++++++++++++++++++++ tools/perf/tests/attr/test-record-C0 | 12 +++++- 3 files changed, 74 insertions(+), 7 deletions(-) create mode 100644 tools/perf/tests/attr/system-wide-dummy diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1d9c1dc20789..e2994c3a8f2c 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -757,19 +757,28 @@ static int record__open(struct record *rec) int rc = 0; /* - * For initial_delay we need to add a dummy event so that we can track - * PERF_RECORD_MMAP while we wait for the initial delay to enable the - * real events, the ones asked by the user. + * For initial_delay or system wide, we need to add a dummy event so + * that we can track PERF_RECORD_MMAP to cover the delay of waiting or + * event synthesis. */ - if (opts->initial_delay) { + if (opts->initial_delay || target__has_cpu(&opts->target)) { if (perf_evlist__add_dummy(evlist)) return -ENOMEM; + /* Disable tracking of mmaps on lead event. */ pos = evlist__first(evlist); pos->tracking = 0; + /* Set up dummy event. */ pos = evlist__last(evlist); pos->tracking = 1; - pos->core.attr.enable_on_exec = 1; + /* + * Enable the dummy event when the process is forked for + * initial_delay, immediately for system wide. + */ + if (opts->initial_delay) + pos->core.attr.enable_on_exec = 1; + else + pos->immediate = 1; } perf_evlist__config(evlist, opts, &callchain_param); diff --git a/tools/perf/tests/attr/system-wide-dummy b/tools/perf/tests/attr/system-wide-dummy new file mode 100644 index 000000000000..eba723cc0d38 --- /dev/null +++ b/tools/perf/tests/attr/system-wide-dummy @@ -0,0 +1,50 @@ +# Event added by system-wide or CPU perf-record to handle the race of +# processes starting while /proc is processed. +[event] +fd=1 +group_fd=-1 +cpu=* +pid=-1 +flags=8 +type=1 +size=120 +config=9 +sample_period=4000 +sample_type=455 +read_format=4 +# Event will be enabled right away. +disabled=0 +inherit=1 +pinned=0 +exclusive=0 +exclude_user=0 +exclude_kernel=0 +exclude_hv=0 +exclude_idle=0 +mmap=1 +comm=1 +freq=1 +inherit_stat=0 +enable_on_exec=0 +task=1 +watermark=0 +precise_ip=0 +mmap_data=0 +sample_id_all=1 +exclude_host=0 +exclude_guest=0 +exclude_callchain_kernel=0 +exclude_callchain_user=0 +mmap2=1 +comm_exec=1 +context_switch=0 +write_backward=0 +namespaces=0 +use_clockid=0 +wakeup_events=0 +bp_type=0 +config1=0 +config2=0 +branch_sample_type=0 +sample_regs_user=0 +sample_stack_user=0 diff --git a/tools/perf/tests/attr/test-record-C0 b/tools/perf/tests/attr/test-record-C0 index 93818054ae20..317730b906dd 100644 --- a/tools/perf/tests/attr/test-record-C0 +++ b/tools/perf/tests/attr/test-record-C0 @@ -9,6 +9,14 @@ cpu=0 # no enable on exec for CPU attached enable_on_exec=0 -# PERF_SAMPLE_IP | PERF_SAMPLE_TID PERF_SAMPLE_TIME | # PERF_SAMPLE_PERIOD +# PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME | +# PERF_SAMPLE_ID | PERF_SAMPLE_PERIOD # + PERF_SAMPLE_CPU added by -C 0 -sample_type=391 +sample_type=455 + +# Dummy event handles mmaps, comm and task. +mmap=0 +comm=0 +task=0 + +[event:system-wide-dummy] -- Gitee From 5610f808f7430260dc75ace309ec4ebbef186823 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 17 Jun 2020 09:16:20 -0300 Subject: [PATCH 173/257] perf evlist: Fix the class prefix for 'struct evlist' 'add' evsel methods commit e251abee87cf9c49a2ec1b143bd71f92b71557c1 upstream To differentiate from libperf's 'struct perf_evlist' methods. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- tools/perf/builtin-kvm.c | 2 +- tools/perf/builtin-record.c | 4 ++-- tools/perf/builtin-stat.c | 16 +++++++--------- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 3 +-- tools/perf/util/evlist.c | 17 +++++++---------- tools/perf/util/evlist.h | 17 ++++++++--------- 7 files changed, 27 insertions(+), 34 deletions(-) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 20366ba14de1..a7ee11cd57c0 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1320,7 +1320,7 @@ static struct evlist *kvm_live_event_list(void) *name = '\0'; name++; - if (perf_evlist__add_newtp(evlist, sys, name, NULL)) { + if (evlist__add_newtp(evlist, sys, name, NULL)) { pr_err("Failed to add %s tracepoint to the list\n", *events_tp); free(tp); goto out; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index e2994c3a8f2c..047b30ca33b4 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -762,7 +762,7 @@ static int record__open(struct record *rec) * event synthesis. */ if (opts->initial_delay || target__has_cpu(&opts->target)) { - if (perf_evlist__add_dummy(evlist)) + if (evlist__add_dummy(evlist)) return -ENOMEM; /* Disable tracking of mmaps on lead event. */ @@ -2466,7 +2466,7 @@ int cmd_record(int argc, const char **argv) record.opts.tail_synthesize = true; if (rec->evlist->core.nr_entries == 0 && - __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) { + __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) { pr_err("Not enough memory for event selector list\n"); goto out; } diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index ea183922c4ef..b5a82969dab4 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1362,19 +1362,17 @@ static int add_default_attributes(void) if (target__has_cpu(&target)) default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK; - if (perf_evlist__add_default_attrs(evsel_list, default_attrs0) < 0) + if (evlist__add_default_attrs(evsel_list, default_attrs0) < 0) return -1; if (pmu_have_event("cpu", "stalled-cycles-frontend")) { - if (perf_evlist__add_default_attrs(evsel_list, - frontend_attrs) < 0) + if (evlist__add_default_attrs(evsel_list, frontend_attrs) < 0) return -1; } if (pmu_have_event("cpu", "stalled-cycles-backend")) { - if (perf_evlist__add_default_attrs(evsel_list, - backend_attrs) < 0) + if (evlist__add_default_attrs(evsel_list, backend_attrs) < 0) return -1; } - if (perf_evlist__add_default_attrs(evsel_list, default_attrs1) < 0) + if (evlist__add_default_attrs(evsel_list, default_attrs1) < 0) return -1; } @@ -1384,21 +1382,21 @@ static int add_default_attributes(void) return 0; /* Append detailed run extra attributes: */ - if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) + if (evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) return -1; if (detailed_run < 2) return 0; /* Append very detailed run extra attributes: */ - if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) + if (evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) return -1; if (detailed_run < 3) return 0; /* Append very, very detailed run extra attributes: */ - return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); + return evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); } static const char * const stat_record_usage[] = { diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 8bd526b8c143..0fec22232daf 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1568,7 +1568,7 @@ int cmd_top(int argc, const char **argv) usage_with_options(top_usage, options); if (!top.evlist->core.nr_entries && - perf_evlist__add_default(top.evlist) < 0) { + evlist__add_default(top.evlist) < 0) { pr_err("Not enough memory for event selector list\n"); goto out_delete_evlist; } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 6e2d13c5c950..1f40c683bba8 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3349,8 +3349,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) } if (trace->sched && - perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", - trace__sched_stat_runtime)) + evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime)) goto out_error_sched_stat_runtime; /* diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 92a94b52e51c..bda41564d406 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -79,7 +79,7 @@ struct evlist *perf_evlist__new_default(void) { struct evlist *evlist = evlist__new(); - if (evlist && perf_evlist__add_default(evlist)) { + if (evlist && evlist__add_default(evlist)) { evlist__delete(evlist); evlist = NULL; } @@ -91,7 +91,7 @@ struct evlist *perf_evlist__new_dummy(void) { struct evlist *evlist = evlist__new(); - if (evlist && perf_evlist__add_dummy(evlist)) { + if (evlist && evlist__add_dummy(evlist)) { evlist__delete(evlist); evlist = NULL; } @@ -211,7 +211,7 @@ void perf_evlist__set_leader(struct evlist *evlist) } } -int __perf_evlist__add_default(struct evlist *evlist, bool precise) +int __evlist__add_default(struct evlist *evlist, bool precise) { struct evsel *evsel = evsel__new_cycles(precise); @@ -222,7 +222,7 @@ int __perf_evlist__add_default(struct evlist *evlist, bool precise) return 0; } -int perf_evlist__add_dummy(struct evlist *evlist) +int evlist__add_dummy(struct evlist *evlist) { struct perf_event_attr attr = { .type = PERF_TYPE_SOFTWARE, @@ -238,8 +238,7 @@ int perf_evlist__add_dummy(struct evlist *evlist) return 0; } -static int evlist__add_attrs(struct evlist *evlist, - struct perf_event_attr *attrs, size_t nr_attrs) +static int evlist__add_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs) { struct evsel *evsel, *n; LIST_HEAD(head); @@ -262,8 +261,7 @@ static int evlist__add_attrs(struct evlist *evlist, return -1; } -int __perf_evlist__add_default_attrs(struct evlist *evlist, - struct perf_event_attr *attrs, size_t nr_attrs) +int __evlist__add_default_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs) { size_t i; @@ -302,8 +300,7 @@ perf_evlist__find_tracepoint_by_name(struct evlist *evlist, return NULL; } -int perf_evlist__add_newtp(struct evlist *evlist, - const char *sys, const char *name, void *handler) +int evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name, void *handler) { struct evsel *evsel = evsel__newtp(sys, name); diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index b791f6473097..e98644a65498 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -92,20 +92,20 @@ void evlist__delete(struct evlist *evlist); void evlist__add(struct evlist *evlist, struct evsel *entry); void evlist__remove(struct evlist *evlist, struct evsel *evsel); -int __perf_evlist__add_default(struct evlist *evlist, bool precise); +int __evlist__add_default(struct evlist *evlist, bool precise); -static inline int perf_evlist__add_default(struct evlist *evlist) +static inline int evlist__add_default(struct evlist *evlist) { - return __perf_evlist__add_default(evlist, true); + return __evlist__add_default(evlist, true); } -int __perf_evlist__add_default_attrs(struct evlist *evlist, +int __evlist__add_default_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs); -#define perf_evlist__add_default_attrs(evlist, array) \ - __perf_evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array)) +#define evlist__add_default_attrs(evlist, array) \ + __evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array)) -int perf_evlist__add_dummy(struct evlist *evlist); +int evlist__add_dummy(struct evlist *evlist); int perf_evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *attr, @@ -116,8 +116,7 @@ int perf_evlist__start_sb_thread(struct evlist *evlist, struct target *target); void perf_evlist__stop_sb_thread(struct evlist *evlist); -int perf_evlist__add_newtp(struct evlist *evlist, - const char *sys, const char *name, void *handler); +int evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name, void *handler); void __perf_evlist__set_sample_bit(struct evlist *evlist, enum perf_event_sample_format bit); -- Gitee From 38a35eb3c38b5287469f2c37f22e27fc5bd29b29 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 4 Dec 2020 19:10:09 +0800 Subject: [PATCH 174/257] perf pmu: Add pmu_id() commit 51d548471510843e56d9f427aa6473ca0981c4a4 upstream Add a function to read the PMU id sysfs entry. This is only done for uncore PMUs where this would possibly be relevant. Signed-off-by: John Garry Acked-by: Kajol Jain Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Joakim Zhang Cc: Kan Liang Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Shaokun Zhang Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxarm@huawei.com Link: http://lore.kernel.org/lkml/1607080216-36968-4-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- tools/perf/util/pmu.c | 18 ++++++++++++++++++ tools/perf/util/pmu.h | 1 + 2 files changed, 19 insertions(+) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 5f1ece607471..95a4c824b0f3 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -593,6 +593,7 @@ static struct perf_cpu_map *__pmu_cpumask(const char *path) * Uncore PMUs have a "cpumask" file under sysfs. CPU PMUs (e.g. on arm/arm64) * may have a "cpus" file. */ +#define SYS_TEMPLATE_ID "./bus/event_source/devices/%s/identifier" #define CPUS_TEMPLATE_UNCORE "%s/bus/event_source/devices/%s/cpumask" #define CPUS_TEMPLATE_CPU "%s/bus/event_source/devices/%s/cpus" @@ -634,6 +635,21 @@ static bool pmu_is_uncore(const char *name) return !!cpus; } +static char *pmu_id(const char *name) +{ + char path[PATH_MAX], *str; + size_t len; + + snprintf(path, PATH_MAX, SYS_TEMPLATE_ID, name); + + if (sysfs__read_str(path, &str, &len) < 0) + return NULL; + + str[len - 1] = 0; /* remove line feed */ + + return str; +} + /* * PMU CORE devices have different name other than cpu in sysfs on some * platforms. @@ -843,6 +859,8 @@ static struct perf_pmu *pmu_lookup(const char *name) pmu->name = strdup(name); pmu->type = type; pmu->is_uncore = pmu_is_uncore(name); + if (pmu->is_uncore) + pmu->id = pmu_id(name); pmu->max_precise = pmu_max_precise(name); pmu_add_cpu_aliases(&aliases, pmu); diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index d5e87aa9fd5b..9f843c762243 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -23,6 +23,7 @@ struct perf_event_attr; struct perf_pmu { char *name; + char *id; __u32 type; bool selectable; bool is_uncore; -- Gitee From 3ef8951f4d4ebdd4fba244e6c610908f2c0acc78 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 20 Nov 2019 16:15:11 -0800 Subject: [PATCH 175/257] perf pmu: Use file system cache to optimize sysfs access commit d96645821e940bddff3fc5290656f83bf70d4c92 upstream pmu.c does a lot of redundant /sys accesses while parsing aliases and probing for PMUs. On large systems with a lot of PMUs this can get expensive (>2s): % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 27.25 1.227847 8 160888 16976 openat 26.42 1.190481 7 164224 164077 stat Add a cache to remember if specific file names exist or don't exist, which eliminates most of this overhead. Also optimize some stat() calls to be slightly cheaper access() Resulting in: 0.18 0.004166 2 1851 305 open 0.08 0.001970 2 829 622 access Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lore.kernel.org/lkml/20191121001522.180827-2-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- tools/perf/util/Build | 1 + tools/perf/util/fncache.c | 63 +++++++++++++++++++++++++++++++++++++++ tools/perf/util/fncache.h | 7 +++++ tools/perf/util/pmu.c | 34 +++++++-------------- tools/perf/util/srccode.c | 9 +----- 5 files changed, 83 insertions(+), 31 deletions(-) create mode 100644 tools/perf/util/fncache.c create mode 100644 tools/perf/util/fncache.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index cf6b6be27e86..3df3a5089d8e 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -49,6 +49,7 @@ perf-y += header.o perf-y += callchain.o perf-y += values.o perf-y += debug.o +perf-y += fncache.o perf-y += machine.o perf-y += map.o perf-y += pstack.o diff --git a/tools/perf/util/fncache.c b/tools/perf/util/fncache.c new file mode 100644 index 000000000000..6225cbc52310 --- /dev/null +++ b/tools/perf/util/fncache.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Manage a cache of file names' existence */ +#include +#include +#include +#include +#include "fncache.h" + +struct fncache { + struct hlist_node nd; + bool res; + char name[]; +}; + +#define FNHSIZE 61 + +static struct hlist_head fncache_hash[FNHSIZE]; + +unsigned shash(const unsigned char *s) +{ + unsigned h = 0; + while (*s) + h = 65599 * h + *s++; + return h ^ (h >> 16); +} + +static bool lookup_fncache(const char *name, bool *res) +{ + int h = shash((const unsigned char *)name) % FNHSIZE; + struct fncache *n; + + hlist_for_each_entry(n, &fncache_hash[h], nd) { + if (!strcmp(n->name, name)) { + *res = n->res; + return true; + } + } + return false; +} + +static void update_fncache(const char *name, bool res) +{ + struct fncache *n = malloc(sizeof(struct fncache) + strlen(name) + 1); + int h = shash((const unsigned char *)name) % FNHSIZE; + + if (!n) + return; + strcpy(n->name, name); + n->res = res; + hlist_add_head(&n->nd, &fncache_hash[h]); +} + +/* No LRU, only use when bounded in some other way. */ +bool file_available(const char *name) +{ + bool res; + + if (lookup_fncache(name, &res)) + return res; + res = access(name, R_OK) == 0; + update_fncache(name, res); + return res; +} diff --git a/tools/perf/util/fncache.h b/tools/perf/util/fncache.h new file mode 100644 index 000000000000..fe020beaefb1 --- /dev/null +++ b/tools/perf/util/fncache.h @@ -0,0 +1,7 @@ +#ifndef _FCACHE_H +#define _FCACHE_H 1 + +unsigned shash(const unsigned char *s); +bool file_available(const char *name); + +#endif diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 95a4c824b0f3..594d8f788bb0 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -25,6 +25,7 @@ #include "pmu-events/pmu-events.h" #include "string2.h" #include "strbuf.h" +#include "fncache.h" struct perf_pmu_format { char *name; @@ -83,7 +84,6 @@ int perf_pmu__format_parse(char *dir, struct list_head *head) */ static int pmu_format(const char *name, struct list_head *format) { - struct stat st; char path[PATH_MAX]; const char *sysfs = sysfs__mountpoint(); @@ -93,8 +93,8 @@ static int pmu_format(const char *name, struct list_head *format) snprintf(path, PATH_MAX, "%s" EVENT_SOURCE_DEVICE_PATH "%s/format", sysfs, name); - if (stat(path, &st) < 0) - return 0; /* no error if format does not exist */ + if (!file_available(path)) + return 0; if (perf_pmu__format_parse(path, format)) return -1; @@ -471,7 +471,6 @@ static int pmu_aliases_parse(char *dir, struct list_head *head) */ static int pmu_aliases(const char *name, struct list_head *head) { - struct stat st; char path[PATH_MAX]; const char *sysfs = sysfs__mountpoint(); @@ -481,8 +480,8 @@ static int pmu_aliases(const char *name, struct list_head *head) snprintf(path, PATH_MAX, "%s/bus/event_source/devices/%s/events", sysfs, name); - if (stat(path, &st) < 0) - return 0; /* no error if 'events' does not exist */ + if (!file_available(path)) + return 0; if (pmu_aliases_parse(path, head)) return -1; @@ -521,7 +520,6 @@ static int pmu_alias_terms(struct perf_pmu_alias *alias, */ static int pmu_type(const char *name, __u32 *type) { - struct stat st; char path[PATH_MAX]; FILE *file; int ret = 0; @@ -533,7 +531,7 @@ static int pmu_type(const char *name, __u32 *type) snprintf(path, PATH_MAX, "%s" EVENT_SOURCE_DEVICE_PATH "%s/type", sysfs, name); - if (stat(path, &st) < 0) + if (access(path, R_OK) < 0) return -1; file = fopen(path, "r"); @@ -625,14 +623,11 @@ static struct perf_cpu_map *pmu_cpumask(const char *name) static bool pmu_is_uncore(const char *name) { char path[PATH_MAX]; - struct perf_cpu_map *cpus; - const char *sysfs = sysfs__mountpoint(); + const char *sysfs; + sysfs = sysfs__mountpoint(); snprintf(path, PATH_MAX, CPUS_TEMPLATE_UNCORE, sysfs, name); - cpus = __pmu_cpumask(path); - perf_cpu_map__put(cpus); - - return !!cpus; + return file_available(path); } static char *pmu_id(const char *name) @@ -657,7 +652,6 @@ static char *pmu_id(const char *name) */ static int is_arm_pmu_core(const char *name) { - struct stat st; char path[PATH_MAX]; const char *sysfs = sysfs__mountpoint(); @@ -667,10 +661,7 @@ static int is_arm_pmu_core(const char *name) /* Look for cpu sysfs (specific to arm) */ scnprintf(path, PATH_MAX, "%s/bus/event_source/devices/%s/cpus", sysfs, name); - if (stat(path, &st) == 0) - return 1; - - return 0; + return file_available(path); } static char *perf_pmu__getcpuid(struct perf_pmu *pmu) @@ -1584,7 +1575,6 @@ bool pmu_have_event(const char *pname, const char *name) static FILE *perf_pmu__open_file(struct perf_pmu *pmu, const char *name) { - struct stat st; char path[PATH_MAX]; const char *sysfs; @@ -1594,10 +1584,8 @@ static FILE *perf_pmu__open_file(struct perf_pmu *pmu, const char *name) snprintf(path, PATH_MAX, "%s" EVENT_SOURCE_DEVICE_PATH "%s/%s", sysfs, pmu->name, name); - - if (stat(path, &st) < 0) + if (!file_available(path)) return NULL; - return fopen(path, "r"); } diff --git a/tools/perf/util/srccode.c b/tools/perf/util/srccode.c index d84ed8b6caaa..c29edaaca863 100644 --- a/tools/perf/util/srccode.c +++ b/tools/perf/util/srccode.c @@ -16,6 +16,7 @@ #include "srccode.h" #include "debug.h" #include // page_size +#include "fncache.h" #define MAXSRCCACHE (32*1024*1024) #define MAXSRCFILES 64 @@ -36,14 +37,6 @@ static LIST_HEAD(srcfile_list); static long map_total_sz; static int num_srcfiles; -static unsigned shash(unsigned char *s) -{ - unsigned h = 0; - while (*s) - h = 65599 * h + *s++; - return h ^ (h >> 16); -} - static int countlines(char *map, int maplen) { int numl; -- Gitee From b92ab23032cd1e3787a6c0fe3a30d762a5d809d7 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:18 +0800 Subject: [PATCH 176/257] perf pmu: Save detected hybrid pmus to a global pmu list commit 444624307c4e06d35de12df1cfe08a4964ac086f upstream We identify the cpu_core pmu and cpu_atom pmu by explicitly checking following files: For cpu_core, checks: "/sys/bus/event_source/devices/cpu_core/cpus" For cpu_atom, checks: "/sys/bus/event_source/devices/cpu_atom/cpus" If the 'cpus' file exists and it has data, the pmu exists. But in order not to hardcode the "cpu_core" and "cpu_atom", and make the code in a generic way. So if the path "/sys/bus/event_source/devices/cpu_xxx/cpus" exists, the hybrid pmu exists. All the detected hybrid pmus are linked to a global list 'perf_pmu__hybrid_pmus' and then next we just need to iterate the list to get all hybrid pmu by using perf_pmu__for_each_hybrid_pmu. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-6-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- tools/perf/util/Build | 1 + tools/perf/util/pmu-hybrid.c | 49 ++++++++++++++++++++++++++++++++++++ tools/perf/util/pmu-hybrid.h | 18 +++++++++++++ tools/perf/util/pmu.c | 9 ++++++- tools/perf/util/pmu.h | 4 +++ 5 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 tools/perf/util/pmu-hybrid.c create mode 100644 tools/perf/util/pmu-hybrid.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 3df3a5089d8e..90c68a6aed1d 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -68,6 +68,7 @@ perf-y += parse-events-bison.o perf-y += pmu.o perf-y += pmu-flex.o perf-y += pmu-bison.o +perf-y += pmu-hybrid.o perf-y += trace-event-read.o perf-y += trace-event-info.o perf-y += trace-event-scripting.o diff --git a/tools/perf/util/pmu-hybrid.c b/tools/perf/util/pmu-hybrid.c new file mode 100644 index 000000000000..8ed0e6e1776d --- /dev/null +++ b/tools/perf/util/pmu-hybrid.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fncache.h" +#include "pmu-hybrid.h" + +LIST_HEAD(perf_pmu__hybrid_pmus); + +bool perf_pmu__hybrid_mounted(const char *name) +{ + char path[PATH_MAX]; + const char *sysfs; + FILE *file; + int n, cpu; + + if (strncmp(name, "cpu_", 4)) + return false; + + sysfs = sysfs__mountpoint(); + if (!sysfs) + return false; + + snprintf(path, PATH_MAX, CPUS_TEMPLATE_CPU, sysfs, name); + if (!file_available(path)) + return false; + + file = fopen(path, "r"); + if (!file) + return false; + + n = fscanf(file, "%u", &cpu); + fclose(file); + if (n <= 0) + return false; + + return true; +} diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h new file mode 100644 index 000000000000..35bed3714438 --- /dev/null +++ b/tools/perf/util/pmu-hybrid.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PMU_HYBRID_H +#define __PMU_HYBRID_H + +#include +#include +#include +#include +#include "pmu.h" + +extern struct list_head perf_pmu__hybrid_pmus; + +#define perf_pmu__for_each_hybrid_pmu(pmu) \ + list_for_each_entry(pmu, &perf_pmu__hybrid_pmus, hybrid_list) + +bool perf_pmu__hybrid_mounted(const char *name); + +#endif /* __PMU_HYBRID_H */ diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 594d8f788bb0..d33c2cd6f17e 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -26,6 +26,7 @@ #include "string2.h" #include "strbuf.h" #include "fncache.h" +#include "pmu-hybrid.h" struct perf_pmu_format { char *name; @@ -593,7 +594,6 @@ static struct perf_cpu_map *__pmu_cpumask(const char *path) */ #define SYS_TEMPLATE_ID "./bus/event_source/devices/%s/identifier" #define CPUS_TEMPLATE_UNCORE "%s/bus/event_source/devices/%s/cpumask" -#define CPUS_TEMPLATE_CPU "%s/bus/event_source/devices/%s/cpus" static struct perf_cpu_map *pmu_cpumask(const char *name) { @@ -625,6 +625,9 @@ static bool pmu_is_uncore(const char *name) char path[PATH_MAX]; const char *sysfs; + if (perf_pmu__hybrid_mounted(name)) + return false; + sysfs = sysfs__mountpoint(); snprintf(path, PATH_MAX, CPUS_TEMPLATE_UNCORE, sysfs, name); return file_available(path); @@ -852,6 +855,7 @@ static struct perf_pmu *pmu_lookup(const char *name) pmu->is_uncore = pmu_is_uncore(name); if (pmu->is_uncore) pmu->id = pmu_id(name); + pmu->is_hybrid = perf_pmu__hybrid_mounted(name); pmu->max_precise = pmu_max_precise(name); pmu_add_cpu_aliases(&aliases, pmu); @@ -861,6 +865,9 @@ static struct perf_pmu *pmu_lookup(const char *name) list_splice(&aliases, &pmu->aliases); list_add_tail(&pmu->list, &pmus); + if (pmu->is_hybrid) + list_add_tail(&pmu->hybrid_list, &perf_pmu__hybrid_pmus); + pmu->default_config = perf_pmu__get_default_config(pmu); return pmu; diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 9f843c762243..93a90033e43d 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "parse-events.h" @@ -18,6 +19,7 @@ enum { #define PERF_PMU_FORMAT_BITS 64 #define EVENT_SOURCE_DEVICE_PATH "/bus/event_source/devices/" +#define CPUS_TEMPLATE_CPU "%s/bus/event_source/devices/%s/cpus" struct perf_event_attr; @@ -27,6 +29,7 @@ struct perf_pmu { __u32 type; bool selectable; bool is_uncore; + bool is_hybrid; bool auxtrace; int max_precise; struct perf_event_attr *default_config; @@ -34,6 +37,7 @@ struct perf_pmu { struct list_head format; /* HEAD struct perf_pmu_format -> list */ struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */ struct list_head list; /* ELEM */ + struct list_head hybrid_list; }; struct perf_pmu_info { -- Gitee From 0b5bf928e7576ff598de3934c182d85fa0e26495 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:01 -0700 Subject: [PATCH 177/257] perf pmu: Add support for PMU capabilities commit 9fbc61f832ebf432326a90e28184dade05ee34a8 upstream The PMU capabilities information, which is located at /sys/bus/event_source/devices//caps, is required by perf tool. For example, the max LBR information is required to stitch LBR call stack. Add perf_pmu__caps_parse() to parse the PMU capabilities information. The information is stored in a list. The following patch will store the capabilities information in perf header. Committer notes: Here's an example of such directories and its files in an i5 7th gen machine: [root@seventh ~]# ls -lad /sys/bus/event_source/devices/*/caps drwxr-xr-x. 2 root root 0 Apr 14 13:33 /sys/bus/event_source/devices/cpu/caps drwxr-xr-x. 2 root root 0 Apr 14 13:33 /sys/bus/event_source/devices/intel_pt/caps [root@seventh ~]# ls -la /sys/bus/event_source/devices/intel_pt/caps total 0 drwxr-xr-x. 2 root root 0 Apr 14 13:33 . drwxr-xr-x. 5 root root 0 Apr 14 13:12 .. -r--r--r--. 1 root root 4096 Apr 16 13:10 cr3_filtering -r--r--r--. 1 root root 4096 Apr 16 11:42 cycle_thresholds -r--r--r--. 1 root root 4096 Apr 16 13:10 ip_filtering -r--r--r--. 1 root root 4096 Apr 16 13:10 max_subleaf -r--r--r--. 1 root root 4096 Apr 14 13:33 mtc -r--r--r--. 1 root root 4096 Apr 14 13:33 mtc_periods -r--r--r--. 1 root root 4096 Apr 16 13:10 num_address_ranges -r--r--r--. 1 root root 4096 Apr 16 13:10 output_subsys -r--r--r--. 1 root root 4096 Apr 16 13:10 payloads_lip -r--r--r--. 1 root root 4096 Apr 16 13:10 power_event_trace -r--r--r--. 1 root root 4096 Apr 14 13:33 psb_cyc -r--r--r--. 1 root root 4096 Apr 14 13:33 psb_periods -r--r--r--. 1 root root 4096 Apr 16 13:10 ptwrite -r--r--r--. 1 root root 4096 Apr 16 13:10 single_range_output -r--r--r--. 1 root root 4096 Apr 16 12:03 topa_multiple_entries -r--r--r--. 1 root root 4096 Apr 16 13:10 topa_output [root@seventh ~]# cat /sys/bus/event_source/devices/intel_pt/caps/topa_output 1 [root@seventh ~]# cat /sys/bus/event_source/devices/intel_pt/caps/topa_multiple_entries 1 [root@seventh ~]# cat /sys/bus/event_source/devices/intel_pt/caps/mtc 1 [root@seventh ~]# cat /sys/bus/event_source/devices/intel_pt/caps/power_event_trace 0 [root@seventh ~]# [root@seventh ~]# ls -la /sys/bus/event_source/devices/cpu/caps/ total 0 drwxr-xr-x. 2 root root 0 Apr 14 13:33 . drwxr-xr-x. 6 root root 0 Apr 14 13:12 .. -r--r--r--. 1 root root 4096 Apr 16 13:10 branches -r--r--r--. 1 root root 4096 Apr 14 13:33 max_precise -r--r--r--. 1 root root 4096 Apr 16 13:10 pmu_name [root@seventh ~]# cat /sys/bus/event_source/devices/cpu/caps/max_precise 3 [root@seventh ~]# cat /sys/bus/event_source/devices/cpu/caps/branches 32 [root@seventh ~]# cat /sys/bus/event_source/devices/cpu/caps/pmu_name skylake [root@seventh ~]# Wow, first time I've heard about /sys/bus/event_source/devices/cpu/caps/max_precise, I think I'll use it! :-) Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- tools/perf/util/pmu.c | 82 +++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/pmu.h | 9 +++++ 2 files changed, 91 insertions(+) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index d33c2cd6f17e..a68b79de5268 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -861,6 +861,7 @@ static struct perf_pmu *pmu_lookup(const char *name) INIT_LIST_HEAD(&pmu->format); INIT_LIST_HEAD(&pmu->aliases); + INIT_LIST_HEAD(&pmu->caps); list_splice(&format, &pmu->format); list_splice(&aliases, &pmu->aliases); list_add_tail(&pmu->list, &pmus); @@ -1612,3 +1613,84 @@ int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, va_end(args); return ret; } + +static int perf_pmu__new_caps(struct list_head *list, char *name, char *value) +{ + struct perf_pmu_caps *caps = zalloc(sizeof(*caps)); + + if (!caps) + return -ENOMEM; + + caps->name = strdup(name); + if (!caps->name) + goto free_caps; + caps->value = strndup(value, strlen(value) - 1); + if (!caps->value) + goto free_name; + list_add_tail(&caps->list, list); + return 0; + +free_name: + zfree(caps->name); +free_caps: + free(caps); + + return -ENOMEM; +} + +/* + * Reading/parsing the given pmu capabilities, which should be located at: + * /sys/bus/event_source/devices//caps as sysfs group attributes. + * Return the number of capabilities + */ +int perf_pmu__caps_parse(struct perf_pmu *pmu) +{ + struct stat st; + char caps_path[PATH_MAX]; + const char *sysfs = sysfs__mountpoint(); + DIR *caps_dir; + struct dirent *evt_ent; + int nr_caps = 0; + + if (!sysfs) + return -1; + + snprintf(caps_path, PATH_MAX, + "%s" EVENT_SOURCE_DEVICE_PATH "%s/caps", sysfs, pmu->name); + + if (stat(caps_path, &st) < 0) + return 0; /* no error if caps does not exist */ + + caps_dir = opendir(caps_path); + if (!caps_dir) + return -EINVAL; + + while ((evt_ent = readdir(caps_dir)) != NULL) { + char path[PATH_MAX + NAME_MAX + 1]; + char *name = evt_ent->d_name; + char value[128]; + FILE *file; + + if (!strcmp(name, ".") || !strcmp(name, "..")) + continue; + + snprintf(path, sizeof(path), "%s/%s", caps_path, name); + + file = fopen(path, "r"); + if (!file) + continue; + + if (!fgets(value, sizeof(value), file) || + (perf_pmu__new_caps(&pmu->caps, name, value) < 0)) { + fclose(file); + continue; + } + + nr_caps++; + fclose(file); + } + + closedir(caps_dir); + + return nr_caps; +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 93a90033e43d..d13293836761 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -23,6 +23,12 @@ enum { struct perf_event_attr; +struct perf_pmu_caps { + char *name; + char *value; + struct list_head list; +}; + struct perf_pmu { char *name; char *id; @@ -36,6 +42,7 @@ struct perf_pmu { struct perf_cpu_map *cpus; struct list_head format; /* HEAD struct perf_pmu_format -> list */ struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */ + struct list_head caps; /* HEAD struct perf_pmu_caps -> list */ struct list_head list; /* ELEM */ struct list_head hybrid_list; }; @@ -106,4 +113,6 @@ struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu); int perf_pmu__convert_scale(const char *scale, char **end, double *sval); +int perf_pmu__caps_parse(struct perf_pmu *pmu); + #endif /* __PMU_H */ -- Gitee From 6b2c0c38ab6a282b5ff331a7b0f5ac8a95ad7752 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Wed, 10 Mar 2021 13:11:38 +0800 Subject: [PATCH 178/257] perf pmu: Validate raw event with sysfs exported format bits commit e40647762fb5881360874e08e03e972d58d63c42 upstream A raw PMU event (eventsel+umask) in the form of rNNN is supported by perf but lacks of checking for the validity of raw encoding. For example, bit 16 and bit 17 are not valid on KBL but perf doesn't report warning when encoding with these bits. Before: # ./perf stat -e cpu/r031234/ -a -- sleep 1 Performance counter stats for 'system wide': 0 cpu/r031234/ 1.003798924 seconds time elapsed It may silently measure the wrong event! The kernel supported bits have been exported through /sys/devices//format/. Perf collects the information to 'struct perf_pmu_format' and links it to 'pmu->format' list. The 'struct perf_pmu_format' has a bitmap which records the valid bits for this format. For example, root@kbl-ppc:/sys/devices/cpu/format# cat umask config:8-15 The valid bits (bit8-bit15) are recorded in bitmap of format 'umask'. We collect total valid bits of all formats, save to a local variable 'masks' and reverse it. Now '~masks' represents total invalid bits. bits = config & ~masks; The set bits in 'bits' indicate the invalid bits used in config. Finally we use bitmap_scnprintf to report the invalid bits. Some architectures may not export supported bits through sysfs, so if masks is 0, perf_pmu__warn_invalid_config directly returns. After: Single event without name: # ./perf stat -e cpu/r031234/ -a -- sleep 1 WARNING: event 'N/A' not valid (bits 16-17 of config '31234' not supported by kernel)! Performance counter stats for 'system wide': 0 cpu/r031234/ 1.001597373 seconds time elapsed Multiple events with names: # ./perf stat -e cpu/rf01234,name=aaa/,cpu/r031234,name=bbb/ -a -- sleep 1 WARNING: event 'aaa' not valid (bits 20,22 of config 'f01234' not supported by kernel)! WARNING: event 'bbb' not valid (bits 16-17 of config '31234' not supported by kernel)! Performance counter stats for 'system wide': 0 aaa 0 bbb 1.001573787 seconds time elapsed Warnings are reported for invalid bits. Co-developed-by: Jiri Olsa Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jin Yao Cc: Kan Liang Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210310051138.12154-1-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- tools/perf/util/parse-events.c | 3 +++ tools/perf/util/pmu.c | 33 +++++++++++++++++++++++++++++++++ tools/perf/util/pmu.h | 3 +++ 3 files changed, 39 insertions(+) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 378700519613..27fd5501e053 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -342,6 +342,9 @@ __add_event(struct list_head *list, int *idx, cpu_list ? perf_cpu_map__new(cpu_list) : NULL; event_attr_init(attr); + if (pmu && attr->type == PERF_TYPE_RAW) + perf_pmu__warn_invalid_config(pmu, attr->config, name); + evsel = evsel__new_idx(attr, *idx); if (!evsel) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index a68b79de5268..209c493bb325 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1694,3 +1694,36 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) return nr_caps; } + +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name) +{ + struct perf_pmu_format *format; + __u64 masks = 0, bits; + char buf[100]; + unsigned int i; + + list_for_each_entry(format, &pmu->format, list) { + if (format->value != PERF_PMU_FORMAT_VALUE_CONFIG) + continue; + + for_each_set_bit(i, format->bits, PERF_PMU_FORMAT_BITS) + masks |= 1ULL << i; + } + + /* + * Kernel doesn't export any valid format bits. + */ + if (masks == 0) + return; + + bits = config & ~masks; + if (bits == 0) + return; + + bitmap_scnprintf((unsigned long *)&bits, sizeof(bits) * 8, buf, sizeof(buf)); + + pr_warning("WARNING: event '%s' not valid (bits %s of config " + "'%llx' not supported by kernel)!\n", + name ?: "N/A", buf, config); +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index d13293836761..4866cddc6fcb 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -115,4 +115,7 @@ int perf_pmu__convert_scale(const char *scale, char **end, double *sval); int perf_pmu__caps_parse(struct perf_pmu *pmu); +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name); + #endif /* __PMU_H */ -- Gitee From e5e8c0a037dd7ec1f3bcfd9f38915bc9c1215b9b Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:19 +0800 Subject: [PATCH 179/257] perf pmu: Add hybrid helper functions commit c5a26ea490a16798d973e6fa352c6b8375646bc4 upstream The functions perf_pmu__is_hybrid and perf_pmu__find_hybrid_pmu can be used to identify the hybrid platform and return the found hybrid cpu pmu. All the detected hybrid pmus have been saved in 'perf_pmu__hybrid_pmus' list. So we just need to search this list. perf_pmu__hybrid_type_to_pmu converts the user specified string to hybrid pmu name. This is used to support the '--cputype' option in next patches. perf_pmu__has_hybrid checks the existing of hybrid pmu. Note that, we have to define it in pmu.c (make pmu-hybrid.c no more symbol dependency), otherwise perf test python would be failed. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-7-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- tools/perf/util/pmu-hybrid.c | 40 ++++++++++++++++++++++++++++++++++++ tools/perf/util/pmu-hybrid.h | 4 ++++ tools/perf/util/pmu.c | 11 ++++++++++ tools/perf/util/pmu.h | 2 ++ 4 files changed, 57 insertions(+) diff --git a/tools/perf/util/pmu-hybrid.c b/tools/perf/util/pmu-hybrid.c index 8ed0e6e1776d..f51ccaac60ee 100644 --- a/tools/perf/util/pmu-hybrid.c +++ b/tools/perf/util/pmu-hybrid.c @@ -47,3 +47,43 @@ bool perf_pmu__hybrid_mounted(const char *name) return true; } + +struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name) +{ + struct perf_pmu *pmu; + + if (!name) + return NULL; + + perf_pmu__for_each_hybrid_pmu(pmu) { + if (!strcmp(name, pmu->name)) + return pmu; + } + + return NULL; +} + +bool perf_pmu__is_hybrid(const char *name) +{ + return perf_pmu__find_hybrid_pmu(name) != NULL; +} + +char *perf_pmu__hybrid_type_to_pmu(const char *type) +{ + char *pmu_name = NULL; + + if (asprintf(&pmu_name, "cpu_%s", type) < 0) + return NULL; + + if (perf_pmu__is_hybrid(pmu_name)) + return pmu_name; + + /* + * pmu may be not scanned, check the sysfs. + */ + if (perf_pmu__hybrid_mounted(pmu_name)) + return pmu_name; + + free(pmu_name); + return NULL; +} diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h index 35bed3714438..d0fa7bc50a76 100644 --- a/tools/perf/util/pmu-hybrid.h +++ b/tools/perf/util/pmu-hybrid.h @@ -15,4 +15,8 @@ extern struct list_head perf_pmu__hybrid_pmus; bool perf_pmu__hybrid_mounted(const char *name); +struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name); +bool perf_pmu__is_hybrid(const char *name); +char *perf_pmu__hybrid_type_to_pmu(const char *type); + #endif /* __PMU_HYBRID_H */ diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 209c493bb325..c0f8bdb65ccc 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -39,6 +39,7 @@ int perf_pmu_parse(struct list_head *list, char *name); extern FILE *perf_pmu_in; static LIST_HEAD(pmus); +static bool hybrid_scanned; /* * Parse & process all the sysfs attributes located under @@ -1727,3 +1728,13 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, "'%llx' not supported by kernel)!\n", name ?: "N/A", buf, config); } + +bool perf_pmu__has_hybrid(void) +{ + if (!hybrid_scanned) { + hybrid_scanned = true; + perf_pmu__scan(NULL); + } + + return !list_empty(&perf_pmu__hybrid_pmus); +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 4866cddc6fcb..57c7bf476350 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -118,4 +118,6 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu); void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, char *name); +bool perf_pmu__has_hybrid(void); + #endif /* __PMU_H */ -- Gitee From b263b10c55aadff6635bd46d7140af3424078144 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 Jan 2021 14:40:11 -0800 Subject: [PATCH 180/257] perf/x86/intel: Support CPUID 10.ECX to disable fixed counters commit 32451614da2a9cf4296f90d3606ac77814fb519d upstream With Architectural Performance Monitoring Version 5, CPUID 10.ECX cpu leaf indicates the fixed counter enumeration. This extends the previous count to a bitmap which allows disabling even lower fixed counters. It could be used by a Hypervisor. The existing intel_ctrl variable is used to remember the bitmask of the counters. All code that reads all counters is fixed to check this extra bitmask. Suggested-by: Peter Zijlstra (Intel) Originally-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-6-git-send-email-kan.liang@linux.intel.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- arch/x86/events/core.c | 8 +++++++- arch/x86/events/intel/core.c | 34 ++++++++++++++++++++++++---------- arch/x86/events/perf_event.h | 5 +++++ 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index aba34228231d..d36dc994a8e0 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -223,6 +223,8 @@ static bool check_hw_exists(void) if (ret) goto msr_fail; for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (fixed_counter_disabled(i)) + continue; if (val & (0x03 << i*4)) { bios_fail = 1; val_fail = val; @@ -1505,6 +1507,8 @@ void perf_event_print_debug(void) cpu, idx, prev_left); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx)) + continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1947,7 +1951,9 @@ static int __init init_hw_perf_events(void) pr_info("... generic registers: %d\n", x86_pmu.num_counters); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); + pr_info("... fixed-purpose events: %lu\n", + hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1) + << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl)); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); /* diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 091420463569..7a0cb583ddbe 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2554,8 +2554,11 @@ static void intel_pmu_reset(void) wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx)) + continue; wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } if (ds) ds->bts_index = ds->bts_buffer_base; @@ -4907,7 +4910,7 @@ __init int intel_pmu_init(void) union cpuid10_eax eax; union cpuid10_ebx ebx; struct event_constraint *c; - unsigned int unused; + unsigned int fixed_mask; struct extra_reg *er; bool pmem = false; int version, i; @@ -4929,7 +4932,7 @@ __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full); if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; @@ -4953,12 +4956,15 @@ __init int intel_pmu_init(void) * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: */ - if (version > 1) { + if (version > 1 && version < 5) { int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR); x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, assume); - } + + fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1; + } else if (version >= 5) + x86_pmu.num_counters_fixed = fls(fixed_mask); if (version >= 4) x86_pmu.counter_freezing = !disable_counter_freezing; @@ -5504,8 +5510,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; } - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED; if (x86_pmu.event_constraints) { /* @@ -5518,13 +5523,22 @@ __init int intel_pmu_init(void) * events to the generic counters. */ if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { + /* + * Disable topdown slots and metrics events, + * if slots event is not in CPUID. + */ + if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl)) + c->idxmsk64 = 0; c->weight = hweight64(c->idxmsk64); continue; } - if (c->cmask == FIXED_EVENT_FLAGS - && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) { - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + if (c->cmask == FIXED_EVENT_FLAGS) { + /* Disabled fixed counters which are not in CPUID */ + c->idxmsk64 &= x86_pmu.intel_ctrl; + + if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; } c->idxmsk64 &= ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 9a377bd754f9..109fa8d11a5d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1038,6 +1038,11 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); +static inline bool fixed_counter_disabled(int i) +{ + return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED)); +} + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); -- Gitee From 9ecc2d5819a2385459663999ce216797ef392458 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 12 Apr 2021 07:30:55 -0700 Subject: [PATCH 181/257] perf/x86: Factor out x86_pmu_show_pmu_cap commit e11c1a7eb302ac8f6f47c18fa662546405a5fd83 upstream The PMU capabilities are different among hybrid PMUs. Perf should dump the PMU capabilities information for each hybrid PMU. Factor out x86_pmu_show_pmu_cap() which shows the PMU capabilities information. The function will be reused later when registering a dedicated hybrid PMU. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1618237865-33448-16-git-send-email-kan.liang@linux.intel.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- arch/x86/events/core.c | 25 ++++++++++++++++--------- arch/x86/events/perf_event.h | 3 +++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d36dc994a8e0..0388280b0004 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1890,6 +1890,20 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_caps_group; +void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, + u64 intel_ctrl) +{ + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic registers: %d\n", num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %lu\n", + hweight64((((1ULL << num_counters_fixed) - 1) + << INTEL_PMC_IDX_FIXED) & intel_ctrl)); + pr_info("... event mask: %016Lx\n", intel_ctrl); +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1946,15 +1960,8 @@ static int __init init_hw_perf_events(void) pmu.attr_update = x86_pmu.attr_update; - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %lu\n", - hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1) - << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl)); - pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + x86_pmu_show_pmu_cap(x86_pmu.num_counters, x86_pmu.num_counters_fixed, + x86_pmu.intel_ctrl); /* * Install callbacks. Core will call them for each online diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 109fa8d11a5d..ba5d948e8979 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -997,6 +997,9 @@ void x86_pmu_enable_event(struct perf_event *event); int x86_pmu_handle_irq(struct pt_regs *regs); +void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, + u64 intel_ctrl); + extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; -- Gitee From 31839dae4e5782f56e479ce34a03adddb4add9f4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:21 -0700 Subject: [PATCH 182/257] perf/core: Factor out functions to allocate/free the task_ctx_data commit ff9ff926889dd8026b4ba55266a010c27f68604f upstream The method to allocate/free the task_ctx_data is going to be changed in the following patch. Currently, the task_ctx_data is allocated/freed in several different places. To avoid repeatedly modifying the same codes in several different places, alloc_task_ctx_data() and free_task_ctx_data() are factored out to allocate/free the task_ctx_data. The modification only needs to be applied once. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-16-git-send-email-kan.liang@linux.intel.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- kernel/events/core.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 418cbf41c7fd..634f56700e7f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1183,12 +1183,22 @@ static void get_ctx(struct perf_event_context *ctx) refcount_inc(&ctx->refcount); } +static void *alloc_task_ctx_data(struct pmu *pmu) +{ + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); +} + +static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +{ + kfree(task_ctx_data); +} + static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx->task_ctx_data); + free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -4316,7 +4326,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, goto errout; if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + task_ctx_data = alloc_task_ctx_data(pmu); if (!task_ctx_data) { err = -ENOMEM; goto errout; @@ -4374,11 +4384,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, } } - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - kfree(task_ctx_data); + free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } @@ -12315,8 +12325,7 @@ inherit_event(struct perf_event *parent_event, !child_ctx->task_ctx_data) { struct pmu *pmu = child_event->pmu; - child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, - GFP_KERNEL); + child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); if (!child_ctx->task_ctx_data) { free_event(child_event); return ERR_PTR(-ENOMEM); -- Gitee From f84ac6e833b60b0c71c62aa39c9905b409a3780a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:22 -0700 Subject: [PATCH 183/257] perf/core: Use kmem_cache to allocate the PMU specific data commit 217c2a633ebb36f1cc6d249f4ef2e4a809d46818 upstream Currently, the PMU specific data task_ctx_data is allocated by the function kzalloc() in the perf generic code. When there is no specific alignment requirement for the task_ctx_data, the method works well for now. However, there will be a problem once a specific alignment requirement is introduced in future features, e.g., the Architecture LBR XSAVE feature requires 64-byte alignment. If the specific alignment requirement is not fulfilled, the XSAVE family of instructions will fail to save/restore the xstate to/from the task_ctx_data. The function kzalloc() itself only guarantees a natural alignment. A new method to allocate the task_ctx_data has to be introduced, which has to meet the requirements as below: - must be a generic method can be used by different architectures, because the allocation of the task_ctx_data is implemented in the perf generic code; - must be an alignment-guarantee method (The alignment requirement is not changed after the boot); - must be able to allocate/free a buffer (smaller than a page size) dynamically; - should not cause extra CPU overhead or space overhead. Several options were considered as below: - One option is to allocate a larger buffer for task_ctx_data. E.g., ptr = kmalloc(size + alignment, GFP_KERNEL); ptr &= ~(alignment - 1); This option causes space overhead. - Another option is to allocate the task_ctx_data in the PMU specific code. To do so, several function pointers have to be added. As a result, both the generic structure and the PMU specific structure will become bigger. Besides, extra function calls are added when allocating/freeing the buffer. This option will increase both the space overhead and CPU overhead. - The third option is to use a kmem_cache to allocate a buffer for the task_ctx_data. The kmem_cache can be created with a specific alignment requirement by the PMU at boot time. A new pointer for kmem_cache has to be added in the generic struct pmu, which would be used to dynamically allocate a buffer for the task_ctx_data at run time. Although the new pointer is added to the struct pmu, the existing variable task_ctx_size is not required anymore. The size of the generic structure is kept the same. The third option which meets all the aforementioned requirements is used to replace kzalloc() for the PMU specific data allocation. A later patch will remove the kzalloc() method and the related variables. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-17-git-send-email-kan.liang@linux.intel.com Signed-off-by: Rahul Kumar Signed-off-by: mohanasv2 --- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2aa48df4e89f..7f6f174f481d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -441,6 +441,11 @@ struct pmu { */ size_t task_ctx_size; + /* + * Kmem cache of PMU specific data + */ + struct kmem_cache *task_ctx_cache; + /* * Set up pmu-private data structures for an AUX area diff --git a/kernel/events/core.c b/kernel/events/core.c index 634f56700e7f..a3270243c917 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1185,12 +1185,18 @@ static void get_ctx(struct perf_event_context *ctx) static void *alloc_task_ctx_data(struct pmu *pmu) { + if (pmu->task_ctx_cache) + return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); + return kzalloc(pmu->task_ctx_size, GFP_KERNEL); } static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) { - kfree(task_ctx_data); + if (pmu->task_ctx_cache && task_ctx_data) + kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); + else + kfree(task_ctx_data); } static void free_ctx(struct rcu_head *head) -- Gitee From 3e743e9fd402b84abee3f724d74c14c231925689 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 3 Jul 2020 05:49:23 -0700 Subject: [PATCH 184/257] perf/x86/intel/lbr: Create kmem_cache for the LBR context data commit 33cad284497cf40f55ad6029c06011de3538ebed upstream A new kmem_cache method is introduced to allocate the PMU specific data task_ctx_data, which requires the PMU specific code to create a kmem_cache. Currently, the task_ctx_data is only used by the Intel LBR call stack feature, which is introduced since Haswell. The kmem_cache should be only created for Haswell and later platforms. There is no alignment requirement for the existing platforms. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1593780569-62993-18-git-send-email-kan.liang@linux.intel.com Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- arch/x86/events/intel/lbr.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index e01c48eaaf6b..aaf3852d26da 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1529,9 +1529,17 @@ void __init intel_pmu_lbr_init_snb(void) */ } +static inline struct kmem_cache * +create_lbr_kmem_cache(size_t size, size_t align) +{ + return kmem_cache_create("x86_lbr", size, align, 0, NULL); +} + /* haswell */ void intel_pmu_lbr_init_hsw(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; @@ -1540,6 +1548,8 @@ void intel_pmu_lbr_init_hsw(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + if (lbr_from_signext_quirk_needed()) static_branch_enable(&lbr_from_quirk_key); } @@ -1547,6 +1557,8 @@ void intel_pmu_lbr_init_hsw(void) /* skylake */ __init void intel_pmu_lbr_init_skl(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 32; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; @@ -1556,6 +1568,8 @@ __init void intel_pmu_lbr_init_skl(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + /* * SW branch filter usage: * - support syscall, sysret capture. @@ -1629,6 +1643,7 @@ void __init intel_pmu_arch_lbr_init(void) union cpuid28_ebx ebx; union cpuid28_ecx ecx; unsigned int unused_edx; + size_t size; u64 lbr_nr; /* Arch LBR Capabilities */ @@ -1653,8 +1668,10 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_br_type = ecx.split.lbr_br_type; x86_pmu.lbr_nr = lbr_nr; - x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) + - lbr_nr * sizeof(struct lbr_entry); + size = sizeof(struct x86_perf_task_context_arch_lbr) + + lbr_nr * sizeof(struct lbr_entry); + x86_get_pmu()->task_ctx_size = size; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0; -- Gitee From ec19b6a2726943904d1c2a407fd0d5fc8046144b Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:14 +0800 Subject: [PATCH 185/257] tools headers uapi: Update tools's copy of linux/perf_event.h commit 412736119116d0161688e9061485fbc3e25f78d5 upstream To get the changes in: Liang Kan's patch 55bcf6ef314ae8ba ("perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE") Kan's patch is in the tip/perf/core branch. So the next perf tool patches need this interface for hybrid support. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-2-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/include/uapi/linux/perf_event.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index d4d99f945d13..0eeab4d13bad 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -37,6 +37,21 @@ enum perf_type_id { PERF_TYPE_MAX, /* non-ABI */ }; +/* + * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + * AA: hardware event ID + * EEEEEEEE: PMU type ID + * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + * BB: hardware cache ID + * CC: hardware cache op ID + * DD: hardware cache op result ID + * EEEEEEEE: PMU type ID + * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + */ +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff + /* * Generalized performance event event_id types, used by the * attr.event_id parameter of the sys_perf_event_open() -- Gitee From c260078f3d4dc69e9c7125dc963113c90fc36928 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Tue, 27 Apr 2021 15:01:26 +0800 Subject: [PATCH 186/257] perf record: Create two hybrid 'cycles' events by default commit b53a0755d5c2d19b13db897d6faf4969e03e45ae upstream When evlist is empty, for example no '-e' specified in perf record, one default 'cycles' event is added to evlist. While on hybrid platform, it needs to create two default 'cycles' events. One is for cpu_core, the other is for cpu_atom. This patch actually calls evsel__new_cycles() two times to create two 'cycles' events. # ./perf record -vv -a -- sleep 1 ... ------------------------------------------------------------ perf_event_attr: size 120 config 0x400000000 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|ID|CPU|PERIOD read_format ID disabled 1 inherit 1 freq 1 precise_ip 3 sample_id_all 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 = 5 sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 = 6 sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 = 7 sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 = 9 sys_perf_event_open: pid -1 cpu 4 group_fd -1 flags 0x8 = 10 sys_perf_event_open: pid -1 cpu 5 group_fd -1 flags 0x8 = 11 sys_perf_event_open: pid -1 cpu 6 group_fd -1 flags 0x8 = 12 sys_perf_event_open: pid -1 cpu 7 group_fd -1 flags 0x8 = 13 sys_perf_event_open: pid -1 cpu 8 group_fd -1 flags 0x8 = 14 sys_perf_event_open: pid -1 cpu 9 group_fd -1 flags 0x8 = 15 sys_perf_event_open: pid -1 cpu 10 group_fd -1 flags 0x8 = 16 sys_perf_event_open: pid -1 cpu 11 group_fd -1 flags 0x8 = 17 sys_perf_event_open: pid -1 cpu 12 group_fd -1 flags 0x8 = 18 sys_perf_event_open: pid -1 cpu 13 group_fd -1 flags 0x8 = 19 sys_perf_event_open: pid -1 cpu 14 group_fd -1 flags 0x8 = 20 sys_perf_event_open: pid -1 cpu 15 group_fd -1 flags 0x8 = 21 ------------------------------------------------------------ perf_event_attr: size 120 config 0x800000000 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|ID|CPU|PERIOD read_format ID disabled 1 inherit 1 freq 1 precise_ip 3 sample_id_all 1 exclude_guest 1 ------------------------------------------------------------ sys_perf_event_open: pid -1 cpu 16 group_fd -1 flags 0x8 = 22 sys_perf_event_open: pid -1 cpu 17 group_fd -1 flags 0x8 = 23 sys_perf_event_open: pid -1 cpu 18 group_fd -1 flags 0x8 = 24 sys_perf_event_open: pid -1 cpu 19 group_fd -1 flags 0x8 = 25 sys_perf_event_open: pid -1 cpu 20 group_fd -1 flags 0x8 = 26 sys_perf_event_open: pid -1 cpu 21 group_fd -1 flags 0x8 = 27 sys_perf_event_open: pid -1 cpu 22 group_fd -1 flags 0x8 = 28 sys_perf_event_open: pid -1 cpu 23 group_fd -1 flags 0x8 = 29 ------------------------------------------------------------ We have to create evlist-hybrid.c otherwise due to the symbol dependency the perf test python would be failed. Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20210427070139.25256-14-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/builtin-record.c | 19 +++++++++++---- tools/perf/util/Build | 1 + tools/perf/util/evlist-hybrid.c | 41 +++++++++++++++++++++++++++++++++ tools/perf/util/evlist-hybrid.h | 12 ++++++++++ tools/perf/util/evlist.c | 5 +++- tools/perf/util/evsel.c | 6 ++--- tools/perf/util/evsel.h | 2 +- 7 files changed, 77 insertions(+), 9 deletions(-) create mode 100644 tools/perf/util/evlist-hybrid.c create mode 100644 tools/perf/util/evlist-hybrid.h diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 047b30ca33b4..1c73eb666f51 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -44,6 +44,8 @@ #include "util/time-utils.h" #include "util/units.h" #include "util/bpf-event.h" +#include "util/pmu-hybrid.h" +#include "util/evlist-hybrid.h" #include "asm/bug.h" #include "perf.h" @@ -2465,10 +2467,19 @@ int cmd_record(int argc, const char **argv) if (record.opts.overwrite) record.opts.tail_synthesize = true; - if (rec->evlist->core.nr_entries == 0 && - __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) { - pr_err("Not enough memory for event selector list\n"); - goto out; + if (rec->evlist->core.nr_entries == 0) { + if (perf_pmu__has_hybrid()) { + err = evlist__add_default_hybrid(rec->evlist, + !record.opts.no_samples); + } else { + err = __evlist__add_default(rec->evlist, + !record.opts.no_samples); + } + + if (err < 0) { + pr_err("Not enough memory for event selector list\n"); + goto out; + } } if (rec->opts.target.tid && !rec->opts.no_inherit_set) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 90c68a6aed1d..8ed7fc0b91a8 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -9,6 +9,7 @@ perf-y += db-export.o perf-y += env.o perf-y += event.o perf-y += evlist.o +perf-y += evlist-hybrid.o perf-y += sideband_evlist.o perf-y += evsel.o perf-y += evsel_fprintf.o diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c new file mode 100644 index 000000000000..e11998526f2e --- /dev/null +++ b/tools/perf/util/evlist-hybrid.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include "cpumap.h" +#include "evlist.h" +#include "evsel.h" +#include "../perf.h" +#include "util/pmu-hybrid.h" +#include "util/evlist-hybrid.h" +#include +#include +#include +#include +#include +#include +#include + +int evlist__add_default_hybrid(struct evlist *evlist, bool precise) +{ + struct evsel *evsel; + struct perf_pmu *pmu; + __u64 config; + struct perf_cpu_map *cpus; + + perf_pmu__for_each_hybrid_pmu(pmu) { + config = PERF_COUNT_HW_CPU_CYCLES | + ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT); + evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE, + config); + if (!evsel) + return -ENOMEM; + + cpus = perf_cpu_map__get(pmu->cpus); + evsel->core.cpus = cpus; + evsel->core.own_cpus = perf_cpu_map__get(cpus); + evsel->pmu_name = strdup(pmu->name); + evlist__add(evlist, evsel); + } + + return 0; +} diff --git a/tools/perf/util/evlist-hybrid.h b/tools/perf/util/evlist-hybrid.h new file mode 100644 index 000000000000..e25861649d8f --- /dev/null +++ b/tools/perf/util/evlist-hybrid.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_EVLIST_HYBRID_H +#define __PERF_EVLIST_HYBRID_H + +#include +#include +#include "evlist.h" +#include + +int evlist__add_default_hybrid(struct evlist *evlist, bool precise); + +#endif /* __PERF_EVLIST_HYBRID_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index bda41564d406..7443238d0ca7 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -23,6 +23,7 @@ #include "bpf-event.h" #include "util/string2.h" #include "util/perf_api_probe.h" +#include "util/evlist-hybrid.h" #include #include #include @@ -213,8 +214,10 @@ void perf_evlist__set_leader(struct evlist *evlist) int __evlist__add_default(struct evlist *evlist, bool precise) { - struct evsel *evsel = evsel__new_cycles(precise); + struct evsel *evsel; + evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE, + PERF_COUNT_HW_CPU_CYCLES); if (evsel == NULL) return -ENOMEM; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index cdd8738e9cdf..3bf71f2b2ba2 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -292,11 +292,11 @@ static bool perf_event_can_profile_kernel(void) return perf_event_paranoid_check(1); } -struct evsel *evsel__new_cycles(bool precise) +struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config) { struct perf_event_attr attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, + .type = type, + .config = config, .exclude_kernel = !perf_event_can_profile_kernel(), }; struct evsel *evsel; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 4aa78039cef7..fed1ef2a9fb2 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -177,7 +177,7 @@ static inline struct evsel *evsel__newtp(const char *sys, const char *name) return evsel__newtp_idx(sys, name, 0); } -struct evsel *evsel__new_cycles(bool precise); +struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config); struct tep_event *event_format__new(const char *sys, const char *name); -- Gitee From bb1aae9f863ae3c34656be27cef1286d87d5fec7 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 2 Nov 2021 11:01:12 +0530 Subject: [PATCH 187/257] perf evsel: Don't set exclude_guest by default commit eb39bf325631f9ae185abd16281079b7b9858737 upstream Perf tool sets exclude_guest by default while calling perf_event_open(). Because IBS does not have filtering capability, it always gets rejected by IBS PMU driver and thus perf falls back to non-precise sampling. Fix it by not setting exclude_guest by default on AMD. Before: $ sudo ./perf record -C 0 -vvv true |& grep precise precise_ip 3 decreasing precise_ip by one (2) precise_ip 2 decreasing precise_ip by one (1) precise_ip 1 decreasing precise_ip by one (0) After: $ sudo ./perf record -C 0 -vvv true |& grep precise precise_ip 3 decreasing precise_ip by one (2) precise_ip 2 Committer notes: Fixup init to zero for perf_env in older compilers: arch/x86/util/evsel.c:15:26: error: missing field 'os_release' initializer [-Werror,-Wmissing-field-initializers] struct perf_env env = {0}; ^ Committer notes: Namhyung remarked: It'd be nice if it can cover explicit "-e cycles:pp" as well. Ravi clarified: For explicit :pp modifier, evsel->precise_max does not get set and thus perf does not try with different attr->precise_ip values while exclude_guest set. So no issue with explicit :pp: $ sudo ./perf record -C 0 -e cycles:pp -vvv |& grep "precise_ip\|exclude_guest" precise_ip 2 exclude_guest 1 precise_ip 2 exclude_guest 1 switching off exclude_guest, exclude_host precise_ip 2 ^C Also, with :P modifier, evsel->precise_max gets set but exclude_guest does not and thus :P also works fine: $ sudo ./perf record -C 0 -e cycles:P -vvv |& grep "precise_ip\|exclude_guest" precise_ip 3 decreasing precise_ip by one (2) precise_ip 2 ^C Reported-by: Kim Phillips Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20211103072112.32312-1-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/arch/x86/util/evsel.c | 23 +++++++++++++++++++++++ tools/perf/util/evsel.c | 12 +++++++----- tools/perf/util/evsel.h | 1 + 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/tools/perf/arch/x86/util/evsel.c b/tools/perf/arch/x86/util/evsel.c index 2f733cdc8dbb..ac2899a25b7a 100644 --- a/tools/perf/arch/x86/util/evsel.c +++ b/tools/perf/arch/x86/util/evsel.c @@ -1,8 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include "util/evsel.h" +#include "util/env.h" +#include "linux/string.h" void arch_evsel__set_sample_weight(struct evsel *evsel) { evsel__set_sample_bit(evsel, WEIGHT_STRUCT); } + +void arch_evsel__fixup_new_cycles(struct perf_event_attr *attr) +{ + struct perf_env env = { .total_mem = 0, } ; + + if (!perf_env__cpuid(&env)) + return; + + /* + * On AMD, precise cycles event sampling internally uses IBS pmu. + * But IBS does not have filtering capabilities and perf by default + * sets exclude_guest = 1. This makes IBS pmu event init fail and + * thus perf ends up doing non-precise sampling. Avoid it by clearing + * exclude_guest. + */ + if (env.cpuid && strstarts(env.cpuid, "AuthenticAMD")) + attr->exclude_guest = 0; + + free(env.cpuid); +} diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 3bf71f2b2ba2..abdf91ca843d 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -292,7 +292,7 @@ static bool perf_event_can_profile_kernel(void) return perf_event_paranoid_check(1); } -struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config) +struct evsel *evsel__new_cycles(bool precise __maybe_unused, __u32 type, __u64 config) { struct perf_event_attr attr = { .type = type, @@ -303,18 +303,16 @@ struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config) event_attr_init(&attr); - if (!precise) - goto new_event; - /* * Now let the usual logic to set up the perf_event_attr defaults * to kick in when we return and before perf_evsel__open() is called. */ -new_event: evsel = evsel__new(&attr); if (evsel == NULL) goto out; + arch_evsel__fixup_new_cycles(&evsel->core.attr); + evsel->precise_max = true; /* use asprintf() because free(evsel) assumes name is allocated */ @@ -930,6 +928,10 @@ void __weak arch_evsel__set_sample_weight(struct evsel *evsel) evsel__set_sample_bit(evsel, WEIGHT); } +void __weak arch_evsel__fixup_new_cycles(struct perf_event_attr *attr __maybe_unused) +{ +} + /* * The enable_on_exec/disabled value strategy: * diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index fed1ef2a9fb2..97848fd39b67 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -232,6 +232,7 @@ int perf_evsel__append_tp_filter(struct evsel *evsel, const char *filter); int perf_evsel__append_addr_filter(struct evsel *evsel, const char *filter); void arch_evsel__set_sample_weight(struct evsel *evsel); +void arch_evsel__fixup_new_cycles(struct perf_event_attr *attr); int evsel__enable(struct evsel *evsel); int evsel__disable(struct evsel *evsel); -- Gitee From adf177282b703c7e90e51d8f30bac488ffceba58 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 20 Nov 2019 16:15:18 -0800 Subject: [PATCH 188/257] perf stat: Factor out open error handling commit e0e6a6ca3ac211cc07486330a2b89f41ea31b4dd upstream Factor out the open error handling into a separate function. This is useful for followon patches who need to duplicate this. No behavior change intended. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lore.kernel.org/lkml/20191121001522.180827-9-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/builtin-stat.c | 100 +++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 40 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index b5a82969dab4..60c3f47d569a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -420,6 +420,57 @@ static bool is_target_alive(struct target *_target, return false; } +enum counter_recovery { + COUNTER_SKIP, + COUNTER_RETRY, + COUNTER_FATAL, +}; + +static enum counter_recovery stat_handle_error(struct evsel *counter) +{ + char msg[BUFSIZ]; + /* + * PPC returns ENXIO for HW counters until 2.6.37 + * (behavior changed with commit b0a873e). + */ + if (errno == EINVAL || errno == ENOSYS || + errno == ENOENT || errno == EOPNOTSUPP || + errno == ENXIO) { + if (verbose > 0) + ui__warning("%s event is not supported by the kernel.\n", + perf_evsel__name(counter)); + counter->supported = false; + + if ((counter->leader != counter) || + !(counter->leader->core.nr_members > 1)) + return COUNTER_SKIP; + } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { + if (verbose > 0) + ui__warning("%s\n", msg); + return COUNTER_RETRY; + } else if (target__has_per_thread(&target) && + evsel_list->core.threads && + evsel_list->core.threads->err_thread != -1) { + /* + * For global --per-thread case, skip current + * error thread. + */ + if (!thread_map__remove(evsel_list->core.threads, + evsel_list->core.threads->err_thread)) { + evsel_list->core.threads->err_thread = -1; + return COUNTER_RETRY; + } + } + + perf_evsel__open_strerror(counter, &target, + errno, msg, sizeof(msg)); + ui__error("%s\n", msg); + + if (child_pid != -1) + kill(child_pid, SIGTERM); + return COUNTER_FATAL; +} + static int __run_perf_stat(int argc, const char **argv, int run_idx) { int interval = stat_config.interval; @@ -469,47 +520,16 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) goto try_again; } - /* - * PPC returns ENXIO for HW counters until 2.6.37 - * (behavior changed with commit b0a873e). - */ - if (errno == EINVAL || errno == ENOSYS || - errno == ENOENT || errno == EOPNOTSUPP || - errno == ENXIO) { - if (verbose > 0) - ui__warning("%s event is not supported by the kernel.\n", - perf_evsel__name(counter)); - counter->supported = false; - - if ((counter->leader != counter) || - !(counter->leader->core.nr_members > 1)) - continue; - } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { - if (verbose > 0) - ui__warning("%s\n", msg); - goto try_again; - } else if (target__has_per_thread(&target) && - evsel_list->core.threads && - evsel_list->core.threads->err_thread != -1) { - /* - * For global --per-thread case, skip current - * error thread. - */ - if (!thread_map__remove(evsel_list->core.threads, - evsel_list->core.threads->err_thread)) { - evsel_list->core.threads->err_thread = -1; - goto try_again; - } + switch (stat_handle_error(counter)) { + case COUNTER_FATAL: + return -1; + case COUNTER_RETRY: + goto try_again; + case COUNTER_SKIP: + continue; + default: + break; } - - perf_evsel__open_strerror(counter, &target, - errno, msg, sizeof(msg)); - ui__error("%s\n", msg); - - if (child_pid != -1) - kill(child_pid, SIGTERM); - - return -1; } counter->supported = true; -- Gitee From feec51afdeee1c83614e82142cfd5b7e5e8f98cc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 20 Nov 2019 16:15:12 -0800 Subject: [PATCH 189/257] perf affinity: Add infrastructure to save/restore affinity commit 267ed5d8593cedd6146eabe00d270629c9cff771 upstream The kernel perf subsystem has to IPI to the target CPU for many operations. On systems with many CPUs and when managing many events the overhead can be dominated by lots of IPIs. An alternative is to set up CPU affinity in the perf tool, then set up all the events for that CPU, and then move on to the next CPU. Add some affinity management infrastructure to enable such a model. Used in followon patches. Committer notes: Use zfree() in some places, add missing stdbool.h header, some minor coding style changes. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lore.kernel.org/lkml/20191121001522.180827-3-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/util/Build | 1 + tools/perf/util/affinity.c | 73 ++++++++++++++++++++++++++++++ tools/perf/util/affinity.h | 17 +++++++ tools/perf/util/python-ext-sources | 1 + 4 files changed, 92 insertions(+) create mode 100644 tools/perf/util/affinity.c create mode 100644 tools/perf/util/affinity.h diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 8ed7fc0b91a8..d9df8a2a83cd 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -79,6 +79,7 @@ perf-y += sort.o perf-y += hist.o perf-y += util.o perf-y += cpumap.o +perf-y += affinity.o perf-y += cputopo.o perf-y += cgroup.o perf-y += target.o diff --git a/tools/perf/util/affinity.c b/tools/perf/util/affinity.c new file mode 100644 index 000000000000..a5e31f826828 --- /dev/null +++ b/tools/perf/util/affinity.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Manage affinity to optimize IPIs inside the kernel perf API. */ +#define _GNU_SOURCE 1 +#include +#include +#include +#include +#include "perf.h" +#include "cpumap.h" +#include "affinity.h" + +static int get_cpu_set_size(void) +{ + int sz = cpu__max_cpu() + 8 - 1; + /* + * sched_getaffinity doesn't like masks smaller than the kernel. + * Hopefully that's big enough. + */ + if (sz < 4096) + sz = 4096; + return sz / 8; +} + +int affinity__setup(struct affinity *a) +{ + int cpu_set_size = get_cpu_set_size(); + + a->orig_cpus = bitmap_alloc(cpu_set_size * 8); + if (!a->orig_cpus) + return -1; + sched_getaffinity(0, cpu_set_size, (cpu_set_t *)a->orig_cpus); + a->sched_cpus = bitmap_alloc(cpu_set_size * 8); + if (!a->sched_cpus) { + zfree(&a->orig_cpus); + return -1; + } + bitmap_zero((unsigned long *)a->sched_cpus, cpu_set_size); + a->changed = false; + return 0; +} + +/* + * perf_event_open does an IPI internally to the target CPU. + * It is more efficient to change perf's affinity to the target + * CPU and then set up all events on that CPU, so we amortize + * CPU communication. + */ +void affinity__set(struct affinity *a, int cpu) +{ + int cpu_set_size = get_cpu_set_size(); + + if (cpu == -1) + return; + a->changed = true; + set_bit(cpu, a->sched_cpus); + /* + * We ignore errors because affinity is just an optimization. + * This could happen for example with isolated CPUs or cpusets. + * In this case the IPIs inside the kernel's perf API still work. + */ + sched_setaffinity(0, cpu_set_size, (cpu_set_t *)a->sched_cpus); + clear_bit(cpu, a->sched_cpus); +} + +void affinity__cleanup(struct affinity *a) +{ + int cpu_set_size = get_cpu_set_size(); + + if (a->changed) + sched_setaffinity(0, cpu_set_size, (cpu_set_t *)a->orig_cpus); + zfree(&a->sched_cpus); + zfree(&a->orig_cpus); +} diff --git a/tools/perf/util/affinity.h b/tools/perf/util/affinity.h new file mode 100644 index 000000000000..0ad6a18ef20c --- /dev/null +++ b/tools/perf/util/affinity.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef PERF_AFFINITY_H +#define PERF_AFFINITY_H 1 + +#include + +struct affinity { + unsigned long *orig_cpus; + unsigned long *sched_cpus; + bool changed; +}; + +void affinity__cleanup(struct affinity *a); +void affinity__set(struct affinity *a, int cpu); +int affinity__setup(struct affinity *a); + +#endif // PERF_AFFINITY_H diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index 9af183860fbd..e7279ea6043a 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -33,3 +33,4 @@ util/trace-event.c util/string.c util/symbol_fprintf.c util/units.c +util/affinity.c -- Gitee From fadabe01600349380961c1a87db9bf719992b818 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 20 Nov 2019 16:15:15 -0800 Subject: [PATCH 190/257] perf evsel: Add iterator to iterate over events ordered by CPU commit a8cbe40fe9f4ba499cc60b8b6a6851c2c1963797 upstream Add some common code that is needed to iterate over all events in CPU order. Used in followon patches Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lore.kernel.org/lkml/20191121001522.180827-6-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/util/cpumap.h | 1 + tools/perf/util/evlist.c | 32 ++++++++++++++++++++++++++++++++ tools/perf/util/evlist.h | 8 ++++++++ tools/perf/util/evsel.h | 1 + 4 files changed, 42 insertions(+) diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 2553bef1279d..dbc1d7e949ed 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -60,4 +60,5 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res, int cpu_map__cpu(struct perf_cpu_map *cpus, int idx); bool cpu_map__has(struct perf_cpu_map *cpus, int cpu); + #endif /* __PERF_CPUMAP_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 7443238d0ca7..d971c12b9653 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -324,6 +324,38 @@ static int perf_evlist__nr_threads(struct evlist *evlist, return perf_thread_map__nr(evlist->core.threads); } +void evlist__cpu_iter_start(struct evlist *evlist) +{ + struct evsel *pos; + + /* + * Reset the per evsel cpu_iter. This is needed because + * each evsel's cpumap may have a different index space, + * and some operations need the index to modify + * the FD xyarray (e.g. open, close) + */ + evlist__for_each_entry(evlist, pos) + pos->cpu_iter = 0; +} + +bool evsel__cpu_iter_skip_no_inc(struct evsel *ev, int cpu) +{ + if (ev->cpu_iter >= ev->core.cpus->nr) + return true; + if (cpu >= 0 && ev->core.cpus->map[ev->cpu_iter] != cpu) + return true; + return false; +} + +bool evsel__cpu_iter_skip(struct evsel *ev, int cpu) +{ + if (!evsel__cpu_iter_skip_no_inc(ev, cpu)) { + ev->cpu_iter++; + return false; + } + return true; +} + void evlist__disable(struct evlist *evlist) { struct evsel *pos; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index e98644a65498..a7094df7b816 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -320,9 +320,17 @@ void perf_evlist__to_front(struct evlist *evlist, #define evlist__for_each_entry_safe(evlist, tmp, evsel) \ __evlist__for_each_entry_safe(&(evlist)->core.entries, tmp, evsel) +#define evlist__for_each_cpu(evlist, index, cpu) \ + evlist__cpu_iter_start(evlist); \ + perf_cpu_map__for_each_cpu (cpu, index, (evlist)->core.all_cpus) + void perf_evlist__set_tracking_event(struct evlist *evlist, struct evsel *tracking_evsel); +void evlist__cpu_iter_start(struct evlist *evlist); +bool evsel__cpu_iter_skip(struct evsel *ev, int cpu); +bool evsel__cpu_iter_skip_no_inc(struct evsel *ev, int cpu); + struct evsel * perf_evlist__find_evsel_by_str(struct evlist *evlist, const char *str); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 97848fd39b67..ddb255bd837e 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -95,6 +95,7 @@ struct evsel { bool collect_stat; bool weak_group; bool percore; + int cpu_iter; const char *pmu_name; struct { perf_evsel__sb_cb_t *cb; -- Gitee From 0d4cd43e5d3689a878dbb51824eee59d90becda8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 20 Nov 2019 16:15:14 -0800 Subject: [PATCH 191/257] perf evlist: Maintain evlist->all_cpus commit a2408a70368ade9c99de27da78d49416313b8833 upstream Maintain a cpumap in the evlist that is the union of all the cpus of the events. This needs a cpumap merge operation, which is added together with tests. v2: Add tests for cpu map merge Fix handling of duplicates Rename _update to _merge Factor out sorting. Fix handling of NULL maps in merge v3: Add comments and empty lines to _merge Committer testing: # perf test "Merge cpu map" 52: Merge cpu map : Ok # Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Link: http://lore.kernel.org/lkml/20191121001522.180827-5-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/lib/cpumap.c | 57 ++++++++++++++++++++++++ tools/perf/lib/evlist.c | 1 + tools/perf/lib/include/internal/evlist.h | 1 + tools/perf/lib/include/perf/cpumap.h | 2 + tools/perf/tests/builtin-test.c | 5 +++ tools/perf/tests/cpumap.c | 16 +++++++ tools/perf/tests/tests.h | 1 + 7 files changed, 83 insertions(+) diff --git a/tools/perf/lib/cpumap.c b/tools/perf/lib/cpumap.c index 2ca1fafa620d..5e1ae7a23591 100644 --- a/tools/perf/lib/cpumap.c +++ b/tools/perf/lib/cpumap.c @@ -272,3 +272,60 @@ int perf_cpu_map__max(struct perf_cpu_map *map) return max; } + +/* + * Merge two cpumaps + * + * orig either gets freed and replaced with a new map, or reused + * with no reference count change (similar to "realloc") + * other has its reference count increased. + */ + +struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig, + struct perf_cpu_map *other) +{ + int *tmp_cpus; + int tmp_len; + int i, j, k; + struct perf_cpu_map *merged; + + if (!orig && !other) + return NULL; + if (!orig) { + perf_cpu_map__get(other); + return other; + } + if (!other) + return orig; + if (orig->nr == other->nr && + !memcmp(orig->map, other->map, orig->nr * sizeof(int))) + return orig; + + tmp_len = orig->nr + other->nr; + tmp_cpus = malloc(tmp_len * sizeof(int)); + if (!tmp_cpus) + return NULL; + + /* Standard merge algorithm from wikipedia */ + i = j = k = 0; + while (i < orig->nr && j < other->nr) { + if (orig->map[i] <= other->map[j]) { + if (orig->map[i] == other->map[j]) + j++; + tmp_cpus[k++] = orig->map[i++]; + } else + tmp_cpus[k++] = other->map[j++]; + } + + while (i < orig->nr) + tmp_cpus[k++] = orig->map[i++]; + + while (j < other->nr) + tmp_cpus[k++] = other->map[j++]; + assert(k <= tmp_len); + + merged = cpu_map__trim_new(k, tmp_cpus); + free(tmp_cpus); + perf_cpu_map__put(orig); + return merged; +} diff --git a/tools/perf/lib/evlist.c b/tools/perf/lib/evlist.c index d1496fee810c..b7e8b3b15f25 100644 --- a/tools/perf/lib/evlist.c +++ b/tools/perf/lib/evlist.c @@ -46,6 +46,7 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, perf_thread_map__put(evsel->threads); evsel->threads = perf_thread_map__get(evlist->threads); + evlist->all_cpus = perf_cpu_map__merge(evlist->all_cpus, evsel->cpus); } static void perf_evlist__propagate_maps(struct perf_evlist *evlist) diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h index 9f440ab12b76..096bf0b05e04 100644 --- a/tools/perf/lib/include/internal/evlist.h +++ b/tools/perf/lib/include/internal/evlist.h @@ -17,6 +17,7 @@ struct perf_evlist { int nr_entries; bool has_user_cpus; struct perf_cpu_map *cpus; + struct perf_cpu_map *all_cpus; struct perf_thread_map *threads; int nr_mmaps; size_t mmap_len; diff --git a/tools/perf/lib/include/perf/cpumap.h b/tools/perf/lib/include/perf/cpumap.h index ac9aa497f84a..6a17ad730cbc 100644 --- a/tools/perf/lib/include/perf/cpumap.h +++ b/tools/perf/lib/include/perf/cpumap.h @@ -12,6 +12,8 @@ LIBPERF_API struct perf_cpu_map *perf_cpu_map__dummy_new(void); LIBPERF_API struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list); LIBPERF_API struct perf_cpu_map *perf_cpu_map__read(FILE *file); LIBPERF_API struct perf_cpu_map *perf_cpu_map__get(struct perf_cpu_map *map); +LIBPERF_API struct perf_cpu_map *perf_cpu_map__merge(struct perf_cpu_map *orig, + struct perf_cpu_map *other); LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map); LIBPERF_API int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx); LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus); diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 55774baffc2a..d111d2ed1c10 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -259,6 +259,11 @@ static struct test generic_tests[] = { .desc = "Print cpu map", .func = test__cpu_map_print, }, + { + .desc = "Merge cpu map", + .func = test__cpu_map_merge, + }, + { .desc = "Probe SDT events", .func = test__sdt_event, diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index 8a0d236202b0..4ac56741ac5f 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -120,3 +120,19 @@ int test__cpu_map_print(struct test *test __maybe_unused, int subtest __maybe_un TEST_ASSERT_VAL("failed to convert map", cpu_map_print("1-10,12-20,22-30,32-40")); return 0; } + +int test__cpu_map_merge(struct test *test __maybe_unused, int subtest __maybe_unused) +{ + struct perf_cpu_map *a = perf_cpu_map__new("4,2,1"); + struct perf_cpu_map *b = perf_cpu_map__new("4,5,7"); + struct perf_cpu_map *c = perf_cpu_map__merge(a, b); + char buf[100]; + + TEST_ASSERT_VAL("failed to merge map: bad nr", c->nr == 5); + cpu_map__snprint(c, buf, sizeof(buf)); + TEST_ASSERT_VAL("failed to merge map: bad result", !strcmp(buf, "1-2,4-5,7")); + perf_cpu_map__put(a); + perf_cpu_map__put(b); + perf_cpu_map__put(c); + return 0; +} diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index 72912eb473cb..27e3c65c6955 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -98,6 +98,7 @@ int test__event_update(struct test *test, int subtest); int test__event_times(struct test *test, int subtest); int test__backward_ring_buffer(struct test *test, int subtest); int test__cpu_map_print(struct test *test, int subtest); +int test__cpu_map_merge(struct test *test, int subtest); int test__sdt_event(struct test *test, int subtest); int test__is_printable_array(struct test *test, int subtest); int test__bitmap_print(struct test *test, int subtest); -- Gitee From e02e2e3d639d7cc3bc967052985448e09303df8f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 20 Oct 2019 10:51:55 -0700 Subject: [PATCH 192/257] perf evsel: Avoid close(-1) commit 2ccfb8bc2143ca347609d1d4434176d73a78d805 upstream In some weak fallback cases close can be called a lot with -1. Check for this case and avoid calling close then. This is mainly to shut up valgrind which complains about this case. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20191020175202.32456-3-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/lib/evsel.c | 3 ++- tools/perf/util/evsel.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/lib/evsel.c b/tools/perf/lib/evsel.c index a8cb582e2721..5a89857b0381 100644 --- a/tools/perf/lib/evsel.c +++ b/tools/perf/lib/evsel.c @@ -120,7 +120,8 @@ void perf_evsel__close_fd(struct perf_evsel *evsel) for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) for (thread = 0; thread < xyarray__max_y(evsel->fd); ++thread) { - close(FD(evsel, cpu, thread)); + if (FD(evsel, cpu, thread) >= 0) + close(FD(evsel, cpu, thread)); FD(evsel, cpu, thread) = -1; } } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index abdf91ca843d..1470f6c328f1 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1885,7 +1885,8 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, do { while (--thread >= 0) { - close(FD(evsel, cpu, thread)); + if (FD(evsel, cpu, thread) >= 0) + close(FD(evsel, cpu, thread)); FD(evsel, cpu, thread) = -1; } thread = nthreads; -- Gitee From b539010de10e2425685e8bd026c14ae22f73ca63 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 20 Nov 2019 16:15:16 -0800 Subject: [PATCH 193/257] perf evsel: Add functions to close evsel on a CPU commit 99d6141d677a8cd0b35390a29527c8def42538b1 upstream Refactor the existing all CPU function to use the per CPU close internally. Export APIs to close per CPU. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lore.kernel.org/lkml/20191121001522.180827-7-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/lib/evsel.c | 27 +++++++++++++++++++++------ tools/perf/lib/include/perf/evsel.h | 1 + 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/tools/perf/lib/evsel.c b/tools/perf/lib/evsel.c index 5a89857b0381..ea775dacbd2d 100644 --- a/tools/perf/lib/evsel.c +++ b/tools/perf/lib/evsel.c @@ -114,16 +114,23 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus, return err; } +static void perf_evsel__close_fd_cpu(struct perf_evsel *evsel, int cpu) +{ + int thread; + + for (thread = 0; thread < xyarray__max_y(evsel->fd); ++thread) { + if (FD(evsel, cpu, thread) >= 0) + close(FD(evsel, cpu, thread)); + FD(evsel, cpu, thread) = -1; + } +} + void perf_evsel__close_fd(struct perf_evsel *evsel) { - int cpu, thread; + int cpu; for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) - for (thread = 0; thread < xyarray__max_y(evsel->fd); ++thread) { - if (FD(evsel, cpu, thread) >= 0) - close(FD(evsel, cpu, thread)); - FD(evsel, cpu, thread) = -1; - } + perf_evsel__close_fd_cpu(evsel, cpu); } void perf_evsel__free_fd(struct perf_evsel *evsel) @@ -141,6 +148,14 @@ void perf_evsel__close(struct perf_evsel *evsel) perf_evsel__free_fd(evsel); } +void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu) +{ + if (evsel->fd == NULL) + return; + + perf_evsel__close_fd_cpu(evsel, cpu); +} + int perf_evsel__read_size(struct perf_evsel *evsel) { u64 read_format = evsel->attr.read_format; diff --git a/tools/perf/lib/include/perf/evsel.h b/tools/perf/lib/include/perf/evsel.h index 4388667f265c..ed10a914cd3f 100644 --- a/tools/perf/lib/include/perf/evsel.h +++ b/tools/perf/lib/include/perf/evsel.h @@ -28,6 +28,7 @@ LIBPERF_API void perf_evsel__delete(struct perf_evsel *evsel); LIBPERF_API int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus, struct perf_thread_map *threads); LIBPERF_API void perf_evsel__close(struct perf_evsel *evsel); +LIBPERF_API void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu); LIBPERF_API int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread, struct perf_counts_values *count); LIBPERF_API int perf_evsel__enable(struct perf_evsel *evsel); -- Gitee From e6ff94e6cc6d0df8c26818218200feff2430ed4b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 20 Nov 2019 16:15:19 -0800 Subject: [PATCH 194/257] perf stat: Use affinity for opening events commit 4804e0111662d7d89edf4e767a64c6f7e4778bb1 upstream Restructure the event opening in perf stat to cycle through the events by CPU after setting affinity to that CPU. This eliminates IPI overhead in the perf API. We have to loop through the CPU in the outter builtin-stat code instead of leaving that to low level functions. It has to change the weak group fallback strategy slightly. Since we cannot easily undo the opens for other CPUs move the weak group retry to a separate loop. Before with a large test case with 94 CPUs: % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 42.75 4.050910 67 60046 110 perf_event_open After: 26.86 0.944396 16 58069 110 perf_event_open (the number changes slightly because the weak group retries work differently and the test case relies on weak groups) Committer notes: Added one of the hunks in a patch provided by Andi after I noticed that the "event times" 'perf test' entry was segfaulting. Signed-off-by: Andi Kleen Acked-by: Jiri Olsa Link: http://lore.kernel.org/lkml/20191121001522.180827-10-andi@firstfloor.org Link: http://lore.kernel.org/lkml/20191127232657.GL84886@tassilo.jf.intel.com # Fix Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-stat.c | 121 +++++++++++++++++++++++++++------ tools/perf/tests/event-times.c | 4 +- tools/perf/util/evlist.c | 10 ++- tools/perf/util/evlist.h | 3 +- tools/perf/util/evsel.c | 22 ++++-- tools/perf/util/evsel.h | 5 +- tools/perf/util/stat.c | 5 +- tools/perf/util/stat.h | 3 +- 9 files changed, 141 insertions(+), 34 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 1c73eb666f51..4f30f7c1e079 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -796,7 +796,7 @@ static int record__open(struct record *rec) if ((errno == EINVAL || errno == EBADF) && pos->leader != pos && pos->weak_group) { - pos = perf_evlist__reset_weak_group(evlist, pos); + pos = perf_evlist__reset_weak_group(evlist, pos, true); goto try_again; } rc = -errno; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 60c3f47d569a..40c6c2d822e1 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -65,6 +65,7 @@ #include "util/target.h" #include "util/time-utils.h" #include "util/top.h" +#include "util/affinity.h" #include "asm/bug.h" #include @@ -440,6 +441,11 @@ static enum counter_recovery stat_handle_error(struct evsel *counter) ui__warning("%s event is not supported by the kernel.\n", perf_evsel__name(counter)); counter->supported = false; + /* + * errored is a sticky flag that means one of the counter's + * cpu event had a problem and needs to be reexamined. + */ + counter->errored = true; if ((counter->leader != counter) || !(counter->leader->core.nr_members > 1)) @@ -484,6 +490,9 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) int status = 0; const bool forks = (argc > 0); bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false; + struct affinity affinity; + int i, cpu; + bool second_pass = false; if (interval) { ts.tv_sec = interval / USEC_PER_MSEC; @@ -508,30 +517,104 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) if (group) perf_evlist__set_leader(evsel_list); - evlist__for_each_entry(evsel_list, counter) { + if (affinity__setup(&affinity) < 0) + return -1; + + evlist__for_each_cpu (evsel_list, i, cpu) { + affinity__set(&affinity, cpu); + + evlist__for_each_entry(evsel_list, counter) { + if (evsel__cpu_iter_skip(counter, cpu)) + continue; + if (counter->reset_group || counter->errored) + continue; try_again: - if (create_perf_stat_counter(counter, &stat_config, &target) < 0) { - - /* Weak group failed. Reset the group. */ - if ((errno == EINVAL || errno == EBADF) && - counter->leader != counter && - counter->weak_group) { - counter = perf_evlist__reset_weak_group(evsel_list, counter); - goto try_again; + if (create_perf_stat_counter(counter, &stat_config, &target, + counter->cpu_iter - 1) < 0) { + + /* + * Weak group failed. We cannot just undo this here + * because earlier CPUs might be in group mode, and the kernel + * doesn't support mixing group and non group reads. Defer + * it to later. + * Don't close here because we're in the wrong affinity. + */ + if ((errno == EINVAL || errno == EBADF) && + counter->leader != counter && + counter->weak_group) { + perf_evlist__reset_weak_group(evsel_list, counter, false); + assert(counter->reset_group); + second_pass = true; + continue; + } + + switch (stat_handle_error(counter)) { + case COUNTER_FATAL: + return -1; + case COUNTER_RETRY: + goto try_again; + case COUNTER_SKIP: + continue; + default: + break; + } + } + counter->supported = true; + } + } - switch (stat_handle_error(counter)) { - case COUNTER_FATAL: - return -1; - case COUNTER_RETRY: - goto try_again; - case COUNTER_SKIP: - continue; - default: - break; + if (second_pass) { + /* + * Now redo all the weak group after closing them, + * and also close errored counters. + */ + + evlist__for_each_cpu(evsel_list, i, cpu) { + affinity__set(&affinity, cpu); + /* First close errored or weak retry */ + evlist__for_each_entry(evsel_list, counter) { + if (!counter->reset_group && !counter->errored) + continue; + if (evsel__cpu_iter_skip_no_inc(counter, cpu)) + continue; + perf_evsel__close_cpu(&counter->core, counter->cpu_iter); + } + /* Now reopen weak */ + evlist__for_each_entry(evsel_list, counter) { + if (!counter->reset_group && !counter->errored) + continue; + if (evsel__cpu_iter_skip(counter, cpu)) + continue; + if (!counter->reset_group) + continue; +try_again_reset: + pr_debug2("reopening weak %s\n", perf_evsel__name(counter)); + if (create_perf_stat_counter(counter, &stat_config, &target, + counter->cpu_iter - 1) < 0) { + + switch (stat_handle_error(counter)) { + case COUNTER_FATAL: + return -1; + case COUNTER_RETRY: + goto try_again_reset; + case COUNTER_SKIP: + continue; + default: + break; + } + } + counter->supported = true; } } - counter->supported = true; + } + affinity__cleanup(&affinity); + + evlist__for_each_entry(evsel_list, counter) { + if (!counter->supported) { + perf_evsel__free_fd(&counter->core); + continue; + } l = strlen(counter->unit); if (l > stat_config.unit_width) diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c index 1ee8704e2284..1e8a9f5c356d 100644 --- a/tools/perf/tests/event-times.c +++ b/tools/perf/tests/event-times.c @@ -125,7 +125,7 @@ static int attach__cpu_disabled(struct evlist *evlist) evsel->core.attr.disabled = 1; - err = perf_evsel__open_per_cpu(evsel, cpus); + err = perf_evsel__open_per_cpu(evsel, cpus, -1); if (err) { if (err == -EACCES) return TEST_SKIP; @@ -152,7 +152,7 @@ static int attach__cpu_enabled(struct evlist *evlist) return -1; } - err = perf_evsel__open_per_cpu(evsel, cpus); + err = perf_evsel__open_per_cpu(evsel, cpus, -1); if (err == -EACCES) return TEST_SKIP; diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index d971c12b9653..84c252d28eb6 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1722,7 +1722,8 @@ void perf_evlist__force_leader(struct evlist *evlist) } struct evsel *perf_evlist__reset_weak_group(struct evlist *evsel_list, - struct evsel *evsel) + struct evsel *evsel, + bool close) { struct evsel *c2, *leader; bool is_open = true; @@ -1739,10 +1740,15 @@ struct evsel *perf_evlist__reset_weak_group(struct evlist *evsel_list, if (c2 == evsel) is_open = false; if (c2->leader == leader) { - if (is_open) + if (is_open && close) perf_evsel__close(&c2->core); c2->leader = c2; c2->core.nr_members = 0; + /* + * Set this for all former members of the group + * to indicate they get reopened. + */ + c2->reset_group = true; } } return leader; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index a7094df7b816..29c550216a62 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -342,5 +342,6 @@ bool perf_evlist__exclude_kernel(struct evlist *evlist); void perf_evlist__force_leader(struct evlist *evlist); struct evsel *perf_evlist__reset_weak_group(struct evlist *evlist, - struct evsel *evsel); + struct evsel *evsel, + bool close); #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1470f6c328f1..6b4e7bf75312 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1610,8 +1610,9 @@ static int perf_event_open(struct evsel *evsel, return fd; } -int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, - struct perf_thread_map *threads) +static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, + struct perf_thread_map *threads, + int start_cpu, int end_cpu) { int cpu, thread, nthreads; unsigned long flags = PERF_FLAG_FD_CLOEXEC; @@ -1694,7 +1695,7 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, display_attr(&evsel->core.attr); - for (cpu = 0; cpu < cpus->nr; cpu++) { + for (cpu = start_cpu; cpu < end_cpu; cpu++) { for (thread = 0; thread < nthreads; thread++) { int fd, group_fd; @@ -1894,6 +1895,12 @@ int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, return err; } +int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, + struct perf_thread_map *threads) +{ + return evsel__open_cpu(evsel, cpus, threads, 0, cpus ? cpus->nr : 1); +} + void evsel__close(struct evsel *evsel) { perf_evsel__close(&evsel->core); @@ -1901,9 +1908,14 @@ void evsel__close(struct evsel *evsel) } int perf_evsel__open_per_cpu(struct evsel *evsel, - struct perf_cpu_map *cpus) + struct perf_cpu_map *cpus, + int cpu) { - return evsel__open(evsel, cpus, NULL); + if (cpu == -1) + return evsel__open_cpu(evsel, cpus, NULL, 0, + cpus ? cpus->nr : 1); + + return evsel__open_cpu(evsel, cpus, NULL, cpu, cpu + 1); } int perf_evsel__open_per_thread(struct evsel *evsel, diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index ddb255bd837e..af87690775a7 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -94,6 +94,8 @@ struct evsel { struct evsel *metric_leader; bool collect_stat; bool weak_group; + bool reset_group; + bool errored; bool percore; int cpu_iter; const char *pmu_name; @@ -239,7 +241,8 @@ int evsel__enable(struct evsel *evsel); int evsel__disable(struct evsel *evsel); int perf_evsel__open_per_cpu(struct evsel *evsel, - struct perf_cpu_map *cpus); + struct perf_cpu_map *cpus, + int cpu); int perf_evsel__open_per_thread(struct evsel *evsel, struct perf_thread_map *threads); int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 5156aa971fbb..93c95b4b13dd 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -465,7 +465,8 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp) int create_perf_stat_counter(struct evsel *evsel, struct perf_stat_config *config, - struct target *target) + struct target *target, + int cpu) { struct perf_event_attr *attr = &evsel->core.attr; struct evsel *leader = evsel->leader; @@ -509,7 +510,7 @@ int create_perf_stat_counter(struct evsel *evsel, } if (target__has_cpu(target) && !target__has_per_thread(target)) - return perf_evsel__open_per_cpu(evsel, evsel__cpus(evsel)); + return perf_evsel__open_per_cpu(evsel, evsel__cpus(evsel), cpu); return perf_evsel__open_per_thread(evsel, evsel->core.threads); } diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index edbeb2f63e8d..0773e4b7ec44 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -211,7 +211,8 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp); int create_perf_stat_counter(struct evsel *evsel, struct perf_stat_config *config, - struct target *target); + struct target *target, + int cpu); void perf_evlist__print_counters(struct evlist *evlist, struct perf_stat_config *config, -- Gitee From 0bea731989feb97c6b19064c0feee7c1e7bc7750 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 29 Apr 2020 15:50:10 -0300 Subject: [PATCH 195/257] perf evsel: Rename perf_evsel__find_pmu() to evsel__find_pmu() commit e76026bdd51bcd4a0b9793c655891cde45367f5f upstream As it is a 'struct evsel' method, not part of tools/lib/perf/, aka libperf, to whom the perf_ prefix belongs. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/util/auxtrace.c | 2 +- tools/perf/util/evsel.h | 2 +- tools/perf/util/pmu.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index ea11526bef07..185136bae6f6 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -2379,7 +2379,7 @@ static int parse_addr_filter(struct evsel *evsel, const char *filter, static int perf_evsel__nr_addr_filter(struct evsel *evsel) { - struct perf_pmu *pmu = perf_evsel__find_pmu(evsel); + struct perf_pmu *pmu = evsel__find_pmu(evsel); int nr_addr_filters = 0; if (!pmu) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index af87690775a7..2b4c33e773de 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -160,7 +160,7 @@ int perf_evsel__object_config(size_t object_size, int (*init)(struct evsel *evsel), void (*fini)(struct evsel *evsel)); -struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel); +struct perf_pmu *evsel__find_pmu(struct evsel *evsel); bool perf_evsel__is_aux_event(struct evsel *evsel); struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index c0f8bdb65ccc..2677ea6b9344 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -901,7 +901,7 @@ struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu) return NULL; } -struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel) +struct perf_pmu *evsel__find_pmu(struct evsel *evsel) { struct perf_pmu *pmu = NULL; @@ -915,7 +915,7 @@ struct perf_pmu *perf_evsel__find_pmu(struct evsel *evsel) bool perf_evsel__is_aux_event(struct evsel *evsel) { - struct perf_pmu *pmu = perf_evsel__find_pmu(evsel); + struct perf_pmu *pmu = evsel__find_pmu(evsel); return pmu && pmu->auxtrace; } -- Gitee From 68c0c20a7de63263554a331e50de0cd11908ae39 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 28 Feb 2020 08:30:00 -0800 Subject: [PATCH 196/257] perf tools: Add hw_idx in struct branch_stack commit 42bbabed09ce6208026648a71a45b4394c74585a upstream The low level index of raw branch records for the most recent branch can be recorded in a sample with PERF_SAMPLE_BRANCH_HW_INDEX branch_sample_type. Extend struct branch_stack to support it. However, if the PERF_SAMPLE_BRANCH_HW_INDEX is not applied, only nr and entries[] will be output by kernel. The pointer of entries[] could be wrong, since the output format is different with new struct branch_stack. Add a variable no_hw_idx in struct perf_sample to indicate whether the hw_idx is output. Add get_branch_entry() to return corresponding pointer of entries[0]. To make dummy branch sample consistent as new branch sample, add hw_idx in struct dummy_branch_stack for cs-etm and intel-pt. Apply the new struct branch_stack for synthetic events as well. Extend test case sample-parsing to support new struct branch_stack. Committer notes: Renamed get_branch_entries() to perf_sample__branch_entries() to have proper namespacing and pave the way for this to be moved to libperf, eventually. Add 'static' to that inline as it is in a header. Add 'hw_idx' to 'struct dummy_branch_stack' in cs-etm.c to fix the build on arm64. Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Alexey Budankov Cc: Andi Kleen Cc: Jiri Olsa Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200228163011.19358-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/builtin-script.c | 70 ++++++++++--------- tools/perf/tests/sample-parsing.c | 7 +- tools/perf/util/branch.h | 22 ++++++ tools/perf/util/cs-etm.c | 2 + tools/perf/util/event.h | 1 + tools/perf/util/evsel.c | 5 ++ tools/perf/util/evsel.h | 5 ++ tools/perf/util/hist.c | 3 +- tools/perf/util/intel-pt.c | 2 + tools/perf/util/machine.c | 35 +++++----- .../scripting-engines/trace-event-python.c | 30 ++++---- tools/perf/util/session.c | 8 ++- tools/perf/util/synthetic-events.c | 6 +- 13 files changed, 125 insertions(+), 71 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index bbf1f2d3387e..bb64dbfe043a 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -735,6 +735,7 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -743,8 +744,8 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, return 0; for (i = 0; i < br->nr; i++) { - from = br->entries[i].from; - to = br->entries[i].to; + from = entries[i].from; + to = entries[i].to; if (PRINT_FIELD(DSO)) { memset(&alf, 0, sizeof(alf)); @@ -768,10 +769,10 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample, } printed += fprintf(fp, "/%c/%c/%c/%d ", - mispred_str( br->entries + i), - br->entries[i].flags.in_tx? 'X' : '-', - br->entries[i].flags.abort? 'A' : '-', - br->entries[i].flags.cycles); + mispred_str(entries + i), + entries[i].flags.in_tx ? 'X' : '-', + entries[i].flags.abort ? 'A' : '-', + entries[i].flags.cycles); } return printed; @@ -782,6 +783,7 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -793,8 +795,8 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, memset(&alf, 0, sizeof(alf)); memset(&alt, 0, sizeof(alt)); - from = br->entries[i].from; - to = br->entries[i].to; + from = entries[i].from; + to = entries[i].to; thread__find_symbol_fb(thread, sample->cpumode, from, &alf); thread__find_symbol_fb(thread, sample->cpumode, to, &alt); @@ -813,10 +815,10 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample, printed += fprintf(fp, ")"); } printed += fprintf(fp, "/%c/%c/%c/%d ", - mispred_str( br->entries + i), - br->entries[i].flags.in_tx? 'X' : '-', - br->entries[i].flags.abort? 'A' : '-', - br->entries[i].flags.cycles); + mispred_str(entries + i), + entries[i].flags.in_tx ? 'X' : '-', + entries[i].flags.abort ? 'A' : '-', + entries[i].flags.cycles); } return printed; @@ -827,6 +829,7 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, struct perf_event_attr *attr, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct addr_location alf, alt; u64 i, from, to; int printed = 0; @@ -838,8 +841,8 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, memset(&alf, 0, sizeof(alf)); memset(&alt, 0, sizeof(alt)); - from = br->entries[i].from; - to = br->entries[i].to; + from = entries[i].from; + to = entries[i].to; if (thread__find_map_fb(thread, sample->cpumode, from, &alf) && !alf.map->dso->adjust_symbols) @@ -862,10 +865,10 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample, printed += fprintf(fp, ")"); } printed += fprintf(fp, "/%c/%c/%c/%d ", - mispred_str(br->entries + i), - br->entries[i].flags.in_tx ? 'X' : '-', - br->entries[i].flags.abort ? 'A' : '-', - br->entries[i].flags.cycles); + mispred_str(entries + i), + entries[i].flags.in_tx ? 'X' : '-', + entries[i].flags.abort ? 'A' : '-', + entries[i].flags.cycles); } return printed; @@ -1011,6 +1014,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, struct machine *machine, FILE *fp) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); u64 start, end; int i, insn, len, nr, ilen, printed = 0; struct perf_insn x; @@ -1031,31 +1035,31 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += fprintf(fp, "%c", '\n'); /* Handle first from jump, of which we don't know the entry. */ - len = grab_bb(buffer, br->entries[nr-1].from, - br->entries[nr-1].from, + len = grab_bb(buffer, entries[nr-1].from, + entries[nr-1].from, machine, thread, &x.is64bit, &x.cpumode, false); if (len > 0) { - printed += ip__fprintf_sym(br->entries[nr - 1].from, thread, + printed += ip__fprintf_sym(entries[nr - 1].from, thread, x.cpumode, x.cpu, &lastsym, attr, fp); - printed += ip__fprintf_jump(br->entries[nr - 1].from, &br->entries[nr - 1], + printed += ip__fprintf_jump(entries[nr - 1].from, &entries[nr - 1], &x, buffer, len, 0, fp, &total_cycles); if (PRINT_FIELD(SRCCODE)) - printed += print_srccode(thread, x.cpumode, br->entries[nr - 1].from); + printed += print_srccode(thread, x.cpumode, entries[nr - 1].from); } /* Print all blocks */ for (i = nr - 2; i >= 0; i--) { - if (br->entries[i].from || br->entries[i].to) + if (entries[i].from || entries[i].to) pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i, - br->entries[i].from, - br->entries[i].to); - start = br->entries[i + 1].to; - end = br->entries[i].from; + entries[i].from, + entries[i].to); + start = entries[i + 1].to; + end = entries[i].from; len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false); /* Patch up missing kernel transfers due to ring filters */ if (len == -ENXIO && i > 0) { - end = br->entries[--i].from; + end = entries[--i].from; pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n", start, end); len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false); } @@ -1068,7 +1072,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp); if (ip == end) { - printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, ++insn, fp, + printed += ip__fprintf_jump(ip, &entries[i], &x, buffer + off, len - off, ++insn, fp, &total_cycles); if (PRINT_FIELD(SRCCODE)) printed += print_srccode(thread, x.cpumode, ip); @@ -1092,9 +1096,9 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, * Hit the branch? In this case we are already done, and the target * has not been executed yet. */ - if (br->entries[0].from == sample->ip) + if (entries[0].from == sample->ip) goto out; - if (br->entries[0].flags.abort) + if (entries[0].flags.abort) goto out; /* @@ -1105,7 +1109,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample, * between final branch and sample. When this happens just * continue walking after the last TO until we hit a branch. */ - start = br->entries[0].to; + start = entries[0].to; end = sample->ip; if (end < start) { /* Missing jump. Scan 128 bytes for the next branch */ diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 12b8278a0e35..31cca805daff 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -99,6 +99,7 @@ static bool samples_same(const struct perf_sample *s1, if (type & PERF_SAMPLE_BRANCH_STACK) { COMP(branch_stack->nr); + COMP(branch_stack->hw_idx); for (i = 0; i < s1->branch_stack->nr; i++) MCOMP(branch_stack->entries[i]); } @@ -189,7 +190,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) u64 data[64]; } branch_stack = { /* 1 branch_entry */ - .data = {1, 211, 212, 213}, + .data = {1, -1ULL, 211, 212, 213}, }; u64 regs[64]; const u32 raw_data[] = {0x12345678, 0x0a0b0c0d, 0x11020304, 0x05060708, 0 }; @@ -211,6 +212,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) .transaction = 112, .raw_data = (void *)raw_data, .callchain = &callchain.callchain, + .no_hw_idx = false, .branch_stack = &branch_stack.branch_stack, .user_regs = { .abi = PERF_SAMPLE_REGS_ABI_64, @@ -248,6 +250,9 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) if (sample_type & PERF_SAMPLE_REGS_INTR) evsel.core.attr.sample_regs_intr = sample_regs; + if (sample_type & PERF_SAMPLE_BRANCH_STACK) + evsel.core.attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX; + for (i = 0; i < sizeof(regs); i++) *(i + (u8 *)regs) = i & 0xfe; diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h index 88e00d268f6f..154a05cd03af 100644 --- a/tools/perf/util/branch.h +++ b/tools/perf/util/branch.h @@ -12,6 +12,7 @@ #include #include #include +#include "event.h" struct branch_flags { u64 mispred:1; @@ -39,9 +40,30 @@ struct branch_entry { struct branch_stack { u64 nr; + u64 hw_idx; struct branch_entry entries[0]; }; +/* + * The hw_idx is only available when PERF_SAMPLE_BRANCH_HW_INDEX is applied. + * Otherwise, the output format of a sample with branch stack is + * struct branch_stack { + * u64 nr; + * struct branch_entry entries[0]; + * } + * Check whether the hw_idx is available, + * and return the corresponding pointer of entries[0]. + */ +static inline struct branch_entry *perf_sample__branch_entries(struct perf_sample *sample) +{ + u64 *entry = (u64 *)sample->branch_stack; + + entry++; + if (sample->no_hw_idx) + return (struct branch_entry *)entry; + return (struct branch_entry *)(++entry); +} + struct branch_type_stat { bool branch_to; u64 counts[PERF_BR_MAX]; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index f5a9cb408808..f9cc15f93c4a 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -1192,6 +1192,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, union perf_event *event = tidq->event_buf; struct dummy_branch_stack { u64 nr; + u64 hw_idx; struct branch_entry entries; } dummy_bs; u64 ip; @@ -1222,6 +1223,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq, if (etm->synth_opts.last_branch) { dummy_bs = (struct dummy_branch_stack){ .nr = 1, + .hw_idx = -1ULL, .entries = { .from = sample.ip, .to = sample.addr, diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index c8d31205a3dd..c2fd88a2c09e 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -142,6 +142,7 @@ struct perf_sample { u16 insn_len; u8 cpumode; u16 misc; + bool no_hw_idx; /* No hw_idx collected in branch_stack */ char insn[MAX_INSN]; void *raw_data; struct ip_callchain *callchain; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 6b4e7bf75312..272f77ce05c4 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2207,7 +2207,12 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event, if (data->branch_stack->nr > max_branch_nr) return -EFAULT; + sz = data->branch_stack->nr * sizeof(struct branch_entry); + if (perf_evsel__has_branch_hw_idx(evsel)) + sz += sizeof(u64); + else + data->no_hw_idx = true; OVERFLOW_CHECK(array, sz, max_size); array = (void *)array + sz; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 2b4c33e773de..7a0eb975f8bf 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -402,6 +402,11 @@ static inline bool perf_evsel__has_branch_callstack(const struct evsel *evsel) return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } +static inline bool perf_evsel__has_branch_hw_idx(const struct evsel *evsel) +{ + return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + static inline bool evsel__has_callchain(const struct evsel *evsel) { /* diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 7b6eaf5e0bda..151b9e43c88f 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -2572,9 +2572,10 @@ void hist__account_cycles(struct branch_stack *bs, struct addr_location *al, struct perf_sample *sample, bool nonany_branch_mode) { struct branch_info *bi; + struct branch_entry *entries = perf_sample__branch_entries(sample); /* If we have branch cycles always annotate them. */ - if (bs && bs->nr && bs->entries[0].flags.cycles) { + if (bs && bs->nr && entries[0].flags.cycles) { int i; bi = sample__resolve_bstack(sample, al); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index bdb4b2586a7d..5e72294bc030 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1279,6 +1279,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) struct perf_sample sample = { .ip = 0, }; struct dummy_branch_stack { u64 nr; + u64 hw_idx; struct branch_entry entries; } dummy_bs; @@ -1300,6 +1301,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) if (pt->synth_opts.last_branch && sort__mode == SORT_MODE__BRANCH) { dummy_bs = (struct dummy_branch_stack){ .nr = 1, + .hw_idx = -1ULL, .entries = { .from = sample.ip, .to = sample.addr, diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index c02ec8fcce55..7e3957602885 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2094,15 +2094,16 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, { unsigned int i; const struct branch_stack *bs = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct branch_info *bi = calloc(bs->nr, sizeof(struct branch_info)); if (!bi) return NULL; for (i = 0; i < bs->nr; i++) { - ip__resolve_ams(al->thread, &bi[i].to, bs->entries[i].to); - ip__resolve_ams(al->thread, &bi[i].from, bs->entries[i].from); - bi[i].flags = bs->entries[i].flags; + ip__resolve_ams(al->thread, &bi[i].to, entries[i].to); + ip__resolve_ams(al->thread, &bi[i].from, entries[i].from); + bi[i].flags = entries[i].flags; } return bi; } @@ -2198,6 +2199,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, /* LBR only affects the user callchain */ if (i != chain_nr) { struct branch_stack *lbr_stack = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); int lbr_nr = lbr_stack->nr, j, k; bool branch; struct branch_flags *flags; @@ -2223,31 +2225,29 @@ static int resolve_lbr_callchain_sample(struct thread *thread, ip = chain->ips[j]; else if (j > i + 1) { k = j - i - 2; - ip = lbr_stack->entries[k].from; + ip = entries[k].from; branch = true; - flags = &lbr_stack->entries[k].flags; + flags = &entries[k].flags; } else { - ip = lbr_stack->entries[0].to; + ip = entries[0].to; branch = true; - flags = &lbr_stack->entries[0].flags; - branch_from = - lbr_stack->entries[0].from; + flags = &entries[0].flags; + branch_from = entries[0].from; } } else { if (j < lbr_nr) { k = lbr_nr - j - 1; - ip = lbr_stack->entries[k].from; + ip = entries[k].from; branch = true; - flags = &lbr_stack->entries[k].flags; + flags = &entries[k].flags; } else if (j > lbr_nr) ip = chain->ips[i + 1 - (j - lbr_nr)]; else { - ip = lbr_stack->entries[0].to; + ip = entries[0].to; branch = true; - flags = &lbr_stack->entries[0].flags; - branch_from = - lbr_stack->entries[0].from; + flags = &entries[0].flags; + branch_from = entries[0].from; } } @@ -2294,6 +2294,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, int max_stack) { struct branch_stack *branch = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); struct ip_callchain *chain = sample->callchain; int chain_nr = 0; u8 cpumode = PERF_RECORD_MISC_USER; @@ -2341,7 +2342,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, for (i = 0; i < nr; i++) { if (callchain_param.order == ORDER_CALLEE) { - be[i] = branch->entries[i]; + be[i] = entries[i]; if (chain == NULL) continue; @@ -2360,7 +2361,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, be[i].from >= chain->ips[first_call] - 8) first_call++; } else - be[i] = branch->entries[branch->nr - i - 1]; + be[i] = entries[branch->nr - i - 1]; } memset(iter, 0, sizeof(struct iterations) * nr); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 3b02c3f1b289..2bdd10c4c246 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -464,6 +464,7 @@ static PyObject *python_process_brstack(struct perf_sample *sample, struct thread *thread) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); PyObject *pylist; u64 i; @@ -484,28 +485,28 @@ static PyObject *python_process_brstack(struct perf_sample *sample, Py_FatalError("couldn't create Python dictionary"); pydict_set_item_string_decref(pyelem, "from", - PyLong_FromUnsignedLongLong(br->entries[i].from)); + PyLong_FromUnsignedLongLong(entries[i].from)); pydict_set_item_string_decref(pyelem, "to", - PyLong_FromUnsignedLongLong(br->entries[i].to)); + PyLong_FromUnsignedLongLong(entries[i].to)); pydict_set_item_string_decref(pyelem, "mispred", - PyBool_FromLong(br->entries[i].flags.mispred)); + PyBool_FromLong(entries[i].flags.mispred)); pydict_set_item_string_decref(pyelem, "predicted", - PyBool_FromLong(br->entries[i].flags.predicted)); + PyBool_FromLong(entries[i].flags.predicted)); pydict_set_item_string_decref(pyelem, "in_tx", - PyBool_FromLong(br->entries[i].flags.in_tx)); + PyBool_FromLong(entries[i].flags.in_tx)); pydict_set_item_string_decref(pyelem, "abort", - PyBool_FromLong(br->entries[i].flags.abort)); + PyBool_FromLong(entries[i].flags.abort)); pydict_set_item_string_decref(pyelem, "cycles", - PyLong_FromUnsignedLongLong(br->entries[i].flags.cycles)); + PyLong_FromUnsignedLongLong(entries[i].flags.cycles)); thread__find_map_fb(thread, sample->cpumode, - br->entries[i].from, &al); + entries[i].from, &al); dsoname = get_dsoname(al.map); pydict_set_item_string_decref(pyelem, "from_dsoname", _PyUnicode_FromString(dsoname)); thread__find_map_fb(thread, sample->cpumode, - br->entries[i].to, &al); + entries[i].to, &al); dsoname = get_dsoname(al.map); pydict_set_item_string_decref(pyelem, "to_dsoname", _PyUnicode_FromString(dsoname)); @@ -561,6 +562,7 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, struct thread *thread) { struct branch_stack *br = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); PyObject *pylist; u64 i; char bf[512]; @@ -581,22 +583,22 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, Py_FatalError("couldn't create Python dictionary"); thread__find_symbol_fb(thread, sample->cpumode, - br->entries[i].from, &al); + entries[i].from, &al); get_symoff(al.sym, &al, true, bf, sizeof(bf)); pydict_set_item_string_decref(pyelem, "from", _PyUnicode_FromString(bf)); thread__find_symbol_fb(thread, sample->cpumode, - br->entries[i].to, &al); + entries[i].to, &al); get_symoff(al.sym, &al, true, bf, sizeof(bf)); pydict_set_item_string_decref(pyelem, "to", _PyUnicode_FromString(bf)); - get_br_mspred(&br->entries[i].flags, bf, sizeof(bf)); + get_br_mspred(&entries[i].flags, bf, sizeof(bf)); pydict_set_item_string_decref(pyelem, "pred", _PyUnicode_FromString(bf)); - if (br->entries[i].flags.in_tx) { + if (entries[i].flags.in_tx) { pydict_set_item_string_decref(pyelem, "in_tx", _PyUnicode_FromString("X")); } else { @@ -604,7 +606,7 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample, _PyUnicode_FromString("-")); } - if (br->entries[i].flags.abort) { + if (entries[i].flags.abort) { pydict_set_item_string_decref(pyelem, "abort", _PyUnicode_FromString("A")); } else { diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index e8a4516070b3..7710481ab01a 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1006,6 +1006,7 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample) { struct ip_callchain *callchain = sample->callchain; struct branch_stack *lbr_stack = sample->branch_stack; + struct branch_entry *entries = perf_sample__branch_entries(sample); u64 kernel_callchain_nr = callchain->nr; unsigned int i; @@ -1042,10 +1043,10 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample) i, callchain->ips[i]); printf("..... %2d: %016" PRIx64 "\n", - (int)(kernel_callchain_nr), lbr_stack->entries[0].to); + (int)(kernel_callchain_nr), entries[0].to); for (i = 0; i < lbr_stack->nr; i++) printf("..... %2d: %016" PRIx64 "\n", - (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from); + (int)(i + kernel_callchain_nr + 1), entries[i].from); } } @@ -1067,6 +1068,7 @@ static void callchain__printf(struct evsel *evsel, static void branch_stack__printf(struct perf_sample *sample, bool callstack) { + struct branch_entry *entries = perf_sample__branch_entries(sample); uint64_t i; printf("%s: nr:%" PRIu64 "\n", @@ -1074,7 +1076,7 @@ static void branch_stack__printf(struct perf_sample *sample, bool callstack) sample->branch_stack->nr); for (i = 0; i < sample->branch_stack->nr; i++) { - struct branch_entry *e = &sample->branch_stack->entries[i]; + struct branch_entry *e = &entries[i]; if (!callstack) { printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n", diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 2c0f7ec3f8b7..b7d0f6614daf 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1183,7 +1183,8 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, if (type & PERF_SAMPLE_BRANCH_STACK) { sz = sample->branch_stack->nr * sizeof(struct branch_entry); - sz += sizeof(u64); + /* nr, hw_idx */ + sz += 2 * sizeof(u64); result += sz; } @@ -1353,7 +1354,8 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo if (type & PERF_SAMPLE_BRANCH_STACK) { sz = sample->branch_stack->nr * sizeof(struct branch_entry); - sz += sizeof(u64); + /* nr, hw_idx */ + sz += 2 * sizeof(u64); memcpy(array, sample->branch_stack, sz); array = (void *)array + sz; } -- Gitee From 1bd16b453487354ef9669f622ea692ff7fac011d Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:12 +0530 Subject: [PATCH 197/257] perf record ibs: Warn about sampling period skew commit 9ab95b0b15a092abb7c3309f3580f572a041f9ab upstream Samples without an L3 miss are discarded and counter is reset with random value (between 1-15 for fetch PMU and 1-127 for op PMU) when IBS L3 miss filtering is enabled. This causes a sampling period skew but there is no way to reconstruct aggregated sampling period. So print a warning at perf record if user sets l3missonly=1. Ex: # perf record -c 10000 -C 0 -e ibs_op/l3missonly=1/ WARNING: Hw internally resets sampling period when L3 Miss Filtering is enabled and tagged operation does not cause L3 Miss. This causes sampling period skew. [Backport changes] 1.In the tools/perf/arch/x86/util/evsel.c file, the #include "util/pmu.h" directive was added manually instead of backporting upstream commit d98079c05b5a54, in order to avoid introducing over 20 dependencies. 2.In tools/perf/arch/x86/util/evsel.c, inside arch__post_evsel_config(), the function name evsel__find_pmu() was manually changed back to perf_evsel__find_pmu() to match the current kernel. The upstream commit e76026bdd51bcd only renames the function without modifying its logic, but backporting it would require over 20 extra patches. So the change was made manually to avoid that overhead. Signed-off-by: Ravi Bangoria Acked-by: Ian Rogers Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: http://lore.kernel.org/lkml/20220604044519.594-2-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: priyanka-mani Signed-off-by: mohanasv2 --- tools/perf/arch/x86/util/evsel.c | 52 ++++++++++++++++++++++++++++++++ tools/perf/util/evsel.c | 7 +++++ tools/perf/util/evsel.h | 1 + 3 files changed, 60 insertions(+) diff --git a/tools/perf/arch/x86/util/evsel.c b/tools/perf/arch/x86/util/evsel.c index ac2899a25b7a..6222bbb26c65 100644 --- a/tools/perf/arch/x86/util/evsel.c +++ b/tools/perf/arch/x86/util/evsel.c @@ -3,7 +3,12 @@ #include #include "util/evsel.h" #include "util/env.h" +#include "util/pmu.h" #include "linux/string.h" +#include "util/debug.h" + +#define IBS_FETCH_L3MISSONLY (1ULL << 59) +#define IBS_OP_L3MISSONLY (1ULL << 16) void arch_evsel__set_sample_weight(struct evsel *evsel) { @@ -29,3 +34,50 @@ void arch_evsel__fixup_new_cycles(struct perf_event_attr *attr) free(env.cpuid); } +static void ibs_l3miss_warn(void) +{ + pr_warning( +"WARNING: Hw internally resets sampling period when L3 Miss Filtering is enabled\n" +"and tagged operation does not cause L3 Miss. This causes sampling period skew.\n"); +} + +void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr) +{ + struct perf_pmu *evsel_pmu, *ibs_fetch_pmu, *ibs_op_pmu; + static int warned_once; + /* 0: Uninitialized, 1: Yes, -1: No */ + static int is_amd; + + if (warned_once || is_amd == -1) + return; + + if (!is_amd) { + struct perf_env *env = perf_evsel__env(evsel); + + if (!perf_env__cpuid(env) || !env->cpuid || + !strstarts(env->cpuid, "AuthenticAMD")) { + is_amd = -1; + return; + } + is_amd = 1; + } + + evsel_pmu = evsel__find_pmu(evsel); + if (!evsel_pmu) + return; + + ibs_fetch_pmu = perf_pmu__find("ibs_fetch"); + ibs_op_pmu = perf_pmu__find("ibs_op"); + + if (ibs_fetch_pmu && ibs_fetch_pmu->type == evsel_pmu->type) { + if (attr->config & IBS_FETCH_L3MISSONLY) { + ibs_l3miss_warn(); + warned_once = 1; + } + } else if (ibs_op_pmu && ibs_op_pmu->type == evsel_pmu->type) { + if (attr->config & IBS_OP_L3MISSONLY) { + ibs_l3miss_warn(); + warned_once = 1; + } + } +} diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 272f77ce05c4..147855005d31 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -932,6 +932,11 @@ void __weak arch_evsel__fixup_new_cycles(struct perf_event_attr *attr __maybe_un { } +void __weak arch__post_evsel_config(struct evsel *evsel __maybe_unused, + struct perf_event_attr *attr __maybe_unused) +{ +} + /* * The enable_on_exec/disabled value strategy: * @@ -1201,6 +1206,8 @@ void perf_evsel__config(struct evsel *evsel, struct record_opts *opts, */ if (opts->initial_delay && is_dummy_event(evsel)) evsel__reset_sample_bit(evsel, BRANCH_STACK); + + arch__post_evsel_config(evsel, attr); } int perf_evsel__set_filter(struct evsel *evsel, const char *filter) diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 7a0eb975f8bf..a7e7a25af2be 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -236,6 +236,7 @@ int perf_evsel__append_addr_filter(struct evsel *evsel, const char *filter); void arch_evsel__set_sample_weight(struct evsel *evsel); void arch_evsel__fixup_new_cycles(struct perf_event_attr *attr); +void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr); int evsel__enable(struct evsel *evsel); int evsel__disable(struct evsel *evsel); -- Gitee From 17bef68c8588c81535e5a8a35ae8d1f05a635c5c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:13 +0530 Subject: [PATCH 198/257] perf pmu: Parse pmu caps sysfs only once commit 3339ec44be7f9963c82d5d21e163f16f96e5ca58 upstream In addition to returning nr_caps, cache it locally in struct perf_pmu. Similarly, cache status of whether caps sysfs has already been parsed or not. These will help to avoid parsing sysfs every time the function gets called. Reviewed-by: Kan Liang Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-3-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: chaithanyaLagisetty Signed-off-by: mohanasv2 --- tools/perf/util/pmu.c | 15 +++++++++++---- tools/perf/util/pmu.h | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 2677ea6b9344..708cbd322d61 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1651,7 +1651,11 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) const char *sysfs = sysfs__mountpoint(); DIR *caps_dir; struct dirent *evt_ent; - int nr_caps = 0; + + if (pmu->caps_initialized) + return pmu->nr_caps; + + pmu->nr_caps = 0; if (!sysfs) return -1; @@ -1659,8 +1663,10 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) snprintf(caps_path, PATH_MAX, "%s" EVENT_SOURCE_DEVICE_PATH "%s/caps", sysfs, pmu->name); - if (stat(caps_path, &st) < 0) + if (stat(caps_path, &st) < 0) { + pmu->caps_initialized = true; return 0; /* no error if caps does not exist */ + } caps_dir = opendir(caps_path); if (!caps_dir) @@ -1687,13 +1693,14 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) continue; } - nr_caps++; + pmu->nr_caps++; fclose(file); } closedir(caps_dir); - return nr_caps; + pmu->caps_initialized = true; + return pmu->nr_caps; } void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 57c7bf476350..e0cf540acac2 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -42,6 +42,8 @@ struct perf_pmu { struct perf_cpu_map *cpus; struct list_head format; /* HEAD struct perf_pmu_format -> list */ struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */ + bool caps_initialized; + u32 nr_caps; struct list_head caps; /* HEAD struct perf_pmu_caps -> list */ struct list_head list; /* ELEM */ struct list_head hybrid_list; -- Gitee From 36497088cfe275ea172ff7decb84231e4ec2d54f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 19 Mar 2020 13:25:02 -0700 Subject: [PATCH 199/257] perf header: Support CPU PMU capabilities commit 6f91ea283a1ed23e4a548ddd62db6deb2c707f82 upstream To stitch LBR call stack, the max LBR information is required. So the CPU PMU capabilities information has to be stored in perf header. Add a new feature HEADER_CPU_PMU_CAPS for CPU PMU capabilities. Retrieve all CPU PMU capabilities, not just max LBR information. Add variable max_branches to facilitate future usage. Committer testing: # ls -la /sys/devices/cpu/caps/ total 0 drwxr-xr-x. 2 root root 0 Apr 17 10:53 . drwxr-xr-x. 6 root root 0 Apr 17 07:02 .. -r--r--r--. 1 root root 4096 Apr 17 10:53 max_precise # # cat /sys/devices/cpu/caps/max_precise 0 # perf record sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.033 MB perf.data (7 samples) ] # # perf report --header-only | egrep 'cpu(desc|.*capabilities)' # cpudesc : AMD Ryzen 5 3600X 6-Core Processor # cpu pmu capabilities: max_precise=0 # And then on an Intel machine: $ ls -la /sys/devices/cpu/caps/ total 0 drwxr-xr-x. 2 root root 0 Apr 17 10:51 . drwxr-xr-x. 6 root root 0 Apr 17 10:04 .. -r--r--r--. 1 root root 4096 Apr 17 11:37 branches -r--r--r--. 1 root root 4096 Apr 17 10:51 max_precise -r--r--r--. 1 root root 4096 Apr 17 11:37 pmu_name $ cat /sys/devices/cpu/caps/max_precise 3 $ cat /sys/devices/cpu/caps/branches 32 $ cat /sys/devices/cpu/caps/pmu_name skylake $ perf record sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.001 MB perf.data (8 samples) ] $ perf report --header-only | egrep 'cpu(desc|.*capabilities)' # cpudesc : Intel(R) Core(TM) i5-7500 CPU @ 3.40GHz # cpu pmu capabilities: branches=32, max_precise=3, pmu_name=skylake $ Signed-off-by: Kan Liang Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexey Budankov Cc: Mathieu Poirier Cc: Michael Ellerman Cc: Namhyung Kim Cc: Pavel Gerasimov Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Stephane Eranian Cc: Vitaly Slobodskoy Link: http://lore.kernel.org/lkml/20200319202517.23423-3-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: chaithanyaLagisetty Signed-off-by: mohanasv2 --- .../Documentation/perf.data-file-format.txt | 16 +++ tools/perf/util/env.h | 3 + tools/perf/util/header.c | 108 ++++++++++++++++++ tools/perf/util/header.h | 1 + 4 files changed, 128 insertions(+) diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index b0152e1095c5..b6472e463284 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -373,6 +373,22 @@ struct { Indicates that trace contains records of PERF_RECORD_COMPRESSED type that have perf_events records in compressed form. + HEADER_CPU_PMU_CAPS = 28, + + A list of cpu PMU capabilities. The format of data is as below. + +struct { + u32 nr_cpu_pmu_caps; + { + char name[]; + char value[]; + } [nr_cpu_pmu_caps] +}; + + +Example: + cpu pmu capabilities: branches=32, max_precise=3, pmu_name=icelake + other bits are reserved and should ignored for now HEADER_FEAT_BITS = 256, diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 0b862b4a01e3..fd2b29aabc42 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -48,6 +48,7 @@ struct perf_env { char *cpuid; unsigned long long total_mem; unsigned int msr_pmu_type; + unsigned int max_branches; int nr_cmdline; int nr_sibling_cores; @@ -57,12 +58,14 @@ struct perf_env { int nr_memory_nodes; int nr_pmu_mappings; int nr_groups; + int nr_cpu_pmu_caps; char *cmdline; const char **cmdline_argv; char *sibling_cores; char *sibling_dies; char *sibling_threads; char *pmu_mappings; + char *cpu_pmu_caps; struct cpu_topology_map *cpu; struct cpu_cache_level *caches; int caches_cnt; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d3412f2c0d18..f6d6dc793e90 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1395,6 +1395,38 @@ static int write_compressed(struct feat_fd *ff __maybe_unused, return do_write(ff, &(ff->ph->env.comp_mmap_len), sizeof(ff->ph->env.comp_mmap_len)); } +static int write_cpu_pmu_caps(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) +{ + struct perf_pmu *cpu_pmu = perf_pmu__find("cpu"); + struct perf_pmu_caps *caps = NULL; + int nr_caps; + int ret; + + if (!cpu_pmu) + return -ENOENT; + + nr_caps = perf_pmu__caps_parse(cpu_pmu); + if (nr_caps < 0) + return nr_caps; + + ret = do_write(ff, &nr_caps, sizeof(nr_caps)); + if (ret < 0) + return ret; + + list_for_each_entry(caps, &cpu_pmu->caps, list) { + ret = do_write_string(ff, caps->name); + if (ret < 0) + return ret; + + ret = do_write_string(ff, caps->value); + if (ret < 0) + return ret; + } + + return ret; +} + static void print_hostname(struct feat_fd *ff, FILE *fp) { fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname); @@ -1772,6 +1804,27 @@ static void print_compressed(struct feat_fd *ff, FILE *fp) ff->ph->env.comp_level, ff->ph->env.comp_ratio); } +static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) +{ + const char *delimiter = "# cpu pmu capabilities: "; + u32 nr_caps = ff->ph->env.nr_cpu_pmu_caps; + char *str; + + if (!nr_caps) { + fprintf(fp, "# cpu pmu capabilities: not available\n"); + return; + } + + str = ff->ph->env.cpu_pmu_caps; + while (nr_caps--) { + fprintf(fp, "%s%s", delimiter, str); + delimiter = ", "; + str += strlen(str) + 1; + } + + fprintf(fp, "\n"); +} + static void print_pmu_mappings(struct feat_fd *ff, FILE *fp) { const char *delimiter = "# pmu mappings: "; @@ -2809,6 +2862,60 @@ static int process_compressed(struct feat_fd *ff, return 0; } +static int process_cpu_pmu_caps(struct feat_fd *ff, + void *data __maybe_unused) +{ + char *name, *value; + struct strbuf sb; + u32 nr_caps; + + if (do_read_u32(ff, &nr_caps)) + return -1; + + if (!nr_caps) { + pr_debug("cpu pmu capabilities not available\n"); + return 0; + } + + ff->ph->env.nr_cpu_pmu_caps = nr_caps; + + if (strbuf_init(&sb, 128) < 0) + return -1; + + while (nr_caps--) { + name = do_read_string(ff); + if (!name) + goto error; + + value = do_read_string(ff); + if (!value) + goto free_name; + + if (strbuf_addf(&sb, "%s=%s", name, value) < 0) + goto free_value; + + /* include a NULL character at the end */ + if (strbuf_add(&sb, "", 1) < 0) + goto free_value; + + if (!strcmp(name, "branches")) + ff->ph->env.max_branches = atoi(value); + + free(value); + free(name); + } + ff->ph->env.cpu_pmu_caps = strbuf_detach(&sb, NULL); + return 0; + +free_value: + free(value); +free_name: + free(name); +error: + strbuf_release(&sb); + return -1; +} + #define FEAT_OPR(n, func, __full_only) \ [HEADER_##n] = { \ .name = __stringify(n), \ @@ -2866,6 +2973,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(BPF_PROG_INFO, bpf_prog_info, false), FEAT_OPR(BPF_BTF, bpf_btf, false), FEAT_OPR(COMPRESSED, compressed, false), + FEAT_OPR(CPU_PMU_CAPS, cpu_pmu_caps, false), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index ca53a929e9fd..ae8a7108f52b 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -43,6 +43,7 @@ enum { HEADER_BPF_PROG_INFO, HEADER_BPF_BTF, HEADER_COMPRESSED, + HEADER_CPU_PMU_CAPS, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; -- Gitee From 87885cbe4ec298ee6da2a15721047bc2bc527711 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 5 Aug 2020 11:34:38 +0200 Subject: [PATCH 200/257] perf clockid: Move parse_clockid() to new clockid object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6953beb4ddab87ad00029d8a11c1f985df09981e upstream Move parse_clockid and all needed clcckid related stuff into clockid object. We are going to add clockid_name function in following change, so it's better it's placed in separated object and not in builtin-record.c. No functional change is intended. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Geneviève Bastien Cc: Ian Rogers Cc: Jeremie Galarneau Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lore.kernel.org/lkml/20200805093444.314999-2-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: chaithanyaLagisetty Signed-off-by: mohanasv2 --- tools/perf/builtin-record.c | 98 +------------------------------- tools/perf/util/Build | 1 + tools/perf/util/clockid.c | 108 ++++++++++++++++++++++++++++++++++++ tools/perf/util/clockid.h | 9 +++ 4 files changed, 119 insertions(+), 97 deletions(-) create mode 100644 tools/perf/util/clockid.c create mode 100644 tools/perf/util/clockid.h diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 4f30f7c1e079..f2d60eb9679e 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -46,6 +46,7 @@ #include "util/bpf-event.h" #include "util/pmu-hybrid.h" #include "util/evlist-hybrid.h" +#include "util/clockid.h" #include "asm/bug.h" #include "perf.h" @@ -1872,103 +1873,6 @@ static int perf_record_config(const char *var, const char *value, void *cb) return 0; } -struct clockid_map { - const char *name; - int clockid; -}; - -#define CLOCKID_MAP(n, c) \ - { .name = n, .clockid = (c), } - -#define CLOCKID_END { .name = NULL, } - - -/* - * Add the missing ones, we need to build on many distros... - */ -#ifndef CLOCK_MONOTONIC_RAW -#define CLOCK_MONOTONIC_RAW 4 -#endif -#ifndef CLOCK_BOOTTIME -#define CLOCK_BOOTTIME 7 -#endif -#ifndef CLOCK_TAI -#define CLOCK_TAI 11 -#endif - -static const struct clockid_map clockids[] = { - /* available for all events, NMI safe */ - CLOCKID_MAP("monotonic", CLOCK_MONOTONIC), - CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW), - - /* available for some events */ - CLOCKID_MAP("realtime", CLOCK_REALTIME), - CLOCKID_MAP("boottime", CLOCK_BOOTTIME), - CLOCKID_MAP("tai", CLOCK_TAI), - - /* available for the lazy */ - CLOCKID_MAP("mono", CLOCK_MONOTONIC), - CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW), - CLOCKID_MAP("real", CLOCK_REALTIME), - CLOCKID_MAP("boot", CLOCK_BOOTTIME), - - CLOCKID_END, -}; - -static int get_clockid_res(clockid_t clk_id, u64 *res_ns) -{ - struct timespec res; - - *res_ns = 0; - if (!clock_getres(clk_id, &res)) - *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC; - else - pr_warning("WARNING: Failed to determine specified clock resolution.\n"); - - return 0; -} - -static int parse_clockid(const struct option *opt, const char *str, int unset) -{ - struct record_opts *opts = (struct record_opts *)opt->value; - const struct clockid_map *cm; - const char *ostr = str; - - if (unset) { - opts->use_clockid = 0; - return 0; - } - - /* no arg passed */ - if (!str) - return 0; - - /* no setting it twice */ - if (opts->use_clockid) - return -1; - - opts->use_clockid = true; - - /* if its a number, we're done */ - if (sscanf(str, "%d", &opts->clockid) == 1) - return get_clockid_res(opts->clockid, &opts->clockid_res_ns); - - /* allow a "CLOCK_" prefix to the name */ - if (!strncasecmp(str, "CLOCK_", 6)) - str += 6; - - for (cm = clockids; cm->name; cm++) { - if (!strcasecmp(str, cm->name)) { - opts->clockid = cm->clockid; - return get_clockid_res(opts->clockid, - &opts->clockid_res_ns); - } - } - - opts->use_clockid = false; - ui__warning("unknown clockid %s, check man page\n", ostr); - return -1; -} static int record__parse_affinity(const struct option *opt, const char *str, int unset) { diff --git a/tools/perf/util/Build b/tools/perf/util/Build index d9df8a2a83cd..4f8265ddda0e 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -128,6 +128,7 @@ perf-y += time-utils.o perf-y += expr-bison.o perf-y += branch.o perf-y += mem2node.o +perf-y += clockid.o perf-$(CONFIG_LIBBPF) += bpf-loader.o perf-$(CONFIG_LIBBPF) += bpf_map.o diff --git a/tools/perf/util/clockid.c b/tools/perf/util/clockid.c new file mode 100644 index 000000000000..b7a08606dc3e --- /dev/null +++ b/tools/perf/util/clockid.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include "debug.h" +#include "clockid.h" +#include "record.h" + +struct clockid_map { + const char *name; + int clockid; +}; + +#define CLOCKID_MAP(n, c) \ + { .name = n, .clockid = (c), } + +#define CLOCKID_END { .name = NULL, } + + +/* + * Add the missing ones, we need to build on many distros... + */ +#ifndef CLOCK_MONOTONIC_RAW +#define CLOCK_MONOTONIC_RAW 4 +#endif +#ifndef CLOCK_BOOTTIME +#define CLOCK_BOOTTIME 7 +#endif +#ifndef CLOCK_TAI +#define CLOCK_TAI 11 +#endif + +static const struct clockid_map clockids[] = { + /* available for all events, NMI safe */ + CLOCKID_MAP("monotonic", CLOCK_MONOTONIC), + CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW), + + /* available for some events */ + CLOCKID_MAP("realtime", CLOCK_REALTIME), + CLOCKID_MAP("boottime", CLOCK_BOOTTIME), + CLOCKID_MAP("tai", CLOCK_TAI), + + /* available for the lazy */ + CLOCKID_MAP("mono", CLOCK_MONOTONIC), + CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW), + CLOCKID_MAP("real", CLOCK_REALTIME), + CLOCKID_MAP("boot", CLOCK_BOOTTIME), + + CLOCKID_END, +}; + +static int get_clockid_res(clockid_t clk_id, u64 *res_ns) +{ + struct timespec res; + + *res_ns = 0; + if (!clock_getres(clk_id, &res)) + *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC; + else + pr_warning("WARNING: Failed to determine specified clock resolution.\n"); + + return 0; +} + +int parse_clockid(const struct option *opt, const char *str, int unset) +{ + struct record_opts *opts = (struct record_opts *)opt->value; + const struct clockid_map *cm; + const char *ostr = str; + + if (unset) { + opts->use_clockid = 0; + return 0; + } + + /* no arg passed */ + if (!str) + return 0; + + /* no setting it twice */ + if (opts->use_clockid) + return -1; + + opts->use_clockid = true; + + /* if its a number, we're done */ + if (sscanf(str, "%d", &opts->clockid) == 1) + return get_clockid_res(opts->clockid, &opts->clockid_res_ns); + + /* allow a "CLOCK_" prefix to the name */ + if (!strncasecmp(str, "CLOCK_", 6)) + str += 6; + + for (cm = clockids; cm->name; cm++) { + if (!strcasecmp(str, cm->name)) { + opts->clockid = cm->clockid; + return get_clockid_res(opts->clockid, + &opts->clockid_res_ns); + } + } + + opts->use_clockid = false; + ui__warning("unknown clockid %s, check man page\n", ostr); + return -1; +} diff --git a/tools/perf/util/clockid.h b/tools/perf/util/clockid.h new file mode 100644 index 000000000000..8e567b3ebbbd --- /dev/null +++ b/tools/perf/util/clockid.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PERF_CLOCKID_H +#define __PERF_CLOCKID_H + +struct option; +int parse_clockid(const struct option *opt, const char *str, int unset); + +#endif -- Gitee From b8392dd729a00141a217eedac2e3f41d2f098d68 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 5 Aug 2020 11:34:39 +0200 Subject: [PATCH 201/257] perf tools: Add clockid_name function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit cc3365bbd07c26aa2e4c7435068292e03116d4e7 upstream Add the clockid_name() function to get the clock name based on its clockid. It will be used in the following changes. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Geneviève Bastien Cc: Ian Rogers Cc: Jeremie Galarneau Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lore.kernel.org/lkml/20200805093444.314999-3-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: chaithanyaLagisetty Signed-off-by: mohanasv2 --- tools/perf/util/clockid.c | 11 +++++++++++ tools/perf/util/clockid.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/tools/perf/util/clockid.c b/tools/perf/util/clockid.c index b7a08606dc3e..74365a5d99c1 100644 --- a/tools/perf/util/clockid.c +++ b/tools/perf/util/clockid.c @@ -106,3 +106,14 @@ int parse_clockid(const struct option *opt, const char *str, int unset) ui__warning("unknown clockid %s, check man page\n", ostr); return -1; } + +const char *clockid_name(clockid_t clk_id) +{ + const struct clockid_map *cm; + + for (cm = clockids; cm->name; cm++) { + if (cm->clockid == clk_id) + return cm->name; + } + return "(not found)"; +} diff --git a/tools/perf/util/clockid.h b/tools/perf/util/clockid.h index 8e567b3ebbbd..9b49b4711c76 100644 --- a/tools/perf/util/clockid.h +++ b/tools/perf/util/clockid.h @@ -6,4 +6,6 @@ struct option; int parse_clockid(const struct option *opt, const char *str, int unset); +const char *clockid_name(clockid_t clk_id); + #endif -- Gitee From 12d2224a0b3c32c7a4548061309f30d0767399ad Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 5 Aug 2020 11:34:40 +0200 Subject: [PATCH 202/257] perf header: Store clock references for -k/--clockid option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d1e325cf40fec5fec9b18aa2b537de7e3680ef6c upstream Add a new CLOCK_DATA feature that stores reference times when -k/--clockid option is specified. It contains the clock id and its reference time together with wall clock time taken at the 'same time', both values are in nanoseconds. The format of data is as below: struct { u32 version; /* version = 1 */ u32 clockid; u64 wall_clock_ns; u64 clockid_time_ns; }; This clock reference times will be used in following changes to display wall clock for perf events. It's available only for recording with clockid specified, because it's the only case where we can get reference time to wallclock time. It's can't do that with perf clock yet. Committer testing: $ perf record -h -k Usage: perf record [] [] or: perf record [] -- [] -k, --clockid clockid to use for events, see clock_gettime() $ perf record -k monotonic sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.017 MB perf.data (8 samples) ] $ perf report --header-only | grep clockid -A1 # event : name = cycles:u, , id = { 88815, 88816, 88817, 88818, 88819, 88820, 88821, 88822 }, size = 120, { sample_period, sample_freq } = 4000, sample_type = IP|TID|TIME|PERIOD, read_format = ID, disabled = 1, inherit = 1, exclude_kernel = 1, mmap = 1, comm = 1, freq = 1, enable_on_exec = 1, task = 1, precise_ip = 3, sample_id_all = 1, exclude_guest = 1, mmap2 = 1, comm_exec = 1, use_clockid = 1, ksymbol = 1, bpf_event = 1, clockid = 1 # CPU_TOPOLOGY info available, use -I to display -- # clockid frequency: 1000 MHz # cpu pmu capabilities: branches=32, max_precise=3, pmu_name=skylake # clockid: monotonic (1) # reference time: 2020-08-06 09:40:21.619290 = 1596717621.619290 (TOD) = 21931.077673635 (monotonic) $ Original-patch-by: David Ahern Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Geneviève Bastien Cc: Ian Rogers Cc: Jeremie Galarneau Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lore.kernel.org/lkml/20200805093444.314999-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: chaithanyaLagisetty Signed-off-by: mohanasv2 --- .../Documentation/perf.data-file-format.txt | 13 ++ tools/perf/builtin-record.c | 41 +++++++ tools/perf/util/env.h | 12 ++ tools/perf/util/header.c | 113 ++++++++++++++++++ tools/perf/util/header.h | 1 + 5 files changed, 180 insertions(+) diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index b6472e463284..9ee96640744e 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -389,6 +389,19 @@ struct { Example: cpu pmu capabilities: branches=32, max_precise=3, pmu_name=icelake + HEADER_CLOCK_DATA = 29, + + Contains clock id and its reference time together with wall clock + time taken at the 'same time', both values are in nanoseconds. + The format of data is as below. + +struct { + u32 version; /* version = 1 */ + u32 clockid; + u64 wall_clock_ns; + u64 clockid_time_ns; +}; + other bits are reserved and should ignored for now HEADER_FEAT_BITS = 256, diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f2d60eb9679e..6280f6430050 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -63,6 +63,7 @@ #include #include #include +#include struct switch_output { bool enabled; @@ -1075,6 +1076,9 @@ static void record__init_features(struct record *rec) if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns)) perf_header__clear_feat(&session->header, HEADER_CLOCKID); + if (!rec->opts.use_clockid) + perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA); + perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT); if (!record__comp_enabled(rec)) perf_header__clear_feat(&session->header, HEADER_COMPRESSED); @@ -1363,6 +1367,40 @@ static int record__synthesize(struct record *rec, bool tail) return err; } +static int record__init_clock(struct record *rec) +{ + struct perf_session *session = rec->session; + struct timespec ref_clockid; + struct timeval ref_tod; + u64 ref; + + if (!rec->opts.use_clockid) + return 0; + + session->header.env.clock.clockid = rec->opts.clockid; + + if (gettimeofday(&ref_tod, NULL) != 0) { + pr_err("gettimeofday failed, cannot set reference time.\n"); + return -1; + } + + if (clock_gettime(rec->opts.clockid, &ref_clockid)) { + pr_err("clock_gettime failed, cannot set reference time.\n"); + return -1; + } + + ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC + + (u64) ref_tod.tv_usec * NSEC_PER_USEC; + + session->header.env.clock.tod_ns = ref; + + ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC + + (u64) ref_clockid.tv_nsec; + + session->header.env.clock.clockid_ns = ref; + return 0; +} + static int __cmd_record(struct record *rec, int argc, const char **argv) { int err; @@ -1422,6 +1460,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) session->header.env.comp_type = PERF_COMP_ZSTD; session->header.env.comp_level = rec->opts.comp_level; + if (record__init_clock(rec)) + return -1; + record__init_features(rec); if (rec->opts.use_clockid && rec->opts.clockid_res_ns) diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index fd2b29aabc42..48dff0ae9844 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -94,6 +94,18 @@ struct perf_env { /* For fast cpu to numa node lookup via perf_env__numa_node */ int *numa_map; int nr_numa_map; + + /* For real clock time reference. */ + struct { + u64 tod_ns; + u64 clockid_ns; + int clockid; + /* + * enabled is valid for report mode, and is true if above + * values are set, it's set in process_clock_data + */ + bool enabled; + } clock; }; enum perf_compress_type { diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index f6d6dc793e90..db00e721dc46 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -46,6 +46,7 @@ #include "util/util.h" // perf_exe() #include "cputopo.h" #include "bpf-event.h" +#include "clockid.h" #include #include @@ -896,6 +897,40 @@ static int write_clockid(struct feat_fd *ff, sizeof(ff->ph->env.clockid_res_ns)); } +static int write_clock_data(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) +{ + u64 *data64; + u32 data32; + int ret; + + /* version */ + data32 = 1; + + ret = do_write(ff, &data32, sizeof(data32)); + if (ret < 0) + return ret; + + /* clockid */ + data32 = ff->ph->env.clock.clockid; + + ret = do_write(ff, &data32, sizeof(data32)); + if (ret < 0) + return ret; + + /* TOD ref time */ + data64 = &ff->ph->env.clock.tod_ns; + + ret = do_write(ff, data64, sizeof(*data64)); + if (ret < 0) + return ret; + + /* clockid ref time */ + data64 = &ff->ph->env.clock.clockid_ns; + + return do_write(ff, data64, sizeof(*data64)); +} + static int write_dir_format(struct feat_fd *ff, struct evlist *evlist __maybe_unused) { @@ -1550,6 +1585,49 @@ static void print_clockid(struct feat_fd *ff, FILE *fp) ff->ph->env.clockid_res_ns * 1000); } +static void print_clock_data(struct feat_fd *ff, FILE *fp) +{ + struct timespec clockid_ns; + char tstr[64], date[64]; + struct timeval tod_ns; + clockid_t clockid; + struct tm ltime; + u64 ref; + + if (!ff->ph->env.clock.enabled) { + fprintf(fp, "# reference time disabled\n"); + return; + } + + /* Compute TOD time. */ + ref = ff->ph->env.clock.tod_ns; + tod_ns.tv_sec = ref / NSEC_PER_SEC; + ref -= tod_ns.tv_sec * NSEC_PER_SEC; + tod_ns.tv_usec = ref / NSEC_PER_USEC; + + /* Compute clockid time. */ + ref = ff->ph->env.clock.clockid_ns; + clockid_ns.tv_sec = ref / NSEC_PER_SEC; + ref -= clockid_ns.tv_sec * NSEC_PER_SEC; + clockid_ns.tv_nsec = ref; + + clockid = ff->ph->env.clock.clockid; + + if (localtime_r(&tod_ns.tv_sec, <ime) == NULL) + snprintf(tstr, sizeof(tstr), ""); + else { + strftime(date, sizeof(date), "%F %T", <ime); + scnprintf(tstr, sizeof(tstr), "%s.%06d", + date, (int) tod_ns.tv_usec); + } + + fprintf(fp, "# clockid: %s (%u)\n", clockid_name(clockid), clockid); + fprintf(fp, "# reference time: %s = %ld.%06d (TOD) = %ld.%09ld (%s)\n", + tstr, tod_ns.tv_sec, (int) tod_ns.tv_usec, + clockid_ns.tv_sec, clockid_ns.tv_nsec, + clockid_name(clockid)); +} + static void print_dir_format(struct feat_fd *ff, FILE *fp) { struct perf_session *session; @@ -2704,6 +2782,40 @@ static int process_clockid(struct feat_fd *ff, return 0; } +static int process_clock_data(struct feat_fd *ff, + void *_data __maybe_unused) +{ + u32 data32; + u64 data64; + + /* version */ + if (do_read_u32(ff, &data32)) + return -1; + + if (data32 != 1) + return -1; + + /* clockid */ + if (do_read_u32(ff, &data32)) + return -1; + + ff->ph->env.clock.clockid = data32; + + /* TOD ref time */ + if (do_read_u64(ff, &data64)) + return -1; + + ff->ph->env.clock.tod_ns = data64; + + /* clockid ref time */ + if (do_read_u64(ff, &data64)) + return -1; + + ff->ph->env.clock.clockid_ns = data64; + ff->ph->env.clock.enabled = true; + return 0; +} + static int process_dir_format(struct feat_fd *ff, void *_data __maybe_unused) { @@ -2974,6 +3086,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(BPF_BTF, bpf_btf, false), FEAT_OPR(COMPRESSED, compressed, false), FEAT_OPR(CPU_PMU_CAPS, cpu_pmu_caps, false), + FEAT_OPR(CLOCK_DATA, clock_data, false), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index ae8a7108f52b..6d4fcb7377b7 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -44,6 +44,7 @@ enum { HEADER_BPF_BTF, HEADER_COMPRESSED, HEADER_CPU_PMU_CAPS, + HEADER_CLOCK_DATA, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; -- Gitee From 458828041a65ba8900a60a37ab22493aa9e7363f Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Fri, 14 May 2021 20:29:47 +0800 Subject: [PATCH 203/257] perf header: Support HYBRID_TOPOLOGY feature commit f7d74ce32fc1b9b3cbf58c015009d1f616e60c23 upstream It is useful to let the user know about the hybrid topology. Add the HYBRID_TOPOLOGY feature in header to indicate the core CPUs and the atom CPUs. With this patch a perf.data generated on a hybrid platform reports the hybrid CPU list: root@otcpl-adl-s-2:~# perf report --header-only -I ... # hybrid cpu system: # cpu_core cpu list : 0-15 # cpu_atom cpu list : 16-23 For a perf.data generated on a non-hybrid platform, reports a message that HYBRID_TOPOLOGY is missing: root@kbl-ppc:~# perf report --header-only -I ... # missing features: TRACING_DATA BRANCH_STACK GROUP_DESC AUXTRACE STAT CLOCKID DIR_FORMAT COMPRESSED CLOCK_DATA HYBRID_TOPOLOGY Signed-off-by: Jin Yao Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jin Yao Cc: Kan Liang Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210514122948.9472-2-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: chaithanyaLagisetty Signed-off-by: mohanasv2 --- .../Documentation/perf.data-file-format.txt | 17 ++++ tools/perf/util/cputopo.c | 80 +++++++++++++++++ tools/perf/util/cputopo.h | 13 +++ tools/perf/util/env.c | 6 ++ tools/perf/util/env.h | 7 ++ tools/perf/util/header.c | 87 +++++++++++++++++++ tools/perf/util/header.h | 1 + tools/perf/util/pmu-hybrid.h | 11 +++ 8 files changed, 222 insertions(+) diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index 9ee96640744e..fbee9e580ee4 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -402,6 +402,23 @@ struct { u64 clockid_time_ns; }; + HEADER_HYBRID_TOPOLOGY = 30, + +Indicate the hybrid CPUs. The format of data is as below. + +struct { + u32 nr; + struct { + char pmu_name[]; + char cpus[]; + } [nr]; /* Variable length records */ +}; + +Example: + hybrid cpu system: + cpu_core cpu list : 0-15 + cpu_atom cpu list : 16-23 + other bits are reserved and should ignored for now HEADER_FEAT_BITS = 256, diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index 1b52402a8923..ec77e2a7b3ca 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -12,6 +12,7 @@ #include "cpumap.h" #include "debug.h" #include "env.h" +#include "pmu-hybrid.h" #define CORE_SIB_FMT \ "%s/devices/system/cpu/cpu%d/topology/core_siblings_list" @@ -351,3 +352,82 @@ void numa_topology__delete(struct numa_topology *tp) free(tp); } + +static int load_hybrid_node(struct hybrid_topology_node *node, + struct perf_pmu *pmu) +{ + const char *sysfs; + char path[PATH_MAX]; + char *buf = NULL, *p; + FILE *fp; + size_t len = 0; + + node->pmu_name = strdup(pmu->name); + if (!node->pmu_name) + return -1; + + sysfs = sysfs__mountpoint(); + if (!sysfs) + goto err; + + snprintf(path, PATH_MAX, CPUS_TEMPLATE_CPU, sysfs, pmu->name); + fp = fopen(path, "r"); + if (!fp) + goto err; + + if (getline(&buf, &len, fp) <= 0) { + fclose(fp); + goto err; + } + + p = strchr(buf, '\n'); + if (p) + *p = '\0'; + + fclose(fp); + node->cpus = buf; + return 0; + +err: + zfree(&node->pmu_name); + free(buf); + return -1; +} + +struct hybrid_topology *hybrid_topology__new(void) +{ + struct perf_pmu *pmu; + struct hybrid_topology *tp = NULL; + u32 nr, i = 0; + + nr = perf_pmu__hybrid_pmu_num(); + if (nr == 0) + return NULL; + + tp = zalloc(sizeof(*tp) + sizeof(tp->nodes[0]) * nr); + if (!tp) + return NULL; + + tp->nr = nr; + perf_pmu__for_each_hybrid_pmu(pmu) { + if (load_hybrid_node(&tp->nodes[i], pmu)) { + hybrid_topology__delete(tp); + return NULL; + } + i++; + } + + return tp; +} + +void hybrid_topology__delete(struct hybrid_topology *tp) +{ + u32 i; + + for (i = 0; i < tp->nr; i++) { + zfree(&tp->nodes[i].pmu_name); + zfree(&tp->nodes[i].cpus); + } + + free(tp); +} diff --git a/tools/perf/util/cputopo.h b/tools/perf/util/cputopo.h index 7bf6b811f715..528b747152f3 100644 --- a/tools/perf/util/cputopo.h +++ b/tools/perf/util/cputopo.h @@ -25,10 +25,23 @@ struct numa_topology { struct numa_topology_node nodes[0]; }; +struct hybrid_topology_node { + char *pmu_name; + char *cpus; +}; + +struct hybrid_topology { + u32 nr; + struct hybrid_topology_node nodes[]; +}; + struct cpu_topology *cpu_topology__new(void); void cpu_topology__delete(struct cpu_topology *tp); struct numa_topology *numa_topology__new(void); void numa_topology__delete(struct numa_topology *tp); +struct hybrid_topology *hybrid_topology__new(void); +void hybrid_topology__delete(struct hybrid_topology *tp); + #endif /* __PERF_CPUTOPO_H */ diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 6b8252dd42c1..acf8a7164c3a 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -198,6 +198,12 @@ void perf_env__exit(struct perf_env *env) for (i = 0; i < env->nr_memory_nodes; i++) zfree(&env->memory_nodes[i].set); zfree(&env->memory_nodes); + + for (i = 0; i < env->nr_hybrid_nodes; i++) { + zfree(&env->hybrid_nodes[i].pmu_name); + zfree(&env->hybrid_nodes[i].cpus); + } + zfree(&env->hybrid_nodes); } void perf_env__init(struct perf_env *env) diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 48dff0ae9844..a9633e8327b1 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -37,6 +37,11 @@ struct memory_node { unsigned long *set; }; +struct hybrid_node { + char *pmu_name; + char *cpus; +}; + struct perf_env { char *hostname; char *os_release; @@ -59,6 +64,7 @@ struct perf_env { int nr_pmu_mappings; int nr_groups; int nr_cpu_pmu_caps; + int nr_hybrid_nodes; char *cmdline; const char **cmdline_argv; char *sibling_cores; @@ -79,6 +85,7 @@ struct perf_env { unsigned long long memory_bsize; u64 clockid_res_ns; + struct hybrid_node *hybrid_nodes; /* * bpf_info_lock protects bpf rbtrees. This is needed because the * trees are accessed by different threads in perf-top diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index db00e721dc46..185e136d7eb7 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -931,6 +931,40 @@ static int write_clock_data(struct feat_fd *ff, return do_write(ff, data64, sizeof(*data64)); } +static int write_hybrid_topology(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) +{ + struct hybrid_topology *tp; + int ret; + u32 i; + + tp = hybrid_topology__new(); + if (!tp) + return -ENOENT; + + ret = do_write(ff, &tp->nr, sizeof(u32)); + if (ret < 0) + goto err; + + for (i = 0; i < tp->nr; i++) { + struct hybrid_topology_node *n = &tp->nodes[i]; + + ret = do_write_string(ff, n->pmu_name); + if (ret < 0) + goto err; + + ret = do_write_string(ff, n->cpus); + if (ret < 0) + goto err; + } + + ret = 0; + +err: + hybrid_topology__delete(tp); + return ret; +} + static int write_dir_format(struct feat_fd *ff, struct evlist *evlist __maybe_unused) { @@ -1628,6 +1662,18 @@ static void print_clock_data(struct feat_fd *ff, FILE *fp) clockid_name(clockid)); } +static void print_hybrid_topology(struct feat_fd *ff, FILE *fp) +{ + int i; + struct hybrid_node *n; + + fprintf(fp, "# hybrid cpu system:\n"); + for (i = 0; i < ff->ph->env.nr_hybrid_nodes; i++) { + n = &ff->ph->env.hybrid_nodes[i]; + fprintf(fp, "# %s cpu list : %s\n", n->pmu_name, n->cpus); + } +} + static void print_dir_format(struct feat_fd *ff, FILE *fp) { struct perf_session *session; @@ -2816,6 +2862,46 @@ static int process_clock_data(struct feat_fd *ff, return 0; } +static int process_hybrid_topology(struct feat_fd *ff, + void *data __maybe_unused) +{ + struct hybrid_node *nodes, *n; + u32 nr, i; + + /* nr nodes */ + if (do_read_u32(ff, &nr)) + return -1; + + nodes = zalloc(sizeof(*nodes) * nr); + if (!nodes) + return -ENOMEM; + + for (i = 0; i < nr; i++) { + n = &nodes[i]; + + n->pmu_name = do_read_string(ff); + if (!n->pmu_name) + goto error; + + n->cpus = do_read_string(ff); + if (!n->cpus) + goto error; + } + + ff->ph->env.nr_hybrid_nodes = nr; + ff->ph->env.hybrid_nodes = nodes; + return 0; + +error: + for (i = 0; i < nr; i++) { + free(nodes[i].pmu_name); + free(nodes[i].cpus); + } + + free(nodes); + return -1; +} + static int process_dir_format(struct feat_fd *ff, void *_data __maybe_unused) { @@ -3087,6 +3173,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(COMPRESSED, compressed, false), FEAT_OPR(CPU_PMU_CAPS, cpu_pmu_caps, false), FEAT_OPR(CLOCK_DATA, clock_data, false), + FEAT_OPN(HYBRID_TOPOLOGY, hybrid_topology, true), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 6d4fcb7377b7..2cd5349e7e04 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -45,6 +45,7 @@ enum { HEADER_COMPRESSED, HEADER_CPU_PMU_CAPS, HEADER_CLOCK_DATA, + HEADER_HYBRID_TOPOLOGY, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h index d0fa7bc50a76..2b186c26a43e 100644 --- a/tools/perf/util/pmu-hybrid.h +++ b/tools/perf/util/pmu-hybrid.h @@ -19,4 +19,15 @@ struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name); bool perf_pmu__is_hybrid(const char *name); char *perf_pmu__hybrid_type_to_pmu(const char *type); +static inline int perf_pmu__hybrid_pmu_num(void) +{ + struct perf_pmu *pmu; + int num = 0; + + perf_pmu__for_each_hybrid_pmu(pmu) + num++; + + return num; +} + #endif /* __PMU_HYBRID_H */ -- Gitee From c0889147d45bf0284aca8eb402e3c8e476425a8f Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Fri, 14 May 2021 20:29:48 +0800 Subject: [PATCH 204/257] perf header: Support HYBRID_CPU_PMU_CAPS feature commit e119083bab80c2550065f6c0f10ba225a894595e upstream Perf has supported the CPU_PMU_CAPS feature to display a list of CPU PMU capabilities. But on a hybrid platform, it may have several CPU PMUs (such as "cpu_core" and "cpu_atom"). The CPU_PMU_CAPS feature is hard to extend to support multiple CPU PMUs well if it needs to be compatible for the case of old perf data file + new perf tool. So for better compatibility we now create a new feature HYBRID_CPU_PMU_CAPS in the header. For the perf.data generated on hybrid platform, root@otcpl-adl-s-2:~# perf report --header-only -I # cpu_core pmu capabilities: branches=32, max_precise=3, pmu_name=alderlake_hybrid # cpu_atom pmu capabilities: branches=32, max_precise=3, pmu_name=alderlake_hybrid # missing features: TRACING_DATA BRANCH_STACK GROUP_DESC AUXTRACE STAT CLOCKID DIR_FORMAT COMPRESSED CPU_PMU_CAPS CLOCK_DATA For the perf.data generated on non-hybrid platform root@kbl-ppc:~# perf report --header-only -I # cpu pmu capabilities: branches=32, max_precise=3, pmu_name=skylake # missing features: TRACING_DATA BRANCH_STACK GROUP_DESC AUXTRACE STAT CLOCKID DIR_FORMAT COMPRESSED CLOCK_DATA HYBRID_TOPOLOGY HYBRID_CPU_PMU_CAPS Signed-off-by: Jin Yao Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jin Yao Cc: Kan Liang Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210514122948.9472-3-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: chaithanyaLagisetty Signed-off-by: mohanasv2 --- .../Documentation/perf.data-file-format.txt | 16 ++ tools/perf/util/env.c | 6 + tools/perf/util/env.h | 9 + tools/perf/util/header.c | 162 ++++++++++++++++-- tools/perf/util/header.h | 1 + 5 files changed, 175 insertions(+), 19 deletions(-) diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index fbee9e580ee4..e6ff8c898ada 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -419,6 +419,22 @@ Example: cpu_core cpu list : 0-15 cpu_atom cpu list : 16-23 + HEADER_HYBRID_CPU_PMU_CAPS = 31, + + A list of hybrid CPU PMU capabilities. + +struct { + u32 nr_pmu; + struct { + u32 nr_cpu_pmu_caps; + { + char name[]; + char value[]; + } [nr_cpu_pmu_caps]; + char pmu_name[]; + } [nr_pmu]; +}; + other bits are reserved and should ignored for now HEADER_FEAT_BITS = 256, diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index acf8a7164c3a..aaa7c24d6d29 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -204,6 +204,12 @@ void perf_env__exit(struct perf_env *env) zfree(&env->hybrid_nodes[i].cpus); } zfree(&env->hybrid_nodes); + + for (i = 0; i < env->nr_hybrid_cpc_nodes; i++) { + zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps); + zfree(&env->hybrid_cpc_nodes[i].pmu_name); + } + zfree(&env->hybrid_cpc_nodes); } void perf_env__init(struct perf_env *env) diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index a9633e8327b1..25a0edabd74c 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -42,6 +42,13 @@ struct hybrid_node { char *cpus; }; +struct hybrid_cpc_node { + int nr_cpu_pmu_caps; + unsigned int max_branches; + char *cpu_pmu_caps; + char *pmu_name; +}; + struct perf_env { char *hostname; char *os_release; @@ -65,6 +72,7 @@ struct perf_env { int nr_groups; int nr_cpu_pmu_caps; int nr_hybrid_nodes; + int nr_hybrid_cpc_nodes; char *cmdline; const char **cmdline_argv; char *sibling_cores; @@ -86,6 +94,7 @@ struct perf_env { u64 clockid_res_ns; struct hybrid_node *hybrid_nodes; + struct hybrid_cpc_node *hybrid_cpc_nodes; /* * bpf_info_lock protects bpf rbtrees. This is needed because the * trees are accessed by different threads in perf-top diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 185e136d7eb7..faaea8e4f195 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -47,6 +47,7 @@ #include "cputopo.h" #include "bpf-event.h" #include "clockid.h" +#include "pmu-hybrid.h" #include #include @@ -1464,18 +1465,14 @@ static int write_compressed(struct feat_fd *ff __maybe_unused, return do_write(ff, &(ff->ph->env.comp_mmap_len), sizeof(ff->ph->env.comp_mmap_len)); } -static int write_cpu_pmu_caps(struct feat_fd *ff, - struct evlist *evlist __maybe_unused) +static int write_per_cpu_pmu_caps(struct feat_fd *ff, struct perf_pmu *pmu, + bool write_pmu) { - struct perf_pmu *cpu_pmu = perf_pmu__find("cpu"); struct perf_pmu_caps *caps = NULL; int nr_caps; int ret; - if (!cpu_pmu) - return -ENOENT; - - nr_caps = perf_pmu__caps_parse(cpu_pmu); + nr_caps = perf_pmu__caps_parse(pmu); if (nr_caps < 0) return nr_caps; @@ -1483,7 +1480,7 @@ static int write_cpu_pmu_caps(struct feat_fd *ff, if (ret < 0) return ret; - list_for_each_entry(caps, &cpu_pmu->caps, list) { + list_for_each_entry(caps, &pmu->caps, list) { ret = do_write_string(ff, caps->name); if (ret < 0) return ret; @@ -1493,9 +1490,49 @@ static int write_cpu_pmu_caps(struct feat_fd *ff, return ret; } + if (write_pmu) { + ret = do_write_string(ff, pmu->name); + if (ret < 0) + return ret; + } + return ret; } +static int write_cpu_pmu_caps(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) +{ + struct perf_pmu *cpu_pmu = perf_pmu__find("cpu"); + + if (!cpu_pmu) + return -ENOENT; + + return write_per_cpu_pmu_caps(ff, cpu_pmu, false); +} + +static int write_hybrid_cpu_pmu_caps(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) +{ + struct perf_pmu *pmu; + u32 nr_pmu = perf_pmu__hybrid_pmu_num(); + int ret; + + if (nr_pmu == 0) + return -ENOENT; + + ret = do_write(ff, &nr_pmu, sizeof(nr_pmu)); + if (ret < 0) + return ret; + + perf_pmu__for_each_hybrid_pmu(pmu) { + ret = write_per_cpu_pmu_caps(ff, pmu, true); + if (ret < 0) + return ret; + } + + return 0; +} + static void print_hostname(struct feat_fd *ff, FILE *fp) { fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname); @@ -1928,18 +1965,28 @@ static void print_compressed(struct feat_fd *ff, FILE *fp) ff->ph->env.comp_level, ff->ph->env.comp_ratio); } -static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) +static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps, + char *pmu_name) { - const char *delimiter = "# cpu pmu capabilities: "; - u32 nr_caps = ff->ph->env.nr_cpu_pmu_caps; - char *str; + const char *delimiter; + char *str, buf[128]; if (!nr_caps) { - fprintf(fp, "# cpu pmu capabilities: not available\n"); + if (!pmu_name) + fprintf(fp, "# cpu pmu capabilities: not available\n"); + else + fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name); return; } - str = ff->ph->env.cpu_pmu_caps; + if (!pmu_name) + scnprintf(buf, sizeof(buf), "# cpu pmu capabilities: "); + else + scnprintf(buf, sizeof(buf), "# %s pmu capabilities: ", pmu_name); + + delimiter = buf; + + str = cpu_pmu_caps; while (nr_caps--) { fprintf(fp, "%s%s", delimiter, str); delimiter = ", "; @@ -1949,6 +1996,24 @@ static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) fprintf(fp, "\n"); } +static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) +{ + print_per_cpu_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps, + ff->ph->env.cpu_pmu_caps, NULL); +} + +static void print_hybrid_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) +{ + struct hybrid_cpc_node *n; + + for (int i = 0; i < ff->ph->env.nr_hybrid_cpc_nodes; i++) { + n = &ff->ph->env.hybrid_cpc_nodes[i]; + print_per_cpu_pmu_caps(fp, n->nr_cpu_pmu_caps, + n->cpu_pmu_caps, + n->pmu_name); + } +} + static void print_pmu_mappings(struct feat_fd *ff, FILE *fp) { const char *delimiter = "# pmu mappings: "; @@ -3060,8 +3125,9 @@ static int process_compressed(struct feat_fd *ff, return 0; } -static int process_cpu_pmu_caps(struct feat_fd *ff, - void *data __maybe_unused) +static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, + char **cpu_pmu_caps, + unsigned int *max_branches) { char *name, *value; struct strbuf sb; @@ -3075,7 +3141,7 @@ static int process_cpu_pmu_caps(struct feat_fd *ff, return 0; } - ff->ph->env.nr_cpu_pmu_caps = nr_caps; + *nr_cpu_pmu_caps = nr_caps; if (strbuf_init(&sb, 128) < 0) return -1; @@ -3097,12 +3163,12 @@ static int process_cpu_pmu_caps(struct feat_fd *ff, goto free_value; if (!strcmp(name, "branches")) - ff->ph->env.max_branches = atoi(value); + *max_branches = atoi(value); free(value); free(name); } - ff->ph->env.cpu_pmu_caps = strbuf_detach(&sb, NULL); + *cpu_pmu_caps = strbuf_detach(&sb, NULL); return 0; free_value: @@ -3114,6 +3180,63 @@ static int process_cpu_pmu_caps(struct feat_fd *ff, return -1; } +static int process_cpu_pmu_caps(struct feat_fd *ff, + void *data __maybe_unused) +{ + return process_per_cpu_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, + &ff->ph->env.cpu_pmu_caps, + &ff->ph->env.max_branches); +} + +static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, + void *data __maybe_unused) +{ + struct hybrid_cpc_node *nodes; + u32 nr_pmu, i; + int ret; + + if (do_read_u32(ff, &nr_pmu)) + return -1; + + if (!nr_pmu) { + pr_debug("hybrid cpu pmu capabilities not available\n"); + return 0; + } + + nodes = zalloc(sizeof(*nodes) * nr_pmu); + if (!nodes) + return -ENOMEM; + + for (i = 0; i < nr_pmu; i++) { + struct hybrid_cpc_node *n = &nodes[i]; + + ret = process_per_cpu_pmu_caps(ff, &n->nr_cpu_pmu_caps, + &n->cpu_pmu_caps, + &n->max_branches); + if (ret) + goto err; + + n->pmu_name = do_read_string(ff); + if (!n->pmu_name) { + ret = -1; + goto err; + } + } + + ff->ph->env.nr_hybrid_cpc_nodes = nr_pmu; + ff->ph->env.hybrid_cpc_nodes = nodes; + return 0; + +err: + for (i = 0; i < nr_pmu; i++) { + free(nodes[i].cpu_pmu_caps); + free(nodes[i].pmu_name); + } + + free(nodes); + return ret; +} + #define FEAT_OPR(n, func, __full_only) \ [HEADER_##n] = { \ .name = __stringify(n), \ @@ -3174,6 +3297,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(CPU_PMU_CAPS, cpu_pmu_caps, false), FEAT_OPR(CLOCK_DATA, clock_data, false), FEAT_OPN(HYBRID_TOPOLOGY, hybrid_topology, true), + FEAT_OPR(HYBRID_CPU_PMU_CAPS, hybrid_cpu_pmu_caps, false), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 2cd5349e7e04..19b2acfd8422 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -46,6 +46,7 @@ enum { HEADER_CPU_PMU_CAPS, HEADER_CLOCK_DATA, HEADER_HYBRID_TOPOLOGY, + HEADER_HYBRID_CPU_PMU_CAPS, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; -- Gitee From 2cb194f034f9d36ca14a21f5e615051f89d561b8 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:14 +0530 Subject: [PATCH 205/257] perf header: Pass "cpu" pmu name while printing caps commit 2a12bef413bb278db202ecb6adbd9c18dec4b260 upstream Avoid unnecessary conditional code to check if pmu name is NULL or not by passing "cpu" pmu name to the printing function. Reviewed-by: Kan Liang Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-4-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: chaithanyaLagisetty Signed-off-by: mohanasv2 --- tools/perf/util/header.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index faaea8e4f195..adef0b1d60f7 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1972,17 +1972,11 @@ static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps, char *str, buf[128]; if (!nr_caps) { - if (!pmu_name) - fprintf(fp, "# cpu pmu capabilities: not available\n"); - else - fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name); + fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name); return; } - if (!pmu_name) - scnprintf(buf, sizeof(buf), "# cpu pmu capabilities: "); - else - scnprintf(buf, sizeof(buf), "# %s pmu capabilities: ", pmu_name); + scnprintf(buf, sizeof(buf), "# %s pmu capabilities: ", pmu_name); delimiter = buf; @@ -1999,7 +1993,7 @@ static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps, static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) { print_per_cpu_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps, - ff->ph->env.cpu_pmu_caps, NULL); + ff->ph->env.cpu_pmu_caps, (char *)"cpu"); } static void print_hybrid_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) -- Gitee From e12ab2ab5203ff9721115929322d5543971992e5 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:15 +0530 Subject: [PATCH 206/257] perf header: Store PMU caps in an array of strings commit ff34eaa820231dfaecca9c048637bd86ba294bc7 upstream Currently all capabilities are stored in a single string separated by NULL character. Instead, store them in an array which makes searching of capability easier. Reviewed-by: Kan Liang Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-5-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: chaithanyaLagisetty Signed-off-by: mohanasv2 --- tools/perf/util/env.c | 6 +++- tools/perf/util/env.h | 4 +-- tools/perf/util/header.c | 63 ++++++++++++++++++++++------------------ 3 files changed, 41 insertions(+), 32 deletions(-) diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index aaa7c24d6d29..98f4531fe96b 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -169,7 +169,7 @@ static void perf_env__purge_bpf(struct perf_env *env) void perf_env__exit(struct perf_env *env) { - int i; + int i, j; perf_env__purge_bpf(env); zfree(&env->hostname); @@ -185,6 +185,8 @@ void perf_env__exit(struct perf_env *env) zfree(&env->sibling_threads); zfree(&env->pmu_mappings); zfree(&env->cpu); + for (i = 0; i < env->nr_cpu_pmu_caps; i++) + zfree(&env->cpu_pmu_caps[i]); zfree(&env->numa_map); for (i = 0; i < env->nr_numa_nodes; i++) @@ -206,6 +208,8 @@ void perf_env__exit(struct perf_env *env) zfree(&env->hybrid_nodes); for (i = 0; i < env->nr_hybrid_cpc_nodes; i++) { + for (j = 0; j < env->hybrid_cpc_nodes[i].nr_cpu_pmu_caps; j++) + zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps[j]); zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps); zfree(&env->hybrid_cpc_nodes[i].pmu_name); } diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 25a0edabd74c..42e336e5a490 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -45,7 +45,7 @@ struct hybrid_node { struct hybrid_cpc_node { int nr_cpu_pmu_caps; unsigned int max_branches; - char *cpu_pmu_caps; + char **cpu_pmu_caps; char *pmu_name; }; @@ -79,7 +79,7 @@ struct perf_env { char *sibling_dies; char *sibling_threads; char *pmu_mappings; - char *cpu_pmu_caps; + char **cpu_pmu_caps; struct cpu_topology_map *cpu; struct cpu_cache_level *caches; int caches_cnt; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index adef0b1d60f7..43533c212240 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1965,26 +1965,21 @@ static void print_compressed(struct feat_fd *ff, FILE *fp) ff->ph->env.comp_level, ff->ph->env.comp_ratio); } -static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char *cpu_pmu_caps, +static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char **cpu_pmu_caps, char *pmu_name) { - const char *delimiter; - char *str, buf[128]; + const char *delimiter = ""; + int i; if (!nr_caps) { fprintf(fp, "# %s pmu capabilities: not available\n", pmu_name); return; } - scnprintf(buf, sizeof(buf), "# %s pmu capabilities: ", pmu_name); - - delimiter = buf; - - str = cpu_pmu_caps; - while (nr_caps--) { - fprintf(fp, "%s%s", delimiter, str); + fprintf(fp, "# %s pmu capabilities: ", pmu_name); + for (i = 0; i < nr_caps; i++) { + fprintf(fp, "%s%s", delimiter, cpu_pmu_caps[i]); delimiter = ", "; - str += strlen(str) + 1; } fprintf(fp, "\n"); @@ -3120,27 +3115,26 @@ static int process_compressed(struct feat_fd *ff, } static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, - char **cpu_pmu_caps, + char ***cpu_pmu_caps, unsigned int *max_branches) { - char *name, *value; - struct strbuf sb; - u32 nr_caps; + char *name, *value, *ptr; + u32 nr_caps, i; + + *nr_cpu_pmu_caps = 0; + *cpu_pmu_caps = NULL; if (do_read_u32(ff, &nr_caps)) return -1; - if (!nr_caps) { - pr_debug("cpu pmu capabilities not available\n"); + if (!nr_caps) return 0; - } - - *nr_cpu_pmu_caps = nr_caps; - if (strbuf_init(&sb, 128) < 0) + *cpu_pmu_caps = zalloc(sizeof(char *) * nr_caps); + if (!*cpu_pmu_caps) return -1; - while (nr_caps--) { + for (i = 0; i < nr_caps; i++) { name = do_read_string(ff); if (!name) goto error; @@ -3149,12 +3143,10 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, if (!value) goto free_name; - if (strbuf_addf(&sb, "%s=%s", name, value) < 0) + if (asprintf(&ptr, "%s=%s", name, value) < 0) goto free_value; - /* include a NULL character at the end */ - if (strbuf_add(&sb, "", 1) < 0) - goto free_value; + (*cpu_pmu_caps)[i] = ptr; if (!strcmp(name, "branches")) *max_branches = atoi(value); @@ -3162,7 +3154,7 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, free(value); free(name); } - *cpu_pmu_caps = strbuf_detach(&sb, NULL); + *nr_cpu_pmu_caps = nr_caps; return 0; free_value: @@ -3170,16 +3162,24 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, free_name: free(name); error: - strbuf_release(&sb); + for (; i > 0; i--) + free((*cpu_pmu_caps)[i - 1]); + free(*cpu_pmu_caps); + *cpu_pmu_caps = NULL; + *nr_cpu_pmu_caps = 0; return -1; } static int process_cpu_pmu_caps(struct feat_fd *ff, void *data __maybe_unused) { - return process_per_cpu_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, + int ret = process_per_cpu_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, &ff->ph->env.cpu_pmu_caps, &ff->ph->env.max_branches); + + if (!ret && !ff->ph->env.cpu_pmu_caps) + pr_debug("cpu pmu capabilities not available\n"); + return ret; } static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, @@ -3188,6 +3188,7 @@ static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, struct hybrid_cpc_node *nodes; u32 nr_pmu, i; int ret; + int j; if (do_read_u32(ff, &nr_pmu)) return -1; @@ -3215,6 +3216,8 @@ static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, ret = -1; goto err; } + if (!n->nr_cpu_pmu_caps) + pr_debug("%s pmu capabilities not available\n", n->pmu_name); } ff->ph->env.nr_hybrid_cpc_nodes = nr_pmu; @@ -3223,6 +3226,8 @@ static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, err: for (i = 0; i < nr_pmu; i++) { + for (j = 0; j < nodes[i].nr_cpu_pmu_caps; j++) + free(nodes[i].cpu_pmu_caps[j]); free(nodes[i].cpu_pmu_caps); free(nodes[i].pmu_name); } -- Gitee From d27488bf80d2774809f3f1c2699347cb33382df0 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 30 Apr 2021 10:03:01 +0300 Subject: [PATCH 207/257] perf inject: Add facility to do in place update commit 2a525f6a5502bfd80568e6befb84053cf650ad25 upstream When there is a need to modify only timestamps, it is much simpler and quicker to do it to the existing file rather than re-write all the contents. In preparation for that, add the ability to modify the input file in place. In practice that just means making the file descriptor and mmaps writable. Signed-off-by: Adrian Hunter Reviewed-by: Andi Kleen Cc: Jiri Olsa Link: https://lore.kernel.org/r/20210430070309.17624-5-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/builtin-inject.c | 26 ++++++++++++++++++++++---- tools/perf/util/data.c | 3 ++- tools/perf/util/data.h | 1 + tools/perf/util/header.c | 5 +++++ tools/perf/util/session.c | 6 +++++- 5 files changed, 35 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 6b6bb86d62d3..49f121762a8f 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -39,6 +39,8 @@ struct perf_inject { bool have_auxtrace; bool strip; bool jit_mode; + bool in_place_update; + bool in_place_update_dry_run; const char *input_name; struct perf_data output; u64 bytes_written; @@ -635,7 +637,7 @@ static int __cmd_inject(struct perf_inject *inject) int ret = -EINVAL; struct perf_session *session = inject->session; struct perf_data *data_out = &inject->output; - int fd = perf_data__fd(data_out); + int fd = inject->in_place_update ? -1 : perf_data__fd(data_out); u64 output_data_offset; signal(SIGINT, sig_handler); @@ -690,14 +692,14 @@ static int __cmd_inject(struct perf_inject *inject) if (!inject->itrace_synth_opts.set) auxtrace_index__free(&session->auxtrace_index); - if (!data_out->is_pipe) + if (!data_out->is_pipe && !inject->in_place_update) lseek(fd, output_data_offset, SEEK_SET); ret = perf_session__process_events(session); if (ret) return ret; - if (!data_out->is_pipe) { + if (!data_out->is_pipe && !inject->in_place_update) { if (inject->build_ids) perf_header__set_feat(&session->header, HEADER_BUILD_ID); @@ -827,7 +829,23 @@ int cmd_inject(int argc, const char **argv) return -1; } - if (perf_data__open(&inject.output)) { + if (inject.in_place_update) { + if (!strcmp(inject.input_name, "-")) { + pr_err("Input file name required for in-place updating\n"); + return -1; + } + if (strcmp(inject.output.path, "-")) { + pr_err("Output file name must not be specified for in-place updating\n"); + return -1; + } + if (!data.force && !inject.in_place_update_dry_run) { + pr_err("The input file would be updated in place, " + "the --force option is required.\n"); + return -1; + } + if (!inject.in_place_update_dry_run) + data.in_place_update = true; + } else if (perf_data__open(&inject.output)) { perror("failed to create output file"); return -1; } diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 4da900bdb2f1..af061352430a 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -221,11 +221,12 @@ static bool is_dir(struct perf_data *data) static int open_file_read(struct perf_data *data) { + int flags = data->in_place_update ? O_RDWR : O_RDONLY; struct stat st; int fd; char sbuf[STRERR_BUFSIZE]; - fd = open(data->file.path, O_RDONLY); + fd = open(data->file.path, flags); if (fd < 0) { int err = errno; diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 252d99071249..06d6555c66db 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -22,6 +22,7 @@ struct perf_data { bool is_pipe; bool is_dir; bool force; + bool in_place_update; enum perf_data_mode mode; struct { diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 43533c212240..6008fd97eb71 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3980,6 +3980,11 @@ int perf_session__read_header(struct perf_session *session) if (perf_file_header__read(&f_header, header, fd) < 0) return -EINVAL; + if (header->needs_swap && data->in_place_update) { + pr_err("In-place update not supported when byte-swapping is required\n"); + return -EINVAL; + } + /* * Sanity check that perf.data was written cleanly; data size is * initialized to 0 and updated only if the on_exit function is run. diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 7710481ab01a..c13eecf9b29e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2070,6 +2070,7 @@ struct reader { u64 data_size; u64 data_offset; reader_cb_t process; + bool in_place_update; }; static int @@ -2103,7 +2104,9 @@ reader__process_events(struct reader *rd, struct perf_session *session, mmap_prot = PROT_READ; mmap_flags = MAP_SHARED; - if (session->header.needs_swap) { + if (rd->in_place_update) { + mmap_prot |= PROT_WRITE; + } else if (session->header.needs_swap) { mmap_prot |= PROT_WRITE; mmap_flags = MAP_PRIVATE; } @@ -2189,6 +2192,7 @@ static int __perf_session__process_events(struct perf_session *session) .data_size = session->header.data_size, .data_offset = session->header.data_offset, .process = process_simple, + .in_place_update = session->data->in_place_update, }; struct ordered_events *oe = &session->ordered_events; struct perf_tool *tool = session->tool; -- Gitee From ac808d74d27474ae7bdaa7cf51a0c5c3aba6e888 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 2 Apr 2020 08:43:53 -0700 Subject: [PATCH 208/257] perf bench: Add event synthesis benchmark commit 2a4b51666af8bf0b67ccc2e53120bad27351917c upstream Event synthesis may occur at the start or end (tail) of a perf command. In system-wide mode it can scan every process in /proc, which may add seconds of latency before event recording. Add a new benchmark that times how long event synthesis takes with and without data synthesis. An example execution looks like: $ perf bench internals synthesize # Running 'internals/synthesize' benchmark: Average synthesis took: 168.253800 usec Average data synthesis took: 208.104700 usec Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andrey Zhizhikin Cc: Kan Liang Cc: Kefeng Wang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lore.kernel.org/lkml/20200402154357.107873-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf-bench.txt | 8 ++ tools/perf/bench/Build | 2 +- tools/perf/bench/bench.h | 2 +- tools/perf/bench/synthesize.c | 101 ++++++++++++++++++++++++ tools/perf/builtin-bench.c | 6 ++ 5 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 tools/perf/bench/synthesize.c diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index 0921a3c67381..bad16512c48d 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -61,6 +61,9 @@ SUBSYSTEM 'epoll':: Eventpoll (epoll) stressing benchmarks. +'internals':: + Benchmark internal perf functionality. + 'all':: All benchmark subsystems. @@ -214,6 +217,11 @@ Suite for evaluating concurrent epoll_wait calls. *ctl*:: Suite for evaluating multiple epoll_ctl calls. +SUITES FOR 'internals' +~~~~~~~~~~~~~~~~~~~~~~ +*synthesize*:: +Suite for evaluating perf's event synthesis performance. + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index e4e321b6f883..042827385c87 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -6,9 +6,9 @@ perf-y += futex-wake.o perf-y += futex-wake-parallel.o perf-y += futex-requeue.o perf-y += futex-lock-pi.o - perf-y += epoll-wait.o perf-y += epoll-ctl.o +perf-y += synthesize.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index d9329ae84e17..9d797dd89cf1 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -29,9 +29,9 @@ int bench_futex_wake_parallel(int argc, const char **argv); int bench_futex_requeue(int argc, const char **argv); /* pi futexes */ int bench_futex_lock_pi(int argc, const char **argv); - int bench_epoll_wait(int argc, const char **argv); int bench_epoll_ctl(int argc, const char **argv); +int bench_synthesize(int argc, const char **argv); #define BENCH_FORMAT_DEFAULT_STR "default" #define BENCH_FORMAT_DEFAULT 0 diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c new file mode 100644 index 000000000000..6291257bc9c9 --- /dev/null +++ b/tools/perf/bench/synthesize.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark synthesis of perf events such as at the start of a 'perf + * record'. Synthesis is done on the current process and the 'dummy' event + * handlers are invoked that support dump_trace but otherwise do nothing. + * + * Copyright 2019 Google LLC. + */ +#include +#include "bench.h" +#include "../util/debug.h" +#include "../util/session.h" +#include "../util/synthetic-events.h" +#include "../util/target.h" +#include "../util/thread_map.h" +#include "../util/tool.h" +#include +#include +#include + +static unsigned int iterations = 10000; + +static const struct option options[] = { + OPT_UINTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average"), + OPT_END() +}; + +static const char *const usage[] = { + "perf bench internals synthesize ", + NULL +}; + + +static int do_synthesize(struct perf_session *session, + struct perf_thread_map *threads, + struct target *target, bool data_mmap) +{ + const unsigned int nr_threads_synthesize = 1; + struct timeval start, end, diff; + u64 runtime_us; + unsigned int i; + double average; + int err; + + gettimeofday(&start, NULL); + for (i = 0; i < iterations; i++) { + err = machine__synthesize_threads(&session->machines.host, + target, threads, data_mmap, + nr_threads_synthesize); + if (err) + return err; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + average = (double)runtime_us/(double)iterations; + printf("Average %ssynthesis took: %f usec\n", + data_mmap ? "data " : "", average); + return 0; +} + +int bench_synthesize(int argc, const char **argv) +{ + struct perf_tool tool; + struct perf_session *session; + struct target target = { + .pid = "self", + }; + struct perf_thread_map *threads; + int err; + + argc = parse_options(argc, argv, options, usage, 0); + + session = perf_session__new(NULL, false, NULL); + if (IS_ERR(session)) { + pr_err("Session creation failed.\n"); + return PTR_ERR(session); + } + threads = thread_map__new_by_pid(getpid()); + if (!threads) { + pr_err("Thread map creation failed.\n"); + err = -ENOMEM; + goto err_out; + } + perf_tool__fill_defaults(&tool); + + err = do_synthesize(session, threads, &target, false); + if (err) + goto err_out; + + err = do_synthesize(session, threads, &target, true); + +err_out: + if (threads) + perf_thread_map__put(threads); + + perf_session__delete(session); + return err; +} diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index c06fe21c8613..11c79a8d85d6 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -76,6 +76,11 @@ static struct bench epoll_benchmarks[] = { }; #endif // HAVE_EVENTFD +static struct bench internals_benchmarks[] = { + { "synthesize", "Benchmark perf event synthesis", bench_synthesize }, + { NULL, NULL, NULL } +}; + struct collection { const char *name; const char *summary; @@ -92,6 +97,7 @@ static struct collection collections[] = { #ifdef HAVE_EVENTFD {"epoll", "Epoll stressing benchmarks", epoll_benchmarks }, #endif + { "internals", "Perf-internals benchmarks", internals_benchmarks }, { "all", "All benchmarks", NULL }, { NULL, NULL, NULL } }; -- Gitee From d33f7f4e6dae21af50b2b702d0645937f43a6993 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 5 Aug 2020 11:34:42 +0200 Subject: [PATCH 209/257] perf data: Add support to store time of day in CTF data conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 88371c5898fc9e7c9faeb7f53ab9082b4e84b7a8 upstream Adad support to convert and store time of day in CTF data conversion for 'perf data convert' subcommand. The perf.data used for conversion needs to have clock data information - must be recorded with -k/--clockid option). New --tod option is added to 'perf data convert' subcommand to convert data with timestamps converted to wall clock time. Record data with clockid set: # perf record -k CLOCK_MONOTONIC kill kill: not enough arguments [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.033 MB perf.data (8 samples) ] Convert data with TOD timestamps: # perf data convert --tod --to-ctf ./ctf [ perf data convert: Converted 'perf.data' into CTF data './ctf' ] [ perf data convert: Converted and wrote 0.000 MB (8 samples) ] Display data in perf script: # perf script -F+tod --ns perf 262150 2020-07-13 18:38:50.097678523 153633.958246159: 1 cycles: ... perf 262150 2020-07-13 18:38:50.097682941 153633.958250577: 1 cycles: ... perf 262150 2020-07-13 18:38:50.097684997 153633.958252633: 7 cycles: ... ... Display data in babeltrace: # babeltrace --clock-date ./ctf [2020-07-13 18:38:50.097678523] (+?.?????????) cycles: { cpu_id = 0 }, { perf_ip = 0xFFF ... [2020-07-13 18:38:50.097682941] (+0.000004418) cycles: { cpu_id = 0 }, { perf_ip = 0xFFF ... [2020-07-13 18:38:50.097684997] (+0.000002056) cycles: { cpu_id = 0 }, { perf_ip = 0xFFF ... ... It's available only for recording with clockid specified, because it's the only case where we can get reference time to wallclock time. It's can't do that with perf clock yet. Error is display if you want to use --tod on data without clockid specified: # perf data convert --tod --to-ctf ./ctf Can't provide --tod time, missing clock data. Please record with -k/--clockid option. Failed to setup CTF writer. Error during conversion setup. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Geneviève Bastien Cc: Ian Rogers Cc: Jeremie Galarneau Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lore.kernel.org/lkml/20200805093444.314999-6-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf-data.txt | 3 ++ tools/perf/builtin-data.c | 1 + tools/perf/util/data-convert-bt.c | 57 +++++++++++++++++--------- tools/perf/util/data-convert.h | 1 + 4 files changed, 42 insertions(+), 20 deletions(-) diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt index c87180764829..726b9bc9e1a7 100644 --- a/tools/perf/Documentation/perf-data.txt +++ b/tools/perf/Documentation/perf-data.txt @@ -27,6 +27,9 @@ OPTIONS for 'convert' --to-ctf:: Triggers the CTF conversion, specify the path of CTF data directory. +--tod:: + Convert time to wall clock time. + -i:: Specify input perf data file path. diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c index ca2fb44874e4..8d23b8d6ee8e 100644 --- a/tools/perf/builtin-data.c +++ b/tools/perf/builtin-data.c @@ -65,6 +65,7 @@ static int cmd_data_convert(int argc, const char **argv) OPT_STRING('i', "input", &input_name, "file", "input file name"), #ifdef HAVE_LIBBABELTRACE_SUPPORT OPT_STRING(0, "to-ctf", &to_ctf, NULL, "Convert to CTF format"), + OPT_BOOLEAN(0, "tod", &opts.tod, "Convert time to wall clock time"), #endif OPT_BOOLEAN('f', "force", &opts.force, "don't complain, do it"), OPT_BOOLEAN(0, "all", &opts.all, "Convert all events"), diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index dbc772bfb04e..8893c65391d0 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -31,6 +31,9 @@ #include "config.h" #include #include +#include +#include "util.h" +#include "clockid.h" #define pr_N(n, fmt, ...) \ eprintf(n, debug_data_convert, fmt, ##__VA_ARGS__) @@ -1381,11 +1384,26 @@ do { \ return 0; } -static int ctf_writer__setup_clock(struct ctf_writer *cw) +static int ctf_writer__setup_clock(struct ctf_writer *cw, + struct perf_session *session, + bool tod) { struct bt_ctf_clock *clock = cw->clock; + const char *desc = "perf clock"; + int64_t offset = 0; - bt_ctf_clock_set_description(clock, "perf clock"); + if (tod) { + struct perf_env *env = &session->header.env; + + if (!env->clock.enabled) { + pr_err("Can't provide --tod time, missing clock data. " + "Please record with -k/--clockid option.\n"); + return -1; + } + + desc = clockid_name(env->clock.clockid); + offset = env->clock.tod_ns - env->clock.clockid_ns; + } #define SET(__n, __v) \ do { \ @@ -1394,8 +1412,8 @@ do { \ } while (0) SET(frequency, 1000000000); - SET(offset_s, 0); - SET(offset, 0); + SET(offset, offset); + SET(description, desc); SET(precision, 10); SET(is_absolute, 0); @@ -1481,7 +1499,8 @@ static void ctf_writer__cleanup(struct ctf_writer *cw) memset(cw, 0, sizeof(*cw)); } -static int ctf_writer__init(struct ctf_writer *cw, const char *path) +static int ctf_writer__init(struct ctf_writer *cw, const char *path, + struct perf_session *session, bool tod) { struct bt_ctf_writer *writer; struct bt_ctf_stream_class *stream_class; @@ -1505,7 +1524,7 @@ static int ctf_writer__init(struct ctf_writer *cw, const char *path) cw->clock = clock; - if (ctf_writer__setup_clock(cw)) { + if (ctf_writer__setup_clock(cw, session, tod)) { pr("Failed to setup CTF clock.\n"); goto err_cleanup; } @@ -1613,17 +1632,15 @@ int bt_convert__perf2ctf(const char *input, const char *path, if (err) return err; - /* CTF writer */ - if (ctf_writer__init(cw, path)) - return -1; - err = -1; /* perf.data session */ session = perf_session__new(&data, 0, &c.tool); - if (IS_ERR(session)) { - err = PTR_ERR(session); - goto free_writer; - } + if (IS_ERR(session)) + return PTR_ERR(session); + + /* CTF writer */ + if (ctf_writer__init(cw, path, session, opts->tod)) + goto free_session; if (c.queue_size) { ordered_events__set_alloc_size(&session->ordered_events, @@ -1632,17 +1649,17 @@ int bt_convert__perf2ctf(const char *input, const char *path, /* CTF writer env/clock setup */ if (ctf_writer__setup_env(cw, session)) - goto free_session; + goto free_writer; /* CTF events setup */ if (setup_events(cw, session)) - goto free_session; + goto free_writer; if (opts->all && setup_non_sample_events(cw, session)) - goto free_session; + goto free_writer; if (setup_streams(cw, session)) - goto free_session; + goto free_writer; err = perf_session__process_events(session); if (!err) @@ -1670,10 +1687,10 @@ int bt_convert__perf2ctf(const char *input, const char *path, return err; -free_session: - perf_session__delete(session); free_writer: ctf_writer__cleanup(cw); +free_session: + perf_session__delete(session); pr_err("Error during conversion setup.\n"); return err; } diff --git a/tools/perf/util/data-convert.h b/tools/perf/util/data-convert.h index af90b6076c06..feab5f114e37 100644 --- a/tools/perf/util/data-convert.h +++ b/tools/perf/util/data-convert.h @@ -5,6 +5,7 @@ struct perf_data_convert_opts { bool force; bool all; + bool tod; }; #endif /* __DATA_CONVERT_H */ -- Gitee From 1c292b21292b53e1ca2fe71066b64eb3b7157733 Mon Sep 17 00:00:00 2001 From: Nicholas Fraser Date: Mon, 26 Apr 2021 10:47:16 -0400 Subject: [PATCH 210/257] perf data: Add JSON export MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d0713d4ca3e94827de77f8758e3e8045a0d85215 upstream This adds a feature to export perf data to JSON. The resolved symbols are exported into the JSON so that external tools don't need to load the dsos themselves (or even have access to them at all.) This makes it easy to load and analyze perf data with standalone tools where direct perf or libbabeltrace integration is impractical. The exporter uses a minimal inline JSON encoding without any external dependencies. Currently it only outputs some headers and sample metadata but it's easily extensible. Use it like this: $ perf data convert --to-json out.json Committer notes: Fixup a __printf() bug that broke the build: util/data-convert-json.c:103:11: error: expected ‘)’ before numeric constant 103 | __(printf, 5, 6) | ^~ | ) util/data-convert-json.c: In function ‘output_sample_callchain_entry’: util/data-convert-json.c:124:2: error: implicit declaration of function ‘output_json_key_format’; did you mean ‘output_json_format’? [-Werror=implicit-function-declaration] 124 | output_json_key_format(out, false, 5, "ip", "\"0x%" PRIx64 "\"", ip); | ^~~~~~~~~~~~~~~~~~~~~~ | output_json_format Also had to add this patch to fix errors reported by various versions of clang: - if (al && al->sym && al->sym->name && strlen(al->sym->name) > 0) { + if (al && al->sym && al->sym->namelen) { al->sym->name is a zero sized array, to avoid one extra alloc in the symbol__new() constructor, sym->namelen carries its strlen. Committer testing: $ ls -la out.json ls: cannot access 'out.json': No such file or directory $ perf record sleep 0.1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.001 MB perf.data (8 samples) ] $ perf report --stats | grep -w SAMPLE SAMPLE events: 8 $ perf data convert --to-json out.json [ perf data convert: Converted 'perf.data' into JSON data 'out.json' ] [ perf data convert: Converted and wrote 0.002 MB (8 samples) ] $ ls -la out.json -rw-rw-r--. 1 acme acme 2017 Apr 26 17:29 out.json $ cat out.json { "linux-perf-json-version": 1, "headers": { "header-version": 1, "captured-on": "2021-04-26T20:28:57Z", "data-offset": 432, "data-size": 1016, "feat-offset": 1448, "hostname": "five", "os-release": "5.11.14-200.fc33.x86_64", "arch": "x86_64", "cpu-desc": "AMD Ryzen 9 3900X 12-Core Processor", "cpuid": "AuthenticAMD,23,113,0", "nrcpus-online": 24, "nrcpus-avail": 24, "perf-version": "5.12.gee134f3189bd", "cmdline": [ "/home/acme/bin/perf", "record", "sleep", "0.1" ] }, "samples": [ { "timestamp": 170517539043684, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa6268827" } ] }, { "timestamp": 170517539048443, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa661359d" } ] }, { "timestamp": 170517539051018, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa6311e18" } ] }, { "timestamp": 170517539053652, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0x7fdb77b4812b", "symbol": "_dl_start", "dso": "ld-2.32.so" } ] }, { "timestamp": 170517539055306, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa6269286" } ] }, { "timestamp": 170517539057590, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0xffffffffa62abd8b" } ] }, { "timestamp": 170517539067559, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0x7fdb77b5e9e9", "symbol": "__GI___tunables_init", "dso": "ld-2.32.so" } ] }, { "timestamp": 170517539282452, "pid": 375844, "tid": 375844, "comm": "sleep", "callchain": [ { "ip": "0x7fdb779978d2", "symbol": "getenv", "dso": "libc-2.32.so" } ] } ] } $ Signed-off-by: Nicholas Fraser Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Changbin Du Cc: Ian Rogers Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tan Xiaojun Cc: Ulrich Czekalla Link: http://lore.kernel.org/lkml/3884969f-804d-2f53-c648-e2b0bd85edff@codeweavers.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf-data.txt | 5 +- tools/perf/builtin-data.c | 26 +- tools/perf/util/Build | 1 + tools/perf/util/data-convert-bt.c | 2 +- tools/perf/util/data-convert-bt.h | 11 - tools/perf/util/data-convert-json.c | 384 +++++++++++++++++++++++++ tools/perf/util/data-convert.h | 10 + 7 files changed, 418 insertions(+), 21 deletions(-) delete mode 100644 tools/perf/util/data-convert-bt.h create mode 100644 tools/perf/util/data-convert-json.c diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt index 726b9bc9e1a7..417bf17e265c 100644 --- a/tools/perf/Documentation/perf-data.txt +++ b/tools/perf/Documentation/perf-data.txt @@ -17,7 +17,7 @@ Data file related processing. COMMANDS -------- convert:: - Converts perf data file into another format (only CTF [1] format is support by now). + Converts perf data file into another format. It's possible to set data-convert debug variable to get debug messages from conversion, like: perf --debug data-convert data convert ... @@ -27,6 +27,9 @@ OPTIONS for 'convert' --to-ctf:: Triggers the CTF conversion, specify the path of CTF data directory. +--to-json:: + Triggers JSON conversion. Specify the JSON filename to output. + --tod:: Convert time to wall clock time. diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c index 8d23b8d6ee8e..15ca23675ef0 100644 --- a/tools/perf/builtin-data.c +++ b/tools/perf/builtin-data.c @@ -7,7 +7,6 @@ #include "debug.h" #include #include "data-convert.h" -#include "data-convert-bt.h" typedef int (*data_cmd_fn_t)(int argc, const char **argv); @@ -55,7 +54,8 @@ static const char * const data_convert_usage[] = { static int cmd_data_convert(int argc, const char **argv) { - const char *to_ctf = NULL; + const char *to_json = NULL; + const char *to_ctf = NULL; struct perf_data_convert_opts opts = { .force = false, .all = false, @@ -63,6 +63,7 @@ static int cmd_data_convert(int argc, const char **argv) const struct option options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose"), OPT_STRING('i', "input", &input_name, "file", "input file name"), + OPT_STRING(0, "to-json", &to_json, NULL, "Convert to JSON format"), #ifdef HAVE_LIBBABELTRACE_SUPPORT OPT_STRING(0, "to-ctf", &to_ctf, NULL, "Convert to CTF format"), OPT_BOOLEAN(0, "tod", &opts.tod, "Convert time to wall clock time"), @@ -72,11 +73,6 @@ static int cmd_data_convert(int argc, const char **argv) OPT_END() }; -#ifndef HAVE_LIBBABELTRACE_SUPPORT - pr_err("No conversion support compiled in. perf should be compiled with environment variables LIBBABELTRACE=1 and LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n"); - return -1; -#endif - argc = parse_options(argc, argv, options, data_convert_usage, 0); if (argc) { @@ -84,11 +80,25 @@ static int cmd_data_convert(int argc, const char **argv) return -1; } + if (to_json && to_ctf) { + pr_err("You cannot specify both --to-ctf and --to-json.\n"); + return -1; + } + if (!to_json && !to_ctf) { + pr_err("You must specify one of --to-ctf or --to-json.\n"); + return -1; + } + + if (to_json) + return bt_convert__perf2json(input_name, to_json, &opts); + if (to_ctf) { #ifdef HAVE_LIBBABELTRACE_SUPPORT return bt_convert__perf2ctf(input_name, to_ctf, &opts); #else - pr_err("The libbabeltrace support is not compiled in.\n"); + pr_err("The libbabeltrace support is not compiled in. perf should be " + "compiled with environment variables LIBBABELTRACE=1 and " + "LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n"); return -1; #endif } diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 4f8265ddda0e..ae0dafa75a69 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -156,6 +156,7 @@ perf-$(CONFIG_LIBUNWIND_X86) += libunwind/x86_32.o perf-$(CONFIG_LIBUNWIND_AARCH64) += libunwind/arm64.o perf-$(CONFIG_LIBBABELTRACE) += data-convert-bt.o +perf-y += data-convert-json.o perf-y += scripting-engines/ diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 8893c65391d0..5ee5c9ef7c58 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -21,7 +21,7 @@ #include #include #include "asm/bug.h" -#include "data-convert-bt.h" +#include "data-convert.h" #include "session.h" #include "debug.h" #include "tool.h" diff --git a/tools/perf/util/data-convert-bt.h b/tools/perf/util/data-convert-bt.h deleted file mode 100644 index 821674d63c4e..000000000000 --- a/tools/perf/util/data-convert-bt.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __DATA_CONVERT_BT_H -#define __DATA_CONVERT_BT_H -#include "data-convert.h" -#ifdef HAVE_LIBBABELTRACE_SUPPORT - -int bt_convert__perf2ctf(const char *input_name, const char *to_ctf, - struct perf_data_convert_opts *opts); - -#endif /* HAVE_LIBBABELTRACE_SUPPORT */ -#endif /* __DATA_CONVERT_BT_H */ diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c new file mode 100644 index 000000000000..355cd1948bdf --- /dev/null +++ b/tools/perf/util/data-convert-json.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * JSON export. + * + * Copyright (C) 2021, CodeWeavers Inc. + */ + +#include "data-convert.h" + +#include +#include +#include +#include + +#include "linux/compiler.h" +#include "linux/err.h" +#include "util/auxtrace.h" +#include "util/debug.h" +#include "util/dso.h" +#include "util/event.h" +#include "util/evsel.h" +#include "util/evlist.h" +#include "util/header.h" +#include "util/map.h" +#include "util/session.h" +#include "util/symbol.h" +#include "util/thread.h" +#include "util/tool.h" + +struct convert_json { + struct perf_tool tool; + FILE *out; + bool first; + u64 events_count; +}; + +// Outputs a JSON-encoded string surrounded by quotes with characters escaped. +static void output_json_string(FILE *out, const char *s) +{ + fputc('"', out); + while (*s) { + switch (*s) { + + // required escapes with special forms as per RFC 8259 + case '"': fputs("\\\"", out); break; + case '\\': fputs("\\\\", out); break; + case '\b': fputs("\\b", out); break; + case '\f': fputs("\\f", out); break; + case '\n': fputs("\\n", out); break; + case '\r': fputs("\\r", out); break; + case '\t': fputs("\\t", out); break; + + default: + // all other control characters must be escaped by hex code + if (*s <= 0x1f) + fprintf(out, "\\u%04x", *s); + else + fputc(*s, out); + break; + } + + ++s; + } + fputc('"', out); +} + +// Outputs an optional comma, newline and indentation to delimit a new value +// from the previous one in a JSON object or array. +static void output_json_delimiters(FILE *out, bool comma, int depth) +{ + int i; + + if (comma) + fputc(',', out); + fputc('\n', out); + for (i = 0; i < depth; ++i) + fputc('\t', out); +} + +// Outputs a printf format string (with delimiter) as a JSON value. +__printf(4, 5) +static void output_json_format(FILE *out, bool comma, int depth, const char *format, ...) +{ + va_list args; + + output_json_delimiters(out, comma, depth); + va_start(args, format); + vfprintf(out, format, args); + va_end(args); +} + +// Outputs a JSON key-value pair where the value is a string. +static void output_json_key_string(FILE *out, bool comma, int depth, + const char *key, const char *value) +{ + output_json_delimiters(out, comma, depth); + output_json_string(out, key); + fputs(": ", out); + output_json_string(out, value); +} + +// Outputs a JSON key-value pair where the value is a printf format string. +__printf(5, 6) +static void output_json_key_format(FILE *out, bool comma, int depth, + const char *key, const char *format, ...) +{ + va_list args; + + output_json_delimiters(out, comma, depth); + output_json_string(out, key); + fputs(": ", out); + va_start(args, format); + vfprintf(out, format, args); + va_end(args); +} + +static void output_sample_callchain_entry(struct perf_tool *tool, + u64 ip, struct addr_location *al) +{ + struct convert_json *c = container_of(tool, struct convert_json, tool); + FILE *out = c->out; + + output_json_format(out, false, 4, "{"); + output_json_key_format(out, false, 5, "ip", "\"0x%" PRIx64 "\"", ip); + + if (al && al->sym && al->sym->namelen) { + fputc(',', out); + output_json_key_string(out, false, 5, "symbol", al->sym->name); + + if (al->map && al->map->dso) { + const char *dso = al->map->dso->short_name; + + if (dso && strlen(dso) > 0) { + fputc(',', out); + output_json_key_string(out, false, 5, "dso", dso); + } + } + } + + output_json_format(out, false, 4, "}"); +} + +static int process_sample_event(struct perf_tool *tool, + union perf_event *event __maybe_unused, + struct perf_sample *sample, + struct evsel *evsel __maybe_unused, + struct machine *machine) +{ + struct convert_json *c = container_of(tool, struct convert_json, tool); + FILE *out = c->out; + struct addr_location al, tal; + u8 cpumode = PERF_RECORD_MISC_USER; + + if (machine__resolve(machine, &al, sample) < 0) { + pr_err("Sample resolution failed!\n"); + return -1; + } + + ++c->events_count; + + if (c->first) + c->first = false; + else + fputc(',', out); + output_json_format(out, false, 2, "{"); + + output_json_key_format(out, false, 3, "timestamp", "%" PRIi64, sample->time); + output_json_key_format(out, true, 3, "pid", "%i", al.thread->pid_); + output_json_key_format(out, true, 3, "tid", "%i", al.thread->tid); + + if (al.thread->cpu >= 0) + output_json_key_format(out, true, 3, "cpu", "%i", al.thread->cpu); + + output_json_key_string(out, true, 3, "comm", thread__comm_str(al.thread)); + + output_json_key_format(out, true, 3, "callchain", "["); + if (sample->callchain) { + unsigned int i; + bool ok; + bool first_callchain = true; + + for (i = 0; i < sample->callchain->nr; ++i) { + u64 ip = sample->callchain->ips[i]; + + if (ip >= PERF_CONTEXT_MAX) { + switch (ip) { + case PERF_CONTEXT_HV: + cpumode = PERF_RECORD_MISC_HYPERVISOR; + break; + case PERF_CONTEXT_KERNEL: + cpumode = PERF_RECORD_MISC_KERNEL; + break; + case PERF_CONTEXT_USER: + cpumode = PERF_RECORD_MISC_USER; + break; + default: + pr_debug("invalid callchain context: %" + PRId64 "\n", (s64) ip); + break; + } + continue; + } + + if (first_callchain) + first_callchain = false; + else + fputc(',', out); + + ok = thread__find_symbol(al.thread, cpumode, ip, &tal); + output_sample_callchain_entry(tool, ip, ok ? &tal : NULL); + } + } else { + output_sample_callchain_entry(tool, sample->ip, &al); + } + output_json_format(out, false, 3, "]"); + + output_json_format(out, false, 2, "}"); + return 0; +} + +static void output_headers(struct perf_session *session, struct convert_json *c) +{ + struct stat st; + struct perf_header *header = &session->header; + int ret; + int fd = perf_data__fd(session->data); + int i; + FILE *out = c->out; + + output_json_key_format(out, false, 2, "header-version", "%u", header->version); + + ret = fstat(fd, &st); + if (ret >= 0) { + time_t stctime = st.st_mtime; + char buf[256]; + + strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&stctime)); + output_json_key_string(out, true, 2, "captured-on", buf); + } else { + pr_debug("Failed to get mtime of source file, not writing captured-on"); + } + + output_json_key_format(out, true, 2, "data-offset", "%" PRIu64, header->data_offset); + output_json_key_format(out, true, 2, "data-size", "%" PRIu64, header->data_size); + output_json_key_format(out, true, 2, "feat-offset", "%" PRIu64, header->feat_offset); + + output_json_key_string(out, true, 2, "hostname", header->env.hostname); + output_json_key_string(out, true, 2, "os-release", header->env.os_release); + output_json_key_string(out, true, 2, "arch", header->env.arch); + + output_json_key_string(out, true, 2, "cpu-desc", header->env.cpu_desc); + output_json_key_string(out, true, 2, "cpuid", header->env.cpuid); + output_json_key_format(out, true, 2, "nrcpus-online", "%u", header->env.nr_cpus_online); + output_json_key_format(out, true, 2, "nrcpus-avail", "%u", header->env.nr_cpus_avail); + + if (header->env.clock.enabled) { + output_json_key_format(out, true, 2, "clockid", + "%u", header->env.clock.clockid); + output_json_key_format(out, true, 2, "clock-time", + "%" PRIu64, header->env.clock.clockid_ns); + output_json_key_format(out, true, 2, "real-time", + "%" PRIu64, header->env.clock.tod_ns); + } + + output_json_key_string(out, true, 2, "perf-version", header->env.version); + + output_json_key_format(out, true, 2, "cmdline", "["); + for (i = 0; i < header->env.nr_cmdline; i++) { + output_json_delimiters(out, i != 0, 3); + output_json_string(c->out, header->env.cmdline_argv[i]); + } + output_json_format(out, false, 2, "]"); +} + +int bt_convert__perf2json(const char *input_name, const char *output_name, + struct perf_data_convert_opts *opts __maybe_unused) +{ + struct perf_session *session; + int fd; + int ret = -1; + + struct convert_json c = { + .tool = { + .sample = process_sample_event, + .mmap = perf_event__process_mmap, + .mmap2 = perf_event__process_mmap2, + .comm = perf_event__process_comm, + .namespaces = perf_event__process_namespaces, + .cgroup = perf_event__process_cgroup, + .exit = perf_event__process_exit, + .fork = perf_event__process_fork, + .lost = perf_event__process_lost, + .tracing_data = perf_event__process_tracing_data, + .build_id = perf_event__process_build_id, + .id_index = perf_event__process_id_index, + .auxtrace_info = perf_event__process_auxtrace_info, + .auxtrace = perf_event__process_auxtrace, + .event_update = perf_event__process_event_update, + .ordered_events = true, + .ordering_requires_timestamps = true, + }, + .first = true, + .events_count = 0, + }; + + struct perf_data data = { + .mode = PERF_DATA_MODE_READ, + .path = input_name, + .force = opts->force, + }; + + if (opts->all) { + pr_err("--all is currently unsupported for JSON output.\n"); + goto err; + } + if (opts->tod) { + pr_err("--tod is currently unsupported for JSON output.\n"); + goto err; + } + + fd = open(output_name, O_CREAT | O_WRONLY | (opts->force ? O_TRUNC : O_EXCL), 0666); + if (fd == -1) { + if (errno == EEXIST) + pr_err("Output file exists. Use --force to overwrite it.\n"); + else + pr_err("Error opening output file!\n"); + goto err; + } + + c.out = fdopen(fd, "w"); + if (!c.out) { + fprintf(stderr, "Error opening output file!\n"); + close(fd); + goto err; + } + + session = perf_session__new(&data, false, &c.tool); + if (IS_ERR(session)) { + fprintf(stderr, "Error creating perf session!\n"); + goto err_fclose; + } + + if (symbol__init(&session->header.env) < 0) { + fprintf(stderr, "Symbol init error!\n"); + goto err_session_delete; + } + + // The opening brace is printed manually because it isn't delimited from a + // previous value (i.e. we don't want a leading newline) + fputc('{', c.out); + + // Version number for future-proofing. Most additions should be able to be + // done in a backwards-compatible way so this should only need to be bumped + // if some major breaking change must be made. + output_json_format(c.out, false, 1, "\"linux-perf-json-version\": 1"); + + // Output headers + output_json_format(c.out, true, 1, "\"headers\": {"); + output_headers(session, &c); + output_json_format(c.out, false, 1, "}"); + + // Output samples + output_json_format(c.out, true, 1, "\"samples\": ["); + perf_session__process_events(session); + output_json_format(c.out, false, 1, "]"); + output_json_format(c.out, false, 0, "}"); + fputc('\n', c.out); + + fprintf(stderr, + "[ perf data convert: Converted '%s' into JSON data '%s' ]\n", + data.path, output_name); + + fprintf(stderr, + "[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n", + (ftell(c.out)) / 1024.0 / 1024.0, c.events_count); + + ret = 0; +err_session_delete: + perf_session__delete(session); +err_fclose: + fclose(c.out); +err: + return ret; +} diff --git a/tools/perf/util/data-convert.h b/tools/perf/util/data-convert.h index feab5f114e37..1b4c5f598415 100644 --- a/tools/perf/util/data-convert.h +++ b/tools/perf/util/data-convert.h @@ -2,10 +2,20 @@ #ifndef __DATA_CONVERT_H #define __DATA_CONVERT_H +#include + struct perf_data_convert_opts { bool force; bool all; bool tod; }; +#ifdef HAVE_LIBBABELTRACE_SUPPORT +int bt_convert__perf2ctf(const char *input_name, const char *to_ctf, + struct perf_data_convert_opts *opts); +#endif /* HAVE_LIBBABELTRACE_SUPPORT */ + +int bt_convert__perf2json(const char *input_name, const char *to_ctf, + struct perf_data_convert_opts *opts); + #endif /* __DATA_CONVERT_H */ -- Gitee From 8eed5fffc924c7733ccc8da926e6108db1d2ccf5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 14 Apr 2020 22:40:48 -0700 Subject: [PATCH 211/257] perf bench: Add a multi-threaded synthesize benchmark commit 13edc237200c75425ab0e3fe4b4c75dafb468c2e upstream By default this isn't run as it reads /proc and may not have access. For consistency, modify the single threaded benchmark to compute an average time per event. Committer testing: $ grep -m1 "model name" /proc/cpuinfo model name : Intel(R) Core(TM) i7-8650U CPU @ 1.90GHz $ grep "model name" /proc/cpuinfo | wc -l 8 $ $ perf bench internals synthesize -h # Running 'internals/synthesize' benchmark: Usage: perf bench internals synthesize -I, --multi-iterations Number of iterations used to compute multi-threaded average -i, --single-iterations Number of iterations used to compute single-threaded average -M, --max-threads Maximum number of threads in multithreaded bench -m, --min-threads Minimum number of threads in multithreaded bench -s, --st Run single threaded benchmark -t, --mt Run multi-threaded benchmark $ $ perf bench internals synthesize -t # Running 'internals/synthesize' benchmark: Computing performance of multi threaded perf event synthesis by synthesizing events on CPU 0: Number of synthesis threads: 1 Average synthesis took: 65449.000 usec (+- 586.442 usec) Average num. events: 9405.400 (+- 0.306) Average time per event 6.959 usec Number of synthesis threads: 2 Average synthesis took: 37838.300 usec (+- 130.259 usec) Average num. events: 9501.800 (+- 20.469) Average time per event 3.982 usec Number of synthesis threads: 3 Average synthesis took: 48551.400 usec (+- 225.686 usec) Average num. events: 9544.000 (+- 0.000) Average time per event 5.087 usec Number of synthesis threads: 4 Average synthesis took: 29632.500 usec (+- 50.808 usec) Average num. events: 9544.000 (+- 0.000) Average time per event 3.105 usec Number of synthesis threads: 5 Average synthesis took: 33920.400 usec (+- 284.509 usec) Average num. events: 9544.000 (+- 0.000) Average time per event 3.554 usec Number of synthesis threads: 6 Average synthesis took: 27604.100 usec (+- 72.344 usec) Average num. events: 9548.000 (+- 0.000) Average time per event 2.891 usec Number of synthesis threads: 7 Average synthesis took: 25406.300 usec (+- 933.371 usec) Average num. events: 9545.500 (+- 0.167) Average time per event 2.662 usec Number of synthesis threads: 8 Average synthesis took: 24110.400 usec (+- 73.229 usec) Average num. events: 9551.000 (+- 0.000) Average time per event 2.524 usec $ Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Andrey Zhizhikin Cc: Kan Liang Cc: Kefeng Wang Cc: Mark Rutland Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lore.kernel.org/lkml/20200415054050.31645-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/bench/synthesize.c | 211 ++++++++++++++++++++++++++++++---- 1 file changed, 186 insertions(+), 25 deletions(-) diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c index 6291257bc9c9..8d624aea1c5e 100644 --- a/tools/perf/bench/synthesize.c +++ b/tools/perf/bench/synthesize.c @@ -10,60 +10,105 @@ #include "bench.h" #include "../util/debug.h" #include "../util/session.h" +#include "../util/stat.h" #include "../util/synthetic-events.h" #include "../util/target.h" #include "../util/thread_map.h" #include "../util/tool.h" +#include "../util/util.h" +#include #include #include #include -static unsigned int iterations = 10000; +static unsigned int min_threads = 1; +static unsigned int max_threads = UINT_MAX; +static unsigned int single_iterations = 10000; +static unsigned int multi_iterations = 10; +static bool run_st; +static bool run_mt; static const struct option options[] = { - OPT_UINTEGER('i', "iterations", &iterations, - "Number of iterations used to compute average"), + OPT_BOOLEAN('s', "st", &run_st, "Run single threaded benchmark"), + OPT_BOOLEAN('t', "mt", &run_mt, "Run multi-threaded benchmark"), + OPT_UINTEGER('m', "min-threads", &min_threads, + "Minimum number of threads in multithreaded bench"), + OPT_UINTEGER('M', "max-threads", &max_threads, + "Maximum number of threads in multithreaded bench"), + OPT_UINTEGER('i', "single-iterations", &single_iterations, + "Number of iterations used to compute single-threaded average"), + OPT_UINTEGER('I', "multi-iterations", &multi_iterations, + "Number of iterations used to compute multi-threaded average"), OPT_END() }; -static const char *const usage[] = { +static const char *const bench_usage[] = { "perf bench internals synthesize ", NULL }; +static atomic_t event_count; -static int do_synthesize(struct perf_session *session, - struct perf_thread_map *threads, - struct target *target, bool data_mmap) +static int process_synthesized_event(struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + atomic_inc(&event_count); + return 0; +} + +static int do_run_single_threaded(struct perf_session *session, + struct perf_thread_map *threads, + struct target *target, bool data_mmap) { const unsigned int nr_threads_synthesize = 1; struct timeval start, end, diff; u64 runtime_us; unsigned int i; - double average; + double time_average, time_stddev, event_average, event_stddev; int err; + struct stats time_stats, event_stats; - gettimeofday(&start, NULL); - for (i = 0; i < iterations; i++) { - err = machine__synthesize_threads(&session->machines.host, - target, threads, data_mmap, + init_stats(&time_stats); + init_stats(&event_stats); + + for (i = 0; i < single_iterations; i++) { + atomic_set(&event_count, 0); + gettimeofday(&start, NULL); + err = __machine__synthesize_threads(&session->machines.host, + NULL, + target, threads, + process_synthesized_event, + data_mmap, nr_threads_synthesize); if (err) return err; + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + update_stats(&event_stats, atomic_read(&event_count)); } - gettimeofday(&end, NULL); - timersub(&end, &start, &diff); - runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; - average = (double)runtime_us/(double)iterations; - printf("Average %ssynthesis took: %f usec\n", - data_mmap ? "data " : "", average); + time_average = avg_stats(&time_stats); + time_stddev = stddev_stats(&time_stats); + printf(" Average %ssynthesis took: %.3f usec (+- %.3f usec)\n", + data_mmap ? "data " : "", time_average, time_stddev); + + event_average = avg_stats(&event_stats); + event_stddev = stddev_stats(&event_stats); + printf(" Average num. events: %.3f (+- %.3f)\n", + event_average, event_stddev); + + printf(" Average time per event %.3f usec\n", + time_average / event_average); return 0; } -int bench_synthesize(int argc, const char **argv) +static int run_single_threaded(void) { - struct perf_tool tool; struct perf_session *session; struct target target = { .pid = "self", @@ -71,8 +116,7 @@ int bench_synthesize(int argc, const char **argv) struct perf_thread_map *threads; int err; - argc = parse_options(argc, argv, options, usage, 0); - + perf_set_singlethreaded(); session = perf_session__new(NULL, false, NULL); if (IS_ERR(session)) { pr_err("Session creation failed.\n"); @@ -84,13 +128,16 @@ int bench_synthesize(int argc, const char **argv) err = -ENOMEM; goto err_out; } - perf_tool__fill_defaults(&tool); - err = do_synthesize(session, threads, &target, false); + puts( +"Computing performance of single threaded perf event synthesis by\n" +"synthesizing events on the perf process itself:"); + + err = do_run_single_threaded(session, threads, &target, false); if (err) goto err_out; - err = do_synthesize(session, threads, &target, true); + err = do_run_single_threaded(session, threads, &target, true); err_out: if (threads) @@ -99,3 +146,117 @@ int bench_synthesize(int argc, const char **argv) perf_session__delete(session); return err; } + +static int do_run_multi_threaded(struct target *target, + unsigned int nr_threads_synthesize) +{ + struct timeval start, end, diff; + u64 runtime_us; + unsigned int i; + double time_average, time_stddev, event_average, event_stddev; + int err; + struct stats time_stats, event_stats; + struct perf_session *session; + + init_stats(&time_stats); + init_stats(&event_stats); + for (i = 0; i < multi_iterations; i++) { + session = perf_session__new(NULL, false, NULL); + if (!session) + return -ENOMEM; + + atomic_set(&event_count, 0); + gettimeofday(&start, NULL); + err = __machine__synthesize_threads(&session->machines.host, + NULL, + target, NULL, + process_synthesized_event, + false, + nr_threads_synthesize); + if (err) { + perf_session__delete(session); + return err; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + update_stats(&event_stats, atomic_read(&event_count)); + perf_session__delete(session); + } + + time_average = avg_stats(&time_stats); + time_stddev = stddev_stats(&time_stats); + printf(" Average synthesis took: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + + event_average = avg_stats(&event_stats); + event_stddev = stddev_stats(&event_stats); + printf(" Average num. events: %.3f (+- %.3f)\n", + event_average, event_stddev); + + printf(" Average time per event %.3f usec\n", + time_average / event_average); + return 0; +} + +static int run_multi_threaded(void) +{ + struct target target = { + .cpu_list = "0" + }; + unsigned int nr_threads_synthesize; + int err; + + if (max_threads == UINT_MAX) + max_threads = sysconf(_SC_NPROCESSORS_ONLN); + + puts( +"Computing performance of multi threaded perf event synthesis by\n" +"synthesizing events on CPU 0:"); + + for (nr_threads_synthesize = min_threads; + nr_threads_synthesize <= max_threads; + nr_threads_synthesize++) { + if (nr_threads_synthesize == 1) + perf_set_singlethreaded(); + else + perf_set_multithreaded(); + + printf(" Number of synthesis threads: %u\n", + nr_threads_synthesize); + + err = do_run_multi_threaded(&target, nr_threads_synthesize); + if (err) + return err; + } + perf_set_singlethreaded(); + return 0; +} + +int bench_synthesize(int argc, const char **argv) +{ + int err = 0; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + /* + * If neither single threaded or multi-threaded are specified, default + * to running just single threaded. + */ + if (!run_st && !run_mt) + run_st = true; + + if (run_st) + err = run_single_threaded(); + + if (!err && run_mt) + err = run_multi_threaded(); + + return err; +} -- Gitee From 6cedbabb22f4dfc2f21012d603ac9ce815fe2ab7 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 2 Sep 2020 22:05:26 +0800 Subject: [PATCH 212/257] perf bench: The do_run_multi_threaded() function must use IS_ERR(perf_session__new()) commit e4d71f79cf5c10fa8bc6f5d3bebea570c9c438f1 upstream In case of error, the function perf_session__new() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR() Committer notes: This wasn't compiling due to an extraneous '{' not matched by a '}', fix it. Fixes: 13edc237200c ("perf bench: Add a multi-threaded synthesize benchmark") Signed-off-by: YueHaibing Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200902140526.26916-1-yuehaibing@huawei.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/bench/synthesize.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c index 8d624aea1c5e..b2924e3181dc 100644 --- a/tools/perf/bench/synthesize.c +++ b/tools/perf/bench/synthesize.c @@ -162,8 +162,8 @@ static int do_run_multi_threaded(struct target *target, init_stats(&event_stats); for (i = 0; i < multi_iterations; i++) { session = perf_session__new(NULL, false, NULL); - if (!session) - return -ENOMEM; + if (IS_ERR(session)) + return PTR_ERR(session); atomic_set(&event_count, 0); gettimeofday(&start, NULL); -- Gitee From f1b0ac10a9786f663dc1d434dba69e6fedd82ea7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 10 Dec 2020 15:13:02 +0900 Subject: [PATCH 213/257] perf evlist: Support pipe mode display commit 96aea4daa6cb893d339d80ce14727e6421991d8b upstream Likewise, perf evlist command should print event attributes by reading PERF_RECORD_HEADER_ATTR records. Before: $ perf record -o- true | ./perf evlist -i- (prints nothing) After: $ perf record -o- true | ./perf evlist -i- cycles:pppH Committer testing: Verbose mode also works as expected: $ perf record -o- true | perf evlist -i- cycles:uhH $ perf record -o- true | perf evlist -vi- cycles:uhH: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|PERIOD, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 $ Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20201210061302.88213-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/builtin-evlist.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index 440501994931..4529b0a998b4 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -17,6 +17,14 @@ #include "util/data.h" #include "util/debug.h" #include +#include "util/tool.h" + +static int process_header_feature(struct perf_session *session __maybe_unused, + union perf_event *event __maybe_unused) +{ + session_done = 1; + return 0; +} static int __cmd_evlist(const char *file_name, struct perf_attr_details *details) { @@ -27,12 +35,20 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details .mode = PERF_DATA_MODE_READ, .force = details->force, }; + struct perf_tool tool = { + /* only needed for pipe mode */ + .attr = perf_event__process_attr, + .feature = process_header_feature, + }; bool has_tracepoint = false; - session = perf_session__new(&data, 0, NULL); + session = perf_session__new(&data, 0, &tool); if (IS_ERR(session)) return PTR_ERR(session); + if (data.is_pipe) + perf_session__process_events(session); + evlist__for_each_entry(session->evlist, pos) { perf_evsel__fprintf(pos, details, stdout); -- Gitee From 57c4800397b60f4201dd098bfe4beb400419e00b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 10 Oct 2019 11:36:45 -0700 Subject: [PATCH 214/257] perf annotate: Avoid reallocation in objdump parsing commit 353dcaa2f979a04f9397306ae3165ccf9fc731df upstream Objdump output is parsed using getline which allocates memory for the read. Getline will realloc if the memory is too small, but currently the line is always freed after the call. Simplify parse_objdump_line by performing the reading in symbol__disassemble. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jin Yao Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: clang-built-linux@googlegroups.com Link: http://lore.kernel.org/lkml/20191010183649.23768-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/util/annotate.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 0344cb86fa95..ab4a9896c986 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1486,24 +1486,17 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start * means that it's not a disassembly line so should be treated differently. * The ops.raw part will be parsed further according to type of the instruction. */ -static int symbol__parse_objdump_line(struct symbol *sym, FILE *file, +static int symbol__parse_objdump_line(struct symbol *sym, struct annotate_args *args, - int *line_nr) + char *line, int *line_nr) { struct map *map = args->ms.map; struct annotation *notes = symbol__annotation(sym); struct disasm_line *dl; - char *line = NULL, *parsed_line, *tmp, *tmp2; - size_t line_len; + char *parsed_line, *tmp, *tmp2; s64 line_ip, offset = -1; regmatch_t match[2]; - if (getline(&line, &line_len, file) < 0) - return -1; - - if (!line) - return -1; - line_ip = -1; parsed_line = strim(line); @@ -1540,7 +1533,6 @@ static int symbol__parse_objdump_line(struct symbol *sym, FILE *file, args->ms.sym = sym; dl = disasm_line__new(args); - free(line); (*line_nr)++; if (dl == NULL) @@ -1873,6 +1865,8 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) int lineno = 0; int nline; pid_t pid; + char *line; + size_t line_len; int err = dso__disassemble_filename(dso, symfs_filename, sizeof(symfs_filename)); if (err) @@ -1961,18 +1955,26 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) goto out_free_command; } + /* Storage for getline. */ + line = NULL; + line_len = 0; + nline = 0; while (!feof(file)) { + if (getline(&line, &line_len, file) < 0 || !line) + break; + /* * The source code line number (lineno) needs to be kept in * across calls to symbol__parse_objdump_line(), so that it * can associate it with the instructions till the next one. * See disasm_line__new() and struct disasm_line::line_nr. */ - if (symbol__parse_objdump_line(sym, file, args, &lineno) < 0) + if (symbol__parse_objdump_line(sym, args, line, &lineno) < 0) break; nline++; } + free(line); if (nline == 0) pr_err("No output from %s\n", command); -- Gitee From f8e70257204ad3f4607cea8f9f32cc51221e6436 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 10 Oct 2019 11:36:47 -0700 Subject: [PATCH 215/257] perf annotate: Don't pipe objdump output through 'grep' command commit 7a675de428364a16038a8e6ed557daf0a009ce9c upstream Simplify the objdump command by not piping the output of objdump through grep. Instead, drop lines that match the grep pattern during the reading loop. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jin Yao Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: clang-built-linux@googlegroups.com Link: http://lore.kernel.org/lkml/20191010183649.23768-4-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/util/annotate.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ab4a9896c986..1e8b1533363f 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1905,7 +1905,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) err = asprintf(&command, "%s %s%s --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 - " -l -d %s %s -C \"$1\" 2>/dev/null|grep -v \"$1:\"|expand", + " -l -d %s %s -C \"$1\" 2>/dev/null|expand", opts->objdump_path ?: "objdump", opts->disassembler_style ? "-M " : "", opts->disassembler_style ?: "", @@ -1961,9 +1961,16 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) nline = 0; while (!feof(file)) { + const char *match; + if (getline(&line, &line_len, file) < 0 || !line) break; + /* Skip lines containing "filename:" */ + match = strstr(line, symfs_filename); + if (match && match[strlen(symfs_filename)] == ':') + continue; + /* * The source code line number (lineno) needs to be kept in * across calls to symbol__parse_objdump_line(), so that it -- Gitee From 19cbe0fbcea81157bebd9a0e527bf0aaea4b6efd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 10 Oct 2019 11:36:48 -0700 Subject: [PATCH 216/257] perf annotate: Don't pipe objdump output through 'expand' command commit b34b45eef16d814a93a52f2e76db803bb38939a0 upstream Avoiding a pipe allows objdump command failures to surface. Move to the caller of symbol__parse_objdump_line the call to strim that removes leading and trailing tabs. Add a new expand_tabs function that if a tab is present allocate a new line in which tabs are expanded. In symbol__parse_objdump_line the line had no leading spaces, so simplify the line_ip processing. Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jin Yao Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: clang-built-linux@googlegroups.com Link: http://lore.kernel.org/lkml/20191010183649.23768-5-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/util/annotate.c | 95 ++++++++++++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 19 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 1e8b1533363f..668891c4252c 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1488,35 +1488,24 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start */ static int symbol__parse_objdump_line(struct symbol *sym, struct annotate_args *args, - char *line, int *line_nr) + char *parsed_line, int *line_nr) { struct map *map = args->ms.map; struct annotation *notes = symbol__annotation(sym); struct disasm_line *dl; - char *parsed_line, *tmp, *tmp2; + char *tmp; s64 line_ip, offset = -1; regmatch_t match[2]; - line_ip = -1; - parsed_line = strim(line); - /* /filename:linenr ? Save line number and ignore. */ if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) { *line_nr = atoi(parsed_line + match[1].rm_so); return 0; } - tmp = skip_spaces(parsed_line); - if (*tmp) { - /* - * Parse hexa addresses followed by ':' - */ - line_ip = strtoull(tmp, &tmp2, 16); - if (*tmp2 != ':' || tmp == tmp2 || tmp2[1] == '\0') - line_ip = -1; - } - - if (line_ip != -1) { + /* Process hex address followed by ':'. */ + line_ip = strtoull(parsed_line, &tmp, 16); + if (parsed_line != tmp && tmp[0] == ':' && tmp[1] != '\0') { u64 start = map__rip_2objdump(map, sym->start), end = map__rip_2objdump(map, sym->end); @@ -1524,7 +1513,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, if ((u64)line_ip < start || (u64)line_ip >= end) offset = -1; else - parsed_line = tmp2 + 1; + parsed_line = tmp + 1; } args->offset = offset; @@ -1850,6 +1839,67 @@ static int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, } #endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) +/* + * Possibly create a new version of line with tabs expanded. Returns the + * existing or new line, storage is updated if a new line is allocated. If + * allocation fails then NULL is returned. + */ +static char *expand_tabs(char *line, char **storage, size_t *storage_len) +{ + size_t i, src, dst, len, new_storage_len, num_tabs; + char *new_line; + size_t line_len = strlen(line); + + for (num_tabs = 0, i = 0; i < line_len; i++) + if (line[i] == '\t') + num_tabs++; + + if (num_tabs == 0) + return line; + + /* + * Space for the line and '\0', less the leading and trailing + * spaces. Each tab may introduce 7 additional spaces. + */ + new_storage_len = line_len + 1 + (num_tabs * 7); + + new_line = malloc(new_storage_len); + if (new_line == NULL) { + pr_err("Failure allocating memory for tab expansion\n"); + return NULL; + } + + /* + * Copy regions starting at src and expand tabs. If there are two + * adjacent tabs then 'src == i', the memcpy is of size 0 and the spaces + * are inserted. + */ + for (i = 0, src = 0, dst = 0; i < line_len && num_tabs; i++) { + if (line[i] == '\t') { + len = i - src; + memcpy(&new_line[dst], &line[src], len); + dst += len; + new_line[dst++] = ' '; + while (dst % 8 != 0) + new_line[dst++] = ' '; + src = i + 1; + num_tabs--; + } + } + + /* Expand the last region. */ + len = line_len + 1 - src; + memcpy(&new_line[dst], &line[src], len); + dst += len; + new_line[dst] = '\0'; + + free(*storage); + *storage = new_line; + *storage_len = new_storage_len; + return new_line; + +} + static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) { struct annotation_options *opts = args->options; @@ -1905,7 +1955,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) err = asprintf(&command, "%s %s%s --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 - " -l -d %s %s -C \"$1\" 2>/dev/null|expand", + " -l -d %s %s -C \"$1\" 2>/dev/null", opts->objdump_path ?: "objdump", opts->disassembler_style ? "-M " : "", opts->disassembler_style ?: "", @@ -1962,6 +2012,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) nline = 0; while (!feof(file)) { const char *match; + char *expanded_line; if (getline(&line, &line_len, file) < 0 || !line) break; @@ -1971,13 +2022,19 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) if (match && match[strlen(symfs_filename)] == ':') continue; + expanded_line = strim(line); + expanded_line = expand_tabs(expanded_line, &line, &line_len); + if (!expanded_line) + break; + /* * The source code line number (lineno) needs to be kept in * across calls to symbol__parse_objdump_line(), so that it * can associate it with the instructions till the next one. * See disasm_line__new() and struct disasm_line::line_nr. */ - if (symbol__parse_objdump_line(sym, args, line, &lineno) < 0) + if (symbol__parse_objdump_line(sym, args, expanded_line, + &lineno) < 0) break; nline++; } -- Gitee From 3a05383198355d1887a2bf1d3b5ada24554e7d3b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 10 Oct 2019 11:36:49 -0700 Subject: [PATCH 217/257] perf annotate: Fix objdump --no-show-raw-insn flag commit c5baf9089246c1356705c9ba36d767ee8ce43dd2 upstream Remove redirection of objdump's stderr to /dev/null to help diagnose failures. Fix the '--no-show-raw' flag to be '--no-show-raw-insn' which binutils is permissive and allows, but fails with LLVM objdump. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jin Yao Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: clang-built-linux@googlegroups.com Link: http://lore.kernel.org/lkml/20191010183649.23768-6-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/util/annotate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 668891c4252c..6483bab68da9 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1955,13 +1955,13 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) err = asprintf(&command, "%s %s%s --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 - " -l -d %s %s -C \"$1\" 2>/dev/null", + " -l -d %s %s -C \"$1\"", opts->objdump_path ?: "objdump", opts->disassembler_style ? "-M " : "", opts->disassembler_style ?: "", map__rip_2objdump(map, sym->start), map__rip_2objdump(map, sym->end), - opts->show_asm_raw ? "" : "--no-show-raw", + opts->show_asm_raw ? "" : "--no-show-raw-insn", opts->annotate_src ? "-S" : ""); if (err < 0) { -- Gitee From 15856a6afe8c8f4cfd9bba5566324565087c9e0a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 7 Jan 2020 13:04:44 -0800 Subject: [PATCH 218/257] perf tools: Support --prefix/--prefix-strip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3b0b16bf8cb92ae67968c1abb7b335032b899b33 upstream The objdump utility has useful --prefix / --prefix-strip options to allow changing source code file names hardcoded into executables' debug info. Add options to 'perf report', 'perf top' and 'perf annotate', which are then passed to objdump. $ mkdir foo $ echo 'main() { for (;;); }' > foo/foo.c $ gcc -g foo/foo.c foo/foo.c:1:1: warning: return type defaults to ‘int’ [-Wimplicit-int] 1 | main() { for (;;); } | ^~~~ $ perf record ./a.out ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.230 MB perf.data (5721 samples) ] $ mv foo bar $ perf annotate $ perf annotate --prefix=/home/ak/lsrc/git/bar --prefix-strip=5 Signed-off-by: Andi Kleen Tested-by: Jiri Olsa LPU-Reference: 20200107210444.214071-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf-annotate.txt | 6 ++++++ tools/perf/Documentation/perf-report.txt | 6 ++++++ tools/perf/Documentation/perf-top.txt | 6 ++++++ tools/perf/builtin-annotate.c | 7 +++++++ tools/perf/builtin-report.c | 7 +++++++ tools/perf/builtin-top.c | 7 +++++++ tools/perf/util/annotate.c | 19 +++++++++++++++++-- tools/perf/util/annotate.h | 5 +++++ 8 files changed, 61 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index e8c972f89357..1b5042f134a8 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -112,6 +112,12 @@ OPTIONS --objdump=:: Path to objdump binary. +--prefix=PREFIX:: +--prefix-strip=N:: + Remove first N entries from source file path names in executables + and add PREFIX. This allows to display source code compiled on systems + with different file system layout. + --skip-missing:: Skip symbols that cannot be annotated. diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 7315f155803f..f627a962b80e 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -367,6 +367,12 @@ OPTIONS --objdump=:: Path to objdump binary. +--prefix=PREFIX:: +--prefix-strip=N:: + Remove first N entries from source file path names in executables + and add PREFIX. This allows to display source code compiled on systems + with different file system layout. + --group:: Show event group information together. It forces group output also if there are no groups defined in data file. diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 5596129a71cf..324b6b53c86b 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -158,6 +158,12 @@ Default is to monitor all CPUS. -M:: --disassembler-style=:: Set disassembler style for objdump. +--prefix=PREFIX:: +--prefix-strip=N:: + Remove first N entries from source file path names in executables + and add PREFIX. This allows to display source code compiled on systems + with different file system layout. + --source:: Interleave source code with assembly code. Enabled by default, disable with --no-source. diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 2cfb6e5da381..72543a1cb7fb 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -535,6 +535,10 @@ int cmd_annotate(int argc, const char **argv) "Display raw encoding of assembly instructions (default)"), OPT_STRING('M', "disassembler-style", &annotate.opts.disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), + OPT_STRING(0, "prefix", &annotate.opts.prefix, "prefix", + "Add prefix to source file path names in programs (with --prefix-strip)"), + OPT_STRING(0, "prefix-strip", &annotate.opts.prefix_strip, "N", + "Strip first N entries of source file path name in programs (with --prefix)"), OPT_STRING(0, "objdump", &annotate.opts.objdump_path, "path", "objdump binary to use for disassembly and annotations"), OPT_BOOLEAN(0, "group", &symbol_conf.event_group, @@ -574,6 +578,9 @@ int cmd_annotate(int argc, const char **argv) annotate.sym_hist_filter = argv[0]; } + if (annotate_check_args(&annotate.opts) < 0) + return -EINVAL; + if (symbol_conf.show_nr_samples && annotate.use_gtk) { pr_err("--show-nr-samples is not available in --gtk mode at this time\n"); return ret; diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index f5c5e0cdfcb3..ab9914cc3f6f 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1164,6 +1164,10 @@ int cmd_report(int argc, const char **argv) "Display raw encoding of assembly instructions (default)"), OPT_STRING('M', "disassembler-style", &report.annotation_opts.disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), + OPT_STRING(0, "prefix", &report.annotation_opts.prefix, "prefix", + "Add prefix to source file path names in programs (with --prefix-strip)"), + OPT_STRING(0, "prefix-strip", &report.annotation_opts.prefix_strip, "N", + "Strip first N entries of source file path name in programs (with --prefix)"), OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period, "Show a column with the sum of periods"), OPT_BOOLEAN_SET(0, "group", &symbol_conf.event_group, &report.group_set, @@ -1241,6 +1245,9 @@ int cmd_report(int argc, const char **argv) report.symbol_filter_str = argv[0]; } + if (annotate_check_args(&report.annotation_opts) < 0) + return -EINVAL; + if (report.mmaps_mode) report.tasks_mode = true; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 0fec22232daf..1d69739040bd 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1513,6 +1513,10 @@ int cmd_top(int argc, const char **argv) "objdump binary to use for disassembly and annotations"), OPT_STRING('M', "disassembler-style", &top.annotation_opts.disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), + OPT_STRING(0, "prefix", &top.annotation_opts.prefix, "prefix", + "Add prefix to source file path names in programs (with --prefix-strip)"), + OPT_STRING(0, "prefix-strip", &top.annotation_opts.prefix_strip, "N", + "Strip first N entries of source file path name in programs (with --prefix)"), OPT_STRING('u', "uid", &target->uid_str, "user", "user to profile"), OPT_CALLBACK(0, "percent-limit", &top, "percent", "Don't show entries under that percent", parse_percent_limit), @@ -1567,6 +1571,9 @@ int cmd_top(int argc, const char **argv) if (argc) usage_with_options(top_usage, options); + if (annotate_check_args(&top.annotation_opts) < 0) + goto out_delete_evlist; + if (!top.evlist->core.nr_entries && evlist__add_default(top.evlist) < 0) { pr_err("Not enough memory for event selector list\n"); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 6483bab68da9..ac7a5de062cc 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1955,14 +1955,20 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args) err = asprintf(&command, "%s %s%s --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 - " -l -d %s %s -C \"$1\"", + " -l -d %s %s %s %c%s%c %s%s -C \"$1\"", opts->objdump_path ?: "objdump", opts->disassembler_style ? "-M " : "", opts->disassembler_style ?: "", map__rip_2objdump(map, sym->start), map__rip_2objdump(map, sym->end), opts->show_asm_raw ? "" : "--no-show-raw-insn", - opts->annotate_src ? "-S" : ""); + opts->annotate_src ? "-S" : "", + opts->prefix ? "--prefix " : "", + opts->prefix ? '"' : ' ', + opts->prefix ?: "", + opts->prefix ? '"' : ' ', + opts->prefix_strip ? "--prefix-strip=" : "", + opts->prefix_strip ?: ""); if (err < 0) { pr_err("Failure allocating memory for the command to run\n"); @@ -3200,3 +3206,12 @@ int annotate_parse_percent_type(const struct option *opt, const char *_str, free(str1); return err; } + +int annotate_check_args(struct annotation_options *args) +{ + if (args->prefix_strip && !args->prefix) { + pr_err("--prefix-strip requires --prefix\n"); + return -1; + } + return 0; +} diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index d76fd0e81f46..47f3d2fd20ce 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -93,6 +93,8 @@ struct annotation_options { int context; const char *objdump_path; const char *disassembler_style; + const char *prefix; + const char *prefix_strip; unsigned int percent_type; }; @@ -419,4 +421,7 @@ void annotation_config__init(void); int annotate_parse_percent_type(const struct option *opt, const char *_str, int unset); + +int annotate_check_args(struct annotation_options *args); + #endif /* __PERF_ANNOTATE_H */ -- Gitee From 3a78a35c1e4c00deda45a690fba96779707738f5 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 15 Jul 2021 18:07:14 +0200 Subject: [PATCH 219/257] perf report: Free generated help strings for sort option commit a37338aad8c4d8676173ead14e881d2ec308155c upstream ASan reports the memory leak of the strings allocated by sort_help() when running perf report. This patch changes the returned pointer to char* (instead of const char*), saves it in a temporary variable, and finally deallocates it at function exit. Signed-off-by: Riccardo Mancini Fixes: 702fb9b415e7c99b ("perf report: Show all sort keys in help output") Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/a38b13f02812a8a6759200b9063c6191337f44d4.1626343282.git.rickyman7@gmail.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/builtin-report.c | 33 ++++++++++++++++++++++----------- tools/perf/util/sort.c | 2 +- tools/perf/util/sort.h | 2 +- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index ab9914cc3f6f..142d99ffd624 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1071,6 +1071,8 @@ int cmd_report(int argc, const char **argv) .socket_filter = -1, .annotation_opts = annotation__default_options, }; + char *sort_order_help = sort_help("sort by key(s):"); + char *field_order_help = sort_help("output field(s): overhead period sample "); const struct option options[] = { OPT_STRING('i', "input", &input_name, "file", "input file name"), @@ -1105,9 +1107,9 @@ int cmd_report(int argc, const char **argv) OPT_BOOLEAN(0, "header-only", &report.header_only, "Show only data header."), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", - sort_help("sort by key(s):")), + sort_order_help), OPT_STRING('F', "fields", &field_order, "key[,keys...]", - sort_help("output field(s): overhead period sample ")), + field_order_help), OPT_BOOLEAN(0, "show-cpu-utilization", &symbol_conf.show_cpu_utilization, "Show sample percentage for different cpu modes"), OPT_BOOLEAN_FLAG(0, "showcpuutilization", &symbol_conf.show_cpu_utilization, @@ -1227,11 +1229,11 @@ int cmd_report(int argc, const char **argv) char sort_tmp[128]; if (ret < 0) - return ret; + goto exit; ret = perf_config(report__config, &report); if (ret) - return ret; + goto exit; argc = parse_options(argc, argv, options, report_usage, 0); if (argc) { @@ -1245,8 +1247,10 @@ int cmd_report(int argc, const char **argv) report.symbol_filter_str = argv[0]; } - if (annotate_check_args(&report.annotation_opts) < 0) - return -EINVAL; + if (annotate_check_args(&report.annotation_opts) < 0) { + ret = -EINVAL; + goto exit; + } if (report.mmaps_mode) report.tasks_mode = true; @@ -1257,12 +1261,14 @@ int cmd_report(int argc, const char **argv) if (symbol_conf.vmlinux_name && access(symbol_conf.vmlinux_name, R_OK)) { pr_err("Invalid file: %s\n", symbol_conf.vmlinux_name); - return -EINVAL; + ret = -EINVAL; + goto exit; } if (symbol_conf.kallsyms_name && access(symbol_conf.kallsyms_name, R_OK)) { pr_err("Invalid file: %s\n", symbol_conf.kallsyms_name); - return -EINVAL; + ret = -EINVAL; + goto exit; } if (report.inverted_callchain) @@ -1286,12 +1292,14 @@ int cmd_report(int argc, const char **argv) repeat: session = perf_session__new(&data, false, &report.tool); - if (IS_ERR(session)) - return PTR_ERR(session); + if (IS_ERR(session)) { + ret = PTR_ERR(session); + goto exit; + } ret = evswitch__init(&report.evswitch, session->evlist, stderr); if (ret) - return ret; + goto exit; if (zstd_init(&(session->zstd_data), 0) < 0) pr_warning("Decompression initialization failed. Reported data may be incomplete.\n"); @@ -1498,5 +1506,8 @@ int cmd_report(int argc, const char **argv) } zstd_fini(&(session->zstd_data)); perf_session__delete(session); +exit: + free(sort_order_help); + free(field_order_help); return ret; } diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 4027906fd3e3..b7cba38b0fa2 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -3142,7 +3142,7 @@ static void add_hpp_sort_string(struct strbuf *sb, struct hpp_dimension *s, int add_key(sb, s[i].name, llen); } -const char *sort_help(const char *prefix) +char *sort_help(const char *prefix) { struct strbuf sb; char *s; diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 7b93f34ac1f4..c047b4591b70 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -285,7 +285,7 @@ void reset_output_field(void); void sort__setup_elide(FILE *fp); void perf_hpp__set_elide(int idx, bool elide); -const char *sort_help(const char *prefix); +char *sort_help(const char *prefix); int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, int unset); -- Gitee From 9032d4e5357a0d57d463ec59ae29650c0676a305 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 19 Jul 2021 15:31:49 -0700 Subject: [PATCH 220/257] perf tools: Remove repipe argument from perf_session__new() commit 2681bd85a4b92788e265934d0d76bd56b5b08d16 upstream The repipe argument is only used by perf inject and the all others passes 'false'. Let's remove it from the function signature and add __perf_session__new() to be called from perf inject directly. This is a preparation of the change the pipe input/output. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210719223153.1618812-2-namhyung@kernel.org [ Fixed up some trivial conflicts as this patchset fell thru the cracks ;-( ] Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/bench/synthesize.c | 4 ++-- tools/perf/builtin-annotate.c | 2 +- tools/perf/builtin-buildid-cache.c | 2 +- tools/perf/builtin-buildid-list.c | 2 +- tools/perf/builtin-c2c.c | 2 +- tools/perf/builtin-diff.c | 4 ++-- tools/perf/builtin-evlist.c | 2 +- tools/perf/builtin-inject.c | 2 +- tools/perf/builtin-kmem.c | 2 +- tools/perf/builtin-kvm.c | 4 ++-- tools/perf/builtin-lock.c | 2 +- tools/perf/builtin-mem.c | 3 +-- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-report.c | 2 +- tools/perf/builtin-sched.c | 4 ++-- tools/perf/builtin-script.c | 4 ++-- tools/perf/builtin-stat.c | 4 ++-- tools/perf/builtin-timechart.c | 3 +-- tools/perf/builtin-top.c | 2 +- tools/perf/builtin-trace.c | 2 +- tools/perf/tests/topology.c | 4 ++-- tools/perf/util/data-convert-bt.c | 2 +- tools/perf/util/data-convert-json.c | 2 +- tools/perf/util/session.c | 5 +++-- tools/perf/util/session.h | 12 ++++++++++-- 25 files changed, 43 insertions(+), 36 deletions(-) diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c index b2924e3181dc..05f7c923c745 100644 --- a/tools/perf/bench/synthesize.c +++ b/tools/perf/bench/synthesize.c @@ -117,7 +117,7 @@ static int run_single_threaded(void) int err; perf_set_singlethreaded(); - session = perf_session__new(NULL, false, NULL); + session = perf_session__new(NULL, NULL); if (IS_ERR(session)) { pr_err("Session creation failed.\n"); return PTR_ERR(session); @@ -161,7 +161,7 @@ static int do_run_multi_threaded(struct target *target, init_stats(&time_stats); init_stats(&event_stats); for (i = 0; i < multi_iterations; i++) { - session = perf_session__new(NULL, false, NULL); + session = perf_session__new(NULL, NULL); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 72543a1cb7fb..e26125c7c106 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -591,7 +591,7 @@ int cmd_annotate(int argc, const char **argv) data.path = input_name; - annotate.session = perf_session__new(&data, false, &annotate.tool); + annotate.session = perf_session__new(&data, &annotate.tool); if (IS_ERR(annotate.session)) return PTR_ERR(annotate.session); diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index 39efa51d7fb3..18821556b603 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -422,7 +422,7 @@ int cmd_buildid_cache(int argc, const char **argv) data.path = missing_filename; data.force = force; - session = perf_session__new(&data, false, NULL); + session = perf_session__new(&data, NULL); if (IS_ERR(session)) return PTR_ERR(session); } diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index e3ef75583514..27b0ab5d03ef 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -65,7 +65,7 @@ static int perf_session__list_build_ids(bool force, bool with_hits) if (filename__fprintf_build_id(input_name, stdout) > 0) goto out; - session = perf_session__new(&data, false, &build_id__mark_dso_hit_ops); + session = perf_session__new(&data, &build_id__mark_dso_hit_ops); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 29d460c30176..cd25ba49195c 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -2789,7 +2789,7 @@ static int perf_c2c__report(int argc, const char **argv) goto out; } - session = perf_session__new(&data, 0, &c2c.tool); + session = perf_session__new(&data, &c2c.tool); if (IS_ERR(session)) { err = PTR_ERR(session); pr_debug("Error creating perf session\n"); diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 48ce1f99c1ed..f679bf7c04b4 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -1154,7 +1154,7 @@ static int check_file_brstack(void) int i; data__for_each_file(i, d) { - d->session = perf_session__new(&d->data, false, &pdiff.tool); + d->session = perf_session__new(&d->data, &pdiff.tool); if (IS_ERR(d->session)) { pr_err("Failed to open %s\n", d->data.path); return PTR_ERR(d->session); @@ -1186,7 +1186,7 @@ static int __cmd_diff(void) ret = -EINVAL; data__for_each_file(i, d) { - d->session = perf_session__new(&d->data, false, &pdiff.tool); + d->session = perf_session__new(&d->data, &pdiff.tool); if (IS_ERR(d->session)) { ret = PTR_ERR(d->session); pr_err("Failed to open %s\n", d->data.path); diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index 4529b0a998b4..d1d7a8370c09 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -42,7 +42,7 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details }; bool has_tracepoint = false; - session = perf_session__new(&data, 0, &tool); + session = perf_session__new(&data, &tool); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 49f121762a8f..00311f8c79d8 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -853,7 +853,7 @@ int cmd_inject(int argc, const char **argv) inject.tool.ordered_events = inject.sched_stat; data.path = inject.input_name; - inject.session = perf_session__new(&data, inject.output.is_pipe, &inject.tool); + inject.session = __perf_session__new(&data, inject.output.is_pipe, &inject.tool); if (IS_ERR(inject.session)) { ret = PTR_ERR(inject.session); goto out_close_output; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 9661671cc26e..afa36cbd34e3 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -1957,7 +1957,7 @@ int cmd_kmem(int argc, const char **argv) data.path = input_name; - kmem_session = session = perf_session__new(&data, false, &perf_kmem); + kmem_session = session = perf_session__new(&data, &perf_kmem); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index a7ee11cd57c0..1b49ae6001d6 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1094,7 +1094,7 @@ static int read_events(struct perf_kvm_stat *kvm) }; kvm->tool = eops; - kvm->session = perf_session__new(&file, false, &kvm->tool); + kvm->session = perf_session__new(&file, &kvm->tool); if (IS_ERR(kvm->session)) { pr_err("Initializing perf session failed\n"); return PTR_ERR(kvm->session); @@ -1449,7 +1449,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm, /* * perf session */ - kvm->session = perf_session__new(&data, false, &kvm->tool); + kvm->session = perf_session__new(&data, &kvm->tool); if (IS_ERR(kvm->session)) { err = PTR_ERR(kvm->session); goto out; diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index d06665077dfe..0b29babb38bc 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -872,7 +872,7 @@ static int __cmd_report(bool display_info) .force = force, }; - session = perf_session__new(&data, false, &eops); + session = perf_session__new(&data, &eops); if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); return PTR_ERR(session); diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index a13f5817d6fc..a0d93d7e0f63 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -247,8 +247,7 @@ static int report_raw_events(struct perf_mem *mem) .force = mem->force, }; int ret; - struct perf_session *session = perf_session__new(&data, false, - &mem->tool); + struct perf_session *session = perf_session__new(&data, &mem->tool); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6280f6430050..75ad52c08bab 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1443,7 +1443,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) signal(SIGUSR2, SIG_IGN); } - session = perf_session__new(data, false, tool); + session = perf_session__new(data, tool); if (IS_ERR(session)) { pr_err("Perf session creation failed.\n"); return PTR_ERR(session); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 142d99ffd624..3a1953c39a26 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1291,7 +1291,7 @@ int cmd_report(int argc, const char **argv) data.force = symbol_conf.force; repeat: - session = perf_session__new(&data, false, &report.tool); + session = perf_session__new(&data, &report.tool); if (IS_ERR(session)) { ret = PTR_ERR(session); goto exit; diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 5cacc4f84c8d..817a3efd22b4 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1798,7 +1798,7 @@ static int perf_sched__read_events(struct perf_sched *sched) }; int rc = -1; - session = perf_session__new(&data, false, &sched->tool); + session = perf_session__new(&data, &sched->tool); if (IS_ERR(session)) { pr_debug("Error creating perf session"); return PTR_ERR(session); @@ -2990,7 +2990,7 @@ static int perf_sched__timehist(struct perf_sched *sched) symbol_conf.use_callchain = sched->show_callchain; - session = perf_session__new(&data, false, &sched->tool); + session = perf_session__new(&data, &sched->tool); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index bb64dbfe043a..e67a534836a7 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -3100,7 +3100,7 @@ int find_scripts(char **scripts_array, char **scripts_path_array, int num, char *temp; int i = 0; - session = perf_session__new(&data, false, NULL); + session = perf_session__new(&data, NULL); if (IS_ERR(session)) return PTR_ERR(session); @@ -3766,7 +3766,7 @@ int cmd_script(int argc, const char **argv) use_browser = 0; } - session = perf_session__new(&data, false, &script.tool); + session = perf_session__new(&data, &script.tool); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 40c6c2d822e1..748be8cf5a73 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1537,7 +1537,7 @@ static int __cmd_record(int argc, const char **argv) return -1; } - session = perf_session__new(data, false, NULL); + session = perf_session__new(data, NULL); if (IS_ERR(session)) { pr_err("Perf session creation failed\n"); return PTR_ERR(session); @@ -1736,7 +1736,7 @@ static int __cmd_report(int argc, const char **argv) perf_stat.data.path = input_name; perf_stat.data.mode = PERF_DATA_MODE_READ; - session = perf_session__new(&perf_stat.data, false, &perf_stat.tool); + session = perf_session__new(&perf_stat.data, &perf_stat.tool); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 9e84fae9b096..eda2892aa261 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -1598,8 +1598,7 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name) .force = tchart->force, }; - struct perf_session *session = perf_session__new(&data, false, - &tchart->tool); + struct perf_session *session = perf_session__new(&data, &tchart->tool); int ret = -EINVAL; if (IS_ERR(session)) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 1d69739040bd..3892e56e7248 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -1683,7 +1683,7 @@ int cmd_top(int argc, const char **argv) signal(SIGWINCH, winch_sig); } - top.session = perf_session__new(NULL, false, NULL); + top.session = perf_session__new(NULL, NULL); if (IS_ERR(top.session)) { status = PTR_ERR(top.session); goto out_delete_evlist; diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 1f40c683bba8..06edb52090d4 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3636,7 +3636,7 @@ static int trace__replay(struct trace *trace) /* add tid to output */ trace->multiple_threads = true; - session = perf_session__new(&data, false, &trace->tool); + session = perf_session__new(&data, &trace->tool); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index f4a2c0df0954..c8af1310bd7c 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -37,7 +37,7 @@ static int session_write_header(char *path) .mode = PERF_DATA_MODE_WRITE, }; - session = perf_session__new(&data, false, NULL); + session = perf_session__new(&data, NULL); TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); session->evlist = perf_evlist__new_default(); @@ -67,7 +67,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) }; int i; - session = perf_session__new(&data, false, NULL); + session = perf_session__new(&data, NULL); TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); /* On platforms with large numbers of CPUs process_cpu_topology() diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 5ee5c9ef7c58..c764c6da33a2 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -1634,7 +1634,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, err = -1; /* perf.data session */ - session = perf_session__new(&data, 0, &c.tool); + session = perf_session__new(&data, &c.tool); if (IS_ERR(session)) return PTR_ERR(session); diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c index 355cd1948bdf..f1ab6edba446 100644 --- a/tools/perf/util/data-convert-json.c +++ b/tools/perf/util/data-convert-json.c @@ -334,7 +334,7 @@ int bt_convert__perf2json(const char *input_name, const char *output_name, goto err; } - session = perf_session__new(&data, false, &c.tool); + session = perf_session__new(&data, &c.tool); if (IS_ERR(session)) { fprintf(stderr, "Error creating perf session!\n"); goto err_fclose; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index c13eecf9b29e..6da10007c49e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -184,8 +184,9 @@ static int ordered_events__deliver_event(struct ordered_events *oe, session->tool, event->file_offset); } -struct perf_session *perf_session__new(struct perf_data *data, - bool repipe, struct perf_tool *tool) +struct perf_session *__perf_session__new(struct perf_data *data, + bool repipe, + struct perf_tool *tool) { int ret = -ENOMEM; struct perf_session *session = zalloc(sizeof(*session)); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index b4c9428c18f0..a0ff19189fdc 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -54,8 +54,16 @@ struct decomp { struct perf_tool; -struct perf_session *perf_session__new(struct perf_data *data, - bool repipe, struct perf_tool *tool); +struct perf_session *__perf_session__new(struct perf_data *data, + bool repipe, + struct perf_tool *tool); + +static inline struct perf_session *perf_session__new(struct perf_data *data, + struct perf_tool *tool) +{ + return __perf_session__new(data, false, tool); +} + void perf_session__delete(struct perf_session *session); void perf_event_header__bswap(struct perf_event_header *hdr); -- Gitee From 3880cb5149663e1226a3360280af76e062b3609d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 May 2020 11:50:22 +0200 Subject: [PATCH 221/257] perf session: Try to read pipe data from file commit 14d3d54052539a1e833b5b615add5bc9ac3ef76a upstream Ian came with the idea of having support to read the pipe data also from file. Currently pipe mode files fail like: $ perf record -o - sleep 1 > /tmp/perf.pipe.data $ perf report -i /tmp/perf.pipe.data incompatible file format (rerun with -v to learn more) This patch adds the support to do that by trying the pipe header first, and if its successfully detected, switching the perf data to pipe mode. Committer testing: # ls # perf record -a -o - sleep 1 > /tmp/perf.pipe.data [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.000 MB - ] # ls # perf report -i /tmp/perf.pipe.data | head -25 # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 511 of event 'cycles' # Event count (approx.): 178447276 # # Overhead Command Shared Object Symbol # ........ ........ ................. ........................................................................................... # 65.49% swapper [kernel.kallsyms] [k] native_safe_halt 6.45% chromium libblink_core.so [.] blink::SelectorChecker::CheckOne 4.08% chromium libblink_core.so [.] blink::SelectorQuery::ExecuteForTraverseRoot 2.25% chromium libblink_core.so [.] blink::SelectorQuery::FindTraverseRootsAndExecute 2.11% chromium libblink_core.so [.] blink::SelectorChecker::MatchSelector 1.91% chromium libblink_core.so [.] blink::Node::OwnerShadowHost 1.31% chromium libblink_core.so [.] blink::Node::parentNode@plt 1.22% chromium libblink_core.so [.] blink::Node::parentNode 0.59% chromium libblink_core.so [.] blink::AnyAttributeMatches 0.58% chromium libv8.so [.] v8::internal::GlobalHandles::Create 0.58% chromium libblink_core.so [.] blink::NodeTraversal::NextAncestorSibling 0.55% chromium libv8.so [.] v8::internal::RegExpGlobalCache::RegExpGlobalCache 0.55% chromium libblink_core.so [.] blink::Node::ContainingShadowRoot 0.55% chromium libblink_core.so [.] blink::NodeTraversal::NextAncestorSibling@plt # Original-patch-by: Ian Rogers Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Ian Rogers Cc: Michael Petlan Cc: Namhyung Kim Cc: Paul Khuong Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200507095024.2789147-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/util/header.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 6008fd97eb71..be4afe86c408 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3863,7 +3863,7 @@ static int perf_header__read_pipe(struct perf_session *session) return -EINVAL; } - return 0; + return f_header.size == sizeof(f_header) ? 0 : -1; } static int read_attr(int fd, struct perf_header *ph, @@ -3965,7 +3965,7 @@ int perf_session__read_header(struct perf_session *session) struct perf_file_header f_header; struct perf_file_attr f_attr; u64 f_id; - int nr_attrs, nr_ids, i, j; + int nr_attrs, nr_ids, i, j, err; int fd = perf_data__fd(data); session->evlist = evlist__new(); @@ -3974,8 +3974,16 @@ int perf_session__read_header(struct perf_session *session) session->evlist->env = &header->env; session->machines.host.env = &header->env; - if (perf_data__is_pipe(data)) - return perf_header__read_pipe(session); + + /* + * We can read 'pipe' data event from regular file, + * check for the pipe header regardless of source. + */ + err = perf_header__read_pipe(session); + if (!err || (err && perf_data__is_pipe(data))) { + data->is_pipe = true; + return err; + } if (perf_file_header__read(&f_header, header, fd) < 0) return -EINVAL; -- Gitee From 657934d70a020913822e5e544d68b6400aa834bd Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 5 Feb 2021 11:54:15 +0800 Subject: [PATCH 222/257] perf tools: Simplify the calculation of variables commit 52bcc6031c0b459baa1f2cacd1fd4adc78ae0127 upstream Fix the following coccicheck warnings: ./tools/perf/util/header.c:3809:18-20: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Yonghong Song Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Link: http://lore.kernel.org/lkml/1612497255-87189-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/util/header.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index be4afe86c408..4478dbc31576 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3980,7 +3980,7 @@ int perf_session__read_header(struct perf_session *session) * check for the pipe header regardless of source. */ err = perf_header__read_pipe(session); - if (!err || (err && perf_data__is_pipe(data))) { + if (!err || perf_data__is_pipe(data)) { data->is_pipe = true; return err; } -- Gitee From 06f323db5e205c30a25a812e9973e50d88657b5d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Oct 2020 16:02:09 +0900 Subject: [PATCH 223/257] perf bench: Add build-id injection benchmark commit 0bf02a0d80427f263195c1e5a4c8ada14bd5d261 upstream Sometimes I can see that 'perf record' piped with 'perf inject' take a long time processing build-ids. So introduce a inject-build-id benchmark to the internals benchmark suite to measure its overhead regularly. It runs the 'perf inject' command internally and feeds the given number of synthesized events (MMAP2 + SAMPLE basically). Usage: perf bench internals inject-build-id -i, --iterations Number of iterations used to compute average (default: 100) -m, --nr-mmaps Number of mmap events for each iteration (default: 100) -n, --nr-samples Number of sample events per mmap event (default: 100) -v, --verbose be more verbose (show iteration count, DSO name, etc) By default, it measures average processing time of 100 MMAP2 events and 10000 SAMPLE events. Below is a result on my laptop. $ perf bench internals inject-build-id # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 25.789 msec (+- 0.202 msec) Average time per event: 2.528 usec (+- 0.020 usec) Average memory usage: 8411 KB (+- 7 KB) Committer testing: $ perf bench Usage: perf bench [] [] # List of all available benchmark collections: sched: Scheduler and IPC benchmarks syscall: System call benchmarks mem: Memory access benchmarks numa: NUMA scheduling and MM benchmarks futex: Futex stressing benchmarks epoll: Epoll stressing benchmarks internals: Perf-internals benchmarks all: All benchmarks $ perf bench internals # List of available benchmarks for collection 'internals': synthesize: Benchmark perf event synthesis kallsyms-parse: Benchmark kallsyms parsing inject-build-id: Benchmark build-id injection $ perf bench internals inject-build-id # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 14.202 msec (+- 0.059 msec) Average time per event: 1.392 usec (+- 0.006 usec) Average memory usage: 12650 KB (+- 10 KB) Average build-id-all injection took: 12.831 msec (+- 0.071 msec) Average time per event: 1.258 usec (+- 0.007 usec) Average memory usage: 11895 KB (+- 10 KB) $ $ perf stat -r5 perf bench internals inject-build-id # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 14.380 msec (+- 0.056 msec) Average time per event: 1.410 usec (+- 0.006 usec) Average memory usage: 12608 KB (+- 11 KB) Average build-id-all injection took: 11.889 msec (+- 0.064 msec) Average time per event: 1.166 usec (+- 0.006 usec) Average memory usage: 11838 KB (+- 10 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 14.246 msec (+- 0.065 msec) Average time per event: 1.397 usec (+- 0.006 usec) Average memory usage: 12744 KB (+- 10 KB) Average build-id-all injection took: 12.019 msec (+- 0.066 msec) Average time per event: 1.178 usec (+- 0.006 usec) Average memory usage: 11963 KB (+- 10 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 14.321 msec (+- 0.067 msec) Average time per event: 1.404 usec (+- 0.007 usec) Average memory usage: 12690 KB (+- 10 KB) Average build-id-all injection took: 11.909 msec (+- 0.041 msec) Average time per event: 1.168 usec (+- 0.004 usec) Average memory usage: 11938 KB (+- 10 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 14.287 msec (+- 0.059 msec) Average time per event: 1.401 usec (+- 0.006 usec) Average memory usage: 12864 KB (+- 10 KB) Average build-id-all injection took: 11.862 msec (+- 0.058 msec) Average time per event: 1.163 usec (+- 0.006 usec) Average memory usage: 12103 KB (+- 10 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 14.402 msec (+- 0.053 msec) Average time per event: 1.412 usec (+- 0.005 usec) Average memory usage: 12876 KB (+- 10 KB) Average build-id-all injection took: 11.826 msec (+- 0.061 msec) Average time per event: 1.159 usec (+- 0.006 usec) Average memory usage: 12111 KB (+- 10 KB) Performance counter stats for 'perf bench internals inject-build-id' (5 runs): 4,267.48 msec task-clock:u # 1.502 CPUs utilized ( +- 0.14% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 102,092 page-faults:u # 0.024 M/sec ( +- 0.08% ) 3,894,589,578 cycles:u # 0.913 GHz ( +- 0.19% ) (83.49%) 140,078,421 stalled-cycles-frontend:u # 3.60% frontend cycles idle ( +- 0.77% ) (83.34%) 948,581,189 stalled-cycles-backend:u # 24.36% backend cycles idle ( +- 0.46% ) (83.25%) 5,835,587,719 instructions:u # 1.50 insn per cycle # 0.16 stalled cycles per insn ( +- 0.21% ) (83.24%) 1,267,423,636 branches:u # 296.996 M/sec ( +- 0.22% ) (83.12%) 17,484,290 branch-misses:u # 1.38% of all branches ( +- 0.12% ) (83.55%) 2.84176 +- 0.00222 seconds time elapsed ( +- 0.08% ) $ Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20201012070214.2074921-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/bench/Build | 1 + tools/perf/bench/bench.h | 1 + tools/perf/bench/inject-buildid.c | 460 ++++++++++++++++++++++++++++++ tools/perf/builtin-bench.c | 1 + tools/perf/builtin-inject.c | 9 +- tools/perf/util/build-id.h | 4 + 6 files changed, 471 insertions(+), 5 deletions(-) create mode 100644 tools/perf/bench/inject-buildid.c diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index 042827385c87..075132693375 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -9,6 +9,7 @@ perf-y += futex-lock-pi.o perf-y += epoll-wait.o perf-y += epoll-ctl.o perf-y += synthesize.o +perf-y += inject-buildid.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 9d797dd89cf1..02b93a7a6f4a 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -32,6 +32,7 @@ int bench_futex_lock_pi(int argc, const char **argv); int bench_epoll_wait(int argc, const char **argv); int bench_epoll_ctl(int argc, const char **argv); int bench_synthesize(int argc, const char **argv); +int bench_inject_build_id(int argc, const char **argv); #define BENCH_FORMAT_DEFAULT_STR "default" #define BENCH_FORMAT_DEFAULT 0 diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c new file mode 100644 index 000000000000..0fccf2a9e95b --- /dev/null +++ b/tools/perf/bench/inject-buildid.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bench.h" +#include "util/data.h" +#include "util/stat.h" +#include "util/debug.h" +#include "util/event.h" +#include "util/symbol.h" +#include "util/session.h" +#include "util/build-id.h" +#include "util/synthetic-events.h" + +#define MMAP_DEV_MAJOR 8 +#define DSO_MMAP_RATIO 4 + +static unsigned int iterations = 100; +static unsigned int nr_mmaps = 100; +static unsigned int nr_samples = 100; /* samples per mmap */ + +static u64 bench_sample_type; +static u16 bench_id_hdr_size; + +struct bench_data { + int pid; + int input_pipe[2]; + int output_pipe[2]; + pthread_t th; +}; + +struct bench_dso { + struct list_head list; + char *name; + int ino; +}; + +static int nr_dsos; +static struct bench_dso *dsos; + +extern int cmd_inject(int argc, const char *argv[]); + +static const struct option options[] = { + OPT_UINTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average (default: 100)"), + OPT_UINTEGER('m', "nr-mmaps", &nr_mmaps, + "Number of mmap events for each iteration (default: 100)"), + OPT_UINTEGER('n', "nr-samples", &nr_samples, + "Number of sample events per mmap event (default: 100)"), + OPT_INCR('v', "verbose", &verbose, + "be more verbose (show iteration count, DSO name, etc)"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench internals inject-build-id ", + NULL +}; + +/* + * Helper for collect_dso that adds the given file as a dso to dso_list + * if it contains a build-id. Stops after collecting 4 times more than + * we need (for MMAP2 events). + */ +static int add_dso(const char *fpath, const struct stat *sb __maybe_unused, + int typeflag, struct FTW *ftwbuf __maybe_unused) +{ + struct bench_dso *dso = &dsos[nr_dsos]; + unsigned char build_id[BUILD_ID_SIZE]; + + if (typeflag == FTW_D || typeflag == FTW_SL) + return 0; + + if (filename__read_build_id(fpath, build_id, BUILD_ID_SIZE) < 0) + return 0; + + dso->name = realpath(fpath, NULL); + if (dso->name == NULL) + return -1; + + dso->ino = nr_dsos++; + pr_debug2(" Adding DSO: %s\n", fpath); + + /* stop if we collected enough DSOs */ + if ((unsigned int)nr_dsos == DSO_MMAP_RATIO * nr_mmaps) + return 1; + + return 0; +} + +static void collect_dso(void) +{ + dsos = calloc(nr_mmaps * DSO_MMAP_RATIO, sizeof(*dsos)); + if (dsos == NULL) { + printf(" Memory allocation failed\n"); + exit(1); + } + + if (nftw("/usr/lib/", add_dso, 10, FTW_PHYS) < 0) + return; + + pr_debug(" Collected %d DSOs\n", nr_dsos); +} + +static void release_dso(void) +{ + int i; + + for (i = 0; i < nr_dsos; i++) { + struct bench_dso *dso = &dsos[i]; + + free(dso->name); + } + free(dsos); +} + +/* Fake address used by mmap and sample events */ +static u64 dso_map_addr(struct bench_dso *dso) +{ + return 0x400000ULL + dso->ino * 8192ULL; +} + +static u32 synthesize_attr(struct bench_data *data) +{ + union perf_event event; + + memset(&event, 0, sizeof(event.attr) + sizeof(u64)); + + event.header.type = PERF_RECORD_HEADER_ATTR; + event.header.size = sizeof(event.attr) + sizeof(u64); + + event.attr.attr.type = PERF_TYPE_SOFTWARE; + event.attr.attr.config = PERF_COUNT_SW_TASK_CLOCK; + event.attr.attr.exclude_kernel = 1; + event.attr.attr.sample_id_all = 1; + event.attr.attr.sample_type = bench_sample_type; + + return writen(data->input_pipe[1], &event, event.header.size); +} + +static u32 synthesize_fork(struct bench_data *data) +{ + union perf_event event; + + memset(&event, 0, sizeof(event.fork) + bench_id_hdr_size); + + event.header.type = PERF_RECORD_FORK; + event.header.misc = PERF_RECORD_MISC_FORK_EXEC; + event.header.size = sizeof(event.fork) + bench_id_hdr_size; + + event.fork.ppid = 1; + event.fork.ptid = 1; + event.fork.pid = data->pid; + event.fork.tid = data->pid; + + return writen(data->input_pipe[1], &event, event.header.size); +} + +static u32 synthesize_mmap(struct bench_data *data, struct bench_dso *dso, + u64 timestamp) +{ + union perf_event event; + size_t len = offsetof(struct perf_record_mmap2, filename); + u64 *id_hdr_ptr = (void *)&event; + int ts_idx; + + len += roundup(strlen(dso->name) + 1, 8) + bench_id_hdr_size; + + memset(&event, 0, min(len, sizeof(event.mmap2))); + + event.header.type = PERF_RECORD_MMAP2; + event.header.misc = PERF_RECORD_MISC_USER; + event.header.size = len; + + event.mmap2.pid = data->pid; + event.mmap2.tid = data->pid; + event.mmap2.maj = MMAP_DEV_MAJOR; + event.mmap2.ino = dso->ino; + + strcpy(event.mmap2.filename, dso->name); + + event.mmap2.start = dso_map_addr(dso); + event.mmap2.len = 4096; + event.mmap2.prot = PROT_EXEC; + + if (len > sizeof(event.mmap2)) { + /* write mmap2 event first */ + writen(data->input_pipe[1], &event, len - bench_id_hdr_size); + /* zero-fill sample id header */ + memset(id_hdr_ptr, 0, bench_id_hdr_size); + /* put timestamp in the right position */ + ts_idx = (bench_id_hdr_size / sizeof(u64)) - 2; + id_hdr_ptr[ts_idx] = timestamp; + writen(data->input_pipe[1], id_hdr_ptr, bench_id_hdr_size); + } else { + ts_idx = (len / sizeof(u64)) - 2; + id_hdr_ptr[ts_idx] = timestamp; + writen(data->input_pipe[1], &event, len); + } + return len; +} + +static u32 synthesize_sample(struct bench_data *data, struct bench_dso *dso, + u64 timestamp) +{ + union perf_event event; + struct perf_sample sample = { + .tid = data->pid, + .pid = data->pid, + .ip = dso_map_addr(dso), + .time = timestamp, + }; + + event.header.type = PERF_RECORD_SAMPLE; + event.header.misc = PERF_RECORD_MISC_USER; + event.header.size = perf_event__sample_event_size(&sample, bench_sample_type, 0); + + perf_event__synthesize_sample(&event, bench_sample_type, 0, &sample); + + return writen(data->input_pipe[1], &event, event.header.size); +} + +static u32 synthesize_flush(struct bench_data *data) +{ + struct perf_event_header header = { + .size = sizeof(header), + .type = PERF_RECORD_FINISHED_ROUND, + }; + + return writen(data->input_pipe[1], &header, header.size); +} + +static void *data_reader(void *arg) +{ + struct bench_data *data = arg; + char buf[8192]; + int flag; + int n; + + flag = fcntl(data->output_pipe[0], F_GETFL); + fcntl(data->output_pipe[0], F_SETFL, flag | O_NONBLOCK); + + /* read out data from child */ + while (true) { + n = read(data->output_pipe[0], buf, sizeof(buf)); + if (n > 0) + continue; + if (n == 0) + break; + + if (errno != EINTR && errno != EAGAIN) + break; + + usleep(100); + } + + close(data->output_pipe[0]); + return NULL; +} + +static int setup_injection(struct bench_data *data) +{ + int ready_pipe[2]; + int dev_null_fd; + char buf; + + if (pipe(ready_pipe) < 0) + return -1; + + if (pipe(data->input_pipe) < 0) + return -1; + + if (pipe(data->output_pipe) < 0) + return -1; + + data->pid = fork(); + if (data->pid < 0) + return -1; + + if (data->pid == 0) { + const char **inject_argv; + + close(data->input_pipe[1]); + close(data->output_pipe[0]); + close(ready_pipe[0]); + + dup2(data->input_pipe[0], STDIN_FILENO); + close(data->input_pipe[0]); + dup2(data->output_pipe[1], STDOUT_FILENO); + close(data->output_pipe[1]); + + dev_null_fd = open("/dev/null", O_WRONLY); + if (dev_null_fd < 0) + exit(1); + + dup2(dev_null_fd, STDERR_FILENO); + + inject_argv = calloc(3, sizeof(*inject_argv)); + if (inject_argv == NULL) + exit(1); + + inject_argv[0] = strdup("inject"); + inject_argv[1] = strdup("-b"); + + /* signal that we're ready to go */ + close(ready_pipe[1]); + + cmd_inject(2, inject_argv); + + exit(0); + } + + pthread_create(&data->th, NULL, data_reader, data); + + close(ready_pipe[1]); + close(data->input_pipe[0]); + close(data->output_pipe[1]); + + /* wait for child ready */ + if (read(ready_pipe[0], &buf, 1) < 0) + return -1; + close(ready_pipe[0]); + + return 0; +} + +static int inject_build_id(struct bench_data *data, u64 *max_rss) +{ + int status; + unsigned int i, k; + struct rusage rusage; + u64 len = 0; + + /* this makes the child to run */ + if (perf_header__write_pipe(data->input_pipe[1]) < 0) + return -1; + + len += synthesize_attr(data); + len += synthesize_fork(data); + + for (i = 0; i < nr_mmaps; i++) { + int idx = rand() % (nr_dsos - 1); + struct bench_dso *dso = &dsos[idx]; + u64 timestamp = rand() % 1000000; + + pr_debug2(" [%d] injecting: %s\n", i+1, dso->name); + len += synthesize_mmap(data, dso, timestamp); + + for (k = 0; k < nr_samples; k++) + len += synthesize_sample(data, dso, timestamp + k * 1000); + + if ((i + 1) % 10 == 0) + len += synthesize_flush(data); + } + + /* tihs makes the child to finish */ + close(data->input_pipe[1]); + + wait4(data->pid, &status, 0, &rusage); + *max_rss = rusage.ru_maxrss; + + pr_debug(" Child %d exited with %d\n", data->pid, status); + + return 0; +} + +static int do_inject_loop(struct bench_data *data) +{ + unsigned int i; + struct stats time_stats, mem_stats; + double time_average, time_stddev; + double mem_average, mem_stddev; + + srand(time(NULL)); + init_stats(&time_stats); + init_stats(&mem_stats); + symbol__init(NULL); + + bench_sample_type = PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP; + bench_sample_type |= PERF_SAMPLE_TID | PERF_SAMPLE_TIME; + bench_id_hdr_size = 32; + + collect_dso(); + if (nr_dsos == 0) { + printf(" Cannot collect DSOs for injection\n"); + return -1; + } + + for (i = 0; i < iterations; i++) { + struct timeval start, end, diff; + u64 runtime_us, max_rss; + + pr_debug(" Iteration #%d\n", i+1); + + if (setup_injection(data) < 0) { + printf(" Build-id injection setup failed\n"); + break; + } + + gettimeofday(&start, NULL); + if (inject_build_id(data, &max_rss) < 0) { + printf(" Build-id injection failed\n"); + break; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + update_stats(&mem_stats, max_rss); + + pthread_join(data->th, NULL); + } + + time_average = avg_stats(&time_stats) / USEC_PER_MSEC; + time_stddev = stddev_stats(&time_stats) / USEC_PER_MSEC; + printf(" Average build-id injection took: %.3f msec (+- %.3f msec)\n", + time_average, time_stddev); + + /* each iteration, it processes MMAP2 + BUILD_ID + nr_samples * SAMPLE */ + time_average = avg_stats(&time_stats) / (nr_mmaps * (nr_samples + 2)); + time_stddev = stddev_stats(&time_stats) / (nr_mmaps * (nr_samples + 2)); + printf(" Average time per event: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + + mem_average = avg_stats(&mem_stats); + mem_stddev = stddev_stats(&mem_stats); + printf(" Average memory usage: %.0f KB (+- %.0f KB)\n", + mem_average, mem_stddev); + + release_dso(); + return 0; +} + +int bench_inject_build_id(int argc, const char **argv) +{ + struct bench_data data; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + return do_inject_loop(&data); +} + diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index 11c79a8d85d6..e78e8196ccac 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -78,6 +78,7 @@ static struct bench epoll_benchmarks[] = { static struct bench internals_benchmarks[] = { { "synthesize", "Benchmark perf event synthesis", bench_synthesize }, + { "inject-build-id", "Benchmark build-id injection", bench_inject_build_id }, { NULL, NULL, NULL } }; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 00311f8c79d8..5045daafda56 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -425,11 +425,10 @@ static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, return 0; } -static int perf_event__inject_buildid(struct perf_tool *tool, - union perf_event *event, - struct perf_sample *sample, - struct evsel *evsel __maybe_unused, - struct machine *machine) +int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, + struct evsel *evsel __maybe_unused, + struct machine *machine) { struct addr_location al; struct thread *thread; diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h index aad419bb165c..949f7e54c9cb 100644 --- a/tools/perf/util/build-id.h +++ b/tools/perf/util/build-id.h @@ -29,6 +29,10 @@ int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event, int dsos__hit_all(struct perf_session *session); +int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, struct evsel *evsel, + struct machine *machine); + bool perf_session__read_build_ids(struct perf_session *session, bool with_hits); int perf_session__write_buildid_table(struct perf_session *session, struct feat_fd *fd); -- Gitee From ec8e8a0b30ab391808400c417bb1b1e38986ba30 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 30 Oct 2020 14:47:42 +0900 Subject: [PATCH 224/257] perf data: Allow to use stdio functions for pipe mode commit 601366678c93618f37a685332c0ba07e5556798c upstream When perf data is in a pipe, it reads each event separately using read(2) syscall. This is a huge performance bottleneck when processing large data like in perf inject. Also perf inject needs to use write(2) syscall for the output. So convert it to use buffer I/O functions in stdio library for pipe data. This makes inject-build-id bench time drops from 20ms to 8ms. $ perf bench internals inject-build-id # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 8.074 msec (+- 0.013 msec) Average time per event: 0.792 usec (+- 0.001 usec) Average memory usage: 8328 KB (+- 0 KB) Average build-id-all injection took: 5.490 msec (+- 0.008 msec) Average time per event: 0.538 usec (+- 0.001 usec) Average memory usage: 7563 KB (+- 0 KB) This patch enables it just for perf inject when used with pipe (it's a default behavior). Maybe we could do it for perf record and/or report later.. Committer testing: Before: $ perf stat -r 5 perf bench internals inject-build-id # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 13.605 msec (+- 0.064 msec) Average time per event: 1.334 usec (+- 0.006 usec) Average memory usage: 12220 KB (+- 7 KB) Average build-id-all injection took: 11.458 msec (+- 0.058 msec) Average time per event: 1.123 usec (+- 0.006 usec) Average memory usage: 11546 KB (+- 8 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 13.673 msec (+- 0.057 msec) Average time per event: 1.341 usec (+- 0.006 usec) Average memory usage: 12508 KB (+- 8 KB) Average build-id-all injection took: 11.437 msec (+- 0.046 msec) Average time per event: 1.121 usec (+- 0.004 usec) Average memory usage: 11812 KB (+- 7 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 13.641 msec (+- 0.069 msec) Average time per event: 1.337 usec (+- 0.007 usec) Average memory usage: 12302 KB (+- 8 KB) Average build-id-all injection took: 10.820 msec (+- 0.106 msec) Average time per event: 1.061 usec (+- 0.010 usec) Average memory usage: 11616 KB (+- 7 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 13.379 msec (+- 0.074 msec) Average time per event: 1.312 usec (+- 0.007 usec) Average memory usage: 12334 KB (+- 8 KB) Average build-id-all injection took: 11.288 msec (+- 0.071 msec) Average time per event: 1.107 usec (+- 0.007 usec) Average memory usage: 11657 KB (+- 8 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 13.534 msec (+- 0.058 msec) Average time per event: 1.327 usec (+- 0.006 usec) Average memory usage: 12264 KB (+- 8 KB) Average build-id-all injection took: 11.557 msec (+- 0.076 msec) Average time per event: 1.133 usec (+- 0.007 usec) Average memory usage: 11593 KB (+- 8 KB) Performance counter stats for 'perf bench internals inject-build-id' (5 runs): 4,060.05 msec task-clock:u # 1.566 CPUs utilized ( +- 0.65% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 101,888 page-faults:u # 0.025 M/sec ( +- 0.12% ) 3,745,833,163 cycles:u # 0.923 GHz ( +- 0.10% ) (83.22%) 194,346,613 stalled-cycles-frontend:u # 5.19% frontend cycles idle ( +- 0.57% ) (83.30%) 708,495,034 stalled-cycles-backend:u # 18.91% backend cycles idle ( +- 0.48% ) (83.48%) 5,629,328,628 instructions:u # 1.50 insn per cycle # 0.13 stalled cycles per insn ( +- 0.21% ) (83.57%) 1,236,697,927 branches:u # 304.602 M/sec ( +- 0.16% ) (83.44%) 17,564,877 branch-misses:u # 1.42% of all branches ( +- 0.23% ) (82.99%) 2.5934 +- 0.0128 seconds time elapsed ( +- 0.49% ) $ After: $ perf stat -r 5 perf bench internals inject-build-id # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 8.560 msec (+- 0.125 msec) Average time per event: 0.839 usec (+- 0.012 usec) Average memory usage: 12520 KB (+- 8 KB) Average build-id-all injection took: 5.789 msec (+- 0.054 msec) Average time per event: 0.568 usec (+- 0.005 usec) Average memory usage: 11919 KB (+- 9 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 8.639 msec (+- 0.111 msec) Average time per event: 0.847 usec (+- 0.011 usec) Average memory usage: 12732 KB (+- 8 KB) Average build-id-all injection took: 5.647 msec (+- 0.069 msec) Average time per event: 0.554 usec (+- 0.007 usec) Average memory usage: 12093 KB (+- 7 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 8.551 msec (+- 0.096 msec) Average time per event: 0.838 usec (+- 0.009 usec) Average memory usage: 12739 KB (+- 8 KB) Average build-id-all injection took: 5.617 msec (+- 0.061 msec) Average time per event: 0.551 usec (+- 0.006 usec) Average memory usage: 12105 KB (+- 7 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 8.403 msec (+- 0.097 msec) Average time per event: 0.824 usec (+- 0.010 usec) Average memory usage: 12770 KB (+- 8 KB) Average build-id-all injection took: 5.611 msec (+- 0.085 msec) Average time per event: 0.550 usec (+- 0.008 usec) Average memory usage: 12134 KB (+- 8 KB) # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 8.518 msec (+- 0.102 msec) Average time per event: 0.835 usec (+- 0.010 usec) Average memory usage: 12518 KB (+- 10 KB) Average build-id-all injection took: 5.503 msec (+- 0.073 msec) Average time per event: 0.540 usec (+- 0.007 usec) Average memory usage: 11882 KB (+- 8 KB) Performance counter stats for 'perf bench internals inject-build-id' (5 runs): 2,394.88 msec task-clock:u # 1.577 CPUs utilized ( +- 0.83% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 103,181 page-faults:u # 0.043 M/sec ( +- 0.11% ) 3,548,172,030 cycles:u # 1.482 GHz ( +- 0.30% ) (83.26%) 81,537,700 stalled-cycles-frontend:u # 2.30% frontend cycles idle ( +- 1.54% ) (83.24%) 876,631,544 stalled-cycles-backend:u # 24.71% backend cycles idle ( +- 1.14% ) (83.45%) 5,960,361,707 instructions:u # 1.68 insn per cycle # 0.15 stalled cycles per insn ( +- 0.27% ) (83.26%) 1,269,413,491 branches:u # 530.054 M/sec ( +- 0.10% ) (83.48%) 11,372,453 branch-misses:u # 0.90% of all branches ( +- 0.52% ) (83.31%) 1.51874 +- 0.00642 seconds time elapsed ( +- 0.42% ) $ Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Ian Rogers Cc: Mark Rutland Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20201030054742.87740-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/builtin-inject.c | 2 ++ tools/perf/util/data.c | 41 ++++++++++++++++++++++++++++++++++--- tools/perf/util/data.h | 11 +++++++++- tools/perf/util/header.c | 8 ++++---- tools/perf/util/session.c | 7 ++++--- 5 files changed, 58 insertions(+), 11 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 5045daafda56..f7ea774b116f 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -775,10 +775,12 @@ int cmd_inject(int argc, const char **argv) .output = { .path = "-", .mode = PERF_DATA_MODE_WRITE, + .use_stdio = true, }, }; struct perf_data data = { .mode = PERF_DATA_MODE_READ, + .use_stdio = true, }; int ret; diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index af061352430a..4f37475ad515 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -169,8 +169,21 @@ static bool check_pipe(struct perf_data *data) is_pipe = true; } - if (is_pipe) - data->file.fd = fd; + if (is_pipe) { + if (data->use_stdio) { + const char *mode; + + mode = perf_data__is_read(data) ? "r" : "w"; + data->file.fptr = fdopen(fd, mode); + + if (data->file.fptr == NULL) { + data->file.fd = fd; + data->use_stdio = false; + } + } else { + data->file.fd = fd; + } + } return data->is_pipe = is_pipe; } @@ -330,6 +343,9 @@ int perf_data__open(struct perf_data *data) if (check_pipe(data)) return 0; + /* currently it allows stdio for pipe only */ + data->use_stdio = false; + if (!data->path) data->path = "perf.data"; @@ -349,7 +365,21 @@ void perf_data__close(struct perf_data *data) perf_data__close_dir(data); zfree(&data->file.path); - close(data->file.fd); + + if (data->use_stdio) + fclose(data->file.fptr); + else + close(data->file.fd); +} + +ssize_t perf_data__read(struct perf_data *data, void *buf, size_t size) +{ + if (data->use_stdio) { + if (fread(buf, size, 1, data->file.fptr) == 1) + return size; + return feof(data->file.fptr) ? 0 : -1; + } + return readn(data->file.fd, buf, size); } ssize_t perf_data_file__write(struct perf_data_file *file, @@ -361,6 +391,11 @@ ssize_t perf_data_file__write(struct perf_data_file *file, ssize_t perf_data__write(struct perf_data *data, void *buf, size_t size) { + if (data->use_stdio) { + if (fwrite(buf, size, 1, data->file.fptr) == 1) + return size; + return -1; + } return perf_data_file__write(&data->file, buf, size); } diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 06d6555c66db..10fbadebae81 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -2,6 +2,7 @@ #ifndef __PERF_DATA_H #define __PERF_DATA_H +#include #include #include @@ -12,7 +13,10 @@ enum perf_data_mode { struct perf_data_file { char *path; - int fd; + union { + int fd; + FILE *fptr; + }; unsigned long size; }; @@ -23,6 +27,7 @@ struct perf_data { bool is_dir; bool force; bool in_place_update; + bool use_stdio; enum perf_data_mode mode; struct { @@ -54,11 +59,15 @@ static inline bool perf_data__is_dir(struct perf_data *data) static inline int perf_data__fd(struct perf_data *data) { + if (data->use_stdio) + return fileno(data->file.fptr); + return data->file.fd; } int perf_data__open(struct perf_data *data); void perf_data__close(struct perf_data *data); +ssize_t perf_data__read(struct perf_data *data, void *buf, size_t size); ssize_t perf_data__write(struct perf_data *data, void *buf, size_t size); ssize_t perf_data_file__write(struct perf_data_file *file, diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4478dbc31576..930ad7b8addb 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3824,7 +3824,8 @@ static int perf_file_section__process(struct perf_file_section *section, } static int perf_file_header__read_pipe(struct perf_pipe_file_header *header, - struct perf_header *ph, int fd, + struct perf_header *ph, + struct perf_data* data, bool repipe) { struct feat_fd ff = { @@ -3833,7 +3834,7 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *header, }; ssize_t ret; - ret = readn(fd, header, sizeof(*header)); + ret = perf_data__read(data, header, sizeof(*header)); if (ret <= 0) return -1; @@ -3856,8 +3857,7 @@ static int perf_header__read_pipe(struct perf_session *session) struct perf_header *header = &session->header; struct perf_pipe_file_header f_header; - if (perf_file_header__read_pipe(&f_header, header, - perf_data__fd(session->data), + if (perf_file_header__read_pipe(&f_header, header, session->data, session->repipe) < 0) { pr_debug("incompatible file format\n"); return -EINVAL; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 6da10007c49e..5070b1954746 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1863,7 +1863,6 @@ static int __perf_session__process_pipe_events(struct perf_session *session) { struct ordered_events *oe = &session->ordered_events; struct perf_tool *tool = session->tool; - int fd = perf_data__fd(session->data); union perf_event *event; uint32_t size, cur_size = 0; void *buf = NULL; @@ -1883,7 +1882,8 @@ static int __perf_session__process_pipe_events(struct perf_session *session) ordered_events__set_copy_on_queue(oe, true); more: event = buf; - err = readn(fd, event, sizeof(struct perf_event_header)); + err = perf_data__read(session->data, event, + sizeof(struct perf_event_header)); if (err <= 0) { if (err == 0) goto done; @@ -1915,7 +1915,8 @@ static int __perf_session__process_pipe_events(struct perf_session *session) p += sizeof(struct perf_event_header); if (size - sizeof(struct perf_event_header)) { - err = readn(fd, p, size - sizeof(struct perf_event_header)); + err = perf_data__read(session->data, p, + size - sizeof(struct perf_event_header)); if (err <= 0) { if (err == 0) { pr_err("unexpected end of event stream\n"); -- Gitee From e50b394041a48f6213fcc38f5cc4d4d563986469 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 19 Jul 2021 15:31:50 -0700 Subject: [PATCH 225/257] perf tools: Pass a fd to perf_file_header__read_pipe() commit 0ae03893623dd1ddf17c1210265e5d7f9e2f3ed6 upstream Currently it unconditionally writes to stdout for repipe. But perf inject can direct its output to a regular file. Then it needs to write the header to the file as well. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210719223153.1618812-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/builtin-inject.c | 3 ++- tools/perf/util/header.c | 12 ++++++------ tools/perf/util/header.h | 2 +- tools/perf/util/session.c | 8 ++++---- tools/perf/util/session.h | 4 ++-- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index f7ea774b116f..ef840bb53a55 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -854,7 +854,8 @@ int cmd_inject(int argc, const char **argv) inject.tool.ordered_events = inject.sched_stat; data.path = inject.input_name; - inject.session = __perf_session__new(&data, inject.output.is_pipe, &inject.tool); + inject.session = __perf_session__new(&data, inject.output.is_pipe, + perf_data__fd(&inject.output), &inject.tool); if (IS_ERR(inject.session)) { ret = PTR_ERR(inject.session); goto out_close_output; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 930ad7b8addb..744c4eea5f9d 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3826,10 +3826,10 @@ static int perf_file_section__process(struct perf_file_section *section, static int perf_file_header__read_pipe(struct perf_pipe_file_header *header, struct perf_header *ph, struct perf_data* data, - bool repipe) + bool repipe, int repipe_fd) { struct feat_fd ff = { - .fd = STDOUT_FILENO, + .fd = repipe_fd, .ph = ph, }; ssize_t ret; @@ -3852,13 +3852,13 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *header, return 0; } -static int perf_header__read_pipe(struct perf_session *session) +static int perf_header__read_pipe(struct perf_session *session, int repipe_fd) { struct perf_header *header = &session->header; struct perf_pipe_file_header f_header; if (perf_file_header__read_pipe(&f_header, header, session->data, - session->repipe) < 0) { + session->repipe, repipe_fd) < 0) { pr_debug("incompatible file format\n"); return -EINVAL; } @@ -3958,7 +3958,7 @@ static int perf_evlist__prepare_tracepoint_events(struct evlist *evlist, return 0; } -int perf_session__read_header(struct perf_session *session) +int perf_session__read_header(struct perf_session *session, int repipe_fd) { struct perf_data *data = session->data; struct perf_header *header = &session->header; @@ -3979,7 +3979,7 @@ int perf_session__read_header(struct perf_session *session) * We can read 'pipe' data event from regular file, * check for the pipe header regardless of source. */ - err = perf_header__read_pipe(session); + err = perf_header__read_pipe(session, repipe_fd); if (!err || perf_data__is_pipe(data)) { data->is_pipe = true; return err; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 19b2acfd8422..b53bdaa10adc 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -119,7 +119,7 @@ struct perf_session; struct perf_tool; union perf_event; -int perf_session__read_header(struct perf_session *session); +int perf_session__read_header(struct perf_session *session, int repipe_fd); int perf_session__write_header(struct perf_session *session, struct evlist *evlist, int fd, bool at_exit); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 5070b1954746..782896153efe 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -101,11 +101,11 @@ static int perf_session__deliver_event(struct perf_session *session, struct perf_tool *tool, u64 file_offset); -static int perf_session__open(struct perf_session *session) +static int perf_session__open(struct perf_session *session, int repipe_fd) { struct perf_data *data = session->data; - if (perf_session__read_header(session) < 0) { + if (perf_session__read_header(session, repipe_fd) < 0) { pr_err("incompatible file format (rerun with -v to learn more)\n"); return -1; } @@ -185,7 +185,7 @@ static int ordered_events__deliver_event(struct ordered_events *oe, } struct perf_session *__perf_session__new(struct perf_data *data, - bool repipe, + bool repipe, int repipe_fd, struct perf_tool *tool) { int ret = -ENOMEM; @@ -210,7 +210,7 @@ struct perf_session *__perf_session__new(struct perf_data *data, session->data = data; if (perf_data__is_read(data)) { - ret = perf_session__open(session); + ret = perf_session__open(session, repipe_fd); if (ret < 0) goto out_delete; diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index a0ff19189fdc..f68c33562e73 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -55,13 +55,13 @@ struct decomp { struct perf_tool; struct perf_session *__perf_session__new(struct perf_data *data, - bool repipe, + bool repipe, int repipe_fd, struct perf_tool *tool); static inline struct perf_session *perf_session__new(struct perf_data *data, struct perf_tool *tool) { - return __perf_session__new(data, false, tool); + return __perf_session__new(data, false, -1, tool); } void perf_session__delete(struct perf_session *session); -- Gitee From 8e1d23ce894bd93b04aadbb24f0348cbe5533ed3 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 19 Jul 2021 15:31:51 -0700 Subject: [PATCH 226/257] perf inject: Fix output from a pipe to a file commit fea20d66f90cdbdc7dccf1fa001d40e084984e55 upstream Sometimes it needs to save the perf inject data to a file for debugging. But normally it assumes the same format for input and output, so the end result cannot be used due to a broken format. # perf record -a -o - sleep 1 | perf inject -b -o my.data # perf report -i my.data --stdio 0x208 [0]: failed to process type: 0 [Invalid argument] Error: failed to process sample # To display the perf.data header info, please use --header/--header-only options. # In this case, it thought the data has a regular file header since the output is not a pipe. But actually it doesn't have one and has a pipe file header. At the end of the session, it tries to rewrite the regular file header with updated features and it overwrites the data just follows the pipe header. Fix it by checking either the input and the output is a pipe. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210719223153.1618812-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: malathi Signed-off-by: mohanasv2 --- tools/perf/builtin-inject.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index ef840bb53a55..44d659c42d49 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -41,6 +41,7 @@ struct perf_inject { bool jit_mode; bool in_place_update; bool in_place_update_dry_run; + bool is_pipe; const char *input_name; struct perf_data output; u64 bytes_written; @@ -110,7 +111,7 @@ static int perf_event__repipe_attr(struct perf_tool *tool, if (ret) return ret; - if (!inject->output.is_pipe) + if (!inject->is_pipe) return 0; return perf_event__repipe_synth(tool, event); @@ -691,14 +692,14 @@ static int __cmd_inject(struct perf_inject *inject) if (!inject->itrace_synth_opts.set) auxtrace_index__free(&session->auxtrace_index); - if (!data_out->is_pipe && !inject->in_place_update) + if (!inject->is_pipe && !inject->in_place_update) lseek(fd, output_data_offset, SEEK_SET); ret = perf_session__process_events(session); if (ret) return ret; - if (!data_out->is_pipe && !inject->in_place_update) { + if (!inject->is_pipe && !inject->in_place_update) { if (inject->build_ids) perf_header__set_feat(&session->header, HEADER_BUILD_ID); @@ -854,8 +855,12 @@ int cmd_inject(int argc, const char **argv) inject.tool.ordered_events = inject.sched_stat; data.path = inject.input_name; - inject.session = __perf_session__new(&data, inject.output.is_pipe, - perf_data__fd(&inject.output), &inject.tool); + if (!strcmp(inject.input_name, "-") || inject.output.is_pipe) + inject.is_pipe = true; + + inject.session = __perf_session__new(&data, inject.is_pipe, + perf_data__fd(&inject.output), + &inject.tool); if (IS_ERR(inject.session)) { ret = PTR_ERR(inject.session); goto out_close_output; -- Gitee From 15d4c0fe75bba3b70dff17a1b7d3f35594e0026a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 20 May 2022 16:24:00 +0300 Subject: [PATCH 227/257] perf header: Add ability to keep feature sections commit 237c96b8c1584d4c28d3593e5f52508fbec9ff06 upstream Many feature sections should not be re-written during perf inject. In preparation to support that, add callbacks that a tool can use to copy a feature section from elsewhere. perf inject will use this facility to copy features sections from the input file. Signed-off-by: Adrian Hunter Acked-by: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220520132404.25853-2-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/header.c | 54 ++++++++++++++++++++++++++++++++++------ tools/perf/util/header.h | 10 ++++++++ 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 744c4eea5f9d..e8c6101ea29c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3377,9 +3377,22 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full) return 0; } +struct header_fw { + struct feat_writer fw; + struct feat_fd *ff; +}; + +static int feat_writer_cb(struct feat_writer *fw, void *buf, size_t sz) +{ + struct header_fw *h = container_of(fw, struct header_fw, fw); + + return do_write(h->ff, buf, sz); +} + static int do_write_feat(struct feat_fd *ff, int type, struct perf_file_section **p, - struct evlist *evlist) + struct evlist *evlist, + struct feat_copier *fc) { int err; int ret = 0; @@ -3393,7 +3406,23 @@ static int do_write_feat(struct feat_fd *ff, int type, (*p)->offset = lseek(ff->fd, 0, SEEK_CUR); - err = feat_ops[type].write(ff, evlist); + /* + * Hook to let perf inject copy features sections from the input + * file. + */ + if (fc && fc->copy) { + struct header_fw h = { + .fw.write = feat_writer_cb, + .ff = ff, + }; + + /* ->copy() returns 0 if the feature was not copied */ + err = fc->copy(fc, type, &h.fw); + } else { + err = 0; + } + if (!err) + err = feat_ops[type].write(ff, evlist); if (err < 0) { pr_debug("failed to write feature %s\n", feat_ops[type].name); @@ -3409,7 +3438,8 @@ static int do_write_feat(struct feat_fd *ff, int type, } static int perf_header__adds_write(struct perf_header *header, - struct evlist *evlist, int fd) + struct evlist *evlist, int fd, + struct feat_copier *fc) { int nr_sections; struct feat_fd ff; @@ -3438,7 +3468,7 @@ static int perf_header__adds_write(struct perf_header *header, lseek(fd, sec_start + sec_size, SEEK_SET); for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) { - if (do_write_feat(&ff, feat, &p, evlist)) + if (do_write_feat(&ff, feat, &p, evlist, fc)) perf_header__clear_feat(header, feat); } @@ -3476,9 +3506,10 @@ int perf_header__write_pipe(int fd) return 0; } -int perf_session__write_header(struct perf_session *session, - struct evlist *evlist, - int fd, bool at_exit) +static int perf_session__do_write_header(struct perf_session *session, + struct evlist *evlist, + int fd, bool at_exit, + struct feat_copier *fc) { struct perf_file_header f_header; struct perf_file_attr f_attr; @@ -3522,7 +3553,7 @@ int perf_session__write_header(struct perf_session *session, header->feat_offset = header->data_offset + header->data_size; if (at_exit) { - err = perf_header__adds_write(header, evlist, fd); + err = perf_header__adds_write(header, evlist, fd, fc); if (err < 0) return err; } @@ -3555,6 +3586,13 @@ int perf_session__write_header(struct perf_session *session, return 0; } +int perf_session__write_header(struct perf_session *session, + struct evlist *evlist, + int fd, bool at_exit) +{ + return perf_session__do_write_header(session, evlist, fd, at_exit, NULL); +} + static int perf_header__getbuffer64(struct perf_header *header, int fd, void *buf, size_t size) { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index b53bdaa10adc..1f49d5170cff 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -125,6 +125,16 @@ int perf_session__write_header(struct perf_session *session, int fd, bool at_exit); int perf_header__write_pipe(int fd); +/* feat_writer writes a feature section to output */ +struct feat_writer { + int (*write)(struct feat_writer *fw, void *buf, size_t sz); +}; + +/* feat_copier copies a feature section using feat_writer to output */ +struct feat_copier { + int (*copy)(struct feat_copier *fc, int feat, struct feat_writer *fw); +}; + void perf_header__set_feat(struct perf_header *header, int feat); void perf_header__clear_feat(struct perf_header *header, int feat); bool perf_header__has_feat(const struct perf_header *header, int feat); -- Gitee From 96f77f996f6bc577eec274242ca4685279c371d3 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 20 May 2022 18:56:04 +0300 Subject: [PATCH 228/257] libperf: Add preadn() commit 618ee7838e409513635320ca9c4c8d52c44f2dd0 upstream Add preadn() to provide pread() and readn() semantics. [Backport Changes] 1. In the current kernel, lib.h and lib.c are located under tools/perf/lib/*, whereas in the upstream kernel, they have been moved to tools/lib/perf/*. The patch responsible for relocating these files was skipped, and the relevant changes from the current patch were backported directly to the files under tools/perf/lib/*. Signed-off-by: Adrian Hunter Acked-by: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/ab8918a4-7ac8-a37e-2e2c-28438c422d87@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/lib/include/internal/lib.h | 2 ++ tools/perf/lib/lib.c | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/tools/perf/lib/include/internal/lib.h b/tools/perf/lib/include/internal/lib.h index 5175d491b2d4..85471a4b900f 100644 --- a/tools/perf/lib/include/internal/lib.h +++ b/tools/perf/lib/include/internal/lib.h @@ -9,4 +9,6 @@ extern unsigned int page_size; ssize_t readn(int fd, void *buf, size_t n); ssize_t writen(int fd, const void *buf, size_t n); +ssize_t preadn(int fd, void *buf, size_t n, off_t offs); + #endif /* __LIBPERF_INTERNAL_CPUMAP_H */ diff --git a/tools/perf/lib/lib.c b/tools/perf/lib/lib.c index 18658931fc71..696fb0ea67c6 100644 --- a/tools/perf/lib/lib.c +++ b/tools/perf/lib/lib.c @@ -38,6 +38,26 @@ ssize_t readn(int fd, void *buf, size_t n) return ion(true, fd, buf, n); } +ssize_t preadn(int fd, void *buf, size_t n, off_t offs) +{ + size_t left = n; + + while (left) { + ssize_t ret = pread(fd, buf, left, offs); + + if (ret < 0 && errno == EINTR) + continue; + if (ret <= 0) + return ret; + + left -= ret; + buf += ret; + offs += ret; + } + + return n; +} + /* * Write exactly 'n' bytes or return an error. */ -- Gitee From 77475a316cea76b2706802dbff4d88f51f319cf0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Nov 2019 12:26:19 -0300 Subject: [PATCH 229/257] perf map: Move maj/min/ino/ino_generation to separate struct commit 99459a84d5870a88274b4f10bc85c3e39e1d642c upstream And this patch highlights where these fields are being used: in the sort order where it uses it to compare maps and classify samples taking into account not just the DSO, but those DSO id fields. I think these should be used to differentiate DSOs with the same name but different 'struct dso_id' fields, i.e. these fields should move to 'struct dso' and then be used as part of the key when doing lookups for DSOs, in addition to the DSO name. Cc: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-8v5isitqy0dup47nnwkpc80f@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-report.c | 2 +- tools/perf/util/map.c | 8 ++++---- tools/perf/util/map.h | 14 +++++++++++--- tools/perf/util/sort.c | 24 ++++++++++++------------ 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 3a1953c39a26..7839a64d08ba 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -740,7 +740,7 @@ static size_t maps__fprintf_task(struct maps *maps, int indent, FILE *fp) map->prot & PROT_EXEC ? 'x' : '-', map->flags & MAP_SHARED ? 's' : 'p', map->pgoff, - map->ino, map->dso->name); + map->dso_id.ino, map->dso->name); } return printed; diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 571e99c908a0..f5294a58e70c 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -163,10 +163,10 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, vdso = is_vdso_map(filename); no_dso = is_no_dso_memory(filename); - map->maj = d_maj; - map->min = d_min; - map->ino = ino; - map->ino_generation = ino_gen; + map->dso_id.maj = d_maj; + map->dso_id.min = d_min; + map->dso_id.ino = ino; + map->dso_id.ino_generation = ino_gen; map->prot = prot; map->flags = flags; nsi = nsinfo__get(thread->nsinfo); diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index c3614195ddc7..f10ddf7a6b8b 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -18,6 +18,16 @@ struct map_groups; struct machine; struct evsel; +/* + * Data about backing storage DSO, comes from PERF_RECORD_MMAP2 meta events + */ +struct dso_id { + u32 maj; + u32 min; + u64 ino; + u64 ino_generation; +}; + struct map { union { struct rb_node rb_node; @@ -32,9 +42,6 @@ struct map { u32 flags; u64 pgoff; u64 reloc; - u32 maj, min; /* only valid for MMAP2 record */ - u64 ino; /* only valid for MMAP2 record */ - u64 ino_generation;/* only valid for MMAP2 record */ /* ip -> dso rip */ u64 (*map_ip)(struct map *, u64); @@ -43,6 +50,7 @@ struct map { struct dso *dso; struct map_groups *groups; + struct dso_id dso_id; refcount_t refcnt; }; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index b7cba38b0fa2..05669063b9e3 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1218,17 +1218,17 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) if (!l_map) return -1; if (!r_map) return 1; - if (l_map->maj > r_map->maj) return -1; - if (l_map->maj < r_map->maj) return 1; + if (l_map->dso_id.maj > r_map->dso_id.maj) return -1; + if (l_map->dso_id.maj < r_map->dso_id.maj) return 1; - if (l_map->min > r_map->min) return -1; - if (l_map->min < r_map->min) return 1; + if (l_map->dso_id.min > r_map->dso_id.min) return -1; + if (l_map->dso_id.min < r_map->dso_id.min) return 1; - if (l_map->ino > r_map->ino) return -1; - if (l_map->ino < r_map->ino) return 1; + if (l_map->dso_id.ino > r_map->dso_id.ino) return -1; + if (l_map->dso_id.ino < r_map->dso_id.ino) return 1; - if (l_map->ino_generation > r_map->ino_generation) return -1; - if (l_map->ino_generation < r_map->ino_generation) return 1; + if (l_map->dso_id.ino_generation > r_map->dso_id.ino_generation) return -1; + if (l_map->dso_id.ino_generation < r_map->dso_id.ino_generation) return 1; /* * Addresses with no major/minor numbers are assumed to be @@ -1240,8 +1240,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) if ((left->cpumode != PERF_RECORD_MISC_KERNEL) && (!(l_map->flags & MAP_SHARED)) && - !l_map->maj && !l_map->min && !l_map->ino && - !l_map->ino_generation) { + !l_map->dso_id.maj && !l_map->dso_id.min && + !l_map->dso_id.ino && !l_map->dso_id.ino_generation) { /* userspace anonymous */ if (left->thread->pid_ > right->thread->pid_) return -1; @@ -1277,8 +1277,8 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf, if ((he->cpumode != PERF_RECORD_MISC_KERNEL) && map && !(map->prot & PROT_EXEC) && (map->flags & MAP_SHARED) && - (map->maj || map->min || map->ino || - map->ino_generation)) + (map->dso_id.maj || map->dso_id.min || + map->dso_id.ino || map->dso_id.ino_generation)) level = 's'; else if (!map) level = 'X'; -- Gitee From 80b6b262ab9ae8df9b36a282b8ec678ac15e0067 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Nov 2019 16:30:56 -0300 Subject: [PATCH 230/257] perf map: Move comparision of map's dso_id to a separate function commit 7b59a82493b49b715224bfe3b35fae52e48e5fa1 upstream We'll use it when doing DSO lookups using dso_ids. Cc: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-u2nr1oq03o0i29w2ay9jx03s@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/dsos.c | 25 +++++++++++++++++++++++++ tools/perf/util/map.h | 2 ++ tools/perf/util/sort.c | 16 ++++------------ 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index 3ea80d203587..ecf8d7346685 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -2,6 +2,7 @@ #include "debug.h" #include "dsos.h" #include "dso.h" +#include "map.h" #include "vdso.h" #include "namespaces.h" #include @@ -9,6 +10,30 @@ #include #include // filename__read_build_id +int dso_id__cmp(struct dso_id *a, struct dso_id *b) +{ + /* + * The second is always dso->id, so zeroes if not set, assume passing + * NULL for a means a zeroed id + */ + if (a == NULL) + return 0; + + if (a->maj > b->maj) return -1; + if (a->maj < b->maj) return 1; + + if (a->min > b->min) return -1; + if (a->min < b->min) return 1; + + if (a->ino > b->ino) return -1; + if (a->ino < b->ino) return 1; + + if (a->ino_generation > b->ino_generation) return -1; + if (a->ino_generation < b->ino_generation) return 1; + + return 0; +} + bool __dsos__read_build_ids(struct list_head *head, bool with_hits) { bool have_build_id = false; diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index f10ddf7a6b8b..70a1b4de44c8 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -28,6 +28,8 @@ struct dso_id { u64 ino_generation; }; +int dso_id__cmp(struct dso_id *a, struct dso_id *b); + struct map { union { struct rb_node rb_node; diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 05669063b9e3..23a2561bdb2d 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1200,6 +1200,7 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) { u64 l, r; struct map *l_map, *r_map; + int rc; if (!left->mem_info) return -1; if (!right->mem_info) return 1; @@ -1218,18 +1219,9 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) if (!l_map) return -1; if (!r_map) return 1; - if (l_map->dso_id.maj > r_map->dso_id.maj) return -1; - if (l_map->dso_id.maj < r_map->dso_id.maj) return 1; - - if (l_map->dso_id.min > r_map->dso_id.min) return -1; - if (l_map->dso_id.min < r_map->dso_id.min) return 1; - - if (l_map->dso_id.ino > r_map->dso_id.ino) return -1; - if (l_map->dso_id.ino < r_map->dso_id.ino) return 1; - - if (l_map->dso_id.ino_generation > r_map->dso_id.ino_generation) return -1; - if (l_map->dso_id.ino_generation < r_map->dso_id.ino_generation) return 1; - + rc = dso_id__cmp(&l_map->dso_id, &r_map->dso_id); + if (rc) + return rc; /* * Addresses with no major/minor numbers are assumed to be * anonymous in userspace. Sort those on pid then address. -- Gitee From 3d42930ffe50e4c302a316bd18e9fdb20591f8be Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Nov 2019 12:40:29 -0300 Subject: [PATCH 231/257] perf map: Pass a dso_id to map__new() commit 4a7380a52ec90fbb1565dd638ee7f5b6e709f7fb upstream Instead of the 4 fields, a step in the direction of moving this to struct dso. Cc: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-gp5s1xgxacurmih5d1l94ymy@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/machine.c | 15 ++++++++------- tools/perf/util/map.c | 13 +++++++------ tools/perf/util/map.h | 3 +-- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 7e3957602885..145cd8304498 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1661,6 +1661,12 @@ int machine__process_mmap2_event(struct machine *machine, { struct thread *thread; struct map *map; + struct dso_id dso_id = { + .maj = event->mmap2.maj, + .min = event->mmap2.min, + .ino = event->mmap2.ino, + .ino_generation = event->mmap2.ino_generation, + }; int ret = 0; if (dump_trace) @@ -1681,10 +1687,7 @@ int machine__process_mmap2_event(struct machine *machine, map = map__new(machine, event->mmap2.start, event->mmap2.len, event->mmap2.pgoff, - event->mmap2.maj, - event->mmap2.min, event->mmap2.ino, - event->mmap2.ino_generation, - event->mmap2.prot, + &dso_id, event->mmap2.prot, event->mmap2.flags, event->mmap2.filename, thread); @@ -1737,9 +1740,7 @@ int machine__process_mmap_event(struct machine *machine, union perf_event *event map = map__new(machine, event->mmap.start, event->mmap.len, event->mmap.pgoff, - 0, 0, 0, 0, prot, 0, - event->mmap.filename, - thread); + NULL, prot, 0, event->mmap.filename, thread); if (map == NULL) goto out_problem_map; diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index f5294a58e70c..330dc4d0bcec 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -145,8 +145,8 @@ void map__init(struct map *map, u64 start, u64 end, u64 pgoff, struct dso *dso) } struct map *map__new(struct machine *machine, u64 start, u64 len, - u64 pgoff, u32 d_maj, u32 d_min, u64 ino, - u64 ino_gen, u32 prot, u32 flags, char *filename, + u64 pgoff, struct dso_id *id, + u32 prot, u32 flags, char *filename, struct thread *thread) { struct map *map = malloc(sizeof(*map)); @@ -163,10 +163,11 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, vdso = is_vdso_map(filename); no_dso = is_no_dso_memory(filename); - map->dso_id.maj = d_maj; - map->dso_id.min = d_min; - map->dso_id.ino = ino; - map->dso_id.ino_generation = ino_gen; + if (id) + map->dso_id = *id; + else + map->dso_id.min = map->dso_id.ino = map->dso_id.ino_generation = 0; + map->prot = prot; map->flags = flags; nsi = nsinfo__get(thread->nsinfo); diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 70a1b4de44c8..193c294c3695 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -121,8 +121,7 @@ struct thread; void map__init(struct map *map, u64 start, u64 end, u64 pgoff, struct dso *dso); struct map *map__new(struct machine *machine, u64 start, u64 len, - u64 pgoff, u32 d_maj, u32 d_min, u64 ino, - u64 ino_gen, u32 prot, u32 flags, + u64 pgoff, struct dso_id *id, u32 prot, u32 flags, char *filename, struct thread *thread); struct map *map__new2(u64 start, struct dso *dso); void map__delete(struct map *map); -- Gitee From 875a15c61aa4cbc29020e30cfd200d813b876698 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 19 Nov 2019 18:44:22 -0300 Subject: [PATCH 232/257] perf dso: Move dso_id from 'struct map' to 'struct dso' commit 0e3149f86b99ddabde8c5029eea0a9267e34f1a0 upstream And take it into account when looking up DSOs when we have the dso_id fields obtained from somewhere, like from PERF_RECORD_MMAP2 records. Instances of struct map pointing to the same DSO pathname but with anything in dso_id different are in fact different DSOs, so better have different 'struct dso' instances to reflect that. At some point we may want to get copies of the contents of the different objects if we want to do correct annotation or other analysis. With this we get 'struct map' 24 bytes leaner: $ pahole -C map ~/bin/perf struct map { union { struct rb_node rb_node __attribute__((__aligned__(8))); /* 0 24 */ struct list_head node; /* 0 16 */ } __attribute__((__aligned__(8))); /* 0 24 */ u64 start; /* 24 8 */ u64 end; /* 32 8 */ _Bool erange_warned:1; /* 40: 0 1 */ _Bool priv:1; /* 40: 1 1 */ /* XXX 6 bits hole, try to pack */ /* XXX 3 bytes hole, try to pack */ u32 prot; /* 44 4 */ u64 pgoff; /* 48 8 */ u64 reloc; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ u64 (*map_ip)(struct map *, u64); /* 64 8 */ u64 (*unmap_ip)(struct map *, u64); /* 72 8 */ struct dso * dso; /* 80 8 */ refcount_t refcnt; /* 88 4 */ u32 flags; /* 92 4 */ /* size: 96, cachelines: 2, members: 13 */ /* sum members: 92, holes: 1, sum holes: 3 */ /* sum bitfield members: 2 bits, bit holes: 1, sum bit holes: 6 bits */ /* forced alignments: 1 */ /* last cacheline: 32 bytes */ } __attribute__((__aligned__(8))); $ Cc: Adrian Hunter Cc: Andi Kleen Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-g4hxxmraplo7wfjmk384mfsb@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-report.c | 2 +- tools/perf/util/dso.c | 24 +++++++--- tools/perf/util/dso.h | 13 ++++++ tools/perf/util/dsos.c | 87 +++++++++++++++++++++++++++---------- tools/perf/util/dsos.h | 13 +++--- tools/perf/util/machine.c | 7 ++- tools/perf/util/machine.h | 2 + tools/perf/util/map.c | 8 +--- tools/perf/util/map.h | 16 ++----- tools/perf/util/sort.c | 10 ++--- 10 files changed, 118 insertions(+), 64 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 7839a64d08ba..137a2476c682 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -740,7 +740,7 @@ static size_t maps__fprintf_task(struct maps *maps, int indent, FILE *fp) map->prot & PROT_EXEC ? 'x' : '-', map->flags & MAP_SHARED ? 's' : 'p', map->pgoff, - map->dso_id.ino, map->dso->name); + map->dso->id.ino, map->dso->name); } return printed; diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 7f07a5dc555f..d12921a903f9 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1114,7 +1114,7 @@ struct dso *machine__findnew_kernel(struct machine *machine, const char *name, return dso; } -void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated) +static void dso__set_long_name_id(struct dso *dso, const char *name, struct dso_id *id, bool name_allocated) { struct rb_root *root = dso->root; @@ -1127,8 +1127,8 @@ void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated) if (root) { rb_erase(&dso->rb_node, root); /* - * __dsos__findnew_link_by_longname() isn't guaranteed to add it - * back, so a clean removal is required here. + * __dsos__findnew_link_by_longname_id() isn't guaranteed to + * add it back, so a clean removal is required here. */ RB_CLEAR_NODE(&dso->rb_node); dso->root = NULL; @@ -1139,7 +1139,12 @@ void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated) dso->long_name_allocated = name_allocated; if (root) - __dsos__findnew_link_by_longname(root, dso, NULL); + __dsos__findnew_link_by_longname_id(root, dso, NULL, id); +} + +void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated) +{ + dso__set_long_name_id(dso, name, NULL, name_allocated); } void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated) @@ -1180,13 +1185,15 @@ void dso__set_sorted_by_name(struct dso *dso) dso->sorted_by_name = true; } -struct dso *dso__new(const char *name) +struct dso *dso__new_id(const char *name, struct dso_id *id) { struct dso *dso = calloc(1, sizeof(*dso) + strlen(name) + 1); if (dso != NULL) { strcpy(dso->name, name); - dso__set_long_name(dso, dso->name, false); + if (id) + dso->id = *id; + dso__set_long_name_id(dso, dso->name, id, false); dso__set_short_name(dso, dso->name, false); dso->symbols = dso->symbol_names = RB_ROOT_CACHED; dso->data.cache = RB_ROOT; @@ -1217,6 +1224,11 @@ struct dso *dso__new(const char *name) return dso; } +struct dso *dso__new(const char *name) +{ + return dso__new_id(name, NULL); +} + void dso__delete(struct dso *dso) { if (!RB_EMPTY_NODE(&dso->rb_node)) diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 69bb77d19164..90843e2213a1 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -123,6 +123,16 @@ enum dso_load_errno { #define DSO__DATA_CACHE_SIZE 4096 #define DSO__DATA_CACHE_MASK ~(DSO__DATA_CACHE_SIZE - 1) +/* + * Data about backing storage DSO, comes from PERF_RECORD_MMAP2 meta events + */ +struct dso_id { + u32 maj; + u32 min; + u64 ino; + u64 ino_generation; +}; + struct dso_cache { struct rb_node rb_node; u64 offset; @@ -197,6 +207,7 @@ struct dso { u64 db_id; }; struct nsinfo *nsinfo; + struct dso_id id; refcount_t refcnt; char name[0]; }; @@ -215,9 +226,11 @@ static inline void dso__set_loaded(struct dso *dso) dso->loaded = true; } +struct dso *dso__new_id(const char *name, struct dso_id *id); struct dso *dso__new(const char *name); void dso__delete(struct dso *dso); +int dso__cmp_id(struct dso *a, struct dso *b); void dso__set_short_name(struct dso *dso, const char *name, bool name_allocated); void dso__set_long_name(struct dso *dso, const char *name, bool name_allocated); diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index ecf8d7346685..f62e9100091b 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -2,7 +2,6 @@ #include "debug.h" #include "dsos.h" #include "dso.h" -#include "map.h" #include "vdso.h" #include "namespaces.h" #include @@ -10,15 +9,8 @@ #include #include // filename__read_build_id -int dso_id__cmp(struct dso_id *a, struct dso_id *b) +static int __dso_id__cmp(struct dso_id *a, struct dso_id *b) { - /* - * The second is always dso->id, so zeroes if not set, assume passing - * NULL for a means a zeroed id - */ - if (a == NULL) - return 0; - if (a->maj > b->maj) return -1; if (a->maj < b->maj) return 1; @@ -34,6 +26,23 @@ int dso_id__cmp(struct dso_id *a, struct dso_id *b) return 0; } +static int dso_id__cmp(struct dso_id *a, struct dso_id *b) +{ + /* + * The second is always dso->id, so zeroes if not set, assume passing + * NULL for a means a zeroed id + */ + if (a == NULL) + return 0; + + return __dso_id__cmp(a, b); +} + +int dso__cmp_id(struct dso *a, struct dso *b) +{ + return __dso_id__cmp(&a->id, &b->id); +} + bool __dsos__read_build_ids(struct list_head *head, bool with_hits) { bool have_build_id = false; @@ -59,12 +68,30 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits) return have_build_id; } +static int __dso__cmp_long_name(const char *long_name, struct dso_id *id, struct dso *b) +{ + int rc = strcmp(long_name, b->long_name); + return rc ?: dso_id__cmp(id, &b->id); +} + +static int __dso__cmp_short_name(const char *short_name, struct dso_id *id, struct dso *b) +{ + int rc = strcmp(short_name, b->short_name); + return rc ?: dso_id__cmp(id, &b->id); +} + +static int dso__cmp_short_name(struct dso *a, struct dso *b) +{ + return __dso__cmp_short_name(a->short_name, &a->id, b); +} + /* * Find a matching entry and/or link current entry to RB tree. * Either one of the dso or name parameter must be non-NULL or the * function will not work. */ -struct dso *__dsos__findnew_link_by_longname(struct rb_root *root, struct dso *dso, const char *name) +struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso *dso, + const char *name, struct dso_id *id) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -76,7 +103,7 @@ struct dso *__dsos__findnew_link_by_longname(struct rb_root *root, struct dso *d */ while (*p) { struct dso *this = rb_entry(*p, struct dso, rb_node); - int rc = strcmp(name, this->long_name); + int rc = __dso__cmp_long_name(name, id, this); parent = *p; if (rc == 0) { @@ -92,7 +119,7 @@ struct dso *__dsos__findnew_link_by_longname(struct rb_root *root, struct dso *d * In this case, the short name should be different. * Comparing the short names to differentiate the DSOs. */ - rc = strcmp(dso->short_name, this->short_name); + rc = dso__cmp_short_name(dso, this); if (rc == 0) { pr_err("Duplicated dso name: %s\n", name); return NULL; @@ -115,7 +142,7 @@ struct dso *__dsos__findnew_link_by_longname(struct rb_root *root, struct dso *d void __dsos__add(struct dsos *dsos, struct dso *dso) { list_add_tail(&dso->node, &dsos->head); - __dsos__findnew_link_by_longname(&dsos->root, dso, NULL); + __dsos__findnew_link_by_longname_id(&dsos->root, dso, NULL, &dso->id); /* * It is now in the linked list, grab a reference, then garbage collect * this when needing memory, by looking at LRU dso instances in the @@ -146,17 +173,27 @@ void dsos__add(struct dsos *dsos, struct dso *dso) up_write(&dsos->lock); } -struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short) +static struct dso *__dsos__findnew_by_longname_id(struct rb_root *root, const char *name, struct dso_id *id) +{ + return __dsos__findnew_link_by_longname_id(root, NULL, name, id); +} + +static struct dso *__dsos__find_id(struct dsos *dsos, const char *name, struct dso_id *id, bool cmp_short) { struct dso *pos; if (cmp_short) { list_for_each_entry(pos, &dsos->head, node) - if (strcmp(pos->short_name, name) == 0) + if (__dso__cmp_short_name(name, id, pos) == 0) return pos; return NULL; } - return __dsos__findnew_by_longname(&dsos->root, name); + return __dsos__findnew_by_longname_id(&dsos->root, name, id); +} + +struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short) +{ + return __dsos__find_id(dsos, name, NULL, cmp_short); } struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short) @@ -200,9 +237,9 @@ static void dso__set_basename(struct dso *dso) dso__set_short_name(dso, base, true); } -struct dso *__dsos__addnew(struct dsos *dsos, const char *name) +static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, struct dso_id *id) { - struct dso *dso = dso__new(name); + struct dso *dso = dso__new_id(name, id); if (dso != NULL) { __dsos__add(dsos, dso); @@ -213,18 +250,22 @@ struct dso *__dsos__addnew(struct dsos *dsos, const char *name) return dso; } -struct dso *__dsos__findnew(struct dsos *dsos, const char *name) +struct dso *__dsos__addnew(struct dsos *dsos, const char *name) { - struct dso *dso = __dsos__find(dsos, name, false); + return __dsos__addnew_id(dsos, name, NULL); +} - return dso ? dso : __dsos__addnew(dsos, name); +static struct dso *__dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id) +{ + struct dso *dso = __dsos__find_id(dsos, name, id, false); + return dso ? dso : __dsos__addnew_id(dsos, name, id); } -struct dso *dsos__findnew(struct dsos *dsos, const char *name) +struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id) { struct dso *dso; down_write(&dsos->lock); - dso = dso__get(__dsos__findnew(dsos, name)); + dso = dso__get(__dsos__findnew_id(dsos, name, id)); up_write(&dsos->lock); return dso; } diff --git a/tools/perf/util/dsos.h b/tools/perf/util/dsos.h index 32f1fbee0feb..a9b552af1d34 100644 --- a/tools/perf/util/dsos.h +++ b/tools/perf/util/dsos.h @@ -9,6 +9,7 @@ #include "rwsem.h" struct dso; +struct dso_id; /* * DSOs are put into both a list for fast iteration and rbtree for fast @@ -25,15 +26,11 @@ void dsos__add(struct dsos *dsos, struct dso *dso); struct dso *__dsos__addnew(struct dsos *dsos, const char *name); struct dso *__dsos__find(struct dsos *dsos, const char *name, bool cmp_short); struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short); -struct dso *__dsos__findnew(struct dsos *dsos, const char *name); -struct dso *dsos__findnew(struct dsos *dsos, const char *name); -struct dso *__dsos__findnew_link_by_longname(struct rb_root *root, struct dso *dso, const char *name); - -static inline struct dso *__dsos__findnew_by_longname(struct rb_root *root, const char *name) -{ - return __dsos__findnew_link_by_longname(root, NULL, name); -} +struct dso *dsos__findnew_id(struct dsos *dsos, const char *name, struct dso_id *id); + +struct dso *__dsos__findnew_link_by_longname_id(struct rb_root *root, struct dso *dso, + const char *name, struct dso_id *id); bool __dsos__read_build_ids(struct list_head *head, bool with_hits); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 145cd8304498..0b51c453bfda 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2715,9 +2715,14 @@ u8 machine__addr_cpumode(struct machine *machine, u8 cpumode, u64 addr) return addr_cpumode; } +struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename, struct dso_id *id) +{ + return dsos__findnew_id(&machine->dsos, filename, id); +} + struct dso *machine__findnew_dso(struct machine *machine, const char *filename) { - return dsos__findnew(&machine->dsos, filename); + return machine__findnew_dso_id(machine, filename, NULL); } char *machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 4f734337661a..fbb13c07f07e 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -11,6 +11,7 @@ struct addr_location; struct branch_stack; struct dso; +struct dso_id; struct evsel; struct perf_sample; struct symbol; @@ -205,6 +206,7 @@ int machine__nr_cpus_avail(struct machine *machine); struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid); struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid); +struct dso *machine__findnew_dso_id(struct machine *machine, const char *filename, struct dso_id *id); struct dso *machine__findnew_dso(struct machine *machine, const char *filename); size_t machine__fprintf(struct machine *machine, FILE *fp); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 330dc4d0bcec..5259740bbafd 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -162,12 +162,6 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, anon = is_anon_memory(filename, flags); vdso = is_vdso_map(filename); no_dso = is_no_dso_memory(filename); - - if (id) - map->dso_id = *id; - else - map->dso_id.min = map->dso_id.ino = map->dso_id.ino_generation = 0; - map->prot = prot; map->flags = flags; nsi = nsinfo__get(thread->nsinfo); @@ -197,7 +191,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, pgoff = 0; dso = machine__findnew_vdso(machine, thread); } else - dso = machine__findnew_dso(machine, filename); + dso = machine__findnew_dso_id(machine, filename, id); if (dso == NULL) goto out_delete; diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 193c294c3695..3f75829ccacf 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -18,18 +18,6 @@ struct map_groups; struct machine; struct evsel; -/* - * Data about backing storage DSO, comes from PERF_RECORD_MMAP2 meta events - */ -struct dso_id { - u32 maj; - u32 min; - u64 ino; - u64 ino_generation; -}; - -int dso_id__cmp(struct dso_id *a, struct dso_id *b); - struct map { union { struct rb_node rb_node; @@ -52,7 +40,6 @@ struct map { struct dso *dso; struct map_groups *groups; - struct dso_id dso_id; refcount_t refcnt; }; @@ -120,6 +107,9 @@ struct thread; void map__init(struct map *map, u64 start, u64 end, u64 pgoff, struct dso *dso); + +struct dso_id; + struct map *map__new(struct machine *machine, u64 start, u64 len, u64 pgoff, struct dso_id *id, u32 prot, u32 flags, char *filename, struct thread *thread); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 23a2561bdb2d..90b0da006e7a 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1219,7 +1219,7 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) if (!l_map) return -1; if (!r_map) return 1; - rc = dso_id__cmp(&l_map->dso_id, &r_map->dso_id); + rc = dso__cmp_id(l_map->dso, r_map->dso); if (rc) return rc; /* @@ -1232,8 +1232,8 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right) if ((left->cpumode != PERF_RECORD_MISC_KERNEL) && (!(l_map->flags & MAP_SHARED)) && - !l_map->dso_id.maj && !l_map->dso_id.min && - !l_map->dso_id.ino && !l_map->dso_id.ino_generation) { + !l_map->dso->id.maj && !l_map->dso->id.min && + !l_map->dso->id.ino && !l_map->dso->id.ino_generation) { /* userspace anonymous */ if (left->thread->pid_ > right->thread->pid_) return -1; @@ -1269,8 +1269,8 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf, if ((he->cpumode != PERF_RECORD_MISC_KERNEL) && map && !(map->prot & PROT_EXEC) && (map->flags & MAP_SHARED) && - (map->dso_id.maj || map->dso_id.min || - map->dso_id.ino || map->dso_id.ino_generation)) + (map->dso->id.maj || map->dso->id.min || + map->dso->id.ino || map->dso->id.ino_generation)) level = 's'; else if (!map) level = 'X'; -- Gitee From 80c1dc1d39375c4615a317d94384ecdcee4ae1f8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Oct 2020 16:02:11 +0900 Subject: [PATCH 233/257] perf inject: Enter namespace when reading build-id commit 336c95b297e8127705e590bbd25f5c627bfcb782 upstream It should be in a proper mnt namespace when accessing the file. I think this had no problem since the build-id was actually read from map__load() -> dso__load() already. But I'd like to change it in the following commit. Acked-by: Jiri Olsa Signed-off-by: Namhyung Kim Link: https://lore.kernel.org/r/20201012070214.2074921-4-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-inject.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 44d659c42d49..688f99d3b79b 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -23,6 +23,7 @@ #include "util/symbol.h" #include "util/synthetic-events.h" #include "util/thread.h" +#include "util/namespaces.h" #include #include @@ -390,16 +391,19 @@ static int perf_event__repipe_id_index(struct perf_session *session, static int dso__read_build_id(struct dso *dso) { + struct nscookie nsc; + if (dso->has_build_id) return 0; + nsinfo__mountns_enter(dso->nsinfo, &nsc); if (filename__read_build_id(dso->long_name, dso->build_id, sizeof(dso->build_id)) > 0) { dso->has_build_id = true; - return 0; } + nsinfo__mountns_exit(&nsc); - return -1; + return dso->has_build_id ? 0 : -1; } static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, -- Gitee From e5008231cd37b0f0864d3811d4885c2920c9aa34 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Oct 2020 16:02:12 +0900 Subject: [PATCH 234/257] perf inject: Do not load map/dso when injecting build-id commit e7b60c5a0c4b9c02fa3b471f8d5edb4989ebdf60 upstream No need to load symbols in a DSO when injecting build-id. I guess the reason was to check the DSO is a special file like anon files. Use some helper functions in map.c to check them before reading build-id. Also pass sample event's cpumode to a new build-id event. It brought a speedup in the benchmark of 25 -> 21 msec on my laptop. Also the memory usage (Max RSS) went down by ~200 KB. # Running 'internals/inject-build-id' benchmark: Average build-id injection took: 21.389 msec (+- 0.138 msec) Average time per event: 2.097 usec (+- 0.014 usec) Average memory usage: 8225 KB (+- 0 KB) Committer notes: Before: $ perf stat -r5 perf bench internals inject-build-id > /dev/null Performance counter stats for 'perf bench internals inject-build-id' (5 runs): 4,020.56 msec task-clock:u # 1.271 CPUs utilized ( +- 0.74% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 123,354 page-faults:u # 0.031 M/sec ( +- 0.81% ) 7,119,951,568 cycles:u # 1.771 GHz ( +- 1.74% ) (83.27%) 230,086,969 stalled-cycles-frontend:u # 3.23% frontend cycles idle ( +- 1.97% ) (83.41%) 1,168,298,765 stalled-cycles-backend:u # 16.41% backend cycles idle ( +- 1.13% ) (83.44%) 11,173,083,669 instructions:u # 1.57 insn per cycle # 0.10 stalled cycles per insn ( +- 1.58% ) (83.31%) 2,413,908,936 branches:u # 600.392 M/sec ( +- 1.69% ) (83.26%) 46,576,289 branch-misses:u # 1.93% of all branches ( +- 2.20% ) (83.31%) 3.1638 +- 0.0309 seconds time elapsed ( +- 0.98% ) $ After: $ perf stat -r5 perf bench internals inject-build-id > /dev/null Performance counter stats for 'perf bench internals inject-build-id' (5 runs): 2,379.94 msec task-clock:u # 1.473 CPUs utilized ( +- 0.18% ) 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 62,584 page-faults:u # 0.026 M/sec ( +- 0.07% ) 2,372,389,668 cycles:u # 0.997 GHz ( +- 0.29% ) (83.14%) 106,937,862 stalled-cycles-frontend:u # 4.51% frontend cycles idle ( +- 4.89% ) (83.20%) 581,697,915 stalled-cycles-backend:u # 24.52% backend cycles idle ( +- 0.71% ) (83.47%) 3,659,692,199 instructions:u # 1.54 insn per cycle # 0.16 stalled cycles per insn ( +- 0.10% ) (83.63%) 791,372,961 branches:u # 332.518 M/sec ( +- 0.27% ) (83.39%) 10,648,083 branch-misses:u # 1.35% of all branches ( +- 0.22% ) (83.16%) 1.61570 +- 0.00172 seconds time elapsed ( +- 0.11% ) $ Signed-off-by: Namhyung Kim Original-patch-by: Stephane Eranian Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20201012070214.2074921-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-inject.c | 33 ++++++++++++--------------------- tools/perf/util/map.c | 17 +---------------- tools/perf/util/map.h | 14 ++++++++++++++ 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 688f99d3b79b..88e09ecad45a 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -24,9 +24,10 @@ #include "util/synthetic-events.h" #include "util/thread.h" #include "util/namespaces.h" -#include +#include #include +#include /* To get things like MAP_HUGETLB even on older libc headers */ #include #include @@ -407,21 +408,22 @@ static int dso__read_build_id(struct dso *dso) } static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, - struct machine *machine) + struct machine *machine, u8 cpumode, u32 flags) { - u16 misc = PERF_RECORD_MISC_USER; int err; + if (is_anon_memory(dso->long_name) || flags & MAP_HUGETLB) + return 0; + if (is_no_dso_memory(dso->long_name)) + return 0; + if (dso__read_build_id(dso) < 0) { pr_debug("no build_id found for %s\n", dso->long_name); return -1; } - if (dso->kernel) - misc = PERF_RECORD_MISC_KERNEL; - - err = perf_event__synthesize_build_id(tool, dso, misc, perf_event__repipe, - machine); + err = perf_event__synthesize_build_id(tool, dso, cpumode, + perf_event__repipe, machine); if (err) { pr_err("Can't synthesize build_id event for %s\n", dso->long_name); return -1; @@ -448,19 +450,8 @@ int perf_event__inject_buildid(struct perf_tool *tool, union perf_event *event, if (thread__find_map(thread, sample->cpumode, sample->ip, &al)) { if (!al.map->dso->hit) { al.map->dso->hit = 1; - if (map__load(al.map) >= 0) { - dso__inject_build_id(al.map->dso, tool, machine); - /* - * If this fails, too bad, let the other side - * account this as unresolved. - */ - } else { -#ifdef HAVE_LIBELF_SUPPORT - pr_warning("no symbols found in %s, maybe " - "install a debug package?\n", - al.map->dso->long_name); -#endif - } + dso__inject_build_id(al.map->dso, tool, machine, + sample->cpumode, al.map->flags); } } diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 5259740bbafd..2028922982ba 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -28,21 +28,6 @@ static void __maps__insert(struct maps *maps, struct map *map); static void __maps__insert_name(struct maps *maps, struct map *map); -static inline int is_anon_memory(const char *filename, u32 flags) -{ - return flags & MAP_HUGETLB || - !strcmp(filename, "//anon") || - !strncmp(filename, "/dev/zero", sizeof("/dev/zero") - 1) || - !strncmp(filename, "/anon_hugepage", sizeof("/anon_hugepage") - 1); -} - -static inline int is_no_dso_memory(const char *filename) -{ - return !strncmp(filename, "[stack", 6) || - !strncmp(filename, "/SYSV",5) || - !strcmp(filename, "[heap]"); -} - static inline int is_android_lib(const char *filename) { return !strncmp(filename, "/data/app-lib", 13) || @@ -159,7 +144,7 @@ struct map *map__new(struct machine *machine, u64 start, u64 len, int anon, no_dso, vdso, android; android = is_android_lib(filename); - anon = is_anon_memory(filename, flags); + anon = is_anon_memory(filename) || flags & MAP_HUGETLB; vdso = is_vdso_map(filename); no_dso = is_no_dso_memory(filename); map->prot = prot; diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 3f75829ccacf..0be5e970b9a8 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -175,4 +175,18 @@ static inline bool is_entry_trampoline(const char *name) return !strcmp(name, ENTRY_TRAMPOLINE_NAME); } + +static inline int is_anon_memory(const char *filename) +{ + return !strcmp(filename, "//anon") || + !strncmp(filename, "/dev/zero", sizeof("/dev/zero") - 1) || + !strncmp(filename, "/anon_hugepage", sizeof("/anon_hugepage") - 1); +} + +static inline int is_no_dso_memory(const char *filename) +{ + return !strncmp(filename, "[stack", 6) || + !strncmp(filename, "/SYSV", 5) || + !strcmp(filename, "[heap]"); +} #endif /* __PERF_MAP_H */ -- Gitee From a1fa74b0819b8449ace3391aa39093bbc18af760 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 12 Oct 2020 16:02:13 +0900 Subject: [PATCH 235/257] perf inject: Add --buildid-all option commit 27c9c3424fc217dab4077122069fe4b4413dbf18 upstream Like 'perf record', we can even more speedup build-id processing by just using all DSOs. Then we don't need to look at all the sample events anymore. The following patch will update 'perf bench' to show the result of the --buildid-all option too. Signed-off-by: Namhyung Kim Original-patch-by: Stephane Eranian Acked-by: Ian Rogers Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20201012070214.2074921-6-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf-inject.txt | 6 +- tools/perf/builtin-inject.c | 113 ++++++++++++++++++++++- 2 files changed, 113 insertions(+), 6 deletions(-) diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index a64d6588470e..fefe658701c5 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -24,8 +24,12 @@ information could make use of this facility. OPTIONS ------- -b:: ---build-ids=:: +--build-ids:: Inject build-ids into the output stream + +--buildid-all: + Inject build-ids of all DSOs into the output stream + -v:: --verbose:: Be more verbose. diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 88e09ecad45a..d6abcba54786 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -10,6 +10,7 @@ #include "util/color.h" #include "util/dso.h" +#include "util/vdso.h" #include "util/evlist.h" #include "util/evsel.h" #include "util/map.h" @@ -37,6 +38,7 @@ struct perf_inject { struct perf_tool tool; struct perf_session *session; bool build_ids; + bool build_id_all; bool sched_stat; bool have_auxtrace; bool strip; @@ -58,6 +60,9 @@ struct event_entry { union perf_event event[0]; }; +static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool, + struct machine *machine, u8 cpumode, u32 flags); + static int output_bytes(struct perf_inject *inject, void *buf, size_t sz) { ssize_t size; @@ -279,6 +284,68 @@ static int perf_event__jit_repipe_mmap(struct perf_tool *tool, } #endif +static struct dso *findnew_dso(int pid, int tid, const char *filename, + struct dso_id *id, struct machine *machine) +{ + struct thread *thread; + struct nsinfo *nsi = NULL; + struct nsinfo *nnsi; + struct dso *dso; + bool vdso; + + thread = machine__findnew_thread(machine, pid, tid); + if (thread == NULL) { + pr_err("cannot find or create a task %d/%d.\n", tid, pid); + return NULL; + } + + vdso = is_vdso_map(filename); + nsi = nsinfo__get(thread->nsinfo); + + if (vdso) { + /* The vdso maps are always on the host and not the + * container. Ensure that we don't use setns to look + * them up. + */ + nnsi = nsinfo__copy(nsi); + if (nnsi) { + nsinfo__put(nsi); + nnsi->need_setns = false; + nsi = nnsi; + } + dso = machine__findnew_vdso(machine, thread); + } else { + dso = machine__findnew_dso_id(machine, filename, id); + } + + if (dso) + dso->nsinfo = nsi; + else + nsinfo__put(nsi); + + thread__put(thread); + return dso; +} + +static int perf_event__repipe_buildid_mmap(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine) +{ + struct dso *dso; + + dso = findnew_dso(event->mmap.pid, event->mmap.tid, + event->mmap.filename, NULL, machine); + + if (dso && !dso->hit) { + dso->hit = 1; + dso__inject_build_id(dso, tool, machine, sample->cpumode, 0); + dso__put(dso); + } + + return perf_event__repipe(tool, event, sample, machine); +} + static int perf_event__repipe_mmap2(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, @@ -317,6 +384,34 @@ static int perf_event__jit_repipe_mmap2(struct perf_tool *tool, } #endif +static int perf_event__repipe_buildid_mmap2(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine) +{ + struct dso_id dso_id = { + .maj = event->mmap2.maj, + .min = event->mmap2.min, + .ino = event->mmap2.ino, + .ino_generation = event->mmap2.ino_generation, + }; + struct dso *dso; + + dso = findnew_dso(event->mmap2.pid, event->mmap2.tid, + event->mmap2.filename, &dso_id, machine); + + if (dso && !dso->hit) { + dso->hit = 1; + dso__inject_build_id(dso, tool, machine, sample->cpumode, + event->mmap2.flags); + dso__put(dso); + } + + perf_event__repipe(tool, event, sample, machine); + + return 0; +} + static int perf_event__repipe_fork(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, @@ -638,7 +733,7 @@ static int __cmd_inject(struct perf_inject *inject) signal(SIGINT, sig_handler); if (inject->build_ids || inject->sched_stat || - inject->itrace_synth_opts.set) { + inject->itrace_synth_opts.set || inject->build_id_all) { inject->tool.mmap = perf_event__repipe_mmap; inject->tool.mmap2 = perf_event__repipe_mmap2; inject->tool.fork = perf_event__repipe_fork; @@ -647,7 +742,10 @@ static int __cmd_inject(struct perf_inject *inject) output_data_offset = session->header.data_offset; - if (inject->build_ids) { + if (inject->build_id_all) { + inject->tool.mmap = perf_event__repipe_buildid_mmap; + inject->tool.mmap2 = perf_event__repipe_buildid_mmap2; + } else if (inject->build_ids) { inject->tool.sample = perf_event__inject_buildid; } else if (inject->sched_stat) { struct evsel *evsel; @@ -783,6 +881,8 @@ int cmd_inject(int argc, const char **argv) struct option options[] = { OPT_BOOLEAN('b', "build-ids", &inject.build_ids, "Inject build-ids into the output stream"), + OPT_BOOLEAN(0, "buildid-all", &inject.build_id_all, + "Inject build-ids of all DSOs into the output stream"), OPT_STRING('i', "input", &inject.input_name, "file", "input file name"), OPT_STRING('o', "output", &inject.output.path, "file", @@ -847,8 +947,6 @@ int cmd_inject(int argc, const char **argv) return -1; } - inject.tool.ordered_events = inject.sched_stat; - data.path = inject.input_name; if (!strcmp(inject.input_name, "-") || inject.output.is_pipe) inject.is_pipe = true; @@ -864,7 +962,7 @@ int cmd_inject(int argc, const char **argv) if (zstd_init(&(inject.session->zstd_data), 0) < 0) pr_warning("Decompression initialization failed.\n"); - if (inject.build_ids) { + if (inject.build_ids && !inject.build_id_all) { /* * to make sure the mmap records are ordered correctly * and so that the correct especially due to jitted code @@ -874,6 +972,11 @@ int cmd_inject(int argc, const char **argv) inject.tool.ordered_events = true; inject.tool.ordering_requires_timestamps = true; } + + if (inject.sched_stat) { + inject.tool.ordered_events = true; + } + #ifdef HAVE_JITDUMP if (inject.jit_mode) { inject.tool.mmap2 = perf_event__jit_repipe_mmap2; -- Gitee From 135017447b5f4548b41949a0b9659e8cbe9d6bb8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 19 Jul 2021 15:31:52 -0700 Subject: [PATCH 236/257] perf inject: Fix output from a file to a pipe commit c3a057dc3aa9979ce6dc350e05eb2e4c021432cd upstream When the input is a regular file but the output is a pipe, it should write a pipe header. But just repiping would write a portion of the existing header which is different in 'size' value. So we need to prevent it and write a new pipe header along with other information like event attributes and features. This can handle something like this: # perf record -a -B sleep 1 # perf inject -b -i perf.data | perf report -i - Factor out perf_event__synthesize_for_pipe() to be shared between perf record and inject. [Backport changes] 1.In tools/perf/util/synthetic-events.c, the upstream patch removes the #include "util/cgroup.h" directive. However, this line doesn't exist in the current kernel source, so no changes were needed for this part of the backport. Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210719223153.1618812-5-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-inject.c | 28 ++++++++++++++-- tools/perf/builtin-record.c | 38 +++------------------- tools/perf/util/synthetic-events.c | 52 ++++++++++++++++++++++++++++++ tools/perf/util/synthetic-events.h | 6 ++++ 4 files changed, 88 insertions(+), 36 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index d6abcba54786..cc7fe28dbd88 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -877,6 +877,7 @@ int cmd_inject(int argc, const char **argv) .use_stdio = true, }; int ret; + bool repipe = true; struct option options[] = { OPT_BOOLEAN('b', "build-ids", &inject.build_ids, @@ -948,10 +949,18 @@ int cmd_inject(int argc, const char **argv) } data.path = inject.input_name; - if (!strcmp(inject.input_name, "-") || inject.output.is_pipe) + if (!strcmp(inject.input_name, "-") || inject.output.is_pipe) { inject.is_pipe = true; + /* + * Do not repipe header when input is a regular file + * since either it can rewrite the header at the end + * or write a new pipe header. + */ + if (strcmp(inject.input_name, "-")) + repipe = false; + } - inject.session = __perf_session__new(&data, inject.is_pipe, + inject.session = __perf_session__new(&data, repipe, perf_data__fd(&inject.output), &inject.tool); if (IS_ERR(inject.session)) { @@ -962,6 +971,21 @@ int cmd_inject(int argc, const char **argv) if (zstd_init(&(inject.session->zstd_data), 0) < 0) pr_warning("Decompression initialization failed.\n"); + if (!data.is_pipe && inject.output.is_pipe) { + ret = perf_header__write_pipe(perf_data__fd(&inject.output)); + if (ret < 0) { + pr_err("Couldn't write a new pipe header.\n"); + goto out_delete; + } + + ret = perf_event__synthesize_for_pipe(&inject.tool, + inject.session, + &inject.output, + perf_event__repipe); + if (ret < 0) + goto out_delete; + } + if (inject.build_ids && !inject.build_id_all) { /* * to make sure the mmap records are ordered correctly diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 75ad52c08bab..2af9b8685817 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1249,48 +1249,18 @@ static int record__synthesize(struct record *rec, bool tail) struct perf_data *data = &rec->data; struct record_opts *opts = &rec->opts; struct perf_tool *tool = &rec->tool; - int fd = perf_data__fd(data); int err = 0; if (rec->opts.tail_synthesize != tail) return 0; if (data->is_pipe) { - /* - * We need to synthesize events first, because some - * features works on top of them (on report side). - */ - err = perf_event__synthesize_attrs(tool, rec->evlist, - process_synthesized_event); - if (err < 0) { - pr_err("Couldn't synthesize attrs.\n"); - goto out; - } - - err = perf_event__synthesize_features(tool, session, rec->evlist, + err = perf_event__synthesize_for_pipe(tool, session, data, process_synthesized_event); - if (err < 0) { - pr_err("Couldn't synthesize features.\n"); - return err; - } + if (err < 0) + goto out; - if (have_tracepoints(&rec->evlist->core.entries)) { - /* - * FIXME err <= 0 here actually means that - * there were no tracepoints so its not really - * an error, just that we don't need to - * synthesize anything. We really have to - * return this more properly and also - * propagate errors that now are calling die() - */ - err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist, - process_synthesized_event); - if (err <= 0) { - pr_err("Couldn't record tracing data.\n"); - goto out; - } - rec->bytes_written += err; - } + rec->bytes_written += err; } err = perf_event__synth_time_conv(record__pick_pc(rec), tool, diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index b7d0f6614daf..96e2adca2719 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include "util/cgroup.h" +#include "util/data.h" #include "util/debug.h" #include "util/dso.h" #include "util/event.h" @@ -1922,3 +1924,53 @@ int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session free(ff.buf); return ret; } + +int perf_event__synthesize_for_pipe(struct perf_tool *tool, + struct perf_session *session, + struct perf_data *data, + perf_event__handler_t process) +{ + int err; + int ret = 0; + struct evlist *evlist = session->evlist; + + /* + * We need to synthesize events first, because some + * features works on top of them (on report side). + */ + err = perf_event__synthesize_attrs(tool, evlist, process); + if (err < 0) { + pr_err("Couldn't synthesize attrs.\n"); + return err; + } + ret += err; + + err = perf_event__synthesize_features(tool, session, evlist, process); + if (err < 0) { + pr_err("Couldn't synthesize features.\n"); + return err; + } + ret += err; + + if (have_tracepoints(&evlist->core.entries)) { + int fd = perf_data__fd(data); + + /* + * FIXME err <= 0 here actually means that + * there were no tracepoints so its not really + * an error, just that we don't need to + * synthesize anything. We really have to + * return this more properly and also + * propagate errors that now are calling die() + */ + err = perf_event__synthesize_tracing_data(tool, fd, evlist, + process); + if (err <= 0) { + pr_err("Couldn't record tracing data.\n"); + return err; + } + ret += err; + } + + return ret; +} diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h index baead0cdc381..96e11c7eed9d 100644 --- a/tools/perf/util/synthetic-events.h +++ b/tools/perf/util/synthetic-events.h @@ -14,6 +14,7 @@ struct evsel; struct machine; struct perf_counts_values; struct perf_cpu_map; +struct perf_data; struct perf_event_attr; struct perf_event_mmap_page; struct perf_sample; @@ -100,4 +101,9 @@ static inline int perf_event__synthesize_bpf_events(struct perf_session *session } #endif // HAVE_LIBBPF_SUPPORT +int perf_event__synthesize_for_pipe(struct perf_tool *tool, + struct perf_session *session, + struct perf_data *data, + perf_event__handler_t process); + #endif // __PERF_SYNTHETIC_EVENTS_H -- Gitee From 0db534d4466f0b71f96a11be7507f3dfb0e881a4 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 30 Apr 2021 10:03:02 +0300 Subject: [PATCH 237/257] perf inject: Add --vm-time-correlation option commit 83d7f5f1ad0ec6072bb5b4ca32795f0c199424e2 upstream Intel PT timestamps are affected by virtualization. Add a new option that will allow the Intel PT decoder to correlate the timestamps and translate the virtual machine timestamps to host timestamps. The advantages of making this a separate step, rather than a part of normal decoding are that it is simpler to implement, and it needs to be done only once. This patch adds only the option. Later patches add Intel PT support. Signed-off-by: Adrian Hunter Reviewed-by: Andi Kleen Cc: Jiri Olsa Link: https://lore.kernel.org/r/20210430070309.17624-6-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/Documentation/perf-inject.txt | 10 ++++++ tools/perf/builtin-inject.c | 44 ++++++++++++++++++++++++ tools/perf/util/auxtrace.h | 6 ++++ 3 files changed, 60 insertions(+) diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index fefe658701c5..c34eb5b8c762 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -68,6 +68,16 @@ include::itrace.txt[] --force:: Don't complain, do it. +--vm-time-correlation[=OPTIONS]:: + Some architectures may capture AUX area data which contains timestamps + affected by virtualization. This option will update those timestamps + in place, to correlate with host timestamps. The in-place update means + that an output file is not specified, and instead the input file is + modified. The options are architecture specific, except that they may + start with "dry-run" which will cause the file to be processed but + without updating it. Currently this option is supported only by + Intel PT, refer linkperf:perf-intel-pt[1] + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1] diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index cc7fe28dbd88..8f8c36a79020 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -31,6 +31,7 @@ #include /* To get things like MAP_HUGETLB even on older libc headers */ #include +#include #include #include @@ -722,6 +723,36 @@ static void strip_fini(struct perf_inject *inject) } } +static int parse_vm_time_correlation(const struct option *opt, const char *str, int unset) +{ + struct perf_inject *inject = opt->value; + const char *args; + char *dry_run; + + if (unset) + return 0; + + inject->itrace_synth_opts.set = true; + inject->itrace_synth_opts.vm_time_correlation = true; + inject->in_place_update = true; + + if (!str) + return 0; + + dry_run = skip_spaces(str); + if (!strncmp(dry_run, "dry-run", strlen("dry-run"))) { + inject->itrace_synth_opts.vm_tm_corr_dry_run = true; + inject->in_place_update_dry_run = true; + args = dry_run + strlen("dry-run"); + } else { + args = str; + } + + inject->itrace_synth_opts.vm_tm_corr_args = strdup(args); + + return inject->itrace_synth_opts.vm_tm_corr_args ? 0 : -ENOMEM; +} + static int __cmd_inject(struct perf_inject *inject) { int ret = -EINVAL; @@ -763,6 +794,15 @@ static int __cmd_inject(struct perf_inject *inject) else if (!strncmp(name, "sched:sched_stat_", 17)) evsel->handler = perf_inject__sched_stat; } + } else if (inject->itrace_synth_opts.vm_time_correlation) { + session->itrace_synth_opts = &inject->itrace_synth_opts; + memset(&inject->tool, 0, sizeof(inject->tool)); + inject->tool.id_index = perf_event__process_id_index; + inject->tool.auxtrace_info = perf_event__process_auxtrace_info; + inject->tool.auxtrace = perf_event__process_auxtrace; + inject->tool.auxtrace_error = perf_event__process_auxtrace_error; + inject->tool.ordered_events = true; + inject->tool.ordering_requires_timestamps = true; } else if (inject->itrace_synth_opts.set) { session->itrace_synth_opts = &inject->itrace_synth_opts; inject->itrace_synth_opts.inject = true; @@ -905,6 +945,9 @@ int cmd_inject(int argc, const char **argv) itrace_parse_synth_opts), OPT_BOOLEAN(0, "strip", &inject.strip, "strip non-synthesized events (use with --itrace)"), + OPT_CALLBACK_OPTARG(0, "vm-time-correlation", &inject, NULL, "opts", + "correlate time between VM guests and the host", + parse_vm_time_correlation), OPT_END() }; const char * const inject_usage[] = { @@ -1023,6 +1066,7 @@ int cmd_inject(int argc, const char **argv) out_delete: zstd_fini(&(inject.session->zstd_data)); perf_session__delete(inject.session); + free(inject.itrace_synth_opts.vm_tm_corr_args); out_close_output: perf_data__close(&inject.output); return ret; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 39ef333ddb6c..15d24882ce40 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -82,6 +82,9 @@ enum itrace_period_type { * @llc: whether to synthesize last level cache events * @tlb: whether to synthesize TLB events * @remote_access: whether to synthesize remote access events + * @vm_time_correlation: perform VM Time Correlation + * @vm_tm_corr_dry_run: VM Time Correlation dry-run + * @vm_tm_corr_args: VM Time Correlation implementation-specific arguments * @callchain_sz: maximum callchain size * @last_branch_sz: branch context size * @period: 'instructions' events period @@ -113,6 +116,9 @@ struct itrace_synth_opts { bool llc; bool tlb; bool remote_access; + bool vm_time_correlation; + bool vm_tm_corr_dry_run; + char *vm_tm_corr_args; unsigned int callchain_sz; unsigned int last_branch_sz; unsigned long long period; -- Gitee From 9bdbbe4224394324501bb9b9e7a9f529560af3c6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 13 Dec 2021 10:48:29 +0200 Subject: [PATCH 238/257] perf inject: Fix segfault due to perf_data__fd() without open commit c271a55b0c6029fed0cac909fa57999a11467132 upstream The fixed commit attempts to get the output file descriptor even if the file was never opened e.g. $ perf record uname Linux [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.002 MB perf.data (7 samples) ] $ perf inject -i perf.data --vm-time-correlation=dry-run Segmentation fault (core dumped) $ gdb --quiet perf Reading symbols from perf... (gdb) r inject -i perf.data --vm-time-correlation=dry-run Starting program: /home/ahunter/bin/perf inject -i perf.data --vm-time-correlation=dry-run [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1". Program received signal SIGSEGV, Segmentation fault. __GI___fileno (fp=0x0) at fileno.c:35 35 fileno.c: No such file or directory. (gdb) bt #0 __GI___fileno (fp=0x0) at fileno.c:35 #1 0x00005621e48dd987 in perf_data__fd (data=0x7fff4c68bd08) at util/data.h:72 #2 perf_data__fd (data=0x7fff4c68bd08) at util/data.h:69 #3 cmd_inject (argc=, argv=0x7fff4c69c1f0) at builtin-inject.c:1017 #4 0x00005621e4936783 in run_builtin (p=0x5621e4ee6878 , argc=4, argv=0x7fff4c69c1f0) at perf.c:313 #5 0x00005621e4897d5c in handle_internal_command (argv=, argc=) at perf.c:365 #6 run_argv (argcp=, argv=) at perf.c:409 #7 main (argc=4, argv=0x7fff4c69c1f0) at perf.c:539 (gdb) Fixes: 0ae03893623dd1dd ("perf tools: Pass a fd to perf_file_header__read_pipe()") Signed-off-by: Adrian Hunter Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Riccardo Mancini Cc: stable@vger.kernel.org Link: http://lore.kernel.org/lkml/20211213084829.114772-3-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-inject.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 8f8c36a79020..0c557e387c04 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -753,12 +753,16 @@ static int parse_vm_time_correlation(const struct option *opt, const char *str, return inject->itrace_synth_opts.vm_tm_corr_args ? 0 : -ENOMEM; } +static int output_fd(struct perf_inject *inject) +{ + return inject->in_place_update ? -1 : perf_data__fd(&inject->output); +} + static int __cmd_inject(struct perf_inject *inject) { int ret = -EINVAL; struct perf_session *session = inject->session; - struct perf_data *data_out = &inject->output; - int fd = inject->in_place_update ? -1 : perf_data__fd(data_out); + int fd = output_fd(inject); u64 output_data_offset; signal(SIGINT, sig_handler); @@ -1004,7 +1008,7 @@ int cmd_inject(int argc, const char **argv) } inject.session = __perf_session__new(&data, repipe, - perf_data__fd(&inject.output), + output_fd(&inject), &inject.tool); if (IS_ERR(inject.session)) { ret = PTR_ERR(inject.session); -- Gitee From 3c10b64897ed92f5225e3602c679e62590df2a9f Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 20 May 2022 16:24:02 +0300 Subject: [PATCH 239/257] perf inject: Keep some features sections from input file commit 180b3d06263ce70dd0622681237cf5c0555d9ca0 upstream perf inject overwrites feature sections with information from the current machine. It makes more sense to keep original information that describes the machine or software when perf record was run. Example: perf.data from "Desktop" injected on "nuc11" Before: $ perf script --header-only -i perf.data-from-desktop | head -15 # ======== # captured on : Thu May 19 09:55:50 2022 # header version : 1 # data offset : 1208 # data size : 837480 # feat offset : 838688 # hostname : Desktop # os release : 5.13.0-41-generic # perf version : 5.18.rc5.gac837f7ca7ed # arch : x86_64 # nrcpus online : 28 # nrcpus avail : 28 # cpudesc : Intel(R) Core(TM) i9-9940X CPU @ 3.30GHz # cpuid : GenuineIntel,6,85,4 # total memory : 65548656 kB $ perf inject -i perf.data-from-desktop -o injected-perf.data $ perf script --header-only -i injected-perf.data | head -15 # ======== # captured on : Fri May 20 15:06:55 2022 # header version : 1 # data offset : 1208 # data size : 837480 # feat offset : 838688 # hostname : nuc11 # os release : 5.17.5-local # perf version : 5.18.rc5.g0f828fdeb9af # arch : x86_64 # nrcpus online : 8 # nrcpus avail : 8 # cpudesc : 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz # cpuid : GenuineIntel,6,140,1 # total memory : 16012124 kB After: $ perf inject -i perf.data-from-desktop -o injected-perf.data $ perf script --header-only -i injected-perf.data | head -15 # ======== # captured on : Fri May 20 15:08:54 2022 # header version : 1 # data offset : 1208 # data size : 837480 # feat offset : 838688 # hostname : Desktop # os release : 5.13.0-41-generic # perf version : 5.18.rc5.gac837f7ca7ed # arch : x86_64 # nrcpus online : 28 # nrcpus avail : 28 # cpudesc : Intel(R) Core(TM) i9-9940X CPU @ 3.30GHz # cpuid : GenuineIntel,6,85,4 # total memory : 65548656 kB Signed-off-by: Adrian Hunter Acked-by: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220520132404.25853-4-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/builtin-inject.c | 129 +++++++++++++++++++++++++++++++++++- tools/perf/util/header.c | 8 +++ tools/perf/util/header.h | 5 ++ 3 files changed, 141 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 0c557e387c04..f8930389d909 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -26,6 +26,8 @@ #include "util/thread.h" #include "util/namespaces.h" +#include + #include #include #include /* To get things like MAP_HUGETLB even on older libc headers */ @@ -53,6 +55,7 @@ struct perf_inject { u64 aux_id; struct list_head samples; struct itrace_synth_opts itrace_synth_opts; + struct perf_file_section secs[HEADER_FEAT_BITS]; }; struct event_entry { @@ -753,6 +756,120 @@ static int parse_vm_time_correlation(const struct option *opt, const char *str, return inject->itrace_synth_opts.vm_tm_corr_args ? 0 : -ENOMEM; } +static int save_section_info_cb(struct perf_file_section *section, + struct perf_header *ph __maybe_unused, + int feat, int fd __maybe_unused, void *data) +{ + struct perf_inject *inject = data; + + inject->secs[feat] = *section; + return 0; +} + +static int save_section_info(struct perf_inject *inject) +{ + struct perf_header *header = &inject->session->header; + int fd = perf_data__fd(inject->session->data); + + return perf_header__process_sections(header, fd, inject, save_section_info_cb); +} + +static bool keep_feat(int feat) +{ + switch (feat) { + /* Keep original information that describes the machine or software */ + case HEADER_TRACING_DATA: + case HEADER_HOSTNAME: + case HEADER_OSRELEASE: + case HEADER_VERSION: + case HEADER_ARCH: + case HEADER_NRCPUS: + case HEADER_CPUDESC: + case HEADER_CPUID: + case HEADER_TOTAL_MEM: + case HEADER_CPU_TOPOLOGY: + case HEADER_NUMA_TOPOLOGY: + case HEADER_PMU_MAPPINGS: + case HEADER_CACHE: + case HEADER_MEM_TOPOLOGY: + case HEADER_CLOCKID: + case HEADER_BPF_PROG_INFO: + case HEADER_BPF_BTF: + case HEADER_CPU_PMU_CAPS: + case HEADER_CLOCK_DATA: + case HEADER_HYBRID_TOPOLOGY: + case HEADER_HYBRID_CPU_PMU_CAPS: + return true; + /* Information that can be updated */ + case HEADER_BUILD_ID: + case HEADER_CMDLINE: + case HEADER_EVENT_DESC: + case HEADER_BRANCH_STACK: + case HEADER_GROUP_DESC: + case HEADER_AUXTRACE: + case HEADER_STAT: + case HEADER_SAMPLE_TIME: + case HEADER_DIR_FORMAT: + case HEADER_COMPRESSED: + default: + return false; + }; +} + +static int read_file(int fd, u64 offs, void *buf, size_t sz) +{ + ssize_t ret = preadn(fd, buf, sz, offs); + + if (ret < 0) + return -errno; + if ((size_t)ret != sz) + return -EINVAL; + return 0; +} + +static int feat_copy(struct perf_inject *inject, int feat, struct feat_writer *fw) +{ + int fd = perf_data__fd(inject->session->data); + u64 offs = inject->secs[feat].offset; + size_t sz = inject->secs[feat].size; + void *buf = malloc(sz); + int ret; + + if (!buf) + return -ENOMEM; + + ret = read_file(fd, offs, buf, sz); + if (ret) + goto out_free; + + ret = fw->write(fw, buf, sz); +out_free: + free(buf); + return ret; +} + +struct inject_fc { + struct feat_copier fc; + struct perf_inject *inject; +}; + +static int feat_copy_cb(struct feat_copier *fc, int feat, struct feat_writer *fw) +{ + struct inject_fc *inj_fc = container_of(fc, struct inject_fc, fc); + struct perf_inject *inject = inj_fc->inject; + int ret; + + if (!inject->secs[feat].offset || + !keep_feat(feat)) + return 0; + + ret = feat_copy(inject, feat, fw); + if (ret < 0) + return ret; + + return 1; /* Feature section copied */ +} + static int output_fd(struct perf_inject *inject) { return inject->in_place_update ? -1 : perf_data__fd(&inject->output); @@ -837,6 +954,11 @@ static int __cmd_inject(struct perf_inject *inject) return ret; if (!inject->is_pipe && !inject->in_place_update) { + struct inject_fc inj_fc = { + .fc.copy = feat_copy_cb, + .inject = inject, + }; + if (inject->build_ids) perf_header__set_feat(&session->header, HEADER_BUILD_ID); @@ -873,7 +995,7 @@ static int __cmd_inject(struct perf_inject *inject) } session->header.data_offset = output_data_offset; session->header.data_size = inject->bytes_written; - perf_session__write_header(session, session->evlist, fd, true); + perf_session__inject_header(session, session->evlist, fd, &inj_fc.fc); } return ret; @@ -1018,6 +1140,11 @@ int cmd_inject(int argc, const char **argv) if (zstd_init(&(inject.session->zstd_data), 0) < 0) pr_warning("Decompression initialization failed.\n"); + /* Save original section info before feature bits change */ + ret = save_section_info(&inject); + if (ret) + goto out_delete; + if (!data.is_pipe && inject.output.is_pipe) { ret = perf_header__write_pipe(perf_data__fd(&inject.output)); if (ret < 0) { diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index e8c6101ea29c..63ed69b480f1 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3593,6 +3593,14 @@ int perf_session__write_header(struct perf_session *session, return perf_session__do_write_header(session, evlist, fd, at_exit, NULL); } +int perf_session__inject_header(struct perf_session *session, + struct evlist *evlist, + int fd, + struct feat_copier *fc) +{ + return perf_session__do_write_header(session, evlist, fd, true, fc); +} + static int perf_header__getbuffer64(struct perf_header *header, int fd, void *buf, size_t size) { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 1f49d5170cff..f40a682c99a5 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -135,6 +135,11 @@ struct feat_copier { int (*copy)(struct feat_copier *fc, int feat, struct feat_writer *fw); }; +int perf_session__inject_header(struct perf_session *session, + struct evlist *evlist, + int fd, + struct feat_copier *fc); + void perf_header__set_feat(struct perf_header *header, int feat); void perf_header__clear_feat(struct perf_header *header, int feat); bool perf_header__has_feat(const struct perf_header *header, int feat); -- Gitee From 10a86b04f01fc8c8d0cf0a6512afc3928dd07f90 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:16 +0530 Subject: [PATCH 240/257] perf header: Record non-CPU PMU capabilities commit 2139f7424819818afc25fbfd0effda4babe235f6 upstream PMUs advertise their capabilities via sysfs attribute files but the perf tool currently parses only core(CPU) or hybrid core PMU capabilities. Add support of recording non-core PMU capabilities int perf.data header. Note that a newly proposed HEADER_PMU_CAPS is replacing existing HEADER_HYBRID_CPU_PMU_CAPS. Special care is taken for hybrid core PMUs by writing their capabilities first in the perf.data header to make sure new perf.data file being read by old perf tool does not break. Reviewed-by: Kan Liang Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-6-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- .../Documentation/perf.data-file-format.txt | 10 +- tools/perf/builtin-inject.c | 2 +- tools/perf/util/env.c | 60 ++++++- tools/perf/util/env.h | 12 +- tools/perf/util/header.c | 160 ++++++++++-------- tools/perf/util/header.h | 2 +- 6 files changed, 158 insertions(+), 88 deletions(-) diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index e6ff8c898ada..99866b5ea028 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -419,18 +419,20 @@ Example: cpu_core cpu list : 0-15 cpu_atom cpu list : 16-23 - HEADER_HYBRID_CPU_PMU_CAPS = 31, + HEADER_PMU_CAPS = 31, - A list of hybrid CPU PMU capabilities. + List of pmu capabilities (except cpu pmu which is already + covered by HEADER_CPU_PMU_CAPS). Note that hybrid cpu pmu + capabilities are also stored here. struct { u32 nr_pmu; struct { - u32 nr_cpu_pmu_caps; + u32 nr_caps; { char name[]; char value[]; - } [nr_cpu_pmu_caps]; + } [nr_caps]; char pmu_name[]; } [nr_pmu]; }; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index f8930389d909..a45fa511d4e8 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -798,7 +798,7 @@ static bool keep_feat(int feat) case HEADER_CPU_PMU_CAPS: case HEADER_CLOCK_DATA: case HEADER_HYBRID_TOPOLOGY: - case HEADER_HYBRID_CPU_PMU_CAPS: + case HEADER_PMU_CAPS: return true; /* Information that can be updated */ case HEADER_BUILD_ID: diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index 98f4531fe96b..9a2e9c8a1ea0 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -207,13 +207,13 @@ void perf_env__exit(struct perf_env *env) } zfree(&env->hybrid_nodes); - for (i = 0; i < env->nr_hybrid_cpc_nodes; i++) { - for (j = 0; j < env->hybrid_cpc_nodes[i].nr_cpu_pmu_caps; j++) - zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps[j]); - zfree(&env->hybrid_cpc_nodes[i].cpu_pmu_caps); - zfree(&env->hybrid_cpc_nodes[i].pmu_name); + for (i = 0; i < env->nr_pmus_with_caps; i++) { + for (j = 0; j < env->pmu_caps[i].nr_caps; j++) + zfree(&env->pmu_caps[i].caps[j]); + zfree(&env->pmu_caps[i].caps); + zfree(&env->pmu_caps[i].pmu_name); } - zfree(&env->hybrid_cpc_nodes); + zfree(&env->pmu_caps); } void perf_env__init(struct perf_env *env) @@ -492,3 +492,51 @@ int perf_env__numa_node(struct perf_env *env, int cpu) return cpu >= 0 && cpu < env->nr_numa_map ? env->numa_map[cpu] : -1; } + +char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name, + const char *cap) +{ + char *cap_eq; + int cap_size; + char **ptr; + int i, j; + + if (!pmu_name || !cap) + return NULL; + + cap_size = strlen(cap); + cap_eq = zalloc(cap_size + 2); + if (!cap_eq) + return NULL; + + memcpy(cap_eq, cap, cap_size); + cap_eq[cap_size] = '='; + + if (!strcmp(pmu_name, "cpu")) { + for (i = 0; i < env->nr_cpu_pmu_caps; i++) { + if (!strncmp(env->cpu_pmu_caps[i], cap_eq, cap_size + 1)) { + free(cap_eq); + return &env->cpu_pmu_caps[i][cap_size + 1]; + } + } + goto out; + } + + for (i = 0; i < env->nr_pmus_with_caps; i++) { + if (strcmp(env->pmu_caps[i].pmu_name, pmu_name)) + continue; + + ptr = env->pmu_caps[i].caps; + + for (j = 0; j < env->pmu_caps[i].nr_caps; j++) { + if (!strncmp(ptr[j], cap_eq, cap_size + 1)) { + free(cap_eq); + return &ptr[j][cap_size + 1]; + } + } + } + +out: + free(cap_eq); + return NULL; +} diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index 42e336e5a490..3bc7ee843df4 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -42,10 +42,10 @@ struct hybrid_node { char *cpus; }; -struct hybrid_cpc_node { - int nr_cpu_pmu_caps; +struct pmu_caps { + int nr_caps; unsigned int max_branches; - char **cpu_pmu_caps; + char **caps; char *pmu_name; }; @@ -72,7 +72,7 @@ struct perf_env { int nr_groups; int nr_cpu_pmu_caps; int nr_hybrid_nodes; - int nr_hybrid_cpc_nodes; + int nr_pmus_with_caps; char *cmdline; const char **cmdline_argv; char *sibling_cores; @@ -94,7 +94,7 @@ struct perf_env { u64 clockid_res_ns; struct hybrid_node *hybrid_nodes; - struct hybrid_cpc_node *hybrid_cpc_nodes; + struct pmu_caps *pmu_caps; /* * bpf_info_lock protects bpf rbtrees. This is needed because the * trees are accessed by different threads in perf-top @@ -162,4 +162,6 @@ bool perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node); struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id); int perf_env__numa_node(struct perf_env *env, int cpu); +char *perf_env__find_pmu_cap(struct perf_env *env, const char *pmu_name, + const char *cap); #endif /* __PERF_ENV_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 63ed69b480f1..3b04ddc2f124 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1465,18 +1465,13 @@ static int write_compressed(struct feat_fd *ff __maybe_unused, return do_write(ff, &(ff->ph->env.comp_mmap_len), sizeof(ff->ph->env.comp_mmap_len)); } -static int write_per_cpu_pmu_caps(struct feat_fd *ff, struct perf_pmu *pmu, - bool write_pmu) +static int __write_pmu_caps(struct feat_fd *ff, struct perf_pmu *pmu, + bool write_pmu) { struct perf_pmu_caps *caps = NULL; - int nr_caps; int ret; - nr_caps = perf_pmu__caps_parse(pmu); - if (nr_caps < 0) - return nr_caps; - - ret = do_write(ff, &nr_caps, sizeof(nr_caps)); + ret = do_write(ff, &pmu->nr_caps, sizeof(pmu->nr_caps)); if (ret < 0) return ret; @@ -1503,33 +1498,60 @@ static int write_cpu_pmu_caps(struct feat_fd *ff, struct evlist *evlist __maybe_unused) { struct perf_pmu *cpu_pmu = perf_pmu__find("cpu"); + int ret; if (!cpu_pmu) return -ENOENT; - return write_per_cpu_pmu_caps(ff, cpu_pmu, false); + ret = perf_pmu__caps_parse(cpu_pmu); + if (ret < 0) + return ret; + + return __write_pmu_caps(ff, cpu_pmu, false); } -static int write_hybrid_cpu_pmu_caps(struct feat_fd *ff, - struct evlist *evlist __maybe_unused) +static int write_pmu_caps(struct feat_fd *ff, + struct evlist *evlist __maybe_unused) { - struct perf_pmu *pmu; - u32 nr_pmu = perf_pmu__hybrid_pmu_num(); + struct perf_pmu *pmu = NULL; + int nr_pmu = 0; int ret; - if (nr_pmu == 0) - return -ENOENT; + while ((pmu = perf_pmu__scan(pmu))) { + if (!pmu->name || !strcmp(pmu->name, "cpu") || + perf_pmu__caps_parse(pmu) <= 0) + continue; + nr_pmu++; + } ret = do_write(ff, &nr_pmu, sizeof(nr_pmu)); if (ret < 0) return ret; + if (!nr_pmu) + return 0; + + /* + * Write hybrid pmu caps first to maintain compatibility with + * older perf tool. + */ + pmu = NULL; perf_pmu__for_each_hybrid_pmu(pmu) { - ret = write_per_cpu_pmu_caps(ff, pmu, true); + ret = __write_pmu_caps(ff, pmu, true); if (ret < 0) return ret; } + pmu = NULL; + while ((pmu = perf_pmu__scan(pmu))) { + if (!pmu->name || !strcmp(pmu->name, "cpu") || + !pmu->nr_caps || perf_pmu__is_hybrid(pmu->name)) + continue; + + ret = __write_pmu_caps(ff, pmu, true); + if (ret < 0) + return ret; + } return 0; } @@ -1965,8 +1987,7 @@ static void print_compressed(struct feat_fd *ff, FILE *fp) ff->ph->env.comp_level, ff->ph->env.comp_ratio); } -static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char **cpu_pmu_caps, - char *pmu_name) +static void __print_pmu_caps(FILE *fp, int nr_caps, char **caps, char *pmu_name) { const char *delimiter = ""; int i; @@ -1978,7 +1999,7 @@ static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char **cpu_pmu_caps, fprintf(fp, "# %s pmu capabilities: ", pmu_name); for (i = 0; i < nr_caps; i++) { - fprintf(fp, "%s%s", delimiter, cpu_pmu_caps[i]); + fprintf(fp, "%s%s", delimiter, caps[i]); delimiter = ", "; } @@ -1987,19 +2008,18 @@ static void print_per_cpu_pmu_caps(FILE *fp, int nr_caps, char **cpu_pmu_caps, static void print_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) { - print_per_cpu_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps, - ff->ph->env.cpu_pmu_caps, (char *)"cpu"); + __print_pmu_caps(fp, ff->ph->env.nr_cpu_pmu_caps, + ff->ph->env.cpu_pmu_caps, (char *)"cpu"); } -static void print_hybrid_cpu_pmu_caps(struct feat_fd *ff, FILE *fp) +static void print_pmu_caps(struct feat_fd *ff, FILE *fp) { - struct hybrid_cpc_node *n; + struct pmu_caps *pmu_caps; - for (int i = 0; i < ff->ph->env.nr_hybrid_cpc_nodes; i++) { - n = &ff->ph->env.hybrid_cpc_nodes[i]; - print_per_cpu_pmu_caps(fp, n->nr_cpu_pmu_caps, - n->cpu_pmu_caps, - n->pmu_name); + for (int i = 0; i < ff->ph->env.nr_pmus_with_caps; i++) { + pmu_caps = &ff->ph->env.pmu_caps[i]; + __print_pmu_caps(fp, pmu_caps->nr_caps, pmu_caps->caps, + pmu_caps->pmu_name); } } @@ -3114,27 +3134,26 @@ static int process_compressed(struct feat_fd *ff, return 0; } -static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, - char ***cpu_pmu_caps, - unsigned int *max_branches) +static int __process_pmu_caps(struct feat_fd *ff, int *nr_caps, + char ***caps, unsigned int *max_branches) { char *name, *value, *ptr; - u32 nr_caps, i; + u32 nr_pmu_caps, i; - *nr_cpu_pmu_caps = 0; - *cpu_pmu_caps = NULL; + *nr_caps = 0; + *caps = NULL; - if (do_read_u32(ff, &nr_caps)) + if (do_read_u32(ff, &nr_pmu_caps)) return -1; - if (!nr_caps) + if (!nr_pmu_caps) return 0; - *cpu_pmu_caps = zalloc(sizeof(char *) * nr_caps); - if (!*cpu_pmu_caps) + *caps = zalloc(sizeof(char *) * nr_pmu_caps); + if (!*caps) return -1; - for (i = 0; i < nr_caps; i++) { + for (i = 0; i < nr_pmu_caps; i++) { name = do_read_string(ff); if (!name) goto error; @@ -3146,7 +3165,7 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, if (asprintf(&ptr, "%s=%s", name, value) < 0) goto free_value; - (*cpu_pmu_caps)[i] = ptr; + (*caps)[i] = ptr; if (!strcmp(name, "branches")) *max_branches = atoi(value); @@ -3154,7 +3173,7 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, free(value); free(name); } - *nr_cpu_pmu_caps = nr_caps; + *nr_caps = nr_pmu_caps; return 0; free_value: @@ -3163,29 +3182,28 @@ static int process_per_cpu_pmu_caps(struct feat_fd *ff, int *nr_cpu_pmu_caps, free(name); error: for (; i > 0; i--) - free((*cpu_pmu_caps)[i - 1]); - free(*cpu_pmu_caps); - *cpu_pmu_caps = NULL; - *nr_cpu_pmu_caps = 0; + free((*caps)[i - 1]); + free(*caps); + *caps = NULL; + *nr_caps = 0; return -1; } static int process_cpu_pmu_caps(struct feat_fd *ff, void *data __maybe_unused) { - int ret = process_per_cpu_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, - &ff->ph->env.cpu_pmu_caps, - &ff->ph->env.max_branches); + int ret = __process_pmu_caps(ff, &ff->ph->env.nr_cpu_pmu_caps, + &ff->ph->env.cpu_pmu_caps, + &ff->ph->env.max_branches); if (!ret && !ff->ph->env.cpu_pmu_caps) pr_debug("cpu pmu capabilities not available\n"); return ret; } -static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, - void *data __maybe_unused) +static int process_pmu_caps(struct feat_fd *ff, void *data __maybe_unused) { - struct hybrid_cpc_node *nodes; + struct pmu_caps *pmu_caps; u32 nr_pmu, i; int ret; int j; @@ -3194,45 +3212,45 @@ static int process_hybrid_cpu_pmu_caps(struct feat_fd *ff, return -1; if (!nr_pmu) { - pr_debug("hybrid cpu pmu capabilities not available\n"); + pr_debug("pmu capabilities not available\n"); return 0; } - nodes = zalloc(sizeof(*nodes) * nr_pmu); - if (!nodes) + pmu_caps = zalloc(sizeof(*pmu_caps) * nr_pmu); + if (!pmu_caps) return -ENOMEM; for (i = 0; i < nr_pmu; i++) { - struct hybrid_cpc_node *n = &nodes[i]; - - ret = process_per_cpu_pmu_caps(ff, &n->nr_cpu_pmu_caps, - &n->cpu_pmu_caps, - &n->max_branches); + ret = __process_pmu_caps(ff, &pmu_caps[i].nr_caps, + &pmu_caps[i].caps, + &pmu_caps[i].max_branches); if (ret) goto err; - n->pmu_name = do_read_string(ff); - if (!n->pmu_name) { + pmu_caps[i].pmu_name = do_read_string(ff); + if (!pmu_caps[i].pmu_name) { ret = -1; goto err; } - if (!n->nr_cpu_pmu_caps) - pr_debug("%s pmu capabilities not available\n", n->pmu_name); + if (!pmu_caps[i].nr_caps) { + pr_debug("%s pmu capabilities not available\n", + pmu_caps[i].pmu_name); + } } - ff->ph->env.nr_hybrid_cpc_nodes = nr_pmu; - ff->ph->env.hybrid_cpc_nodes = nodes; + ff->ph->env.nr_pmus_with_caps = nr_pmu; + ff->ph->env.pmu_caps = pmu_caps; return 0; err: for (i = 0; i < nr_pmu; i++) { - for (j = 0; j < nodes[i].nr_cpu_pmu_caps; j++) - free(nodes[i].cpu_pmu_caps[j]); - free(nodes[i].cpu_pmu_caps); - free(nodes[i].pmu_name); + for (j = 0; j < pmu_caps[i].nr_caps; j++) + free(pmu_caps[i].caps[j]); + free(pmu_caps[i].caps); + free(pmu_caps[i].pmu_name); } - free(nodes); + free(pmu_caps); return ret; } @@ -3296,7 +3314,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = { FEAT_OPR(CPU_PMU_CAPS, cpu_pmu_caps, false), FEAT_OPR(CLOCK_DATA, clock_data, false), FEAT_OPN(HYBRID_TOPOLOGY, hybrid_topology, true), - FEAT_OPR(HYBRID_CPU_PMU_CAPS, hybrid_cpu_pmu_caps, false), + FEAT_OPR(PMU_CAPS, pmu_caps, false), }; struct header_print_data { diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index f40a682c99a5..5db0f8fc074c 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -46,7 +46,7 @@ enum { HEADER_CPU_PMU_CAPS, HEADER_CLOCK_DATA, HEADER_HYBRID_TOPOLOGY, - HEADER_HYBRID_CPU_PMU_CAPS, + HEADER_PMU_CAPS, HEADER_LAST_FEATURE, HEADER_FEAT_BITS = 256, }; -- Gitee From 74eb12fb014cace9a3ad820596dc410208ef4aa9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 30 Nov 2020 15:11:10 -0300 Subject: [PATCH 241/257] perf evlist: Use the right prefix for 'struct evlist' raw samples methods commit 44d2a5573665ab5dfb72572e43184388d15d695e upstream perf_evlist__ is for 'struct perf_evlist' methods, in tools/lib/perf/, go on completing this split. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/s390-sample-raw.c | 3 +-- tools/perf/util/sample-raw.c | 4 ++-- tools/perf/util/sample-raw.h | 7 ++----- tools/perf/util/session.c | 2 +- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/s390-sample-raw.c b/tools/perf/util/s390-sample-raw.c index 05b43ab4eeef..8626ae40d058 100644 --- a/tools/perf/util/s390-sample-raw.c +++ b/tools/perf/util/s390-sample-raw.c @@ -197,8 +197,7 @@ static void s390_cpumcfdg_dump(struct perf_sample *sample) * its raw data. * The function is only invoked when the dump flag -D is set. */ -void perf_evlist__s390_sample_raw(struct evlist *evlist, union perf_event *event, - struct perf_sample *sample) +void evlist__s390_sample_raw(struct evlist *evlist, union perf_event *event, struct perf_sample *sample) { struct evsel *ev_bc000; diff --git a/tools/perf/util/sample-raw.c b/tools/perf/util/sample-raw.c index e84bbe0e441a..cde5cd3ce49b 100644 --- a/tools/perf/util/sample-raw.c +++ b/tools/perf/util/sample-raw.c @@ -9,10 +9,10 @@ * Check platform the perf data file was created on and perform platform * specific interpretation. */ -void perf_evlist__init_trace_event_sample_raw(struct evlist *evlist) +void evlist__init_trace_event_sample_raw(struct evlist *evlist) { const char *arch_pf = perf_env__arch(evlist->env); if (arch_pf && !strcmp("s390", arch_pf)) - evlist->trace_event_sample_raw = perf_evlist__s390_sample_raw; + evlist->trace_event_sample_raw = evlist__s390_sample_raw; } diff --git a/tools/perf/util/sample-raw.h b/tools/perf/util/sample-raw.h index afe1491a117e..4be84a5510cf 100644 --- a/tools/perf/util/sample-raw.h +++ b/tools/perf/util/sample-raw.h @@ -6,9 +6,6 @@ struct evlist; union perf_event; struct perf_sample; -void perf_evlist__s390_sample_raw(struct evlist *evlist, - union perf_event *event, - struct perf_sample *sample); - -void perf_evlist__init_trace_event_sample_raw(struct evlist *evlist); +void evlist__s390_sample_raw(struct evlist *evlist, union perf_event *event, struct perf_sample *sample); +void evlist__init_trace_event_sample_raw(struct evlist *evlist); #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 782896153efe..dbf1d7611bf8 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -223,7 +223,7 @@ struct perf_session *__perf_session__new(struct perf_data *data, perf_session__set_comm_exec(session); } - perf_evlist__init_trace_event_sample_raw(session->evlist); + evlist__init_trace_event_sample_raw(session->evlist); /* Open the directory data. */ if (data->is_dir) { -- Gitee From 17e2ffdf44750f429331898fedf539c49ef258e5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 26 Sep 2019 15:26:39 -0300 Subject: [PATCH 242/257] tools arch x86: Grab a copy of the file containing the MSR numbers commit 444e2ff34df8f631cd83ae73bb56ef13cfb84b34 upstream We'll use it to generate a table and then convert the msr:{read,write}_msr 'msr' option in things like perf trace, script, etc. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-y1f4s0y1s43d4drh7pd2huzn@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/arch/x86/include/asm/msr-index.h | 857 +++++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + 2 files changed, 858 insertions(+) create mode 100644 tools/arch/x86/include/asm/msr-index.h diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h new file mode 100644 index 000000000000..20ce682a2540 --- /dev/null +++ b/tools/arch/x86/include/asm/msr-index.h @@ -0,0 +1,857 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_MSR_INDEX_H +#define _ASM_X86_MSR_INDEX_H + +#include + +/* + * CPU model specific register (MSR) numbers. + * + * Do not add new entries to this file unless the definitions are shared + * between multiple compilation units. + */ + +/* x86-64 specific MSRs */ +#define MSR_EFER 0xc0000080 /* extended feature register */ +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ +#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */ +#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ +#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ +#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ +#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ +#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */ + +/* EFER bits: */ +#define _EFER_SCE 0 /* SYSCALL/SYSRET */ +#define _EFER_LME 8 /* Long mode enable */ +#define _EFER_LMA 10 /* Long mode active (read-only) */ +#define _EFER_NX 11 /* No execute enable */ +#define _EFER_SVME 12 /* Enable virtualization */ +#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ +#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ + +#define EFER_SCE (1<<_EFER_SCE) +#define EFER_LME (1<<_EFER_LME) +#define EFER_LMA (1<<_EFER_LMA) +#define EFER_NX (1<<_EFER_NX) +#define EFER_SVME (1<<_EFER_SVME) +#define EFER_LMSLE (1<<_EFER_LMSLE) +#define EFER_FFXSR (1<<_EFER_FFXSR) + +/* Intel MSRs. Some also available on other CPUs */ + +#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ +#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */ +#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */ +#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ +#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ +#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ + +#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ +#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ + +#define MSR_PPIN_CTL 0x0000004e +#define MSR_PPIN 0x0000004f + +#define MSR_IA32_PERFCTR0 0x000000c1 +#define MSR_IA32_PERFCTR1 0x000000c2 +#define MSR_FSB_FREQ 0x000000cd +#define MSR_PLATFORM_INFO 0x000000ce +#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31 +#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT) + +#define MSR_IA32_UMWAIT_CONTROL 0xe1 +#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0) +#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1) +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) + +#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 +#define NHM_C3_AUTO_DEMOTE (1UL << 25) +#define NHM_C1_AUTO_DEMOTE (1UL << 26) +#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25) +#define SNB_C3_AUTO_UNDEMOTE (1UL << 27) +#define SNB_C1_AUTO_UNDEMOTE (1UL << 28) + +#define MSR_MTRRcap 0x000000fe + +#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a +#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ +#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ +#define ARCH_CAP_SSB_NO BIT(4) /* + * Not susceptible to Speculative Store Bypass + * attack, so no Speculative Store Bypass + * control required. + */ +#define ARCH_CAP_MDS_NO BIT(5) /* + * Not susceptible to + * Microarchitectural Data + * Sampling (MDS) vulnerabilities. + */ + +#define MSR_IA32_FLUSH_CMD 0x0000010b +#define L1D_FLUSH BIT(0) /* + * Writeback and invalidate the + * L1 data cache. + */ + +#define MSR_IA32_BBL_CR_CTL 0x00000119 +#define MSR_IA32_BBL_CR_CTL3 0x0000011e + +#define MSR_IA32_SYSENTER_CS 0x00000174 +#define MSR_IA32_SYSENTER_ESP 0x00000175 +#define MSR_IA32_SYSENTER_EIP 0x00000176 + +#define MSR_IA32_MCG_CAP 0x00000179 +#define MSR_IA32_MCG_STATUS 0x0000017a +#define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_IA32_MCG_EXT_CTL 0x000004d0 + +#define MSR_OFFCORE_RSP_0 0x000001a6 +#define MSR_OFFCORE_RSP_1 0x000001a7 +#define MSR_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_TURBO_RATIO_LIMIT1 0x000001ae +#define MSR_TURBO_RATIO_LIMIT2 0x000001af + +#define MSR_LBR_SELECT 0x000001c8 +#define MSR_LBR_TOS 0x000001c9 +#define MSR_LBR_NHM_FROM 0x00000680 +#define MSR_LBR_NHM_TO 0x000006c0 +#define MSR_LBR_CORE_FROM 0x00000040 +#define MSR_LBR_CORE_TO 0x00000060 + +#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */ +#define LBR_INFO_MISPRED BIT_ULL(63) +#define LBR_INFO_IN_TX BIT_ULL(62) +#define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYCLES 0xffff + +#define MSR_IA32_PEBS_ENABLE 0x000003f1 +#define MSR_PEBS_DATA_CFG 0x000003f2 +#define MSR_IA32_DS_AREA 0x00000600 +#define MSR_IA32_PERF_CAPABILITIES 0x00000345 +#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 + +#define MSR_IA32_RTIT_CTL 0x00000570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OS BIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_EN BIT(4) +#define RTIT_CTL_FUP_ON_PTW BIT(5) +#define RTIT_CTL_FABRIC_EN BIT(6) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_EN BIT(9) +#define RTIT_CTL_TSC_EN BIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_EN BIT(12) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) +#define MSR_IA32_RTIT_STATUS 0x00000571 +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVF BIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPED BIT(5) +#define RTIT_STATUS_BYTECNT_OFFSET 32 +#define RTIT_STATUS_BYTECNT (0x1ffffull << RTIT_STATUS_BYTECNT_OFFSET) +#define MSR_IA32_RTIT_ADDR0_A 0x00000580 +#define MSR_IA32_RTIT_ADDR0_B 0x00000581 +#define MSR_IA32_RTIT_ADDR1_A 0x00000582 +#define MSR_IA32_RTIT_ADDR1_B 0x00000583 +#define MSR_IA32_RTIT_ADDR2_A 0x00000584 +#define MSR_IA32_RTIT_ADDR2_B 0x00000585 +#define MSR_IA32_RTIT_ADDR3_A 0x00000586 +#define MSR_IA32_RTIT_ADDR3_B 0x00000587 +#define MSR_IA32_RTIT_CR3_MATCH 0x00000572 +#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 +#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 + +#define MSR_MTRRfix64K_00000 0x00000250 +#define MSR_MTRRfix16K_80000 0x00000258 +#define MSR_MTRRfix16K_A0000 0x00000259 +#define MSR_MTRRfix4K_C0000 0x00000268 +#define MSR_MTRRfix4K_C8000 0x00000269 +#define MSR_MTRRfix4K_D0000 0x0000026a +#define MSR_MTRRfix4K_D8000 0x0000026b +#define MSR_MTRRfix4K_E0000 0x0000026c +#define MSR_MTRRfix4K_E8000 0x0000026d +#define MSR_MTRRfix4K_F0000 0x0000026e +#define MSR_MTRRfix4K_F8000 0x0000026f +#define MSR_MTRRdefType 0x000002ff + +#define MSR_IA32_CR_PAT 0x00000277 + +#define MSR_IA32_DEBUGCTLMSR 0x000001d9 +#define MSR_IA32_LASTBRANCHFROMIP 0x000001db +#define MSR_IA32_LASTBRANCHTOIP 0x000001dc +#define MSR_IA32_LASTINTFROMIP 0x000001dd +#define MSR_IA32_LASTINTTOIP 0x000001de + +/* DEBUGCTLMSR bits (others vary by model): */ +#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF_SHIFT 1 +#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ +#define DEBUGCTLMSR_TR (1UL << 6) +#define DEBUGCTLMSR_BTS (1UL << 7) +#define DEBUGCTLMSR_BTINT (1UL << 8) +#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) +#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) +#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) +#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 +#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) + +#define MSR_PEBS_FRONTEND 0x000003f7 + +#define MSR_IA32_POWER_CTL 0x000001fc + +#define MSR_IA32_MC0_CTL 0x00000400 +#define MSR_IA32_MC0_STATUS 0x00000401 +#define MSR_IA32_MC0_ADDR 0x00000402 +#define MSR_IA32_MC0_MISC 0x00000403 + +/* C-state Residency Counters */ +#define MSR_PKG_C3_RESIDENCY 0x000003f8 +#define MSR_PKG_C6_RESIDENCY 0x000003f9 +#define MSR_ATOM_PKG_C6_RESIDENCY 0x000003fa +#define MSR_PKG_C7_RESIDENCY 0x000003fa +#define MSR_CORE_C3_RESIDENCY 0x000003fc +#define MSR_CORE_C6_RESIDENCY 0x000003fd +#define MSR_CORE_C7_RESIDENCY 0x000003fe +#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff +#define MSR_PKG_C2_RESIDENCY 0x0000060d +#define MSR_PKG_C8_RESIDENCY 0x00000630 +#define MSR_PKG_C9_RESIDENCY 0x00000631 +#define MSR_PKG_C10_RESIDENCY 0x00000632 + +/* Interrupt Response Limit */ +#define MSR_PKGC3_IRTL 0x0000060a +#define MSR_PKGC6_IRTL 0x0000060b +#define MSR_PKGC7_IRTL 0x0000060c +#define MSR_PKGC8_IRTL 0x00000633 +#define MSR_PKGC9_IRTL 0x00000634 +#define MSR_PKGC10_IRTL 0x00000635 + +/* Run Time Average Power Limiting (RAPL) Interface */ + +#define MSR_RAPL_POWER_UNIT 0x00000606 + +#define MSR_PKG_POWER_LIMIT 0x00000610 +#define MSR_PKG_ENERGY_STATUS 0x00000611 +#define MSR_PKG_PERF_STATUS 0x00000613 +#define MSR_PKG_POWER_INFO 0x00000614 + +#define MSR_DRAM_POWER_LIMIT 0x00000618 +#define MSR_DRAM_ENERGY_STATUS 0x00000619 +#define MSR_DRAM_PERF_STATUS 0x0000061b +#define MSR_DRAM_POWER_INFO 0x0000061c + +#define MSR_PP0_POWER_LIMIT 0x00000638 +#define MSR_PP0_ENERGY_STATUS 0x00000639 +#define MSR_PP0_POLICY 0x0000063a +#define MSR_PP0_PERF_STATUS 0x0000063b + +#define MSR_PP1_POWER_LIMIT 0x00000640 +#define MSR_PP1_ENERGY_STATUS 0x00000641 +#define MSR_PP1_POLICY 0x00000642 + +/* Config TDP MSRs */ +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL_1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + +#define MSR_PLATFORM_ENERGY_STATUS 0x0000064D + +#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 +#define MSR_PKG_ANY_CORE_C0_RES 0x00000659 +#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A +#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B + +#define MSR_CORE_C1_RES 0x00000660 +#define MSR_MODULE_C6_RES_MS 0x00000664 + +#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 +#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669 + +#define MSR_ATOM_CORE_RATIOS 0x0000066a +#define MSR_ATOM_CORE_VIDS 0x0000066b +#define MSR_ATOM_CORE_TURBO_RATIOS 0x0000066c +#define MSR_ATOM_CORE_TURBO_VIDS 0x0000066d + + +#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690 +#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 +#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 + +/* Hardware P state interface */ +#define MSR_PPERF 0x0000064e +#define MSR_PERF_LIMIT_REASONS 0x0000064f +#define MSR_PM_ENABLE 0x00000770 +#define MSR_HWP_CAPABILITIES 0x00000771 +#define MSR_HWP_REQUEST_PKG 0x00000772 +#define MSR_HWP_INTERRUPT 0x00000773 +#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_STATUS 0x00000777 + +/* CPUID.6.EAX */ +#define HWP_BASE_BIT (1<<7) +#define HWP_NOTIFICATIONS_BIT (1<<8) +#define HWP_ACTIVITY_WINDOW_BIT (1<<9) +#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10) +#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11) + +/* IA32_HWP_CAPABILITIES */ +#define HWP_HIGHEST_PERF(x) (((x) >> 0) & 0xff) +#define HWP_GUARANTEED_PERF(x) (((x) >> 8) & 0xff) +#define HWP_MOSTEFFICIENT_PERF(x) (((x) >> 16) & 0xff) +#define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff) + +/* IA32_HWP_REQUEST */ +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) +#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24) +#define HWP_EPP_PERFORMANCE 0x00 +#define HWP_EPP_BALANCE_PERFORMANCE 0x80 +#define HWP_EPP_BALANCE_POWERSAVE 0xC0 +#define HWP_EPP_POWERSAVE 0xFF +#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42) + +/* IA32_HWP_STATUS */ +#define HWP_GUARANTEED_CHANGE(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4) + +/* IA32_HWP_INTERRUPT */ +#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2) + +#define MSR_AMD64_MC0_MASK 0xc0010044 + +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + +/* These are consecutive and not in the normal 4er MCE bank block */ +#define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + +#define MSR_P6_PERFCTR0 0x000000c1 +#define MSR_P6_PERFCTR1 0x000000c2 +#define MSR_P6_EVNTSEL0 0x00000186 +#define MSR_P6_EVNTSEL1 0x00000187 + +#define MSR_KNC_PERFCTR0 0x00000020 +#define MSR_KNC_PERFCTR1 0x00000021 +#define MSR_KNC_EVNTSEL0 0x00000028 +#define MSR_KNC_EVNTSEL1 0x00000029 + +/* Alternative perfctr range with full access. */ +#define MSR_IA32_PMC0 0x000004c1 + +/* Auto-reload via MSR instead of DS area */ +#define MSR_RELOAD_PMC0 0x000014c1 +#define MSR_RELOAD_FIXED_CTR0 0x00001309 + +/* + * AMD64 MSRs. Not complete. See the architecture manual for a more + * complete list. + */ +#define MSR_AMD64_PATCH_LEVEL 0x0000008b +#define MSR_AMD64_TSC_RATIO 0xc0000104 +#define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_CPUID_FN_1 0xc0011004 +#define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD_PERF_CTL 0xc0010062 +#define MSR_AMD_PERF_STATUS 0xc0010063 +#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD64_LS_CFG 0xc0011020 +#define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_BU_CFG2 0xc001102a +#define MSR_AMD64_IBSFETCHCTL 0xc0011030 +#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 +#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +#define MSR_AMD64_IBSFETCH_REG_COUNT 3 +#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL< Date: Tue, 17 Aug 2021 17:15:09 -0500 Subject: [PATCH 243/257] perf report: Add support to print a textual representation of IBS raw sample data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 291dcb98d7ee5cd719f4c5991d977794b1829c16 upstream Perf records IBS (Instruction Based Sampling) extra sample data when 'perf record --raw-samples' is used with an IBS-compatible event, on a machine that supports IBS. IBS support is indicated in CPUID_Fn80000001_ECX bit #10. Up until now, users have been able to see the extra sample data solely in raw hex format using 'perf report --dump-raw-trace'. From there, users could decode the data either manually, or by using an external script. Enable the built-in 'perf report --dump-raw-trace' to do the decoding of the extra sample data bits, so manual or external script decoding isn't necessary. Example usage: $ sudo perf record -c 10000001 -a --raw-samples -e ibs_fetch/rand_en=1/,ibs_op/cnt_ctl=1/ -C 0,1 taskset -c 0,1 7za b -mmt2 | perf report --dump-raw-trace Stdout contains IBS Fetch samples, e.g.: ibs_fetch_ctl: 02170007ffffffff MaxCnt 1048560 Cnt 1048560 Lat 7 En 1 Val 1 Comp 1 IcMiss 0 PhyAddrValid 1 L1TlbPgSz 4KB L1TlbMiss 0 L2TlbMiss 0 RandEn 1 L2Miss 0 IbsFetchLinAd: 000056016b2ead40 IbsFetchPhysAd: 000000115cedfd40 c_ibs_ext_ctl: 0000000000000000 IbsItlbRefillLat 0 ..and IBS Op samples, e.g.: ibs_op_ctl: 0000009e009e8968 MaxCnt 10000000 En 1 Val 1 CntCtl 1=uOps CurCnt 158 IbsOpRip: 000056016b2ea73d ibs_op_data: 00000000000b0002 CompToRetCtr 2 TagToRetCtr 11 BrnRet 0 RipInvalid 0 BrnFuse 0 Microcode 0 ibs_op_data2: 0000000000000002 CacheHitSt 0=M-state RmtNode 0 DataSrc 2=Local node cache ibs_op_data3: 0000000000c60002 LdOp 0 StOp 1 DcL1TlbMiss 0 DcL2TlbMiss 0 DcL1TlbHit2M 0 DcL1TlbHit1G 0 DcL2TlbHit2M 0 DcMiss 0 DcMisAcc 0 DcWcMemAcc 0 DcUcMemAcc 0 DcLockedOp 0 DcMissNoMabAlloc 0 DcLinAddrValid 1 DcPhyAddrValid 1 DcL2TlbHit1G 0 L2Miss 0 SwPf 0 OpMemWidth 4 bytes OpDcMissOpenMemReqs 0 DcMissLat 0 TlbRefillLat 0 IbsDCLinAd: 00007f133c319ce0 IbsDCPhysAd: 0000000270485ce0 Committer notes: Fixed up this: util/amd-sample-raw.c: In function ‘evlist__amd_sample_raw’: util/amd-sample-raw.c:125:42: error: ‘ bytes’ directive output may be truncated writing 6 bytes into a region of size between 4 and 7 [-Werror=format-truncation=] 125 | " OpMemWidth %2d bytes", 1 << (reg.op_mem_width - 1)); | ^~~~~~ In file included from /usr/include/stdio.h:866, from util/amd-sample-raw.c:7: /usr/include/bits/stdio2.h:71:10: note: ‘__builtin___snprintf_chk’ output between 21 and 24 bytes into a destination of size 21 71 | return __builtin___snprintf_chk (__s, __n, __USE_FORTIFY_LEVEL - 1, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 72 | __glibc_objsize (__s), __fmt, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 73 | __va_arg_pack ()); | ~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors As that %2d won't limit the number of chars to 2, just state that 2 is the minimal width: $ cat printf.c #include #include int main(int argc, char *argv[]) { char bf[64]; int len = snprintf(bf, sizeof(bf), "%2d", atoi(argv[1])); printf("strlen(%s): %u\n", bf, len); return 0; } $ ./printf 1 strlen( 1): 2 $ ./printf 12 strlen(12): 2 $ ./printf 123 strlen(123): 3 $ ./printf 1234 strlen(1234): 4 $ ./printf 12345 strlen(12345): 5 $ ./printf 123456 strlen(123456): 6 $ And since we probably don't want that output to be truncated, just assume the worst case, as the compiler did, and add a few more chars to that buffer. Also use sizeof(var) instead of sizeof(dup-of-wanted-format-string) to avoid bugs when changing one but not the other. I also had to change this: -#include +#include "../../arch/x86/include/asm/amd-ibs.h" To make it build on other architectures, just like intel-pt does. [Backport changes] 1. In ttools/perf/util/amd-sample-raw.c, inside evlist__amd_sample_raw(), the function name evlist__event2evsel() was manually changed back to perf_evlist__event2evsel() to match the current kernel. The upstream commit 3ccf8a7b66b6bf only renames the function without modifying its logic, but backporting it would require over 20 extra patches. So the change was made manually to avoid that overhead. Signed-off-by: Kim Phillips Cc: Alexander Shishkin Cc: Boris Ostrovsky Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Joao Martins Cc: Konrad Rzeszutek Wilk Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Link: https //lore.kernel.org/r/20210817221509.88391-4-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/Build | 1 + tools/perf/util/amd-sample-raw.c | 289 +++++++++++++++++++++++++++++++ tools/perf/util/sample-raw.c | 8 + tools/perf/util/sample-raw.h | 6 +- 4 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 tools/perf/util/amd-sample-raw.c diff --git a/tools/perf/util/Build b/tools/perf/util/Build index ae0dafa75a69..df4219f7615a 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -57,6 +57,7 @@ perf-y += pstack.o perf-y += session.o perf-y += sample-raw.o perf-y += s390-sample-raw.o +perf-y += amd-sample-raw.o perf-$(CONFIG_TRACE) += syscalltbl.o perf-y += ordered-events.o perf-y += namespaces.o diff --git a/tools/perf/util/amd-sample-raw.c b/tools/perf/util/amd-sample-raw.c new file mode 100644 index 000000000000..2aa349b2e944 --- /dev/null +++ b/tools/perf/util/amd-sample-raw.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD specific. Provide textual annotation for IBS raw sample data. + */ + +#include +#include +#include +#include + +#include +#include "../../arch/x86/include/asm/amd-ibs.h" + +#include "debug.h" +#include "session.h" +#include "evlist.h" +#include "sample-raw.h" +#include "pmu-events/pmu-events.h" + +static u32 cpu_family, cpu_model, ibs_fetch_type, ibs_op_type; + +static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg) +{ + const char * const ic_miss_strs[] = { + " IcMiss 0", + " IcMiss 1", + }; + const char * const l1tlb_pgsz_strs[] = { + " L1TlbPgSz 4KB", + " L1TlbPgSz 2MB", + " L1TlbPgSz 1GB", + " L1TlbPgSz RESERVED" + }; + const char * const l1tlb_pgsz_strs_erratum1347[] = { + " L1TlbPgSz 4KB", + " L1TlbPgSz 16KB", + " L1TlbPgSz 2MB", + " L1TlbPgSz 1GB" + }; + const char *ic_miss_str = NULL; + const char *l1tlb_pgsz_str = NULL; + + if (cpu_family == 0x19 && cpu_model < 0x10) { + /* + * Erratum #1238 workaround is to ignore MSRC001_1030[IbsIcMiss] + * Erratum #1347 workaround is to use table provided in erratum + */ + if (reg.phy_addr_valid) + l1tlb_pgsz_str = l1tlb_pgsz_strs_erratum1347[reg.l1tlb_pgsz]; + } else { + if (reg.phy_addr_valid) + l1tlb_pgsz_str = l1tlb_pgsz_strs[reg.l1tlb_pgsz]; + ic_miss_str = ic_miss_strs[reg.ic_miss]; + } + + printf("ibs_fetch_ctl:\t%016llx MaxCnt %7d Cnt %7d Lat %5d En %d Val %d Comp %d%s " + "PhyAddrValid %d%s L1TlbMiss %d L2TlbMiss %d RandEn %d%s\n", + reg.val, reg.fetch_maxcnt << 4, reg.fetch_cnt << 4, reg.fetch_lat, + reg.fetch_en, reg.fetch_val, reg.fetch_comp, ic_miss_str ? : "", + reg.phy_addr_valid, l1tlb_pgsz_str ? : "", reg.l1tlb_miss, reg.l2tlb_miss, + reg.rand_en, reg.fetch_comp ? (reg.fetch_l2_miss ? " L2Miss 1" : " L2Miss 0") : ""); +} + +static void pr_ic_ibs_extd_ctl(union ic_ibs_extd_ctl reg) +{ + printf("ic_ibs_ext_ctl:\t%016llx IbsItlbRefillLat %3d\n", reg.val, reg.itlb_refill_lat); +} + +static void pr_ibs_op_ctl(union ibs_op_ctl reg) +{ + printf("ibs_op_ctl:\t%016llx MaxCnt %9d En %d Val %d CntCtl %d=%s CurCnt %9d\n", + reg.val, ((reg.opmaxcnt_ext << 16) | reg.opmaxcnt) << 4, reg.op_en, reg.op_val, + reg.cnt_ctl, reg.cnt_ctl ? "uOps" : "cycles", reg.opcurcnt); +} + +static void pr_ibs_op_data(union ibs_op_data reg) +{ + printf("ibs_op_data:\t%016llx CompToRetCtr %5d TagToRetCtr %5d%s%s%s BrnRet %d " + " RipInvalid %d BrnFuse %d Microcode %d\n", + reg.val, reg.comp_to_ret_ctr, reg.tag_to_ret_ctr, + reg.op_brn_ret ? (reg.op_return ? " OpReturn 1" : " OpReturn 0") : "", + reg.op_brn_ret ? (reg.op_brn_taken ? " OpBrnTaken 1" : " OpBrnTaken 0") : "", + reg.op_brn_ret ? (reg.op_brn_misp ? " OpBrnMisp 1" : " OpBrnMisp 0") : "", + reg.op_brn_ret, reg.op_rip_invalid, reg.op_brn_fuse, reg.op_microcode); +} + +static void pr_ibs_op_data2(union ibs_op_data2 reg) +{ + static const char * const data_src_str[] = { + "", + " DataSrc 1=(reserved)", + " DataSrc 2=Local node cache", + " DataSrc 3=DRAM", + " DataSrc 4=Remote node cache", + " DataSrc 5=(reserved)", + " DataSrc 6=(reserved)", + " DataSrc 7=Other" + }; + + printf("ibs_op_data2:\t%016llx %sRmtNode %d%s\n", reg.val, + reg.data_src == 2 ? (reg.cache_hit_st ? "CacheHitSt 1=O-State " + : "CacheHitSt 0=M-state ") : "", + reg.rmt_node, data_src_str[reg.data_src]); +} + +static void pr_ibs_op_data3(union ibs_op_data3 reg) +{ + char l2_miss_str[sizeof(" L2Miss _")] = ""; + char op_mem_width_str[sizeof(" OpMemWidth _____ bytes")] = ""; + char op_dc_miss_open_mem_reqs_str[sizeof(" OpDcMissOpenMemReqs __")] = ""; + + /* + * Erratum #1293 + * Ignore L2Miss and OpDcMissOpenMemReqs (and opdata2) if DcMissNoMabAlloc or SwPf set + */ + if (!(cpu_family == 0x19 && cpu_model < 0x10 && (reg.dc_miss_no_mab_alloc || reg.sw_pf))) { + snprintf(l2_miss_str, sizeof(l2_miss_str), " L2Miss %d", reg.l2_miss); + snprintf(op_dc_miss_open_mem_reqs_str, sizeof(op_dc_miss_open_mem_reqs_str), + " OpDcMissOpenMemReqs %2d", reg.op_dc_miss_open_mem_reqs); + } + + if (reg.op_mem_width) + snprintf(op_mem_width_str, sizeof(op_mem_width_str), + " OpMemWidth %2d bytes", 1 << (reg.op_mem_width - 1)); + + printf("ibs_op_data3:\t%016llx LdOp %d StOp %d DcL1TlbMiss %d DcL2TlbMiss %d " + "DcL1TlbHit2M %d DcL1TlbHit1G %d DcL2TlbHit2M %d DcMiss %d DcMisAcc %d " + "DcWcMemAcc %d DcUcMemAcc %d DcLockedOp %d DcMissNoMabAlloc %d DcLinAddrValid %d " + "DcPhyAddrValid %d DcL2TlbHit1G %d%s SwPf %d%s%s DcMissLat %5d TlbRefillLat %5d\n", + reg.val, reg.ld_op, reg.st_op, reg.dc_l1tlb_miss, reg.dc_l2tlb_miss, + reg.dc_l1tlb_hit_2m, reg.dc_l1tlb_hit_1g, reg.dc_l2tlb_hit_2m, reg.dc_miss, + reg.dc_mis_acc, reg.dc_wc_mem_acc, reg.dc_uc_mem_acc, reg.dc_locked_op, + reg.dc_miss_no_mab_alloc, reg.dc_lin_addr_valid, reg.dc_phy_addr_valid, + reg.dc_l2_tlb_hit_1g, l2_miss_str, reg.sw_pf, op_mem_width_str, + op_dc_miss_open_mem_reqs_str, reg.dc_miss_lat, reg.tlb_refill_lat); +} + +/* + * IBS Op/Execution MSRs always saved, in order, are: + * IBS_OP_CTL, IBS_OP_RIP, IBS_OP_DATA, IBS_OP_DATA2, + * IBS_OP_DATA3, IBS_DC_LINADDR, IBS_DC_PHYSADDR, BP_IBSTGT_RIP + */ +static void amd_dump_ibs_op(struct perf_sample *sample) +{ + struct perf_ibs_data *data = sample->raw_data; + union ibs_op_ctl *op_ctl = (union ibs_op_ctl *)data->data; + __u64 *rip = (__u64 *)op_ctl + 1; + union ibs_op_data *op_data = (union ibs_op_data *)(rip + 1); + union ibs_op_data3 *op_data3 = (union ibs_op_data3 *)(rip + 3); + + pr_ibs_op_ctl(*op_ctl); + if (!op_data->op_rip_invalid) + printf("IbsOpRip:\t%016llx\n", *rip); + pr_ibs_op_data(*op_data); + /* + * Erratum #1293: ignore op_data2 if DcMissNoMabAlloc or SwPf are set + */ + if (!(cpu_family == 0x19 && cpu_model < 0x10 && + (op_data3->dc_miss_no_mab_alloc || op_data3->sw_pf))) + pr_ibs_op_data2(*(union ibs_op_data2 *)(rip + 2)); + pr_ibs_op_data3(*op_data3); + if (op_data3->dc_lin_addr_valid) + printf("IbsDCLinAd:\t%016llx\n", *(rip + 4)); + if (op_data3->dc_phy_addr_valid) + printf("IbsDCPhysAd:\t%016llx\n", *(rip + 5)); + if (op_data->op_brn_ret && *(rip + 6)) + printf("IbsBrTarget:\t%016llx\n", *(rip + 6)); +} + +/* + * IBS Fetch MSRs always saved, in order, are: + * IBS_FETCH_CTL, IBS_FETCH_LINADDR, IBS_FETCH_PHYSADDR, IC_IBS_EXTD_CTL + */ +static void amd_dump_ibs_fetch(struct perf_sample *sample) +{ + struct perf_ibs_data *data = sample->raw_data; + union ibs_fetch_ctl *fetch_ctl = (union ibs_fetch_ctl *)data->data; + __u64 *addr = (__u64 *)fetch_ctl + 1; + union ic_ibs_extd_ctl *extd_ctl = (union ic_ibs_extd_ctl *)addr + 2; + + pr_ibs_fetch_ctl(*fetch_ctl); + printf("IbsFetchLinAd:\t%016llx\n", *addr++); + if (fetch_ctl->phy_addr_valid) + printf("IbsFetchPhysAd:\t%016llx\n", *addr); + pr_ic_ibs_extd_ctl(*extd_ctl); +} + +/* + * Test for enable and valid bits in captured control MSRs. + */ +static bool is_valid_ibs_fetch_sample(struct perf_sample *sample) +{ + struct perf_ibs_data *data = sample->raw_data; + union ibs_fetch_ctl *fetch_ctl = (union ibs_fetch_ctl *)data->data; + + if (fetch_ctl->fetch_en && fetch_ctl->fetch_val) + return true; + + return false; +} + +static bool is_valid_ibs_op_sample(struct perf_sample *sample) +{ + struct perf_ibs_data *data = sample->raw_data; + union ibs_op_ctl *op_ctl = (union ibs_op_ctl *)data->data; + + if (op_ctl->op_en && op_ctl->op_val) + return true; + + return false; +} + +/* AMD vendor specific raw sample function. Check for PERF_RECORD_SAMPLE events + * and if the event was triggered by IBS, display its raw data with decoded text. + * The function is only invoked when the dump flag -D is set. + */ +void evlist__amd_sample_raw(struct evlist *evlist, union perf_event *event, + struct perf_sample *sample) +{ + struct evsel *evsel; + + if (event->header.type != PERF_RECORD_SAMPLE || !sample->raw_size) + return; + + evsel = perf_evlist__event2evsel(evlist, event); + if (!evsel) + return; + + if (evsel->core.attr.type == ibs_fetch_type) { + if (!is_valid_ibs_fetch_sample(sample)) { + pr_debug("Invalid raw IBS Fetch MSR data encountered\n"); + return; + } + amd_dump_ibs_fetch(sample); + } else if (evsel->core.attr.type == ibs_op_type) { + if (!is_valid_ibs_op_sample(sample)) { + pr_debug("Invalid raw IBS Op MSR data encountered\n"); + return; + } + amd_dump_ibs_op(sample); + } +} + +static void parse_cpuid(struct perf_env *env) +{ + const char *cpuid; + int ret; + + cpuid = perf_env__cpuid(env); + /* + * cpuid = "AuthenticAMD,family,model,stepping" + */ + ret = sscanf(cpuid, "%*[^,],%u,%u", &cpu_family, &cpu_model); + if (ret != 2) + pr_debug("problem parsing cpuid\n"); +} + +/* + * Find and assign the type number used for ibs_op or ibs_fetch samples. + * Device names can be large - we are only interested in the first 9 characters, + * to match "ibs_fetch". + */ +bool evlist__has_amd_ibs(struct evlist *evlist) +{ + struct perf_env *env = evlist->env; + int ret, nr_pmu_mappings = perf_env__nr_pmu_mappings(env); + const char *pmu_mapping = perf_env__pmu_mappings(env); + char name[sizeof("ibs_fetch")]; + u32 type; + + while (nr_pmu_mappings--) { + ret = sscanf(pmu_mapping, "%u:%9s", &type, name); + if (ret == 2) { + if (strstarts(name, "ibs_op")) + ibs_op_type = type; + else if (strstarts(name, "ibs_fetch")) + ibs_fetch_type = type; + } + pmu_mapping += strlen(pmu_mapping) + 1 /* '\0' */; + } + + if (ibs_fetch_type || ibs_op_type) { + if (!cpu_family) + parse_cpuid(env); + return true; + } + + return false; +} diff --git a/tools/perf/util/sample-raw.c b/tools/perf/util/sample-raw.c index cde5cd3ce49b..f3f6bd9d290e 100644 --- a/tools/perf/util/sample-raw.c +++ b/tools/perf/util/sample-raw.c @@ -1,8 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include #include "evlist.h" #include "env.h" +#include "header.h" #include "sample-raw.h" /* @@ -12,7 +14,13 @@ void evlist__init_trace_event_sample_raw(struct evlist *evlist) { const char *arch_pf = perf_env__arch(evlist->env); + const char *cpuid = perf_env__cpuid(evlist->env); if (arch_pf && !strcmp("s390", arch_pf)) evlist->trace_event_sample_raw = evlist__s390_sample_raw; + else if (arch_pf && !strcmp("x86", arch_pf) && + cpuid && strstarts(cpuid, "AuthenticAMD") && + evlist__has_amd_ibs(evlist)) { + evlist->trace_event_sample_raw = evlist__amd_sample_raw; + } } diff --git a/tools/perf/util/sample-raw.h b/tools/perf/util/sample-raw.h index 4be84a5510cf..ea01c5811503 100644 --- a/tools/perf/util/sample-raw.h +++ b/tools/perf/util/sample-raw.h @@ -6,6 +6,10 @@ struct evlist; union perf_event; struct perf_sample; -void evlist__s390_sample_raw(struct evlist *evlist, union perf_event *event, struct perf_sample *sample); +void evlist__s390_sample_raw(struct evlist *evlist, union perf_event *event, + struct perf_sample *sample); +bool evlist__has_amd_ibs(struct evlist *evlist); +void evlist__amd_sample_raw(struct evlist *evlist, union perf_event *event, + struct perf_sample *sample); void evlist__init_trace_event_sample_raw(struct evlist *evlist); #endif /* __PERF_EVLIST_H */ -- Gitee From 97f467c2246f00f6029bcb1c441fbac7fd220169 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 2 Dec 2019 11:40:57 -0300 Subject: [PATCH 244/257] perf bench: Update the copies of x86's mem{cpy,set}_64.S commit bd5c6b81dd6025bd4c6ca7800a580b217d9899b9 upstream And update linux/linkage.h, which requires in turn that we make these files switch from ENTRY()/ENDPROC() to SYM_FUNC_START()/SYM_FUNC_END(): tools/perf/arch/arm64/tests/regs_load.S tools/perf/arch/arm/tests/regs_load.S tools/perf/arch/powerpc/tests/regs_load.S tools/perf/arch/x86/tests/regs_load.S We also need to switch SYM_FUNC_START_LOCAL() to SYM_FUNC_START() for the functions used directly by 'perf bench', and update tools/perf/check_headers.sh to ignore those changes when checking if the kernel original files drifted from the copies we carry. This is to get the changes from: 6dcc5627f6ae ("x86/asm: Change all ENTRY+ENDPROC to SYM_FUNC_*") ef1e03152cb0 ("x86/asm: Make some functions local") e9b9d020c487 ("x86/asm: Annotate aliases") And address these tools/perf build warnings: Warning: Kernel ABI header at 'tools/arch/x86/lib/memcpy_64.S' differs from latest version at 'arch/x86/lib/memcpy_64.S' diff -u tools/arch/x86/lib/memcpy_64.S arch/x86/lib/memcpy_64.S Warning: Kernel ABI header at 'tools/arch/x86/lib/memset_64.S' differs from latest version at 'arch/x86/lib/memset_64.S' diff -u tools/arch/x86/lib/memset_64.S arch/x86/lib/memset_64.S Cc: Adrian Hunter Cc: Borislav Petkov Cc: Jiri Olsa Cc: Jiri Slaby Cc: Namhyung Kim Link: https://lkml.kernel.org/n/tip-tay3l8x8k11p7y3qcpqh9qh5@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/arch/x86/lib/memcpy_64.S | 20 +++--- tools/arch/x86/lib/memset_64.S | 16 ++--- tools/perf/arch/arm/tests/regs_load.S | 4 +- tools/perf/arch/arm64/tests/regs_load.S | 4 +- tools/perf/arch/x86/tests/regs_load.S | 8 +-- tools/perf/check-headers.sh | 4 +- tools/perf/util/include/linux/linkage.h | 89 ++++++++++++++++++++++++- 7 files changed, 114 insertions(+), 31 deletions(-) diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S index 92748660ba51..df767afc690f 100644 --- a/tools/arch/x86/lib/memcpy_64.S +++ b/tools/arch/x86/lib/memcpy_64.S @@ -28,8 +28,8 @@ * Output: * rax original destination */ -ENTRY(__memcpy) -ENTRY(memcpy) +SYM_FUNC_START_ALIAS(__memcpy) +SYM_FUNC_START_LOCAL(memcpy) ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ "jmp memcpy_erms", X86_FEATURE_ERMS @@ -41,8 +41,8 @@ ENTRY(memcpy) movl %edx, %ecx rep movsb ret -ENDPROC(memcpy) -ENDPROC(__memcpy) +SYM_FUNC_END(memcpy) +SYM_FUNC_END_ALIAS(__memcpy) EXPORT_SYMBOL(memcpy) EXPORT_SYMBOL(__memcpy) @@ -50,14 +50,14 @@ EXPORT_SYMBOL(__memcpy) * memcpy_erms() - enhanced fast string memcpy. This is faster and * simpler than memcpy. Use memcpy_erms when possible. */ -ENTRY(memcpy_erms) +SYM_FUNC_START(memcpy_erms) movq %rdi, %rax movq %rdx, %rcx rep movsb ret -ENDPROC(memcpy_erms) +SYM_FUNC_END(memcpy_erms) -ENTRY(memcpy_orig) +SYM_FUNC_START(memcpy_orig) movq %rdi, %rax cmpq $0x20, %rdx @@ -182,7 +182,7 @@ ENTRY(memcpy_orig) .Lend: retq -ENDPROC(memcpy_orig) +SYM_FUNC_END(memcpy_orig) #ifndef CONFIG_UML @@ -193,7 +193,7 @@ MCSAFE_TEST_CTL * Note that we only catch machine checks when reading the source addresses. * Writes to target are posted and don't generate machine checks. */ -ENTRY(__memcpy_mcsafe) +SYM_FUNC_START(__memcpy_mcsafe) cmpl $8, %edx /* Less than 8 bytes? Go to byte copy loop */ jb .L_no_whole_words @@ -260,7 +260,7 @@ ENTRY(__memcpy_mcsafe) xorl %eax, %eax .L_done: ret -ENDPROC(__memcpy_mcsafe) +SYM_FUNC_END(__memcpy_mcsafe) EXPORT_SYMBOL_GPL(__memcpy_mcsafe) .section .fixup, "ax" diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S index f8f3dc0a6690..fd5d25a474b7 100644 --- a/tools/arch/x86/lib/memset_64.S +++ b/tools/arch/x86/lib/memset_64.S @@ -18,8 +18,8 @@ * * rax original destination */ -ENTRY(memset) -ENTRY(__memset) +SYM_FUNC_START_ALIAS(memset) +SYM_FUNC_START(__memset) /* * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended * to use it when possible. If not available, use fast string instructions. @@ -42,8 +42,8 @@ ENTRY(__memset) rep stosb movq %r9,%rax ret -ENDPROC(memset) -ENDPROC(__memset) +SYM_FUNC_END(__memset) +SYM_FUNC_END_ALIAS(memset) /* * ISO C memset - set a memory block to a byte value. This function uses @@ -56,16 +56,16 @@ ENDPROC(__memset) * * rax original destination */ -ENTRY(memset_erms) +SYM_FUNC_START(memset_erms) movq %rdi,%r9 movb %sil,%al movq %rdx,%rcx rep stosb movq %r9,%rax ret -ENDPROC(memset_erms) +SYM_FUNC_END(memset_erms) -ENTRY(memset_orig) +SYM_FUNC_START(memset_orig) movq %rdi,%r10 /* expand byte value */ @@ -136,4 +136,4 @@ ENTRY(memset_orig) subq %r8,%rdx jmp .Lafter_bad_alignment .Lfinal: -ENDPROC(memset_orig) +SYM_FUNC_END(memset_orig) diff --git a/tools/perf/arch/arm/tests/regs_load.S b/tools/perf/arch/arm/tests/regs_load.S index 6e2495cc4517..4284307d7822 100644 --- a/tools/perf/arch/arm/tests/regs_load.S +++ b/tools/perf/arch/arm/tests/regs_load.S @@ -37,7 +37,7 @@ .text .type perf_regs_load,%function -ENTRY(perf_regs_load) +SYM_FUNC_START(perf_regs_load) str r0, [r0, #R0] str r1, [r0, #R1] str r2, [r0, #R2] @@ -56,4 +56,4 @@ ENTRY(perf_regs_load) str lr, [r0, #PC] // store pc as lr in order to skip the call // to this function mov pc, lr -ENDPROC(perf_regs_load) +SYM_FUNC_END(perf_regs_load) diff --git a/tools/perf/arch/arm64/tests/regs_load.S b/tools/perf/arch/arm64/tests/regs_load.S index 07042511dca9..d49de40b6818 100644 --- a/tools/perf/arch/arm64/tests/regs_load.S +++ b/tools/perf/arch/arm64/tests/regs_load.S @@ -7,7 +7,7 @@ #define LDR_REG(r) ldr x##r, [x0, 8 * r] #define SP (8 * 31) #define PC (8 * 32) -ENTRY(perf_regs_load) +SYM_FUNC_START(perf_regs_load) STR_REG(0) STR_REG(1) STR_REG(2) @@ -44,4 +44,4 @@ ENTRY(perf_regs_load) str x30, [x0, #PC] LDR_REG(1) ret -ENDPROC(perf_regs_load) +SYM_FUNC_END(perf_regs_load) diff --git a/tools/perf/arch/x86/tests/regs_load.S b/tools/perf/arch/x86/tests/regs_load.S index bbe5a0d16e51..80f14f52e3f6 100644 --- a/tools/perf/arch/x86/tests/regs_load.S +++ b/tools/perf/arch/x86/tests/regs_load.S @@ -28,7 +28,7 @@ .text #ifdef HAVE_ARCH_X86_64_SUPPORT -ENTRY(perf_regs_load) +SYM_FUNC_START(perf_regs_load) movq %rax, AX(%rdi) movq %rbx, BX(%rdi) movq %rcx, CX(%rdi) @@ -60,9 +60,9 @@ ENTRY(perf_regs_load) movq %r14, R14(%rdi) movq %r15, R15(%rdi) ret -ENDPROC(perf_regs_load) +SYM_FUNC_END(perf_regs_load) #else -ENTRY(perf_regs_load) +SYM_FUNC_START(perf_regs_load) push %edi movl 8(%esp), %edi movl %eax, AX(%edi) @@ -88,7 +88,7 @@ ENTRY(perf_regs_load) movl $0, FS(%edi) movl $0, GS(%edi) ret -ENDPROC(perf_regs_load) +SYM_FUNC_END(perf_regs_load) #endif /* diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 8bb186c6f16a..39e0aefc7ffa 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -109,8 +109,8 @@ for i in $FILES; do done # diff with extra ignore lines -check arch/x86/lib/memcpy_64.S '-I "^EXPORT_SYMBOL" -I "^#include "' -check arch/x86/lib/memset_64.S '-I "^EXPORT_SYMBOL" -I "^#include "' +check arch/x86/lib/memcpy_64.S '-I "^EXPORT_SYMBOL" -I "^#include " -I"^SYM_FUNC_START\(_LOCAL\)*(memcpy_\(erms\|orig\))"' +check arch/x86/lib/memset_64.S '-I "^EXPORT_SYMBOL" -I "^#include " -I"^SYM_FUNC_START\(_LOCAL\)*(memset_\(erms\|orig\))"' check include/uapi/asm-generic/mman.h '-I "^#include <\(uapi/\)*asm-generic/mman-common\(-tools\)*.h>"' check include/uapi/linux/mman.h '-I "^#include <\(uapi/\)*asm/mman.h>"' check include/linux/ctype.h '-I "isdigit("' diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h index f01d48a8d707..b8a5159361b4 100644 --- a/tools/perf/util/include/linux/linkage.h +++ b/tools/perf/util/include/linux/linkage.h @@ -5,10 +5,93 @@ /* linkage.h ... for including arch/x86/lib/memcpy_64.S */ -#define ENTRY(name) \ - .globl name; \ +/* Some toolchains use other characters (e.g. '`') to mark new line in macro */ +#ifndef ASM_NL +#define ASM_NL ; +#endif + +#ifndef __ALIGN +#define __ALIGN .align 4,0x90 +#define __ALIGN_STR ".align 4,0x90" +#endif + +/* SYM_T_FUNC -- type used by assembler to mark functions */ +#ifndef SYM_T_FUNC +#define SYM_T_FUNC STT_FUNC +#endif + +/* SYM_A_* -- align the symbol? */ +#define SYM_A_ALIGN ALIGN + +/* SYM_L_* -- linkage of symbols */ +#define SYM_L_GLOBAL(name) .globl name +#define SYM_L_LOCAL(name) /* nothing */ + +#define ALIGN __ALIGN + +/* === generic annotations === */ + +/* SYM_ENTRY -- use only if you have to for non-paired symbols */ +#ifndef SYM_ENTRY +#define SYM_ENTRY(name, linkage, align...) \ + linkage(name) ASM_NL \ + align ASM_NL \ name: +#endif + +/* SYM_START -- use only if you have to */ +#ifndef SYM_START +#define SYM_START(name, linkage, align...) \ + SYM_ENTRY(name, linkage, align) +#endif + +/* SYM_END -- use only if you have to */ +#ifndef SYM_END +#define SYM_END(name, sym_type) \ + .type name sym_type ASM_NL \ + .size name, .-name +#endif + +/* + * SYM_FUNC_START_ALIAS -- use where there are two global names for one + * function + */ +#ifndef SYM_FUNC_START_ALIAS +#define SYM_FUNC_START_ALIAS(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) +#endif + +/* SYM_FUNC_START -- use for global functions */ +#ifndef SYM_FUNC_START +/* + * The same as SYM_FUNC_START_ALIAS, but we will need to distinguish these two + * later. + */ +#define SYM_FUNC_START(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) +#endif + +/* SYM_FUNC_START_LOCAL -- use for local functions */ +#ifndef SYM_FUNC_START_LOCAL +/* the same as SYM_FUNC_START_LOCAL_ALIAS, see comment near SYM_FUNC_START */ +#define SYM_FUNC_START_LOCAL(name) \ + SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) +#endif + +/* SYM_FUNC_END_ALIAS -- the end of LOCAL_ALIASed or ALIASed function */ +#ifndef SYM_FUNC_END_ALIAS +#define SYM_FUNC_END_ALIAS(name) \ + SYM_END(name, SYM_T_FUNC) +#endif -#define ENDPROC(name) +/* + * SYM_FUNC_END -- the end of SYM_FUNC_START_LOCAL, SYM_FUNC_START, + * SYM_FUNC_START_WEAK, ... + */ +#ifndef SYM_FUNC_END +/* the same as SYM_FUNC_END_ALIAS, see comment near SYM_FUNC_START */ +#define SYM_FUNC_END(name) \ + SYM_END(name, SYM_T_FUNC) +#endif #endif /* PERF_LINUX_LINKAGE_H_ */ -- Gitee From 4b340c154d9ac788a082bf0066bde4669ab39396 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:15:08 -0500 Subject: [PATCH 245/257] perf report: Add tools/arch/x86/include/asm/amd-ibs.h commit dde994dd54fbf84f8fd14230de3477d552e42470 upstream This is a tools/-side patch for the patch that adds the original copy of the IBS header file, in arch/x86/include/asm/. We also add an entry to check-headers.sh, so future changes continue to be copied. Committer notes: Had to add this -#include +#include "msr-index.h" And change the check-headers.sh entry to ignore this line when diffing with the original kernel header. This is needed so that we can use 'perf report' on a perf.data with IBS data on a !x86 system, i.e. building on ARM fails without this as there is no asm/msr-index.h there. This was done on the next patch in this series and is done for things like Intel PT and ARM CoreSight. Signed-off-by: Kim Phillips Cc: Alexander Shishkin Cc: Boris Ostrovsky Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Joao Martins Cc: Konrad Rzeszutek Wilk Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Link: https //lore.kernel.org/r/20210817221509.88391-3-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/arch/x86/include/asm/amd-ibs.h | 132 +++++++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + 2 files changed, 133 insertions(+) create mode 100644 tools/arch/x86/include/asm/amd-ibs.h diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h new file mode 100644 index 000000000000..174e7d83fcbd --- /dev/null +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * From PPR Vol 1 for AMD Family 19h Model 01h B1 + * 55898 Rev 0.35 - Feb 5, 2021 + */ + +#include "msr-index.h" + +/* + * IBS Hardware MSRs + */ + +/* MSR 0xc0011030: IBS Fetch Control */ +union ibs_fetch_ctl { + __u64 val; + struct { + __u64 fetch_maxcnt:16,/* 0-15: instruction fetch max. count */ + fetch_cnt:16, /* 16-31: instruction fetch count */ + fetch_lat:16, /* 32-47: instruction fetch latency */ + fetch_en:1, /* 48: instruction fetch enable */ + fetch_val:1, /* 49: instruction fetch valid */ + fetch_comp:1, /* 50: instruction fetch complete */ + ic_miss:1, /* 51: i-cache miss */ + phy_addr_valid:1,/* 52: physical address valid */ + l1tlb_pgsz:2, /* 53-54: i-cache L1TLB page size + * (needs IbsPhyAddrValid) */ + l1tlb_miss:1, /* 55: i-cache fetch missed in L1TLB */ + l2tlb_miss:1, /* 56: i-cache fetch missed in L2TLB */ + rand_en:1, /* 57: random tagging enable */ + fetch_l2_miss:1,/* 58: L2 miss for sampled fetch + * (needs IbsFetchComp) */ + reserved:5; /* 59-63: reserved */ + }; +}; + +/* MSR 0xc0011033: IBS Execution Control */ +union ibs_op_ctl { + __u64 val; + struct { + __u64 opmaxcnt:16, /* 0-15: periodic op max. count */ + reserved0:1, /* 16: reserved */ + op_en:1, /* 17: op sampling enable */ + op_val:1, /* 18: op sample valid */ + cnt_ctl:1, /* 19: periodic op counter control */ + opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ + reserved1:5, /* 27-31: reserved */ + opcurcnt:27, /* 32-58: periodic op counter current count */ + reserved2:5; /* 59-63: reserved */ + }; +}; + +/* MSR 0xc0011035: IBS Op Data 2 */ +union ibs_op_data { + __u64 val; + struct { + __u64 comp_to_ret_ctr:16, /* 0-15: op completion to retire count */ + tag_to_ret_ctr:16, /* 15-31: op tag to retire count */ + reserved1:2, /* 32-33: reserved */ + op_return:1, /* 34: return op */ + op_brn_taken:1, /* 35: taken branch op */ + op_brn_misp:1, /* 36: mispredicted branch op */ + op_brn_ret:1, /* 37: branch op retired */ + op_rip_invalid:1, /* 38: RIP is invalid */ + op_brn_fuse:1, /* 39: fused branch op */ + op_microcode:1, /* 40: microcode op */ + reserved2:23; /* 41-63: reserved */ + }; +}; + +/* MSR 0xc0011036: IBS Op Data 2 */ +union ibs_op_data2 { + __u64 val; + struct { + __u64 data_src:3, /* 0-2: data source */ + reserved0:1, /* 3: reserved */ + rmt_node:1, /* 4: destination node */ + cache_hit_st:1, /* 5: cache hit state */ + reserved1:57; /* 5-63: reserved */ + }; +}; + +/* MSR 0xc0011037: IBS Op Data 3 */ +union ibs_op_data3 { + __u64 val; + struct { + __u64 ld_op:1, /* 0: load op */ + st_op:1, /* 1: store op */ + dc_l1tlb_miss:1, /* 2: data cache L1TLB miss */ + dc_l2tlb_miss:1, /* 3: data cache L2TLB hit in 2M page */ + dc_l1tlb_hit_2m:1, /* 4: data cache L1TLB hit in 2M page */ + dc_l1tlb_hit_1g:1, /* 5: data cache L1TLB hit in 1G page */ + dc_l2tlb_hit_2m:1, /* 6: data cache L2TLB hit in 2M page */ + dc_miss:1, /* 7: data cache miss */ + dc_mis_acc:1, /* 8: misaligned access */ + reserved:4, /* 9-12: reserved */ + dc_wc_mem_acc:1, /* 13: write combining memory access */ + dc_uc_mem_acc:1, /* 14: uncacheable memory access */ + dc_locked_op:1, /* 15: locked operation */ + dc_miss_no_mab_alloc:1, /* 16: DC miss with no MAB allocated */ + dc_lin_addr_valid:1, /* 17: data cache linear address valid */ + dc_phy_addr_valid:1, /* 18: data cache physical address valid */ + dc_l2_tlb_hit_1g:1, /* 19: data cache L2 hit in 1GB page */ + l2_miss:1, /* 20: L2 cache miss */ + sw_pf:1, /* 21: software prefetch */ + op_mem_width:4, /* 22-25: load/store size in bytes */ + op_dc_miss_open_mem_reqs:6, /* 26-31: outstanding mem reqs on DC fill */ + dc_miss_lat:16, /* 32-47: data cache miss latency */ + tlb_refill_lat:16; /* 48-63: L1 TLB refill latency */ + }; +}; + +/* MSR 0xc001103c: IBS Fetch Control Extended */ +union ic_ibs_extd_ctl { + __u64 val; + struct { + __u64 itlb_refill_lat:16, /* 0-15: ITLB Refill latency for sampled fetch */ + reserved:48; /* 16-63: reserved */ + }; +}; + +/* + * IBS driver related + */ + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 39e0aefc7ffa..9cea76bf0efa 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -111,6 +111,7 @@ done # diff with extra ignore lines check arch/x86/lib/memcpy_64.S '-I "^EXPORT_SYMBOL" -I "^#include " -I"^SYM_FUNC_START\(_LOCAL\)*(memcpy_\(erms\|orig\))"' check arch/x86/lib/memset_64.S '-I "^EXPORT_SYMBOL" -I "^#include " -I"^SYM_FUNC_START\(_LOCAL\)*(memset_\(erms\|orig\))"' +check arch/x86/include/asm/amd-ibs.h '-I "^#include [<\"]\(asm/\)*msr-index.h"' check include/uapi/asm-generic/mman.h '-I "^#include <\(uapi/\)*asm-generic/mman-common\(-tools\)*.h>"' check include/uapi/linux/mman.h '-I "^#include <\(uapi/\)*asm/mman.h>"' check include/linux/ctype.h '-I "isdigit("' -- Gitee From 204631218ac637e79b79744e9305d858610a6f52 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:18 +0530 Subject: [PATCH 246/257] perf tool ibs: Sync AMD IBS header file commit c1f4f92b7d5d3deeaae758a1a7ed263b1381dd1c upstream IBS support has been enhanced with two new features in upcoming uarch: 1. DataSrc extension 2. L3 miss filtering. Additional set of bits has been introduced in IBS registers to exploit these features. New bits are already defining in arch/x86/ header. Sync it with tools header file. Also rename existing ibs_op_data field 'data_src' to 'data_src_lo'. Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-8-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/arch/x86/include/asm/amd-ibs.h | 16 ++++++++++------ tools/perf/util/amd-sample-raw.c | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 174e7d83fcbd..21e01cf6162e 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -29,7 +29,10 @@ union ibs_fetch_ctl { rand_en:1, /* 57: random tagging enable */ fetch_l2_miss:1,/* 58: L2 miss for sampled fetch * (needs IbsFetchComp) */ - reserved:5; /* 59-63: reserved */ + l3_miss_only:1, /* 59: Collect L3 miss samples only */ + fetch_oc_miss:1,/* 60: Op cache miss for the sampled fetch */ + fetch_l3_miss:1,/* 61: L3 cache miss for the sampled fetch */ + reserved:2; /* 62-63: reserved */ }; }; @@ -38,14 +41,14 @@ union ibs_op_ctl { __u64 val; struct { __u64 opmaxcnt:16, /* 0-15: periodic op max. count */ - reserved0:1, /* 16: reserved */ + l3_miss_only:1, /* 16: Collect L3 miss samples only */ op_en:1, /* 17: op sampling enable */ op_val:1, /* 18: op sample valid */ cnt_ctl:1, /* 19: periodic op counter control */ opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ - reserved1:5, /* 27-31: reserved */ + reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved2:5; /* 59-63: reserved */ + reserved1:5; /* 59-63: reserved */ }; }; @@ -71,11 +74,12 @@ union ibs_op_data { union ibs_op_data2 { __u64 val; struct { - __u64 data_src:3, /* 0-2: data source */ + __u64 data_src_lo:3, /* 0-2: data source low */ reserved0:1, /* 3: reserved */ rmt_node:1, /* 4: destination node */ cache_hit_st:1, /* 5: cache hit state */ - reserved1:57; /* 5-63: reserved */ + data_src_hi:2, /* 6-7: data source high */ + reserved1:56; /* 8-63: reserved */ }; }; diff --git a/tools/perf/util/amd-sample-raw.c b/tools/perf/util/amd-sample-raw.c index 2aa349b2e944..8b53b2224031 100644 --- a/tools/perf/util/amd-sample-raw.c +++ b/tools/perf/util/amd-sample-raw.c @@ -98,9 +98,9 @@ static void pr_ibs_op_data2(union ibs_op_data2 reg) }; printf("ibs_op_data2:\t%016llx %sRmtNode %d%s\n", reg.val, - reg.data_src == 2 ? (reg.cache_hit_st ? "CacheHitSt 1=O-State " + reg.data_src_lo == 2 ? (reg.cache_hit_st ? "CacheHitSt 1=O-State " : "CacheHitSt 0=M-state ") : "", - reg.rmt_node, data_src_str[reg.data_src]); + reg.rmt_node, data_src_str[reg.data_src_lo]); } static void pr_ibs_op_data3(union ibs_op_data3 reg) -- Gitee From a34701e41f992b90d704cb4a407518c647b78c56 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 4 Jun 2022 10:15:19 +0530 Subject: [PATCH 247/257] perf script ibs: Support new IBS bits in raw trace dump commit 0429796e45ec17eee26d7a59de92271c275d7666 upstream Interpret Additional set of IBS register bits while doing perf report/script raw dump. IBS op PMU ex: $ sudo ./perf record -c 130 -a -e ibs_op/l3missonly=1/ --raw-samples $ sudo ./perf report -D ... ibs_op_ctl: 0000004500070008 MaxCnt 128 L3MissOnly 1 En 1 Val 1 CntCtl 0=cycles CurCnt 69 ibs_op_data: 0000000000710002 CompToRetCtr 2 TagToRetCtr 113 BrnRet 0 RipInvalid 0 BrnFuse 0 Microcode 0 ibs_op_data2: 0000000000000002 CacheHitSt 0=M-state RmtNode 0 DataSrc 2=A peer cache in a near CCX ibs_op_data3: 000000681d1700a1 LdOp 1 StOp 0 DcL1TlbMiss 0 DcL2TlbMiss 0 DcL1TlbHit2M 0 DcL1TlbHit1G 1 DcL2TlbHit2M 0 DcMiss 1 DcMisAcc 0 DcWcMemAcc 0 DcUcMemAcc 0 DcLockedOp 0 DcMissNoMabAlloc 1 DcLinAddrValid 1 DcPhyAddrValid 1 DcL2TlbHit1G 0 L2Miss 1 SwPf 0 OpMemWidth 8 bytes OpDcMissOpenMemReqs 7 DcMissLat 104 TlbRefillLat 0 IBS Fetch PMU ex: $ sudo ./perf record -c 130 -a -e ibs_fetch/l3missonly=1/ --raw-samples $ sudo ./perf report -D ... ibs_fetch_ctl: 3c1f00c700080008 MaxCnt 128 Cnt 128 Lat 199 En 1 Val 1 Comp 1 IcMiss 1 PhyAddrValid 1 L1TlbPgSz 4KB L1TlbMiss 0 L2TlbMiss 0 RandEn 0 L2Miss 1 L3MissOnly 1 FetchOcMiss 1 FetchL3Miss 1 With the DataSrc extensions, the source of data can be decoded among: - Local L3 or other L1/L2 in CCX. - A peer cache in a near CCX. - Data returned from DRAM. - A peer cache in a far CCX. - DRAM address map with "long latency" bit set. - Data returned from MMIO/Config/PCI/APIC. - Extension Memory (S-Link, GenZ, etc - identified by the CS target and/or address map at DF's choice). - Peer Agent Memory. Signed-off-by: Ravi Bangoria Acked-by: Namhyung Kim Cc: Ananth Narayan Cc: Andi Kleen Cc: Borislav Petkov Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Kim Phillips Cc: Leo Yan Cc: Mark Rutland Cc: Peter Zijlstra Cc: Robert Richter Cc: Sandipan Das Cc: Santosh Shukla Cc: Stephane Eranian Cc: Thomas Gleixner Cc: like.xu.linux@gmail.com Cc: x86@kernel.org Link: https://lore.kernel.org/r/20220604044519.594-9-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- tools/perf/util/amd-sample-raw.c | 64 +++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/amd-sample-raw.c b/tools/perf/util/amd-sample-raw.c index 8b53b2224031..38b4be42d0a9 100644 --- a/tools/perf/util/amd-sample-raw.c +++ b/tools/perf/util/amd-sample-raw.c @@ -18,6 +18,7 @@ #include "pmu-events/pmu-events.h" static u32 cpu_family, cpu_model, ibs_fetch_type, ibs_op_type; +static bool zen4_ibs_extensions; static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg) { @@ -39,6 +40,7 @@ static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg) }; const char *ic_miss_str = NULL; const char *l1tlb_pgsz_str = NULL; + char l3_miss_str[sizeof(" L3MissOnly _ FetchOcMiss _ FetchL3Miss _")] = ""; if (cpu_family == 0x19 && cpu_model < 0x10) { /* @@ -53,12 +55,19 @@ static void pr_ibs_fetch_ctl(union ibs_fetch_ctl reg) ic_miss_str = ic_miss_strs[reg.ic_miss]; } + if (zen4_ibs_extensions) { + snprintf(l3_miss_str, sizeof(l3_miss_str), + " L3MissOnly %d FetchOcMiss %d FetchL3Miss %d", + reg.l3_miss_only, reg.fetch_oc_miss, reg.fetch_l3_miss); + } + printf("ibs_fetch_ctl:\t%016llx MaxCnt %7d Cnt %7d Lat %5d En %d Val %d Comp %d%s " - "PhyAddrValid %d%s L1TlbMiss %d L2TlbMiss %d RandEn %d%s\n", + "PhyAddrValid %d%s L1TlbMiss %d L2TlbMiss %d RandEn %d%s%s\n", reg.val, reg.fetch_maxcnt << 4, reg.fetch_cnt << 4, reg.fetch_lat, reg.fetch_en, reg.fetch_val, reg.fetch_comp, ic_miss_str ? : "", reg.phy_addr_valid, l1tlb_pgsz_str ? : "", reg.l1tlb_miss, reg.l2tlb_miss, - reg.rand_en, reg.fetch_comp ? (reg.fetch_l2_miss ? " L2Miss 1" : " L2Miss 0") : ""); + reg.rand_en, reg.fetch_comp ? (reg.fetch_l2_miss ? " L2Miss 1" : " L2Miss 0") : "", + l3_miss_str); } static void pr_ic_ibs_extd_ctl(union ic_ibs_extd_ctl reg) @@ -68,9 +77,15 @@ static void pr_ic_ibs_extd_ctl(union ic_ibs_extd_ctl reg) static void pr_ibs_op_ctl(union ibs_op_ctl reg) { - printf("ibs_op_ctl:\t%016llx MaxCnt %9d En %d Val %d CntCtl %d=%s CurCnt %9d\n", - reg.val, ((reg.opmaxcnt_ext << 16) | reg.opmaxcnt) << 4, reg.op_en, reg.op_val, - reg.cnt_ctl, reg.cnt_ctl ? "uOps" : "cycles", reg.opcurcnt); + char l3_miss_only[sizeof(" L3MissOnly _")] = ""; + + if (zen4_ibs_extensions) + snprintf(l3_miss_only, sizeof(l3_miss_only), " L3MissOnly %d", reg.l3_miss_only); + + printf("ibs_op_ctl:\t%016llx MaxCnt %9d%s En %d Val %d CntCtl %d=%s CurCnt %9d\n", + reg.val, ((reg.opmaxcnt_ext << 16) | reg.opmaxcnt) << 4, l3_miss_only, + reg.op_en, reg.op_val, reg.cnt_ctl, + reg.cnt_ctl ? "uOps" : "cycles", reg.opcurcnt); } static void pr_ibs_op_data(union ibs_op_data reg) @@ -84,7 +99,34 @@ static void pr_ibs_op_data(union ibs_op_data reg) reg.op_brn_ret, reg.op_rip_invalid, reg.op_brn_fuse, reg.op_microcode); } -static void pr_ibs_op_data2(union ibs_op_data2 reg) +static void pr_ibs_op_data2_extended(union ibs_op_data2 reg) +{ + static const char * const data_src_str[] = { + "", + " DataSrc 1=Local L3 or other L1/L2 in CCX", + " DataSrc 2=A peer cache in a near CCX", + " DataSrc 3=Data returned from DRAM", + " DataSrc 4=(reserved)", + " DataSrc 5=A peer cache in a far CCX", + " DataSrc 6=DRAM address map with \"long latency\" bit set", + " DataSrc 7=Data returned from MMIO/Config/PCI/APIC", + " DataSrc 8=Extension Memory (S-Link, GenZ, etc)", + " DataSrc 9=(reserved)", + " DataSrc 10=(reserved)", + " DataSrc 11=(reserved)", + " DataSrc 12=Peer Agent Memory", + /* 13 to 31 are reserved. Avoid printing them. */ + }; + int data_src = (reg.data_src_hi << 3) | reg.data_src_lo; + + printf("ibs_op_data2:\t%016llx %sRmtNode %d%s\n", reg.val, + (data_src == 1 || data_src == 2 || data_src == 5) ? + (reg.cache_hit_st ? "CacheHitSt 1=O-State " : "CacheHitSt 0=M-state ") : "", + reg.rmt_node, + data_src < (int)ARRAY_SIZE(data_src_str) ? data_src_str[data_src] : ""); +} + +static void pr_ibs_op_data2_default(union ibs_op_data2 reg) { static const char * const data_src_str[] = { "", @@ -103,6 +145,13 @@ static void pr_ibs_op_data2(union ibs_op_data2 reg) reg.rmt_node, data_src_str[reg.data_src_lo]); } +static void pr_ibs_op_data2(union ibs_op_data2 reg) +{ + if (zen4_ibs_extensions) + return pr_ibs_op_data2_extended(reg); + pr_ibs_op_data2_default(reg); +} + static void pr_ibs_op_data3(union ibs_op_data3 reg) { char l2_miss_str[sizeof(" L2Miss _")] = ""; @@ -279,6 +328,9 @@ bool evlist__has_amd_ibs(struct evlist *evlist) pmu_mapping += strlen(pmu_mapping) + 1 /* '\0' */; } + if (perf_env__find_pmu_cap(env, "ibs_op", "zen4_ibs_extensions")) + zen4_ibs_extensions = 1; + if (ibs_fetch_type || ibs_op_type) { if (!cpu_family) parse_cpuid(env); -- Gitee From c0d5d718c774ce696791ba8e8be00cf4f5960a6a Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Mar 2023 04:33:38 -0700 Subject: [PATCH 248/257] perf/x86/amd/core: Always clear status for idx commit 263f5ecaf7080513efc248ec739b6d9e00f4129f upstream The variable 'status' (which contains the unhandled overflow bits) is not being properly masked in some cases, displaying the following warning: WARNING: CPU: 156 PID: 475601 at arch/x86/events/amd/core.c:972 amd_pmu_v2_handle_irq+0x216/0x270 This seems to be happening because the loop is being continued before the status bit being unset, in case x86_perf_event_set_period() returns 0. This is also causing an inconsistency because the "handled" counter is incremented, but the status bit is not cleaned. Move the bit cleaning together above, together when the "handled" counter is incremented. Fixes: 7685665c390d ("perf/x86/amd/core: Add PerfMonV2 overflow handling") Signed-off-by: Breno Leitao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20230321113338.1669660-1-leitao@debian.org Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9b95649bf16a..d748fd4704f0 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -958,6 +958,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) /* Event overflow */ handled++; + status &= ~mask; perf_sample_data_init(&data, 0, hwc->last_period); if (!x86_perf_event_set_period(event)) @@ -965,8 +966,6 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); - - status &= ~mask; } /* -- Gitee From f36739bea496522150f07d97148a0390a1e7a865 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 14 Sep 2023 19:36:04 +0530 Subject: [PATCH 249/257] perf/x86/amd/core: Fix overflow reset on hotplug commit 23d2626b841c2adccdeb477665313c02dff02dc3 upstream Kernels older than v5.19 do not support PerfMonV2 and the PMI handler does not clear the overflow bits of the PerfCntrGlobalStatus register. Because of this, loading a recent kernel using kexec from an older kernel can result in inconsistent register states on Zen 4 systems. The PMI handler of the new kernel gets confused and shows a warning when an overflow occurs because some of the overflow bits are set even if the corresponding counters are inactive. These are remnants from overflows that were handled by the older kernel. During CPU hotplug, the PerfCntrGlobalCtl and PerfCntrGlobalStatus registers should always be cleared for PerfMonV2-capable processors. However, a condition used for NB event constaints applicable only to older processors currently prevents this from happening. Move the reset sequence to an appropriate place and also clear the LBR Freeze bit. [Backport changes] 1.In arch/x86/events/amd/core.c, the upstream patch removes an empty line within the amd_pmu_cpu_starting() function. However, that line is not present in the current kernel source, so this part of the patch does not apply. Fixes: 21d59e3e2c40 ("perf/x86/amd/core: Detect PerfMonV2 support") Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/882a87511af40792ba69bb0e9026f19a2e71e8a3.1694696888.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d748fd4704f0..01362e83db51 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -589,8 +589,12 @@ static void amd_pmu_cpu_reset(int cpu) /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); - /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); + /* + * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze + * and PerfCntrGLobalStatus.PerfCntrOvfl + */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask); } static int amd_pmu_cpu_prepare(int cpu) @@ -617,6 +621,7 @@ static void amd_pmu_cpu_starting(int cpu) int i, nb_id; cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -638,7 +643,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - amd_pmu_cpu_reset(cpu); amd_brs_reset(); } @@ -646,6 +650,7 @@ static void amd_pmu_cpu_starting(int cpu) static void amd_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuhw; + amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -660,8 +665,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } - - amd_pmu_cpu_reset(cpu); } static inline void amd_pmu_set_global_ctl(u64 ctl) -- Gitee From de6890805326268a28a6cd90901eb3bf148da948 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Mon, 29 Jan 2024 16:36:26 +0530 Subject: [PATCH 250/257] perf/x86/amd/core: Avoid register reset when CPU is dead commit ad8c91282c95f801c37812d59d2d9eba6899b384 upstream When bringing a CPU online, some of the PMC and LBR related registers are reset. The same is done when a CPU is taken offline although that is unnecessary. This currently happens in the "cpu_dead" callback which is also incorrect as the callback runs on a control CPU instead of the one that is being taken offline. This also affects hibernation and suspend to RAM on some platforms as reported in the link below. Fixes: 21d59e3e2c40 ("perf/x86/amd/core: Detect PerfMonV2 support") Reported-by: Mario Limonciello Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/550a026764342cf7e5812680e3e2b91fe662b5ac.1706526029.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 01362e83db51..d3c344fecaeb 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -650,7 +650,6 @@ static void amd_pmu_cpu_starting(int cpu) static void amd_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuhw; - amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; -- Gitee From 22d8f1b35ecb7a150116814bbc699b1fd51a28df Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Mon, 25 Mar 2024 13:17:53 +0530 Subject: [PATCH 251/257] perf/x86/amd/core: Update and fix stalled-cycles-* events for Zen 2 and later commit c7b2edd8377be983442c1344cb940cd2ac21b601 upstream AMD processors based on Zen 2 and later microarchitectures do not support PMCx087 (instruction pipe stalls) which is used as the backing event for "stalled-cycles-frontend" and "stalled-cycles-backend". Use PMCx0A9 (cycles where micro-op queue is empty) instead to count frontend stalls and remove the entry for backend stalls since there is no direct replacement. Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Fixes: 3fe3331bb285 ("perf/x86/amd: Add event map for AMD Family 17h") Link: https://lore.kernel.org/r/03d7fc8fa2a28f9be732116009025bdec1b3ec97.1711352180.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d3c344fecaeb..1037489c8368 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -249,7 +249,7 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = /* * AMD Performance Monitor Family 17h and later: */ -static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +static const u64 amd_zen1_perfmon_event_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, @@ -261,10 +261,24 @@ static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, }; +static const u64 amd_zen2_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0964, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9, +}; + static u64 amd_pmu_event_map(int hw_event) { - if (boot_cpu_data.x86 >= 0x17) - return amd_f17h_perfmon_event_map[hw_event]; + if (cpu_feature_enabled(X86_FEATURE_ZEN2) || boot_cpu_data.x86 >= 0x19) + return amd_zen2_perfmon_event_map[hw_event]; + + if (cpu_feature_enabled(X86_FEATURE_ZEN1)) + return amd_zen1_perfmon_event_map[hw_event]; return amd_perfmon_event_map[hw_event]; } -- Gitee From c4fc7a3bc02160c350be41ef0008d776f8bf4404 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Mon, 25 Mar 2024 13:17:54 +0530 Subject: [PATCH 252/257] perf/x86/amd/core: Define a proper ref-cycles event for Zen 4 and later commit 68cdf1e6e8f2ce78ed7d8f5d80844fd75a9c54ff upstream Add the "ref-cycles" event for AMD processors based on Zen 4 and later microarchitectures. The backing event is based on PMCx120 which counts cycles not in halt state in P0 frequency (same as MPERF). Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Link: https://lore.kernel.org/r/089155f19f7c7e65aeb1caa727a882e2ca9b8b04.1711352180.git.sandipan.das@amd.com Signed-off-by: Hemanth Selam Signed-off-by: mohanasv2 --- arch/x86/events/amd/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 1037489c8368..438ab9243409 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -272,8 +272,23 @@ static const u64 amd_zen2_perfmon_event_map[PERF_COUNT_HW_MAX] = [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9, }; +static const u64 amd_zen4_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0964, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x100000120, +}; + static u64 amd_pmu_event_map(int hw_event) { + if (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1a) + return amd_zen4_perfmon_event_map[hw_event]; + if (cpu_feature_enabled(X86_FEATURE_ZEN2) || boot_cpu_data.x86 >= 0x19) return amd_zen2_perfmon_event_map[hw_event]; -- Gitee From 626cdcbb0c5281245424f94d3dd7fb9234aa64fe Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:33 -0700 Subject: [PATCH 253/257] x86/bugs: Don't fill RSB on VMEXIT with eIBRS+retpoline commit 18bae0dfec15b24ec14ca17dc18603372f5f254f upstream eIBRS protects against guest->host RSB underflow/poisoning attacks. Adding retpoline to the mix doesn't change that. Retpoline has a balanced CALL/RET anyway. So the current full RSB filling on VMEXIT with eIBRS+retpoline is overkill. Disable it or do the VMEXIT_LITE mitigation if needed. Suggested-by: Pawan Gupta Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Pawan Gupta Reviewed-by: Amit Shah Reviewed-by: Nikolay Borisov Cc: Paolo Bonzini Cc: Vitaly Kuznetsov Cc: Sean Christopherson Cc: David Woodhouse Link: https://lore.kernel.org/r/84a1226e5c9e2698eae1b5ade861f1b8bf3677dc.1744148254.git.jpoimboe@kernel.org Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/kernel/cpu/bugs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 71be1bf011be..06d5186fe1cb 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1248,20 +1248,20 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ case SPECTRE_V2_NONE: return; - case SPECTRE_V2_EIBRS_LFENCE: case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS_RETPOLINE: if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } return; - case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); return; } -- Gitee From 6734c74f5cdc127443a3310576e27306bfd97378 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sat, 20 Feb 2021 15:17:11 -0800 Subject: [PATCH 254/257] x86/mm/tlb: Remove unnecessary uses of the inline keyword commit 1608e4cf31b88c8c448ce13aa1d77969dda6bdb7 upstream The compiler is smart enough without these hints. Suggested-by: Dave Hansen Signed-off-by: Nadav Amit Signed-off-by: Ingo Molnar Reviewed-by: Dave Hansen Link: https://lore.kernel.org/r/20210220231712.2475218-9-namit@vmware.com Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/mm/tlb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 851359b7edc5..085b96f5eb99 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -189,7 +189,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) } } -static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +static unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; @@ -735,7 +735,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); #endif -static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, +static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables, u64 new_tlb_gen) @@ -761,7 +761,7 @@ static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, return info; } -static inline void put_flush_tlb_info(void) +static void put_flush_tlb_info(void) { #ifdef CONFIG_DEBUG_VM /* Complete reentrency prevention checks */ -- Gitee From 3b3e47f64ff4a028eddd5c4caa1c0a2ddf22aa99 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 8 Jan 2021 23:10:53 +1100 Subject: [PATCH 255/257] x86/mm: Refactor cond_ibpb() to support other use cases commit 371b09c6fdc436f2c7bb67fc90df5eec8ce90f06 upstream cond_ibpb() has the necessary bits required to track the previous mm in switch_mm_irqs_off(). This can be reused for other use cases like L1D flushing on context switch. Suggested-by: Thomas Gleixner Signed-off-by: Balbir Singh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210108121056.21940-3-sblbir@amazon.com Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/mm/tlb.c | 53 ++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6f66d841262d..69e6ea20679c 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -172,7 +172,7 @@ struct tlb_state { /* Last user mm for optimizing IBPB */ union { struct mm_struct *last_user_mm; - unsigned long last_user_mm_ibpb; + unsigned long last_user_mm_spec; }; u16 loaded_mm_asid; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 085b96f5eb99..b9c1b9d85bd8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -33,10 +33,14 @@ */ /* - * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is - * stored in cpu_tlb_state.last_user_mm_ibpb. + * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_spec. */ #define LAST_USER_MM_IBPB 0x1UL +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB) + +/* Bits to set when tlbstate and flush is (re)initialized */ +#define LAST_USER_MM_INIT LAST_USER_MM_IBPB /* * We get here when we do something requiring a TLB invalidation @@ -189,20 +193,29 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) } } -static unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; - unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; + unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; - return (unsigned long)next->mm | ibpb; + return (unsigned long)next->mm | spec_bits; } -static void cond_ibpb(struct task_struct *next) +static void cond_mitigation(struct task_struct *next) { + unsigned long prev_mm, next_mm; + if (!next || !next->mm) return; + next_mm = mm_mangle_tif_spec_bits(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); + /* + * Avoid user/user BTB poisoning by flushing the branch predictor + * when switching between processes. This stops one process from + * doing Spectre-v2 attacks on another. + * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the * same process. Using the mm pointer instead of mm->context.ctx_id @@ -212,8 +225,6 @@ static void cond_ibpb(struct task_struct *next) * exposed data is not really interesting. */ if (static_branch_likely(&switch_mm_cond_ibpb)) { - unsigned long prev_mm, next_mm; - /* * This is a bit more complex than the always mode because * it has to handle two cases: @@ -243,20 +254,14 @@ static void cond_ibpb(struct task_struct *next) * Optimize this with reasonably small overhead for the * above cases. Mangle the TIF_SPEC_IB bit into the mm * pointer of the incoming task which is stored in - * cpu_tlbstate.last_user_mm_ibpb for comparison. - */ - next_mm = mm_mangle_tif_spec_ib(next); - prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); - - /* + * cpu_tlbstate.last_user_mm_spec for comparison. + * * Issue IBPB only if the mm's are different and one or * both have the IBPB bit set. */ if (next_mm != prev_mm && (next_mm | prev_mm) & LAST_USER_MM_IBPB) indirect_branch_prediction_barrier(); - - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); } if (static_branch_unlikely(&switch_mm_always_ibpb)) { @@ -265,11 +270,12 @@ static void cond_ibpb(struct task_struct *next) * different context than the user space task which ran * last on this CPU. */ - if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != + (unsigned long)next->mm) indirect_branch_prediction_barrier(); - this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); - } } + + this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); } void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, @@ -377,11 +383,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, need_flush = true; } else { /* - * Avoid user/user BTB poisoning by flushing the branch - * predictor when switching between processes. This stops - * one process from doing Spectre-v2 attacks on another. + * Apply process to process speculation vulnerability + * mitigations if applicable. */ - cond_ibpb(tsk); + cond_mitigation(tsk); if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* @@ -507,7 +512,7 @@ void initialize_tlbstate_and_flush(void) write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); + this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); -- Gitee From 3a7816b5b4614dfbc1f6ce7c52376faa492558c4 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:34 -0700 Subject: [PATCH 256/257] x86/bugs: Don't fill RSB on context switch with eIBRS commit 27ce8299bc1ec6df8306073785ff82b30b3cc5ee upstream User->user Spectre v2 attacks (including RSB) across context switches are already mitigated by IBPB in cond_mitigation(), if enabled globally or if either the prev or the next task has opted in to protection. RSB filling without IBPB serves no purpose for protecting user space, as indirect branches are still vulnerable. User->kernel RSB attacks are mitigated by eIBRS. In which case the RSB filling on context switch isn't needed, so remove it. Suggested-by: Pawan Gupta Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Pawan Gupta Reviewed-by: Amit Shah Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/98cdefe42180358efebf78e3b80752850c7a3e1b.1744148254.git.jpoimboe@kernel.org Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- arch/x86/kernel/cpu/bugs.c | 24 ++++++++++++------------ arch/x86/mm/tlb.c | 6 +++--- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 06d5186fe1cb..a4e07267e382 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1222,7 +1222,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) } } -static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* * Similar to context switches, there are two types of RSB attacks @@ -1246,7 +1246,7 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ */ switch (mode) { case SPECTRE_V2_NONE: - return; + break; case SPECTRE_V2_EIBRS: case SPECTRE_V2_EIBRS_LFENCE: @@ -1255,18 +1255,21 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } - return; + break; case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: - pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); - return; - } + break; - pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); - dump_stack(); + default: + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n"); + dump_stack(); + break; + } } static void __init spectre_v2_select_mitigation(void) @@ -1415,10 +1418,7 @@ static void __init spectre_v2_select_mitigation(void) * * FIXME: Is this pointless for retbleed-affected AMD? */ - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - - spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + spectre_v2_select_rsb_mitigation(mode); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index b9c1b9d85bd8..47a35e335da2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -212,9 +212,9 @@ static void cond_mitigation(struct task_struct *next) prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); /* - * Avoid user/user BTB poisoning by flushing the branch predictor - * when switching between processes. This stops one process from - * doing Spectre-v2 attacks on another. + * Avoid user->user BTB/RSB poisoning by flushing them when switching + * between processes. This stops one process from doing Spectre-v2 + * attacks on another. * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the -- Gitee From 2966efee2a16d588edc3e25db3525cd818a56c18 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:35 -0700 Subject: [PATCH 257/257] x86/bugs: Add RSB mitigation document commit 83f6665a49c3d44ad0c08f837d352dd290f5d10b upstream Create a document to summarize hard-earned knowledge about RSB-related mitigations, with references, and replace the overly verbose yet incomplete comments with a reference to the document. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/ab73f4659ba697a974759f07befd41ae605e33dd.1744148254.git.jpoimboe@kernel.org Signed-off-by: Mukesh-Ogare Signed-off-by: mohanasv2 --- Documentation/admin-guide/hw-vuln/index.rst | 1 + Documentation/admin-guide/hw-vuln/rsb.rst | 268 ++++++++++++++++++++ arch/x86/kernel/cpu/bugs.c | 64 +---- 3 files changed, 282 insertions(+), 51 deletions(-) create mode 100644 Documentation/admin-guide/hw-vuln/rsb.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 2adec1e6520a..812dc0697890 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -16,3 +16,4 @@ are configurable at compile, boot or run time. multihit.rst special-register-buffer-data-sampling.rst processor_mmio_stale_data.rst + rsb diff --git a/Documentation/admin-guide/hw-vuln/rsb.rst b/Documentation/admin-guide/hw-vuln/rsb.rst new file mode 100644 index 000000000000..21dbf9cf25f8 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/rsb.rst @@ -0,0 +1,268 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +RSB-related mitigations +======================= + +.. warning:: + Please keep this document up-to-date, otherwise you will be + volunteered to update it and convert it to a very long comment in + bugs.c! + +Since 2018 there have been many Spectre CVEs related to the Return Stack +Buffer (RSB) (sometimes referred to as the Return Address Stack (RAS) or +Return Address Predictor (RAP) on AMD). + +Information about these CVEs and how to mitigate them is scattered +amongst a myriad of microarchitecture-specific documents. + +This document attempts to consolidate all the relevant information in +once place and clarify the reasoning behind the current RSB-related +mitigations. It's meant to be as concise as possible, focused only on +the current kernel mitigations: what are the RSB-related attack vectors +and how are they currently being mitigated? + +It's *not* meant to describe how the RSB mechanism operates or how the +exploits work. More details about those can be found in the references +below. + +Rather, this is basically a glorified comment, but too long to actually +be one. So when the next CVE comes along, a kernel developer can +quickly refer to this as a refresher to see what we're actually doing +and why. + +At a high level, there are two classes of RSB attacks: RSB poisoning +(Intel and AMD) and RSB underflow (Intel only). They must each be +considered individually for each attack vector (and microarchitecture +where applicable). + +---- + +RSB poisoning (Intel and AMD) +============================= + +SpectreRSB +~~~~~~~~~~ + +RSB poisoning is a technique used by SpectreRSB [#spectre-rsb]_ where +an attacker poisons an RSB entry to cause a victim's return instruction +to speculate to an attacker-controlled address. This can happen when +there are unbalanced CALLs/RETs after a context switch or VMEXIT. + +* All attack vectors can potentially be mitigated by flushing out any + poisoned RSB entries using an RSB filling sequence + [#intel-rsb-filling]_ [#amd-rsb-filling]_ when transitioning between + untrusted and trusted domains. But this has a performance impact and + should be avoided whenever possible. + + .. DANGER:: + **FIXME**: Currently we're flushing 32 entries. However, some CPU + models have more than 32 entries. The loop count needs to be + increased for those. More detailed information is needed about RSB + sizes. + +* On context switch, the user->user mitigation requires ensuring the + RSB gets filled or cleared whenever IBPB gets written [#cond-ibpb]_ + during a context switch: + + * AMD: + On Zen 4+, IBPB (or SBPB [#amd-sbpb]_ if used) clears the RSB. + This is indicated by IBPB_RET in CPUID [#amd-ibpb-rsb]_. + + On Zen < 4, the RSB filling sequence [#amd-rsb-filling]_ must be + always be done in addition to IBPB [#amd-ibpb-no-rsb]_. This is + indicated by X86_BUG_IBPB_NO_RET. + + * Intel: + IBPB always clears the RSB: + + "Software that executed before the IBPB command cannot control + the predicted targets of indirect branches executed after the + command on the same logical processor. The term indirect branch + in this context includes near return instructions, so these + predicted targets may come from the RSB." [#intel-ibpb-rsb]_ + +* On context switch, user->kernel attacks are prevented by SMEP. User + space can only insert user space addresses into the RSB. Even + non-canonical addresses can't be inserted due to the page gap at the + end of the user canonical address space reserved by TASK_SIZE_MAX. + A SMEP #PF at instruction fetch prevents the kernel from speculatively + executing user space. + + * AMD: + "Finally, branches that are predicted as 'ret' instructions get + their predicted targets from the Return Address Predictor (RAP). + AMD recommends software use a RAP stuffing sequence (mitigation + V2-3 in [2]) and/or Supervisor Mode Execution Protection (SMEP) + to ensure that the addresses in the RAP are safe for + speculation. Collectively, we refer to these mitigations as "RAP + Protection"." [#amd-smep-rsb]_ + + * Intel: + "On processors with enhanced IBRS, an RSB overwrite sequence may + not suffice to prevent the predicted target of a near return + from using an RSB entry created in a less privileged predictor + mode. Software can prevent this by enabling SMEP (for + transitions from user mode to supervisor mode) and by having + IA32_SPEC_CTRL.IBRS set during VM exits." [#intel-smep-rsb]_ + +* On VMEXIT, guest->host attacks are mitigated by eIBRS (and PBRSB + mitigation if needed): + + * AMD: + "When Automatic IBRS is enabled, the internal return address + stack used for return address predictions is cleared on VMEXIT." + [#amd-eibrs-vmexit]_ + + * Intel: + "On processors with enhanced IBRS, an RSB overwrite sequence may + not suffice to prevent the predicted target of a near return + from using an RSB entry created in a less privileged predictor + mode. Software can prevent this by enabling SMEP (for + transitions from user mode to supervisor mode) and by having + IA32_SPEC_CTRL.IBRS set during VM exits. Processors with + enhanced IBRS still support the usage model where IBRS is set + only in the OS/VMM for OSes that enable SMEP. To do this, such + processors will ensure that guest behavior cannot control the + RSB after a VM exit once IBRS is set, even if IBRS was not set + at the time of the VM exit." [#intel-eibrs-vmexit]_ + + Note that some Intel CPUs are susceptible to Post-barrier Return + Stack Buffer Predictions (PBRSB) [#intel-pbrsb]_, where the last + CALL from the guest can be used to predict the first unbalanced RET. + In this case the PBRSB mitigation is needed in addition to eIBRS. + +AMD RETBleed / SRSO / Branch Type Confusion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On AMD, poisoned RSB entries can also be created by the AMD RETBleed +variant [#retbleed-paper]_ [#amd-btc]_ or by Speculative Return Stack +Overflow [#amd-srso]_ (Inception [#inception-paper]_). The kernel +protects itself by replacing every RET in the kernel with a branch to a +single safe RET. + +---- + +RSB underflow (Intel only) +========================== + +RSB Alternate (RSBA) ("Intel Retbleed") +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some Intel Skylake-generation CPUs are susceptible to the Intel variant +of RETBleed [#retbleed-paper]_ (Return Stack Buffer Underflow +[#intel-rsbu]_). If a RET is executed when the RSB buffer is empty due +to mismatched CALLs/RETs or returning from a deep call stack, the branch +predictor can fall back to using the Branch Target Buffer (BTB). If a +user forces a BTB collision then the RET can speculatively branch to a +user-controlled address. + +* Note that RSB filling doesn't fully mitigate this issue. If there + are enough unbalanced RETs, the RSB may still underflow and fall back + to using a poisoned BTB entry. + +* On context switch, user->user underflow attacks are mitigated by the + conditional IBPB [#cond-ibpb]_ on context switch which effectively + clears the BTB: + + * "The indirect branch predictor barrier (IBPB) is an indirect branch + control mechanism that establishes a barrier, preventing software + that executed before the barrier from controlling the predicted + targets of indirect branches executed after the barrier on the same + logical processor." [#intel-ibpb-btb]_ + +* On context switch and VMEXIT, user->kernel and guest->host RSB + underflows are mitigated by IBRS or eIBRS: + + * "Enabling IBRS (including enhanced IBRS) will mitigate the "RSBU" + attack demonstrated by the researchers. As previously documented, + Intel recommends the use of enhanced IBRS, where supported. This + includes any processor that enumerates RRSBA but not RRSBA_DIS_S." + [#intel-rsbu]_ + + However, note that eIBRS and IBRS do not mitigate intra-mode attacks. + Like RRSBA below, this is mitigated by clearing the BHB on kernel + entry. + + As an alternative to classic IBRS, call depth tracking (combined with + retpolines) can be used to track kernel returns and fill the RSB when + it gets close to being empty. + +Restricted RSB Alternate (RRSBA) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some newer Intel CPUs have Restricted RSB Alternate (RRSBA) behavior, +which, similar to RSBA described above, also falls back to using the BTB +on RSB underflow. The only difference is that the predicted targets are +restricted to the current domain when eIBRS is enabled: + +* "Restricted RSB Alternate (RRSBA) behavior allows alternate branch + predictors to be used by near RET instructions when the RSB is + empty. When eIBRS is enabled, the predicted targets of these + alternate predictors are restricted to those belonging to the + indirect branch predictor entries of the current prediction domain. + [#intel-eibrs-rrsba]_ + +When a CPU with RRSBA is vulnerable to Branch History Injection +[#bhi-paper]_ [#intel-bhi]_, an RSB underflow could be used for an +intra-mode BTI attack. This is mitigated by clearing the BHB on +kernel entry. + +However if the kernel uses retpolines instead of eIBRS, it needs to +disable RRSBA: + +* "Where software is using retpoline as a mitigation for BHI or + intra-mode BTI, and the processor both enumerates RRSBA and + enumerates RRSBA_DIS controls, it should disable this behavior." + [#intel-retpoline-rrsba]_ + +---- + +References +========== + +.. [#spectre-rsb] `Spectre Returns! Speculation Attacks using the Return Stack Buffer `_ + +.. [#intel-rsb-filling] "Empty RSB Mitigation on Skylake-generation" in `Retpoline: A Branch Target Injection Mitigation `_ + +.. [#amd-rsb-filling] "Mitigation V2-3" in `Software Techniques for Managing Speculation `_ + +.. [#cond-ibpb] Whether IBPB is written depends on whether the prev and/or next task is protected from Spectre attacks. It typically requires opting in per task or system-wide. For more details see the documentation for the ``spectre_v2_user`` cmdline option in Documentation/admin-guide/kernel-parameters.txt. + +.. [#amd-sbpb] IBPB without flushing of branch type predictions. Only exists for AMD. + +.. [#amd-ibpb-rsb] "Function 8000_0008h -- Processor Capacity Parameters and Extended Feature Identification" in `AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions `_. SBPB behaves the same way according to `this email `_. + +.. [#amd-ibpb-no-rsb] `Spectre Attacks: Exploiting Speculative Execution `_ + +.. [#intel-ibpb-rsb] "Introduction" in `Post-barrier Return Stack Buffer Predictions / CVE-2022-26373 / INTEL-SA-00706 `_ + +.. [#amd-smep-rsb] "Existing Mitigations" in `Technical Guidance for Mitigating Branch Type Confusion `_ + +.. [#intel-smep-rsb] "Enhanced IBRS" in `Indirect Branch Restricted Speculation `_ + +.. [#amd-eibrs-vmexit] "Extended Feature Enable Register (EFER)" in `AMD64 Architecture Programmer's Manual Volume 2: System Programming `_ + +.. [#intel-eibrs-vmexit] "Enhanced IBRS" in `Indirect Branch Restricted Speculation `_ + +.. [#intel-pbrsb] `Post-barrier Return Stack Buffer Predictions / CVE-2022-26373 / INTEL-SA-00706 `_ + +.. [#retbleed-paper] `RETBleed: Arbitrary Speculative Code Execution with Return Instruction `_ + +.. [#amd-btc] `Technical Guidance for Mitigating Branch Type Confusion `_ + +.. [#amd-srso] `Technical Update Regarding Speculative Return Stack Overflow `_ + +.. [#inception-paper] `Inception: Exposing New Attack Surfaces with Training in Transient Execution `_ + +.. [#intel-rsbu] `Return Stack Buffer Underflow / Return Stack Buffer Underflow / CVE-2022-29901, CVE-2022-28693 / INTEL-SA-00702 `_ + +.. [#intel-ibpb-btb] `Indirect Branch Predictor Barrier' `_ + +.. [#intel-eibrs-rrsba] "Guidance for RSBU" in `Return Stack Buffer Underflow / Return Stack Buffer Underflow / CVE-2022-29901, CVE-2022-28693 / INTEL-SA-00702 `_ + +.. [#bhi-paper] `Branch History Injection: On the Effectiveness of Hardware Mitigations Against Cross-Privilege Spectre-v2 Attacks `_ + +.. [#intel-bhi] `Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 `_ + +.. [#intel-retpoline-rrsba] "Retpoline" in `Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 `_ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a4e07267e382..38efbd122041 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1225,25 +1225,25 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* - * Similar to context switches, there are two types of RSB attacks - * after VM exit: + * WARNING! There are many subtleties to consider when changing *any* + * code related to RSB-related mitigations. Before doing so, carefully + * read the following document, and update if necessary: * - * 1) RSB underflow + * Documentation/admin-guide/hw-vuln/rsb.rst * - * 2) Poisoned RSB entry + * In an overly simplified nutshell: * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. + * - User->user RSB attacks are conditionally mitigated during + * context switches by cond_mitigation -> write_ibpb(). * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. + * - User->kernel and guest->host attacks are mitigated by eIBRS or + * RSB filling. * - * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB - * bug is present then a LITE version of RSB protection is required, - * just a single call needs to retire before a RET is executed. + * Though, depending on config, note that other alternative + * mitigations may end up getting used instead, e.g., IBPB on + * entry/vmexit, call depth tracking, or return thunks. */ + switch (mode) { case SPECTRE_V2_NONE: break; @@ -1380,44 +1380,6 @@ static void __init spectre_v2_select_mitigation(void) spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); - /* - * If Spectre v2 protection has been enabled, fill the RSB during a - * context switch. In general there are two types of RSB attacks - * across context switches, for which the CALLs/RETs may be unbalanced. - * - * 1) RSB underflow - * - * Some Intel parts have "bottomless RSB". When the RSB is empty, - * speculated return targets may come from the branch predictor, - * which could have a user-poisoned BTB or BHB entry. - * - * AMD has it even worse: *all* returns are speculated from the BTB, - * regardless of the state of the RSB. - * - * When IBRS or eIBRS is enabled, the "user -> kernel" attack - * scenario is mitigated by the IBRS branch prediction isolation - * properties, so the RSB buffer filling wouldn't be necessary to - * protect against this type of attack. - * - * The "user -> user" attack scenario is mitigated by RSB filling. - * - * 2) Poisoned RSB entry - * - * If the 'next' in-kernel return stack is shorter than 'prev', - * 'next' could be tricked into speculating with a user-poisoned RSB - * entry. - * - * The "user -> kernel" attack scenario is mitigated by SMEP and - * eIBRS. - * - * The "user -> user" scenario, also known as SpectreBHB, requires - * RSB clearing. - * - * So to mitigate all cases, unconditionally fill RSB on context - * switches. - * - * FIXME: Is this pointless for retbleed-affected AMD? - */ spectre_v2_select_rsb_mitigation(mode); /* -- Gitee