From 639da762499740f0dc8e70e77521c069c3b3e177 Mon Sep 17 00:00:00 2001 From: Xie Xiaodong <624338359@qq.com> Date: Thu, 16 Apr 2026 11:38:03 +0800 Subject: [PATCH 1/5] KVM: arm64: Fix FEAT_TLBIRANGE bugs Fix the local FEAT_TLBIRANGE integration issues in the 5.4 arm64 KVM backport: - use *flush and clear it after memslot range flushes - restore the 5.4 single-name full TLB flush API hook - complete deferred stage-2 unmap range invalidation per walker chunk instead of deferring it until the full hand-written walk completes Upstream arm64 KVM chunks large stage-2 unmaps through stage2_apply_range(), which invokes kvm_pgtable_stage2_unmap() on each chunk. When deferred range invalidation is enabled, each chunk completes its TLBI range before stage2_apply_range() can drop and reacquire mmu_lock via cond_resched_rwlock_write(). Align the 5.4 behaviour with upstream by completing the deferred range TLBI for each walker chunk immediately after unmap_stage2_puds() returns. This keeps FEAT_TLBIRANGE enabled for stage-2 unmap while ensuring stale leaf translations do not survive beyond a chunk that may reschedule. Fixes: 1b2905d3582f ("KVM: arm64: Add support for FEAT_TLBIRANGE") Signed-off-by: Xie Xiaodong <624338359@qq.com> --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/Kconfig | 1 + include/linux/kvm_host.h | 6 +++--- virt/kvm/Kconfig | 3 +++ virt/kvm/arm/arm.c | 17 ++++++----------- virt/kvm/arm/mmu.c | 14 +++++++++----- virt/kvm/kvm_main.c | 11 +++++++---- 7 files changed, 30 insertions(+), 24 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 850ef4a6d389..cbca2fc7fd31 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -693,7 +693,7 @@ void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); void kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC -#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 1d0a3791c017..ca5401afc1fb 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -24,6 +24,7 @@ config KVM select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_ARM_HOST select KVM_GENERIC_DIRTYLOG_READ_PROTECT diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9019db30b8a7..ac29d18293fb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -931,13 +931,13 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) } #endif -#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS -static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) +#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB +static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) { return -ENOTSUPP; } #else -int kvm_arch_flush_remote_tlbs(struct kvm *kvm); +int kvm_arch_flush_remote_tlb(struct kvm *kvm); #endif #ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 6ec39b52214c..aad9284c043a 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -36,6 +36,9 @@ config HAVE_KVM_CPU_RELAX_INTERCEPT config KVM_VFIO bool +config HAVE_KVM_ARCH_TLB_FLUSH_ALL + bool + config HAVE_KVM_INVALID_WAKEUPS bool diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 254537f5ed00..34ef6ed11b20 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1274,11 +1274,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, * Steps 1-4 below provide general overview of dirty page logging. See * kvm_get_dirty_log_protect() function description for additional details. * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. + * We call kvm_get_dirty_log_protect() to handle steps 1-4. The helper + * flushes the relevant memslot TLBs when needed, even if the subsequent + * copy_to_user() fails and the dirty bitmap may be corrupt. Regardless of + * previous outcome the KVM logging API does not preclude user space + * subsequent dirty log read. Flushing TLB ensures writes will be marked + * dirty for next log read. * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. @@ -1294,9 +1295,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) r = kvm_get_dirty_log_protect(kvm, log, &flush); - if (flush) - kvm_flush_remote_tlbs(kvm); - mutex_unlock(&kvm->slots_lock); return r; } @@ -1310,9 +1308,6 @@ int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *lo r = kvm_clear_dirty_log_protect(kvm, log, &flush); - if (flush) - kvm_flush_remote_tlbs(kvm); - mutex_unlock(&kvm->slots_lock); return r; } diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index abb5f76454b5..0d293f9f1b39 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -49,12 +49,12 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) } /** - * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * kvm_arch_flush_remote_tlb() - flush all VM TLB entries for v7/8 * @kvm: pointer to kvm structure. * * Interface to HYP function to flush all VM TLB entries */ -int kvm_arch_flush_remote_tlbs(struct kvm *kvm) +int kvm_arch_flush_remote_tlb(struct kvm *kvm) { kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); return 0; @@ -429,10 +429,14 @@ static void __unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size, next = stage2_pgd_addr_end(kvm, addr, end); if (!stage2_pgd_none(kvm, *pgd)) unmap_stage2_puds(kvm, pgd, addr, next); - + if (stage2_unmap_defer_tlb_flush()) - /* Perform the deferred TLB invalidations */ - kvm_tlb_flush_vmid_range(kvm, addr, size); + /* + * Mirror the upstream chunked unmap semantics by + * completing deferred range invalidation for this + * walker chunk before the next iteration can resched. + */ + kvm_tlb_flush_vmid_range(kvm, addr, next - addr); /* * If the range is too large, release the kvm->mmu_lock diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ac74b31efb81..ceb70dc7e7f0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -341,7 +341,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that * barrier here. */ - if (!kvm_arch_flush_remote_tlbs(kvm) + if (!kvm_arch_flush_remote_tlb(kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); @@ -1343,8 +1343,10 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, } spin_unlock(&kvm->mmu_lock); } - if (flush) + if (*flush) { kvm_flush_remote_tlbs_memslot(kvm, memslot); + *flush = false; + } if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) return -EFAULT; return 0; @@ -1420,9 +1422,10 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm, } } spin_unlock(&kvm->mmu_lock); - - if(flush) + if (*flush) { kvm_flush_remote_tlbs_memslot(kvm, memslot); + *flush = false; + } return 0; } -- Gitee From 88be31467ce78faed50f777fb8066942d11c00c9 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:26 +0000 Subject: [PATCH 2/5] arm64/mm: Modify range-based tlbi to decrement scale commit e2768b798a197318736f00c506633cb78ff77012 upstream In preparation for adding support for LPA2 to the tlb invalidation routines, modify the algorithm used by range-based tlbi to start at the highest 'scale' and decrement instead of starting at the lowest 'scale' and incrementing. This new approach makes it possible to maintain 64K alignment as we work through the range, until the last op (at scale=0). This is required when LPA2 is enabled. (This part will be added in a subsequent commit). This change is separated into its own patch because it will also impact non-LPA2 systems, and I want to make it easy to bisect in case it leads to performance regression (see below for benchmarks that suggest this should not be a problem). The original commit (d1d3aa98 "arm64: tlb: Use the TLBI RANGE feature in arm64") stated this as the reason for _incrementing_ scale: However, in most scenarios, the pages = 1 when flush_tlb_range() is called. Start from scale = 3 or other proper value (such as scale =ilog2(pages)), will incur extra overhead. So increase 'scale' from 0 to maximum. But pages=1 is already special cased by the non-range invalidation path, which will take care of it the first time through the loop (both in the original commit and in my change), so I don't think switching to decrement scale should have any extra performance impact after all. Indeed benchmarking kernel compilation, a TLBI-heavy workload, suggests that this new approach actually _improves_ performance slightly (using a virtual machine on Apple M2): Table shows time to execute kernel compilation workload with 8 jobs, relative to baseline without this patch (more negative number is bigger speedup). Repeated 9 times across 3 system reboots: | counter | mean | stdev | |:----------|-----------:|----------:| | real-time | -0.6% | 0.0% | | kern-time | -1.6% | 0.5% | | user-time | -0.4% | 0.1% | Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-2-ryan.roberts@arm.com Signed-off-by: Xie Xiaodong <624338359@qq.com> --- arch/arm64/include/asm/tlbflush.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 11e9d8bd8b75..d1ea1732515f 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -298,14 +298,14 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, * entries one by one at the granularity of 'stride'. If the TLB * range ops are supported, then: * - * 1. If 'pages' is odd, flush the first page through non-range - * operations; + * 1. The minimum range granularity is decided by 'scale', so multiple range + * TLBI operations may be required. Start from scale = 3, flush the largest + * possible number of pages ((num+1)*2^(5*scale+1)) that fit into the + * requested range, then decrement scale and continue until one or zero pages + * are left. * - * 2. For remaining pages: the minimum range granularity is decided - * by 'scale', so multiple range TLBI operations may be required. - * Start from scale = 0, flush the corresponding number of pages - * ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it - * until no pages left. + * 2. If there is 1 page remaining, flush it through non-range operations. Range + * operations can only span an even number of pages. * * Note that certain ranges can be represented by either num = 31 and * scale or num = 0 and scale + 1. The loop below favours the latter @@ -315,12 +315,12 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, asid, tlb_level, tlbi_user) \ do { \ int num = 0; \ - int scale = 0; \ + int scale = 3; \ unsigned long addr; \ \ while (pages > 0) { \ if (!system_supports_tlb_range() || \ - pages % 2 == 1) { \ + pages == 1) { \ addr = __TLBI_VADDR(start, asid); \ __tlbi_level(op, addr, tlb_level); \ if (tlbi_user) \ @@ -340,7 +340,7 @@ do { \ start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ pages -= __TLBI_RANGE_PAGES(num, scale); \ } \ - scale++; \ + scale--; \ } \ } while (0) -- Gitee From abeb0a387d8b9b9054dc161ae917dfb65277eff2 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 5 Apr 2024 13:58:50 +1000 Subject: [PATCH 3/5] arm64: tlb: Fix TLBI RANGE operand commit e3ba51ab24fddef79fc212f9840de54db8fd1685 upstream KVM/arm64 relies on TLBI RANGE feature to flush TLBs when the dirty pages are collected by VMM and the page table entries become write protected during live migration. Unfortunately, the operand passed to the TLBI RANGE instruction isn't correctly sorted out due to the commit 117940aa6e5f ("KVM: arm64: Define kvm_tlb_flush_vmid_range()"). It leads to crash on the destination VM after live migration because TLBs aren't flushed completely and some of the dirty pages are missed. For example, I have a VM where 8GB memory is assigned, starting from 0x40000000 (1GB). Note that the host has 4KB as the base page size. In the middile of migration, kvm_tlb_flush_vmid_range() is executed to flush TLBs. It passes MAX_TLBI_RANGE_PAGES as the argument to __kvm_tlb_flush_vmid_range() and __flush_s2_tlb_range_op(). SCALE#3 and NUM#31, corresponding to MAX_TLBI_RANGE_PAGES, isn't supported by __TLBI_RANGE_NUM(). In this specific case, -1 has been returned from __TLBI_RANGE_NUM() for SCALE#3/2/1/0 and rejected by the loop in the __flush_tlb_range_op() until the variable @scale underflows and becomes -9, 0xffff708000040000 is set as the operand. The operand is wrong since it's sorted out by __TLBI_VADDR_RANGE() according to invalid @scale and @num. Fix it by extending __TLBI_RANGE_NUM() to support the combination of SCALE#3 and NUM#31. With the changes, [-1 31] instead of [-1 30] can be returned from the macro, meaning the TLBs for 0x200000 pages in the above example can be flushed in one shoot with SCALE#3 and NUM#31. The macro TLBI_RANGE_MASK is dropped since no one uses it any more. The comments are also adjusted accordingly. Fixes: 117940aa6e5f ("KVM: arm64: Define kvm_tlb_flush_vmid_range()") Cc: stable@kernel.org # v6.6+ Reported-by: Yihuang Yu Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Catalin Marinas Reviewed-by: Ryan Roberts Reviewed-by: Anshuman Khandual Reviewed-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240405035852.1532010-2-gshan@redhat.com Signed-off-by: Catalin Marinas Signed-off-by: Xie Xiaodong <624338359@qq.com> --- arch/arm64/include/asm/tlbflush.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index d1ea1732515f..e30854715311 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -157,12 +157,17 @@ static inline unsigned long get_trans_granule(void) #define MAX_TLBI_RANGE_PAGES __TLBI_RANGE_PAGES(31, 3) /* - * Generate 'num' values from -1 to 30 with -1 rejected by the - * __flush_tlb_range() loop below. + * Generate 'num' values from -1 to 31 with -1 rejected by the + * __flush_tlb_range() loop below. Its return value is only significant + * for a maximum of MAX_TLBI_RANGE_PAGES pages. If 'pages' is more than + * that, you must iterate over the overall range. */ -#define TLBI_RANGE_MASK GENMASK_ULL(4, 0) -#define __TLBI_RANGE_NUM(pages, scale) \ - ((((pages) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1) +#define __TLBI_RANGE_NUM(pages, scale) \ + ({ \ + unsigned long __pages = min_t(unsigned long, (pages), \ + __TLBI_RANGE_PAGES(31, (scale))); \ + (__pages >> (5 * (scale) + 1)) - 1; \ + }) /* * TLB Invalidation @@ -306,10 +311,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, * * 2. If there is 1 page remaining, flush it through non-range operations. Range * operations can only span an even number of pages. - * - * Note that certain ranges can be represented by either num = 31 and - * scale or num = 0 and scale + 1. The loop below favours the latter - * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. */ #define __flush_tlb_range_op(op, start, pages, stride, \ asid, tlb_level, tlbi_user) \ -- Gitee From 92380ad2e4ff5cdf6a464a56f0671035fa8578ac Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 5 Apr 2024 13:58:51 +1000 Subject: [PATCH 4/5] arm64: tlb: Improve __TLBI_VADDR_RANGE() commit e07255d69702bc9131427fda8f9749355b10780f upstream The macro returns the operand of TLBI RANGE instruction. A mask needs to be applied to each individual field upon producing the operand, to avoid the adjacent fields can interfere with each other when invalid arguments have been provided. The code looks more tidy at least with a mask and FIELD_PREP(). Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Reviewed-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240405035852.1532010-3-gshan@redhat.com Signed-off-by: Will Deacon Signed-off-by: Xie Xiaodong <624338359@qq.com> --- arch/arm64/include/asm/tlbflush.h | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index e30854715311..b4e4267d4312 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -139,16 +139,24 @@ static inline unsigned long get_trans_granule(void) * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE) * */ +#define TLBIR_ASID_MASK GENMASK_ULL(63, 48) +#define TLBIR_TG_MASK GENMASK_ULL(47, 46) +#define TLBIR_SCALE_MASK GENMASK_ULL(45, 44) +#define TLBIR_NUM_MASK GENMASK_ULL(43, 39) +#define TLBIR_TTL_MASK GENMASK_ULL(38, 37) +#define TLBIR_BADDR_MASK GENMASK_ULL(36, 0) + #define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl) \ - ({ \ - unsigned long __ta = (addr) >> PAGE_SHIFT; \ - __ta &= GENMASK_ULL(36, 0); \ - __ta |= (unsigned long)(ttl) << 37; \ - __ta |= (unsigned long)(num) << 39; \ - __ta |= (unsigned long)(scale) << 44; \ - __ta |= get_trans_granule() << 46; \ - __ta |= (unsigned long)(asid) << 48; \ - __ta; \ + ({ \ + unsigned long __ta = 0; \ + unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \ + __ta |= FIELD_PREP(TLBIR_BADDR_MASK, (addr) >> PAGE_SHIFT); \ + __ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl); \ + __ta |= FIELD_PREP(TLBIR_NUM_MASK, num); \ + __ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale); \ + __ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule()); \ + __ta |= FIELD_PREP(TLBIR_ASID_MASK, asid); \ + __ta; \ }) /* These macros are used by the TLBI RANGE feature. */ -- Gitee From 48f9b5ef8d43ce7ae866df6d6106a1c7a962ca20 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 5 Apr 2024 13:58:52 +1000 Subject: [PATCH 5/5] arm64: tlb: Allow range operation for MAX_TLBI_RANGE_PAGES commit 73301e464a72a0d007d0d4e0f4d3dab5c58125bf upstream MAX_TLBI_RANGE_PAGES pages is covered by SCALE#3 and NUM#31 and it's supported now. Allow TLBI RANGE operation when the number of pages is equal to MAX_TLBI_RANGE_PAGES in __flush_tlb_range_nosync(). Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Reviewed-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240405035852.1532010-4-gshan@redhat.com Signed-off-by: Will Deacon Signed-off-by: Xie Xiaodong <624338359@qq.com> --- arch/arm64/include/asm/tlbflush.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index b4e4267d4312..5c5baa7cef53 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -371,11 +371,11 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, * When not uses TLB range ops, we can handle up to * (MAX_TLBI_OPS - 1) pages; * When uses TLB range ops, we can handle up to - * (MAX_TLBI_RANGE_PAGES - 1) pages. + * MAX_TLBI_RANGE_PAGES pages. */ if ((!system_supports_tlb_range() && (end - start) >= (MAX_TLBI_OPS * stride)) || - pages >= MAX_TLBI_RANGE_PAGES) { + pages > MAX_TLBI_RANGE_PAGES) { flush_tlb_mm(vma->vm_mm); return; } -- Gitee