From 33175041b584b0ae460a2e5a20e2df5cfac0c782 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 23 Jun 2023 14:14:07 +0300 Subject: [PATCH 01/34] x86/elf: Make loading of 32bit processes depend on ia32_enabled() ANBZ: #33261 commit 5ae2702d7c482edbf002499e23a2e22ac4047af1 upstream. Major aspect of ia32 emulation is the ability to load 32bit processes. That's currently decided (among others) by compat_elf_check_arch(). Make the macro use ia32_enabled() to decide if IA32 compat is enabled before loading a 32bit process. Intel-SIG: commit 5ae2702d7c48 Make loading of 32bit processes depend on ia32_enabled(). Backport Intel FRED architecture for x86-64. Signed-off-by: Nikolay Borisov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230623111409.3047467-5-nik.borisov@suse.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/elf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 18fd06f7936a..a0234dfd1031 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -149,7 +150,7 @@ do { \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ - (elf_check_arch_ia32(x) || \ + ((elf_check_arch_ia32(x) && ia32_enabled()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) static inline void elf_common_init(struct thread_struct *t, -- Gitee From 3e0aa84d32549a270c18ed16ac9c4ad6fda1120f Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:49:50 -0800 Subject: [PATCH 02/34] x86/cpufeatures,opcode,msr: Add the WRMSRNS instruction support ANBZ: #33261 commit a4cb5ece145828cae35503857debf3d49c9d1c5f upstream. WRMSRNS is an instruction that behaves exactly like WRMSR, with the only difference being that it is not a serializing instruction by default. Under certain conditions, WRMSRNS may replace WRMSR to improve performance. Add its CPU feature bit, opcode to the x86 opcode map, and an always inline API __wrmsrns() to embed WRMSRNS into the code. Intel-SIG: commit a4cb5ece1458 Add the WRMSRNS instruction support. Backport Intel FRED architecture for x86-64. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Acked-by: Masami Hiramatsu (Google) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231205105030.8698-2-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr.h | 18 ++++++++++++++++++ arch/x86/lib/x86-opcode-map.txt | 2 +- tools/arch/x86/include/asm/cpufeatures.h | 1 + tools/arch/x86/lib/x86-opcode-map.txt | 2 +- 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2d8f9554ea3c..9bb4e370e8d0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -349,6 +349,7 @@ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 65ec1965cd28..c284ff9ebe67 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -97,6 +97,19 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) : : "c" (msr), "a"(low), "d" (high) : "memory"); } +/* + * WRMSRNS behaves exactly like WRMSR with the only difference being + * that it is not a serializing instruction by default. + */ +static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high) +{ + /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */ + asm volatile("1: .byte 0x0f,0x01,0xc6\n" + "2:\n" + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) + : : "c" (msr), "a"(low), "d" (high)); +} + #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = __rdmsr((msr)); \ @@ -297,6 +310,11 @@ do { \ #endif /* !CONFIG_PARAVIRT_XXL */ +static __always_inline void wrmsrns(u32 msr, u64 val) +{ + __wrmsrns(msr, val, val >> 32); +} + /* * 64-bit version of wrmsr_safe(): */ diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index ed01bdce2b4e..db361ba13f3c 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1055,7 +1055,7 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index bada341b583e..08f2956710e0 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -318,6 +318,7 @@ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index ed01bdce2b4e..db361ba13f3c 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -1055,7 +1055,7 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms -- Gitee From bbdff13fd41696be33fc6858f2fa7e194063b4fd Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:49:51 -0800 Subject: [PATCH 03/34] x86/entry: Remove idtentry_sysvec from entry_{32,64}.S ANBZ: #33261 commit 3167b37f82ea8f9da156ff4edf100756bbc9277e upstream. idtentry_sysvec is really just DECLARE_IDTENTRY defined in , no need to define it separately. Intel-SIG: commit 3167b37f82ea Remove idtentry_sysvec from entry_{32,64}.S. Backport Intel FRED architecture for x86-64. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-3-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/entry/entry_32.S | 4 ---- arch/x86/entry/entry_64.S | 8 -------- arch/x86/include/asm/idtentry.h | 2 +- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 3894acc54b79..6dca2677b251 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -649,10 +649,6 @@ SYM_CODE_START_LOCAL(asm_\cfunc) SYM_CODE_END(asm_\cfunc) .endm -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /* * Include the defines which emit the idt entries which are shared * shared between 32 and 64 bit and emit the __irqentry_text_* markers diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1f9e508ac075..99f2eef48fc8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -421,14 +421,6 @@ SYM_CODE_END(\asmsym) idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm -/* - * System vectors which invoke their handlers directly and are not - * going through the regular common device interrupt handling code. - */ -.macro idtentry_sysvec vector cfunc - idtentry \vector asm_\cfunc \cfunc has_error_code=0 -.endm - /** * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB * @vector: Vector number diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index d3e07f07d7c4..07054bfdc066 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -485,7 +485,7 @@ __visible noinstr void func(struct pt_regs *regs, \ /* System vector entries */ #define DECLARE_IDTENTRY_SYSVEC(vector, func) \ - idtentry_sysvec vector func + DECLARE_IDTENTRY(vector, func) #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ -- Gitee From 5c1447d6525c6fcfaa2f3ba064a0ec5d4fdcbfdd Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:49:52 -0800 Subject: [PATCH 04/34] x86/trapnr: Add event type macros to ANBZ: #33261 commit 8df719341e8556f1e2bfa0f78fc433db6eba110b upstream. Intel VT-x classifies events into eight different types, which is inherited by FRED for event identification. As such, event types becomes a common x86 concept, and should be defined in a common x86 header. Add event type macros to , and use them in . Intel-SIG: commit 8df719341e85 Add event type macros to . Backport Intel FRED architecture for x86-64. Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-4-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/trapnr.h | 12 ++++++++++++ arch/x86/include/asm/vmx.h | 17 +++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index f5d2325aa0b7..8d1154cdf787 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -2,6 +2,18 @@ #ifndef _ASM_X86_TRAPNR_H #define _ASM_X86_TRAPNR_H +/* + * Event type codes used by FRED, Intel VT-x and AMD SVM + */ +#define EVENT_TYPE_EXTINT 0 // External interrupt +#define EVENT_TYPE_RESERVED 1 +#define EVENT_TYPE_NMI 2 // NMI +#define EVENT_TYPE_HWEXC 3 // Hardware originated traps, exceptions +#define EVENT_TYPE_SWINT 4 // INT n +#define EVENT_TYPE_PRIV_SWEXC 5 // INT1 +#define EVENT_TYPE_SWEXC 6 // INTO, INT3 +#define EVENT_TYPE_OTHER 7 // FRED SYSCALL/SYSENTER, VT-x MTF + /* Interrupts/Exceptions */ #define X86_TRAP_DE 0 /* Divide-by-zero */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 3a4f60f19de3..1fe36d0caa04 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -17,6 +17,7 @@ #include #include +#include #include #define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f) @@ -381,14 +382,14 @@ enum vmcs_field { #define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK -#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ -#define INTR_TYPE_RESERVED (1 << 8) /* reserved */ -#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ -#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ -#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ -#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ -#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ -#define INTR_TYPE_OTHER_EVENT (7 << 8) /* other event */ +#define INTR_TYPE_EXT_INTR (EVENT_TYPE_EXTINT << 8) /* external interrupt */ +#define INTR_TYPE_RESERVED (EVENT_TYPE_RESERVED << 8) /* reserved */ +#define INTR_TYPE_NMI_INTR (EVENT_TYPE_NMI << 8) /* NMI */ +#define INTR_TYPE_HARD_EXCEPTION (EVENT_TYPE_HWEXC << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (EVENT_TYPE_SWINT << 8) /* software interrupt */ +#define INTR_TYPE_PRIV_SW_EXCEPTION (EVENT_TYPE_PRIV_SWEXC << 8) /* ICE breakpoint */ +#define INTR_TYPE_SOFT_EXCEPTION (EVENT_TYPE_SWEXC << 8) /* software exception */ +#define INTR_TYPE_OTHER_EVENT (EVENT_TYPE_OTHER << 8) /* other event */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define GUEST_INTR_STATE_STI 0x00000001 -- Gitee From 1c59b1b1594299b7d68d5362e94c8aa55b033ddf Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:49:53 -0800 Subject: [PATCH 05/34] Documentation/x86/64: Add documentation for FRED ANBZ: #33261 commit 51383e741b41748ee80140233ab98ca6b56918b3 upstream. Briefly introduce FRED, and its advantages compared to IDT. Intel-SIG: commit 51383e741b41 Add documentation for FRED. Backport Intel FRED architecture for x86-64. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20231205105030.8698-5-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- Documentation/arch/x86/x86_64/fred.rst | 96 +++++++++++++++++++++++++ Documentation/arch/x86/x86_64/index.rst | 1 + 2 files changed, 97 insertions(+) create mode 100644 Documentation/arch/x86/x86_64/fred.rst diff --git a/Documentation/arch/x86/x86_64/fred.rst b/Documentation/arch/x86/x86_64/fred.rst new file mode 100644 index 000000000000..9f57e7b91f7e --- /dev/null +++ b/Documentation/arch/x86/x86_64/fred.rst @@ -0,0 +1,96 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +Flexible Return and Event Delivery (FRED) +========================================= + +Overview +======== + +The FRED architecture defines simple new transitions that change +privilege level (ring transitions). The FRED architecture was +designed with the following goals: + +1) Improve overall performance and response time by replacing event + delivery through the interrupt descriptor table (IDT event + delivery) and event return by the IRET instruction with lower + latency transitions. + +2) Improve software robustness by ensuring that event delivery + establishes the full supervisor context and that event return + establishes the full user context. + +The new transitions defined by the FRED architecture are FRED event +delivery and, for returning from events, two FRED return instructions. +FRED event delivery can effect a transition from ring 3 to ring 0, but +it is used also to deliver events incident to ring 0. One FRED +instruction (ERETU) effects a return from ring 0 to ring 3, while the +other (ERETS) returns while remaining in ring 0. Collectively, FRED +event delivery and the FRED return instructions are FRED transitions. + +In addition to these transitions, the FRED architecture defines a new +instruction (LKGS) for managing the state of the GS segment register. +The LKGS instruction can be used by 64-bit operating systems that do +not use the new FRED transitions. + +Furthermore, the FRED architecture is easy to extend for future CPU +architectures. + +Software based event dispatching +================================ + +FRED operates differently from IDT in terms of event handling. Instead +of directly dispatching an event to its handler based on the event +vector, FRED requires the software to dispatch an event to its handler +based on both the event's type and vector. Therefore, an event dispatch +framework must be implemented to facilitate the event-to-handler +dispatch process. The FRED event dispatch framework takes control +once an event is delivered, and employs a two-level dispatch. + +The first level dispatching is event type based, and the second level +dispatching is event vector based. + +Full supervisor/user context +============================ + +FRED event delivery atomically save and restore full supervisor/user +context upon event delivery and return. Thus it avoids the problem of +transient states due to %cr2 and/or %dr6, and it is no longer needed +to handle all the ugly corner cases caused by half baked entry states. + +FRED allows explicit unblock of NMI with new event return instructions +ERETS/ERETU, avoiding the mess caused by IRET which unconditionally +unblocks NMI, e.g., when an exception happens during NMI handling. + +FRED always restores the full value of %rsp, thus ESPFIX is no longer +needed when FRED is enabled. + +LKGS +==== + +LKGS behaves like the MOV to GS instruction except that it loads the +base address into the IA32_KERNEL_GS_BASE MSR instead of the GS +segment’s descriptor cache. With LKGS, it ends up with avoiding +mucking with kernel GS, i.e., an operating system can always operate +with its own GS base address. + +Because FRED event delivery from ring 3 and ERETU both swap the value +of the GS base address and that of the IA32_KERNEL_GS_BASE MSR, plus +the introduction of LKGS instruction, the SWAPGS instruction is no +longer needed when FRED is enabled, thus is disallowed (#UD). + +Stack levels +============ + +4 stack levels 0~3 are introduced to replace the nonreentrant IST for +event handling, and each stack level should be configured to use a +dedicated stack. + +The current stack level could be unchanged or go higher upon FRED +event delivery. If unchanged, the CPU keeps using the current event +stack. If higher, the CPU switches to a new event stack specified by +the MSR of the new stack level, i.e., MSR_IA32_FRED_RSP[123]. + +Only execution of a FRED return instruction ERET[US], could lower the +current stack level, causing the CPU to switch back to the stack it was +on before a previous event delivery that promoted the stack level. diff --git a/Documentation/arch/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst index a56070fc8e77..ad15e9bd623f 100644 --- a/Documentation/arch/x86/x86_64/index.rst +++ b/Documentation/arch/x86/x86_64/index.rst @@ -15,3 +15,4 @@ x86_64 Support cpu-hotplug-spec machinecheck fsgs + fred -- Gitee From 863663c0ea777d3d93d0d2d310ac8ed17c8e1d30 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:49:54 -0800 Subject: [PATCH 06/34] x86/fred: Add Kconfig option for FRED (CONFIG_X86_FRED) ANBZ: #33261 commit 2cce95918d635126098d784c040b59333c464b20 upstream. Add the configuration option CONFIG_X86_FRED to enable FRED. Intel-SIG: commit 2cce95918d63 Add Kconfig option for FRED (CONFIG_X86_FRED). Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-6-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d74eef96d693..a6c5ab3de252 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -515,6 +515,15 @@ config X86_CPU_RESCTRL Say N if unsure. +config X86_FRED + bool "Flexible Return and Event Delivery" + depends on X86_64 + help + When enabled, try to use Flexible Return and Event Delivery + instead of the legacy SYSCALL/SYSENTER/IDT architecture for + ring transitions and exception/interrupt handling if the + system supports. + if X86_32 config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" -- Gitee From 294853b863ec3208ee732fc95af0467ad2fbcf47 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Thu, 16 Apr 2026 08:50:20 +0800 Subject: [PATCH 07/34] x86/cpufeatures: Add the CPU feature bit for FRED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #33261 commit 51c158f7aaccc6f6423a61a1df4a0d4c0d9d22a9 upstream. Any FRED enabled CPU will always have the following features as its baseline: 1) LKGS, load attributes of the GS segment but the base address into the IA32_KERNEL_GS_BASE MSR instead of the GS segment’s descriptor cache. 2) WRMSRNS, non-serializing WRMSR for faster MSR writes. Intel-SIG: commit 51c158f7aacc Add the CPU feature bit for FRED MIME-Version: 1.0. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-7-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/cpuid-deps.c | 2 ++ tools/arch/x86/include/asm/cpufeatures.h | 1 + 3 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 9bb4e370e8d0..e32ffbb289a9 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -348,6 +348,7 @@ #define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ #define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index cd35e820d664..6bec16d64c72 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -83,6 +83,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, + { X86_FEATURE_FRED, X86_FEATURE_LKGS }, + { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS }, { X86_FEATURE_CRC32C_LOW_PERF, X86_FEATURE_XMM4_2 }, {} }; diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 08f2956710e0..7aed70598932 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -317,6 +317,7 @@ #define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ #define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ -- Gitee From ae011d69b977bf90f96478b6212eeccf70cb03dd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Thu, 16 Apr 2026 08:54:24 +0800 Subject: [PATCH 08/34] x86/fred: Disable FRED support if CONFIG_X86_FRED is disabled ANBZ: #33261 commit e554a8ca49d6d6d782f546ae4d7f036946e7dd87 upstream. Add CONFIG_X86_FRED to to make cpu_feature_enabled() work correctly with FRED. Intel-SIG: commit e554a8ca49d6 Disable FRED support if CONFIG_X86_FRED is disabled. Backport Intel FRED architecture for x86-64. Originally-by: Megha Dey Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-8-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/disabled-features.h | 8 +++++++- tools/arch/x86/include/asm/disabled-features.h | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 6d2df02536bb..e06bad1576e3 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -125,6 +125,12 @@ #define DISABLE_INVLPGB (1 << (X86_FEATURE_INVLPGB & 31)) #endif +#ifdef CONFIG_X86_FRED +# define DISABLE_FRED 0 +#else +# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -141,7 +147,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_LAM) +#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) #define DISABLED_MASK13 (DISABLE_INVLPGB) #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index be3fef5e80ba..abcb8bb459b4 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -105,6 +105,12 @@ # define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) #endif +#ifdef CONFIG_X86_FRED +# define DISABLE_FRED 0 +#else +# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -121,7 +127,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING) -#define DISABLED_MASK12 (DISABLE_LAM) +#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 -- Gitee From 77d670599675a9a004a009d7008c63985a100873 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:49:57 -0800 Subject: [PATCH 09/34] x86/fred: Add a fred= cmdline param ANBZ: #33261 commit 3810da12710aaa05c6101418675c923642a80c0c upstream. Let command line option "fred" accept multiple options to make it easier to tweak its behavior. Currently, two options 'on' and 'off' are allowed, and the default behavior is to disable FRED. To enable FRED, append "fred=on" to the kernel command line. [ bp: Use cpu_feature_enabled(), touch ups. ] Intel-SIG: commit 3810da12710a Add a fred= cmdline param. Backport Intel FRED architecture for x86-64. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-9-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- .../admin-guide/kernel-parameters.txt | 6 +++++ arch/x86/kernel/traps.c | 26 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 506c7e7b8b89..65cda51e36b4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1538,6 +1538,12 @@ Warning: use of this parameter will taint the kernel and may cause unknown problems. + fred= [X86-64] + Enable/disable Flexible Return and Event Delivery. + Format: { on | off } + on: enable FRED when it's present. + off: disable FRED, the default setting. + ftrace=[tracer] [FTRACE] will set and start the specified tracer as early as possible in order to facilitate early diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 555a7b640437..3de80cf2d332 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1479,8 +1479,34 @@ DEFINE_IDTENTRY_SW(iret_error) } #endif +/* Do not enable FRED by default yet. */ +static bool enable_fred __ro_after_init = false; + +#ifdef CONFIG_X86_FRED +static int __init fred_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + return 0; + + if (!strcmp(str, "on")) + enable_fred = true; + else if (!strcmp(str, "off")) + enable_fred = false; + else + pr_warn("invalid FRED option: 'fred=%s'\n", str); + return 0; +} +early_param("fred", fred_setup); +#endif + void __init trap_init(void) { + if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred) + setup_clear_cpu_cap(X86_FEATURE_FRED); + /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); -- Gitee From 6c47ae1c06c6a736804845ef6ba29ec566e49d4a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:49:58 -0800 Subject: [PATCH 10/34] x86/opcode: Add ERET[US] instructions to the x86 opcode map ANBZ: #33261 commit 0115f8b1a26ef49338cdb0bd98ad374b8586d0fd upstream. ERETU returns from an event handler while making a transition to ring 3, and ERETS returns from an event handler while staying in ring 0. Add instruction opcodes used by ERET[US] to the x86 opcode map; opcode numbers are per FRED spec v5.0. Intel-SIG: commit 0115f8b1a26e Add ERET[US] instructions to the x86 opcode map. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20231205105030.8698-10-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/lib/x86-opcode-map.txt | 2 +- tools/arch/x86/lib/x86-opcode-map.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index db361ba13f3c..7cff5db832fd 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1056,7 +1056,7 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index db361ba13f3c..7cff5db832fd 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -1056,7 +1056,7 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv -- Gitee From 97a974b0a2dca1e3dec9f907775432efad65d876 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:49:59 -0800 Subject: [PATCH 11/34] x86/objtool: Teach objtool about ERET[US] ANBZ: #33261 commit cd19bab825bda5bb192ef1d22e67d069daf2efb8 upstream. Update the objtool decoder to know about the ERET[US] instructions (type INSN_CONTEXT_SWITCH). Intel-SIG: commit cd19bab825bd Teach objtool about ERET[US]. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-11-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- tools/objtool/arch/x86/decode.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index c0f25d00181e..6999f478c155 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -509,11 +509,20 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec if (op2 == 0x01) { - if (modrm == 0xca) - insn->type = INSN_CLAC; - else if (modrm == 0xcb) - insn->type = INSN_STAC; - + switch (insn_last_prefix_id(&ins)) { + case INAT_PFX_REPE: + case INAT_PFX_REPNE: + if (modrm == 0xca) + /* eretu/erets */ + insn->type = INSN_CONTEXT_SWITCH; + break; + default: + if (modrm == 0xca) + insn->type = INSN_CLAC; + else if (modrm == 0xcb) + insn->type = INSN_STAC; + break; + } } else if (op2 >= 0x80 && op2 <= 0x8f) { insn->type = INSN_JUMP_CONDITIONAL; -- Gitee From 12f05d43372dbb4e97b37e6145dbf9923acdaa09 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:00 -0800 Subject: [PATCH 12/34] x86/cpu: Add X86_CR4_FRED macro ANBZ: #33261 commit ff45746fbf005f96e42bea466698e3fdbf926013 upstream. Add X86_CR4_FRED macro for the FRED bit in %cr4. This bit must not be changed after initialization, so add it to the pinned CR4 bits. Intel-SIG: commit ff45746fbf00 Add X86_CR4_FRED macro. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-12-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/uapi/asm/processor-flags.h | 7 +++++++ arch/x86/kernel/cpu/common.c | 5 ++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index d898432947ff..f1a4adc78272 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -139,6 +139,13 @@ #define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */ #define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT) +#ifdef __x86_64__ +#define X86_CR4_FRED_BIT 32 /* enable FRED kernel entry */ +#define X86_CR4_FRED _BITUL(X86_CR4_FRED_BIT) +#else +#define X86_CR4_FRED (0) +#endif + /* * x86-64 Task Priority Register, CR8 */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b27077af3bac..59664392d6be 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -404,9 +404,8 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c) } /* These bits should not change their value after CPU init is finished. */ -static const unsigned long cr4_pinned_mask = - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | - X86_CR4_FSGSBASE | X86_CR4_CET; +static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | + X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; -- Gitee From 68590d1661fec56987a1901a9e52a1707d58a054 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Thu, 16 Apr 2026 08:55:57 +0800 Subject: [PATCH 13/34] x86/cpu: Add MSR numbers for FRED configuration ANBZ: #33261 commit cd6df3f378f63f5d6dce0987169b182be1cb427c upstream. Add MSR numbers for the FRED configuration registers per FRED spec 5.0. Intel-SIG: commit cd6df3f378f6 Add MSR numbers for FRED configuration. Backport Intel FRED architecture for x86-64. Originally-by: Megha Dey Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-13-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/msr-index.h | 13 ++++++++++++- tools/arch/x86/include/asm/msr-index.h | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 2839a0287167..06ba89685bbd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -38,8 +38,19 @@ #define EFER_TCE (1<<_EFER_TCE) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) -/* Intel MSRs. Some also available on other CPUs */ +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ +#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ +#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */ +#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */ +#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */ +#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */ +#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */ +#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */ +#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */ +/* Intel MSRs. Some also available on other CPUs */ #define MSR_TEST_CTRL 0x00000033 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index b08293060367..7eac0ae73b0f 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -38,8 +38,19 @@ #define EFER_TCE (1<<_EFER_TCE) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) -/* Intel MSRs. Some also available on other CPUs */ +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ +#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ +#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */ +#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */ +#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */ +#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */ +#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */ +#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */ +#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */ +/* Intel MSRs. Some also available on other CPUs */ #define MSR_TEST_CTRL 0x00000033 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) -- Gitee From b839f9adf9b413f6dd52d54b7f6a2b142a8022db Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:04 -0800 Subject: [PATCH 14/34] x86/fred: Add a new header file for FRED definitions ANBZ: #33261 commit 32b09c230392ca4c03fcbade9e28b2053f11396b upstream. Add a header file for FRED prototypes and definitions. Intel-SIG: commit 32b09c230392 Add a new header file for FRED definitions. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-16-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/fred.h | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 arch/x86/include/asm/fred.h diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h new file mode 100644 index 000000000000..f514fdb5a39f --- /dev/null +++ b/arch/x86/include/asm/fred.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Macros for Flexible Return and Event Delivery (FRED) + */ + +#ifndef ASM_X86_FRED_H +#define ASM_X86_FRED_H + +#include + +#include + +/* + * FRED event return instruction opcodes for ERET{S,U}; supported in + * binutils >= 2.41. + */ +#define ERETS _ASM_BYTES(0xf2,0x0f,0x01,0xca) +#define ERETU _ASM_BYTES(0xf3,0x0f,0x01,0xca) + +/* + * RSP is aligned to a 64-byte boundary before used to push a new stack frame + */ +#define FRED_STACK_FRAME_RSP_MASK _AT(unsigned long, (~0x3f)) + +/* + * Used for the return address for call emulation during code patching, + * and measured in 64-byte cache lines. + */ +#define FRED_CONFIG_REDZONE_AMOUNT 1 +#define FRED_CONFIG_REDZONE (_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6) +#define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9) +#define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p)) + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_X86_FRED +#include + +#include + +struct fred_info { + /* Event data: CR2, DR6, ... */ + unsigned long edata; + unsigned long resv; +}; + +/* Full format of the FRED stack frame */ +struct fred_frame { + struct pt_regs regs; + struct fred_info info; +}; + +static __always_inline struct fred_info *fred_info(struct pt_regs *regs) +{ + return &container_of(regs, struct fred_frame, regs)->info; +} + +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) +{ + return fred_info(regs)->edata; +} + +#else /* CONFIG_X86_FRED */ +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +#endif /* CONFIG_X86_FRED */ +#endif /* !__ASSEMBLY__ */ + +#endif /* ASM_X86_FRED_H */ -- Gitee From 0de2c7eb18daae283d005a4145cda40c475e858b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:05 -0800 Subject: [PATCH 15/34] x86/fred: Reserve space for the FRED stack frame ANBZ: #33261 commit 65c9cc9e2c14602d98f1ca61c51ac954e9529303 upstream. When using FRED, reserve space at the top of the stack frame, just like i386 does. Intel-SIG: commit 65c9cc9e2c14 Reserve space for the FRED stack frame. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-17-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/thread_info.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1c62f52a6ae8..98f601a9a5f0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,9 @@ * In vm86 mode, the hardware frame is much longer still, so add 16 * bytes to make room for the real-mode segments. * - * x86_64 has a fixed-length stack frame. + * x86-64 has a fixed-length stack frame, but it depends on whether + * or not FRED is enabled. Future versions of FRED might make this + * dynamic, but for now it is always 2 words longer. */ #ifdef CONFIG_X86_32 # ifdef CONFIG_VM86 @@ -39,8 +41,12 @@ # else # define TOP_OF_KERNEL_STACK_PADDING 8 # endif -#else -# define TOP_OF_KERNEL_STACK_PADDING 0 +#else /* x86-64 */ +# ifdef CONFIG_X86_FRED +# define TOP_OF_KERNEL_STACK_PADDING (2 * 8) +# else +# define TOP_OF_KERNEL_STACK_PADDING 0 +# endif #endif /* -- Gitee From 74a0021ea8cfebcc9a81b512e6cc74871970765b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:06 -0800 Subject: [PATCH 16/34] x86/fred: Update MSR_IA32_FRED_RSP0 during task switch ANBZ: #33261 commit 9356c4b8886c4f7d3436c3f7fe31715bdcf1c79e upstream. MSR_IA32_FRED_RSP0 is used during ring 3 event delivery, and needs to be updated to point to the top of next task stack during task switch. Intel-SIG: commit 9356c4b8886c Update MSR_IA32_FRED_RSP0 during task switch. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-18-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/include/asm/switch_to.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index f42dbf17f52b..c3bd0c0758c9 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -70,9 +70,13 @@ static inline void update_task_stack(struct task_struct *task) #ifdef CONFIG_X86_32 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* Xen PV enters the kernel on the thread stack. */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + /* WRMSRNS is a baseline feature for FRED. */ + wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE); + } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* Xen PV enters the kernel on the thread stack. */ load_sp0(task_top_of_stack(task)); + } #endif } -- Gitee From d36db4c67a1e411082ef3da0bc61ac492fd414de Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:07 -0800 Subject: [PATCH 17/34] x86/fred: Disallow the swapgs instruction when FRED is enabled ANBZ: #33261 commit 09794f68936a017e5632774c3e4450bebbcca2cb upstream. SWAPGS is no longer needed thus NOT allowed with FRED because FRED transitions ensure that an operating system can _always_ operate with its own GS base address: - For events that occur in ring 3, FRED event delivery swaps the GS base address with the IA32_KERNEL_GS_BASE MSR. - ERETU (the FRED transition that returns to ring 3) also swaps the GS base address with the IA32_KERNEL_GS_BASE MSR. And the operating system can still setup the GS segment for a user thread without the need of loading a user thread GS with: - Using LKGS, available with FRED, to modify other attributes of the GS segment without compromising its ability always to operate with its own GS base address. - Accessing the GS segment base address for a user thread as before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE MSR instead of the GS segment's descriptor cache. As such, the operating system never changes its runtime GS base address. Intel-SIG: commit 09794f68936a Disallow the swapgs instruction when FRED is enabled. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-19-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/process_64.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index fbadfa84a02d..0e4eb4485f26 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -168,7 +168,29 @@ static noinstr unsigned long __rdgsbase_inactive(void) lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* + * SWAPGS is no longer needed thus NOT allowed with FRED because + * FRED transitions ensure that an operating system can _always_ + * operate with its own GS base address: + * - For events that occur in ring 3, FRED event delivery swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * - ERETU (the FRED transition that returns to ring 3) also swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * + * And the operating system can still setup the GS segment for a + * user thread without the need of loading a user thread GS with: + * - Using LKGS, available with FRED, to modify other attributes + * of the GS segment without compromising its ability always to + * operate with its own GS base address. + * - Accessing the GS segment base address for a user thread as + * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. + * + * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE + * MSR instead of the GS segment’s descriptor cache. As such, the + * operating system never changes its runtime GS base address. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); @@ -193,7 +215,8 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); -- Gitee From 1d7c1e3c7a094498b31d733491820624991bae87 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:08 -0800 Subject: [PATCH 18/34] x86/fred: No ESPFIX needed when FRED is enabled ANBZ: #33261 commit df8838737b3612eea024fce5ffce0b23dafe5058 upstream. Because FRED always restores the full value of %rsp, ESPFIX is no longer needed when it's enabled. Intel-SIG: commit df8838737b36 No ESPFIX needed when FRED is enabled. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-20-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/espfix_64.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 16f9814c9be0..6726e0473d0b 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -106,6 +106,10 @@ void __init init_espfix_bsp(void) pgd_t *pgd; p4d_t *p4d; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* Install the espfix pud into the kernel page directory */ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); @@ -129,6 +133,10 @@ void init_espfix_ap(int cpu) void *stack_page; pteval_t ptemask; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* We only have to do this once... */ if (likely(per_cpu(espfix_stack, cpu))) return; /* Already initialized */ -- Gitee From dd188dc7c619d9a6ebcf489358d3f6d1f95668f5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:09 -0800 Subject: [PATCH 19/34] x86/fred: Allow single-step trap and NMI when starting a new task ANBZ: #33261 commit ad41a14cc2d66229479d73e4a7dc1fda26827666 upstream. Entering a new task is logically speaking a return from a system call (exec, fork, clone, etc.). As such, if ptrace enables single stepping a single step exception should be allowed to trigger immediately upon entering user space. This is not optional. NMI should *never* be disabled in user space. As such, this is an optional, opportunistic way to catch errors. Allow single-step trap and NMI when starting a new task, thus once the new task enters user space, single-step trap and NMI are both enabled immediately. Intel-SIG: commit ad41a14cc2d6 Allow single-step trap and NMI when starting a new task. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-21-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/process_64.c | 38 ++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0e4eb4485f26..ead39141b8e5 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -56,6 +56,7 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include @@ -530,7 +531,7 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, - unsigned int _cs, unsigned int _ss, unsigned int _ds) + u16 _cs, u16 _ss, u16 _ds) { WARN_ON_ONCE(regs != current_pt_regs()); @@ -547,11 +548,36 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(ds, _ds); load_gs_index(0); - regs->ip = new_ip; - regs->sp = new_sp; - regs->cs = _cs; - regs->ss = _ss; - regs->flags = X86_EFLAGS_IF; + regs->ip = new_ip; + regs->sp = new_sp; + regs->csx = _cs; + regs->ssx = _ss; + /* + * Allow single-step trap and NMI when starting a new task, thus + * once the new task enters user space, single-step trap and NMI + * are both enabled immediately. + * + * Entering a new task is logically speaking a return from a + * system call (exec, fork, clone, etc.). As such, if ptrace + * enables single stepping a single step exception should be + * allowed to trigger immediately upon entering user space. + * This is not optional. + * + * NMI should *never* be disabled in user space. As such, this + * is an optional, opportunistic way to catch errors. + * + * Paranoia: High-order 48 bits above the lowest 16 bit SS are + * discarded by the legacy IRET instruction on all Intel, AMD, + * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally, + * even when FRED is not enabled. But we choose the safer side + * to use these bits only when FRED is enabled. + */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + regs->fred_ss.swevent = true; + regs->fred_ss.nmi = true; + } + + regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; } void -- Gitee From 310c80b59673307f1e119a14b5aed3c7c58121f8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:10 -0800 Subject: [PATCH 20/34] x86/fred: Make exc_page_fault() work for FRED ANBZ: #33261 commit 58c80cc55e079933205597ecf846583c5e6e4946 upstream. On a FRED system, the faulting address (CR2) is passed on the stack, to avoid the problem of transient state. Thus the page fault address is read from the FRED stack frame instead of CR2 when FRED is enabled. Intel-SIG: commit 58c80cc55e07 Make exc_page_fault() work for FRED. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-22-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/mm/fault.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7add6b93a7b4..5f0bc092baa0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -34,6 +34,7 @@ #include /* kvm_handle_async_pf */ #include /* fixup_vdso_exception() */ #include +#include #define CREATE_TRACE_POINTS #include @@ -1481,8 +1482,10 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { - unsigned long address = read_cr2(); irqentry_state_t state; + unsigned long address; + + address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); prefetchw(¤t->mm->mmap_lock); -- Gitee From 59f32896976f2a72e4d60d5bea405cb5ff40d69f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Thu, 16 Apr 2026 12:06:18 +0800 Subject: [PATCH 21/34] x86/fred: Add a debug fault entry stub for FRED ANBZ: #33261 commit 99fcc968e7c4b117c91f7d03c302d860b74b947b upstream. When occurred on different ring level, i.e., from user or kernel context, stack, while kernel #DB on a dedicated stack. This is exactly how FRED event delivery invokes an exception handler: ring 3 event on level 0 stack, i.e., current task stack; ring 0 event on the #DB dedicated stack specified in the IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception entry stub doesn't do stack switch. On a FRED system, the debug trap status information (DR6) is passed on the stack, to avoid the problem of transient state. Furthermore, FRED transitions avoid a lot of ugly corner cases the handling of which can, and should be, skipped. The FRED debug trap status information saved on the stack differs from DR6 in both stickiness and polarity; it is exactly in the format which debug_read_clear_dr6() returns for the IDT entry points. Intel-SIG: commit 99fcc968e7c4 Add a debug fault entry stub for FRED. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-24-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/traps.c | 43 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3de80cf2d332..8b636bfe6d04 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -1045,8 +1046,7 @@ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) return false; } -static __always_inline void exc_debug_kernel(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions @@ -1058,6 +1058,11 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. + * + * For FRED, nested #DB should just work fine. But when a watchpoint or + * breakpoint is set in the code path which is executed by #DB handler, + * it results in an endless recursion and stack overflow. Thus we stay + * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); @@ -1087,7 +1092,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ - if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* @@ -1119,8 +1125,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, local_db_restore(dr7); } -static __always_inline void exc_debug_user(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; @@ -1204,6 +1209,34 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_reset_dr6()); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #DB needs to be handled on different stack: User #DB on + * current task stack, while kernel #DB on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #DB dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception + * entry stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_DEBUG(exc_debug) +{ + /* + * FRED #DB stores DR6 on the stack in the format which + * debug_read_clear_dr6() returns for the IDT entry points. + */ + unsigned long dr6 = fred_event_data(regs); + + if (user_mode(regs)) + exc_debug_user(regs, dr6); + else + exc_debug_kernel(regs, dr6); +} +#endif /* CONFIG_X86_FRED */ + #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) -- Gitee From 0bbd620ca7c82dd684372fda417bda91f6edb25b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Fri, 15 Dec 2023 22:31:39 -0800 Subject: [PATCH 22/34] x86/fred: Add a NMI entry stub for FRED ANBZ: #33261 commit f8b8ee45f82b681606d288bcec89c9071b4079fc upstream. On a FRED system, NMIs nest both with themselves and faults, transient information is saved into the stack frame, and NMI unblocking only happens when the stack frame indicates that so should happen. Thus, the NMI entry stub for FRED is really quite small... Intel-SIG: commit f8b8ee45f82b Add a NMI entry stub for FRED. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231216063139.25567-1-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/nmi.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 35fd5f1444fd..eb15d82327ca 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -35,6 +35,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -690,6 +691,47 @@ void nmi_backtrace_stall_check(const struct cpumask *btp) #endif +#ifdef CONFIG_X86_FRED +/* + * With FRED, CR2/DR6 is pushed to #PF/#DB stack frame during FRED + * event delivery, i.e., there is no problem of transient states. + * And NMI unblocking only happens when the stack frame indicates + * that so should happen. + * + * Thus, the NMI entry stub for FRED is really straightforward and + * as simple as most exception handlers. As such, #DB is allowed + * during NMI handling. + */ +DEFINE_FREDENTRY_NMI(exc_nmi) +{ + irqentry_state_t irq_state; + + if (arch_cpu_is_offline(smp_processor_id())) { + if (microcode_nmi_handler_enabled()) + microcode_offline_nmi_handler(); + return; + } + + /* + * Save CR2 for eventual restore to cover the case where the NMI + * hits the VMENTER/VMEXIT region where guest CR2 is life. This + * prevents guest state corruption in case that the NMI handler + * takes a page fault. + */ + this_cpu_write(nmi_cr2, read_cr2()); + + irq_state = irqentry_nmi_enter(regs); + + inc_irq_stat(__nmi_count); + default_do_nmi(regs); + + irqentry_nmi_exit(regs, irq_state); + + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) + write_cr2(this_cpu_read(nmi_cr2)); +} +#endif + void stop_nmi(void) { ignore_nmis++; -- Gitee From ed990728fd1f55d0d9eab560ac1738dcc28e4b5e Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 16 Apr 2026 12:08:42 +0800 Subject: [PATCH 23/34] x86/fred: Add a machine check entry stub for FRED ANBZ: #33261 commit ffa4901f0e004e1a0a4e18a2452a1fcc27277cc0 upstream. Like #DB, when occurred on different ring level, i.e., from user or kernel context, #MCE needs to be handled on different stack: User #MCE on current task stack, while kernel #MCE on a dedicated stack. This is exactly how FRED event delivery invokes an exception handler: ring 3 event on level 0 stack, i.e., current task stack; ring 0 event on the the FRED machine check entry stub doesn't do stack switch. Intel-SIG: commit ffa4901f0e00 Add a machine check entry stub for FRED. Backport Intel FRED architecture for x86-64. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-26-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/cpu/mce/core.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4277293aa2a5..10c3210662b3 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -47,6 +47,7 @@ #include #include +#include #include #include #include @@ -2163,6 +2164,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) exc_machine_check_user(regs); local_db_restore(dr7); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #MCE needs to be handled on different stack: User #MCE + * on current task stack, while kernel #MCE on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #MCE dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry + * stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif #else /* 32bit unified entry point */ DEFINE_IDTENTRY_RAW(exc_machine_check) -- Gitee From 660ee0a9fe77bd8c45e8f0e96d4eb91aacc6663e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Thu, 16 Apr 2026 16:03:52 +0800 Subject: [PATCH 24/34] x86/fred: FRED entry/exit and dispatch code ANBZ: #33261 commit 14619d912b658ecd9573fb88400d3830a29cadcb upstream. The code to actually handle kernel and event entry/exit using FRED. It is split up into two files thus: - entry_64_fred.S contains the actual entrypoints and exit code, and saves and restores registers. - entry_fred.c contains the two-level event dispatch code for FRED. The first-level dispatch is on the event type, and the second-level is on the event vector. [ bp: Fold in an allmodconfig clang build fix: https://lore.kernel.org/r/20240129064521.5168-1-xin3.li@intel.com and a CONFIG_IA32_EMULATION=n build fix: https://lore.kernel.org/r/20240127093728.1323-3-xin3.li@intel.com] Intel-SIG: commit 14619d912b65 FRED entry/exit and dispatch code. Backport Intel FRED architecture for x86-64. Suggested-by: Thomas Gleixner Originally-by: Megha Dey Co-developed-by: Xin Li Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231209214214.2932-1-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/entry/Makefile | 5 +- arch/x86/entry/entry_64_fred.S | 50 ++++++ arch/x86/entry/entry_fred.c | 245 ++++++++++++++++++++++++++ arch/x86/include/asm/asm-prototypes.h | 1 + arch/x86/include/asm/fred.h | 6 + arch/x86/include/asm/ia32.h | 4 +- 6 files changed, 308 insertions(+), 3 deletions(-) create mode 100644 arch/x86/entry/entry_64_fred.S create mode 100644 arch/x86/entry/entry_fred.c diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index ca2fe186994b..c93e7f5c2a06 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -18,6 +18,9 @@ obj-y += vdso/ obj-y += vsyscall/ obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o +CFLAGS_entry_fred.o += -fno-stack-protector +CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE) +obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o + obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o - diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S new file mode 100644 index 000000000000..c1ddaf6b068f --- /dev/null +++ b/arch/x86/entry/entry_64_fred.S @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The actual FRED entry points. + */ + +#include + +#include "calling.h" + + .code64 + .section .noinstr.text, "ax" + +.macro FRED_ENTER + UNWIND_HINT_END_OF_STACK + ENDBR + PUSH_AND_CLEAR_REGS + movq %rsp, %rdi /* %rdi -> pt_regs */ +.endm + +.macro FRED_EXIT + UNWIND_HINT_REGS + POP_REGS +.endm + +/* + * The new RIP value that FRED event delivery establishes is + * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3. + * Thus the FRED ring 3 entry point must be 4K page aligned. + */ + .align 4096 + +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) + FRED_ENTER + call fred_entry_from_user + FRED_EXIT + ERETU +SYM_CODE_END(asm_fred_entrypoint_user) + +/* + * The new RIP value that FRED event delivery establishes is + * (IA32_FRED_CONFIG & ~FFFH) + 256 for events that occur in + * ring 0, i.e., asm_fred_entrypoint_user + 256. + */ + .org asm_fred_entrypoint_user + 256, 0xcc +SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel) + FRED_ENTER + call fred_entry_from_kernel + FRED_EXIT + ERETS +SYM_CODE_END(asm_fred_entrypoint_kernel) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c new file mode 100644 index 000000000000..125b62311b31 --- /dev/null +++ b/arch/x86/entry/entry_fred.c @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The FRED specific kernel/user entry functions which are invoked from + * assembly code and dispatch to the associated handlers. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* FRED EVENT_TYPE_OTHER vector numbers */ +#define FRED_SYSCALL 1 +#define FRED_SYSENTER 2 + +static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code) +{ + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + /* Panic on events from a high stack level */ + if (regs->fred_cs.sl > 0) { + pr_emerg("PANIC: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + die("invalid or fatal FRED event", regs, regs->orig_ax); + panic("invalid or fatal FRED event"); + } else { + unsigned long flags = oops_begin(); + int sig = SIGKILL; + + pr_alert("BUG: invalid or fatal FRED event; event type %u " + "vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n", + regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax, + fred_event_data(regs), regs->cs, regs->ip); + + if (__die("Invalid or fatal FRED event", regs, regs->orig_ax)) + sig = 0; + + oops_end(flags, regs, sig); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +static noinstr void fred_intx(struct pt_regs *regs) +{ + switch (regs->fred_ss.vector) { + /* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */ + case X86_TRAP_BP: + return exc_int3(regs); + + /* Opcode 0xcd, 0x4, NOT INTO (opcode 0xce) */ + case X86_TRAP_OF: + return exc_overflow(regs); + +#ifdef CONFIG_IA32_EMULATION + /* INT80 */ + case IA32_SYSCALL_VECTOR: + if (ia32_enabled()) + return int80_emulation(regs); + fallthrough; +#endif + + default: + return exc_general_protection(regs, 0); + } +} + +static __always_inline void fred_other(struct pt_regs *regs) +{ + /* The compiler can fold these conditions into a single test */ + if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_syscall_64(regs, regs->orig_ax); + return; + } else if (ia32_enabled() && + likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) { + regs->orig_ax = regs->ax; + regs->ax = -ENOSYS; + do_fast_syscall_32(regs); + return; + } else { + exc_invalid_op(regs); + return; + } +} + +#define SYSVEC(_vector, _function) [_vector - FIRST_SYSTEM_VECTOR] = fred_sysvec_##_function + +static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { + SYSVEC(ERROR_APIC_VECTOR, error_interrupt), + SYSVEC(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), + SYSVEC(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + SYSVEC(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + + SYSVEC(RESCHEDULE_VECTOR, reschedule_ipi), + SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, call_function_single), + SYSVEC(CALL_FUNCTION_VECTOR, call_function), + SYSVEC(REBOOT_VECTOR, reboot), + + SYSVEC(THRESHOLD_APIC_VECTOR, threshold), + SYSVEC(DEFERRED_ERROR_VECTOR, deferred_error), + SYSVEC(THERMAL_APIC_VECTOR, thermal), + + SYSVEC(IRQ_WORK_VECTOR, irq_work), + + SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +}; + +static noinstr void fred_extint(struct pt_regs *regs) +{ + unsigned int vector = regs->fred_ss.vector; + unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR, + NR_SYSTEM_VECTORS); + + if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR)) + return; + + if (likely(vector >= FIRST_SYSTEM_VECTOR)) { + irqentry_state_t state = irqentry_enter(regs); + + instrumentation_begin(); + sysvec_table[index](regs); + instrumentation_end(); + irqentry_exit(regs, state); + } else { + common_interrupt(regs, vector); + } +} + +static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code) +{ + /* Optimize for #PF. That's the only exception which matters performance wise */ + if (likely(regs->fred_ss.vector == X86_TRAP_PF)) + return exc_page_fault(regs, error_code); + + switch (regs->fred_ss.vector) { + case X86_TRAP_DE: return exc_divide_error(regs); + case X86_TRAP_DB: return fred_exc_debug(regs); + case X86_TRAP_BR: return exc_bounds(regs); + case X86_TRAP_UD: return exc_invalid_op(regs); + case X86_TRAP_NM: return exc_device_not_available(regs); + case X86_TRAP_DF: return exc_double_fault(regs, error_code); + case X86_TRAP_TS: return exc_invalid_tss(regs, error_code); + case X86_TRAP_NP: return exc_segment_not_present(regs, error_code); + case X86_TRAP_SS: return exc_stack_segment(regs, error_code); + case X86_TRAP_GP: return exc_general_protection(regs, error_code); + case X86_TRAP_MF: return exc_coprocessor_error(regs); + case X86_TRAP_AC: return exc_alignment_check(regs, error_code); + case X86_TRAP_XF: return exc_simd_coprocessor_error(regs); + +#ifdef CONFIG_X86_MCE + case X86_TRAP_MC: return fred_exc_machine_check(regs); +#endif +#ifdef CONFIG_INTEL_TDX_GUEST + case X86_TRAP_VE: return exc_virtualization_exception(regs); +#endif +#ifdef CONFIG_X86_CET + case X86_TRAP_CP: return exc_control_protection(regs, error_code); +#endif + default: return fred_bad_type(regs, error_code); + } + +} + +static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code) +{ + switch (regs->fred_ss.vector) { + case X86_TRAP_BP: return exc_int3(regs); + case X86_TRAP_OF: return exc_overflow(regs); + default: return fred_bad_type(regs, error_code); + } +} + +__visible noinstr void fred_entry_from_user(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_SWINT: + return fred_intx(regs); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + case EVENT_TYPE_OTHER: + return fred_other(regs); + default: break; + } + + return fred_bad_type(regs, error_code); +} + +__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) +{ + unsigned long error_code = regs->orig_ax; + + /* Invalidate orig_ax so that syscall_get_nr() works correctly */ + regs->orig_ax = -1; + + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) + return fred_exc_nmi(regs); + break; + case EVENT_TYPE_HWEXC: + return fred_hwexc(regs, error_code); + case EVENT_TYPE_PRIV_SWEXC: + if (likely(regs->fred_ss.vector == X86_TRAP_DB)) + return fred_exc_debug(regs); + break; + case EVENT_TYPE_SWEXC: + return fred_swexc(regs, error_code); + default: break; + } + + return fred_bad_type(regs, error_code); +} diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 768076e68668..3674006e3974 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index f514fdb5a39f..16a64ffecbf8 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -60,6 +60,12 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) return fred_info(regs)->edata; } +void asm_fred_entrypoint_user(void); +void asm_fred_entrypoint_kernel(void); + +__visible void fred_entry_from_user(struct pt_regs *regs); +__visible void fred_entry_from_kernel(struct pt_regs *regs); + #else /* CONFIG_X86_FRED */ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } #endif /* CONFIG_X86_FRED */ diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 9805629479d9..b3b898881d95 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -70,7 +70,7 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); extern bool __ia32_enabled; -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return __ia32_enabled; } @@ -82,7 +82,7 @@ static inline void ia32_disable(void) #else /* !CONFIG_IA32_EMULATION */ -static inline bool ia32_enabled(void) +static __always_inline bool ia32_enabled(void) { return IS_ENABLED(CONFIG_X86_32); } -- Gitee From 17f4deb3d9fb208937aaab1d5c28943cf97b2335 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:16 -0800 Subject: [PATCH 25/34] x86/traps: Add sysvec_install() to install a system interrupt handler ANBZ: #33261 commit 8f4a29b0e8a40d865040800684d7ff4141c1394f upstream. Add sysvec_install() to install a system interrupt handler into the IDT or the FRED system interrupt handler table. Intel-SIG: commit 8f4a29b0e8a4 Add sysvec_install() to install a system interrupt handler. Backport Intel FRED architecture for x86-64. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-28-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/entry/entry_fred.c | 14 ++++++++++++++ arch/x86/include/asm/desc.h | 2 -- arch/x86/include/asm/idtentry.h | 15 +++++++++++++++ arch/x86/kernel/cpu/acrn.c | 4 ++-- arch/x86/kernel/cpu/mshyperv.c | 15 +++++++-------- arch/x86/kernel/idt.c | 4 ++-- arch/x86/kernel/kvm.c | 2 +- drivers/xen/events/events_base.c | 2 +- 8 files changed, 42 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 125b62311b31..3be0269bc0d4 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -119,6 +119,20 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), }; +static bool fred_setup_done __initdata; + +void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler) +{ + if (WARN_ON_ONCE(sysvec < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON_ONCE(fred_setup_done)) + return; + + if (!WARN_ON_ONCE(sysvec_table[sysvec - FIRST_SYSTEM_VECTOR])) + sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler; +} + static noinstr void fred_extint(struct pt_regs *regs) { unsigned int vector = regs->fred_ss.vector; diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ab97b22ac04a..ec95fe44fa3a 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -402,8 +402,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void alloc_intr_gate(unsigned int n, const void *addr); - static inline void init_idt_data(struct idt_data *data, unsigned int n, const void *addr) { diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 07054bfdc066..6df8cf13fd56 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -459,6 +459,21 @@ __visible noinstr void func(struct pt_regs *regs, \ #define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW #endif +void idt_install_sysvec(unsigned int n, const void *function); + +#ifdef CONFIG_X86_FRED +void fred_install_sysvec(unsigned int vector, const idtentry_t function); +#else +static inline void fred_install_sysvec(unsigned int vector, const idtentry_t function) { } +#endif + +#define sysvec_install(vector, function) { \ + if (cpu_feature_enabled(X86_FEATURE_FRED)) \ + fred_install_sysvec(vector, function); \ + else \ + idt_install_sysvec(vector, asm_##function); \ +} + #else /* !__ASSEMBLY__ */ /* diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index bfeb18fad63f..2c5b51aad91a 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -26,8 +26,8 @@ static u32 __init acrn_detect(void) static void __init acrn_init_platform(void) { - /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + /* Install system interrupt handler for ACRN hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); x86_platform.calibrate_tsc = acrn_get_tsc_khz; x86_platform.calibrate_cpu = acrn_get_tsc_khz; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e709070eed70..7d489b57d560 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -583,19 +583,18 @@ static void __init ms_hyperv_init_platform(void) */ x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); - /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); - /* Setup the IDT for reenlightenment notifications */ + /* Install system interrupt handler for hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); + + /* Install system interrupt handler for reenlightenment notifications */ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { - alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - asm_sysvec_hyperv_reenlightenment); + sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); } - /* Setup the IDT for stimer0 */ + /* Install system interrupt handler for stimer0 */ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { - alloc_intr_gate(HYPERV_STIMER0_VECTOR, - asm_sysvec_hyperv_stimer0); + sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); } # ifdef CONFIG_SMP diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 07d7fe8a16a4..280c0b1834b2 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -333,7 +333,7 @@ void idt_invalidate(void) load_idt(&idt); } -void __init alloc_intr_gate(unsigned int n, const void *addr) +void __init idt_install_sysvec(unsigned int n, const void *function) { if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) return; @@ -342,5 +342,5 @@ void __init alloc_intr_gate(unsigned int n, const void *addr) return; if (!WARN_ON(test_and_set_bit(n, system_vectors))) - set_intr_gate(n, addr); + set_intr_gate(n, function); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1e261e16d875..68e9b90baef1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -831,7 +831,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); } #ifdef CONFIG_SMP diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 9e3b5d21d098..75733eefe77c 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -2235,7 +2235,7 @@ static __init void xen_alloc_callback_vector(void) return; pr_info("Xen HVM callback vector for event delivery is enabled\n"); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_xen_hvm_callback); + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback); } #else void xen_setup_callback_vector(void) {} -- Gitee From 0841e16b47659cc26ea41cf120c96fc2754d648b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 5 Dec 2023 02:50:17 -0800 Subject: [PATCH 26/34] x86/fred: Let ret_from_fork_asm() jmp to asm_fred_exit_user when FRED is enabled ANBZ: #33261 commit 51ef2a4da7ec347e3315af69a426ac36fab98a6c upstream. Let ret_from_fork_asm() jmp to asm_fred_exit_user when FRED is enabled, otherwise the existing IDT code is chosen. Intel-SIG: commit 51ef2a4da7ec Let ret_from_fork_asm() jmp to asm_fred_exit_user when FRED is enabled. Backport Intel FRED architecture for x86-64. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-29-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/entry/entry_64.S | 6 ++++++ arch/x86/entry/entry_64_fred.S | 1 + 2 files changed, 7 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 99f2eef48fc8..8b42fb851eeb 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -298,7 +298,13 @@ SYM_CODE_START(ret_from_fork_asm) * and unwind should work normally. */ UNWIND_HINT_REGS + +#ifdef CONFIG_X86_FRED + ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \ + "jmp asm_fred_exit_user", X86_FEATURE_FRED +#else jmp swapgs_restore_regs_and_return_to_usermode +#endif SYM_CODE_END(ret_from_fork_asm) .popsection diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index c1ddaf6b068f..2271a1c690dc 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -32,6 +32,7 @@ SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) FRED_ENTER call fred_entry_from_user +SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL) FRED_EXIT ERETU SYM_CODE_END(asm_fred_entrypoint_user) -- Gitee From 750f75902b69a88251f4bce0eb40d2475755877b Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:18 -0800 Subject: [PATCH 27/34] x86/fred: Fixup fault on ERETU by jumping to fred_entrypoint_user ANBZ: #33261 commit 5105e7687ad3dffde77f6e4393b5530e83d672dc upstream. If the stack frame contains an invalid user context (e.g. due to invalid SS, a non-canonical RIP, etc.) the ERETU instruction will trap (#SS or #GP). From a Linux point of view, this really should be considered a user space failure, so use the standard fault fixup mechanism to intercept the fault, fix up the exception frame, and redirect execution to fred_entrypoint_user. The end result is that it appears just as if the hardware had taken the exception immediately after completing the transition to user space. Intel-SIG: commit 5105e7687ad3 Fixup fault on ERETU by jumping to fred_entrypoint_user. Backport Intel FRED architecture for x86-64. Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-30-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/entry/entry_64_fred.S | 5 +- arch/x86/include/asm/extable_fixup_types.h | 4 +- arch/x86/mm/extable.c | 78 ++++++++++++++++++++++ 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index 2271a1c690dc..7fe2722ad90c 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -3,6 +3,7 @@ * The actual FRED entry points. */ +#include #include #include "calling.h" @@ -34,7 +35,9 @@ SYM_CODE_START_NOALIGN(asm_fred_entrypoint_user) call fred_entry_from_user SYM_INNER_LABEL(asm_fred_exit_user, SYM_L_GLOBAL) FRED_EXIT - ERETU +1: ERETU + + _ASM_EXTABLE_TYPE(1b, asm_fred_entrypoint_user, EX_TYPE_ERETU) SYM_CODE_END(asm_fred_entrypoint_user) /* diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index afad9c0b07e0..3a6bf5c7fbb2 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -64,6 +64,8 @@ #define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4)) #define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) -#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ +#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ + +#define EX_TYPE_ERETU 21 #endif diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 94195d750e65..52e22d3e1a82 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -215,6 +216,79 @@ static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } +#ifdef CONFIG_X86_FRED +static bool ex_handler_eretu(const struct exception_table_entry *fixup, + struct pt_regs *regs, unsigned long error_code) +{ + struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax)); + unsigned short ss = uregs->ss; + unsigned short cs = uregs->cs; + + /* + * Move the NMI bit from the invalid stack frame, which caused ERETU + * to fault, to the fault handler's stack frame, thus to unblock NMI + * with the fault handler's ERETS instruction ASAP if NMI is blocked. + */ + regs->fred_ss.nmi = uregs->fred_ss.nmi; + + /* + * Sync event information to uregs, i.e., the ERETU return frame, but + * is it safe to write to the ERETU return frame which is just above + * current event stack frame? + * + * The RSP used by FRED to push a stack frame is not the value in %rsp, + * it is calculated from %rsp with the following 2 steps: + * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes + * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line + * when an event delivery doesn't trigger a stack level change. + * + * Here is an example with N*64 (N=1) bytes reserved: + * + * 64-byte cache line ==> ______________ + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETU return frame + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * |______________| + * 64-byte cache line ==> |______________| <== RSP after step 1) and 2) + * |___Reserved___| + * |__Event_data__| + * |_____SS_______| + * |_____RSP______| + * |_____FLAGS____| + * |_____CS_______| + * |_____IP_______| + * 64-byte cache line ==> |__Error_code__| <== ERETS return frame + * + * Thus a new FRED stack frame will always be pushed below a previous + * FRED stack frame ((N*64) bytes may be reserved between), and it is + * safe to write to a previous FRED stack frame as they never overlap. + */ + fred_info(uregs)->edata = fred_event_data(regs); + uregs->ssx = regs->ssx; + uregs->fred_ss.ss = ss; + /* The NMI bit was moved away above */ + uregs->fred_ss.nmi = 0; + uregs->csx = regs->csx; + uregs->fred_cs.sl = 0; + uregs->fred_cs.wfe = 0; + uregs->cs = cs; + uregs->orig_ax = error_code; + + return ex_handler_default(fixup, regs); +} +#endif + int ex_get_fixup_type(unsigned long ip) { const struct exception_table_entry *e = search_exception_tables(ip); @@ -290,6 +364,10 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); +#ifdef CONFIG_X86_FRED + case EX_TYPE_ERETU: + return ex_handler_eretu(e, regs, error_code); +#endif } BUG(); } -- Gitee From dd80297ef1010e3eaa0cd15c9a1f2eb8faa06656 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 5 Dec 2023 02:50:19 -0800 Subject: [PATCH 28/34] x86/entry/calling: Allow PUSH_AND_CLEAR_REGS being used beyond actual entry code ANBZ: #33261 commit 2333f3c473c1562633cd17ac2eb743c29c3b2d9d upstream. PUSH_AND_CLEAR_REGS could be used besides actual entry code; in that case %rbp shouldn't be cleared (otherwise the frame pointer is destroyed) and UNWIND_HINT shouldn't be added. Intel-SIG: commit 2333f3c473c1 Allow PUSH_AND_CLEAR_REGS being used beyond actual entry code. Backport Intel FRED architecture for x86-64. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-31-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/entry/calling.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 01e9593e2bd9..b6d62ecdd750 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -65,7 +65,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 unwind_hint=1 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -89,14 +89,17 @@ For 32-bit we have the following conventions - kernel is built with pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + + .if \unwind_hint UNWIND_HINT_REGS + .endif .if \save_ret pushq %rsi /* return address on top of stack */ .endif .endm -.macro CLEAR_REGS +.macro CLEAR_REGS clear_bp=1 /* * Sanitize registers of values that a speculation attack might * otherwise want to exploit. The lower registers are likely clobbered @@ -111,7 +114,9 @@ For 32-bit we have the following conventions - kernel is built with xorl %r10d, %r10d /* nospec r10 */ xorl %r11d, %r11d /* nospec r11 */ xorl %ebx, %ebx /* nospec rbx */ + .if \clear_bp xorl %ebp, %ebp /* nospec rbp */ + .endif xorl %r12d, %r12d /* nospec r12 */ xorl %r13d, %r13d /* nospec r13 */ xorl %r14d, %r14d /* nospec r14 */ @@ -119,9 +124,9 @@ For 32-bit we have the following conventions - kernel is built with .endm -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 - PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret - CLEAR_REGS +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0 clear_bp=1 unwind_hint=1 + PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret unwind_hint=\unwind_hint + CLEAR_REGS clear_bp=\clear_bp .endm .macro POP_REGS pop_rdi=1 -- Gitee From 710f996fd5603ee19af4d23223bea2d5a2b64b6e Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:20 -0800 Subject: [PATCH 29/34] x86/entry: Add fred_entry_from_kvm() for VMX to handle IRQ/NMI ANBZ: #33261 commit 2e670358ec1829238c99fbff178e285d3eb43ef1 upstream. In IRQ/NMI induced VM exits, KVM VMX needs to execute the respective handlers, which requires the software to create a FRED stack frame, and use it to invoke the handlers. Add fred_irq_entry_from_kvm() for this job. Export fred_entry_from_kvm() because VMX can be compiled as a module. Intel-SIG: commit 2e670358ec18 Add fred_entry_from_kvm() for VMX to handle IRQ/NMI. Backport Intel FRED architecture for x86-64. Suggested-by: Sean Christopherson Suggested-by: Thomas Gleixner Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-32-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/entry/entry_64_fred.S | 77 ++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_fred.c | 14 +++++++ arch/x86/include/asm/fred.h | 18 ++++++++ 3 files changed, 109 insertions(+) diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index 7fe2722ad90c..a02bc6f3d2e6 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -3,8 +3,11 @@ * The actual FRED entry points. */ +#include + #include #include +#include #include "calling.h" @@ -52,3 +55,77 @@ SYM_CODE_START_NOALIGN(asm_fred_entrypoint_kernel) FRED_EXIT ERETS SYM_CODE_END(asm_fred_entrypoint_kernel) + +#if IS_ENABLED(CONFIG_KVM_INTEL) +SYM_FUNC_START(asm_fred_entry_from_kvm) + push %rbp + mov %rsp, %rbp + + UNWIND_HINT_SAVE + + /* + * Both IRQ and NMI from VMX can be handled on current task stack + * because there is no need to protect from reentrancy and the call + * stack leading to this helper is effectively constant and shallow + * (relatively speaking). Do the same when FRED is active, i.e., no + * need to check current stack level for a stack switch. + * + * Emulate the FRED-defined redzone and stack alignment. + */ + sub $(FRED_CONFIG_REDZONE_AMOUNT << 6), %rsp + and $FRED_STACK_FRAME_RSP_MASK, %rsp + + /* + * Start to push a FRED stack frame, which is always 64 bytes: + * + * +--------+-----------------+ + * | Bytes | Usage | + * +--------+-----------------+ + * | 63:56 | Reserved | + * | 55:48 | Event Data | + * | 47:40 | SS + Event Info | + * | 39:32 | RSP | + * | 31:24 | RFLAGS | + * | 23:16 | CS + Aux Info | + * | 15:8 | RIP | + * | 7:0 | Error Code | + * +--------+-----------------+ + */ + push $0 /* Reserved, must be 0 */ + push $0 /* Event data, 0 for IRQ/NMI */ + push %rdi /* fred_ss handed in by the caller */ + push %rbp + pushf + mov $__KERNEL_CS, %rax + push %rax + + /* + * Unlike the IDT event delivery, FRED _always_ pushes an error code + * after pushing the return RIP, thus the CALL instruction CANNOT be + * used here to push the return RIP, otherwise there is no chance to + * push an error code before invoking the IRQ/NMI handler. + * + * Use LEA to get the return RIP and push it, then push an error code. + */ + lea 1f(%rip), %rax + push %rax /* Return RIP */ + push $0 /* Error code, 0 for IRQ/NMI */ + + PUSH_AND_CLEAR_REGS clear_bp=0 unwind_hint=0 + movq %rsp, %rdi /* %rdi -> pt_regs */ + call __fred_entry_from_kvm /* Call the C entry point */ + POP_REGS + ERETS +1: + /* + * Objtool doesn't understand what ERETS does, this hint tells it that + * yes, we'll reach here and with what stack state. A save/restore pair + * isn't strictly needed, but it's the simplest form. + */ + UNWIND_HINT_RESTORE + pop %rbp + RET + +SYM_FUNC_END(asm_fred_entry_from_kvm) +EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm); +#endif diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 3be0269bc0d4..6ecc08b6d72a 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -257,3 +257,17 @@ __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) return fred_bad_type(regs, error_code); } + +#if IS_ENABLED(CONFIG_KVM_INTEL) +__visible noinstr void __fred_entry_from_kvm(struct pt_regs *regs) +{ + switch (regs->fred_ss.type) { + case EVENT_TYPE_EXTINT: + return fred_extint(regs); + case EVENT_TYPE_NMI: + return fred_exc_nmi(regs); + default: + WARN_ON_ONCE(1); + } +} +#endif diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 16a64ffecbf8..2fa9f34e5c95 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -9,6 +9,7 @@ #include #include +#include /* * FRED event return instruction opcodes for ERET{S,U}; supported in @@ -62,12 +63,29 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) void asm_fred_entrypoint_user(void); void asm_fred_entrypoint_kernel(void); +void asm_fred_entry_from_kvm(struct fred_ss); __visible void fred_entry_from_user(struct pt_regs *regs); __visible void fred_entry_from_kernel(struct pt_regs *regs); +__visible void __fred_entry_from_kvm(struct pt_regs *regs); + +/* Can be called from noinstr code, thus __always_inline */ +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) +{ + struct fred_ss ss = { + .ss =__KERNEL_DS, + .type = type, + .vector = vector, + .nmi = type == EVENT_TYPE_NMI, + .lm = 1, + }; + + asm_fred_entry_from_kvm(ss); +} #else /* CONFIG_X86_FRED */ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } #endif /* CONFIG_X86_FRED */ #endif /* !__ASSEMBLY__ */ -- Gitee From 6aab0791fabd9ace80c88ede7211ef4fd065f238 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:21 -0800 Subject: [PATCH 30/34] KVM: VMX: Call fred_entry_from_kvm() for IRQ/NMI handling ANBZ: #33261 commit 70d0fe5d0923abfb28c26e71171944f4801f9f38 upstream. When FRED is enabled, call fred_entry_from_kvm() to handle IRQ/NMI in IRQ/NMI induced VM exits. Intel-SIG: commit 70d0fe5d0923 VMX: Call fred_entry_from_kvm() for IRQ/NMI handling. Backport Intel FRED architecture for x86-64. Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Acked-by: Paolo Bonzini Link: https://lore.kernel.org/r/20231205105030.8698-33-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kvm/vmx/vmx.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 502ab830db3c..efa614544fcd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -7037,14 +7038,16 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; - gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); - vmx_do_interrupt_irqoff(gate_offset(desc)); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); + else + vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); kvm_after_interrupt(vcpu); vcpu->arch.at_instruction_boundary = true; @@ -7341,7 +7344,10 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && is_nmi(vmx_get_intr_info(vcpu))) { kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - vmx_do_nmi_irqoff(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); + else + vmx_do_nmi_irqoff(); kvm_after_interrupt(vcpu); } -- Gitee From 71c013edc152c69dcb720b9afccb9ab08def4f35 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 16 Apr 2026 12:24:00 +0800 Subject: [PATCH 31/34] x86/syscall: Split IDT syscall setup code into idt_syscall_init() ANBZ: #33261 commit 530dce278afffd8084af9a23493532912cdbe98a upstream. Because FRED uses the ring 3 FRED entrypoint for SYSCALL and SYSENTER and ERETU is the only legit instruction to return to ring 3, there is NO need to setup SYSCALL and SYSENTER MSRs for FRED, except the IA32_STAR MSR. Split IDT syscall setup code into idt_syscall_init() to make it easy to skip syscall setup code when FRED is enabled. Intel-SIG: commit 530dce278aff Split IDT syscall setup code into idt_syscall_init(). Backport Intel FRED architecture for x86-64. Suggested-by: Thomas Gleixner Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-34-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/cpu/common.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 59664392d6be..443266475a49 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2092,10 +2092,8 @@ static void wrmsrl_cstar(unsigned long val) wrmsrl(MSR_CSTAR, val); } -/* May not be marked __init: used by software suspend */ -void syscall_init(void) +static inline void idt_syscall_init(void) { - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION @@ -2129,6 +2127,15 @@ void syscall_init(void) X86_EFLAGS_AC|X86_EFLAGS_ID); } +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* The default user and kernel segments */ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + + idt_syscall_init(); +} + #else /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR -- Gitee From 4462c9e1ddbbe4d2e294d2fb6ef214f240cac759 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Thu, 16 Apr 2026 20:24:29 +0800 Subject: [PATCH 32/34] x86/fred: Add FRED initialization functions ANBZ: #33261 commit cdd99dd873cb11c40adf1ef70693f72c622ac8f3 upstream. Add cpu_init_fred_exceptions() to: - Set FRED entrypoints for events happening in ring 0 and 3. - Specify the stack level for IRQs occurred ring 0. - Specify dedicated event stacks for #DB/NMI/#MCE/#DF. - Enable FRED and invalidtes IDT. - Force 32-bit system calls to use "int $0x80" only. Add fred_complete_exception_setup() to: - Initialize system_vectors as done for IDT systems. - Set unused sysvec_table entries to fred_handle_spurious_interrupt(). Intel-SIG: commit cdd99dd873cb Add FRED initialization functions. Backport Intel FRED architecture for x86-64. Co-developed-by: Xin Li Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-35-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/entry/entry_fred.c | 21 +++++++++++++ arch/x86/include/asm/fred.h | 5 ++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/fred.c | 59 +++++++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+) create mode 100644 arch/x86/kernel/fred.c diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 6ecc08b6d72a..ac120cbdaaf2 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -133,6 +133,27 @@ void __init fred_install_sysvec(unsigned int sysvec, idtentry_t handler) sysvec_table[sysvec - FIRST_SYSTEM_VECTOR] = handler; } +static noinstr void fred_handle_spurious_interrupt(struct pt_regs *regs) +{ + spurious_interrupt(regs, regs->fred_ss.vector); +} + +void __init fred_complete_exception_setup(void) +{ + unsigned int vector; + + for (vector = 0; vector < FIRST_EXTERNAL_VECTOR; vector++) + set_bit(vector, system_vectors); + + for (vector = 0; vector < NR_SYSTEM_VECTORS; vector++) { + if (sysvec_table[vector]) + set_bit(vector + FIRST_SYSTEM_VECTOR, system_vectors); + else + sysvec_table[vector] = fred_handle_spurious_interrupt; + } + fred_setup_done = true; +} + static noinstr void fred_extint(struct pt_regs *regs) { unsigned int vector = regs->fred_ss.vector; diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 2fa9f34e5c95..e86c7ba32435 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -83,8 +83,13 @@ static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int asm_fred_entry_from_kvm(ss); } +void cpu_init_fred_exceptions(void); +void fred_complete_exception_setup(void); + #else /* CONFIG_X86_FRED */ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +static inline void cpu_init_fred_exceptions(void) { } +static inline void fred_complete_exception_setup(void) { } static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } #endif /* CONFIG_X86_FRED */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9ff1490bdd4a..89787d3c6fa3 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -48,6 +48,7 @@ obj-y += platform-quirks.o obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o +obj-$(CONFIG_X86_FRED) += fred.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o obj-y += setup.o x86_init.o i8259.o irqinit.o diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c new file mode 100644 index 000000000000..4bcd8791ad96 --- /dev/null +++ b/arch/x86/kernel/fred.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +#include +#include +#include +#include + +/* #DB in the kernel would imply the use of a kernel debugger. */ +#define FRED_DB_STACK_LEVEL 1UL +#define FRED_NMI_STACK_LEVEL 2UL +#define FRED_MC_STACK_LEVEL 2UL +/* + * #DF is the highest level because a #DF means "something went wrong + * *while delivering an exception*." The number of cases for which that + * can happen with FRED is drastically reduced and basically amounts to + * "the stack you pointed me to is broken." Thus, always change stacks + * on #DF, which means it should be at the highest level. + */ +#define FRED_DF_STACK_LEVEL 3UL + +#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector))) + +void cpu_init_fred_exceptions(void) +{ + /* When FRED is enabled by default, remove this log message */ + pr_info("Initialize FRED on CPU%d\n", smp_processor_id()); + + wrmsrl(MSR_IA32_FRED_CONFIG, + /* Reserve for CALL emulation */ + FRED_CONFIG_REDZONE | + FRED_CONFIG_INT_STKLVL(0) | + FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); + + /* + * The purpose of separate stacks for NMI, #DB and #MC *in the kernel* + * (remember that user space faults are always taken on stack level 0) + * is to avoid overflowing the kernel stack. + */ + wrmsrl(MSR_IA32_FRED_STKLVLS, + FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL)); + + /* The FRED equivalents to IST stacks... */ + wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); + wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); + wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); + + /* Enable FRED */ + cr4_set_bits(X86_CR4_FRED); + /* Any further IDT use is a bug */ + idt_invalidate(); + + /* Use int $0x80 for 32-bit system calls in FRED mode */ + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); +} -- Gitee From 2f73376e3953ca2115afdee3735f01ef692615bd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Thu, 16 Apr 2026 12:26:18 +0800 Subject: [PATCH 33/34] x86/fred: Invoke FRED initialization code to enable FRED ANBZ: #33261 commit 208d8c79fd0f155bce1b23d8d78926653f7603b7 upstream. Let cpu_init_exception_handling() call cpu_init_fred_exceptions() to initialize FRED. However if FRED is unavailable or disabled, it falls back to set up TSS IST and initialize IDT. Intel-SIG: commit 208d8c79fd0f Invoke FRED initialization code to enable FRED. Backport Intel FRED architecture for x86-64. Co-developed-by: Xin Li Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-36-xin3.li@intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li --- arch/x86/kernel/cpu/common.c | 22 +++++++++++++++++----- arch/x86/kernel/irqinit.c | 7 ++++++- arch/x86/kernel/traps.c | 5 ++++- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 443266475a49..503de9baae0f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -2133,7 +2134,15 @@ void syscall_init(void) /* The default user and kernel segments */ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - idt_syscall_init(); + /* + * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and + * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED + * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit + * instruction to return to ring 3 (both sysexit and sysret cause + * #UD when FRED is enabled). + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_syscall_init(); } #else /* CONFIG_X86_64 */ @@ -2237,8 +2246,9 @@ void cpu_init_exception_handling(void) /* paranoid_entry() gets the CPU number from the GDT */ setup_getcpu(cpu); - /* IST vectors need TSS to be set up. */ - tss_setup_ist(tss); + /* For IDT mode, IST vectors need to be set in TSS. */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + tss_setup_ist(tss); tss_setup_io_bitmap(tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); @@ -2247,8 +2257,10 @@ void cpu_init_exception_handling(void) /* GHCB needs to be setup to handle #VC. */ setup_ghcb(); - /* Finally load the IDT */ - load_current_idt(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + cpu_init_fred_exceptions(); + else + load_current_idt(); } /* diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c683666876f1..f79c5edc0b89 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -28,6 +28,7 @@ #include #include #include +#include #include /* @@ -96,7 +97,11 @@ void __init native_init_IRQ(void) /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - idt_setup_apic_and_irq_gates(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_complete_exception_setup(); + else + idt_setup_apic_and_irq_gates(); + lapic_assign_system_vectors(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8b636bfe6d04..2dfe6246048d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1548,7 +1548,10 @@ void __init trap_init(void) /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(); + /* Setup traps as cpu_init() might #GP */ - idt_setup_traps(); + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_setup_traps(); + cpu_init(); } -- Gitee From 153b9d36dcda7016185c554af12aae0e04a67225 Mon Sep 17 00:00:00 2001 From: Aubrey Li Date: Fri, 17 Apr 2026 08:21:37 +0800 Subject: [PATCH 34/34] anolis: configs: enable CONFIG_X86_FRED by default ANBZ: #33261 This enables CONFIG_X86_FRED by default and also enables Intel flexible return and event delivery (FRED) architecture for x86-64. Signed-off-by: Aubrey Li --- anolis/configs/L1-RECOMMEND/x86/CONFIG_X86_FRED | 1 + 1 file changed, 1 insertion(+) create mode 100644 anolis/configs/L1-RECOMMEND/x86/CONFIG_X86_FRED diff --git a/anolis/configs/L1-RECOMMEND/x86/CONFIG_X86_FRED b/anolis/configs/L1-RECOMMEND/x86/CONFIG_X86_FRED new file mode 100644 index 000000000000..420c30258aa4 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/x86/CONFIG_X86_FRED @@ -0,0 +1 @@ +CONFIG_X86_FRED=y -- Gitee