From daa520bc50b1725d0759aa1490d3b1bf2a7c9564 Mon Sep 17 00:00:00 2001 From: Keith Adler Date: Wed, 11 Mar 2026 17:27:22 -0500 Subject: [PATCH 01/18] hypervisor: kvm: Add GUEST_MEMFD and KVM_SET_USER_MEMORY_REGION2 support Add support for guest_memfd (available in Linux kernel v6.8+), which enables private memory for confidential VMs. Key changes: - Introduce UserMemoryRegion abstraction with guest_memfd fields - Add From impls between kvm_userspace_memory_region2 and UserMemoryRegion - Convert all KVM memory region operations from kvm_userspace_memory_region to kvm_userspace_memory_region2, with automatic fallback to v1 when guest_memfd is not supported - Add set_user_memory_region() wrapper that dispatches to v1/v2 based on kvm_guest_memfd_supported capability - Create guest_memfd via KVM_CREATE_GUEST_MEMFD ioctl when supported - Extend KvmDirtyLogSlot to preserve region2 fields across dirty log start/stop cycles This is prerequisite infrastructure for KVM-based confidential computing (SEV-SNP, TDX) that requires private guest memory backed by guest_memfd. Co-authored-by: Alex Orozco Signed-off-by: Keith Adler Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 222 +++++++++++++++++++++++++++++++------ hypervisor/src/lib.rs | 26 +++++ vmm/src/seccomp_filters.rs | 19 ++++ 3 files changed, 236 insertions(+), 31 deletions(-) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 8b21002d1d..76b6b23da1 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -14,7 +14,9 @@ use std::any::Any; use std::collections::HashMap; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use std::mem::offset_of; -#[cfg(feature = "tdx")] +#[cfg(target_arch = "x86_64")] +use std::os::fd::{FromRawFd, OwnedFd}; +#[cfg(target_arch = "x86_64")] use std::os::unix::io::AsRawFd; #[cfg(feature = "tdx")] use std::os::unix::io::RawFd; @@ -26,9 +28,12 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, RwLock}; use anyhow::anyhow; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::kvm_create_guest_memfd; use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; #[cfg(target_arch = "x86_64")] use log::warn; +use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; #[cfg(target_arch = "aarch64")] @@ -50,8 +55,6 @@ pub use crate::riscv64::{ }; #[cfg(target_arch = "riscv64")] use crate::riscv64_reg_id; -use crate::vm::{self, InterruptSourceConfig, VmOps}; -use crate::{HypervisorType, HypervisorVmConfig, cpu, hypervisor}; // x86_64 dependencies #[cfg(target_arch = "x86_64")] pub mod x86_64; @@ -73,7 +76,12 @@ use crate::arch::x86::{ CpuIdEntry, FpuState, LapicState, MTRR_MSR_INDICES, MsrEntry, NUM_IOAPIC_PINS, SpecialRegisters, XsaveState, }; -use crate::{CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters}; +use crate::{ + CpuState, HypervisorType, HypervisorVmConfig, InterruptSourceConfig, IoEventAddress, + IrqRoutingEntry, MpState, StandardRegisters, USER_MEMORY_REGION_GUEST_MEMFD, + USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, + UserMemoryRegion, VmOps, cpu, hypervisor, vm, +}; // aarch64 dependencies #[cfg(target_arch = "aarch64")] pub mod aarch64; @@ -83,6 +91,8 @@ pub mod riscv64; #[cfg(target_arch = "aarch64")] use std::mem; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::KVM_X86_DEFAULT_VM; /// /// Export generically-named wrappers of kvm-bindings for Unix-based platforms /// @@ -92,10 +102,11 @@ pub use kvm_bindings::kvm_vcpu_events as VcpuEvents; use kvm_bindings::nested::KvmNestedStateBuffer; pub use kvm_bindings::{ self, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, - KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, - kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice, + KVM_IRQ_ROUTING_MSI, KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, + KVM_MSI_VALID_DEVID, kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice, kvm_device_attr as DeviceAttr, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_run, kvm_userspace_memory_region, + kvm_userspace_memory_region2, }; #[cfg(target_arch = "aarch64")] use kvm_bindings::{ @@ -107,7 +118,7 @@ use kvm_bindings::{ #[cfg(target_arch = "riscv64")] use kvm_bindings::{KVM_REG_RISCV_CORE, kvm_riscv_core}; #[cfg(feature = "tdx")] -use kvm_bindings::{KVM_X86_DEFAULT_VM, KVM_X86_SW_PROTECTED_VM, KVMIO, kvm_run__bindgen_ty_1}; +use kvm_bindings::{KVM_X86_SW_PROTECTED_VM, KVMIO, kvm_run__bindgen_ty_1}; #[cfg(target_arch = "x86_64")] use kvm_bindings::{Xsave as xsave2, kvm_xsave2}; pub use kvm_ioctls::{self, Cap, Kvm, VcpuExit}; @@ -128,6 +139,10 @@ use crate::kvm::x86_64::XsaveStateError; #[cfg(target_arch = "x86_64")] ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a); +// Maximum virtual address space. +#[cfg(target_arch = "x86_64")] +const VIRTUAL_ADDRESS_SIZE: u64 = 1 << 48; + #[cfg(feature = "tdx")] const KVM_EXIT_TDX: u32 = 50; #[cfg(feature = "tdx")] @@ -238,6 +253,61 @@ pub struct KvmTdxExitVmcall { pub out_rdx: u64, } +impl From for UserMemoryRegion { + fn from(region: kvm_userspace_memory_region2) -> Self { + let mut flags = USER_MEMORY_REGION_READ; + if region.flags & KVM_MEM_READONLY == 0 { + flags |= USER_MEMORY_REGION_WRITE; + } + if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { + flags |= USER_MEMORY_REGION_LOG_DIRTY; + } + if region.flags & KVM_MEM_GUEST_MEMFD != 0 { + flags |= USER_MEMORY_REGION_GUEST_MEMFD; + } + + UserMemoryRegion { + slot: region.slot, + guest_phys_addr: region.guest_phys_addr, + memory_size: region.memory_size, + userspace_addr: region.userspace_addr, + flags, + guest_memfd: region.guest_memfd, + guest_memfd_offset: region.guest_memfd_offset, + } + } +} + +impl From for kvm_userspace_memory_region2 { + fn from(region: UserMemoryRegion) -> Self { + assert!( + region.flags & USER_MEMORY_REGION_READ != 0, + "KVM mapped memory is always readable" + ); + + let mut flags = 0; + if region.flags & USER_MEMORY_REGION_WRITE == 0 { + flags |= KVM_MEM_READONLY; + } + if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { + flags |= KVM_MEM_LOG_DIRTY_PAGES; + } + if region.flags & USER_MEMORY_REGION_GUEST_MEMFD != 0 { + flags |= KVM_MEM_GUEST_MEMFD; + } + + kvm_userspace_memory_region2 { + slot: region.slot, + guest_phys_addr: region.guest_phys_addr, + memory_size: region.memory_size, + userspace_addr: region.userspace_addr, + flags, + guest_memfd: region.guest_memfd, + guest_memfd_offset: region.guest_memfd_offset, + ..Default::default() + } + } +} impl From for MpState { fn from(s: kvm_mp_state) -> Self { MpState::Kvm(s) @@ -424,6 +494,9 @@ struct KvmDirtyLogSlot { guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, + // Following fields are used by kvm_userspace_memory_region2. + guest_memfd_offset: u64, + guest_memfd: u32, } /// Wrapper over KVM VM ioctls. @@ -432,6 +505,9 @@ pub struct KvmVm { #[cfg(target_arch = "x86_64")] msrs: Vec, dirty_log_slots: RwLock>, + #[cfg(target_arch = "x86_64")] + memfd: Option, + kvm_guest_memfd_supported: bool, } impl KvmVm { @@ -494,6 +570,45 @@ impl KvmVm { fn translate_msi_ext_dest_id(address_lo: u32, address_hi: u32) -> (u32, u32) { (address_lo, address_hi) } + /// Set user memory region to use guest_memfd when available. + /// guest_memfd is available on host linux kernel v6.8+ + /// + /// # Safety + /// + /// `region.userspace_addr` must point to `region.memory_size` bytes of + /// memory that will stay mapped until the slot is removed via + /// `remove_user_memory_region`. The memory region must + /// be uniquely owned by the caller, as mapping it into the guest + /// effectively creates a long-lived mutable reference. + unsafe fn set_user_memory_region( + &self, + region: kvm_userspace_memory_region2, + ) -> Result<(), errno::Error> { + if self.kvm_guest_memfd_supported { + // SAFETY: Safe as the caller guarantees that region is safe to map + // the guest and is non-overlapping. + unsafe { self.fd.set_user_memory_region2(region) } + } else { + // SAFETY: Safe because guest regions are guaranteed not to overlap. + unsafe { + self.fd.set_user_memory_region(kvm_userspace_memory_region { + slot: region.slot, + guest_phys_addr: region.guest_phys_addr, + userspace_addr: region.userspace_addr, + flags: region.flags, + memory_size: region.memory_size, + }) + } + } + } + /// Get flag for kvm_userspace_memory_region based on memfd support. + fn get_kvm_userspace_memory_region_flag(&self, flag: u32) -> u32 { + flag | if self.kvm_guest_memfd_supported { + KVM_MEM_GUEST_MEMFD + } else { + 0 + } + } } /// Implementation of Vm trait for KVM @@ -759,14 +874,27 @@ impl vm::Vm for KvmVm { const _: () = assert!(core::mem::size_of::() <= core::mem::size_of::()); - let mut region = kvm_userspace_memory_region { + // guest_memfd is only used on x86_64 (SEV-SNP, TDX) for now + #[cfg(target_arch = "x86_64")] + let guest_memfd = if let Some(memfd) = &self.memfd { + memfd.as_raw_fd() as u32 + } else { + 0 + }; + #[cfg(not(target_arch = "x86_64"))] + let guest_memfd = 0; + + let mut region = kvm_userspace_memory_region2 { slot, + flags: self.get_kvm_userspace_memory_region_flag(flags), guest_phys_addr, memory_size: memory_size as u64, userspace_addr: userspace_addr as usize as u64, - flags, + #[cfg(not(target_arch = "riscv64"))] + guest_memfd, + guest_memfd_offset: guest_phys_addr, + ..Default::default() }; - if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { if (region.flags & KVM_MEM_READONLY) != 0 { return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( @@ -782,20 +910,22 @@ impl vm::Vm for KvmVm { guest_phys_addr: region.guest_phys_addr, memory_size: region.memory_size, userspace_addr: region.userspace_addr, + guest_memfd_offset: region.guest_memfd_offset, + guest_memfd: region.guest_memfd, }, ); // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. - region.flags = 0; + region.flags = self.get_kvm_userspace_memory_region_flag(0); } // SAFETY: Safe because caller promised this is safe. unsafe { - self.fd - .set_user_memory_region(region) - .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) + self.set_user_memory_region(region) + .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?; } + Ok(()) } /// Removes a guest physical memory region. @@ -823,12 +953,13 @@ impl vm::Vm for KvmVm { const _: () = assert!(core::mem::size_of::() <= core::mem::size_of::()); - let mut region = kvm_userspace_memory_region { + let mut region = kvm_userspace_memory_region2 { slot, guest_phys_addr, memory_size: memory_size as u64, userspace_addr: userspace_addr as usize as u64, flags, + ..Default::default() }; // Remove the corresponding entry from "self.dirty_log_slots" if needed @@ -838,8 +969,7 @@ impl vm::Vm for KvmVm { region.memory_size = 0; // SAFETY: Safe because caller promised this is safe. unsafe { - self.fd - .set_user_memory_region(region) + self.set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) } } @@ -932,17 +1062,19 @@ impl vm::Vm for KvmVm { fn start_dirty_log(&self) -> vm::Result<()> { let dirty_log_slots = self.dirty_log_slots.read().unwrap(); for (_, s) in dirty_log_slots.iter() { - let region = kvm_userspace_memory_region { + let region = kvm_userspace_memory_region2 { slot: s.slot, guest_phys_addr: s.guest_phys_addr, memory_size: s.memory_size, userspace_addr: s.userspace_addr, - flags: KVM_MEM_LOG_DIRTY_PAGES, + flags: self.get_kvm_userspace_memory_region_flag(KVM_MEM_LOG_DIRTY_PAGES), + guest_memfd: s.guest_memfd, + guest_memfd_offset: s.guest_memfd_offset, + ..Default::default() }; // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { - self.fd - .set_user_memory_region(region) + self.set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; } } @@ -956,17 +1088,19 @@ impl vm::Vm for KvmVm { fn stop_dirty_log(&self) -> vm::Result<()> { let dirty_log_slots = self.dirty_log_slots.read().unwrap(); for (_, s) in dirty_log_slots.iter() { - let region = kvm_userspace_memory_region { + let region = kvm_userspace_memory_region2 { slot: s.slot, guest_phys_addr: s.guest_phys_addr, memory_size: s.memory_size, userspace_addr: s.userspace_addr, - flags: 0, + flags: self.get_kvm_userspace_memory_region_flag(0), + guest_memfd: s.guest_memfd, + guest_memfd_offset: s.guest_memfd_offset, + ..Default::default() }; // SAFETY: Safe because guest regions are guaranteed not to overlap. unsafe { - self.fd - .set_user_memory_region(region) + self.set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; } } @@ -1228,11 +1362,17 @@ impl hypervisor::Hypervisor for KvmHypervisor { vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); } - #[cfg(feature = "tdx")] - if _config.tdx_enabled { - vm_type = KVM_X86_SW_PROTECTED_VM.into(); - } else { - vm_type = KVM_X86_DEFAULT_VM.into(); + #[cfg(target_arch = "x86_64")] + cfg_if::cfg_if! { + if #[cfg(feature = "tdx")] { + if _config.tdx_enabled { + vm_type = KVM_X86_SW_PROTECTED_VM.into(); + } else { + vm_type = KVM_X86_DEFAULT_VM.into(); + } + } else { + vm_type = KVM_X86_DEFAULT_VM.into(); + } } loop { @@ -1255,7 +1395,7 @@ impl hypervisor::Hypervisor for KvmHypervisor { { let msr_list = self.get_msr_list()?; let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; - let mut msrs: Vec = vec![ + let mut msrs = vec![ MsrEntry { ..Default::default() }; @@ -1266,10 +1406,29 @@ impl hypervisor::Hypervisor for KvmHypervisor { msrs[pos].index = *index; } + let kvm_guest_memfd_supported = fd.check_extension(Cap::GuestMemfd); + let mut memfd = None; + if kvm_guest_memfd_supported { + // TODO: Refactor to create memfd when the memory region is created so + // that the size is appropriate. + // SAFETY: Safe because guest regions are guaranteed not to overlap. + memfd = unsafe { + Some(OwnedFd::from_raw_fd( + fd.create_guest_memfd(kvm_create_guest_memfd { + size: VIRTUAL_ADDRESS_SIZE, + ..Default::default() + }) + .map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?, + )) + }; + } + Ok(Arc::new(KvmVm { fd, msrs, dirty_log_slots: RwLock::new(HashMap::new()), + memfd, + kvm_guest_memfd_supported, })) } @@ -1278,6 +1437,7 @@ impl hypervisor::Hypervisor for KvmHypervisor { Ok(Arc::new(KvmVm { fd, dirty_log_slots: RwLock::new(HashMap::new()), + kvm_guest_memfd_supported: false, })) } } diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index 3d919e45ce..90f931f771 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -118,6 +118,32 @@ pub fn vec_with_array_field(count: usize) -> Vec { vec_with_size_in_bytes(vec_size_bytes) } +/// +/// User memory region structure +/// +#[derive(Debug, Default, Eq, PartialEq)] +pub struct UserMemoryRegion { + pub slot: u32, + pub guest_phys_addr: u64, + pub memory_size: u64, + pub userspace_addr: u64, + pub flags: u32, + #[cfg(feature = "kvm")] + pub guest_memfd: u32, + #[cfg(feature = "kvm")] + pub guest_memfd_offset: u64, +} + +/// +/// Flags for user memory region +/// +pub const USER_MEMORY_REGION_READ: u32 = 1; +pub const USER_MEMORY_REGION_WRITE: u32 = 1 << 1; +pub const USER_MEMORY_REGION_EXECUTE: u32 = 1 << 2; +pub const USER_MEMORY_REGION_LOG_DIRTY: u32 = 1 << 3; +pub const USER_MEMORY_REGION_ADJUSTABLE: u32 = 1 << 4; +pub const USER_MEMORY_REGION_GUEST_MEMFD: u32 = 1 << 5; + #[derive(Debug)] pub enum MpState { #[cfg(feature = "kvm")] diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 97f020e650..5c90138147 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -90,6 +90,9 @@ mod kvm { pub const KVM_HAS_DEVICE_ATTR: u64 = 0x4018_aee3; pub const KVM_SET_ONE_REG: u64 = 0x4010_aeac; pub const KVM_SET_USER_MEMORY_REGION: u64 = 0x4020_ae46; + pub const KVM_SET_USER_MEMORY_REGION2: u64 = 0x40a0_ae49; + pub const KVM_SET_MEMORY_ATTRIBUTES: u64 = 0x4020_aed2; + pub const KVM_CREATE_GUEST_MEMFD: u64 = 0xc040_aed4; pub const KVM_IRQFD: u64 = 0x4020_ae76; pub const KVM_IOEVENTFD: u64 = 0x4040_ae79; pub const KVM_SET_VCPU_EVENTS: u64 = 0x4040_aea0; @@ -240,6 +243,14 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_ONE_REG)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_REGS)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + KVM_SET_USER_MEMORY_REGION2, + )?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_MEMORY_ATTRIBUTES,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_GUEST_MEMFD,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_VCPU_EVENTS,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], @@ -714,6 +725,14 @@ fn create_vcpu_ioctl_seccomp_rule_kvm() -> Result, BackendError and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_DEVICE_ATTR,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_GSI_ROUTING,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + KVM_SET_USER_MEMORY_REGION2, + )?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_CREATE_GUEST_MEMFD,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_MEMORY_ATTRIBUTES,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_RUN,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], From a9e65df587af384368623f21c97bab148147d2db Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 07:59:20 -0700 Subject: [PATCH 02/18] vmm: allow IGVM payload alongside a kernel Previously, the payload validation rejected an IGVM file combined with a kernel or firmware. Relax this constraint to allow an IGVM carrying a firmware (e.g Oak stage0) to be paired with a separate kernel image. This enables fw_cfg-style boot where stage0 loads a kernel provided through fw_cfg rather than embedded in the IGVM file itself. Signed-off-by: Ruben Hakobyan --- vmm/src/vm_config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index 88f8af4acf..8d1378b640 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -857,7 +857,7 @@ impl PayloadConfig { #[cfg(feature = "igvm")] { if self.igvm.is_some() { - if self.firmware.is_some() || self.kernel.is_some() { + if self.firmware.is_some() { return Err(PayloadConfigError::IgvmPlusOtherPayloads); } return Ok(()); From 6b0c58dd911d02c608433fbf2680c457edff48ac Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 08:05:30 -0700 Subject: [PATCH 03/18] arch, hypervisor, vmm: skip vcpu setup when using igvm and kvm When we use igvm + kvm, we setup the regs and sregs using the cpuid page. We still need to setup the fpu in configure_vcpu. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- arch/src/x86_64/mod.rs | 5 ++++- hypervisor/src/lib.rs | 2 +- vmm/src/cpu.rs | 51 +++++++++++++++++++++++++++++++----------- vmm/src/vm.rs | 10 +++++++++ 4 files changed, 53 insertions(+), 15 deletions(-) diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index d35a878e61..9f2ddf6dc4 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -819,6 +819,7 @@ pub fn configure_vcpu( cpu_vendor: CpuVendor, topology: (u16, u16, u16, u16), nested: bool, + setup_registers: bool, ) -> super::Result<()> { let x2apic_id = get_x2apic_id(id, Some(topology)); @@ -891,7 +892,9 @@ pub fn configure_vcpu( } regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?; - if let Some((kernel_entry_point, guest_memory)) = boot_setup { + if let Some((kernel_entry_point, guest_memory)) = boot_setup + && setup_registers + { regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?; diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index 90f931f771..4be179e642 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -64,7 +64,7 @@ pub use vm::{ pub use crate::hypervisor::{Hypervisor, HypervisorError}; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq)] pub enum HypervisorType { #[cfg(feature = "kvm")] Kvm, diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 4b15cffc31..40be270538 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -546,6 +546,7 @@ impl Vcpu { #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, #[cfg(target_arch = "x86_64")] topology: (u16, u16, u16, u16), #[cfg(target_arch = "x86_64")] nested: bool, + #[cfg(feature = "igvm")] igvm_enabled: bool, ) -> Result<()> { #[cfg(target_arch = "aarch64")] { @@ -558,17 +559,32 @@ impl Vcpu { .map_err(Error::VcpuConfiguration)?; info!("Configuring vCPU: cpu_id = {}", self.id); #[cfg(target_arch = "x86_64")] - arch::configure_vcpu( - self.vcpu.as_ref(), - self.id, - boot_setup, - cpuid, - kvm_hyperv, - self.vendor, - topology, - nested, - ) - .map_err(Error::VcpuConfiguration)?; + { + // When IGVM is enabled, skip standard register setup here — the IGVM + // loader populates vCPU registers from the VMSA via set_sev_control_register + // (currently KVM-specific; MSHV handles this through its own import path). + // igvm_enabled is kept as an explicit flag rather than derived from sev_snp + // state because IGVM could theoretically be used independently of SEV-SNP. + cfg_if::cfg_if! { + if #[cfg(feature = "igvm")] { + let setup_registers = !igvm_enabled; + } else { + let setup_registers = true; + } + } + arch::configure_vcpu( + self.vcpu.as_ref(), + self.id, + boot_setup, + cpuid, + kvm_hyperv, + self.vendor, + topology, + nested, + setup_registers, + ) + .map_err(Error::VcpuConfiguration)?; + } Ok(()) } @@ -697,6 +713,8 @@ pub struct CpuManager { sev_snp_enabled: bool, // State of the core scheduling group leader election (VM mode). core_scheduling_group_leader: Arc, + #[cfg(feature = "igvm")] + igvm_enabled: bool, } const CPU_ENABLE_FLAG: usize = 0; @@ -896,6 +914,7 @@ impl CpuManager { #[cfg(feature = "tdx")] tdx_enabled: bool, numa_nodes: &NumaNodes, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, + #[cfg(feature = "igvm")] igvm_enabled: bool, ) -> Result>> { if config.max_vcpus > hypervisor.get_max_vcpus() { return Err(Error::MaximumVcpusExceeded( @@ -972,6 +991,8 @@ impl CpuManager { core_scheduling_group_leader: Arc::new(AtomicI32::new( CoreSchedulingLeader::Initial as i32, )), + #[cfg(feature = "igvm")] + igvm_enabled, }))) } @@ -1050,8 +1071,10 @@ impl CpuManager { vcpu: &mut Vcpu, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, ) -> Result<()> { - #[cfg(feature = "sev_snp")] - if self.sev_snp_enabled { + #[cfg(all(feature = "sev_snp", feature = "mshv"))] + if self.sev_snp_enabled + && self.hypervisor.hypervisor_type() == hypervisor::HypervisorType::Mshv + { if let Some((kernel_entry_point, _)) = boot_setup { vcpu.set_sev_control_register( kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE, @@ -1092,6 +1115,8 @@ impl CpuManager { self.config.kvm_hyperv, topology, self.config.nested, + #[cfg(feature = "igvm")] + self.igvm_enabled, )?; #[cfg(target_arch = "aarch64")] diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 04bd2d595a..3e0cb8c285 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -729,6 +729,14 @@ impl Vm { let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); #[cfg(feature = "sev_snp")] let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); + #[cfg(feature = "igvm")] + let igvm_enabled = config + .lock() + .unwrap() + .payload + .as_ref() + .and_then(|p| p.igvm.as_ref()) + .is_some(); let cpus_config = config.lock().unwrap().cpus.clone(); let cpu_manager = cpu::CpuManager::new( @@ -746,6 +754,8 @@ impl Vm { numa_nodes, #[cfg(feature = "sev_snp")] sev_snp_enabled, + #[cfg(feature = "igvm")] + igvm_enabled, ) .map_err(Error::CpuManager)?; From 597f5e777ce6207f8dadaef07def8bee85fdff12 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 08:09:58 -0700 Subject: [PATCH 04/18] hypervisor, vmm: pass SNP guest policy to sev_snp_init The SNP guest policy (AMD SEV-SNP ABI bits controlling SMT, migration, debug, etc.) was previously hardcoded inside the MSHV implementation. Widen Vm::sev_snp_init() to accept an SnpPolicy parameter so each hypervisor backend receives the policy at init time. Add get_default_sev_snp_guest_policy() in the VMM to construct the default policy. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- hypervisor/src/mshv/mod.rs | 4 ++-- hypervisor/src/vm.rs | 4 +++- vmm/Cargo.toml | 7 ++++++- vmm/src/vm.rs | 18 +++++++++++++++++- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 8623531c5b..1119691273 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -58,7 +58,7 @@ pub use aarch64::VcpuMshvState; #[cfg(target_arch = "aarch64")] use aarch64::gic::{BASE_SPI_IRQ, MshvGicV2M}; #[cfg(feature = "sev_snp")] -use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; +use igvm_defs::{IGVM_VHS_SNP_ID_BLOCK, SnpPolicy}; #[cfg(feature = "sev_snp")] use snp_constants::*; use vmm_sys_util::eventfd::EventFd; @@ -2254,7 +2254,7 @@ impl vm::Vm for MshvVm { /// Initialize the SEV-SNP VM #[cfg(feature = "sev_snp")] - fn sev_snp_init(&self) -> vm::Result<()> { + fn sev_snp_init(&self, _guest_policy: SnpPolicy) -> vm::Result<()> { self.fd .set_partition_property( hv_partition_property_code_HV_PARTITION_PROPERTY_ISOLATION_STATE, diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index 9d7e60a8be..e6787d19ef 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -17,6 +17,8 @@ use std::sync::Mutex; #[cfg(feature = "sev_snp")] use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; +#[cfg(feature = "sev_snp")] +use igvm_defs::SnpPolicy; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; @@ -392,7 +394,7 @@ pub trait Vm: Send + Sync + Any { fn get_dirty_log(&self, slot: u32, base_gpa: u64, memory_size: u64) -> Result>; #[cfg(feature = "sev_snp")] /// Initialize SEV-SNP on this VM - fn sev_snp_init(&self) -> Result<()> { + fn sev_snp_init(&self, _guest_policy: SnpPolicy) -> Result<()> { unimplemented!() } #[cfg(feature = "tdx")] diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index ab0278e6d1..7088db3219 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -30,7 +30,12 @@ mshv = [ "vm-device/mshv", ] pvmemcontrol = ["devices/pvmemcontrol"] -sev_snp = ["arch/sev_snp", "hypervisor/sev_snp", "virtio-devices/sev_snp"] +sev_snp = [ + "arch/sev_snp", + "hypervisor/sev_snp", + "igvm_defs", + "virtio-devices/sev_snp", +] tdx = ["arch/tdx", "hypervisor/tdx"] tracing = ["tracer/tracing"] diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 3e0cb8c285..14d993ad6d 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -47,6 +47,8 @@ use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; use hypervisor::{HypervisorVmConfig, HypervisorVmError, VmOps}; +#[cfg(feature = "sev_snp")] +use igvm_defs::SnpPolicy; use libc::{SIGWINCH, termios}; use linux_loader::cmdline::Cmdline; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -528,6 +530,19 @@ pub struct Vm { impl Vm { pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; + #[cfg(feature = "sev_snp")] + pub fn get_default_sev_snp_guest_policy() -> SnpPolicy { + SnpPolicy::new() + .with_abi_minor(0) + .with_abi_major(0) + // SMT permitted: allows the guest to run on an SMT-enabled host. + // This is the permissive default; future work can expose this as a + // configurable platform option. + .with_smt(1) + .with_reserved_must_be_one(1) + .with_migrate_ma(0) + } + #[allow(clippy::needless_pass_by_value)] #[allow(clippy::too_many_arguments)] pub fn new_from_memory_manager( @@ -971,7 +986,8 @@ impl Vm { .map_err(Error::CpuManager)?; // Initialize SEV-SNP - transitions guest into secure state - vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; + vm.sev_snp_init(Self::get_default_sev_snp_guest_policy()) + .map_err(Error::InitializeSevSnpVm)?; // Load payload for SEV-SNP (IGVM parser needs cpu_manager for cpuid) let load_payload_handle = if snapshot.is_none() { From 29713b5c9f9ce0ded7e2ab2973f3a5630a990ec0 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 18:39:37 -0700 Subject: [PATCH 05/18] vmm: make RSDP address optional in configure_system Change configure_system to take an option since rsdp is wrapped into an option anyways (we use configure system to setup the mptables). Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- vmm/src/vm.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 14d993ad6d..df87487ffd 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -1663,7 +1663,11 @@ impl Vm { } #[cfg(target_arch = "x86_64")] - fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { + fn configure_system( + &mut self, + rsdp_addr: Option, + entry_addr: EntryPoint, + ) -> Result<()> { trace_scoped!("configure_system"); info!("Configuring system"); let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); @@ -1674,7 +1678,6 @@ impl Vm { }; let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); - let rsdp_addr = Some(rsdp_addr); let serial_number = self .config @@ -1726,7 +1729,7 @@ impl Vm { #[cfg(target_arch = "aarch64")] fn configure_system( &mut self, - _rsdp_addr: GuestAddress, + _rsdp_addr: Option, _entry_addr: EntryPoint, ) -> Result<()> { let cmdline = Self::generate_cmdline( @@ -2763,16 +2766,11 @@ impl Vm { let rsdp_addr = self.create_acpi_tables(); #[cfg(not(target_arch = "riscv64"))] - { - #[cfg(not(any(feature = "sev_snp", feature = "tdx")))] - assert!(rsdp_addr.is_some()); - // Configure shared state based on loaded kernel - if let Some(rsdp_adr) = rsdp_addr { - entry_point - .map(|entry_point| self.configure_system(rsdp_adr, entry_point)) - .transpose()?; - } - } + // Configure shared state based on loaded kernel + entry_point + .map(|entry_point| self.configure_system(rsdp_addr, entry_point)) + .transpose()?; + #[cfg(target_arch = "riscv64")] self.configure_system().unwrap(); From d3eb5e8f436bd09de57c5a2fefb1ba2b832fc7b0 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 18:40:26 -0700 Subject: [PATCH 06/18] vmm: remove sev_snp_enabled parameter from payload loading The load_payload and load_payload_async functions previously received a sev_snp_enabled flag to decide whether to call load_igvm with or without the host_data parameter. Replace this with a single code path that always passes host_data behind a cfg(feature = "sev_snp") gate, removing the runtime branch and the extra parameter threaded through three call sites. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- vmm/src/vm.rs | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index df87487ffd..f7ba17440e 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -931,8 +931,6 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, - #[cfg(feature = "sev_snp")] - false, )? } else { None @@ -996,7 +994,6 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, - true, )? } else { None @@ -1570,19 +1567,19 @@ impl Vm { payload: &PayloadConfig, memory_manager: Arc>, #[cfg(feature = "igvm")] cpu_manager: Arc>, - #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, ) -> Result { trace_scoped!("load_payload"); #[cfg(feature = "igvm")] { if let Some(_igvm_file) = &payload.igvm { let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; - #[cfg(feature = "sev_snp")] - if sev_snp_enabled { - return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); - } - #[cfg(not(feature = "sev_snp"))] - return Self::load_igvm(igvm, memory_manager, cpu_manager); + return Self::load_igvm( + igvm, + memory_manager, + cpu_manager, + #[cfg(feature = "sev_snp")] + &payload.host_data, + ); } } match (&payload.firmware, &payload.kernel) { @@ -1626,7 +1623,6 @@ impl Vm { memory_manager: &Arc>, config: &Arc>, #[cfg(feature = "igvm")] cpu_manager: &Arc>, - #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, ) -> Result>>> { // Kernel with TDX is loaded in a different manner #[cfg(feature = "tdx")] @@ -1653,8 +1649,6 @@ impl Vm { memory_manager, #[cfg(feature = "igvm")] cpu_manager, - #[cfg(feature = "sev_snp")] - sev_snp_enabled, ) }) .map_err(Error::KernelLoadThreadSpawn) From 2bf9acb4b874b9936f2376952903558964c5ea30 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Thu, 9 Apr 2026 16:58:58 -0700 Subject: [PATCH 07/18] igvm, vmm: parse IGVM file early and thread it through VM setup Move IGVM file parsing from load_igvm() into a dedicated parse_igvm() helper in igvm/mod.rs, and parse the file upfront in Vm::new() so the resulting IgvmFile struct is available throughout VM initialization. This is a prerequisite for extracting VMSA SEV features from the parsed IGVM before issuing KVM_SEV_INIT2, which needs sev_features. Signed-off-by: Ruben Hakobyan --- vmm/src/igvm/igvm_loader.rs | 14 ++++------- vmm/src/igvm/mod.rs | 8 +++++++ vmm/src/lib.rs | 2 ++ vmm/src/vm.rs | 46 +++++++++++++++++++++++++++++-------- 4 files changed, 51 insertions(+), 19 deletions(-) diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index 6e256c1ecb..7669675430 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -4,12 +4,11 @@ // use std::collections::HashMap; use std::ffi::CString; -use std::io::{Read, Seek, SeekFrom}; use std::mem::size_of; use std::sync::{Arc, Mutex}; use igvm::snp_defs::SevVmsa; -use igvm::{IgvmDirectiveHeader, IgvmFile, IgvmPlatformHeader, IsolationType}; +use igvm::{IgvmDirectiveHeader, IgvmFile, IgvmPlatformHeader}; #[cfg(feature = "sev_snp")] use igvm_defs::{IGVM_VHS_MEMORY_MAP_ENTRY, MemoryMapEntryType}; use igvm_defs::{ @@ -51,6 +50,8 @@ pub enum Error { FailedToDecodeHostData(#[source] hex::FromHexError), #[error("Error allocating address space")] MemoryManager(MemoryManagerError), + #[error("Error reading the IGVM file")] + MissingIgvm, } #[allow(dead_code)] @@ -135,7 +136,7 @@ fn import_parameter( /// any isolation. #[allow(clippy::needless_pass_by_value)] pub fn load_igvm( - mut file: &std::fs::File, + igvm_file: IgvmFile, memory_manager: Arc>, cpu_manager: Arc>, cmdline: &str, @@ -143,7 +144,6 @@ pub fn load_igvm( ) -> Result, Error> { let mut loaded_info: Box = Box::default(); let command_line = CString::new(cmdline).map_err(Error::InvalidCommandLine)?; - let mut file_contents = Vec::new(); let memory = memory_manager.lock().as_ref().unwrap().guest_memory(); let mut gpas: Vec = Vec::new(); let proc_count = cpu_manager.lock().unwrap().vcpus().len() as u32; @@ -156,12 +156,6 @@ pub fn load_igvm( .map_err(Error::FailedToDecodeHostData)?; } - file.seek(SeekFrom::Start(0)).map_err(Error::Igvm)?; - file.read_to_end(&mut file_contents).map_err(Error::Igvm)?; - - let igvm_file = IgvmFile::new_from_binary(&file_contents, Some(IsolationType::Snp)) - .map_err(Error::InvalidIgvmFile)?; - let mask = match &igvm_file.platforms()[0] { IgvmPlatformHeader::SupportedPlatform(info) => { debug_assert!(info.platform_type == IgvmPlatformType::SEV_SNP); diff --git a/vmm/src/igvm/mod.rs b/vmm/src/igvm/mod.rs index 62c32d4e89..9803cf21e2 100644 --- a/vmm/src/igvm/mod.rs +++ b/vmm/src/igvm/mod.rs @@ -28,9 +28,17 @@ pub mod igvm_loader; mod loader; use igvm::snp_defs::SevVmsa; +use igvm::{IgvmFile, IsolationType}; use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; +use std::path::Path; use zerocopy::FromZeros; +pub fn parse_igvm(igvm_path: &Path) -> Result { + let file_contents = std::fs::read(igvm_path).map_err(igvm_loader::Error::Igvm)?; + IgvmFile::new_from_binary(&file_contents, Some(IsolationType::Snp)) + .map_err(igvm_loader::Error::InvalidIgvmFile) +} + #[derive(Debug, Clone)] pub struct IgvmLoadedInfo { pub gpas: Vec, diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 92bf4c6b70..eefe0707ec 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1088,6 +1088,8 @@ impl Vmm { self.console_resize_pipe.clone(), Arc::clone(&self.original_termios_opt), Some(&snapshot), + #[cfg(feature = "igvm")] + None, ) .map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error creating VM from snapshot: {e:?}")) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index f7ba17440e..387e30a260 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -109,6 +109,8 @@ use crate::{ CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, GuestMemoryMmap, MEMORY_MANAGER_SNAPSHOT_ID, PciDeviceInfo, cpu, }; +#[cfg(feature = "igvm")] +use igvm::IgvmFile; /// Errors associated with VM management #[derive(Debug, Error)] @@ -327,10 +329,6 @@ pub enum Error { #[error("Error coredumping VM")] Coredump(#[source] GuestDebuggableError), - #[cfg(feature = "igvm")] - #[error("Cannot open igvm file")] - IgvmFile(#[source] io::Error), - #[cfg(feature = "igvm")] #[error("Cannot load the igvm into memory")] IgvmLoad(#[source] igvm_loader::Error), @@ -560,6 +558,7 @@ impl Vm { console_resize_pipe: Option>, original_termios: Arc>>, snapshot: Option<&Snapshot>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result { trace_scoped!("Vm::new_from_memory_manager"); @@ -641,6 +640,8 @@ impl Vm { console_resize_pipe.as_ref(), &original_termios, snapshot, + #[cfg(feature = "igvm")] + igvm_file, )?; // Load kernel and initramfs files @@ -868,6 +869,7 @@ impl Vm { console_resize_pipe: Option<&Arc>, original_termios: &Arc>>, snapshot: Option<&Snapshot>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result>>> { #[cfg(feature = "mshv")] let is_mshv = matches!( @@ -902,6 +904,8 @@ impl Vm { console_resize_pipe, original_termios, snapshot, + #[cfg(feature = "igvm")] + igvm_file, ); } @@ -931,6 +935,8 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, + #[cfg(feature = "igvm")] + igvm_file, )? } else { None @@ -975,6 +981,7 @@ impl Vm { console_resize_pipe: Option<&Arc>, original_termios: &Arc>>, snapshot: Option<&Snapshot>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result>>> { // Create boot vCPUs before SEV-SNP initialization cpu_manager @@ -994,6 +1001,8 @@ impl Vm { config, #[cfg(feature = "igvm")] cpu_manager, + #[cfg(feature = "igvm")] + igvm_file, )? } else { None @@ -1294,6 +1303,18 @@ impl Vm { vm_config.lock().unwrap().is_tdx_enabled() }; + #[cfg(feature = "igvm")] + let igvm_file = { + let config = vm_config.lock().unwrap(); + config + .payload + .as_ref() + .and_then(|p| p.igvm.as_ref()) + .map(|igvm_path| crate::igvm::parse_igvm(igvm_path)) + .transpose() + .map_err(Error::IgvmLoad)? + }; + let vm = Self::create_hypervisor_vm( hypervisor.as_ref(), vm_config.as_ref().lock().unwrap().deref().into(), @@ -1353,6 +1374,8 @@ impl Vm { console_resize_pipe, original_termios, snapshot, + #[cfg(feature = "igvm")] + igvm_file, ) } @@ -1471,13 +1494,13 @@ impl Vm { #[cfg(feature = "igvm")] #[allow(clippy::needless_pass_by_value)] fn load_igvm( - igvm: File, + igvm_file: IgvmFile, memory_manager: Arc>, cpu_manager: Arc>, #[cfg(feature = "sev_snp")] host_data: &Option, ) -> Result { let res = igvm_loader::load_igvm( - &igvm, + igvm_file, memory_manager, cpu_manager.clone(), "", @@ -1567,14 +1590,16 @@ impl Vm { payload: &PayloadConfig, memory_manager: Arc>, #[cfg(feature = "igvm")] cpu_manager: Arc>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result { trace_scoped!("load_payload"); #[cfg(feature = "igvm")] { - if let Some(_igvm_file) = &payload.igvm { - let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; + if payload.igvm.is_some() { + let igvm_file = + igvm_file.ok_or(Error::IgvmLoad(igvm_loader::Error::MissingIgvm))?; return Self::load_igvm( - igvm, + igvm_file, memory_manager, cpu_manager, #[cfg(feature = "sev_snp")] @@ -1623,6 +1648,7 @@ impl Vm { memory_manager: &Arc>, config: &Arc>, #[cfg(feature = "igvm")] cpu_manager: &Arc>, + #[cfg(feature = "igvm")] igvm_file: Option, ) -> Result>>> { // Kernel with TDX is loaded in a different manner #[cfg(feature = "tdx")] @@ -1649,6 +1675,8 @@ impl Vm { memory_manager, #[cfg(feature = "igvm")] cpu_manager, + #[cfg(feature = "igvm")] + igvm_file, ) }) .map_err(Error::KernelLoadThreadSpawn) From 01d4d47493dfbff6e75efbf82a06cb9626ddef31 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 14:23:13 -0700 Subject: [PATCH 08/18] hypervisor, vmm: Add KVM SEV_{INIT2, SNP_LAUNCH_START} support Introduce the SevFd abstraction that wraps /dev/sev and implements the KVM_SEV_INIT2 and KVM_SEV_SNP_LAUNCH_START ioctls for SEV-SNP VM initialization on KVM. Key changes: - Add sev.rs with KvmSevInit and KvmSevSnpLaunchStart ioctl structs matching the kernel layout (linux/arch/x86/include/uapi/asm/kvm.h) - Implement KVM_SEV_INIT2 and KVM_SEV_SNP_LAUNCH_START ioctls - Set KVM_MEMORY_ATTRIBUTE_PRIVATE on newly created memory regions when guest_memfd is supported - Widen SevSnpPageAccessProxy cfg gates from mshv-only to all sev_snp-enabled builds - Make sev_snp_init a required trait method (remove default impl) - Include KVM_SEV_SNP_LAUNCH_START in the seccomp allowlist - Parse VMSA SEV features from IGVM and include them in the KVM_SEV_INIT2 ioctl Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Co-developed-by: Rob Bradford Signed-off-by: Rob Bradford Signed-off-by: Ruben Hakobyan --- hypervisor/src/cpu.rs | 5 +- hypervisor/src/hypervisor.rs | 5 ++ hypervisor/src/kvm/mod.rs | 77 ++++++++++++++++---- hypervisor/src/kvm/x86_64/mod.rs | 3 + hypervisor/src/kvm/x86_64/sev.rs | 117 +++++++++++++++++++++++++++++++ hypervisor/src/lib.rs | 2 + hypervisor/src/vm.rs | 4 +- vmm/src/device_manager.rs | 8 +-- vmm/src/igvm/igvm_loader.rs | 15 ++++ vmm/src/lib.rs | 2 + vmm/src/seccomp_filters.rs | 2 + vmm/src/vm.rs | 14 ++-- 12 files changed, 229 insertions(+), 25 deletions(-) create mode 100644 hypervisor/src/kvm/x86_64/sev.rs diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index 4bc348a98d..a4a029e989 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -587,10 +587,11 @@ pub trait Vcpu: Send + Sync { ) -> Result<[u32; 4]> { unimplemented!() } - #[cfg(feature = "mshv")] - fn set_sev_control_register(&self, _reg: u64) -> Result<()> { + #[cfg(feature = "sev_snp")] + fn set_sev_control_register(&self, _vmsa_pfn: u64) -> Result<()> { unimplemented!() } + /// /// Sets the value of GIC redistributor address /// diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index a25f8a9bf7..05852a230f 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -96,6 +96,11 @@ pub enum HypervisorError { #[cfg(target_arch = "x86_64")] #[error("Failed to enable AMX tile state components")] CouldNotEnableAmxStateComponents(#[source] crate::arch::x86::AmxGuestSupportError), + /// + /// Failed to retrieve SEV-SNP capabilities + /// + #[error("Failed to retrieve SEV-SNP capabilities:{0}")] + SevSnpCapabilities(#[source] anyhow::Error), } /// diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 76b6b23da1..442a75176c 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -143,6 +143,9 @@ ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a); #[cfg(target_arch = "x86_64")] const VIRTUAL_ADDRESS_SIZE: u64 = 1 << 48; +#[cfg(feature = "sev_snp")] +use kvm_bindings::{KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes}; + #[cfg(feature = "tdx")] const KVM_EXIT_TDX: u32 = 50; #[cfg(feature = "tdx")] @@ -501,9 +504,11 @@ struct KvmDirtyLogSlot { /// Wrapper over KVM VM ioctls. pub struct KvmVm { - fd: VmFd, + fd: Arc, #[cfg(target_arch = "x86_64")] msrs: Vec, + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + sev_fd: Option, dirty_log_slots: RwLock>, #[cfg(target_arch = "x86_64")] memfd: Option, @@ -624,6 +629,15 @@ impl KvmVm { /// let vm = hypervisor.create_vm(HypervisorVmConfig::default()).expect("new VM fd creation failed"); /// ``` impl vm::Vm for KvmVm { + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + fn sev_snp_init(&self, guest_policy: igvm_defs::SnpPolicy) -> vm::Result<()> { + self.sev_fd + .as_ref() + .unwrap() + .launch_start(&self.fd, guest_policy) + .map_err(|e| vm::HypervisorVmError::InitializeSevSnp(e.into())) + } + #[cfg(target_arch = "x86_64")] /// /// Sets the address of the one-page region in the VM's address space. @@ -925,6 +939,18 @@ impl vm::Vm for KvmVm { self.set_user_memory_region(region) .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?; } + + #[cfg(feature = "sev_snp")] + if self.kvm_guest_memfd_supported { + self.fd + .set_memory_attributes(kvm_memory_attributes { + address: region.guest_phys_addr, + size: region.memory_size, + attributes: KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + flags: 0, + }) + .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))?; + } Ok(()) } @@ -1363,15 +1389,17 @@ impl hypervisor::Hypervisor for KvmHypervisor { } #[cfg(target_arch = "x86_64")] - cfg_if::cfg_if! { - if #[cfg(feature = "tdx")] { - if _config.tdx_enabled { - vm_type = KVM_X86_SW_PROTECTED_VM.into(); - } else { - vm_type = KVM_X86_DEFAULT_VM.into(); - } - } else { - vm_type = KVM_X86_DEFAULT_VM.into(); + { + vm_type = KVM_X86_DEFAULT_VM.into(); + + #[cfg(feature = "sev_snp")] + if _config.sev_snp_enabled { + vm_type = KVM_X86_SNP_VM.into(); + } + + #[cfg(feature = "tdx")] + if _config.tdx_enabled { + vm_type = KVM_X86_SW_PROTECTED_VM.into(); } } @@ -1423,10 +1451,35 @@ impl hypervisor::Hypervisor for KvmHypervisor { }; } + #[cfg(feature = "sev_snp")] + let sev_fd = { + let sev_snp_enabled = vm_type == KVM_X86_SNP_VM as u64; + if sev_snp_enabled { + let mask = self.kvm.check_extension_int(crate::kvm::Cap::ExitHypercall); + let cap = kvm_bindings::kvm_enable_cap { + cap: kvm_bindings::KVM_CAP_EXIT_HYPERCALL, + args: [mask as _, 0, 0, 0], + ..Default::default() + }; + fd.enable_cap(&cap) + .map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; + let sev_dev = x86_64::sev::SevFd::new("/dev/sev") + .map_err(|e| hypervisor::HypervisorError::SevSnpCapabilities(e.into()))?; + sev_dev + .init2(&fd, _config.vmsa_features) + .map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; + Some(sev_dev) + } else { + None + } + }; + Ok(Arc::new(KvmVm { - fd, + fd: Arc::new(fd), msrs, dirty_log_slots: RwLock::new(HashMap::new()), + #[cfg(feature = "sev_snp")] + sev_fd, memfd, kvm_guest_memfd_supported, })) @@ -1435,7 +1488,7 @@ impl hypervisor::Hypervisor for KvmHypervisor { #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] { Ok(Arc::new(KvmVm { - fd, + fd: Arc::new(fd), dirty_log_slots: RwLock::new(HashMap::new()), kvm_guest_memfd_supported: false, })) diff --git a/hypervisor/src/kvm/x86_64/mod.rs b/hypervisor/src/kvm/x86_64/mod.rs index e338346c3f..62185fd84e 100644 --- a/hypervisor/src/kvm/x86_64/mod.rs +++ b/hypervisor/src/kvm/x86_64/mod.rs @@ -31,6 +31,9 @@ use crate::arch::x86::{ }; use crate::kvm::{Cap, Kvm, KvmError, KvmResult}; +#[cfg(feature = "sev_snp")] +pub(crate) mod sev; + /// /// Check KVM extension for Linux /// diff --git a/hypervisor/src/kvm/x86_64/sev.rs b/hypervisor/src/kvm/x86_64/sev.rs new file mode 100644 index 0000000000..6f7c6fd5eb --- /dev/null +++ b/hypervisor/src/kvm/x86_64/sev.rs @@ -0,0 +1,117 @@ +// Copyright 2025 Google LLC. +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fs::OpenOptions; +use std::os::fd::{AsRawFd, OwnedFd}; +use std::os::unix::fs::OpenOptionsExt; +use std::path::Path; + +use igvm_defs::SnpPolicy; +use kvm_bindings::kvm_sev_cmd; +use kvm_ioctls::VmFd; +use log::{error, info}; +use vmm_sys_util::errno; + +pub(crate) type Result = std::result::Result; + +// KVM SEV command IDs — linux/include/uapi/linux/kvm.h +const KVM_SEV_INIT2: u32 = 22; +const KVM_SEV_SNP_LAUNCH_START: u32 = 100; + +// SNP in VMSA - linux/arch/x86/include/asm/svm.h +const SVM_SEV_FEAT_SNP_ACTIVE: u64 = 1 << 0; + +fn sev_op(vm: &VmFd, sev_cmd: &mut kvm_sev_cmd, name: &str) -> Result<()> { + let ret = vm.encrypt_op_sev(sev_cmd); + if ret.is_err() { + error!("{name} op failed. error code: 0x{:x}", sev_cmd.error); + } + ret +} + +#[derive(Debug)] +pub struct SevFd { + pub fd: OwnedFd, +} + +// These ioctl structs must match the kernel layout exactly. +// Layouts from linux/arch/x86/include/uapi/asm/kvm.h + +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevInit { + pub vmsa_features: u64, + pub flags: u32, + pub ghcb_version: u16, + pub pad1: u16, + pub pad2: [u32; 8], +} + +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevSnpLaunchStart { + pub policy: u64, + pub gosvw: [u8; 16], + pub flags: u16, + pub pad0: [u8; 6], + pub pad1: [u64; 4], +} + +impl SevFd { + pub(crate) fn new(sev_path: impl AsRef) -> Result { + // give sev device rw and close on exec + let file_r = OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_CLOEXEC) + .open(sev_path.as_ref()); + if let Ok(file) = file_r { + Ok(SevFd { + fd: OwnedFd::from(file), + }) + } else { + Err(errno::Error::last()) + } + } + + pub(crate) fn init2(&self, vm: &VmFd, vmsa_features: u64) -> Result<()> { + // Clear the SNP bit, KVM sets it directly + let vmsa_features = vmsa_features & !SVM_SEV_FEAT_SNP_ACTIVE; + + // TODO: Query KVM for supported VMSA features before calling init2 + if vmsa_features != 0 { + info!("SEV-SNP: requesting vmsa_features: {vmsa_features:#x}"); + } + + let mut init = KvmSevInit { + vmsa_features, + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_INIT2, + data: &mut init as *mut KvmSevInit as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + sev_op(vm, &mut sev_cmd, "KVM_SEV_INIT2") + } + + pub(crate) fn launch_start(&self, vm: &VmFd, guest_policy: SnpPolicy) -> Result<()> { + // See AMD Spec Section 4.3 - Guest Policy + // Bit 17 is reserved and has to be one. + // https://docs.amd.com/v/u/en-US/56860_PUB_1.58_SEV_SNP + let mut start: KvmSevSnpLaunchStart = KvmSevSnpLaunchStart { + policy: guest_policy.into_bits(), + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_SNP_LAUNCH_START, + data: &mut start as *mut KvmSevSnpLaunchStart as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_START") + } +} diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index 4be179e642..cb106a131c 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -196,6 +196,8 @@ pub struct HypervisorVmConfig { pub sev_snp_enabled: bool, #[cfg(feature = "sev_snp")] pub mem_size: u64, + #[cfg(feature = "sev_snp")] + pub vmsa_features: u64, pub nested: bool, pub smt_enabled: bool, } diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index e6787d19ef..36aae27b08 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -394,9 +394,7 @@ pub trait Vm: Send + Sync + Any { fn get_dirty_log(&self, slot: u32, base_gpa: u64, memory_size: u64) -> Result>; #[cfg(feature = "sev_snp")] /// Initialize SEV-SNP on this VM - fn sev_snp_init(&self, _guest_policy: SnpPolicy) -> Result<()> { - unimplemented!() - } + fn sev_snp_init(&self, guest_policy: SnpPolicy) -> Result<()>; #[cfg(feature = "tdx")] /// Initialize TDX on this VM fn tdx_init(&self, _cpuid: &[CpuIdEntry], _max_vcpus: u32) -> Result<()> { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index edb8b76af0..d7bab3c1bd 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -927,26 +927,26 @@ pub struct AcpiPlatformAddresses { pub sleep_status_reg_address: Option, } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] struct SevSnpPageAccessProxy { vm: Arc, } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] impl std::fmt::Debug for SevSnpPageAccessProxy { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "SNP Page access proxy") } } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] impl SevSnpPageAccessProxy { fn new(vm: Arc) -> SevSnpPageAccessProxy { SevSnpPageAccessProxy { vm } } } -#[cfg(all(feature = "mshv", feature = "sev_snp"))] +#[cfg(feature = "sev_snp")] impl AccessPlatform for SevSnpPageAccessProxy { fn translate_gpa(&self, base: u64, _size: u64) -> std::result::Result { Ok(base) diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index 7669675430..92ee364e6d 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -129,6 +129,21 @@ fn import_parameter( Ok(()) } +/// +/// Extract sev_features from the boot CPU (vp_index 0) VMSA. +/// +#[cfg(feature = "sev_snp")] +pub fn extract_sev_features(igvm_file: &IgvmFile) -> u64 { + for header in igvm_file.directives() { + if let IgvmDirectiveHeader::SnpVpContext { vp_index, vmsa, .. } = header + && *vp_index == 0 + { + return vmsa.sev_features.into(); + } + } + 0 +} + /// /// Load the given IGVM file to guest memory. /// Right now it only supports SNP based isolation. diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index eefe0707ec..6d559dac1d 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -230,6 +230,8 @@ impl From<&VmConfig> for hypervisor::HypervisorVmConfig { sev_snp_enabled: _value.is_sev_snp_enabled(), #[cfg(feature = "sev_snp")] mem_size: _value.memory.total_size(), + #[cfg(feature = "sev_snp")] + vmsa_features: 0, nested: _value.cpus.nested, smt_enabled: _value .cpus diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 5c90138147..0cedb76dac 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -111,6 +111,7 @@ mod kvm { pub const KVM_NMI: u64 = 0xae9a; pub const KVM_GET_NESTED_STATE: u64 = 3229658814; pub const KVM_SET_NESTED_STATE: u64 = 1082175167; + pub const KVM_SEV_SNP_LAUNCH_START: u64 = 0x4018_aeb4; } // Block device ioctls (not exported by libc) @@ -255,6 +256,7 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_NESTED_STATE)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_START)?], ]) } diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 387e30a260..e4749892f8 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -1315,10 +1315,16 @@ impl Vm { .map_err(Error::IgvmLoad)? }; - let vm = Self::create_hypervisor_vm( - hypervisor.as_ref(), - vm_config.as_ref().lock().unwrap().deref().into(), - )?; + let vm = { + #[allow(unused_mut)] + let mut hv_config: hypervisor::HypervisorVmConfig = + vm_config.as_ref().lock().unwrap().deref().into(); + #[cfg(all(feature = "igvm", feature = "sev_snp"))] + if let Some(ref igvm) = igvm_file { + hv_config.vmsa_features = igvm_loader::extract_sev_features(igvm); + } + Self::create_hypervisor_vm(hypervisor.as_ref(), hv_config)? + }; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] if vm_config.lock().unwrap().max_apic_id() > MAX_SUPPORTED_CPUS_LEGACY { From ad585910f1ac502daea20432addaffa6ad30580e Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 16:43:32 -0700 Subject: [PATCH 09/18] hypervisor, vmm: Add support for KVM_SEV_SNP_LAUNCH_UPDATE Implement the KVM_SEV_SNP_LAUNCH_UPDATE ioctl. Extend Vm::import_isolated_pages() with a uaddrs parameter carrying host virtual addresses, which KVM needs, unlike MSHV. Compute uaddrs from guest memory mappings in the IGVM loader. Add KVM_SEV_SNP_LAUNCH_UPDATE to the seccomp allowlist. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 40 +++++++++++++++++++++++++++ hypervisor/src/kvm/x86_64/sev.rs | 47 ++++++++++++++++++++++++++++++++ hypervisor/src/mshv/mod.rs | 1 + hypervisor/src/vm.rs | 1 + vmm/src/igvm/igvm_loader.rs | 14 ++++++++++ vmm/src/seccomp_filters.rs | 2 ++ 6 files changed, 105 insertions(+) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 442a75176c..4d6d937ca2 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -145,6 +145,8 @@ const VIRTUAL_ADDRESS_SIZE: u64 = 1 << 48; #[cfg(feature = "sev_snp")] use kvm_bindings::{KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes}; +#[cfg(feature = "sev_snp")] +use x86_64::sev; #[cfg(feature = "tdx")] const KVM_EXIT_TDX: u32 = 50; @@ -638,6 +640,44 @@ impl vm::Vm for KvmVm { .map_err(|e| vm::HypervisorVmError::InitializeSevSnp(e.into())) } + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + fn import_isolated_pages( + &self, + page_type: u32, + page_size: u32, + // host page frame numbers + pfns: &[u64], + uaddrs: &[u64], + ) -> vm::Result<()> { + if pfns.is_empty() { + return Ok(()); + } + assert_eq!(pfns.len(), uaddrs.len()); + // VMSA pages are not supported by launch_update + // https://elixir.bootlin.com/linux/v6.11/source/arch/x86/kvm/svm/sev.c#L2377 + if page_type == sev::SNP_PAGE_TYPE_VMSA { + return Ok(()); + } + for i in 0..pfns.len() { + self.fd + .set_memory_attributes(kvm_memory_attributes { + address: pfns[i] << sev::GPA_METADATA_PADDING, + size: page_size as u64, + attributes: kvm_bindings::KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + // Flags must be zero o/w error (flags aren't being used here yet) + flags: 0, + }) + .map_err(|e| vm::HypervisorVmError::ImportIsolatedPages(e.into()))?; + self.sev_fd + .as_ref() + .unwrap() + .launch_update(&self.fd, uaddrs[i], page_size as u64, pfns[i], page_type) + .map_err(|e| vm::HypervisorVmError::ImportIsolatedPages(e.into()))?; + } + + Ok(()) + } + #[cfg(target_arch = "x86_64")] /// /// Sets the address of the one-page region in the VM's address space. diff --git a/hypervisor/src/kvm/x86_64/sev.rs b/hypervisor/src/kvm/x86_64/sev.rs index 6f7c6fd5eb..dc5fc39425 100644 --- a/hypervisor/src/kvm/x86_64/sev.rs +++ b/hypervisor/src/kvm/x86_64/sev.rs @@ -19,6 +19,14 @@ pub(crate) type Result = std::result::Result; // KVM SEV command IDs — linux/include/uapi/linux/kvm.h const KVM_SEV_INIT2: u32 = 22; const KVM_SEV_SNP_LAUNCH_START: u32 = 100; +const KVM_SEV_SNP_LAUNCH_UPDATE: u32 = 101; +// SNP_LAUNCH_UPDATE page types — linux/arch/x86/include/uapi/asm/sev-guest.h +pub const SNP_PAGE_TYPE_VMSA: u32 = 2; + +// See AMD Spec Section 8.17 — SNP_LAUNCH_UPDATE +// The last 12 bits are metadata about the guest context +// https://docs.amd.com/v/u/en-US/56860_PUB_1.58_SEV_SNP +pub const GPA_METADATA_PADDING: u32 = 12; // SNP in VMSA - linux/arch/x86/include/asm/svm.h const SVM_SEV_FEAT_SNP_ACTIVE: u64 = 1 << 0; @@ -59,6 +67,19 @@ pub(crate) struct KvmSevSnpLaunchStart { pub pad1: [u64; 4], } +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevSnpLaunchUpdate { + pub gfn_start: u64, + pub uaddr: u64, + pub len: u64, + pub type_: u8, + pub pad0: u8, + pub flags: u16, + pub pad1: u32, + pub pad2: [u64; 4], +} + impl SevFd { pub(crate) fn new(sev_path: impl AsRef) -> Result { // give sev device rw and close on exec @@ -114,4 +135,30 @@ impl SevFd { }; sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_START") } + + pub(crate) fn launch_update( + &self, + vm: &VmFd, + // host virtual address + hva: u64, + size: u64, + // guest frame number + gfn_start: u64, + page_type: u32, + ) -> Result<()> { + let mut update = KvmSevSnpLaunchUpdate { + gfn_start, + uaddr: hva, + len: size, + type_: page_type as u8, + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_SNP_LAUNCH_UPDATE, + data: &mut update as *mut KvmSevSnpLaunchUpdate as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_UPDATE") + } } diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 1119691273..a61f2e44ef 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -2272,6 +2272,7 @@ impl vm::Vm for MshvVm { page_type: u32, page_size: u32, pages: &[u64], + _uaddrs: &[u64], ) -> vm::Result<()> { debug_assert!(page_size == hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB); if pages.is_empty() { diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index 36aae27b08..6d3a4a4ae5 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -429,6 +429,7 @@ pub trait Vm: Send + Sync + Any { _page_type: u32, _page_size: u32, _pages: &[u64], + _uaddrs: &[u64], ) -> Result<()> { unimplemented!() } diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index 92ee364e6d..b117425be1 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -21,6 +21,9 @@ use mshv_bindings::*; use thiserror::Error; use zerocopy::IntoBytes; +#[cfg(feature = "sev_snp")] +use vm_memory::{GuestAddress, GuestAddressSpace, GuestMemory}; + #[cfg(feature = "sev_snp")] use crate::GuestMemoryMmap; use crate::cpu::CpuManager; @@ -471,6 +474,16 @@ pub fn load_igvm( .iter() .map(|gpa| gpa.gpa >> HV_HYP_PAGE_SHIFT) .collect(); + let guest_memory = memory_manager.lock().unwrap().guest_memory().memory(); + let uaddrs: Vec<_> = group + .iter() + .map(|gpa| { + let guest_region_mmap = guest_memory.to_region_addr(GuestAddress(gpa.gpa)); + let uaddr_base = guest_region_mmap.unwrap().0.as_ptr() as u64; + let uaddr_offset: u64 = guest_region_mmap.unwrap().1.0; + uaddr_base + uaddr_offset + }) + .collect(); memory_manager .lock() .unwrap() @@ -479,6 +492,7 @@ pub fn load_igvm( group[0].page_type, hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, &pfns, + &uaddrs, ) .map_err(Error::ImportIsolatedPages)?; } diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 0cedb76dac..4878ea8219 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -112,6 +112,7 @@ mod kvm { pub const KVM_GET_NESTED_STATE: u64 = 3229658814; pub const KVM_SET_NESTED_STATE: u64 = 1082175167; pub const KVM_SEV_SNP_LAUNCH_START: u64 = 0x4018_aeb4; + pub const KVM_SEV_SNP_LAUNCH_UPDATE: u64 = 0x8018_aeb5; } // Block device ioctls (not exported by libc) @@ -257,6 +258,7 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_NESTED_STATE)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_START)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_UPDATE)?], ]) } From 2f73f6745764cf9b507ac1ea4053efccb427ed3a Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 16:57:42 -0700 Subject: [PATCH 10/18] hypervisor, vmm: Add support for KVM_SEV_SNP_LAUNCH_FINISH Add the KVM_SEV_SNP_LAUNCH_FINISH ioctl, which finalizes the SNP launch sequence and transitions the VM into a runnable encrypted state. Additionally, add KVM_SEV_SNP_LAUNCH_FINISH to the seccomp allowlist. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 19 +++++++++++++++ hypervisor/src/kvm/x86_64/sev.rs | 42 +++++++++++++++++++++++++++++++- vmm/src/seccomp_filters.rs | 2 ++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 4d6d937ca2..2a06271ee1 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -678,6 +678,25 @@ impl vm::Vm for KvmVm { Ok(()) } + #[cfg(all(feature = "sev_snp", target_arch = "x86_64"))] + fn complete_isolated_import( + &self, + snp_id_block: igvm_defs::IGVM_VHS_SNP_ID_BLOCK, + host_data: [u8; 32], + id_block_enabled: u8, + ) -> vm::Result<()> { + self.sev_fd + .as_ref() + .unwrap() + .launch_finish( + &self.fd, + host_data, + id_block_enabled, + snp_id_block.author_key_enabled, + ) + .map_err(|e| vm::HypervisorVmError::CompleteIsolatedImport(e.into())) + } + #[cfg(target_arch = "x86_64")] /// /// Sets the address of the one-page region in the VM's address space. diff --git a/hypervisor/src/kvm/x86_64/sev.rs b/hypervisor/src/kvm/x86_64/sev.rs index dc5fc39425..0ed4dabf74 100644 --- a/hypervisor/src/kvm/x86_64/sev.rs +++ b/hypervisor/src/kvm/x86_64/sev.rs @@ -11,7 +11,7 @@ use std::path::Path; use igvm_defs::SnpPolicy; use kvm_bindings::kvm_sev_cmd; use kvm_ioctls::VmFd; -use log::{error, info}; +use log::{debug, error, info}; use vmm_sys_util::errno; pub(crate) type Result = std::result::Result; @@ -20,6 +20,7 @@ pub(crate) type Result = std::result::Result; const KVM_SEV_INIT2: u32 = 22; const KVM_SEV_SNP_LAUNCH_START: u32 = 100; const KVM_SEV_SNP_LAUNCH_UPDATE: u32 = 101; +const KVM_SEV_SNP_LAUNCH_FINISH: u32 = 102; // SNP_LAUNCH_UPDATE page types — linux/arch/x86/include/uapi/asm/sev-guest.h pub const SNP_PAGE_TYPE_VMSA: u32 = 2; @@ -80,6 +81,21 @@ pub(crate) struct KvmSevSnpLaunchUpdate { pub pad2: [u64; 4], } +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub(crate) struct KvmSevSnpLaunchFinish { + pub id_block_uaddr: u64, + pub id_auth_uaddr: u64, + pub id_block_en: u8, + pub auth_key_en: u8, + pub vcek_disabled: u8, + pub host_data: [u8; 32], + pub pad0: [u8; 3], + // must be zero https://elixir.bootlin.com/linux/v6.11/source/arch/x86/kvm/svm/sev.c#L2506 + pub flags: u16, + pub pad1: [u64; 4], +} + impl SevFd { pub(crate) fn new(sev_path: impl AsRef) -> Result { // give sev device rw and close on exec @@ -161,4 +177,28 @@ impl SevFd { }; sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_UPDATE") } + + pub(crate) fn launch_finish( + &self, + vm: &VmFd, + host_data: [u8; 32], + id_block_en: u8, + auth_key_en: u8, + ) -> Result<()> { + let mut finish = KvmSevSnpLaunchFinish { + host_data, + id_block_en, + auth_key_en, + ..Default::default() + }; + let mut sev_cmd = kvm_sev_cmd { + id: KVM_SEV_SNP_LAUNCH_FINISH, + data: &mut finish as *mut KvmSevSnpLaunchFinish as _, + sev_fd: self.fd.as_raw_fd() as _, + ..Default::default() + }; + let flags = finish.flags; + debug!("Calling KVM_SEV_SNP_LAUNCH_FINISH, flags: {flags}"); + sev_op(vm, &mut sev_cmd, "KVM_SEV_SNP_LAUNCH_FINISH") + } } diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 4878ea8219..0db2450be5 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -113,6 +113,7 @@ mod kvm { pub const KVM_SET_NESTED_STATE: u64 = 1082175167; pub const KVM_SEV_SNP_LAUNCH_START: u64 = 0x4018_aeb4; pub const KVM_SEV_SNP_LAUNCH_UPDATE: u64 = 0x8018_aeb5; + pub const KVM_SEV_SNP_LAUNCH_FINISH: u64 = 0x4008_aeb7; } // Block device ioctls (not exported by libc) @@ -259,6 +260,7 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_NESTED_STATE)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_START)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_UPDATE)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SEV_SNP_LAUNCH_FINISH)?], ]) } From 3069e84d62533c3cfc9573c239485b012f186933 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 17:04:58 -0700 Subject: [PATCH 11/18] hypervisor: Handle KVM_HC_MAP_GPA_RANGE hypercalls SEV-SNP guests will issue this hypercall to signal a change in the page encryption status to the hypervisor. Handle VcpuExit::Hypercall in the KVM vCPU run loop: decode the GPA, page count, and private/shared attribute from the hypercall arguments, then call KVM_SET_MEMORY_ATTRIBUTES to update the page state. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 2a06271ee1..a53bdd88c7 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -31,6 +31,8 @@ use anyhow::anyhow; #[cfg(target_arch = "x86_64")] use kvm_bindings::kvm_create_guest_memfd; use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; +#[cfg(feature = "sev_snp")] +use log::info; #[cfg(target_arch = "x86_64")] use log::warn; use vmm_sys_util::errno; @@ -143,6 +145,8 @@ ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a); #[cfg(target_arch = "x86_64")] const VIRTUAL_ADDRESS_SIZE: u64 = 1 << 48; +#[cfg(feature = "sev_snp")] +use igvm_defs::PAGE_SIZE_4K; #[cfg(feature = "sev_snp")] use kvm_bindings::{KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes}; #[cfg(feature = "sev_snp")] @@ -777,6 +781,8 @@ impl vm::Vm for KvmVm { hyperv_synic: AtomicBool::new(false), #[cfg(target_arch = "x86_64")] xsave_size, + #[cfg(feature = "sev_snp")] + vm_fd: self.fd.clone(), }; Ok(Box::new(vcpu)) } @@ -1634,6 +1640,8 @@ pub struct KvmVcpu { hyperv_synic: AtomicBool, #[cfg(target_arch = "x86_64")] xsave_size: i32, + #[cfg(feature = "sev_snp")] + vm_fd: Arc, } /// Implementation of Vcpu trait for KVM @@ -2293,6 +2301,49 @@ impl cpu::Vcpu for KvmVcpu { #[cfg(feature = "tdx")] VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), + #[cfg(feature = "sev_snp")] + VcpuExit::Hypercall(hypercall) => { + // https://docs.kernel.org/virt/kvm/x86/hypercalls.html#kvm-hc-map-gpa-range + info!("VcpuExit::Hypercall"); + const KVM_HC_MAP_GPA_RANGE: u64 = 12; + // 4th bit of attributes argument is encrypted page bit + match hypercall.nr { + KVM_HC_MAP_GPA_RANGE => { + info!("Handling KVM_HC_MAP_GPA_RANGE hypercall"); + // guest physical address of start page + let address = hypercall.args[0]; + // num pages to map from start address + let num_pages = hypercall.args[1]; + // bits[0-3] = page size encoding + // bits[4] = 1 if private, 0 if shared + // bits[5-63] = zero + let attributes = hypercall.args[2]; + // TODO: Add 2mb page support + let size = num_pages * PAGE_SIZE_4K; + // bit 4 = private attribute encoding + const PRIVATE_ENCODING_BITMASK: u64 = 0b10000; + info!("hypercall attributes: {attributes:#b}"); + let set_private_attr = if attributes & PRIVATE_ENCODING_BITMASK > 0 { + KVM_MEMORY_ATTRIBUTE_PRIVATE as u64 + } else { + // the only attribute available is private, o/w 0 + // https://docs.kernel.org/virt/kvm/api.html#kvm-set-memory-attributes + 0u64 + }; + let mem_attributes = kvm_memory_attributes { + address, + size, + attributes: set_private_attr, + ..Default::default() + }; + self.vm_fd + .set_memory_attributes(mem_attributes) + .map(|_| cpu::VmExit::Ignore) + .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())) + } + _ => Ok(cpu::VmExit::Ignore), + } + } r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "Unexpected exit reason on vcpu run: {r:?}" From 973086936c2ec33e65dff6e05c30c7ff2f0ddf3a Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 17:21:15 -0700 Subject: [PATCH 12/18] hypervisor: handle VcpuExit::MemoryFault for AP boot page conversions During SNP boot all guest RAM is initially marked KVM_MEMORY_ATTRIBUTE_PRIVATE. Pages imported via SNP_LAUNCH_UPDATE are properly accepted by the guest, but generic RAM pages (e.g. the AP trampoline at GPA 0xD000) are not. When stage0 on the BSP starts secondary vCPUs via x2APIC, the APs try to execute from the trampoline page through the shared mapping while KVM still has it marked private, causing a KVM_EXIT_MEMORY_FAULT (flags=KVM_MEMORY_EXIT_FLAG_PRIVATE) that previously fell through to the catch-all error, killing the VM. Handle VcpuExit::MemoryFault by toggling the page's memory attribute between private and shared based on the exit flags, allowing the vCPU to retry the access. Signed-off-by: Ruben Hakobyan --- hypervisor/src/kvm/mod.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index a53bdd88c7..4b41713568 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -2345,6 +2345,32 @@ impl cpu::Vcpu for KvmVcpu { } } + #[cfg(feature = "sev_snp")] + VcpuExit::MemoryFault { flags, gpa, size } => { + info!("VcpuExit::MemoryFault: flags={flags:#x}, gpa={gpa:#x}, size={size:#x}"); + + const KVM_MEMORY_EXIT_FLAG_PRIVATE: u64 = + kvm_bindings::KVM_MEMORY_EXIT_FLAG_PRIVATE as u64; + + const KVM_MEMORY_ATTRIBUTE_SHARED: u64 = 0; + + let attributes = if flags & KVM_MEMORY_EXIT_FLAG_PRIVATE != 0 { + KVM_MEMORY_ATTRIBUTE_PRIVATE as u64 + } else { + KVM_MEMORY_ATTRIBUTE_SHARED + }; + + self.vm_fd + .set_memory_attributes(kvm_memory_attributes { + address: gpa, + size, + attributes, + flags: 0, + }) + .map(|_| cpu::VmExit::Ignore) + .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())) + } + r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "Unexpected exit reason on vcpu run: {r:?}" ))), From bfb66710d77d55d9e846f86013658503f29d3ab0 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 17:30:24 -0700 Subject: [PATCH 13/18] vmm: add KVM SEV-SNP support to IGVM loader Adapt the IGVM loader to work with both MSHV and KVM backends, which differ in page type constants, CPUID page layout, and VMSA handling. Abstract page types into a PageTypeConfig struct populated at runtime from the detected hypervisor, replacing hardcoded mshv_bindings constants. Apply the VMSA register state to each vCPU via setup_sev_snp_regs(), translating SevSelector attributes to KVM segment format using a bitfield decoder. KVM's SNP launch path sanitizes certain CPUID bits that could lead to an insecure guest. If the VMM sets these bits, KVM rejects the CPUID page import on the first attempt, requiring a retry with the firmware-corrected values. Pre-clear the known problematic bits before import to avoid the reject-and-retry cycle: - Leaf 0x1, ECX bit 24: TSC_DEADLINE (filtered by KVM) - Leaf 0x7, EBX bit 1: SGX (filtered by KVM) - Leaf 0x7, EDX: clear entirely (contains speculative features) - Leaf 0x80000008, EBX bit 25: filtered by KVM - Leaf 0x80000021, ECX: clear entirely This keeps the CPUID page stable across launch updates and avoids noisy error logs from the retry path. Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Co-authored-by: Dylan Reid Signed-off-by: Dylan Reid Signed-off-by: Ruben Hakobyan --- Cargo.lock | 7 + hypervisor/Cargo.toml | 1 + hypervisor/src/cpu.rs | 4 + hypervisor/src/kvm/mod.rs | 112 ++++++++++++- vmm/src/config.rs | 28 +++- vmm/src/cpu.rs | 17 +- vmm/src/igvm/igvm_loader.rs | 321 ++++++++++++++++++++++++++++++------ 7 files changed, 429 insertions(+), 61 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f60d47d317..e4e4e414bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -285,6 +285,12 @@ dependencies = [ "windows-link", ] +[[package]] +name = "bitfield" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5acf59e2452f0c4b968b15ce4b9468f57b45f7733b919d68b19fcc39264bfb8" + [[package]] name = "bitfield-struct" version = "0.10.1" @@ -998,6 +1004,7 @@ version = "0.1.0" dependencies = [ "anyhow", "arc-swap", + "bitfield", "bitfield-struct 0.12.1", "byteorder", "cfg-if", diff --git a/hypervisor/Cargo.toml b/hypervisor/Cargo.toml index 1ffaa46b78..f0bc6e5fe5 100644 --- a/hypervisor/Cargo.toml +++ b/hypervisor/Cargo.toml @@ -16,6 +16,7 @@ tdx = [] [dependencies] anyhow = { workspace = true } arc-swap = "1.9.0" +bitfield = "0.16.1" bitfield-struct = "0.12.0" byteorder = { workspace = true } cfg-if = { workspace = true } diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index a4a029e989..044c81a2e8 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -591,6 +591,10 @@ pub trait Vcpu: Send + Sync { fn set_sev_control_register(&self, _vmsa_pfn: u64) -> Result<()> { unimplemented!() } + #[cfg(feature = "sev_snp")] + fn setup_sev_snp_regs(&self, _vmsa: igvm::snp_defs::SevVmsa) -> Result<()> { + unimplemented!() + } /// /// Sets the value of GIC redistributor address diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 4b41713568..c9e4c2fa85 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -148,10 +148,57 @@ const VIRTUAL_ADDRESS_SIZE: u64 = 1 << 48; #[cfg(feature = "sev_snp")] use igvm_defs::PAGE_SIZE_4K; #[cfg(feature = "sev_snp")] -use kvm_bindings::{KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes}; +use kvm_bindings::{ + KVM_MEMORY_ATTRIBUTE_PRIVATE, KVM_X86_SNP_VM, kvm_memory_attributes, kvm_segment as Segment, +}; +use vm_memory::GuestAddress; #[cfg(feature = "sev_snp")] use x86_64::sev; +// Hardcoded GPA of a bootloader and VMSA page for KVM +// TODO: Derive these from the IGVM file's PageData/SnpVpContext directives +// instead of using fixed constants, to support arbitrary bootloader layouts. +pub const BOOTLOADER_START: GuestAddress = GuestAddress(0xffc0_0000); +pub const BOOTLOADER_SIZE: usize = 0x40_0000; // 4 MiB +pub const KVM_VMSA_PAGE_ADDRESS: GuestAddress = GuestAddress(0xffff_ffff_f000); +pub const KVM_VMSA_PAGE_SIZE: usize = 0x1000; // 4 KiB + +#[cfg(feature = "sev_snp")] +bitfield::bitfield! { + /// AMD VMCB segment attributes + /// linux/arch/x86/include/asm/svm.h + pub struct SegAccess(u32); + impl Debug; + pub seg_type, _ : 3, 0; + pub s_code_data, _ : 4; + pub priv_level, _ : 6, 5; + pub present, _ : 7; + pub available, _ : 8; + pub l_64bit, _ : 9; + pub db_size_32, _: 10; + pub granularity, _: 11; +} + +#[cfg(feature = "sev_snp")] +fn make_segment(sev_selector: igvm::snp_defs::SevSelector) -> Segment { + let flags = SegAccess(sev_selector.attrib.into()); + Segment { + base: sev_selector.base, + limit: sev_selector.limit, + selector: sev_selector.selector, + type_: flags.seg_type() as u8, + s: flags.s_code_data() as u8, + dpl: flags.priv_level() as u8, + present: flags.present() as u8, + avl: flags.available() as u8, + db: flags.db_size_32() as u8, + g: flags.granularity() as u8, + l: flags.l_64bit() as u8, + unusable: 0, + ..Default::default() + } +} + #[cfg(feature = "tdx")] const KVM_EXIT_TDX: u32 = 50; #[cfg(feature = "tdx")] @@ -3223,6 +3270,69 @@ impl cpu::Vcpu for KvmVcpu { Ok(_) => Ok(()), } } + + #[cfg(feature = "sev_snp")] + fn set_sev_control_register(&self, _vmsa_pfn: u64) -> cpu::Result<()> { + Ok(()) + } + + #[cfg(feature = "sev_snp")] + fn setup_sev_snp_regs(&self, vmsa: igvm::snp_defs::SevVmsa) -> cpu::Result<()> { + let mut sregs = self + .fd + .get_sregs() + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?; + sregs.cs = make_segment(vmsa.cs); + sregs.ds = make_segment(vmsa.ds); + sregs.es = make_segment(vmsa.es); + sregs.fs = make_segment(vmsa.fs); + sregs.gs = make_segment(vmsa.gs); + sregs.ss = make_segment(vmsa.ss); + sregs.tr = make_segment(vmsa.tr); + sregs.ldt = make_segment(vmsa.ldtr); + + sregs.cr0 = vmsa.cr0; + sregs.cr4 = vmsa.cr4; + sregs.cr3 = vmsa.cr3; + sregs.efer = vmsa.efer; + + sregs.idt.base = vmsa.idtr.base; + sregs.idt.limit = vmsa.idtr.limit.try_into().unwrap(); + sregs.gdt.base = vmsa.gdtr.base; + sregs.gdt.limit = vmsa.gdtr.limit.try_into().unwrap(); + self.fd + .set_sregs(&sregs) + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))?; + + let mut regs = self + .fd + .get_regs() + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::GetRegister(e.into()))?; + regs.rip = vmsa.rip; + regs.rdx = vmsa.rdx; + regs.rflags = vmsa.rflags; + regs.rsp = vmsa.rsp; + regs.rax = vmsa.rax; + regs.rbx = vmsa.rbx; + regs.rcx = vmsa.rcx; + regs.rbp = vmsa.rbp; + regs.rsi = vmsa.rsi; + regs.rdi = vmsa.rdi; + regs.r8 = vmsa.r8; + regs.r9 = vmsa.r9; + regs.r10 = vmsa.r10; + regs.r11 = vmsa.r11; + regs.r12 = vmsa.r12; + regs.r13 = vmsa.r13; + regs.r14 = vmsa.r14; + regs.r15 = vmsa.r15; + + self.fd + .set_regs(®s) + .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::SetRegister(e.into()))?; + + Ok(()) + } } impl KvmVcpu { diff --git a/vmm/src/config.rs b/vmm/src/config.rs index fc7eb1b8d1..5b72fe993d 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -350,6 +350,9 @@ pub enum ValidationError { #[cfg(feature = "sev_snp")] #[error("Invalid host data format")] InvalidHostData, + #[cfg(all(feature = "sev_snp", feature = "igvm"))] + #[error("SEV-SNP requires an IGVM payload (--payload igvm=)")] + SevSnpRequiresIgvm, /// Restore expects all net ids that have fds #[error("Net id {0} is associated with FDs and is required")] RestoreMissingRequiredNetId(String), @@ -2880,12 +2883,25 @@ impl VmConfig { #[cfg(feature = "sev_snp")] { - let host_data_opt = &self.payload.as_ref().unwrap().host_data; - - if let Some(host_data) = host_data_opt - && host_data.len() != 64 - { - return Err(ValidationError::InvalidHostData); + let sev_snp_enabled = self.platform.as_ref().is_some_and(|p| p.sev_snp); + if sev_snp_enabled { + let host_data_opt = &self.payload.as_ref().unwrap().host_data; + if let Some(host_data) = host_data_opt + && host_data.len() != 64 + { + return Err(ValidationError::InvalidHostData); + } + // KVM SEV-SNP requires an IGVM payload to initialise the VMSA. + // Without IGVM the vCPU register state is undefined and VM entry fails. + #[cfg(feature = "igvm")] + if self + .payload + .as_ref() + .and_then(|p| p.igvm.as_ref()) + .is_none() + { + return Err(ValidationError::SevSnpRequiresIgvm); + } } } // The 'conflict' check is introduced in commit 24438e0390d3 diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index 40be270538..520f2a3128 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -212,6 +212,9 @@ pub enum Error { #[cfg(feature = "sev_snp")] #[error("Failed to set sev control register")] SetSevControlRegister(#[source] hypervisor::HypervisorCpuError), + #[cfg(feature = "sev_snp")] + #[error("Failed to set up SEV-SNP vCPU registers")] + SetupSevSnpRegs(#[source] hypervisor::HypervisorCpuError), #[cfg(target_arch = "x86_64")] #[error("Failed to inject NMI")] @@ -644,6 +647,13 @@ impl Vcpu { .map_err(Error::SetSevControlRegister) } + #[cfg(feature = "sev_snp")] + pub fn setup_sev_snp_regs(&self, vmsa: igvm::snp_defs::SevVmsa) -> Result<()> { + self.vcpu + .setup_sev_snp_regs(vmsa) + .map_err(Error::SetupSevSnpRegs) + } + /// /// Sets the vCPU's GIC redistributor base address. /// @@ -2272,7 +2282,7 @@ impl CpuManager { &self.vcpus_kill_signalled } - #[cfg(feature = "igvm")] + #[cfg(all(feature = "igvm", feature = "mshv"))] pub(crate) fn get_cpuid_leaf( &self, cpu_id: u8, @@ -2295,6 +2305,11 @@ impl CpuManager { self.sev_snp_enabled } + #[cfg(feature = "igvm")] + pub(crate) fn hypervisor_type(&self) -> hypervisor::HypervisorType { + self.hypervisor.hypervisor_type() + } + pub(crate) fn nmi(&mut self) -> Result<()> { self.vcpus_kick_signalled.store(true, Ordering::SeqCst); self.signal_vcpus()?; diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index b117425be1..99cc86dda0 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -7,6 +7,7 @@ use std::ffi::CString; use std::mem::size_of; use std::sync::{Arc, Mutex}; +use hypervisor::HypervisorType; use igvm::snp_defs::SevVmsa; use igvm::{IgvmDirectiveHeader, IgvmFile, IgvmPlatformHeader}; #[cfg(feature = "sev_snp")] @@ -15,14 +16,20 @@ use igvm_defs::{ IGVM_VHS_PARAMETER, IGVM_VHS_PARAMETER_INSERT, IgvmPageDataType, IgvmPlatformType, }; use log::debug; +#[cfg(all(feature = "kvm", feature = "sev_snp"))] +use log::error; #[cfg(feature = "sev_snp")] use log::info; +#[cfg(feature = "mshv")] use mshv_bindings::*; use thiserror::Error; +#[cfg(feature = "sev_snp")] +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemory}; +#[cfg(all(feature = "kvm", feature = "sev_snp"))] +use vm_migration::Snapshottable; use zerocopy::IntoBytes; - #[cfg(feature = "sev_snp")] -use vm_memory::{GuestAddress, GuestAddressSpace, GuestMemory}; +use zerocopy::{FromBytes, FromZeros}; #[cfg(feature = "sev_snp")] use crate::GuestMemoryMmap; @@ -31,6 +38,36 @@ use crate::igvm::loader::Loader; use crate::igvm::{BootPageAcceptance, HV_PAGE_SIZE, IgvmLoadedInfo, StartupMemoryType}; use crate::memory_manager::{Error as MemoryManagerError, MemoryManager}; +#[cfg(feature = "sev_snp")] +const ISOLATED_PAGE_SHIFT: u32 = 12; +#[cfg(feature = "sev_snp")] +const SNP_CPUID_LIMIT: u32 = 64; +// see section 7.1 +// https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf +#[cfg(feature = "sev_snp")] +#[repr(C)] +#[derive(Debug, Clone, PartialEq, Eq, IntoBytes, FromBytes)] +pub struct SnpCpuidFunc { + pub eax_in: u32, + pub ecx_in: u32, + pub xcr0_in: u64, + pub xss_in: u64, + pub eax: u32, + pub ebx: u32, + pub ecx: u32, + pub edx: u32, + pub reserved: u64, +} + +#[cfg(feature = "sev_snp")] +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes)] +pub struct SnpCpuidInfo { + pub count: u32, + pub _reserved1: u32, + pub _reserved2: u64, + pub entries: [SnpCpuidFunc; SNP_CPUID_LIMIT as usize], +} #[derive(Debug, Error)] pub enum Error { #[error("command line is not a valid C string")] @@ -55,6 +92,30 @@ pub enum Error { MemoryManager(MemoryManagerError), #[error("Error reading the IGVM file")] MissingIgvm, + #[error("Error applying VMSA to vCPU registers: {0}")] + SetVmsa(#[source] crate::cpu::Error), +} + +// KVM SNP page types — linux/arch/x86/include/uapi/asm/sev-guest.h +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_NORMAL: u32 = 1; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_VMSA: u32 = 2; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_UNMEASURED: u32 = 4; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_SECRETS: u32 = 5; +#[cfg(feature = "kvm")] +const KVM_SNP_PAGE_TYPE_CPUID: u32 = 6; + +// Consolidated page type/size configuration per hypervisor. +struct PageTypeConfig { + isolated_page_size_4kb: u32, + normal: u32, + unmeasured: u32, + cpuid: u32, + secrets: u32, + vmsa: u32, } #[allow(dead_code)] @@ -152,6 +213,10 @@ pub fn extract_sev_features(igvm_file: &IgvmFile) -> u64 { /// Right now it only supports SNP based isolation. /// We can boot legacy VM with an igvm file without /// any isolation. +/// +/// NOTE: KVM and MSHV have different page type values and CPUID/VMSA handling. +/// Hypervisor-specific code paths are gated by runtime type checks. A future +/// refactor could split these into separate KVM/MSHV loader implementations. #[allow(clippy::needless_pass_by_value)] pub fn load_igvm( igvm_file: IgvmFile, @@ -160,6 +225,28 @@ pub fn load_igvm( cmdline: &str, #[cfg(feature = "sev_snp")] host_data: &Option, ) -> Result, Error> { + let hypervisor_type = cpu_manager.lock().unwrap().hypervisor_type(); + let page_types = match hypervisor_type { + #[cfg(feature = "mshv")] + HypervisorType::Mshv => PageTypeConfig { + isolated_page_size_4kb: mshv_bindings::hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + normal: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_NORMAL, + unmeasured: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, + cpuid: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_NORMAL, + secrets: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, + vmsa: mshv_bindings::hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_VMSA, + }, + #[cfg(feature = "kvm")] + HypervisorType::Kvm => PageTypeConfig { + isolated_page_size_4kb: HV_PAGE_SIZE as u32, + normal: KVM_SNP_PAGE_TYPE_NORMAL, + unmeasured: KVM_SNP_PAGE_TYPE_UNMEASURED, + cpuid: KVM_SNP_PAGE_TYPE_CPUID, + secrets: KVM_SNP_PAGE_TYPE_SECRETS, + vmsa: KVM_SNP_PAGE_TYPE_VMSA, + }, + }; + let mut loaded_info: Box = Box::default(); let command_line = CString::new(cmdline).map_err(Error::InvalidCommandLine)?; let memory = memory_manager.lock().as_ref().unwrap().guest_memory(); @@ -174,6 +261,8 @@ pub fn load_igvm( .map_err(Error::FailedToDecodeHostData)?; } + #[cfg(feature = "sev_snp")] + let sev_snp_enabled = cpu_manager.lock().unwrap().sev_snp_enabled(); let mask = match &igvm_file.platforms()[0] { IgvmPlatformHeader::SupportedPlatform(info) => { debug_assert!(info.platform_type == IgvmPlatformType::SEV_SNP); @@ -206,15 +295,15 @@ pub fn load_igvm( if flags.unmeasured() { gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.unmeasured, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::ExclusiveUnmeasured } else { gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_NORMAL, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.normal, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::Exclusive } @@ -222,43 +311,46 @@ pub fn load_igvm( IgvmPageDataType::SECRETS => { gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_SECRETS, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.secrets, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::SecretsPage } IgvmPageDataType::CPUID_DATA => { - // SAFETY: CPUID is readonly - unsafe { - let cpuid_page_p: *mut hv_psp_cpuid_page = - data.as_ptr() as *mut hv_psp_cpuid_page; // as *mut hv_psp_cpuid_page; - let cpuid_page: &mut hv_psp_cpuid_page = &mut *cpuid_page_p; - for i in 0..cpuid_page.count { - let leaf = cpuid_page.cpuid_leaf_info[i as usize]; - let mut in_leaf = cpu_manager - .lock() - .unwrap() - .get_cpuid_leaf( - 0, - leaf.eax_in, - leaf.ecx_in, - leaf.xfem_in, - leaf.xss_in, - ) - .unwrap(); - if leaf.eax_in == 1 { - in_leaf[2] &= 0x7FFFFFFF; + #[cfg(feature = "mshv")] + if hypervisor_type == HypervisorType::Mshv { + // SAFETY: CPUID is readonly + unsafe { + let cpuid_page_p: *mut hv_psp_cpuid_page = + data.as_ptr() as *mut hv_psp_cpuid_page; // as *mut hv_psp_cpuid_page; + let cpuid_page: &mut hv_psp_cpuid_page = &mut *cpuid_page_p; + for i in 0..cpuid_page.count { + let leaf = cpuid_page.cpuid_leaf_info[i as usize]; + let mut in_leaf = cpu_manager + .lock() + .unwrap() + .get_cpuid_leaf( + 0, + leaf.eax_in, + leaf.ecx_in, + leaf.xfem_in, + leaf.xss_in, + ) + .unwrap(); + if leaf.eax_in == 1 { + in_leaf[2] &= 0x7FFFFFFF; + } + cpuid_page.cpuid_leaf_info[i as usize].eax_out = in_leaf[0]; + cpuid_page.cpuid_leaf_info[i as usize].ebx_out = in_leaf[1]; + cpuid_page.cpuid_leaf_info[i as usize].ecx_out = in_leaf[2]; + cpuid_page.cpuid_leaf_info[i as usize].edx_out = in_leaf[3]; } - cpuid_page.cpuid_leaf_info[i as usize].eax_out = in_leaf[0]; - cpuid_page.cpuid_leaf_info[i as usize].ebx_out = in_leaf[1]; - cpuid_page.cpuid_leaf_info[i as usize].ecx_out = in_leaf[2]; - cpuid_page.cpuid_leaf_info[i as usize].edx_out = in_leaf[3]; } } gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_CPUID, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.cpuid, + page_size: page_types.isolated_page_size_4kb, }); BootPageAcceptance::CpuidPage } @@ -266,9 +358,73 @@ pub fn load_igvm( _ => todo!("unsupported IgvmPageDataType"), }; - loader - .import_pages(gpa / HV_PAGE_SIZE, 1, acceptance, data) - .map_err(Error::Loader)?; + #[allow(unused_mut)] + let mut imported_page = false; + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + if hypervisor_type == HypervisorType::Kvm + && *data_type == IgvmPageDataType::CPUID_DATA + { + let mut new_cp = SnpCpuidInfo::new_zeroed(); + + let entries = cpu_manager.lock().unwrap().common_cpuid(); + // TODO: Filter cpuid rather than truncate + for (i, entry) in entries + .iter() + .enumerate() + .take(std::cmp::min(SNP_CPUID_LIMIT as usize, entries.len())) + { + new_cp.entries[i].eax_in = entry.function; + new_cp.entries[i].ecx_in = entry.index; + new_cp.entries[i].eax = entry.eax; + new_cp.entries[i].ebx = entry.ebx; + new_cp.entries[i].ecx = entry.ecx; + new_cp.entries[i].edx = entry.edx; + /* + * Guest kernels will calculate EBX themselves using the 0xD + * subfunctions corresponding to the individual XSAVE areas, so only + * encode the base XSAVE size in the initial leaves, corresponding + * to the initial XCR0=1 state. (https://tinyurl.com/qemu-cpuid) + */ + if new_cp.entries[i].eax_in == 0xd + && (new_cp.entries[i].ecx_in == 0x0 || new_cp.entries[i].ecx_in == 0x1) + { + new_cp.entries[i].ebx = 0x240; + new_cp.entries[i].xcr0_in = 1; + new_cp.entries[i].xss_in = 0; + } + + // KVM SNP launch may reject a CPUID page with bits it intends + // to sanitize internally. Pre-clearing the known unsafe bits keeps + // the CPUID page stable across launch updates. + match (new_cp.entries[i].eax_in, new_cp.entries[i].ecx_in) { + (0x1, 0x0) => { + new_cp.entries[i].ecx &= !(1 << 24); + } + (0x7, 0x0) => { + new_cp.entries[i].ebx &= !0x2; + new_cp.entries[i].edx = 0; + } + (0x80000008, 0x0) => { + new_cp.entries[i].ebx &= !0x0200_0000; + } + (0x80000021, 0x0) => { + new_cp.entries[i].ecx = 0; + } + _ => {} + } + } + new_cp.count = new_cp.entries.len() as u32; + info!("gpa: {:#x}", *gpa); + loader + .import_pages(gpa / HV_PAGE_SIZE, 1, acceptance, new_cp.as_mut_bytes()) + .map_err(Error::Loader)?; + imported_page = true; + } + if !imported_page { + loader + .import_pages(gpa / HV_PAGE_SIZE, 1, acceptance, data) + .map_err(Error::Loader)?; + } } IgvmDirectiveHeader::ParameterArea { number_of_bytes, @@ -300,16 +456,16 @@ pub fn load_igvm( IgvmDirectiveHeader::MmioRanges(_info) => { todo!("unsupported IgvmPageDataType"); } - IgvmDirectiveHeader::MemoryMap(_info) => { + IgvmDirectiveHeader::MemoryMap(_info) => + { #[cfg(feature = "sev_snp")] - { + if sev_snp_enabled { let guest_mem = memory_manager.lock().unwrap().boot_guest_memory(); let memory_map = generate_memory_map(&guest_mem)?; import_parameter(&mut parameter_areas, _info, memory_map.as_bytes())?; + } else { + todo!("Not implemented"); } - - #[cfg(not(feature = "sev_snp"))] - todo!("Not implemented"); } IgvmDirectiveHeader::CommandLine(info) => { import_parameter(&mut parameter_areas, info, command_line.as_bytes_with_nul())?; @@ -337,7 +493,7 @@ pub fn load_igvm( vmsa, } => { assert_eq!(gpa % HV_PAGE_SIZE, 0); - let mut data: [u8; 4096] = [0; 4096]; + let mut data: [u8; HV_PAGE_SIZE as usize] = [0; HV_PAGE_SIZE as usize]; let len = size_of::(); loaded_info.vmsa_gpa = *gpa; loaded_info.vmsa = **vmsa; @@ -349,10 +505,28 @@ pub fn load_igvm( .map_err(Error::Loader)?; } + // Set vCPU initial register state from VMSA before SNP_LAUNCH_FINISH + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + if hypervisor_type == HypervisorType::Kvm { + let vcpus = cpu_manager.lock().unwrap().vcpus(); + for vcpu in vcpus { + let vcpu_locked = vcpu.lock().unwrap(); + let vcpu_id: u16 = vcpu_locked.id().parse().unwrap(); + if vcpu_id == *vp_index { + vcpu_locked + .setup_sev_snp_regs(loaded_info.vmsa) + .map_err(Error::SetVmsa)?; + vcpu_locked + .set_sev_control_register(0) + .map_err(Error::SetVmsa)?; + } + } + } + gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_VMSA, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.vmsa, + page_size: page_types.isolated_page_size_4kb, }); } IgvmDirectiveHeader::SnpIdBlock { @@ -420,8 +594,8 @@ pub fn load_igvm( *area = ParameterAreaState::Inserted; gpas.push(GpaPages { gpa: *gpa, - page_type: hv_isolated_page_type_HV_ISOLATED_PAGE_TYPE_UNMEASURED, - page_size: hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_type: page_types.unmeasured, + page_size: page_types.isolated_page_size_4kb, }); } IgvmDirectiveHeader::ErrorRange { .. } => { @@ -434,7 +608,7 @@ pub fn load_igvm( } #[cfg(feature = "sev_snp")] - { + if sev_snp_enabled { memory_manager .lock() .unwrap() @@ -472,7 +646,7 @@ pub fn load_igvm( // of PFN for importing the isolated pages let pfns: Vec = group .iter() - .map(|gpa| gpa.gpa >> HV_HYP_PAGE_SHIFT) + .map(|gpa| gpa.gpa >> ISOLATED_PAGE_SHIFT) .collect(); let guest_memory = memory_manager.lock().unwrap().guest_memory().memory(); let uaddrs: Vec<_> = group @@ -484,17 +658,48 @@ pub fn load_igvm( uaddr_base + uaddr_offset }) .collect(); - memory_manager + #[cfg(feature = "kvm")] + let page_type = group[0].page_type; + let mut new_cp = SnpCpuidInfo::new_zeroed(); + let _ = guest_memory.read(new_cp.as_mut_bytes(), GuestAddress(group[0].gpa)); + let _import = memory_manager .lock() .unwrap() .vm .import_isolated_pages( group[0].page_type, - hv_isolated_page_size_HV_ISOLATED_PAGE_SIZE_4KB, + page_types.isolated_page_size_4kb, &pfns, &uaddrs, ) - .map_err(Error::ImportIsolatedPages)?; + .map_err(Error::ImportIsolatedPages); + #[cfg(feature = "kvm")] + if hypervisor_type == HypervisorType::Kvm + && _import.is_err() + && page_type == page_types.cpuid + { + // When we import the CPUID page, the firmware will change any cpuid fns that + // could lead to an insecure guest, we must then make sure to import the updated cpuid + // https://elixir.bootlin.com/linux/v6.11/source/arch/x86/kvm/svm/sev.c#L2322 + let mut updated_cp = SnpCpuidInfo::new_zeroed(); + let _ = guest_memory.read(updated_cp.as_mut_bytes(), GuestAddress(group[0].gpa)); + for (set, got) in std::iter::zip(new_cp.entries.iter(), updated_cp.entries.iter()) { + if set != got { + error!("Set cpuid fn: {set:#x?}, but firmware expects: {got:#x?}"); + } + } + memory_manager + .lock() + .unwrap() + .vm + .import_isolated_pages( + group[0].page_type, + page_types.isolated_page_size_4kb, + &pfns, + &uaddrs, + ) + .map_err(Error::ImportIsolatedPages)?; + } } info!( @@ -503,13 +708,23 @@ pub fn load_igvm( gpas.len() ); + let id_block_enabled = if hypervisor_type == HypervisorType::Mshv { + 1 + } else { + 0 + }; + now = Instant::now(); // Call Complete Isolated Import since we are done importing isolated pages memory_manager .lock() .unwrap() .vm - .complete_isolated_import(loaded_info.snp_id_block, host_data_contents, 1) + .complete_isolated_import( + loaded_info.snp_id_block, + host_data_contents, + id_block_enabled, + ) .map_err(Error::CompleteIsolatedImport)?; info!( From c452b02d79ce3ccc9d19d05c602368d87c2c2740 Mon Sep 17 00:00:00 2001 From: Ruben Hakobyan Date: Tue, 7 Apr 2026 18:14:43 -0700 Subject: [PATCH 14/18] vmm: reserve memory regions for stage0 and VMSA on KVM SEV-SNP A bootloader/firmware (e.g. stage0) and the VMSA page require dedicated memory regions at fixed GPAs. Add reserve_region_for_stage0() to allocate these regions before IGVM loading begins: - Stage0 at GPA 0xffc0_0000 (4 MB) - VMSA page at GPA 0xffff_ffff_f000 (4 KB) These reservations are KVM-only; MSHV handles stage0/VMSA placement through its own isolated import path. Also add fw_cfg device creation and SYS_statx to the vCPU seccomp allowlist (needed by stage0's file access pattern). Co-authored-by: Keith Adler Signed-off-by: Keith Adler Co-authored-by: Alex Orozco Signed-off-by: Alex Orozco Signed-off-by: Ruben Hakobyan --- vmm/src/seccomp_filters.rs | 1 + vmm/src/vm.rs | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 0db2450be5..c6c597261a 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -865,6 +865,7 @@ fn vcpu_thread_rules( (libc::SYS_sendto, vec![]), (libc::SYS_shutdown, vec![]), (libc::SYS_sigaltstack, vec![]), + (libc::SYS_statx, vec![]), (libc::SYS_tgkill, vec![]), (libc::SYS_tkill, vec![]), #[cfg(target_arch = "x86_64")] diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index e4749892f8..dc16b0fc89 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -46,6 +46,10 @@ use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; +#[cfg(all(feature = "kvm", feature = "sev_snp"))] +use hypervisor::kvm::{ + BOOTLOADER_SIZE, BOOTLOADER_START, KVM_VMSA_PAGE_ADDRESS, KVM_VMSA_PAGE_SIZE, +}; use hypervisor::{HypervisorVmConfig, HypervisorVmError, VmOps}; #[cfg(feature = "sev_snp")] use igvm_defs::SnpPolicy; @@ -1029,6 +1033,9 @@ impl Vm { ) .map_err(Error::DeviceManager)?; + #[cfg(feature = "fw_cfg")] + Self::create_fw_cfg_if_enabled(config, device_manager)?; + Ok(load_payload_handle) } @@ -1497,6 +1504,16 @@ impl Vm { Ok(EntryPoint { entry_addr }) } + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + fn reserve_bootloader_regions(memory_manager: &Arc>) -> Result<()> { + let mut mm = memory_manager.lock().unwrap(); + mm.add_ram_region(BOOTLOADER_START, BOOTLOADER_SIZE) + .map_err(Error::MemoryManager)?; + mm.add_ram_region(KVM_VMSA_PAGE_ADDRESS, KVM_VMSA_PAGE_SIZE) + .map_err(Error::MemoryManager)?; + Ok(()) + } + #[cfg(feature = "igvm")] #[allow(clippy::needless_pass_by_value)] fn load_igvm( @@ -1505,6 +1522,13 @@ impl Vm { cpu_manager: Arc>, #[cfg(feature = "sev_snp")] host_data: &Option, ) -> Result { + // Only reserve bootloader/VMSA regions for KVM + SEV-SNP; other hypervisors + // (e.g. MSHV) handle this through their own import path. + #[cfg(all(feature = "kvm", feature = "sev_snp"))] + if cpu_manager.lock().unwrap().sev_snp_enabled() { + Self::reserve_bootloader_regions(&memory_manager)?; + } + let res = igvm_loader::load_igvm( igvm_file, memory_manager, From fcd6a2dc1d986e6b405ac6dd248e1ad77b2b8e14 Mon Sep 17 00:00:00 2001 From: Dylan Reid Date: Fri, 3 Apr 2026 16:10:20 -0700 Subject: [PATCH 15/18] fw_cfg: export full setup-header area for x86_64 kernels The Linux x86 boot protocol defines the setup area as (setup_sects + 1) * 512 bytes. Previously we exported only the boot_params buffer (4096 bytes), which is wrong for kernels with setup_sects >= 8 where the actual setup area exceeds boot_params. Truncate or extend the existing buffer to the correct setup_sects- derived length, reading any extra bytes directly from the kernel file. This avoids an extra allocation in the common case (setup_sects <= 7) and matches QEMU's fw_cfg_add_kernel() behavior in hw/i386/x86-common.c. Signed-off-by: Dylan Reid --- devices/src/legacy/fw_cfg.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/devices/src/legacy/fw_cfg.rs b/devices/src/legacy/fw_cfg.rs index f33179d831..d19705609a 100644 --- a/devices/src/legacy/fw_cfg.rs +++ b/devices/src/legacy/fw_cfg.rs @@ -649,6 +649,16 @@ impl FwCfg { let kernel_start = bp.text_offset; #[cfg(target_arch = "x86_64")] let kernel_start = (bp.hdr.setup_sects as usize + 1) * 512; + if kernel_start <= buffer.len() { + buffer.truncate(kernel_start); + } else { + buffer.resize(kernel_start, 0); + file.read_exact_at( + &mut buffer[size_of::()..], + size_of::() as u64, + )?; + } + self.known_items[FW_CFG_SETUP_SIZE as usize] = FwCfgContent::U32(buffer.len() as u32); self.known_items[FW_CFG_SETUP_DATA as usize] = FwCfgContent::Bytes(buffer); self.known_items[FW_CFG_KERNEL_SIZE as usize] = From d9c0f3a4553f131f90929e862de52f37921cf71a Mon Sep 17 00:00:00 2001 From: Dylan Reid Date: Fri, 3 Apr 2026 16:11:22 -0700 Subject: [PATCH 16/18] vmm: use 64-bit BARs for hotplugged virtio block devices Boot-time block devices on PCI segment 0 use 32-bit BARs so early firmware can access them without additional identity mapping in the firmware page tables. However, hot-plugged block devices are only ever seen by the OS kernel which handles 64-bit BARs natively. Switch hot-plugged block devices to 64-bit BARs to avoid exhausting the scarce 32-bit MMIO window (typically 2-3 GB between RAM and 4 GB) when many devices are hot-plugged. Extract the BAR sizing decision into use_64bit_bar_for_virtio_device() and thread an is_hotplug flag through add_virtio_pci_device(). Add unit tests covering all relevant combinations. Signed-off-by: Dylan Reid --- vmm/src/device_manager.rs | 47 ++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index d7bab3c1bd..c75a080901 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -1155,6 +1155,14 @@ fn create_mmio_allocators( mmio_allocators } +fn use_64bit_bar_for_virtio_device( + device_type: u32, + pci_segment_id: u16, + is_hotplug: bool, +) -> bool { + pci_segment_id > 0 || device_type != VirtioDeviceType::Block as u32 || is_hotplug +} + impl DeviceManager { #[allow(clippy::too_many_arguments)] pub fn new( @@ -1656,6 +1664,7 @@ impl DeviceManager { &mapping, &handle.id, handle.pci_segment, + false, handle.dma_handler, )?; @@ -1688,7 +1697,8 @@ impl DeviceManager { } if let Some(iommu_device) = iommu_device { - let dev_id = self.add_virtio_pci_device(iommu_device, &None, &iommu_id, 0, None)?; + let dev_id = + self.add_virtio_pci_device(iommu_device, &None, &iommu_id, 0, false, None)?; self.iommu_attached_devices = Some((dev_id, iommu_attached_devices)); } } @@ -4189,6 +4199,7 @@ impl DeviceManager { iommu_mapping: &Option>, virtio_device_id: &str, pci_segment_id: u16, + is_hotplug: bool, dma_handler: Option>, ) -> DeviceManagerResult { let id = format!("{VIRTIO_PCI_DEVICE_NAME_PREFIX}-{virtio_device_id}"); @@ -4287,11 +4298,10 @@ impl DeviceManager { self.activate_evt .try_clone() .map_err(DeviceManagerError::EventFd)?, - // All device types *except* virtio block devices should be allocated a 64-bit bar - // The block devices should be given a 32-bit BAR so that they are easily accessible - // to firmware without requiring excessive identity mapping. - // The exception being if not on the default PCI segment. - pci_segment_id > 0 || device_type != VirtioDeviceType::Block as u32, + // Boot-time block devices stay in 32-bit BAR space so early firmware can access + // them without additional identity mapping. Hot-plugged block devices do not have + // that constraint and should use 64-bit BARs like the rest of the virtio devices. + use_64bit_bar_for_virtio_device(device_type, pci_segment_id, is_hotplug), dma_handler, self.pending_activations.clone(), vm_migration::snapshot_from_id(self.snapshot.as_ref(), id.as_str()), @@ -4947,6 +4957,7 @@ impl DeviceManager { &mapping, &handle.id, handle.pci_segment, + true, handle.dma_handler, )?; @@ -5693,6 +5704,30 @@ impl Drop for DeviceManager { mod unit_tests { use super::*; + #[test] + fn test_hotplugged_block_devices_use_64bit_bars() { + assert!(!use_64bit_bar_for_virtio_device( + VirtioDeviceType::Block as u32, + 0, + false, + )); + assert!(use_64bit_bar_for_virtio_device( + VirtioDeviceType::Block as u32, + 0, + true, + )); + assert!(use_64bit_bar_for_virtio_device( + VirtioDeviceType::Net as u32, + 0, + false, + )); + assert!(use_64bit_bar_for_virtio_device( + VirtioDeviceType::Block as u32, + 1, + false, + )); + } + #[test] fn test_create_mmio_allocators() { let res = create_mmio_allocators(0x100000, 0x400000, 1, &[1], 4 << 10); From 765d783ed7d438dee44bae078ae42fe2ac7d116d Mon Sep 17 00:00:00 2001 From: Keith Adler Date: Tue, 31 Mar 2026 17:21:03 -0700 Subject: [PATCH 17/18] ci: Add CI jobs for KVM SEV-SNP Add build and clippy jobs for kvm+sev_snp+igvm+fw_cfg feature combination Signed-off-by: Keith Adler Signed-off-by: Ruben Hakobyan --- .github/workflows/build.yaml | 3 +++ .github/workflows/quality.yaml | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e3b1a9e7f7..062e404a56 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -64,6 +64,9 @@ jobs: - name: Build (sev_snp) run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "sev_snp" + - name: Build (kvm + sev_snp) + run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "kvm,igvm,sev_snp,fw_cfg" + - name: Build (igvm) run: cargo build --locked --bin cloud-hypervisor --no-default-features --features "igvm" diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index de6391186a..66681cc850 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -157,6 +157,26 @@ jobs: target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "tdx,kvm" -- -D warnings + - name: Clippy (kvm + sev_snp + igvm + fw_cfg) + if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} + uses: houseabsolute/actions-rust-cross@v1 + with: + command: clippy + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm,sev_snp,igvm,fw_cfg" -- -D warnings + + - name: Clippy (default features + sev_snp + igvm + fw_cfg) + if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} + uses: houseabsolute/actions-rust-cross@v1 + with: + command: clippy + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --tests --examples --features "sev_snp,igvm,fw_cfg" -- -D warnings + - name: Check build did not modify any files run: test -z "$(git status --porcelain)" From 3f7c4d60dec86a0dd2dfefa66a4f8c95acae2d5f Mon Sep 17 00:00:00 2001 From: Keith Adler Date: Tue, 14 Apr 2026 15:07:51 -0500 Subject: [PATCH 18/18] Address open review feedback on SEV-SNP PR - igvm_loader: add comment explaining the hypervisor match is exhaustive at compile time via #[cfg]-gated HypervisorType variants - igvm/mod.rs: add TODO noting IsolationType::Snp is hardcoded and should be parameterized when TDX IGVM support lands - quality.yaml: add mshv + kvm + igvm clippy target to cover the #[cfg(all(feature = "igvm", feature = "mshv"))] code paths Signed-off-by: Keith Adler --- .github/workflows/quality.yaml | 10 ++++++++++ vmm/src/igvm/igvm_loader.rs | 1 + vmm/src/igvm/mod.rs | 1 + 3 files changed, 12 insertions(+) diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index 66681cc850..a7edfd12d7 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -167,6 +167,16 @@ jobs: target: ${{ matrix.target }} args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm,sev_snp,igvm,fw_cfg" -- -D warnings + - name: Clippy (mshv + kvm + igvm) + if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} + uses: houseabsolute/actions-rust-cross@v1 + with: + command: clippy + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv,kvm,igvm" -- -D warnings + - name: Clippy (default features + sev_snp + igvm + fw_cfg) if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} uses: houseabsolute/actions-rust-cross@v1 diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index 99cc86dda0..2a15ee0461 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -226,6 +226,7 @@ pub fn load_igvm( #[cfg(feature = "sev_snp")] host_data: &Option, ) -> Result, Error> { let hypervisor_type = cpu_manager.lock().unwrap().hypervisor_type(); + // HypervisorType variants are all #[cfg]-gated, so this match is exhaustive at compile time. let page_types = match hypervisor_type { #[cfg(feature = "mshv")] HypervisorType::Mshv => PageTypeConfig { diff --git a/vmm/src/igvm/mod.rs b/vmm/src/igvm/mod.rs index 9803cf21e2..b41487cfef 100644 --- a/vmm/src/igvm/mod.rs +++ b/vmm/src/igvm/mod.rs @@ -33,6 +33,7 @@ use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; use std::path::Path; use zerocopy::FromZeros; +// TODO: IsolationType is hardcoded to Snp. Parameterize when TDX IGVM support lands. pub fn parse_igvm(igvm_path: &Path) -> Result { let file_contents = std::fs::read(igvm_path).map_err(igvm_loader::Error::Igvm)?; IgvmFile::new_from_binary(&file_contents, Some(IsolationType::Snp))