From d8059d4db3c9c9941b9c4eec74d571bc0f126b88 Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Fri, 30 Jan 2026 14:56:29 +0100 Subject: [PATCH 1/2] vm-allocator: use bitmap to record used GSIs and propagate errors The old implementation used a u32 to allocate new GSIs. The counter increased every time a new GSI was allocated, and freeing GSIs was not possible. Thus, Cloud Hypervisor could run out of GSIs and panic. This new implementation can also free GSIs. Please note that this commit only replaces the old mechanism, the next commit will introduce a mechanism to free used GSIs. While I was here I also propagated the errors that the allocator may throw where necessary. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- Cargo.lock | 1 + vm-allocator/Cargo.toml | 1 + vm-allocator/src/gsi.rs | 182 +++++++++++++++++++++++++++++-------- vm-allocator/src/lib.rs | 2 +- vm-allocator/src/system.rs | 22 ++--- vmm/src/device_manager.rs | 4 +- vmm/src/interrupt.rs | 2 +- vmm/src/pci_segment.rs | 2 +- 8 files changed, 160 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 04bce118bb..982d1722c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2689,6 +2689,7 @@ version = "0.1.0" dependencies = [ "arch", "libc", + "thiserror 2.0.17", "vm-memory", ] diff --git a/vm-allocator/Cargo.toml b/vm-allocator/Cargo.toml index a4996d6dc3..512ad23fbb 100644 --- a/vm-allocator/Cargo.toml +++ b/vm-allocator/Cargo.toml @@ -10,6 +10,7 @@ kvm = ["arch/kvm"] [dependencies] libc = { workspace = true } +thiserror = { workspace = true } vm-memory = { workspace = true } [target.'cfg(any(target_arch = "aarch64", target_arch = "riscv64"))'.dependencies] diff --git a/vm-allocator/src/gsi.rs b/vm-allocator/src/gsi.rs index d58670a43c..ae3f973eea 100644 --- a/vm-allocator/src/gsi.rs +++ b/vm-allocator/src/gsi.rs @@ -4,14 +4,10 @@ #[cfg(target_arch = "x86_64")] use std::collections::btree_map::BTreeMap; -use std::result; +use std::result::Result; +use std::sync::Mutex; -#[derive(Debug)] -pub enum Error { - Overflow, -} - -pub type Result = result::Result; +use thiserror::Error; /// GsiApic #[cfg(target_arch = "x86_64")] @@ -29,80 +25,186 @@ impl GsiApic { } } +#[derive(Debug)] +struct InterruptAllocator { + words: Vec, + size: u32, + offset: u32, +} + +/// Errors that may happen while allocating or freeing an interrupt. +#[derive(Error, Debug)] +pub enum InterruptAllocError { + /// Interrupt allocator is exhausted, i.e. out of interrupt vectors. + #[error("Interrupt allocator is exhausted")] + ExhaustedError(), + + /// Tried to free an interrupt that wasn't allocated. + #[error("Interrupt was not allocated: {0}")] + AlreadyFree(u32), + + /// Tried to free an interrupt that is not in range of the interrupt allocator. + #[error("Interrupt vector is out of range: {0}")] + OutOfRange(u32), +} + +// Maximum number of IRQ routes according to the kernel code. +const KVM_MAX_IRQ_ROUTES: u32 = 4096; + +impl InterruptAllocator { + fn new(size: u32, offset: u32) -> Self { + assert_ne!(size, 0); + let bits_per_word = usize::BITS; + let num_words = (size + bits_per_word - 1).div_ceil(bits_per_word); + + let mut words = vec![0usize; num_words as usize]; + words[(num_words - 1) as usize] = Self::last_word(size); + + Self { + words, + size, + offset, + } + } + + fn last_word(size: u32) -> usize { + let rem = size % usize::BITS; + if rem == 0 { + 0usize + } else { + !((1usize << rem) - 1) + } + } + + fn free(&mut self, vector: u32) -> Result<(), InterruptAllocError> { + // At first we make sure that the vector is not out of range. + if !(self.offset..self.offset + self.size).contains(&vector) { + return Err(InterruptAllocError::OutOfRange(vector)); + } + + let idx = vector.abs_diff(self.offset); + let (w, b) = Self::word_and_bit(idx); + + let mask = 1usize << b; + + // Let's first check whether the bit is set. + if self.words[w] & mask == 0usize { + return Err(InterruptAllocError::AlreadyFree(vector)); + } + // Clear the bit and we are done! + self.words[w] &= !mask; + Ok(()) + } + + fn word_and_bit(vector: u32) -> (usize, usize) { + let idx = vector as usize; + let bits = usize::BITS as usize; + (idx / bits, idx & bits) + } + + fn alloc(&mut self) -> Result { + let bits = usize::BITS as usize; + for idx in 0..self.words.len() { + let word = self.words[idx]; + + // If every bit is set, we can continue. + if word == !0usize { + continue; + } + + // Find lowest free bit. + let bit = (!word).trailing_zeros() as usize; + // Set the bit. + self.words[idx] |= 1usize << bit; + // Calculate index, add offset and return. + let vector = idx * bits + bit; + return Ok(vector as u32 + self.offset); + } + + Err(InterruptAllocError::ExhaustedError()) + } +} + /// GsiAllocator pub struct GsiAllocator { #[cfg(target_arch = "x86_64")] apics: BTreeMap, - next_irq: u32, - next_gsi: u32, + irqs: Mutex, + gsis: Mutex, } impl GsiAllocator { #[cfg(target_arch = "x86_64")] /// New GSI allocator pub fn new(apics: &[GsiApic]) -> Self { - let mut allocator = GsiAllocator { - apics: BTreeMap::new(), - next_irq: 0xffff_ffff, - next_gsi: 0, - }; + let mut allocator_apics = BTreeMap::new(); + let mut next_irq = 0xffff_ffffu32; + let mut next_gsi = 0u32; for apic in apics { - if apic.base < allocator.next_irq { - allocator.next_irq = apic.base; + if apic.base < next_irq { + next_irq = apic.base; } - if apic.base + apic.irqs > allocator.next_gsi { - allocator.next_gsi = apic.base + apic.irqs; + if apic.base + apic.irqs > next_gsi { + next_gsi = apic.base + apic.irqs; } - allocator.apics.insert(apic.base, apic.irqs); + allocator_apics.insert(apic.base, apic.irqs); } - allocator + Self { + apics: allocator_apics, + irqs: Mutex::new(InterruptAllocator::new( + KVM_MAX_IRQ_ROUTES - next_irq, + next_irq, + )), + gsis: Mutex::new(InterruptAllocator::new( + KVM_MAX_IRQ_ROUTES - next_gsi, + next_gsi, + )), + } } #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] /// New GSI allocator pub fn new() -> Self { GsiAllocator { - next_irq: arch::IRQ_BASE, - next_gsi: arch::IRQ_BASE, + irqs: Mutex::new(InterruptAllocator::new( + KVM_MAX_IRQ_ROUTES - arch::IRQ_BASE, + arch::IRQ_BASE, + )), + gsis: Mutex::new(InterruptAllocator::new( + KVM_MAX_IRQ_ROUTES - arch::IRQ_BASE, + arch::IRQ_BASE, + )), } } /// Allocate a GSI - pub fn allocate_gsi(&mut self) -> Result { - let gsi = self.next_gsi; - self.next_gsi = self.next_gsi.checked_add(1).ok_or(Error::Overflow)?; - Ok(gsi) + pub fn allocate_gsi(&mut self) -> Result { + self.gsis.lock().unwrap().alloc() } #[cfg(target_arch = "x86_64")] /// Allocate an IRQ - pub fn allocate_irq(&mut self) -> Result { - let mut irq: u32 = 0; + pub fn allocate_irq(&mut self) -> Result { + let next_irq = self.irqs.lock().unwrap().alloc()?; for (base, irqs) in self.apics.iter() { // HACKHACK - This only works with 1 single IOAPIC... - if self.next_irq >= *base && self.next_irq < *base + *irqs { - irq = self.next_irq; - self.next_irq += 1; + if next_irq >= *base && next_irq < *base + *irqs { + return Ok(next_irq); } } - if irq == 0 { - return Err(Error::Overflow); - } - - Ok(irq) + self.irqs.lock().unwrap().free(next_irq)?; + Err(InterruptAllocError::ExhaustedError()) } #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] /// Allocate an IRQ - pub fn allocate_irq(&mut self) -> Result { - let irq = self.next_irq; - self.next_irq = self.next_irq.checked_add(1).ok_or(Error::Overflow)?; - Ok(irq) + pub fn allocate_irq(&mut self) -> Result { + self.irqs.lock().unwrap().alloc() } } diff --git a/vm-allocator/src/lib.rs b/vm-allocator/src/lib.rs index d560482889..ecc59ec16b 100644 --- a/vm-allocator/src/lib.rs +++ b/vm-allocator/src/lib.rs @@ -17,9 +17,9 @@ pub mod page_size; mod system; pub use crate::address::AddressAllocator; -pub use crate::gsi::GsiAllocator; #[cfg(target_arch = "x86_64")] pub use crate::gsi::GsiApic; +pub use crate::gsi::{GsiAllocator, InterruptAllocError}; pub use crate::system::SystemAllocator; mod memory_slot; pub use memory_slot::MemorySlotAllocator; diff --git a/vm-allocator/src/system.rs b/vm-allocator/src/system.rs index 02ea86c6a1..0436345b8f 100644 --- a/vm-allocator/src/system.rs +++ b/vm-allocator/src/system.rs @@ -10,9 +10,9 @@ use vm_memory::{GuestAddress, GuestUsize}; use crate::address::AddressAllocator; -use crate::gsi::GsiAllocator; #[cfg(target_arch = "x86_64")] use crate::gsi::GsiApic; +use crate::gsi::{GsiAllocator, InterruptAllocError}; use crate::page_size::get_page_size; /// Manages allocating system resources such as address space and interrupt numbers. @@ -31,17 +31,17 @@ use crate::page_size::get_page_size; /// GuestAddress(0x10000000), 0x10000000, /// #[cfg(target_arch = "x86_64")] &[GsiApic::new(5, 19)]).unwrap(); /// #[cfg(target_arch = "x86_64")] -/// assert_eq!(allocator.allocate_irq(), Some(5)); +/// assert_eq!(allocator.allocate_irq().unwrap(), 5); /// #[cfg(target_arch = "aarch64")] -/// assert_eq!(allocator.allocate_irq(), Some(32)); +/// assert_eq!(allocator.allocate_irq().unwrap(), 32); /// #[cfg(target_arch = "riscv64")] -/// assert_eq!(allocator.allocate_irq(), Some(0)); +/// assert_eq!(allocator.allocate_irq().unwrap(), 0); /// #[cfg(target_arch = "x86_64")] -/// assert_eq!(allocator.allocate_irq(), Some(6)); +/// assert_eq!(allocator.allocate_irq().unwrap(), 6); /// #[cfg(target_arch = "aarch64")] -/// assert_eq!(allocator.allocate_irq(), Some(33)); +/// assert_eq!(allocator.allocate_irq().unwrap(), 33); /// #[cfg(target_arch = "riscv64")] -/// assert_eq!(allocator.allocate_irq(), Some(1)); +/// assert_eq!(allocator.allocate_irq().unwrap(), 1); /// assert_eq!(allocator.allocate_platform_mmio_addresses(None, 0x1000, Some(0x1000)), Some(GuestAddress(0x1fff_f000))); /// /// ``` @@ -82,13 +82,13 @@ impl SystemAllocator { } /// Reserves the next available system irq number. - pub fn allocate_irq(&mut self) -> Option { - self.gsi_allocator.allocate_irq().ok() + pub fn allocate_irq(&mut self) -> Result { + self.gsi_allocator.allocate_irq() } /// Reserves the next available GSI. - pub fn allocate_gsi(&mut self) -> Option { - self.gsi_allocator.allocate_gsi().ok() + pub fn allocate_gsi(&mut self) -> Result { + self.gsi_allocator.allocate_gsi() } /// Reserves a section of `size` bytes of IO address space. diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index c536a67bb4..41421d3ed5 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -94,7 +94,7 @@ use virtio_devices::{ AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, PostMigrationAnnouncer, VdpaDmaMapping, VirtioMemMappingSource, }; -use vm_allocator::{AddressAllocator, SystemAllocator}; +use vm_allocator::{AddressAllocator, InterruptAllocError, SystemAllocator}; use vm_device::dma_mapping::ExternalDmaMapping; use vm_device::interrupt::{ InterruptIndex, InterruptManager, LegacyIrqGroupConfig, MsiIrqGroupConfig, @@ -274,7 +274,7 @@ pub enum DeviceManagerError { /// Cannot allocate IRQ. #[error("Cannot allocate IRQ")] - AllocateIrq, + AllocateIrq(#[from] InterruptAllocError), /// Cannot configure the IRQ. #[error("Cannot configure the IRQ")] diff --git a/vmm/src/interrupt.rs b/vmm/src/interrupt.rs index e42ba2f76b..3b1f647403 100644 --- a/vmm/src/interrupt.rs +++ b/vmm/src/interrupt.rs @@ -31,7 +31,7 @@ impl InterruptRoute { let irq_fd = EventFd::new(libc::EFD_NONBLOCK)?; let gsi = allocator .allocate_gsi() - .ok_or_else(|| io::Error::other("Failed allocating new GSI"))?; + .map_err(|e| io::Error::other(format!("Failed allocating new GSI: {e}")))?; Ok(InterruptRoute { gsi, diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index 462d6b7111..9b0d88e141 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -198,7 +198,7 @@ impl PciSegment { .lock() .unwrap() .allocate_irq() - .ok_or(DeviceManagerError::AllocateIrq)? as u8, + .map_err(DeviceManagerError::AllocateIrq)? as u8, ); } From f05cdf50cb2cf6606118b09c893402a9f3a752df Mon Sep 17 00:00:00 2001 From: Sebastian Eydam Date: Mon, 2 Feb 2026 11:12:17 +0100 Subject: [PATCH 2/2] vm-allocator: free GSIs when they are no longer used These changes free GSIs when they are no longer used. That way we shouldn't exhaust the available GSIs anymore when attaching and detaching devices. On-behalf-of: SAP sebastian.eydam@sap.com Signed-off-by: Sebastian Eydam --- vm-allocator/src/gsi.rs | 5 +++++ vm-allocator/src/system.rs | 5 +++++ vmm/src/interrupt.rs | 15 ++++++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/vm-allocator/src/gsi.rs b/vm-allocator/src/gsi.rs index ae3f973eea..e20bd2acb7 100644 --- a/vm-allocator/src/gsi.rs +++ b/vm-allocator/src/gsi.rs @@ -186,6 +186,11 @@ impl GsiAllocator { self.gsis.lock().unwrap().alloc() } + /// Frees a GSI + pub fn free_gsi(&mut self, vector: u32) -> Result<(), InterruptAllocError> { + self.gsis.lock().unwrap().free(vector) + } + #[cfg(target_arch = "x86_64")] /// Allocate an IRQ pub fn allocate_irq(&mut self) -> Result { diff --git a/vm-allocator/src/system.rs b/vm-allocator/src/system.rs index 0436345b8f..22d261725d 100644 --- a/vm-allocator/src/system.rs +++ b/vm-allocator/src/system.rs @@ -91,6 +91,11 @@ impl SystemAllocator { self.gsi_allocator.allocate_gsi() } + /// Frees the given GSI. + pub fn free_gsi(&mut self, vector: u32) -> Result<(), InterruptAllocError> { + self.gsi_allocator.free_gsi(vector) + } + /// Reserves a section of `size` bytes of IO address space. pub fn allocate_io_addresses( &mut self, diff --git a/vmm/src/interrupt.rs b/vmm/src/interrupt.rs index 3b1f647403..9edf8d249c 100644 --- a/vmm/src/interrupt.rs +++ b/vmm/src/interrupt.rs @@ -24,12 +24,15 @@ struct InterruptRoute { gsi: u32, irq_fd: EventFd, registered: AtomicBool, + allocator: Arc>, } impl InterruptRoute { - pub fn new(allocator: &mut SystemAllocator) -> Result { + pub fn new(allocator: Arc>) -> Result { let irq_fd = EventFd::new(libc::EFD_NONBLOCK)?; let gsi = allocator + .lock() + .unwrap() .allocate_gsi() .map_err(|e| io::Error::other(format!("Failed allocating new GSI: {e}")))?; @@ -37,6 +40,7 @@ impl InterruptRoute { gsi, irq_fd, registered: AtomicBool::new(false), + allocator, }) } @@ -77,6 +81,12 @@ impl InterruptRoute { } } +impl Drop for InterruptRoute { + fn drop(&mut self) { + let _ = self.allocator.lock().unwrap().free_gsi(self.gsi); + } +} + pub struct RoutingEntry { route: IrqRoutingEntry, masked: bool, @@ -292,11 +302,10 @@ impl InterruptManager for MsiInterruptManager { type GroupConfig = MsiIrqGroupConfig; fn create_group(&self, config: Self::GroupConfig) -> Result> { - let mut allocator = self.allocator.lock().unwrap(); let mut irq_routes: HashMap = HashMap::with_capacity(config.count as usize); for i in config.base..config.base + config.count { - irq_routes.insert(i, InterruptRoute::new(&mut allocator)?); + irq_routes.insert(i, InterruptRoute::new(self.allocator.clone())?); } Ok(Arc::new(MsiInterruptGroup::new(