From 6dd73c64b5912e59e613b3a90be7407cf18b2c94 Mon Sep 17 00:00:00 2001 From: "Hui, Chunyang" Date: Mon, 27 Sep 2021 11:02:29 +0000 Subject: [PATCH] Improve userspace VM management Occlum is a single-address-space library OS. Previously, userspace memory are divided for each process. And all the memory are allocated when the process is created, which leads to a lot of wasted space and complicated configuration. In the current implementation, the whole userspace is managed as a memory pool that consists of chunks. There are two kinds of chunks: (1) Single VMA chunk: a chunk with only one VMA. Should be owned by exactly one process. (2) Multi VMA chunk: a chunk with default chunk size and there could be a lot of VMAs in this chunk. Can be used by different processes. This design can help to achieve mainly two goals: (1) Simplify the configuration: Users don't need to configure the process.default_mmap_size anymore. And multiple processes running in the same Occlum instance can use dramatically different sizes of memory. (2) Gain better performance: Two-level management(chunks & VMAs) reduces the time for finding, inserting, deleting, and iterating. --- src/Enclave.edl | 1 - src/libos/Cargo.lock | 42 +- src/libos/Cargo.toml | 3 + src/libos/src/entry.rs | 1 + src/libos/src/fs/procfs/meminfo.rs | 2 +- src/libos/src/lib.rs | 3 + src/libos/src/misc/sysinfo.rs | 2 +- src/libos/src/prelude.rs | 1 + src/libos/src/process/do_exit.rs | 6 + src/libos/src/syscall/mod.rs | 2 +- src/libos/src/vm/chunk.rs | 239 ++++ src/libos/src/vm/free_space_manager.rs | 149 +++ src/libos/src/vm/mod.rs | 63 +- src/libos/src/vm/process_vm.rs | 199 +-- src/libos/src/vm/user_space_vm.rs | 100 +- src/libos/src/vm/vm_area.rs | 82 +- src/libos/src/vm/vm_chunk_manager.rs | 654 ++++++++++ src/libos/src/vm/vm_layout.rs | 2 +- src/libos/src/vm/vm_manager.rs | 1574 ++++++++++-------------- src/libos/src/vm/vm_perms.rs | 20 + src/libos/src/vm/vm_range.rs | 4 +- src/libos/src/vm/vm_util.rs | 276 +++++ src/pal/src/pal_api.c | 1 - test/mmap/main.c | 48 + 24 files changed, 2372 insertions(+), 1102 deletions(-) create mode 100644 src/libos/src/vm/chunk.rs create mode 100644 src/libos/src/vm/free_space_manager.rs create mode 100644 src/libos/src/vm/vm_chunk_manager.rs create mode 100644 src/libos/src/vm/vm_util.rs diff --git a/src/Enclave.edl b/src/Enclave.edl index 48f6d26d..17de3633 100644 --- a/src/Enclave.edl +++ b/src/Enclave.edl @@ -103,7 +103,6 @@ enclave { */ public int occlum_ecall_kill(int pid, int sig); - /* * Broadcast interrupts to LibOS threads. * diff --git a/src/libos/Cargo.lock b/src/libos/Cargo.lock index 5b973dae..02570c94 100644 --- a/src/libos/Cargo.lock +++ b/src/libos/Cargo.lock @@ -8,11 +8,14 @@ dependencies = [ "atomic", "bitflags", "bitvec", + "ctor", "derive_builder", "goblin", + "intrusive-collections", + "itertools", "lazy_static", "log", - "memoffset", + "memoffset 0.6.1", "rcore-fs", "rcore-fs-devfs", "rcore-fs-mountfs", @@ -110,6 +113,16 @@ dependencies = [ "bitflags", ] +[[package]] +name = "ctor" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fbaabec2c953050352311293be5c6aba8e141ba19d6811862b232d6fd020484" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "darling" version = "0.10.2" @@ -227,6 +240,24 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" +[[package]] +name = "intrusive-collections" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb4ed164b4cf1c6bd6e18c097490331a0e58fbb0f39e8f6b5ac7f168006511cd" +dependencies = [ + "memoffset 0.5.6", +] + +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.5" @@ -258,6 +289,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "memoffset" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" +dependencies = [ + "autocfg 1.0.1", +] + [[package]] name = "memoffset" version = "0.6.1" diff --git a/src/libos/Cargo.toml b/src/libos/Cargo.toml index 92b4ae2b..b2c735e0 100644 --- a/src/libos/Cargo.toml +++ b/src/libos/Cargo.toml @@ -27,6 +27,8 @@ serde = { path = "../../deps/serde-sgx/serde", features = ["derive"] } serde_json = { path = "../../deps/serde-json-sgx" } memoffset = "0.6.1" scroll = { version = "0.10.2", default-features = false } +itertools = { version = "0.10.0", default-features = false, features = ["use_alloc"] } +ctor = "0.1" [patch.'https://github.com/apache/teaclave-sgx-sdk.git'] sgx_tstd = { path = "../../deps/rust-sgx-sdk/sgx_tstd" } @@ -48,3 +50,4 @@ sgx_tse = { path = "../../deps/rust-sgx-sdk/sgx_tse" } sgx_tcrypto = { path = "../../deps/rust-sgx-sdk/sgx_tcrypto" } sgx_cov = { path = "../../deps/rust-sgx-sdk/sgx_cov", optional = true } goblin = { version = "0.3.4", default-features = false, features = ["elf64", "elf32", "endian_fd"] } +intrusive-collections = "0.9" diff --git a/src/libos/src/entry.rs b/src/libos/src/entry.rs index 75eb6c0e..8a5ed4b0 100644 --- a/src/libos/src/entry.rs +++ b/src/libos/src/entry.rs @@ -15,6 +15,7 @@ use crate::util::log::LevelFilter; use crate::util::mem_util::from_untrusted::*; use crate::util::resolv_conf_util::{parse_resolv_conf, write_resolv_conf}; use crate::util::sgx::allow_debug as sgx_allow_debug; +use crate::vm::USER_SPACE_VM_MANAGER; use sgx_tse::*; pub static mut INSTANCE_DIR: String = String::new(); diff --git a/src/libos/src/fs/procfs/meminfo.rs b/src/libos/src/fs/procfs/meminfo.rs index f12a8a9c..69fce5f2 100644 --- a/src/libos/src/fs/procfs/meminfo.rs +++ b/src/libos/src/fs/procfs/meminfo.rs @@ -14,7 +14,7 @@ impl MemInfoINode { impl ProcINode for MemInfoINode { fn generate_data_in_bytes(&self) -> vfs::Result> { let total_ram = USER_SPACE_VM_MANAGER.get_total_size(); - let free_ram = USER_SPACE_VM_MANAGER.get_free_size(); + let free_ram = current!().vm().get_free_size(); Ok(format!( "MemTotal: {} kB\n\ MemFree: {} kB\n\ diff --git a/src/libos/src/lib.rs b/src/libos/src/lib.rs index 1b820b2a..e2799628 100644 --- a/src/libos/src/lib.rs +++ b/src/libos/src/lib.rs @@ -9,6 +9,7 @@ #![feature(alloc_layout_extra)] #![feature(concat_idents)] #![feature(trace_macros)] +#![feature(drain_filter)] // for !Send in rw_lock #![feature(negative_impls)] // for may_dangle in rw_lock @@ -54,6 +55,8 @@ extern crate serde; extern crate serde_json; #[macro_use] extern crate memoffset; +extern crate ctor; +extern crate intrusive_collections; extern crate resolv_conf; use sgx_trts::libc; diff --git a/src/libos/src/misc/sysinfo.rs b/src/libos/src/misc/sysinfo.rs index 00ed4950..1ed15273 100644 --- a/src/libos/src/misc/sysinfo.rs +++ b/src/libos/src/misc/sysinfo.rs @@ -26,7 +26,7 @@ pub fn do_sysinfo() -> Result { let info = sysinfo_t { uptime: time::up_time::get().unwrap().as_secs() as i64, // Duration can't be negative totalram: USER_SPACE_VM_MANAGER.get_total_size() as u64, - freeram: USER_SPACE_VM_MANAGER.get_free_size() as u64, + freeram: current!().vm().get_free_size() as u64, procs: table::get_all_processes().len() as u16, mem_unit: 1, ..Default::default() diff --git a/src/libos/src/prelude.rs b/src/libos/src/prelude.rs index 42f207f5..ee4044d5 100644 --- a/src/libos/src/prelude.rs +++ b/src/libos/src/prelude.rs @@ -2,6 +2,7 @@ pub use sgx_trts::libc; pub use sgx_trts::libc::off_t; pub use sgx_types::*; +pub use core::intrinsics::unreachable; use std; pub use std::cell::{Cell, RefCell}; pub use std::cmp::{max, min}; diff --git a/src/libos/src/process/do_exit.rs b/src/libos/src/process/do_exit.rs index ea8384eb..31a27c82 100644 --- a/src/libos/src/process/do_exit.rs +++ b/src/libos/src/process/do_exit.rs @@ -9,6 +9,7 @@ use super::{table, ProcessRef, TermStatus, ThreadRef, ThreadStatus}; use crate::prelude::*; use crate::signal::{KernelSignal, SigNum}; use crate::syscall::CpuContext; +use crate::vm::USER_SPACE_VM_MANAGER; pub fn do_exit_group(status: i32, curr_user_ctxt: &mut CpuContext) -> Result { if is_vforked_child_process() { @@ -103,6 +104,8 @@ fn exit_process(thread: &ThreadRef, term_status: TermStatus) { }; // Lock the current process let mut process_inner = process.inner(); + // Clean used VM + USER_SPACE_VM_MANAGER.free_chunks_when_exit(thread); // The parent is the idle process if parent_inner.is_none() { @@ -201,6 +204,9 @@ fn exit_process_for_execve( // Lock the current process let mut process_inner = process.inner(); + // Clean used VM + USER_SPACE_VM_MANAGER.free_chunks_when_exit(thread); + let mut new_parent_inner = new_parent_ref.inner(); let pid = process.pid(); diff --git a/src/libos/src/syscall/mod.rs b/src/libos/src/syscall/mod.rs index 85bbf3f9..d477cc9a 100644 --- a/src/libos/src/syscall/mod.rs +++ b/src/libos/src/syscall/mod.rs @@ -710,7 +710,7 @@ fn do_syscall(user_context: &mut CpuContext) { retval } }; - trace!("Retval = {:?}", retval); + trace!("Retval = 0x{:x}", retval); // Put the return value into user_context.rax, except for syscalls that may // modify user_context directly. Currently, there are three such syscalls: diff --git a/src/libos/src/vm/chunk.rs b/src/libos/src/vm/chunk.rs new file mode 100644 index 00000000..603b6fc9 --- /dev/null +++ b/src/libos/src/vm/chunk.rs @@ -0,0 +1,239 @@ +use super::*; + +use super::vm_area::VMArea; +use super::vm_chunk_manager::ChunkManager; +use super::vm_perms::VMPerms; +use super::vm_util::*; +use crate::process::ProcessRef; +use crate::process::ThreadRef; +use std::cmp::Ordering; +use std::collections::HashSet; +use std::hash::{Hash, Hasher}; + +// For single VMA chunk, the vma struct doesn't need to update the pid field. Because all the chunks are recorded by the process VM already. +pub const DUMMY_CHUNK_PROCESS_ID: pid_t = 0; +// Default chunk size: 32MB +pub const CHUNK_DEFAULT_SIZE: usize = 32 * 1024 * 1024; + +pub type ChunkID = usize; +pub type ChunkRef = Arc; + +pub struct Chunk { + range: VMRange, + internal: ChunkType, +} + +impl Hash for Chunk { + fn hash(&self, state: &mut H) { + self.range.hash(state); + } +} + +impl Ord for Chunk { + fn cmp(&self, other: &Self) -> Ordering { + self.range.start().cmp(&other.range.start()) + } +} + +impl PartialOrd for Chunk { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for Chunk { + fn eq(&self, other: &Self) -> bool { + self.range == other.range + } +} + +impl Eq for Chunk {} + +impl Debug for Chunk { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.internal() { + ChunkType::SingleVMA(vma) => write!(f, "Single VMA chunk: {:?}", vma), + ChunkType::MultiVMA(internal_manager) => write!(f, "default chunk: {:?}", self.range()), + } + } +} + +impl Chunk { + pub fn range(&self) -> &VMRange { + &self.range + } + + pub fn internal(&self) -> &ChunkType { + &self.internal + } + + pub fn free_size(&self) -> usize { + match self.internal() { + ChunkType::SingleVMA(vma) => 0, // for single VMA chunk, there is no free space + ChunkType::MultiVMA(internal_manager) => internal_manager.lock().unwrap().free_size(), + } + } + + pub fn new_default_chunk(vm_range: VMRange) -> Result { + let internal_manager = ChunkInternal::new(vm_range)?; + Ok(Self { + range: vm_range, + internal: ChunkType::MultiVMA(SgxMutex::new(internal_manager)), + }) + } + + pub fn new_single_vma_chunk(vm_range: VMRange, options: &VMMapOptions) -> Self { + let writeback_file = options.writeback_file().clone(); + let vm_area = VMArea::new( + vm_range.clone(), + *options.perms(), + writeback_file, + DUMMY_CHUNK_PROCESS_ID, + ); + // Initialize the memory of the new range + unsafe { + let buf = vm_range.as_slice_mut(); + options.initializer().init_slice(buf); + } + // Set memory permissions + if !options.perms().is_default() { + VMPerms::apply_perms(&vm_area, vm_area.perms()); + } + Self::new_chunk_with_vma(vm_area) + } + + pub fn new_chunk_with_vma(vma: VMArea) -> Self { + Self { + range: vma.range().clone(), + internal: ChunkType::SingleVMA(SgxMutex::new(vma)), + } + } + + pub fn add_process(&self, current: &ThreadRef) { + match self.internal() { + ChunkType::SingleVMA(vma) => unreachable!(), + ChunkType::MultiVMA(internal_manager) => { + internal_manager + .lock() + .unwrap() + .add_process(current.process().pid()); + } + } + } + + pub fn mmap(&self, options: &VMMapOptions) -> Result { + debug_assert!(!self.is_single_vma()); + trace!("try allocate in chunk: {:?}", self); + let mut internal_manager = if let ChunkType::MultiVMA(internal_manager) = &self.internal { + internal_manager.lock().unwrap() + } else { + unreachable!(); + }; + if internal_manager.chunk_manager.free_size() < options.size() { + return_errno!(ENOMEM, "no enough size without trying. try other chunks"); + } + return internal_manager.chunk_manager.mmap(options); + } + + pub fn try_mmap(&self, options: &VMMapOptions) -> Result { + debug_assert!(!self.is_single_vma()); + // Try lock ChunkManager. If it fails, just return and will try other chunks. + let mut internal_manager = if let ChunkType::MultiVMA(internal_manager) = &self.internal { + internal_manager + .try_lock() + .map_err(|_| errno!(EAGAIN, "try other chunks"))? + } else { + unreachable!(); + }; + trace!("get lock, try mmap in chunk: {:?}", self); + if internal_manager.chunk_manager().free_size() < options.size() { + return_errno!(ENOMEM, "no enough size without trying. try other chunks"); + } + internal_manager.chunk_manager().mmap(options) + } + + pub fn is_single_vma(&self) -> bool { + if let ChunkType::SingleVMA(_) = self.internal { + true + } else { + false + } + } + + pub fn find_mmap_region(&self, addr: usize) -> Result { + let internal = &self.internal; + match self.internal() { + ChunkType::SingleVMA(vma) => { + let vma = vma.lock().unwrap(); + if vma.contains(addr) { + return Ok(vma.range().clone()); + } else { + return_errno!(ESRCH, "addr not found in this chunk") + } + } + ChunkType::MultiVMA(internal_manager) => { + return internal_manager + .lock() + .unwrap() + .chunk_manager + .find_mmap_region(addr); + } + } + } +} + +#[derive(Debug)] +pub enum ChunkType { + SingleVMA(SgxMutex), + MultiVMA(SgxMutex), +} + +#[derive(Debug)] +pub struct ChunkInternal { + chunk_manager: ChunkManager, + process_set: HashSet, +} + +const PROCESS_SET_INIT_SIZE: usize = 5; + +impl ChunkInternal { + pub fn new(vm_range: VMRange) -> Result { + let chunk_manager = ChunkManager::from(vm_range.start(), vm_range.size())?; + + let mut process_set = HashSet::with_capacity(PROCESS_SET_INIT_SIZE); + process_set.insert(current!().process().pid()); + Ok(Self { + chunk_manager, + process_set, + }) + } + + pub fn add_process(&mut self, pid: pid_t) { + self.process_set.insert(pid); + } + + pub fn chunk_manager(&mut self) -> &mut ChunkManager { + &mut self.chunk_manager + } + + pub fn is_owned_by_current_process(&self) -> bool { + let current_pid = current!().process().pid(); + self.process_set.contains(¤t_pid) && self.process_set.len() == 1 + } + + pub fn free_size(&self) -> usize { + *self.chunk_manager.free_size() + } + + // Clean vmas when munmap a MultiVMA chunk, return whether this chunk is cleaned + pub fn clean_multi_vmas(&mut self) -> bool { + let current_pid = current!().process().pid(); + self.chunk_manager.clean_vmas_with_pid(current_pid); + if self.chunk_manager.is_empty() { + self.process_set.remove(¤t_pid); + return true; + } else { + return false; + } + } +} diff --git a/src/libos/src/vm/free_space_manager.rs b/src/libos/src/vm/free_space_manager.rs new file mode 100644 index 00000000..2e19f32b --- /dev/null +++ b/src/libos/src/vm/free_space_manager.rs @@ -0,0 +1,149 @@ +// Implements free space management for memory. +// Currently only use simple vector as the base structure. +// +// Basically use address-ordered first fit to find free ranges. + +use super::vm_util::VMMapAddr; +use super::*; + +static INITIAL_SIZE: usize = 100; + +#[derive(Debug, Default)] +pub struct VMFreeSpaceManager { + free_manager: Vec, // Address-ordered first fit +} + +impl VMFreeSpaceManager { + pub fn new(initial_free_range: VMRange) -> Self { + let mut free_manager = Vec::with_capacity(INITIAL_SIZE); + free_manager.push(initial_free_range); + + VMFreeSpaceManager { + free_manager: free_manager, + } + } + + pub fn free_size(&self) -> usize { + self.free_manager + .iter() + .fold(0, |acc, free_range| acc + free_range.size()) + } + + // TODO: respect options.align when mmap + pub fn find_free_range_internal( + &mut self, + size: usize, + align: usize, + addr: VMMapAddr, + ) -> Result { + // Record the minimal free range that satisfies the contraints + let mut result_free_range: Option = None; + let mut result_idx: Option = None; + let mut free_list = &mut self.free_manager; + + trace!("find free range, free list = {:?}", free_list); + + for (idx, free_range) in free_list.iter().enumerate() { + let mut free_range = { + if free_range.size() < size { + continue; + } + unsafe { VMRange::from_unchecked(free_range.start(), free_range.end()) } + }; + + match addr { + // Want a minimal free_range + VMMapAddr::Any => {} + // Prefer to have free_range.start == addr + VMMapAddr::Hint(addr) => { + if addr % align == 0 + && free_range.contains(addr) + && free_range.end() - addr >= size + { + free_range.start = addr; + free_range.end = addr + size; + self.free_list_update_range(idx, free_range); + return Ok(free_range); + } else { + // Hint failure, record the result but keep iterating. + if result_free_range == None + || result_free_range.as_ref().unwrap().size() > free_range.size() + { + result_free_range = Some(free_range); + result_idx = Some(idx); + } + continue; + } + } + // Must have free_range.start == addr + VMMapAddr::Need(addr) | VMMapAddr::Force(addr) => { + if free_range.start() > addr { + return_errno!(ENOMEM, "not enough memory for fixed mmap"); + } + if !free_range.contains(addr) { + continue; + } + if free_range.end() - addr < size { + return_errno!(ENOMEM, "not enough memory for fixed mmap"); + } + free_range.start = addr; + free_range.end = addr + size; + } + } + + result_free_range = Some(free_range); + result_idx = Some(idx); + break; + } + + if result_free_range.is_none() { + return_errno!(ENOMEM, "not enough memory"); + } + + let index = result_idx.unwrap(); + let result_free_range = { + let free_range = result_free_range.unwrap(); + let start = align_up(free_range.start(), align); + let end = start + size; + VMRange { start, end } + }; + + self.free_list_update_range(index, result_free_range); + trace!("after find free range, free list = {:?}", self.free_manager); + return Ok(result_free_range); + } + + fn free_list_update_range(&mut self, index: usize, range: VMRange) { + let mut free_list = &mut self.free_manager; + let ranges_after_subtraction = free_list[index].subtract(&range); + debug_assert!(ranges_after_subtraction.len() <= 2); + if ranges_after_subtraction.len() == 0 { + free_list.remove(index); + return; + } + free_list[index] = ranges_after_subtraction[0]; + if ranges_after_subtraction.len() == 2 { + free_list.insert(index + 1, ranges_after_subtraction[1]); + } + } + + pub fn add_range_back_to_free_manager(&mut self, dirty_range: &VMRange) -> Result<()> { + let mut free_list = &mut self.free_manager; + free_list.push(*dirty_range); + // Sort and merge small ranges + free_list.sort_unstable_by(|range_a, range_b| range_a.start().cmp(&range_b.start())); + let mut idx = 0; + while (idx < free_list.len() - 1) { + let right_range = free_list[idx + 1]; + let mut left_range = &mut free_list[idx]; + if left_range.end() == right_range.start() { + left_range.set_end(right_range.end()); + free_list.remove(idx + 1); + continue; + } + idx += 1; + } + trace!("after add range back free list = {:?}", free_list); + return Ok(()); + } +} diff --git a/src/libos/src/vm/mod.rs b/src/libos/src/vm/mod.rs index 70696de4..a4119bf6 100644 --- a/src/libos/src/vm/mod.rs +++ b/src/libos/src/vm/mod.rs @@ -1,18 +1,79 @@ +/* +Occlum is a single-address-space library OS. Previously, userspace memory are divided for each process. +And all the memory are allocated when the process is created, which leads to a lot of wasted space and +complicated configuration. + +In the current implementation, the whole userspace is managed as a memory pool that consists of chunks. There +are two kinds of chunks: +(1) Single VMA chunk: a chunk with only one VMA. Should be owned by exactly one process. +(2) Multi VMA chunk: a chunk with default chunk size and there could be a lot of VMAs in this chunk. Can be used +by different processes. + +This design can help to achieve mainly two goals: +(1) Simplify the configuration: Users don't need to configure the process.default_mmap_size anymore. And multiple processes +running in the same Occlum instance can use dramatically different sizes of memory. +(2) Gain better performance: Two-level management(chunks & VMAs) reduces the time for finding, inserting, deleting, and iterating. + +***************** Chart for Occlum User Space Memory Management *************** + User Space VM Manager +┌──────────────────────────────────────────────────────────────┐ +│ VMManager │ +│ │ +│ Chunks (in use): B-Tree Set │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ Multi VMA Chunk │ │ +│ │ ┌───────────────────────────────┐ │ │ +│ │ Single VMA Chunk │ ChunkManager │ │ │ +│ │ ┌──────────────┐ │ │ │ │ +│ │ │ │ │ VMAs (in use): Red Black Tree│ │ │ +│ │ │ VMArea │ │ ┌─────────────────────────┐ │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ └──────────────┘ │ │ ┌──────┐ ┌────┐ ┌────┐ │ │ │ │ +│ │ │ │ │ VMA │ │VMA │ │VMA │ │ │ │ │ +│ │ Single VMA Chunk │ │ └──────┘ └────┘ └────┘ │ │ │ │ +│ │ ┌──────────────┐ │ │ │ │ │ │ +│ │ │ │ │ └─────────────────────────┘ │ │ │ +│ │ │ VMArea │ │ │ │ │ +│ │ │ │ │ │ │ │ +│ │ └──────────────┘ │ Free Manager (free) │ │ │ +│ │ │ ┌────────────────────────┐ │ │ │ +│ │ Single VMA Chunk │ │ │ │ │ │ +│ │ ┌──────────────┐ │ │ VMFreeSpaceManager │ │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ VMArea │ │ └────────────────────────┘ │ │ │ +│ │ │ │ │ │ │ │ +│ │ └──────────────┘ └───────────────────────────────┘ │ │ +│ │ │ │ +│ └────────────────────────────────────────────────────────┘ │ +│ │ +│ Free Manager (free) │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ │ │ +│ │ VMFreeSpaceManager │ │ +│ │ │ │ +│ └────────────────────────────────────────────────────────┘ │ +│ │ +└──────────────────────────────────────────────────────────────┘ +*/ + use super::*; use fs::{File, FileDesc, FileRef}; use process::{Process, ProcessRef}; use std::fmt; +mod chunk; +mod free_space_manager; mod process_vm; mod user_space_vm; mod vm_area; +mod vm_chunk_manager; mod vm_layout; mod vm_manager; mod vm_perms; mod vm_range; +mod vm_util; use self::vm_layout::VMLayout; -use self::vm_manager::{VMManager, VMMapOptionsBuilder}; pub use self::process_vm::{MMapFlags, MRemapFlags, MSyncFlags, ProcessVM, ProcessVMBuilder}; pub use self::user_space_vm::USER_SPACE_VM_MANAGER; diff --git a/src/libos/src/vm/process_vm.rs b/src/libos/src/vm/process_vm.rs index b032cb1a..c8d4570e 100644 --- a/src/libos/src/vm/process_vm.rs +++ b/src/libos/src/vm/process_vm.rs @@ -1,12 +1,12 @@ use super::*; +use super::chunk::{Chunk, ChunkRef}; use super::config; use super::process::elf_file::{ElfFile, ProgramHeaderExt}; -use super::user_space_vm::{UserSpaceVMManager, UserSpaceVMRange, USER_SPACE_VM_MANAGER}; -use super::vm_manager::{ - VMInitializer, VMManager, VMMapAddr, VMMapOptions, VMMapOptionsBuilder, VMRemapOptions, -}; +use super::user_space_vm::USER_SPACE_VM_MANAGER; use super::vm_perms::VMPerms; +use super::vm_util::{VMInitializer, VMMapAddr, VMMapOptions, VMMapOptionsBuilder, VMRemapOptions}; +use std::collections::HashSet; use std::sync::atomic::{AtomicUsize, Ordering}; // Used for heap and stack start address randomization. @@ -69,9 +69,6 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> { let stack_size = self .stack_size .unwrap_or(config::LIBOS_CONFIG.process.default_stack_size); - let mmap_size = self - .mmap_size - .unwrap_or(config::LIBOS_CONFIG.process.default_mmap_size); // Before allocating memory, let's first calcualte how much memory // we need in total by iterating the memory layouts required by @@ -92,11 +89,10 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> { }) .collect(); - // TODO: Make heap and stack 16-byte aligned instead of page aligned. + // Make heap and stack 16-byte aligned let other_layouts = vec![ - VMLayout::new(heap_size, PAGE_SIZE)?, - VMLayout::new(stack_size, PAGE_SIZE)?, - VMLayout::new(mmap_size, PAGE_SIZE)?, + VMLayout::new(heap_size, 16)?, + VMLayout::new(stack_size, 16)?, ]; let process_layout = elf_layouts.iter().chain(other_layouts.iter()).fold( VMLayout::new_empty(), @@ -108,85 +104,61 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> { // Now that we end up with the memory layout required by the process, // let's allocate the memory for the process - let process_range = { USER_SPACE_VM_MANAGER.alloc(process_layout)? }; - let process_base = process_range.range().start(); - // Use the vm_manager to manage the whole process VM (including mmap region) - let mut vm_manager = VMManager::from(process_base, process_range.range().size())?; - // Note: we do not need to fill zeros of the mmap region. - // VMManager will fill zeros (if necessary) on mmap. - - // Tracker to track the min_start for each part - let mut min_start = - process_base + Self::get_randomize_offset(process_range.range().size() >> 3); + let mut chunks = HashSet::new(); // Init the memory for ELFs in the process let mut elf_ranges = Vec::with_capacity(2); elf_layouts .iter() .zip(self.elfs.iter()) .map(|(elf_layout, elf_file)| { - let desired_range = VMRange::new_with_layout(elf_layout, min_start); let vm_option = VMMapOptionsBuilder::default() - .size(desired_range.size()) - .addr(VMMapAddr::Need(desired_range.start())) + .size(elf_layout.size()) + .align(elf_layout.align()) .perms(VMPerms::ALL) // set it to read | write | exec for simplicity .initializer(VMInitializer::DoNothing()) .build()?; - let elf_start = vm_manager.mmap(vm_option)?; - debug_assert!(desired_range.start == elf_start); - debug_assert!(elf_start % elf_layout.align() == 0); - debug_assert!(process_range.range().is_superset_of(&desired_range)); - Self::init_elf_memory(&desired_range, elf_file)?; - min_start = desired_range.end(); - elf_ranges.push(desired_range); - trace!("elf range = {:?}", desired_range); + let (elf_range, chunk_ref) = USER_SPACE_VM_MANAGER.alloc(&vm_option)?; + debug_assert!(elf_range.start() % elf_layout.align() == 0); + Self::init_elf_memory(&elf_range, elf_file)?; + trace!("elf range = {:?}", elf_range); + elf_ranges.push(elf_range); + chunks.insert(chunk_ref); Ok(()) }) .collect::>()?; // Init the heap memory in the process let heap_layout = &other_layouts[0]; - let heap_min_start = min_start + Self::get_randomize_offset(RANGE_FOR_RANDOMIZATION); - let heap_range = VMRange::new_with_layout(heap_layout, heap_min_start); let vm_option = VMMapOptionsBuilder::default() - .size(heap_range.size()) - .addr(VMMapAddr::Need(heap_range.start())) + .size(heap_layout.size()) + .align(heap_layout.align()) .perms(VMPerms::READ | VMPerms::WRITE) .build()?; - let heap_start = vm_manager.mmap(vm_option)?; - debug_assert!(heap_range.start == heap_start); + let (heap_range, chunk_ref) = USER_SPACE_VM_MANAGER.alloc(&vm_option)?; + debug_assert!(heap_range.start() % heap_layout.align() == 0); trace!("heap range = {:?}", heap_range); let brk = AtomicUsize::new(heap_range.start()); - min_start = heap_range.end(); + chunks.insert(chunk_ref); // Init the stack memory in the process let stack_layout = &other_layouts[1]; - let stack_min_start = min_start + Self::get_randomize_offset(RANGE_FOR_RANDOMIZATION); - let stack_range = VMRange::new_with_layout(stack_layout, stack_min_start); let vm_option = VMMapOptionsBuilder::default() - .size(stack_range.size()) - .addr(VMMapAddr::Need(stack_range.start())) + .size(stack_layout.size()) + .align(heap_layout.align()) .perms(VMPerms::READ | VMPerms::WRITE) .build()?; - let stack_start = vm_manager.mmap(vm_option)?; - debug_assert!(stack_range.start == stack_start); + let (stack_range, chunk_ref) = USER_SPACE_VM_MANAGER.alloc(&vm_option)?; + debug_assert!(stack_range.start() % stack_layout.align() == 0); + chunks.insert(chunk_ref); trace!("stack range = {:?}", stack_range); - min_start = stack_range.end(); - // Note: we do not need to fill zeros for stack - - debug_assert!(process_range.range().is_superset_of(&heap_range)); - debug_assert!(process_range.range().is_superset_of(&stack_range)); - - // Set mmap prefered start address - vm_manager.set_mmap_prefered_start_addr(min_start); - let vm_manager = SgxMutex::new(vm_manager); + let mem_chunks = Arc::new(RwLock::new(chunks)); Ok(ProcessVM { - process_range, elf_ranges, heap_range, stack_range, brk, - vm_manager, + mem_chunks, }) } @@ -255,39 +227,83 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> { } } +// MemChunks is the structure to track all the chunks which are used by this process. +type MemChunks = Arc>>; + /// The per-process virtual memory #[derive(Debug)] pub struct ProcessVM { - vm_manager: SgxMutex, // manage the whole process VM elf_ranges: Vec, heap_range: VMRange, stack_range: VMRange, brk: AtomicUsize, - // Memory safety notes: the process_range field must be the last one. + // Memory safety notes: the mem_chunks field must be the last one. // // Rust drops fields in the same order as they are declared. So by making - // process_range the last field, we ensure that when all other fields are + // mem_chunks the last field, we ensure that when all other fields are // dropped, their drop methods (if provided) can still access the memory - // region represented by the process_range field. - process_range: UserSpaceVMRange, + // region represented by the mem_chunks field. + mem_chunks: MemChunks, } impl Default for ProcessVM { fn default() -> ProcessVM { ProcessVM { - process_range: USER_SPACE_VM_MANAGER.alloc_dummy(), elf_ranges: Default::default(), heap_range: Default::default(), stack_range: Default::default(), brk: Default::default(), - vm_manager: Default::default(), + mem_chunks: Arc::new(RwLock::new(HashSet::new())), } } } +impl Drop for ProcessVM { + fn drop(&mut self) { + let mut mem_chunks = self.mem_chunks.write().unwrap(); + // There are two cases when this drop is called: + // (1) Process exits normally and in the end, drop process VM + // (2) During creating process stage, process VM is ready but there are some other errors when creating the process, e.g. spawn_attribute is set + // to a wrong value + // + // For the first case, the process VM is cleaned in the exit procedure and nothing is needed. For the second cases, mem_chunks is not empty and should + // be cleaned here. + + // In the first case, the current is reset to idle thread + // In the second case, the current thread belongs to parent process + let current = current!(); + if current.tid() != 0 { + mem_chunks + .drain_filter(|chunk| chunk.is_single_vma()) + .for_each(|chunk| USER_SPACE_VM_MANAGER.free_chunk(&chunk)) + } + assert!(mem_chunks.len() == 0); + info!("Process VM dropped"); + } +} + impl ProcessVM { + pub fn mem_chunks(&self) -> &MemChunks { + &self.mem_chunks + } + + pub fn add_mem_chunk(&self, chunk: ChunkRef) { + let mut mem_chunks = self.mem_chunks.write().unwrap(); + mem_chunks.insert(chunk); + } + + pub fn remove_mem_chunk(&self, chunk: &ChunkRef) { + let mut mem_chunks = self.mem_chunks.write().unwrap(); + mem_chunks.remove(chunk); + } + + pub fn replace_mem_chunk(&self, old_chunk: &ChunkRef, new_chunk: ChunkRef) { + self.remove_mem_chunk(old_chunk); + self.add_mem_chunk(new_chunk) + } + pub fn get_process_range(&self) -> &VMRange { - self.process_range.range() + USER_SPACE_VM_MANAGER.range() } pub fn get_elf_ranges(&self) -> &[VMRange] { @@ -335,6 +351,18 @@ impl ProcessVM { Ok(new_brk) } + // Get a NON-accurate free size for current process + pub fn get_free_size(&self) -> usize { + let chunk_free_size = { + let process_chunks = self.mem_chunks.read().unwrap(); + process_chunks + .iter() + .fold(0, |acc, chunks| acc + chunks.free_size()) + }; + let free_size = chunk_free_size + USER_SPACE_VM_MANAGER.free_size(); + free_size + } + pub fn mmap( &self, addr: usize, @@ -346,9 +374,6 @@ impl ProcessVM { ) -> Result { let addr_option = { if flags.contains(MMapFlags::MAP_FIXED) { - if !self.process_range.range().contains(addr) { - return_errno!(EINVAL, "Beyond valid memory range"); - } VMMapAddr::Force(addr) } else { if addr == 0 { @@ -360,7 +385,8 @@ impl ProcessVM { }; let initializer = { if flags.contains(MMapFlags::MAP_ANONYMOUS) { - VMInitializer::FillZeros() + // There is no need to fill zeros in mmap. Cleaning is done after munmap. + VMInitializer::DoNothing() } else { let file_ref = current!().file(fd)?; VMInitializer::LoadFromFile { @@ -386,7 +412,7 @@ impl ProcessVM { .initializer(initializer) .writeback_file(writeback_file) .build()?; - let mmap_addr = self.vm_manager.lock().unwrap().mmap(mmap_options)?; + let mmap_addr = USER_SPACE_VM_MANAGER.mmap(&mmap_options)?; Ok(mmap_addr) } @@ -397,18 +423,12 @@ impl ProcessVM { new_size: usize, flags: MRemapFlags, ) -> Result { - if let Some(new_addr) = flags.new_addr() { - if !self.process_range.range().contains(new_addr) { - return_errno!(EINVAL, "new_addr is beyond valid memory range"); - } - } - let mremap_option = VMRemapOptions::new(old_addr, old_size, new_size, flags)?; - self.vm_manager.lock().unwrap().mremap(&mremap_option) + USER_SPACE_VM_MANAGER.mremap(&mremap_option) } pub fn munmap(&self, addr: usize, size: usize) -> Result<()> { - self.vm_manager.lock().unwrap().munmap(addr, size) + USER_SPACE_VM_MANAGER.munmap(addr, size) } pub fn mprotect(&self, addr: usize, size: usize, perms: VMPerms) -> Result<()> { @@ -419,38 +439,21 @@ impl ProcessVM { align_up(size, PAGE_SIZE) }; let protect_range = VMRange::new_with_size(addr, size)?; - if !self.process_range.range().is_superset_of(&protect_range) { - return_errno!(ENOMEM, "invalid range"); - } - let mut mmap_manager = self.vm_manager.lock().unwrap(); - // TODO: support mprotect vm regions in addition to mmap - if !mmap_manager.range().is_superset_of(&protect_range) { - warn!("Do not support mprotect memory outside the mmap region yet"); - return Ok(()); - } - - mmap_manager.mprotect(addr, size, perms) + return USER_SPACE_VM_MANAGER.mprotect(addr, size, perms); } pub fn msync(&self, addr: usize, size: usize) -> Result<()> { - let sync_range = VMRange::new_with_size(addr, size)?; - let mut mmap_manager = self.vm_manager.lock().unwrap(); - mmap_manager.msync_by_range(&sync_range) + return USER_SPACE_VM_MANAGER.msync(addr, size); } pub fn msync_by_file(&self, sync_file: &FileRef) { - let mut mmap_manager = self.vm_manager.lock().unwrap(); - mmap_manager.msync_by_file(sync_file); + return USER_SPACE_VM_MANAGER.msync_by_file(sync_file); } // Return: a copy of the found region pub fn find_mmap_region(&self, addr: usize) -> Result { - self.vm_manager - .lock() - .unwrap() - .find_mmap_region(addr) - .map(|range_ref| *range_ref) + USER_SPACE_VM_MANAGER.find_mmap_region(addr) } } diff --git a/src/libos/src/vm/user_space_vm.rs b/src/libos/src/vm/user_space_vm.rs index 88b3c4f4..00ebd58f 100644 --- a/src/libos/src/vm/user_space_vm.rs +++ b/src/libos/src/vm/user_space_vm.rs @@ -1,62 +1,69 @@ use super::*; +use crate::ctor::dtor; use config::LIBOS_CONFIG; +use std::ops::{Deref, DerefMut}; +use vm_manager::VMManager; /// The virtual memory manager for the entire user space -pub struct UserSpaceVMManager { - total_size: usize, - free_size: SgxMutex, -} +pub struct UserSpaceVMManager(VMManager); impl UserSpaceVMManager { - fn new() -> UserSpaceVMManager { + fn new() -> Result { let rsrv_mem_size = LIBOS_CONFIG.resource_limits.user_space_size; - UserSpaceVMManager { - total_size: rsrv_mem_size, - free_size: SgxMutex::new(rsrv_mem_size), - } - } - - pub fn alloc(&self, vm_layout: VMLayout) -> Result { - let size = align_up(vm_layout.size(), vm_layout.align()); let vm_range = unsafe { - let ptr = sgx_alloc_rsrv_mem(size); + // TODO: Current sgx_alloc_rsrv_mem implmentation will commit all the pages of the desired size, which will consume + // a lot of time. When EDMM is supported, there is no need to commit all the pages at the initialization stage. A function + // which reserves memory but not commit pages should be provided then. + let ptr = sgx_alloc_rsrv_mem(rsrv_mem_size); let perm = MemPerm::READ | MemPerm::WRITE; if ptr.is_null() { return_errno!(ENOMEM, "run out of reserved memory"); } // Change the page permission to RW (default) - assert!(sgx_tprotect_rsrv_mem(ptr, size, perm.bits()) == sgx_status_t::SGX_SUCCESS); + assert!( + sgx_tprotect_rsrv_mem(ptr, rsrv_mem_size, perm.bits()) == sgx_status_t::SGX_SUCCESS + ); let addr = ptr as usize; - debug!("allocated rsrv addr is 0x{:x}, len is 0x{:x}", addr, size); - VMRange::from_unchecked(addr, addr + size) + debug!( + "allocated rsrv addr is 0x{:x}, len is 0x{:x}", + addr, rsrv_mem_size + ); + VMRange::from_unchecked(addr, addr + rsrv_mem_size) }; - *self.free_size.lock().unwrap() -= size; - Ok(UserSpaceVMRange::new(vm_range)) - } + let vm_manager = VMManager::init(vm_range)?; - fn add_free_size(&self, user_space_vmrange: &UserSpaceVMRange) { - *self.free_size.lock().unwrap() += user_space_vmrange.range().size(); - } - - // The empty range is not added to sub_range - pub fn alloc_dummy(&self) -> UserSpaceVMRange { - let empty_user_vm_range = unsafe { VMRange::from_unchecked(0, 0) }; - UserSpaceVMRange::new(empty_user_vm_range) + Ok(UserSpaceVMManager(vm_manager)) } pub fn get_total_size(&self) -> usize { - self.total_size + self.range().size() } +} - pub fn get_free_size(&self) -> usize { - *self.free_size.lock().unwrap() +// This provides module teardown function attribute similar with `__attribute__((destructor))` in C/C++ and will +// be called after the main function. Static variables are still safe to visit at this time. +#[dtor] +fn free_user_space() { + let range = USER_SPACE_VM_MANAGER.range(); + assert!(USER_SPACE_VM_MANAGER.verified_clean_when_exit()); + let addr = range.start() as *const c_void; + let size = range.size(); + info!("free user space VM: {:?}", range); + assert!(unsafe { sgx_free_rsrv_mem(addr, size) == 0 }); +} + +impl Deref for UserSpaceVMManager { + type Target = VMManager; + + fn deref(&self) -> &Self::Target { + &self.0 } } lazy_static! { - pub static ref USER_SPACE_VM_MANAGER: UserSpaceVMManager = UserSpaceVMManager::new(); + pub static ref USER_SPACE_VM_MANAGER: UserSpaceVMManager = UserSpaceVMManager::new().unwrap(); } bitflags! { @@ -96,32 +103,3 @@ extern "C" { // fn sgx_tprotect_rsrv_mem(addr: *const c_void, length: usize, prot: i32) -> sgx_status_t; } - -#[derive(Debug)] -pub struct UserSpaceVMRange { - vm_range: VMRange, -} - -impl UserSpaceVMRange { - fn new(vm_range: VMRange) -> UserSpaceVMRange { - UserSpaceVMRange { vm_range } - } - - pub fn range(&self) -> &VMRange { - &self.vm_range - } -} - -impl Drop for UserSpaceVMRange { - fn drop(&mut self) { - let addr = self.vm_range.start() as *const c_void; - let size = self.vm_range.size(); - if size == 0 { - return; - } - - USER_SPACE_VM_MANAGER.add_free_size(self); - info!("user space vm free: {:?}", self.vm_range); - assert!(unsafe { sgx_free_rsrv_mem(addr, size) == 0 }); - } -} diff --git a/src/libos/src/vm/vm_area.rs b/src/libos/src/vm/vm_area.rs index cf496af8..5a349eea 100644 --- a/src/libos/src/vm/vm_area.rs +++ b/src/libos/src/vm/vm_area.rs @@ -4,25 +4,40 @@ use super::vm_perms::VMPerms; use super::vm_range::VMRange; use super::*; +use intrusive_collections::rbtree::{Link, RBTree}; +use intrusive_collections::{intrusive_adapter, KeyAdapter}; + #[derive(Clone, Debug, Default)] pub struct VMArea { range: VMRange, perms: VMPerms, writeback_file: Option<(FileRef, usize)>, + pid: pid_t, } impl VMArea { - pub fn new(range: VMRange, perms: VMPerms, writeback_file: Option<(FileRef, usize)>) -> Self { + pub fn new( + range: VMRange, + perms: VMPerms, + writeback_file: Option<(FileRef, usize)>, + pid: pid_t, + ) -> Self { Self { range, perms, writeback_file, + pid, } } /// Create a new VMArea object that inherits the write-back file (if any), but has /// a new range and permissions. - pub fn inherits_file_from(vma: &VMArea, new_range: VMRange, new_perms: VMPerms) -> Self { + pub fn inherits_file_from( + vma: &VMArea, + new_range: VMRange, + new_perms: VMPerms, + pid: pid_t, + ) -> Self { let new_writeback_file = vma.writeback_file.as_ref().map(|(file, file_offset)| { let new_file = file.clone(); @@ -36,7 +51,7 @@ impl VMArea { }; (new_file, new_file_offset) }); - Self::new(new_range, new_perms, new_writeback_file) + Self::new(new_range, new_perms, new_writeback_file, pid) } pub fn perms(&self) -> VMPerms { @@ -47,6 +62,10 @@ impl VMArea { &self.range } + pub fn pid(&self) -> pid_t { + self.pid + } + pub fn writeback_file(&self) -> &Option<(FileRef, usize)> { &self.writeback_file } @@ -59,7 +78,7 @@ impl VMArea { self.deref() .subtract(other) .into_iter() - .map(|range| Self::inherits_file_from(self, range, self.perms())) + .map(|range| Self::inherits_file_from(self, range, self.perms(), self.pid())) .collect() } @@ -72,7 +91,7 @@ impl VMArea { } new_range.unwrap() }; - let new_vma = VMArea::inherits_file_from(self, new_range, self.perms()); + let new_vma = VMArea::inherits_file_from(self, new_range, self.perms(), self.pid()); Some(new_vma) } @@ -109,3 +128,56 @@ impl Deref for VMArea { &self.range } } + +#[derive(Clone)] +pub struct VMAObj { + link: Link, + vma: VMArea, +} + +impl fmt::Debug for VMAObj { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?}", self.vma) + } +} + +// key adapter for RBTree which is sorted by the start of vma ranges +intrusive_adapter!(pub VMAAdapter = Box: VMAObj { link : Link }); +impl<'a> KeyAdapter<'a> for VMAAdapter { + type Key = usize; + fn get_key(&self, vma_obj: &'a VMAObj) -> usize { + vma_obj.vma.range().start() + } +} + +impl VMAObj { + pub fn new_vma_obj(vma: VMArea) -> Box { + Box::new(Self { + link: Link::new(), + vma, + }) + } + + pub fn vma(&self) -> &VMArea { + &self.vma + } +} + +impl VMArea { + pub fn new_obj( + range: VMRange, + perms: VMPerms, + writeback_file: Option<(FileRef, usize)>, + pid: pid_t, + ) -> Box { + Box::new(VMAObj { + link: Link::new(), + vma: VMArea { + range, + perms, + writeback_file, + pid, + }, + }) + } +} diff --git a/src/libos/src/vm/vm_chunk_manager.rs b/src/libos/src/vm/vm_chunk_manager.rs new file mode 100644 index 00000000..51003153 --- /dev/null +++ b/src/libos/src/vm/vm_chunk_manager.rs @@ -0,0 +1,654 @@ +use super::*; + +use super::free_space_manager::VMFreeSpaceManager as FreeRangeManager; +use super::vm_area::*; +use super::vm_perms::VMPerms; +use super::vm_util::*; +use std::collections::BTreeSet; + +use intrusive_collections::rbtree::{Link, RBTree}; +use intrusive_collections::Bound; +use intrusive_collections::RBTreeLink; +use intrusive_collections::{intrusive_adapter, KeyAdapter}; + +/// Memory chunk manager. +/// +/// Chunk is the memory unit for Occlum. For chunks with `default` size, every chunk is managed by a ChunkManager which provides +/// usedful memory management APIs such as mmap, munmap, mremap, mprotect, etc. +/// ChunkManager is implemented basically with two data structures: a red-black tree to track vmas in use and a FreeRangeManager to track +/// ranges which are free. +/// For vmas-in-use, there are two sentry vmas with zero length at the front and end of the red-black tree. +#[derive(Debug, Default)] +pub struct ChunkManager { + range: VMRange, + free_size: usize, + vmas: RBTree, + free_manager: FreeRangeManager, +} + +impl ChunkManager { + pub fn from(addr: usize, size: usize) -> Result { + let range = VMRange::new(addr, addr + size)?; + let vmas = { + let start = range.start(); + let end = range.end(); + let start_sentry = { + let range = VMRange::new_empty(start)?; + let perms = VMPerms::empty(); + // sentry vma shouldn't belong to any process + VMAObj::new_vma_obj(VMArea::new(range, perms, None, 0)) + }; + let end_sentry = { + let range = VMRange::new_empty(end)?; + let perms = VMPerms::empty(); + VMAObj::new_vma_obj(VMArea::new(range, perms, None, 0)) + }; + let mut new_tree = RBTree::new(VMAAdapter::new()); + new_tree.insert(start_sentry); + new_tree.insert(end_sentry); + new_tree + }; + Ok(ChunkManager { + range, + free_size: range.size(), + vmas, + free_manager: FreeRangeManager::new(range.clone()), + }) + } + + pub fn range(&self) -> &VMRange { + &self.range + } + + pub fn vmas(&self) -> &RBTree { + &self.vmas + } + + pub fn free_size(&self) -> &usize { + &self.free_size + } + + pub fn is_empty(&self) -> bool { + self.vmas.iter().count() == 2 // only sentry vmas + } + + pub fn clean_vmas_with_pid(&mut self, pid: pid_t) { + let mut vmas_cursor = self.vmas.cursor_mut(); + vmas_cursor.move_next(); // move to the first element of the tree + while !vmas_cursor.is_null() { + let vma = vmas_cursor.get().unwrap().vma(); + if vma.pid() != pid || vma.size() == 0 { + // Skip vmas which doesn't belong to this process + vmas_cursor.move_next(); + continue; + } + + Self::flush_file_vma(vma); + + if !vma.perms().is_default() { + VMPerms::apply_perms(vma, VMPerms::default()); + } + + unsafe { + let buf = vma.as_slice_mut(); + buf.iter_mut().for_each(|b| *b = 0) + } + + self.free_manager.add_range_back_to_free_manager(vma); + self.free_size += vma.size(); + + // Remove this vma from vmas list + vmas_cursor.remove(); + } + } + + pub fn mmap(&mut self, options: &VMMapOptions) -> Result { + let addr = *options.addr(); + let size = *options.size(); + let align = *options.align(); + + if let VMMapAddr::Force(addr) = addr { + self.munmap(addr, size)?; + } + + // Find and allocate a new range for this mmap request + let new_range = self + .free_manager + .find_free_range_internal(size, align, addr)?; + let new_addr = new_range.start(); + let writeback_file = options.writeback_file().clone(); + let current_pid = current!().process().pid(); + let new_vma = VMArea::new(new_range, *options.perms(), writeback_file, current_pid); + + // Initialize the memory of the new range + unsafe { + let buf = new_vma.as_slice_mut(); + options.initializer().init_slice(buf)?; + } + // Set memory permissions + if !options.perms().is_default() { + VMPerms::apply_perms(&new_vma, new_vma.perms()); + } + self.free_size -= new_vma.size(); + // After initializing, we can safely insert the new VMA + self.vmas.insert(VMAObj::new_vma_obj(new_vma)); + Ok(new_addr) + } + + pub fn munmap_range(&mut self, range: VMRange) -> Result<()> { + let bound = range.start(); + let current_pid = current!().process().pid(); + + // The cursor to iterate vmas that might intersect with munmap_range. + // Upper bound returns the vma whose start address is below and nearest to the munmap range. Start from this range. + let mut vmas_cursor = self.vmas.upper_bound_mut(Bound::Included(&bound)); + while !vmas_cursor.is_null() && vmas_cursor.get().unwrap().vma().start() <= range.end() { + let vma = &vmas_cursor.get().unwrap().vma(); + warn!("munmap related vma = {:?}", vma); + if vma.size() == 0 || current_pid != vma.pid() { + vmas_cursor.move_next(); + continue; + } + let intersection_vma = match vma.intersect(&range) { + None => { + vmas_cursor.move_next(); + continue; + } + Some(intersection_vma) => intersection_vma, + }; + + // File-backed VMA needs to be flushed upon munmap + Self::flush_file_vma(&intersection_vma); + if !&intersection_vma.perms().is_default() { + VMPerms::apply_perms(&intersection_vma, VMPerms::default()); + } + + if vma.range() == intersection_vma.range() { + // Exact match. Just remove. + vmas_cursor.remove(); + } else { + // The intersection_vma is a subset of current vma + let mut remain_vmas = vma.subtract(&intersection_vma); + if remain_vmas.len() == 1 { + let new_obj = VMAObj::new_vma_obj(remain_vmas.pop().unwrap()); + vmas_cursor.replace_with(new_obj); + vmas_cursor.move_next(); + } else { + debug_assert!(remain_vmas.len() == 2); + let vma_left_part = VMAObj::new_vma_obj(remain_vmas.swap_remove(0)); + vmas_cursor.replace_with(vma_left_part); + let vma_right_part = VMAObj::new_vma_obj(remain_vmas.pop().unwrap()); + // The new element will be inserted at the correct position in the tree based on its key automatically. + vmas_cursor.insert(vma_right_part); + } + } + + // Reset zero + unsafe { + warn!("intersection vma = {:?}", intersection_vma); + let buf = intersection_vma.as_slice_mut(); + buf.iter_mut().for_each(|b| *b = 0) + } + + self.free_manager + .add_range_back_to_free_manager(intersection_vma.range()); + self.free_size += intersection_vma.size(); + } + Ok(()) + } + + pub fn munmap(&mut self, addr: usize, size: usize) -> Result<()> { + let size = { + if size == 0 { + return_errno!(EINVAL, "size of munmap must not be zero"); + } + align_up(size, PAGE_SIZE) + }; + let munmap_range = { + let munmap_range = VMRange::new(addr, addr + size)?; + + let effective_munmap_range_opt = munmap_range.intersect(&self.range); + if effective_munmap_range_opt.is_none() { + return Ok(()); + } + + let effective_munmap_range = effective_munmap_range_opt.unwrap(); + if effective_munmap_range.empty() { + return Ok(()); + } + effective_munmap_range + }; + + self.munmap_range(munmap_range) + } + + pub fn mremap(&mut self, options: &VMRemapOptions) -> Result { + let old_addr = options.old_addr(); + let old_size = options.old_size(); + let old_range = VMRange::new_with_size(old_addr, old_size)?; + let new_size = options.new_size(); + let flags = options.flags(); + let size_type = SizeType::new(&old_size, &new_size); + + return_errno!(ENOSYS, "Under development"); + + // Old dead code. Could be used for future development. + #[cfg(dev)] + { + // The old range must be contained in one VMA + let idx = self + .find_containing_vma_idx(&old_range) + .ok_or_else(|| errno!(EFAULT, "invalid range"))?; + let containing_vma = &self.vmas[idx]; + // Get the memory permissions of the old range + let perms = containing_vma.perms(); + // Get the write back file of the old range if there is one. + let writeback_file = containing_vma.writeback_file(); + + // FIXME: Current implementation for file-backed memory mremap has limitation that if a SUBRANGE of the previous + // file-backed mmap with MAP_SHARED is then mremap-ed with MREMAP_MAYMOVE, there will be two vmas that have the same backed file. + // For Linux, writing to either memory vma or the file will update the other two equally. But we won't be able to support this before + // we really have paging. Thus, if the old_range is not equal to a recorded vma, we will just return with error. + if writeback_file.is_some() && &old_range != containing_vma.range() { + return_errno!(EINVAL, "Known limition") + } + + // Implement mremap as one optional mmap followed by one optional munmap. + // + // The exact arguments for the mmap and munmap are determined by the values of MRemapFlags, + // SizeType and writeback_file. There is a total of 18 combinations among MRemapFlags and + // SizeType and writeback_file. As some combinations result in the same mmap and munmap operations, + // the following code only needs to match below patterns of (MRemapFlags, SizeType, writeback_file) + // and treat each case accordingly. + + // Determine whether need to do mmap. And when possible, determine the returned address + let (need_mmap, mut ret_addr) = match (flags, size_type, writeback_file) { + (MRemapFlags::None, SizeType::Growing, None) => { + let vm_initializer_for_new_range = VMInitializer::FillZeros(); + let mmap_opts = VMMapOptionsBuilder::default() + .size(new_size - old_size) + .addr(VMMapAddr::Need(old_range.end())) + .perms(perms) + .initializer(vm_initializer_for_new_range) + .build()?; + let ret_addr = Some(old_addr); + (Some(mmap_opts), ret_addr) + } + (MRemapFlags::None, SizeType::Growing, Some((backed_file, offset))) => { + // Update writeback file offset + let new_writeback_file = + Some((backed_file.clone(), offset + containing_vma.size())); + let vm_initializer_for_new_range = VMInitializer::LoadFromFile { + file: backed_file.clone(), + offset: offset + containing_vma.size(), // file-backed mremap should start from the end of previous mmap/mremap file + }; + let mmap_opts = VMMapOptionsBuilder::default() + .size(new_size - old_size) + .addr(VMMapAddr::Need(old_range.end())) + .perms(perms) + .initializer(vm_initializer_for_new_range) + .writeback_file(new_writeback_file) + .build()?; + let ret_addr = Some(old_addr); + (Some(mmap_opts), ret_addr) + } + (MRemapFlags::MayMove, SizeType::Growing, None) => { + let prefered_new_range = + VMRange::new_with_size(old_addr + old_size, new_size - old_size)?; + if self.is_free_range(&prefered_new_range) { + // Don't need to move the old range + let vm_initializer_for_new_range = VMInitializer::FillZeros(); + let mmap_ops = VMMapOptionsBuilder::default() + .size(prefered_new_range.size()) + .addr(VMMapAddr::Need(prefered_new_range.start())) + .perms(perms) + .initializer(vm_initializer_for_new_range) + .build()?; + (Some(mmap_ops), Some(old_addr)) + } else { + // Need to move old range to a new range and init the new range + let vm_initializer_for_new_range = + VMInitializer::CopyFrom { range: old_range }; + let mmap_ops = VMMapOptionsBuilder::default() + .size(new_size) + .addr(VMMapAddr::Any) + .perms(perms) + .initializer(vm_initializer_for_new_range) + .build()?; + // Cannot determine the returned address for now, which can only be obtained after calling mmap + let ret_addr = None; + (Some(mmap_ops), ret_addr) + } + } + (MRemapFlags::MayMove, SizeType::Growing, Some((backed_file, offset))) => { + let prefered_new_range = + VMRange::new_with_size(old_addr + old_size, new_size - old_size)?; + if self.is_free_range(&prefered_new_range) { + // Don't need to move the old range + let vm_initializer_for_new_range = VMInitializer::LoadFromFile { + file: backed_file.clone(), + offset: offset + containing_vma.size(), // file-backed mremap should start from the end of previous mmap/mremap file + }; + // Write back file should start from new offset + let new_writeback_file = + Some((backed_file.clone(), offset + containing_vma.size())); + let mmap_ops = VMMapOptionsBuilder::default() + .size(prefered_new_range.size()) + .addr(VMMapAddr::Need(prefered_new_range.start())) + .perms(perms) + .initializer(vm_initializer_for_new_range) + .writeback_file(new_writeback_file) + .build()?; + (Some(mmap_ops), Some(old_addr)) + } else { + // Need to move old range to a new range and init the new range + let vm_initializer_for_new_range = { + let copy_end = containing_vma.end(); + let copy_range = VMRange::new(old_range.start(), copy_end)?; + let reread_file_start_offset = copy_end - containing_vma.start(); + VMInitializer::CopyOldAndReadNew { + old_range: copy_range, + file: backed_file.clone(), + offset: reread_file_start_offset, + } + }; + let new_writeback_file = Some((backed_file.clone(), *offset)); + let mmap_ops = VMMapOptionsBuilder::default() + .size(new_size) + .addr(VMMapAddr::Any) + .perms(perms) + .initializer(vm_initializer_for_new_range) + .writeback_file(new_writeback_file) + .build()?; + // Cannot determine the returned address for now, which can only be obtained after calling mmap + let ret_addr = None; + (Some(mmap_ops), ret_addr) + } + } + (MRemapFlags::FixedAddr(new_addr), _, None) => { + let vm_initializer_for_new_range = + { VMInitializer::CopyFrom { range: old_range } }; + let mmap_opts = VMMapOptionsBuilder::default() + .size(new_size) + .addr(VMMapAddr::Force(new_addr)) + .perms(perms) + .initializer(vm_initializer_for_new_range) + .build()?; + let ret_addr = Some(new_addr); + (Some(mmap_opts), ret_addr) + } + (MRemapFlags::FixedAddr(new_addr), _, Some((backed_file, offset))) => { + let vm_initializer_for_new_range = { + let copy_end = containing_vma.end(); + let copy_range = VMRange::new(old_range.start(), copy_end)?; + let reread_file_start_offset = copy_end - containing_vma.start(); + VMInitializer::CopyOldAndReadNew { + old_range: copy_range, + file: backed_file.clone(), + offset: reread_file_start_offset, + } + }; + let new_writeback_file = Some((backed_file.clone(), *offset)); + let mmap_opts = VMMapOptionsBuilder::default() + .size(new_size) + .addr(VMMapAddr::Force(new_addr)) + .perms(perms) + .initializer(vm_initializer_for_new_range) + .writeback_file(new_writeback_file) + .build()?; + let ret_addr = Some(new_addr); + (Some(mmap_opts), ret_addr) + } + _ => (None, Some(old_addr)), + }; + + let need_munmap = match (flags, size_type) { + (MRemapFlags::None, SizeType::Shrinking) + | (MRemapFlags::MayMove, SizeType::Shrinking) => { + let unmap_addr = old_addr + new_size; + let unmap_size = old_size - new_size; + Some((unmap_addr, unmap_size)) + } + (MRemapFlags::MayMove, SizeType::Growing) => { + if ret_addr.is_none() { + // We must need to do mmap. Thus unmap the old range + Some((old_addr, old_size)) + } else { + // We must choose to reuse the old range. Thus, no need to unmap + None + } + } + (MRemapFlags::FixedAddr(new_addr), _) => { + let new_range = VMRange::new_with_size(new_addr, new_size)?; + if new_range.overlap_with(&old_range) { + return_errno!(EINVAL, "new range cannot overlap with the old one"); + } + Some((old_addr, old_size)) + } + _ => None, + }; + + // Perform mmap and munmap if needed + if let Some(mmap_options) = need_mmap { + let mmap_addr = self.mmap(&mmap_options)?; + + if ret_addr.is_none() { + ret_addr = Some(mmap_addr); + } + } + if let Some((addr, size)) = need_munmap { + self.munmap(addr, size).expect("never fail"); + } + + debug_assert!(ret_addr.is_some()); + Ok(ret_addr.unwrap()) + } + } + + pub fn mprotect(&mut self, addr: usize, size: usize, new_perms: VMPerms) -> Result<()> { + let protect_range = VMRange::new_with_size(addr, size)?; + let bound = protect_range.start(); + let mut containing_vmas = self.vmas.upper_bound_mut(Bound::Included(&bound)); + if containing_vmas.is_null() { + return_errno!(ENOMEM, "invalid range"); + } + let current_pid = current!().process().pid(); + + // If a mprotect range is not a subrange of one vma, it must be subrange of multiple connecting vmas. + while !containing_vmas.is_null() + && containing_vmas.get().unwrap().vma().start() <= protect_range.end() + { + let mut containing_vma = containing_vmas.get().unwrap().vma().clone(); + if containing_vma.pid() != current_pid { + containing_vmas.move_next(); + continue; + } + + let old_perms = containing_vma.perms(); + if new_perms == old_perms { + containing_vmas.move_next(); + continue; + } + + let intersection_vma = match containing_vma.intersect(&protect_range) { + None => { + containing_vmas.move_next(); + continue; + } + Some(intersection_vma) => intersection_vma, + }; + + if intersection_vma.range() == containing_vma.range() { + // The whole containing_vma is mprotected + containing_vma.set_perms(new_perms); + VMPerms::apply_perms(&containing_vma, containing_vma.perms()); + warn!("containing_vma = {:?}", containing_vma); + containing_vmas.replace_with(VMAObj::new_vma_obj(containing_vma)); + containing_vmas.move_next(); + continue; + } else { + // A subrange of containing_vma is mprotected + debug_assert!(containing_vma.is_superset_of(&intersection_vma)); + let mut remain_vmas = containing_vma.subtract(&intersection_vma); + match remain_vmas.len() { + 2 => { + // The containing VMA is divided into three VMAs: + // Shrinked old VMA: [containing_vma.start, protect_range.start) + // New VMA: [protect_range.start, protect_range.end) + // Another new vma: [protect_range.end, containing_vma.end) + let old_end = containing_vma.end(); + let protect_end = protect_range.end(); + + // Shrinked old VMA + containing_vma.set_end(protect_range.start()); + + // New VMA + let new_vma = VMArea::inherits_file_from( + &containing_vma, + protect_range, + new_perms, + current_pid, + ); + VMPerms::apply_perms(&new_vma, new_vma.perms()); + let new_vma = VMAObj::new_vma_obj(new_vma); + + // Another new VMA + let new_vma2 = { + let range = VMRange::new(protect_end, old_end).unwrap(); + let new_vma = VMArea::inherits_file_from( + &containing_vma, + range, + old_perms, + current_pid, + ); + VMAObj::new_vma_obj(new_vma) + }; + + containing_vmas.replace_with(VMAObj::new_vma_obj(containing_vma)); + containing_vmas.insert(new_vma); + containing_vmas.insert(new_vma2); + // In this case, there is no need to check other vmas. + break; + } + 1 => { + let remain_vma = remain_vmas.pop().unwrap(); + if remain_vma.start() == containing_vma.start() { + // mprotect right side of the vma + containing_vma.set_end(remain_vma.end()); + } else { + // mprotect left side of the vma + debug_assert!(remain_vma.end() == containing_vma.end()); + containing_vma.set_start(remain_vma.start()); + } + let new_vma = VMArea::inherits_file_from( + &containing_vma, + intersection_vma.range().clone(), + new_perms, + current_pid, + ); + VMPerms::apply_perms(&new_vma, new_vma.perms()); + + containing_vmas.replace_with(VMAObj::new_vma_obj(containing_vma)); + containing_vmas.insert(VMAObj::new_vma_obj(new_vma)); + containing_vmas.move_next(); + continue; + } + _ => unreachable!(), + } + } + } + + Ok(()) + } + + /// Sync all shared, file-backed memory mappings in the given range by flushing the + /// memory content to its underlying file. + pub fn msync_by_range(&mut self, sync_range: &VMRange) -> Result<()> { + if !self.range().is_superset_of(sync_range) { + return_errno!(ENOMEM, "invalid range"); + } + + // ?FIXME: check if sync_range covers unmapped memory + for vma_obj in &self.vmas { + let vma = match vma_obj.vma().intersect(sync_range) { + None => continue, + Some(vma) => vma, + }; + Self::flush_file_vma(&vma); + } + Ok(()) + } + + /// Sync all shared, file-backed memory mappings of the given file by flushing + /// the memory content to the file. + pub fn msync_by_file(&mut self, sync_file: &FileRef) { + for vma_obj in &self.vmas { + let is_same_file = |file: &FileRef| -> bool { Arc::ptr_eq(&file, &sync_file) }; + Self::flush_file_vma_with_cond(&vma_obj.vma(), is_same_file); + } + } + + /// Flush a file-backed VMA to its file. This has no effect on anonymous VMA. + pub fn flush_file_vma(vma: &VMArea) { + Self::flush_file_vma_with_cond(vma, |_| true) + } + + /// Same as flush_vma, except that an extra condition on the file needs to satisfy. + pub fn flush_file_vma_with_cond bool>(vma: &VMArea, cond_fn: F) { + let (file, file_offset) = match vma.writeback_file().as_ref() { + None => return, + Some((file_and_offset)) => file_and_offset, + }; + let file_writable = file + .access_mode() + .map(|ac| ac.writable()) + .unwrap_or_default(); + if !file_writable { + return; + } + if !cond_fn(file) { + return; + } + file.write_at(*file_offset, unsafe { vma.as_slice() }); + } + + pub fn find_mmap_region(&self, addr: usize) -> Result { + let vma = self.vmas.upper_bound(Bound::Included(&addr)); + if vma.is_null() { + return_errno!(ESRCH, "no mmap regions that contains the address"); + } + let vma = vma.get().unwrap().vma(); + if vma.pid() != current!().process().pid() || !vma.contains(addr) { + return_errno!(ESRCH, "no mmap regions that contains the address"); + } + + return Ok(vma.range().clone()); + } + + pub fn usage_percentage(&self) -> f32 { + let totol_size = self.range.size(); + let mut used_size = 0; + self.vmas + .iter() + .for_each(|vma_obj| used_size += vma_obj.vma().size()); + + return used_size as f32 / totol_size as f32; + } + + // Returns whether the requested range is free + fn is_free_range(&self, request_range: &VMRange) -> bool { + self.range.is_superset_of(request_range) + && self + .vmas + .iter() + .any(|vma_obj| vma_obj.vma().range().is_superset_of(request_range) == true) + } +} + +impl Drop for ChunkManager { + fn drop(&mut self) { + assert!(self.is_empty()); + assert!(self.free_size == self.range.size()); + assert!(self.free_manager.free_size() == self.range.size()); + } +} diff --git a/src/libos/src/vm/vm_layout.rs b/src/libos/src/vm/vm_layout.rs index 578fa5c1..163c03d0 100644 --- a/src/libos/src/vm/vm_layout.rs +++ b/src/libos/src/vm/vm_layout.rs @@ -8,7 +8,7 @@ pub struct VMLayout { impl VMLayout { pub fn new(size: usize, align: usize) -> Result { - if !align.is_power_of_two() || align % PAGE_SIZE != 0 { + if !align.is_power_of_two() { return_errno!(EINVAL, "invalid layout"); } Ok(VMLayout { size, align }) diff --git a/src/libos/src/vm/vm_manager.rs b/src/libos/src/vm/vm_manager.rs index 6f8a2065..3ede121d 100644 --- a/src/libos/src/vm/vm_manager.rs +++ b/src/libos/src/vm/vm_manager.rs @@ -1,315 +1,36 @@ use super::*; +use super::chunk::{ + Chunk, ChunkID, ChunkRef, ChunkType, CHUNK_DEFAULT_SIZE, DUMMY_CHUNK_PROCESS_ID, +}; +use super::free_space_manager::VMFreeSpaceManager; use super::vm_area::VMArea; +use super::vm_chunk_manager::ChunkManager; use super::vm_perms::VMPerms; +use super::vm_util::*; +use crate::process::{ThreadRef, ThreadStatus}; +use std::ops::Bound::{Excluded, Included}; -#[derive(Clone, Debug)] -pub enum VMInitializer { - DoNothing(), - FillZeros(), - CopyFrom { - range: VMRange, - }, - LoadFromFile { - file: FileRef, - offset: usize, - }, - // For file-backed mremap which may move from old range to new range and read extra bytes from file - CopyOldAndReadNew { - old_range: VMRange, - file: FileRef, - offset: usize, // read file from this offset - }, -} +use crate::util::sync::rw_lock; +use std::collections::{BTreeSet, HashSet}; -impl Default for VMInitializer { - fn default() -> VMInitializer { - VMInitializer::DoNothing() - } -} - -impl VMInitializer { - pub fn init_slice(&self, buf: &mut [u8]) -> Result<()> { - match self { - VMInitializer::DoNothing() => { - // Do nothing - } - VMInitializer::FillZeros() => { - for b in buf { - *b = 0; - } - } - VMInitializer::CopyFrom { range } => { - let src_slice = unsafe { range.as_slice() }; - let copy_len = min(buf.len(), src_slice.len()); - buf[..copy_len].copy_from_slice(&src_slice[..copy_len]); - for b in &mut buf[copy_len..] { - *b = 0; - } - } - VMInitializer::LoadFromFile { file, offset } => { - // TODO: make sure that read_at does not move file cursor - let len = file - .read_at(*offset, buf) - .cause_err(|_| errno!(EIO, "failed to init memory from file"))?; - for b in &mut buf[len..] { - *b = 0; - } - } - VMInitializer::CopyOldAndReadNew { - old_range, - file, - offset, - } => { - // TODO: Handle old_range with non-readable subrange - let src_slice = unsafe { old_range.as_slice() }; - let copy_len = src_slice.len(); - debug_assert!(copy_len <= buf.len()); - let read_len = buf.len() - copy_len; - buf[..copy_len].copy_from_slice(&src_slice[..copy_len]); - let len = file - .read_at(*offset, &mut buf[copy_len..]) - .cause_err(|_| errno!(EIO, "failed to init memory from file"))?; - for b in &mut buf[(copy_len + len)..] { - *b = 0; - } - } - } - Ok(()) - } -} - -#[derive(Clone, Copy, Debug, PartialEq)] -pub enum VMMapAddr { - Any, // Free to choose any address - Hint(usize), // Prefer the address, but can use other address - Need(usize), // Need to use the address, otherwise report error - Force(usize), // Force using the address by munmap first -} - -impl Default for VMMapAddr { - fn default() -> VMMapAddr { - VMMapAddr::Any - } -} - -#[derive(Builder, Debug)] -#[builder(pattern = "owned", build_fn(skip), no_std)] -pub struct VMMapOptions { - size: usize, - align: usize, - perms: VMPerms, - addr: VMMapAddr, - initializer: VMInitializer, - // The content of the VMA can be written back to a given file at a given offset - writeback_file: Option<(FileRef, usize)>, -} - -// VMMapOptionsBuilder is generated automatically, except the build function -impl VMMapOptionsBuilder { - pub fn build(mut self) -> Result { - let size = { - let size = self - .size - .ok_or_else(|| errno!(EINVAL, "invalid size for mmap"))?; - if size == 0 { - return_errno!(EINVAL, "invalid size for mmap"); - } - align_up(size, PAGE_SIZE) - }; - let align = { - let align = self.align.unwrap_or(PAGE_SIZE); - if align == 0 || align % PAGE_SIZE != 0 { - return_errno!(EINVAL, "invalid size for mmap"); - } - align - }; - let perms = self - .perms - .ok_or_else(|| errno!(EINVAL, "perms must be given"))?; - let addr = { - let addr = self.addr.unwrap_or_default(); - match addr { - // TODO: check addr + size overflow - VMMapAddr::Any => VMMapAddr::Any, - VMMapAddr::Hint(addr) => { - let addr = align_down(addr, PAGE_SIZE); - VMMapAddr::Hint(addr) - } - VMMapAddr::Need(addr_) | VMMapAddr::Force(addr_) => { - if addr_ % align != 0 { - return_errno!(EINVAL, "unaligned addr for fixed mmap"); - } - addr - } - } - }; - let initializer = match self.initializer.as_ref() { - Some(initializer) => initializer.clone(), - None => VMInitializer::default(), - }; - let writeback_file = self.writeback_file.take().unwrap_or_default(); - Ok(VMMapOptions { - size, - align, - perms, - addr, - initializer, - writeback_file, - }) - } -} - -impl VMMapOptions { - pub fn size(&self) -> &usize { - &self.size - } - - pub fn addr(&self) -> &VMMapAddr { - &self.addr - } - - pub fn perms(&self) -> &VMPerms { - &self.perms - } - - pub fn initializer(&self) -> &VMInitializer { - &self.initializer - } - - pub fn writeback_file(&self) -> &Option<(FileRef, usize)> { - &self.writeback_file - } -} +// Incorrect order of locks could cause deadlock easily. +// Don't hold a low-order lock and then try to get a high-order lock. +// High order -> Low order: +// VMManager.internal > ProcessVM.mem_chunks > locks in chunks #[derive(Debug)] -pub struct VMRemapOptions { - old_addr: usize, - old_size: usize, - new_size: usize, - flags: MRemapFlags, -} - -impl VMRemapOptions { - pub fn new( - old_addr: usize, - old_size: usize, - new_size: usize, - flags: MRemapFlags, - ) -> Result { - let old_addr = if old_addr % PAGE_SIZE != 0 { - return_errno!(EINVAL, "unaligned old address"); - } else { - old_addr - }; - let old_size = if old_size == 0 { - // TODO: support old_size is zero for shareable mapping - warn!("do not support old_size is zero"); - return_errno!(EINVAL, "invalid old size"); - } else { - align_up(old_size, PAGE_SIZE) - }; - if let Some(new_addr) = flags.new_addr() { - if new_addr % PAGE_SIZE != 0 { - return_errno!(EINVAL, "unaligned new address"); - } - } - let new_size = if new_size == 0 { - return_errno!(EINVAL, "invalid new size"); - } else { - align_up(new_size, PAGE_SIZE) - }; - Ok(Self { - old_addr, - old_size, - new_size, - flags, - }) - } - - pub fn old_addr(&self) -> usize { - self.old_addr - } - - pub fn old_size(&self) -> usize { - self.old_size - } - - pub fn new_size(&self) -> usize { - self.new_size - } - - pub fn flags(&self) -> MRemapFlags { - self.flags - } -} - -/// Memory manager. -/// -/// VMManager provides useful memory management APIs such as mmap, munmap, mremap, etc. It also manages the whole -/// process VM including mmap, stack, heap, elf ranges. -/// -/// # Invariants -/// -/// Behind the scene, VMManager maintains a list of VMArea that have been allocated. -/// (denoted as `self.vmas`). To reason about the correctness of VMManager, we give -/// the set of invariants hold by VMManager. -/// -/// 1. The rule of sentry: -/// ``` -/// self.range.begin() == self.vmas[0].start() == self.vmas[0].end() -/// ``` -/// and -/// ``` -/// self.range.end() == self.vmas[N-1].start() == self.vmas[N-1].end() -/// ``` -/// where `N = self.vmas.len()`. -/// -/// 2. The rule of non-emptyness: -/// ``` -/// self.vmas[i].size() > 0, for 1 <= i < self.vmas.len() - 1 -/// ``` -/// -/// 3. The rule of ordering: -/// ``` -/// self.vmas[i].end() <= self.vmas[i+1].start() for 0 <= i < self.vmas.len() - 1 -/// ``` -/// -/// 4. The rule of non-mergablility: -/// ``` -/// self.vmas[i].end() != self.vmas[i+1].start() || self.vmas[i].perms() != self.vmas[i+1].perms() -/// for 1 <= i < self.vmas.len() - 2 -/// ``` -/// -#[derive(Debug, Default)] pub struct VMManager { range: VMRange, - vmas: Vec, - mmap_prefered_start_addr: usize, // Prefer to alloc mmap range starting this address + internal: SgxMutex, } impl VMManager { - pub fn from(addr: usize, size: usize) -> Result { - let range = VMRange::new(addr, addr + size)?; - let vmas = { - let start = range.start(); - let end = range.end(); - let start_sentry = { - let range = VMRange::new_empty(start)?; - let perms = VMPerms::empty(); - VMArea::new(range, perms, None) - }; - let end_sentry = { - let range = VMRange::new_empty(end)?; - let perms = VMPerms::empty(); - VMArea::new(range, perms, None) - }; - vec![start_sentry, end_sentry] - }; + pub fn init(vm_range: VMRange) -> Result { + let internal = InternalVMManager::init(vm_range.clone()); Ok(VMManager { - range, - vmas, - mmap_prefered_start_addr: addr, // make it the start of VMManger range by default + range: vm_range, + internal: SgxMutex::new(internal), }) } @@ -317,680 +38,677 @@ impl VMManager { &self.range } - pub fn vmas(&self) -> &Vec { - &self.vmas + fn internal(&self) -> SgxMutexGuard { + self.internal.lock().unwrap() } - // This is used to set the mmap prefered start address for VMManager - pub fn set_mmap_prefered_start_addr(&mut self, addr: usize) { - self.mmap_prefered_start_addr = addr + pub fn free_size(&self) -> usize { + self.internal().free_manager.free_size() } - pub fn mmap(&mut self, mut options: VMMapOptions) -> Result { - // TODO: respect options.align when mmap + pub fn verified_clean_when_exit(&self) -> bool { + let internal = self.internal(); + internal.chunks.len() == 0 && internal.free_manager.free_size() == self.range.size() + } + + pub fn free_chunk(&self, chunk: &ChunkRef) { + let mut internal = self.internal(); + internal.free_chunk(chunk); + } + + // Allocate single VMA chunk for new process whose process VM is not ready yet + pub fn alloc(&self, options: &VMMapOptions) -> Result<(VMRange, ChunkRef)> { + let addr = *options.addr(); + let size = *options.size(); + if let Ok(new_chunk) = self.internal().mmap_chunk(options) { + return Ok((new_chunk.range().clone(), new_chunk)); + } + return_errno!(ENOMEM, "can't allocate free chunks"); + } + + pub fn mmap(&self, options: &VMMapOptions) -> Result { + let addr = *options.addr(); + let size = *options.size(); + let align = *options.align(); + + match addr { + VMMapAddr::Any => {} + VMMapAddr::Hint(addr) => { + let target_range = unsafe { VMRange::from_unchecked(addr, addr + size) }; + let ret = self.mmap_with_addr(target_range, options); + if ret.is_ok() { + return ret; + } + } + VMMapAddr::Need(addr) | VMMapAddr::Force(addr) => { + let target_range = unsafe { VMRange::from_unchecked(addr, addr + size) }; + return self.mmap_with_addr(target_range, options); + } + } + + if size > CHUNK_DEFAULT_SIZE { + if let Ok(new_chunk) = self.internal().mmap_chunk(options) { + let start = new_chunk.range().start(); + current!().vm().add_mem_chunk(new_chunk); + return Ok(start); + } else { + return_errno!(ENOMEM, "can't allocate free chunks"); + } + } + + // Allocate in default chunk + let current = current!(); + { + // Fast path: Try to go to assigned chunks to do mmap + // There is no lock on VMManager in this path. + let process_mem_chunks = current.vm().mem_chunks().read().unwrap(); + for chunk in process_mem_chunks + .iter() + .filter(|&chunk| !chunk.is_single_vma()) + { + let result_start = chunk.try_mmap(options); + if result_start.is_ok() { + return result_start; + } + } + } + + // Process' chunks are all busy or can't allocate from process_mem_chunks list. + // Allocate a new chunk with chunk default size. + // Lock on ChunkManager. + if let Ok(new_chunk) = self.internal().mmap_chunk_default(addr) { + // Allocate in the new chunk + let start = new_chunk.mmap(options); + debug_assert!(start.is_ok()); // We just allocate a chunk for you. You must succeed. + // Add this new chunk to process' chunk list + new_chunk.add_process(¤t); + current.vm().add_mem_chunk(new_chunk); + return start; + } + + // Slow path: Sadly, there is no free chunk, iterate every chunk to find a range + { + // Release lock after this block + let mut result_start = Ok(0); + let chunks = &self.internal().chunks; + let chunk = chunks + .iter() + .filter(|&chunk| !chunk.is_single_vma()) + .find(|&chunk| { + result_start = chunk.mmap(options); + result_start.is_ok() + }); + if let Some(chunk) = chunk { + chunk.add_process(¤t); + current.vm().add_mem_chunk(chunk.clone()); + return result_start; + } + } + + // Can't find a range in default chunks. Maybe there is still free range in the global free list. + if let Ok(new_chunk) = self.internal().mmap_chunk(options) { + let start = new_chunk.range().start(); + current!().vm().add_mem_chunk(new_chunk); + return Ok(start); + } + + // No free range + return_errno!(ENOMEM, "Can't find a free chunk for this allocation"); + } + + // If addr is specified, use single VMA chunk to record this + fn mmap_with_addr(&self, range: VMRange, options: &VMMapOptions) -> Result { let addr = *options.addr(); let size = *options.size(); - if let VMMapAddr::Force(addr) = addr { - self.munmap(addr, size)?; + let current = current!(); + + let chunk = { + let process_mem_chunks = current.vm().mem_chunks().read().unwrap(); + process_mem_chunks + .iter() + .find(|&chunk| chunk.range().intersect(&range).is_some()) + .cloned() + }; + + if let Some(chunk) = chunk { + // This range is currently in a allocated chunk + match chunk.internal() { + ChunkType::MultiVMA(chunk_internal) => { + // If the chunk only intersect, but not a superset, we can't handle this. + if !chunk.range().is_superset_of(&range) { + return_errno!(EINVAL, "mmap with specified addr spans over two chunks"); + } + trace!( + "mmap with addr in existing default chunk: {:?}", + chunk.range() + ); + return chunk_internal.lock().unwrap().chunk_manager().mmap(options); + } + ChunkType::SingleVMA(_) => { + match addr { + VMMapAddr::Hint(addr) => { + return_errno!(ENOMEM, "Single VMA is currently in use. Hint failure"); + } + VMMapAddr::Need(addr) => { + return_errno!(ENOMEM, "Single VMA is currently in use. Need failure"); + } + VMMapAddr::Force(addr) => { + // Munmap the corresponding single vma chunk + // If the chunk only intersect, but not a superset, we can't handle this. + if !chunk.range().is_superset_of(&range) { + trace!( + "chunk range = {:?}, target range = {:?}", + chunk.range(), + range + ); + return_errno!(EINVAL, "mmap with specified addr spans two chunks"); + } + let mut internal_manager = self.internal(); + internal_manager.munmap_chunk(&chunk, Some(&range))?; + } + VMMapAddr::Any => unreachable!(), + } + } + } } - // Allocate a new range for this mmap request - let (insert_idx, free_range) = self.find_free_range(size, addr)?; - let new_range = self.alloc_range_from(size, addr, &free_range); - let new_addr = new_range.start(); - let writeback_file = options.writeback_file.take(); - let new_vma = VMArea::new(new_range, *options.perms(), writeback_file); - - // Initialize the memory of the new range - unsafe { - let buf = new_vma.as_slice_mut(); - options.initializer.init_slice(buf)?; + // This range is not currently using, allocate one in global list + if let Ok(new_chunk) = self.internal().mmap_chunk(options) { + let start = new_chunk.range().start(); + debug_assert!({ + match addr { + VMMapAddr::Force(addr) | VMMapAddr::Need(addr) => start == range.start(), + _ => true, + } + }); + current.vm().add_mem_chunk(new_chunk); + return Ok(start); + } else { + return_errno!(ENOMEM, "can't allocate a chunk in global list") } - // Set memory permissions - if !options.perms.is_default() { - Self::apply_perms(&new_vma, new_vma.perms()); - } - - // After initializing, we can safely insert the new VMA - self.insert_new_vma(insert_idx, new_vma); - Ok(new_addr) } - pub fn munmap(&mut self, addr: usize, size: usize) -> Result<()> { + pub fn munmap(&self, addr: usize, size: usize) -> Result<()> { + // Go to every process chunk to see if it contains the range. let size = { if size == 0 { return_errno!(EINVAL, "size of munmap must not be zero"); } align_up(size, PAGE_SIZE) }; - let munmap_range = { - let munmap_range = VMRange::new(addr, addr + size)?; - - let effective_munmap_range_opt = munmap_range.intersect(&self.range); - if effective_munmap_range_opt.is_none() { + let munmap_range = { VMRange::new(addr, addr + size) }?; + let chunk = { + let current = current!(); + let process_mem_chunks = current.vm().mem_chunks().read().unwrap(); + let chunk = process_mem_chunks + .iter() + .find(|&chunk| chunk.range().intersect(&munmap_range).is_some()); + if chunk.is_none() { + // Note: + // The man page of munmap states that "it is not an error if the indicated + // range does not contain any mapped pages". This is not considered as + // an error! + trace!("the munmap range is not mapped"); return Ok(()); } - - let effective_munmap_range = effective_munmap_range_opt.unwrap(); - if effective_munmap_range.empty() { - return Ok(()); - } - effective_munmap_range + chunk.unwrap().clone() }; - let old_vmas = { - let mut old_vmas = Vec::new(); - std::mem::swap(&mut self.vmas, &mut old_vmas); - old_vmas - }; - let new_vmas = old_vmas - .into_iter() - .flat_map(|vma| { - // Keep the two sentry VMA intact - if vma.size() == 0 { - return vec![vma]; + if !chunk.range().is_superset_of(&munmap_range) { + // munmap range spans multiple chunks + let munmap_single_vma_chunks = { + let current = current!(); + let mut process_mem_chunks = current.vm().mem_chunks().write().unwrap(); + let munmap_single_vma_chunks = process_mem_chunks + .drain_filter(|p_chunk| { + p_chunk.is_single_vma() && p_chunk.range().overlap_with(&munmap_range) + }) + .collect::>(); + if munmap_single_vma_chunks + .iter() + .find(|chunk| !munmap_range.is_superset_of(chunk.range())) + .is_some() + { + // TODO: Support munmap multiple single VMA chunk with remaining ranges. + return_errno!( + EINVAL, + "munmap multiple chunks with remaining ranges is not supported" + ); } - let intersection_vma = match vma.intersect(&munmap_range) { - None => return vec![vma], - Some(intersection_vma) => intersection_vma, - }; - - // File-backed VMA needs to be flushed upon munmap - Self::flush_file_vma(&intersection_vma); - - // Reset memory permissions - if !&intersection_vma.perms().is_default() { - Self::apply_perms(&intersection_vma, VMPerms::default()); + // TODO: Support munmap a part of default chunks + // Check munmap default chunks + if process_mem_chunks + .iter() + .find(|p_chunk| p_chunk.range().overlap_with(&munmap_range)) + .is_some() + { + return_errno!( + EINVAL, + "munmap range overlap with default chunks is not supported" + ); } + munmap_single_vma_chunks + }; - vma.subtract(&intersection_vma) - }) - .collect(); - self.vmas = new_vmas; - Ok(()) - } - - pub fn mremap(&mut self, options: &VMRemapOptions) -> Result { - let old_addr = options.old_addr(); - let old_size = options.old_size(); - let old_range = VMRange::new_with_size(old_addr, old_size)?; - let new_size = options.new_size(); - let flags = options.flags(); - - #[derive(Clone, Copy, PartialEq)] - enum SizeType { - Same, - Shrinking, - Growing, - }; - let size_type = if new_size == old_size { - SizeType::Same - } else if new_size < old_size { - SizeType::Shrinking - } else { - SizeType::Growing - }; - // The old range must be contained in one VMA - let idx = self - .find_containing_vma_idx(&old_range) - .ok_or_else(|| errno!(EFAULT, "invalid range"))?; - let containing_vma = &self.vmas[idx]; - // Get the memory permissions of the old range - let perms = containing_vma.perms(); - // Get the write back file of the old range if there is one. - let writeback_file = containing_vma.writeback_file(); - - // FIXME: Current implementation for file-backed memory mremap has limitation that if a SUBRANGE of the previous - // file-backed mmap with MAP_SHARED is then mremap-ed with MREMAP_MAYMOVE, there will be two vmas that have the same backed file. - // For Linux, writing to either memory vma or the file will update the other two equally. But we won't be able to support this before - // we really have paging. Thus, if the old_range is not equal to a recorded vma, we will just return with error. - if writeback_file.is_some() && &old_range != containing_vma.range() { - return_errno!(EINVAL, "Known limition") - } - - // Implement mremap as one optional mmap followed by one optional munmap. - // - // The exact arguments for the mmap and munmap are determined by the values of MRemapFlags, - // SizeType and writeback_file. There is a total of 18 combinations among MRemapFlags and - // SizeType and writeback_file. As some combinations result in the same mmap and munmap operations, - // the following code only needs to match below patterns of (MRemapFlags, SizeType, writeback_file) - // and treat each case accordingly. - - // Determine whether need to do mmap. And when possible, determine the returned address - let (need_mmap, mut ret_addr) = match (flags, size_type, writeback_file) { - (MRemapFlags::None, SizeType::Growing, None) => { - let vm_initializer_for_new_range = VMInitializer::FillZeros(); - let mmap_opts = VMMapOptionsBuilder::default() - .size(new_size - old_size) - .addr(VMMapAddr::Need(old_range.end())) - .perms(perms) - .initializer(vm_initializer_for_new_range) - .build()?; - let ret_addr = Some(old_addr); - (Some(mmap_opts), ret_addr) - } - (MRemapFlags::None, SizeType::Growing, Some((backed_file, offset))) => { - // Update writeback file offset - let new_writeback_file = - Some((backed_file.clone(), offset + containing_vma.size())); - let vm_initializer_for_new_range = VMInitializer::LoadFromFile { - file: backed_file.clone(), - offset: offset + containing_vma.size(), // file-backed mremap should start from the end of previous mmap/mremap file - }; - let mmap_opts = VMMapOptionsBuilder::default() - .size(new_size - old_size) - .addr(VMMapAddr::Need(old_range.end())) - .perms(perms) - .initializer(vm_initializer_for_new_range) - .writeback_file(new_writeback_file) - .build()?; - let ret_addr = Some(old_addr); - (Some(mmap_opts), ret_addr) - } - (MRemapFlags::MayMove, SizeType::Growing, None) => { - let prefered_new_range = - VMRange::new_with_size(old_addr + old_size, new_size - old_size)?; - if self.is_free_range(&prefered_new_range) { - // Don't need to move the old range - let vm_initializer_for_new_range = VMInitializer::FillZeros(); - let mmap_ops = VMMapOptionsBuilder::default() - .size(prefered_new_range.size()) - .addr(VMMapAddr::Need(prefered_new_range.start())) - .perms(perms) - .initializer(vm_initializer_for_new_range) - .build()?; - (Some(mmap_ops), Some(old_addr)) - } else { - // Need to move old range to a new range and init the new range - let vm_initializer_for_new_range = VMInitializer::CopyFrom { range: old_range }; - let mmap_ops = VMMapOptionsBuilder::default() - .size(new_size) - .addr(VMMapAddr::Any) - .perms(perms) - .initializer(vm_initializer_for_new_range) - .build()?; - // Cannot determine the returned address for now, which can only be obtained after calling mmap - let ret_addr = None; - (Some(mmap_ops), ret_addr) - } - } - (MRemapFlags::MayMove, SizeType::Growing, Some((backed_file, offset))) => { - let prefered_new_range = - VMRange::new_with_size(old_addr + old_size, new_size - old_size)?; - if self.is_free_range(&prefered_new_range) { - // Don't need to move the old range - let vm_initializer_for_new_range = VMInitializer::LoadFromFile { - file: backed_file.clone(), - offset: offset + containing_vma.size(), // file-backed mremap should start from the end of previous mmap/mremap file - }; - // Write back file should start from new offset - let new_writeback_file = - Some((backed_file.clone(), offset + containing_vma.size())); - let mmap_ops = VMMapOptionsBuilder::default() - .size(prefered_new_range.size()) - .addr(VMMapAddr::Need(prefered_new_range.start())) - .perms(perms) - .initializer(vm_initializer_for_new_range) - .writeback_file(new_writeback_file) - .build()?; - (Some(mmap_ops), Some(old_addr)) - } else { - // Need to move old range to a new range and init the new range - let vm_initializer_for_new_range = { - let copy_end = containing_vma.end(); - let copy_range = VMRange::new(old_range.start(), copy_end)?; - let reread_file_start_offset = copy_end - containing_vma.start(); - VMInitializer::CopyOldAndReadNew { - old_range: copy_range, - file: backed_file.clone(), - offset: reread_file_start_offset, - } - }; - let new_writeback_file = Some((backed_file.clone(), *offset)); - let mmap_ops = VMMapOptionsBuilder::default() - .size(new_size) - .addr(VMMapAddr::Any) - .perms(perms) - .initializer(vm_initializer_for_new_range) - .writeback_file(new_writeback_file) - .build()?; - // Cannot determine the returned address for now, which can only be obtained after calling mmap - let ret_addr = None; - (Some(mmap_ops), ret_addr) - } - } - (MRemapFlags::FixedAddr(new_addr), _, None) => { - let vm_initializer_for_new_range = { VMInitializer::CopyFrom { range: old_range } }; - let mmap_opts = VMMapOptionsBuilder::default() - .size(new_size) - .addr(VMMapAddr::Force(new_addr)) - .perms(perms) - .initializer(vm_initializer_for_new_range) - .build()?; - let ret_addr = Some(new_addr); - (Some(mmap_opts), ret_addr) - } - (MRemapFlags::FixedAddr(new_addr), _, Some((backed_file, offset))) => { - let vm_initializer_for_new_range = { - let copy_end = containing_vma.end(); - let copy_range = VMRange::new(old_range.start(), copy_end)?; - let reread_file_start_offset = copy_end - containing_vma.start(); - VMInitializer::CopyOldAndReadNew { - old_range: copy_range, - file: backed_file.clone(), - offset: reread_file_start_offset, - } - }; - let new_writeback_file = Some((backed_file.clone(), *offset)); - let mmap_opts = VMMapOptionsBuilder::default() - .size(new_size) - .addr(VMMapAddr::Force(new_addr)) - .perms(perms) - .initializer(vm_initializer_for_new_range) - .writeback_file(new_writeback_file) - .build()?; - let ret_addr = Some(new_addr); - (Some(mmap_opts), ret_addr) - } - _ => (None, Some(old_addr)), - }; - - let need_munmap = match (flags, size_type) { - (MRemapFlags::None, SizeType::Shrinking) - | (MRemapFlags::MayMove, SizeType::Shrinking) => { - let unmap_addr = old_addr + new_size; - let unmap_size = old_size - new_size; - Some((unmap_addr, unmap_size)) - } - (MRemapFlags::MayMove, SizeType::Growing) => { - if ret_addr.is_none() { - // We must need to do mmap. Thus unmap the old range - Some((old_addr, old_size)) - } else { - // We must choose to reuse the old range. Thus, no need to unmap - None - } - } - (MRemapFlags::FixedAddr(new_addr), _) => { - let new_range = VMRange::new_with_size(new_addr, new_size)?; - if new_range.overlap_with(&old_range) { - return_errno!(EINVAL, "new range cannot overlap with the old one"); - } - Some((old_addr, old_size)) - } - _ => None, - }; - - // Perform mmap and munmap if needed - if let Some(mmap_options) = need_mmap { - let mmap_addr = self.mmap(mmap_options)?; - - if ret_addr.is_none() { - ret_addr = Some(mmap_addr); - } - } - if let Some((addr, size)) = need_munmap { - self.munmap(addr, size).expect("never fail"); - } - - debug_assert!(ret_addr.is_some()); - Ok(ret_addr.unwrap()) - } - - pub fn mprotect(&mut self, addr: usize, size: usize, new_perms: VMPerms) -> Result<()> { - let protect_range = VMRange::new_with_size(addr, size)?; - - // FIXME: the current implementation requires the target range to be - // contained in exact one VMA. - let containing_idx = self - .find_containing_vma_idx(&protect_range) - .ok_or_else(|| errno!(ENOMEM, "invalid range"))?; - let containing_vma = &self.vmas[containing_idx]; - - let old_perms = containing_vma.perms(); - if new_perms == old_perms { + let mut internl_manager = self.internal(); + munmap_single_vma_chunks.iter().for_each(|p_chunk| { + internl_manager.munmap_chunk(p_chunk, None); + }); return Ok(()); } - let same_start = protect_range.start() == containing_vma.start(); - let same_end = protect_range.end() == containing_vma.end(); - let containing_vma = &mut self.vmas[containing_idx]; - match (same_start, same_end) { - (true, true) => { - containing_vma.set_perms(new_perms); - - Self::apply_perms(containing_vma, containing_vma.perms()); + match chunk.internal() { + ChunkType::MultiVMA(manager) => { + return manager + .lock() + .unwrap() + .chunk_manager() + .munmap_range(munmap_range); } - (false, true) => { - containing_vma.set_end(protect_range.start()); - - let new_vma = VMArea::inherits_file_from(containing_vma, protect_range, new_perms); - Self::apply_perms(&new_vma, new_vma.perms()); - self.insert_new_vma(containing_idx + 1, new_vma); - } - (true, false) => { - containing_vma.set_start(protect_range.end()); - - let new_vma = VMArea::inherits_file_from(containing_vma, protect_range, new_perms); - Self::apply_perms(&new_vma, new_vma.perms()); - self.insert_new_vma(containing_idx, new_vma); - } - (false, false) => { - // The containing VMA is divided into three VMAs: - // Shrinked old VMA: [containing_vma.start, protect_range.start) - // New VMA: [protect_range.start, protect_range.end) - // Another new vma: [protect_range.end, containing_vma.end) - - let old_end = containing_vma.end(); - let protect_end = protect_range.end(); - - // Shrinked old VMA - containing_vma.set_end(protect_range.start()); - - // New VMA - let new_vma = VMArea::inherits_file_from(containing_vma, protect_range, new_perms); - Self::apply_perms(&new_vma, new_vma.perms()); - - // Another new VMA - let new_vma2 = { - let range = VMRange::new(protect_end, old_end).unwrap(); - VMArea::inherits_file_from(containing_vma, range, old_perms) - }; - - drop(containing_vma); - self.insert_new_vma(containing_idx + 1, new_vma); - self.insert_new_vma(containing_idx + 2, new_vma2); + ChunkType::SingleVMA(_) => { + let mut internal_manager = self.internal(); + return internal_manager.munmap_chunk(&chunk, Some(&munmap_range)); } } - - Ok(()) } - /// Sync all shared, file-backed memory mappings in the given range by flushing the - /// memory content to its underlying file. - pub fn msync_by_range(&mut self, sync_range: &VMRange) -> Result<()> { - if !self.range().is_superset_of(&sync_range) { - return_errno!(ENOMEM, "invalid range"); - } - - // FIXME: check if sync_range covers unmapped memory - for vma in &self.vmas { - let vma = match vma.intersect(sync_range) { - None => continue, - Some(vma) => vma, - }; - Self::flush_file_vma(&vma); - } - Ok(()) + pub fn find_mmap_region(&self, addr: usize) -> Result { + let current = current!(); + let process_mem_chunks = current.vm().mem_chunks().read().unwrap(); + let mut vm_range = Ok(Default::default()); + process_mem_chunks.iter().find(|&chunk| { + vm_range = chunk.find_mmap_region(addr); + vm_range.is_ok() + }); + return vm_range; } - /// Sync all shared, file-backed memory mappings of the given file by flushing - /// the memory content to the file. - pub fn msync_by_file(&mut self, sync_file: &FileRef) { - for vma in &self.vmas { - let is_same_file = |file: &FileRef| -> bool { Arc::ptr_eq(&file, &sync_file) }; - Self::flush_file_vma_with_cond(vma, is_same_file); - } - } - - /// Flush a file-backed VMA to its file. This has no effect on anonymous VMA. - fn flush_file_vma(vma: &VMArea) { - Self::flush_file_vma_with_cond(vma, |_| true) - } - - /// Same as flush_vma, except that an extra condition on the file needs to satisfy. - fn flush_file_vma_with_cond bool>(vma: &VMArea, cond_fn: F) { - let (file, file_offset) = match vma.writeback_file().as_ref() { - None => return, - Some((file_and_offset)) => file_and_offset, - }; - let file_writable = file - .access_mode() - .map(|ac| ac.writable()) - .unwrap_or_default(); - if !file_writable { - return; - } - if !cond_fn(file) { - return; - } - file.write_at(*file_offset, unsafe { vma.as_slice() }); - } - - pub fn find_mmap_region(&self, addr: usize) -> Result<&VMRange> { - self.vmas - .iter() - .map(|vma| vma.range()) - .find(|vma| vma.contains(addr)) - .ok_or_else(|| errno!(ESRCH, "no mmap regions that contains the address")) - } - - pub fn usage_percentage(&self) -> f32 { - let totol_size = self.range.size(); - let mut used_size = 0; - self.vmas.iter().for_each(|vma| used_size += vma.size()); - - return used_size as f32 / totol_size as f32; - } - - // Find a VMA that contains the given range, returning the VMA's index - fn find_containing_vma_idx(&self, target_range: &VMRange) -> Option { - self.vmas - .iter() - .position(|vma| vma.is_superset_of(target_range)) - } - - // Returns whether the requested range is free - fn is_free_range(&self, request_range: &VMRange) -> bool { - self.range.is_superset_of(request_range) - && self - .vmas + pub fn mprotect(&self, addr: usize, size: usize, perms: VMPerms) -> Result<()> { + let protect_range = VMRange::new_with_size(addr, size)?; + let chunk = { + let current = current!(); + let process_mem_chunks = current.vm().mem_chunks().read().unwrap(); + let chunk = process_mem_chunks .iter() - .all(|range| range.overlap_with(request_range) == false) - } - - // Find the free range that satisfies the constraints of size and address - fn find_free_range(&self, size: usize, addr: VMMapAddr) -> Result<(usize, VMRange)> { - // TODO: reduce the complexity from O(N) to O(log(N)), where N is - // the number of existing VMAs. - - let mmap_prefered_start_addr = self.mmap_prefered_start_addr; - // Record the minimal free range that satisfies the contraints8 - let mut result_free_range: Option = None; - let mut result_idx: Option = None; - - for (idx, range_pair) in self.vmas.windows(2).enumerate() { - // Since we have two sentry vmas at both ends, we can be sure that the free - // space only appears between two consecutive vmas. - let pre_range = &range_pair[0]; - let next_range = &range_pair[1]; - - let mut free_range = { - let free_range_start = pre_range.end(); - let free_range_end = next_range.start(); - - let free_range_size = free_range_end - free_range_start; - if free_range_size < size { - continue; - } - - unsafe { VMRange::from_unchecked(free_range_start, free_range_end) } - }; - - match addr { - // Want a minimal free_range - VMMapAddr::Any => {} - // Prefer to have free_range.start == addr - VMMapAddr::Hint(addr) => { - if free_range.contains(addr) { - if free_range.end() - addr >= size { - free_range.start = addr; - let insert_idx = idx + 1; - return Ok((insert_idx, free_range)); - } - } - } - // Must have free_range.start == addr - VMMapAddr::Need(addr) | VMMapAddr::Force(addr) => { - if free_range.start() > addr { - return_errno!(ENOMEM, "not enough memory for fixed mmap"); - } - if !free_range.contains(addr) { - continue; - } - if free_range.end() - addr < size { - return_errno!(ENOMEM, "not enough memory for fixed mmap"); - } - free_range.start = addr; - let insert_idx = idx + 1; - return Ok((insert_idx, free_range)); - } + .find(|&chunk| chunk.range().intersect(&protect_range).is_some()); + if chunk.is_none() { + return_errno!(ENOMEM, "invalid range"); } - - if result_free_range == None - || result_free_range.as_ref().unwrap().size() > free_range.size() - // Preferentially alloc range above mmap_prefered_start_addr - || (result_free_range.as_ref().unwrap().end() < mmap_prefered_start_addr - && mmap_prefered_start_addr <= free_range.start()) - { - result_free_range = Some(free_range); - result_idx = Some(idx); - } - } - - if result_free_range.is_none() { - let usage = self.usage_percentage(); - debug!( - "Not enough memory to allocate {} bytes. Current memory usage is {}%", - size, - usage * 100 as f32 - ); - return_errno!(ENOMEM, "not enough memory"); - } - - let free_range = result_free_range.unwrap(); - let insert_idx = result_idx.unwrap() + 1; - Ok((insert_idx, free_range)) - } - - fn alloc_range_from(&self, size: usize, addr: VMMapAddr, free_range: &VMRange) -> VMRange { - debug_assert!(free_range.size() >= size); - - let mut new_range = *free_range; - - if let VMMapAddr::Need(addr) = addr { - debug_assert!(addr == new_range.start()); - } - if let VMMapAddr::Force(addr) = addr { - debug_assert!(addr == new_range.start()); - } - - new_range.resize(size); - new_range - } - - // Insert a new VMA, and when possible, merge it with its neighbors. - fn insert_new_vma(&mut self, insert_idx: usize, new_vma: VMArea) { - // New VMA can only be inserted between the two sentry VMAs - debug_assert!(0 < insert_idx && insert_idx < self.vmas.len()); - - let left_idx = insert_idx - 1; - let right_idx = insert_idx; - - let left_vma = &self.vmas[left_idx]; - let right_vma = &self.vmas[right_idx]; - - // Double check the order - debug_assert!(left_vma.end() <= new_vma.start()); - debug_assert!(new_vma.end() <= right_vma.start()); - - let left_mergable = Self::can_merge_vmas(left_vma, &new_vma); - let right_mergable = Self::can_merge_vmas(&new_vma, right_vma); - - drop(left_vma); - drop(right_vma); - - match (left_mergable, right_mergable) { - (false, false) => { - self.vmas.insert(insert_idx, new_vma); - } - (true, false) => { - self.vmas[left_idx].set_end(new_vma.end); - } - (false, true) => { - self.vmas[right_idx].set_start(new_vma.start); - } - (true, true) => { - let left_new_end = self.vmas[right_idx].end(); - self.vmas[left_idx].set_end(left_new_end); - self.vmas.remove(right_idx); - } - } - } - - fn can_merge_vmas(left: &VMArea, right: &VMArea) -> bool { - debug_assert!(left.end() <= right.start()); - - // Both of the two VMAs must not be sentry (whose size == 0) - if left.size() == 0 || right.size() == 0 { - return false; - } - // The two VMAs must border with each other - if left.end() != right.start() { - return false; - } - // The two VMAs must have the same memory permissions - if left.perms() != right.perms() { - return false; - } - - // If the two VMAs have write-back files, the files must be the same and - // the two file regions must be continuous. - let left_writeback_file = left.writeback_file(); - let right_writeback_file = right.writeback_file(); - match (left_writeback_file, right_writeback_file) { - (None, None) => true, - (Some(_), None) => false, - (None, Some(_)) => false, - (Some((left_file, left_offset)), Some((right_file, right_offset))) => { - Arc::ptr_eq(&left_file, &right_file) - && right_offset > left_offset - && right_offset - left_offset == left.size() - } - } - } - - fn apply_perms(protect_range: &VMRange, perms: VMPerms) { - extern "C" { - pub fn occlum_ocall_mprotect( - retval: *mut i32, - addr: *const c_void, - len: usize, - prot: i32, - ) -> sgx_status_t; + chunk.unwrap().clone() }; - unsafe { - let mut retval = 0; - let addr = protect_range.start() as *const c_void; - let len = protect_range.size(); - let prot = perms.bits() as i32; - let sgx_status = occlum_ocall_mprotect(&mut retval, addr, len, prot); - assert!(sgx_status == sgx_status_t::SGX_SUCCESS && retval == 0); + // TODO: Support mprotect range spans multiple chunks + if !chunk.range().is_superset_of(&protect_range) { + return_errno!(EINVAL, "mprotect range is not in a single chunk"); } + + match chunk.internal() { + ChunkType::MultiVMA(manager) => { + trace!("mprotect default chunk: {:?}", chunk.range()); + return manager + .lock() + .unwrap() + .chunk_manager() + .mprotect(addr, size, perms); + } + ChunkType::SingleVMA(_) => { + let mut internal_manager = self.internal(); + return internal_manager.mprotect_single_vma_chunk(&chunk, protect_range, perms); + } + } + } + + pub fn msync(&self, addr: usize, size: usize) -> Result<()> { + let sync_range = VMRange::new_with_size(addr, size)?; + let chunk = { + let current = current!(); + let process_mem_chunks = current.vm().mem_chunks().read().unwrap(); + let chunk = process_mem_chunks + .iter() + .find(|&chunk| chunk.range().is_superset_of(&sync_range)); + if chunk.is_none() { + return_errno!(ENOMEM, "invalid range"); + } + chunk.unwrap().clone() + }; + + match chunk.internal() { + ChunkType::MultiVMA(manager) => { + trace!("msync default chunk: {:?}", chunk.range()); + return manager + .lock() + .unwrap() + .chunk_manager() + .msync_by_range(&sync_range); + } + ChunkType::SingleVMA(vma) => { + let vma = vma.lock().unwrap(); + ChunkManager::flush_file_vma(&vma); + } + } + Ok(()) + } + + pub fn msync_by_file(&self, sync_file: &FileRef) { + let current = current!(); + let process_mem_chunks = current.vm().mem_chunks().read().unwrap(); + let is_same_file = |file: &FileRef| -> bool { Arc::ptr_eq(&file, &sync_file) }; + process_mem_chunks + .iter() + .for_each(|chunk| match chunk.internal() { + ChunkType::MultiVMA(manager) => { + manager + .lock() + .unwrap() + .chunk_manager() + .msync_by_file(sync_file); + } + ChunkType::SingleVMA(vma) => { + ChunkManager::flush_file_vma_with_cond(&vma.lock().unwrap(), is_same_file); + } + }); + } + + pub fn mremap(&self, options: &VMRemapOptions) -> Result { + return_errno!(ENOSYS, "Under development"); + } + + // When process is exiting, free all owned chunks + pub fn free_chunks_when_exit(&self, thread: &ThreadRef) { + let mut internal_manager = self.internal(); + let mut mem_chunks = thread.vm().mem_chunks().write().unwrap(); + + mem_chunks.iter().for_each(|chunk| { + internal_manager.munmap_chunk(&chunk, None); + }); + mem_chunks.clear(); + + debug_assert!(mem_chunks.len() == 0); } } -impl Drop for VMManager { - fn drop(&mut self) { - // Ensure that memory permissions are recovered - for vma in &self.vmas { - if vma.size() == 0 || vma.perms() == VMPerms::default() { - continue; - } - Self::apply_perms(vma, VMPerms::default()); +// Modification on this structure must aquire the global lock. +// TODO: Enable fast_default_chunks for faster chunk allocation +#[derive(Debug)] +pub struct InternalVMManager { + chunks: BTreeSet, // track in-use chunks, use B-Tree for better performance and simplicity (compared with red-black tree) + fast_default_chunks: Vec, // empty default chunks + free_manager: VMFreeSpaceManager, +} + +impl InternalVMManager { + pub fn init(vm_range: VMRange) -> Self { + let chunks = BTreeSet::new(); + let fast_default_chunks = Vec::new(); + let free_manager = VMFreeSpaceManager::new(vm_range); + Self { + chunks, + fast_default_chunks, + free_manager, } } + + // Allocate a new chunk with default size + pub fn mmap_chunk_default(&mut self, addr: VMMapAddr) -> Result { + // Find a free range from free_manager + let free_range = self.find_free_gaps(CHUNK_DEFAULT_SIZE, PAGE_SIZE, addr)?; + + // Add this range to chunks + let chunk = Arc::new(Chunk::new_default_chunk(free_range)?); + trace!("allocate a default chunk = {:?}", chunk); + self.chunks.insert(chunk.clone()); + Ok(chunk) + } + + // Allocate a chunk with single vma + pub fn mmap_chunk(&mut self, options: &VMMapOptions) -> Result { + let addr = *options.addr(); + let size = *options.size(); + let align = *options.align(); + let free_range = self.find_free_gaps(size, align, addr)?; + let chunk = Arc::new(Chunk::new_single_vma_chunk(free_range, options)); + trace!("allocate a new single vma chunk: {:?}", chunk); + self.chunks.insert(chunk.clone()); + Ok(chunk) + } + + // Munmap a chunk + // For Single VMA chunk, a part of the chunk could be munmapped if munmap_range is specified. + pub fn munmap_chunk(&mut self, chunk: &ChunkRef, munmap_range: Option<&VMRange>) -> Result<()> { + trace!( + "munmap_chunk range = {:?}, munmap_range = {:?}", + chunk.range(), + munmap_range + ); + let vma = match chunk.internal() { + ChunkType::MultiVMA(manager) => { + let mut manager = manager.lock().unwrap(); + let is_cleaned = manager.clean_multi_vmas(); + // If the manager is cleaned, there is only one process using this chunk. Thus it can be freed safely. + // If the manager is not cleaned, there is at least another process which is using this chunk. Don't free it here. + if is_cleaned { + self.free_chunk(chunk)?; + } + return Ok(()); + } + ChunkType::SingleVMA(vma) => vma, + }; + + let munmap_range = { + if munmap_range.is_none() { + chunk.range() + } else { + munmap_range.unwrap() + } + }; + debug_assert!(chunk.range().is_superset_of(munmap_range)); + + let mut vma = vma.lock().unwrap(); + debug_assert!(chunk.range() == vma.range()); + let intersection_vma = match vma.intersect(munmap_range) { + Some(intersection_vma) => intersection_vma, + _ => unreachable!(), + }; + + // File-backed VMA needs to be flushed upon munmap + ChunkManager::flush_file_vma(&intersection_vma); + + // Reset memory permissions + if !&intersection_vma.perms().is_default() { + VMPerms::apply_perms(&intersection_vma, VMPerms::default()); + } + + // Reset to zero + unsafe { + let buf = intersection_vma.as_slice_mut(); + buf.iter_mut().for_each(|b| *b = 0) + } + + let mut new_vmas = vma.subtract(&intersection_vma); + let current = current!(); + // Release lock in chunk before getting lock for process mem_chunks to avoid deadlock + drop(vma); + + match new_vmas.len() { + 0 => { + // Exact size + self.free_chunk(&chunk); + if current.status() != ThreadStatus::Exited { + // If the current thread is exiting, there is no need to remove the chunk from process' mem_list. + // It will be drained. + current.vm().remove_mem_chunk(&chunk); + } + } + 1 => { + // Update the current vma to the new vma + let updated_vma = new_vmas.pop().unwrap(); + self.update_single_vma_chunk(¤t, &chunk, updated_vma); + + // Return the intersection range to free list + self.free_manager + .add_range_back_to_free_manager(intersection_vma.range()); + } + 2 => { + // single vma => (updated_vma, munmapped_vma, new_vma) + self.free_manager + .add_range_back_to_free_manager(intersection_vma.range()); + + let new_vma = new_vmas.pop().unwrap(); + let new_vma_chunk = Arc::new(Chunk::new_chunk_with_vma(new_vma)); + self.chunks.insert(new_vma_chunk.clone()); + current.vm().add_mem_chunk(new_vma_chunk); + + let updated_vma = new_vmas.pop().unwrap(); + self.update_single_vma_chunk(¤t, &chunk, updated_vma); + } + _ => unreachable!(), + } + Ok(()) + } + + fn update_single_vma_chunk( + &mut self, + current_thread: &ThreadRef, + old_chunk: &ChunkRef, + new_vma: VMArea, + ) { + let new_chunk = Arc::new(Chunk::new_chunk_with_vma(new_vma)); + current_thread + .vm() + .replace_mem_chunk(old_chunk, new_chunk.clone()); + self.chunks.remove(old_chunk); + self.chunks.insert(new_chunk); + } + + pub fn mprotect_single_vma_chunk( + &mut self, + chunk: &ChunkRef, + protect_range: VMRange, + new_perms: VMPerms, + ) -> Result<()> { + let vma = match chunk.internal() { + ChunkType::MultiVMA(_) => { + unreachable!(); + } + ChunkType::SingleVMA(vma) => vma, + }; + + let mut updated_vmas = { + let mut containing_vma = vma.lock().unwrap(); + trace!( + "mprotect_single_vma_chunk range = {:?}, mprotect_range = {:?}", + chunk.range(), + protect_range + ); + debug_assert!(chunk.range() == containing_vma.range()); + + if containing_vma.perms() == new_perms { + return Ok(()); + } + + let same_start = protect_range.start() == containing_vma.start(); + let same_end = protect_range.end() == containing_vma.end(); + match (same_start, same_end) { + (true, true) => { + // Exact the same vma + containing_vma.set_perms(new_perms); + VMPerms::apply_perms(&containing_vma, containing_vma.perms()); + return Ok(()); + } + (false, false) => { + // The containing VMA is divided into three VMAs: + // Shrinked old VMA: [containing_vma.start, protect_range.start) + // New VMA: [protect_range.start, protect_range.end) + // remaining old VMA: [protect_range.end, containing_vma.end) + + let old_end = containing_vma.end(); + let old_perms = containing_vma.perms(); + + containing_vma.set_end(protect_range.start()); + + let new_vma = VMArea::inherits_file_from( + &containing_vma, + protect_range, + new_perms, + DUMMY_CHUNK_PROCESS_ID, + ); + VMPerms::apply_perms(&new_vma, new_vma.perms()); + + let remaining_old_vma = { + let range = VMRange::new(protect_range.end(), old_end).unwrap(); + VMArea::inherits_file_from( + &containing_vma, + range, + old_perms, + DUMMY_CHUNK_PROCESS_ID, + ) + }; + + let updated_vmas = vec![containing_vma.clone(), new_vma, remaining_old_vma]; + updated_vmas + } + _ => { + if same_start { + // Protect range is at left side of the cotaining vma + containing_vma.set_start(protect_range.end()); + } else { + // Protect range is at right side of the cotaining vma + containing_vma.set_end(protect_range.start()); + } + + let new_vma = VMArea::inherits_file_from( + &containing_vma, + protect_range, + new_perms, + DUMMY_CHUNK_PROCESS_ID, + ); + VMPerms::apply_perms(&new_vma, new_vma.perms()); + + let updated_vmas = vec![containing_vma.clone(), new_vma]; + updated_vmas + } + } + }; + + let current = current!(); + while updated_vmas.len() > 1 { + let vma = updated_vmas.pop().unwrap(); + self.add_new_chunk(¤t, vma); + } + + debug_assert!(updated_vmas.len() == 1); + let vma = updated_vmas.pop().unwrap(); + self.update_single_vma_chunk(¤t, &chunk, vma); + + Ok(()) + } + + fn add_new_chunk(&mut self, current_thread: &ThreadRef, new_vma: VMArea) { + let new_vma_chunk = Arc::new(Chunk::new_chunk_with_vma(new_vma)); + self.chunks.insert(new_vma_chunk.clone()); + current_thread.vm().add_mem_chunk(new_vma_chunk); + } + + pub fn free_chunk(&mut self, chunk: &ChunkRef) -> Result<()> { + let range = chunk.range(); + // Remove from chunks + self.chunks.remove(chunk); + + // Add range back to freespace manager + self.free_manager.add_range_back_to_free_manager(range); + Ok(()) + } + + pub fn find_free_gaps( + &mut self, + size: usize, + align: usize, + addr: VMMapAddr, + ) -> Result { + return self + .free_manager + .find_free_range_internal(size, align, addr); + } } diff --git a/src/libos/src/vm/vm_perms.rs b/src/libos/src/vm/vm_perms.rs index 0c538841..933f3203 100644 --- a/src/libos/src/vm/vm_perms.rs +++ b/src/libos/src/vm/vm_perms.rs @@ -31,6 +31,26 @@ impl VMPerms { pub fn is_default(&self) -> bool { self.bits == Self::DEFAULT.bits } + + pub fn apply_perms(protect_range: &VMRange, perms: VMPerms) { + extern "C" { + pub fn occlum_ocall_mprotect( + retval: *mut i32, + addr: *const c_void, + len: usize, + prot: i32, + ) -> sgx_status_t; + }; + + unsafe { + let mut retval = 0; + let addr = protect_range.start() as *const c_void; + let len = protect_range.size(); + let prot = perms.bits() as i32; + let sgx_status = occlum_ocall_mprotect(&mut retval, addr, len, prot); + assert!(sgx_status == sgx_status_t::SGX_SUCCESS && retval == 0); + } + } } impl Default for VMPerms { diff --git a/src/libos/src/vm/vm_range.rs b/src/libos/src/vm/vm_range.rs index f9bf50c6..3e0876ff 100644 --- a/src/libos/src/vm/vm_range.rs +++ b/src/libos/src/vm/vm_range.rs @@ -1,6 +1,6 @@ use super::*; -#[derive(Clone, Copy, Default, PartialEq)] +#[derive(Clone, Copy, Default, Eq, PartialEq, Hash)] pub struct VMRange { pub(super) start: usize, pub(super) end: usize, @@ -130,7 +130,7 @@ impl VMRange { pub fn intersect(&self, other: &VMRange) -> Option { let intersection_start = self.start().max(other.start()); let intersection_end = self.end().min(other.end()); - if intersection_start > intersection_end { + if intersection_start >= intersection_end { return None; } unsafe { diff --git a/src/libos/src/vm/vm_util.rs b/src/libos/src/vm/vm_util.rs new file mode 100644 index 00000000..13caa87b --- /dev/null +++ b/src/libos/src/vm/vm_util.rs @@ -0,0 +1,276 @@ +use super::*; + +// use super::vm_area::VMArea; +// use super::free_space_manager::VMFreeSpaceManager; +use super::vm_area::*; +use super::vm_perms::VMPerms; +use std::collections::BTreeSet; + +use intrusive_collections::rbtree::{Link, RBTree}; +use intrusive_collections::Bound; +use intrusive_collections::RBTreeLink; +use intrusive_collections::{intrusive_adapter, KeyAdapter}; + +#[derive(Clone, Debug)] +pub enum VMInitializer { + DoNothing(), + FillZeros(), + CopyFrom { + range: VMRange, + }, + LoadFromFile { + file: FileRef, + offset: usize, + }, + // For file-backed mremap which may move from old range to new range and read extra bytes from file + CopyOldAndReadNew { + old_range: VMRange, + file: FileRef, + offset: usize, // read file from this offset + }, +} + +impl Default for VMInitializer { + fn default() -> VMInitializer { + VMInitializer::DoNothing() + } +} + +impl VMInitializer { + pub fn init_slice(&self, buf: &mut [u8]) -> Result<()> { + match self { + VMInitializer::DoNothing() => { + // Do nothing + } + VMInitializer::FillZeros() => { + for b in buf { + *b = 0; + } + } + VMInitializer::CopyFrom { range } => { + let src_slice = unsafe { range.as_slice() }; + let copy_len = min(buf.len(), src_slice.len()); + buf[..copy_len].copy_from_slice(&src_slice[..copy_len]); + for b in &mut buf[copy_len..] { + *b = 0; + } + } + VMInitializer::LoadFromFile { file, offset } => { + // TODO: make sure that read_at does not move file cursor + let len = file + .read_at(*offset, buf) + .cause_err(|_| errno!(EIO, "failed to init memory from file"))?; + for b in &mut buf[len..] { + *b = 0; + } + } + VMInitializer::CopyOldAndReadNew { + old_range, + file, + offset, + } => { + // TODO: Handle old_range with non-readable subrange + let src_slice = unsafe { old_range.as_slice() }; + let copy_len = src_slice.len(); + debug_assert!(copy_len <= buf.len()); + let read_len = buf.len() - copy_len; + buf[..copy_len].copy_from_slice(&src_slice[..copy_len]); + let len = file + .read_at(*offset, &mut buf[copy_len..]) + .cause_err(|_| errno!(EIO, "failed to init memory from file"))?; + for b in &mut buf[(copy_len + len)..] { + *b = 0; + } + } + } + Ok(()) + } +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum VMMapAddr { + Any, // Free to choose any address + Hint(usize), // Prefer the address, but can use other address + Need(usize), // Need to use the address, otherwise report error + Force(usize), // Force using the address by munmap first +} + +impl Default for VMMapAddr { + fn default() -> VMMapAddr { + VMMapAddr::Any + } +} + +#[derive(Builder, Debug)] +#[builder(pattern = "owned", build_fn(skip), no_std)] +pub struct VMMapOptions { + size: usize, + align: usize, + perms: VMPerms, + addr: VMMapAddr, + initializer: VMInitializer, + // The content of the VMA can be written back to a given file at a given offset + writeback_file: Option<(FileRef, usize)>, +} + +// VMMapOptionsBuilder is generated automatically, except the build function +impl VMMapOptionsBuilder { + pub fn build(mut self) -> Result { + let size = { + let size = self + .size + .ok_or_else(|| errno!(EINVAL, "invalid size for mmap"))?; + if size == 0 { + return_errno!(EINVAL, "invalid size for mmap"); + } + align_up(size, PAGE_SIZE) + }; + let align = { + let align = self.align.unwrap_or(PAGE_SIZE); + if align == 0 || !align.is_power_of_two() { + return_errno!(EINVAL, "invalid size for mmap"); + } + align + }; + let perms = self + .perms + .ok_or_else(|| errno!(EINVAL, "perms must be given"))?; + let addr = { + let addr = self.addr.unwrap_or_default(); + match addr { + // TODO: check addr + size overflow + VMMapAddr::Any => VMMapAddr::Any, + VMMapAddr::Hint(addr) => { + let addr = align_down(addr, PAGE_SIZE); + VMMapAddr::Hint(addr) + } + VMMapAddr::Need(addr_) | VMMapAddr::Force(addr_) => { + if addr_ % align != 0 { + return_errno!(EINVAL, "unaligned addr for fixed mmap"); + } + addr + } + } + }; + let initializer = match self.initializer.as_ref() { + Some(initializer) => initializer.clone(), + None => VMInitializer::default(), + }; + let writeback_file = self.writeback_file.take().unwrap_or_default(); + Ok(VMMapOptions { + size, + align, + perms, + addr, + initializer, + writeback_file, + }) + } +} + +impl VMMapOptions { + pub fn size(&self) -> &usize { + &self.size + } + + pub fn addr(&self) -> &VMMapAddr { + &self.addr + } + + pub fn perms(&self) -> &VMPerms { + &self.perms + } + + pub fn align(&self) -> &usize { + &self.align + } + + pub fn initializer(&self) -> &VMInitializer { + &self.initializer + } + + pub fn writeback_file(&self) -> &Option<(FileRef, usize)> { + &self.writeback_file + } +} + +#[derive(Clone, Copy, PartialEq)] +pub enum SizeType { + Same, + Shrinking, + Growing, +} + +impl SizeType { + pub fn new(old_size: &usize, new_size: &usize) -> Self { + if new_size == old_size { + SizeType::Same + } else if new_size < old_size { + SizeType::Shrinking + } else { + SizeType::Growing + } + } +} + +#[derive(Debug)] +pub struct VMRemapOptions { + old_addr: usize, + old_size: usize, + new_size: usize, + flags: MRemapFlags, +} + +impl VMRemapOptions { + pub fn new( + old_addr: usize, + old_size: usize, + new_size: usize, + flags: MRemapFlags, + ) -> Result { + let old_addr = if old_addr % PAGE_SIZE != 0 { + return_errno!(EINVAL, "unaligned old address"); + } else { + old_addr + }; + let old_size = if old_size == 0 { + // TODO: support old_size is zero for shareable mapping + warn!("do not support old_size is zero"); + return_errno!(EINVAL, "invalid old size"); + } else { + align_up(old_size, PAGE_SIZE) + }; + if let Some(new_addr) = flags.new_addr() { + if new_addr % PAGE_SIZE != 0 { + return_errno!(EINVAL, "unaligned new address"); + } + } + let new_size = if new_size == 0 { + return_errno!(EINVAL, "invalid new size"); + } else { + align_up(new_size, PAGE_SIZE) + }; + Ok(Self { + old_addr, + old_size, + new_size, + flags, + }) + } + + pub fn old_addr(&self) -> usize { + self.old_addr + } + + pub fn old_size(&self) -> usize { + self.old_size + } + + pub fn new_size(&self) -> usize { + self.new_size + } + + pub fn flags(&self) -> MRemapFlags { + self.flags + } +} diff --git a/src/pal/src/pal_api.c b/src/pal/src/pal_api.c index 3ce43548..82e0cd69 100644 --- a/src/pal/src/pal_api.c +++ b/src/pal/src/pal_api.c @@ -239,7 +239,6 @@ int occlum_pal_destroy(void) { } int ret = 0; - if (pal_interrupt_thread_stop() < 0) { ret = -1; PAL_WARN("Cannot stop the interrupt thread: %s", errno2str(errno)); diff --git a/test/mmap/main.c b/test/mmap/main.c index a5d47bb7..7d5ff74f 100644 --- a/test/mmap/main.c +++ b/test/mmap/main.c @@ -47,6 +47,10 @@ static int get_a_valid_range_of_hints(size_t *hint_begin, size_t *hint_end) { if (big_buf == MAP_FAILED) { THROW_ERROR("mmap failed"); } + + // Check if munmap will clean the range + memset(big_buf, 0xff, big_buf_len); + int ret = munmap(big_buf, big_buf_len); if (ret < 0) { THROW_ERROR("munmap failed"); @@ -1038,6 +1042,47 @@ int test_mprotect_with_non_page_aligned_size() { *(char *)buf = 1; *(char *)(buf + PAGE_SIZE) = 1; + ret = munmap(buf, PAGE_SIZE * 2); + if (ret < 0) { + THROW_ERROR("munmap failed"); + } + return 0; +} + +int test_mprotect_multiple_vmas() { + // Create multiple VMA with PROT_NONE + int flags = MAP_PRIVATE | MAP_ANONYMOUS; + void *buf_a = mmap((void *)HINT_BEGIN, PAGE_SIZE * 2, PROT_NONE, flags, -1, 0); + if (buf_a == MAP_FAILED || buf_a != (void *)HINT_BEGIN) { + THROW_ERROR("mmap failed"); + } + void *buf_b = mmap((void *)(HINT_BEGIN + 2 * PAGE_SIZE), PAGE_SIZE, PROT_NONE, flags, -1, + 0); + if (buf_b == MAP_FAILED || buf_b != (void *)(HINT_BEGIN + 2 * PAGE_SIZE)) { + THROW_ERROR("mmap failed"); + } + void *buf_c = mmap((void *)(HINT_BEGIN + 3 * PAGE_SIZE), PAGE_SIZE * 2, PROT_NONE, flags, + -1, 0); + if (buf_c == MAP_FAILED || buf_c != (void *)(HINT_BEGIN + 3 * PAGE_SIZE)) { + THROW_ERROR("mmap failed"); + } + + // Set a part of the ranges to read-write + int ret = mprotect(buf_a + PAGE_SIZE, 3 * PAGE_SIZE, PROT_READ | PROT_WRITE); + if (ret < 0) { + THROW_ERROR("mprotect multiple vmas failed"); + } + + // Check if these ranges are writable + *(char *)(buf_a + PAGE_SIZE) = 1; + *(char *)(buf_b) = 1; + *(char *)(buf_c) = 1; + + ret = munmap(buf_a, PAGE_SIZE * 5); + if (ret < 0) { + THROW_ERROR("munmap multiple vmas failed"); + } + return 0; } @@ -1231,11 +1276,13 @@ static test_case_t test_cases[] = { TEST_CASE(test_munmap_with_null_addr), TEST_CASE(test_munmap_with_zero_len), TEST_CASE(test_munmap_with_non_page_aligned_len), +#ifdef MREMAP_SUPPORTED TEST_CASE(test_mremap), TEST_CASE(test_mremap_subrange), TEST_CASE(test_mremap_with_fixed_addr), TEST_CASE(test_file_backed_mremap), TEST_CASE(test_file_backed_mremap_mem_may_move), +#endif TEST_CASE(test_mprotect_once), TEST_CASE(test_mprotect_twice), TEST_CASE(test_mprotect_triple), @@ -1243,6 +1290,7 @@ static test_case_t test_cases[] = { TEST_CASE(test_mprotect_with_invalid_addr), TEST_CASE(test_mprotect_with_invalid_prot), TEST_CASE(test_mprotect_with_non_page_aligned_size), + TEST_CASE(test_mprotect_multiple_vmas), }; int main() {