diff --git a/src/libos/Cargo.lock b/src/libos/Cargo.lock index d370c738..d9ea06ab 100644 --- a/src/libos/Cargo.lock +++ b/src/libos/Cargo.lock @@ -9,7 +9,7 @@ dependencies = [ "aligned", "atomic", "bitflags", - "bitvec", + "bitvec 1.0.1", "ctor", "derive_builder", "goblin", @@ -18,6 +18,7 @@ dependencies = [ "lazy_static", "log", "memoffset 0.6.5", + "modular-bitfield", "rcore-fs", "rcore-fs-devfs", "rcore-fs-mountfs", @@ -94,7 +95,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41262f11d771fd4a61aa3ce019fca363b4b6c282fca9da2a31186d3965a47a5c" dependencies = [ "either", - "radium", + "radium 0.3.0", +] + +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium 0.7.0", + "tap", + "wyz", ] [[package]] @@ -206,6 +219,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + [[package]] name = "goblin" version = "0.5.4" @@ -294,6 +313,27 @@ dependencies = [ "autocfg 1.1.0", ] +[[package]] +name = "modular-bitfield" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a53d79ba8304ac1c4f9eb3b9d281f21f7be9d4626f72ce7df4ad8fbde4f38a74" +dependencies = [ + "modular-bitfield-impl", + "static_assertions 1.1.0", +] + +[[package]] +name = "modular-bitfield-impl" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a7d5f7076603ebc68de2dc6a650ec331a062a13abaa346975be747bbfa4b789" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "plain" version = "0.2.3" @@ -334,6 +374,12 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "def50a86306165861203e7f84ecffbbdfdea79f0e51039b33de1e952358c47ac" +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + [[package]] name = "rand" version = "0.6.5" @@ -479,11 +525,11 @@ dependencies = [ name = "rcore-fs-sefs" version = "0.1.0" dependencies = [ - "bitvec", + "bitvec 0.17.4", "log", "rcore-fs", "spin 0.5.2", - "static_assertions", + "static_assertions 0.3.4", "uuid", ] @@ -719,6 +765,12 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f3eb36b47e512f8f1c9e3d10c2c1965bc992bd9cdb024fa581e2194501c83d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.9.3" @@ -736,6 +788,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + [[package]] name = "unicode-ident" version = "1.0.3" @@ -772,3 +830,12 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] diff --git a/src/libos/Cargo.toml b/src/libos/Cargo.toml index dda96e52..bd378161 100644 --- a/src/libos/Cargo.toml +++ b/src/libos/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["staticlib"] [dependencies] atomic = "0.5" bitflags = "1.0" -bitvec = { version = "0.17", default-features = false, features = ["alloc"] } +bitvec = { version = "1", default-features = false, features = ["alloc"] } log = "0.4" aligned = "0.4.1" lazy_static = { version = "1.1.0", features = ["spin_no_std"] } # Implies nightly @@ -33,6 +33,7 @@ regex = { git = "https://github.com/mesalock-linux/regex-sgx", default-features goblin = { version = "0.5.4", default-features = false, features = ["elf64", "elf32", "endian_fd"] } intrusive-collections = "0.9" spin = "0.7" +modular-bitfield = "0.11.2" [patch.'https://github.com/apache/teaclave-sgx-sdk.git'] sgx_tstd = { path = "../../deps/rust-sgx-sdk/sgx_tstd" } diff --git a/src/libos/src/exception/mod.rs b/src/libos/src/exception/mod.rs index bb193141..a76441ae 100644 --- a/src/libos/src/exception/mod.rs +++ b/src/libos/src/exception/mod.rs @@ -6,10 +6,14 @@ use self::syscall::{handle_syscall_exception, SYSCALL_OPCODE}; use super::*; use crate::signal::{FaultSignal, SigSet}; use crate::syscall::exception_interrupt_syscall_c_abi; -use crate::syscall::{CpuContext, FpRegs, SyscallNum}; -use aligned::{Aligned, A16}; -use core::arch::x86_64::_fxsave; +use crate::syscall::{CpuContext, ExtraContext, SyscallNum}; +use crate::vm::{enclave_page_fault_handler, USER_SPACE_VM_MANAGER}; use sgx_types::*; +use sgx_types::{sgx_exception_type_t, sgx_exception_vector_t}; + +const ENCLU: u32 = 0xd7010f; +const EACCEPT: u32 = 0x5; +const EACCEPTCOPY: u32 = 0x7; // Modules for instruction simulation mod cpuid; @@ -25,14 +29,63 @@ pub fn register_exception_handlers() { } } +fn try_handle_kernel_exception(info: &sgx_exception_info_t) -> i32 { + if info.exception_vector == sgx_exception_vector_t::SGX_EXCEPTION_VECTOR_PF { + let pf_addr = info.exinfo.faulting_address as usize; + // The PF address must be in the user space. Otherwise, keep searching for the exception handler + if !USER_SPACE_VM_MANAGER.range().contains(pf_addr) { + SGX_MM_EXCEPTION_CONTINUE_SEARCH + } else { + let rip = info.cpu_context.rip as *const u32; + let rax = info.cpu_context.rax as u32; + // This can happen when two threads both try to EAUG a new page. Thread 1 EAUG because it first + // touches the memory and triggers #PF. Thread 2 EAUG because it uses sgx_mm_commit to commit a + // new page with EACCEPT and triggers #PF. If Thread 1 first acquires the lock to do EAUG, when Thread 2 + // acquires the lock, it can't do EAUG again and will fail. The failure will raise a signal. + // This signal will eventually be handled here. And the instruction that triggers this exception is EACCEPT/EACCEPTCOPY. + // In this case, since the new page is EAUG-ed already, just need to excecute the EACCEPT again. Thus here + // just return SGX_MM_EXCEPTION_CONTINUE_EXECUTION + if ENCLU == (unsafe { *rip } as u32) & 0xffffff + && (EACCEPT == rax || EACCEPTCOPY == rax) + { + return SGX_MM_EXCEPTION_CONTINUE_EXECUTION; + } + + // If the triggered code is not user's code and the #PF address is in the userspace, then it is a + // kernel-triggered #PF that we can handle. This can happen e.g. when read syscall triggers user buffer #PF + info!("kernel code triggers #PF"); + let kernel_triggers = true; + enclave_page_fault_handler(info.cpu_context.rip as usize, info.exinfo, kernel_triggers) + .expect("handle PF failure"); + SGX_MM_EXCEPTION_CONTINUE_EXECUTION + } + } else { + // Otherwise, we can't handle. Keep searching for the exception handler + error!( + "We can't handle this exception: {:?}", + info.exception_vector + ); + SGX_MM_EXCEPTION_CONTINUE_SEARCH + } +} + #[no_mangle] extern "C" fn handle_exception(info: *mut sgx_exception_info_t) -> i32 { - let mut fpregs = FpRegs::save(); + let info = unsafe { &mut *info }; + + // Try handle kernel-trigged #PF + if !USER_SPACE_VM_MANAGER + .range() + .contains(info.cpu_context.rip as usize) + { + return try_handle_kernel_exception(&info); + } + + // User-space-triggered exception unsafe { exception_interrupt_syscall_c_abi( SyscallNum::HandleException as u32, - info as *mut _, - &mut fpregs as *mut FpRegs, + info as *mut sgx_exception_info_t as *mut _, ) }; unreachable!(); @@ -41,20 +94,22 @@ extern "C" fn handle_exception(info: *mut sgx_exception_info_t) -> i32 { /// Exceptions are handled as a special kind of system calls. pub fn do_handle_exception( info: *mut sgx_exception_info_t, - fpregs: *mut FpRegs, user_context: *mut CpuContext, ) -> Result { let info = unsafe { &mut *info }; check_exception_type(info.exception_type)?; + info!("do handle exception: {:?}", info.exception_vector); let user_context = unsafe { &mut *user_context }; *user_context = CpuContext::from_sgx(&info.cpu_context); - user_context.fpregs = fpregs; + let xsave_area = info.xsave_area.as_mut_ptr(); + user_context.extra_context = ExtraContext::Xsave; + user_context.extra_context_ptr = xsave_area; // Try to do instruction emulation first if info.exception_vector == sgx_exception_vector_t::SGX_EXCEPTION_VECTOR_UD { // Assume the length of opcode is 2 bytes - let ip_opcode = unsafe { *(user_context.rip as *const u16) }; + let ip_opcode: u16 = unsafe { *(user_context.rip as *const u16) }; if ip_opcode == RDTSC_OPCODE { return handle_rdtsc_exception(user_context); } else if ip_opcode == SYSCALL_OPCODE { @@ -64,6 +119,23 @@ pub fn do_handle_exception( } } + // Normally, We should only handled PF exception with SGX bit set which is due to uncommitted EPC. + // However, it happens that when committing a no-read-write page (e.g. RWX), there is a short gap + // after EACCEPTCOPY and before the mprotect ocall. And if the user touches memory during this short + // gap, the SGX bit will not be set. Thus, here we don't check the SGX bit. + if info.exception_vector == sgx_exception_vector_t::SGX_EXCEPTION_VECTOR_PF { + info!("Userspace #PF caught, try handle"); + if enclave_page_fault_handler(info.cpu_context.rip as usize, info.exinfo, false).is_ok() { + info!("#PF handling is done successfully"); + return Ok(0); + } + + warn!( + "#PF not handled. Turn to signal. user context = {:?}", + user_context + ); + } + // Then, it must be a "real" exception. Convert it to signal and force delivering it. // The generated signal is SIGBUS, SIGFPE, SIGILL, or SIGSEGV. // @@ -108,3 +180,21 @@ fn check_exception_type(type_: sgx_exception_type_t) -> Result<()> { } Ok(()) } + +// Based on Page-Fault Error Code of Intel Mannul +const PF_EXCEPTION_SGX_BIT: u32 = 0x1; +const PF_EXCEPTION_RW_BIT: u32 = 0x2; + +// Return value: +// True - SGX bit is set +// False - SGX bit is not set +pub fn check_sgx_bit(exception_error_code: u32) -> bool { + exception_error_code & PF_EXCEPTION_SGX_BIT == PF_EXCEPTION_SGX_BIT +} + +// Return value: +// True - write bit is set, #PF caused by write +// False - read bit is set, #PF caused by read +pub fn check_rw_bit(exception_error_code: u32) -> bool { + exception_error_code & PF_EXCEPTION_RW_BIT == PF_EXCEPTION_RW_BIT +} diff --git a/src/libos/src/fs/procfs/pid/maps.rs b/src/libos/src/fs/procfs/pid/maps.rs index bf788399..003dd1e3 100644 --- a/src/libos/src/fs/procfs/pid/maps.rs +++ b/src/libos/src/fs/procfs/pid/maps.rs @@ -91,7 +91,7 @@ fn get_output_for_vma(vma: &VMArea, heap_or_stack: Option<&str>) -> String { let perms = vma.perms(); let (file_path, offset, device_id, inode_num) = { - if let Some((file, offset)) = vma.init_file() { + if let Some((file, offset)) = vma.backed_file() { let inode_file = file.as_inode_file().unwrap(); let file_path = inode_file.abs_path(); let inode_num = inode_file.inode().metadata().unwrap().inode; diff --git a/src/libos/src/interrupt/mod.rs b/src/libos/src/interrupt/mod.rs index 936a743a..5f4fd7d1 100644 --- a/src/libos/src/interrupt/mod.rs +++ b/src/libos/src/interrupt/mod.rs @@ -2,9 +2,7 @@ pub use self::sgx::sgx_interrupt_info_t; use crate::prelude::*; use crate::process::ThreadRef; use crate::syscall::exception_interrupt_syscall_c_abi; -use crate::syscall::{CpuContext, FpRegs, SyscallNum}; -use aligned::{Aligned, A16}; -use core::arch::x86_64::_fxsave; +use crate::syscall::{CpuContext, ExtraContext, SyscallNum}; mod sgx; @@ -16,28 +14,23 @@ pub fn init() { } extern "C" fn handle_interrupt(info: *mut sgx_interrupt_info_t) -> i32 { - let mut fpregs = FpRegs::save(); unsafe { - exception_interrupt_syscall_c_abi( - SyscallNum::HandleInterrupt as u32, - info as *mut _, - &mut fpregs as *mut FpRegs, - ) + exception_interrupt_syscall_c_abi(SyscallNum::HandleInterrupt as u32, info as *mut _) }; unreachable!(); } pub fn do_handle_interrupt( info: *mut sgx_interrupt_info_t, - fpregs: *mut FpRegs, cpu_context: *mut CpuContext, ) -> Result { - let info = unsafe { &*info }; + let info = unsafe { &mut *info }; let context = unsafe { &mut *cpu_context }; // The cpu context is overriden so that it is as if the syscall is called from where the // interrupt happened *context = CpuContext::from_sgx(&info.cpu_context); - context.fpregs = fpregs; + context.extra_context = ExtraContext::Xsave; + context.extra_context_ptr = info.xsave_area.as_mut_ptr(); Ok(0) } diff --git a/src/libos/src/interrupt/sgx.rs b/src/libos/src/interrupt/sgx.rs index f84993d8..def1a4d3 100644 --- a/src/libos/src/interrupt/sgx.rs +++ b/src/libos/src/interrupt/sgx.rs @@ -1,10 +1,15 @@ use crate::prelude::*; -#[repr(C)] +#[repr(C, align(64))] #[derive(Default, Clone, Copy)] #[allow(non_camel_case_types)] pub struct sgx_interrupt_info_t { pub cpu_context: sgx_cpu_context_t, + pub interrupt_valid: uint32_t, + reserved: uint32_t, + pub xsave_size: uint64_t, + pub reserved1: [uint64_t; 4], + pub xsave_area: [uint8_t; 0], } #[allow(non_camel_case_types)] diff --git a/src/libos/src/lib.rs b/src/libos/src/lib.rs index 06b994ae..83566a53 100644 --- a/src/libos/src/lib.rs +++ b/src/libos/src/lib.rs @@ -21,8 +21,11 @@ #![feature(test)] #![feature(atomic_from_mut)] #![feature(btree_drain_filter)] -#![feature(bench_black_box)] #![feature(arbitrary_enum_discriminant)] +// for core::ptr::non_null::NonNull addr() method +#![feature(strict_provenance)] +// for VMArea::can_merge_vmas +#![feature(is_some_and)] #[macro_use] extern crate alloc; @@ -59,6 +62,7 @@ extern crate memoffset; extern crate ctor; extern crate intrusive_collections; extern crate itertools; +extern crate modular_bitfield; extern crate resolv_conf; use sgx_trts::libc; diff --git a/src/libos/src/process/do_exit.rs b/src/libos/src/process/do_exit.rs index fa23a458..43f12827 100644 --- a/src/libos/src/process/do_exit.rs +++ b/src/libos/src/process/do_exit.rs @@ -1,6 +1,6 @@ use crate::process::do_vfork::reap_zombie_child_created_with_vfork; use crate::signal::constants::*; -use std::intrinsics::atomic_store; +use std::intrinsics::atomic_store_seqcst; use super::do_futex::futex_wake; use super::do_vfork::{is_vforked_child_process, vfork_return_to_parent}; @@ -61,7 +61,7 @@ fn exit_thread(term_status: TermStatus) { // Notify a thread, if any, that waits on ctid. See set_tid_address(2) for more info. if let Some(ctid_ptr) = thread.clear_ctid() { unsafe { - atomic_store(ctid_ptr.as_ptr(), 0); + atomic_store_seqcst(ctid_ptr.as_ptr(), 0); } futex_wake(ctid_ptr.as_ptr() as *const i32, 1); } diff --git a/src/libos/src/process/do_futex.rs b/src/libos/src/process/do_futex.rs index 23b71dbf..d3bb64dc 100644 --- a/src/libos/src/process/do_futex.rs +++ b/src/libos/src/process/do_futex.rs @@ -1,6 +1,6 @@ use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; -use std::intrinsics::atomic_load; +use std::intrinsics::atomic_load_seqcst; use std::sync::atomic::{AtomicBool, Ordering}; use crate::prelude::*; @@ -258,7 +258,7 @@ impl FutexKey { } pub fn load_val(&self) -> i32 { - unsafe { atomic_load(self.0 as *const i32) } + unsafe { atomic_load_seqcst(self.0 as *const i32) } } pub fn addr(&self) -> usize { diff --git a/src/libos/src/sched/cpu_set.rs b/src/libos/src/sched/cpu_set.rs index f0175940..bc184151 100644 --- a/src/libos/src/sched/cpu_set.rs +++ b/src/libos/src/sched/cpu_set.rs @@ -8,6 +8,7 @@ //! * If `cpu_set[i] == true`, then the i-th CPU core belongs to the set; //! * Otherwise, the i-th CPU core is not in the set. +use bitvec::order::LocalBits as Local; use bitvec::prelude::*; use std::ops::Index; @@ -15,7 +16,7 @@ use crate::prelude::*; #[derive(Debug, Clone, PartialEq)] pub struct CpuSet { - bits: BitBox, + bits: BitBox, } impl CpuSet { @@ -33,14 +34,14 @@ impl CpuSet { /// Create a CpuSet that consists of all of the CPU cores. pub fn new_full() -> Self { - let mut bits = bitbox![Local, u8; 1; Self::len() * 8]; + let mut bits = bitbox![u8, Local; 1; Self::len() * 8]; Self::clear_unused(&mut bits); Self { bits } } /// Create a CpuSet that consists of none of the CPU cores. pub fn new_empty() -> Self { - let bits = bitbox![Local, u8; 0; Self::len() * 8]; + let bits = bitbox![u8, Local; 0; Self::len() * 8]; Self { bits } } @@ -61,7 +62,7 @@ impl CpuSet { /// Returns the first index of CPUs in set. pub fn first_cpu_idx(&self) -> Option { - self.iter().position(|&b| b == true) + self.iter().position(|b| b == true) } // Returns if the CpuSet is a subset of available cpu set @@ -75,7 +76,7 @@ impl CpuSet { return_errno!(EINVAL, "slice is not long enough"); } let slice = &slice[..Self::len()]; - let mut bits = BitBox::from_slice(slice); + let mut bits = BitBox::from_bitslice(&BitSlice::from_slice(slice)); Self::clear_unused(&mut bits); Ok(Self { bits }) @@ -85,11 +86,11 @@ impl CpuSet { /// /// The last, unused bits in the byte slice are guaranteed to be zero. pub fn as_slice(&self) -> &[u8] { - self.bits.as_slice() + self.bits.as_raw_slice() } pub fn as_mut_slice(&mut self) -> &mut [u8] { - self.bits.as_mut_slice() + self.bits.as_raw_mut_slice() } /// Returns an iterator that allows accessing the underlying bits. @@ -102,7 +103,7 @@ impl CpuSet { self.bits.iter_mut() } - fn clear_unused(bits: &mut BitSlice) { + fn clear_unused(bits: &mut BitSlice) { let unused_bits = &mut bits[Self::ncores()..(Self::len() * 8)]; for mut bit in unused_bits { *bit = false; @@ -110,8 +111,8 @@ impl CpuSet { } } -pub type Iter<'a> = bitvec::slice::Iter<'a, Local, u8>; -pub type IterMut<'a> = bitvec::slice::IterMut<'a, Local, u8>; +pub type Iter<'a> = bitvec::slice::Iter<'a, u8, Local>; +pub type IterMut<'a> = bitvec::slice::IterMut<'a, u8, Local>; impl Index for CpuSet { type Output = bool; diff --git a/src/libos/src/signal/c_types.rs b/src/libos/src/signal/c_types.rs index 9d20a972..09937a35 100644 --- a/src/libos/src/signal/c_types.rs +++ b/src/libos/src/signal/c_types.rs @@ -199,7 +199,7 @@ impl siginfo_t { } } -#[derive(Clone, Copy)] +#[derive(Clone)] #[repr(C)] pub struct ucontext_t { pub uc_flags: u64, @@ -225,7 +225,8 @@ pub type stack_t = sigaltstack_t; pub struct mcontext_t { pub inner: CpuContext, // TODO: the fields should be csgsfs, err, trapno, oldmask, and cr2 - _unused0: [u64; 5], + // The number should be 5 but we use extra 2 spaces to store something else in the CpuContext. Thus make it 3. + _unused0: [u64; 3], // TODO: this field should be `fpregs: fpregset_t,` _unused1: usize, _reserved: [u64; 8], diff --git a/src/libos/src/signal/do_sigreturn.rs b/src/libos/src/signal/do_sigreturn.rs index 823ce7c9..389de0cd 100644 --- a/src/libos/src/signal/do_sigreturn.rs +++ b/src/libos/src/signal/do_sigreturn.rs @@ -5,9 +5,8 @@ use super::{SigAction, SigActionFlags, SigDefaultAction, SigSet, Signal}; use crate::lazy_static::__Deref; use crate::prelude::*; use crate::process::{ProcessRef, TermStatus, ThreadRef}; -use crate::syscall::{CpuContext, FpRegs}; +use crate::syscall::{CpuContext, ExtraContext, FpRegs, XsaveArea}; use aligned::{Aligned, A16}; -use core::arch::x86_64::{_fxrstor, _fxsave}; use std::{ptr, slice}; pub fn do_rt_sigreturn(curr_user_ctxt: &mut CpuContext) -> Result<()> { @@ -34,11 +33,27 @@ pub fn do_rt_sigreturn(curr_user_ctxt: &mut CpuContext) -> Result<()> { *curr_user_ctxt = last_ucontext.uc_mcontext.inner; // Restore the floating point registers to a temp area - // The floating point registers would be recoved just - // before return to user's code - let mut fpregs = Box::new(unsafe { FpRegs::from_slice(&last_ucontext.fpregs) }); - curr_user_ctxt.fpregs = Box::into_raw(fpregs); - curr_user_ctxt.fpregs_on_heap = 1; // indicates the fpregs is on heap + // The floating point registers would be recoved just before return to user's code + match curr_user_ctxt.extra_context { + ExtraContext::Fpregs => { + // Signal raised by direct syscall + // fpregs should be stored on the heap. Because the ucontext_t will be freed when this function returns. And curr_user_ctxt only stores the pointer + let mut fpregs = Box::new(unsafe { FpRegs::from_slice(&last_ucontext.fpregs) }); + curr_user_ctxt.extra_context_ptr = Box::into_raw(fpregs) as *mut u8; + } + ExtraContext::Xsave => { + // Signal raised by exception + // The xsave_area is stored at a special area reserved on kernel's stack. We can just overwrite this area with the latest user context + // Note: Currently, we only restore the fpregs instead of restoring the whole xsave area for sigreturn. Because during the + // handle path, we don't touch other advanced registers. However, in the future, if we have to touch those registers, + // we should restore the whole xsave area when sigreturn. + let latest_fpregs = unsafe { FpRegs::from_slice(&last_ucontext.fpregs) }; + let xsave_area = + unsafe { (&mut *(curr_user_ctxt.extra_context_ptr as *mut XsaveArea)) }; + xsave_area.set_fpregs_area(latest_fpregs); + } + } + Ok(()) } @@ -261,16 +276,24 @@ fn handle_signals_by_user( // Save the old sigmask ucontext.uc_sigmask = old_sigmask.to_c(); // Save the user context - ucontext.uc_mcontext.inner = *curr_user_ctxt; + ucontext.uc_mcontext.inner = curr_user_ctxt.clone(); // Save the floating point registers - if curr_user_ctxt.fpregs != ptr::null_mut() { - ucontext - .fpregs - .copy_from_slice(unsafe { curr_user_ctxt.fpregs.as_ref().unwrap().as_slice() }); - // Clear the floating point registers, since we do not need to recover is when this syscall return - curr_user_ctxt.fpregs = ptr::null_mut(); + if curr_user_ctxt.extra_context_ptr != ptr::null_mut() { + // Signal from exception handling + debug_assert!(matches!(curr_user_ctxt.extra_context, ExtraContext::Xsave)); + let fpregs_area = + unsafe { (&*(curr_user_ctxt.extra_context_ptr as *mut XsaveArea)) }.get_fpregs(); + ucontext.fpregs.copy_from_slice(fpregs_area.as_slice()); + // Clear the floating point registers, since we do not need to recover this when this syscall return + curr_user_ctxt.extra_context_ptr = ptr::null_mut(); } else { + // Raise the signal with direct syscall + debug_assert!( + matches!(curr_user_ctxt.extra_context, ExtraContext::Fpregs) + && curr_user_ctxt.extra_context_ptr == ptr::null_mut() + ); + // We need a correct fxsave structure in the buffer, // because the app may modify part of it to update the // floating point after the signal handler finished. diff --git a/src/libos/src/signal/signals/fault.rs b/src/libos/src/signal/signals/fault.rs index 85d9a0c8..8e882596 100644 --- a/src/libos/src/signal/signals/fault.rs +++ b/src/libos/src/signal/signals/fault.rs @@ -36,12 +36,12 @@ impl FaultSignal { // Page fault exception SGX_EXCEPTION_VECTOR_PF => { const PF_ERR_FLAG_PRESENT : u32 = 1u32 << 0; - let code = if info.exinfo.errcd & PF_ERR_FLAG_PRESENT != 0 { + let code = if info.exinfo.error_code & PF_ERR_FLAG_PRESENT != 0 { SEGV_ACCERR } else { SEGV_MAPERR }; - let addr = Some(info.exinfo.maddr); + let addr = Some(info.exinfo.faulting_address ); (SIGSEGV, code, addr) }, // General protection exception diff --git a/src/libos/src/syscall/mod.rs b/src/libos/src/syscall/mod.rs index 69e17df0..2cf6d593 100644 --- a/src/libos/src/syscall/mod.rs +++ b/src/libos/src/syscall/mod.rs @@ -7,7 +7,7 @@ //! 3. Preprocess the system call and then call `dispatch_syscall` (in this file) //! 4. Call `do_*` to process the system call (in other modules) -use aligned::{Aligned, A16}; +use aligned::{Aligned, A16, A64}; use core::arch::x86_64::{_fxrstor, _fxsave}; use std::any::Any; use std::convert::TryFrom; @@ -60,7 +60,7 @@ use crate::signal::{ do_rt_sigtimedwait, do_sigaltstack, do_tgkill, do_tkill, sigaction_t, siginfo_t, sigset_t, stack_t, }; -use crate::vm::{MMapFlags, MRemapFlags, MSyncFlags, VMPerms}; +use crate::vm::{MMapFlags, MRemapFlags, MSyncFlags, MadviceFlags, VMPerms}; use crate::{fs, process, std, vm}; use super::*; @@ -122,7 +122,7 @@ macro_rules! process_syscall_table_with_callback { (Mremap = 25) => do_mremap(old_addr: usize, old_size: usize, new_size: usize, flags: i32, new_addr: usize), (Msync = 26) => do_msync(addr: usize, size: usize, flags: u32), (Mincore = 27) => handle_unsupported(), - (Madvise = 28) => handle_unsupported(), + (Madvise = 28) => do_madvice(addr: usize, length: usize, advice: i32), (Shmget = 29) => do_shmget(key: key_t, size: size_t, shmflg: i32), (Shmat = 30) => do_shmat(shmid: i32, shmaddr: usize, shmflg: i32), (Shmctl = 31) => do_shmctl(shmid: i32, cmd: i32, buf: *mut shmids_t), @@ -424,8 +424,8 @@ macro_rules! process_syscall_table_with_callback { // Occlum-specific system calls (SpawnGlibc = 359) => do_spawn_for_glibc(child_pid_ptr: *mut u32, path: *const i8, argv: *const *const i8, envp: *const *const i8, fa: *const SpawnFileActions, attribute_list: *const posix_spawnattr_t), (SpawnMusl = 360) => do_spawn_for_musl(child_pid_ptr: *mut u32, path: *const i8, argv: *const *const i8, envp: *const *const i8, fdop_list: *const FdOp, attribute_list: *const posix_spawnattr_t), - (HandleException = 361) => do_handle_exception(info: *mut sgx_exception_info_t, fpregs: *mut FpRegs, context: *mut CpuContext), - (HandleInterrupt = 362) => do_handle_interrupt(info: *mut sgx_interrupt_info_t, fpregs: *mut FpRegs, context: *mut CpuContext), + (HandleException = 361) => do_handle_exception(info: *mut sgx_exception_info_t, context: *mut CpuContext), + (HandleInterrupt = 362) => do_handle_interrupt(info: *mut sgx_interrupt_info_t, context: *mut CpuContext), (MountRootFS = 363) => do_mount_rootfs(key_ptr: *const sgx_key_128bit_t, rootfs_config_ptr: *const user_rootfs_config), } }; @@ -649,12 +649,10 @@ fn do_syscall(user_context: &mut CpuContext) { syscall.args[1] = user_context as *mut _ as isize; } else if syscall_num == SyscallNum::HandleException { // syscall.args[0] == info - // syscall.args[1] == fpregs - syscall.args[2] = user_context as *mut _ as isize; + syscall.args[1] = user_context as *mut _ as isize; } else if syscall.num == SyscallNum::HandleInterrupt { // syscall.args[0] == info - // syscall.args[1] == fpregs - syscall.args[2] = user_context as *mut _ as isize; + syscall.args[1] = user_context as *mut _ as isize; } else if syscall.num == SyscallNum::Sigaltstack { // syscall.args[0] == new_ss // syscall.args[1] == old_ss @@ -751,21 +749,27 @@ fn do_sysret(user_context: &mut CpuContext) -> ! { fn do_exit_task() -> !; } if current!().status() != ThreadStatus::Exited { - // Restore the floating point registers - // Todo: Is it correct to do fxstor in kernel? - let fpregs = user_context.fpregs; - if (fpregs != ptr::null_mut()) { - if user_context.fpregs_on_heap == 1 { - let fpregs = unsafe { Box::from_raw(user_context.fpregs as *mut FpRegs) }; - fpregs.restore(); - } else { - unsafe { fpregs.as_ref().unwrap().restore() }; + if user_context.extra_context_ptr != ptr::null_mut() { + match user_context.extra_context { + ExtraContext::Fpregs => { + let fpregs = user_context.extra_context_ptr as *mut FpRegs; + unsafe { fpregs.as_ref().unwrap().restore() }; + // The fpregs must be allocated on heap + drop(unsafe { Box::from_raw(user_context.extra_context_ptr as *mut FpRegs) }); + } + ExtraContext::Xsave => { + let xsave_area = user_context.extra_context_ptr; + unsafe { (&*(xsave_area as *mut XsaveArea)).restore() }; + } } + user_context.extra_context_ptr = ptr::null_mut(); } unsafe { __occlum_sysret(user_context) } // jump to user space } else { - if user_context.fpregs != ptr::null_mut() && user_context.fpregs_on_heap == 1 { - drop(unsafe { Box::from_raw(user_context.fpregs as *mut FpRegs) }); + if user_context.extra_context_ptr != ptr::null_mut() + && matches!(user_context.extra_context, ExtraContext::Fpregs) + { + drop(unsafe { Box::from_raw(user_context.extra_context_ptr as *mut FpRegs) }); } unsafe { do_exit_task() } // exit enclave } @@ -828,6 +832,12 @@ fn do_msync(addr: usize, size: usize, flags: u32) -> Result { Ok(0) } +fn do_madvice(addr: usize, length: usize, advice: i32) -> Result { + let flags = MadviceFlags::from_i32(advice)?; + vm::do_madvice(addr, length, flags)?; + Ok(0) +} + fn do_sysinfo(info: *mut sysinfo_t) -> Result { check_mut_ptr(info)?; let info = unsafe { &mut *info }; @@ -977,7 +987,6 @@ fn handle_unsupported() -> Result { /// Floating point registers /// /// Note. The area is used to save fxsave result -//#[derive(Clone, Copy)] #[repr(C)] pub struct FpRegs { inner: Aligned, @@ -1017,6 +1026,41 @@ impl FpRegs { } } +#[derive(Debug)] +#[repr(C)] +pub struct XsaveArea { + inner: Aligned, +} + +impl XsaveArea { + // The first 512 bytes of xsave area is used for FP registers + const FXSAVE_AREA_LEN: usize = 512; + + /// Save the current CPU floating pointer states to an instance of FpRegs + pub fn save() -> Self { + let mut xsave_area = MaybeUninit::::uninit(); + unsafe { + save_xregs(xsave_area.as_mut_ptr() as *mut u8); + xsave_area.assume_init() + } + } + + /// Restore the current CPU floating pointer states from this FpRegs instance + pub fn restore(&self) { + unsafe { + restore_xregs(self.inner.as_ptr()); + } + } + + pub fn get_fpregs(&self) -> FpRegs { + unsafe { FpRegs::from_slice(&self.inner[..Self::FXSAVE_AREA_LEN]) } + } + + pub fn set_fpregs_area(&mut self, fpregs: FpRegs) { + self.inner[..Self::FXSAVE_AREA_LEN].copy_from_slice(fpregs.as_slice()) + } +} + /// Cpu context. /// /// Note. The definition of this struct must be kept in sync with the assembly @@ -1042,8 +1086,21 @@ pub struct CpuContext { pub rsp: u64, pub rip: u64, pub rflags: u64, - pub fpregs_on_heap: u64, - pub fpregs: *mut FpRegs, + pub extra_context: ExtraContext, + pub extra_context_ptr: *mut u8, +} + +#[repr(u64)] +#[derive(Clone, Copy, Debug)] +pub enum ExtraContext { + Fpregs = 0, + Xsave = 1, +} + +impl Default for ExtraContext { + fn default() -> Self { + Self::Fpregs + } } impl CpuContext { @@ -1067,8 +1124,8 @@ impl CpuContext { rsp: src.rsp, rip: src.rip, rflags: src.rflags, - fpregs_on_heap: 0, - fpregs: ptr::null_mut(), + extra_context: Default::default(), + extra_context_ptr: ptr::null_mut(), } } } @@ -1082,14 +1139,15 @@ impl CpuContext { // pointer that is not safe to use by external modules. In our case, the // FpRegs pointer will not be used actually. So the Rust warning is a // false alarm. We suppress it here. -pub unsafe fn exception_interrupt_syscall_c_abi( - num: u32, - info: *mut c_void, - fpregs: *mut FpRegs, -) -> u32 { +pub unsafe fn exception_interrupt_syscall_c_abi(num: u32, info: *mut c_void) -> u32 { #[allow(improper_ctypes)] extern "C" { - pub fn __occlum_syscall_c_abi(num: u32, info: *mut c_void, fpregs: *mut FpRegs) -> u32; + pub fn __occlum_syscall_c_abi(num: u32, info: *mut c_void) -> u32; } - __occlum_syscall_c_abi(num, info, fpregs) + __occlum_syscall_c_abi(num, info) +} + +extern "C" { + pub fn save_xregs(save_area: *mut u8); + pub fn restore_xregs(save_area: *const u8); } diff --git a/src/libos/src/syscall/syscall_entry_x86-64.S b/src/libos/src/syscall/syscall_entry_x86-64.S index e0e09720..a3534fdc 100644 --- a/src/libos/src/syscall/syscall_entry_x86-64.S +++ b/src/libos/src/syscall/syscall_entry_x86-64.S @@ -52,8 +52,8 @@ __occlum_syscall_linux_abi: // Save the target CPU state when `call __occlum_syscall` is returned in // a CpuContext struct. The registers are saved in the reverse order of // the fields in CpuContext. - pushq $0 // default fpregs is NULL - pushq $0 // default fpregs is allocated on stack + pushq $0 // default extra_context_ptr is NULL + pushq $0 // default extra_context is floating point registers pushfq push %rcx // save %rip push %r11 // save %rsp diff --git a/src/libos/src/vm/chunk.rs b/src/libos/src/vm/chunk.rs index 6e666f12..de5daea5 100644 --- a/src/libos/src/vm/chunk.rs +++ b/src/libos/src/vm/chunk.rs @@ -100,16 +100,9 @@ impl Chunk { *options.perms(), options.initializer().backed_file(), current!().process().pid(), - ); - // Initialize the memory of the new range - unsafe { - let buf = vm_range.as_slice_mut(); - options.initializer().init_slice(buf)?; - } - // Set memory permissions - if !options.perms().is_default() { - VMPerms::apply_perms(&vm_area, vm_area.perms()); - } + ) + .init_memory(options)?; + Ok(Self::new_chunk_with_vma(vm_area)) } @@ -238,6 +231,30 @@ impl Chunk { } } + pub fn handle_page_fault( + &self, + rip: usize, + pf_addr: usize, + errcd: u32, + kernel_triggers: bool, + ) -> Result<()> { + let internal = &self.internal; + match self.internal() { + ChunkType::SingleVMA(vma) => { + let mut vma = vma.lock().unwrap(); + debug_assert!(vma.contains(pf_addr)); + return vma.handle_page_fault(rip, pf_addr, errcd, kernel_triggers); + } + ChunkType::MultiVMA(internal_manager) => { + return internal_manager + .lock() + .unwrap() + .chunk_manager + .handle_page_fault(rip, pf_addr, errcd, kernel_triggers); + } + } + } + pub fn is_free_range(&self, request_range: &VMRange) -> bool { match self.internal() { ChunkType::SingleVMA(_) => false, // single-vma chunk can't be free diff --git a/src/libos/src/vm/mod.rs b/src/libos/src/vm/mod.rs index 523f5414..98a40edc 100644 --- a/src/libos/src/vm/mod.rs +++ b/src/libos/src/vm/mod.rs @@ -63,11 +63,13 @@ use std::fmt; mod chunk; mod free_space_manager; +mod page_tracker; mod process_vm; mod shm_manager; mod user_space_vm; mod vm_area; mod vm_chunk_manager; +mod vm_epc; mod vm_layout; mod vm_manager; mod vm_perms; @@ -77,9 +79,12 @@ mod vm_util; use self::vm_layout::VMLayout; pub use self::chunk::{ChunkRef, ChunkType}; -pub use self::process_vm::{MMapFlags, MRemapFlags, MSyncFlags, ProcessVM, ProcessVMBuilder}; +pub use self::process_vm::{ + MMapFlags, MRemapFlags, MSyncFlags, MadviceFlags, ProcessVM, ProcessVMBuilder, +}; pub use self::user_space_vm::USER_SPACE_VM_MANAGER; pub use self::vm_area::VMArea; +pub use self::vm_epc::enclave_page_fault_handler; pub use self::vm_manager::MunmapChunkFlag; pub use self::vm_perms::VMPerms; pub use self::vm_range::VMRange; @@ -154,4 +159,9 @@ pub fn do_msync(addr: usize, size: usize, flags: MSyncFlags) -> Result<()> { current!().vm().msync(addr, size) } +pub fn do_madvice(addr: usize, length: usize, advice: MadviceFlags) -> Result<()> { + warn!("madvice is not supported. madvice flags:{:?}", advice); + Ok(()) +} + pub const PAGE_SIZE: usize = 4096; diff --git a/src/libos/src/vm/page_tracker.rs b/src/libos/src/vm/page_tracker.rs new file mode 100644 index 00000000..2d8b4f25 --- /dev/null +++ b/src/libos/src/vm/page_tracker.rs @@ -0,0 +1,488 @@ +use super::*; + +use super::user_space_vm::USER_SPACE_VM_MANAGER; +use super::vm_util::{GB, KB, MB}; +use bitvec::vec::BitVec; +use util::sync::RwLock; +use vm_epc::EPCMemType; + +// In SGX v2, there is no upper limit for the size of EPC. If the user configure 1 TB memory, +// and we only use one bit to track if the page is committed, that's 1 TB / 4 kB / 8 bit = 32 MB of memory. +// And the memory footprint will keep the same size during the whole libOS life cycle. +// In order to track the commit status of a huge number of pages, use two level tracking. +// In the first level, global level, we use `PAGE_CHUNK_UNIT` as the unit size for a page chunk. +// In the second level, we just use the page size as the unit size, and use one bit to represent if the page is committed. +// For example, if the user configure 64 TB memory, when a page is committed, the second level tracker will mark the correponding bit as 1. +// And when all the pages of a whole global page chunk are fully committed, the global level tracker will mark the page chunk as fully committed. +// And the corresponding tracker can be freed. In this way, we can use just several bytes to represent the commit status of a big chunk of memory. +// In a worse case, let's say there are several discrete global page chunks which are not not fully committed at the same time. +// And each of them will take some space in the memory. Within a memory-intensive case, we can +// commit the page by hand and make the global page chunk fully committed and free the page tracker. + +// There are mainly three types of data structure to track the page status, from the top to the bottom: +// 1. PageChunkManager - Create for the whole user space. This sructure is used to manage the global paging status. +// 2. GlobalPageChunk - Denotes a chunk of pages. The actual unit of the PageChunkManager. It holds the paging status of a memory range. Stored only +// in the PageChunkManager. A newly created VMA should ask the corresponding GlobalPageChunk for the paging status. When all the pages recoreded by +// GlobalPageChunk are all committed, it will mark itself as "fully committed" and free the inner structure tracking the paging status. All the GlobalPageChunk +// records the VM ranges with the SAME size. +// 3. PageTracker - The real tracker of the paging status. Under the hood, it is a bitvec that tracks every page with a bit. There are mainly two types +// PageTracker: +// * GlobalTracker - Used by GlobalPageChunk to track the paging status. All records the VM range with the same size. +// * VMATracker - Used by VMA to track its paging status. Records different range size according to the VMA. +// Since the VM operations are mostly performed by VMA, the VMA tracker will update itself accordingly. And also update the corresponding GlobalTracker. + +lazy_static! { + pub static ref USER_SPACE_PAGE_CHUNK_MANAGER: RwLock = + RwLock::new(PageChunkManager::new(USER_SPACE_VM_MANAGER.range())); +} + +const PAGE_CHUNK_UNIT: usize = 4 * MB; +const PAGE_CHUNK_PAGE_NUM: usize = PAGE_CHUNK_UNIT / PAGE_SIZE; + +pub struct PageChunkManager { + // The total range that the manager manages. + range: VMRange, + // The page chunks + inner: HashMap, // K: Page chunk start address, V: Global page chunk +} + +impl PageChunkManager { + fn new(range: &VMRange) -> Self { + Self { + range: range.clone(), + inner: HashMap::new(), + } + } +} + +#[derive(Debug)] +// A chunk of pages. Memory space is precious. Don't put anything unnecessary. +struct GlobalPageChunk { + fully_committed: bool, + tracker: Option>>, // if this page chunk is fully committed, the tracker will be set to None. +} + +impl GlobalPageChunk { + fn new(tracker: PageTracker) -> Self { + Self { + fully_committed: false, + tracker: Some(Arc::new(RwLock::new(tracker))), + } + } +} + +#[derive(PartialEq, Clone, Debug)] +enum TrackerType { + GlobalTracker, // PAGE_CHUNK_UNIT size for global management to track the global paging status + VMATracker, // various size for different vma to track its own paging status +} + +// Used for tracking the paging status of global tracker or VMA tracker +#[derive(Clone)] +pub struct PageTracker { + type_: TrackerType, + range: VMRange, + inner: BitVec, + fully_committed: bool, +} + +impl Debug for PageTracker { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("PageTracker") + .field("type", &self.type_) + .field("range", &self.range) + .field("fully committed", &self.fully_committed) + .finish() + } +} + +impl PageTracker { + // Create a new page tracker for GlobalPageChunk. + // When a new global tracker is needed, none of the pages are committed. + fn new_global_tracker(start_addr: usize) -> Result { + let range = VMRange::new_with_size(start_addr, PAGE_CHUNK_UNIT)?; + + let inner = bitvec![0; PAGE_CHUNK_PAGE_NUM]; + Ok(Self { + type_: TrackerType::GlobalTracker, + range, + inner, + fully_committed: false, + }) + } + + pub fn new_vma_tracker(vm_range: &VMRange, epc_type: &EPCMemType) -> Result { + trace!("new vma tracker, range = {:?}", vm_range); + let page_num = vm_range.size() / PAGE_SIZE; + let new_vma_tracker = match epc_type { + EPCMemType::UserRegion => { + let mut new_vma_tracker = Self { + type_: TrackerType::VMATracker, + range: vm_range.clone(), + inner: bitvec![0; page_num], + fully_committed: false, + }; + + // Skip sentry + if page_num != 0 { + new_vma_tracker.get_committed_pages_from_global_tracker()?; + } + new_vma_tracker + } + EPCMemType::Reserved => { + // For reserved memory, there is no need to udpate global page tracker. + // And there is no GLobalPageChunk for reserved memory. + Self { + type_: TrackerType::VMATracker, + range: vm_range.clone(), + inner: bitvec![1; page_num], + fully_committed: true, + } + } + _ => unreachable!(), + }; + + Ok(new_vma_tracker) + } + + pub fn range(&self) -> &VMRange { + &self.range + } + + pub fn is_fully_committed(&self) -> bool { + self.fully_committed + } + + pub fn is_reserved_only(&self) -> bool { + !self.fully_committed && self.inner.not_any() + } + + pub fn is_partially_committed(&self) -> bool { + !self.fully_committed && self.inner.any() + } + + // Get all committed or uncommitted ranges of consecutive page. + // If committed is true, get all committed ranges + // If committed is false, get all uncommitted ranges + pub fn get_ranges(&self, committed: bool) -> Vec { + if self.is_fully_committed() { + if committed { + return vec![self.range.clone()]; + } else { + return Vec::new(); + } + } + if self.is_reserved_only() { + if committed { + return Vec::new(); + } else { + return vec![self.range.clone()]; + } + } + + let tracker_start_addr = self.range.start(); + let mut ret = Vec::new(); + let mut start = None; + let mut end = None; + + for i in 0..self.inner.len() { + if self.inner[i] == committed { + match (start, end) { + // Meet committed page for the first time. Update both the start and end marker. + (None, None) => { + start = Some(i); + end = Some(i); + // Reach the end of the tracker. Only one page + if i == self.inner.len() - 1 { + let committed_range = VMRange::new_with_size( + tracker_start_addr + i * PAGE_SIZE, + PAGE_SIZE, + ) + .unwrap(); + ret.push(committed_range); + } + } + // Previous pages are committed. Update the end marker. + (Some(s), Some(e)) => { + end = Some(i); + // Reach the end of the tracker. + if i == self.inner.len() - 1 { + let committed_range = VMRange::new_with_size( + tracker_start_addr + s * PAGE_SIZE, + PAGE_SIZE * (i - s + 1), + ) + .unwrap(); + ret.push(committed_range); + } + } + _ => unreachable!(), + } + } else { + match (start, end) { + (None, None) => { + // No committed pages. + } + (Some(s), Some(e)) => { + // Meet the first uncommitted pages after recording all the previous committed pages. + let committed_range = VMRange::new_with_size( + tracker_start_addr + s * PAGE_SIZE, + PAGE_SIZE * (e - s + 1), + ) + .unwrap(); + ret.push(committed_range); + // Reset markers + start = None; + end = None; + } + _ => { + unreachable!() + } + } + } + } + + let total_size = ret.iter().fold(0, |a, b| a + b.size()); + if committed { + trace!("get committed ranges = {:?}", ret); + debug_assert!(total_size == self.inner.count_ones() * PAGE_SIZE); + } else { + trace!("get uncommitted ranges = {:?}", ret); + debug_assert!(total_size == self.inner.count_zeros() * PAGE_SIZE); + } + + ret + } + + pub fn split_for_new_range(&mut self, new_range: &VMRange) { + debug_assert!(self.range.is_superset_of(new_range)); + + let new_start = new_range.start(); + let page_num = new_range.size() / PAGE_SIZE; + + let split_idx = (new_start - self.range.start()) / PAGE_SIZE; + let mut new_inner = self.inner.split_off(split_idx); + new_inner.truncate(page_num); + + trace!( + "old range= {:?}, new_start = {:x}, idx = {:?}", + self.range, + new_start, + split_idx + ); + + self.inner = new_inner; + if self.inner.all() { + self.fully_committed = true; + } + + self.range = *new_range; + } + + // Commit memory for the whole current VMA (VMATracker) + pub fn commit_whole(&mut self, perms: VMPerms) -> Result<()> { + debug_assert!(self.type_ == TrackerType::VMATracker); + + if self.is_fully_committed() { + return Ok(()); + } + + // Commit EPC + if self.is_reserved_only() { + vm_epc::commit_memory(self.range().start(), self.range().size(), Some(perms)).unwrap(); + } else { + debug_assert!(self.is_partially_committed()); + let uncommitted_ranges = self.get_ranges(false); + for range in uncommitted_ranges { + vm_epc::commit_memory(range.start(), range.size(), Some(perms)).unwrap(); + } + } + + // Update the tracker + self.inner.fill(true); + self.fully_committed = true; + + self.set_committed_pages_for_global_tracker(self.range().start(), self.range().size()); + + Ok(()) + } + + // Commit memory of a specific range for the current VMA (VMATracker). The range should be verified by caller. + pub fn commit_range(&mut self, range: &VMRange, new_perms: Option) -> Result<()> { + debug_assert!(self.type_ == TrackerType::VMATracker); + debug_assert!(self.range().is_superset_of(range)); + + vm_epc::commit_memory(range.start(), range.size(), new_perms)?; + + self.commit_pages_common(range.start(), range.size()); + self.set_committed_pages_for_global_tracker(range.start(), range.size()); + + Ok(()) + } + + pub fn commit_memory_and_init_with_file( + &mut self, + range: &VMRange, + file: &FileRef, + file_offset: usize, + new_perms: VMPerms, + ) -> Result<()> { + debug_assert!(self.type_ == TrackerType::VMATracker); + debug_assert!(self.range().is_superset_of(range)); + + vm_epc::commit_memory_and_init_with_file( + range.start(), + range.size(), + file, + file_offset, + new_perms, + )?; + + self.commit_pages_common(range.start(), range.size()); + self.set_committed_pages_for_global_tracker(range.start(), range.size()); + + Ok(()) + } + + // VMATracker get page commit status from global tracker and update itself + // This should be called when the VMATracker inits + fn get_committed_pages_from_global_tracker(&mut self) -> Result<()> { + debug_assert!(self.type_ == TrackerType::VMATracker); + let mut vma_tracker = self; + let mut page_chunk_start = get_page_chunk_start_addr(vma_tracker.range().start()); + + let range_end = vma_tracker.range().end(); + for page_chunk_addr in (page_chunk_start..range_end).step_by(PAGE_CHUNK_UNIT) { + let manager = USER_SPACE_PAGE_CHUNK_MANAGER.read().unwrap(); + if let Some(page_chunk) = manager.inner.get(&page_chunk_addr) { + if page_chunk.fully_committed { + // global page chunk fully committed. commit pages for vma page chunk + vma_tracker.commit_pages_common(page_chunk_addr, PAGE_CHUNK_UNIT); + } else { + debug_assert!(page_chunk.tracker.is_some()); + let global_tracker = page_chunk.tracker.as_ref().unwrap().read().unwrap(); + global_tracker.set_committed_pages_for_vma_tracker(vma_tracker); + } + drop(manager); + } else { + // Not tracking this page chunk. Release read lock and acquire write lock for an update. + drop(manager); + // This page chunk is not tracked by global tracker. Thus none of the pages are committed. + let page_chunk = { + let global_page_tracker = PageTracker::new_global_tracker(page_chunk_addr)?; + GlobalPageChunk::new(global_page_tracker) + }; + + // There could be data race here. But it's fine, because the ultimate state is the same. + USER_SPACE_PAGE_CHUNK_MANAGER + .write() + .unwrap() + .inner + .insert(page_chunk_addr, page_chunk); + } + } + + Ok(()) + } + + // VMAtracker helps to update global tracker based on the paging status of itself. + // This should be called whenever the VMATracker updates and needs to sync with the GlobalTracker. + fn set_committed_pages_for_global_tracker(&self, commit_start_addr: usize, commit_size: usize) { + debug_assert!(self.type_ == TrackerType::VMATracker); + + let commit_end_addr = commit_start_addr + commit_size; + let page_chunk_start_addr = get_page_chunk_start_addr(commit_start_addr); + for page_chunk_addr in (page_chunk_start_addr..commit_end_addr).step_by(PAGE_CHUNK_UNIT) { + let is_global_tracker_fully_committed = { + // Find the correponding page chunk + let manager = USER_SPACE_PAGE_CHUNK_MANAGER.read().unwrap(); + let page_chunk = manager + .inner + .get(&page_chunk_addr) + .expect("this page chunk must exist"); + + // Update the global page tracker + if let Some(global_page_tracker) = &page_chunk.tracker { + let mut global_tracker = global_page_tracker.write().unwrap(); + global_tracker.commit_pages_common(commit_start_addr, commit_size); + global_tracker.fully_committed + } else { + // page_tracker is none, the page chunk is fully committed. Go to next chunk. + debug_assert!(page_chunk.fully_committed); + continue; + } + }; + + // Free the global page tracker if fully committed + if is_global_tracker_fully_committed { + // Update the global page chunk manager. Need to acquire the write lock this time. There can be data race because the lock + // could be dropped for a while before acquire again. But its fine, because the ultimate state is the same. + let mut manager = USER_SPACE_PAGE_CHUNK_MANAGER.write().unwrap(); + if let Some(mut page_chunk) = manager.inner.get_mut(&page_chunk_addr) { + page_chunk.fully_committed = true; + page_chunk.tracker = None; + } else { + warn!( + "the global page chunk with start addr: 0x{:x} has been freed already", + page_chunk_addr + ); + unreachable!(); + } + } + } + } + + // GlobalTracker helps to update VMATracker based on the paging status of itself. + // This should be called when the VMATracker inits. + fn set_committed_pages_for_vma_tracker(&self, vma_tracker: &mut PageTracker) { + debug_assert!(self.type_ == TrackerType::GlobalTracker); + debug_assert!(vma_tracker.type_ == TrackerType::VMATracker); + + let global_tracker = self; + + if let Some(intersection_range) = global_tracker.range().intersect(vma_tracker.range()) { + let vma_tracker_page_id = + (intersection_range.start() - vma_tracker.range().start()) / PAGE_SIZE; + let global_tracker_page_id = + (intersection_range.start() - global_tracker.range().start()) / PAGE_SIZE; + let page_num = intersection_range.size() / PAGE_SIZE; + + vma_tracker.inner[vma_tracker_page_id..vma_tracker_page_id + page_num] + .copy_from_bitslice( + &global_tracker.inner + [global_tracker_page_id..global_tracker_page_id + page_num], + ); + if vma_tracker.inner.all() { + vma_tracker.fully_committed = true; + } + } else { + // No intersection range, why calling this? Wierd. + unreachable!(); + } + } + + // Commit pages for page tracker itself. This is a common method for both VMATracker and GlobalTracker. + fn commit_pages_common(&mut self, start_addr: usize, size: usize) { + debug_assert!(!self.fully_committed); + + if let Some(intersection_range) = { + let range = VMRange::new_with_size(start_addr, size).unwrap(); + self.range.intersect(&range) + } { + trace!("commit for page tracker: {:?}", self); + let page_start_id = (intersection_range.start() - self.range().start()) / PAGE_SIZE; + let page_num = intersection_range.size() / PAGE_SIZE; + self.inner[page_start_id..page_start_id + page_num].fill(true); + if self.inner.all() { + self.fully_committed = true; + } + } else { + // No intersect range, wierd + unreachable!(); + } + } +} + +#[inline(always)] +fn get_page_chunk_start_addr(addr: usize) -> usize { + align_down(addr, PAGE_CHUNK_UNIT) +} diff --git a/src/libos/src/vm/process_vm.rs b/src/libos/src/vm/process_vm.rs index 8823ad62..b7d32974 100644 --- a/src/libos/src/vm/process_vm.rs +++ b/src/libos/src/vm/process_vm.rs @@ -6,7 +6,8 @@ use super::vm_area::VMArea; use super::vm_manager::MunmapChunkFlag; use super::vm_perms::VMPerms; use super::vm_util::{ - FileBacked, VMInitializer, VMMapAddr, VMMapOptions, VMMapOptionsBuilder, VMRemapOptions, + FileBacked, PagePolicy, VMInitializer, VMMapAddr, VMMapOptions, VMMapOptionsBuilder, + VMRemapOptions, }; use crate::config; use crate::ipc::SHM_MANAGER; @@ -124,6 +125,8 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> { .initializer(VMInitializer::ElfSpecific { elf_file: elf_file.file_ref().clone(), }) + // We only load loadable segments, just commit the memory when allocating. + .page_policy(PagePolicy::CommitNow) .build() .map_err(|e| { &self.handle_error_when_init(&chunks); @@ -152,6 +155,8 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> { .size(heap_layout.size()) .align(heap_layout.align()) .perms(VMPerms::READ | VMPerms::WRITE) + .page_policy(PagePolicy::CommitOnDemand) + // .page_policy(PagePolicy::CommitNow) .build() .map_err(|e| { &self.handle_error_when_init(&chunks); @@ -171,8 +176,10 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> { let stack_layout = &other_layouts[1]; let vm_option = VMMapOptionsBuilder::default() .size(stack_layout.size()) - .align(heap_layout.align()) + .align(stack_layout.align()) .perms(VMPerms::READ | VMPerms::WRITE) + // There are cases that we can't handle when the #PF happens at user's stack. Commit the stack memory now. + .page_policy(PagePolicy::CommitNow) .build() .map_err(|e| { &self.handle_error_when_init(&chunks); @@ -537,11 +544,26 @@ impl ProcessVM { } } }; + + let page_policy = { + if flags.contains(MMapFlags::MAP_STACK) { + // With MAP_STACK, the mmaped memory will be used as user's stack. If not committed, the #PF can occurs + // when switching to user space and can't be handled correctly by us. + PagePolicy::CommitNow + } else if !flags.contains(MMapFlags::MAP_ANONYMOUS) { + // Use commit-now policy for file-backed mmap. We tried the commit-on-demand policy, but didn't get any performance gain at all. + // However, the path for file-backed mmap with commit-on-demand policy is ready. We can enable this whenever needed. + PagePolicy::CommitNow + } else { + PagePolicy::CommitOnDemand + } + }; let mmap_options = VMMapOptionsBuilder::default() .size(size) .addr(addr_option) .perms(perms) .initializer(initializer) + .page_policy(page_policy) .build()?; let mmap_addr = USER_SPACE_VM_MANAGER.mmap(&mmap_options)?; Ok(mmap_addr) @@ -674,3 +696,33 @@ impl MSyncFlags { Ok(flags) } } + +#[allow(non_camel_case_types)] +#[repr(i32)] +#[derive(Debug)] +pub enum MadviceFlags { + MADV_NORMAL = 0, + MADV_RANDOM = 1, + MADV_SEQUENTIAL = 2, + MADV_WILLNEED = 3, + MADV_DONTNEED = 4, +} + +impl MadviceFlags { + pub fn from_i32(raw: i32) -> Result { + const MADV_NORMAL: i32 = 0; + const MADV_RANDOM: i32 = 1; + const MADV_SEQUENTIAL: i32 = 2; + const MADV_WILLNEED: i32 = 3; + const MADV_DONTNEED: i32 = 4; + + match raw { + MADV_NORMAL => Ok(MadviceFlags::MADV_NORMAL), + MADV_RANDOM => Ok(MadviceFlags::MADV_RANDOM), + MADV_SEQUENTIAL => Ok(MadviceFlags::MADV_SEQUENTIAL), + MADV_WILLNEED => Ok(MadviceFlags::MADV_WILLNEED), + MADV_DONTNEED => Ok(MadviceFlags::MADV_DONTNEED), + _ => return_errno!(ENOSYS, "unknown madvice flags"), + } + } +} diff --git a/src/libos/src/vm/shm_manager.rs b/src/libos/src/vm/shm_manager.rs index 6b63f8f5..5603c7b3 100644 --- a/src/libos/src/vm/shm_manager.rs +++ b/src/libos/src/vm/shm_manager.rs @@ -206,8 +206,8 @@ impl ShmManager { let old_perms = old_vma.perms(); if new_perms != old_perms { let perms = new_perms | old_perms; - VMPerms::apply_perms(new_vma.range(), perms); new_vma.set_perms(perms); + new_vma.modify_permissions_for_committed_pages(perms); } let inode_id = Self::inode_id_of(&new_vma); @@ -279,7 +279,7 @@ impl ShmManager { if perms == old_perms { return; } - VMPerms::apply_perms(vma.range(), perms); vma.set_perms(perms); + vma.modify_permissions_for_committed_pages(perms); } } diff --git a/src/libos/src/vm/user_space_vm.rs b/src/libos/src/vm/user_space_vm.rs index ac078340..1f15cfca 100644 --- a/src/libos/src/vm/user_space_vm.rs +++ b/src/libos/src/vm/user_space_vm.rs @@ -1,46 +1,50 @@ use super::*; -use super::vm_manager::VMManager; use crate::config::LIBOS_CONFIG; use crate::ctor::dtor; -use crate::ipc::SHM_MANAGER; +use crate::ipc::SYSTEM_V_SHM_MANAGER; use crate::util::pku_util; use std::ops::{Deref, DerefMut}; +use vm_epc::SGXPlatform; +use vm_manager::VMManager; +use vm_perms::VMPerms; -const RSRV_MEM_PERM: MemPerm = - MemPerm::from_bits_truncate(MemPerm::READ.bits() | MemPerm::WRITE.bits()); +const USER_SPACE_DEFAULT_MEM_PERM: VMPerms = VMPerms::DEFAULT; /// The virtual memory manager for the entire user space -pub struct UserSpaceVMManager(VMManager); +pub struct UserSpaceVMManager { + inner: VMManager, + sgx_platform: SGXPlatform, +} impl UserSpaceVMManager { fn new() -> Result { - let rsrv_mem_size = LIBOS_CONFIG.resource_limits.user_space_size; - let vm_range = unsafe { - // TODO: Current sgx_alloc_rsrv_mem implementation will commit all the pages of the desired size, which will consume - // a lot of time. When EDMM is supported, there is no need to commit all the pages at the initialization stage. A function - // which reserves memory but not commit pages should be provided then. - let ptr = sgx_alloc_rsrv_mem(rsrv_mem_size); - if ptr.is_null() { - return_errno!(ENOMEM, "run out of reserved memory"); - } + let sgx_platform = SGXPlatform::new(); + let init_size = LIBOS_CONFIG.resource_limits.user_space_init_size; + let max_size = LIBOS_CONFIG.resource_limits.user_space_max_size; - // Without EDMM support and the ReservedMemExecutable is set to 1, the reserved memory will be RWX. And we can't change the reserved memory permission. - // With EDMM support, the reserved memory permission is RW by default. And we can change the permissions when needed. + let (userspace_vm_range, gap_range) = sgx_platform.alloc_user_space(init_size, max_size)?; - let addr = ptr as usize; - debug!( - "allocated rsrv addr is 0x{:x}, len is 0x{:x}", - addr, rsrv_mem_size - ); - pku_util::pkey_mprotect_userspace_mem(addr, rsrv_mem_size, RSRV_MEM_PERM.bits()); - VMRange::new(addr, addr + rsrv_mem_size)? - }; + info!( + "user space allocated, range = {:?}, gap_range = {:?}", + userspace_vm_range, gap_range + ); - let vm_manager = VMManager::init(vm_range)?; + // Use pkey_mprotect to set the whole userspace to R/W permissions. If user specifies a new + // permission, the mprotect ocall will update the permission. + pku_util::pkey_mprotect_userspace_mem( + &userspace_vm_range, + gap_range.as_ref(), + USER_SPACE_DEFAULT_MEM_PERM, + ); - Ok(UserSpaceVMManager(vm_manager)) + let vm_manager = VMManager::init(userspace_vm_range, gap_range)?; + + Ok(Self { + inner: vm_manager, + sgx_platform, + }) } pub fn get_total_size(&self) -> usize { @@ -52,51 +56,34 @@ impl UserSpaceVMManager { // be called after the main function. Static variables are still safe to visit at this time. #[dtor] fn free_user_space() { - SHM_MANAGER.clean_when_libos_exit(); - let range = USER_SPACE_VM_MANAGER.range(); + info!("free user space at the end"); + SYSTEM_V_SHM_MANAGER.clean_when_libos_exit(); + let total_user_space_range = USER_SPACE_VM_MANAGER.range(); + let gap_range = USER_SPACE_VM_MANAGER.gap_range(); assert!(USER_SPACE_VM_MANAGER.verified_clean_when_exit()); - let addr = range.start(); - let size = range.size(); - info!("free user space VM: {:?}", range); - pku_util::clear_pku_when_libos_exit(addr, size, RSRV_MEM_PERM.bits()); - assert!(unsafe { sgx_free_rsrv_mem(addr as *const c_void, size) == 0 }); + let addr = total_user_space_range.start(); + let size = total_user_space_range.size(); + info!("free user space VM: {:?}", total_user_space_range); + + pku_util::clear_pku_when_libos_exit( + total_user_space_range, + gap_range.as_ref(), + USER_SPACE_DEFAULT_MEM_PERM, + ); + + USER_SPACE_VM_MANAGER + .sgx_platform + .free_user_space(total_user_space_range, gap_range.as_ref()); } impl Deref for UserSpaceVMManager { type Target = VMManager; fn deref(&self) -> &Self::Target { - &self.0 + &self.inner } } lazy_static! { pub static ref USER_SPACE_VM_MANAGER: UserSpaceVMManager = UserSpaceVMManager::new().unwrap(); } - -bitflags! { - struct MemPerm: i32 { - const READ = 1; - const WRITE = 2; - const EXEC = 4; - } -} - -extern "C" { - // Allocate a range of EPC memory from the reserved memory area with RW permission - // - // Parameters: - // Inputs: length [in]: Size of region to be allocated in bytes. Page aligned - // Return: Starting address of the new allocated memory area on success; otherwise NULL - // - fn sgx_alloc_rsrv_mem(length: usize) -> *const c_void; - - // Free a range of EPC memory from the reserved memory area - // - // Parameters: - // Inputs: addr[in]: Starting address of region to be freed. Page aligned. - // length[in]: The length of the memory to be freed in bytes. Page aligned - // Return: 0 on success; otherwise -1 - // - fn sgx_free_rsrv_mem(addr: *const c_void, length: usize) -> i32; -} diff --git a/src/libos/src/vm/vm_area.rs b/src/libos/src/vm/vm_area.rs index f91ab033..98435d6e 100644 --- a/src/libos/src/vm/vm_area.rs +++ b/src/libos/src/vm/vm_area.rs @@ -1,19 +1,28 @@ use super::*; +use super::page_tracker::PageTracker; +use super::vm_epc::EPCMemType; use super::vm_perms::VMPerms; use super::vm_range::VMRange; -use super::vm_util::FileBacked; - +use super::vm_util::{FileBacked, PagePolicy, VMInitializer, VMMapOptions, GB, KB, MB}; use intrusive_collections::rbtree::{Link, RBTree}; use intrusive_collections::{intrusive_adapter, KeyAdapter}; use std::ops::{Deref, DerefMut}; -#[derive(Clone, Debug, Default)] +// Commit memory size unit when the #PF occurs. +const COMMIT_SIZE_UNIT: usize = 4 * KB; +// Commit the whole VMA when this threshold reaches. +const PF_NUM_THRESHOLD: u64 = 3; + +#[derive(Clone, Debug)] pub struct VMArea { range: VMRange, perms: VMPerms, file_backed: Option, access: VMAccess, + pages: Option, // Track the paging status of this VMA + epc_type: EPCMemType, // Track the type of the EPC to use specific APIs + pf_count: u64, } #[derive(Clone, Debug, Eq, PartialEq)] @@ -32,11 +41,47 @@ impl VMArea { file_backed: Option, pid: pid_t, ) -> Self { - Self { + let epc_type = EPCMemType::new(&range); + let pages = { + match epc_type { + EPCMemType::Reserved => None, + EPCMemType::UserRegion => { + let pages = + PageTracker::new_vma_tracker(&range, &EPCMemType::UserRegion).unwrap(); + (!pages.is_fully_committed()).then_some(pages) + } + } + }; + + let new_vma = Self { range, perms, file_backed, access: VMAccess::Private(pid), + pages, + epc_type, + pf_count: 0, + }; + trace!("new vma = {:?}", new_vma); + new_vma + } + + fn new_with_page_tracker( + range: VMRange, + perms: VMPerms, + file_backed: Option, + access: VMAccess, + pages: Option, + ) -> VMArea { + let epc_type = EPCMemType::new(&range); + Self { + range, + perms, + file_backed, + access, + pages, + epc_type, + pf_count: 0, } } @@ -49,30 +94,41 @@ impl VMArea { access: VMAccess, ) -> Self { debug_assert!(vma.is_superset_of(&new_range)); - let new_backed_file = vma.file_backed.as_ref().map(|file| { + + let new_backed_file = if let Some(file) = &vma.file_backed { let mut new_file = file.clone(); let file_offset = file.offset(); - let new_file_offset = if vma.start() < new_range.start() { - let vma_offset = new_range.start() - vma.start(); - file_offset + vma_offset - } else { - let vma_offset = vma.start() - new_range.start(); - debug_assert!(file_offset >= vma_offset); - file_offset - vma_offset - }; + debug_assert!(vma.start() <= new_range.start()); + let new_start_offset = new_range.start() - vma.start(); + let new_file_offset = file_offset + new_start_offset; new_file.set_offset(new_file_offset); + Some(new_file) + } else { + None + }; - new_file - }); + let new_pages = { + let mut new_pages = vma.pages.clone(); - Self { - range: new_range, - perms: new_perms, - file_backed: new_backed_file, - access, - } + if let Some(pages) = &mut new_pages { + pages.split_for_new_range(&new_range); + if pages.is_fully_committed() { + None + } else { + new_pages + } + } else { + None + } + }; + + let new_vma = + Self::new_with_page_tracker(new_range, new_perms, new_backed_file, access, new_pages); + + trace!("inherits vma: {:?}, create new vma: {:?}", vma, new_vma); + new_vma } pub fn perms(&self) -> VMPerms { @@ -87,6 +143,13 @@ impl VMArea { &self.access } + pub fn get_private_pid(&self) -> Option { + match &self.access { + VMAccess::Private(pid) => Some(*pid), + VMAccess::Shared(_) => None, + } + } + pub fn belong_to(&self, target_pid: pid_t) -> bool { match &self.access { VMAccess::Private(pid) => *pid == target_pid, @@ -105,9 +168,199 @@ impl VMArea { } } - pub fn init_file(&self) -> Option<(&FileRef, usize)> { + fn pages(&self) -> &PageTracker { + debug_assert!(!self.is_fully_committed()); + self.pages.as_ref().unwrap() + } + + fn pages_mut(&mut self) -> &mut PageTracker { + debug_assert!(!self.is_fully_committed()); + self.pages.as_mut().unwrap() + } + + // Get pid for private VMA + pub fn pid(&self) -> pid_t { + match self.access { + VMAccess::Private(pid) => pid, + VMAccess::Shared(_) => unreachable!(), + } + } + + pub fn is_reserved_only(&self) -> bool { + if let Some(pages) = &self.pages { + return pages.is_reserved_only(); + } else { + false + } + } + + pub fn is_fully_committed(&self) -> bool { + self.pages.is_none() + } + + pub fn is_partially_committed(&self) -> bool { + if let Some(pages) = &self.pages { + return pages.is_partially_committed(); + } else { + false + } + } + + pub fn init_memory(mut self, options: &VMMapOptions) -> Result { + let mut vm_area = self; + let page_policy = options.page_policy(); + + // Commit pages if needed + if !vm_area.is_fully_committed() && page_policy == &PagePolicy::CommitNow { + vm_area.pages_mut().commit_whole(VMPerms::DEFAULT)?; + vm_area.pages = None; + } + + // Initialize committed memory + if vm_area.is_partially_committed() { + let committed = true; + for range in vm_area.pages().get_ranges(committed) { + vm_area.init_memory_internal(&range, Some(options.initializer()))?; + } + } else if vm_area.is_fully_committed() { + // Initialize the memory of the new range + unsafe { + let buf = vm_area.range().as_slice_mut(); + options.initializer().init_slice(buf)?; + } + + // Set memory permissions + if !options.perms().is_default() { + vm_area.modify_protection_force(None, vm_area.perms()); + } + } + // Do nothing if this vma has no committed memory + + Ok(vm_area) + } + + pub fn flush_and_clean_memory(&self) -> Result<()> { + let (need_flush, file, file_offset) = match self.writeback_file() { + None => (false, None, None), + Some((file_handle, offset)) => { + if !file_handle.access_mode().unwrap().writable() { + (false, None, None) + } else { + (true, Some(file_handle), Some(offset)) + } + } + }; + + if self.is_fully_committed() { + self.flush_and_clean_internal(self.range(), need_flush, file, file_offset); + } else { + let committed = true; + for range in self.pages().get_ranges(committed) { + self.flush_and_clean_internal(&range, need_flush, file, file_offset); + } + } + + Ok(()) + } + + fn flush_and_clean_internal( + &self, + target_range: &VMRange, + need_flush: bool, + file: Option<&FileRef>, + file_offset: Option, + ) { + trace!("flush and clean committed range: {:?}", target_range); + debug_assert!(self.range().is_superset_of(target_range)); + let buf = unsafe { target_range.as_slice_mut() }; + if !self.perms().is_default() { + self.modify_protection_force(Some(&target_range), VMPerms::default()); + } + + if need_flush { + let file_offset = file_offset.unwrap() + (target_range.start() - self.range.start()); + file.unwrap().write_at(file_offset, buf); + } + + // reset zeros + unsafe { + buf.iter_mut().for_each(|b| *b = 0); + } + } + + pub fn modify_permissions_for_committed_pages(&self, new_perms: VMPerms) { + if self.is_fully_committed() { + self.modify_protection_force(None, new_perms); + } else if self.is_partially_committed() { + let committed = true; + for range in self.pages().get_ranges(committed) { + self.modify_protection_force(Some(&range), new_perms); + } + } + } + + pub fn handle_page_fault( + &mut self, + rip: usize, + pf_addr: usize, + errcd: u32, + kernel_triggers: bool, + ) -> Result<()> { + trace!("PF vma = {:?}", self); + if (self.perms() == VMPerms::NONE) + || (crate::exception::check_rw_bit(errcd) == false + && !self.perms().contains(VMPerms::READ)) + { + return_errno!( + EACCES, + "Page is set to None permission. This is user-intended" + ); + } + + if crate::exception::check_rw_bit(errcd) && !self.perms().contains(VMPerms::WRITE) { + return_errno!( + EACCES, "Page is set to not contain WRITE permission but this PF is triggered by write. This is user-intended" + ) + } + + if rip == pf_addr && !self.perms().contains(VMPerms::EXEC) { + return_errno!( + EACCES, "Page is set to not contain EXEC permission but this PF is triggered by execution. This is user-intended" + ) + } + + if self.is_fully_committed() { + // This vma has been commited by other threads already. Just return. + info!("This vma has been committed by other threads already."); + return Ok(()); + } + + if matches!(self.epc_type, EPCMemType::Reserved) { + return_errno!(EINVAL, "reserved memory shouldn't trigger PF"); + } + + if kernel_triggers || self.pf_count >= PF_NUM_THRESHOLD { + return self.commit_current_vma_whole(); + } + + self.pf_count += 1; + // The return commit_size can be 0 when other threads already commit the PF-containing range but the vma is not fully committed yet. + let commit_size = self.commit_once_for_page_fault(pf_addr).unwrap(); + + trace!("page fault commit memory size = {:?}", commit_size); + + if commit_size == 0 { + warn!("This PF has been handled by other threads already."); + } + + info!("page fault handle success"); + + Ok(()) + } + + pub fn backed_file(&self) -> Option<(&FileRef, usize)> { if let Some(file) = &self.file_backed { - Some(file.init_file()) + Some(file.backed_file()) } else { None } @@ -147,36 +400,51 @@ impl VMArea { Some(new_vma) } - pub fn resize(&mut self, new_size: usize) { - self.range.resize(new_size) - } - pub fn set_start(&mut self, new_start: usize) { let old_start = self.start(); + if new_start == old_start { + return; + } + self.range.set_start(new_start); - if let Some(file) = self.file_backed.as_mut() { - if !file.need_write_back() { - return; + if new_start < old_start { + // Extend this VMA + let pages = { + let pages = PageTracker::new_vma_tracker(&self.range, &self.epc_type).unwrap(); + (!pages.is_fully_committed()).then_some(pages) + }; + self.pages = pages; + } else { + // Split this VMA + debug_assert!(new_start > old_start); + if let Some(pages) = &mut self.pages { + pages.split_for_new_range(&self.range); + if pages.is_fully_committed() { + self.pages = None; + } } + } + + if let Some(file) = self.file_backed.as_mut() { // If the updates to the VMA needs to write back to a file, then the // file offset must be adjusted according to the new start address. - let offset = file.offset(); - if old_start < new_start { - file.set_offset(offset + (new_start - old_start)); - } else { - // The caller must guarantee that the new start makes sense - debug_assert!(offset >= old_start - new_start); - file.set_offset(offset - (old_start - new_start)); - } + Self::set_file_offset(file, new_start, old_start); + } + } + + fn set_file_offset(file: &mut FileBacked, new_start_offset: usize, old_start_offset: usize) { + let offset = file.offset(); + if old_start_offset < new_start_offset { + file.set_offset(offset + (new_start_offset - old_start_offset)); + } else { + // The caller must guarantee that the new start makes sense + debug_assert!(offset >= old_start_offset - new_start_offset); + file.set_offset(offset - (old_start_offset - new_start_offset)); } } pub fn is_the_same_to(&self, other: &VMArea) -> bool { - if self.access() != other.access() { - return false; - } - if self.range() != other.range() { return false; } @@ -185,6 +453,10 @@ impl VMArea { return false; } + if self.access() != other.access() { + return false; + } + let self_writeback_file = self.writeback_file(); let other_writeback_file = other.writeback_file(); match (self_writeback_file, other_writeback_file) { @@ -199,6 +471,13 @@ impl VMArea { pub fn set_end(&mut self, new_end: usize) { self.range.set_end(new_end); + let pages = if self.range.size() > 0 { + let pages = PageTracker::new_vma_tracker(&self.range, &self.epc_type).unwrap(); + (!pages.is_fully_committed()).then_some(pages) + } else { + None + }; + self.pages = pages; } pub fn can_merge_vmas(left: &VMArea, right: &VMArea) -> bool { @@ -208,10 +487,6 @@ impl VMArea { if left.size() == 0 || right.size() == 0 { return false; } - // The two VMAs must be owned by the same process - if left.access() != right.access() { - return false; - } // The two VMAs must border with each other if left.end() != right.start() { return false; @@ -220,6 +495,15 @@ impl VMArea { if left.perms() != right.perms() { return false; } + // The two VMAs must be owned by the same process privately + // Return false if (either is none) or (both are some but two private pids are different) + let private_access = left.get_private_pid().zip(right.get_private_pid()); + if private_access.is_none() { + return false; + } + if private_access.is_some_and(|(left_pid, right_pid)| left_pid != right_pid) { + return false; + } // If the two VMAs have write-back files, the files must be the same and // the two file regions must be continuous. @@ -238,12 +522,12 @@ impl VMArea { } /// Flush a file-backed VMA to its file. This has no effect on anonymous VMA. - pub fn flush_backed_file(&self) { - self.flush_backed_file_with_cond(|_| true) + pub fn flush_committed_backed_file(&self) { + self.flush_committed_backed_file_with_cond(|_| true) } - /// Same as `flush_backed_file()`, except that an extra condition on the file needs to satisfy. - pub fn flush_backed_file_with_cond bool>(&self, cond_fn: F) { + /// Same as `flush_committed_backed_file()`, except that an extra condition on the file needs to satisfy. + pub fn flush_committed_backed_file_with_cond bool>(&self, cond_fn: F) { let (file, file_offset) = match self.writeback_file() { None => return, Some((file_and_offset)) => file_and_offset, @@ -258,7 +542,16 @@ impl VMArea { if !cond_fn(file) { return; } - file.write_at(file_offset, unsafe { self.as_slice() }); + if self.is_fully_committed() { + file.write_at(file_offset, unsafe { self.as_slice() }); + } else { + let committed = true; + let vm_range_start = self.range().start(); + for range in self.pages().get_ranges(committed) { + let file_offset = file_offset + (range.start() - vm_range_start); + file.write_at(file_offset, unsafe { range.as_slice() }); + } + } } pub fn is_shared(&self) -> bool { @@ -310,6 +603,198 @@ impl VMArea { pub fn inherits_access_from(&mut self, vma: &VMArea) { self.access = vma.access().clone() } + + // Current implementation with "unwrap()" can help us find the error quickly by panicing directly. Also, restoring VM state + // when this function fails will require some work and is not that simple. + // TODO: Return with Result instead of "unwrap()"" in this function. + fn modify_protection_force(&self, protect_range: Option<&VMRange>, new_perms: VMPerms) { + let protect_range = protect_range.unwrap_or_else(|| self.range()); + + self.epc_type + .modify_protection(protect_range.start(), protect_range.size(), new_perms) + .unwrap() + } + + // With initializer, the memory should be committed already. + // Without initializer, the memory need to be committed and initialized. + fn init_memory_internal( + &mut self, + target_range: &VMRange, + initializer: Option<&VMInitializer>, + ) -> Result<()> { + debug_assert!(self.range().is_superset_of(target_range)); + trace!("init range = {:?}", target_range); + let perms = self.perms(); + if let Some(initializer) = initializer { + match initializer { + VMInitializer::FileBacked { file } => { + let (file, offset) = file.backed_file(); + let vma_range_start = self.range.start(); + + let init_file_offset = offset + (target_range.start() - vma_range_start); + + self.init_file_backed_mem(target_range, &file, init_file_offset, perms)?; + } + VMInitializer::DoNothing() => { + if !self.perms().is_default() { + self.modify_protection_force(Some(target_range), perms); + } + } + VMInitializer::FillZeros() => { + unsafe { + let buf = target_range.as_slice_mut(); + buf.iter_mut().for_each(|b| *b = 0); + } + if !perms.is_default() { + self.modify_protection_force(Some(target_range), perms); + } + } + _ => todo!(), + } + } else { + // No initializer, #PF triggered. + let init_file = self + .backed_file() + .map(|(file, offset)| (file.clone(), offset)); + if let Some((file, offset)) = init_file { + let vma_range_start = self.range.start(); + + let init_file_offset = offset + (target_range.start() - vma_range_start); + + self.pages + .as_mut() + .unwrap() + .commit_memory_and_init_with_file( + target_range, + &file, + init_file_offset, + perms, + )?; + } else { + // PF triggered, no file-backed memory, just modify protection + self.pages + .as_mut() + .unwrap() + .commit_range(target_range, Some(perms))?; + } + } + + Ok(()) + } + + fn init_file_backed_mem( + &mut self, + target_range: &VMRange, + file: &FileRef, + file_offset: usize, + new_perm: VMPerms, + ) -> Result<()> { + if !file.access_mode().unwrap().readable() { + return_errno!(EBADF, "file is not readable"); + } + + let buf = unsafe { target_range.as_slice_mut() }; + let file_size = file.metadata().unwrap().size; + + let len = file + .read_at(file_offset, buf) + .map_err(|_| errno!(EACCES, "failed to init memory from file"))?; + + if !new_perm.is_default() { + self.modify_protection_force(Some(target_range), new_perm); + } + + Ok(()) + } + + fn get_commit_once_size(&self) -> usize { + COMMIT_SIZE_UNIT + } + + fn commit_once_for_page_fault(&mut self, pf_addr: usize) -> Result { + debug_assert!(!self.is_fully_committed()); + let mut early_return = false; + let mut total_commit_size = 0; + let vma_range_start = self.range.start(); + let permission = self.perms(); + let committed = false; + let mut uncommitted_ranges = self.pages().get_ranges(committed); + let commit_once_size = self.get_commit_once_size(); + + for range in uncommitted_ranges + .iter_mut() + .skip_while(|range| !range.contains(pf_addr)) + { + // Skip until first reach the range which contains the pf_addr + if total_commit_size == 0 { + debug_assert!(range.contains(pf_addr)); + range.set_start(align_down(pf_addr, PAGE_SIZE)); + range.resize(std::cmp::min(range.size(), commit_once_size)); + } else if range.size() + total_commit_size > commit_once_size { + // This is not first time commit. Try to commit until reaching the commit_once_size + range.resize(commit_once_size - total_commit_size); + } + + // We don't take care the file-backed memory here + debug_assert!(self.backed_file().is_none()); + self.init_memory_internal(&range, None)?; + + total_commit_size += range.size(); + if total_commit_size >= commit_once_size { + break; + } + } + + if self.pages().is_fully_committed() { + trace!("vma is fully committed"); + self.pages = None; + } + + Ok(total_commit_size) + } + + // Only used to handle PF triggered by the kernel + fn commit_current_vma_whole(&mut self) -> Result<()> { + debug_assert!(!self.is_fully_committed()); + debug_assert!(self.backed_file().is_none()); + + let mut uncommitted_ranges = self.pages.as_ref().unwrap().get_ranges(false); + for range in uncommitted_ranges { + self.init_memory_internal(&range, None).unwrap(); + } + self.pages = None; + + Ok(()) + } + + // TODO: We can re-enable this when we support lazy extend permissions. + #[allow(dead_code)] + fn page_fault_handler_extend_permission(&mut self, pf_addr: usize) -> Result<()> { + let permission = self.perms(); + + // This is intended by the application. + if permission == VMPerms::NONE { + return_errno!(EPERM, "trying to access PROT_NONE memory"); + } + + if self.is_fully_committed() { + self.modify_protection_force(None, permission); + return Ok(()); + } + + let committed = true; + let committed_ranges = self.pages().get_ranges(committed); + for range in committed_ranges.iter() { + if !range.contains(pf_addr) { + continue; + } + + self.epc_type + .modify_protection(range.start(), range.size(), permission)?; + } + + Ok(()) + } } impl Deref for VMArea { diff --git a/src/libos/src/vm/vm_chunk_manager.rs b/src/libos/src/vm/vm_chunk_manager.rs index 00d8356d..a6a8db60 100644 --- a/src/libos/src/vm/vm_chunk_manager.rs +++ b/src/libos/src/vm/vm_chunk_manager.rs @@ -83,16 +83,7 @@ impl ChunkManager { continue; } - vma.flush_backed_file(); - - if !vma.perms().is_default() { - VMPerms::apply_perms(vma, VMPerms::default()); - } - - unsafe { - let buf = vma.as_slice_mut(); - buf.iter_mut().for_each(|b| *b = 0) - } + vma.flush_and_clean_memory().unwrap(); self.free_manager.add_range_back_to_free_manager(vma); self.free_size += vma.size(); @@ -110,6 +101,7 @@ impl ChunkManager { if let VMMapAddr::Force(addr) = addr { self.munmap(addr, size)?; } + trace!("mmap options = {:?}", options); // Find and allocate a new range for this mmap request let new_range = self @@ -117,27 +109,29 @@ impl ChunkManager { .find_free_range_internal(size, align, addr)?; let new_addr = new_range.start(); let current_pid = current!().process().pid(); - let new_vma = VMArea::new( - new_range, - *options.perms(), - options.initializer().backed_file(), - current_pid, - ); + let new_vma = { + let new_vma = VMArea::new( + new_range, + *options.perms(), + options.initializer().backed_file(), + current_pid, + ) + .init_memory(options); - // Initialize the memory of the new range - let buf = unsafe { new_vma.as_slice_mut() }; - let ret = options.initializer().init_slice(buf); - if let Err(e) = ret { - // Return the free range before return with error - self.free_manager - .add_range_back_to_free_manager(new_vma.range()); - return_errno!(e.errno(), "failed to mmap"); - } + if new_vma.is_err() { + let error = new_vma.err().unwrap(); + error!("init memory failure: {}", error.backtrace()); + let range = VMRange::new_with_size(new_addr, size).unwrap(); + self.free_manager + .add_range_back_to_free_manager(&range) + .unwrap(); + return Err(error); + } + + new_vma.unwrap() + }; + trace!("new vma is ready"); - // Set memory permissions - if !options.perms().is_default() { - VMPerms::apply_perms(&new_vma, new_vma.perms()); - } self.free_size -= new_vma.size(); // After initializing, we can safely insert the new VMA self.vmas.insert(VMAObj::new_vma_obj(new_vma)); @@ -168,11 +162,7 @@ impl ChunkManager { Some(intersection_vma) => intersection_vma, }; - // File-backed VMA needs to be flushed upon munmap - intersection_vma.flush_backed_file(); - if !&intersection_vma.perms().is_default() { - VMPerms::apply_perms(&intersection_vma, VMPerms::default()); - } + intersection_vma.flush_and_clean_memory()?; if vma.range() == intersection_vma.range() { // Exact match. Just remove. @@ -194,13 +184,6 @@ impl ChunkManager { } } - // Reset zero - unsafe { - trace!("intersection vma = {:?}", intersection_vma); - let buf = intersection_vma.as_slice_mut(); - buf.iter_mut().for_each(|b| *b = 0) - } - self.free_manager .add_range_back_to_free_manager(intersection_vma.range()); self.free_size += intersection_vma.size(); @@ -306,8 +289,7 @@ impl ChunkManager { if intersection_vma.range() == containing_vma.range() { // The whole containing_vma is mprotected containing_vma.set_perms(new_perms); - VMPerms::apply_perms(&containing_vma, containing_vma.perms()); - trace!("containing_vma = {:?}", containing_vma); + containing_vma.modify_permissions_for_committed_pages(containing_vma.perms()); containing_vmas.replace_with(VMAObj::new_vma_obj(containing_vma)); containing_vmas.move_next(); continue; @@ -325,13 +307,13 @@ impl ChunkManager { let protect_end = protect_range.end(); // New VMA - let new_vma = VMArea::inherits_file_from( + let mut new_vma = VMArea::inherits_file_from( &containing_vma, protect_range, new_perms, VMAccess::Private(current_pid), ); - VMPerms::apply_perms(&new_vma, new_vma.perms()); + new_vma.modify_permissions_for_committed_pages(new_vma.perms()); let new_vma = VMAObj::new_vma_obj(new_vma); // Another new VMA @@ -356,15 +338,16 @@ impl ChunkManager { break; } 1 => { - let remain_vma = remain_vmas.pop().unwrap(); + let mut remain_vma = remain_vmas.pop().unwrap(); - let new_vma = VMArea::inherits_file_from( + let mut new_vma = VMArea::inherits_file_from( &containing_vma, intersection_vma.range().clone(), new_perms, VMAccess::Private(current_pid), ); - VMPerms::apply_perms(&new_vma, new_vma.perms()); + + new_vma.modify_permissions_for_committed_pages(new_vma.perms()); if remain_vma.start() == containing_vma.start() { // mprotect right side of the vma @@ -374,6 +357,7 @@ impl ChunkManager { debug_assert!(remain_vma.end() == containing_vma.end()); containing_vma.set_start(remain_vma.start()); } + debug_assert!(containing_vma.range() == remain_vma.range()); containing_vmas.replace_with(VMAObj::new_vma_obj(containing_vma)); containing_vmas.insert(VMAObj::new_vma_obj(new_vma)); @@ -401,7 +385,7 @@ impl ChunkManager { None => continue, Some(vma) => vma, }; - vma.flush_backed_file(); + vma.flush_committed_backed_file(); } Ok(()) } @@ -409,9 +393,11 @@ impl ChunkManager { /// Sync all shared, file-backed memory mappings of the given file by flushing /// the memory content to the file. pub fn msync_by_file(&mut self, sync_file: &FileRef) { + let is_same_file = |file: &FileRef| -> bool { Arc::ptr_eq(&file, &sync_file) }; for vma_obj in &self.vmas { - let is_same_file = |file: &FileRef| -> bool { Arc::ptr_eq(&file, &sync_file) }; - vma_obj.vma().flush_backed_file_with_cond(is_same_file); + vma_obj + .vma() + .flush_committed_backed_file_with_cond(is_same_file); } } @@ -428,6 +414,34 @@ impl ChunkManager { return Ok(vma.range().clone()); } + pub fn handle_page_fault( + &mut self, + rip: usize, + pf_addr: usize, + errcd: u32, + kernel_triggers: bool, + ) -> Result<()> { + trace!( + "handle_page_fault chunk manager range = {:?}, free_size = {:?}", + self.range, + self.free_size + ); + let mut vma_cursor = self.vmas.upper_bound_mut(Bound::Included(&pf_addr)); + if vma_cursor.is_null() { + return_errno!(ENOMEM, "no mmap regions that contains the address"); + } + let vma = vma_cursor.get().unwrap().vma(); + if vma.pid() != current!().process().pid() || !vma.contains(pf_addr) { + return_errno!(ENOMEM, "no mmap regions that contains the address"); + } + + let mut vma = vma.clone(); + vma.handle_page_fault(rip, pf_addr, errcd, kernel_triggers)?; + vma_cursor.replace_with(VMAObj::new_vma_obj(vma)); + + Ok(()) + } + pub fn usage_percentage(&self) -> f32 { let total_size = self.range.size(); let mut used_size = 0; @@ -487,6 +501,7 @@ impl VMRemapParser for ChunkManager { impl Drop for ChunkManager { fn drop(&mut self) { + info!("drop chunk manager = {:?}", self); assert!(self.is_empty()); assert!(self.free_size == self.range.size()); assert!(self.free_manager.free_size() == self.range.size()); diff --git a/src/libos/src/vm/vm_epc.rs b/src/libos/src/vm/vm_epc.rs new file mode 100644 index 00000000..54f23e99 --- /dev/null +++ b/src/libos/src/vm/vm_epc.rs @@ -0,0 +1,405 @@ +// This file contains EPC related APIs and definitions. +use super::*; +use sgx_trts::emm::{ + AllocAddr, AllocFlags, AllocOptions, EmmAlloc, HandleResult, PageFaultHandler, Perm, +}; +use sgx_trts::enclave::rsgx_is_supported_EDMM; +use std::ptr::NonNull; + +// Memory Layout for Platforms with EDMM support +// +// Addr low -> high +// |---------------------------------------------||---------------------||--------------------------------------| +// Reserved Memory Gap Range User Region Memory +// (commit memory when loading the enclave) (used by SDK) (commit on demand when PF occurs) +// +// For platforms without EDMM support, we only use reserved memory. + +pub enum SGXPlatform { + WithEDMM, + NoEDMM, +} + +#[derive(Clone)] +pub enum EPCMemType { + Reserved, + UserRegion, +} + +pub struct ReservedMem; +pub struct UserRegionMem; + +#[repr(C, align(4096))] +#[derive(Clone)] +struct ZeroPage([u8; PAGE_SIZE]); + +impl ZeroPage { + fn new() -> Self { + Self([0; PAGE_SIZE]) + } + + fn new_page_aligned_vec(size: usize) -> Vec { + debug_assert!(size % PAGE_SIZE == 0); + let page_num = size / PAGE_SIZE; + let mut page_vec = vec![Self::new(); page_num]; + + let ptr = page_vec.as_mut_ptr(); + + let size = page_num * std::mem::size_of::(); + std::mem::forget(page_vec); + + unsafe { Vec::from_raw_parts(ptr as *mut u8, size, size) } + } +} + +lazy_static! { + static ref ZERO_PAGE: Vec = ZeroPage::new_page_aligned_vec(PAGE_SIZE); +} + +pub trait EPCAllocator { + fn alloc(size: usize) -> Result { + return_errno!(ENOSYS, "operation not supported"); + } + + fn alloc_with_addr(addr: usize, size: usize) -> Result { + return_errno!(ENOSYS, "operation not supported"); + } + + fn free(addr: usize, size: usize) -> Result<()> { + return_errno!(ENOSYS, "operation not supported"); + } + + fn modify_protection(addr: usize, length: usize, protection: VMPerms) -> Result<()> { + return_errno!(ENOSYS, "operation not supported"); + } + + fn mem_type() -> EPCMemType; +} + +impl EPCAllocator for ReservedMem { + fn alloc(size: usize) -> Result { + let ptr = unsafe { sgx_alloc_rsrv_mem(size) }; + if ptr.is_null() { + return_errno!(ENOMEM, "run out of reserved memory"); + } + Ok(ptr as usize) + } + + fn alloc_with_addr(addr: usize, size: usize) -> Result { + let ptr = unsafe { sgx_alloc_rsrv_mem_ex(addr as *const c_void, size) }; + if ptr.is_null() { + return_errno!(ENOMEM, "can't allocate reserved memory at desired address"); + } + Ok(ptr as usize) + } + + fn free(addr: usize, size: usize) -> Result<()> { + let ret = unsafe { sgx_free_rsrv_mem(addr as *const c_void, size) }; + assert!(ret == 0); + Ok(()) + } + + fn modify_protection(addr: usize, length: usize, protection: VMPerms) -> Result<()> { + let mut ret_val = 0; + let ret = if rsgx_is_supported_EDMM() { + unsafe { + sgx_tprotect_rsrv_mem(addr as *const c_void, length, protection.bits() as i32) + } + } else { + // For platforms without EDMM, sgx_tprotect_rsrv_mem is actually useless. + // However, at least we can set pages to desired protections in the host kernel page table. + unsafe { + occlum_ocall_mprotect( + &mut ret_val as *mut i32, + addr as *const c_void, + length, + protection.bits() as i32, + ) + } + }; + + if ret != sgx_status_t::SGX_SUCCESS || ret_val != 0 { + return_errno!(ENOMEM, "reserved memory modify protection failure"); + } + + Ok(()) + } + + fn mem_type() -> EPCMemType { + EPCMemType::Reserved + } +} + +impl EPCAllocator for UserRegionMem { + fn alloc(size: usize) -> Result { + let alloc_options = AllocOptions::new() + .set_flags(AllocFlags::COMMIT_ON_DEMAND) + .set_handler(enclave_page_fault_handler_dummy, 0); + let ptr = unsafe { EmmAlloc.alloc(AllocAddr::Any, size, alloc_options) } + .map_err(|e| errno!(Errno::from(e as u32)))?; + + Ok(ptr.addr().get()) + } + + fn free(addr: usize, size: usize) -> Result<()> { + let ptr = NonNull::::new(addr as *mut u8).unwrap(); + unsafe { EmmAlloc.dealloc(ptr, size) }.map_err(|e| errno!(Errno::from(e as u32)))?; + Ok(()) + } + + fn modify_protection(addr: usize, length: usize, protection: VMPerms) -> Result<()> { + trace!( + "user region modify protection, protection = {:?}, range = {:?}", + protection, + VMRange::new_with_size(addr, length).unwrap() + ); + let ptr = NonNull::::new(addr as *mut u8).unwrap(); + unsafe { + EmmAlloc.modify_permissions(ptr, length, Perm::from_bits(protection.bits()).unwrap()) + } + .map_err(|e| errno!(Errno::from(e as u32)))?; + + Ok(()) + } + + fn mem_type() -> EPCMemType { + EPCMemType::UserRegion + } +} + +impl UserRegionMem { + fn commit_memory(start_addr: usize, size: usize) -> Result<()> { + let ptr = NonNull::::new(start_addr as *mut u8).unwrap(); + unsafe { EmmAlloc.commit(ptr, size) }.map_err(|e| errno!(Errno::from(e as u32)))?; + Ok(()) + } + + fn commit_memory_with_new_permission( + start_addr: usize, + size: usize, + new_perms: VMPerms, + ) -> Result<()> { + let ptr = NonNull::::new(start_addr as *mut u8).unwrap(); + let perm = Perm::from_bits(new_perms.bits()).unwrap(); + if size == PAGE_SIZE { + unsafe { EmmAlloc::commit_with_data(ptr, ZERO_PAGE.as_slice(), perm) } + .map_err(|e| errno!(Errno::from(e as u32)))?; + } else { + let data = ZeroPage::new_page_aligned_vec(size); + unsafe { EmmAlloc::commit_with_data(ptr, data.as_slice(), perm) } + .map_err(|e| errno!(Errno::from(e as u32)))?; + } + Ok(()) + } + + fn commit_memory_and_init_with_file( + start_addr: usize, + size: usize, + file: &FileRef, + file_offset: usize, + new_perms: VMPerms, + ) -> Result<()> { + let mut data = ZeroPage::new_page_aligned_vec(size); + let len = file + .read_at(file_offset, data.as_mut_slice()) + .map_err(|_| errno!(EACCES, "failed to init memory from file"))?; + + let ptr = NonNull::::new(start_addr as *mut u8).unwrap(); + let perm = Perm::from_bits(new_perms.bits()).unwrap(); + + unsafe { EmmAlloc::commit_with_data(ptr, data.as_slice(), perm) } + .map_err(|e| errno!(Errno::from(e as u32)))?; + Ok(()) + } +} + +impl SGXPlatform { + pub fn new() -> Self { + if rsgx_is_supported_EDMM() { + SGXPlatform::WithEDMM + } else { + SGXPlatform::NoEDMM // including SGX simulation mode + } + } + + pub fn alloc_user_space( + &self, + init_size: usize, + max_size: usize, + ) -> Result<(VMRange, Option)> { + debug!( + "alloc user space init size = {:?}, max size = {:?}", + init_size, max_size + ); + if matches!(self, SGXPlatform::WithEDMM) && max_size > init_size { + let user_region_size = max_size - init_size; + + let reserved_mem_start_addr = ReservedMem::alloc(init_size)?; + + let user_region_start_addr = UserRegionMem::alloc(user_region_size)?; + + let total_user_space_range = VMRange::new( + reserved_mem_start_addr, + user_region_start_addr + user_region_size, + )?; + let gap_range = + VMRange::new(reserved_mem_start_addr + init_size, user_region_start_addr)?; + + info!( + "allocated user space range is {:?}, gap range is {:?}. reserved_mem range is {:?}, user region range is {:?}", + total_user_space_range, gap_range, VMRange::new_with_size(reserved_mem_start_addr, init_size), + VMRange::new_with_size(user_region_start_addr, user_region_size) + ); + + Ok((total_user_space_range, Some(gap_range))) + } else { + // For platform with no-edmm support, or the max_size is the same as init_size, use reserved memory for the whole userspace + let reserved_mem_start_addr = ReservedMem::alloc(max_size)?; + let total_user_space_range = + VMRange::new(reserved_mem_start_addr, reserved_mem_start_addr + max_size)?; + + info!( + "allocated user space range is {:?}, gap range is None", + total_user_space_range + ); + + Ok((total_user_space_range, None)) + } + } + + pub fn free_user_space(&self, user_space_range: &VMRange, gap_range: Option<&VMRange>) { + let user_space_ranges = if let Some(gap_range) = gap_range { + user_space_range.subtract(gap_range) + } else { + vec![*user_space_range] + }; + + if user_space_ranges.len() == 2 { + debug_assert!(matches!(self, SGXPlatform::WithEDMM)); + let reserved_mem = user_space_ranges[0]; + let user_region_mem = user_space_ranges[1]; + ReservedMem::free(reserved_mem.start(), reserved_mem.size()).unwrap(); + UserRegionMem::free(user_region_mem.start(), user_region_mem.size()).unwrap(); + } else { + // For platforms with EDMM but max_size equals init_size or the paltforms without EDMM, there is no gap range. + debug_assert!(user_space_ranges.len() == 1); + let reserved_mem = user_space_ranges[0]; + ReservedMem::free(reserved_mem.start(), reserved_mem.size()).unwrap(); + } + } +} + +impl Debug for EPCMemType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let output_str = match self { + EPCMemType::Reserved => "reserved memory region", + EPCMemType::UserRegion => "user region memory", + }; + write!(f, "{}", output_str) + } +} + +impl EPCMemType { + pub fn new(range: &VMRange) -> Self { + trace!("EPC new range = {:?}", range); + if rsgx_is_supported_EDMM() { + if let Some(gap_range) = USER_SPACE_VM_MANAGER.gap_range() { + debug_assert!({ + if range.size() > 0 { + !gap_range.overlap_with(range) + } else { + // Ignore for sentry VMA + true + } + }); + if range.end() <= gap_range.start() { + EPCMemType::Reserved + } else { + debug_assert!(gap_range.end() <= range.start()); + EPCMemType::UserRegion + } + } else { + // There is no gap, this indicates that there is no user region memory + EPCMemType::Reserved + } + } else { + // Only reserved memory + EPCMemType::Reserved + } + } + + pub fn modify_protection(&self, addr: usize, length: usize, protection: VMPerms) -> Result<()> { + // PT_GROWSDOWN should only be applied to stack segment or a segment mapped with the MAP_GROWSDOWN flag set. + // Since the memory are managed by our own, mprotect ocall shouldn't use this flag. Otherwise, EINVAL will be thrown. + let mut prot = protection.clone(); + prot.remove(VMPerms::GROWSDOWN); + + match self { + EPCMemType::Reserved => ReservedMem::modify_protection(addr, length, prot), + EPCMemType::UserRegion => UserRegionMem::modify_protection(addr, length, prot), + } + } +} + +pub fn commit_memory(start_addr: usize, size: usize, new_perms: Option) -> Result<()> { + trace!( + "commit epc: {:?}, new permission: {:?}", + VMRange::new_with_size(start_addr, size).unwrap(), + new_perms + ); + + // We should make memory commit and permission change atomic to prevent data races. Thus, if the new perms + // are not the default permission (RW), we implement a different function by calling EACCEPTCOPY + match new_perms { + Some(perms) if perms != VMPerms::DEFAULT => { + UserRegionMem::commit_memory_with_new_permission(start_addr, size, perms) + } + _ => UserRegionMem::commit_memory(start_addr, size), + } +} + +pub fn commit_memory_and_init_with_file( + start_addr: usize, + size: usize, + file: &FileRef, + file_offset: usize, + new_perms: VMPerms, +) -> Result<()> { + UserRegionMem::commit_memory_and_init_with_file(start_addr, size, file, file_offset, new_perms) +} + +// This is a dummy function for sgx_mm_alloc. The real handler is "enclave_page_fault_handler" shown below. +extern "C" fn enclave_page_fault_handler_dummy( + pfinfo: &sgx_pfinfo, + private: usize, +) -> HandleResult { + // Don't do anything here. Modification of registers can cause the PF handling error. + return HandleResult::Search; +} + +pub fn enclave_page_fault_handler( + rip: usize, + exception_info: sgx_misc_exinfo_t, + kernel_triggers: bool, +) -> Result<()> { + let pf_addr = exception_info.faulting_address as usize; + let pf_errcd = exception_info.error_code; + trace!( + "enclave page fault caught, pf_addr = 0x{:x}, error code = {:?}", + pf_addr, + pf_errcd + ); + + USER_SPACE_VM_MANAGER.handle_page_fault(rip, pf_addr, pf_errcd, kernel_triggers)?; + + Ok(()) +} + +extern "C" { + fn occlum_ocall_mprotect( + retval: *mut i32, + addr: *const c_void, + len: usize, + prot: i32, + ) -> sgx_status_t; +} diff --git a/src/libos/src/vm/vm_manager.rs b/src/libos/src/vm/vm_manager.rs index b45dd1e8..e1a644e0 100644 --- a/src/libos/src/vm/vm_manager.rs +++ b/src/libos/src/vm/vm_manager.rs @@ -22,14 +22,16 @@ use std::ops::Bound::{Excluded, Included}; #[derive(Debug)] pub struct VMManager { range: VMRange, + gap_range: Option, internal: SgxMutex, } impl VMManager { - pub fn init(vm_range: VMRange) -> Result { - let internal = InternalVMManager::init(vm_range.clone()); + pub fn init(vm_range: VMRange, gap_range: Option) -> Result { + let mut internal = InternalVMManager::init(vm_range.clone(), &gap_range); Ok(VMManager { range: vm_range, + gap_range: gap_range, internal: SgxMutex::new(internal), }) } @@ -38,6 +40,10 @@ impl VMManager { &self.range } + pub fn gap_range(&self) -> &Option { + &self.gap_range + } + pub fn internal(&self) -> SgxMutexGuard { self.internal.lock().unwrap() } @@ -56,8 +62,15 @@ impl VMManager { } pub fn verified_clean_when_exit(&self) -> bool { + let gap_size = if let Some(gap) = self.gap_range() { + gap.size() + } else { + 0 + }; + let internal = self.internal(); - internal.chunks.len() == 0 && internal.free_manager.free_size() == self.range.size() + internal.chunks.len() == 0 + && internal.free_manager.free_size() + gap_size == self.range.size() } pub fn free_chunk(&self, chunk: &ChunkRef) { @@ -358,22 +371,19 @@ impl VMManager { intersect_chunks.iter().for_each(|chunk| { if let ChunkType::SingleVMA(vma) = chunk.internal() { - if let Some(intersection_range) = chunk.range().intersect(&reset_range) { - let mut internal_manager = self.internal(); - internal_manager.mprotect_single_vma_chunk( - &chunk, - intersection_range, - VMPerms::DEFAULT, - ); - - unsafe { - let buf = intersection_range.as_slice_mut(); - buf.iter_mut().for_each(|b| *b = 0) - } + let mut vma = vma.lock().unwrap(); + if let Some(intersection_vma) = vma.intersect(&reset_range) { + intersection_vma.flush_and_clean_memory().unwrap(); } + // clear permission for SingleVMA chunk + if vma.perms() != VMPerms::DEFAULT { + vma.set_perms(VMPerms::default()); + } + } else { + // Currently only used for heap de-allocation. Thus must be SingleVMA chunk. + unreachable!() } }); - Ok(()) } @@ -394,11 +404,11 @@ impl VMManager { match chunk.internal() { ChunkType::MultiVMA(manager) => { trace!("msync default chunk: {:?}", chunk.range()); - return manager + manager .lock() .unwrap() .chunk_manager_mut() - .msync_by_range(&sync_range); + .msync_by_range(&sync_range)?; } ChunkType::SingleVMA(vma) => { // Note: There are rare cases that mutliple threads do mprotect or munmap for the same single-vma chunk @@ -406,7 +416,7 @@ impl VMManager { // It is fine here because this function doesn't modify the global chunk list and only operates on the vma // which is updated realtimely. let vma = vma.lock().unwrap(); - vma.flush_backed_file(); + vma.flush_committed_backed_file(); } } Ok(()) @@ -429,7 +439,7 @@ impl VMManager { ChunkType::SingleVMA(vma) => { vma.lock() .unwrap() - .flush_backed_file_with_cond(is_same_file); + .flush_committed_backed_file_with_cond(is_same_file); } }); } @@ -539,6 +549,41 @@ impl VMManager { assert!(mem_chunks.len() == 0); } + + pub fn handle_page_fault( + &self, + rip: usize, + pf_addr: usize, + errcd: u32, + kernel_triggers: bool, + ) -> Result<()> { + let current = current!(); + let page_fault_chunk = { + let current_process_mem_chunks = current.vm().mem_chunks().read().unwrap(); + if let Some(page_fault_chunk) = current_process_mem_chunks + .iter() + .find(|chunk| chunk.range().contains(pf_addr)) + { + Some(page_fault_chunk.clone()) + } else { + None + } + }; + + if let Some(page_fault_chunk) = page_fault_chunk { + return page_fault_chunk.handle_page_fault(rip, pf_addr, errcd, kernel_triggers); + } + + // System V SHM segments are not tracked by the process VM. Try find the chunk here. + if let Some(page_fault_shm_chunk) = + SYSTEM_V_SHM_MANAGER.get_shm_chunk_containing_addr(pf_addr, current.process().pid()) + { + return page_fault_shm_chunk.handle_page_fault(rip, pf_addr, errcd, kernel_triggers); + } + + // This can happen for example, when the user intends to trigger the SIGSEGV handler by visit nullptr. + return_errno!(ENOMEM, "can't find the chunk containing the address"); + } } // Modification on this structure must acquire the global lock. @@ -552,11 +597,21 @@ pub struct InternalVMManager { } impl InternalVMManager { - pub fn init(vm_range: VMRange) -> Self { + pub fn init(vm_range: VMRange, gap_range: &Option) -> Self { let chunks = BTreeSet::new(); let fast_default_chunks = Vec::new(); - let free_manager = VMFreeSpaceManager::new(vm_range); + let mut free_manager = VMFreeSpaceManager::new(vm_range); let shm_manager = ShmManager::new(); + if let Some(gap_range) = gap_range { + debug_assert!(vm_range.is_superset_of(&gap_range)); + free_manager + .find_free_range_internal( + gap_range.size(), + PAGE_SIZE, + VMMapAddr::Force(gap_range.start()), + ) + .unwrap(); + } Self { chunks, fast_default_chunks, @@ -657,19 +712,7 @@ impl InternalVMManager { _ => unreachable!(), }; - // File-backed VMA needs to be flushed upon munmap - intersection_vma.flush_backed_file(); - - // Reset memory permissions - if !&intersection_vma.perms().is_default() { - VMPerms::apply_perms(&intersection_vma, VMPerms::default()); - } - - // Reset to zero - unsafe { - let buf = intersection_vma.as_slice_mut(); - buf.iter_mut().for_each(|b| *b = 0) - } + intersection_vma.flush_and_clean_memory()?; let mut new_vmas = vma.subtract(&intersection_vma); let current = current!(); @@ -724,10 +767,10 @@ impl InternalVMManager { self.shm_manager .create_shared_chunk(options, new_chunk.clone()) .map_err(|e| { - let vma = new_chunk.get_vma_for_single_vma_chunk(); + let mut vma = new_chunk.get_vma_for_single_vma_chunk(); // Reset memory permissions if !vma.perms().is_default() { - VMPerms::apply_perms(&vma, VMPerms::default()); + vma.modify_permissions_for_committed_pages(VMPerms::default()) } // Reset memory contents unsafe { @@ -778,19 +821,11 @@ impl InternalVMManager { .munmap_shared_chunk(chunk, munmap_range, flag)? == MunmapSharedResult::Freeable { - let vma = chunk.get_vma_for_single_vma_chunk(); - // Flush memory contents to backed file - vma.flush_backed_file(); - // Reset memory permissions - if !vma.perms().is_default() { - VMPerms::apply_perms(&vma, VMPerms::default()); + // Flush memory contents to backed file and reset memory contents + { + let vma = chunk.get_vma_for_single_vma_chunk(); + vma.flush_and_clean_memory()?; } - // Reset memory contents - unsafe { - let buf = vma.as_slice_mut(); - buf.iter_mut().for_each(|b| *b = 0) - } - drop(vma); self.free_chunk(chunk); let current = current!(); @@ -855,7 +890,6 @@ impl InternalVMManager { } ChunkType::SingleVMA(vma) => vma, }; - let mut updated_vmas = { let mut containing_vma = vma.lock().unwrap(); trace!( @@ -865,7 +899,8 @@ impl InternalVMManager { ); debug_assert!(chunk.range() == containing_vma.range()); - if containing_vma.perms() == new_perms { + let old_perms = containing_vma.perms(); + if old_perms == new_perms { return Ok(()); } @@ -876,7 +911,7 @@ impl InternalVMManager { (true, true) => { // Exact the same vma containing_vma.set_perms(new_perms); - VMPerms::apply_perms(&containing_vma, containing_vma.perms()); + containing_vma.modify_permissions_for_committed_pages(new_perms); return Ok(()); } (false, false) => { @@ -886,15 +921,13 @@ impl InternalVMManager { // remaining old VMA: [protect_range.end, containing_vma.end) let old_end = containing_vma.end(); - let old_perms = containing_vma.perms(); - - let new_vma = VMArea::inherits_file_from( + let mut new_vma = VMArea::inherits_file_from( &containing_vma, protect_range, new_perms, VMAccess::Private(current_pid), ); - VMPerms::apply_perms(&new_vma, new_vma.perms()); + new_vma.modify_permissions_for_committed_pages(new_perms); let remaining_old_vma = { let range = VMRange::new(protect_range.end(), old_end).unwrap(); @@ -905,7 +938,6 @@ impl InternalVMManager { VMAccess::Private(current_pid), ) }; - containing_vma.set_end(protect_range.start()); // Put containing_vma at last to be updated first. @@ -913,19 +945,19 @@ impl InternalVMManager { updated_vmas } _ => { - let new_vma = VMArea::inherits_file_from( + let mut new_vma = VMArea::inherits_file_from( &containing_vma, protect_range, new_perms, VMAccess::Private(current_pid), ); - VMPerms::apply_perms(&new_vma, new_vma.perms()); + new_vma.modify_permissions_for_committed_pages(new_perms); if same_start { - // Protect range is at left side of the cotaining vma + // Protect range is at left side of the containing vma containing_vma.set_start(protect_range.end()); } else { - // Protect range is at right side of the cotaining vma + // Protect range is at right side of the containing vma containing_vma.set_end(protect_range.start()); } @@ -935,19 +967,16 @@ impl InternalVMManager { } } }; - let current = current!(); // First update current vma chunk if updated_vmas.len() > 1 { let update_vma = updated_vmas.pop().unwrap(); self.update_single_vma_chunk(¤t, &chunk, update_vma); } - // Then add new chunks if any updated_vmas.into_iter().for_each(|vma| { self.add_new_chunk(¤t, vma); }); - Ok(()) } @@ -964,9 +993,6 @@ impl InternalVMManager { // Remove from chunks self.chunks.remove(chunk); - // Mprotect the whole chunk to reduce the usage of vma count of host - VMPerms::apply_perms(range, VMPerms::DEFAULT); - // Add range back to freespace manager self.free_manager.add_range_back_to_free_manager(range); Ok(()) @@ -1131,6 +1157,7 @@ impl InternalVMManager { let perms = options.perms().clone(); let align = options.align().clone(); let initializer = options.initializer(); + let page_policy = options.page_policy(); target_contained_ranges .iter() .map(|range| { @@ -1146,6 +1173,7 @@ impl InternalVMManager { .initializer(initializer.clone()) .addr(addr) .size(size) + .page_policy(*page_policy) .build() .unwrap() }) diff --git a/src/libos/src/vm/vm_perms.rs b/src/libos/src/vm/vm_perms.rs index 41da2d54..86ee0e31 100644 --- a/src/libos/src/vm/vm_perms.rs +++ b/src/libos/src/vm/vm_perms.rs @@ -39,37 +39,6 @@ impl VMPerms { self.bits == Self::DEFAULT.bits } - pub fn apply_perms(protect_range: &VMRange, perms: VMPerms) { - use sgx_trts::enclave::rsgx_is_supported_EDMM; - - unsafe { - let mut retval = 0; - let addr = protect_range.start() as *const c_void; - let len = protect_range.size(); - // PT_GROWSDOWN should only be applied to stack segment or a segment mapped with the MAP_GROWSDOWN flag set. - // Since the memory are managed by our own, mprotect ocall shouldn't use this flag. Otherwise, EINVAL will be thrown. - let mut prot = perms.clone(); - prot.remove(VMPerms::GROWSDOWN); - - if rsgx_is_supported_EDMM() { - // With EDMM support, reserved memory permission should be updated. - let sgx_status = sgx_tprotect_rsrv_mem(addr, len, prot.bits() as i32); - if sgx_status != sgx_status_t::SGX_SUCCESS { - panic!("sgx_tprotect_rsrv_mem status {}", sgx_status); - } - } else { - // Without EDMM support, reserved memory permission is statically RWX and we only need to do mprotect ocall. - let sgx_status = occlum_ocall_mprotect(&mut retval, addr, len, prot.bits() as i32); - if sgx_status != sgx_status_t::SGX_SUCCESS || retval != 0 { - panic!( - "occlum_ocall_mprotect status {}, retval {}", - sgx_status, retval - ); - } - } - } - } - pub fn display(&self) -> String { let mut str = String::new(); if self.can_read() { @@ -96,23 +65,3 @@ impl Default for VMPerms { VMPerms::DEFAULT } } - -extern "C" { - // Modify the access permissions of the pages in the reserved memory area - // - // Parameters: - // Inputs: addr[in]: Starting address of region which needs to change access - // permission. Page aligned. - // length[in]: The length of the memory to be manipulated in bytes. Page aligned. - // prot[in]: The target memory protection. - // Return: sgx_status_t - // - fn sgx_tprotect_rsrv_mem(addr: *const c_void, length: usize, prot: i32) -> sgx_status_t; - - fn occlum_ocall_mprotect( - retval: *mut i32, - addr: *const c_void, - len: usize, - prot: i32, - ) -> sgx_status_t; -} diff --git a/src/libos/src/vm/vm_util.rs b/src/libos/src/vm/vm_util.rs index 0a1e3892..6651d919 100644 --- a/src/libos/src/vm/vm_util.rs +++ b/src/libos/src/vm/vm_util.rs @@ -10,6 +10,11 @@ use intrusive_collections::RBTreeLink; use intrusive_collections::{intrusive_adapter, KeyAdapter}; use rcore_fs::vfs::Metadata; +pub const GB: usize = 1 << 30; +pub const TB: usize = 1 << 40; +pub const MB: usize = 1 << 20; +pub const KB: usize = 1 << 10; + #[derive(Clone, Debug)] pub enum VMInitializer { DoNothing(), @@ -139,7 +144,7 @@ impl FileBacked { self.write_back } - pub fn init_file(&self) -> (&FileRef, usize) { + pub fn backed_file(&self) -> (&FileRef, usize) { (&self.file, self.offset) } @@ -179,6 +184,19 @@ impl VMMapAddr { } } +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum PagePolicy { + ReserveOnly = 0x1, // Only reserve + CommitNow = 0x2, // Commit all pages when mmap. + CommitOnDemand = 0x4, // Reserve space when mmap, commit in the PF handler. This is the default policy. +} + +impl Default for PagePolicy { + fn default() -> PagePolicy { + PagePolicy::CommitOnDemand + } +} + #[derive(Builder, Debug)] #[builder(pattern = "owned", build_fn(skip), no_std)] pub struct VMMapOptions { @@ -187,6 +205,7 @@ pub struct VMMapOptions { perms: VMPerms, addr: VMMapAddr, initializer: VMInitializer, + page_policy: PagePolicy, } // VMMapOptionsBuilder is generated automatically, except the build function @@ -232,12 +251,21 @@ impl VMMapOptionsBuilder { Some(initializer) => initializer.clone(), None => VMInitializer::default(), }; + let page_policy = { + match &initializer { + VMInitializer::CopyFrom { .. } => PagePolicy::CommitNow, + VMInitializer::CopyOldAndReadNew { .. } => PagePolicy::CommitNow, + _ => self.page_policy.unwrap_or_default(), + } + }; + Ok(VMMapOptions { size, align, perms, addr, initializer, + page_policy, }) } } @@ -269,6 +297,10 @@ impl VMMapOptions { } false } + + pub fn page_policy(&self) -> &PagePolicy { + &self.page_policy + } } #[derive(Clone, Copy, PartialEq)] diff --git a/tools/toolchains/dcap_lib/Cargo.lock b/tools/toolchains/dcap_lib/Cargo.lock index 35a2c5f4..fbebbf90 100644 --- a/tools/toolchains/dcap_lib/Cargo.lock +++ b/tools/toolchains/dcap_lib/Cargo.lock @@ -25,4 +25,4 @@ dependencies = [ [[package]] name = "sgx_types" -version = "1.1.5" +version = "1.1.6"