Add EDMM support for Legacy Occlum

This commit is contained in:
Hui, Chunyang 2023-09-18 11:36:17 +00:00 committed by volcano
parent 28c29c8896
commit d49b3af0aa
28 changed files with 2104 additions and 393 deletions

75
src/libos/Cargo.lock generated

@ -9,7 +9,7 @@ dependencies = [
"aligned",
"atomic",
"bitflags",
"bitvec",
"bitvec 1.0.1",
"ctor",
"derive_builder",
"goblin",
@ -18,6 +18,7 @@ dependencies = [
"lazy_static",
"log",
"memoffset 0.6.5",
"modular-bitfield",
"rcore-fs",
"rcore-fs-devfs",
"rcore-fs-mountfs",
@ -94,7 +95,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41262f11d771fd4a61aa3ce019fca363b4b6c282fca9da2a31186d3965a47a5c"
dependencies = [
"either",
"radium",
"radium 0.3.0",
]
[[package]]
name = "bitvec"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
dependencies = [
"funty",
"radium 0.7.0",
"tap",
"wyz",
]
[[package]]
@ -206,6 +219,12 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
[[package]]
name = "funty"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
[[package]]
name = "goblin"
version = "0.5.4"
@ -294,6 +313,27 @@ dependencies = [
"autocfg 1.1.0",
]
[[package]]
name = "modular-bitfield"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a53d79ba8304ac1c4f9eb3b9d281f21f7be9d4626f72ce7df4ad8fbde4f38a74"
dependencies = [
"modular-bitfield-impl",
"static_assertions 1.1.0",
]
[[package]]
name = "modular-bitfield-impl"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a7d5f7076603ebc68de2dc6a650ec331a062a13abaa346975be747bbfa4b789"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "plain"
version = "0.2.3"
@ -334,6 +374,12 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "def50a86306165861203e7f84ecffbbdfdea79f0e51039b33de1e952358c47ac"
[[package]]
name = "radium"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
[[package]]
name = "rand"
version = "0.6.5"
@ -479,11 +525,11 @@ dependencies = [
name = "rcore-fs-sefs"
version = "0.1.0"
dependencies = [
"bitvec",
"bitvec 0.17.4",
"log",
"rcore-fs",
"spin 0.5.2",
"static_assertions",
"static_assertions 0.3.4",
"uuid",
]
@ -719,6 +765,12 @@ version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f3eb36b47e512f8f1c9e3d10c2c1965bc992bd9cdb024fa581e2194501c83d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "strsim"
version = "0.9.3"
@ -736,6 +788,12 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tap"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]]
name = "unicode-ident"
version = "1.0.3"
@ -772,3 +830,12 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "wyz"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
dependencies = [
"tap",
]

@ -10,7 +10,7 @@ crate-type = ["staticlib"]
[dependencies]
atomic = "0.5"
bitflags = "1.0"
bitvec = { version = "0.17", default-features = false, features = ["alloc"] }
bitvec = { version = "1", default-features = false, features = ["alloc"] }
log = "0.4"
aligned = "0.4.1"
lazy_static = { version = "1.1.0", features = ["spin_no_std"] } # Implies nightly
@ -33,6 +33,7 @@ regex = { git = "https://github.com/mesalock-linux/regex-sgx", default-features
goblin = { version = "0.5.4", default-features = false, features = ["elf64", "elf32", "endian_fd"] }
intrusive-collections = "0.9"
spin = "0.7"
modular-bitfield = "0.11.2"
[patch.'https://github.com/apache/teaclave-sgx-sdk.git']
sgx_tstd = { path = "../../deps/rust-sgx-sdk/sgx_tstd" }

@ -6,10 +6,14 @@ use self::syscall::{handle_syscall_exception, SYSCALL_OPCODE};
use super::*;
use crate::signal::{FaultSignal, SigSet};
use crate::syscall::exception_interrupt_syscall_c_abi;
use crate::syscall::{CpuContext, FpRegs, SyscallNum};
use aligned::{Aligned, A16};
use core::arch::x86_64::_fxsave;
use crate::syscall::{CpuContext, ExtraContext, SyscallNum};
use crate::vm::{enclave_page_fault_handler, USER_SPACE_VM_MANAGER};
use sgx_types::*;
use sgx_types::{sgx_exception_type_t, sgx_exception_vector_t};
const ENCLU: u32 = 0xd7010f;
const EACCEPT: u32 = 0x5;
const EACCEPTCOPY: u32 = 0x7;
// Modules for instruction simulation
mod cpuid;
@ -25,14 +29,63 @@ pub fn register_exception_handlers() {
}
}
fn try_handle_kernel_exception(info: &sgx_exception_info_t) -> i32 {
if info.exception_vector == sgx_exception_vector_t::SGX_EXCEPTION_VECTOR_PF {
let pf_addr = info.exinfo.faulting_address as usize;
// The PF address must be in the user space. Otherwise, keep searching for the exception handler
if !USER_SPACE_VM_MANAGER.range().contains(pf_addr) {
SGX_MM_EXCEPTION_CONTINUE_SEARCH
} else {
let rip = info.cpu_context.rip as *const u32;
let rax = info.cpu_context.rax as u32;
// This can happen when two threads both try to EAUG a new page. Thread 1 EAUG because it first
// touches the memory and triggers #PF. Thread 2 EAUG because it uses sgx_mm_commit to commit a
// new page with EACCEPT and triggers #PF. If Thread 1 first acquires the lock to do EAUG, when Thread 2
// acquires the lock, it can't do EAUG again and will fail. The failure will raise a signal.
// This signal will eventually be handled here. And the instruction that triggers this exception is EACCEPT/EACCEPTCOPY.
// In this case, since the new page is EAUG-ed already, just need to excecute the EACCEPT again. Thus here
// just return SGX_MM_EXCEPTION_CONTINUE_EXECUTION
if ENCLU == (unsafe { *rip } as u32) & 0xffffff
&& (EACCEPT == rax || EACCEPTCOPY == rax)
{
return SGX_MM_EXCEPTION_CONTINUE_EXECUTION;
}
// If the triggered code is not user's code and the #PF address is in the userspace, then it is a
// kernel-triggered #PF that we can handle. This can happen e.g. when read syscall triggers user buffer #PF
info!("kernel code triggers #PF");
let kernel_triggers = true;
enclave_page_fault_handler(info.cpu_context.rip as usize, info.exinfo, kernel_triggers)
.expect("handle PF failure");
SGX_MM_EXCEPTION_CONTINUE_EXECUTION
}
} else {
// Otherwise, we can't handle. Keep searching for the exception handler
error!(
"We can't handle this exception: {:?}",
info.exception_vector
);
SGX_MM_EXCEPTION_CONTINUE_SEARCH
}
}
#[no_mangle]
extern "C" fn handle_exception(info: *mut sgx_exception_info_t) -> i32 {
let mut fpregs = FpRegs::save();
let info = unsafe { &mut *info };
// Try handle kernel-trigged #PF
if !USER_SPACE_VM_MANAGER
.range()
.contains(info.cpu_context.rip as usize)
{
return try_handle_kernel_exception(&info);
}
// User-space-triggered exception
unsafe {
exception_interrupt_syscall_c_abi(
SyscallNum::HandleException as u32,
info as *mut _,
&mut fpregs as *mut FpRegs,
info as *mut sgx_exception_info_t as *mut _,
)
};
unreachable!();
@ -41,20 +94,22 @@ extern "C" fn handle_exception(info: *mut sgx_exception_info_t) -> i32 {
/// Exceptions are handled as a special kind of system calls.
pub fn do_handle_exception(
info: *mut sgx_exception_info_t,
fpregs: *mut FpRegs,
user_context: *mut CpuContext,
) -> Result<isize> {
let info = unsafe { &mut *info };
check_exception_type(info.exception_type)?;
info!("do handle exception: {:?}", info.exception_vector);
let user_context = unsafe { &mut *user_context };
*user_context = CpuContext::from_sgx(&info.cpu_context);
user_context.fpregs = fpregs;
let xsave_area = info.xsave_area.as_mut_ptr();
user_context.extra_context = ExtraContext::Xsave;
user_context.extra_context_ptr = xsave_area;
// Try to do instruction emulation first
if info.exception_vector == sgx_exception_vector_t::SGX_EXCEPTION_VECTOR_UD {
// Assume the length of opcode is 2 bytes
let ip_opcode = unsafe { *(user_context.rip as *const u16) };
let ip_opcode: u16 = unsafe { *(user_context.rip as *const u16) };
if ip_opcode == RDTSC_OPCODE {
return handle_rdtsc_exception(user_context);
} else if ip_opcode == SYSCALL_OPCODE {
@ -64,6 +119,23 @@ pub fn do_handle_exception(
}
}
// Normally, We should only handled PF exception with SGX bit set which is due to uncommitted EPC.
// However, it happens that when committing a no-read-write page (e.g. RWX), there is a short gap
// after EACCEPTCOPY and before the mprotect ocall. And if the user touches memory during this short
// gap, the SGX bit will not be set. Thus, here we don't check the SGX bit.
if info.exception_vector == sgx_exception_vector_t::SGX_EXCEPTION_VECTOR_PF {
info!("Userspace #PF caught, try handle");
if enclave_page_fault_handler(info.cpu_context.rip as usize, info.exinfo, false).is_ok() {
info!("#PF handling is done successfully");
return Ok(0);
}
warn!(
"#PF not handled. Turn to signal. user context = {:?}",
user_context
);
}
// Then, it must be a "real" exception. Convert it to signal and force delivering it.
// The generated signal is SIGBUS, SIGFPE, SIGILL, or SIGSEGV.
//
@ -108,3 +180,21 @@ fn check_exception_type(type_: sgx_exception_type_t) -> Result<()> {
}
Ok(())
}
// Based on Page-Fault Error Code of Intel Mannul
const PF_EXCEPTION_SGX_BIT: u32 = 0x1;
const PF_EXCEPTION_RW_BIT: u32 = 0x2;
// Return value:
// True - SGX bit is set
// False - SGX bit is not set
pub fn check_sgx_bit(exception_error_code: u32) -> bool {
exception_error_code & PF_EXCEPTION_SGX_BIT == PF_EXCEPTION_SGX_BIT
}
// Return value:
// True - write bit is set, #PF caused by write
// False - read bit is set, #PF caused by read
pub fn check_rw_bit(exception_error_code: u32) -> bool {
exception_error_code & PF_EXCEPTION_RW_BIT == PF_EXCEPTION_RW_BIT
}

@ -91,7 +91,7 @@ fn get_output_for_vma(vma: &VMArea, heap_or_stack: Option<&str>) -> String {
let perms = vma.perms();
let (file_path, offset, device_id, inode_num) = {
if let Some((file, offset)) = vma.init_file() {
if let Some((file, offset)) = vma.backed_file() {
let inode_file = file.as_inode_file().unwrap();
let file_path = inode_file.abs_path();
let inode_num = inode_file.inode().metadata().unwrap().inode;

@ -2,9 +2,7 @@ pub use self::sgx::sgx_interrupt_info_t;
use crate::prelude::*;
use crate::process::ThreadRef;
use crate::syscall::exception_interrupt_syscall_c_abi;
use crate::syscall::{CpuContext, FpRegs, SyscallNum};
use aligned::{Aligned, A16};
use core::arch::x86_64::_fxsave;
use crate::syscall::{CpuContext, ExtraContext, SyscallNum};
mod sgx;
@ -16,28 +14,23 @@ pub fn init() {
}
extern "C" fn handle_interrupt(info: *mut sgx_interrupt_info_t) -> i32 {
let mut fpregs = FpRegs::save();
unsafe {
exception_interrupt_syscall_c_abi(
SyscallNum::HandleInterrupt as u32,
info as *mut _,
&mut fpregs as *mut FpRegs,
)
exception_interrupt_syscall_c_abi(SyscallNum::HandleInterrupt as u32, info as *mut _)
};
unreachable!();
}
pub fn do_handle_interrupt(
info: *mut sgx_interrupt_info_t,
fpregs: *mut FpRegs,
cpu_context: *mut CpuContext,
) -> Result<isize> {
let info = unsafe { &*info };
let info = unsafe { &mut *info };
let context = unsafe { &mut *cpu_context };
// The cpu context is overriden so that it is as if the syscall is called from where the
// interrupt happened
*context = CpuContext::from_sgx(&info.cpu_context);
context.fpregs = fpregs;
context.extra_context = ExtraContext::Xsave;
context.extra_context_ptr = info.xsave_area.as_mut_ptr();
Ok(0)
}

@ -1,10 +1,15 @@
use crate::prelude::*;
#[repr(C)]
#[repr(C, align(64))]
#[derive(Default, Clone, Copy)]
#[allow(non_camel_case_types)]
pub struct sgx_interrupt_info_t {
pub cpu_context: sgx_cpu_context_t,
pub interrupt_valid: uint32_t,
reserved: uint32_t,
pub xsave_size: uint64_t,
pub reserved1: [uint64_t; 4],
pub xsave_area: [uint8_t; 0],
}
#[allow(non_camel_case_types)]

@ -21,8 +21,11 @@
#![feature(test)]
#![feature(atomic_from_mut)]
#![feature(btree_drain_filter)]
#![feature(bench_black_box)]
#![feature(arbitrary_enum_discriminant)]
// for core::ptr::non_null::NonNull addr() method
#![feature(strict_provenance)]
// for VMArea::can_merge_vmas
#![feature(is_some_and)]
#[macro_use]
extern crate alloc;
@ -59,6 +62,7 @@ extern crate memoffset;
extern crate ctor;
extern crate intrusive_collections;
extern crate itertools;
extern crate modular_bitfield;
extern crate resolv_conf;
use sgx_trts::libc;

@ -1,6 +1,6 @@
use crate::process::do_vfork::reap_zombie_child_created_with_vfork;
use crate::signal::constants::*;
use std::intrinsics::atomic_store;
use std::intrinsics::atomic_store_seqcst;
use super::do_futex::futex_wake;
use super::do_vfork::{is_vforked_child_process, vfork_return_to_parent};
@ -61,7 +61,7 @@ fn exit_thread(term_status: TermStatus) {
// Notify a thread, if any, that waits on ctid. See set_tid_address(2) for more info.
if let Some(ctid_ptr) = thread.clear_ctid() {
unsafe {
atomic_store(ctid_ptr.as_ptr(), 0);
atomic_store_seqcst(ctid_ptr.as_ptr(), 0);
}
futex_wake(ctid_ptr.as_ptr() as *const i32, 1);
}

@ -1,6 +1,6 @@
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::intrinsics::atomic_load;
use std::intrinsics::atomic_load_seqcst;
use std::sync::atomic::{AtomicBool, Ordering};
use crate::prelude::*;
@ -258,7 +258,7 @@ impl FutexKey {
}
pub fn load_val(&self) -> i32 {
unsafe { atomic_load(self.0 as *const i32) }
unsafe { atomic_load_seqcst(self.0 as *const i32) }
}
pub fn addr(&self) -> usize {

@ -8,6 +8,7 @@
//! * If `cpu_set[i] == true`, then the i-th CPU core belongs to the set;
//! * Otherwise, the i-th CPU core is not in the set.
use bitvec::order::LocalBits as Local;
use bitvec::prelude::*;
use std::ops::Index;
@ -15,7 +16,7 @@ use crate::prelude::*;
#[derive(Debug, Clone, PartialEq)]
pub struct CpuSet {
bits: BitBox<Local, u8>,
bits: BitBox<u8, Local>,
}
impl CpuSet {
@ -33,14 +34,14 @@ impl CpuSet {
/// Create a CpuSet that consists of all of the CPU cores.
pub fn new_full() -> Self {
let mut bits = bitbox![Local, u8; 1; Self::len() * 8];
let mut bits = bitbox![u8, Local; 1; Self::len() * 8];
Self::clear_unused(&mut bits);
Self { bits }
}
/// Create a CpuSet that consists of none of the CPU cores.
pub fn new_empty() -> Self {
let bits = bitbox![Local, u8; 0; Self::len() * 8];
let bits = bitbox![u8, Local; 0; Self::len() * 8];
Self { bits }
}
@ -61,7 +62,7 @@ impl CpuSet {
/// Returns the first index of CPUs in set.
pub fn first_cpu_idx(&self) -> Option<usize> {
self.iter().position(|&b| b == true)
self.iter().position(|b| b == true)
}
// Returns if the CpuSet is a subset of available cpu set
@ -75,7 +76,7 @@ impl CpuSet {
return_errno!(EINVAL, "slice is not long enough");
}
let slice = &slice[..Self::len()];
let mut bits = BitBox::from_slice(slice);
let mut bits = BitBox::from_bitslice(&BitSlice::from_slice(slice));
Self::clear_unused(&mut bits);
Ok(Self { bits })
@ -85,11 +86,11 @@ impl CpuSet {
///
/// The last, unused bits in the byte slice are guaranteed to be zero.
pub fn as_slice(&self) -> &[u8] {
self.bits.as_slice()
self.bits.as_raw_slice()
}
pub fn as_mut_slice(&mut self) -> &mut [u8] {
self.bits.as_mut_slice()
self.bits.as_raw_mut_slice()
}
/// Returns an iterator that allows accessing the underlying bits.
@ -102,7 +103,7 @@ impl CpuSet {
self.bits.iter_mut()
}
fn clear_unused(bits: &mut BitSlice<Local, u8>) {
fn clear_unused(bits: &mut BitSlice<u8, Local>) {
let unused_bits = &mut bits[Self::ncores()..(Self::len() * 8)];
for mut bit in unused_bits {
*bit = false;
@ -110,8 +111,8 @@ impl CpuSet {
}
}
pub type Iter<'a> = bitvec::slice::Iter<'a, Local, u8>;
pub type IterMut<'a> = bitvec::slice::IterMut<'a, Local, u8>;
pub type Iter<'a> = bitvec::slice::Iter<'a, u8, Local>;
pub type IterMut<'a> = bitvec::slice::IterMut<'a, u8, Local>;
impl Index<usize> for CpuSet {
type Output = bool;

@ -199,7 +199,7 @@ impl siginfo_t {
}
}
#[derive(Clone, Copy)]
#[derive(Clone)]
#[repr(C)]
pub struct ucontext_t {
pub uc_flags: u64,
@ -225,7 +225,8 @@ pub type stack_t = sigaltstack_t;
pub struct mcontext_t {
pub inner: CpuContext,
// TODO: the fields should be csgsfs, err, trapno, oldmask, and cr2
_unused0: [u64; 5],
// The number should be 5 but we use extra 2 spaces to store something else in the CpuContext. Thus make it 3.
_unused0: [u64; 3],
// TODO: this field should be `fpregs: fpregset_t,`
_unused1: usize,
_reserved: [u64; 8],

@ -5,9 +5,8 @@ use super::{SigAction, SigActionFlags, SigDefaultAction, SigSet, Signal};
use crate::lazy_static::__Deref;
use crate::prelude::*;
use crate::process::{ProcessRef, TermStatus, ThreadRef};
use crate::syscall::{CpuContext, FpRegs};
use crate::syscall::{CpuContext, ExtraContext, FpRegs, XsaveArea};
use aligned::{Aligned, A16};
use core::arch::x86_64::{_fxrstor, _fxsave};
use std::{ptr, slice};
pub fn do_rt_sigreturn(curr_user_ctxt: &mut CpuContext) -> Result<()> {
@ -34,11 +33,27 @@ pub fn do_rt_sigreturn(curr_user_ctxt: &mut CpuContext) -> Result<()> {
*curr_user_ctxt = last_ucontext.uc_mcontext.inner;
// Restore the floating point registers to a temp area
// The floating point registers would be recoved just
// before return to user's code
let mut fpregs = Box::new(unsafe { FpRegs::from_slice(&last_ucontext.fpregs) });
curr_user_ctxt.fpregs = Box::into_raw(fpregs);
curr_user_ctxt.fpregs_on_heap = 1; // indicates the fpregs is on heap
// The floating point registers would be recoved just before return to user's code
match curr_user_ctxt.extra_context {
ExtraContext::Fpregs => {
// Signal raised by direct syscall
// fpregs should be stored on the heap. Because the ucontext_t will be freed when this function returns. And curr_user_ctxt only stores the pointer
let mut fpregs = Box::new(unsafe { FpRegs::from_slice(&last_ucontext.fpregs) });
curr_user_ctxt.extra_context_ptr = Box::into_raw(fpregs) as *mut u8;
}
ExtraContext::Xsave => {
// Signal raised by exception
// The xsave_area is stored at a special area reserved on kernel's stack. We can just overwrite this area with the latest user context
// Note: Currently, we only restore the fpregs instead of restoring the whole xsave area for sigreturn. Because during the
// handle path, we don't touch other advanced registers. However, in the future, if we have to touch those registers,
// we should restore the whole xsave area when sigreturn.
let latest_fpregs = unsafe { FpRegs::from_slice(&last_ucontext.fpregs) };
let xsave_area =
unsafe { (&mut *(curr_user_ctxt.extra_context_ptr as *mut XsaveArea)) };
xsave_area.set_fpregs_area(latest_fpregs);
}
}
Ok(())
}
@ -261,16 +276,24 @@ fn handle_signals_by_user(
// Save the old sigmask
ucontext.uc_sigmask = old_sigmask.to_c();
// Save the user context
ucontext.uc_mcontext.inner = *curr_user_ctxt;
ucontext.uc_mcontext.inner = curr_user_ctxt.clone();
// Save the floating point registers
if curr_user_ctxt.fpregs != ptr::null_mut() {
ucontext
.fpregs
.copy_from_slice(unsafe { curr_user_ctxt.fpregs.as_ref().unwrap().as_slice() });
// Clear the floating point registers, since we do not need to recover is when this syscall return
curr_user_ctxt.fpregs = ptr::null_mut();
if curr_user_ctxt.extra_context_ptr != ptr::null_mut() {
// Signal from exception handling
debug_assert!(matches!(curr_user_ctxt.extra_context, ExtraContext::Xsave));
let fpregs_area =
unsafe { (&*(curr_user_ctxt.extra_context_ptr as *mut XsaveArea)) }.get_fpregs();
ucontext.fpregs.copy_from_slice(fpregs_area.as_slice());
// Clear the floating point registers, since we do not need to recover this when this syscall return
curr_user_ctxt.extra_context_ptr = ptr::null_mut();
} else {
// Raise the signal with direct syscall
debug_assert!(
matches!(curr_user_ctxt.extra_context, ExtraContext::Fpregs)
&& curr_user_ctxt.extra_context_ptr == ptr::null_mut()
);
// We need a correct fxsave structure in the buffer,
// because the app may modify part of it to update the
// floating point after the signal handler finished.

@ -36,12 +36,12 @@ impl FaultSignal {
// Page fault exception
SGX_EXCEPTION_VECTOR_PF => {
const PF_ERR_FLAG_PRESENT : u32 = 1u32 << 0;
let code = if info.exinfo.errcd & PF_ERR_FLAG_PRESENT != 0 {
let code = if info.exinfo.error_code & PF_ERR_FLAG_PRESENT != 0 {
SEGV_ACCERR
} else {
SEGV_MAPERR
};
let addr = Some(info.exinfo.maddr);
let addr = Some(info.exinfo.faulting_address );
(SIGSEGV, code, addr)
},
// General protection exception

@ -7,7 +7,7 @@
//! 3. Preprocess the system call and then call `dispatch_syscall` (in this file)
//! 4. Call `do_*` to process the system call (in other modules)
use aligned::{Aligned, A16};
use aligned::{Aligned, A16, A64};
use core::arch::x86_64::{_fxrstor, _fxsave};
use std::any::Any;
use std::convert::TryFrom;
@ -60,7 +60,7 @@ use crate::signal::{
do_rt_sigtimedwait, do_sigaltstack, do_tgkill, do_tkill, sigaction_t, siginfo_t, sigset_t,
stack_t,
};
use crate::vm::{MMapFlags, MRemapFlags, MSyncFlags, VMPerms};
use crate::vm::{MMapFlags, MRemapFlags, MSyncFlags, MadviceFlags, VMPerms};
use crate::{fs, process, std, vm};
use super::*;
@ -122,7 +122,7 @@ macro_rules! process_syscall_table_with_callback {
(Mremap = 25) => do_mremap(old_addr: usize, old_size: usize, new_size: usize, flags: i32, new_addr: usize),
(Msync = 26) => do_msync(addr: usize, size: usize, flags: u32),
(Mincore = 27) => handle_unsupported(),
(Madvise = 28) => handle_unsupported(),
(Madvise = 28) => do_madvice(addr: usize, length: usize, advice: i32),
(Shmget = 29) => do_shmget(key: key_t, size: size_t, shmflg: i32),
(Shmat = 30) => do_shmat(shmid: i32, shmaddr: usize, shmflg: i32),
(Shmctl = 31) => do_shmctl(shmid: i32, cmd: i32, buf: *mut shmids_t),
@ -424,8 +424,8 @@ macro_rules! process_syscall_table_with_callback {
// Occlum-specific system calls
(SpawnGlibc = 359) => do_spawn_for_glibc(child_pid_ptr: *mut u32, path: *const i8, argv: *const *const i8, envp: *const *const i8, fa: *const SpawnFileActions, attribute_list: *const posix_spawnattr_t),
(SpawnMusl = 360) => do_spawn_for_musl(child_pid_ptr: *mut u32, path: *const i8, argv: *const *const i8, envp: *const *const i8, fdop_list: *const FdOp, attribute_list: *const posix_spawnattr_t),
(HandleException = 361) => do_handle_exception(info: *mut sgx_exception_info_t, fpregs: *mut FpRegs, context: *mut CpuContext),
(HandleInterrupt = 362) => do_handle_interrupt(info: *mut sgx_interrupt_info_t, fpregs: *mut FpRegs, context: *mut CpuContext),
(HandleException = 361) => do_handle_exception(info: *mut sgx_exception_info_t, context: *mut CpuContext),
(HandleInterrupt = 362) => do_handle_interrupt(info: *mut sgx_interrupt_info_t, context: *mut CpuContext),
(MountRootFS = 363) => do_mount_rootfs(key_ptr: *const sgx_key_128bit_t, rootfs_config_ptr: *const user_rootfs_config),
}
};
@ -649,12 +649,10 @@ fn do_syscall(user_context: &mut CpuContext) {
syscall.args[1] = user_context as *mut _ as isize;
} else if syscall_num == SyscallNum::HandleException {
// syscall.args[0] == info
// syscall.args[1] == fpregs
syscall.args[2] = user_context as *mut _ as isize;
syscall.args[1] = user_context as *mut _ as isize;
} else if syscall.num == SyscallNum::HandleInterrupt {
// syscall.args[0] == info
// syscall.args[1] == fpregs
syscall.args[2] = user_context as *mut _ as isize;
syscall.args[1] = user_context as *mut _ as isize;
} else if syscall.num == SyscallNum::Sigaltstack {
// syscall.args[0] == new_ss
// syscall.args[1] == old_ss
@ -751,21 +749,27 @@ fn do_sysret(user_context: &mut CpuContext) -> ! {
fn do_exit_task() -> !;
}
if current!().status() != ThreadStatus::Exited {
// Restore the floating point registers
// Todo: Is it correct to do fxstor in kernel?
let fpregs = user_context.fpregs;
if (fpregs != ptr::null_mut()) {
if user_context.fpregs_on_heap == 1 {
let fpregs = unsafe { Box::from_raw(user_context.fpregs as *mut FpRegs) };
fpregs.restore();
} else {
unsafe { fpregs.as_ref().unwrap().restore() };
if user_context.extra_context_ptr != ptr::null_mut() {
match user_context.extra_context {
ExtraContext::Fpregs => {
let fpregs = user_context.extra_context_ptr as *mut FpRegs;
unsafe { fpregs.as_ref().unwrap().restore() };
// The fpregs must be allocated on heap
drop(unsafe { Box::from_raw(user_context.extra_context_ptr as *mut FpRegs) });
}
ExtraContext::Xsave => {
let xsave_area = user_context.extra_context_ptr;
unsafe { (&*(xsave_area as *mut XsaveArea)).restore() };
}
}
user_context.extra_context_ptr = ptr::null_mut();
}
unsafe { __occlum_sysret(user_context) } // jump to user space
} else {
if user_context.fpregs != ptr::null_mut() && user_context.fpregs_on_heap == 1 {
drop(unsafe { Box::from_raw(user_context.fpregs as *mut FpRegs) });
if user_context.extra_context_ptr != ptr::null_mut()
&& matches!(user_context.extra_context, ExtraContext::Fpregs)
{
drop(unsafe { Box::from_raw(user_context.extra_context_ptr as *mut FpRegs) });
}
unsafe { do_exit_task() } // exit enclave
}
@ -828,6 +832,12 @@ fn do_msync(addr: usize, size: usize, flags: u32) -> Result<isize> {
Ok(0)
}
fn do_madvice(addr: usize, length: usize, advice: i32) -> Result<isize> {
let flags = MadviceFlags::from_i32(advice)?;
vm::do_madvice(addr, length, flags)?;
Ok(0)
}
fn do_sysinfo(info: *mut sysinfo_t) -> Result<isize> {
check_mut_ptr(info)?;
let info = unsafe { &mut *info };
@ -977,7 +987,6 @@ fn handle_unsupported() -> Result<isize> {
/// Floating point registers
///
/// Note. The area is used to save fxsave result
//#[derive(Clone, Copy)]
#[repr(C)]
pub struct FpRegs {
inner: Aligned<A16, [u8; 512]>,
@ -1017,6 +1026,41 @@ impl FpRegs {
}
}
#[derive(Debug)]
#[repr(C)]
pub struct XsaveArea {
inner: Aligned<A64, [u8; 4096]>,
}
impl XsaveArea {
// The first 512 bytes of xsave area is used for FP registers
const FXSAVE_AREA_LEN: usize = 512;
/// Save the current CPU floating pointer states to an instance of FpRegs
pub fn save() -> Self {
let mut xsave_area = MaybeUninit::<Self>::uninit();
unsafe {
save_xregs(xsave_area.as_mut_ptr() as *mut u8);
xsave_area.assume_init()
}
}
/// Restore the current CPU floating pointer states from this FpRegs instance
pub fn restore(&self) {
unsafe {
restore_xregs(self.inner.as_ptr());
}
}
pub fn get_fpregs(&self) -> FpRegs {
unsafe { FpRegs::from_slice(&self.inner[..Self::FXSAVE_AREA_LEN]) }
}
pub fn set_fpregs_area(&mut self, fpregs: FpRegs) {
self.inner[..Self::FXSAVE_AREA_LEN].copy_from_slice(fpregs.as_slice())
}
}
/// Cpu context.
///
/// Note. The definition of this struct must be kept in sync with the assembly
@ -1042,8 +1086,21 @@ pub struct CpuContext {
pub rsp: u64,
pub rip: u64,
pub rflags: u64,
pub fpregs_on_heap: u64,
pub fpregs: *mut FpRegs,
pub extra_context: ExtraContext,
pub extra_context_ptr: *mut u8,
}
#[repr(u64)]
#[derive(Clone, Copy, Debug)]
pub enum ExtraContext {
Fpregs = 0,
Xsave = 1,
}
impl Default for ExtraContext {
fn default() -> Self {
Self::Fpregs
}
}
impl CpuContext {
@ -1067,8 +1124,8 @@ impl CpuContext {
rsp: src.rsp,
rip: src.rip,
rflags: src.rflags,
fpregs_on_heap: 0,
fpregs: ptr::null_mut(),
extra_context: Default::default(),
extra_context_ptr: ptr::null_mut(),
}
}
}
@ -1082,14 +1139,15 @@ impl CpuContext {
// pointer that is not safe to use by external modules. In our case, the
// FpRegs pointer will not be used actually. So the Rust warning is a
// false alarm. We suppress it here.
pub unsafe fn exception_interrupt_syscall_c_abi(
num: u32,
info: *mut c_void,
fpregs: *mut FpRegs,
) -> u32 {
pub unsafe fn exception_interrupt_syscall_c_abi(num: u32, info: *mut c_void) -> u32 {
#[allow(improper_ctypes)]
extern "C" {
pub fn __occlum_syscall_c_abi(num: u32, info: *mut c_void, fpregs: *mut FpRegs) -> u32;
pub fn __occlum_syscall_c_abi(num: u32, info: *mut c_void) -> u32;
}
__occlum_syscall_c_abi(num, info, fpregs)
__occlum_syscall_c_abi(num, info)
}
extern "C" {
pub fn save_xregs(save_area: *mut u8);
pub fn restore_xregs(save_area: *const u8);
}

@ -52,8 +52,8 @@ __occlum_syscall_linux_abi:
// Save the target CPU state when `call __occlum_syscall` is returned in
// a CpuContext struct. The registers are saved in the reverse order of
// the fields in CpuContext.
pushq $0 // default fpregs is NULL
pushq $0 // default fpregs is allocated on stack
pushq $0 // default extra_context_ptr is NULL
pushq $0 // default extra_context is floating point registers
pushfq
push %rcx // save %rip
push %r11 // save %rsp

@ -100,16 +100,9 @@ impl Chunk {
*options.perms(),
options.initializer().backed_file(),
current!().process().pid(),
);
// Initialize the memory of the new range
unsafe {
let buf = vm_range.as_slice_mut();
options.initializer().init_slice(buf)?;
}
// Set memory permissions
if !options.perms().is_default() {
VMPerms::apply_perms(&vm_area, vm_area.perms());
}
)
.init_memory(options)?;
Ok(Self::new_chunk_with_vma(vm_area))
}
@ -238,6 +231,30 @@ impl Chunk {
}
}
pub fn handle_page_fault(
&self,
rip: usize,
pf_addr: usize,
errcd: u32,
kernel_triggers: bool,
) -> Result<()> {
let internal = &self.internal;
match self.internal() {
ChunkType::SingleVMA(vma) => {
let mut vma = vma.lock().unwrap();
debug_assert!(vma.contains(pf_addr));
return vma.handle_page_fault(rip, pf_addr, errcd, kernel_triggers);
}
ChunkType::MultiVMA(internal_manager) => {
return internal_manager
.lock()
.unwrap()
.chunk_manager
.handle_page_fault(rip, pf_addr, errcd, kernel_triggers);
}
}
}
pub fn is_free_range(&self, request_range: &VMRange) -> bool {
match self.internal() {
ChunkType::SingleVMA(_) => false, // single-vma chunk can't be free

@ -63,11 +63,13 @@ use std::fmt;
mod chunk;
mod free_space_manager;
mod page_tracker;
mod process_vm;
mod shm_manager;
mod user_space_vm;
mod vm_area;
mod vm_chunk_manager;
mod vm_epc;
mod vm_layout;
mod vm_manager;
mod vm_perms;
@ -77,9 +79,12 @@ mod vm_util;
use self::vm_layout::VMLayout;
pub use self::chunk::{ChunkRef, ChunkType};
pub use self::process_vm::{MMapFlags, MRemapFlags, MSyncFlags, ProcessVM, ProcessVMBuilder};
pub use self::process_vm::{
MMapFlags, MRemapFlags, MSyncFlags, MadviceFlags, ProcessVM, ProcessVMBuilder,
};
pub use self::user_space_vm::USER_SPACE_VM_MANAGER;
pub use self::vm_area::VMArea;
pub use self::vm_epc::enclave_page_fault_handler;
pub use self::vm_manager::MunmapChunkFlag;
pub use self::vm_perms::VMPerms;
pub use self::vm_range::VMRange;
@ -154,4 +159,9 @@ pub fn do_msync(addr: usize, size: usize, flags: MSyncFlags) -> Result<()> {
current!().vm().msync(addr, size)
}
pub fn do_madvice(addr: usize, length: usize, advice: MadviceFlags) -> Result<()> {
warn!("madvice is not supported. madvice flags:{:?}", advice);
Ok(())
}
pub const PAGE_SIZE: usize = 4096;

@ -0,0 +1,488 @@
use super::*;
use super::user_space_vm::USER_SPACE_VM_MANAGER;
use super::vm_util::{GB, KB, MB};
use bitvec::vec::BitVec;
use util::sync::RwLock;
use vm_epc::EPCMemType;
// In SGX v2, there is no upper limit for the size of EPC. If the user configure 1 TB memory,
// and we only use one bit to track if the page is committed, that's 1 TB / 4 kB / 8 bit = 32 MB of memory.
// And the memory footprint will keep the same size during the whole libOS life cycle.
// In order to track the commit status of a huge number of pages, use two level tracking.
// In the first level, global level, we use `PAGE_CHUNK_UNIT` as the unit size for a page chunk.
// In the second level, we just use the page size as the unit size, and use one bit to represent if the page is committed.
// For example, if the user configure 64 TB memory, when a page is committed, the second level tracker will mark the correponding bit as 1.
// And when all the pages of a whole global page chunk are fully committed, the global level tracker will mark the page chunk as fully committed.
// And the corresponding tracker can be freed. In this way, we can use just several bytes to represent the commit status of a big chunk of memory.
// In a worse case, let's say there are several discrete global page chunks which are not not fully committed at the same time.
// And each of them will take some space in the memory. Within a memory-intensive case, we can
// commit the page by hand and make the global page chunk fully committed and free the page tracker.
// There are mainly three types of data structure to track the page status, from the top to the bottom:
// 1. PageChunkManager - Create for the whole user space. This sructure is used to manage the global paging status.
// 2. GlobalPageChunk - Denotes a chunk of pages. The actual unit of the PageChunkManager. It holds the paging status of a memory range. Stored only
// in the PageChunkManager. A newly created VMA should ask the corresponding GlobalPageChunk for the paging status. When all the pages recoreded by
// GlobalPageChunk are all committed, it will mark itself as "fully committed" and free the inner structure tracking the paging status. All the GlobalPageChunk
// records the VM ranges with the SAME size.
// 3. PageTracker - The real tracker of the paging status. Under the hood, it is a bitvec that tracks every page with a bit. There are mainly two types
// PageTracker:
// * GlobalTracker - Used by GlobalPageChunk to track the paging status. All records the VM range with the same size.
// * VMATracker - Used by VMA to track its paging status. Records different range size according to the VMA.
// Since the VM operations are mostly performed by VMA, the VMA tracker will update itself accordingly. And also update the corresponding GlobalTracker.
lazy_static! {
pub static ref USER_SPACE_PAGE_CHUNK_MANAGER: RwLock<PageChunkManager> =
RwLock::new(PageChunkManager::new(USER_SPACE_VM_MANAGER.range()));
}
const PAGE_CHUNK_UNIT: usize = 4 * MB;
const PAGE_CHUNK_PAGE_NUM: usize = PAGE_CHUNK_UNIT / PAGE_SIZE;
pub struct PageChunkManager {
// The total range that the manager manages.
range: VMRange,
// The page chunks
inner: HashMap<usize, GlobalPageChunk>, // K: Page chunk start address, V: Global page chunk
}
impl PageChunkManager {
fn new(range: &VMRange) -> Self {
Self {
range: range.clone(),
inner: HashMap::new(),
}
}
}
#[derive(Debug)]
// A chunk of pages. Memory space is precious. Don't put anything unnecessary.
struct GlobalPageChunk {
fully_committed: bool,
tracker: Option<Arc<RwLock<PageTracker>>>, // if this page chunk is fully committed, the tracker will be set to None.
}
impl GlobalPageChunk {
fn new(tracker: PageTracker) -> Self {
Self {
fully_committed: false,
tracker: Some(Arc::new(RwLock::new(tracker))),
}
}
}
#[derive(PartialEq, Clone, Debug)]
enum TrackerType {
GlobalTracker, // PAGE_CHUNK_UNIT size for global management to track the global paging status
VMATracker, // various size for different vma to track its own paging status
}
// Used for tracking the paging status of global tracker or VMA tracker
#[derive(Clone)]
pub struct PageTracker {
type_: TrackerType,
range: VMRange,
inner: BitVec,
fully_committed: bool,
}
impl Debug for PageTracker {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("PageTracker")
.field("type", &self.type_)
.field("range", &self.range)
.field("fully committed", &self.fully_committed)
.finish()
}
}
impl PageTracker {
// Create a new page tracker for GlobalPageChunk.
// When a new global tracker is needed, none of the pages are committed.
fn new_global_tracker(start_addr: usize) -> Result<Self> {
let range = VMRange::new_with_size(start_addr, PAGE_CHUNK_UNIT)?;
let inner = bitvec![0; PAGE_CHUNK_PAGE_NUM];
Ok(Self {
type_: TrackerType::GlobalTracker,
range,
inner,
fully_committed: false,
})
}
pub fn new_vma_tracker(vm_range: &VMRange, epc_type: &EPCMemType) -> Result<Self> {
trace!("new vma tracker, range = {:?}", vm_range);
let page_num = vm_range.size() / PAGE_SIZE;
let new_vma_tracker = match epc_type {
EPCMemType::UserRegion => {
let mut new_vma_tracker = Self {
type_: TrackerType::VMATracker,
range: vm_range.clone(),
inner: bitvec![0; page_num],
fully_committed: false,
};
// Skip sentry
if page_num != 0 {
new_vma_tracker.get_committed_pages_from_global_tracker()?;
}
new_vma_tracker
}
EPCMemType::Reserved => {
// For reserved memory, there is no need to udpate global page tracker.
// And there is no GLobalPageChunk for reserved memory.
Self {
type_: TrackerType::VMATracker,
range: vm_range.clone(),
inner: bitvec![1; page_num],
fully_committed: true,
}
}
_ => unreachable!(),
};
Ok(new_vma_tracker)
}
pub fn range(&self) -> &VMRange {
&self.range
}
pub fn is_fully_committed(&self) -> bool {
self.fully_committed
}
pub fn is_reserved_only(&self) -> bool {
!self.fully_committed && self.inner.not_any()
}
pub fn is_partially_committed(&self) -> bool {
!self.fully_committed && self.inner.any()
}
// Get all committed or uncommitted ranges of consecutive page.
// If committed is true, get all committed ranges
// If committed is false, get all uncommitted ranges
pub fn get_ranges(&self, committed: bool) -> Vec<VMRange> {
if self.is_fully_committed() {
if committed {
return vec![self.range.clone()];
} else {
return Vec::new();
}
}
if self.is_reserved_only() {
if committed {
return Vec::new();
} else {
return vec![self.range.clone()];
}
}
let tracker_start_addr = self.range.start();
let mut ret = Vec::new();
let mut start = None;
let mut end = None;
for i in 0..self.inner.len() {
if self.inner[i] == committed {
match (start, end) {
// Meet committed page for the first time. Update both the start and end marker.
(None, None) => {
start = Some(i);
end = Some(i);
// Reach the end of the tracker. Only one page
if i == self.inner.len() - 1 {
let committed_range = VMRange::new_with_size(
tracker_start_addr + i * PAGE_SIZE,
PAGE_SIZE,
)
.unwrap();
ret.push(committed_range);
}
}
// Previous pages are committed. Update the end marker.
(Some(s), Some(e)) => {
end = Some(i);
// Reach the end of the tracker.
if i == self.inner.len() - 1 {
let committed_range = VMRange::new_with_size(
tracker_start_addr + s * PAGE_SIZE,
PAGE_SIZE * (i - s + 1),
)
.unwrap();
ret.push(committed_range);
}
}
_ => unreachable!(),
}
} else {
match (start, end) {
(None, None) => {
// No committed pages.
}
(Some(s), Some(e)) => {
// Meet the first uncommitted pages after recording all the previous committed pages.
let committed_range = VMRange::new_with_size(
tracker_start_addr + s * PAGE_SIZE,
PAGE_SIZE * (e - s + 1),
)
.unwrap();
ret.push(committed_range);
// Reset markers
start = None;
end = None;
}
_ => {
unreachable!()
}
}
}
}
let total_size = ret.iter().fold(0, |a, b| a + b.size());
if committed {
trace!("get committed ranges = {:?}", ret);
debug_assert!(total_size == self.inner.count_ones() * PAGE_SIZE);
} else {
trace!("get uncommitted ranges = {:?}", ret);
debug_assert!(total_size == self.inner.count_zeros() * PAGE_SIZE);
}
ret
}
pub fn split_for_new_range(&mut self, new_range: &VMRange) {
debug_assert!(self.range.is_superset_of(new_range));
let new_start = new_range.start();
let page_num = new_range.size() / PAGE_SIZE;
let split_idx = (new_start - self.range.start()) / PAGE_SIZE;
let mut new_inner = self.inner.split_off(split_idx);
new_inner.truncate(page_num);
trace!(
"old range= {:?}, new_start = {:x}, idx = {:?}",
self.range,
new_start,
split_idx
);
self.inner = new_inner;
if self.inner.all() {
self.fully_committed = true;
}
self.range = *new_range;
}
// Commit memory for the whole current VMA (VMATracker)
pub fn commit_whole(&mut self, perms: VMPerms) -> Result<()> {
debug_assert!(self.type_ == TrackerType::VMATracker);
if self.is_fully_committed() {
return Ok(());
}
// Commit EPC
if self.is_reserved_only() {
vm_epc::commit_memory(self.range().start(), self.range().size(), Some(perms)).unwrap();
} else {
debug_assert!(self.is_partially_committed());
let uncommitted_ranges = self.get_ranges(false);
for range in uncommitted_ranges {
vm_epc::commit_memory(range.start(), range.size(), Some(perms)).unwrap();
}
}
// Update the tracker
self.inner.fill(true);
self.fully_committed = true;
self.set_committed_pages_for_global_tracker(self.range().start(), self.range().size());
Ok(())
}
// Commit memory of a specific range for the current VMA (VMATracker). The range should be verified by caller.
pub fn commit_range(&mut self, range: &VMRange, new_perms: Option<VMPerms>) -> Result<()> {
debug_assert!(self.type_ == TrackerType::VMATracker);
debug_assert!(self.range().is_superset_of(range));
vm_epc::commit_memory(range.start(), range.size(), new_perms)?;
self.commit_pages_common(range.start(), range.size());
self.set_committed_pages_for_global_tracker(range.start(), range.size());
Ok(())
}
pub fn commit_memory_and_init_with_file(
&mut self,
range: &VMRange,
file: &FileRef,
file_offset: usize,
new_perms: VMPerms,
) -> Result<()> {
debug_assert!(self.type_ == TrackerType::VMATracker);
debug_assert!(self.range().is_superset_of(range));
vm_epc::commit_memory_and_init_with_file(
range.start(),
range.size(),
file,
file_offset,
new_perms,
)?;
self.commit_pages_common(range.start(), range.size());
self.set_committed_pages_for_global_tracker(range.start(), range.size());
Ok(())
}
// VMATracker get page commit status from global tracker and update itself
// This should be called when the VMATracker inits
fn get_committed_pages_from_global_tracker(&mut self) -> Result<()> {
debug_assert!(self.type_ == TrackerType::VMATracker);
let mut vma_tracker = self;
let mut page_chunk_start = get_page_chunk_start_addr(vma_tracker.range().start());
let range_end = vma_tracker.range().end();
for page_chunk_addr in (page_chunk_start..range_end).step_by(PAGE_CHUNK_UNIT) {
let manager = USER_SPACE_PAGE_CHUNK_MANAGER.read().unwrap();
if let Some(page_chunk) = manager.inner.get(&page_chunk_addr) {
if page_chunk.fully_committed {
// global page chunk fully committed. commit pages for vma page chunk
vma_tracker.commit_pages_common(page_chunk_addr, PAGE_CHUNK_UNIT);
} else {
debug_assert!(page_chunk.tracker.is_some());
let global_tracker = page_chunk.tracker.as_ref().unwrap().read().unwrap();
global_tracker.set_committed_pages_for_vma_tracker(vma_tracker);
}
drop(manager);
} else {
// Not tracking this page chunk. Release read lock and acquire write lock for an update.
drop(manager);
// This page chunk is not tracked by global tracker. Thus none of the pages are committed.
let page_chunk = {
let global_page_tracker = PageTracker::new_global_tracker(page_chunk_addr)?;
GlobalPageChunk::new(global_page_tracker)
};
// There could be data race here. But it's fine, because the ultimate state is the same.
USER_SPACE_PAGE_CHUNK_MANAGER
.write()
.unwrap()
.inner
.insert(page_chunk_addr, page_chunk);
}
}
Ok(())
}
// VMAtracker helps to update global tracker based on the paging status of itself.
// This should be called whenever the VMATracker updates and needs to sync with the GlobalTracker.
fn set_committed_pages_for_global_tracker(&self, commit_start_addr: usize, commit_size: usize) {
debug_assert!(self.type_ == TrackerType::VMATracker);
let commit_end_addr = commit_start_addr + commit_size;
let page_chunk_start_addr = get_page_chunk_start_addr(commit_start_addr);
for page_chunk_addr in (page_chunk_start_addr..commit_end_addr).step_by(PAGE_CHUNK_UNIT) {
let is_global_tracker_fully_committed = {
// Find the correponding page chunk
let manager = USER_SPACE_PAGE_CHUNK_MANAGER.read().unwrap();
let page_chunk = manager
.inner
.get(&page_chunk_addr)
.expect("this page chunk must exist");
// Update the global page tracker
if let Some(global_page_tracker) = &page_chunk.tracker {
let mut global_tracker = global_page_tracker.write().unwrap();
global_tracker.commit_pages_common(commit_start_addr, commit_size);
global_tracker.fully_committed
} else {
// page_tracker is none, the page chunk is fully committed. Go to next chunk.
debug_assert!(page_chunk.fully_committed);
continue;
}
};
// Free the global page tracker if fully committed
if is_global_tracker_fully_committed {
// Update the global page chunk manager. Need to acquire the write lock this time. There can be data race because the lock
// could be dropped for a while before acquire again. But its fine, because the ultimate state is the same.
let mut manager = USER_SPACE_PAGE_CHUNK_MANAGER.write().unwrap();
if let Some(mut page_chunk) = manager.inner.get_mut(&page_chunk_addr) {
page_chunk.fully_committed = true;
page_chunk.tracker = None;
} else {
warn!(
"the global page chunk with start addr: 0x{:x} has been freed already",
page_chunk_addr
);
unreachable!();
}
}
}
}
// GlobalTracker helps to update VMATracker based on the paging status of itself.
// This should be called when the VMATracker inits.
fn set_committed_pages_for_vma_tracker(&self, vma_tracker: &mut PageTracker) {
debug_assert!(self.type_ == TrackerType::GlobalTracker);
debug_assert!(vma_tracker.type_ == TrackerType::VMATracker);
let global_tracker = self;
if let Some(intersection_range) = global_tracker.range().intersect(vma_tracker.range()) {
let vma_tracker_page_id =
(intersection_range.start() - vma_tracker.range().start()) / PAGE_SIZE;
let global_tracker_page_id =
(intersection_range.start() - global_tracker.range().start()) / PAGE_SIZE;
let page_num = intersection_range.size() / PAGE_SIZE;
vma_tracker.inner[vma_tracker_page_id..vma_tracker_page_id + page_num]
.copy_from_bitslice(
&global_tracker.inner
[global_tracker_page_id..global_tracker_page_id + page_num],
);
if vma_tracker.inner.all() {
vma_tracker.fully_committed = true;
}
} else {
// No intersection range, why calling this? Wierd.
unreachable!();
}
}
// Commit pages for page tracker itself. This is a common method for both VMATracker and GlobalTracker.
fn commit_pages_common(&mut self, start_addr: usize, size: usize) {
debug_assert!(!self.fully_committed);
if let Some(intersection_range) = {
let range = VMRange::new_with_size(start_addr, size).unwrap();
self.range.intersect(&range)
} {
trace!("commit for page tracker: {:?}", self);
let page_start_id = (intersection_range.start() - self.range().start()) / PAGE_SIZE;
let page_num = intersection_range.size() / PAGE_SIZE;
self.inner[page_start_id..page_start_id + page_num].fill(true);
if self.inner.all() {
self.fully_committed = true;
}
} else {
// No intersect range, wierd
unreachable!();
}
}
}
#[inline(always)]
fn get_page_chunk_start_addr(addr: usize) -> usize {
align_down(addr, PAGE_CHUNK_UNIT)
}

@ -6,7 +6,8 @@ use super::vm_area::VMArea;
use super::vm_manager::MunmapChunkFlag;
use super::vm_perms::VMPerms;
use super::vm_util::{
FileBacked, VMInitializer, VMMapAddr, VMMapOptions, VMMapOptionsBuilder, VMRemapOptions,
FileBacked, PagePolicy, VMInitializer, VMMapAddr, VMMapOptions, VMMapOptionsBuilder,
VMRemapOptions,
};
use crate::config;
use crate::ipc::SHM_MANAGER;
@ -124,6 +125,8 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> {
.initializer(VMInitializer::ElfSpecific {
elf_file: elf_file.file_ref().clone(),
})
// We only load loadable segments, just commit the memory when allocating.
.page_policy(PagePolicy::CommitNow)
.build()
.map_err(|e| {
&self.handle_error_when_init(&chunks);
@ -152,6 +155,8 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> {
.size(heap_layout.size())
.align(heap_layout.align())
.perms(VMPerms::READ | VMPerms::WRITE)
.page_policy(PagePolicy::CommitOnDemand)
// .page_policy(PagePolicy::CommitNow)
.build()
.map_err(|e| {
&self.handle_error_when_init(&chunks);
@ -171,8 +176,10 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> {
let stack_layout = &other_layouts[1];
let vm_option = VMMapOptionsBuilder::default()
.size(stack_layout.size())
.align(heap_layout.align())
.align(stack_layout.align())
.perms(VMPerms::READ | VMPerms::WRITE)
// There are cases that we can't handle when the #PF happens at user's stack. Commit the stack memory now.
.page_policy(PagePolicy::CommitNow)
.build()
.map_err(|e| {
&self.handle_error_when_init(&chunks);
@ -537,11 +544,26 @@ impl ProcessVM {
}
}
};
let page_policy = {
if flags.contains(MMapFlags::MAP_STACK) {
// With MAP_STACK, the mmaped memory will be used as user's stack. If not committed, the #PF can occurs
// when switching to user space and can't be handled correctly by us.
PagePolicy::CommitNow
} else if !flags.contains(MMapFlags::MAP_ANONYMOUS) {
// Use commit-now policy for file-backed mmap. We tried the commit-on-demand policy, but didn't get any performance gain at all.
// However, the path for file-backed mmap with commit-on-demand policy is ready. We can enable this whenever needed.
PagePolicy::CommitNow
} else {
PagePolicy::CommitOnDemand
}
};
let mmap_options = VMMapOptionsBuilder::default()
.size(size)
.addr(addr_option)
.perms(perms)
.initializer(initializer)
.page_policy(page_policy)
.build()?;
let mmap_addr = USER_SPACE_VM_MANAGER.mmap(&mmap_options)?;
Ok(mmap_addr)
@ -674,3 +696,33 @@ impl MSyncFlags {
Ok(flags)
}
}
#[allow(non_camel_case_types)]
#[repr(i32)]
#[derive(Debug)]
pub enum MadviceFlags {
MADV_NORMAL = 0,
MADV_RANDOM = 1,
MADV_SEQUENTIAL = 2,
MADV_WILLNEED = 3,
MADV_DONTNEED = 4,
}
impl MadviceFlags {
pub fn from_i32(raw: i32) -> Result<Self> {
const MADV_NORMAL: i32 = 0;
const MADV_RANDOM: i32 = 1;
const MADV_SEQUENTIAL: i32 = 2;
const MADV_WILLNEED: i32 = 3;
const MADV_DONTNEED: i32 = 4;
match raw {
MADV_NORMAL => Ok(MadviceFlags::MADV_NORMAL),
MADV_RANDOM => Ok(MadviceFlags::MADV_RANDOM),
MADV_SEQUENTIAL => Ok(MadviceFlags::MADV_SEQUENTIAL),
MADV_WILLNEED => Ok(MadviceFlags::MADV_WILLNEED),
MADV_DONTNEED => Ok(MadviceFlags::MADV_DONTNEED),
_ => return_errno!(ENOSYS, "unknown madvice flags"),
}
}
}

@ -206,8 +206,8 @@ impl ShmManager {
let old_perms = old_vma.perms();
if new_perms != old_perms {
let perms = new_perms | old_perms;
VMPerms::apply_perms(new_vma.range(), perms);
new_vma.set_perms(perms);
new_vma.modify_permissions_for_committed_pages(perms);
}
let inode_id = Self::inode_id_of(&new_vma);
@ -279,7 +279,7 @@ impl ShmManager {
if perms == old_perms {
return;
}
VMPerms::apply_perms(vma.range(), perms);
vma.set_perms(perms);
vma.modify_permissions_for_committed_pages(perms);
}
}

@ -1,46 +1,50 @@
use super::*;
use super::vm_manager::VMManager;
use crate::config::LIBOS_CONFIG;
use crate::ctor::dtor;
use crate::ipc::SHM_MANAGER;
use crate::ipc::SYSTEM_V_SHM_MANAGER;
use crate::util::pku_util;
use std::ops::{Deref, DerefMut};
use vm_epc::SGXPlatform;
use vm_manager::VMManager;
use vm_perms::VMPerms;
const RSRV_MEM_PERM: MemPerm =
MemPerm::from_bits_truncate(MemPerm::READ.bits() | MemPerm::WRITE.bits());
const USER_SPACE_DEFAULT_MEM_PERM: VMPerms = VMPerms::DEFAULT;
/// The virtual memory manager for the entire user space
pub struct UserSpaceVMManager(VMManager);
pub struct UserSpaceVMManager {
inner: VMManager,
sgx_platform: SGXPlatform,
}
impl UserSpaceVMManager {
fn new() -> Result<UserSpaceVMManager> {
let rsrv_mem_size = LIBOS_CONFIG.resource_limits.user_space_size;
let vm_range = unsafe {
// TODO: Current sgx_alloc_rsrv_mem implementation will commit all the pages of the desired size, which will consume
// a lot of time. When EDMM is supported, there is no need to commit all the pages at the initialization stage. A function
// which reserves memory but not commit pages should be provided then.
let ptr = sgx_alloc_rsrv_mem(rsrv_mem_size);
if ptr.is_null() {
return_errno!(ENOMEM, "run out of reserved memory");
}
let sgx_platform = SGXPlatform::new();
let init_size = LIBOS_CONFIG.resource_limits.user_space_init_size;
let max_size = LIBOS_CONFIG.resource_limits.user_space_max_size;
// Without EDMM support and the ReservedMemExecutable is set to 1, the reserved memory will be RWX. And we can't change the reserved memory permission.
// With EDMM support, the reserved memory permission is RW by default. And we can change the permissions when needed.
let (userspace_vm_range, gap_range) = sgx_platform.alloc_user_space(init_size, max_size)?;
let addr = ptr as usize;
debug!(
"allocated rsrv addr is 0x{:x}, len is 0x{:x}",
addr, rsrv_mem_size
);
pku_util::pkey_mprotect_userspace_mem(addr, rsrv_mem_size, RSRV_MEM_PERM.bits());
VMRange::new(addr, addr + rsrv_mem_size)?
};
info!(
"user space allocated, range = {:?}, gap_range = {:?}",
userspace_vm_range, gap_range
);
let vm_manager = VMManager::init(vm_range)?;
// Use pkey_mprotect to set the whole userspace to R/W permissions. If user specifies a new
// permission, the mprotect ocall will update the permission.
pku_util::pkey_mprotect_userspace_mem(
&userspace_vm_range,
gap_range.as_ref(),
USER_SPACE_DEFAULT_MEM_PERM,
);
Ok(UserSpaceVMManager(vm_manager))
let vm_manager = VMManager::init(userspace_vm_range, gap_range)?;
Ok(Self {
inner: vm_manager,
sgx_platform,
})
}
pub fn get_total_size(&self) -> usize {
@ -52,51 +56,34 @@ impl UserSpaceVMManager {
// be called after the main function. Static variables are still safe to visit at this time.
#[dtor]
fn free_user_space() {
SHM_MANAGER.clean_when_libos_exit();
let range = USER_SPACE_VM_MANAGER.range();
info!("free user space at the end");
SYSTEM_V_SHM_MANAGER.clean_when_libos_exit();
let total_user_space_range = USER_SPACE_VM_MANAGER.range();
let gap_range = USER_SPACE_VM_MANAGER.gap_range();
assert!(USER_SPACE_VM_MANAGER.verified_clean_when_exit());
let addr = range.start();
let size = range.size();
info!("free user space VM: {:?}", range);
pku_util::clear_pku_when_libos_exit(addr, size, RSRV_MEM_PERM.bits());
assert!(unsafe { sgx_free_rsrv_mem(addr as *const c_void, size) == 0 });
let addr = total_user_space_range.start();
let size = total_user_space_range.size();
info!("free user space VM: {:?}", total_user_space_range);
pku_util::clear_pku_when_libos_exit(
total_user_space_range,
gap_range.as_ref(),
USER_SPACE_DEFAULT_MEM_PERM,
);
USER_SPACE_VM_MANAGER
.sgx_platform
.free_user_space(total_user_space_range, gap_range.as_ref());
}
impl Deref for UserSpaceVMManager {
type Target = VMManager;
fn deref(&self) -> &Self::Target {
&self.0
&self.inner
}
}
lazy_static! {
pub static ref USER_SPACE_VM_MANAGER: UserSpaceVMManager = UserSpaceVMManager::new().unwrap();
}
bitflags! {
struct MemPerm: i32 {
const READ = 1;
const WRITE = 2;
const EXEC = 4;
}
}
extern "C" {
// Allocate a range of EPC memory from the reserved memory area with RW permission
//
// Parameters:
// Inputs: length [in]: Size of region to be allocated in bytes. Page aligned
// Return: Starting address of the new allocated memory area on success; otherwise NULL
//
fn sgx_alloc_rsrv_mem(length: usize) -> *const c_void;
// Free a range of EPC memory from the reserved memory area
//
// Parameters:
// Inputs: addr[in]: Starting address of region to be freed. Page aligned.
// length[in]: The length of the memory to be freed in bytes. Page aligned
// Return: 0 on success; otherwise -1
//
fn sgx_free_rsrv_mem(addr: *const c_void, length: usize) -> i32;
}

@ -1,19 +1,28 @@
use super::*;
use super::page_tracker::PageTracker;
use super::vm_epc::EPCMemType;
use super::vm_perms::VMPerms;
use super::vm_range::VMRange;
use super::vm_util::FileBacked;
use super::vm_util::{FileBacked, PagePolicy, VMInitializer, VMMapOptions, GB, KB, MB};
use intrusive_collections::rbtree::{Link, RBTree};
use intrusive_collections::{intrusive_adapter, KeyAdapter};
use std::ops::{Deref, DerefMut};
#[derive(Clone, Debug, Default)]
// Commit memory size unit when the #PF occurs.
const COMMIT_SIZE_UNIT: usize = 4 * KB;
// Commit the whole VMA when this threshold reaches.
const PF_NUM_THRESHOLD: u64 = 3;
#[derive(Clone, Debug)]
pub struct VMArea {
range: VMRange,
perms: VMPerms,
file_backed: Option<FileBacked>,
access: VMAccess,
pages: Option<PageTracker>, // Track the paging status of this VMA
epc_type: EPCMemType, // Track the type of the EPC to use specific APIs
pf_count: u64,
}
#[derive(Clone, Debug, Eq, PartialEq)]
@ -32,11 +41,47 @@ impl VMArea {
file_backed: Option<FileBacked>,
pid: pid_t,
) -> Self {
Self {
let epc_type = EPCMemType::new(&range);
let pages = {
match epc_type {
EPCMemType::Reserved => None,
EPCMemType::UserRegion => {
let pages =
PageTracker::new_vma_tracker(&range, &EPCMemType::UserRegion).unwrap();
(!pages.is_fully_committed()).then_some(pages)
}
}
};
let new_vma = Self {
range,
perms,
file_backed,
access: VMAccess::Private(pid),
pages,
epc_type,
pf_count: 0,
};
trace!("new vma = {:?}", new_vma);
new_vma
}
fn new_with_page_tracker(
range: VMRange,
perms: VMPerms,
file_backed: Option<FileBacked>,
access: VMAccess,
pages: Option<PageTracker>,
) -> VMArea {
let epc_type = EPCMemType::new(&range);
Self {
range,
perms,
file_backed,
access,
pages,
epc_type,
pf_count: 0,
}
}
@ -49,30 +94,41 @@ impl VMArea {
access: VMAccess,
) -> Self {
debug_assert!(vma.is_superset_of(&new_range));
let new_backed_file = vma.file_backed.as_ref().map(|file| {
let new_backed_file = if let Some(file) = &vma.file_backed {
let mut new_file = file.clone();
let file_offset = file.offset();
let new_file_offset = if vma.start() < new_range.start() {
let vma_offset = new_range.start() - vma.start();
file_offset + vma_offset
} else {
let vma_offset = vma.start() - new_range.start();
debug_assert!(file_offset >= vma_offset);
file_offset - vma_offset
};
debug_assert!(vma.start() <= new_range.start());
let new_start_offset = new_range.start() - vma.start();
let new_file_offset = file_offset + new_start_offset;
new_file.set_offset(new_file_offset);
Some(new_file)
} else {
None
};
new_file
});
let new_pages = {
let mut new_pages = vma.pages.clone();
Self {
range: new_range,
perms: new_perms,
file_backed: new_backed_file,
access,
}
if let Some(pages) = &mut new_pages {
pages.split_for_new_range(&new_range);
if pages.is_fully_committed() {
None
} else {
new_pages
}
} else {
None
}
};
let new_vma =
Self::new_with_page_tracker(new_range, new_perms, new_backed_file, access, new_pages);
trace!("inherits vma: {:?}, create new vma: {:?}", vma, new_vma);
new_vma
}
pub fn perms(&self) -> VMPerms {
@ -87,6 +143,13 @@ impl VMArea {
&self.access
}
pub fn get_private_pid(&self) -> Option<pid_t> {
match &self.access {
VMAccess::Private(pid) => Some(*pid),
VMAccess::Shared(_) => None,
}
}
pub fn belong_to(&self, target_pid: pid_t) -> bool {
match &self.access {
VMAccess::Private(pid) => *pid == target_pid,
@ -105,9 +168,199 @@ impl VMArea {
}
}
pub fn init_file(&self) -> Option<(&FileRef, usize)> {
fn pages(&self) -> &PageTracker {
debug_assert!(!self.is_fully_committed());
self.pages.as_ref().unwrap()
}
fn pages_mut(&mut self) -> &mut PageTracker {
debug_assert!(!self.is_fully_committed());
self.pages.as_mut().unwrap()
}
// Get pid for private VMA
pub fn pid(&self) -> pid_t {
match self.access {
VMAccess::Private(pid) => pid,
VMAccess::Shared(_) => unreachable!(),
}
}
pub fn is_reserved_only(&self) -> bool {
if let Some(pages) = &self.pages {
return pages.is_reserved_only();
} else {
false
}
}
pub fn is_fully_committed(&self) -> bool {
self.pages.is_none()
}
pub fn is_partially_committed(&self) -> bool {
if let Some(pages) = &self.pages {
return pages.is_partially_committed();
} else {
false
}
}
pub fn init_memory(mut self, options: &VMMapOptions) -> Result<Self> {
let mut vm_area = self;
let page_policy = options.page_policy();
// Commit pages if needed
if !vm_area.is_fully_committed() && page_policy == &PagePolicy::CommitNow {
vm_area.pages_mut().commit_whole(VMPerms::DEFAULT)?;
vm_area.pages = None;
}
// Initialize committed memory
if vm_area.is_partially_committed() {
let committed = true;
for range in vm_area.pages().get_ranges(committed) {
vm_area.init_memory_internal(&range, Some(options.initializer()))?;
}
} else if vm_area.is_fully_committed() {
// Initialize the memory of the new range
unsafe {
let buf = vm_area.range().as_slice_mut();
options.initializer().init_slice(buf)?;
}
// Set memory permissions
if !options.perms().is_default() {
vm_area.modify_protection_force(None, vm_area.perms());
}
}
// Do nothing if this vma has no committed memory
Ok(vm_area)
}
pub fn flush_and_clean_memory(&self) -> Result<()> {
let (need_flush, file, file_offset) = match self.writeback_file() {
None => (false, None, None),
Some((file_handle, offset)) => {
if !file_handle.access_mode().unwrap().writable() {
(false, None, None)
} else {
(true, Some(file_handle), Some(offset))
}
}
};
if self.is_fully_committed() {
self.flush_and_clean_internal(self.range(), need_flush, file, file_offset);
} else {
let committed = true;
for range in self.pages().get_ranges(committed) {
self.flush_and_clean_internal(&range, need_flush, file, file_offset);
}
}
Ok(())
}
fn flush_and_clean_internal(
&self,
target_range: &VMRange,
need_flush: bool,
file: Option<&FileRef>,
file_offset: Option<usize>,
) {
trace!("flush and clean committed range: {:?}", target_range);
debug_assert!(self.range().is_superset_of(target_range));
let buf = unsafe { target_range.as_slice_mut() };
if !self.perms().is_default() {
self.modify_protection_force(Some(&target_range), VMPerms::default());
}
if need_flush {
let file_offset = file_offset.unwrap() + (target_range.start() - self.range.start());
file.unwrap().write_at(file_offset, buf);
}
// reset zeros
unsafe {
buf.iter_mut().for_each(|b| *b = 0);
}
}
pub fn modify_permissions_for_committed_pages(&self, new_perms: VMPerms) {
if self.is_fully_committed() {
self.modify_protection_force(None, new_perms);
} else if self.is_partially_committed() {
let committed = true;
for range in self.pages().get_ranges(committed) {
self.modify_protection_force(Some(&range), new_perms);
}
}
}
pub fn handle_page_fault(
&mut self,
rip: usize,
pf_addr: usize,
errcd: u32,
kernel_triggers: bool,
) -> Result<()> {
trace!("PF vma = {:?}", self);
if (self.perms() == VMPerms::NONE)
|| (crate::exception::check_rw_bit(errcd) == false
&& !self.perms().contains(VMPerms::READ))
{
return_errno!(
EACCES,
"Page is set to None permission. This is user-intended"
);
}
if crate::exception::check_rw_bit(errcd) && !self.perms().contains(VMPerms::WRITE) {
return_errno!(
EACCES, "Page is set to not contain WRITE permission but this PF is triggered by write. This is user-intended"
)
}
if rip == pf_addr && !self.perms().contains(VMPerms::EXEC) {
return_errno!(
EACCES, "Page is set to not contain EXEC permission but this PF is triggered by execution. This is user-intended"
)
}
if self.is_fully_committed() {
// This vma has been commited by other threads already. Just return.
info!("This vma has been committed by other threads already.");
return Ok(());
}
if matches!(self.epc_type, EPCMemType::Reserved) {
return_errno!(EINVAL, "reserved memory shouldn't trigger PF");
}
if kernel_triggers || self.pf_count >= PF_NUM_THRESHOLD {
return self.commit_current_vma_whole();
}
self.pf_count += 1;
// The return commit_size can be 0 when other threads already commit the PF-containing range but the vma is not fully committed yet.
let commit_size = self.commit_once_for_page_fault(pf_addr).unwrap();
trace!("page fault commit memory size = {:?}", commit_size);
if commit_size == 0 {
warn!("This PF has been handled by other threads already.");
}
info!("page fault handle success");
Ok(())
}
pub fn backed_file(&self) -> Option<(&FileRef, usize)> {
if let Some(file) = &self.file_backed {
Some(file.init_file())
Some(file.backed_file())
} else {
None
}
@ -147,36 +400,51 @@ impl VMArea {
Some(new_vma)
}
pub fn resize(&mut self, new_size: usize) {
self.range.resize(new_size)
}
pub fn set_start(&mut self, new_start: usize) {
let old_start = self.start();
if new_start == old_start {
return;
}
self.range.set_start(new_start);
if let Some(file) = self.file_backed.as_mut() {
if !file.need_write_back() {
return;
if new_start < old_start {
// Extend this VMA
let pages = {
let pages = PageTracker::new_vma_tracker(&self.range, &self.epc_type).unwrap();
(!pages.is_fully_committed()).then_some(pages)
};
self.pages = pages;
} else {
// Split this VMA
debug_assert!(new_start > old_start);
if let Some(pages) = &mut self.pages {
pages.split_for_new_range(&self.range);
if pages.is_fully_committed() {
self.pages = None;
}
}
}
if let Some(file) = self.file_backed.as_mut() {
// If the updates to the VMA needs to write back to a file, then the
// file offset must be adjusted according to the new start address.
let offset = file.offset();
if old_start < new_start {
file.set_offset(offset + (new_start - old_start));
} else {
// The caller must guarantee that the new start makes sense
debug_assert!(offset >= old_start - new_start);
file.set_offset(offset - (old_start - new_start));
}
Self::set_file_offset(file, new_start, old_start);
}
}
fn set_file_offset(file: &mut FileBacked, new_start_offset: usize, old_start_offset: usize) {
let offset = file.offset();
if old_start_offset < new_start_offset {
file.set_offset(offset + (new_start_offset - old_start_offset));
} else {
// The caller must guarantee that the new start makes sense
debug_assert!(offset >= old_start_offset - new_start_offset);
file.set_offset(offset - (old_start_offset - new_start_offset));
}
}
pub fn is_the_same_to(&self, other: &VMArea) -> bool {
if self.access() != other.access() {
return false;
}
if self.range() != other.range() {
return false;
}
@ -185,6 +453,10 @@ impl VMArea {
return false;
}
if self.access() != other.access() {
return false;
}
let self_writeback_file = self.writeback_file();
let other_writeback_file = other.writeback_file();
match (self_writeback_file, other_writeback_file) {
@ -199,6 +471,13 @@ impl VMArea {
pub fn set_end(&mut self, new_end: usize) {
self.range.set_end(new_end);
let pages = if self.range.size() > 0 {
let pages = PageTracker::new_vma_tracker(&self.range, &self.epc_type).unwrap();
(!pages.is_fully_committed()).then_some(pages)
} else {
None
};
self.pages = pages;
}
pub fn can_merge_vmas(left: &VMArea, right: &VMArea) -> bool {
@ -208,10 +487,6 @@ impl VMArea {
if left.size() == 0 || right.size() == 0 {
return false;
}
// The two VMAs must be owned by the same process
if left.access() != right.access() {
return false;
}
// The two VMAs must border with each other
if left.end() != right.start() {
return false;
@ -220,6 +495,15 @@ impl VMArea {
if left.perms() != right.perms() {
return false;
}
// The two VMAs must be owned by the same process privately
// Return false if (either is none) or (both are some but two private pids are different)
let private_access = left.get_private_pid().zip(right.get_private_pid());
if private_access.is_none() {
return false;
}
if private_access.is_some_and(|(left_pid, right_pid)| left_pid != right_pid) {
return false;
}
// If the two VMAs have write-back files, the files must be the same and
// the two file regions must be continuous.
@ -238,12 +522,12 @@ impl VMArea {
}
/// Flush a file-backed VMA to its file. This has no effect on anonymous VMA.
pub fn flush_backed_file(&self) {
self.flush_backed_file_with_cond(|_| true)
pub fn flush_committed_backed_file(&self) {
self.flush_committed_backed_file_with_cond(|_| true)
}
/// Same as `flush_backed_file()`, except that an extra condition on the file needs to satisfy.
pub fn flush_backed_file_with_cond<F: Fn(&FileRef) -> bool>(&self, cond_fn: F) {
/// Same as `flush_committed_backed_file()`, except that an extra condition on the file needs to satisfy.
pub fn flush_committed_backed_file_with_cond<F: Fn(&FileRef) -> bool>(&self, cond_fn: F) {
let (file, file_offset) = match self.writeback_file() {
None => return,
Some((file_and_offset)) => file_and_offset,
@ -258,7 +542,16 @@ impl VMArea {
if !cond_fn(file) {
return;
}
file.write_at(file_offset, unsafe { self.as_slice() });
if self.is_fully_committed() {
file.write_at(file_offset, unsafe { self.as_slice() });
} else {
let committed = true;
let vm_range_start = self.range().start();
for range in self.pages().get_ranges(committed) {
let file_offset = file_offset + (range.start() - vm_range_start);
file.write_at(file_offset, unsafe { range.as_slice() });
}
}
}
pub fn is_shared(&self) -> bool {
@ -310,6 +603,198 @@ impl VMArea {
pub fn inherits_access_from(&mut self, vma: &VMArea) {
self.access = vma.access().clone()
}
// Current implementation with "unwrap()" can help us find the error quickly by panicing directly. Also, restoring VM state
// when this function fails will require some work and is not that simple.
// TODO: Return with Result instead of "unwrap()"" in this function.
fn modify_protection_force(&self, protect_range: Option<&VMRange>, new_perms: VMPerms) {
let protect_range = protect_range.unwrap_or_else(|| self.range());
self.epc_type
.modify_protection(protect_range.start(), protect_range.size(), new_perms)
.unwrap()
}
// With initializer, the memory should be committed already.
// Without initializer, the memory need to be committed and initialized.
fn init_memory_internal(
&mut self,
target_range: &VMRange,
initializer: Option<&VMInitializer>,
) -> Result<()> {
debug_assert!(self.range().is_superset_of(target_range));
trace!("init range = {:?}", target_range);
let perms = self.perms();
if let Some(initializer) = initializer {
match initializer {
VMInitializer::FileBacked { file } => {
let (file, offset) = file.backed_file();
let vma_range_start = self.range.start();
let init_file_offset = offset + (target_range.start() - vma_range_start);
self.init_file_backed_mem(target_range, &file, init_file_offset, perms)?;
}
VMInitializer::DoNothing() => {
if !self.perms().is_default() {
self.modify_protection_force(Some(target_range), perms);
}
}
VMInitializer::FillZeros() => {
unsafe {
let buf = target_range.as_slice_mut();
buf.iter_mut().for_each(|b| *b = 0);
}
if !perms.is_default() {
self.modify_protection_force(Some(target_range), perms);
}
}
_ => todo!(),
}
} else {
// No initializer, #PF triggered.
let init_file = self
.backed_file()
.map(|(file, offset)| (file.clone(), offset));
if let Some((file, offset)) = init_file {
let vma_range_start = self.range.start();
let init_file_offset = offset + (target_range.start() - vma_range_start);
self.pages
.as_mut()
.unwrap()
.commit_memory_and_init_with_file(
target_range,
&file,
init_file_offset,
perms,
)?;
} else {
// PF triggered, no file-backed memory, just modify protection
self.pages
.as_mut()
.unwrap()
.commit_range(target_range, Some(perms))?;
}
}
Ok(())
}
fn init_file_backed_mem(
&mut self,
target_range: &VMRange,
file: &FileRef,
file_offset: usize,
new_perm: VMPerms,
) -> Result<()> {
if !file.access_mode().unwrap().readable() {
return_errno!(EBADF, "file is not readable");
}
let buf = unsafe { target_range.as_slice_mut() };
let file_size = file.metadata().unwrap().size;
let len = file
.read_at(file_offset, buf)
.map_err(|_| errno!(EACCES, "failed to init memory from file"))?;
if !new_perm.is_default() {
self.modify_protection_force(Some(target_range), new_perm);
}
Ok(())
}
fn get_commit_once_size(&self) -> usize {
COMMIT_SIZE_UNIT
}
fn commit_once_for_page_fault(&mut self, pf_addr: usize) -> Result<usize> {
debug_assert!(!self.is_fully_committed());
let mut early_return = false;
let mut total_commit_size = 0;
let vma_range_start = self.range.start();
let permission = self.perms();
let committed = false;
let mut uncommitted_ranges = self.pages().get_ranges(committed);
let commit_once_size = self.get_commit_once_size();
for range in uncommitted_ranges
.iter_mut()
.skip_while(|range| !range.contains(pf_addr))
{
// Skip until first reach the range which contains the pf_addr
if total_commit_size == 0 {
debug_assert!(range.contains(pf_addr));
range.set_start(align_down(pf_addr, PAGE_SIZE));
range.resize(std::cmp::min(range.size(), commit_once_size));
} else if range.size() + total_commit_size > commit_once_size {
// This is not first time commit. Try to commit until reaching the commit_once_size
range.resize(commit_once_size - total_commit_size);
}
// We don't take care the file-backed memory here
debug_assert!(self.backed_file().is_none());
self.init_memory_internal(&range, None)?;
total_commit_size += range.size();
if total_commit_size >= commit_once_size {
break;
}
}
if self.pages().is_fully_committed() {
trace!("vma is fully committed");
self.pages = None;
}
Ok(total_commit_size)
}
// Only used to handle PF triggered by the kernel
fn commit_current_vma_whole(&mut self) -> Result<()> {
debug_assert!(!self.is_fully_committed());
debug_assert!(self.backed_file().is_none());
let mut uncommitted_ranges = self.pages.as_ref().unwrap().get_ranges(false);
for range in uncommitted_ranges {
self.init_memory_internal(&range, None).unwrap();
}
self.pages = None;
Ok(())
}
// TODO: We can re-enable this when we support lazy extend permissions.
#[allow(dead_code)]
fn page_fault_handler_extend_permission(&mut self, pf_addr: usize) -> Result<()> {
let permission = self.perms();
// This is intended by the application.
if permission == VMPerms::NONE {
return_errno!(EPERM, "trying to access PROT_NONE memory");
}
if self.is_fully_committed() {
self.modify_protection_force(None, permission);
return Ok(());
}
let committed = true;
let committed_ranges = self.pages().get_ranges(committed);
for range in committed_ranges.iter() {
if !range.contains(pf_addr) {
continue;
}
self.epc_type
.modify_protection(range.start(), range.size(), permission)?;
}
Ok(())
}
}
impl Deref for VMArea {

@ -83,16 +83,7 @@ impl ChunkManager {
continue;
}
vma.flush_backed_file();
if !vma.perms().is_default() {
VMPerms::apply_perms(vma, VMPerms::default());
}
unsafe {
let buf = vma.as_slice_mut();
buf.iter_mut().for_each(|b| *b = 0)
}
vma.flush_and_clean_memory().unwrap();
self.free_manager.add_range_back_to_free_manager(vma);
self.free_size += vma.size();
@ -110,6 +101,7 @@ impl ChunkManager {
if let VMMapAddr::Force(addr) = addr {
self.munmap(addr, size)?;
}
trace!("mmap options = {:?}", options);
// Find and allocate a new range for this mmap request
let new_range = self
@ -117,27 +109,29 @@ impl ChunkManager {
.find_free_range_internal(size, align, addr)?;
let new_addr = new_range.start();
let current_pid = current!().process().pid();
let new_vma = VMArea::new(
new_range,
*options.perms(),
options.initializer().backed_file(),
current_pid,
);
let new_vma = {
let new_vma = VMArea::new(
new_range,
*options.perms(),
options.initializer().backed_file(),
current_pid,
)
.init_memory(options);
// Initialize the memory of the new range
let buf = unsafe { new_vma.as_slice_mut() };
let ret = options.initializer().init_slice(buf);
if let Err(e) = ret {
// Return the free range before return with error
self.free_manager
.add_range_back_to_free_manager(new_vma.range());
return_errno!(e.errno(), "failed to mmap");
}
if new_vma.is_err() {
let error = new_vma.err().unwrap();
error!("init memory failure: {}", error.backtrace());
let range = VMRange::new_with_size(new_addr, size).unwrap();
self.free_manager
.add_range_back_to_free_manager(&range)
.unwrap();
return Err(error);
}
new_vma.unwrap()
};
trace!("new vma is ready");
// Set memory permissions
if !options.perms().is_default() {
VMPerms::apply_perms(&new_vma, new_vma.perms());
}
self.free_size -= new_vma.size();
// After initializing, we can safely insert the new VMA
self.vmas.insert(VMAObj::new_vma_obj(new_vma));
@ -168,11 +162,7 @@ impl ChunkManager {
Some(intersection_vma) => intersection_vma,
};
// File-backed VMA needs to be flushed upon munmap
intersection_vma.flush_backed_file();
if !&intersection_vma.perms().is_default() {
VMPerms::apply_perms(&intersection_vma, VMPerms::default());
}
intersection_vma.flush_and_clean_memory()?;
if vma.range() == intersection_vma.range() {
// Exact match. Just remove.
@ -194,13 +184,6 @@ impl ChunkManager {
}
}
// Reset zero
unsafe {
trace!("intersection vma = {:?}", intersection_vma);
let buf = intersection_vma.as_slice_mut();
buf.iter_mut().for_each(|b| *b = 0)
}
self.free_manager
.add_range_back_to_free_manager(intersection_vma.range());
self.free_size += intersection_vma.size();
@ -306,8 +289,7 @@ impl ChunkManager {
if intersection_vma.range() == containing_vma.range() {
// The whole containing_vma is mprotected
containing_vma.set_perms(new_perms);
VMPerms::apply_perms(&containing_vma, containing_vma.perms());
trace!("containing_vma = {:?}", containing_vma);
containing_vma.modify_permissions_for_committed_pages(containing_vma.perms());
containing_vmas.replace_with(VMAObj::new_vma_obj(containing_vma));
containing_vmas.move_next();
continue;
@ -325,13 +307,13 @@ impl ChunkManager {
let protect_end = protect_range.end();
// New VMA
let new_vma = VMArea::inherits_file_from(
let mut new_vma = VMArea::inherits_file_from(
&containing_vma,
protect_range,
new_perms,
VMAccess::Private(current_pid),
);
VMPerms::apply_perms(&new_vma, new_vma.perms());
new_vma.modify_permissions_for_committed_pages(new_vma.perms());
let new_vma = VMAObj::new_vma_obj(new_vma);
// Another new VMA
@ -356,15 +338,16 @@ impl ChunkManager {
break;
}
1 => {
let remain_vma = remain_vmas.pop().unwrap();
let mut remain_vma = remain_vmas.pop().unwrap();
let new_vma = VMArea::inherits_file_from(
let mut new_vma = VMArea::inherits_file_from(
&containing_vma,
intersection_vma.range().clone(),
new_perms,
VMAccess::Private(current_pid),
);
VMPerms::apply_perms(&new_vma, new_vma.perms());
new_vma.modify_permissions_for_committed_pages(new_vma.perms());
if remain_vma.start() == containing_vma.start() {
// mprotect right side of the vma
@ -374,6 +357,7 @@ impl ChunkManager {
debug_assert!(remain_vma.end() == containing_vma.end());
containing_vma.set_start(remain_vma.start());
}
debug_assert!(containing_vma.range() == remain_vma.range());
containing_vmas.replace_with(VMAObj::new_vma_obj(containing_vma));
containing_vmas.insert(VMAObj::new_vma_obj(new_vma));
@ -401,7 +385,7 @@ impl ChunkManager {
None => continue,
Some(vma) => vma,
};
vma.flush_backed_file();
vma.flush_committed_backed_file();
}
Ok(())
}
@ -409,9 +393,11 @@ impl ChunkManager {
/// Sync all shared, file-backed memory mappings of the given file by flushing
/// the memory content to the file.
pub fn msync_by_file(&mut self, sync_file: &FileRef) {
let is_same_file = |file: &FileRef| -> bool { Arc::ptr_eq(&file, &sync_file) };
for vma_obj in &self.vmas {
let is_same_file = |file: &FileRef| -> bool { Arc::ptr_eq(&file, &sync_file) };
vma_obj.vma().flush_backed_file_with_cond(is_same_file);
vma_obj
.vma()
.flush_committed_backed_file_with_cond(is_same_file);
}
}
@ -428,6 +414,34 @@ impl ChunkManager {
return Ok(vma.range().clone());
}
pub fn handle_page_fault(
&mut self,
rip: usize,
pf_addr: usize,
errcd: u32,
kernel_triggers: bool,
) -> Result<()> {
trace!(
"handle_page_fault chunk manager range = {:?}, free_size = {:?}",
self.range,
self.free_size
);
let mut vma_cursor = self.vmas.upper_bound_mut(Bound::Included(&pf_addr));
if vma_cursor.is_null() {
return_errno!(ENOMEM, "no mmap regions that contains the address");
}
let vma = vma_cursor.get().unwrap().vma();
if vma.pid() != current!().process().pid() || !vma.contains(pf_addr) {
return_errno!(ENOMEM, "no mmap regions that contains the address");
}
let mut vma = vma.clone();
vma.handle_page_fault(rip, pf_addr, errcd, kernel_triggers)?;
vma_cursor.replace_with(VMAObj::new_vma_obj(vma));
Ok(())
}
pub fn usage_percentage(&self) -> f32 {
let total_size = self.range.size();
let mut used_size = 0;
@ -487,6 +501,7 @@ impl VMRemapParser for ChunkManager {
impl Drop for ChunkManager {
fn drop(&mut self) {
info!("drop chunk manager = {:?}", self);
assert!(self.is_empty());
assert!(self.free_size == self.range.size());
assert!(self.free_manager.free_size() == self.range.size());

405
src/libos/src/vm/vm_epc.rs Normal file

@ -0,0 +1,405 @@
// This file contains EPC related APIs and definitions.
use super::*;
use sgx_trts::emm::{
AllocAddr, AllocFlags, AllocOptions, EmmAlloc, HandleResult, PageFaultHandler, Perm,
};
use sgx_trts::enclave::rsgx_is_supported_EDMM;
use std::ptr::NonNull;
// Memory Layout for Platforms with EDMM support
//
// Addr low -> high
// |---------------------------------------------||---------------------||--------------------------------------|
// Reserved Memory Gap Range User Region Memory
// (commit memory when loading the enclave) (used by SDK) (commit on demand when PF occurs)
//
// For platforms without EDMM support, we only use reserved memory.
pub enum SGXPlatform {
WithEDMM,
NoEDMM,
}
#[derive(Clone)]
pub enum EPCMemType {
Reserved,
UserRegion,
}
pub struct ReservedMem;
pub struct UserRegionMem;
#[repr(C, align(4096))]
#[derive(Clone)]
struct ZeroPage([u8; PAGE_SIZE]);
impl ZeroPage {
fn new() -> Self {
Self([0; PAGE_SIZE])
}
fn new_page_aligned_vec(size: usize) -> Vec<u8> {
debug_assert!(size % PAGE_SIZE == 0);
let page_num = size / PAGE_SIZE;
let mut page_vec = vec![Self::new(); page_num];
let ptr = page_vec.as_mut_ptr();
let size = page_num * std::mem::size_of::<Self>();
std::mem::forget(page_vec);
unsafe { Vec::from_raw_parts(ptr as *mut u8, size, size) }
}
}
lazy_static! {
static ref ZERO_PAGE: Vec<u8> = ZeroPage::new_page_aligned_vec(PAGE_SIZE);
}
pub trait EPCAllocator {
fn alloc(size: usize) -> Result<usize> {
return_errno!(ENOSYS, "operation not supported");
}
fn alloc_with_addr(addr: usize, size: usize) -> Result<usize> {
return_errno!(ENOSYS, "operation not supported");
}
fn free(addr: usize, size: usize) -> Result<()> {
return_errno!(ENOSYS, "operation not supported");
}
fn modify_protection(addr: usize, length: usize, protection: VMPerms) -> Result<()> {
return_errno!(ENOSYS, "operation not supported");
}
fn mem_type() -> EPCMemType;
}
impl EPCAllocator for ReservedMem {
fn alloc(size: usize) -> Result<usize> {
let ptr = unsafe { sgx_alloc_rsrv_mem(size) };
if ptr.is_null() {
return_errno!(ENOMEM, "run out of reserved memory");
}
Ok(ptr as usize)
}
fn alloc_with_addr(addr: usize, size: usize) -> Result<usize> {
let ptr = unsafe { sgx_alloc_rsrv_mem_ex(addr as *const c_void, size) };
if ptr.is_null() {
return_errno!(ENOMEM, "can't allocate reserved memory at desired address");
}
Ok(ptr as usize)
}
fn free(addr: usize, size: usize) -> Result<()> {
let ret = unsafe { sgx_free_rsrv_mem(addr as *const c_void, size) };
assert!(ret == 0);
Ok(())
}
fn modify_protection(addr: usize, length: usize, protection: VMPerms) -> Result<()> {
let mut ret_val = 0;
let ret = if rsgx_is_supported_EDMM() {
unsafe {
sgx_tprotect_rsrv_mem(addr as *const c_void, length, protection.bits() as i32)
}
} else {
// For platforms without EDMM, sgx_tprotect_rsrv_mem is actually useless.
// However, at least we can set pages to desired protections in the host kernel page table.
unsafe {
occlum_ocall_mprotect(
&mut ret_val as *mut i32,
addr as *const c_void,
length,
protection.bits() as i32,
)
}
};
if ret != sgx_status_t::SGX_SUCCESS || ret_val != 0 {
return_errno!(ENOMEM, "reserved memory modify protection failure");
}
Ok(())
}
fn mem_type() -> EPCMemType {
EPCMemType::Reserved
}
}
impl EPCAllocator for UserRegionMem {
fn alloc(size: usize) -> Result<usize> {
let alloc_options = AllocOptions::new()
.set_flags(AllocFlags::COMMIT_ON_DEMAND)
.set_handler(enclave_page_fault_handler_dummy, 0);
let ptr = unsafe { EmmAlloc.alloc(AllocAddr::Any, size, alloc_options) }
.map_err(|e| errno!(Errno::from(e as u32)))?;
Ok(ptr.addr().get())
}
fn free(addr: usize, size: usize) -> Result<()> {
let ptr = NonNull::<u8>::new(addr as *mut u8).unwrap();
unsafe { EmmAlloc.dealloc(ptr, size) }.map_err(|e| errno!(Errno::from(e as u32)))?;
Ok(())
}
fn modify_protection(addr: usize, length: usize, protection: VMPerms) -> Result<()> {
trace!(
"user region modify protection, protection = {:?}, range = {:?}",
protection,
VMRange::new_with_size(addr, length).unwrap()
);
let ptr = NonNull::<u8>::new(addr as *mut u8).unwrap();
unsafe {
EmmAlloc.modify_permissions(ptr, length, Perm::from_bits(protection.bits()).unwrap())
}
.map_err(|e| errno!(Errno::from(e as u32)))?;
Ok(())
}
fn mem_type() -> EPCMemType {
EPCMemType::UserRegion
}
}
impl UserRegionMem {
fn commit_memory(start_addr: usize, size: usize) -> Result<()> {
let ptr = NonNull::<u8>::new(start_addr as *mut u8).unwrap();
unsafe { EmmAlloc.commit(ptr, size) }.map_err(|e| errno!(Errno::from(e as u32)))?;
Ok(())
}
fn commit_memory_with_new_permission(
start_addr: usize,
size: usize,
new_perms: VMPerms,
) -> Result<()> {
let ptr = NonNull::<u8>::new(start_addr as *mut u8).unwrap();
let perm = Perm::from_bits(new_perms.bits()).unwrap();
if size == PAGE_SIZE {
unsafe { EmmAlloc::commit_with_data(ptr, ZERO_PAGE.as_slice(), perm) }
.map_err(|e| errno!(Errno::from(e as u32)))?;
} else {
let data = ZeroPage::new_page_aligned_vec(size);
unsafe { EmmAlloc::commit_with_data(ptr, data.as_slice(), perm) }
.map_err(|e| errno!(Errno::from(e as u32)))?;
}
Ok(())
}
fn commit_memory_and_init_with_file(
start_addr: usize,
size: usize,
file: &FileRef,
file_offset: usize,
new_perms: VMPerms,
) -> Result<()> {
let mut data = ZeroPage::new_page_aligned_vec(size);
let len = file
.read_at(file_offset, data.as_mut_slice())
.map_err(|_| errno!(EACCES, "failed to init memory from file"))?;
let ptr = NonNull::<u8>::new(start_addr as *mut u8).unwrap();
let perm = Perm::from_bits(new_perms.bits()).unwrap();
unsafe { EmmAlloc::commit_with_data(ptr, data.as_slice(), perm) }
.map_err(|e| errno!(Errno::from(e as u32)))?;
Ok(())
}
}
impl SGXPlatform {
pub fn new() -> Self {
if rsgx_is_supported_EDMM() {
SGXPlatform::WithEDMM
} else {
SGXPlatform::NoEDMM // including SGX simulation mode
}
}
pub fn alloc_user_space(
&self,
init_size: usize,
max_size: usize,
) -> Result<(VMRange, Option<VMRange>)> {
debug!(
"alloc user space init size = {:?}, max size = {:?}",
init_size, max_size
);
if matches!(self, SGXPlatform::WithEDMM) && max_size > init_size {
let user_region_size = max_size - init_size;
let reserved_mem_start_addr = ReservedMem::alloc(init_size)?;
let user_region_start_addr = UserRegionMem::alloc(user_region_size)?;
let total_user_space_range = VMRange::new(
reserved_mem_start_addr,
user_region_start_addr + user_region_size,
)?;
let gap_range =
VMRange::new(reserved_mem_start_addr + init_size, user_region_start_addr)?;
info!(
"allocated user space range is {:?}, gap range is {:?}. reserved_mem range is {:?}, user region range is {:?}",
total_user_space_range, gap_range, VMRange::new_with_size(reserved_mem_start_addr, init_size),
VMRange::new_with_size(user_region_start_addr, user_region_size)
);
Ok((total_user_space_range, Some(gap_range)))
} else {
// For platform with no-edmm support, or the max_size is the same as init_size, use reserved memory for the whole userspace
let reserved_mem_start_addr = ReservedMem::alloc(max_size)?;
let total_user_space_range =
VMRange::new(reserved_mem_start_addr, reserved_mem_start_addr + max_size)?;
info!(
"allocated user space range is {:?}, gap range is None",
total_user_space_range
);
Ok((total_user_space_range, None))
}
}
pub fn free_user_space(&self, user_space_range: &VMRange, gap_range: Option<&VMRange>) {
let user_space_ranges = if let Some(gap_range) = gap_range {
user_space_range.subtract(gap_range)
} else {
vec![*user_space_range]
};
if user_space_ranges.len() == 2 {
debug_assert!(matches!(self, SGXPlatform::WithEDMM));
let reserved_mem = user_space_ranges[0];
let user_region_mem = user_space_ranges[1];
ReservedMem::free(reserved_mem.start(), reserved_mem.size()).unwrap();
UserRegionMem::free(user_region_mem.start(), user_region_mem.size()).unwrap();
} else {
// For platforms with EDMM but max_size equals init_size or the paltforms without EDMM, there is no gap range.
debug_assert!(user_space_ranges.len() == 1);
let reserved_mem = user_space_ranges[0];
ReservedMem::free(reserved_mem.start(), reserved_mem.size()).unwrap();
}
}
}
impl Debug for EPCMemType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let output_str = match self {
EPCMemType::Reserved => "reserved memory region",
EPCMemType::UserRegion => "user region memory",
};
write!(f, "{}", output_str)
}
}
impl EPCMemType {
pub fn new(range: &VMRange) -> Self {
trace!("EPC new range = {:?}", range);
if rsgx_is_supported_EDMM() {
if let Some(gap_range) = USER_SPACE_VM_MANAGER.gap_range() {
debug_assert!({
if range.size() > 0 {
!gap_range.overlap_with(range)
} else {
// Ignore for sentry VMA
true
}
});
if range.end() <= gap_range.start() {
EPCMemType::Reserved
} else {
debug_assert!(gap_range.end() <= range.start());
EPCMemType::UserRegion
}
} else {
// There is no gap, this indicates that there is no user region memory
EPCMemType::Reserved
}
} else {
// Only reserved memory
EPCMemType::Reserved
}
}
pub fn modify_protection(&self, addr: usize, length: usize, protection: VMPerms) -> Result<()> {
// PT_GROWSDOWN should only be applied to stack segment or a segment mapped with the MAP_GROWSDOWN flag set.
// Since the memory are managed by our own, mprotect ocall shouldn't use this flag. Otherwise, EINVAL will be thrown.
let mut prot = protection.clone();
prot.remove(VMPerms::GROWSDOWN);
match self {
EPCMemType::Reserved => ReservedMem::modify_protection(addr, length, prot),
EPCMemType::UserRegion => UserRegionMem::modify_protection(addr, length, prot),
}
}
}
pub fn commit_memory(start_addr: usize, size: usize, new_perms: Option<VMPerms>) -> Result<()> {
trace!(
"commit epc: {:?}, new permission: {:?}",
VMRange::new_with_size(start_addr, size).unwrap(),
new_perms
);
// We should make memory commit and permission change atomic to prevent data races. Thus, if the new perms
// are not the default permission (RW), we implement a different function by calling EACCEPTCOPY
match new_perms {
Some(perms) if perms != VMPerms::DEFAULT => {
UserRegionMem::commit_memory_with_new_permission(start_addr, size, perms)
}
_ => UserRegionMem::commit_memory(start_addr, size),
}
}
pub fn commit_memory_and_init_with_file(
start_addr: usize,
size: usize,
file: &FileRef,
file_offset: usize,
new_perms: VMPerms,
) -> Result<()> {
UserRegionMem::commit_memory_and_init_with_file(start_addr, size, file, file_offset, new_perms)
}
// This is a dummy function for sgx_mm_alloc. The real handler is "enclave_page_fault_handler" shown below.
extern "C" fn enclave_page_fault_handler_dummy(
pfinfo: &sgx_pfinfo,
private: usize,
) -> HandleResult {
// Don't do anything here. Modification of registers can cause the PF handling error.
return HandleResult::Search;
}
pub fn enclave_page_fault_handler(
rip: usize,
exception_info: sgx_misc_exinfo_t,
kernel_triggers: bool,
) -> Result<()> {
let pf_addr = exception_info.faulting_address as usize;
let pf_errcd = exception_info.error_code;
trace!(
"enclave page fault caught, pf_addr = 0x{:x}, error code = {:?}",
pf_addr,
pf_errcd
);
USER_SPACE_VM_MANAGER.handle_page_fault(rip, pf_addr, pf_errcd, kernel_triggers)?;
Ok(())
}
extern "C" {
fn occlum_ocall_mprotect(
retval: *mut i32,
addr: *const c_void,
len: usize,
prot: i32,
) -> sgx_status_t;
}

@ -22,14 +22,16 @@ use std::ops::Bound::{Excluded, Included};
#[derive(Debug)]
pub struct VMManager {
range: VMRange,
gap_range: Option<VMRange>,
internal: SgxMutex<InternalVMManager>,
}
impl VMManager {
pub fn init(vm_range: VMRange) -> Result<Self> {
let internal = InternalVMManager::init(vm_range.clone());
pub fn init(vm_range: VMRange, gap_range: Option<VMRange>) -> Result<Self> {
let mut internal = InternalVMManager::init(vm_range.clone(), &gap_range);
Ok(VMManager {
range: vm_range,
gap_range: gap_range,
internal: SgxMutex::new(internal),
})
}
@ -38,6 +40,10 @@ impl VMManager {
&self.range
}
pub fn gap_range(&self) -> &Option<VMRange> {
&self.gap_range
}
pub fn internal(&self) -> SgxMutexGuard<InternalVMManager> {
self.internal.lock().unwrap()
}
@ -56,8 +62,15 @@ impl VMManager {
}
pub fn verified_clean_when_exit(&self) -> bool {
let gap_size = if let Some(gap) = self.gap_range() {
gap.size()
} else {
0
};
let internal = self.internal();
internal.chunks.len() == 0 && internal.free_manager.free_size() == self.range.size()
internal.chunks.len() == 0
&& internal.free_manager.free_size() + gap_size == self.range.size()
}
pub fn free_chunk(&self, chunk: &ChunkRef) {
@ -358,22 +371,19 @@ impl VMManager {
intersect_chunks.iter().for_each(|chunk| {
if let ChunkType::SingleVMA(vma) = chunk.internal() {
if let Some(intersection_range) = chunk.range().intersect(&reset_range) {
let mut internal_manager = self.internal();
internal_manager.mprotect_single_vma_chunk(
&chunk,
intersection_range,
VMPerms::DEFAULT,
);
unsafe {
let buf = intersection_range.as_slice_mut();
buf.iter_mut().for_each(|b| *b = 0)
}
let mut vma = vma.lock().unwrap();
if let Some(intersection_vma) = vma.intersect(&reset_range) {
intersection_vma.flush_and_clean_memory().unwrap();
}
// clear permission for SingleVMA chunk
if vma.perms() != VMPerms::DEFAULT {
vma.set_perms(VMPerms::default());
}
} else {
// Currently only used for heap de-allocation. Thus must be SingleVMA chunk.
unreachable!()
}
});
Ok(())
}
@ -394,11 +404,11 @@ impl VMManager {
match chunk.internal() {
ChunkType::MultiVMA(manager) => {
trace!("msync default chunk: {:?}", chunk.range());
return manager
manager
.lock()
.unwrap()
.chunk_manager_mut()
.msync_by_range(&sync_range);
.msync_by_range(&sync_range)?;
}
ChunkType::SingleVMA(vma) => {
// Note: There are rare cases that mutliple threads do mprotect or munmap for the same single-vma chunk
@ -406,7 +416,7 @@ impl VMManager {
// It is fine here because this function doesn't modify the global chunk list and only operates on the vma
// which is updated realtimely.
let vma = vma.lock().unwrap();
vma.flush_backed_file();
vma.flush_committed_backed_file();
}
}
Ok(())
@ -429,7 +439,7 @@ impl VMManager {
ChunkType::SingleVMA(vma) => {
vma.lock()
.unwrap()
.flush_backed_file_with_cond(is_same_file);
.flush_committed_backed_file_with_cond(is_same_file);
}
});
}
@ -539,6 +549,41 @@ impl VMManager {
assert!(mem_chunks.len() == 0);
}
pub fn handle_page_fault(
&self,
rip: usize,
pf_addr: usize,
errcd: u32,
kernel_triggers: bool,
) -> Result<()> {
let current = current!();
let page_fault_chunk = {
let current_process_mem_chunks = current.vm().mem_chunks().read().unwrap();
if let Some(page_fault_chunk) = current_process_mem_chunks
.iter()
.find(|chunk| chunk.range().contains(pf_addr))
{
Some(page_fault_chunk.clone())
} else {
None
}
};
if let Some(page_fault_chunk) = page_fault_chunk {
return page_fault_chunk.handle_page_fault(rip, pf_addr, errcd, kernel_triggers);
}
// System V SHM segments are not tracked by the process VM. Try find the chunk here.
if let Some(page_fault_shm_chunk) =
SYSTEM_V_SHM_MANAGER.get_shm_chunk_containing_addr(pf_addr, current.process().pid())
{
return page_fault_shm_chunk.handle_page_fault(rip, pf_addr, errcd, kernel_triggers);
}
// This can happen for example, when the user intends to trigger the SIGSEGV handler by visit nullptr.
return_errno!(ENOMEM, "can't find the chunk containing the address");
}
}
// Modification on this structure must acquire the global lock.
@ -552,11 +597,21 @@ pub struct InternalVMManager {
}
impl InternalVMManager {
pub fn init(vm_range: VMRange) -> Self {
pub fn init(vm_range: VMRange, gap_range: &Option<VMRange>) -> Self {
let chunks = BTreeSet::new();
let fast_default_chunks = Vec::new();
let free_manager = VMFreeSpaceManager::new(vm_range);
let mut free_manager = VMFreeSpaceManager::new(vm_range);
let shm_manager = ShmManager::new();
if let Some(gap_range) = gap_range {
debug_assert!(vm_range.is_superset_of(&gap_range));
free_manager
.find_free_range_internal(
gap_range.size(),
PAGE_SIZE,
VMMapAddr::Force(gap_range.start()),
)
.unwrap();
}
Self {
chunks,
fast_default_chunks,
@ -657,19 +712,7 @@ impl InternalVMManager {
_ => unreachable!(),
};
// File-backed VMA needs to be flushed upon munmap
intersection_vma.flush_backed_file();
// Reset memory permissions
if !&intersection_vma.perms().is_default() {
VMPerms::apply_perms(&intersection_vma, VMPerms::default());
}
// Reset to zero
unsafe {
let buf = intersection_vma.as_slice_mut();
buf.iter_mut().for_each(|b| *b = 0)
}
intersection_vma.flush_and_clean_memory()?;
let mut new_vmas = vma.subtract(&intersection_vma);
let current = current!();
@ -724,10 +767,10 @@ impl InternalVMManager {
self.shm_manager
.create_shared_chunk(options, new_chunk.clone())
.map_err(|e| {
let vma = new_chunk.get_vma_for_single_vma_chunk();
let mut vma = new_chunk.get_vma_for_single_vma_chunk();
// Reset memory permissions
if !vma.perms().is_default() {
VMPerms::apply_perms(&vma, VMPerms::default());
vma.modify_permissions_for_committed_pages(VMPerms::default())
}
// Reset memory contents
unsafe {
@ -778,19 +821,11 @@ impl InternalVMManager {
.munmap_shared_chunk(chunk, munmap_range, flag)?
== MunmapSharedResult::Freeable
{
let vma = chunk.get_vma_for_single_vma_chunk();
// Flush memory contents to backed file
vma.flush_backed_file();
// Reset memory permissions
if !vma.perms().is_default() {
VMPerms::apply_perms(&vma, VMPerms::default());
// Flush memory contents to backed file and reset memory contents
{
let vma = chunk.get_vma_for_single_vma_chunk();
vma.flush_and_clean_memory()?;
}
// Reset memory contents
unsafe {
let buf = vma.as_slice_mut();
buf.iter_mut().for_each(|b| *b = 0)
}
drop(vma);
self.free_chunk(chunk);
let current = current!();
@ -855,7 +890,6 @@ impl InternalVMManager {
}
ChunkType::SingleVMA(vma) => vma,
};
let mut updated_vmas = {
let mut containing_vma = vma.lock().unwrap();
trace!(
@ -865,7 +899,8 @@ impl InternalVMManager {
);
debug_assert!(chunk.range() == containing_vma.range());
if containing_vma.perms() == new_perms {
let old_perms = containing_vma.perms();
if old_perms == new_perms {
return Ok(());
}
@ -876,7 +911,7 @@ impl InternalVMManager {
(true, true) => {
// Exact the same vma
containing_vma.set_perms(new_perms);
VMPerms::apply_perms(&containing_vma, containing_vma.perms());
containing_vma.modify_permissions_for_committed_pages(new_perms);
return Ok(());
}
(false, false) => {
@ -886,15 +921,13 @@ impl InternalVMManager {
// remaining old VMA: [protect_range.end, containing_vma.end)
let old_end = containing_vma.end();
let old_perms = containing_vma.perms();
let new_vma = VMArea::inherits_file_from(
let mut new_vma = VMArea::inherits_file_from(
&containing_vma,
protect_range,
new_perms,
VMAccess::Private(current_pid),
);
VMPerms::apply_perms(&new_vma, new_vma.perms());
new_vma.modify_permissions_for_committed_pages(new_perms);
let remaining_old_vma = {
let range = VMRange::new(protect_range.end(), old_end).unwrap();
@ -905,7 +938,6 @@ impl InternalVMManager {
VMAccess::Private(current_pid),
)
};
containing_vma.set_end(protect_range.start());
// Put containing_vma at last to be updated first.
@ -913,19 +945,19 @@ impl InternalVMManager {
updated_vmas
}
_ => {
let new_vma = VMArea::inherits_file_from(
let mut new_vma = VMArea::inherits_file_from(
&containing_vma,
protect_range,
new_perms,
VMAccess::Private(current_pid),
);
VMPerms::apply_perms(&new_vma, new_vma.perms());
new_vma.modify_permissions_for_committed_pages(new_perms);
if same_start {
// Protect range is at left side of the cotaining vma
// Protect range is at left side of the containing vma
containing_vma.set_start(protect_range.end());
} else {
// Protect range is at right side of the cotaining vma
// Protect range is at right side of the containing vma
containing_vma.set_end(protect_range.start());
}
@ -935,19 +967,16 @@ impl InternalVMManager {
}
}
};
let current = current!();
// First update current vma chunk
if updated_vmas.len() > 1 {
let update_vma = updated_vmas.pop().unwrap();
self.update_single_vma_chunk(&current, &chunk, update_vma);
}
// Then add new chunks if any
updated_vmas.into_iter().for_each(|vma| {
self.add_new_chunk(&current, vma);
});
Ok(())
}
@ -964,9 +993,6 @@ impl InternalVMManager {
// Remove from chunks
self.chunks.remove(chunk);
// Mprotect the whole chunk to reduce the usage of vma count of host
VMPerms::apply_perms(range, VMPerms::DEFAULT);
// Add range back to freespace manager
self.free_manager.add_range_back_to_free_manager(range);
Ok(())
@ -1131,6 +1157,7 @@ impl InternalVMManager {
let perms = options.perms().clone();
let align = options.align().clone();
let initializer = options.initializer();
let page_policy = options.page_policy();
target_contained_ranges
.iter()
.map(|range| {
@ -1146,6 +1173,7 @@ impl InternalVMManager {
.initializer(initializer.clone())
.addr(addr)
.size(size)
.page_policy(*page_policy)
.build()
.unwrap()
})

@ -39,37 +39,6 @@ impl VMPerms {
self.bits == Self::DEFAULT.bits
}
pub fn apply_perms(protect_range: &VMRange, perms: VMPerms) {
use sgx_trts::enclave::rsgx_is_supported_EDMM;
unsafe {
let mut retval = 0;
let addr = protect_range.start() as *const c_void;
let len = protect_range.size();
// PT_GROWSDOWN should only be applied to stack segment or a segment mapped with the MAP_GROWSDOWN flag set.
// Since the memory are managed by our own, mprotect ocall shouldn't use this flag. Otherwise, EINVAL will be thrown.
let mut prot = perms.clone();
prot.remove(VMPerms::GROWSDOWN);
if rsgx_is_supported_EDMM() {
// With EDMM support, reserved memory permission should be updated.
let sgx_status = sgx_tprotect_rsrv_mem(addr, len, prot.bits() as i32);
if sgx_status != sgx_status_t::SGX_SUCCESS {
panic!("sgx_tprotect_rsrv_mem status {}", sgx_status);
}
} else {
// Without EDMM support, reserved memory permission is statically RWX and we only need to do mprotect ocall.
let sgx_status = occlum_ocall_mprotect(&mut retval, addr, len, prot.bits() as i32);
if sgx_status != sgx_status_t::SGX_SUCCESS || retval != 0 {
panic!(
"occlum_ocall_mprotect status {}, retval {}",
sgx_status, retval
);
}
}
}
}
pub fn display(&self) -> String {
let mut str = String::new();
if self.can_read() {
@ -96,23 +65,3 @@ impl Default for VMPerms {
VMPerms::DEFAULT
}
}
extern "C" {
// Modify the access permissions of the pages in the reserved memory area
//
// Parameters:
// Inputs: addr[in]: Starting address of region which needs to change access
// permission. Page aligned.
// length[in]: The length of the memory to be manipulated in bytes. Page aligned.
// prot[in]: The target memory protection.
// Return: sgx_status_t
//
fn sgx_tprotect_rsrv_mem(addr: *const c_void, length: usize, prot: i32) -> sgx_status_t;
fn occlum_ocall_mprotect(
retval: *mut i32,
addr: *const c_void,
len: usize,
prot: i32,
) -> sgx_status_t;
}

@ -10,6 +10,11 @@ use intrusive_collections::RBTreeLink;
use intrusive_collections::{intrusive_adapter, KeyAdapter};
use rcore_fs::vfs::Metadata;
pub const GB: usize = 1 << 30;
pub const TB: usize = 1 << 40;
pub const MB: usize = 1 << 20;
pub const KB: usize = 1 << 10;
#[derive(Clone, Debug)]
pub enum VMInitializer {
DoNothing(),
@ -139,7 +144,7 @@ impl FileBacked {
self.write_back
}
pub fn init_file(&self) -> (&FileRef, usize) {
pub fn backed_file(&self) -> (&FileRef, usize) {
(&self.file, self.offset)
}
@ -179,6 +184,19 @@ impl VMMapAddr {
}
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum PagePolicy {
ReserveOnly = 0x1, // Only reserve
CommitNow = 0x2, // Commit all pages when mmap.
CommitOnDemand = 0x4, // Reserve space when mmap, commit in the PF handler. This is the default policy.
}
impl Default for PagePolicy {
fn default() -> PagePolicy {
PagePolicy::CommitOnDemand
}
}
#[derive(Builder, Debug)]
#[builder(pattern = "owned", build_fn(skip), no_std)]
pub struct VMMapOptions {
@ -187,6 +205,7 @@ pub struct VMMapOptions {
perms: VMPerms,
addr: VMMapAddr,
initializer: VMInitializer,
page_policy: PagePolicy,
}
// VMMapOptionsBuilder is generated automatically, except the build function
@ -232,12 +251,21 @@ impl VMMapOptionsBuilder {
Some(initializer) => initializer.clone(),
None => VMInitializer::default(),
};
let page_policy = {
match &initializer {
VMInitializer::CopyFrom { .. } => PagePolicy::CommitNow,
VMInitializer::CopyOldAndReadNew { .. } => PagePolicy::CommitNow,
_ => self.page_policy.unwrap_or_default(),
}
};
Ok(VMMapOptions {
size,
align,
perms,
addr,
initializer,
page_policy,
})
}
}
@ -269,6 +297,10 @@ impl VMMapOptions {
}
false
}
pub fn page_policy(&self) -> &PagePolicy {
&self.page_policy
}
}
#[derive(Clone, Copy, PartialEq)]

@ -25,4 +25,4 @@ dependencies = [
[[package]]
name = "sgx_types"
version = "1.1.5"
version = "1.1.6"