Add EDMM support for Legacy Occlum

2023-09-18 11:36:17 +00:00 · 2023-09-18 11:36:17 +00:00 · d49b3af0aa
commit d49b3af0aa
parent 28c29c8896
28 changed files with 2104 additions and 393 deletions
--- a/src/libos/Cargo.lock
+++ b/src/libos/Cargo.lock
@ -9,7 +9,7 @@ dependencies = [
 "aligned",
 "atomic",
 "bitflags",
- "bitvec",
+ "bitvec 1.0.1",
 "ctor",
 "derive_builder",
 "goblin",
@ -18,6 +18,7 @@ dependencies = [
 "lazy_static",
 "log",
 "memoffset 0.6.5",
+ "modular-bitfield",
 "rcore-fs",
 "rcore-fs-devfs",
 "rcore-fs-mountfs",
@ -94,7 +95,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "41262f11d771fd4a61aa3ce019fca363b4b6c282fca9da2a31186d3965a47a5c"
 dependencies = [
 "either",
- "radium",
+ "radium 0.3.0",
+]
+
+[[package]]
+name = "bitvec"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+dependencies = [
+ "funty",
+ "radium 0.7.0",
+ "tap",
+ "wyz",
 ]

 [[package]]
@ -206,6 +219,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"

+[[package]]
+name = "funty"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
+
 [[package]]
 name = "goblin"
 version = "0.5.4"
@ -294,6 +313,27 @@ dependencies = [
 "autocfg 1.1.0",
 ]

+[[package]]
+name = "modular-bitfield"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a53d79ba8304ac1c4f9eb3b9d281f21f7be9d4626f72ce7df4ad8fbde4f38a74"
+dependencies = [
+ "modular-bitfield-impl",
+ "static_assertions 1.1.0",
+]
+
+[[package]]
+name = "modular-bitfield-impl"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a7d5f7076603ebc68de2dc6a650ec331a062a13abaa346975be747bbfa4b789"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "plain"
 version = "0.2.3"
@ -334,6 +374,12 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "def50a86306165861203e7f84ecffbbdfdea79f0e51039b33de1e952358c47ac"

+[[package]]
+name = "radium"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
+
 [[package]]
 name = "rand"
 version = "0.6.5"
@ -479,11 +525,11 @@ dependencies = [
 name = "rcore-fs-sefs"
 version = "0.1.0"
 dependencies = [
- "bitvec",
+ "bitvec 0.17.4",
 "log",
 "rcore-fs",
 "spin 0.5.2",
- "static_assertions",
+ "static_assertions 0.3.4",
 "uuid",
 ]

@ -719,6 +765,12 @@ version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f3eb36b47e512f8f1c9e3d10c2c1965bc992bd9cdb024fa581e2194501c83d3"

+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
 [[package]]
 name = "strsim"
 version = "0.9.3"
@ -736,6 +788,12 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.3"
@ -772,3 +830,12 @@ name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "wyz"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
+dependencies = [
+ "tap",
+]
--- a/src/libos/Cargo.toml
+++ b/src/libos/Cargo.toml
@ -10,7 +10,7 @@ crate-type = ["staticlib"]
 [dependencies]
 atomic = "0.5"
 bitflags = "1.0"
-bitvec = { version = "0.17", default-features = false, features = ["alloc"]  }
+bitvec = { version = "1", default-features = false, features = ["alloc"]  }
 log = "0.4"
 aligned = "0.4.1"
 lazy_static = { version = "1.1.0", features = ["spin_no_std"] } # Implies nightly
@ -33,6 +33,7 @@ regex = { git = "https://github.com/mesalock-linux/regex-sgx", default-features
 goblin = { version = "0.5.4", default-features = false, features = ["elf64", "elf32", "endian_fd"] }
 intrusive-collections = "0.9"
 spin = "0.7"
+modular-bitfield = "0.11.2"

 [patch.'https://github.com/apache/teaclave-sgx-sdk.git']
 sgx_tstd = { path = "../../deps/rust-sgx-sdk/sgx_tstd" }
--- a/src/libos/src/exception/mod.rs
+++ b/src/libos/src/exception/mod.rs
@ -6,10 +6,14 @@ use self::syscall::{handle_syscall_exception, SYSCALL_OPCODE};
 use super::*;
 use crate::signal::{FaultSignal, SigSet};
 use crate::syscall::exception_interrupt_syscall_c_abi;
-use crate::syscall::{CpuContext, FpRegs, SyscallNum};
-use aligned::{Aligned, A16};
-use core::arch::x86_64::_fxsave;
+use crate::syscall::{CpuContext, ExtraContext, SyscallNum};
+use crate::vm::{enclave_page_fault_handler, USER_SPACE_VM_MANAGER};
 use sgx_types::*;
+use sgx_types::{sgx_exception_type_t, sgx_exception_vector_t};
+
+const ENCLU: u32 = 0xd7010f;
+const EACCEPT: u32 = 0x5;
+const EACCEPTCOPY: u32 = 0x7;

 // Modules for instruction simulation
 mod cpuid;
@ -25,14 +29,63 @@ pub fn register_exception_handlers() {
    }
 }

+fn try_handle_kernel_exception(info: &sgx_exception_info_t) -> i32 {
+    if info.exception_vector == sgx_exception_vector_t::SGX_EXCEPTION_VECTOR_PF {
+        let pf_addr = info.exinfo.faulting_address as usize;
+        // The PF address must be in the user space. Otherwise, keep searching for the exception handler
+        if !USER_SPACE_VM_MANAGER.range().contains(pf_addr) {
+            SGX_MM_EXCEPTION_CONTINUE_SEARCH
+        } else {
+            let rip = info.cpu_context.rip as *const u32;
+            let rax = info.cpu_context.rax as u32;
+            // This can happen when two threads both try to EAUG a new page. Thread 1 EAUG because it first
+            // touches the memory and triggers #PF. Thread 2 EAUG because it uses sgx_mm_commit to commit a
+            // new page with EACCEPT and triggers #PF. If Thread 1 first acquires the lock to do EAUG, when Thread 2
+            // acquires the lock, it can't do EAUG again and will fail. The failure will raise a signal.
+            // This signal will eventually be handled here. And the instruction that triggers this exception is EACCEPT/EACCEPTCOPY.
+            // In this case, since the new page is EAUG-ed already, just need to excecute the EACCEPT again. Thus here
+            // just return SGX_MM_EXCEPTION_CONTINUE_EXECUTION
+            if ENCLU == (unsafe { *rip } as u32) & 0xffffff
+                && (EACCEPT == rax || EACCEPTCOPY == rax)
+            {
+                return SGX_MM_EXCEPTION_CONTINUE_EXECUTION;
+            }
+
+            // If the triggered code is not user's code and the #PF address is in the userspace, then it is a
+            // kernel-triggered #PF that we can handle. This can happen e.g. when read syscall triggers user buffer #PF
+            info!("kernel code triggers #PF");
+            let kernel_triggers = true;
+            enclave_page_fault_handler(info.cpu_context.rip as usize, info.exinfo, kernel_triggers)
+                .expect("handle PF failure");
+            SGX_MM_EXCEPTION_CONTINUE_EXECUTION
+        }
+    } else {
+        // Otherwise, we can't handle. Keep searching for the exception handler
+        error!(
+            "We can't handle this exception: {:?}",
+            info.exception_vector
+        );
+        SGX_MM_EXCEPTION_CONTINUE_SEARCH
+    }
+}
+
 #[no_mangle]
 extern "C" fn handle_exception(info: *mut sgx_exception_info_t) -> i32 {
-    let mut fpregs = FpRegs::save();
+    let info = unsafe { &mut *info };
+
+    // Try handle kernel-trigged #PF
+    if !USER_SPACE_VM_MANAGER
+        .range()
+        .contains(info.cpu_context.rip as usize)
+    {
+        return try_handle_kernel_exception(&info);
+    }
+
+    // User-space-triggered exception
    unsafe {
        exception_interrupt_syscall_c_abi(
            SyscallNum::HandleException as u32,
-            info as *mut _,
-            &mut fpregs as *mut FpRegs,
+            info as *mut sgx_exception_info_t as *mut _,
        )
    };
    unreachable!();
@ -41,20 +94,22 @@ extern "C" fn handle_exception(info: *mut sgx_exception_info_t) -> i32 {
 /// Exceptions are handled as a special kind of system calls.
 pub fn do_handle_exception(
    info: *mut sgx_exception_info_t,
-    fpregs: *mut FpRegs,
    user_context: *mut CpuContext,
 ) -> Result<isize> {
    let info = unsafe { &mut *info };
    check_exception_type(info.exception_type)?;
+    info!("do handle exception: {:?}", info.exception_vector);

    let user_context = unsafe { &mut *user_context };
    *user_context = CpuContext::from_sgx(&info.cpu_context);
-    user_context.fpregs = fpregs;
+    let xsave_area = info.xsave_area.as_mut_ptr();
+    user_context.extra_context = ExtraContext::Xsave;
+    user_context.extra_context_ptr = xsave_area;

    // Try to do instruction emulation first
    if info.exception_vector == sgx_exception_vector_t::SGX_EXCEPTION_VECTOR_UD {
        // Assume the length of opcode is 2 bytes
-        let ip_opcode = unsafe { *(user_context.rip as *const u16) };
+        let ip_opcode: u16 = unsafe { *(user_context.rip as *const u16) };
        if ip_opcode == RDTSC_OPCODE {
            return handle_rdtsc_exception(user_context);
        } else if ip_opcode == SYSCALL_OPCODE {
@ -64,6 +119,23 @@ pub fn do_handle_exception(
        }
    }

+    // Normally, We should only handled PF exception with SGX bit set which is due to uncommitted EPC.
+    // However, it happens that when committing a no-read-write page (e.g. RWX), there is a short gap
+    // after EACCEPTCOPY and before the mprotect ocall. And if the user touches memory during this short
+    // gap, the SGX bit will not be set. Thus, here we don't check the SGX bit.
+    if info.exception_vector == sgx_exception_vector_t::SGX_EXCEPTION_VECTOR_PF {
+        info!("Userspace #PF caught, try handle");
+        if enclave_page_fault_handler(info.cpu_context.rip as usize, info.exinfo, false).is_ok() {
+            info!("#PF handling is done successfully");
+            return Ok(0);
+        }
+
+        warn!(
+            "#PF not handled. Turn to signal. user context = {:?}",
+            user_context
+        );
+    }
+
    // Then, it must be a "real" exception. Convert it to signal and force delivering it.
    // The generated signal is SIGBUS, SIGFPE, SIGILL, or SIGSEGV.
    //
@ -108,3 +180,21 @@ fn check_exception_type(type_: sgx_exception_type_t) -> Result<()> {
    }
    Ok(())
 }
+
+// Based on Page-Fault Error Code of Intel Mannul
+const PF_EXCEPTION_SGX_BIT: u32 = 0x1;
+const PF_EXCEPTION_RW_BIT: u32 = 0x2;
+
+// Return value:
+// True     - SGX bit is set
+// False    - SGX bit is not set
+pub fn check_sgx_bit(exception_error_code: u32) -> bool {
+    exception_error_code & PF_EXCEPTION_SGX_BIT == PF_EXCEPTION_SGX_BIT
+}
+
+// Return value:
+// True     - write bit is set, #PF caused by write
+// False    - read bit is set, #PF caused by read
+pub fn check_rw_bit(exception_error_code: u32) -> bool {
+    exception_error_code & PF_EXCEPTION_RW_BIT == PF_EXCEPTION_RW_BIT
+}
--- a/src/libos/src/fs/procfs/pid/maps.rs
+++ b/src/libos/src/fs/procfs/pid/maps.rs
@ -91,7 +91,7 @@ fn get_output_for_vma(vma: &VMArea, heap_or_stack: Option<&str>) -> String {
    let perms = vma.perms();

    let (file_path, offset, device_id, inode_num) = {
-        if let Some((file, offset)) = vma.init_file() {
+        if let Some((file, offset)) = vma.backed_file() {
            let inode_file = file.as_inode_file().unwrap();
            let file_path = inode_file.abs_path();
            let inode_num = inode_file.inode().metadata().unwrap().inode;
--- a/src/libos/src/interrupt/mod.rs
+++ b/src/libos/src/interrupt/mod.rs
@ -2,9 +2,7 @@ pub use self::sgx::sgx_interrupt_info_t;
 use crate::prelude::*;
 use crate::process::ThreadRef;
 use crate::syscall::exception_interrupt_syscall_c_abi;
-use crate::syscall::{CpuContext, FpRegs, SyscallNum};
-use aligned::{Aligned, A16};
-use core::arch::x86_64::_fxsave;
+use crate::syscall::{CpuContext, ExtraContext, SyscallNum};

 mod sgx;

@ -16,28 +14,23 @@ pub fn init() {
 }

 extern "C" fn handle_interrupt(info: *mut sgx_interrupt_info_t) -> i32 {
-    let mut fpregs = FpRegs::save();
    unsafe {
-        exception_interrupt_syscall_c_abi(
-            SyscallNum::HandleInterrupt as u32,
-            info as *mut _,
-            &mut fpregs as *mut FpRegs,
-        )
+        exception_interrupt_syscall_c_abi(SyscallNum::HandleInterrupt as u32, info as *mut _)
    };
    unreachable!();
 }

 pub fn do_handle_interrupt(
    info: *mut sgx_interrupt_info_t,
-    fpregs: *mut FpRegs,
    cpu_context: *mut CpuContext,
 ) -> Result<isize> {
-    let info = unsafe { &*info };
+    let info = unsafe { &mut *info };
    let context = unsafe { &mut *cpu_context };
    // The cpu context is overriden so that it is as if the syscall is called from where the
    // interrupt happened
    *context = CpuContext::from_sgx(&info.cpu_context);
-    context.fpregs = fpregs;
+    context.extra_context = ExtraContext::Xsave;
+    context.extra_context_ptr = info.xsave_area.as_mut_ptr();
    Ok(0)
 }

--- a/src/libos/src/interrupt/sgx.rs
+++ b/src/libos/src/interrupt/sgx.rs
@ -1,10 +1,15 @@
 use crate::prelude::*;

-#[repr(C)]
+#[repr(C, align(64))]
 #[derive(Default, Clone, Copy)]
 #[allow(non_camel_case_types)]
 pub struct sgx_interrupt_info_t {
    pub cpu_context: sgx_cpu_context_t,
+    pub interrupt_valid: uint32_t,
+    reserved: uint32_t,
+    pub xsave_size: uint64_t,
+    pub reserved1: [uint64_t; 4],
+    pub xsave_area: [uint8_t; 0],
 }

 #[allow(non_camel_case_types)]
--- a/src/libos/src/lib.rs
+++ b/src/libos/src/lib.rs
@ -21,8 +21,11 @@
 #![feature(test)]
 #![feature(atomic_from_mut)]
 #![feature(btree_drain_filter)]
-#![feature(bench_black_box)]
 #![feature(arbitrary_enum_discriminant)]
+// for core::ptr::non_null::NonNull addr() method
+#![feature(strict_provenance)]
+// for VMArea::can_merge_vmas
+#![feature(is_some_and)]

 #[macro_use]
 extern crate alloc;
@ -59,6 +62,7 @@ extern crate memoffset;
 extern crate ctor;
 extern crate intrusive_collections;
 extern crate itertools;
+extern crate modular_bitfield;
 extern crate resolv_conf;

 use sgx_trts::libc;
--- a/src/libos/src/process/do_exit.rs
+++ b/src/libos/src/process/do_exit.rs
@ -1,6 +1,6 @@
 use crate::process::do_vfork::reap_zombie_child_created_with_vfork;
 use crate::signal::constants::*;
-use std::intrinsics::atomic_store;
+use std::intrinsics::atomic_store_seqcst;

 use super::do_futex::futex_wake;
 use super::do_vfork::{is_vforked_child_process, vfork_return_to_parent};
@ -61,7 +61,7 @@ fn exit_thread(term_status: TermStatus) {
    // Notify a thread, if any, that waits on ctid. See set_tid_address(2) for more info.
    if let Some(ctid_ptr) = thread.clear_ctid() {
        unsafe {
-            atomic_store(ctid_ptr.as_ptr(), 0);
+            atomic_store_seqcst(ctid_ptr.as_ptr(), 0);
        }
        futex_wake(ctid_ptr.as_ptr() as *const i32, 1);
    }
--- a/src/libos/src/process/do_futex.rs
+++ b/src/libos/src/process/do_futex.rs
@ -1,6 +1,6 @@
 use std::collections::hash_map::DefaultHasher;
 use std::hash::{Hash, Hasher};
-use std::intrinsics::atomic_load;
+use std::intrinsics::atomic_load_seqcst;
 use std::sync::atomic::{AtomicBool, Ordering};

 use crate::prelude::*;
@ -258,7 +258,7 @@ impl FutexKey {
    }

    pub fn load_val(&self) -> i32 {
-        unsafe { atomic_load(self.0 as *const i32) }
+        unsafe { atomic_load_seqcst(self.0 as *const i32) }
    }

    pub fn addr(&self) -> usize {
--- a/src/libos/src/sched/cpu_set.rs
+++ b/src/libos/src/sched/cpu_set.rs
@ -8,6 +8,7 @@
 //! * If `cpu_set[i] == true`, then the i-th CPU core belongs to the set;
 //! * Otherwise, the i-th CPU core is not in the set.

+use bitvec::order::LocalBits as Local;
 use bitvec::prelude::*;
 use std::ops::Index;

@ -15,7 +16,7 @@ use crate::prelude::*;

 #[derive(Debug, Clone, PartialEq)]
 pub struct CpuSet {
-    bits: BitBox<Local, u8>,
+    bits: BitBox<u8, Local>,
 }

 impl CpuSet {
@ -33,14 +34,14 @@ impl CpuSet {

    /// Create a CpuSet that consists of all of the CPU cores.
    pub fn new_full() -> Self {
-        let mut bits = bitbox![Local, u8; 1; Self::len() * 8];
+        let mut bits = bitbox![u8, Local; 1; Self::len() * 8];
        Self::clear_unused(&mut bits);
        Self { bits }
    }

    /// Create a CpuSet that consists of none of the CPU cores.
    pub fn new_empty() -> Self {
-        let bits = bitbox![Local, u8; 0; Self::len() * 8];
+        let bits = bitbox![u8, Local; 0; Self::len() * 8];
        Self { bits }
    }

@ -61,7 +62,7 @@ impl CpuSet {

    /// Returns the first index of CPUs in set.
    pub fn first_cpu_idx(&self) -> Option<usize> {
-        self.iter().position(|&b| b == true)
+        self.iter().position(|b| b == true)
    }

    // Returns if the CpuSet is a subset of available cpu set
@ -75,7 +76,7 @@ impl CpuSet {
            return_errno!(EINVAL, "slice is not long enough");
        }
        let slice = &slice[..Self::len()];
-        let mut bits = BitBox::from_slice(slice);
+        let mut bits = BitBox::from_bitslice(&BitSlice::from_slice(slice));
        Self::clear_unused(&mut bits);

        Ok(Self { bits })
@ -85,11 +86,11 @@ impl CpuSet {
    ///
    /// The last, unused bits in the byte slice are guaranteed to be zero.
    pub fn as_slice(&self) -> &[u8] {
-        self.bits.as_slice()
+        self.bits.as_raw_slice()
    }

    pub fn as_mut_slice(&mut self) -> &mut [u8] {
-        self.bits.as_mut_slice()
+        self.bits.as_raw_mut_slice()
    }

    /// Returns an iterator that allows accessing the underlying bits.
@ -102,7 +103,7 @@ impl CpuSet {
        self.bits.iter_mut()
    }

-    fn clear_unused(bits: &mut BitSlice<Local, u8>) {
+    fn clear_unused(bits: &mut BitSlice<u8, Local>) {
        let unused_bits = &mut bits[Self::ncores()..(Self::len() * 8)];
        for mut bit in unused_bits {
            *bit = false;
@ -110,8 +111,8 @@ impl CpuSet {
    }
 }

-pub type Iter<'a> = bitvec::slice::Iter<'a, Local, u8>;
-pub type IterMut<'a> = bitvec::slice::IterMut<'a, Local, u8>;
+pub type Iter<'a> = bitvec::slice::Iter<'a, u8, Local>;
+pub type IterMut<'a> = bitvec::slice::IterMut<'a, u8, Local>;

 impl Index<usize> for CpuSet {
    type Output = bool;
--- a/src/libos/src/signal/c_types.rs
+++ b/src/libos/src/signal/c_types.rs
@ -199,7 +199,7 @@ impl siginfo_t {
    }
 }

-#[derive(Clone, Copy)]
+#[derive(Clone)]
 #[repr(C)]
 pub struct ucontext_t {
    pub uc_flags: u64,
@ -225,7 +225,8 @@ pub type stack_t = sigaltstack_t;
 pub struct mcontext_t {
    pub inner: CpuContext,
    // TODO: the fields should be csgsfs, err, trapno, oldmask, and cr2
-    _unused0: [u64; 5],
+    // The number should be 5 but we use extra 2 spaces to store something else in the CpuContext. Thus make it 3.
+    _unused0: [u64; 3],
    // TODO: this field should be `fpregs: fpregset_t,`
    _unused1: usize,
    _reserved: [u64; 8],
--- a/src/libos/src/signal/do_sigreturn.rs
+++ b/src/libos/src/signal/do_sigreturn.rs
@ -5,9 +5,8 @@ use super::{SigAction, SigActionFlags, SigDefaultAction, SigSet, Signal};
 use crate::lazy_static::__Deref;
 use crate::prelude::*;
 use crate::process::{ProcessRef, TermStatus, ThreadRef};
-use crate::syscall::{CpuContext, FpRegs};
+use crate::syscall::{CpuContext, ExtraContext, FpRegs, XsaveArea};
 use aligned::{Aligned, A16};
-use core::arch::x86_64::{_fxrstor, _fxsave};
 use std::{ptr, slice};

 pub fn do_rt_sigreturn(curr_user_ctxt: &mut CpuContext) -> Result<()> {
@ -34,11 +33,27 @@ pub fn do_rt_sigreturn(curr_user_ctxt: &mut CpuContext) -> Result<()> {
    *curr_user_ctxt = last_ucontext.uc_mcontext.inner;

    // Restore the floating point registers to a temp area
-    // The floating point registers would be recoved just
-    // before return to user's code
-    let mut fpregs = Box::new(unsafe { FpRegs::from_slice(&last_ucontext.fpregs) });
-    curr_user_ctxt.fpregs = Box::into_raw(fpregs);
-    curr_user_ctxt.fpregs_on_heap = 1; // indicates the fpregs is on heap
+    // The floating point registers would be recoved just before return to user's code
+    match curr_user_ctxt.extra_context {
+        ExtraContext::Fpregs => {
+            // Signal raised by direct syscall
+            // fpregs should be stored on the heap. Because the ucontext_t will be freed when this function returns. And curr_user_ctxt only stores the pointer
+            let mut fpregs = Box::new(unsafe { FpRegs::from_slice(&last_ucontext.fpregs) });
+            curr_user_ctxt.extra_context_ptr = Box::into_raw(fpregs) as *mut u8;
+        }
+        ExtraContext::Xsave => {
+            // Signal raised by exception
+            // The xsave_area is stored at a special area reserved on kernel's stack. We can just overwrite this area with the latest user context
+            // Note: Currently, we only restore the fpregs instead of restoring the whole xsave area for sigreturn. Because during the
+            // handle path, we don't touch other advanced registers. However, in the future, if we have to touch those registers,
+            // we should restore the whole xsave area when sigreturn.
+            let latest_fpregs = unsafe { FpRegs::from_slice(&last_ucontext.fpregs) };
+            let xsave_area =
+                unsafe { (&mut *(curr_user_ctxt.extra_context_ptr as *mut XsaveArea)) };
+            xsave_area.set_fpregs_area(latest_fpregs);
+        }
+    }
+
    Ok(())
 }

@ -261,16 +276,24 @@ fn handle_signals_by_user(
        // Save the old sigmask
        ucontext.uc_sigmask = old_sigmask.to_c();
        // Save the user context
-        ucontext.uc_mcontext.inner = *curr_user_ctxt;
+        ucontext.uc_mcontext.inner = curr_user_ctxt.clone();

        // Save the floating point registers
-        if curr_user_ctxt.fpregs != ptr::null_mut() {
-            ucontext
-                .fpregs
-                .copy_from_slice(unsafe { curr_user_ctxt.fpregs.as_ref().unwrap().as_slice() });
-            // Clear the floating point registers, since we do not need to recover is when this syscall return
-            curr_user_ctxt.fpregs = ptr::null_mut();
+        if curr_user_ctxt.extra_context_ptr != ptr::null_mut() {
+            // Signal from exception handling
+            debug_assert!(matches!(curr_user_ctxt.extra_context, ExtraContext::Xsave));
+            let fpregs_area =
+                unsafe { (&*(curr_user_ctxt.extra_context_ptr as *mut XsaveArea)) }.get_fpregs();
+            ucontext.fpregs.copy_from_slice(fpregs_area.as_slice());
+            // Clear the floating point registers, since we do not need to recover this when this syscall return
+            curr_user_ctxt.extra_context_ptr = ptr::null_mut();
        } else {
+            // Raise the signal with direct syscall
+            debug_assert!(
+                matches!(curr_user_ctxt.extra_context, ExtraContext::Fpregs)
+                    && curr_user_ctxt.extra_context_ptr == ptr::null_mut()
+            );
+
            // We need a correct fxsave structure in the buffer,
            // because the app may modify part of it to update the
            // floating point after the signal handler finished.
--- a/src/libos/src/signal/signals/fault.rs
+++ b/src/libos/src/signal/signals/fault.rs
@ -36,12 +36,12 @@ impl FaultSignal {
            // Page fault exception
            SGX_EXCEPTION_VECTOR_PF => {
                const PF_ERR_FLAG_PRESENT : u32 = 1u32 << 0;
-                let code = if info.exinfo.errcd & PF_ERR_FLAG_PRESENT != 0 {
+                let code = if info.exinfo.error_code & PF_ERR_FLAG_PRESENT != 0 {
                    SEGV_ACCERR
                } else {
                    SEGV_MAPERR
                };
-                let addr = Some(info.exinfo.maddr);
+                let addr = Some(info.exinfo.faulting_address );
                (SIGSEGV, code, addr)
            },
            // General protection exception
--- a/src/libos/src/syscall/mod.rs
+++ b/src/libos/src/syscall/mod.rs
@ -7,7 +7,7 @@
 //! 3. Preprocess the system call and then call `dispatch_syscall` (in this file)
 //! 4. Call `do_*` to process the system call (in other modules)

-use aligned::{Aligned, A16};
+use aligned::{Aligned, A16, A64};
 use core::arch::x86_64::{_fxrstor, _fxsave};
 use std::any::Any;
 use std::convert::TryFrom;
@ -60,7 +60,7 @@ use crate::signal::{
    do_rt_sigtimedwait, do_sigaltstack, do_tgkill, do_tkill, sigaction_t, siginfo_t, sigset_t,
    stack_t,
 };
-use crate::vm::{MMapFlags, MRemapFlags, MSyncFlags, VMPerms};
+use crate::vm::{MMapFlags, MRemapFlags, MSyncFlags, MadviceFlags, VMPerms};
 use crate::{fs, process, std, vm};

 use super::*;
@ -122,7 +122,7 @@ macro_rules! process_syscall_table_with_callback {
            (Mremap = 25) => do_mremap(old_addr: usize, old_size: usize, new_size: usize, flags: i32, new_addr: usize),
            (Msync = 26) => do_msync(addr: usize, size: usize, flags: u32),
            (Mincore = 27) => handle_unsupported(),
-            (Madvise = 28) => handle_unsupported(),
+            (Madvise = 28) => do_madvice(addr: usize, length: usize, advice: i32),
            (Shmget = 29) => do_shmget(key: key_t, size: size_t, shmflg: i32),
            (Shmat = 30) => do_shmat(shmid: i32, shmaddr: usize, shmflg: i32),
            (Shmctl = 31) => do_shmctl(shmid: i32, cmd: i32, buf: *mut shmids_t),
@ -424,8 +424,8 @@ macro_rules! process_syscall_table_with_callback {
            // Occlum-specific system calls
            (SpawnGlibc = 359) => do_spawn_for_glibc(child_pid_ptr: *mut u32, path: *const i8, argv: *const *const i8, envp: *const *const i8, fa: *const SpawnFileActions, attribute_list: *const posix_spawnattr_t),
            (SpawnMusl = 360) => do_spawn_for_musl(child_pid_ptr: *mut u32, path: *const i8, argv: *const *const i8, envp: *const *const i8, fdop_list: *const FdOp, attribute_list: *const posix_spawnattr_t),
-            (HandleException = 361) => do_handle_exception(info: *mut sgx_exception_info_t, fpregs: *mut FpRegs, context: *mut CpuContext),
-            (HandleInterrupt = 362) => do_handle_interrupt(info: *mut sgx_interrupt_info_t, fpregs: *mut FpRegs, context: *mut CpuContext),
+            (HandleException = 361) => do_handle_exception(info: *mut sgx_exception_info_t, context: *mut CpuContext),
+            (HandleInterrupt = 362) => do_handle_interrupt(info: *mut sgx_interrupt_info_t, context: *mut CpuContext),
            (MountRootFS = 363) => do_mount_rootfs(key_ptr: *const sgx_key_128bit_t, rootfs_config_ptr: *const user_rootfs_config),
        }
    };
@ -649,12 +649,10 @@ fn do_syscall(user_context: &mut CpuContext) {
            syscall.args[1] = user_context as *mut _ as isize;
        } else if syscall_num == SyscallNum::HandleException {
            // syscall.args[0] == info
-            // syscall.args[1] == fpregs
-            syscall.args[2] = user_context as *mut _ as isize;
+            syscall.args[1] = user_context as *mut _ as isize;
        } else if syscall.num == SyscallNum::HandleInterrupt {
            // syscall.args[0] == info
-            // syscall.args[1] == fpregs
-            syscall.args[2] = user_context as *mut _ as isize;
+            syscall.args[1] = user_context as *mut _ as isize;
        } else if syscall.num == SyscallNum::Sigaltstack {
            // syscall.args[0] == new_ss
            // syscall.args[1] == old_ss
@ -751,21 +749,27 @@ fn do_sysret(user_context: &mut CpuContext) -> ! {
        fn do_exit_task() -> !;
    }
    if current!().status() != ThreadStatus::Exited {
-        // Restore the floating point registers
-        // Todo: Is it correct to do fxstor in kernel?
-        let fpregs = user_context.fpregs;
-        if (fpregs != ptr::null_mut()) {
-            if user_context.fpregs_on_heap == 1 {
-                let fpregs = unsafe { Box::from_raw(user_context.fpregs as *mut FpRegs) };
-                fpregs.restore();
-            } else {
-                unsafe { fpregs.as_ref().unwrap().restore() };
+        if user_context.extra_context_ptr != ptr::null_mut() {
+            match user_context.extra_context {
+                ExtraContext::Fpregs => {
+                    let fpregs = user_context.extra_context_ptr as *mut FpRegs;
+                    unsafe { fpregs.as_ref().unwrap().restore() };
+                    // The fpregs must be allocated on heap
+                    drop(unsafe { Box::from_raw(user_context.extra_context_ptr as *mut FpRegs) });
+                }
+                ExtraContext::Xsave => {
+                    let xsave_area = user_context.extra_context_ptr;
+                    unsafe { (&*(xsave_area as *mut XsaveArea)).restore() };
+                }
            }
+            user_context.extra_context_ptr = ptr::null_mut();
        }
        unsafe { __occlum_sysret(user_context) } // jump to user space
    } else {
-        if user_context.fpregs != ptr::null_mut() && user_context.fpregs_on_heap == 1 {
-            drop(unsafe { Box::from_raw(user_context.fpregs as *mut FpRegs) });
+        if user_context.extra_context_ptr != ptr::null_mut()
+            && matches!(user_context.extra_context, ExtraContext::Fpregs)
+        {
+            drop(unsafe { Box::from_raw(user_context.extra_context_ptr as *mut FpRegs) });
        }
        unsafe { do_exit_task() } // exit enclave
    }
@ -828,6 +832,12 @@ fn do_msync(addr: usize, size: usize, flags: u32) -> Result<isize> {
    Ok(0)
 }

+fn do_madvice(addr: usize, length: usize, advice: i32) -> Result<isize> {
+    let flags = MadviceFlags::from_i32(advice)?;
+    vm::do_madvice(addr, length, flags)?;
+    Ok(0)
+}
+
 fn do_sysinfo(info: *mut sysinfo_t) -> Result<isize> {
    check_mut_ptr(info)?;
    let info = unsafe { &mut *info };
@ -977,7 +987,6 @@ fn handle_unsupported() -> Result<isize> {
 /// Floating point registers
 ///
 /// Note. The area is used to save fxsave result
-//#[derive(Clone, Copy)]
 #[repr(C)]
 pub struct FpRegs {
    inner: Aligned<A16, [u8; 512]>,
@ -1017,6 +1026,41 @@ impl FpRegs {
    }
 }

+#[derive(Debug)]
+#[repr(C)]
+pub struct XsaveArea {
+    inner: Aligned<A64, [u8; 4096]>,
+}
+
+impl XsaveArea {
+    // The first 512 bytes of xsave area is used for FP registers
+    const FXSAVE_AREA_LEN: usize = 512;
+
+    /// Save the current CPU floating pointer states to an instance of FpRegs
+    pub fn save() -> Self {
+        let mut xsave_area = MaybeUninit::<Self>::uninit();
+        unsafe {
+            save_xregs(xsave_area.as_mut_ptr() as *mut u8);
+            xsave_area.assume_init()
+        }
+    }
+
+    /// Restore the current CPU floating pointer states from this FpRegs instance
+    pub fn restore(&self) {
+        unsafe {
+            restore_xregs(self.inner.as_ptr());
+        }
+    }
+
+    pub fn get_fpregs(&self) -> FpRegs {
+        unsafe { FpRegs::from_slice(&self.inner[..Self::FXSAVE_AREA_LEN]) }
+    }
+
+    pub fn set_fpregs_area(&mut self, fpregs: FpRegs) {
+        self.inner[..Self::FXSAVE_AREA_LEN].copy_from_slice(fpregs.as_slice())
+    }
+}
+
 /// Cpu context.
 ///
 /// Note. The definition of this struct must be kept in sync with the assembly
@ -1042,8 +1086,21 @@ pub struct CpuContext {
    pub rsp: u64,
    pub rip: u64,
    pub rflags: u64,
-    pub fpregs_on_heap: u64,
-    pub fpregs: *mut FpRegs,
+    pub extra_context: ExtraContext,
+    pub extra_context_ptr: *mut u8,
+}
+
+#[repr(u64)]
+#[derive(Clone, Copy, Debug)]
+pub enum ExtraContext {
+    Fpregs = 0,
+    Xsave = 1,
+}
+
+impl Default for ExtraContext {
+    fn default() -> Self {
+        Self::Fpregs
+    }
 }

 impl CpuContext {
@ -1067,8 +1124,8 @@ impl CpuContext {
            rsp: src.rsp,
            rip: src.rip,
            rflags: src.rflags,
-            fpregs_on_heap: 0,
-            fpregs: ptr::null_mut(),
+            extra_context: Default::default(),
+            extra_context_ptr: ptr::null_mut(),
        }
    }
 }
@ -1082,14 +1139,15 @@ impl CpuContext {
 //  pointer that is not safe to use by external modules. In our case, the
 //  FpRegs pointer will not be used actually. So the Rust warning is a
 //  false alarm. We suppress it here.
-pub unsafe fn exception_interrupt_syscall_c_abi(
-    num: u32,
-    info: *mut c_void,
-    fpregs: *mut FpRegs,
-) -> u32 {
+pub unsafe fn exception_interrupt_syscall_c_abi(num: u32, info: *mut c_void) -> u32 {
    #[allow(improper_ctypes)]
    extern "C" {
-        pub fn __occlum_syscall_c_abi(num: u32, info: *mut c_void, fpregs: *mut FpRegs) -> u32;
+        pub fn __occlum_syscall_c_abi(num: u32, info: *mut c_void) -> u32;
    }
-    __occlum_syscall_c_abi(num, info, fpregs)
+    __occlum_syscall_c_abi(num, info)
+}
+
+extern "C" {
+    pub fn save_xregs(save_area: *mut u8);
+    pub fn restore_xregs(save_area: *const u8);
 }
--- a/src/libos/src/syscall/syscall_entry_x86-64.S
+++ b/src/libos/src/syscall/syscall_entry_x86-64.S
@ -52,8 +52,8 @@ __occlum_syscall_linux_abi:
    // Save the target CPU state when `call __occlum_syscall` is returned in
    // a CpuContext struct. The registers are saved in the reverse order of 
    // the fields in CpuContext.
-    pushq $0         // default fpregs is NULL 
-    pushq $0         // default fpregs is allocated on stack
+    pushq $0         // default extra_context_ptr is NULL
+    pushq $0         // default extra_context is floating point registers
    pushfq
    push %rcx       // save %rip
    push %r11       // save %rsp
--- a/src/libos/src/vm/chunk.rs
+++ b/src/libos/src/vm/chunk.rs
@ -100,16 +100,9 @@ impl Chunk {
            *options.perms(),
            options.initializer().backed_file(),
            current!().process().pid(),
-        );
-        // Initialize the memory of the new range
-        unsafe {
-            let buf = vm_range.as_slice_mut();
-            options.initializer().init_slice(buf)?;
-        }
-        // Set memory permissions
-        if !options.perms().is_default() {
-            VMPerms::apply_perms(&vm_area, vm_area.perms());
-        }
+        )
+        .init_memory(options)?;
+
        Ok(Self::new_chunk_with_vma(vm_area))
    }

@ -238,6 +231,30 @@ impl Chunk {
        }
    }

+    pub fn handle_page_fault(
+        &self,
+        rip: usize,
+        pf_addr: usize,
+        errcd: u32,
+        kernel_triggers: bool,
+    ) -> Result<()> {
+        let internal = &self.internal;
+        match self.internal() {
+            ChunkType::SingleVMA(vma) => {
+                let mut vma = vma.lock().unwrap();
+                debug_assert!(vma.contains(pf_addr));
+                return vma.handle_page_fault(rip, pf_addr, errcd, kernel_triggers);
+            }
+            ChunkType::MultiVMA(internal_manager) => {
+                return internal_manager
+                    .lock()
+                    .unwrap()
+                    .chunk_manager
+                    .handle_page_fault(rip, pf_addr, errcd, kernel_triggers);
+            }
+        }
+    }
+
    pub fn is_free_range(&self, request_range: &VMRange) -> bool {
        match self.internal() {
            ChunkType::SingleVMA(_) => false, // single-vma chunk can't be free
--- a/src/libos/src/vm/mod.rs
+++ b/src/libos/src/vm/mod.rs
@ -63,11 +63,13 @@ use std::fmt;

 mod chunk;
 mod free_space_manager;
+mod page_tracker;
 mod process_vm;
 mod shm_manager;
 mod user_space_vm;
 mod vm_area;
 mod vm_chunk_manager;
+mod vm_epc;
 mod vm_layout;
 mod vm_manager;
 mod vm_perms;
@ -77,9 +79,12 @@ mod vm_util;
 use self::vm_layout::VMLayout;

 pub use self::chunk::{ChunkRef, ChunkType};
-pub use self::process_vm::{MMapFlags, MRemapFlags, MSyncFlags, ProcessVM, ProcessVMBuilder};
+pub use self::process_vm::{
+    MMapFlags, MRemapFlags, MSyncFlags, MadviceFlags, ProcessVM, ProcessVMBuilder,
+};
 pub use self::user_space_vm::USER_SPACE_VM_MANAGER;
 pub use self::vm_area::VMArea;
+pub use self::vm_epc::enclave_page_fault_handler;
 pub use self::vm_manager::MunmapChunkFlag;
 pub use self::vm_perms::VMPerms;
 pub use self::vm_range::VMRange;
@ -154,4 +159,9 @@ pub fn do_msync(addr: usize, size: usize, flags: MSyncFlags) -> Result<()> {
    current!().vm().msync(addr, size)
 }

+pub fn do_madvice(addr: usize, length: usize, advice: MadviceFlags) -> Result<()> {
+    warn!("madvice is not supported. madvice flags:{:?}", advice);
+    Ok(())
+}
+
 pub const PAGE_SIZE: usize = 4096;
--- a/src/libos/src/vm/page_tracker.rs
+++ b/src/libos/src/vm/page_tracker.rs
@ -0,0 +1,488 @@
+use super::*;
+
+use super::user_space_vm::USER_SPACE_VM_MANAGER;
+use super::vm_util::{GB, KB, MB};
+use bitvec::vec::BitVec;
+use util::sync::RwLock;
+use vm_epc::EPCMemType;
+
+// In SGX v2, there is no upper limit for the size of EPC. If the user configure 1 TB memory,
+// and we only use one bit to track if the page is committed, that's 1 TB / 4 kB / 8 bit = 32 MB of memory.
+// And the memory footprint will keep the same size during the whole libOS life cycle.
+// In order to track the commit status of a huge number of pages, use two level tracking.
+// In the first level, global level, we use `PAGE_CHUNK_UNIT` as the unit size for a page chunk.
+// In the second level, we just use the page size as the unit size, and use one bit to represent if the page is committed.
+// For example, if the user configure 64 TB memory, when a page is committed, the second level tracker will mark the correponding bit as 1.
+// And when all the pages of a whole global page chunk are fully committed, the global level tracker will mark the page chunk as fully committed.
+// And the corresponding tracker can be freed. In this way, we can use just several bytes to represent the commit status of a big chunk of memory.
+// In a worse case, let's say there are several discrete global page chunks which are not not fully committed at the same time.
+// And each of them will take some space in the memory. Within a memory-intensive case, we can
+// commit the page by hand and make the global page chunk fully committed and free the page tracker.
+
+// There are mainly three types of data structure to track the page status, from the top to the bottom:
+// 1. PageChunkManager - Create for the whole user space. This sructure is used to manage the global paging status.
+// 2. GlobalPageChunk - Denotes a chunk of pages. The actual unit of the PageChunkManager. It holds the paging status of a memory range. Stored only
+// in the PageChunkManager. A newly created VMA should ask the corresponding GlobalPageChunk for the paging status. When all the pages recoreded by
+// GlobalPageChunk are all committed, it will mark itself as "fully committed" and free the inner structure tracking the paging status. All the GlobalPageChunk
+// records the VM ranges with the SAME size.
+// 3. PageTracker - The real tracker of the paging status. Under the hood, it is a bitvec that tracks every page with a bit. There are mainly two types
+// PageTracker:
+//      * GlobalTracker - Used by GlobalPageChunk to track the paging status. All records the VM range with the same size.
+//      * VMATracker - Used by VMA to track its paging status. Records different range size according to the VMA.
+// Since the VM operations are mostly performed by VMA, the VMA tracker will update itself accordingly. And also update the corresponding GlobalTracker.
+
+lazy_static! {
+    pub static ref USER_SPACE_PAGE_CHUNK_MANAGER: RwLock<PageChunkManager> =
+        RwLock::new(PageChunkManager::new(USER_SPACE_VM_MANAGER.range()));
+}
+
+const PAGE_CHUNK_UNIT: usize = 4 * MB;
+const PAGE_CHUNK_PAGE_NUM: usize = PAGE_CHUNK_UNIT / PAGE_SIZE;
+
+pub struct PageChunkManager {
+    // The total range that the manager manages.
+    range: VMRange,
+    // The page chunks
+    inner: HashMap<usize, GlobalPageChunk>, // K: Page chunk start address, V: Global page chunk
+}
+
+impl PageChunkManager {
+    fn new(range: &VMRange) -> Self {
+        Self {
+            range: range.clone(),
+            inner: HashMap::new(),
+        }
+    }
+}
+
+#[derive(Debug)]
+// A chunk of pages. Memory space is precious. Don't put anything unnecessary.
+struct GlobalPageChunk {
+    fully_committed: bool,
+    tracker: Option<Arc<RwLock<PageTracker>>>, // if this page chunk is fully committed, the tracker will be set to None.
+}
+
+impl GlobalPageChunk {
+    fn new(tracker: PageTracker) -> Self {
+        Self {
+            fully_committed: false,
+            tracker: Some(Arc::new(RwLock::new(tracker))),
+        }
+    }
+}
+
+#[derive(PartialEq, Clone, Debug)]
+enum TrackerType {
+    GlobalTracker, // PAGE_CHUNK_UNIT size for global management to track the global paging status
+    VMATracker,    // various size for different vma to track its own paging status
+}
+
+// Used for tracking the paging status of global tracker or VMA tracker
+#[derive(Clone)]
+pub struct PageTracker {
+    type_: TrackerType,
+    range: VMRange,
+    inner: BitVec,
+    fully_committed: bool,
+}
+
+impl Debug for PageTracker {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("PageTracker")
+            .field("type", &self.type_)
+            .field("range", &self.range)
+            .field("fully committed", &self.fully_committed)
+            .finish()
+    }
+}
+
+impl PageTracker {
+    // Create a new page tracker for GlobalPageChunk.
+    // When a new global tracker is needed, none of the pages are committed.
+    fn new_global_tracker(start_addr: usize) -> Result<Self> {
+        let range = VMRange::new_with_size(start_addr, PAGE_CHUNK_UNIT)?;
+
+        let inner = bitvec![0; PAGE_CHUNK_PAGE_NUM];
+        Ok(Self {
+            type_: TrackerType::GlobalTracker,
+            range,
+            inner,
+            fully_committed: false,
+        })
+    }
+
+    pub fn new_vma_tracker(vm_range: &VMRange, epc_type: &EPCMemType) -> Result<Self> {
+        trace!("new vma tracker, range = {:?}", vm_range);
+        let page_num = vm_range.size() / PAGE_SIZE;
+        let new_vma_tracker = match epc_type {
+            EPCMemType::UserRegion => {
+                let mut new_vma_tracker = Self {
+                    type_: TrackerType::VMATracker,
+                    range: vm_range.clone(),
+                    inner: bitvec![0; page_num],
+                    fully_committed: false,
+                };
+
+                // Skip sentry
+                if page_num != 0 {
+                    new_vma_tracker.get_committed_pages_from_global_tracker()?;
+                }
+                new_vma_tracker
+            }
+            EPCMemType::Reserved => {
+                // For reserved memory, there is no need to udpate global page tracker.
+                // And there is no GLobalPageChunk for reserved memory.
+                Self {
+                    type_: TrackerType::VMATracker,
+                    range: vm_range.clone(),
+                    inner: bitvec![1; page_num],
+                    fully_committed: true,
+                }
+            }
+            _ => unreachable!(),
+        };
+
+        Ok(new_vma_tracker)
+    }
+
+    pub fn range(&self) -> &VMRange {
+        &self.range
+    }
+
+    pub fn is_fully_committed(&self) -> bool {
+        self.fully_committed
+    }
+
+    pub fn is_reserved_only(&self) -> bool {
+        !self.fully_committed && self.inner.not_any()
+    }
+
+    pub fn is_partially_committed(&self) -> bool {
+        !self.fully_committed && self.inner.any()
+    }
+
+    // Get all committed or uncommitted ranges of consecutive page.
+    // If committed is true, get all committed ranges
+    // If committed is false, get all uncommitted ranges
+    pub fn get_ranges(&self, committed: bool) -> Vec<VMRange> {
+        if self.is_fully_committed() {
+            if committed {
+                return vec![self.range.clone()];
+            } else {
+                return Vec::new();
+            }
+        }
+        if self.is_reserved_only() {
+            if committed {
+                return Vec::new();
+            } else {
+                return vec![self.range.clone()];
+            }
+        }
+
+        let tracker_start_addr = self.range.start();
+        let mut ret = Vec::new();
+        let mut start = None;
+        let mut end = None;
+
+        for i in 0..self.inner.len() {
+            if self.inner[i] == committed {
+                match (start, end) {
+                    // Meet committed page for the first time. Update both the start and end marker.
+                    (None, None) => {
+                        start = Some(i);
+                        end = Some(i);
+                        // Reach the end of the tracker. Only one page
+                        if i == self.inner.len() - 1 {
+                            let committed_range = VMRange::new_with_size(
+                                tracker_start_addr + i * PAGE_SIZE,
+                                PAGE_SIZE,
+                            )
+                            .unwrap();
+                            ret.push(committed_range);
+                        }
+                    }
+                    // Previous pages are committed. Update the end marker.
+                    (Some(s), Some(e)) => {
+                        end = Some(i);
+                        // Reach the end of the tracker.
+                        if i == self.inner.len() - 1 {
+                            let committed_range = VMRange::new_with_size(
+                                tracker_start_addr + s * PAGE_SIZE,
+                                PAGE_SIZE * (i - s + 1),
+                            )
+                            .unwrap();
+                            ret.push(committed_range);
+                        }
+                    }
+                    _ => unreachable!(),
+                }
+            } else {
+                match (start, end) {
+                    (None, None) => {
+                        // No committed pages.
+                    }
+                    (Some(s), Some(e)) => {
+                        // Meet the first uncommitted pages after recording all the previous committed pages.
+                        let committed_range = VMRange::new_with_size(
+                            tracker_start_addr + s * PAGE_SIZE,
+                            PAGE_SIZE * (e - s + 1),
+                        )
+                        .unwrap();
+                        ret.push(committed_range);
+                        // Reset markers
+                        start = None;
+                        end = None;
+                    }
+                    _ => {
+                        unreachable!()
+                    }
+                }
+            }
+        }
+
+        let total_size = ret.iter().fold(0, |a, b| a + b.size());
+        if committed {
+            trace!("get committed ranges = {:?}", ret);
+            debug_assert!(total_size == self.inner.count_ones() * PAGE_SIZE);
+        } else {
+            trace!("get uncommitted ranges = {:?}", ret);
+            debug_assert!(total_size == self.inner.count_zeros() * PAGE_SIZE);
+        }
+
+        ret
+    }
+
+    pub fn split_for_new_range(&mut self, new_range: &VMRange) {
+        debug_assert!(self.range.is_superset_of(new_range));
+
+        let new_start = new_range.start();
+        let page_num = new_range.size() / PAGE_SIZE;
+
+        let split_idx = (new_start - self.range.start()) / PAGE_SIZE;
+        let mut new_inner = self.inner.split_off(split_idx);
+        new_inner.truncate(page_num);
+
+        trace!(
+            "old range= {:?}, new_start = {:x}, idx = {:?}",
+            self.range,
+            new_start,
+            split_idx
+        );
+
+        self.inner = new_inner;
+        if self.inner.all() {
+            self.fully_committed = true;
+        }
+
+        self.range = *new_range;
+    }
+
+    // Commit memory for the whole current VMA (VMATracker)
+    pub fn commit_whole(&mut self, perms: VMPerms) -> Result<()> {
+        debug_assert!(self.type_ == TrackerType::VMATracker);
+
+        if self.is_fully_committed() {
+            return Ok(());
+        }
+
+        // Commit EPC
+        if self.is_reserved_only() {
+            vm_epc::commit_memory(self.range().start(), self.range().size(), Some(perms)).unwrap();
+        } else {
+            debug_assert!(self.is_partially_committed());
+            let uncommitted_ranges = self.get_ranges(false);
+            for range in uncommitted_ranges {
+                vm_epc::commit_memory(range.start(), range.size(), Some(perms)).unwrap();
+            }
+        }
+
+        // Update the tracker
+        self.inner.fill(true);
+        self.fully_committed = true;
+
+        self.set_committed_pages_for_global_tracker(self.range().start(), self.range().size());
+
+        Ok(())
+    }
+
+    // Commit memory of a specific range for the current VMA (VMATracker). The range should be verified by caller.
+    pub fn commit_range(&mut self, range: &VMRange, new_perms: Option<VMPerms>) -> Result<()> {
+        debug_assert!(self.type_ == TrackerType::VMATracker);
+        debug_assert!(self.range().is_superset_of(range));
+
+        vm_epc::commit_memory(range.start(), range.size(), new_perms)?;
+
+        self.commit_pages_common(range.start(), range.size());
+        self.set_committed_pages_for_global_tracker(range.start(), range.size());
+
+        Ok(())
+    }
+
+    pub fn commit_memory_and_init_with_file(
+        &mut self,
+        range: &VMRange,
+        file: &FileRef,
+        file_offset: usize,
+        new_perms: VMPerms,
+    ) -> Result<()> {
+        debug_assert!(self.type_ == TrackerType::VMATracker);
+        debug_assert!(self.range().is_superset_of(range));
+
+        vm_epc::commit_memory_and_init_with_file(
+            range.start(),
+            range.size(),
+            file,
+            file_offset,
+            new_perms,
+        )?;
+
+        self.commit_pages_common(range.start(), range.size());
+        self.set_committed_pages_for_global_tracker(range.start(), range.size());
+
+        Ok(())
+    }
+
+    // VMATracker get page commit status from global tracker and update itself
+    // This should be called when the VMATracker inits
+    fn get_committed_pages_from_global_tracker(&mut self) -> Result<()> {
+        debug_assert!(self.type_ == TrackerType::VMATracker);
+        let mut vma_tracker = self;
+        let mut page_chunk_start = get_page_chunk_start_addr(vma_tracker.range().start());
+
+        let range_end = vma_tracker.range().end();
+        for page_chunk_addr in (page_chunk_start..range_end).step_by(PAGE_CHUNK_UNIT) {
+            let manager = USER_SPACE_PAGE_CHUNK_MANAGER.read().unwrap();
+            if let Some(page_chunk) = manager.inner.get(&page_chunk_addr) {
+                if page_chunk.fully_committed {
+                    // global page chunk fully committed. commit pages for vma page chunk
+                    vma_tracker.commit_pages_common(page_chunk_addr, PAGE_CHUNK_UNIT);
+                } else {
+                    debug_assert!(page_chunk.tracker.is_some());
+                    let global_tracker = page_chunk.tracker.as_ref().unwrap().read().unwrap();
+                    global_tracker.set_committed_pages_for_vma_tracker(vma_tracker);
+                }
+                drop(manager);
+            } else {
+                // Not tracking this page chunk. Release read lock and acquire write lock for an update.
+                drop(manager);
+                // This page chunk is not tracked by global tracker. Thus none of the pages are committed.
+                let page_chunk = {
+                    let global_page_tracker = PageTracker::new_global_tracker(page_chunk_addr)?;
+                    GlobalPageChunk::new(global_page_tracker)
+                };
+
+                // There could be data race here. But it's fine, because the ultimate state is the same.
+                USER_SPACE_PAGE_CHUNK_MANAGER
+                    .write()
+                    .unwrap()
+                    .inner
+                    .insert(page_chunk_addr, page_chunk);
+            }
+        }
+
+        Ok(())
+    }
+
+    // VMAtracker helps to update global tracker based on the paging status of itself.
+    // This should be called whenever the VMATracker updates and needs to sync with the GlobalTracker.
+    fn set_committed_pages_for_global_tracker(&self, commit_start_addr: usize, commit_size: usize) {
+        debug_assert!(self.type_ == TrackerType::VMATracker);
+
+        let commit_end_addr = commit_start_addr + commit_size;
+        let page_chunk_start_addr = get_page_chunk_start_addr(commit_start_addr);
+        for page_chunk_addr in (page_chunk_start_addr..commit_end_addr).step_by(PAGE_CHUNK_UNIT) {
+            let is_global_tracker_fully_committed = {
+                // Find the correponding page chunk
+                let manager = USER_SPACE_PAGE_CHUNK_MANAGER.read().unwrap();
+                let page_chunk = manager
+                    .inner
+                    .get(&page_chunk_addr)
+                    .expect("this page chunk must exist");
+
+                // Update the global page tracker
+                if let Some(global_page_tracker) = &page_chunk.tracker {
+                    let mut global_tracker = global_page_tracker.write().unwrap();
+                    global_tracker.commit_pages_common(commit_start_addr, commit_size);
+                    global_tracker.fully_committed
+                } else {
+                    // page_tracker is none, the page chunk is fully committed. Go to next chunk.
+                    debug_assert!(page_chunk.fully_committed);
+                    continue;
+                }
+            };
+
+            // Free the global page tracker if fully committed
+            if is_global_tracker_fully_committed {
+                // Update the global page chunk manager. Need to acquire the write lock this time. There can be data race because the lock
+                // could be dropped for a while before acquire again. But its fine, because the ultimate state is the same.
+                let mut manager = USER_SPACE_PAGE_CHUNK_MANAGER.write().unwrap();
+                if let Some(mut page_chunk) = manager.inner.get_mut(&page_chunk_addr) {
+                    page_chunk.fully_committed = true;
+                    page_chunk.tracker = None;
+                } else {
+                    warn!(
+                        "the global page chunk with start addr: 0x{:x} has been freed already",
+                        page_chunk_addr
+                    );
+                    unreachable!();
+                }
+            }
+        }
+    }
+
+    // GlobalTracker helps to update VMATracker based on the paging status of itself.
+    // This should be called when the VMATracker inits.
+    fn set_committed_pages_for_vma_tracker(&self, vma_tracker: &mut PageTracker) {
+        debug_assert!(self.type_ == TrackerType::GlobalTracker);
+        debug_assert!(vma_tracker.type_ == TrackerType::VMATracker);
+
+        let global_tracker = self;
+
+        if let Some(intersection_range) = global_tracker.range().intersect(vma_tracker.range()) {
+            let vma_tracker_page_id =
+                (intersection_range.start() - vma_tracker.range().start()) / PAGE_SIZE;
+            let global_tracker_page_id =
+                (intersection_range.start() - global_tracker.range().start()) / PAGE_SIZE;
+            let page_num = intersection_range.size() / PAGE_SIZE;
+
+            vma_tracker.inner[vma_tracker_page_id..vma_tracker_page_id + page_num]
+                .copy_from_bitslice(
+                    &global_tracker.inner
+                        [global_tracker_page_id..global_tracker_page_id + page_num],
+                );
+            if vma_tracker.inner.all() {
+                vma_tracker.fully_committed = true;
+            }
+        } else {
+            // No intersection range, why calling this? Wierd.
+            unreachable!();
+        }
+    }
+
+    // Commit pages for page tracker itself. This is a common method for both VMATracker and GlobalTracker.
+    fn commit_pages_common(&mut self, start_addr: usize, size: usize) {
+        debug_assert!(!self.fully_committed);
+
+        if let Some(intersection_range) = {
+            let range = VMRange::new_with_size(start_addr, size).unwrap();
+            self.range.intersect(&range)
+        } {
+            trace!("commit for page tracker: {:?}", self);
+            let page_start_id = (intersection_range.start() - self.range().start()) / PAGE_SIZE;
+            let page_num = intersection_range.size() / PAGE_SIZE;
+            self.inner[page_start_id..page_start_id + page_num].fill(true);
+            if self.inner.all() {
+                self.fully_committed = true;
+            }
+        } else {
+            // No intersect range, wierd
+            unreachable!();
+        }
+    }
+}
+
+#[inline(always)]
+fn get_page_chunk_start_addr(addr: usize) -> usize {
+    align_down(addr, PAGE_CHUNK_UNIT)
+}
--- a/src/libos/src/vm/process_vm.rs
+++ b/src/libos/src/vm/process_vm.rs
@ -6,7 +6,8 @@ use super::vm_area::VMArea;
 use super::vm_manager::MunmapChunkFlag;
 use super::vm_perms::VMPerms;
 use super::vm_util::{
-    FileBacked, VMInitializer, VMMapAddr, VMMapOptions, VMMapOptionsBuilder, VMRemapOptions,
+    FileBacked, PagePolicy, VMInitializer, VMMapAddr, VMMapOptions, VMMapOptionsBuilder,
+    VMRemapOptions,
 };
 use crate::config;
 use crate::ipc::SHM_MANAGER;
@ -124,6 +125,8 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> {
                    .initializer(VMInitializer::ElfSpecific {
                        elf_file: elf_file.file_ref().clone(),
                    })
+                    // We only load loadable segments, just commit the memory when allocating.
+                    .page_policy(PagePolicy::CommitNow)
                    .build()
                    .map_err(|e| {
                        &self.handle_error_when_init(&chunks);
@ -152,6 +155,8 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> {
            .size(heap_layout.size())
            .align(heap_layout.align())
            .perms(VMPerms::READ | VMPerms::WRITE)
+            .page_policy(PagePolicy::CommitOnDemand)
+            // .page_policy(PagePolicy::CommitNow)
            .build()
            .map_err(|e| {
                &self.handle_error_when_init(&chunks);
@ -171,8 +176,10 @@ impl<'a, 'b> ProcessVMBuilder<'a, 'b> {
        let stack_layout = &other_layouts[1];
        let vm_option = VMMapOptionsBuilder::default()
            .size(stack_layout.size())
-            .align(heap_layout.align())
+            .align(stack_layout.align())
            .perms(VMPerms::READ | VMPerms::WRITE)
+            // There are cases that we can't handle when the #PF happens at user's stack. Commit the stack memory now.
+            .page_policy(PagePolicy::CommitNow)
            .build()
            .map_err(|e| {
                &self.handle_error_when_init(&chunks);
@ -537,11 +544,26 @@ impl ProcessVM {
                }
            }
        };
+
+        let page_policy = {
+            if flags.contains(MMapFlags::MAP_STACK) {
+                // With MAP_STACK, the mmaped memory will be used as user's stack. If not committed, the #PF can occurs
+                // when switching to user space and can't be handled correctly by us.
+                PagePolicy::CommitNow
+            } else if !flags.contains(MMapFlags::MAP_ANONYMOUS) {
+                // Use commit-now policy for file-backed mmap. We tried the commit-on-demand policy, but didn't get any performance gain at all.
+                // However, the path for file-backed mmap with commit-on-demand policy is ready. We can enable this whenever needed.
+                PagePolicy::CommitNow
+            } else {
+                PagePolicy::CommitOnDemand
+            }
+        };
        let mmap_options = VMMapOptionsBuilder::default()
            .size(size)
            .addr(addr_option)
            .perms(perms)
            .initializer(initializer)
+            .page_policy(page_policy)
            .build()?;
        let mmap_addr = USER_SPACE_VM_MANAGER.mmap(&mmap_options)?;
        Ok(mmap_addr)
@ -674,3 +696,33 @@ impl MSyncFlags {
        Ok(flags)
    }
 }
+
+#[allow(non_camel_case_types)]
+#[repr(i32)]
+#[derive(Debug)]
+pub enum MadviceFlags {
+    MADV_NORMAL = 0,
+    MADV_RANDOM = 1,
+    MADV_SEQUENTIAL = 2,
+    MADV_WILLNEED = 3,
+    MADV_DONTNEED = 4,
+}
+
+impl MadviceFlags {
+    pub fn from_i32(raw: i32) -> Result<Self> {
+        const MADV_NORMAL: i32 = 0;
+        const MADV_RANDOM: i32 = 1;
+        const MADV_SEQUENTIAL: i32 = 2;
+        const MADV_WILLNEED: i32 = 3;
+        const MADV_DONTNEED: i32 = 4;
+
+        match raw {
+            MADV_NORMAL => Ok(MadviceFlags::MADV_NORMAL),
+            MADV_RANDOM => Ok(MadviceFlags::MADV_RANDOM),
+            MADV_SEQUENTIAL => Ok(MadviceFlags::MADV_SEQUENTIAL),
+            MADV_WILLNEED => Ok(MadviceFlags::MADV_WILLNEED),
+            MADV_DONTNEED => Ok(MadviceFlags::MADV_DONTNEED),
+            _ => return_errno!(ENOSYS, "unknown madvice flags"),
+        }
+    }
+}
--- a/src/libos/src/vm/shm_manager.rs
+++ b/src/libos/src/vm/shm_manager.rs
@ -206,8 +206,8 @@ impl ShmManager {
            let old_perms = old_vma.perms();
            if new_perms != old_perms {
                let perms = new_perms | old_perms;
-                VMPerms::apply_perms(new_vma.range(), perms);
                new_vma.set_perms(perms);
+                new_vma.modify_permissions_for_committed_pages(perms);
            }

            let inode_id = Self::inode_id_of(&new_vma);
@ -279,7 +279,7 @@ impl ShmManager {
        if perms == old_perms {
            return;
        }
-        VMPerms::apply_perms(vma.range(), perms);
        vma.set_perms(perms);
+        vma.modify_permissions_for_committed_pages(perms);
    }
 }
--- a/src/libos/src/vm/user_space_vm.rs
+++ b/src/libos/src/vm/user_space_vm.rs
@ -1,46 +1,50 @@
 use super::*;

-use super::vm_manager::VMManager;
 use crate::config::LIBOS_CONFIG;
 use crate::ctor::dtor;
-use crate::ipc::SHM_MANAGER;
+use crate::ipc::SYSTEM_V_SHM_MANAGER;
 use crate::util::pku_util;

 use std::ops::{Deref, DerefMut};
+use vm_epc::SGXPlatform;
+use vm_manager::VMManager;
+use vm_perms::VMPerms;

-const RSRV_MEM_PERM: MemPerm =
-    MemPerm::from_bits_truncate(MemPerm::READ.bits() | MemPerm::WRITE.bits());
+const USER_SPACE_DEFAULT_MEM_PERM: VMPerms = VMPerms::DEFAULT;

 /// The virtual memory manager for the entire user space
-pub struct UserSpaceVMManager(VMManager);
+pub struct UserSpaceVMManager {
+    inner: VMManager,
+    sgx_platform: SGXPlatform,
+}

 impl UserSpaceVMManager {
    fn new() -> Result<UserSpaceVMManager> {
-        let rsrv_mem_size = LIBOS_CONFIG.resource_limits.user_space_size;
-        let vm_range = unsafe {
-            // TODO: Current sgx_alloc_rsrv_mem implementation will commit all the pages of the desired size, which will consume
-            // a lot of time. When EDMM is supported, there is no need to commit all the pages at the initialization stage. A function
-            // which reserves memory but not commit pages should be provided then.
-            let ptr = sgx_alloc_rsrv_mem(rsrv_mem_size);
-            if ptr.is_null() {
-                return_errno!(ENOMEM, "run out of reserved memory");
-            }
+        let sgx_platform = SGXPlatform::new();
+        let init_size = LIBOS_CONFIG.resource_limits.user_space_init_size;
+        let max_size = LIBOS_CONFIG.resource_limits.user_space_max_size;

-            // Without EDMM support and the ReservedMemExecutable is set to 1, the reserved memory will be RWX. And we can't change the reserved memory permission.
-            // With EDMM support, the reserved memory permission is RW by default. And we can change the permissions when needed.
+        let (userspace_vm_range, gap_range) = sgx_platform.alloc_user_space(init_size, max_size)?;

-            let addr = ptr as usize;
-            debug!(
-                "allocated rsrv addr is 0x{:x}, len is 0x{:x}",
-                addr, rsrv_mem_size
-            );
-            pku_util::pkey_mprotect_userspace_mem(addr, rsrv_mem_size, RSRV_MEM_PERM.bits());
-            VMRange::new(addr, addr + rsrv_mem_size)?
-        };
+        info!(
+            "user space allocated, range = {:?}, gap_range = {:?}",
+            userspace_vm_range, gap_range
+        );

-        let vm_manager = VMManager::init(vm_range)?;
+        // Use pkey_mprotect to set the whole userspace to R/W permissions. If user specifies a new
+        // permission, the mprotect ocall will update the permission.
+        pku_util::pkey_mprotect_userspace_mem(
+            &userspace_vm_range,
+            gap_range.as_ref(),
+            USER_SPACE_DEFAULT_MEM_PERM,
+        );

-        Ok(UserSpaceVMManager(vm_manager))
+        let vm_manager = VMManager::init(userspace_vm_range, gap_range)?;
+
+        Ok(Self {
+            inner: vm_manager,
+            sgx_platform,
+        })
    }

    pub fn get_total_size(&self) -> usize {
@ -52,51 +56,34 @@ impl UserSpaceVMManager {
 // be called after the main function. Static variables are still safe to visit at this time.
 #[dtor]
 fn free_user_space() {
-    SHM_MANAGER.clean_when_libos_exit();
-    let range = USER_SPACE_VM_MANAGER.range();
+    info!("free user space at the end");
+    SYSTEM_V_SHM_MANAGER.clean_when_libos_exit();
+    let total_user_space_range = USER_SPACE_VM_MANAGER.range();
+    let gap_range = USER_SPACE_VM_MANAGER.gap_range();
    assert!(USER_SPACE_VM_MANAGER.verified_clean_when_exit());
-    let addr = range.start();
-    let size = range.size();
-    info!("free user space VM: {:?}", range);
-    pku_util::clear_pku_when_libos_exit(addr, size, RSRV_MEM_PERM.bits());
-    assert!(unsafe { sgx_free_rsrv_mem(addr as *const c_void, size) == 0 });
+    let addr = total_user_space_range.start();
+    let size = total_user_space_range.size();
+    info!("free user space VM: {:?}", total_user_space_range);
+
+    pku_util::clear_pku_when_libos_exit(
+        total_user_space_range,
+        gap_range.as_ref(),
+        USER_SPACE_DEFAULT_MEM_PERM,
+    );
+
+    USER_SPACE_VM_MANAGER
+        .sgx_platform
+        .free_user_space(total_user_space_range, gap_range.as_ref());
 }

 impl Deref for UserSpaceVMManager {
    type Target = VMManager;

    fn deref(&self) -> &Self::Target {
-        &self.0
+        &self.inner
    }
 }

 lazy_static! {
    pub static ref USER_SPACE_VM_MANAGER: UserSpaceVMManager = UserSpaceVMManager::new().unwrap();
 }
-
-bitflags! {
-    struct MemPerm: i32 {
-        const READ  = 1;
-        const WRITE = 2;
-        const EXEC  = 4;
-    }
-}
-
-extern "C" {
-    // Allocate a range of EPC memory from the reserved memory area with RW permission
-    //
-    // Parameters:
-    // Inputs: length [in]: Size of region to be allocated in bytes. Page aligned
-    // Return: Starting address of the new allocated memory area on success; otherwise NULL
-    //
-    fn sgx_alloc_rsrv_mem(length: usize) -> *const c_void;
-
-    // Free a range of EPC memory from the reserved memory area
-    //
-    // Parameters:
-    // Inputs: addr[in]: Starting address of region to be freed. Page aligned.
-    //         length[in]: The length of the memory to be freed in bytes.  Page aligned
-    // Return: 0 on success; otherwise -1
-    //
-    fn sgx_free_rsrv_mem(addr: *const c_void, length: usize) -> i32;
-}
--- a/src/libos/src/vm/vm_area.rs
+++ b/src/libos/src/vm/vm_area.rs
@ -1,19 +1,28 @@
 use super::*;

+use super::page_tracker::PageTracker;
+use super::vm_epc::EPCMemType;
 use super::vm_perms::VMPerms;
 use super::vm_range::VMRange;
-use super::vm_util::FileBacked;
-
+use super::vm_util::{FileBacked, PagePolicy, VMInitializer, VMMapOptions, GB, KB, MB};
 use intrusive_collections::rbtree::{Link, RBTree};
 use intrusive_collections::{intrusive_adapter, KeyAdapter};
 use std::ops::{Deref, DerefMut};

-#[derive(Clone, Debug, Default)]
+// Commit memory size unit when the #PF occurs.
+const COMMIT_SIZE_UNIT: usize = 4 * KB;
+// Commit the whole VMA when this threshold reaches.
+const PF_NUM_THRESHOLD: u64 = 3;
+
+#[derive(Clone, Debug)]
 pub struct VMArea {
    range: VMRange,
    perms: VMPerms,
    file_backed: Option<FileBacked>,
    access: VMAccess,
+    pages: Option<PageTracker>, // Track the paging status of this VMA
+    epc_type: EPCMemType,       // Track the type of the EPC to use specific APIs
+    pf_count: u64,
 }

 #[derive(Clone, Debug, Eq, PartialEq)]
@ -32,11 +41,47 @@ impl VMArea {
        file_backed: Option<FileBacked>,
        pid: pid_t,
    ) -> Self {
-        Self {
+        let epc_type = EPCMemType::new(&range);
+        let pages = {
+            match epc_type {
+                EPCMemType::Reserved => None,
+                EPCMemType::UserRegion => {
+                    let pages =
+                        PageTracker::new_vma_tracker(&range, &EPCMemType::UserRegion).unwrap();
+                    (!pages.is_fully_committed()).then_some(pages)
+                }
+            }
+        };
+
+        let new_vma = Self {
            range,
            perms,
            file_backed,
            access: VMAccess::Private(pid),
+            pages,
+            epc_type,
+            pf_count: 0,
+        };
+        trace!("new vma = {:?}", new_vma);
+        new_vma
+    }
+
+    fn new_with_page_tracker(
+        range: VMRange,
+        perms: VMPerms,
+        file_backed: Option<FileBacked>,
+        access: VMAccess,
+        pages: Option<PageTracker>,
+    ) -> VMArea {
+        let epc_type = EPCMemType::new(&range);
+        Self {
+            range,
+            perms,
+            file_backed,
+            access,
+            pages,
+            epc_type,
+            pf_count: 0,
        }
    }

@ -49,30 +94,41 @@ impl VMArea {
        access: VMAccess,
    ) -> Self {
        debug_assert!(vma.is_superset_of(&new_range));
-        let new_backed_file = vma.file_backed.as_ref().map(|file| {
+
+        let new_backed_file = if let Some(file) = &vma.file_backed {
            let mut new_file = file.clone();
            let file_offset = file.offset();

-            let new_file_offset = if vma.start() < new_range.start() {
-                let vma_offset = new_range.start() - vma.start();
-                file_offset + vma_offset
-            } else {
-                let vma_offset = vma.start() - new_range.start();
-                debug_assert!(file_offset >= vma_offset);
-                file_offset - vma_offset
-            };
+            debug_assert!(vma.start() <= new_range.start());
+            let new_start_offset = new_range.start() - vma.start();
+            let new_file_offset = file_offset + new_start_offset;

            new_file.set_offset(new_file_offset);
+            Some(new_file)
+        } else {
+            None
+        };

-            new_file
-        });
+        let new_pages = {
+            let mut new_pages = vma.pages.clone();

-        Self {
-            range: new_range,
-            perms: new_perms,
-            file_backed: new_backed_file,
-            access,
-        }
+            if let Some(pages) = &mut new_pages {
+                pages.split_for_new_range(&new_range);
+                if pages.is_fully_committed() {
+                    None
+                } else {
+                    new_pages
+                }
+            } else {
+                None
+            }
+        };
+
+        let new_vma =
+            Self::new_with_page_tracker(new_range, new_perms, new_backed_file, access, new_pages);
+
+        trace!("inherits vma: {:?}, create new vma: {:?}", vma, new_vma);
+        new_vma
    }

    pub fn perms(&self) -> VMPerms {
@ -87,6 +143,13 @@ impl VMArea {
        &self.access
    }

+    pub fn get_private_pid(&self) -> Option<pid_t> {
+        match &self.access {
+            VMAccess::Private(pid) => Some(*pid),
+            VMAccess::Shared(_) => None,
+        }
+    }
+
    pub fn belong_to(&self, target_pid: pid_t) -> bool {
        match &self.access {
            VMAccess::Private(pid) => *pid == target_pid,
@ -105,9 +168,199 @@ impl VMArea {
        }
    }

-    pub fn init_file(&self) -> Option<(&FileRef, usize)> {
+    fn pages(&self) -> &PageTracker {
+        debug_assert!(!self.is_fully_committed());
+        self.pages.as_ref().unwrap()
+    }
+
+    fn pages_mut(&mut self) -> &mut PageTracker {
+        debug_assert!(!self.is_fully_committed());
+        self.pages.as_mut().unwrap()
+    }
+
+    // Get pid for private VMA
+    pub fn pid(&self) -> pid_t {
+        match self.access {
+            VMAccess::Private(pid) => pid,
+            VMAccess::Shared(_) => unreachable!(),
+        }
+    }
+
+    pub fn is_reserved_only(&self) -> bool {
+        if let Some(pages) = &self.pages {
+            return pages.is_reserved_only();
+        } else {
+            false
+        }
+    }
+
+    pub fn is_fully_committed(&self) -> bool {
+        self.pages.is_none()
+    }
+
+    pub fn is_partially_committed(&self) -> bool {
+        if let Some(pages) = &self.pages {
+            return pages.is_partially_committed();
+        } else {
+            false
+        }
+    }
+
+    pub fn init_memory(mut self, options: &VMMapOptions) -> Result<Self> {
+        let mut vm_area = self;
+        let page_policy = options.page_policy();
+
+        // Commit pages if needed
+        if !vm_area.is_fully_committed() && page_policy == &PagePolicy::CommitNow {
+            vm_area.pages_mut().commit_whole(VMPerms::DEFAULT)?;
+            vm_area.pages = None;
+        }
+
+        // Initialize committed memory
+        if vm_area.is_partially_committed() {
+            let committed = true;
+            for range in vm_area.pages().get_ranges(committed) {
+                vm_area.init_memory_internal(&range, Some(options.initializer()))?;
+            }
+        } else if vm_area.is_fully_committed() {
+            // Initialize the memory of the new range
+            unsafe {
+                let buf = vm_area.range().as_slice_mut();
+                options.initializer().init_slice(buf)?;
+            }
+
+            // Set memory permissions
+            if !options.perms().is_default() {
+                vm_area.modify_protection_force(None, vm_area.perms());
+            }
+        }
+        // Do nothing if this vma has no committed memory
+
+        Ok(vm_area)
+    }
+
+    pub fn flush_and_clean_memory(&self) -> Result<()> {
+        let (need_flush, file, file_offset) = match self.writeback_file() {
+            None => (false, None, None),
+            Some((file_handle, offset)) => {
+                if !file_handle.access_mode().unwrap().writable() {
+                    (false, None, None)
+                } else {
+                    (true, Some(file_handle), Some(offset))
+                }
+            }
+        };
+
+        if self.is_fully_committed() {
+            self.flush_and_clean_internal(self.range(), need_flush, file, file_offset);
+        } else {
+            let committed = true;
+            for range in self.pages().get_ranges(committed) {
+                self.flush_and_clean_internal(&range, need_flush, file, file_offset);
+            }
+        }
+
+        Ok(())
+    }
+
+    fn flush_and_clean_internal(
+        &self,
+        target_range: &VMRange,
+        need_flush: bool,
+        file: Option<&FileRef>,
+        file_offset: Option<usize>,
+    ) {
+        trace!("flush and clean committed range: {:?}", target_range);
+        debug_assert!(self.range().is_superset_of(target_range));
+        let buf = unsafe { target_range.as_slice_mut() };
+        if !self.perms().is_default() {
+            self.modify_protection_force(Some(&target_range), VMPerms::default());
+        }
+
+        if need_flush {
+            let file_offset = file_offset.unwrap() + (target_range.start() - self.range.start());
+            file.unwrap().write_at(file_offset, buf);
+        }
+
+        // reset zeros
+        unsafe {
+            buf.iter_mut().for_each(|b| *b = 0);
+        }
+    }
+
+    pub fn modify_permissions_for_committed_pages(&self, new_perms: VMPerms) {
+        if self.is_fully_committed() {
+            self.modify_protection_force(None, new_perms);
+        } else if self.is_partially_committed() {
+            let committed = true;
+            for range in self.pages().get_ranges(committed) {
+                self.modify_protection_force(Some(&range), new_perms);
+            }
+        }
+    }
+
+    pub fn handle_page_fault(
+        &mut self,
+        rip: usize,
+        pf_addr: usize,
+        errcd: u32,
+        kernel_triggers: bool,
+    ) -> Result<()> {
+        trace!("PF vma = {:?}", self);
+        if (self.perms() == VMPerms::NONE)
+            || (crate::exception::check_rw_bit(errcd) == false
+                && !self.perms().contains(VMPerms::READ))
+        {
+            return_errno!(
+                EACCES,
+                "Page is set to None permission. This is user-intended"
+            );
+        }
+
+        if crate::exception::check_rw_bit(errcd) && !self.perms().contains(VMPerms::WRITE) {
+            return_errno!(
+                EACCES, "Page is set to not contain WRITE permission but this PF is triggered by write. This is user-intended"
+            )
+        }
+
+        if rip == pf_addr && !self.perms().contains(VMPerms::EXEC) {
+            return_errno!(
+                EACCES, "Page is set to not contain EXEC permission but this PF is triggered by execution. This is user-intended"
+            )
+        }
+
+        if self.is_fully_committed() {
+            // This vma has been commited by other threads already. Just return.
+            info!("This vma has been committed by other threads already.");
+            return Ok(());
+        }
+
+        if matches!(self.epc_type, EPCMemType::Reserved) {
+            return_errno!(EINVAL, "reserved memory shouldn't trigger PF");
+        }
+
+        if kernel_triggers || self.pf_count >= PF_NUM_THRESHOLD {
+            return self.commit_current_vma_whole();
+        }
+
+        self.pf_count += 1;
+        // The return commit_size can be 0 when other threads already commit the PF-containing range but the vma is not fully committed yet.
+        let commit_size = self.commit_once_for_page_fault(pf_addr).unwrap();
+
+        trace!("page fault commit memory size = {:?}", commit_size);
+
+        if commit_size == 0 {
+            warn!("This PF has been handled by other threads already.");
+        }
+
+        info!("page fault handle success");
+
+        Ok(())
+    }
+
+    pub fn backed_file(&self) -> Option<(&FileRef, usize)> {
        if let Some(file) = &self.file_backed {
-            Some(file.init_file())
+            Some(file.backed_file())
        } else {
            None
        }
@ -147,36 +400,51 @@ impl VMArea {
        Some(new_vma)
    }

-    pub fn resize(&mut self, new_size: usize) {
-        self.range.resize(new_size)
-    }
-
    pub fn set_start(&mut self, new_start: usize) {
        let old_start = self.start();
+        if new_start == old_start {
+            return;
+        }
+
        self.range.set_start(new_start);

-        if let Some(file) = self.file_backed.as_mut() {
-            if !file.need_write_back() {
-                return;
+        if new_start < old_start {
+            // Extend this VMA
+            let pages = {
+                let pages = PageTracker::new_vma_tracker(&self.range, &self.epc_type).unwrap();
+                (!pages.is_fully_committed()).then_some(pages)
+            };
+            self.pages = pages;
+        } else {
+            // Split this VMA
+            debug_assert!(new_start > old_start);
+            if let Some(pages) = &mut self.pages {
+                pages.split_for_new_range(&self.range);
+                if pages.is_fully_committed() {
+                    self.pages = None;
+                }
            }
+        }
+
+        if let Some(file) = self.file_backed.as_mut() {
            // If the updates to the VMA needs to write back to a file, then the
            // file offset must be adjusted according to the new start address.
-            let offset = file.offset();
-            if old_start < new_start {
-                file.set_offset(offset + (new_start - old_start));
-            } else {
-                // The caller must guarantee that the new start makes sense
-                debug_assert!(offset >= old_start - new_start);
-                file.set_offset(offset - (old_start - new_start));
-            }
+            Self::set_file_offset(file, new_start, old_start);
+        }
+    }
+
+    fn set_file_offset(file: &mut FileBacked, new_start_offset: usize, old_start_offset: usize) {
+        let offset = file.offset();
+        if old_start_offset < new_start_offset {
+            file.set_offset(offset + (new_start_offset - old_start_offset));
+        } else {
+            // The caller must guarantee that the new start makes sense
+            debug_assert!(offset >= old_start_offset - new_start_offset);
+            file.set_offset(offset - (old_start_offset - new_start_offset));
        }
    }

    pub fn is_the_same_to(&self, other: &VMArea) -> bool {
-        if self.access() != other.access() {
-            return false;
-        }
-
        if self.range() != other.range() {
            return false;
        }
@ -185,6 +453,10 @@ impl VMArea {
            return false;
        }

+        if self.access() != other.access() {
+            return false;
+        }
+
        let self_writeback_file = self.writeback_file();
        let other_writeback_file = other.writeback_file();
        match (self_writeback_file, other_writeback_file) {
@ -199,6 +471,13 @@ impl VMArea {

    pub fn set_end(&mut self, new_end: usize) {
        self.range.set_end(new_end);
+        let pages = if self.range.size() > 0 {
+            let pages = PageTracker::new_vma_tracker(&self.range, &self.epc_type).unwrap();
+            (!pages.is_fully_committed()).then_some(pages)
+        } else {
+            None
+        };
+        self.pages = pages;
    }

    pub fn can_merge_vmas(left: &VMArea, right: &VMArea) -> bool {
@ -208,10 +487,6 @@ impl VMArea {
        if left.size() == 0 || right.size() == 0 {
            return false;
        }
-        // The two VMAs must be owned by the same process
-        if left.access() != right.access() {
-            return false;
-        }
        // The two VMAs must border with each other
        if left.end() != right.start() {
            return false;
@ -220,6 +495,15 @@ impl VMArea {
        if left.perms() != right.perms() {
            return false;
        }
+        // The two VMAs must be owned by the same process privately
+        // Return false if (either is none) or (both are some but two private pids are different)
+        let private_access = left.get_private_pid().zip(right.get_private_pid());
+        if private_access.is_none() {
+            return false;
+        }
+        if private_access.is_some_and(|(left_pid, right_pid)| left_pid != right_pid) {
+            return false;
+        }

        // If the two VMAs have write-back files, the files must be the same and
        // the two file regions must be continuous.
@ -238,12 +522,12 @@ impl VMArea {
    }

    /// Flush a file-backed VMA to its file. This has no effect on anonymous VMA.
-    pub fn flush_backed_file(&self) {
-        self.flush_backed_file_with_cond(|_| true)
+    pub fn flush_committed_backed_file(&self) {
+        self.flush_committed_backed_file_with_cond(|_| true)
    }

-    /// Same as `flush_backed_file()`, except that an extra condition on the file needs to satisfy.
-    pub fn flush_backed_file_with_cond<F: Fn(&FileRef) -> bool>(&self, cond_fn: F) {
+    /// Same as `flush_committed_backed_file()`, except that an extra condition on the file needs to satisfy.
+    pub fn flush_committed_backed_file_with_cond<F: Fn(&FileRef) -> bool>(&self, cond_fn: F) {
        let (file, file_offset) = match self.writeback_file() {
            None => return,
            Some((file_and_offset)) => file_and_offset,
@ -258,7 +542,16 @@ impl VMArea {
        if !cond_fn(file) {
            return;
        }
-        file.write_at(file_offset, unsafe { self.as_slice() });
+        if self.is_fully_committed() {
+            file.write_at(file_offset, unsafe { self.as_slice() });
+        } else {
+            let committed = true;
+            let vm_range_start = self.range().start();
+            for range in self.pages().get_ranges(committed) {
+                let file_offset = file_offset + (range.start() - vm_range_start);
+                file.write_at(file_offset, unsafe { range.as_slice() });
+            }
+        }
    }

    pub fn is_shared(&self) -> bool {
@ -310,6 +603,198 @@ impl VMArea {
    pub fn inherits_access_from(&mut self, vma: &VMArea) {
        self.access = vma.access().clone()
    }
+
+    // Current implementation with "unwrap()" can help us find the error quickly by panicing directly. Also, restoring VM state
+    // when this function fails will require some work and is not that simple.
+    // TODO: Return with Result instead of "unwrap()"" in this function.
+    fn modify_protection_force(&self, protect_range: Option<&VMRange>, new_perms: VMPerms) {
+        let protect_range = protect_range.unwrap_or_else(|| self.range());
+
+        self.epc_type
+            .modify_protection(protect_range.start(), protect_range.size(), new_perms)
+            .unwrap()
+    }
+
+    // With initializer, the memory should be committed already.
+    // Without initializer, the memory need to be committed and initialized.
+    fn init_memory_internal(
+        &mut self,
+        target_range: &VMRange,
+        initializer: Option<&VMInitializer>,
+    ) -> Result<()> {
+        debug_assert!(self.range().is_superset_of(target_range));
+        trace!("init range = {:?}", target_range);
+        let perms = self.perms();
+        if let Some(initializer) = initializer {
+            match initializer {
+                VMInitializer::FileBacked { file } => {
+                    let (file, offset) = file.backed_file();
+                    let vma_range_start = self.range.start();
+
+                    let init_file_offset = offset + (target_range.start() - vma_range_start);
+
+                    self.init_file_backed_mem(target_range, &file, init_file_offset, perms)?;
+                }
+                VMInitializer::DoNothing() => {
+                    if !self.perms().is_default() {
+                        self.modify_protection_force(Some(target_range), perms);
+                    }
+                }
+                VMInitializer::FillZeros() => {
+                    unsafe {
+                        let buf = target_range.as_slice_mut();
+                        buf.iter_mut().for_each(|b| *b = 0);
+                    }
+                    if !perms.is_default() {
+                        self.modify_protection_force(Some(target_range), perms);
+                    }
+                }
+                _ => todo!(),
+            }
+        } else {
+            // No initializer, #PF triggered.
+            let init_file = self
+                .backed_file()
+                .map(|(file, offset)| (file.clone(), offset));
+            if let Some((file, offset)) = init_file {
+                let vma_range_start = self.range.start();
+
+                let init_file_offset = offset + (target_range.start() - vma_range_start);
+
+                self.pages
+                    .as_mut()
+                    .unwrap()
+                    .commit_memory_and_init_with_file(
+                        target_range,
+                        &file,
+                        init_file_offset,
+                        perms,
+                    )?;
+            } else {
+                // PF triggered, no file-backed memory, just modify protection
+                self.pages
+                    .as_mut()
+                    .unwrap()
+                    .commit_range(target_range, Some(perms))?;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn init_file_backed_mem(
+        &mut self,
+        target_range: &VMRange,
+        file: &FileRef,
+        file_offset: usize,
+        new_perm: VMPerms,
+    ) -> Result<()> {
+        if !file.access_mode().unwrap().readable() {
+            return_errno!(EBADF, "file is not readable");
+        }
+
+        let buf = unsafe { target_range.as_slice_mut() };
+        let file_size = file.metadata().unwrap().size;
+
+        let len = file
+            .read_at(file_offset, buf)
+            .map_err(|_| errno!(EACCES, "failed to init memory from file"))?;
+
+        if !new_perm.is_default() {
+            self.modify_protection_force(Some(target_range), new_perm);
+        }
+
+        Ok(())
+    }
+
+    fn get_commit_once_size(&self) -> usize {
+        COMMIT_SIZE_UNIT
+    }
+
+    fn commit_once_for_page_fault(&mut self, pf_addr: usize) -> Result<usize> {
+        debug_assert!(!self.is_fully_committed());
+        let mut early_return = false;
+        let mut total_commit_size = 0;
+        let vma_range_start = self.range.start();
+        let permission = self.perms();
+        let committed = false;
+        let mut uncommitted_ranges = self.pages().get_ranges(committed);
+        let commit_once_size = self.get_commit_once_size();
+
+        for range in uncommitted_ranges
+            .iter_mut()
+            .skip_while(|range| !range.contains(pf_addr))
+        {
+            // Skip until first reach the range which contains the pf_addr
+            if total_commit_size == 0 {
+                debug_assert!(range.contains(pf_addr));
+                range.set_start(align_down(pf_addr, PAGE_SIZE));
+                range.resize(std::cmp::min(range.size(), commit_once_size));
+            } else if range.size() + total_commit_size > commit_once_size {
+                // This is not first time commit. Try to commit until reaching the commit_once_size
+                range.resize(commit_once_size - total_commit_size);
+            }
+
+            // We don't take care the file-backed memory here
+            debug_assert!(self.backed_file().is_none());
+            self.init_memory_internal(&range, None)?;
+
+            total_commit_size += range.size();
+            if total_commit_size >= commit_once_size {
+                break;
+            }
+        }
+
+        if self.pages().is_fully_committed() {
+            trace!("vma is fully committed");
+            self.pages = None;
+        }
+
+        Ok(total_commit_size)
+    }
+
+    // Only used to handle PF triggered by the kernel
+    fn commit_current_vma_whole(&mut self) -> Result<()> {
+        debug_assert!(!self.is_fully_committed());
+        debug_assert!(self.backed_file().is_none());
+
+        let mut uncommitted_ranges = self.pages.as_ref().unwrap().get_ranges(false);
+        for range in uncommitted_ranges {
+            self.init_memory_internal(&range, None).unwrap();
+        }
+        self.pages = None;
+
+        Ok(())
+    }
+
+    // TODO: We can re-enable this when we support lazy extend permissions.
+    #[allow(dead_code)]
+    fn page_fault_handler_extend_permission(&mut self, pf_addr: usize) -> Result<()> {
+        let permission = self.perms();
+
+        // This is intended by the application.
+        if permission == VMPerms::NONE {
+            return_errno!(EPERM, "trying to access PROT_NONE memory");
+        }
+
+        if self.is_fully_committed() {
+            self.modify_protection_force(None, permission);
+            return Ok(());
+        }
+
+        let committed = true;
+        let committed_ranges = self.pages().get_ranges(committed);
+        for range in committed_ranges.iter() {
+            if !range.contains(pf_addr) {
+                continue;
+            }
+
+            self.epc_type
+                .modify_protection(range.start(), range.size(), permission)?;
+        }
+
+        Ok(())
+    }
 }

 impl Deref for VMArea {
--- a/src/libos/src/vm/vm_chunk_manager.rs
+++ b/src/libos/src/vm/vm_chunk_manager.rs
@ -83,16 +83,7 @@ impl ChunkManager {
                continue;
            }

-            vma.flush_backed_file();
-
-            if !vma.perms().is_default() {
-                VMPerms::apply_perms(vma, VMPerms::default());
-            }
-
-            unsafe {
-                let buf = vma.as_slice_mut();
-                buf.iter_mut().for_each(|b| *b = 0)
-            }
+            vma.flush_and_clean_memory().unwrap();

            self.free_manager.add_range_back_to_free_manager(vma);
            self.free_size += vma.size();
@ -110,6 +101,7 @@ impl ChunkManager {
        if let VMMapAddr::Force(addr) = addr {
            self.munmap(addr, size)?;
        }
+        trace!("mmap options = {:?}", options);

        // Find and allocate a new range for this mmap request
        let new_range = self
@ -117,27 +109,29 @@ impl ChunkManager {
            .find_free_range_internal(size, align, addr)?;
        let new_addr = new_range.start();
        let current_pid = current!().process().pid();
-        let new_vma = VMArea::new(
-            new_range,
-            *options.perms(),
-            options.initializer().backed_file(),
-            current_pid,
-        );
+        let new_vma = {
+            let new_vma = VMArea::new(
+                new_range,
+                *options.perms(),
+                options.initializer().backed_file(),
+                current_pid,
+            )
+            .init_memory(options);

-        // Initialize the memory of the new range
-        let buf = unsafe { new_vma.as_slice_mut() };
-        let ret = options.initializer().init_slice(buf);
-        if let Err(e) = ret {
-            // Return the free range before return with error
-            self.free_manager
-                .add_range_back_to_free_manager(new_vma.range());
-            return_errno!(e.errno(), "failed to mmap");
-        }
+            if new_vma.is_err() {
+                let error = new_vma.err().unwrap();
+                error!("init memory failure: {}", error.backtrace());
+                let range = VMRange::new_with_size(new_addr, size).unwrap();
+                self.free_manager
+                    .add_range_back_to_free_manager(&range)
+                    .unwrap();
+                return Err(error);
+            }
+
+            new_vma.unwrap()
+        };
+        trace!("new vma is ready");

-        // Set memory permissions
-        if !options.perms().is_default() {
-            VMPerms::apply_perms(&new_vma, new_vma.perms());
-        }
        self.free_size -= new_vma.size();
        // After initializing, we can safely insert the new VMA
        self.vmas.insert(VMAObj::new_vma_obj(new_vma));
@ -168,11 +162,7 @@ impl ChunkManager {
                Some(intersection_vma) => intersection_vma,
            };

-            // File-backed VMA needs to be flushed upon munmap
-            intersection_vma.flush_backed_file();
-            if !&intersection_vma.perms().is_default() {
-                VMPerms::apply_perms(&intersection_vma, VMPerms::default());
-            }
+            intersection_vma.flush_and_clean_memory()?;

            if vma.range() == intersection_vma.range() {
                // Exact match. Just remove.
@ -194,13 +184,6 @@ impl ChunkManager {
                }
            }

-            // Reset zero
-            unsafe {
-                trace!("intersection vma = {:?}", intersection_vma);
-                let buf = intersection_vma.as_slice_mut();
-                buf.iter_mut().for_each(|b| *b = 0)
-            }
-
            self.free_manager
                .add_range_back_to_free_manager(intersection_vma.range());
            self.free_size += intersection_vma.size();
@ -306,8 +289,7 @@ impl ChunkManager {
            if intersection_vma.range() == containing_vma.range() {
                // The whole containing_vma is mprotected
                containing_vma.set_perms(new_perms);
-                VMPerms::apply_perms(&containing_vma, containing_vma.perms());
-                trace!("containing_vma = {:?}", containing_vma);
+                containing_vma.modify_permissions_for_committed_pages(containing_vma.perms());
                containing_vmas.replace_with(VMAObj::new_vma_obj(containing_vma));
                containing_vmas.move_next();
                continue;
@ -325,13 +307,13 @@ impl ChunkManager {
                        let protect_end = protect_range.end();

                        // New VMA
-                        let new_vma = VMArea::inherits_file_from(
+                        let mut new_vma = VMArea::inherits_file_from(
                            &containing_vma,
                            protect_range,
                            new_perms,
                            VMAccess::Private(current_pid),
                        );
-                        VMPerms::apply_perms(&new_vma, new_vma.perms());
+                        new_vma.modify_permissions_for_committed_pages(new_vma.perms());
                        let new_vma = VMAObj::new_vma_obj(new_vma);

                        // Another new VMA
@ -356,15 +338,16 @@ impl ChunkManager {
                        break;
                    }
                    1 => {
-                        let remain_vma = remain_vmas.pop().unwrap();
+                        let mut remain_vma = remain_vmas.pop().unwrap();

-                        let new_vma = VMArea::inherits_file_from(
+                        let mut new_vma = VMArea::inherits_file_from(
                            &containing_vma,
                            intersection_vma.range().clone(),
                            new_perms,
                            VMAccess::Private(current_pid),
                        );
-                        VMPerms::apply_perms(&new_vma, new_vma.perms());
+
+                        new_vma.modify_permissions_for_committed_pages(new_vma.perms());

                        if remain_vma.start() == containing_vma.start() {
                            // mprotect right side of the vma
@ -374,6 +357,7 @@ impl ChunkManager {
                            debug_assert!(remain_vma.end() == containing_vma.end());
                            containing_vma.set_start(remain_vma.start());
                        }
+                        debug_assert!(containing_vma.range() == remain_vma.range());

                        containing_vmas.replace_with(VMAObj::new_vma_obj(containing_vma));
                        containing_vmas.insert(VMAObj::new_vma_obj(new_vma));
@ -401,7 +385,7 @@ impl ChunkManager {
                None => continue,
                Some(vma) => vma,
            };
-            vma.flush_backed_file();
+            vma.flush_committed_backed_file();
        }
        Ok(())
    }
@ -409,9 +393,11 @@ impl ChunkManager {
    /// Sync all shared, file-backed memory mappings of the given file by flushing
    /// the memory content to the file.
    pub fn msync_by_file(&mut self, sync_file: &FileRef) {
+        let is_same_file = |file: &FileRef| -> bool { Arc::ptr_eq(&file, &sync_file) };
        for vma_obj in &self.vmas {
-            let is_same_file = |file: &FileRef| -> bool { Arc::ptr_eq(&file, &sync_file) };
-            vma_obj.vma().flush_backed_file_with_cond(is_same_file);
+            vma_obj
+                .vma()
+                .flush_committed_backed_file_with_cond(is_same_file);
        }
    }

@ -428,6 +414,34 @@ impl ChunkManager {
        return Ok(vma.range().clone());
    }

+    pub fn handle_page_fault(
+        &mut self,
+        rip: usize,
+        pf_addr: usize,
+        errcd: u32,
+        kernel_triggers: bool,
+    ) -> Result<()> {
+        trace!(
+            "handle_page_fault chunk manager range = {:?}, free_size = {:?}",
+            self.range,
+            self.free_size
+        );
+        let mut vma_cursor = self.vmas.upper_bound_mut(Bound::Included(&pf_addr));
+        if vma_cursor.is_null() {
+            return_errno!(ENOMEM, "no mmap regions that contains the address");
+        }
+        let vma = vma_cursor.get().unwrap().vma();
+        if vma.pid() != current!().process().pid() || !vma.contains(pf_addr) {
+            return_errno!(ENOMEM, "no mmap regions that contains the address");
+        }
+
+        let mut vma = vma.clone();
+        vma.handle_page_fault(rip, pf_addr, errcd, kernel_triggers)?;
+        vma_cursor.replace_with(VMAObj::new_vma_obj(vma));
+
+        Ok(())
+    }
+
    pub fn usage_percentage(&self) -> f32 {
        let total_size = self.range.size();
        let mut used_size = 0;
@ -487,6 +501,7 @@ impl VMRemapParser for ChunkManager {

 impl Drop for ChunkManager {
    fn drop(&mut self) {
+        info!("drop chunk manager = {:?}", self);
        assert!(self.is_empty());
        assert!(self.free_size == self.range.size());
        assert!(self.free_manager.free_size() == self.range.size());
--- a/src/libos/src/vm/vm_epc.rs
+++ b/src/libos/src/vm/vm_epc.rs
@ -0,0 +1,405 @@
+// This file contains EPC related APIs and definitions.
+use super::*;
+use sgx_trts::emm::{
+    AllocAddr, AllocFlags, AllocOptions, EmmAlloc, HandleResult, PageFaultHandler, Perm,
+};
+use sgx_trts::enclave::rsgx_is_supported_EDMM;
+use std::ptr::NonNull;
+
+// Memory Layout for Platforms with EDMM support
+//
+// Addr low -> high
+// |---------------------------------------------||---------------------||--------------------------------------|
+//     Reserved Memory                                Gap Range                User Region Memory
+//    (commit memory when loading the enclave)       (used by SDK)           (commit on demand when PF occurs)
+//
+// For platforms without EDMM support, we only use reserved memory.
+
+pub enum SGXPlatform {
+    WithEDMM,
+    NoEDMM,
+}
+
+#[derive(Clone)]
+pub enum EPCMemType {
+    Reserved,
+    UserRegion,
+}
+
+pub struct ReservedMem;
+pub struct UserRegionMem;
+
+#[repr(C, align(4096))]
+#[derive(Clone)]
+struct ZeroPage([u8; PAGE_SIZE]);
+
+impl ZeroPage {
+    fn new() -> Self {
+        Self([0; PAGE_SIZE])
+    }
+
+    fn new_page_aligned_vec(size: usize) -> Vec<u8> {
+        debug_assert!(size % PAGE_SIZE == 0);
+        let page_num = size / PAGE_SIZE;
+        let mut page_vec = vec![Self::new(); page_num];
+
+        let ptr = page_vec.as_mut_ptr();
+
+        let size = page_num * std::mem::size_of::<Self>();
+        std::mem::forget(page_vec);
+
+        unsafe { Vec::from_raw_parts(ptr as *mut u8, size, size) }
+    }
+}
+
+lazy_static! {
+    static ref ZERO_PAGE: Vec<u8> = ZeroPage::new_page_aligned_vec(PAGE_SIZE);
+}
+
+pub trait EPCAllocator {
+    fn alloc(size: usize) -> Result<usize> {
+        return_errno!(ENOSYS, "operation not supported");
+    }
+
+    fn alloc_with_addr(addr: usize, size: usize) -> Result<usize> {
+        return_errno!(ENOSYS, "operation not supported");
+    }
+
+    fn free(addr: usize, size: usize) -> Result<()> {
+        return_errno!(ENOSYS, "operation not supported");
+    }
+
+    fn modify_protection(addr: usize, length: usize, protection: VMPerms) -> Result<()> {
+        return_errno!(ENOSYS, "operation not supported");
+    }
+
+    fn mem_type() -> EPCMemType;
+}
+
+impl EPCAllocator for ReservedMem {
+    fn alloc(size: usize) -> Result<usize> {
+        let ptr = unsafe { sgx_alloc_rsrv_mem(size) };
+        if ptr.is_null() {
+            return_errno!(ENOMEM, "run out of reserved memory");
+        }
+        Ok(ptr as usize)
+    }
+
+    fn alloc_with_addr(addr: usize, size: usize) -> Result<usize> {
+        let ptr = unsafe { sgx_alloc_rsrv_mem_ex(addr as *const c_void, size) };
+        if ptr.is_null() {
+            return_errno!(ENOMEM, "can't allocate reserved memory at desired address");
+        }
+        Ok(ptr as usize)
+    }
+
+    fn free(addr: usize, size: usize) -> Result<()> {
+        let ret = unsafe { sgx_free_rsrv_mem(addr as *const c_void, size) };
+        assert!(ret == 0);
+        Ok(())
+    }
+
+    fn modify_protection(addr: usize, length: usize, protection: VMPerms) -> Result<()> {
+        let mut ret_val = 0;
+        let ret = if rsgx_is_supported_EDMM() {
+            unsafe {
+                sgx_tprotect_rsrv_mem(addr as *const c_void, length, protection.bits() as i32)
+            }
+        } else {
+            // For platforms without EDMM, sgx_tprotect_rsrv_mem is actually useless.
+            // However, at least we can set pages to desired protections in the host kernel page table.
+            unsafe {
+                occlum_ocall_mprotect(
+                    &mut ret_val as *mut i32,
+                    addr as *const c_void,
+                    length,
+                    protection.bits() as i32,
+                )
+            }
+        };
+
+        if ret != sgx_status_t::SGX_SUCCESS || ret_val != 0 {
+            return_errno!(ENOMEM, "reserved memory modify protection failure");
+        }
+
+        Ok(())
+    }
+
+    fn mem_type() -> EPCMemType {
+        EPCMemType::Reserved
+    }
+}
+
+impl EPCAllocator for UserRegionMem {
+    fn alloc(size: usize) -> Result<usize> {
+        let alloc_options = AllocOptions::new()
+            .set_flags(AllocFlags::COMMIT_ON_DEMAND)
+            .set_handler(enclave_page_fault_handler_dummy, 0);
+        let ptr = unsafe { EmmAlloc.alloc(AllocAddr::Any, size, alloc_options) }
+            .map_err(|e| errno!(Errno::from(e as u32)))?;
+
+        Ok(ptr.addr().get())
+    }
+
+    fn free(addr: usize, size: usize) -> Result<()> {
+        let ptr = NonNull::<u8>::new(addr as *mut u8).unwrap();
+        unsafe { EmmAlloc.dealloc(ptr, size) }.map_err(|e| errno!(Errno::from(e as u32)))?;
+        Ok(())
+    }
+
+    fn modify_protection(addr: usize, length: usize, protection: VMPerms) -> Result<()> {
+        trace!(
+            "user region modify protection, protection = {:?}, range = {:?}",
+            protection,
+            VMRange::new_with_size(addr, length).unwrap()
+        );
+        let ptr = NonNull::<u8>::new(addr as *mut u8).unwrap();
+        unsafe {
+            EmmAlloc.modify_permissions(ptr, length, Perm::from_bits(protection.bits()).unwrap())
+        }
+        .map_err(|e| errno!(Errno::from(e as u32)))?;
+
+        Ok(())
+    }
+
+    fn mem_type() -> EPCMemType {
+        EPCMemType::UserRegion
+    }
+}
+
+impl UserRegionMem {
+    fn commit_memory(start_addr: usize, size: usize) -> Result<()> {
+        let ptr = NonNull::<u8>::new(start_addr as *mut u8).unwrap();
+        unsafe { EmmAlloc.commit(ptr, size) }.map_err(|e| errno!(Errno::from(e as u32)))?;
+        Ok(())
+    }
+
+    fn commit_memory_with_new_permission(
+        start_addr: usize,
+        size: usize,
+        new_perms: VMPerms,
+    ) -> Result<()> {
+        let ptr = NonNull::<u8>::new(start_addr as *mut u8).unwrap();
+        let perm = Perm::from_bits(new_perms.bits()).unwrap();
+        if size == PAGE_SIZE {
+            unsafe { EmmAlloc::commit_with_data(ptr, ZERO_PAGE.as_slice(), perm) }
+                .map_err(|e| errno!(Errno::from(e as u32)))?;
+        } else {
+            let data = ZeroPage::new_page_aligned_vec(size);
+            unsafe { EmmAlloc::commit_with_data(ptr, data.as_slice(), perm) }
+                .map_err(|e| errno!(Errno::from(e as u32)))?;
+        }
+        Ok(())
+    }
+
+    fn commit_memory_and_init_with_file(
+        start_addr: usize,
+        size: usize,
+        file: &FileRef,
+        file_offset: usize,
+        new_perms: VMPerms,
+    ) -> Result<()> {
+        let mut data = ZeroPage::new_page_aligned_vec(size);
+        let len = file
+            .read_at(file_offset, data.as_mut_slice())
+            .map_err(|_| errno!(EACCES, "failed to init memory from file"))?;
+
+        let ptr = NonNull::<u8>::new(start_addr as *mut u8).unwrap();
+        let perm = Perm::from_bits(new_perms.bits()).unwrap();
+
+        unsafe { EmmAlloc::commit_with_data(ptr, data.as_slice(), perm) }
+            .map_err(|e| errno!(Errno::from(e as u32)))?;
+        Ok(())
+    }
+}
+
+impl SGXPlatform {
+    pub fn new() -> Self {
+        if rsgx_is_supported_EDMM() {
+            SGXPlatform::WithEDMM
+        } else {
+            SGXPlatform::NoEDMM // including SGX simulation mode
+        }
+    }
+
+    pub fn alloc_user_space(
+        &self,
+        init_size: usize,
+        max_size: usize,
+    ) -> Result<(VMRange, Option<VMRange>)> {
+        debug!(
+            "alloc user space init size = {:?}, max size = {:?}",
+            init_size, max_size
+        );
+        if matches!(self, SGXPlatform::WithEDMM) && max_size > init_size {
+            let user_region_size = max_size - init_size;
+
+            let reserved_mem_start_addr = ReservedMem::alloc(init_size)?;
+
+            let user_region_start_addr = UserRegionMem::alloc(user_region_size)?;
+
+            let total_user_space_range = VMRange::new(
+                reserved_mem_start_addr,
+                user_region_start_addr + user_region_size,
+            )?;
+            let gap_range =
+                VMRange::new(reserved_mem_start_addr + init_size, user_region_start_addr)?;
+
+            info!(
+                "allocated user space range is {:?}, gap range is {:?}. reserved_mem range is {:?}, user region range is {:?}",
+                total_user_space_range, gap_range, VMRange::new_with_size(reserved_mem_start_addr, init_size),
+                VMRange::new_with_size(user_region_start_addr, user_region_size)
+            );
+
+            Ok((total_user_space_range, Some(gap_range)))
+        } else {
+            // For platform with no-edmm support, or the max_size is the same as init_size, use reserved memory for the whole userspace
+            let reserved_mem_start_addr = ReservedMem::alloc(max_size)?;
+            let total_user_space_range =
+                VMRange::new(reserved_mem_start_addr, reserved_mem_start_addr + max_size)?;
+
+            info!(
+                "allocated user space range is {:?}, gap range is None",
+                total_user_space_range
+            );
+
+            Ok((total_user_space_range, None))
+        }
+    }
+
+    pub fn free_user_space(&self, user_space_range: &VMRange, gap_range: Option<&VMRange>) {
+        let user_space_ranges = if let Some(gap_range) = gap_range {
+            user_space_range.subtract(gap_range)
+        } else {
+            vec![*user_space_range]
+        };
+
+        if user_space_ranges.len() == 2 {
+            debug_assert!(matches!(self, SGXPlatform::WithEDMM));
+            let reserved_mem = user_space_ranges[0];
+            let user_region_mem = user_space_ranges[1];
+            ReservedMem::free(reserved_mem.start(), reserved_mem.size()).unwrap();
+            UserRegionMem::free(user_region_mem.start(), user_region_mem.size()).unwrap();
+        } else {
+            // For platforms with EDMM but max_size equals init_size or the paltforms without EDMM, there is no gap range.
+            debug_assert!(user_space_ranges.len() == 1);
+            let reserved_mem = user_space_ranges[0];
+            ReservedMem::free(reserved_mem.start(), reserved_mem.size()).unwrap();
+        }
+    }
+}
+
+impl Debug for EPCMemType {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let output_str = match self {
+            EPCMemType::Reserved => "reserved memory region",
+            EPCMemType::UserRegion => "user region memory",
+        };
+        write!(f, "{}", output_str)
+    }
+}
+
+impl EPCMemType {
+    pub fn new(range: &VMRange) -> Self {
+        trace!("EPC new range = {:?}", range);
+        if rsgx_is_supported_EDMM() {
+            if let Some(gap_range) = USER_SPACE_VM_MANAGER.gap_range() {
+                debug_assert!({
+                    if range.size() > 0 {
+                        !gap_range.overlap_with(range)
+                    } else {
+                        // Ignore for sentry VMA
+                        true
+                    }
+                });
+                if range.end() <= gap_range.start() {
+                    EPCMemType::Reserved
+                } else {
+                    debug_assert!(gap_range.end() <= range.start());
+                    EPCMemType::UserRegion
+                }
+            } else {
+                // There is no gap, this indicates that there is no user region memory
+                EPCMemType::Reserved
+            }
+        } else {
+            // Only reserved memory
+            EPCMemType::Reserved
+        }
+    }
+
+    pub fn modify_protection(&self, addr: usize, length: usize, protection: VMPerms) -> Result<()> {
+        // PT_GROWSDOWN should only be applied to stack segment or a segment mapped with the MAP_GROWSDOWN flag set.
+        // Since the memory are managed by our own, mprotect ocall shouldn't use this flag. Otherwise, EINVAL will be thrown.
+        let mut prot = protection.clone();
+        prot.remove(VMPerms::GROWSDOWN);
+
+        match self {
+            EPCMemType::Reserved => ReservedMem::modify_protection(addr, length, prot),
+            EPCMemType::UserRegion => UserRegionMem::modify_protection(addr, length, prot),
+        }
+    }
+}
+
+pub fn commit_memory(start_addr: usize, size: usize, new_perms: Option<VMPerms>) -> Result<()> {
+    trace!(
+        "commit epc: {:?}, new permission: {:?}",
+        VMRange::new_with_size(start_addr, size).unwrap(),
+        new_perms
+    );
+
+    // We should make memory commit and permission change atomic to prevent data races. Thus, if the new perms
+    // are not the default permission (RW), we implement a different function by calling EACCEPTCOPY
+    match new_perms {
+        Some(perms) if perms != VMPerms::DEFAULT => {
+            UserRegionMem::commit_memory_with_new_permission(start_addr, size, perms)
+        }
+        _ => UserRegionMem::commit_memory(start_addr, size),
+    }
+}
+
+pub fn commit_memory_and_init_with_file(
+    start_addr: usize,
+    size: usize,
+    file: &FileRef,
+    file_offset: usize,
+    new_perms: VMPerms,
+) -> Result<()> {
+    UserRegionMem::commit_memory_and_init_with_file(start_addr, size, file, file_offset, new_perms)
+}
+
+// This is a dummy function for sgx_mm_alloc. The real handler is "enclave_page_fault_handler" shown below.
+extern "C" fn enclave_page_fault_handler_dummy(
+    pfinfo: &sgx_pfinfo,
+    private: usize,
+) -> HandleResult {
+    // Don't do anything here. Modification of registers can cause the PF handling error.
+    return HandleResult::Search;
+}
+
+pub fn enclave_page_fault_handler(
+    rip: usize,
+    exception_info: sgx_misc_exinfo_t,
+    kernel_triggers: bool,
+) -> Result<()> {
+    let pf_addr = exception_info.faulting_address as usize;
+    let pf_errcd = exception_info.error_code;
+    trace!(
+        "enclave page fault caught, pf_addr = 0x{:x}, error code = {:?}",
+        pf_addr,
+        pf_errcd
+    );
+
+    USER_SPACE_VM_MANAGER.handle_page_fault(rip, pf_addr, pf_errcd, kernel_triggers)?;
+
+    Ok(())
+}
+
+extern "C" {
+    fn occlum_ocall_mprotect(
+        retval: *mut i32,
+        addr: *const c_void,
+        len: usize,
+        prot: i32,
+    ) -> sgx_status_t;
+}
--- a/src/libos/src/vm/vm_manager.rs
+++ b/src/libos/src/vm/vm_manager.rs
@ -22,14 +22,16 @@ use std::ops::Bound::{Excluded, Included};
 #[derive(Debug)]
 pub struct VMManager {
    range: VMRange,
+    gap_range: Option<VMRange>,
    internal: SgxMutex<InternalVMManager>,
 }

 impl VMManager {
-    pub fn init(vm_range: VMRange) -> Result<Self> {
-        let internal = InternalVMManager::init(vm_range.clone());
+    pub fn init(vm_range: VMRange, gap_range: Option<VMRange>) -> Result<Self> {
+        let mut internal = InternalVMManager::init(vm_range.clone(), &gap_range);
        Ok(VMManager {
            range: vm_range,
+            gap_range: gap_range,
            internal: SgxMutex::new(internal),
        })
    }
@ -38,6 +40,10 @@ impl VMManager {
        &self.range
    }

+    pub fn gap_range(&self) -> &Option<VMRange> {
+        &self.gap_range
+    }
+
    pub fn internal(&self) -> SgxMutexGuard<InternalVMManager> {
        self.internal.lock().unwrap()
    }
@ -56,8 +62,15 @@ impl VMManager {
    }

    pub fn verified_clean_when_exit(&self) -> bool {
+        let gap_size = if let Some(gap) = self.gap_range() {
+            gap.size()
+        } else {
+            0
+        };
+
        let internal = self.internal();
-        internal.chunks.len() == 0 && internal.free_manager.free_size() == self.range.size()
+        internal.chunks.len() == 0
+            && internal.free_manager.free_size() + gap_size == self.range.size()
    }

    pub fn free_chunk(&self, chunk: &ChunkRef) {
@ -358,22 +371,19 @@ impl VMManager {

        intersect_chunks.iter().for_each(|chunk| {
            if let ChunkType::SingleVMA(vma) = chunk.internal() {
-                if let Some(intersection_range) = chunk.range().intersect(&reset_range) {
-                    let mut internal_manager = self.internal();
-                    internal_manager.mprotect_single_vma_chunk(
-                        &chunk,
-                        intersection_range,
-                        VMPerms::DEFAULT,
-                    );
-
-                    unsafe {
-                        let buf = intersection_range.as_slice_mut();
-                        buf.iter_mut().for_each(|b| *b = 0)
-                    }
+                let mut vma = vma.lock().unwrap();
+                if let Some(intersection_vma) = vma.intersect(&reset_range) {
+                    intersection_vma.flush_and_clean_memory().unwrap();
                }
+                // clear permission for SingleVMA chunk
+                if vma.perms() != VMPerms::DEFAULT {
+                    vma.set_perms(VMPerms::default());
+                }
+            } else {
+                // Currently only used for heap de-allocation. Thus must be SingleVMA chunk.
+                unreachable!()
            }
        });
-
        Ok(())
    }

@ -394,11 +404,11 @@ impl VMManager {
        match chunk.internal() {
            ChunkType::MultiVMA(manager) => {
                trace!("msync default chunk: {:?}", chunk.range());
-                return manager
+                manager
                    .lock()
                    .unwrap()
                    .chunk_manager_mut()
-                    .msync_by_range(&sync_range);
+                    .msync_by_range(&sync_range)?;
            }
            ChunkType::SingleVMA(vma) => {
                // Note: There are rare cases that mutliple threads do mprotect or munmap for the same single-vma chunk
@ -406,7 +416,7 @@ impl VMManager {
                // It is fine here because this function doesn't modify the global chunk list and only operates on the vma
                // which is updated realtimely.
                let vma = vma.lock().unwrap();
-                vma.flush_backed_file();
+                vma.flush_committed_backed_file();
            }
        }
        Ok(())
@ -429,7 +439,7 @@ impl VMManager {
                ChunkType::SingleVMA(vma) => {
                    vma.lock()
                        .unwrap()
-                        .flush_backed_file_with_cond(is_same_file);
+                        .flush_committed_backed_file_with_cond(is_same_file);
                }
            });
    }
@ -539,6 +549,41 @@ impl VMManager {

        assert!(mem_chunks.len() == 0);
    }
+
+    pub fn handle_page_fault(
+        &self,
+        rip: usize,
+        pf_addr: usize,
+        errcd: u32,
+        kernel_triggers: bool,
+    ) -> Result<()> {
+        let current = current!();
+        let page_fault_chunk = {
+            let current_process_mem_chunks = current.vm().mem_chunks().read().unwrap();
+            if let Some(page_fault_chunk) = current_process_mem_chunks
+                .iter()
+                .find(|chunk| chunk.range().contains(pf_addr))
+            {
+                Some(page_fault_chunk.clone())
+            } else {
+                None
+            }
+        };
+
+        if let Some(page_fault_chunk) = page_fault_chunk {
+            return page_fault_chunk.handle_page_fault(rip, pf_addr, errcd, kernel_triggers);
+        }
+
+        // System V SHM segments are not tracked by the process VM. Try find the chunk here.
+        if let Some(page_fault_shm_chunk) =
+            SYSTEM_V_SHM_MANAGER.get_shm_chunk_containing_addr(pf_addr, current.process().pid())
+        {
+            return page_fault_shm_chunk.handle_page_fault(rip, pf_addr, errcd, kernel_triggers);
+        }
+
+        // This can happen for example, when the user intends to trigger the SIGSEGV handler by visit nullptr.
+        return_errno!(ENOMEM, "can't find the chunk containing the address");
+    }
 }

 // Modification on this structure must acquire the global lock.
@ -552,11 +597,21 @@ pub struct InternalVMManager {
 }

 impl InternalVMManager {
-    pub fn init(vm_range: VMRange) -> Self {
+    pub fn init(vm_range: VMRange, gap_range: &Option<VMRange>) -> Self {
        let chunks = BTreeSet::new();
        let fast_default_chunks = Vec::new();
-        let free_manager = VMFreeSpaceManager::new(vm_range);
+        let mut free_manager = VMFreeSpaceManager::new(vm_range);
        let shm_manager = ShmManager::new();
+        if let Some(gap_range) = gap_range {
+            debug_assert!(vm_range.is_superset_of(&gap_range));
+            free_manager
+                .find_free_range_internal(
+                    gap_range.size(),
+                    PAGE_SIZE,
+                    VMMapAddr::Force(gap_range.start()),
+                )
+                .unwrap();
+        }
        Self {
            chunks,
            fast_default_chunks,
@ -657,19 +712,7 @@ impl InternalVMManager {
            _ => unreachable!(),
        };

-        // File-backed VMA needs to be flushed upon munmap
-        intersection_vma.flush_backed_file();
-
-        // Reset memory permissions
-        if !&intersection_vma.perms().is_default() {
-            VMPerms::apply_perms(&intersection_vma, VMPerms::default());
-        }
-
-        // Reset to zero
-        unsafe {
-            let buf = intersection_vma.as_slice_mut();
-            buf.iter_mut().for_each(|b| *b = 0)
-        }
+        intersection_vma.flush_and_clean_memory()?;

        let mut new_vmas = vma.subtract(&intersection_vma);
        let current = current!();
@ -724,10 +767,10 @@ impl InternalVMManager {
                self.shm_manager
                    .create_shared_chunk(options, new_chunk.clone())
                    .map_err(|e| {
-                        let vma = new_chunk.get_vma_for_single_vma_chunk();
+                        let mut vma = new_chunk.get_vma_for_single_vma_chunk();
                        // Reset memory permissions
                        if !vma.perms().is_default() {
-                            VMPerms::apply_perms(&vma, VMPerms::default());
+                            vma.modify_permissions_for_committed_pages(VMPerms::default())
                        }
                        // Reset memory contents
                        unsafe {
@ -778,19 +821,11 @@ impl InternalVMManager {
            .munmap_shared_chunk(chunk, munmap_range, flag)?
            == MunmapSharedResult::Freeable
        {
-            let vma = chunk.get_vma_for_single_vma_chunk();
-            // Flush memory contents to backed file
-            vma.flush_backed_file();
-            // Reset memory permissions
-            if !vma.perms().is_default() {
-                VMPerms::apply_perms(&vma, VMPerms::default());
+            // Flush memory contents to backed file and reset memory contents
+            {
+                let vma = chunk.get_vma_for_single_vma_chunk();
+                vma.flush_and_clean_memory()?;
            }
-            // Reset memory contents
-            unsafe {
-                let buf = vma.as_slice_mut();
-                buf.iter_mut().for_each(|b| *b = 0)
-            }
-            drop(vma);

            self.free_chunk(chunk);
            let current = current!();
@ -855,7 +890,6 @@ impl InternalVMManager {
            }
            ChunkType::SingleVMA(vma) => vma,
        };
-
        let mut updated_vmas = {
            let mut containing_vma = vma.lock().unwrap();
            trace!(
@ -865,7 +899,8 @@ impl InternalVMManager {
            );
            debug_assert!(chunk.range() == containing_vma.range());

-            if containing_vma.perms() == new_perms {
+            let old_perms = containing_vma.perms();
+            if old_perms == new_perms {
                return Ok(());
            }

@ -876,7 +911,7 @@ impl InternalVMManager {
                (true, true) => {
                    // Exact the same vma
                    containing_vma.set_perms(new_perms);
-                    VMPerms::apply_perms(&containing_vma, containing_vma.perms());
+                    containing_vma.modify_permissions_for_committed_pages(new_perms);
                    return Ok(());
                }
                (false, false) => {
@ -886,15 +921,13 @@ impl InternalVMManager {
                    // remaining old VMA:     [protect_range.end,        containing_vma.end)

                    let old_end = containing_vma.end();
-                    let old_perms = containing_vma.perms();
-
-                    let new_vma = VMArea::inherits_file_from(
+                    let mut new_vma = VMArea::inherits_file_from(
                        &containing_vma,
                        protect_range,
                        new_perms,
                        VMAccess::Private(current_pid),
                    );
-                    VMPerms::apply_perms(&new_vma, new_vma.perms());
+                    new_vma.modify_permissions_for_committed_pages(new_perms);

                    let remaining_old_vma = {
                        let range = VMRange::new(protect_range.end(), old_end).unwrap();
@ -905,7 +938,6 @@ impl InternalVMManager {
                            VMAccess::Private(current_pid),
                        )
                    };
-
                    containing_vma.set_end(protect_range.start());

                    // Put containing_vma at last to be updated first.
@ -913,19 +945,19 @@ impl InternalVMManager {
                    updated_vmas
                }
                _ => {
-                    let new_vma = VMArea::inherits_file_from(
+                    let mut new_vma = VMArea::inherits_file_from(
                        &containing_vma,
                        protect_range,
                        new_perms,
                        VMAccess::Private(current_pid),
                    );
-                    VMPerms::apply_perms(&new_vma, new_vma.perms());
+                    new_vma.modify_permissions_for_committed_pages(new_perms);

                    if same_start {
-                        // Protect range is at left side of the cotaining vma
+                        // Protect range is at left side of the containing vma
                        containing_vma.set_start(protect_range.end());
                    } else {
-                        // Protect range is at right side of the cotaining vma
+                        // Protect range is at right side of the containing vma
                        containing_vma.set_end(protect_range.start());
                    }

@ -935,19 +967,16 @@ impl InternalVMManager {
                }
            }
        };
-
        let current = current!();
        // First update current vma chunk
        if updated_vmas.len() > 1 {
            let update_vma = updated_vmas.pop().unwrap();
            self.update_single_vma_chunk(&current, &chunk, update_vma);
        }
-
        // Then add new chunks if any
        updated_vmas.into_iter().for_each(|vma| {
            self.add_new_chunk(&current, vma);
        });
-
        Ok(())
    }

@ -964,9 +993,6 @@ impl InternalVMManager {
        // Remove from chunks
        self.chunks.remove(chunk);

-        // Mprotect the whole chunk to reduce the usage of vma count of host
-        VMPerms::apply_perms(range, VMPerms::DEFAULT);
-
        // Add range back to freespace manager
        self.free_manager.add_range_back_to_free_manager(range);
        Ok(())
@ -1131,6 +1157,7 @@ impl InternalVMManager {
            let perms = options.perms().clone();
            let align = options.align().clone();
            let initializer = options.initializer();
+            let page_policy = options.page_policy();
            target_contained_ranges
                .iter()
                .map(|range| {
@ -1146,6 +1173,7 @@ impl InternalVMManager {
                        .initializer(initializer.clone())
                        .addr(addr)
                        .size(size)
+                        .page_policy(*page_policy)
                        .build()
                        .unwrap()
                })
--- a/src/libos/src/vm/vm_perms.rs
+++ b/src/libos/src/vm/vm_perms.rs
@ -39,37 +39,6 @@ impl VMPerms {
        self.bits == Self::DEFAULT.bits
    }

-    pub fn apply_perms(protect_range: &VMRange, perms: VMPerms) {
-        use sgx_trts::enclave::rsgx_is_supported_EDMM;
-
-        unsafe {
-            let mut retval = 0;
-            let addr = protect_range.start() as *const c_void;
-            let len = protect_range.size();
-            // PT_GROWSDOWN should only be applied to stack segment or a segment mapped with the MAP_GROWSDOWN flag set.
-            // Since the memory are managed by our own, mprotect ocall shouldn't use this flag. Otherwise, EINVAL will be thrown.
-            let mut prot = perms.clone();
-            prot.remove(VMPerms::GROWSDOWN);
-
-            if rsgx_is_supported_EDMM() {
-                // With EDMM support, reserved memory permission should be updated.
-                let sgx_status = sgx_tprotect_rsrv_mem(addr, len, prot.bits() as i32);
-                if sgx_status != sgx_status_t::SGX_SUCCESS {
-                    panic!("sgx_tprotect_rsrv_mem status {}", sgx_status);
-                }
-            } else {
-                // Without EDMM support, reserved memory permission is statically RWX and we only need to do mprotect ocall.
-                let sgx_status = occlum_ocall_mprotect(&mut retval, addr, len, prot.bits() as i32);
-                if sgx_status != sgx_status_t::SGX_SUCCESS || retval != 0 {
-                    panic!(
-                        "occlum_ocall_mprotect status {}, retval {}",
-                        sgx_status, retval
-                    );
-                }
-            }
-        }
-    }
-
    pub fn display(&self) -> String {
        let mut str = String::new();
        if self.can_read() {
@ -96,23 +65,3 @@ impl Default for VMPerms {
        VMPerms::DEFAULT
    }
 }
-
-extern "C" {
-    // Modify the access permissions of the pages in the reserved memory area
-    //
-    // Parameters:
-    // Inputs: addr[in]: Starting address of region which needs to change access
-    //         permission. Page aligned.
-    //         length[in]: The length of the memory to be manipulated in bytes. Page aligned.
-    //         prot[in]: The target memory protection.
-    // Return: sgx_status_t
-    //
-    fn sgx_tprotect_rsrv_mem(addr: *const c_void, length: usize, prot: i32) -> sgx_status_t;
-
-    fn occlum_ocall_mprotect(
-        retval: *mut i32,
-        addr: *const c_void,
-        len: usize,
-        prot: i32,
-    ) -> sgx_status_t;
-}
--- a/src/libos/src/vm/vm_util.rs
+++ b/src/libos/src/vm/vm_util.rs
@ -10,6 +10,11 @@ use intrusive_collections::RBTreeLink;
 use intrusive_collections::{intrusive_adapter, KeyAdapter};
 use rcore_fs::vfs::Metadata;

+pub const GB: usize = 1 << 30;
+pub const TB: usize = 1 << 40;
+pub const MB: usize = 1 << 20;
+pub const KB: usize = 1 << 10;
+
 #[derive(Clone, Debug)]
 pub enum VMInitializer {
    DoNothing(),
@ -139,7 +144,7 @@ impl FileBacked {
        self.write_back
    }

-    pub fn init_file(&self) -> (&FileRef, usize) {
+    pub fn backed_file(&self) -> (&FileRef, usize) {
        (&self.file, self.offset)
    }

@ -179,6 +184,19 @@ impl VMMapAddr {
    }
 }

+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum PagePolicy {
+    ReserveOnly = 0x1,    // Only reserve
+    CommitNow = 0x2,      // Commit all pages when mmap.
+    CommitOnDemand = 0x4, // Reserve space when mmap, commit in the PF handler. This is the default policy.
+}
+
+impl Default for PagePolicy {
+    fn default() -> PagePolicy {
+        PagePolicy::CommitOnDemand
+    }
+}
+
 #[derive(Builder, Debug)]
 #[builder(pattern = "owned", build_fn(skip), no_std)]
 pub struct VMMapOptions {
@ -187,6 +205,7 @@ pub struct VMMapOptions {
    perms: VMPerms,
    addr: VMMapAddr,
    initializer: VMInitializer,
+    page_policy: PagePolicy,
 }

 // VMMapOptionsBuilder is generated automatically, except the build function
@ -232,12 +251,21 @@ impl VMMapOptionsBuilder {
            Some(initializer) => initializer.clone(),
            None => VMInitializer::default(),
        };
+        let page_policy = {
+            match &initializer {
+                VMInitializer::CopyFrom { .. } => PagePolicy::CommitNow,
+                VMInitializer::CopyOldAndReadNew { .. } => PagePolicy::CommitNow,
+                _ => self.page_policy.unwrap_or_default(),
+            }
+        };
+
        Ok(VMMapOptions {
            size,
            align,
            perms,
            addr,
            initializer,
+            page_policy,
        })
    }
 }
@ -269,6 +297,10 @@ impl VMMapOptions {
        }
        false
    }
+
+    pub fn page_policy(&self) -> &PagePolicy {
+        &self.page_policy
+    }
 }

 #[derive(Clone, Copy, PartialEq)]
--- a/tools/toolchains/dcap_lib/Cargo.lock
+++ b/tools/toolchains/dcap_lib/Cargo.lock
@ -25,4 +25,4 @@ dependencies = [

 [[package]]
 name = "sgx_types"
-version = "1.1.5"
+version = "1.1.6"