Make vfork stop parent child threads

When vfork is called and the current process has other running child threads,
for Linux, the other threads remain running. For Occlum, this behavior is
different. All the other threads will be frozen until the vfork returns
or execve is called in the child process.

The reason is that since Occlum doesn't support fork, many applications will
use vfork to replace fork. For multi-threaded applications, if vfork doesn't
stop other child threads, the application will be more likely to fail because
the child process directly uses the VM and the file table of the parent process.
This commit is contained in:
Hui, Chunyang 2022-09-27 11:33:19 +00:00 committed by volcano
parent f71e940cfd
commit 4c3ca79134
11 changed files with 231 additions and 54 deletions

@ -46,6 +46,7 @@ pub fn broadcast_interrupts() -> Result<usize> {
let should_interrupt_thread = |thread: &&ThreadRef| -> bool {
// TODO: check Thread::sig_mask to reduce false positives
thread.process().is_forced_to_exit()
|| thread.is_forced_to_stop()
|| !thread.sig_queues().read().unwrap().empty()
|| !thread.process().sig_queues().read().unwrap().empty()
};

@ -1,5 +1,7 @@
use super::untrusted_event::{set_event, wait_event};
use super::{ProcessRef, ThreadId, ThreadRef};
use crate::fs::FileTable;
use crate::interrupt::broadcast_interrupts;
use crate::prelude::*;
use crate::syscall::CpuContext;
use std::collections::HashMap;
@ -12,10 +14,15 @@ use std::mem;
// Thus in this implementation, the main idea is to let child use parent's task until exit or execve.
//
// Limitation:
// The child process will not have a complete process structure before execve. Thus during the time from vfork
// 1. The child process will not have a complete process structure before execve. Thus during the time from vfork
// to new child process execve or exit, the child process just reuse the parent process's everything, including
// task, pid and etc. And also the log of child process will not start from the point that vfork returns but the
// point that execve returns.
// 2. When vfork is called and the current process has other running child threads, for Linux, the other threads remain
// running. For Occlum, this behavior is different. All the other threads will be frozen until the vfork returns or
// execve is called in the child process. The reason is that since Occlum doesn't support fork, many applications will
// use vfork to replace fork. For multi-threaded applications, if vfork doesn't stop other child threads, the application
// will be more likely to fail because the child process directly uses the VM and the file table of the parent process.
lazy_static! {
// Store all the parents's file tables who call vfork. It will be recovered when the child exits or has its own task.
@ -38,6 +45,16 @@ pub fn do_vfork(mut context: *mut CpuContext) -> Result<isize> {
new_tid.as_u32() as pid_t
};
// stop all other child threads
let child_threads = current.process().threads();
child_threads.iter().for_each(|thread| {
if thread.tid() != current.tid() {
thread.force_stop();
}
});
// Don't hesitate. Interrupt all threads right now to stop child threads.
broadcast_interrupts();
// Save parent's context in TLS
VFORK_CONTEXT.with(|cell| {
let mut ctx = cell.borrow_mut();
@ -78,7 +95,21 @@ pub fn vfork_return_to_parent(
mut context: *mut CpuContext,
current_ref: &ThreadRef,
) -> Result<isize> {
return restore_parent_process(context, current_ref);
let child_pid = restore_parent_process(context, current_ref)?;
// Wake parent's child thread which are all sleeping
let current = current!();
let child_threads = current.process().threads();
child_threads.iter().for_each(|thread| {
thread.resume();
let thread_ptr = thread.raw_ptr();
if current.raw_ptr() != thread_ptr {
set_event(thread_ptr as *const c_void);
info!("Thread 0x{:x} is waken", thread_ptr);
}
});
Ok(child_pid)
}
fn restore_parent_process(mut context: *mut CpuContext, current_ref: &ThreadRef) -> Result<isize> {
@ -156,3 +187,17 @@ fn close_files_opened_by_child(current: &ThreadRef, parent_file_table: &FileTabl
.for_each(|&fd| current.close_file(fd).expect("close child file error"));
Ok(())
}
pub fn handle_force_stop() {
let current = current!();
if current.is_forced_to_stop() {
let current_thread_ptr = current.raw_ptr();
info!(
"Thread 0x{:x} is forced to stop ...",
current_thread_ptr as usize
);
while current.is_forced_to_stop() {
wait_event(current_thread_ptr as *const c_void);
}
}
}

@ -25,7 +25,7 @@ pub use self::do_exit::handle_force_exit;
pub use self::do_futex::{futex_wait, futex_wake};
pub use self::do_robust_list::RobustListHead;
pub use self::do_spawn::do_spawn_without_exec;
pub use self::do_vfork::do_vfork;
pub use self::do_vfork::{do_vfork, handle_force_stop};
pub use self::do_wait4::idle_reap_zombie_children;
pub use self::process::{Process, ProcessFilter, ProcessStatus, IDLE};
pub use self::spawn_attribute::posix_spawnattr_t;
@ -53,6 +53,7 @@ mod spawn_attribute;
mod syscalls;
mod term_status;
mod thread;
mod untrusted_event;
mod wait;
pub mod current;

@ -134,6 +134,7 @@ impl ThreadBuilder {
SgxMutex::new(None)
};
let host_eventfd = Arc::new(HostEventFd::new()?);
let raw_ptr = RwLock::new(0);
let new_thread = Arc::new(Thread {
task,
@ -154,6 +155,7 @@ impl ThreadBuilder {
sig_stack,
profiler,
host_eventfd,
raw_ptr,
});
let mut inner = new_thread.process().inner();

@ -48,6 +48,7 @@ pub struct Thread {
profiler: SgxMutex<Option<ThreadProfiler>>,
// Misc
host_eventfd: Arc<HostEventFd>,
raw_ptr: RwLock<usize>,
}
#[derive(Debug, PartialEq, Clone, Copy)]
@ -55,6 +56,7 @@ pub enum ThreadStatus {
Init,
Running,
Exited,
Stopped,
}
impl Thread {
@ -114,6 +116,11 @@ impl Thread {
&self.profiler
}
/// Get the host thread's raw pointer of this libos thread
pub fn raw_ptr(&self) -> usize {
self.raw_ptr.read().unwrap().clone()
}
/// Get a file from the file table.
pub fn file(&self, fd: FileDesc) -> Result<FileRef> {
self.files().lock().unwrap().get(fd)
@ -206,7 +213,15 @@ impl Thread {
pub(super) fn start(&self, host_tid: pid_t) {
self.sched().lock().unwrap().attach(host_tid);
let mut raw_ptr = self.raw_ptr.write().unwrap();
*raw_ptr = (unsafe { sgx_thread_get_self() } as usize);
// Before the thread starts, this thread could be stopped by other threads
if self.is_forced_to_stop() {
info!("thread is forced to stopped before this thread starts");
} else {
self.inner().start();
}
let eventfd = EventFile::new(
0,
@ -266,6 +281,20 @@ impl Thread {
pub(super) fn inner(&self) -> SgxMutexGuard<ThreadInner> {
self.inner.lock().unwrap()
}
pub fn force_stop(&self) {
let mut inner = self.inner();
inner.stop();
}
pub fn is_forced_to_stop(&self) -> bool {
self.inner().status() == ThreadStatus::Stopped
}
pub fn resume(&self) {
let mut inner = self.inner();
inner.resume();
}
}
impl PartialEq for Thread {
@ -302,6 +331,7 @@ pub enum ThreadInner {
Init,
Running,
Exited { term_status: TermStatus },
Stopped,
}
impl ThreadInner {
@ -314,6 +344,7 @@ impl ThreadInner {
Self::Init { .. } => ThreadStatus::Init,
Self::Running { .. } => ThreadStatus::Running,
Self::Exited { .. } => ThreadStatus::Exited,
Self::Stopped { .. } => ThreadStatus::Stopped,
}
}
@ -325,7 +356,14 @@ impl ThreadInner {
}
pub fn start(&mut self) {
debug_assert!(self.status() == ThreadStatus::Init);
*self = Self::Running;
}
pub fn stop(&mut self) {
*self = Self::Stopped;
}
pub fn resume(&mut self) {
*self = Self::Running;
}
@ -334,3 +372,7 @@ impl ThreadInner {
*self = Self::Exited { term_status };
}
}
extern "C" {
pub(crate) fn sgx_thread_get_self() -> *const c_void;
}

@ -0,0 +1,51 @@
use crate::prelude::*;
pub(crate) fn wait_event(thread: *const c_void) {
let mut ret: c_int = 0;
let mut sgx_ret: c_int = 0;
unsafe {
sgx_ret = sgx_thread_wait_untrusted_event_ocall(&mut ret as *mut c_int, thread);
}
if ret != 0 || sgx_ret != 0 {
panic!("ERROR: OCall failed!");
}
}
pub(crate) fn set_event(thread: *const c_void) {
let mut ret: c_int = 0;
let mut sgx_ret: c_int = 0;
unsafe {
sgx_ret = sgx_thread_set_untrusted_event_ocall(&mut ret as *mut c_int, thread);
}
if ret != 0 || sgx_ret != 0 {
panic!("ERROR: OCall failed!");
}
}
extern "C" {
/* Go outside and wait on my untrusted event */
pub(crate) fn sgx_thread_wait_untrusted_event_ocall(
ret: *mut c_int,
self_thread: *const c_void,
) -> c_int;
/* Wake a thread waiting on its untrusted event */
pub(crate) fn sgx_thread_set_untrusted_event_ocall(
ret: *mut c_int,
waiter_thread: *const c_void,
) -> c_int;
/* Wake a thread waiting on its untrusted event, and wait on my untrusted event */
pub(crate) fn sgx_thread_setwait_untrusted_events_ocall(
ret: *mut c_int,
waiter_thread: *const c_void,
self_thread: *const c_void,
) -> c_int;
/* Wake multiple threads waiting on their untrusted events */
pub(crate) fn sgx_thread_set_multiple_untrusted_events_ocall(
ret: *mut c_int,
waiter_threads: *const *const c_void,
total: size_t,
) -> c_int;
}

@ -1,3 +1,5 @@
use super::thread::sgx_thread_get_self;
use super::untrusted_event::{set_event, wait_event};
/// A wait/wakeup mechanism that connects wait4 and exit system calls.
use crate::prelude::*;
@ -113,50 +115,3 @@ where
1
}
}
fn wait_event(thread: *const c_void) {
let mut ret: c_int = 0;
let mut sgx_ret: c_int = 0;
unsafe {
sgx_ret = sgx_thread_wait_untrusted_event_ocall(&mut ret as *mut c_int, thread);
}
if ret != 0 || sgx_ret != 0 {
panic!("ERROR: OCall failed!");
}
}
fn set_event(thread: *const c_void) {
let mut ret: c_int = 0;
let mut sgx_ret: c_int = 0;
unsafe {
sgx_ret = sgx_thread_set_untrusted_event_ocall(&mut ret as *mut c_int, thread);
}
if ret != 0 || sgx_ret != 0 {
panic!("ERROR: OCall failed!");
}
}
extern "C" {
fn sgx_thread_get_self() -> *const c_void;
/* Go outside and wait on my untrusted event */
fn sgx_thread_wait_untrusted_event_ocall(ret: *mut c_int, self_thread: *const c_void) -> c_int;
/* Wake a thread waiting on its untrusted event */
fn sgx_thread_set_untrusted_event_ocall(ret: *mut c_int, waiter_thread: *const c_void)
-> c_int;
/* Wake a thread waiting on its untrusted event, and wait on my untrusted event */
fn sgx_thread_setwait_untrusted_events_ocall(
ret: *mut c_int,
waiter_thread: *const c_void,
self_thread: *const c_void,
) -> c_int;
/* Wake multiple threads waiting on their untrusted events */
fn sgx_thread_set_multiple_untrusted_events_ocall(
ret: *mut c_int,
waiter_threads: *const *const c_void,
total: size_t,
) -> c_int;
}

@ -67,7 +67,7 @@ pub fn deliver_signal(cpu_context: &mut CpuContext) {
let thread = current!();
let process = thread.process();
if !process.is_forced_to_exit() {
if !process.is_forced_to_exit() && !thread.is_forced_to_stop() {
do_deliver_signal(&thread, &process, cpu_context);
}

@ -730,6 +730,8 @@ fn do_syscall(user_context: &mut CpuContext) {
crate::signal::deliver_signal(user_context);
crate::process::handle_force_stop();
crate::process::handle_force_exit();
}

@ -1,5 +1,5 @@
include ../test_common.mk
EXTRA_C_FLAGS := -g
EXTRA_LINK_FLAGS :=
EXTRA_LINK_FLAGS := -lpthread
BIN_ARGS :=

@ -3,6 +3,7 @@
#include <fcntl.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <pthread.h>
#include "test.h"
// Note: This test intends to test the case that child process directly calls _exit()
@ -89,10 +90,87 @@ parent_exit:
exit(1);
}
volatile static int test_stop_child_flag = 0;
static void *child_thread_routine(void *_arg) {
printf("Child thread starts\n");
test_stop_child_flag = 1;
struct timespec t1, t2;
if (clock_gettime(CLOCK_REALTIME, &t1)) {
return (void *) -1;
}
sleep(1);
if (clock_gettime(CLOCK_REALTIME, &t2)) {
return (void *) -1;
}
// Parent thread vfork and will stop this thread for several seconds
if (t2.tv_sec - t1.tv_sec <= 1) {
printf("the thread is not stopped");
exit(-1);
}
printf("child thread exits\n");
return NULL;
}
// Test the behavior that when vfork is called, the parent process' other child threads are forced to stopped.
//
// This test case has different behaviors for Linux and Occlum
// This limitation is recorded in src/libos/src/process/do_vfork.rs
int test_vfork_stop_child_thread() {
pthread_t child_thread;
pid_t child_pid;
struct timespec ts;
ts.tv_sec = 3;
ts.tv_nsec = 0;
if (pthread_create(&child_thread, NULL, child_thread_routine, NULL) < 0) {
THROW_ERROR("pthread_create failed\n");
}
// Wait for child thread to start
while (test_stop_child_flag == 0);
child_pid = vfork();
if (child_pid == 0) {
printf("child process created\n");
char **child_argv = calloc(1, sizeof(char *) * 2);
child_argv[0] = "getpid";
// Wait for a few seconds
while (1) {
int ret = nanosleep(&ts, &ts);
if (ret == 0) {
break;
}
if (ret < 0 && errno != EINTR) {
THROW_ERROR("nanosleep failed");
}
}
printf("child process exec\n");
int ret = execve("/bin/getpid", child_argv, NULL);
if (ret != 0) {
printf("child process execve error\n");
}
_exit(1);
} else {
printf("return to parent\n");
pthread_join(child_thread, NULL);
}
return 0;
}
static test_case_t test_cases[] = {
TEST_CASE(test_vfork_exit),
TEST_CASE(test_multiple_vfork_execve),
TEST_CASE(test_vfork_isolate_file_table),
TEST_CASE(test_vfork_stop_child_thread),
};
int main() {