Add analytics zoo cluster serving demo

This commit is contained in:
CharleneHu94 2021-06-09 10:08:49 +08:00 committed by Zongmin.Gu
parent eaf47d1662
commit b82072a9b6
16 changed files with 438 additions and 0 deletions

@ -509,6 +509,31 @@ jobs:
sleep ${{ env.nap_time }};
docker exec flink_test bash -c "cd /root/occlum/demos/flink && SGX_MODE=SIM ./run_flink_on_occlum_glibc.sh task"
Cluster_serving_test:
runs-on: ubuntu-18.04
steps:
- uses: actions/checkout@v1
with:
submodules: true
- name: Get occlum version
run: echo "OCCLUM_VERSION=$(grep "Version =" src/pal/include/occlum_version.h | awk '{print $4}')" >> $GITHUB_ENV
- name: Create container
run: docker run -itd --name=cluster_serving_test -v $GITHUB_WORKSPACE:/root/occlum occlum/occlum:${{ env.OCCLUM_VERSION }}-ubuntu18.04
- name: Build dependencies
run: docker exec cluster_serving_test bash -c "cd /root/occlum; make submodule"
- name: Make install
run: docker exec cluster_serving_test bash -c "source /opt/intel/sgxsdk/environment; cd /root/occlum; OCCLUM_RELEASE_BUILD=1 make install"
- name: Set up environment
run: docker exec cluster_serving_test bash -c "cd /root/occlum/demos/cluster_serving; source ./environment.sh; ./install-dependencies.sh"
- name: Run cluster serving test
run: docker exec cluster_serving_test bash -c "cd /root/occlum/demos/cluster_serving; source ./environment.sh; SGX_MODE=SIM ./start-all.sh; ./push-image.sh"
enclave_ra_tls_test:
runs-on: ubuntu-18.04
steps:

@ -14,6 +14,7 @@ This set of demos shows how the Occlum toolchain can be used with different buil
This set of demos shows how real-world apps can be easily run inside SGX enclaves with Occlum.
* [cluster_serving](cluster_serving/): A demo of [Analytics Zoo Cluster Serving](https://analytics-zoo.github.io/master/#ClusterServingGuide/ProgrammingGuide/) inference solution.
* [fish](fish/): A demo of [FISH](https://fishshell.com) shell script.
* [flink](flink/): A demo of [Apache Flink](https://flink.apache.org).
* [font](font/font_support_for_java): A demo of supporting font with Java.

@ -0,0 +1,57 @@
# Analytics Zoo Cluster Serving Inference in SGX with Occlum #
This example demonstrates how to use Analytics Zoo Cluster Serving for real-time inference in SGX.
[Analytics Zoo](https://github.com/intel-analytics/analytics-zoo) is an open source Big Data AI platform, [Cluster Serving](https://www.usenix.org/conference/opml20/presentation/song) is a real-time serving solution that enables automatic model inference on Flink cluster.
Note that in this example all components are run on single machine within one container. For running cluster serving with SGX on multi-nodes, please refer to [distributed mode guide](https://github.com/intel-analytics/analytics-zoo/tree/master/ppml/trusted-realtime-ml/scala/docker-occlum#distributed-mode-multi-containersmulti-nodes) from Analytics Zoo.
Besides following steps in this demo, user can also choose to directly use the docker image provided by Analytics Zoo for cluster serving with Occlum which has all dependencies pre-installed. For detailed guide using the docker image, please refer to [Analytics Zoo guide](https://analytics-zoo.readthedocs.io/en/latest/doc/PPML/Overview/ppml.html#trusted-realtime-compute-and-ml).
## Set up environment ##
Set environment variables and install dependencies (Redis, Flink, Analytics Zoo, models)
source ./environment.sh
./install-dependencies.sh
## Start Cluster Serving ##
Start Redis, Flink and cluster serving
./start-all.sh
Or you can start components separately:
1. **Start Redis Server**
`./start-redis.sh &`
2. **Start Flink**
Start Flink Jobmanager on host
`./start-flink-jobmanager.sh`
Initialize and start Flink Taskmanager with Occlum
```
./init-occlum-taskmanager.sh
./start-flink-taskmanager.sh
```
3. **Start Cluster Serving job**
Start HTTP frontend
`./start-http-frontend.sh &`
Start cluster serving job
`./start-cluster-serving-job.sh`
## Push inference image ##
Push image into queue via Restful API for inference. Users can modify the script with base64 of inference image (note that the image size must match model input size, e.g. 224*224 for resnet50 in this demo). Users can also use python API to directly push the image file, see [guide](https://analytics-zoo.github.io/master/#ClusterServingGuide/ProgrammingGuide/#4-model-inference) for details.
./push-image.sh
## Stop Cluster Serving ##
Stop cluster serving job and all components
./stop-all.sh

@ -0,0 +1,59 @@
## Analytics-zoo Cluster Serving Config
# model path must be provided
modelPath: resnet50
# name, default is serving_stream, you need to specify if running multiple servings
# jobName:
# default, None
# postProcessing:
# default, false, if input is already batched NdArray, set true
# inputAlreadyBatched:
######## Performance Configuration
# default, number of cores per machine
# if you set coreNumberPerMachine, Cluster Serving will auto config modelParallelism and threadPerModel
# if you need to set modelParallelism and threadPerModel manually, do not set coreNumberPerMachine
coreNumberPerMachine: 1
# default: number of models used in serving
# modelParallelism:
# default: number of cores (or threads) used per model in serving
# threadPerModel
# default: OFF
# performance_mode:
######## Specific Configuration
# default: localhost:8081
# flinkRestUrl:
# default: localhost:6379
# redisUrl:
# default: 8g
# redisMaxmemory:
# default: 5000
# redisTimeout:
######## Secure Configuration
# default: false
# secureEnabled:
# default:
# secureTrustStorePath: /opt/keys/keystore.jks
# default:
# secureStructStorePassword:
# default: false
# modelEncrypted:
# default: false
#recordEncrypted:

@ -0,0 +1,12 @@
#!/bin/bash
# Environment variables
export CORE_NUM=`nproc`
export REDIS_VERSION=6.0.6
export REDIS_HOME=redis-${REDIS_VERSION}
export FLINK_VERSION=1.10.1
export FLINK_HOME=flink-${FLINK_VERSION}
export ANALYTICS_ZOO_VERSION=0.10.0
export BIGDL_VERSION=0.12.2
export SPARK_VERSION=2.4.3

4
demos/cluster_serving/hosts Executable file

@ -0,0 +1,4 @@
127.0.0.1 occlum-node
127.0.0.1 localhost
::1 occlum-node
::1 localhost

@ -0,0 +1,42 @@
#!/bin/bash
# set -x
FLINK_VERSION=$FLINK_VERSION
occlum_glibc=/opt/occlum/glibc/lib/
init_instance() {
# Remove older instance
rm -rf flink && mkdir flink
cd flink
# Init Occlum instance
occlum init
new_json="$(jq '.resource_limits.user_space_size = "7000MB" |
.resource_limits.kernel_space_heap_size="64MB" |
.resource_limits.max_num_of_threads = 72 |
.process.default_heap_size = "128MB" |
.process.default_mmap_size = "6600MB" |
.entry_points = [ "/usr/lib/jvm/java-11-openjdk-amd64/bin" ] |
.env.default = [ "LD_LIBRARY_PATH=/usr/lib/jvm/java-11-openjdk-amd64/lib/server:/usr/lib/jvm/java-11-openjdk-amd64/lib:/usr/lib/jvm/java-11-openjdk-amd64/../lib:/lib:/opt/occlum/glibc/lib/", "OMP_NUM_THREADS=1", "KMP_AFFINITY=verbose,granularity=fine,compact,1,0", "KMP_BLOCKTIME=20" ]' Occlum.json)" && \
echo "${new_json}" > Occlum.json
}
build_flink() {
# Copy JVM and class file into Occlum instance and build
mkdir -p image/usr/lib/jvm
cp -r /usr/lib/jvm/java-11-openjdk-amd64 image/usr/lib/jvm
cp /lib/x86_64-linux-gnu/libz.so.1 image/lib
unzip -j ../analytics-zoo-bigdl_${BIGDL_VERSION}-spark_${SPARK_VERSION}-${ANALYTICS_ZOO_VERSION}-serving.jar linux-x86_64/openvino/* -d image/lib
cp $occlum_glibc/libdl.so.2 image/$occlum_glibc
cp $occlum_glibc/librt.so.1 image/$occlum_glibc
cp $occlum_glibc/libm.so.6 image/$occlum_glibc
cp $occlum_glibc/libnss_files.so.2 image/$occlum_glibc
cp -rf ../flink-${FLINK_VERSION}/* image/bin/
cp -rf ../flink-${FLINK_VERSION}/conf image/opt/
cp -rf /etc/java-11-openjdk image/etc/
cp -rf ../hosts image/etc/
# build occlum
occlum build
}
#Build the flink occlum instance
init_instance
build_flink

@ -0,0 +1,30 @@
#!/bin/bash
apt-get update
apt-get install -y openjdk-11-jdk
apt-get install -y netcat
# Redis
wget http://download.redis.io/releases/redis-${REDIS_VERSION}.tar.gz && \
tar -zxvf redis-${REDIS_VERSION}.tar.gz
rm redis-${REDIS_VERSION}.tar.gz
cd redis-${REDIS_VERSION}
make
cd ../
# Flink
wget https://archive.apache.org/dist/flink/flink-${FLINK_VERSION}/flink-${FLINK_VERSION}-bin-scala_2.11.tgz
tar -zxvf flink-${FLINK_VERSION}-bin-scala_2.11.tgz
rm flink-${FLINK_VERSION}-bin-scala_2.11.tgz
# Analytics Zoo
wget https://repo1.maven.org/maven2/com/intel/analytics/zoo/analytics-zoo-bigdl_$BIGDL_VERSION-spark_$SPARK_VERSION/$ANALYTICS_ZOO_VERSION/analytics-zoo-bigdl_$BIGDL_VERSION-spark_$SPARK_VERSION-$ANALYTICS_ZOO_VERSION-serving.jar
wget https://repo1.maven.org/maven2/com/intel/analytics/zoo/analytics-zoo-bigdl_$BIGDL_VERSION-spark_$SPARK_VERSION/$ANALYTICS_ZOO_VERSION/analytics-zoo-bigdl_$BIGDL_VERSION-spark_$SPARK_VERSION-$ANALYTICS_ZOO_VERSION-http.jar
# models
mkdir resnet50 && \
cd resnet50 && \
wget -c "https://sourceforge.net/projects/analytics-zoo/files/analytics-zoo-models/openvino/2018_R5/resnet_v1_50.bin/download" -O resnet_v1_50.bin && \
wget -c "https://sourceforge.net/projects/analytics-zoo/files/analytics-zoo-models/openvino/2018_R5/resnet_v1_50.xml/download" -O resnet_v1_50.xml

File diff suppressed because one or more lines are too long

@ -0,0 +1,35 @@
#!/bin/bash
set -x
REDIS_HOST=127.0.0.1
REDIS_PORT=6379
FLINK_TASK_MANAGER_IP=127.0.0.1
FLINK_JOB_MANAGER_REST_PORT=8081
FLINK_TASK_MANAGER_DATA_PORT=6124
./start-redis.sh &
echo "redis started"
./start-flink-jobmanager.sh &
echo "flink-jobmanager started"
./init-occlum-taskmanager.sh
echo "occlum flink taskmanager image built"
while ! nc -z $FLINK_TASK_MANAGER_IP $FLINK_JOB_MANAGER_REST_PORT; do
sleep 1
done
./start-flink-taskmanager.sh &
echo "flink-taskmanager started"
while ! nc -z $REDIS_HOST $REDIS_PORT; do
sleep 1
done
./start-http-frontend.sh &
echo "http-frontend started"
while ! nc -z $FLINK_TASK_MANAGER_IP $FLINK_TASK_MANAGER_DATA_PORT; do
sleep 1
done
./start-cluster-serving-job.sh &
echo "cluster-serving-job started"

@ -0,0 +1,15 @@
#!/bin/bash
set -x
flink_home=$FLINK_HOME
flink_job_manager_ip=127.0.0.1
flink_job_manager_port=8081
echo "### Launching Cluster Serving Job ###"
export FLINK_CONF_DIR=${flink_home}/conf && \
${flink_home}/bin/flink run \
-c com.intel.analytics.zoo.serving.ClusterServing -p 1 -m $flink_job_manager_ip:$flink_job_manager_port \
analytics-zoo-bigdl_${BIGDL_VERSION}-spark_${SPARK_VERSION}-${ANALYTICS_ZOO_VERSION}-serving.jar | tee ./cluster-serving-job-sgx.log

@ -0,0 +1,32 @@
#!/bin/bash
set -x
core_num=$CORE_NUM
job_manager_host=127.0.0.1
job_manager_rest_port=8081
job_manager_rpc_port=6123
flink_home=$FLINK_HOME
flink_version=$FLINK_VERSION
echo "### Launching Flink Jobmanager ###"
java \
-Xms5g \
-Xmx10g \
-XX:ActiveProcessorCount=${core_num} \
-Dorg.apache.flink.shaded.netty4.io.netty.tryReflectionSetAccessible=true \
-Dorg.apache.flink.shaded.netty4.io.netty.eventLoopThreads=${core_num} \
-Dcom.intel.analytics.zoo.shaded.io.netty.tryReflectionSetAccessible=true \
-Dlog.file=${flink_home}/log/flink-sgx-standalonesession-1-sgx-ICX-LCC.log \
-Dlog4j.configuration=file:${flink_home}/conf/log4j.properties \
-Dlogback.configurationFile=file:${flink_home}/conf/logback.xml \
-classpath ${flink_home}/lib/flink-table_2.11-${flink_version}.jar:${flink_home}/lib/flink-table-blink_2.11-${flink_version}.jar:${flink_home}/lib/log4j-1.2.17.jar:${flink_home}/lib/slf4j-log4j12-1.7.15.jar:${flink_home}/lib/flink-dist_2.11-${flink_version}.jar::: org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint \
--configDir ${flink_home}/conf \
-D rest.bind-address=${job_manager_host} \
-D rest.bind-port=${job_manager_rest_port} \
-D jobmanager.rpc.address=${job_manager_host} \
-D jobmanager.rpc.port=${job_manager_rpc_port} \
-D jobmanager.heap.size=5g \
--executionMode cluster | tee ./flink-jobmanager-sgx.log

@ -0,0 +1,71 @@
#!/bin/bash
set -e
BLUE='\033[1;34m'
NC='\033[0m'
conf_dir=conf
id=$([ -f "$pid" ] && echo $(wc -l < "$pid") || echo "0")
FLINK_LOG_PREFIX="/host/flink--$postfix-${id}"
log="${FLINK_LOG_PREFIX}.log"
out="./flink--$postfix-${id}.out"
core_num=1
job_manager_host=127.0.0.1
job_manager_rest_port=8081
job_manager_rpc_port=6123
task_manager_host=127.0.0.1
task_manager_data_port=6124
task_manager_rpc_port=6125
task_manager_taskslots_num=1
flink_home=$FLINK_HOME
flink_version=$FLINK_VERSION
run_taskmanager() {
# enter occlum image
cd flink
#if conf_dir exists, use the new configurations.
if [[ -d $conf_dir && "$(ls -A $conf_dir)" ]]; then
cp -r $conf_dir/* image/opt/conf/
occlum build
fi
echo -e "${BLUE}occlum run JVM taskmanager${NC}"
echo -e "${BLUE}logfile=$log${NC}"
# start task manager in occlum
occlum run /usr/lib/jvm/java-11-openjdk-amd64/bin/java \
-XX:+UseG1GC -Xmx1152m -Xms1152m -XX:MaxDirectMemorySize=512m -XX:MaxMetaspaceSize=256m \
-Dos.name=Linux \
-XX:ActiveProcessorCount=${core_num} \
-Dlog.file=$log \
-Dlog4j.configuration=file:/opt/conf/log4j.properties \
-Dlogback.configurationFile=file:/opt/conf/logback.xml \
-classpath /bin/lib/flink-table-blink_2.11-1.10.1.jar:/bin/lib/flink-table_2.11-1.10.1.jar:/bin/lib/log4j-1.2.17.jar:/bin/lib/slf4j-log4j12-1.7.15.jar:/bin/lib/flink-dist_2.11-1.10.1.jar org.apache.flink.runtime.taskexecutor.TaskManagerRunner \
-Dorg.apache.flink.shaded.netty4.io.netty.tryReflectionSetAccessible=true \
-Dorg.apache.flink.shaded.netty4.io.netty.eventLoopThreads=${core_num} \
-Dcom.intel.analytics.zoo.shaded.io.netty.tryReflectionSetAccessible=true \
--configDir /opt/conf \
-D rest.bind-address=${job_manager_host} \
-D rest.bind-port=${job_manager_rest_port} \
-D jobmanager.rpc.address=${job_manager_host} \
-D jobmanager.rpc.port=${job_manager_rpc_port} \
-D jobmanager.heap.size=5g \
-D taskmanager.host=${task_manager_host} \
-D taskmanager.data.port=${task_manager_data_port} \
-D taskmanager.rpc.port=${task_manager_rpc_port} \
-D taskmanager.numberOfTaskSlots=${task_manager_taskslots_num} \
-D taskmanager.cpu.cores=${core_num} \
-D taskmanager.memory.framework.off-heap.size=256mb \
-D taskmanager.memory.network.max=256mb \
-D taskmanager.memory.network.min=256mb \
-D taskmanager.memory.framework.heap.size=128mb \
-D taskmanager.memory.managed.size=800mb \
-D taskmanager.cpu.cores=1.0 \
-D taskmanager.memory.task.heap.size=1024mb \
-D taskmanager.memory.task.off-heap.size=0mb 2>&1 | tee $out &
}
run_taskmanager

@ -0,0 +1,24 @@
#!/bin/bash
set -x
echo "### Launching HTTP Frontend ###"
redis_host=127.0.0.1
core_num=$CORE_NUM
java \
-Xms2g \
-Xmx8g \
-XX:ActiveProcessorCount=${core_num} \
-Dcom.intel.analytics.zoo.shaded.io.netty.tryReflectionSetAccessible=true \
-Dakka.http.host-connection-pool.max-connections=100 \
-Dakka.http.host-connection-pool.max-open-requests=128 \
-Dakka.actor.default-dispatcher.fork-join-executor.parallelism-min=100 \
-Dakka.actor.default-dispatcher.fork-join-executor.parallelism-max=120 \
-Dakka.actor.default-dispatcher.fork-join-executor.parallelism-factor=1 \
-jar analytics-zoo-bigdl_${BIGDL_VERSION}-spark_${SPARK_VERSION}-${ANALYTICS_ZOO_VERSION}-http.jar \
--redisHost "${redis_host}" \
--tokensPerSecond 30 \
--tokenBucketEnabled true \
--parallelism ${core_num} | tee ./http-frontend-sgx.log

@ -0,0 +1,8 @@
#!/bin/bash
# set -x
echo "### Launching Redis ###"
REDIS_PORT=6379
$REDIS_HOME/src/redis-server --port $REDIS_PORT \
--protected-mode no --maxmemory 10g | tee ./redis-sgx.log

@ -0,0 +1,12 @@
#!/bin/bash
#set -x
# Stop cluster serving
${FLINK_HOME}/bin/flink list | grep RUNNING | awk '{print $4}' | xargs ${FLINK_HOME}/bin/flink cancel
ps -ef | grep http.jar | grep -v grep | awk '{print $2}' | xargs kill -9
# Stop Flink
ps -ef | grep -e TaskManagerRunner -e StandaloneSessionClusterEntrypoint | grep -v grep | awk '{print $2}' | xargs kill -9
# Stop Redis
${REDIS_HOME}/src/redis-cli shutdown