[example] Add k8s deployment reference

2022-08-09 10:48:29 +08:00 · 2022-08-09 10:48:29 +08:00 · a98ff93b0c
commit a98ff93b0c
parent a871984e5b
11 changed files with 480 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -239,8 +239,14 @@ Alternatively, to use Occlum without Docker, one can install Occlum on popular L

 There are many different projects to demonstrate how Occlum can be used to build and run user applications, which could be found on [`demos`](./demos/).

+### Deployment by Docker
+
 There is also a whole-flow confidential inference service [`example`](./example/) to demonstrate how to convert a real application directly from Docker image to Occlum image, how to integrate Occlum [`Init-RA`](./demos/remote_attestation/init_ra_flow/) solution for whole-flow sensitive data protection plus how to generate and run the Docker container based Occlum instances.

+### Deployment by Kubernetes
+
+Moreover, an example to deploy Occlum Confidential Inference Service on Kubernetes can be found [`here`](./example/kubernetes/).
+
 ## How to Build?

 To build Occlum from the latest source code, do the following steps in an Occlum Docker container (which can be prepared as shown in the last section):
--- a/example/README.md
+++ b/example/README.md
@ -13,6 +13,8 @@ This example introduces the development and deployment of a whole-flow confident

 * Way to build out the Docker container image in minimum size based on the Occlum package.

+* Deploy Occlum Confidential Inference Service on Kubernetes please refer to [`kubernetes`](./kubernetes/).
+
 ## Overview

 ![Arch Overview](./overview.png)
--- a/example/client/benchmark.py
+++ b/example/client/benchmark.py
@ -0,0 +1,115 @@
+import grpc
+import tensorflow as tf
+import argparse, time, grpc, asyncio
+
+from tensorflow_serving.apis import predict_pb2
+from tensorflow_serving.apis import prediction_service_pb2_grpc
+
+
+class benchmark_engine(object):
+    def __init__(self, server, image, certificate, concurrent_num=64, response_time=10):
+        self.server = server
+        self.response_time = response_time
+        self.concurrent_num = concurrent_num
+        self.image = image
+        self.certificate = certificate
+        self.request_signatures = []
+        self.request_stubs = []
+        self.request_response_list = {}
+        self.__prepare__()
+        pass
+
+    def __prepare__(self):
+        for idx in range(self.concurrent_num):
+            # get image array
+            with open(self.image, 'rb') as f:
+                input_name = 'images'
+                input_shape = [1]
+                input_data = f.read()
+
+            # create request
+            request = predict_pb2.PredictRequest()
+            request.model_spec.name = 'INCEPTION'
+            request.model_spec.signature_name = 'predict_images'
+            request.inputs[input_name].CopyFrom(
+                tf.make_tensor_proto(input_data, shape=input_shape))
+            
+            self.request_signatures.append(request)
+        return None
+
+    async def __connection__(self, task_idx, loop_num):
+        request_signatures = self.request_signatures[task_idx]
+        response_list = []
+
+        # create channel
+        creds = grpc.ssl_channel_credentials(root_certificates=open(self.certificate, 'rb').read())
+        async with grpc.aio.secure_channel(self.server, creds) as channel:
+            stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
+            format_string = 'query: {} channel, task {}, loop_idx {}, latency(ms) {:.1f}, tps: {:.1f}'
+            for loop_idx in range(loop_num):
+                start_time = time.time()
+                response = await stub.Predict(request_signatures)
+                stop_time = time.time()
+                latency = stop_time - start_time
+                tps = 1 / latency
+                response_list.append([response, latency])
+                print(format_string.format('secure', task_idx, loop_idx, 1000*latency, tps))
+        return response_list
+
+    def run(self, loop_num):
+        start_time = time.time()
+
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+        connections = []
+        self.request_response_list.clear()
+        for idx in range(self.concurrent_num):
+            connections.append(asyncio.ensure_future(self.__connection__(idx, loop_num)))
+
+        loop.run_until_complete(asyncio.wait(connections))
+        loop.close()
+
+        stop_time = time.time()
+
+        response_list = [connections[idx].result() for idx in range(self.concurrent_num)]
+
+        request_time = 0
+        for c_idx in range(self.concurrent_num):
+            if loop_num != 0:
+                for l_idx in range(loop_num):
+                    request_time += response_list[c_idx][l_idx][1]
+
+        if loop_num != 0:
+            e2e_time = stop_time - start_time
+            request_num = self.concurrent_num * loop_num
+            latency = request_time / request_num
+            tps = request_num * 1 / e2e_time
+            format_string = 'summary: cnum {}, e2e time(s) {}, average latency(ms) {}, tps: {}'
+            print(format_string.format(self.concurrent_num, e2e_time, 1000*latency, tps))
+    pass
+
+def main():
+    benchmark_app = benchmark_engine(args.server, args.image, args.crt, args.cnum)
+    if args.loop == 0:
+        print("loop parameter needs to be bigger than 0")
+        return
+
+    # warm up
+    benchmark_app.run(5)
+    # start loop
+    benchmark_app.run(args.loop)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--server', default='localhost:9000',
+                        help='Tenforflow Model Server Address')
+    parser.add_argument('--crt', default=None, type=str, help='TLS certificate file path')
+    parser.add_argument('--image', default='Siberian_Husky_bi-eyed_Flickr.jpg',
+                        help='Path to the image')
+    parser.add_argument('--cnum', default=8, type=int, help='Concurrent connection num')
+    parser.add_argument('--loop', default=100, type=int, help='Requests loop num, should > 0')
+    args = parser.parse_args()
+
+    main()
--- a/example/client/inception_client.py
+++ b/example/client/inception_client.py
@ -1,5 +1,3 @@
-from __future__ import print_function
-
 import grpc
 import tensorflow as tf
 import argparse
--- a/example/client/requirements.txt
+++ b/example/client/requirements.txt
@ -1,3 +1,4 @@
 grpcio>=1.34.0
+aiohttp>=3.7.0
 tensorflow>=2.3.0
 tensorflow-serving-api>=2.3.0
--- a/example/container/Dockerfile_client
+++ b/example/container/Dockerfile_client
@ -0,0 +1,13 @@
+FROM python:3.8
+LABEL maintainer="Qi Zheng <huaiqing.zq@antgroup.com>"
+
+ARG pip_mirror
+
+COPY ./client /app
+
+# RUN apt update && apt install -y python3-opencv
+
+RUN pip install ${pip_mirror} -r /app/requirements.txt
+
+WORKDIR /app
+CMD ["bash"]
--- a/example/kubernetes/README.md
+++ b/example/kubernetes/README.md
@ -0,0 +1,148 @@
+# Deploy Occlum Confidential Inference Service on Kubernetes
+
+Besides the traditional Docker deployment way, [`Kubernetes`](https://kubernetes.io/docs/concepts/overview/) is the most popular platform for managing containerized workloads and services. This example introduces a solution to deploy scalable Occlum Confidential Inference Service on a single Kubernetes cluster.
+
+## Overview
+
+![Arch Overview](./overview.png)
+
+The technical background of the GRPC RATLS based remote attestation and the detail secrets protection please refer to [`README.md`](../README.md). For the kubernetes components, brief introduction as below.
+
+### Key broker service
+
+The GRPC-RATLS Server pod works as a key broker service, dispatching secrets per request through the GRPC-RATLS connection from the tensorflow serving pods.
+
+### TF Service
+
+Multiple tensorflow serving pods export as TF service by NodePort type in this case.
+The number of tensorflow serving pods could be scaled according to the actual EPC resource.
+
+### Client
+
+Clients can send inference request with server certificates (`server.crt`) to the TF service.
+
+## Environment set up
+
+### Prerequisites
+
+* Aliyun confidential enhancement ECS nodes (g7t) with 32GB EPC
+
+* Ubuntu 20.04 LTS2 with SGX supported kernel
+
+The kernel could be installed by below command.
+```
+$ sudo apt install --install-recommends linux-generic-hwe-20.04
+```
+
+* One single kubernetes cluster with at least one worker node
+
+In this case, one master node, one worker node, both running on Aliyun g7t ECS with 32GB EPC.
+
+* Intel SGX device plugin installed
+
+Please refer to [`INTEL SGX Plugin Installation`](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/sgx_plugin/README.md#installation) for detail.
+
+## How to build
+
+There is one script [`./build.sh`](build.sh) provided for container images build.
+```
+Build Occlum TF examples container images for k8s deployment.
+usage: build.sh [OPTION]...
+    -r <container image registry> the container image registry
+    -g <tag> container image tag
+    -d <grpc_server_domain> GPRC RA server domain
+    -p <grpc_server_port> GPRC RA server port
+```
+
+For example, below command generates three container images.
+```
+# ./build.sh -r demo -g 0.28.0 -d init-ra-server-svc -p 5000
+```
+
+* **`demo/init_ra_server:0.28.0`** acts as key broker pod.
+* **`demo/tf_demo:0.28.0`** acts as tensorflow serving pod.
+* **`demo/tf_demo_client:0.28.0`** acts as client.
+
+## How to test
+
+### Modify the template yaml files
+
+* Modify the container image names and tag according to your build command.
+* Modify the port according to your build command.
+* Modify the `PCCS_URL` environment value according to your env.
+
+* Update the resource limits if necessary.
+```
+        resources:
+          limits:
+            sgx.intel.com/epc: "8000Mi"
+            cpu: "1000m"
+```
+In this case, for inference, "8000MB" SGX EPC memory size is used because Occlum `user_space_size` is set to "7000MB" in building stage. cpu limits "1000m" here is to limit the CPU usage for each Occlum inference pod with the purpose to show the performance gain by scalability in `benchmark`.
+
+* Args `"taskset -c 2,3,4,5"` is necessary till Occlum v1.0 release. The purpose is to limit the CPU cores used in tensorflow serving which makes the SGX thread number used won't exceed the `max_num_of_threads` defined in building stage.
+
+
+### Start the key broker service
+
+```
+$ kubectl apply -f occlum_init_ra_server.yaml
+```
+
+Wait a while, you can see below log for the pod if successful.
+```
+Server listening on 0.0.0.0:5000
+```
+
+### Start the tensorflow serving pods
+
+```
+$ kubectl apply -f occlum_tf_demo.yaml
+```
+
+Wait a while, you can see below log for the pod if successful.
+```
+Running gRPC ModelServer at 0.0.0.0:9001 ...
+```
+
+In default, only one replica for the tensorflow serving pod.
+
+### Try the inference request
+
+```
+$ docker run --rm --network host sevenzheng/tf_demo_client:0.28.0 python3 inception_client.py --server=localhost:31001 --crt server.crt --image cat.jpg
+```
+
+If successful, it prints the classification results.
+
+### Benchmark
+
+Below command can do benchmark test for the tensorflow serving service running in Occlum.
+
+```
+$ docker run --rm --network host sevenzheng/tf_demo_client:0.28.0 python3 benchmark.py --server localhost:31001 --crt server.crt --cnum 4 --loop 10 --image cat.jpg
+```
+
+Try scale up the tensorflow serving pods number, better `tps` can be achieved.
+For example, scale up to 3.
+```
+$ kubectl scale deploy tf-demo-deployment --replicas 3
+```
+
+## Alternatives
+
+### Running environment
+
+Cloud Service Providers (CSPs), such as Azure and Aliyun, provide confidential computing infrastructure (SGX TEE). This example assumes to run on Aliyun, self created kubernetes cluster. But theoretically it can run on any SGX VM/baremetal based kubernetes cluster.
+
+### Attestation
+
+This example uses GRPC-RATLS based remote attestation solution. With minor changes, it can accommodate to other attestation solutions listed below. The key point is to do the remote attestation in Occlum init process, leaving the application unmodified as far as possible.
+
+#### [Microsoft Azure Attestation](https://docs.microsoft.com/en-us/azure/attestation/overview)
+
+Put the RA operation in Occlum init process just like the demo [`maa_init`](../../demos/remote_attestation/azure_attestation/maa_init/). And Users can use Azure Key Vault to act as key broker service.
+
+#### [KubeTEE AECS](https://github.com/SOFAEnclave/enclave-configuration-service)
+
+AECS server acts as a key broker service,  and put the AECS client in Occlum init process in the tensorflow serving pod.
--- a/example/kubernetes/build.sh
+++ b/example/kubernetes/build.sh
@ -0,0 +1,72 @@
+#!/bin/bash
+set -e
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}"  )" >/dev/null 2>&1 && pwd )"
+top_dir=$(dirname "${script_dir}")
+
+# pip mirror is used to accelerate the speed of python pip
+pip_mirror="-i https://pypi.douban.com/simple"
+
+registry="demo"
+tag="latest"
+grpc_server_domain="init-ra-server-svc"
+grpc_server_port="5000"
+
+function usage {
+    cat << EOM
+Build Occlum TF examples container images for k8s deployment.
+usage: $(basename "$0") [OPTION]...
+    -r <container image registry> the container image registry
+    -g <tag> container image tag
+    -d <grpc_server_domain> GPRC RA server domain
+    -p <grpc_server_port> GPRC RA server port
+    -h <usage> usage help
+EOM
+    exit 0
+}
+
+function process_args {
+    while getopts ":r:g:d:p:h" option; do
+        case "${option}" in
+            r) registry=${OPTARG};;
+            g) tag=${OPTARG};;
+            d) grpc_server_domain=${OPTARG};;
+            p) grpc_server_port=${OPTARG};;
+            h) usage;;
+        esac
+    done
+}
+
+process_args "$@"
+
+echo ""
+echo "############################"
+echo "Build Occlum TF examples container images for k8s deployment"
+echo "  Container images registry: ${registry}"
+echo "  Container images tag: ${tag}"
+echo "  GRPC RA server domain: ${grpc_server_domain}"
+echo "  GRPC RA server port: ${grpc_server_port}"
+echo ""
+
+pushd ${top_dir}
+echo "Build Occlum instances first ..."
+./build_content.sh ${grpc_server_domain} ${grpc_server_port}
+
+echo ""
+echo "Build Occlum container images ..."
+./build_container_images.sh ${registry} ${tag}
+
+echo ""
+echo "Build demo client container image ..."
+cp ./ssl_configure/server.crt ./client/
+docker build \
+    --network host \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg pip_mirror="${pip_mirror}" \
+    -f container/Dockerfile_client \
+    -t ${registry}/tf_demo_client:${tag} .
+
+echo "Build is done"
+
+popd
--- a/example/kubernetes/occlum_init_ra_server.yaml
+++ b/example/kubernetes/occlum_init_ra_server.yaml
@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: init-ra-server-deployment
+spec:
+  selector:
+    matchLabels:
+      app: init-ra-server
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: init-ra-server
+    spec:
+      containers:
+      - name: init-ra-server-1
+        image: <registry>/init_ra_server:<tag>
+        # imagePullPolicy: Always
+        imagePullPolicy: IfNotPresent
+        args: ["occlum","run", "/bin/server", "0.0.0.0:5000"]
+        ports:
+        - containerPort: 5000
+        volumeMounts:
+          - name: sgx-enclave
+            mountPath: /dev/sgx/enclave
+          - name: sgx-provision
+            mountPath: /dev/sgx/provision
+        env:
+        - name: PCCS_URL
+          value: https://sgx-dcap-server.cn-shanghai.aliyuncs.com/sgx/certification/v3/
+        securityContext:
+            privileged: true
+        resources:
+          limits:
+            sgx.intel.com/epc: "1000Mi"
+      volumes:
+        - name: sgx-enclave
+          hostPath:
+            path: /dev/sgx_enclave
+        - name: sgx-provision
+          hostPath:
+            path: /dev/sgx_provision
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: init-ra-server-svc
+spec:
+  # type: NodePort
+  ports:
+  - port: 5000
+    targetPort: 5000
+    # nodePort: 31002
+  selector:
+    app: init-ra-server
--- a/example/kubernetes/occlum_tf_demo.yaml
+++ b/example/kubernetes/occlum_tf_demo.yaml
@ -0,0 +1,68 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tf-demo-deployment
+spec:
+  selector:
+    matchLabels:
+      app: tf-demo
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: tf-demo
+    spec:
+      containers:
+      - name: tf-demo-1
+        image: <registry>/tf_demo:<tag>
+        # imagePullPolicy: Always
+        imagePullPolicy: IfNotPresent
+        env:
+        - name: PCCS_URL
+          value: https://sgx-dcap-server.cn-shanghai.aliyuncs.com/sgx/certification/v3/
+        args:
+        - taskset
+        - -c
+        - 2,3,4,5
+        - occlum
+        - run
+        - /bin/tensorflow_model_server
+        - --model_name=INCEPTION
+        - --model_base_path=/model/INCEPTION/INCEPTION
+        - --port=9001
+        - --ssl_config_file=/etc/tf_ssl.cfg
+        ports:
+        - name: grpc
+          containerPort: 9001
+        volumeMounts:
+          - name: sgx-enclave
+            mountPath: /dev/sgx/enclave
+          - name: sgx-provision
+            mountPath: /dev/sgx/provision
+        securityContext:
+            privileged: true
+        resources:
+          limits:
+            sgx.intel.com/epc: "8000Mi"
+            cpu: "1000m"
+      volumes:
+        - name: sgx-enclave
+          hostPath:
+            path: /dev/sgx_enclave
+        - name: sgx-provision
+          hostPath:
+            path: /dev/sgx_provision
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: tf-demo-svc
+spec:
+  type: NodePort
+  ports:
+  - name: grpc
+    port: 9001
+    targetPort: 9001
+    nodePort: 31001
+  selector:
+    app: tf-demo
--- a/example/kubernetes/overview.png
+++ b/example/kubernetes/overview.png