Update the xgboost to latest version

Intel has optimized the xgboost in latest version. Performance has
obvious acceleration in benchmark. In svm training with Yahoo LTR data,
latest version gets 7.8X acceleration in host, and 2X acceleration in occlum.

Signed-off-by: yuanwu <yuan.wu@intel.com>
This commit is contained in:
yuanwu 2018-02-06 19:01:29 +00:00 committed by Zongmin.Gu
parent 4048686c3b
commit 66b64f8276
5 changed files with 46 additions and 35 deletions

@ -9,11 +9,14 @@ apt-get update
apt-get install -y python3-pip python3-setuptools
pip3 install kubernetes
#install the cmake
./install_cmake.sh
# Download and build XGBoost
rm -rf xgboost_src && mkdir xgboost_src
pushd xgboost_src
git clone https://github.com/dmlc/xgboost .
git checkout 6d5b34d82486cd1d0480c548f5d1953834659bd6
git checkout 9e955fb9b06cac32a06c92c4715f749d9d87e932
git submodule init
git submodule update
git apply ../patch/xgboost-01.diff

8
demos/xgboost/install_cmake.sh Executable file

@ -0,0 +1,8 @@
#!/bin/bash
set -e
cd ~
wget https://github.com/Kitware/CMake/releases/download/v3.15.5/cmake-3.15.5.tar.gz && tar xf cmake-3.15.5.tar.gz
cd cmake-3.15.5
./bootstrap
make -j4
sudo make install

@ -1,18 +1,18 @@
diff --git a/tracker/dmlc_tracker/local.py b/tracker/dmlc_tracker/local.py
index dff7c17..fca0f9d 100644
index 6e4af12..09df15e 100644
--- a/tracker/dmlc_tracker/local.py
+++ b/tracker/dmlc_tracker/local.py
@@ -24,6 +24,13 @@ def exec_cmd(cmd, role, taskid, pass_env):
num_retry = env.get('DMLC_NUM_ATTEMPT', 0)
@@ -26,6 +26,13 @@ def exec_cmd(cmd, num_attempt, role, taskid, pass_env):
num_retry = env.get('DMLC_NUM_ATTEMPT', num_attempt)
num_trial = 0
+ cmd_str = ''
+ for k, v in env.items():
+ if str(k)[0:4] == 'DMLC':
+ strenv = str(k) + '=' + str(v);
+ cmd_str = cmd_str + ' ' + strenv
+ cmd = cmd + cmd_str
+ cmdline = cmdline + ' ' + cmd_str
+
logging.debug('num of retry %d',num_retry)
while True:
if os.name == 'nt':
ret = subprocess.call(cmd, shell=True, env=env)

@ -1,36 +1,36 @@
diff --git a/src/allreduce_base.cc b/src/allreduce_base.cc
index 143db6e..a6daf20 100644
--- a/src/allreduce_base.cc
+++ b/src/allreduce_base.cc
@@ -486,12 +486,13 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc
index d1959eaa..68cd377a 100644
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -551,12 +551,13 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
// select must return
watcher.Poll();
// exception handling
- for (int i = 0; i < nlink; ++i) {
+ //FIXME:workaround for Occlum
+ /*for (int i = 0; i < nlink; ++i) {
+ // FIXME:workaround for Occlum
+ /* for (int i = 0; i < nlink; ++i) {
// recive OOB message from some link
if (watcher.CheckExcept(links[i].sock)) {
return ReportError(&links[i], kGetExcept);
}
- }
+ }*/
+ } */
// read data from childs
for (int i = 0; i < nlink; ++i) {
if (i != parent_index && watcher.CheckRead(links[i].sock)) {
@@ -641,12 +642,13 @@ AllreduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
@@ -731,12 +732,13 @@ AllreduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
// select
watcher.Poll();
// exception handling
- for (int i = 0; i < nlink; ++i) {
+ //FIXME:workaround for Occlum
+ /*for (int i = 0; i < nlink; ++i) {
+ // FIXME:workaround for Occlum
+ /* for (int i = 0; i < nlink; ++i) {
// recive OOB message from some link
if (watcher.CheckExcept(links[i].sock)) {
return ReportError(&links[i], kGetExcept);
}
- }
+ }*/
+ } */
if (in_link == -2) {
// probe in-link
for (int i = 0; i < nlink; ++i) {

@ -1,21 +1,21 @@
diff --git a/src/cli_main.cc b/src/cli_main.cc
index faa93ad..a201615 100644
index 5c602f37..d18a77c2 100644
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -339,6 +339,16 @@ int CLIRunTask(int argc, char *argv[]) {
printf("Usage: <config>\n");
return 0;
}
@@ -444,6 +444,16 @@ class CLI {
this->PrintHelp();
exit(1);
}
+
+ //FIXME:workaroud for local distributed machine learning on Occlum
+ for (int i = 2; i < argc; ++i) {
+ char name[128], val[128];
+ if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
+ //LOG(CONSOLE) << "setenv: " << std::string(name) << " = " << std::string(val);
+ setenv(std::string(name).c_str(), std::string(val).c_str(), 1);
+ // FIXME:workaroud for local distributed machine learning on Occlum
+ for (int i = 2; i < argc; ++i) {
+ char name[128], val[128];
+ if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
+ //LOG(CONSOLE) << "setenv: " << std::string(name) << " = " << std::string(val);
+ setenv(std::string(name).c_str(), std::string(val).c_str(), 1);
+ }
+ }
+ }
+
rabit::Init(argc, argv);
std::vector<std::pair<std::string, std::string> > cfg;
for (int i = 0; i < argc; ++i) {
std::string str {argv[i]};
if (str == "-h" || str == "--help") {