Skip to content

Commit

Permalink
[opt](hive docker)Parallel put hive data (#46571) (#46682)
Browse files Browse the repository at this point in the history
Problem Summary:
Parallel put `tpch1.db`, `paimon1` and `tvf_data` hive data. Reduce the
time cost from 22m to 16m on 16C machine.

Change-Id: Ib75c57d397ce1f96d5108d4b570bcb215f31d421
  • Loading branch information
Thearas authored Jan 9, 2025
1 parent 521e653 commit eddea8b
Showing 1 changed file with 32 additions and 17 deletions.
49 changes: 32 additions & 17 deletions docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

set -e -x

parallel=$(getconf _NPROCESSORS_ONLN)

nohup /opt/hive/bin/hive --service metastore &

# wait metastore start
Expand All @@ -37,7 +39,7 @@ done
touch "${lockfile1}"

DATA_DIR="/mnt/scripts/data/"
find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P 10 -I {} sh -c '
find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P "${parallel}" -I {} sh -c '
START_TIME=$(date +%s)
chmod +x "{}" && "{}"
END_TIME=$(date +%s)
Expand Down Expand Up @@ -92,45 +94,58 @@ fi
rm -f "${lockfile2}"

# put data file
hadoop_put_pids=()
hadoop fs -mkdir -p /user/doris/


## put tpch1
if [[ -z "$(ls /mnt/scripts/tpch1.db)" ]]; then
echo "tpch1.db does not exist"
exit 1
fi
hadoop fs -mkdir -p /user/doris/
hadoop fs -put /mnt/scripts/tpch1.db /user/doris/
if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then
echo "tpch1.db put failed"
exit 1
fi
hadoop fs -copyFromLocal -f /mnt/scripts/tpch1.db /user/doris/ &
hadoop_put_pids+=($!)

## put paimon1
if [[ -z "$(ls /mnt/scripts/paimon1)" ]]; then
echo "paimon1 does not exist"
exit 1
fi
hadoop fs -put /mnt/scripts/paimon1 /user/doris/
if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then
echo "paimon1 put failed"
exit 1
fi
hadoop fs -copyFromLocal -f /mnt/scripts/paimon1 /user/doris/ &
hadoop_put_pids+=($!)

## put tvf_data
if [[ -z "$(ls /mnt/scripts/tvf_data)" ]]; then
echo "tvf_data does not exist"
exit 1
fi
hadoop fs -put /mnt/scripts/tvf_data /user/doris/
hadoop fs -copyFromLocal -f /mnt/scripts/tvf_data /user/doris/ &
hadoop_put_pids+=($!)

## put other preinstalled data
hadoop fs -copyFromLocal -f /mnt/scripts/preinstalled_data /user/doris/ &
hadoop_put_pids+=($!)


# wait put finish
set +e
wait "${hadoop_put_pids[@]}"
set -e
if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then
echo "paimon1 put failed"
exit 1
fi
if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then
echo "tpch1.db put failed"
exit 1
fi
if [[ -z "$(hadoop fs -ls /user/doris/tvf_data)" ]]; then
echo "tvf_data put failed"
exit 1
fi

## put other preinstalled data
hadoop fs -put /mnt/scripts/preinstalled_data /user/doris/

# create tables
ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P 10 -I {} bash -c '
ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P "${parallel}" -I {} bash -c '
START_TIME=$(date +%s)
hive -f {}
END_TIME=$(date +%s)
Expand Down

0 comments on commit eddea8b

Please sign in to comment.