From 7bf9345af2e4958696e6f874aeca2b1a99220b65 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Tue, 9 Feb 2021 19:14:16 -0600 Subject: [PATCH] debug and improve auto_stop for triton server --- HeterogeneousCore/SonicTriton/README.md | 1 + .../SonicTriton/scripts/cmsTriton | 20 +++++++++++++++---- .../SonicTriton/src/TritonService.cc | 3 ++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/HeterogeneousCore/SonicTriton/README.md b/HeterogeneousCore/SonicTriton/README.md index 63a52d46d1e57..fa26df21434cf 100644 --- a/HeterogeneousCore/SonicTriton/README.md +++ b/HeterogeneousCore/SonicTriton/README.md @@ -94,6 +94,7 @@ The script has two operations (`start` and `stop`) and the following options: * `-m [dir]`: specific model directory (can be given more than one) * `-n [name]`: name of container instance, also used for hidden temporary dir (default: triton_server_instance) * `-P [port]`: base port number for services (-1: automatically find an unused port range) (default: 8000) +* `-p [pid]`: automatically shut down server when process w/ specified PID ends (-1: use parent process PID) * `-p`: automatically shut down server when parent process ends * `-r [num]`: number of retries when starting container (default: 3) * `-s [dir]`: Singularity sandbox directory (default: /cvmfs/unpacked.cern.ch/registry.hub.docker.com/fastml/triton-torchgeo:20.09-py3-geometric) diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton index 3e79f226e2b15..d9a82a1be9c49 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton @@ -36,7 +36,7 @@ usage() { $ECHO "-m [dir] \t specific model directory (can be given more than one)" $ECHO "-n [name] \t name of container instance, also used for default hidden temporary dir (default: ${SERVER})" $ECHO "-P [port] \t base port number for services (-1: automatically find an unused port range) (default: ${BASEPORT})" - $ECHO "-p \t automatically shut down server when parent process ends" + $ECHO "-p [pid] \t automatically shut down server when process w/ specified PID ends (-1: use parent process PID)" $ECHO "-r [num] \t number of retries when starting container (default: ${RETRIES})" $ECHO "-s [dir] \t Singularity sandbox directory (default: ${SANDBOX})" $ECHO "-t [dir] \t non-default hidden temporary dir" @@ -56,7 +56,7 @@ if [ -e /run/shm ]; then SHM=/run/shm fi -while getopts "cDdfgi:M:m:n:P:pr:s:t:vw:h" opt; do +while getopts "cDdfgi:M:m:n:P:p:r:s:t:vw:h" opt; do case "$opt" in c) CLEANUP="" ;; @@ -78,7 +78,7 @@ while getopts "cDdfgi:M:m:n:P:pr:s:t:vw:h" opt; do ;; P) if [ "$OPTARG" -eq -1 ]; then AUTOPORT=true; else BASEPORT="$OPTARG"; fi ;; - p) PARENTPID="$PPID" + p) if [ "$OPTARG" -eq -1 ]; then PARENTPID="$PPID"; else PARENTPID="$OPTARG"; fi ;; r) RETRIES="$OPTARG" ;; @@ -297,13 +297,25 @@ auto_stop(){ PARENTPID="$2" if [ -n "$PARENTPID" ]; then + if [ -n "$VERBOSE" ]; then + echo "watching PID $PARENTPID" + ps + fi PCOUNTER=0 PMAX=5 while [ "$PCOUNTER" -le "$PMAX" ]; do if ! kill -0 $PARENTPID >& /dev/null; then PCOUNTER=$((PCOUNTER+1)) + if [ -n "$VERBOSE" ]; then + echo "trigger $PCOUNTER:" + ps + fi else - # must get 5 in a row, otherwise reset + # must get N in a row, otherwise reset + if [ "$PCOUNTER" -gt 0 ] && [ -n "$VERBOSE" ]; then + echo "reset:" + ps + fi PCOUNTER=0 fi sleep 1 diff --git a/HeterogeneousCore/SonicTriton/src/TritonService.cc b/HeterogeneousCore/SonicTriton/src/TritonService.cc index d3421b0e60b8d..7ad47b4eded3f 100644 --- a/HeterogeneousCore/SonicTriton/src/TritonService.cc +++ b/HeterogeneousCore/SonicTriton/src/TritonService.cc @@ -18,6 +18,7 @@ #include #include #include +#include namespace ni = nvidia::inferenceserver; namespace nic = ni::client; @@ -207,7 +208,7 @@ void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm:: } //assemble server start command - std::string command("cmsTriton -p -P -1"); + std::string command("cmsTriton -P -1 -p " + std::to_string(::getpid())); if (fallbackOpts_.debug) command += " -c"; if (fallbackOpts_.verbose)