Skip to content
Snippets Groups Projects
run.sh 4.33 KiB
#!/bin/bash
set -euf -o pipefail
shopt -s inherit_errexit

MAX_CI_JOB_RUNTIME=120

TXT_RED="\e[31m"
TXT_BLUE="\e[94m"
TXT_GREEN="\e[32m"
TXT_CLEAR="\e[0m"
TXT_BOLD="\e[1m"

hash awk
hash chown
hash diff
hash getent
hash id
hash runuser
hash salloc
hash sed
hash srun
hash ssh-keygen
hash sinfo

function error {
    : "${RV:=${2:-$BUILD_FAILURE_EXIT_CODE}}"
    echo -e "${TXT_RED}${TXT_BOLD}$1${TXT_CLEAR}" >&2
    return "$RV"
}

## User authentication

[ -z "$CUSTOM_ENV_AUTH_USER" ] && error "AUTH_USER CI/CD variable has not been set."
[ -z "$CUSTOM_ENV_AUTH_KEY" ] && error "AUTH_KEY secret CI/CD variable has not been set."

AUTH_USER=$CUSTOM_ENV_AUTH_USER

## Check if the user exists and the validity of its ID
id -u "$AUTH_USER" >/dev/null 2>&1 || error "User $AUTH_USER does not exist"
(( $(id -u "$AUTH_USER") >= 1000 )) || error "User $AUTH_USER ID within system reserved range."

## Use a key pair to authenticate the user (private key has to be set as a GitLab CI/CD variable)
AUTH_KEY=$CUSTOM_ENV_AUTH_KEY
AUTH_PUB=/etc/gitlab-runner/authorized_keys

(
while read -r PUB
do
    diff --color=never <(ssh-keygen -y -e -f /dev/stdin <<< "$AUTH_KEY") <(ssh-keygen -y -e -f /dev/stdin <<< "$PUB") > /dev/null && exit 0
done < "$AUTH_PUB"
exit 1
) || error "Authentication for user $AUTH_USER failed."

### Env setup

AUTH_USER_HOME=$(getent passwd "$AUTH_USER" | awk -F ":" '{print $6}')
AUTH_USER_SHELL=$(getent passwd "$AUTH_USER" | awk -F ":" '{print $7}')
AUTH_USER_WORK=$(runuser "$AUTH_USER" --login --command "echo \$WORK")

[ -z "$AUTH_USER_WORK" ] && error "Could not determine $AUTH_USER's \$WORK directory."

chown -R "$AUTH_USER" "$TMPDIR"

BASE_DIR=$AUTH_USER_WORK/gitlab-runner/builds/${CUSTOM_ENV_CI_CONCURRENT_PROJECT_ID:?}/${CUSTOM_ENV_CI_PROJECT_NAMESPACE:?}/${CUSTOM_ENV_CI_PROJECT_NAME:?}

runuser "$AUTH_USER" --login --command "mkdir -p $BASE_DIR"

if [[ ("$2" == "step_script" || "$2" == "build_script") && "${CUSTOM_ENV_NO_SLURM_SUBMIT:-}" != 1 ]]; then
    ## The script is the one specified in the gitlab-ci.yml script directive and the SUBMIT_TO_SLURM variable is set

    for E in $(env | grep -E "^CUSTOM_ENV_SLURM_")
    do
        export "${E#CUSTOM_ENV_}"
    done

    : "${SLURM_JOB_NAME:="gitlab-ci-${CUSTOM_ENV_CI_PROJECT_NAME:?}-${CUSTOM_ENV_CI_PIPELINE_ID:?}-${CUSTOM_ENV_CI_JOB_ID:?}"}"
    : "${SLURM_TIMELIMIT:=$MAX_CI_JOB_RUNTIME}"
    : "${SLURM_TIME:=$SLURM_TIMELIMIT}"
    : "${SLURM_NODELIST:="phinally"}" # default node: phinally

    if [ ! $(sinfo -n "$SLURM_NODELIST" -h -O NodeList) ]; then
        echo -e "${TXT_RED}${TXT_BOLD}Unknown node \"$SLURM_NODELIST\" specified. Available nodes: ${TXT_CLEAR}" >&2
        echo "$(sinfo -N -o '%N %c %m')" >&2
        error "Exiting..."
    fi

    SLURM_PARTITION=work
    export SLURM_PARTITION

    SLURM_NODES=1 # currently only individual nodes can be used
    export SLURM_NODES

    export SLURM_JOB_NAME
    export SLURM_TIME
    unset SLURM_TIMELIMIT
    export SLURM_NODELIST

    # limit max job run time
    if [[ $SLURM_TIME > $MAX_CI_JOB_RUNTIME ]]; then
        echo "SLURM_TIMELIMIT or SLURM_TIME larger than $MAX_CI_JOB_RUNTIME (s), limiting to $MAX_CI_JOB_RUNTIME." 1>&2
        SLURM_TIME=$MAX_CI_JOB_RUNTIME
        export SLURM_TIME
    fi

    # Generate salloc arguments from SLURM_* environment variables
    SALLOC_OPTIONS=()
    for E in $(env | grep -E "^SLURM_")
    do
        SALLOC_OPTIONS+=("$(echo "${E#SLURM_}" | awk -F "=" '{gsub("_", "-", $1); print "--"tolower($1)"="$2}')")
    done
    runuser --login "$AUTH_USER" --command "cp $1 $BASE_DIR.tmp/$CUSTOM_ENV_CI_JOB_ID.sh"

    echo "#!/bin/bash -l" > "$TMPDIR"/salloc.sh
    echo "salloc --quiet --chdir \"$BASE_DIR\" ${SALLOC_OPTIONS[@]} \
        srun --cpu-bind none --wait 0 --kill-on-bad-exit=1 \
        \"$AUTH_USER_SHELL\" --login \"$BASE_DIR.tmp/$CUSTOM_ENV_CI_JOB_ID.sh\"" >> "$TMPDIR"/salloc.sh
    chmod +x "$TMPDIR"/salloc.sh

    echo -e "${TXT_GREEN}${TXT_BOLD}Submitting job to node $SLURM_NODELIST...${TXT_CLEAR}"

    exec runuser --login "$AUTH_USER" --command "$TMPDIR"/salloc.sh
    
    #exec runuser --login "$AUTH_USER" --command "salloc --quiet --chdir \"$BASE_DIR\" ${SALLOC_OPTIONS[@]} \
    #    srun --cpu-bind none --wait 0 --kill-on-bad-exit=1 \
    #    \"$AUTH_USER_SHELL\" --login \"$BASE_DIR.tmp/$CUSTOM_ENV_CI_JOB_ID.sh\""
else
    runuser --login "$AUTH_USER" --command "cd \"$BASE_DIR\"; bash -l $1"
fi