Just a heads up, we don't have a huge amount of space on this machine, ~750 GB for the git repos. We can include some data in the projects, but really big datasets will need to remain elsewhere. For anyone new to Git, this is a fairly good place to start: http://gitref.org/index.html Documentation for Gitlab is available here: http://doc.gitlab.com/

Commit 0415ceff authored by Khalid Kunji's avatar Khalid Kunji

queue style memory management

parent 0bf685c4
......@@ -13,12 +13,13 @@ command -v uptime >/dev/null 2>&1 || { echo >&2 "uptime is recommended but not i
command -v lscpu >/dev/null 2>&1 || { echo >&2 "lscpu is required but not installed or not in the path. Aborting."; exit 1; }
command -v grep >/dev/null 2>&1 || { echo >&2 "grep is required but not installed or not in the path. Aborting."; exit 1; }
command -v tail >/dev/null 2>&1 || { echo >&2 "tail is required but not installed or not in the path. Aborting."; exit 1; }
command -v free >/dev/null 2>&1 || { echo >&2 "free is required but not installed or not in the path. Continuing, though some cosmetic information may be missing."; }
command -v free >/dev/null 2>&1 || { echo >&2 "free is required but not installed or not in the path. Continuing, though some cosmetic information may be missing"; }
command -v rev >/dev/null 2>&1 || { echo >&2 "rev is required but not installed or not in the path. Aborting."; exit 1; }
command -v seq >/dev/null 2>&1 || { echo >&2 "seq is required but not installed or not in the path. Aborting."; exit 1; }
command -v tr >/dev/null 2>&1 || { echo >&2 "tr is required but not installed or not in the path. Aborting."; exit 1; }
command -v sort >/dev/null 2>&1 || { echo >&2 "sort is recommended but not installed or not in the path. Continuing, but behavior is undetermined"; }
command -v cgcreate >/dev/null 2>&1 || { echo >&2 "cgcreate from package cgroup-tools/libcgroups is recommended but not installed or not in the path. Continuing, but memory limits will not be imposed. See the README for more info"; }
command -v cgcreate >/dev/null 2>&1 || { echo >&2 "cgcreate from package cgroup-tools/libcgroups is recommended but not installed or not in the path. Continuing, but memory limits may not be imposed. See the README for more info"; }
echo
if [ ${debug_parsing} ]
......@@ -147,8 +148,25 @@ then
echo "long is true"
export long="true"
;;
q )
queue_length="${@:1:1}"
shift
if [ "${queue_length:0:1}" == "-" ]
then
echo "Flags shouldn't follow each other separately (e.g. -o -n ./out 4), but instead should be joined (e.g. -on ./out 4) or totally separate (e.g. -o ./out -n 4)"
exit 1
fi
if [ "${queue_length}" ]
then
echo "Queue set successfully: " "${queue_length}" "processes"
export queue_length
else
echo "-q flag given with no queue length, aborting."
exit 1
fi
;;
v )
echo "GIGI-Quick version: 0.95"
echo "GIGI-Quick version: 0.99"
echo "If you intended to activate verbose output, that flag is now capitalized: '-V'"
exit 0
;;
......@@ -158,11 +176,12 @@ then
;;
h )
echo "Help: "
echo "run_GIGI parameter_file -o [OUTPUT FOLDER] -n [RUN NAME] -t [THREADS] -m [MEMORY IN MB] [-l] [-v] [-V] [-h]"
echo "run_GIGI parameter_file -o [OUTPUT FOLDER] -n [RUN NAME] -t [THREADS] -m [MEMORY IN MB] -q [QUEUE LENGTH] [-l] [-v] [-V] [-h]"
echo "-o [OUTPUT FOLDER] : This is the path to use for the outputs from the run_GIGI scripts, including temporary files."
echo "-n [RUN NAME] : This is a path relative to the [OUTPUT FOLDER] to use to keep the outputs from more than one run of run_GIGI separated."
echo "-t [THREADS] : The number of threads to use for run_GIGI, and also the number of chunks to split the input into."
echo "-m [MEMORY IN MB] : The amount of RAM that run_GIGI will restrict its use to, please set up the cgroup first"
echo "-q [QUEUE LENGTH] : How many copies of GIGI to run concurrently, always use -m when using -q"
echo "-l : Specifies that the input is in the long format."
echo "-v : Display the version of GIGI-Quick"
echo "-V : Verbose mode, output from run_GIGI is much quiter now, you can see much more of what it is doing and what variables are set to at various stages with -v. "
......
#!/bin/bash
### GIGI Config Outputs ###
......@@ -84,13 +85,15 @@ then
fi
if [ "$kern" != "2.6.24" ]
then
echo "Your kernel is too old to support cgroups, if you do not update your kernel to a version >=2.6.24 then memory limits will not have any effect."
echo "Your kernel is too old to support cgroups, if you do not update your kernel to a version >=2.6.24 then memory limits may not have any effect."
fi
echo
echo
#Handle memory
if [ "${memory}" ]
mem_bytes=$((1024*1024*memory))
mem_kbytes=$((1024*memory))
if [[ "${memory}" && ! "${queue_length}" ]]
then
#Create user cgroup for memory limits, and/or cpu share limits
#Here we depend on cgroup-tools
......@@ -101,7 +104,7 @@ then
then
echo "memory limits not enforced, could not create cgroup, error: ${cgstatus}"
else
echo $((1024*1024*memory))
echo "${mem_bytes}"
#Limit to ~2 MB
#echo 2000000 > /sys/fs/cgroup/memory/user_cgroup/gigi/memory.limit_in_bytes
#Limit to 200 KB
......@@ -110,3 +113,82 @@ then
#echo 100 > /sys/fs/cgroup/cpu/test/test_limits/cpu.shares
fi
fi
#if [[ "${memory}" && "${queue_length}" ]]
if [[ "${queue_length}" ]]
then
#get available memory in Kb, added in kernel 3.14
mem_avail=$(cat /proc/meminfo | grep "MemAvailable" | tr -cd [:digit:])
echo "Memory Available: " "${mem_avail}"
if [[ ! "${mem_avail}" ]]
then
mem_free=$(cat /proc/meminfo | grep "MemFree" | tr -cd [:digit:])
mem_cached=$(cat /proc/meminfo | grep "Cached" | tr -cd [:digit:])
mem_avail=$((mem_free+mem_cached))
echo "Echo, MemAvailable not given by kernel, probably kernel is older than version 3.14, falling back on an outdated estimate, tends to be high, see here https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=34e431b0ae398fc54ea69ff85ec700722c9da773"
echo "This outdated estimate can be very far off, e.g. on this test system MemAvailable is 12743064 while this old estimate gives 30148160, a vast overestimate."
echo "Memory Available: " "${mem_avail}"
fi
if [[ "${mem_kbytes}" -gt "${mem_avail}" ]]
then
echo "Memory requested is greater than memory available."
exit 1
fi
# export mem_size=$((mem_avail > mem_kbytes ? mem_avail : mem_kbytes))
if [[ ! "${memory}" ]]
then
export mem_size="${mem_avail}"
else
export mem_size="${mem_kbytes}"
fi
echo "Size of memory envelope in Kb: " "${mem_size}"
fi
#N = Size of Pedigree
cd ${input_folder}
N=$(head -1 "${param_file##*/}")
echo "${N}"
N=$(cat "${N}" | grep "input pedigree size" | tr -cd [:digit:])
echo "Pedigree Size: " "${N}"
echo "Long is: " "${long}"
#S = Number of sequenced subjects
S=$(head -6 "${param_file##*/}" | tail -n 1)
echo "${S}"
if [[ "${long}" == "true" ]]
then
#S=$(wc -L "${S}" | grep -o "^[[:space:]]*[[:digit:]]*" | tr -d [:blank:])
S=$(head -1 "${S}" | grep -o [[:space:]]*[[:digit:]]* | wc | grep -o "^[[:space:]]*[[:digit:]]*" | tr -d [:blank:])
else
S=$(wc "${S}" | grep -o "^[[:space:]]*[[:digit:]]*" | tr -d [:blank:])
fi
echo "S: " "${S}"
#M = Number of markers on a dense panel (aka SNPs)
M=$(head -6 "${param_file##*/}" | tail -n 1)
echo "${M}"
if [[ "${long}" == "true" ]]
then
M=$(wc "${M}" | grep -o "^[[:space:]]*[[:digit:]]*" | tr -d [:blank:])
else
#M=$(wc -L "${M}" | grep -o "^[[:space:]]*[[:digit:]]*" | tr -d [:blank:])
M=$(head -1 "${M}" | grep -o [[:space:]]*[[:digit:]]* | wc | grep -o "^[[:space:]]*[[:digit:]]*" | tr -d [:blank:])
fi
M=$((--M))
echo "M: " "${M}"
#estimated_memory_needed=$((-20490+0.05799*N*M+6164*S))
echo "-20490+0.05799*${N}*${M}+6164*${S}"
export estimated_memory_needed=$(echo "-20490+0.05799*${N}*${M}+6164*${S}" | bc)
echo "Estimated Memory Needed: " "${estimated_memory_needed}"
cd "$parent_path"
#if [[ "$queue_length" ]]
#then
# num_chunks = $((mem_size/estimated_memory_needed/
num_chunks=$(echo "if ( ${estimated_memory_needed}%${mem_size} ) ${estimated_memory_needed}/${mem_size}+1 else ${estimated_memory_needed}/${mem_size}" | bc)
echo "Calced Num_Chunks: " "${num_chunks}"
export num_chunks=$((queue_length*num_chunks))
echo "Num_Chunks to use ${queue_lenght cores at a time}: " "${num_chunks}"
#fi
#estimate/n = mem_avail(size)
#n = estimate/mem_size(ceil)
......@@ -22,6 +22,7 @@ else
then
echo "Long is: " "${long}"
fi
echo "SPLIT STEP, NUM CHUNKS: " "${num_chunks}"
$timecmd -o "${output_folder}/${run_name}/STATS/time${i}.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi_split}" "${param_file##*/}" "${num_chunks}" "${gigi_split_prefix}" "${long}" > "${output_folder}/${run_name}/LOGS/split.log"
echo "Split exit status: " "$?"
fi
......
......@@ -2,7 +2,79 @@
cd "${input_folder}"
function finish {
# kill straggling threads
killall --wait GIGI
}
trap finish EXIT
pids=()
if [[ ${queue_length} ]]
then
if [ ${verbose} ]
then
echo "memory: " "${memory}"
echo "queue_length: " "${queue_length}"
fi
############### Queue style run here
echo "RUNNNING QUEUE STYLE"
for file in "${gigi_split_chunks_folder}"/*.param
do
filename="${file##*/}"
if [ ${verbose} ]
then
echo "FILENAME: " "$filename"
echo "FILE: " "$file"
fi
if [[ $filename =~ [0-9]+ ]] ; then
if [ ${verbose} ]
then
echo "Adding to run list, chunk: " "$BASH_REMATCH"
fi
else
echo "Failed to find chunk id int in filename"
exit 7
fi
echo "${file}" >> "${gigi_split_chunks_folder}"/run_list
done
cat "${gigi_split_chunks_folder}"/run_list
#Run GIGI
i=0
j=0
while IFS= read -r line; do
if (( i++ >= queue_length )); then
wait -n # wait for any job to complete. New in 4.3
fi
if [ -e "${output_folder}/${run_name}/gigi_output/${j}/impute.geno" ]
then
echo "file ${output_folder}/${run_name}/gigi_output/${j}/impute.geno already exists."
else
mkdir -p "${output_folder}/${run_name}/gigi_output/${j}"
if [ ${verbose} ]
then
echo $(pwd)
echo "LONG IS: " "${long}"
fi
if [[ "$long" == "true" ]]
then
$timecmd -o "${output_folder}/${run_name}/STATS/time${j}.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi}" "${file}" -outD="${output_folder}/${run_name}/gigi_output/${j}" -long > "${output_folder}/${run_name}/LOGS/${j}.gigi.log" & pids+=("$!")
else
$timecmd -o "${output_folder}/${run_name}/STATS/time${j}.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi}" "${file}" -outD="${output_folder}/${run_name}/gigi_output/${j}" > "${output_folder}/${run_name}/LOGS/${j}.gigi.log" & pids+=("$!")
fi
last_pid="$!"
echo "gigi pid: " "$last_pid" " for part: " "${j}"
j=$((j+1))
fi
done < "${gigi_split_chunks_folder}"/run_list
wait # wait for the remaining processes
rm "${gigi_split_chunks_folder}"/run_list
else
for file in "${gigi_split_chunks_folder}"/*.param
do
filename="${file##*/}"
......@@ -33,9 +105,9 @@ do
fi
if [[ "$long" == "true" ]]
then
echo "memory: " "${memory}"
if [ ${memory} ]
then
echo "memory: " "${memory}"
command -v cgexec >/dev/null 2>&1
cgexecpresent=$?
if [ ${verbose} ]
......@@ -89,4 +161,6 @@ do
fi
done
fi
cd "$parent_path"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment