Just a heads up, we don't have a huge amount of space on this machine, ~750 GB for the git repos. We can include some data in the projects, but really big datasets will need to remain elsewhere. For anyone new to Git, this is a fairly good place to start: http://gitref.org/index.html Documentation for Gitlab is available here: http://doc.gitlab.com/

Commit 8d4976a0 authored by Khalid Kunji's avatar Khalid Kunji

Added support for long format, fixed file order bug

parent 22b232e1
example/gl_auto_example/ped52.MORGAN.ped
example/framework-gl_auto_v3_2.IVs
gl_auto_example/ped52.MORGAN.ped
framework-gl_auto_v3_2.IVs
100
example/framework.map
example/dense.map
example/dense.genotypes
example/dense.afreq
framework.map
dense.map
dense.genotypes
dense.afreq
2 0.8 0.9
example/gl_auto_example/ped52.MORGAN.ped
example/framework-gl_auto_v3_2.IVs
./gl_auto_example/ped52.MORGAN.ped
./framework-gl_auto_v3_2.IVs
100
example/framework.map
example/dense.map
example/dense.genotypes.long
example/dense.afreq
./framework.map
./dense.map
./dense.genotypes.long
./dense.afreq
2 0.8 0.9
......@@ -37,7 +37,7 @@ echo
#DEFAULTS
output_folder="./"
export long="false"
cd "${base_path}"
if [ $# -ge 1 -a -f "${@:1:1}" ]
......@@ -121,6 +121,9 @@ shift
exit 1
fi
;;
l )
export long="true"
;;
* )
echo "unknown flag: " "${arg:${j}:1}"
exit 1
......@@ -179,6 +182,8 @@ echo "Input Folder: " "$input_folder"
echo "Output Folder: " "$output_folder"
echo "GIGI Split Chunks Folder: " "$gigi_split_chunks_folder"
echo "Long input: " "${long}"
cd "$parent_path"
. ./GIGI_1_setup.sh
. ./GIGI_2_split.sh
......
#!/bin/bash
export timecmd="$(which time)"
echo "INPUT FOLDER: " "${input_folder}"
cd ${input_folder}
find "${gigi_split_prefix%/*}" -name "chunk_0*"
if [[ -n $(find "${gigi_split_prefix%/*}" -name "chunk_0*") ]]
......@@ -9,7 +9,8 @@ then
echo "chunk_0.geno exists, file is most likely already split, if it is not, then remove the existing chunks from ${gigi_split_prefix%/*} and try again"
else
mkdir -p "${output_folder}/${run_name}/STATS"
$timecmd -o "${output_folder}/${run_name}/STATS/time${i}.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi_split}" "${param_file##*/}" "${num_chunks}" "${gigi_split_prefix}"
echo "Long is: " "${long}"
$timecmd -o "${output_folder}/${run_name}/STATS/time${i}.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi_split}" "${param_file##*/}" "${num_chunks}" "${gigi_split_prefix}" "${long}"
echo "Split exit status: " "$?"
fi
cd "$parent_path"
......
......@@ -3,25 +3,38 @@
cd "${input_folder}"
#cd "${gigi_split_chunks_folder}"
i=0
#i=0
pids=()
for file in "${gigi_split_chunks_folder}"/*.param
do
#echo "$file"
echo "iteration: " "$i"
#echo "iteration: " "$i"
filename="${file##*/}"
echo "FILENAME: " "$filename"
echo "FILE: " "$file"
if [ -e "${output_folder}/${run_name}/gigi_output/${i}/impute.geno" ]
if [[ $filename =~ [0-9]+ ]] ; then
echo "Starting run on chunk: " "$BASH_REMATCH"
else
echo "Failed to find chunk id int in filename"
exit 7
fi
if [ -e "${output_folder}/${run_name}/gigi_output/${BASH_REMATCH}/impute.geno" ]
then
echo "file ${output_folder}/${run_name}/gigi_output/${i}/impute.geno already exists."
echo "file ${output_folder}/${run_name}/gigi_output/${BASH_REMATCH}/impute.geno already exists."
else
mkdir -p "${output_folder}/${run_name}/gigi_output/${i}"
mkdir -p "${output_folder}/${run_name}/gigi_output/${BASH_REMATCH}"
echo $(pwd)
$timecmd -o "${output_folder}/${run_name}/STATS/time${i}.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi}" "${file}" -outD="${output_folder}/${run_name}/gigi_output/${i}" & pids+=("$!")
echo "LONG IS: " "${long}"
if [[ "$long" == "true" ]]
then
$timecmd -o "${output_folder}/${run_name}/STATS/time${BASH_REMATCH}.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi}" "${file}" -outD="${output_folder}/${run_name}/gigi_output/${BASH_REMATCH}" -long & pids+=("$!")
else
$timecmd -o "${output_folder}/${run_name}/STATS/time${BASH_REMATCH}.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi}" "${file}" -outD="${output_folder}/${run_name}/gigi_output/${BASH_REMATCH}" & pids+=("$!")
fi
echo "$!"
fi
i=$(($i+1))
#i=$(($i+1))
done
#Check exit status
......
......@@ -6,9 +6,9 @@
echo "GIGI Merge Location: " "${gigi_merge}"
echo "Directory: " "${output_folder}/${run_name}/gigi_output"
echo "Num Chunks: " "${num_chunks}"
echo "Long is: " "${long}"
$timecmd -o "${output_folder}/${run_name}/STATS/time_merge.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi_merge}" "${output_folder}/${run_name}/gigi_output" "${num_chunks}"
#"${gigi_merge}" "${output_folder}/${run_name}/gigi_output" "${num_chunks}"
$timecmd -o "${output_folder}/${run_name}/STATS/time_merge.log" -f'memory in kilobytes %M real %e user %U sys %S command %C' "${gigi_merge}" "${output_folder}/${run_name}/gigi_output" "${num_chunks}" "${long}"
merge_status="$?"
echo "Merge exit status: " "$merge_status"
......
......@@ -23,6 +23,10 @@
#define FILE_GENO "impute.geno"
#define FILE_PROB "impute.prob"
#define ARG_DIRECTORY 1
#define ARG_PARTITION_COUNT 2
#define ARG_LONG_FORMAT 3
#define ARG_COUNT 4
//Creates a file name based on the prefix, numeric id and file extension
......@@ -74,7 +78,7 @@ bool mergeRows(const char* dir, int partitionCount, const char* file) {
return success;
}
//Creates an array of input streams.
//Creates an array of input streams
static std::ifstream* createInputStreams(const char* dir, int partitionCount, const char* file) {
std::ifstream* fin = new std::ifstream[partitionCount];
......@@ -97,7 +101,7 @@ static std::ifstream* createInputStreams(const char* dir, int partitionCount, co
return fin;
}
//Closes input streams.
//Closes input streams
static void closeInputStreams(std::ifstream* fin, int size) {
for (int i = 0; i < size; i++) {
......@@ -163,41 +167,72 @@ bool mergeColumns(const char* dir, int partitionCount, const char* file) {
return success;
}
//Merges files based on long/short format
bool mergeFormat(const char* dir, int partitionCount, const char* file, bool longFormat) {
bool success;
if (longFormat) {
success = mergeRows(dir, partitionCount, file);
} else {
success = mergeColumns(dir, partitionCount, file);
}
return success;
}
//Main entry point of the program
int main(int argc, char** argv) {
if (argc != 3) {
if (argc != ARG_COUNT) {
std::cout << "Usage:" << std::endl;
std::cout << " GIGIMerge <directory> <partition count>" << std::endl;
std::cout << " GIGIMerge <directory> <partition count> <long format (true/false)>" << std::endl;
std::cout << std::endl << "Exit codes: " << std::endl;
std::cout << " 0 Success" << std::endl;
std::cout << " " << ERROR_INVALID_ARGUMENTS << " Invalid arguments" << std::endl;
std::cout << " " << ERROR_INVALID_PARTITION_COUNT << " Invalid partition count (must be positive)" << std::endl;
std::cout << " " << ERROR_MERGE_CONSISTENT_IV_FILE << " Problem merging consistentIV files" << std::endl;
std::cout << " " << ERROR_MERGE_DOSAGE_FILE << " Problem merging dosage files" << std::endl;
std::cout << " " << ERROR_MERGE_GENO_FILE << " Problem merging geno files" << std::endl;
std::cout << " " << ERROR_MERGE_PROB_FILE << " Problem merging prob files" << std::endl;
std::cout << " " << ERROR_MERGE_GENO_FILE << " Problem merging genotype files" << std::endl;
std::cout << " " << ERROR_MERGE_PROB_FILE << " Problem merging probability files" << std::endl;
return ERROR_INVALID_ARGUMENTS;
}
//Parsing long format argument
bool longFormat = false;
if (!strcmp(argv[ARG_LONG_FORMAT], "true")) {
longFormat = true;
} else if (strcmp(argv[ARG_LONG_FORMAT], "false")) {
std::cout << "Invalid long format argument. It should be true or false." << std::endl;
return ERROR_INVALID_ARGUMENTS;
}
int partitionCount = atoi(argv[2]);
//Parsing number of partitions
int partitionCount = atoi(argv[ARG_PARTITION_COUNT]);
if (partitionCount <= 0) {
std::cout << "Invalid partition count (must be positive)" << std::endl;
return ERROR_INVALID_PARTITION_COUNT;
}
if (!mergeRows(argv[1], partitionCount, FILE_CONSISTENT_IV)) {
//Merging IV files
if (!mergeRows(argv[ARG_DIRECTORY], partitionCount, FILE_CONSISTENT_IV)) {
std::cout << "Problem merging consistentIV files" << std::endl;
return ERROR_MERGE_CONSISTENT_IV_FILE;
}
if (!mergeColumns(argv[1], partitionCount, FILE_DOSAGE)) {
//Merging dosage files
if (!mergeFormat(argv[ARG_DIRECTORY], partitionCount, FILE_DOSAGE, longFormat)) {
std::cout << "Problem merging dosage files" << std::endl;
return ERROR_MERGE_DOSAGE_FILE;
}
if (!mergeColumns(argv[1], partitionCount, FILE_GENO)) {
std::cout << "Problem merging geno files" << std::endl;
//Merging genotype files
if (!mergeFormat(argv[ARG_DIRECTORY], partitionCount, FILE_GENO, longFormat)) {
std::cout << "Problem merging genotype files" << std::endl;
return ERROR_MERGE_GENO_FILE;
}
if (!mergeColumns(argv[1], partitionCount, FILE_PROB)) {
std::cout << "Problem merging prob files" << std::endl;
//Merging probability files
if (!mergeFormat(argv[ARG_DIRECTORY], partitionCount, FILE_PROB, longFormat)) {
std::cout << "Problem merging probability files" << std::endl;
return ERROR_MERGE_PROB_FILE;
}
return 0;
......
No preview for this file type
This diff is collapsed.
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment