Commit 49bf5aa9 authored by Khalid Kunji's avatar Khalid Kunji
Browse files

Replaced GIGI with GIGI2 and fixed SPLIT, need to fix scripts still

parent 9e77307a
No preview for this file type
#ifndef DATATYPES_H
#define DATATYPES_H
//Primitive data type for storing marker position
typedef double MarkerPosition;
//Primitive data type for storing a single meiosis marker
typedef unsigned char MeiosisMarker;
//Primitive data type for storing founder genotype labels
//It must be singed and must be able to store 2 * number of founders
typedef int16_t FGL;
//Maximum number of alleles in the genotype
#define MAX_ALLELE_TYPES 2
//Primitive data type for storing genotype
//It must be singed and have at least MAX_ALLELE_TYPES + 1 bits
typedef int8_t Genotype;
//Primitive data type for storing genotype
typedef float AFreq;
//Primitive data type for storing meiosis probability
typedef double MeiosisProbability;
//Primitive data type for storing graph likelihood
typedef float Likelihood;
#endif /* DATATYPES_H */
#ifndef DENSEMARKERS_H
#define DENSEMARKERS_H
#include <cstdint>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include "Datatypes.h"
#include "Logger.h"
#include "Matrix.h"
#include "MapFile.h"
#include "Pedigree.h"
/**
* Macro to test if the genotype is homozygote
*/
#define isHomozygous(x) ((x != 0) && (x == (x & -x)))
/**
* Macro to test if the genotype is heterozygote
*/
#define isHeterozygous(x) (x != (x & -x))
/**
* This class represents all the dense markers. The class encapsulates number,
* marker positions, allele frequencies and genotypes of dense markers.
*/
class DenseMarkers {
private:
/**
* Size of the buffer
*/
unsigned int bufferSize;
/**
* Number of dense markers
*/
unsigned int markerCount;
/**
* Number of markers to be skipped due to range specification
*/
unsigned int markerSkipCount;
/**
* Positions of markers
*/
MarkerPosition* markerPositions;
/**
* Pointer to allele frequency file input stream
*/
std::ifstream* alleleFreqStream;
/**
* An array of allele count
*/
unsigned int* alleleCount;
/**
* Allele frequencies for markers
*/
Matrix<AFreq>* alleleFreq;
/**
* Pointer to genotype file input stream
*/
std::ifstream* genotypeStream;
/**
* Pointer to genotype matrix
*/
Matrix<Genotype>* genotype;
/**
* List of fgl indices for individuals having genotype
*/
unsigned int* genoFGLIndices;
/**
* Counts the number of individuals from the header of genotype file
* @param header Header of the genotype file
* @return Number of individuals
*/
unsigned int countIndividuals(std::string& header) {
std::stringstream ss(header);
std::string id;
unsigned int individualCount = 0;
while (ss >> id) {
individualCount++;
}
return individualCount;
}
public:
/**
* Creates dense marker buffer with given dense marker map file and range of
* dense marker positions
* @param mapFile Dense marker map file
* @param min Lower limit of dense marker positions
* @param max Upper limit of dense marker positions
*/
explicit DenseMarkers(std::string mapFile, double min, double max) {
bufferSize = 0;
markerPositions = loadMap(mapFile, markerCount);
//Count the number of markers above the upper limit
unsigned int filteredMarkers = 0;
while ((filteredMarkers < markerCount) && (markerPositions[markerCount - 1 - filteredMarkers] > max)) {
filteredMarkers++;
}
markerCount -= filteredMarkers;
//Count the number of markers below the lower limit
markerSkipCount = 0;
while ((markerSkipCount < markerCount) && (markerPositions[markerSkipCount] < min)) {
markerSkipCount++;
}
filteredMarkers += markerSkipCount;
markerCount -= markerSkipCount;
if (filteredMarkers > 0) {
messagePrintln(" Filtered " + std::to_string(filteredMarkers) + " markers leaving " + std::to_string(markerCount) + " markers (" + std::to_string(min) + "~" + std::to_string(max) + ")");
}
alleleFreqStream = NULL;
alleleCount = NULL;
alleleFreq = NULL;
genotypeStream = NULL;
genotype = NULL;
genoFGLIndices = NULL;
if (markerCount == 0) {
errorPrintln("*** Error: No dense markers are found. ***");
exit(1);
}
}
/**
* Destructor
*/
virtual ~DenseMarkers() {
delete[] markerPositions;
if (alleleFreqStream) {
delete alleleFreqStream;
}
if (alleleCount) {
delete[] alleleCount;
}
if (alleleFreq) {
delete alleleFreq;
}
if (genotypeStream) {
delete genotypeStream;
}
if (genotype) {
delete genotype;
}
if (genoFGLIndices) {
delete[] genoFGLIndices;
}
}
/**
* Returns marker positions
* @return Marker positions
*/
inline MarkerPosition* getMarkerPositions() {
return markerPositions + markerSkipCount;
}
/**
* Returns the number of markers.
* @return Number of markers
*/
inline unsigned int getMarkerCount() {
return markerCount;
}
/**
* Returns the buffer size.
* @return Buffer size
*/
inline unsigned int getBufferSize() {
return genotype->getRowCount();
}
/**
* Returns the number of individuals with genotype.
* @return Number of individuals with genotype
*/
inline unsigned int getGenotypeCount() {
return genotype->getColumnCount();
}
/**
* Returns a pointer to the genotype at the given buffer index.
* @param index Index of the pattern
* @return Pointer to the genotype at the given index
*/
inline Genotype* getGenotype(unsigned int index) {
return genotype->getRowPtr(index);
}
/**
* Returns list of FGL indices for individuals having genotype
* @return List of FGL indices for individuals having genotype
*/
inline unsigned int* getGenotypeFGLIndices() {
return genoFGLIndices;
}
/**
* Returns number of alleles of marker at the given buffer index.
* @param index Index of the buffer entry
* @return Number of alleles of marker at the given buffer index
*/
inline unsigned int getAlleleCount(int index) {
return alleleCount[index];
}
/**
* Returns allele frequencies of marker at the given buffer index.
* @param index Index of the buffer entry
* @return Allele frequencies of marker at the given buffer index
*/
inline AFreq* getAlleleFrequencies(int index) {
return alleleFreq->getRowPtr(index);
}
/**
* Initializes the datastructures for dense markers and opens allele frequency
* file and genotype file for future reading.
*
* @param afreqFile Allele frequency file
* @param genoFile Genotype file
* @param pedigree Pedigree for genotype file
* @param iterations Number of iterations
* @param size Buffere size
*/
void initBuffer(std::string afreqFile, std::string genoFile, Pedigree& pedigree, unsigned int iterations, unsigned int size) {
bufferSize = size;
//Opening allele frequency file for future reading
messagePrintln("Opening allele frequency file " + afreqFile);
alleleFreqStream = new std::ifstream(afreqFile.c_str());
if (!alleleFreqStream->is_open()) {
errorPrintln("*** Error opening the allele frequency file " + afreqFile + " ***");
exit(1);
}
//Initializing allele frequency datastructures
alleleCount = new unsigned int [bufferSize];
alleleFreq = new Matrix<AFreq>(bufferSize, MAX_ALLELE_TYPES);
//Opening genotype file
messagePrintln("Opening genotype file " + genoFile);
genotypeStream = new std::ifstream(genoFile.c_str());
if (!genotypeStream->is_open()) {
errorPrintln("*** Error opening the genotype file " + genoFile + " ***");
exit(1);
}
//Processing header to define mapping of individual indices
std::string header;
if (!std::getline(*genotypeStream, header)) {
errorPrintln("*** Error: No header in the genotype file ***");
exit(1);
}
//Identifying number of individuals in the genotype file
unsigned int individualCount = countIndividuals(header);
if (individualCount == 0) {
errorPrintln("*** Error: No genotypes found ***");
exit(1);
}
messagePrintln(" Found genotypes for " + std::to_string(individualCount) + " individuals");
//Mapping individuals indices in genotype file
genoFGLIndices = new unsigned int[individualCount];
std::stringstream ss(header);
std::string id;
for (unsigned int i = 0; i < individualCount; i++) {
ss >> id;
genoFGLIndices[i] = pedigree.getIndividualIndex(id) << 1;
if (genoFGLIndices[i] < 0) {
errorPrintln("*** Individual \"" + id + "\" specified in the genotype file is not found in the pedigree ***");
exit(1);
}
}
genotype = new Matrix<Genotype>(bufferSize, individualCount);
//Skipping the filtered markers
if (markerSkipCount > 0) {
messagePrintln(" Skipping " + std::to_string(markerSkipCount) + " markers");
std::string line;
for (unsigned int i = 0; i < markerSkipCount; i++) {
if (!std::getline(*alleleFreqStream, line)) {
errorPrintln("*** Error reading the allele frequency file ***");
exit(1);
}
if (!std::getline(*genotypeStream, line)) {
errorPrintln("*** Error reading the genotype file ***");
exit(1);
}
}
}
}
/**
* Loads next batch of allele frequencies and genotypes.
* @param count Number of markers for which data is to be loaded
*/
void loadMarkers(unsigned int count) {
std::string line;
//Read allele frequencies
for (unsigned int i = 0; i < count; i++) {
if (!std::getline(*alleleFreqStream, line)) {
errorPrintln("\n*** Error reading the allele frequency file ***");
exit(1);
}
std::stringstream ss(line);
AFreq* afreq = alleleFreq->getRowPtr(i);
double f;
for (alleleCount[i] = 0; ss >> f; alleleCount[i]++, afreq++) {
if (alleleCount[i] == MAX_ALLELE_TYPES) {
errorPrintln("\n*** Error: Number of alleles is greater than " + std::to_string(MAX_ALLELE_TYPES) + " ***");
exit(1);
}
*afreq = f;
}
if (alleleCount[i] == 0) {
errorPrintln("\n*** Error: Number of alleles is zero ***");
exit(1);
}
}
//Read genotypes
for (unsigned int i = 0; i < count; i++) {
if (!std::getline(*genotypeStream, line)) {
errorPrintln("\n*** Error reading the genotype file ***");
exit(1);
}
std::stringstream ss(line);
Genotype* geno = genotype->getRowPtr(i);
int g1, g2;
for (unsigned int j = 0; j < genotype->getColumnCount(); j++, geno++) {
if (!(ss >> g1 >> g2)) {
errorPrintln("\n*** Error: Missing genotypes ***");
exit(1);
}
if (g1 <= 0 || g2 <= 0) {
*geno = 0;
} else {
*geno = (1 << (g1 - 1)) | (1 << (g2 - 1));
}
}
}
}
};
#endif /* DENSEMARKERS_H */
#include <thread>
#include <unistd.h>
#include "DenseMarkers.h"
#include "InheritanceVector.h"
#include "Logger.h"
#include "MapFile.h"
#include "Parameters.h"
#include "Pedigree.h"
#include "SparseMarkers.h"
/**
* Flag for terminating launched threads
*/
static volatile bool threadExit;
/**
* Flags corresponding to launched threads indicating if the thread has finished
* processing the data
*/
static bool* volatile threadDone;
/**
* Number of markers in the current batch
*/
static volatile unsigned int markerCount;
/**
* Number of markers imputed so far
*/
static volatile unsigned int imputedMarkerCount;
/**
* Index of sparse flanking marker
*/
static volatile unsigned int sparseMarkerIndex;
/**
* Pointer to inheritance vector buffer
*/
static InheritanceVector* meiosisPtr;
/**
* This function is executed by launched threads for generation of inheritance
* vectors and imputing genotypes
* @param threadId Thread id of the calling thread
* @param threadCount Number of threads
*/
void impute(unsigned int threadId) {
//While threads are not exited
while (!threadExit) {
if (threadDone[threadId]) {//If thread is done then wait
usleep(10000);
} else {//Otherwise generate inheritance vectors and impute genotypes
meiosisPtr->imputeMarkers(sparseMarkerIndex, imputedMarkerCount, markerCount, threadId);
threadDone[threadId] = true; //Thread is done so go to waiting state
}
}
}
/**
* Main entry point of the program
* @param argc Number of command-line arguments
* @param argv Command-line arguments
*/
int main(int argc, char** argv) {
clock_t startTime = clock();
if (argc != 2) {
std::cerr << "Usage:\n " << argv[0] << " <parameter file>" << std::endl << std::endl;
printParameterFormat();
return 1;
}
Parameters parameters(argv[1]);
try {
setLogFile(parameters.getOutputFile());
//Load pedigree
Pedigree pedigree(parameters.getPedigreeFile());
//Load dense marker information
DenseMarkers dense(parameters.getDenseMapFile(), parameters.getDenseMarkerLowerLimit(), parameters.getDenseMarkerUpperLimit());
//Generate sparse flanking markers
SparseFlankingMarkers sparse(pedigree, dense, parameters.getSparseMapFile(), parameters.getMeiosisFile(), parameters.getMeiosisIterations());
//Suggest optimal buffer size based on the maximum dense marker batch size
parameters.suggestBufferSize(sparse.getMaximumDenseMarkerBatchSize());
//Initialize dense marker buffer
dense.initBuffer(parameters.getAlleleFrequencyFile(), parameters.getGenotypeFile(), pedigree, sparse.getIterationCount(), parameters.getBufferSize());
//Initialize inheritance vector buffer
InheritanceVector meiosis(pedigree, sparse, dense, parameters.getSeed(), parameters.getThreadCount(), parameters.getOutputFile(), parameters.getGenotypeCallingMethod(), parameters.getGenotypeCallingThreshold1(), parameters.getGenotypeCallingThreshold2());
messagePrintln("Using thread count " + std::to_string(parameters.getThreadCount()) + ", random seed " + std::to_string(parameters.getSeed()) + " and marker buffer size " + std::to_string(parameters.getBufferSize()));
if (parameters.getGenotypeCallingMethod() == 1) {
messagePrintln("Using maximum probability genotype calling method");
} else if (parameters.getGenotypeCallingMethod() == 2) {
messagePrintln("Using confidence threshold genotype calling method with thresholds " + std::to_string(parameters.getGenotypeCallingThreshold1()) + " and " + std::to_string(parameters.getGenotypeCallingThreshold2()));
}
//Reset tread exit flag
threadExit = false;
//Initialize threads
threadDone = new bool[parameters.getThreadCount()];
std::thread** threads = new std::thread*[parameters.getThreadCount()];
for (unsigned int i = 1; i < parameters.getThreadCount(); i++) {
threadDone[i] = true;
threads[i] = new std::thread(impute, i);
}
//Initialize pointer for launched threads
meiosisPtr = &meiosis;
//Number of remaining markers between current set of flanking markers
unsigned int remainingMarkers;
imputedMarkerCount = 0;
sparseMarkerIndex = 0;
//Iterate through sparse flanking markers
for (int previousSparseMarkerIndex = -1; sparseMarkerIndex <= sparse.getMarkerCount(); sparseMarkerIndex++) {
//Get number of remaining markers between current set of flanking markers
remainingMarkers = sparse.getDenseMarkerCount(sparseMarkerIndex);
if (remainingMarkers > 0) {
messagePrint("Processing " + std::to_string(remainingMarkers) + " markers ");
if (previousSparseMarkerIndex == -1) {
messagePrintln("on left of the sparse marker " + std::to_string(sparse.getMarkerPosition(sparseMarkerIndex)), false);
} else if (sparseMarkerIndex == sparse.getMarkerCount()) {
messagePrintln("on right of the sparse marker " + std::to_string(sparse.getMarkerPosition(previousSparseMarkerIndex)), false);
} else {
messagePrintln("between sparse markers " + std::to_string(sparse.getMarkerPosition(previousSparseMarkerIndex)) + " and " + std::to_string(sparse.getMarkerPosition(sparseMarkerIndex)), false);
}
//Process markers in batches equal to the buffer size
while (remainingMarkers > 0) {
//Number of makers in the buffer
markerCount = (remainingMarkers < dense.getBufferSize()) ? remainingMarkers : dense.getBufferSize();
messagePrint(" " + std::to_string(markerCount) + " markers: loading data");
//Load dense markers
dense.loadMarkers(markerCount);
//Start launched threads
for (unsigned int i = 1; i < parameters.getThreadCount(); i++) {
threadDone[i] = false;
}
messagePrint(", imputing genotypes", false);
//Generate inheritance vectors
meiosis.imputeMarkers(sparseMarkerIndex, imputedMarkerCount, markerCount, 0);
//Check launched threads and wait until all the threads are done
for (unsigned int i = 1; i < parameters.getThreadCount(); i++) {
if (!threadDone[i]) {
i--;
usleep(1000);
}
}
//Exit launched threads if last sparse marker is processed
threadExit = (sparseMarkerIndex == sparse.getMarkerCount()) && (remainingMarkers == markerCount);
messagePrint(", saving", false);
//Save imputation results
meiosis.getGenotypeImputation()->saveResults(markerCount, imputedMarkerCount);
//Update marker counts
imputedMarkerCount += markerCount;
remainingMarkers -= markerCount;
//Print progress
messagePrintln(", " + std::to_string((imputedMarkerCount * 100) / dense.getMarkerCount()) + "% done", false);
}
}
//Update sparse marker index
previousSparseMarkerIndex = sparseMarkerIndex;
}
} catch (std::bad_alloc& e) {
errorPrintln("Unable to allocate memory. Try reducing the buffer size (current buffer size=" + std::to_string(parameters.getBufferSize()) + ")");
return 1;
}
clock_t endTime = clock();
messagePrintln("Computation time for all threads " + std::to_string((endTime - startTime) / CLOCKS_PER_SEC) + " sec");
delete[] threadDone;
return 0;
}
\ No newline at end of file
Author: Charles Y K Cheung
new changes in v1.06.1:
Because MORGAN's gl_auto version 3.2 uses a new output Inheritance Vectors file format, we have made changes so GIGI is now compatible with this file format.
The default behavior of GIGI v1.06 is to use this new Inheritance Vectors file format.
In addition, because this new file format no longer requires us to provide GIGI with the meiosis indexes that we used to need from the
console output of gl_auto, GIGI can now directly use the MORGAN pedigree file instead. Therefore, for convenience, users
can either use the pedigree file or the pedigree meiosis file that users have to parse from the console output of gl_auto.
That said, we are aware of the importance of backward compatibility. If you intend to use gl_auto's output from the pre v3.2
and the pedigree meiosis file, you may continue to do so. GIGI will check which version of IV file you are using.
Note: User is required to use the pedigree meiosis file instead of the pedigree file if the old IV format is used.
new change in v1.05:
In version 1.05, I fixed the bug to account for the condition associated with inbred pedigree: if the IV
infers that an inbred individual gets a pair of the same FGL and if the
observed genotype for this individual is heterozygous, this IV
must be inconsistent with the observed data.
Thanks to Dr. Jae-Hoon Sul for identifying this bug!
new changes in v1.04:
Important new function:
-Now GIGI can read dense markers in long format (rows are markers and columns are individuals, similar to the BEAGLE's genotype file format - except there is no "I" column here.) (See the documentation of the specification).
This change allows GIGI to handle very, very dense files in memory efficient manner.
see: example/param_longFormat.txt and "dense.genotypes.t"
- I converted the original example dense marker file from the old format (rows are individuals) to the long format using the script in the utilities diretory: convertGenotypesfromWideToLongFormat.R
- to tell GIGI that the dense genotype file is in the long format, use the -long flag : see documentation.
New changes in v1.03:
Bug fix:
- max ped size was limited to 160... now the number is changed to 5000.
- in the check that that the provided allele frequencies of each marker sums up to 1, if(sumAF==1) is replaced by if( (sumAF-1) > 0.0000001)
- if a line in the allele frequency only has 1 allelic type (monomorphic marker), added a dummy allelic type with frequency 0 to prevent the program from breaking.
- in main(), close the input streams before deallocating some of the variables to ensures output files get written first.
Other change:
- the call method in the example folder "param.txt" is now set to confidence-based calling (t1=0.8, t2=0.9) instead of the most likely genotype. See manuscript.
Rationale: This change is to remind users that calls based on the most likely genotype may not be accurate.
For example, if a parent has a rare allele, GIGI will correctly assign a 50% chance that the child has the rare allele IN THE SITUATION when we cannot figure out which chromosome is transmitted.
If we use the most likely genotype call method, it will make a call for each genotype despite potential high uncertainty in genotype configuration.
Since calls made using the most likely genotypes may be dangerous to use, we change the default call method to confidence-based calling.
Analyses that account for the uncertainty in the imputed results may be more appropriate.
eg. use the imputed probabilities directly or use a summary of imputed probabilities such as dosage.
- A dosage file is generated if all markers to be imputed are di-allelic markers.
Here, dosage is defined as the expected percent of 1 alleles in a genotype:
dosage of a genotype = 1*P(genotype is 1/1) + 0.5*P(genotype is 1/2)
- a binary GIGI file is included in the main uncompressed directory.
New changes in v1.02:
- warn user in the case when the Inheritance Vector file is empty.
e.g. in trios
rationale: Since we cannot infer recombination in trios, gl_auto generates an empty inheritance output.
This is normal and is correctly stated in the pedigree meiosis file. GIGI will still run, but
GIGI will impute only based on the pedigree structure and minor allele frequencies.
Hence, Linkage Disequilibrium-based method can potentially be more powerful than GIGI for Trios.
- include the perl script extractPedMeiosis.pl in the program to extract the pedigree meiosis file from gl_auto's output.
- expand the FAQ section in the documentation file
New changes in v1.01