diff --git a/README b/README index 69e5f060891e6785d68e04a582e3c5323f3d3c2b..1d7c8e5249b8a945276308d1184fe480f36b2129 100644 --- a/README +++ b/README @@ -1,3 +1,21 @@ -1.) Make sure permissions are correct (aka user running the script has access to $project_folder directory) -2.) Make a $project_folder and place the input fastq.gz files in it, or symlinks to them -3.) Run ./Run_Pipeline +1.) Make sure Oracle Java 1.7 is installed, directions here for Ubuntu 14.04 http://askubuntu.com/questions/521145/how-to-install-oracle-java-on-ubuntu-14-04 + This is needed for Mutect (Stage 11) +2.) Make sure bwa and samtools are installed and in your path (or change the location to where they are in the Stage 1 file, on Ubuntu 14.04 they may be + installed with apt-get: + sudo apt-get install bwa + sudo apt-get install samtools +3.) Install the multitude of tools referred to in the Stage 1 file and replace those paths with the paths to those tools and files for things like + reference genomes etc... +4.) Make sure permissions are correct (aka user running the script has access to $project_folder directory) +5.) Make a $project_folder and place the input fastq.gz files in it, or symlinks to them +6.) Run ./Run_Pipeline + +Installing VEP: + +Full API Installation: http://asia.ensembl.org/info/docs/api/api_installation.html +Some additional Perl modules may need to be installed for Bioperl, install anything it complains about missing using CPAN +If you use screen, note that the Perl environment variables set as described to not persist into the screen session without a little more configuration. +Create a file in your home directory called "screenrc" with the conetents "shell -$SHELL" without the quotes. Now screen will load your shell environment +when it starts. +In our case we don't want to have to read out online every time we run VEP, so we need to download some database/cache files as described here: +http://asia.ensembl.org/info/docs/tools/vep/script/vep_cache.html#cache diff --git a/R_Code_to_generate_Gene_List.R b/R_Code_to_generate_Gene_List.R new file mode 100755 index 0000000000000000000000000000000000000000..1ea504f6d28665d7516743e281182a9902562f5c --- /dev/null +++ b/R_Code_to_generate_Gene_List.R @@ -0,0 +1,35 @@ +#GOAL: Get a gene list for use when calculating DepthOfCoverage from GATK +#The gene list is described here: http://gatkforums.broadinstitute.org/gatk/discussion/40/performing-sequence-coverage-analysis +#They describe how you can get a genelist here: http://gatkforums.broadinstitute.org/gatk/discussion/1329/where-can-i-get-a-gene-list-in-refseq-format +#When using the gene list particularly for the DepthOfCoverage it does not need the special format described there (-[arg]:REFSEQ /path/to/refSeq), use it as shown on the 1st page linked +#Make sure to leave the format as "all fields from selected table" +#Unfortunately the processing isn't done, they say: To run with the GATK, contigs other than the standard 1-22,X,Y,MT must be removed, and the file sorted in karyotypic order. +#That is what this R script does. GATK used to provide a Perl script for this but it has not been maintained and is in a broken state. +refseq<- read.delim("/media/Storage/Work/Michelle/Exome_Pipeline/FullTableDumpRefSeq.refSeq",sep="\t",as.is=TRUE,header = FALSE) +dim(refseq) +head(refseq) +refseq[,3] <- factor(refseq[,3], levels = c("chrM", "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY")) +refseq<- refseq[order(refseq[,3],as.numeric(refseq[,5])),] +unique(refseq[,3]) +paste0("chr",c(1:22,"X","Y","M")) +refseq_f <- refseq[refseq[,3] %in% paste0("chr",c(1:22,"X","Y","M")),] +unique(refseq_f[,3]) +#This file inludes genes that have no coding region (cdsStart == cdsEnd) and gives the warning below +write.table(refseq_f,file="/media/Storage/Work/Michelle/Exome_Pipeline/RefSeq_hg19_Proper_Format.GeneList",sep="\t",quote=FALSE,col.names = FALSE,row.names=FALSE) + +#These are the genes that cause the warning: +#WARN 09:02:59,808 Utils - ******************************************************************************** +#WARN 09:02:59,809 Utils - * WARNING: +#WARN 09:02:59,810 Utils - * +#WARN 09:02:59,810 Utils - * RefSeq file contains transcripts with zero coding length. Such +#WARN 09:02:59,810 Utils - * transcripts will be ignored (this warning is printed only once) +#WARN 09:02:59,811 Utils - ******************************************************************************** +#They appear to be non-coding genes? +#https://www.biostars.org/p/132742/ +#https://github.com/broadgsa/gatk/blob/master/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqCodec.java +#equalTest <- (abs(as.numeric(refseq_f[,7])-as.numeric(refseq_f[,8])) < 0.001) +equalTest <- (abs(as.numeric(refseq_f[,7])-as.numeric(refseq_f[,8])) < 0.001) +write.table(refseq_f[which(equalTest == TRUE),], file = "/media/Storage/Work/Michelle/Exome_Pipeline/RefSeq_hg19_transcripts_with_zero_coding_length.GeneList",sep="\t",quote=FALSE,col.names = FALSE,row.names=FALSE) + +#This file excludes the non-coding genes, since we are looking at exomes we use this +write.table(refseq_f[which(equalTest == FALSE),], file = "/media/Storage/Work/Michelle/Exome_Pipeline/RefSeq_hg19_Proper_Format_Filtered.GeneList",sep="\t",quote=FALSE,col.names = FALSE,row.names=FALSE)