#!/usr/bin/perl use Cwd; use File::Spec::Functions qw(rel2abs); use File::Basename; use Scalar::Util qw(looks_like_number); use File::Copy; use List::Util 'shuffle'; use List::Util qw(sum); # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # VEGAS $version = "2.01.17"; # VEGAS2 version 2 and Month and Year of last major modification ###--- BEGIN FRONT MATTER ---### print "\n##############################################################\n"; print "# #\n"; print "# VEGAS2: Versatile Gene-based Association Study 2 $version #\n"; print "# #\n"; print "##############################################################\n\n"; print "This version was developed by Dr. Aniket Mishra and Asso. Prof. Stuart Macgregor.\n"; print 'If you have any issues/queries or suggestions please send an email to aniket.mishra@qimrbeghofer.edu.au'; my $USAGE = <<'END_USAGE'; Usage: perl vegas2v2 -G [-snpandp] [-custom] [-glist] [-upper] 0 [-lower] 0 [-chr] 1..23 [-out] [-genelist] [-max] 10000000 [-bestsnp/-top 1..100] perl vegas2v2 -P [-geneandp] [-geneandpath] [-maxsample] 1000000 Options: -G/-P : The first argument should be -G or -P to perform gene-based or pathway-based analysis respectively. -snpandp : Immediately after -G argument and space provide -snpandp option and the name of text file containing snp id and p value. -custom : Provide the plink binary format genotype files to compute pairwise LD between variants -glist : Provide human genome annotated gene location file. You can download it from NCBI, UCSC databases. We also provide hg19 gene annotation file on VEGAS2 webpage. -upper : Provide the 5' UTR end gene-boundary. -lower : Provide the 3' UTR end gene-boundary. -chr : Provide the chromosome number to perform analysis on, by default it work on all autosomes. -out : Provide outfile name. -genelist : Provide a file with list of gene Symbols to perform test on. -max : Provide the maximum number of simulation to perform. Must be above 1e6. -bestsnp : To perform best SNP test. -top : To perform top percentage test. Imediately after space provide a number between 1 to 100 to perfom top percentage test. -geneandp : Provide the text file containing gene id and p-value respectively. -geneandpath : Provide gene-pathway annotation file. -maxsample : Provide the maximum number of resamples to perform. Must be above 1e5. END_USAGE $dir = dirname(rel2abs($0)); # By defualt VEGAS performs -top 100 test $percentage = "1"; $outfile = "gene-basedoutput.out"; # Options for($i = 1; $i <= $#ARGV; ++$i){ $field = $ARGV[$i]; if(length($field)>1){ if($field eq "-snpandp"){ # read input for gene-based analysis $snpandp = "$ARGV[$i+1]"; } #For pathway analysis if($field eq "-geneandp"){ $geneandp = "$ARGV[$i+1]"; } if($field eq "-geneandpath"){ $geneandpath = "$ARGV[$i+1]"; } if($field eq "-maxsample"){ # Specify max number of sims (default is 1000000) $maxsample = "$ARGV[$i+1]"; if($maxsample < 1000000){ print "$USAGE\n"; die "Error: maximum number of -maxsample must be greater than 100000\n"; } } #For pathway analysis finished #Options for gene-based analysis if($field eq "-top"){ # Top X test $topten = "$field"; $perc = $ARGV[$i+1]; if($perc > 100 || $perc < 0){ print "$USAGE\n"; die "Error: Percentage must be between 0 and 100\n"; } $percentage = $perc/100; } if($field eq "-topsnp"){ #Do TopSNP test $topsnp = 1; } if($field eq "-chr"){ # Do test for specific chromosome $dochr = "$ARGV[$i+1]"; if ($dochr > 24 || $dochr < 1){ print "$USAGE\n"; die "Error: -chr must be a number between 1 and 22\n"; } } if($field eq "-out"){ # Specify outfile $outfile = "$ARGV[$i+1]"; } if($field eq "-genelist"){ # Do test for list of genes $list = "$ARGV[$i+1]"; if(!-e "$list"){ print "$USAGE\n"; die "Error: $list not found\n"; } open(LIST, "$list"); @genelist = ; close LIST; } if($field eq "-max"){ # Specify max number of sims (default is 1000000) $max = "$ARGV[$i+1]"; if($max < 1000000){ print "$USAGE\n"; die "Error: maximum number of sims must be greater than 1000000\n"; } } if($field eq "-keeptimestamp"){ # Do not delete time_stamp folder $keeptimestamp = 1; } if($field eq "-upper"){ # Upper boundary (use with -custom) $upper = "$ARGV[$i+1]"; } if($field eq "-lower"){ # Lower boundary (use with -custom) $lower = "$ARGV[$i+1]"; } if($field eq "-custom"){ # Use custom individual genotypes - do not confuse with previous -custom (replaced with -pop) $custom = "$ARGV[$i+1]"; } if($field eq "-glist"){ #gene location file hg19/hg18 etc $glist = "$ARGV[$i+1]"; } # For now just keep it here if($field eq "-prune"){ #gene location file hg19/hg18 etc $prunersquare = $ARGV[$i+1]; if($prunersquare > 0.9999 || $prunersquare < 0.001){ die "Error: Prune r2 parameter must be more than 0.001 and less than 0.9999\n"; } } } } # Make working directory my ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time) ; $time_stamp = $DayOfYear.$Hour.$Minute.$Second.$outfile.$dochr; mkdir "$time_stamp", 0777 or die "Error: cannot make working directory. Check that vegas has correct permission settings: $!"; print "\n\nCreating working directory: $time_stamp\n"; # If -G check all required arguments are available if ( $ARGV[0] eq "-G" ) { print "Performing gene-based analysis\n"; print "Options in effect: \n"; # Check if snpandp file exist and copy to working directory if( !-e "$snpandp" ) { print "$USAGE\n"; die "Error: snpandp file not found\n"; } # copy( "$snpandp", "$time_stamp/snpandp" ) or die "Copy failed: $!"; system("sort -k1,1 $snpandp >$time_stamp/snpandp"); print "\t-snpandp $snpandp\n"; # Check if glist file exist and copy to working directory if( !-e "$glist" ) { print "$USAGE\n"; die "Error: glist file not found. Please provide gene annotation file\n"; } copy( "$glist", "$time_stamp/tempglist" ) or die "Copy failed: $!"; print "\t-glist $glist\n"; # 3 Check if genotype file exist if( defined( $custom ) ) { if( !-e "$custom.bed" ) { print "$USAGE\n"; die "Error: $custom.bed not found\n"; } if( !-e "$custom.bim" ) { print "$USAGE\n"; die "Error: $custom.bim not found\n"; } if( !-e "$custom.fam" ) { print "$USAGE\n"; die "Error: $custom.fam not found\n"; } # copy("$custom.bed","$time_stamp/custom.bed") or die "Copy failed: $!"; # copy("$custom.bim","$time_stamp/custom.bim") or die "Copy failed: $!"; # copy("$custom.fam","$time_stamp/custom.fam") or die "Copy failed: $!"; } else { print "$USAGE\n"; die "Error: please provide reference genotype files to perform gene-based test\n"; } print "\t-custom $custom\n"; if( defined( $lower ) ) { print "\t-lower $lower\n"; } else { $lower = "0"; print "\t-lower $lower\n"; } if( defined( $upper ) ) { print "\t-lower $upper\n"; } else { $upper = "0"; print "\t-upper $upper\n"; } if( defined($prunersquare) ) { print "\t-prune $prunersquare\n"; } if( defined($topsnp) ) { print "\t-topsnp\n"; } else { $perc = $percentage * 100; print "\t-top $perc\n"; } if( defined($dochr) ) { print "\t-chr $dochr\n"; } if( defined($list) ) { print "\t-genelist $list\n"; if(!-e "$list"){ print "$USAGE\n"; die "Error: genelist file not found\n"; } copy("$list","$time_stamp/customgenelist") or die "Copy failed: $!"; } if( defined($max)) { print "\t-max $max\n"; } if( defined($keeptimestamp) ) { print "\t-keeptimestamp\n"; } if( defined($outfile) ) { print "\t-out $outfile"; } # Other validations if(defined($topten) and defined($topsnp)){ print "$USAGE\n"; die "Error: Cannont do -topten and -topsnp test concurrently\n"; } if (defined($list) and defined($dochr)){ print "$USAGE\n"; die "Error: Cannot do -genelist and -chr test concurrently\n"; } } elsif( $ARGV[0] eq "-P" ) { print "Performing pathway-based analysis\n"; print "Options in effect: \n"; # Check if geneandp file exist and copy to working directory if( !-e "$geneandp" ) { print "$USAGE\n"; die "Error: -geneandp file not found\n"; } # copy( "$geneandp", "$time_stamp/geneandp" ) or die "Copy failed: $!"; system("sort -k1,1 $geneandp >$time_stamp/geneandp"); # first column should be a gene id print "\t-geneandp $geneandp\n"; # Check if glist file exist and copy to working directory if( !-e "$glist" ) { print "$USAGE\n"; die "Error: -glist file not found. Please provide gene annotation file\n"; } # copy( "$glist", "$time_stamp/tempglist" ) or die "Copy failed: $!"; system("sort -k4,4 $glist >$time_stamp/tempglist"); print "\t-glist $glist\n"; # Check if geneandpath file exist and copy to working directory if( !-e "$geneandpath" ) { print "$USAGE\n"; die "Error: -geneandpath file not found\n"; } # copy( "$geneandpath", "$time_stamp/geneandpath" ) or die "Copy failed: $!"; system("sort -k1,1 $geneandpath >$time_stamp/geneandpath"); # Make sure the first column is gene and second is pathway print "\t-geneandpath $geneandpath\n"; if( defined($outfile) ) { print "\t-out $outfile"; } if( defined($maxsample)) { print "\t-maxsample $maxsample\n"; } } else { print "$USAGE\n"; die "Error: First argument must be either -G or -P to perform gene-based test or pathway-based test respectively\n"; } ###--- END FRONT MATTER ---### ###--- BEGIN FILE VALIDATION and ANALYSIS ---### #Chnging to working directory print "\nChanging working directory..."; chdir "$time_stamp" or die "Error: cannot change to working directory: $!"; print "done\n"; # Check that R and plink exists &checkplinkandr; # Check that mvtnorm and corpcor exists &checkpackages; #Validating input files and running analysis if( $ARGV[0] eq "-G" ) { &testingsnpandp; &testingglist; #Now mapping input variants to genotype file system("awk '{print $1}' snpandp >snpIPlist"); $input_variant_count = `wc -l /dev/null"); $input_variant_merged_with_genofile_count = `wc -l /dev/null"); # Now make allgene files in geneset directory print "\nWritting allgene files..."; mkdir "geneset", 0777 or die "Error: cannot make working directory. Check that vegas has correct permission settings: $!"; for my $chromosome (1..24) { open(ALLGENES, ">geneset/allgene$chromosome"); foreach ( @hglist ) { chomp($_); @each_gene = split(/\s/,$_); $given_chromosome = @each_gene[0]; $given_symbol = @each_gene[3]; if($given_chromosome == $chromosome) { print ALLGENES "$given_symbol\n"; } } close ALLGENES; } # Identify genes with more than 0 variants in it print "done.\n"; # Do test # Make one more file for genes with p-value less than 2e-6 reporting all clumps at r2 0.20 if(defined($topten)){ system("echo \"Chr Gene nSNPs nSims Start Stop Test Pvalue Top-$percentage-pvalue Best-SNP SNP-pvalue\" > gene-basedoutput.out"); } if(defined($topsnp)){ system("echo \"Chr Gene nSNPs nSims Start Stop Test Pvalue Top-SNP-pvalue Best-SNP SNP-pvalue\" > gene-basedoutput.out"); } unless(defined($topsnp) || defined($topten)){ # Define vanilla test system("echo \"Chr Gene nSNPs nSims Start Stop Test Pvalue Best-SNP SNP-pvalue\" > gene-basedoutput.out"); } if(defined($dochr)){ &docustomchr; } if(defined($list)){ &docustomlist; } if(!defined($list) and !defined($dochr)){ &customtest; } if(defined($clustertest)){ &clustertest; } if(defined($outfile)){ system("cp gene-basedoutput.out ../$outfile.out"); } else{ system("cp gene-basedoutput.out ../gene-basedoutput.out"); } } elsif( $ARGV[0] eq "-P" ) { &testinggeneandp; &testingglist; print "Merging geneandp, geneandpath and glist files..."; system("join -1 1 -2 1 geneandp geneandpath >genePandpath"); system("join -1 1 -2 4 genePandpath tempglist >genePpathandlocTEMP"); system("awk '{print \$0,\$4 * 1e9 + \$5, \$4 * 1e9 + \$6}' genePpathandlocTEMP >genePpathandloc"); print "done\n"; # Report statistics for compititive pathway analysis print "Following are the statistics for compititive pathway analysis:\n"; system("awk '{print \$3}' genePpathandloc | sort -u >pathwaylist"); $pathway_count = `wc -l gandpforresampling"); $gene_count_forsampling = `wc -l pathway-basedoutput.out"); &defaultpathwaytest; if(defined($outfile)){ system("cp pathway-basedoutput.out ../$outfile.out"); } else{ system("cp pathway-basedoutput.out ../pathway-basedoutput.out"); } # if( defined( pathanalysisforomicsdata ) ) { # Develop method for pathway analysis of omics data analysis # } # else { # be defualt run pathway analysis for GWAS data # system("echo \"Pathway nGenesMapped nGenesUsed nSamples ObservedP empiricalP Genes\" > pathway-basedoutput.out"); # } # &testinggeneandpath; # Make these two sub routines } # Clean up files chdir "../" or die "Error: Cannot change directory: $!"; unless(defined($keeptimestamp)){ system("rm -R $time_stamp"); } #rmdir $time_stamp or warn "cannot remove $time_stamp\n"; print "\nThank you for using VEGAS2 version 2!\n"; ###--- END GENEBASED TEST ---### ###--- START SUBROUTINES ---### # Now make a file for random sampling sub makepforsampling { $vanillatest=1; system("R --vanilla --slave <; close(ALLPATHWAYS); foreach $pathway (@allpathways) { chomp($pathway); system("grep -w \"$pathway\" genePpathandloc > genesinPath.position"); # Test only those pathways with length more than 5 genes in it $pathway_length = `wc -l genesinPath.position`; if($pathway_length < 5){next;} system("sort -gk7,7 genesinPath.position >genesinPath.positionsorted"); # Drpping all but one gene from cluster of neiboughring genes system(" R --vanilla --slave < 1){ ObservedX2 <- sum(qchisq(PATHWAY_DATA3[,2],1,lower.tail=F)) ObservedP <- pchisq(ObservedX2, Neighbour_Filter_Length, lower.tail=F) ## VARIABLE 5 Gene_List <- paste(PATHWAY_DATA3[,1],collapse='_') ## VARIABLE 7 Temp_Pathway <- data.frame(Pathway_Name,Original_Length,Neighbour_Filter_Length,ObservedX2,ObservedP,Gene_List) write.table(Temp_Pathway,'temppathway',row.names=F,col.names=F,quote=F) } EOF"); # Now compute empirical p-value open(WORKINGPATHWAY, "temppathway") or die "Error: Cannot find temppathway\n"; @workingpathway = ; close(WORKINGPATHWAY); open(X2FORSAMPLING, "x2forsampling") or die "Error: Cannot find P_for_Sampling\n"; @x2forsampling = ; close(X2FORSAMPLING); foreach ( @workingpathway ) { chomp($_); @each_pathway_data = split(/\s/,$_); $pathway_name = @each_pathway_data[0]; $pathway_original_length = @each_pathway_data[1]; $nogenes_to_resample = @each_pathway_data[2]; $observedX2 = @each_pathway_data[3]; $pathway_observedP = @each_pathway_data[4]; $pathway_listofgenes = @each_pathway_data[5]; print "$pathway_name $pathway_original_length $nogenes_to_resample "; die "Too few elements (".scalar(@x2forsampling).") to select $nogenes_to_resample from\n" unless $nogenes_to_resample < @x2forsampling; # Shuffled list of indexes into @deck # Using empP formula 1+r/1+n $Number_of_reps_above_observed = 1; $number_of_shuffle = 1000; for my $shuffleCOUNT (1 .. $number_of_shuffle) { @shuffled_indexes = shuffle(0..$#x2forsampling); # Get just N of them. @pick_indexes = @shuffled_indexes[ 0 .. $nogenes_to_resample - 1 ]; # Pick cards from array @picks = @x2forsampling[ @pick_indexes ]; if(sum(@picks) >= $observedX2 ){ $Number_of_reps_above_observed = $Number_of_reps_above_observed + 1; } } $empiricalP = $Number_of_reps_above_observed / (1 + $number_of_shuffle); if($empiricalP < 0.01){ $number_of_shuffle = 10000; for my $shuffleCOUNT (1 .. $number_of_shuffle) { @shuffled_indexes = shuffle(0..$#x2forsampling); # Get just N of them. @pick_indexes = @shuffled_indexes[ 0 .. $nogenes_to_resample - 1 ]; # Pick cards from array @picks = @x2forsampling[ @pick_indexes ]; if(sum(@picks) >= $observedX2 ){ $Number_of_reps_above_observed = $Number_of_reps_above_observed + 1; } } $empiricalP = $Number_of_reps_above_observed / (1 + $number_of_shuffle); } if($empiricalP < 0.001){ $number_of_shuffle = 50000; for my $shuffleCOUNT (1 .. $number_of_shuffle) { @shuffled_indexes = shuffle(0..$#x2forsampling); # Get just N of them. @pick_indexes = @shuffled_indexes[ 0 .. $nogenes_to_resample - 1 ]; # Pick cards from array @picks = @x2forsampling[ @pick_indexes ]; if(sum(@picks) >= $observedX2 ){ $Number_of_reps_above_observed = $Number_of_reps_above_observed + 1; } } $empiricalP = $Number_of_reps_above_observed / (1 + $number_of_shuffle); } if($empiricalP < 0.0005){ $number_of_shuffle = 100000; for my $shuffleCOUNT (1 .. $number_of_shuffle) { @shuffled_indexes = shuffle(0..$#x2forsampling); # Get just N of them. @pick_indexes = @shuffled_indexes[ 0 .. $nogenes_to_resample - 1 ]; # Pick cards from array @picks = @x2forsampling[ @pick_indexes ]; if(sum(@picks) >= $observedX2 ){ $Number_of_reps_above_observed = $Number_of_reps_above_observed + 1; } } $empiricalP = $Number_of_reps_above_observed / (1 + $number_of_shuffle); } if($empiricalP < 0.0001){ $number_of_shuffle = 500000; for my $shuffleCOUNT (1 .. $number_of_shuffle) { @shuffled_indexes = shuffle(0..$#x2forsampling); # Get just N of them. @pick_indexes = @shuffled_indexes[ 0 .. $nogenes_to_resample - 1 ]; # Pick cards from array @picks = @x2forsampling[ @pick_indexes ]; if(sum(@picks) >= $observedX2 ){ $Number_of_reps_above_observed = $Number_of_reps_above_observed + 1; } } $empiricalP = $Number_of_reps_above_observed / (1 + $number_of_shuffle); } if($empiricalP < 0.00002){ $number_of_shuffle = 1000000; if(defined($maxsample)){ $number_of_shuffle = $maxsample; } for my $shuffleCOUNT (1 .. $number_of_shuffle) { @shuffled_indexes = shuffle(0..$#x2forsampling); # Get just N of them. @pick_indexes = @shuffled_indexes[ 0 .. $nogenes_to_resample - 1 ]; # Pick cards from array @picks = @x2forsampling[ @pick_indexes ]; if(sum(@picks) >= $observedX2 ){ $Number_of_reps_above_observed = $Number_of_reps_above_observed + 1; } } $empiricalP = $Number_of_reps_above_observed / (1 + $number_of_shuffle); } # Now print results #Majority of variables defined at start of foreach loop open(RESULTSPATHWAY, ">>pathway-basedoutput.out"); print RESULTSPATHWAY "$pathway_name $pathway_original_length $nogenes_to_resample $number_of_shuffle $pathway_observedP $empiricalP $pathway_listofgenes\n"; print "$number_of_shuffle $pathway_observedP $empiricalP $pathway_listofgenes\n"; } # Foreach @workingpathway } # foreach $pathway print "\ndone\n"; } # sub defaultpathwaytest sub customtest{ # VEGAS using custom set of SNPs - default print "\nReading custom genotypes...done\n\n"; unless(defined($topsnp) || defined($topten)){ # Define vanilla test $vanillatest=1; } for ($chr = 1; $chr <= 22; $chr++) { #for ($chr=22){ # system("plink --bfile ../$custom --chr $chr --make-bed --out custom$chr --noweb --silent > /dev/null"); open(ALLGENES, "geneset/allgene$chr") or die "Error: Cannot find geneset/allgene$chr\n"; @allgenes = ; close(ALLGENES); foreach $gene (@allgenes){ if(-e "tempgene.snp"){system("rm tempgene.snp");} if(-e "tempgene.pvalue"){system("rm tempgene.pvalue");} if(-e "gene.position"){system("rm gene.position");} chomp($gene); system("grep -w \"$gene\" tempglist|head -1 > gene.position"); @glistpos = grep(/\b$gene\b/, @hglist); chomp(@glistpos); @glistpos = split(/ /,@glistpos[0]); if(!defined($lower)){ $start = @glistpos[1]-50000; } else{ $start = @glistpos[1]-$lower; } if(!defined($upper)){ $stop = @glistpos[2]+50000; } else{ $stop = @glistpos[2]+$upper; } # system("plink --bfile custom$chr --chr $chr --from-bp $start --to-bp $stop --write-snplist --noweb --silent > /dev/null"); # It would be super if you could do these three lines with one PLINK command system("plink --bfile $custom --chr $chr --from-bp $start --to-bp $stop --write-snplist --noweb --silent > /dev/null"); if(-e "plink.snplist"){ system("mv plink.snplist tempgene.snp"); } open(SNPLIST,"tempgene.snp"); @snplist = ; close(SNPLIST); if(scalar(@snplist == 0)){next;} # foreach $snp(@snplist){ # chomp($snp); # system("grep -w \"$snp\" snpandp >> tempgene.pvalue"); # } system("sort -k1,1 tempgene.snp >tempgene.snp2"); system("join -1 1 -2 1 tempgene.snp2 snpandp >tempgene.pvalue"); # Send to R to do make into tempgene format, then send to &hapsmims? - files prepared: tempgene.pvalue plink.ld gene.position &maketempgener; &hapsims; # system("rm custom$chr.bed custom$chr.bim custom$chr.fam"); } } } sub docustomchr { # VEGAS using custom set of SNPs - chromosome print "\nReading custom genotypes...done\n\n"; unless(defined($topsnp) || defined($topten)){ # Define vanilla test $vanillatest=1; } #for ($chr = 1; $chr <= 22; $chr++) { for ($chr=$dochr){ # system("plink --bfile ../$custom --chr $chr --make-bed --out custom$chr --noweb --silent > /dev/null"); open(ALLGENES, "geneset/allgene$chr"); @allgenes = ; close(ALLGENES); foreach $gene (@allgenes){ if(-e "tempgene.snp"){system("rm tempgene.snp");} if(-e "tempgene.pvalue"){system("rm tempgene.pvalue");} if(-e "gene.position"){system("rm gene.position");} chomp($gene); system("grep -w \"$gene\" tempglist|head -1 > gene.position"); @glistpos = grep(/\b$gene\b/, @hglist); chomp(@glistpos); @glistpos = split(/ /,@glistpos[0]); if(!defined($lower)){ $start = @glistpos[1]-50000; } else{ $start = @glistpos[1]-$lower; } if(!defined($upper)){ $stop = @glistpos[2]+50000; } else{ $stop = @glistpos[2]+$upper; } # system("plink --bfile custom$chr --chr $chr --from-bp $start --to-bp $stop --write-snplist --noweb --silent > /dev/null"); # It would be super if you could do these three lines with one PLINK command system("plink --bfile $custom --chr $chr --from-bp $start --to-bp $stop --write-snplist --noweb --silent > /dev/null"); if(-e "plink.snplist"){system("mv plink.snplist tempgene.snp");} open(SNPLIST,"tempgene.snp"); @snplist = ; close(SNPLIST); if(scalar(@snplist == 0)){next;} # foreach $snp(@snplist){ # chomp($snp); # system("grep -w \"$snp\" snpandp >> tempgene.pvalue"); # } system("sort -k1,1 tempgene.snp >tempgene.snp2"); system("join -1 1 -2 1 tempgene.snp2 snpandp >tempgene.pvalue"); # Send to R to do make into tempgene format, then send to &hapsmims? - files prepared: tempgene.pvalue plink.ld gene.position &maketempgener; &hapsims; } } } sub docustomlist { # VEGAS using custom set of SNPs - genelist print "\nReading custom genotypes...done\n\n"; unless(defined($topsnp) || defined($topten)){ # Define vanilla test $vanillatest=1; } open(HG18, "tempglist"); @hg18 = ; close HG18; open(GENELIST, "customgenelist"); @genelist = ; close GENELIST; # Read in genelist and match to chromosomes foreach $genelist (@genelist){ chomp($genelist); $genelist =~ s/^\s+//; $genelist =~ s/\s+$//; foreach $hg18 (@hg18){ chomp($hg18); $hg18 =~ s/^\s+//; $hg18 =~ s/\s+$//; @hglist = split(/ /,$hg18); if($genelist eq @hglist[3]){ push(@dochroms,"@hglist[0]\n"); push(@dogenes,"@hglist[3]\n"); open(WRITECHR, ">>allgene@hglist[0]"); print WRITECHR "$genelist\n"; close WRITECHR; } } } # Extract unique chromosomes %seen = (); foreach $item (@dochroms){ push(@uniqchr, $item) unless $seen{$item}++; } foreach $chr(@uniqchr) { #for ($chr=22){ chomp($chr); system("plink --bfile $custom --chr $chr --make-bed --out custom$chr --noweb --silent > /dev/null"); open(ALLGENES, "allgene$chr"); @allgenes = ; close(ALLGENES); foreach $gene (@allgenes){ if(-e "tempgene.snp"){system("rm tempgene.snp");} if(-e "tempgene.pvalue"){system("rm tempgene.pvalue");} if(-e "gene.position"){system("rm gene.position");} chomp($gene); system("grep -w \"$gene\" tempglist|head -1 > gene.position"); @glistpos = grep(/\b$gene\b/, @hg18); chomp(@glistpos); @glistpos = split(/ /,@glistpos[0]); if(!defined($lower)){ $start = @glistpos[1]-50000; } else{ $start = @glistpos[1]-$lower; } if(!defined($upper)){ $stop = @glistpos[2]+50000; } else{ $stop = @glistpos[2]+$upper; } system("plink --bfile custom$chr --chr $chr --from-bp $start --to-bp $stop --write-snplist --noweb --silent > /dev/null"); # system("plink --bfile custom --chr $chr --from-bp $start --to-bp $stop --write-snplist --noweb --silent > /dev/null"); if(-e "plink.snplist"){system("mv plink.snplist tempgene.snp");} # Check if extra variants are provided add the same to docustomchr and customtest open(SNPLIST,"tempgene.snp"); @snplist = ; close(SNPLIST); if(scalar(@snplist == 0)){next;} system("sort -k1,1 tempgene.snp >tempgene.snp2"); system("join -1 1 -2 1 tempgene.snp2 snpandp >tempgene.pvalue"); # Send to R to do make into tempgene format, then send to &hapsmims? - files prepared: tempgene.pvalue plink.ld gene.position &maketempgener; # Check the line count after removing duplicate SNPs $line_count = `wc -l 1 ) { &hapsims; } else{ next; } } } } sub maketempgener{ unless(-z "tempgene.pvalue"){ if(defined($prunersquare)){ $line_count = `wc -l 1 ) { # system("awk '{print \$1}' tempgene.pvalue >prune.input"); # system("awk 'BEGIN{print \"SNP P\"}1' tempgene.pvalue >clump.input"); system("echo 'SNP P' >clump.header"); system("cat clump.header tempgene.pvalue >clump.input"); # system("plink --bfile custom$chr --extract prune.input --indep-pairwise 100 5 $prunersquare --noweb --silent"); # Clump would work better than the prune sice it will choose the variant based on the p-values insted of randomly # system("plink --bfile custom$chr --clump clump.input --clump-p1 1 --clump-p2 1 --clump-r2 0.99 --clump-kb 1000 --noweb --silent"); # system("plink --bfile ../$custom --clump clump.input --clump-p1 1 --clump-p2 1 --clump-r2 0.99 --clump-kb 1000 --noweb --silent > /dev/null"); system("plink --bfile custom --clump clump.input --clump-p1 1 --clump-p2 1 --clump-r2 0.99 --clump-kb 1000 --noweb --silent > /dev/null"); # system("sort -k1,1 plink.prune.in >plink.prune.in2"); # system("sort -k1,1 tempgene.pvalue >tempgene.pvalue2"); # system("join -1 1 -2 1 plink.prune.in2 tempgene.pvalue2 >tempgene.pvalue"); system("awk 'NR>1{print \$3,\$5}' plink.clumped >tempgene.pvalue"); } } system(" R --vanilla --slave < tempgene.snp"); } # if provided summary concatanate tempgene files } sub hapsims { if (-e "needmoresims"){system("rm needmoresims");} if (-e "correctedp.txt"){system("rm correctedp.txt");} if (-e "plink.ld"){system("rm plink.ld");} if (-e "do100000sims"){system("rm do100000sims");} if (-e "alwayswritep"){system("rm alwayswritep");} &plink; # 1000 sims if(defined($vanillatest)){ &mvsimsr(1000); } if(defined($topten)){ &mvsimstoptenr(1000); } if(defined($topsnp)){ &mvsimstopsnpr(1000); } if (-e "correctedp.txt"){ # If 1000 sims is enough open(CORRECTEDP, "correctedp.txt"); $correctedp = ; open(PRINTCORRECTEDP, ">>gene-basedoutput.out"); print PRINTCORRECTEDP "$correctedp"; print "$correctedp"; } if (-e "do100000sims"){ # Do 100000 sims if(defined($vanillatest)){ &mvsimsr(100000); } if(defined($topten)){ &mvsimstoptenr(100000); } if(defined($topsnp)){ &mvsimstopsnpr(100000); } if(-e "correctedp.txt"){ # If p>0.1 (not likely) write output open(CORRECTEDP, "correctedp.txt"); $correctedp = ; open(PRINTCORRECTEDP, ">>gene-basedoutput.out"); print PRINTCORRECTEDP "$correctedp"; print "$correctedp"; } if(-e "do100000sims"){ # If 0.001 < p < 0.1, write output open(CORRECTEDP, "do100000sims"); $correctedp = ; open(PRINTCORRECTEDP, ">>gene-basedoutput.out"); print PRINTCORRECTEDP "$correctedp"; print "$correctedp"; } if(-e "needmoresims"){ #Do max sims, then write output if(defined($max)){ if(defined($vanillatest)){ &mvsimsr($max); } if(defined($topten)){ &mvsimstoptenr($max); } if(defined($topsnp)){ &mvsimstopsnpr($max); } } else{ if(defined($vanillatest)){ &mvsimsr(1000000); } if(defined($topten)){ &mvsimstoptenr(1000000); } if(defined($topsnp)){ &mvsimstopsnpr(1000000); } } open(CORRECTEDP, "alwayswritep"); ## Run clumping for genes with p-value less than 2e-6 reporting all clumps at r2 0.20 $correctedp = ; open(PRINTCORRECTEDP, ">>gene-basedoutput.out"); print PRINTCORRECTEDP "$correctedp"; print "$correctedp"; } } if (-e "needmoresims"){ # Do max sims if(defined($max)){ if(defined($vanillatest)){ &mvsimsr($max); } if(defined($topten)){ &mvsimstoptenr($max); } if(defined($topsnp)){ &mvsimstopsnpr($max); } } else{ if(defined($vanillatest)){ &mvsimsr(1000000); } if(defined($topten)){ &mvsimstoptenr(1000000); } if(defined($topsnp)){ &mvsimstopsnpr(1000000); } } open(CORRECTEDP, "alwayswritep"); $correctedp = ; open(PRINTCORRECTEDP, ">>gene-basedoutput.out"); print PRINTCORRECTEDP "$correctedp"; print "$correctedp"; } } sub mvsimsr{ if (-e "needmoresims"){system("rm needmoresims");} if (-e "correctedp.txt"){system("rm correctedp.txt");} if (-e "do100000sims"){system("rm do100000sims");} if (-e "alwayswritep"){system("rm alwayswritep");} system(" R --vanilla --slave < co #co <- sqrt(co) #head(co) #check that co is positive definite. Make diagonals 1.0001. library(corpcor) reps <- $_[0] if(is.positive.definite(co)==F){ co <- make.positive.definite(co) } if(is.positive.definite(co)==F){ matrix(scan('plink.ld',quiet=T),nc=numsnps) -> co for(i in 1:numsnps){ co[i,i] <- 1.0001 } } if(is.positive.definite(co)==F){ for(i in 1:numsnps){ co[i,i] <- 1.001 } } if(is.positive.definite(co)==F){ for(i in 1:numsnps){ co[i,i] <- 1.01 } } library(mvtnorm) rmvnorm(reps,mean=rep(0,numsnps),sigma=co) -> rd rowSums(rd ^2) -> summedsq #write.table(data.frame(summedsq,summedsqtopten),file='geneXsummedsq.txt',row.names=F,quote=F,col.names=F) read.table('tempgene',header=F) -> gene ass <- gene[,6] #ass sum(ass,na.rm=T) -> sumass sum(!is.na(ass)) -> len # get number of snps with non-missing test stat length(ass) -> asslen #scan('geneXsummedsq.txt') -> simvals # read in simulation replicates #head(simvals) (1 + sum(ifelse(sumass > summedsq, 0, 1)))/(1 + length(summedsq)) -> empp #empp snpp <- pchisq(max(ass),1,lower.tail=F) snpname <- gene[which(ass==max(ass)),1][1] if(empp>0.1){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,snpname,snpp),'correctedp.txt',row.names=F,col.names=F) } if(empp<=0.1 & empp>0.001){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,snpname,snpp),'do100000sims',row.names=F,col.names=F) } if(empp<=0.001){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,snpname,snpp),'needmoresims',row.names=F,col.names=F) } write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,snpname,snpp),'alwayswritep',row.names=F,col.names=F) EOF "); } sub mvsimstoptenr{ if (-e "needmoresims"){system("rm needmoresims");} if (-e "correctedp.txt"){system("rm correctedp.txt");} if (-e "do100000sims"){system("rm do100000sims");} if (-e "alwayswritep"){system("rm alwayswritep");} if(-e "tempempp"){system("rm tempempp")} if(-e "temprd2"){system("rm temprd2")} if(-e "temprd-sorted"){system("rm temprd-sorted")} if($_[0] == 1000){ system("R --vanilla --slave < numsnps matrix(scan('plink.ld',quiet=T),nc=numsnps) -> co #co <- sqrt(co) #head(co) #check that co is positive definite. Make diagonals 1.0001. library(corpcor) reps <- $_[0] if(is.positive.definite(co)==F){ co <- make.positive.definite(co) } if(is.positive.definite(co)==F){ matrix(scan('plink.ld',quiet=T),nc=numsnps) -> co for(i in 1:numsnps){ co[i,i] <- 1.0001 } } if(is.positive.definite(co)==F){ for(i in 1:numsnps){ co[i,i] <- 1.001 } } if(is.positive.definite(co)==F){ for(i in 1:numsnps){ co[i,i] <- 1.01 } } library(mvtnorm) rmvnorm(reps,mean=rep(0,numsnps),sigma=co) -> rd rowSums(rd ^2) -> summedsq percentage <- $percentage #Old bug in all previous version highlited by Julian Hecker et al #topten <- function(x){ # sum(sort(x,decreasing=T)[1:(1+length(diag(co))*percentage)]^2) #} # Update in version 3 topten <- function(x){ sum(sort(x^2,decreasing=T)[1:(1+length(diag(co))*percentage)]) } apply(rd,1,topten) -> summedsqtopten read.table('tempgene',header=F) -> gene ass <- gene[,6] #ass sum(ass,na.rm=T) -> sumass sum(!is.na(ass)) -> len # get number of snps with non-missing test stat length(ass) -> asslen asstopten <- sort(ass,decreasing=T)[1:(1+length(diag(co))*percentage)] sumasstopten <- sum(asstopten) (1 + sum(ifelse(sumasstopten > summedsqtopten, 0, 1)))/(1 + length(summedsqtopten)) -> empptopten #empptopten (1 + sum(ifelse(sumass > summedsq, 0, 1)))/(1 + length(summedsq)) -> empp #empp snpp <- pchisq(max(ass),1,lower.tail=F) snpname <- gene[which(ass==max(ass)),1][1] if(empptopten >= 0.1){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'correctedp.txt',row.names=F,col.names=F) } #if(empp < 0.1 & empp > 0.001){ #write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'do100000sims',row.names=F,col.names=F) #} if(empptopten < 0.1 & empptopten > 0.001){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'do100000sims',row.names=F,col.names=F) } if(empptopten <= 0.001){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'needmoresims',row.names=F,col.names=F) } write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'alwayswritep',row.names=F,col.names=F) EOF"); } if($_[0] > 1000){ # Do 1e5 simulations and topten sort etc, save to disk, then repeat until reach 1e6 (or $max) my $reps = int($_[0]/100000); for($i = 1; $i <= $reps; $i++){ system("R --vanilla --slave < co library(corpcor) if(is.positive.definite(co)==F){ co <- make.positive.definite(co) } if(is.positive.definite(co)==F){ matrix(scan('plink.ld',quiet=T),nc=numsnps) -> co for(i in 1:numsnps){ co[i,i] <- 1.0001 } } if(is.positive.definite(co)==F){ for(i in 1:numsnps){ co[i,i] <- 1.001 } } if(is.positive.definite(co)==F){ for(i in 1:numsnps){ co[i,i] <- 1.01 } } library(mvtnorm) rmvnorm(reps,mean=rep(0,numsnps),sigma=co) -> rd rowSums(rd ^2) -> summedsq #Old #topten <- function(x){ # sum(sort(x,decreasing=T)[1:(1+length(diag(co))*percentage)]^2) #} #Update in version 3 topten <- function(x){ sum(sort(x^2,decreasing=T)[1:(1+length(diag(co))*percentage)]) } apply(rd,1,topten) -> summedsqtopten write.table(summedsq,'vanilla',append=T,col.names=F,row.names=F,quote=F) write.table(summedsqtopten,'topten',append=T,col.names=F,row.names=F,quote=F) EOF") } system("R --vanilla --slave < gene summedsqtopten <- scan('topten') summedsq <- scan('vanilla') percentage <- $percentage reps <- $_[0] ass <- gene[,6] #ass sum(ass,na.rm=T) -> sumass sum(!is.na(ass)) -> len # get number of snps with non-missing test stat length(ass) -> asslen asstopten <- sort(ass,decreasing=T)[1:(1+length(gene[,1])*percentage)] sumasstopten <- sum(asstopten) (1 + sum(ifelse(sumasstopten > summedsqtopten, 0, 1)))/(1 + length(summedsqtopten)) -> empptopten #empptopten (1 + sum(ifelse(sumass > summedsq, 0, 1)))/(1 + length(summedsq)) -> empp #empp snpp <- pchisq(max(ass),1,lower.tail=F) snpname <- gene[which(ass==max(ass)),1][1] if(empptopten >= 0.1){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'correctedp.txt',row.names=F,col.names=F) } #if(empp < 0.1 & empp > 0){ #write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'do100000sims',row.names=F,col.names=F) #} if(empptopten < 0.1 & empptopten > 0.001){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'do100000sims',row.names=F,col.names=F) } if(empptopten < 0.001){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'needmoresims',row.names=F,col.names=F) } write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'alwayswritep',row.names=F,col.names=F) EOF"); system("rm vanilla topten"); } } sub mvsimstopsnpr{ if (-e "needmoresims"){system("rm needmoresims");} if (-e "correctedp.txt"){system("rm correctedp.txt");} if (-e "do100000sims"){system("rm do100000sims");} if (-e "alwayswritep"){system("rm alwayswritep");} if(-e "tempempp"){system("rm tempempp")} if(-e "temprd2"){system("rm temprd2")} if(-e "temprd-sorted"){system("rm temprd-sorted")} system("R --vanilla --slave < numsnps matrix(scan('plink.ld',quiet=T),nc=numsnps) -> co #check that co is positive definite. Make diagonals 1.0001. library(corpcor) reps <- $_[0] if(is.positive.definite(co)==F){ co <- make.positive.definite(co) } if(is.positive.definite(co)==F){ matrix(scan('plink.ld',quiet=T),nc=numsnps) -> co for(i in 1:numsnps){ co[i,i] <- 1.0001 } } if(is.positive.definite(co)==F){ for(i in 1:numsnps){ co[i,i] <- 1.001 } } if(is.positive.definite(co)==F){ for(i in 1:numsnps){ co[i,i] <- 1.01 } } library(mvtnorm) rmvnorm(reps,mean=rep(0,numsnps),sigma=co) -> rd rowSums(rd ^2) -> summedsq percentage <- $percentage topsnps <- function(x){ max(x^2) -> topten } apply(rd,1,topsnps) -> summedsqtopten read.table('tempgene',header=F) -> gene ass <- gene[,6] sum(ass,na.rm=T) -> sumass sum(!is.na(ass)) -> len # get number of snps with non-missing test stat length(ass) -> asslen sumasstopten <- max(ass) (1 + sum(ifelse(sumasstopten > summedsqtopten, 0, 1)))/(1 + length(summedsqtopten)) -> empptopten (1 + sum(ifelse(sumass > summedsq, 0, 1)))/(1 + length(summedsq)) -> empp snpp <- pchisq(max(ass),1,lower.tail=F) snpname <- gene[which(ass==max(ass)),1][1] if(empptopten >= 0.1){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'correctedp.txt',row.names=F,col.names=F) } # if(empp < 0.1 & empp > 0.001){ # write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'do100000sims',row.names=F,col.names=F) # } if(empptopten < 0.1 & empptopten > 0.001){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'do100000sims',row.names=F,col.names=F) } if(empptopten <= 0.001){ write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'needmoresims',row.names=F,col.names=F) } write.table(data.frame('$chr','$gene',asslen,reps,unique(gene[4]),unique(gene[5]),sumass,empp,empptopten,snpname,snpp),'alwayswritep',row.names=F,col.names=F) EOF"); } sub sort_by_number{ $b <=> $a } sub plink{ # Use plink to generate ld matrix, then check for errors # system("plink --bfile custom$chr --extract tempgene.snp --matrix --r --noweb --silent > /dev/null"); # system("plink --bfile ../$custom --extract tempgene.snp --matrix --r --noweb --silent > /dev/null"); system("plink --bfile custom --extract tempgene.snp --matrix --r --noweb --silent > /dev/null"); open(PLINKLD, "plink.ld"); @plinkld = ; foreach $plinkld(@plinkld){ $plinkld =~ s/nan/0/g; #replace nan's with 0 in ld matrix } if(scalar(@plinkld == 1)){ @plinkld = 1; } open(PRINTPLINKLD, ">plink.ld"); print PRINTPLINKLD "@plinkld"; $numsnps = scalar(@plinkld); close PRINTPLINKLD; close PLINKLD; } sub checkplinkandr { print "\nChecking if plink and R exist..."; system("echo -n \$PATH > binarypath"); open(BINPATH, "binarypath"); $binpath = ; @binpath = split(/:/, $binpath); $Rtimes=0; $plinktimes=0; foreach $i(@binpath){ #print "$i\n"; if(-e "$i/R"){ $Rtimes=$Rtimes+1; } if(-e "$i/r"){ $Rtimes=$Rtimes+1; } if(-e "$i/plink"){ $plinktimes=$plinktimes+1; } } if($Rtimes==0){ chdir "../" or die "cannot change directory: $!"; system("rm -R $time_stamp"); die "Error: Cannot find R - make sure it can be accessed through \$PATH \n$!\n"; } if($plinktimes==0){ chdir "../" or die "cannot change directory: $!"; system("rm -R $time_stamp"); die "Error: Cannot find plink - make sure it can be accessed through \$PATH \n$!\n"; } close PATH; system("rm binarypath"); print "done\n"; } sub checkpackages { print "\nChecking if mvtnorm and corpcor R packages exist..."; system(" R --vanilla --slave <> testpackagelist"); system("grep -w \"corpcor\" R-packagelist >> testpackagelist"); open(TESTPACKAGELIST,"testpackagelist"); my @testpackagelist = ; if(scalar(@testpackagelist ne 2)){ die "Error: Missing R library: VEGAS requires corpcor and mvtnorm to run\n"; } close TESTPACKAGELIST; print "done\n"; } sub testinggeneandp { # Validations # Check pvalues in file are valid - in decimal or standard scientific notation (0.0034, 1e-6 etc...) # Check if p-values are between 0 and 1 print "\nValidating input file..."; open(PVALS, "geneandp"); # Location of GWAS results file @pvals = ; foreach ( @pvals ) { chomp($_); @each_line = split(/\s/,$_); if ( looks_like_number(@each_line[1] ) ) { $checked_linenumber = 0; if ( @each_line[1] < 0 || @each_line[1] > 1 ) { print WRITELOG "Error: Invalid p-values in $user_filename detected.\n P-values should be in between 0 to 1"; print WRITELOG "Please check the column 2 (p-value) for @each_line\n"; print "\nPlease check column 2 (p-value) for @each_line\n"; die "Error: Invalid p-values in $user_filename detected.\n"; } } else { print WRITELOG "Error: Invalid p-values in $user_filename detected.\n"; print WRITELOG "Please check the column 2 (p-value) for @each_line\n"; print "\nPlease check column 2 (p-value) for @each_line\n"; die "Error: Invalid p-values in $user_filename detected.\n"; } } close PVALS; print "done.\n"; } sub testingsnpandp { # Validations # Check pvalues in file are valid - in decimal or standard scientific notation (0.0034, 1e-6 etc...) # Check if p-values are between 0 and 1 print "\nValidating input file..."; open(PVALS, "snpandp"); # Location of GWAS results file @pvals = ; foreach ( @pvals ) { chomp($_); @each_line = split(/\s/,$_); if ( looks_like_number(@each_line[1] ) ) { $checked_linenumber = 0; if ( @each_line[1] < 0 || @each_line[1] > 1 ) { print WRITELOG "Error: Invalid p-values in $user_filename detected.\n P-values should be in between 0 to 1"; print WRITELOG "Please check the column 2 (p-value) for @each_line\n"; print "\nPlease check column 2 (p-value) for @each_line\n"; die "Error: Invalid p-values in $user_filename detected.\n"; } } else { print WRITELOG "Error: Invalid p-values in $user_filename detected.\n"; print WRITELOG "Please check the column 2 (p-value) for @each_line\n"; print "\nPlease check column 2 (p-value) for @each_line\n"; die "Error: Invalid p-values in $user_filename detected.\n"; } } close PVALS; print "done.\n"; } sub testingglist { # Check the glist file print "\nValidating glist file..."; open(HGLIST, "tempglist"); # Location of glist file file @hglist = ; foreach ( @hglist ) { chomp($_); @each_gene = split(/\s/,$_); $no_of_columns = scalar @each_gene; if ($no_of_columns ne 4 ){ die "Error: glist file must contain 4 columns\n"; } $given_chromosome = @each_gene[0]; $given_start = @each_gene[1]; $given_stop = @each_gene[2]; $given_symbol = @each_gene[3]; $given_genesize = $given_stop - $given_start; # Make sure column 1 is a valid chromosome column if ( looks_like_number( $given_chromosome ) ) { $checked_genenumber = 0; if ( $given_chromosome =~ /^\d+$/ ) { #chomosome is a whole number if ( $given_chromosome < 1 || $given_chromosome > 24 ) { print WRITELOG "Error: Invalid chromosomes in $glist detected.\n Column 1 Chromosomes should be in between 1 to 24.\n"; print WRITELOG "Please check the column 1 (chromosome number) for @each_gene\n"; print "\nPlease check column 1 (chromosome number) for @each_gene\n"; die "Error: Invalid chromosome in $glist detected. Column 1 should be a chromsome column with number between 1 to 24.\n"; } } else { print WRITELOG "Error: Invalid chromsome in $glist detected.\n Chromosome should be a whole number.\n"; print WRITELOG "Please check the column 1 (chromosome number) for @each_gene\n"; print "\nPlease check column 1 (chromosome number) for @each_gene\n"; die "Error: Invalid chromosomes in $glist detected.\n Chromosome should be a whole number.\n"; } } else { print WRITELOG "Error: Invalid chromsome in $glist detected.\n Please convert X and Y to 23 and 24 respectively if any.\n"; print WRITELOG "Please check the column 1 (chromosome number) for @each_gene\n"; print "\nPlease check column 1 (chromosome number) for @each_gene\n"; die "Error: Invalid chromosomes in $glist detected.\n Please convert X and Y to 23 and 24 respectively if any.\n"; } # Make sure column 2 and 3 are valid location columns if ( looks_like_number( $given_genesize ) ) { $checked_genenumber = 0; if ( $given_genesize =~ /^\d+$/ ) { #chomosome is a whole number if ( $given_genesize < 1 || $given_genesize > 30000000 ) { print WRITELOG "Error: Invalid location in $glist detected.\n Difference between column 3 (stop) and column 2 (start) should be in between 1 base to 3 Mb.\n"; print WRITELOG "Please check the column 3 (stop) and column 3 (start) for @each_gene\n"; print "\nPlease check column 3 (stop) and column 2 (start) for @each_gene\n"; die "Error: Invalid location in $glist detected. Difference between column 3 (stop) and column 2 (start) should be in between 1 base to 3 Mb.\n"; } } else { print WRITELOG "Error: Invalid location in $glist detected.\n Start and stop values should be a whole number.\n"; print WRITELOG "Please check the column 2 (start) and column 3 (stop) for @each_gene\n"; print "\nPlease check column 2 (start) and column 3 (stop) for @each_gene\n"; die "Error: Invalid location in $glist detected.\n Start and stop values should be a whole number.\n"; } } else { print WRITELOG "Error: Invalid location in $glist detected.\n Make sure locations (columns 2 and 3) are numeric.\n"; print WRITELOG "Please check column 2 (start) and column 3 (stop) for @each_gene\n"; print "\nPlease check column 2 (start) and column 3 (stop) for @each_gene\n"; die "Error: Invalid location in $glist detected.\n Make sure locations (columns 2 and 3) are numeric.\n"; } #Now make a new genelist with upadated upper and lower limit $new_start = $given_start - $lower; if($new_start < 0){$new_start = 1;} $new_stop = $given_stop + $upper; open(PRINTRESULTS, ">>NEWglist"); print PRINTRESULTS "$given_chromosome $new_start $new_stop $given_symbol\n"; } close PRINTRESULTS; close HGLIST; print "done.\n"; } ###--- End subroutines ---###