#!/usr/bin/perl -w
use strict;
use Bio::Perl;
use Bio::SeqIO;
use Bio::Seq;
use Data::Dumper;
use Bio::DB::Fasta;
use Statistics::Standard_Normal qw/z_to_pct pct_to_z/;
use Getopt::Long;
use List::Util;
use FindBin qw($Bin);
use Cwd;
use DateTime;
#------------------------------------------------------------------------------
#DEPENDENCIES
#perl-modules (see above)
#if mode = fasta: blastall, formatdb, clustalw;
#if mode = alignment: getORF, hmmer (hmmbuild, hmmsearch);
#for all modi: exonerate, proSplign, bl2seq, fastacmd, scipio;
#linux command line: grep, awk, sed;
# -----------------------------------------------------------------------------
# GLOBALS
#################################################################################################
#Set this variables to your path once and save the script! #
#
#Path to ExonMatchSolver.jar #
my $Bin = ""; #
#
#Path to sensitive HMMER (alignment-mode only): #
my $hmmer_path = ""; #
#
#Path to a scratch with sufficient file space for a translated genome (alignment-mode only) #
my $space = ""; #
#
#################################################################################################
use vars qw ($help $genome_target $genome_query $input $outfolder);
my $core = 2; #Numbers of cores available for the pipeline
my $distance_first = 10000; #Can be defined by the user in dependence of the intron size expected for the first (few) exons.
my $distance_last = 10000; #Can be defined by the user in dependence of the intron size expected for the last (few) exon.
my $intervall = ""; #Option handed over to ExonMatchSolver
my $max_solution = ""; #Option handed over to ExonMatchSolver
my $WGD = "no";
#Set this as default for cross-species search (default in Webscipio)
#my $scipio_options = "--min_identity=60 --max_move_exon=6 --blat_score=15 --blat_identity=54 --region_size=10000 --exhaust_align_size=15000 --blat_params=\"-oneOff=1\"";
my $scipio_options = "--min_identity=60 --max_move_exon=6 --blat_score=15 --blat_identity=54";
my $input2 = "";
my $mode = "fasta";
my $exonerate = "no";
my $length_cutoff = undef; #Minimal amino acid length of TCEs to be included in the homology assignment.
my $models = "";
my $z_cutoff = 3;
my $nonDuplicated = "";
# -----------------------------------------------------------------------------
# OPTIONS
GetOptions (
"i=s" => \$input,
"user=s" => \$input2,
"m=s" => \$models,
"mode=s" => \$mode,
"o=s" => \$outfolder,
"WGD=s" => \$WGD,
"noWGD=s" => \$nonDuplicated,
"query=s" => \$genome_query,
"target=s" => \$genome_target,
"dFirst=i" => \$distance_first,
"dLast=i" => \$distance_last,
"scipio_opt=s" => \$scipio_options,
"l=i" => \$length_cutoff,
"z=f" => \$z_cutoff,
"c=i" => \$core,
"s=s" => \$exonerate,
"int=i" => \$intervall,
"max=i" => \$max_solution,
"help" => \$help,
"h" => \$help);
usage() if ($help || !$input || !$outfolder|| !$genome_target|| !$mode);
# -----------------------------------------------------------------------------
# MAIN
my @loci;
my $strand;
my $start;
my $end;
my $genome_query = $genome_query;
my $genome_target = $genome_target;
my $paralog;
my $exon;
my %missing_scores;
my @numle;
my $e = 1;
my %single_exons;
my $myInput = $input;
my $myInput2 = $input2;
my $myOutFolder = $outfolder;
my @duplicates = ("a", "b");
my @exons;
my @sorted_exons;
#Input: protein sequence or nucleotide sequence from Swiss-Prot/ RefSeq in fasta-format
#Test existence of target and query genome, build blast database if that does not exist
#Create folder with name of choice (-o), copy input file (-i) in this folder
my @nonDuplicated = split (",", $nonDuplicated);
my %NonDuplicated;
foreach my $duplicate(@nonDuplicated){
$NonDuplicated{$duplicate} = 1;
}
die "The target genome \"$genome_target\" does not exist. $!\n"
unless -e $genome_target;
if ($mode eq "fasta"|| $mode eq "user"){
my @buildtarget = "formatdb -o -i $genome_target -p F";
unless (-e "${genome_target}.nhr" && -e "${genome_target}.nin" && -e "${genome_target}.nsd" && -e "${genome_target}.nsi" && -e "${genome_target}.nsq"){
system (@buildtarget) == 0 or die "Formatdb with target genome failed. $!\n";
};
};
die "The folder \"$myOutFolder\" already exists. Please change the name of the outputfolder (-o).\n"
if -e $myOutFolder;
printHeader();
mkdir "$myOutFolder/";
my $paralog_num = `grep '>' $myInput| wc -l`;
my @numbers = (1..$paralog_num);
chdir "$myOutFolder/";
my $dir = getcwd();
mkdir "SearchQuery/";
mkdir "HomologousExons/";
mkdir "SingleExons/";
if ($mode eq "fasta"|| $mode eq "user"){
my $in = new Bio::SeqIO(-file => "../$myInput");
my $count1 = 0;
while (my $seq = $in->next_seq) {
$count1++;
my $out = new Bio::SeqIO(-file => "> SearchQuery/Paralog${count1}.fa", -format=>'fasta');
$out->write_seq($seq);
}
}elsif($mode eq "alignment"){
open INPUT, "< ../$myInput";
while(){
if ($_ =~ /^>/){
chomp;
s/>//g;
my $header = $_;
open OUT, "> SearchQuery/${header}.fa";
print OUT ">$_\n";
next;
}
print OUT "$_\n";
close OUT;
}
}
if ($mode eq "fasta"){
die "The query genome \"$genome_query\" does not exist. $!\n"
unless -e $genome_query;
my @buildquery = "formatdb -i $genome_query -p F -o";
unless (-e "${genome_query}.nhr" && -e "${genome_query}.nin" && -e "${genome_query}.nsd" && -e "${genome_query}.nsi" && -e "${genome_query}.nsq"){
system (@buildquery) == 0 or die "Formatdb with query genome failed. $!\n";
}
`cp ../$myInput $dir/.`;
#Blast the query sequences against the query genome
foreach my $count(@numbers){
my @call = "blastall -p tblastn -d $genome_query -i SearchQuery/Paralog${count}.fa -a $core -m 8 -G 11 -E 1 -o SearchQuery/Paralog${count}.blastout";
system (@call) == 0 or die "blastall against query genome failed. $!\n";
};
print "Blast of query sequences against query genome done:\n";
#Extract the corresponding locus from the blast output
foreach my $count(@numbers){
my $locus = `sort -k11,11g SearchQuery/Paralog${count}.blastout| awk '{if (\$3>90) print \$2}'| head -n1`;
chomp $locus;
print "Paralog ${count} is located on locus $locus.\n";
$loci[$count] = $locus;
}
BLOCK: foreach my $count(@numbers){
my $strand_last ="";
my $first = `grep -P '$loci[$count]\t' SearchQuery/Paralog${count}.blastout|sort -n -k 7,8| grep 'e-'|head -n 1`;
my @first = split "\t", $first;
if ($first[8]<$first[9]){
$strand = "+";
}elsif ($first[8] > $first[9]){
$strand = "-";
};
my $last = `grep -P '$loci[$count]\t' SearchQuery/Paralog${count}.blastout|sort -n -k 7,8| grep 'e-'|tail -n 1`;
my @last = split "\t", $last;
if ($last[8] < $last[9]){
$strand_last = "+";
}elsif($last[8] > $last[9]){
$strand_last = "-";
}
if ($strand ne $strand_last){
print "WARNING: The paralog $count has high scoring blast/hmm-hits on different strands in the query genome. Check this region with the help of the SearchQuery/Paralog${count}.blastout. This might be a second paralog on the same contig or an assembly mistake. In the first case, split this contig in two parts so that each paralog is situated on a differently named contig and restart the pipeline. In the second case consider taking a different genome version/assembly.\n";
}elsif ($strand eq $strand_last){
if ($strand eq "+"){
$start = $first[8] - $distance_first;
$end = $last[9]+ $distance_last;
print "fastacmd -d $genome_query -s $loci[$count] -L $start,$end > SearchQuery/Paralog${count}_locus.fa\n";
`fastacmd -d $genome_query -s $loci[$count] -L $start,$end > SearchQuery/Paralog${count}_locus.fa`;
}elsif($strand eq "-"){
$end = $first[9] + $distance_first;
$start = $last[8] - $distance_last;
print "fastacmd -d $genome_query -S 2 -s $loci[$count] -L $start,$end > SearchQuery/Paralog${count}_locus.fa\n";
`fastacmd -d $genome_query -S 2 -s $loci[$count] -L $start,$end > SearchQuery/Paralog${count}_locus.fa`;
};
if ($exonerate eq "no"){
if (-e "error"){
`rm error`;
};
my @prosplign = "prosplign -nfa SearchQuery/Paralog${count}_locus.fa -pfa SearchQuery/Paralog${count}.fa -full -out SearchQuery/Paralog${count}.out -inf SearchQuery/Paralog${count}.inf 2> error";
system (@prosplign);
print "@prosplign\n";
`rm SearchQuery/Paralog${count}_locus.fa`;
open (ERROR, "< error");
while (){
if ($_ =~ /std::bad_alloc/){
$exonerate = "yes";
}
}
if ($exonerate eq "yes"){
`rm SearchQuery/Paralog${count}.inf`;
`rm SearchQuery/Paralog${count}.out`;
goto BLOCK;
}
}elsif ($exonerate eq "yes"){
my @exonerate = "exonerate --model protein2genome --showalignment no --showvulgar no --showcigar no --showsugar no --showquerygff no --showtargetgff yes -q SearchQuery/Paralog${count}.fa -t SearchQuery/Paralog${count}_locus.fa > SearchQuery/Paralog${count}.gff";
`exonerate --model protein2genome --showalignment yes --useaatla false --showvulgar no --showcigar no --showsugar no --showquerygff no --showtargetgff no -q SearchQuery/Paralog${count}.fa -t SearchQuery/Paralog${count}_locus.fa| grep -A1 \'|\' | grep -v \'|\'| grep -v \'^\\-\\-\'| sed \'s/\\s//g\'| grep -v \'Model\'|sed \'s/{[A-Z]\*//g\'| sed \'s/}//g\'| sed \':a;N;\$!ba;s/\\n//g\' | sed \'s/[+|-]\\{2,4\\}/\\n/g\'| sed \'s/#//g\'|sed \'s/+//\'|sed \'s/-//\' > SearchQuery/Paralog${count}.aln`;
print "@exonerate\n";
system (@exonerate) == 0 or die "exonerate against query genome failed. $!\n";
`grep -v '#' SearchQuery/Paralog${count}.gff| grep -v 'similarity'| grep -v '\\-\\-'| grep -v 'Hostname'| grep -P '\texon\t'| sort -g -k 4,5 > SearchQuery/Paralog${count}_sorted.gff`;
`rm SearchQuery/Paralog${count}_locus.fa`;
}
}
}
#Retrieve the intron-exon structure for each paralog ($count), the accumulated TCE-length are stored in single arrays within a hash ($hash{$count}) (one array per paralog). This is done by the sub get_exons.
#Lengths are extracted from the .inf-file if prosplign is used and summed up
#They can be used as start and stopp to extract the subsequence of the original amino acid sequence
{
my %hash;
foreach my $count(@numbers){
if ($exonerate eq "no"){
open (INF, "< SearchQuery/Paralog${count}.inf");
$hash{$count} = [get_exons($count)];
close INF;
}elsif ($exonerate eq "yes"){
open (GFF, "< SearchQuery/Paralog${count}_sorted.gff");
$hash{$count} = [get_exons($count)];
close GFF;
}
}
sub get_exons {
my $length_total = 0;
my $length = 0;
my @array;
push @array, 0;
if ($exonerate eq "no"){
while(){
chomp;
if ($_ =~ m/id:/g){
my @coordinates = split "\t", $_;
if ($coordinates[2] - $coordinates[3] > 0){
my $length = $coordinates[2] - $coordinates[3] + 1;
$length_total = $length + $length_total;
push @array, $length_total;
}elsif ($coordinates[3] - $coordinates[2] > 0){
my $length = $coordinates[3] - $coordinates[2] + 1;
$length_total = $length + $length_total;
push @array, $length_total;
}
}
}
}elsif ($exonerate eq "yes"){
while(){
chomp;
if ($_ =~ m/\texon\t/g){
my @coordinates = split "\t", $_;
if ($coordinates[3] - $coordinates[4] > 0){
my $length = $coordinates[3] - $coordinates[4] + 1;
$length_total = $length + $length_total;
push @array, $length_total;
}elsif ($coordinates[4] - $coordinates[3] > 0){
my $length = $coordinates[4] - $coordinates[3] + 1;
$length_total = $length + $length_total;
push @array, $length_total;
}
}
}
}
return (@array);
}
#print Dumper (\%hash);
open (OUT, "> HomologousExons/paralog_exon_alignment.fa");
if ($exonerate eq "no"){
open (OUT2, "> HomologousExons/paralogs.fa");
foreach my $count(sort{$a <=> $b}keys %hash){
my $result = `grep -A1 "[0-9]" SearchQuery/Paralog${count}.out| grep -v "[0-9]"| sed 's/ //g'| sed 's/-//g'| tr -d '\n'`;
$_ = $result;
s/([a-z])[a-z][a-z]/$1/g;
$result = $_;
print OUT2 ">Paralog${count}\n$result\n";
for (my $i=0; $i < @{$hash{$count}}; $i++) {
#print "$count\n";
#print "@{$hash{$count}}\n";
my $string = Bio::Seq->new(-seq => $result, -alphabet => 'protein');
my $start = int(($hash{$count}[$i] / 3) + 1);
my $stopp;
if (!defined($hash{$count}[$i+1])){
$stopp = $string->length;
}else{
$stopp = int(($hash{$count}[$i+1] / 3))
};
if ($start > $stopp){
next;
}else{
#print "$i, $start, $stopp\n";
my $substr = $string->subseq($start,$stopp);
$_ = $substr;
s/[a-z]//g;
print OUT ">Paralog${count}_exon$i\n$_\n";
}
}
};
}elsif ($exonerate eq "yes"){
foreach my $count(sort{$a <=> $b}keys %hash){
open (OUT2, "> HomologousExons/paralogs.fa");
my $result = `sed ':a;N;\$!ba;s/\\n//g' SearchQuery/Paralog${count}.aln`;
print OUT2 ">Paralog${count}\n$result\n";
open (PROT, "< SearchQuery/Paralog${count}.aln");
my $i =-1;
while (){
chomp;
if ($_ ne ""){
$i++;
print OUT ">Paralog${count}_exon$i\n";
print OUT "$_\n";
};
};
};
};
};
close OUT;
#Pairwise scores are calculated for every TCE pair using Clustalw
`clustalw -INFILE=HomologousExons/paralog_exon_alignment.fa > HomologousExons/pairwisescores.clw`;
my $switch = 0;
my @scores;
my @array;
my $value;
my @new_array;
my %hash;
my $x;
my $y;
my @length;
my $length;
my $length2;
my @length2;
my $switch3 = 0;
my @id;
my %length;
my %scores;
my @otherscores;
#Parse the clustalw-input file
#Store the sequence-id as hash key and the aa-length[0] and full sequence[1] name as array in the hash %length
open (IN, "< HomologousExons/pairwisescores.clw");
while (my $line = ){
chomp $line;
if ($line =~ /Sequence\s(\d)+:/){
my @length = split (/\s+/,$line);
$_ = $length[1];
s/://g;
$length[1] = $_;
#print "$length[1]\t$length[2]\t$length[3]\n";
$length{$length[1]} = [int($length[3]), $length[2]];
};
if ($line =~ /Aligned\.\sScore/){
@scores = split ":", $line;
$_ = $scores[0];
s/Sequences\s\(//g;
$scores[0] = $_;
$_ = $scores[1];
s/\)\sAligned.\sScore//g;
$scores[1] = $_;
$_ = $scores[2];
s/(\s)+(\d+)/$2/g;
$scores[2] =$_;
foreach my $num(@numbers){
if ($length{$scores[0]}[1] =~ /Paralog$num/ && $length{$scores[1]}[1] =~ /Paralog$num/){
push @{$scores{$num}}, [$scores[0], $scores[1], $scores[2]];
};
push @otherscores, [$scores[0], $scores[1], $scores[2]];
}
}
}
close IN;
#print "%length:\n";
#print Dumper (\%length);
my @maximum;
for my $seq(keys %length){
push @maximum, $length{$seq}[0];
}
my @sortedmax = sort {$a <=> $b} @maximum;
my $maxLength = $sortedmax[-1];
unless (defined $length_cutoff){
$length_cutoff = ($maxLength / 10);
print "Length-cutoff: $length_cutoff\n";
};
#print Dumper (\%scores);
#print Dumper (\@otherscores);
#print "@array:\n";
#print Dumper (\@array);
#Store the first id as a key in the hash %zscores, in this hash, store the second id as key with the score as value
my %zscores;
for my $paralog(keys %scores){
foreach my $element(0..$#{$scores{$paralog}}){
foreach my $number1($scores{$paralog}[$element][0]){
foreach my $number2($scores{$paralog}[$element][1]){
foreach my $all(0..$#{$scores{$paralog}}){
if ($scores{$paralog}[$all][0] == $number1){
$zscores{$paralog}{$number1}{$scores{$paralog}[$all][1]}=[$scores{$paralog}[$all][2],];
};
if ($scores{$paralog}[$all][1] == $number1){
$zscores{$paralog}{$number1}{$scores{$paralog}[$all][0]}=[$scores{$paralog}[$all][2],];
}
if ($scores{$paralog}[$all][0] == $number2){
$zscores{$paralog}{$number2}{$scores{$paralog}[$all][1]}=[$scores{$paralog}[$all][2],];
};
if ($scores{$paralog}[$all][1] == $number2){
$zscores{$paralog}{$number2}{$scores{$paralog}[$all][0]}=[$scores{$paralog}[$all][2],];
}
}
}
}
}
};
my %otherzscores;
foreach my $element(0..$#otherscores){
foreach my $number($otherscores[$element][0]){
foreach my $all(0..$#otherscores){
if ($otherscores[$all][0] == $number){
$otherzscores{$number}{$otherscores[$all][1]}=[$otherscores[$all][2],];
};
if ($otherscores[$all][1] == $number){
$otherzscores{$number}{$otherscores[$all][0]}=[$otherscores[$all][2],];
}
}
}
}
#Parse %zscores
#Store the first id as a key of the hash %statistics and calculate mean and standard deviation as values of %statistics
my %statistics;
my @average;
my @number;
for my $para(keys %zscores){
for my $number(keys %{$zscores{$para}}){
#print "Paralog $para: $number\n";
for my $pair(keys %{$zscores{$para}{$number}}){
#print "$pair: $zscores{$para}{$number}{$pair}[0]\n";
push @average, $zscores{$para}{$number}{$pair}[0];
#print "$number:$pair $zscores{$number}{$pair}[0]\n";
};
$statistics{$number}=[average(@average),stdev(@average)];
}
}
#print Dumper (\%statistics);
#print Dumper (\%zscores);
#Calculate the zscores of every id-pair with the mean and average parsed from %statistics, add as value to %otherzscores
my @array2;
my @array3;
for my $seq1(keys %otherzscores){
#print "number: $seq1\n";
for my $seq2(keys %{$otherzscores{$seq1}}){
#print "$otherzscores{$seq1}{$seq2}[0]\t$statistics{$seq1}[0]\t$statistics{$seq1}[1]\n";
$otherzscores{$seq1}{$seq2}[1]= (($otherzscores{$seq1}{$seq2}[0] - $statistics{$seq1}[0])/$statistics{$seq1}[1]);
if ($otherzscores{$seq1}{$seq2}[1] > $z_cutoff){
#print "$number\t$score\t$zscores{$number}{$score}[0]\n";
if ($length{$seq1}[0] > $length_cutoff && $length{$seq2}[0] > $length_cutoff){
push @array2, [$seq1, $seq2, $otherzscores{$seq1}{$seq2}[0]];
}
}
}
}
#print "%zscores:\n";
#print Dumper (\%otherzscores);
#print Dumper (\@array2);
#Filter for only the pairs that are reciprocally included in the set (from both z-score filterings)
for my $i (0..$#array2){
for my $j (0..$#array2){
if ($array2[$i][0] == $array2[$j][1] && $array2[$i][1] == $array2[$j][0]){
push @array3, [$array2[$i][0], $array2[$i][1], $array2[$i][2]];
}
}
}
#print Dumper (\@array3);
my @sorted = sort{$a->[0] <=> $b->[0]}@array3;
#print Dumper (\@sorted);
#Assign the homologous TCEs into groups (hash in hash with groups as keys in %hash, in the groups are the ids keys with occurences as values)
#Groups have letters as names
my $switch2 = 0;
for $x (0..$#sorted){
$switch2 = 0;
foreach my $exon(keys %hash){
if (exists $hash{$exon}{$sorted[$x][0]}){
$switch2 = 1;
$hash{$exon}{$sorted[$x][0]}++;
$hash{$exon}{$sorted[$x][1]}++;
}elsif (exists $hash{$exon}{$sorted[$x][1]}){
$switch2 = 1;
$hash{$exon}{$sorted[$x][0]}++;
$hash{$exon}{$sorted[$x][1]}++;
}elsif(!exists $hash{$exon}{$sorted[$x][0]} && !exists $hash{$exon}{$sorted[$x][1]}){
next;
};
};
if ($switch2 == 0){
#print "$switch2\n";
$hash{$e}{$sorted[$x][0]}++;
$hash{$e}{$sorted[$x][1]}++;
$e++;
};
};
my $string;
my %newhash;
for my $exon(sort keys %hash){
#print "$exon\n";
$string = "";
for my $seq (sort {$a <=> $b} keys %{$hash{$exon}}){
$string .= ":".$seq;
}
unless (exists $newhash{$string}){
$newhash{$string} = $exon;
}
}
my %finalhash;
my $counter = 1;
for my $seq (sort {$newhash{$a} <=> $newhash{$b}} keys %newhash){
$finalhash{$counter} = $seq;
$counter ++;
}
#print Dumper (\%newhash);
#print Dumper (\%finalhash);
open (EXONS, "< HomologousExons/paralog_exon_alignment.fa");
open (OUT, "> HomologousExons/paralog_exon_alignment_final.fa");
#print "%hash:\n";
#print Dumper (\%hash);
my @exons;
for my $groups(sort keys %finalhash){
#print "$groups\n";
push @exons, $groups;
my @sequences = split ":", $finalhash{$groups};
if ($#sequences-1 > $paralog_num){
print "WARNING: Number of paralogs assigned to one homologous TCE group is greater than number of paralogs. Check assignment of homologous TCEs. Increase length_cutoff (-l).\n";
};
foreach my $i (1..$#sequences){
#print "$sequences[1]\t$sequences[2]\n";
while(){
chomp;
#print "$_\n";
#print "$length{$id}[1]\n";
if ($switch3 == 1){
print OUT "$_\n";
$switch3 = 0;
last;
};
if ($_ =~ /^>$length{$sequences[$i]}[1]$/){
#print "$length{$id}[1]\n";
s/_exon\d+/_exon$groups/g;
print OUT "$_\n";
$switch3=1;
}else{
$switch3=0;
}
}
seek EXONS, 0, 0;
}
};
close OUT;
close EXONS;
@sorted_exons = sort{$a <=> $b}@exons;
#print "@sorted_exons\n";
sub average{
my( $mean ); # the mean of the values, calculated by this subroutine
my( $n ); # the number of values
my( $sum ); # the sum of the values
my( $x ); # set to each value
$n = 0;
$sum = 0;
foreach $x ( @_ ){
$sum += $x;
$n++;
}
$mean = $sum / $n;
return $mean;
}
sub stdev{
my( $n ); # the number of values
my( $stddev ); # the standard deviation, calculated by this
# subroutine, from the variance
my( $sum ); # the sum of the values
my( $sumOfSquares ); # the sum of the squares of the values
my( $variance ); # the variance, calculated by this subroutine
my( $x ); # set to each value
$n = 0;
$sum = 0;
$sumOfSquares = 0;
foreach $x ( @_ ){
$sum += $x;
$n++;
$sumOfSquares += $x * $x;
}
$variance = ( $sumOfSquares - ( ( $sum * $sum ) / $n ) ) / ( $n - 1 );
$stddev = sqrt( $variance );
return $stddev;
}
print "Homologous TCEs between paralogs identified.\n";
}elsif ($mode eq "user"|| $mode eq "alignment"){
`cp ../$myInput2 $dir/HomologousExons/paralog_exon_alignment_final.fa`;
open (IN, "< HomologousExons/paralog_exon_alignment_final.fa");
while (){
if ($_ =~ /Paralog(\d)_exon(\d+)/){
push @exons, $2;
};
};
my @sort_exons = sort{$a <=> $b}@exons;
#print "@sort_exons\n";
@sorted_exons = uniq (@sort_exons);
}
if ($mode eq "alignment"){
# translate the genomic sequence into 6 ORFs, stored in ${space}${myOutFolder}_ORF.aas (amino-acid sequence) with getorf
chdir "$space" or die "Couldn't access $space\n";
system "awk '{print \$1}' $genome_target > tmp1";
unless (-e "${space}${myOutFolder}_ORF.aas" && -e $genome_target) {
my $ret = system "getorf -minsize 9 -reverse Y -stdout Y -sequence tmp1 -outseq stdout|sed 's/ - /-/g'| awk '{print \$1, \$3, \$2}' > tmp2";
if ($ret != 0){
die;
};
open (TMP, "< tmp2") or die "Couldn't open tmp2, $!";
open (DEST, ">> ${space}${myOutFolder}_ORF.aas") or die "Couldn't open ${space}${myOutFolder}_ORF.aas, $!";
while (){
chomp;
s/\(//;
s/_(\d+)\s\s\[/ $1\(FORWARD\)\[/;
s/\sREVERSE/\(REVERSE\)/;
s/_(\d+)\(REVERSE\) / $1\(REVERSE\)/;
print DEST "$_\n";
}
close TMP;
close DEST;
};
chdir $dir;
mkdir "hmmModels/";
my @models = glob "${models}*.stk";
foreach my $alignments(@models){
$alignments =~ s/${models}//g;
$alignments =~ s/\.stk//g;
`hmmbuild --amino ${dir}/hmmModels/${alignments}.model ${models}${alignments}.stk`;
}
}
if ($mode eq "fasta" || $mode eq "user"){
#Run blast with the single TCEs
#Reformat the blast-output to fit the ExonMatchSolver's needs, Blast-Hits outputted to Target_vs_query.blastout
mkdir "SearchTarget/";
open (HITS, "> ExonMatchSolver1.input");
my @call = "blastall -p tblastn -d $genome_target -i HomologousExons/paralog_exon_alignment_final.fa -a $core -m8 -G 11 -E 1| awk -v OFS='\t' '{print \$2, \$9\"_\"\$10, \$1, \$11, \$12, \$3}' > SearchTarget/Target_vs_query.blastout";
system (@call) == 0 or die "blastall against target genome failed. $!\n";
open (IN, "< SearchTarget/Target_vs_query.blastout") or die "Couldn't open SearchTarget/Target_vs_query.blastout, $!";
print HITS "#TargetName\tStrandinformation\tModel\tE-Value\tBitscore\tBias\tParalogID\tExonID\n";
while (){
chomp;
if ($_ =~ /Paralog(\d+)_exon(\d+)/g){
$paralog = $1;
$exon = $2;
}
if ($WGD eq "no"){
print HITS "$_\t${paralog}\t$exon\n";
}elsif ($WGD eq "yes"){
print HITS "$_\t${paralog}a\t$exon\n";
unless (exists $NonDuplicated{$paralog}){
print HITS "$_\t${paralog}b\t$exon\n";
}
}
}
close HITS;
close IN;
print "Blast of query sequences against target genome done.\n";
}elsif ($mode eq "alignment"){
mkdir "SearchTarget/";
my @files = glob "hmmModels/*.model";
foreach my $file(@files){
$_ = $file;
s/hmmModels\///;
s/\.model//;
$file = $_;
print "$file\n";
system "hmmsearch --cpu $core --tblout SearchTarget/${file}_tableout -o SearchTarget/${file}_out hmmModels/${file}.model ${space}${myOutFolder}_ORF.aas >> SearchTarget/Target_vs_query.hmmsearchout";
};
print "Hmmsearch of query models against target genome done\n";
## cat all output hits to one file all_exons_out_$spec, format this file to be a list with the hits only, substitute with spaces and - with tabs, add columns with paralogID and exonID, save a copy in the file data in ExonMatchSolver/
system "cat SearchTarget/*_tableout|grep -v '#'|awk -v OFS='\t' '{print \$1, \$19, \$3, \$5, \$6, \$7}' > SearchTarget/hmmsearch_tableout";
system ("cat SearchTarget/*_out > SearchTarget/hmmsearch.out");
open (HITS, "> tmp") or die "Couldn't open tmp, $!";
open (IN, "< SearchTarget/hmmsearch_tableout") or die "Couldn't open SearchTarget/hmmsearch_tableout, $!";
while (){
chomp;
if ($_ =~ /Paralog(\d+)_exon(\d+)/g){
$paralog = $1;
$exon = $2;
#push @exons, $exon;
}
if ($WGD eq "no"){
print HITS "$_\t${paralog}\t$exon\n";
}elsif ($WGD eq "yes"){
print HITS "$_\t${paralog}a\t$exon\n";
unless (exists $NonDuplicated{$paralog}){
print HITS "$_\t${paralog}b\t$exon\n";
}
}
}
open (INPUT, "> ExonMatchSolver1.input") or die "Couldn't open ExonMatchSolver1.input, $!";
print INPUT "#TargetName\tStrandinformation\tModel\tE-Value\tBitscore\tBias\tParalogID\tExonID\n";
system "sort -r tmp| uniq >> ExonMatchSolver1.input";
system "sleep 3";
}
#Run ExonMatchSolver
#Invoke ExonMatchSolver a first time with parameters given, print output and invokation options to ExonMatchSolver.out
{
chdir "$Bin/" or die "Couldn't access $Bin/, $!";
my @exonSolver = "java -jar ExonMatchSolver.jar $dir/ExonMatchSolver1.input $intervall $max_solution >> $dir/ExonMatchSolver1.out";
if (-e "$dir/ExonMatchSolver1.out"){
unlink "$dir/ExonMatchSolver1.out";
};
open (SOLUTION, ">> $dir/ExonMatchSolver1.out") or die "Can't open ExonMatchSolver1.out, $!";
print SOLUTION "ExonMatchSolver invoked with @exonSolver\n";
system (@exonSolver) == 0 or die "@exonSolver failed: $!";
}
print "ExonMatchSolver run for the first time.\n";
chdir $dir;
#Read first solution from ExonMatchSolver; this delivers a list of unscored TCEs (score is too low to be found with standard blast options), read this list only, put contig names as keys into a hash and unscored paralog-TCE names into an array/ reference this array
open (SOLUTION, "< ExonMatchSolver1.out") or die "Can't open ExonMatchSolver1.out, $!";
while (){
chomp;
if ($_ =~ /\#Solution/){
last;
}else{
unless ($_ =~ /\bjava\b/ or $_ =~ /\#Unscored/ or $_ =~ /^$/){
my @exon_order = split "\t", $_;
$missing_scores{$exon_order[0]} = [@exon_order[1..$#exon_order]];
}
}
}
close SOLUTION;
if ($mode eq "user" || $mode eq "fasta"){
mkdir "TargetSequences/";
#Extract genomic sequences of the contigs, stored as keys in %missing_scores
foreach my $keys (keys %missing_scores){
`fastacmd -d $genome_target -s $keys| sed \'s/>lcl|/>/g\' > TargetSequences/${keys}.fa`;
}
#Extract the single exon sequences in single files that are unscored until now
#Run blast a second time to find the missing scores
if (-e "SearchTarget/Target_vs_query_missing_unsorted.blastout"){
unlink "SearchTarget/Target_vs_query_missing_unsorted.blastout";
};
foreach my $contig (keys %missing_scores){
my @exons_1 = @{$missing_scores{$contig}};
foreach my $element (@exons_1){
@numle = split "-", $element;
$numle[0] =~ s/(\d)\w/$1/;
`grep -A1 \'>Paralog${numle[0]}_exon${numle[1]}\\s\*\$\' HomologousExons/paralog_exon_alignment_final.fa > SingleExons/Paralog${numle[0]}_exon${numle[1]}.fa`;
if (-s "SingleExons/Paralog${numle[0]}_exon${numle[1]}.fa"){
`bl2seq -p tblastn -i SingleExons/Paralog${numle[0]}_exon${numle[1]}.fa -j TargetSequences/${contig}.fa -G 11 -E 1 -D 1 -e 100| grep -v \'#\'|awk -v OFS='\t' '{print \$2, \$9"_"\$10, \$1, \$11, \$12, \$3}' >> SearchTarget/Target_vs_query_missing_unsorted.blastout`;
}else{unlink "SingleExons/Paralog${numle[0]}_exon${numle[1]}.fa"
}
};
};
`sort -k1,2 SearchTarget/Target_vs_query_missing_unsorted.blastout |uniq > SearchTarget/Target_vs_query_missing.blastout`;
open (HITSTOTAL, "> ExonMatchSolver_missing.input");
open (TOTAL, "< SearchTarget/Target_vs_query_missing.blastout") or die "Couldn't open SearchTarget/Target_vs_query_total.blastout, $!";
}elsif ($mode eq "alignment"){
{
# the translated sequences (fasta) are stored in a file so that one sequence including the header is printed on one line for further processing
unless (-e "${space}${myOutFolder}_ORF_grep"){
my $seq = "";
open (GENOME, "< ${space}${myOutFolder}_ORF.aas") or die "Couldn't open ${space}${myOutFolder}_ORF.aas, $!";
open (AAS, "> ${space}${myOutFolder}_ORF_grep") or die "Couldn't open ${space}${myOutFolder}_ORF_grep, $!";
while (){
chomp;
if (/^>/){
print AAS "$seq\n";
print AAS "$_ ";
$seq = ""; next;
}
chomp;
$seq .= $_;
}
print AAS "$seq\n";
}
};
##Extract AAS sequences of the contigs, stored as keys in %missing_scores
my %FILE;
mkdir "TargetSequences/";
foreach my $keys (keys %missing_scores){
open ($FILE{$keys}, ">> TargetSequences/${keys}.aas") or die;
};
open (AAS, "< ${space}${myOutFolder}_ORF_grep") or die "Couldn't open ${space}${myOutFolder}_ORF_grep, $!";
while (){
chomp;
if ($_ =~ /^>(\S+)\s/){
if (exists $missing_scores{$1}){
#print "\$1 starts: $1\n";
my $name = $1;
s/]\s/]\n/g;
print {$FILE{$name}} "$_\n";
}
}
}
foreach my $keys (keys %missing_scores){
close ($FILE{$keys})
}
if (-e "SearchTarget/Target_vs_query_missing.hmmout"){
unlink "SearchTarget/Target_vs_query_missing.hmmout";
};
foreach my $contig (keys %missing_scores){
my @exons_1 = @{$missing_scores{$contig}};
foreach my $element (@exons_1){
@numle = split "-", $element;
$numle[0] =~ s/(\d)\w/$1/;
system "${hmmer_path}hmmsearch --cpu $core -E 1 --domE 1 --tblout SearchTarget/Paralog${numle[0]}_exon${numle[1]}_${contig}_tableout hmmModels/Paralog${numle[0]}_exon${numle[1]}.model TargetSequences/${contig}.aas >> SearchTarget/Target_vs_query_missing.hmmout";
}
}
system "cat SearchTarget/*_*_*_tableout|grep -v '#'|awk -v OFS='\t' '{print \$1, \$19, \$3, \$5, \$6, \$7}'| cat - SearchTarget/hmmsearch_tableout > SearchTarget/hmmsearch_tableout_missingscores";
open (HITSTOTAL, "> ExonMatchSolver_missing.input") or die "Couldn't open ExonMatchSolver_missing.input, $!";
open (TOTAL, "< SearchTarget/hmmsearch_tableout_missingscores") or die "Couldn't open SearchTarget/hmmsearch_tableout_missingscores, $!";
};
while (){
chomp;
if ($_ =~ /Paralog(\d+)_exon(\d+)/g){
$paralog = $1;
$exon = $2;
};
if ($WGD eq "no"){
print HITSTOTAL "$_\t${paralog}\t$exon\n";
}elsif ($WGD eq "yes"){
if (!exists $NonDuplicated{$paralog}){
foreach my $letter(@duplicates){
print HITSTOTAL "$_\t${paralog}${letter}\t$exon\n";
}
}else{
print HITSTOTAL "$_\t${paralog}a\t$exon\n";
}
}
}
close HITSTOTAL;
close TOTAL;
if ($mode eq "alignment"){
open (INPUT, "> ExonMatchSolver2.input") or die "Couldn't open , ExonMatchSolver2.input $!";
print INPUT "#TargetName\tStrandinformation\tModel\tE-Value\tBitscore\tBias\tParalogID\tExonID\n";
system "sort -r ExonMatchSolver_missing.input |uniq >> ExonMatchSolver2.input";
}elsif($mode eq "user"|| $mode eq "fasta"){
`cat ExonMatchSolver1.input ExonMatchSolver_missing.input > ExonMatchSolver2.input`;
}
# invoke ExonMatchSolver a second time with parameters given, print output and invokation options in ExonMatchSolver2.out
{
chdir "$Bin/" or die "Couldn't access dir $Bin, $!";
my @exonSolver = "java -jar ExonMatchSolver.jar $dir/ExonMatchSolver2.input $intervall $max_solution >> $dir/ExonMatchSolver2.out";
if (-e "$dir/ExonMatchSolver2.out"){
unlink "$dir/ExonMatchSolver2.out"
};
open (SOLUTIONTOTAL, ">> $dir/ExonMatchSolver2.out") or die "Can't open ExonMatchSolver2.out, $!";
print SOLUTIONTOTAL "ExonMatchSolver invoked with @exonSolver\n";
system (@exonSolver) == 0 or die "system @exonSolver failed: $!";
}
print "ExonMatchSolver run for the second time.\n";
chdir $dir;
my @lost_exons;
my %complete_contigs;
# filter output from ExonmatchSolver for contigs that might have the whole paralog annotated in solution 1 (first filter for contigs with more than one exon, sort alphabetically, filter for contigs that have TCEs 1, 2, 3, 4 or 5 and at the same time one of the last three TCEs, names of these contigs are given in @complete_contigs)
#print "@sorted_exons\n";
open (SOLUTIONTOTAL, "< ExonMatchSolver2.out") or die "Can't open ExonMatchSolver2.out, $!";
while (){
chomp;
if ($_ =~ /\#Solution 2/){
last;
} elsif ($_ =~ /\s\d+\s$/){
my @exon_order = split "\t", $_;
if (@exon_order > 3) {
my @sorted_array = (@exon_order[0,1], sort {$a <=> $b} @exon_order[2..$#exon_order]);
if ($sorted_array[2] eq $sorted_exons[0] || $sorted_array[2] eq $sorted_exons[1] || $sorted_array[2] eq $sorted_exons[2]|| $sorted_array[2] eq $sorted_exons[3]|| $sorted_array[2] eq $sorted_exons[4]){
if ($sorted_array[-1] eq $sorted_exons[-3] || $sorted_array[-1] eq $sorted_exons[-2] || $sorted_array[-1] eq $sorted_exons[-1]){
$complete_contigs{$sorted_array[0]} = [$sorted_array[1], $sorted_array[2]];
foreach my $letter(@sorted_exons){
my $switch = 0;
foreach $exon(@sorted_array){
if ($letter eq $exon){
$switch = 1;
}
}
if ($switch == 0){
push @lost_exons, [$sorted_array[0],$sorted_array[1],$letter];
}
}
}
}
}
}
}
close SOLUTIONTOTAL;
print "Genomic fragments for which annotation of TCEs is (nearly) complete for one paralog are:\nGenomic fragment => [Paralog, first TCE detected]\n";
#Extract missing contigs
foreach my $contig(keys %complete_contigs){
unless (-s "TargetSequences/${contig}.fa"){
`fastacmd -d $genome_target -s $contig| sed \'s/>lcl|/>/g\' > TargetSequences/${contig}.fa`;
}
}
print Dumper (\%complete_contigs);
if ($WGD eq "yes"){
foreach my $paralog(@numbers){
if (!exists $NonDuplicated{$paralog}){
foreach my $duplicates (@duplicates){
`cp SearchQuery/Paralog${paralog}.fa SearchQuery/Paralog${paralog}${duplicates}.fa`;
}
`rm SearchQuery/Paralog${paralog}.fa`;
}else{
`cp SearchQuery/Paralog${paralog}.fa SearchQuery/Paralog${paralog}a.fa`;
`rm SearchQuery/Paralog${paralog}.fa`;
}
}
}
#Run exonerate/prosplign-annotation for all paralogs in target species for which sequence should be complete
if ($exonerate eq "no"){
print "Prosplign-annotation of nearly complete hits starts.\n";
}elsif($exonerate eq "yes"){
print "Exonerate-annotation of nearly complete hits starts.\n";
}
foreach my $contig(keys %complete_contigs){
if ($mode eq "fasta"|| $mode eq "user"){
my $strand_last ="";
my $first = `grep -P '$contig\t' ExonMatchSolver2.input|grep 'e-'|sort -nk 8,8| head -n 1| awk '{print \$2}'`;
my @first = split "_", $first;
if ($first[0]<$first[1]){
$strand = "+";
}elsif ($first[0] > $first[1]){
$strand = "-";
};
my $last = `grep -P '$contig\t' ExonMatchSolver2.input|grep 'e-'|sort -nk 8,8| tail -n 1| awk '{print \$2}'`;
my @last = split "_", $last;
if ($last[0] < $last[1]){
$strand_last = "+";
}elsif($last[0] > $last[1]){
$strand_last = "-";
}
if ($strand ne $strand_last){
print "WARNING: The paralog on contig $contig has high scoring blast/hmm-hits on different strands in the target genome. Check this region with the help of the ExonMatchSolver2.input. This might be a second paralog on the same contig or an assembly mistake. In the first case, split this contig in two parts so that each paralog is situated on a differently named contig and restart the pipeline. In the second case consider taking a different genome version/assembly.\n";
}
if ($strand eq "+"){
${complete_contigs{$contig}[2]}= $first[0] - $distance_first;
${complete_contigs{$contig}[3]} = $last[1]+ $distance_last;
}elsif($strand eq "-"){
${complete_contigs{$contig}[3]} = $first[1] + $distance_first;
${complete_contigs{$contig}[2]} = $last[0] - $distance_last;
};
}elsif($mode eq "alignment"){
my $strand_last ="";
my $length = qx(grep -v \'>\' TargetSequences/${contig}.fa| sed ':a;N;\$!ba;s/\\n//g'| wc -m);
my $first = `grep -P '$contig\t' ExonMatchSolver2.input|grep 'e-'|sort -nk 8,8| head -n 1| awk '{print \$2}'`;
my @first = split "-", $first;
$first[0] =~ s/^.+\[//g;
$first[1] =~ s/]//g;
if ($first[0]<$first[1]){
$strand = "+";
}elsif ($first[0] > $first[1]){
$strand = "-";
};
my $last = `grep -P '$contig\t' ExonMatchSolver2.input|grep 'e-'|sort -nk 8,8| tail -n 1| awk '{print \$2}'`;
my @last = split "-", $last;
$last[0] =~ s/^.+\[//g;
$last[1] =~ s/]//g;
if ($last[0] < $last[1]){
$strand_last = "+";
}elsif($last[0] > $last[1]){
$strand_last = "-";
}
if ($strand ne $strand_last){
print "WARNING: The paralog on contig $contig has high scoring blast/hmm-hits on different strands in the target genome. Check this region with the help of the ExonMatchSolver2.input. This might be a second paralog on the same contig or an assembly mistake. In the first case, split this contig in two parts so that each paralog is situated on a differently named contig and restart the pipeline. In the second case consider taking a different genome version/assembly.\n";
};
if ($strand eq "+"){
${complete_contigs{$contig}[2]}= $first[0] - $distance_first;
${complete_contigs{$contig}[3]} = $last[1]+ $distance_last;
}elsif($strand eq "-"){
${complete_contigs{$contig}[3]} = $first[1] + $distance_first;
${complete_contigs{$contig}[2]} = $last[0] - $distance_last;
};
};
if ($exonerate eq "no"){
if (-e "error"){
`rm error`;
};
`prosplign -nfa TargetSequences/${contig}.fa -pfa SearchQuery/Paralog${complete_contigs{$contig}[0]}.fa -strand $strand -nuc_from ${complete_contigs{$contig}[2]} -nuc_to ${complete_contigs{$contig}[3]} -full -out SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.out -inf SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.inf 2> error`;
open (ERROR, "< error");
while (){
if ($_ =~ /std::bad_alloc/){
$exonerate = "yes";
}
}
if ($exonerate eq "yes"){
redo;
`rm SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.inf`;
`rm SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.out`;
}else{
print "$contig: prosplign -nfa TargetSequences/${contig}.fa -pfa SearchQuery/Paralog${complete_contigs{$contig}[0]}.fa -strand $strand -nuc_from ${complete_contigs{$contig}[2]} -nuc_to ${complete_contigs{$contig}[3]} -full -out SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.out -inf SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.inf\n";
open INF, "< SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.inf";
while (){
chomp;
if ($_=~ m/id:/g){
my @array = split "\t", $_;
$_ = $array[7];
s/\%//g;
$array[7] = $_;
if ($array[7] <= 80){
#print "WARNING: protein sequence has low identity with locus: $array[7]\n";
}
}
}
}
}elsif($exonerate eq "yes"){
open (OUT, "> Exonerate_help");
print OUT "Paralog${complete_contigs{$contig}[0]}.fa\t$strand\t${complete_contigs{$contig}[2]}\t${complete_contigs{$contig}[3]}";
close OUT;
#my @exonerate = "exonerate --model protein2genome --showalignment no --showvulgar no --showcigar no --showsugar no --showquerygff no --showtargetgff yes --score 60 --annotation Exonerate_help -q SearchQuery/Paralog${complete_contigs{$contig}[0]}.fa -t TargetSequences/${contig}.fa > SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.gff";
my @exonerate = "exonerate --model protein2genome --showalignment yes --useaatla false --showvulgar no --showcigar no --showsugar no --showquerygff no --showtargetgff no --score 60 --annotation Exonerate_help -q SearchQuery/Paralog${complete_contigs{$contig}[0]}.fa -t TargetSequences/${contig}.fa > SearchTarget/Paralog${complete_contigs{$contig}[0]}_target_complete.aln";
print "@exonerate\n";
system (@exonerate) == 0 or die "exonerate against query genome failed. $!\n";
`grep -A1 \'|\' SearchTarget/Paralog${complete_contigs{$contig}[0]}_target_complete.aln| grep -v \'|\'| grep -v \'^\\-\\-\'| sed \'s/\\s//g\'| grep -v \'Model\'|sed \'s/{[A-Z]\*//g\'| sed \'s/}//g\'| sed \':a;N;\$!ba;s/\\n//g\' | sed \'s/[+|-]\\{2,4\\}/\\n/g\'| sed \'s/#//g\'|sed \'s/+//\'|sed \'s/-//\' > SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.aln`;
open (OUT2, "> SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.fa");
my $result = `sed ':a;N;\$!ba;s/\\n//g' SearchTarget/Paralog${complete_contigs{$contig}[0]}_target.aln`;
print OUT2 ">Paralog${complete_contigs{$contig}[0]}_initial\n$result\n";
close OUT2;
};
};
if ($exonerate eq "no"){
# filter prosplign.out file to extract the protein sequences and print on CL
my @prosplign_out = glob "SearchTarget/*_target.out";
foreach my $files(@prosplign_out){
#print "FILES: $files\n";
my $result = `grep -A1 "[0-9]" $files | grep -v "[0-9]"| sed 's/ //g'| sed 's/-//g'| tr -d '\n'`;
$_ = $result;
s/([a-z])[a-z][a-z]/$1/g;
$result = $_;
$_ = $files;
s/\_target\.out//g;
$files = $_;
#print "$files\n";
open (OUTTARGET, "> ${files}_target.fa");
print OUTTARGET ">${files}_initial\n$result\n";
}
}
chdir $dir;
if (-e "SearchTarget/Exonerate_additional_hits.blastout"){
unlink "SearchTarget/Exonerate_additional_hits.blastout";
}
#Identify TCEs that are additionally found by the exonerate annotation
#How? bl2seq of TCEs that are missing in the putative complete annotations against the Exonerate-protein annotation
#Filter via e-value (lower than 0.01) -> Make this more flexible?
print "The following TCEs are missing in the (nearly) complete genomic fragments as proposed by the second run of ExonMatchSolver. Its amino acid sequences are blasted against the Exonerate annotation of the respective paralog on the respective genomic fragments:\n";
foreach my $lines(0..$#lost_exons){
my $full_name = ${lost_exons[$lines][1]};
${lost_exons[$lines][1]} =~ s/(\d)\w/$1/;
`grep -A1 '>Paralog${lost_exons[$lines][1]}_exon${lost_exons[$lines][2]}\\s\*\$' HomologousExons/paralog_exon_alignment_final.fa > SingleExons/Paralog${lost_exons[$lines][1]}_exon${lost_exons[$lines][2]}.fa`;
if (-s "SingleExons/Paralog${lost_exons[$lines][1]}_exon${lost_exons[$lines][2]}.fa"){
print "Paralog $lost_exons[$lines][1]\texon $lost_exons[$lines][2]\n";
my @bl2seq = "bl2seq -p blastp -i SingleExons/Paralog${lost_exons[$lines][1]}_exon${lost_exons[$lines][2]}.fa -j SearchTarget/Paralog${full_name}_target.fa -G 11 -E 1 -D 1 -e 0.01| grep -v '#' >> SearchTarget/Exonerate_additional_hits.blastout";
print "@bl2seq\n";
#system (@bl2seq) == 0 or warn "Bl2seq against Target-Annotation failed. $!\n";
}else{
print "The amino acid sequence of paralog ${lost_exons[$lines][1]}, exon${lost_exons[$lines][2]} does not exist. This TCE might be missing in this paralog (deleted) or it is too derived to be recognized as a homologous exon.\n";
}
}
#Add these Hits to the ExonMatchSolver input, set score to 0 and add the hits for every paralog
if (-s "SearchTarget/Exonerate_additional_hits.blastout"){
open (PRO, "> $dir/SearchTarget/Exonerate_additional_hits_ExonMatchSolver.input") or warn "Couldn't open Exonerate_additional_hits_ExonMatchSolver.input, $!";
open (ADD, "< SearchTarget/Exonerate_additional_hits.blastout") or warn "Couldn't open Exonerate_additional_hits_ExonMatchSolver.blastout, $!";
while (){
foreach my $lines(0..$#lost_exons){
#${lost_exons[$lines][1]} =~ s/(\d)\w/$1/;
if ($_ =~ /Paralog${lost_exons[$lines][1]}_exon${lost_exons[$lines][2]}/){
my $additional_hits = `grep '${lost_exons[$lines][0]}\t' ExonMatchSolver2.input| sort -nk 8,8| head -n 1`;
$_ = $additional_hits;
foreach my $para(@numbers){
if ($WGD eq "no"){
s/\t(\d)+_(\d)+\tParalog(\d)+_exon(\d+)/\t0_0\tParalog${para}_exon${lost_exons[$lines][2]}/g;
s/\s-*\d+\.\d+\s/\t0\t/g;
s/\s\d.\de-\d+\s/\t0\t/g;
s/\s-*\d+\.\d+\s/\t0\t/g;
s/\t(\d)+\t(\d)+$/\t${para}\t${lost_exons[$lines][2]}/g;
print PRO "$_";
}elsif ($WGD eq "yes"){
if (!exists $NonDuplicated{$para}){
foreach my $letter (@duplicates){
s/\t(\d)+_(\d)+\tParalog(\d)+(\w)_exon(\d+)/\t0_0\tParalog${para}${letter}_exon${lost_exons[$lines][2]}/g;
s/\s-*\d+\.\d+\s/\t0\t/g;
s/\s\d.\de-\d+\s/\t0\t/g;
s/\s-*\d+\.\d+\s/\t0\t/g;
s/\t(\d)+\t(\d)+$/\t${para}${letter}\t${lost_exons[$lines][2]}/g;
print PRO "$_";
}
}else{
s/\t(\d)+_(\d)+\tParalog(\d)+(\w)_exon(\d+)/\t0_0\tParalog${para}a_exon${lost_exons[$lines][2]}/g;
s/\s-*\d+\.\d+\s/\t0\t/g;
s/\s\d.\de-\d+\s/\t0\t/g;
s/\s-*\d+\.\d+\s/\t0\t/g;
s/\t(\d)+\t(\d)+$/\t${para}a\t${lost_exons[$lines][2]}/g;
print PRO "$_";
}
}
}
};
};
};
close PRO;
close ADD;
}
#Check if a third run of EMS is necessary
if (-s "$dir/SearchTarget/Exonerate_additional_hits_ExonMatchSolver.input"){
print "Additional TCEs found by Exonerate.\n";
system ("cat ExonMatchSolver2.input SearchTarget/Exonerate_additional_hits_ExonMatchSolver.input|sort -r > ExonMatchSolver3.input") == 0 or warn "Concatination of input from first ExonMatchSolver-run and additional hits did not work.\n";
`rm SearchTarget/Exonerate_additional_hits_ExonMatchSolver.input`;
#invoke ExonMatchSolver a third time with parameters given (hit list contains hits from two blast runs and additional Exonerate hits), print output and invokation options to $solution
chdir "$Bin/" or die "Couldn't access dir $Bin, $!";
my @exonSolver = "java -jar ExonMatchSolver.jar $dir/ExonMatchSolver3.input $intervall $max_solution >> $dir/ExonMatchSolver3.out";
if (-e "$dir/ExonMatchSolver3.out"){
unlink "$dir/ExonMatchSolver3.out";
};
open (SOLUTIONCOMPLETE, ">> $dir/ExonMatchSolver3.out") or die "Can't open $dir/ExonMatchSolver3.out, $!";
print SOLUTIONCOMPLETE "ExonMatchSolver invoked with @exonSolver\n";
system (@exonSolver) == 0 or warn "system @exonSolver failed: $!";
print "ExonMatchSolver run for the third time.\n";
chdir $dir;
}else{
print "No additional TCEs found by Exonerate. ExonMatchSolver is not run a third time.\n";
`cp ExonMatchSolver2.out ExonMatchSolver3.out`;
`cp ExonMatchSolver2.input ExonMatchSolver3.input`;
};
close SOLUTIONCOMPLETE;
#check for two paralogs on the same scaffold/contig/chromosome, search for Hits that were found by the same TCE twice on the same contig and had an e-value < 0, discrimination with e-value is necessary, otherwise all low scoring hits are shown as well which are probably not real TCEs
{
my $check = `cat ExonMatchSolver3.input| sort -g -k4,4| grep \'e-\'| awk \'{print \$1, \$3, \$7}\'| sort -k1,1|uniq -d| sort -n -k1,1| wc -l`;
chomp $check;
if ($check == 0){
print "Check for two paralogs on one unit: negative.\n";
}elsif ($check > 0){
print "WARNING: Check for two paralogs on one unit: positive. Check this before continuing! More then one blast-hit on the same contig with the same TCE were observed for:\n";
system "cat ExonMatchSolver3.input| sort -g -k4,4| grep \'e-\'| awk \'{print \$1, \$3, \$7}\'| sort -k1,1|uniq -d| sort -n -k1,1";
#print "This could also suggest a split exon.\n";
};
};
#check statistics of $solution: How many different optimal solution were obtained?
open (SOLUTION, "< ExonMatchSolver3.out") or die "Can't open ExonMatchSolver3.out, $!";
open (TMP, "> tmp2");
while (){
if ($_ =~ /\s(\d)+\s$/){
chomp;
s/\t(\d)\w\t/\t$1\t/g;
print TMP "$_\n";
}
}
close SOLUTION;
close TMP;
my $validation = `grep -v '#' tmp2 | grep -v '^\$'| grep -v 'ExonMatchSolver'| grep -v '-'|sort| uniq -c| awk '{print \$1}'| uniq| wc -l`;
chomp $validation;
if ($validation == 1){
print "One optimal solution was obtained by the last run of ExonMatchSolver.\n";
}else {
print "WARNING: Several different optimal solutions were obtained by the last run of ExonMatchSolver, check $dir/ExonMatchSolver3.out.\n";
my $attention = `grep -v '#' tmp2 | grep -v '^\$'| grep -v 'ExonMatchSolver'| grep -v '-'|sort| uniq -c`;
print "$attention";
}
#}
#length normalization of single TCEs on contigs: Are they reliable?
{
# filter output from ExonmatchSolver for contigs that have only one TCE matched, extract bitscore for these hits
open (SOLUTION, "< ExonMatchSolver3.out") or die "Can't open $dir/ExonMatchSolver3.out, $!";
while (){
chomp;
if ($_ =~ /\s(\d)+\s$/){
unless ($_ =~ /ExonMatchsolver/ || $_ =~ /-/ || $_ =~ /#/ || $_ =~ /^$/){
my @exon_order = split "\t", $_;
if (@exon_order < 4) {
$single_exons{$exon_order[0]} = [$exon_order[1], $exon_order[2]];
#print "$exon_order[0]\t$exon_order[1]\t$exon_order[2]\n";
}
}
}
}
close SOLUTION;
if (-e "Single_exons"){
unlink "Single_exons";
};
if (-e "tmp"){
unlink "tmp";
};
open (TMP, ">> tmp");
print TMP "Reliable exons, normalized Bitscore > 1.5:\n";
open (RESULT, ">> Single_exons");
print RESULT "Single exons on contigs:\n#Contig\tParalog\tExon\tBitScore\tNormalizedBitScore\n";
foreach my $contig(keys %single_exons){
my @data = @{$single_exons{$contig}};
my $letter = $data[1];
my $para = $data[0];
my $length = ();
my $score = qx(grep -P '$contig\t' $dir/ExonMatchSolver3.input| grep -v '#'| grep 'Paralog${para}_'| sort -n -k 5,5| tail -n 1|awk '{print \$5}');
chomp $score;
if ($mode eq "user"|| $mode eq "fasta"){
$length = qx(grep -A1 'Paralog${para}_exon${letter}\$' $dir/HomologousExons/paralog_exon_alignment_final.fa| tail -n1| sed ':a;N;\$!ba;s/\\n//g'|wc -c);
chomp $length;
}elsif($mode eq "alignment"){
$length = qx(grep 'LENG' hmmModels/Paralog${para}_exon${letter}.model| awk '{print \$2}');
};
if ($score =~ /\d/){
my $normalized_score = $score/$length;
print RESULT "$contig\t$para\t$letter\t$score\t$normalized_score\n";
if ($normalized_score > 1.5){
print TMP "$contig\t$para\t$letter\t$score\t$normalized_score\n";
}
}
}
close TMP;
close RESULT;
my $reliable_exons = `cat tmp| grep -v '^\$'| grep -v '#'|grep -v 'exons'|sort -k 2,3| awk '{print \$2"\t"\$3}'| uniq -d| grep -v -f - tmp| grep -v 'exons'`;
my $non_explicit_exons = `cat tmp| grep -v '^\$'| grep -v '#'|grep -v 'exons'|sort -k 2,3| awk '{print \$2"\t"\$3}'| uniq -d| grep -f - tmp`;
open (RESULT_2, "> Reliable_exons");
print RESULT_2 "#Contig\tParalog\tExon\tBitScore\tNormalizedBitScore\n";
print RESULT_2 "Reliable, explicit matching exons:\n$reliable_exons";
print RESULT_2 "Reliable, non-explicit matching exons:\n$non_explicit_exons";
{
my @reliable;
open (RESULT_2, "< Reliable_exons");
while (){
chomp;
unless ($_ =~ /exons/){
my @array = split "\t", $_;
push @reliable, [$array[0], $array[1], $array[2]];
}
}
#print "@reliable\n";
print "Reliability check of single TCEs completed.\n";
`rm tmp`;
close RESULT_2;
#Assemble the contigs with more than one hit toegther with the reliable single hits for the optimal solution 1, they are stored in the hash %complete (inside is another hash with TCEs as keys, contigs as values)
#Unreliable single TCEs are not further considered in the automated pipeline
my %complete;
open (EMSOUT, "< ExonMatchSolver3.out");
while (){
chomp;
if ($_ =~ /\#Solution 2/){
last;
} elsif ($_ =~ /\s(\d)+\s$/){
my @exon_order = split "\t", $_;
if (@exon_order > 3) {
my @sorted_array = (@exon_order[0,1], sort {$a <=> $b} @exon_order[2..$#exon_order]);
foreach my $i(0..$#reliable){
if ($sorted_array[1] =~ $reliable[$i][1]){
$complete{$exon_order[1]}{$reliable[$i][2]}=$reliable[$i][0];
}
foreach my $j(@sorted_array[2..$#sorted_array]){
$complete{$exon_order[1]}{$j}=$sorted_array[0];
}
}
}
}
};
seek EMSOUT, 0, 0;
print "Solution of the last run of ExonMatchSolver including contigs with single, reliable TCEs\n";
print Dumper (\%complete);
close EMSOUT;
# Check for every paralog if the previous contig/scaffold is the same as the current one. If not, keep these entries in %merge (array in this array keeps paralog, exon, contig)
my %merge;
my $i=0;
my $switch = 0;
for my $paralog (keys %complete){
my $previous_paralog="";
my $previous_exon ="";
for my $exon (sort {$a <=> $b} keys %{$complete{$paralog}}){
if ($complete{$paralog}{$exon} ne $previous_exon) {
if ($paralog ne '' && $previous_paralog ne '' && $previous_exon ne ''){
if (exists $merge{$paralog}{$previous_exon}){
for my $i (0..$#{$merge{$paralog}{$previous_exon}}){
if ($previous_paralog eq $merge{$paralog}{$previous_exon}[$i]){
$switch =1;
}
}
if ($switch != 1){
push (@{$merge{$paralog}{$previous_exon}}, $previous_paralog);
}
$switch = 0;
}elsif(!exists $merge{$paralog}{$previous_exon}){
$merge{$paralog}{$previous_exon} = [$previous_paralog,];
}
};
if ($paralog ne '' && $exon ne '' && $complete{$paralog}{$exon} ne ''){
if (exists $merge{$paralog}{$complete{$paralog}{$exon}}){
for my $j (0..$#{$merge{$paralog}{$complete{$paralog}{$exon}}}){
if ($exon eq $merge{$paralog}{$complete{$paralog}{$exon}}[$j]){
$switch =1;
}
}
if ($switch != 1){
push (@{$merge{$paralog}{$complete{$paralog}{$exon}}},$exon);
}
$switch = 0;
}elsif(!exists $merge{$paralog}{$complete{$paralog}{$exon}}){
$merge{$paralog}{$complete{$paralog}{$exon}} = [$exon,];
}
};
$previous_exon= $complete{$paralog}{$exon};
$previous_paralog= $exon;
}else{
$previous_exon=$complete{$paralog}{$exon};
$previous_paralog= $exon};
}
}
#print Dumper (\%merge);
my $exonCounter =0;
my %split;
my %cat;
my %test;
foreach my $para(keys %merge){
foreach my $contig(keys %{$merge{$para}}){
unless (-s "TargetSequences/${contig}.fa"){
`fastacmd -d $genome_target -s $contig| sed \'s/>lcl|/>/g\' > TargetSequences/${contig}.fa`;
}
}
}
for my $scaff(keys %merge){
my @moreExons;
#simplest case: only one scaffold
if (scalar (keys %{$merge{$scaff}}) != 1){
#for the cases with more than one scaffold
my @twoElements = ();
my @keys =();
my @exons1 = ();
my @sorted_exons1 = ();
my $switch = 0;
#for my $element (sort {$merge{$scaff}{$a} <=> $merge{$scaff}{$b}} keys %{$merge{$scaff}}){
#push @twoElements, $scaff;
#};
#my @uniqTwoElements = uniq (@twoElements);
#for my $scaffold(@uniqTwoElements){
for my $element (sort {$merge{$scaff}{$a}[0] <=> $merge{$scaff}{$b}[0]} keys %{$merge{$scaff}}){
foreach my $j(0..$#{$merge{$scaff}{$element}}){
push @exons1, $merge{$scaff}{$element}[$j];
};
};
#print "@exons1\n";
@sorted_exons1 = sort {$a <=> $b} @exons1;
#print "SORTED_EXONS: @sorted_exons1\t@exons1\n";
foreach my $i(0..$#exons1){
if($sorted_exons1[$i] != $exons1[$i]){
$switch = 1;
}
}
if ($switch == 1){
for my $element (keys %{$merge{$scaff}}){
foreach my $i (0..$#{$merge{$scaff}{$element}}){
$split{$scaff}{$element}{$merge{$scaff}{$element}[$i]} = [],;
};
}
}elsif ($switch == 0){
for my $element (keys %{$merge{$scaff}}){
foreach my $i (0..$#{$merge{$scaff}{$element}}){
$cat{$scaff}{${merge{$scaff}{$element}[0]}} = [$element,];
}
}
}
#}
delete $merge{$scaff};
}elsif(scalar (keys %{$merge{$scaff}}) == 1){
for my $element (keys %{$merge{$scaff}}){
foreach my $contig(keys %complete_contigs){
#print "TEST: $contig\t$element\t$scaff\n";
if ($element eq $contig){
delete $merge{$scaff};
}
}
}
}
}
#print Dumper (\%merge);
foreach my $para(keys %merge){
open (OUTPROSPLIGN, "> SearchTarget/Paralog${para}_target.fa");
foreach my $contig(keys %{$merge{$para}}){
if ($mode eq "fasta"|| $mode eq "user"){
my $strand_last ="";
my $first = `grep -P '$contig\t' ExonMatchSolver3.input|sort -nk 8,8| head -n 1| awk '{print \$2}'`;
my @first = split "_", $first;
if ($first[0]<$first[1]){
$strand = "+";
}elsif ($first[0] > $first[1]){
$strand = "-";
};
my $last = `grep -P '$contig\t' ExonMatchSolver3.input|sort -nk 8,8| tail -n 1| awk '{print \$2}'`;
my @last = split "_", $last;
if ($last[0] < $last[1]){
$strand_last = "+";
}elsif($last[0] > $last[1]){
$strand_last = "-";
}
if ($strand ne $strand_last){
print "WARNING: The paralog on contig $contig has high scoring blast/hmm-hits on different strands in the target genome. Check this region with the help of the ExonMatchSolver3.input. This might be a second paralog on the same contig or an assembly mistake. In the first case, split this contig in two parts so that each paralog is situated on a differently named contig and restart the pipeline. In the second case consider taking a different genome version/assembly.\n";
}
if ($strand eq "+"){
${merge{$para}{$contig}[1]}= $first[0] - $distance_first;
${merge{$para}{$contig}[2]} = $last[1]+ $distance_last;
}elsif($strand eq "-"){
${merge{$para}{$contig}[2]} = $first[1] + $distance_first;
${merge{$para}{$contig}[1]} = $last[0] - $distance_last;
};
}elsif($mode eq "alignment"){
my $strand_last ="";
my $length = qx(grep -v \'>\' TargetSequences/${contig}.fa| sed ':a;N;\$!ba;s/\\n//g'| wc -m);
my $first = `grep -P '$contig\t' ExonMatchSolver3.input|sort -nk 8,8| head -n 1| awk '{print \$2}'`;
my @first = split (/-/, $first);
$first[0] =~ s/^.+\[//g;
$first[1] =~ s/]//g;
if ($first[0]<$first[1]){
$strand = "+";
}elsif ($first[0] > $first[1]){
$strand = "-";
};
my $last = `grep -P '$contig\t' ExonMatchSolver3.input|sort -nk 8,8| tail -n 1| awk '{print \$2}'`;
my @last = split (/-/, $last);
$last[0] =~ s/^.+\[//g;
$last[1] =~ s/]//g;
if ($last[0] < $last[1]){
$strand_last = "+";
}elsif($last[0] > $last[1]){
$strand_last = "-";
}
if ($strand ne $strand_last){
print "WARNING: The paralog on contig $contig has high scoring blast/hmm-hits on different strands in the target genome. Check this region with the help of the ExonMatchSolver3.input. This might be a second paralog on the same contig or an assembly mistake. In the first case, split this contig in two parts so that each paralog is situated on a differently named contig and restart the pipeline. In the second case consider taking a different genome version/assembly.\n";
}
if ($strand eq "+"){
${complete_contigs{$contig}[2]}= $first[0] - $distance_first;
${complete_contigs{$contig}[3]} = $last[1]+ $distance_last;
}elsif($strand eq "-"){
${complete_contigs{$contig}[3]} = $first[1] + $distance_first;
${complete_contigs{$contig}[2]} = $last[0] - $distance_last;
};
};
if ($exonerate eq "no"){
if (-e "error"){
`rm error`;
};
`prosplign -nfa TargetSequences/${contig}.fa -pfa SearchQuery/Paralog${para}.fa -strand $strand -full -out SearchTarget/Paralog${para}_target.out -inf SearchTarget/Paralog${para}_target.inf 2> error`;
open (ERROR, "< error");
while (){
if ($_ =~ /std::bad_alloc/){
$exonerate = "yes";
}
}
if ($exonerate eq "yes"){
redo;
`rm SearchTarget/Paralog${para}_target.out`;
`rm SearchTarget/Paralog${para}_target.inf`;
}else{
my $result = `grep -A1 "[0-9]" SearchTarget/Paralog${para}_target.out | grep -v "[0-9]"| sed 's/ //g'| sed 's/-//g'| tr -d '\n'`;
$_ = $result;
s/([a-z])[a-z][a-z]/$1/g;
$result = $_;
chomp $result;
print OUTPROSPLIGN ">Paralog${para}\n$result\n";
close OUTPROSPLIGN;
};
}elsif ($exonerate eq "yes"){
open (OUT, "> Exonerate_help");
print OUT "Paralog${para}.fa\t$strand\t${merge{$para}{$contig}[1]}\t${merge{$para}{$contig}[2]}";
close OUT;
my @exonerate = "exonerate --model protein2genome --showalignment yes --useaatla false --showvulgar no --showcigar no --showsugar no --showquerygff no --showtargetgff no --score 60 --annotation Exonerate_help -q SearchQuery/Paralog${para}.fa -t TargetSequences/${contig}.fa > SearchTarget/Paralog${para}_target_complete.aln";
print "@exonerate\n";
system (@exonerate) == 0 or die "exonerate against query genome failed. $!\n";
`grep -A1 \'|\' SearchTarget/Paralog${para}_target_complete.aln | grep -v \'|\'| grep -v \'^\\-\\-\'| sed \'s/\\s//g\'| grep -v \'Model\'|sed \'s/{[A-Z]\*//g\'| sed \'s/}//g\'| sed \':a;N;\$!ba;s/\\n//g\' | sed \'s/[+|-]\\{2,4\\}/\\n/g\'| sed \'s/#//g\'| sed \'s/+//\'|sed \'s/-//\' > SearchTarget/Paralog${para}_target.aln`;
my $result = `sed ':a;N;\$!ba;s/\\n//g' SearchTarget/Paralog${para}_target.aln`;
chomp $result;
print OUTPROSPLIGN ">Paralog${para}_final\n$result\n";
}
};
}
#print "Merge\n";
#print Dumper (\%merge);
#print "Cat\n";
#print Dumper (\%cat);
#print "Split\n";
#print Dumper (\%split);
#For the case %cat
#Fill %cat with the following information: paralog => TCE => [contig, start, stop, strand, seq], start and stop are extracted from the first TCE on this contig
#seq is always extracted so that it is on the positive strand
#sort TCEs and cat sequences of the corresponding contigs to eachother
{
for my $para(keys %cat){
my $para_original = $para;
$para =~ s/(\d)\w/$1/;
open (FINALCAT, "> TargetSequences/Paralog${para_original}_merged.fa");
open (OUTCAT, "> SearchTarget/Paralog${para_original}_target_final.fa");
my $fastacat = "";
for my $exon (sort {$a <=> $b} keys %{$cat{$para_original}}){
my $seq = "";
$para_original = $para;
$para =~ s/(\d)\w/$1/;
my $loci = `grep -P '$cat{$para_original}{$exon}[0].*Paralog.*_exon${exon}\t' ExonMatchSolver3.input| sort -g -k4,4| head -n1| awk '{print \$2}'`;
chomp $loci;
if ($mode eq "fasta"|| $mode eq "user"){
my @temp = split (/_/, $loci);
foreach my $i (0..$#temp){
push @{$cat{$para_original}{$exon}}, $temp[$i];
};
}elsif ($mode eq "alignment"){
my @temp = split (/-/, $loci);
$temp[0] =~ s/^.+\[//g;
$temp[1] =~ s/]//g;
foreach my $i (0..$#temp){
push @{$cat{$para_original}{$exon}}, $temp[$i];
};
}
if ($cat{$para_original}{$exon}[1] > $cat{$para_original}{$exon}[2]){
$cat{$para_original}{$exon}[3] = "-";
}elsif($cat{$para_original}{$exon}[1] < $cat{$para_original}{$exon}[2]){
$cat{$para_original}{$exon}[3] = "+";
};
#}
if ($cat{$para_original}{$exon}[3] eq "-"){
$seq = `fastacmd -d $genome_target -s $cat{$para_original}{$exon}[0] -S 2 | grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
}elsif($cat{$para_original}{$exon}[3] eq "+"){
$seq = `fastacmd -d $genome_target -s $cat{$para_original}{$exon}[0]| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
};
chomp $seq;
#print "$seq\n";
$cat{$para_original}{$exon}[4] = $seq;
$fastacat .= $cat{$para_original}{$exon}[4];
};
$_ = $fastacat;
s/.{32000}/$&\n/g;
$fastacat = $_;
print FINALCAT ">Paralog${para_original}\n$fastacat\n";
close FINALCAT;
if ($exonerate eq "no"){
if (-e "error"){
`rm error`;
};
`prosplign -nfa TargetSequences/Paralog${para_original}_merged.fa -pfa SearchQuery/Paralog${para_original}.fa -strand + -full -out SearchTarget/Paralog${para_original}_target_final.out -inf SearchTarget/Paralog${para_original}_target_final.inf 2> error`;
open (ERROR, "< error");
while (){
if ($_ =~ /std::bad_alloc/){
$exonerate = "yes";
}
}
if ($exonerate eq "yes"){
redo;
`rm SearchTarget/Paralog${para_original}_target_final.inf`;
`rm SearchTarget/Paralog${para_original}_target_final.out`;
}else{
my $result = `grep -A1 "[0-9]" SearchTarget/Paralog${para_original}_target_final.out | grep -v "[0-9]"| sed 's/ //g'| sed 's/-//g'| tr -d '\n'`;
$_ = $result;
s/([a-z])[a-z][a-z]/$1/g;
$result = $_;
chomp $result;
print OUTCAT ">Paralog${para_original}_final\n$result\n";
close OUTCAT;
};
}elsif ($exonerate eq "yes"){
my @exonerate = "exonerate --model protein2genome --showalignment yes --useaatla false --showvulgar no --showcigar no --showsugar no --showquerygff no --showtargetgff no --score 60 -q SearchQuery/Paralog${para_original}.fa -t TargetSequences/Paralog${para_original}_merged.fa > SearchTarget/Paralog${para_original}_target_final_complete.aln";
print "@exonerate\n";
system (@exonerate) == 0 or die "exonerate against query genome failed. $!\n";
`grep -A1 \'|\' SearchTarget/Paralog${para_original}_target_final_complete.aln | grep -v \'|\'| grep -v \'^\\-\\-\'| sed \'s/\\s//g\'| grep -v \'Model\'|sed \'s/{[A-Z]\*//g\'| sed \'s/}//g\'| sed \':a;N;\$!ba;s/\\n//g\' | sed \'s/[+|-]\\{2,4\\}/\\n/g\' |sed \'s/#//g\'| sed \'s/+//\'|sed \'s/-//\' > SearchTarget/Paralog${para_original}_target_final.aln`;
my $result = `sed ':a;N;\$!ba;s/\\n//g' SearchTarget/Paralog${para_original}_target_final.aln`;
chomp $result;
print OUTCAT ">Paralog${para_original}_final\n$result\n";
close OUTCAT;
}
}
}
#For the case %split:
my $strand_overall_positive;
my $strand_overall_negative;
for my $para(keys %split){
my $para_original = $para;
$para =~ s/(\d)\w/$1/;
my $fastasplit = "";
my $difference = 0;
my $fastacat = "";
open (FINAL, "> TargetSequences/Paralog${para_original}_merged.fa");
#if ($exonerate eq "no"){
open (OUT, "> SearchTarget/Paralog${para_original}_target_final.fa");
#};
$strand_overall_positive= 0;
$strand_overall_negative= 0;
my @exons = ();
my @sorted_exons = ();
#Extract maxcontig, the contig with the highest numbers of exons. In case two contigs have the same number of exons, the first one is kept. The strand of this contig is used as reference for the other contigs of this paralog. Maxcontig is also used to extract the locus where the single contigs should be inserted in maxcontig (triples).
my $maxsize =0;
my $maxcontig =0;
my $switchMinus =0;
my $switchPlus =0;
my $seq = ();
my $majorstrand = 0;
my $scrambled = 0;
my @counter = ();
my %maxcontig;
my @maxsorted_exons =();
my %exons_minor;
for my $contig2(keys %{$complete{$para_original}}){
$maxcontig{$complete{$para_original}{$contig2}}++;
}
#print Dumper (\%maxcontig);
for my $contig0(keys %{$split{$para_original}}){
#print "$contig2\t$maxcontig{$contig2}\n";
push @counter, $maxcontig{$contig0};
}
my @sorted_counter = sort {$a <=> $b}@counter;
#print "$sorted_counter[-1]\n";
my %rmaxcontig = reverse %maxcontig;
$maxcontig = $rmaxcontig{$sorted_counter[-1]};
#print "MAX: $maxcontig\n";
for my $pa(keys %split){
for my $sca (keys %{$split{$pa}}){
if ($sca ne $maxcontig) {
for my $parax(keys %complete){
for my $exon(keys %{$complete{$parax}}){
if ($complete{$parax}{$exon} eq $sca) {
if (!exists $split{$parax}{$sca}{$exon}) {
$split{$parax}{$sca}{$exon} = [],;
}
}
}
}
}
}
}
#Extract the strand information and loci of blast hits for the exons in %split, at the same time count the strand for all exons to infer whether they are all on the negative strand ($strand_overall_positive = 0) or on the postive strand ($strand_overall_negative = 0)
for my $contig (keys %{$split{$para_original}}){
for my $exon (keys %{$split{$para_original}{$contig}}){
push @exons, $exon;
@sorted_exons = sort {$a <=> $b} @exons;
#print "@sorted_exons\n";
#print "$para_original\t$para\n";
my $loci = `grep -P '$contig.*Paralog.*_exon$exon\t' ExonMatchSolver3.input| sort -g -k4,4| head -n1| awk '{print \$2}'`;
chomp $loci;
#print "$loci\n";
if ($mode eq "user"|| $mode eq "fasta"){
my @temp = split (/_/, $loci);
foreach my $i (0..$#temp){
$split{$para_original}{$contig}{$exon}[$i] = $temp[$i];
};
}elsif ($mode eq "alignment"){
my @temp = split (/-/, $loci);
$temp[0] =~ s/^.+\[//g;
$temp[1] =~ s/]//g;
foreach my $i (0..$#temp){
$split{$para_original}{$contig}{$exon}[$i] = $temp[$i];
};
}
if ($split{$para_original}{$contig}{$exon}[0] > $split{$para_original}{$contig}{$exon}[1]){
$split{$para_original}{$contig}{$exon}[2] = "-";
$strand_overall_negative ++;
}elsif($split{$para_original}{$contig}{$exon}[0] < $split{$para_original}{$contig}{$exon}[1]){
$split{$para_original}{$contig}{$exon}[2] = "+";
$strand_overall_positive ++;
};
}
}
#print "$para: $strand_overall_positive\n";
#print "$para: $strand_overall_negative\n";
#For maxcontig, put all its exons in an array and sort it, extract the strand-information for maxcontig
#For the other contigs, extract the genomic sequence of the blast hit + 10 nt upstream and downstream, store it in %split in the strand orientation of maxcontig, if $strand_overall_positive or negative are 0, don't change the strands at all
for my $contig3(keys %{$split{$para_original}}){
#if($contig3 eq $maxcontig){
@exons = ();
for my $exon3(sort {$a <=> $b} keys %{$split{$para_original}{$maxcontig}}){
push @exons, $exon3;
};
@maxsorted_exons = sort {$a <=> $b} @exons;
#print "2: @maxsorted_exons\n";
for my $j (0..$#maxsorted_exons){
if (exists $split{$para_original}{$maxcontig}{$maxsorted_exons[$j]}[2]){
if ($split{$para_original}{$maxcontig}{$maxsorted_exons[$j]}[2] eq "-"){
$switchMinus ++;
}elsif ($split{$para_original}{$maxcontig}{$maxsorted_exons[$j]}[2] eq "+"){
$switchPlus ++;
}
}
};
if ($switchMinus >= 1 && $switchPlus >= 1){
print "WARNING: There are hits on the positive and negative strand of the same contig! This could be a scrambled locus. Please check: $maxcontig\n";
};
if ($switchMinus > $switchPlus){
$majorstrand = "-";
}elsif ($switchMinus < $switchPlus){
$majorstrand = "+";
};
#print "MAJORSTRAND: $majorstrand of $para_original\n";
$switchMinus = 0;
$switchPlus = 0;
for my $j (0..$#maxsorted_exons-1){
if ($majorstrand eq "+"){
if ($split{$para_original}{$maxcontig}{$maxsorted_exons[$j]}[0] > $split{$para_original}{$maxcontig}{$maxsorted_exons[$j+1]}[0]){
$scrambled = 1;
};
}elsif ($majorstrand eq "-"){
if ($split{$para_original}{$maxcontig}{$maxsorted_exons[$j]}[0] < $split{$para_original}{$maxcontig}{$maxsorted_exons[$j+1]}[0]){
$scrambled = 1;
};
};
};
if ($scrambled == 1){
print "WARNING: There are hits in a scrambled order on the same contig! Please check: $maxcontig\n";
$scrambled = 0;
};
#}
if ($contig3 ne $maxcontig){
#print "CONTROL: $contig3\n";
for my $exon3(sort {$a <=> $b} keys %{$split{$para_original}{$contig3}}){
$exons_minor{$exon3} = $contig3;
my $start = 0;
my $end = 0;
#print "$para\t$contig3\t$exon3\n";
#my $length = qx(grep -v \'>\' TargetSequences/${contig3}.fa |sed ':a;N;\$!ba;s/\\n//g'|wc -m);
if ($strand_overall_positive == 0 || $strand_overall_negative == 0){
if ($split{$para_original}{$contig3}{$exon3}[2] eq "-"){
if ($mode eq "fasta"|| $mode eq "user"){
$start = $split{$para_original}{$contig3}{$exon3}[1] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[0] + 30;
}elsif ($mode eq "alignment"){
$start = $split{$para_original}{$contig3}{$exon3}[1] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[0] + 30;
};
print "fastacmd -d $genome_target -s $contig3 -L $start,$end| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'\n";
$seq = `fastacmd -d $genome_target -s $contig3 -L $start,$end 2> error| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
if ($seq eq ''){
$seq = `fastacmd -d $genome_target -s $contig3 -L 0,0 | grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
}
chomp $seq;
}elsif ($split{$para_original}{$contig3}{$exon3}[2] eq "+"){
if ($mode eq "fasta"|| $mode eq "user"){
$start = $split{$para_original}{$contig3}{$exon3}[0] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[1] + 30;
}elsif ($mode eq "alignment"){
$start = $split{$para_original}{$contig3}{$exon3}[0] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[1] + 30;
}
print "fastacmd -d $genome_target -s $contig3 -L $start,$end | grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'\n";
$seq = `fastacmd -d $genome_target -s $contig3 -L $start,$end 2> error| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
if ($seq eq ''){
$seq = `fastacmd -d $genome_target -s $contig3 -L 0,0| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
}
chomp $seq;
};
}elsif($majorstrand eq "+" && $strand_overall_negative != 0){
if ($split{$para_original}{$contig3}{$exon3}[2] eq "-"){
if ($mode eq "fasta"|| $mode eq "user"){
$start = $split{$para_original}{$contig3}{$exon3}[1] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[0] + 30;
}elsif ($mode eq "alignment"){
#print "TEST:$split{$para_original}{$contig3}{$exon3}[1]\n";
$start = $split{$para_original}{$contig3}{$exon3}[1] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[0] + 30;
}
$seq = `fastacmd -d $genome_target -s $contig3 -L $start,$end -S2 2> error| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
print "fastacmd -d $genome_target -s $contig3 -L $start,$end -S2| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'\n";
if ($seq eq ''){
$seq = `fastacmd -d $genome_target -s $contig3 -L 0,0 -S2| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
}
chomp $seq;
}elsif ($split{$para_original}{$contig3}{$exon3}[2] eq "+"){
if ($mode eq "fasta"|| $mode eq "user"){
$start = $split{$para_original}{$contig3}{$exon3}[0] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[1] + 30;
}elsif ($mode eq "alignment"){
$start = $split{$para_original}{$contig3}{$exon3}[0] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[1] + 30;
}
print "fastacmd -d $genome_target -s $contig3 -L $start,$end| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'";
$seq = `fastacmd -d $genome_target -s $contig3 -L $start,$end 2> error| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
if ($seq eq ''){
$seq = `fastacmd -d $genome_target -s $contig3 -L 0,0 | grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
}
chomp $seq;
};
}elsif($majorstrand eq "-" && $strand_overall_positive != 0){
if ($split{$para_original}{$contig3}{$exon3}[2] eq "-"){
if ($mode eq "fasta"|| $mode eq "user"){
$start = $split{$para_original}{$contig3}{$exon3}[1] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[0] + 30;
}elsif ($mode eq "alignment"){
$start = $split{$para_original}{$contig3}{$exon3}[1] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[0] + 30;
}
print "fastacmd -d $genome_target -s $contig3 -L $start,$end| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'\n";
$seq = `fastacmd -d $genome_target -s $contig3 -L $start,$end 2> error| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
if ($seq eq ''){
$seq = `fastacmd -d $genome_target -s $contig3 -L 0,0 | grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
}
chomp $seq;
}elsif ($split{$para_original}{$contig3}{$exon3}[2] eq "+"){
if ($mode eq "fasta"|| $mode eq "user"){
$start = $split{$para_original}{$contig3}{$exon3}[0] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[1] + 30;
}elsif ($mode eq "alignment"){
$start = $split{$para_original}{$contig3}{$exon3}[0] - 30;
$end = $split{$para_original}{$contig3}{$exon3}[1] + 30;
}
print "fastacmd -d $genome_target -s $contig3 -L $start,$end -S2| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'\n";
$seq = `fastacmd -d $genome_target -s $contig3 -L $start,$end -S2 2> error| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
if ($seq eq ''){
$seq = `fastacmd -d $genome_target -s $contig3 -L 0,0 -S2 | grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
}
chomp $seq;
}
};
$split{$para_original}{$contig3}{$exon3}[3] = $seq;
};
};
};
#print Dumper (\%split);
#For the other contigs, find the next exon upstream and downstream on maxcontig
#Extract the sequence between these two exons on maxcontig, search for more than three consecutive Ns and substitute the N-region with the putative new exon (stored in %split)
#If the N-region is longer, fill up with Ns, if there are no Ns in the region, insert the sequence 10 nucleotides after the previous exon hit together with a N before and after
#Run exonerate on this merged sequence
#print "SORTED: @maxsorted_exons\n";
#print "$majorstrand\n$maxcontig\n";
if ($majorstrand eq "+"){
for my $contigMinorPlus(sort{$a <=> $b} keys %exons_minor){
my $contig4 = $exons_minor{$contigMinorPlus};
my $exon4 = $contigMinorPlus;
#for my $contig4(sort {${$split{$para_original}}{$a} <=> ${$split{$para_original}}{$b}} keys %{$split{$para_original}}){
#print "$contig4\t$contigMinorPlus\n";
if ($contig4 ne $maxcontig){
my $find = "";
my $addition;
my $replace;
#print "$contig4\t$maxcontig\n";
#for my $exon4(sort {$a <=> $b} keys %{$split{$para_original}{$contig4}}){
my @greater = ();
my @smaller= ();
my $after = ();
my $before = ();
my @greater_sorted =();
my @smaller_sorted=();
#print "$exon4\n";
for my $i (0..$#maxsorted_exons){
if ($exon4 > $maxsorted_exons[$i]){
push @greater, $maxsorted_exons[$i];
}elsif($exon4 < $maxsorted_exons[$i]){
push @smaller, $maxsorted_exons[$i];
}
};
#print "SMALLER/GREATER: @smaller\t@greater\n";
if ($fastasplit eq ""){
$fastasplit = `fastacmd -d $genome_target -s $maxcontig| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
chomp $fastasplit;
};
if (scalar(@smaller) == 0){
#$fastacat = `fastacmd -d $genome_target -s ${contig4}| grep -v '>'| sed ':a;N;\$!ba;s/\\n//g'`;
#chomp $fastacat;
#if ($majorstrand eq "+"){
my $fastasplit_old = $fastasplit;
$fastasplit = $fastasplit_old . $split{$para_original}{$contig4}{$exon4}[3];
#print "CONTROL: $fastasplit\n";
#my $difference_old = $difference;
#$difference = length ($fastasplit) - length ($fastasplit_old);
#print "DIFFERENCE: $difference\n";
}elsif (scalar(@greater) == 0){
#$fastacat = `fastacmd -d $genome_target -s ${contig4}| grep -v '>'| sed ':a;N;\$!ba;s/\\n//g'`;
#chomp $fastacat;
#if ($majorstrand eq "+"){
my $fastasplit_old = $fastasplit;
$fastasplit = $split{$para_original}{$contig4}{$exon4}[3] . $fastasplit_old;
#my $difference_old = $difference;
$difference = (length ($fastasplit) - length ($fastasplit_old));
#print "DIFFERENCE: $difference\n";
#print "CONTROL: $fastasplit\n";
}elsif(scalar(@greater) != 0 && scalar(@smaller) != 0){
@greater_sorted = sort {$a <=> $b} @greater;
@smaller_sorted = sort {$a <=> $b} @smaller;
#print "GREATER: @greater_sorted\n";
#print "SMALLER: @smaller_sorted\n";
$before = $greater_sorted[-1];
$after = $smaller_sorted[0];
#print "$before\t$after\n";
#print "$para_original\t$maxcontig\t$smaller_sorted[0]\n";
#print "$difference\n";
if ($split{$para_original}{$maxcontig}{$smaller_sorted[0]}[2] eq "+"){
#print "$maxcontig, $majorstrand\n";
print "$split{$para_original}{$maxcontig}{$before}[1], $split{$para_original}{$maxcontig}{$after}[0]\t$difference\n"; $find = substr ($fastasplit, $split{$para_original}{$maxcontig}{$before}[1] - 1 + $difference, ($split{$para_original}{$maxcontig}{$after}[0] - $split{$para_original}{$maxcontig}{$before}[1] + 1 + $difference));
}elsif ($split{$para_original}{$maxcontig}{$smaller_sorted[0]}[2] eq "-"){
#print "$split{$para_original}{$maxcontig}{$after}[0], $split{$para_original}{$maxcontig}{$before}[1]\t$difference\n";
$find = substr ($fastasplit, $split{$para_original}{$maxcontig}{$after}[0] - 1 + $difference, ($split{$para_original}{$maxcontig}{$before}[1] - $split{$para_original}{$maxcontig}{$after}[0] + 1 + $difference));
}
#print "$find\n";
my $seq = $split{$para_original}{$contig4}{$exon4}[3];
my $switch = 0;
#if ($majorstrand eq "+"){
if ($find =~ /((N){3,})/){
if (length $1 > length ($seq)){
$addition = "N" x (length ($1) - length ($seq) - 2);
}else {
$addition = "N";
};
$_ = $find;
s/((N){3,})/NN${seq}${addition}/;
$replace = $_;
}else{
$replace = substr ($find, 0, length($find)-10). "N" . $seq . "N" . substr ($find, length($find)-10, 10);
};
#print "REPLACE\n$replace\n";
#print "FIND\n$find\n";
my $fastasplit_old = $fastasplit;
my $pos = index($fastasplit,$find);
substr ($fastasplit, $pos, length($find), $replace);
$difference = length ($fastasplit) - length ($fastasplit_old);
#print "Diff: $difference\n";
#};
};
}
};
}elsif($majorstrand eq "-"){
for my $contigMinorMinus(sort{$b <=> $a} keys %exons_minor){
my $contig4 = $exons_minor{$contigMinorMinus};
my $exon4 = $contigMinorMinus;
#print "$contig4\t$contigMinorMinus\n";
if ($contig4 ne $maxcontig){
my $find = "";
my $addition;
my $replace;
#print "$contig4\t$maxcontig\n";
my @greater = ();
my @smaller= ();
my $after = ();
my $before = ();
my @greater_sorted =();
my @smaller_sorted=();
#print "$exon4\n";
for my $i (0..$#maxsorted_exons){
if ($exon4 > $maxsorted_exons[$i]){
push @greater, $maxsorted_exons[$i];
}elsif($exon4 < $maxsorted_exons[$i]){
push @smaller, $maxsorted_exons[$i];
}
};
#print "SMALLER/GREATER: @smaller\t@greater\n";
if ($fastasplit eq ""){
$fastasplit = `fastacmd -d $genome_target -s $maxcontig| grep -v '>'|sed ':a;N;\$!ba;s/\\n//g'`;
chomp $fastasplit;
};
if (scalar(@smaller) == 0){
my $fastasplit_old = $fastasplit;
$fastasplit = $split{$para_original}{$contig4}{$exon4}[3] . $fastasplit_old;
#my $difference_old = $difference;
$difference = length ($fastasplit) - length ($fastasplit_old);
}elsif (scalar(@greater) == 0){
my $fastasplit_old = $fastasplit;
$fastasplit = $fastasplit_old . $split{$para_original}{$contig4}{$exon4}[3];
}elsif(scalar(@greater) != 0 && scalar(@smaller) != 0){
@greater_sorted = sort {$a <=> $b} @greater;
@smaller_sorted = sort {$a <=> $b} @smaller;
#print "GREATER: @greater_sorted\n";
#print "SMALLER: @smaller_sorted\n";
$before = $greater_sorted[-1];
$after = $smaller_sorted[0];
#print "$before\t$after\n";
#print "$para_original\t$maxcontig\t$smaller_sorted[0]\n";
#print "$difference\n";
if ($split{$para_original}{$maxcontig}{$smaller_sorted[0]}[2] eq "+"){
#print "$maxcontig, $majorstrand\n";
#print "$split{$para_original}{$maxcontig}{$before}[1], $split{$para_original}{$maxcontig}{$after}[0]\t$difference\n";
$find = substr ($fastasplit, $split{$para_original}{$maxcontig}{$before}[1] - 1 + $difference, ($split{$para_original}{$maxcontig}{$after}[0] - $split{$para_original}{$maxcontig}{$before}[1] + 1 + $difference));
}elsif ($split{$para_original}{$maxcontig}{$smaller_sorted[0]}[2] eq "-"){
#print "$split{$para_original}{$maxcontig}{$after}[0], $split{$para_original}{$maxcontig}{$before}[1]\t$difference\n";
$find = substr ($fastasplit, $split{$para_original}{$maxcontig}{$after}[0] - 1 + $difference, ($split{$para_original}{$maxcontig}{$before}[1] - $split{$para_original}{$maxcontig}{$after}[0] + 1 + $difference));
}
#print "$find\n";
my $seq = $split{$para_original}{$contig4}{$exon4}[3];
my $switch = 0;
if ($find =~ /((N){3,})/){
if (length $1 > length ($seq)){
$addition = "N" x (length ($1) - length ($seq) - 2);
}else {
$addition = "N";
};
$_ = $find;
if ($strand eq "+"){
s/((N){3,})/$addition${seq}NN/;
$replace = $_;
}elsif($strand eq "-"){
s/((N){3,})/NN${seq}${addition}/;
$replace = $_;
}
}else{
$replace = substr ($find, 0, 10). "N" . $seq . "N" . substr ($find, 10, length($find));
};
#print "REPLACE\n$replace\n";
#print "FIND\n$find\n";
my $fastasplit_old = $fastasplit;
my $pos = index($fastasplit,$find);
substr ($fastasplit, $pos, length($find), $replace);
$difference = length ($fastasplit) - length ($fastasplit_old);
#print "Diff: $difference\n";
};
}
}
}
#print "Sorted exons: $sorted_exons[0]\t $sorted_exons[1]\n";
$_ = $fastasplit;
s/.{32000}/$&\n/g;
$fastasplit = $_;
print FINAL ">Paralog${para_original}_merged\n$fastasplit\n";
close FINAL;
if ($exonerate eq "no"){
if (-e "error"){
`rm error`;
};
if ($strand_overall_positive == 0){ #All exons of this paralog are on the negative strand
`prosplign -nfa TargetSequences/Paralog${para_original}_merged.fa -pfa SearchQuery/Paralog${para_original}.fa -strand - -full -out SearchTarget/Paralog${para_original}_target_final.out -inf SearchTarget/Paralog${para_original}_target_final.inf 2> error`;
}elsif ($strand_overall_negative == 0){ #All exons of this paralog are on the positive strand
`prosplign -nfa TargetSequences/Paralog${para_original}_merged.fa -pfa SearchQuery/Paralog${para_original}.fa -strand + -full -out SearchTarget/Paralog${para_original}_target_final.out -inf SearchTarget/Paralog${para_original}_target_final.inf 2> error`;
}elsif ($majorstrand eq "-"){ #The exons on the major contig are on the negative strand, but at least one exon is on the positive strand
`prosplign -nfa TargetSequences/Paralog${para_original}_merged.fa -pfa SearchQuery/Paralog${para_original}.fa -strand - -full -out SearchTarget/Paralog${para_original}_target_final.out -inf SearchTarget/Paralog${para_original}_target_final.inf 2> error`;
}elsif ($majorstrand eq "+"){ #The exons on the major contig are on the positive strand, but at least one exon is on the negative strand
`prosplign -nfa TargetSequences/Paralog${para_original}_merged.fa -pfa SearchQuery/Paralog${para_original}.fa -strand + -full -out SearchTarget/Paralog${para_original}_target_final.out -inf SearchTarget/Paralog${para_original}_target_final.inf 2> error`;
}
open (ERROR, "< error");
while (){
if ($_ =~ /std::bad_alloc/){
$exonerate = "yes";
}
}
if ($exonerate eq "yes"){
redo;
`rm SearchTarget/Paralog${para_original}_target_final.out`;
`rm SearchTarget/Paralog${para_original}_target_final.inf`;
}else{
my $result = `grep -A1 "[0-9]" SearchTarget/Paralog${para}_target_final.out | grep -v "[0-9]"| sed 's/ //g'| sed 's/-//g'| tr -d '\n'`;
$_ = $result;
s/([a-z])[a-z][a-z]/$1/g;
$result = $_;
chomp $result;
print OUT ">Paralog${para_original}_final\n$result\n";
close OUT;
};
}elsif ($exonerate eq "yes"){
`exonerate --model protein2genome --showalignment yes --useaatla false --showvulgar no --showcigar no --showsugar no --showquerygff no --showtargetgff no --score 60 -q SearchQuery/Paralog${para_original}.fa -t TargetSequences/Paralog${para_original}_merged.fa > SearchTarget/Paralog${para_original}_target_final_complete.aln`;
`grep -A1 \'|\' SearchTarget/Paralog${para_original}_target_final_complete.aln | grep -v \'|\'| grep -v \'^\\-\\-\'| sed \'s/\\s//g\'| grep -v \'Model\'|sed \'s/{[A-Z]\*//g\'| sed \'s/}//g\'| sed \':a;N;\$!ba;s/\\n//g\' | sed \'s/[+|-]\\{2,4\\}/\\n/g\'|sed \'s/#//g\'|sed \'s/+//\'|sed \'s/-//\' > SearchTarget/Paralog${para_original}_target_final.aln`;
my $result = `sed ':a;N;\$!ba;s/\\n//g' SearchTarget/Paralog${para_original}_target_final.aln`;
chomp $result;
print OUT ">Paralog${para_original}_final\n$result\n";
close OUT;
};
}
if (-e "Summary.fa"){
unlink "Summary.fa";
};
my @proteinfinal;
if ($WGD eq "yes"){
foreach my $paralog(@numbers){
if (!exists $NonDuplicated{$paralog}){
foreach my $duplicates(@duplicates){
if (-e "SearchTarget/Paralog${paralog}${duplicates}_target_final.fa" && -e "SearchQuery/Paralog${paralog}${duplicates}.fa"){
`cat SearchTarget/Paralog${paralog}${duplicates}_target_final.fa >> Summary.fa`;
`cat SearchQuery/Paralog${paralog}${duplicates}.fa >> Summary.fa`;
}elsif (-e "SearchQuery/Paralog${paralog}${duplicates}.fa" && -e "SearchTarget/Paralog${paralog}${duplicates}_target.fa"){
`cat SearchQuery/Paralog${paralog}${duplicates}.fa >> Summary.fa`;
`cat SearchTarget/Paralog${paralog}${duplicates}_target.fa >> Summary.fa`;
}
}
}else{
if (-e "SearchTarget/Paralog${paralog}a_target_final.fa" && -e "SearchQuery/Paralog${paralog}a.fa"){
`cat SearchTarget/Paralog${paralog}a_target_final.fa >> Summary.fa`;
`cat SearchQuery/Paralog${paralog}a.fa >> Summary.fa`;
}elsif (-e "SearchQuery/Paralog${paralog}a.fa" && -e "SearchTarget/Paralog${paralog}a_target.fa"){
`cat SearchQuery/Paralog${paralog}a.fa >> Summary.fa`;
`cat SearchTarget/Paralog${paralog}a_target.fa >> Summary.fa`;
}
};
};
}elsif($WGD eq "no"){
foreach my $paralog(@numbers){
if (-e "SearchTarget/Paralog${paralog}_target_final.fa" && -e "SearchQuery/Paralog${paralog}.fa"){
`cat SearchTarget/Paralog${paralog}_target_final.fa >> Summary.fa`;
`cat SearchQuery/Paralog${paralog}.fa >> Summary.fa`;
}elsif (-e "SearchQuery/Paralog${paralog}.fa" && -e "SearchTarget/Paralog${paralog}_target.fa"){
`cat SearchQuery/Paralog${paralog}.fa >> Summary.fa`;
`cat SearchTarget/Paralog${paralog}_target.fa >> Summary.fa`;
}
};
}
print "Scipio starts\n";
my @scipio = "scipio.1.4.1.pl $scipio_options $genome_target Summary.fa > Summary_scipio.yaml";
system (@scipio);# == 0 or warn "Scipio failed. $!\n";
my @gff = "yaml2gff.1.4.pl Summary_scipio.yaml > Summary_scipio.gff";
system (@gff) == 0 or warn "Conversion of yaml to gff failed. $!\n";
print "ExonMatchSolver pipeline completed\n";
}
}
my @ExonerateFiles = glob ("SearchTarget/*_target.aln");
foreach my $file(@ExonerateFiles){
`rm $file`;
};
my @ExonerateFiles2 = glob ("SearchTarget/*_target_final.aln");
foreach my $file(@ExonerateFiles2){
`rm $file`;
};
#unless ($mode eq "alignment"){
#`rm SearchTarget/*.blastout`;
#}
`rm -r SingleExons`;
`rm tmp*`;
if (-e "error"){
`rm error`;
}
if (-e "error.log"){
`rm error.log`;
}
if (-e "Exonerate_help"){
`rm Exonerate_help`;
}
#------------------------------------------------------------------------------------------------------
#FUNCTIONS
sub usage {
print STDERR "\nExonMatchSolver\nA pipeline assisting curation of gene annotations\n";
print STDERR "usage: ExonMatchSolver.pl -i -o -mode -target -[OPTIONS]\n";
print STDERR "\n";
print STDERR "[INPUT]\n";
print STDERR " -mode mode to be run: alignment (HMM are built), fasta or user\n";
print STDERR " -i input file (protein sequences, one per paralog of the protein family in the query species)\n";
print STDERR " -o output directory\n";
print STDERR " -target absolute path to the target genome\n";
print STDERR " -WGD WGD expected? Default: no\n";
print STDERR " -noWGD specify paralogs for which no WGD is expected separated by commas only (no space), if WGD is set to yes i.e. the specified paralogs are unduplicated\n";
print STDERR " -s spliced alignmnet programm used: exonerate or ProSplign, Default: no (ProSplign)\n";
print STDERR " -query absolute path to the query genome, required in fasta-mode\n";
print STDERR " -user input file with paralog-specific and TCE-individual protein sequences (.fa), required in user- and alignment-mode\n";
print STDERR " -m input folder with paralog-specific and TCE-individual proetin alignments (.stk, one alignment per paralog- and exon), required in alignment-mode\n";
print STDERR " -l length cutoff for the preparational step in fasta-mode\n";
print STDERR " -z Z-score cutoff applied during preparational step in fasta-mode. Default: 3 (3 standard deviations).\n";
print STDERR "[OPTIONS for BLAST]\n";
print STDERR " -dFirst nucleotide distance considered upstream of the first blast hit found\n";
print STDERR " -dLast nucleotide distance considered downstream of the last blast hit found\n";
print STDERR "[OPTIONS for the ILP]\n";
print STDERR " -int integer: round bitscore to a number dividable by this number; fraction: return all optimal solutions within this fraction\n";
print STDERR " -max maximal number of solution to be returned\n";
print STDERR "[OPTIONS for SCIPIO]\n";
print STDERR "-scipio_opt see scipio documentation for options\n";
print STDERR " -c number of cores available to use\n";
print STDERR " -h this (useful) help message\n";
print STDERR "[VERSION]\n";
print STDERR " 06-10-2015\n";
print STDERR "[BUGS]\n";
print STDERR " Please report bugs to henrike\@bioinf.uni-leipzig.de\n";
print STDERR "\n";
exit(-1);
}
sub prettyTime{
my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun);
my ($second, $minute, $hour, $dayOfMonth, $month,
$yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
my $year = 1900 + $yearOffset;
return "$hour:$minute:$second, $weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
}
sub printHeader{
print "\n";
print " ============================================================\n";
print " = ExonMatchSolver =\n";
print " = A pipeline assisting curation of gene annotations =\n";
print " ============================================================\n";
print " startet at ".prettyTime()."\n";
print " -> input file: $input\n";
print " -> output folder: $outfolder\n";
print "\n";
}
sub uniq {
my %seen;
return grep { !$seen{$_}++ } @_;
}