#!/usr/bin/perl -w # -*-Perl-*- # Last changed Time-stamp: <2008-12-22 12:24:14 jana> # # @motif = ( [i,j, {"AU" => score, "GC" => score,... }], ... ); # use strict; use Getopt::Long; use constant { INF => -99999}; use Data::Dumper; use vars qw/$opt_debug/; my $usage= << "JUS"; USAGE: perl $0 -i sequence-file [-a [-f][-s]] -m Motif options: -i file: file containing the sequence(s) -m motif: either 'KTurn' or 'CLoop' (not implemented yet) [REQUIRED] note: The input file is a Fasta file that contains the name of the sequence in the first line, the sequence itself in the second, and the last line must contain the structure information. results: Program $0 searches a single sequence for 3 dimensional motifs. If the motif could be found, the positions relative to the alignment are returned. example: perl $0 -i 18S.structures.fasta JUS my ($in_file, $is_aln, $is_fasta, $s_file, $motif, $ok); $is_aln = ''; $is_fasta = ''; $s_file = ""; #a=s a must be String #b=i b must be Integer #d d = 1 => if 'd' is defined, esle d = undef usage() unless GetOptions("d|debug" => \$opt_debug, "i=s" => \$in_file, "a" => \$is_aln, "f" => \$is_fasta, "s=s" => \$s_file, # external structure file "m=s" => \$motif, "k" => \$ok ); if($opt_debug || !$in_file || !$motif) { usage(); exit(1); } $ok = 1; # Watson-Crick+GU my %WCGU = ("AU" => 100/6, "UA" => 100/6, "CG" => 100/6, "GC" => 100/6, "GU" => 100/6, "UG" => 100/6); # Any base pair my %ANY; for my $x ('A','C','G','U') { for my $y ('A','C','G','U') { $ANY{$x.$y} = 100/16; } } # select motif my (@Minsert, @Mlog, @Motif); if($motif eq "CLoop") { @Minsert = ([[2,0,1]],[[1,0,2],[0,0,1]]); @Motif = ( [0,3, \%WCGU], [1,2, {"GC" => 92.7, "CG" => 2.7, "AU" => 2.7, "UA" => 0.3}], [2,1, {"CA" => 94.7}], [4,2, {"AC" => 0.3, "CC" => 0.1, "CG" => 22.0, "GA" => 69.3}], [5,1, {"UA" => 96.8, "GC" => 2.4, "CG" => 0.4, "CA" => 0.1}], [6,0, \%WCGU], [3,-1, {"A" => 100 }] ); } elsif($motif eq "KTurn") { @Minsert = ([[3,0,2]],[[4,0,1]]); @Motif = ( [0,8, \%WCGU], [1,7, \%ANY], [2,6, {"GA" => 88.3, "AA" => 3.8, "AC" => 6}], [3,5, {"AG" => 66.3, "AA" => 27.7, "AC" => 3.1, "AU" => 2.5}], [4,1, {"GC" => 57.2, "AU" => 41.1}], [3,2, {"AG" => 23.4, "AU" => 67.3, "AC" => 2.9, "AA" => 6.1}], [4,6, {"GA" => 57.7, "AA" => 40.9}], [5,0, \%WCGU]); } @Mlog = logify(\@Motif); #print Dumper(\@Mlog); #printf "first\n"; #printf "scoreM: %s\n",score_motif("CGCAAUG", "UACG", \@Mlog); #printf "23S C-50:%s\n",score_motif("AGCACUG", "CACU", \@Mlog); #printf "23S C-38:%s\n",score_motif("AGCACCG", "CACU", \@Mlog); #printf "16S C-15:%s\n",score_motif("CGCAAUG", "UACG", \@Mlog); #printf "23S C-96:%s\n",score_motif("GCCACUG", "CAGC", \@Mlog); #print score_motif("GUAACGC", "GCAGU", \@Mlog), "\n"; #print score_motif("GGGAGC","GCGAAGAAC", \@Mlog), "\n"; #print score_motif("GGGAGG","CCGAAGAAC", \@Mlog), "\n"; #printf "\n"; #my $nope=0; #for (1..100000) { # my ($s1,$s2); # for my $p (1..7) { # $s1 .= ('A','C','G','U')[rand(4)]; # } # for my $p (1..4) { # $s2 .= ('A','C','G','U')[rand(4)]; # } # my $s = score_motif($s1, $s2, \@Mlog); # # if ($s>INF) { # printf "%f %s %s\n",$s,$s1,$s2; # } else { # $nope++; # } #} #print $nope/10000,"\n"; #die; my (@input, @output, @o, @gapLess, %gap2seq); # RNAfold file is input file if($ok) { my (%data, $key); open(F, $in_file) or die $!; my @s = ; close(F); if(substr($s[0],0,1) ne ">") { printf "\nERROR: no RNAfold output (1)\n\n"; exit(0); } foreach my $s (@s) { chomp($s); if($s =~ /^>/) { $key = $s; } elsif($s =~ /^([AUGCNTaugcnt]+)/) { ${$data{$key}}[0] = $1; } elsif($s =~ /^([$\.$]*)\s+\S+/) { ${$data{$key}}[1] = $1; } else { printf "*%s*\n",$s; printf "\nERROR: no RNAfold output (2)\n\n"; exit(0); } } foreach my $key (keys %data) { my @output = searchSeq(${$data{$key}}[0],${$data{$key}}[1]); if(scalar @output != 0) { printf "%s\n",$key; print @output; } } } else { if($is_aln) { if($is_fasta) { # read fasta format # printf "\nAlignment search (FASTA):\n"; @input = readFastaAln($in_file,$s_file); } else { # read clustal format @input = readClwAln($in_file,$s_file); } @output = search(@input); if(scalar @output > 0 && $output[0] ne "") { printf ">%s\n",$in_file; print @output; printf "\n"; } } else { # do single sequence search # printf "\nSingle sequence search:\n\n"; @input = readSingleSeq($in_file,$s_file); @output = searchSeq($input[0], $input[1]); print @input; die; if(scalar @output > 0) { print @output; } else { printf "Motif not found\n"; } printf "\n"; } } # --- start FUNCTIONS --- ### # --- input reading functions # # 1) input is FASTA formatted alignment sub readFastaAln { my ($file, $str_f) = @_; my (@ret, $is_struc, $struc); my $i = -1; my $length = 0; open(F, $file) or die $!; my @lines = ; close(F); foreach my $l (@lines) { if($l =~ /^>structure/) { $is_struc = 1; $struc = ""; } elsif($l =~ /^>\S+/) { $is_struc = 0; $i++; $ret[$i] = ""; } elsif($l =~ /^([augctnAUGCTN-]+)$/ && $is_struc == 0) { $ret[$i] .= $1; $ret[$i] =~ s/T/U/g; } elsif($l =~ /^([\.]+)$/ && $is_struc == 1) { $struc .= $1; } } if($is_struc == 0 && $str_f eq "") { printf STDERR "\nERROR: Your fasta file does not contains a structure \n"; printf STDERR " and no external structure file is given\n\n"; usage(); exit(0); } elsif($is_struc == 0 && $str_f ne "") { my @x = split /\//,$file; open(S, $str_f) or die $!; my @murx = ; close(S); for(my $m=0; $m$x[scalar @x -1]/) { my @k = split /\s+/,$murx[$m+2]; $struc = $k[0]; last; } } close(S); } push @ret, $struc; $length = length $ret[0]; foreach my $s (@ret) { if(length $s != $length) { printf STDERR "\nERROR: Unequal sequence/structure length in alignment file\n\n"; print $usage; exit(0); } } return @ret; } # --------------------------------------------------------------------------------------- # 2) input is CLUSTAL W formatted alignment sub readClwAln { my ($file,$str_f) = @_; my $length = 0; my (@ret,%hash,$struc); if(length $str_f > 0) { my @x = split /\//,$file; # my $grep = `grep -A2 '$x[scalar @x -1]' $str_f`; my $bla = $file; $bla =~ s/\//\\\//g; my $grep = `grep -A2 '$bla' $str_f | awk '{if(NF>1) print \$1;}'`; # print $grep; $struc = $grep; $struc =~ s/\n$//; open(F, $file) or die $!; my @lines = ; close(F); my ($key,$seq); foreach my $l (@lines) { next if($l =~ /CLUSTAL/ || $l =~ /^\s*$/ || $l =~ /^[\s\*]*$/); if($l =~ /^(\S+)\s+(\S+)$/) { if(exists($hash{$1})) { $hash{$1} .= $2; $hash{$1} =~ s/T/U/g; } else { $hash{$1} = $2; $hash{$1} =~ s/T/U/g; } } } } else { open(F, $file) or die $!; my @lines = ; close(F); my ($key,$seq, $struc); $struc = ""; foreach my $l (@lines) { next if($l =~ /CLUSTAL/ || $l =~ /^\s*$/ || $l =~ /^[\s\*]*$/); next if($l=~ /^$/); if($l =~ /^structure\s+(\S+)/) { $struc .= $1; } elsif($l =~ /^(\S+)\s+(\S+)$/) { if(exists($hash{$1})) { $hash{$1} .= $2; $hash{$1} =~ s/T/U/g; } else { $hash{$1} = $2; $hash{$1} =~ s/T/U/g; } } } if(length $struc == 0) { printf STDERR "\nERROR: Your alignment file does not contain secondary\n"; printf STDERR "structrue information and no external structure file is given\n\n"; print $usage; exit(0); } } @ret = values %hash; push @ret,$struc; $length = length $ret[0]; printf "%d\n",$length; #foreach my $s (@ret) { # if(length($s) != $length) { # printf "%d\n",length($s); # printf STDERR "ERROR: Unequal sequence/structure length in alignment file\n\n"; # exit(0); # } # else {printf "%s\n",$s;} #} return @ret; } # --------------------------------------------------------------------------------------- # 2) input is single sequence sub readSingleSeq { my ($file,$str_f) = @_; my ($key,@ret); open(F, $file) or die $!; my @lines = ; close(F); if($lines[0] =~ /^>(.*)$/) { $ret[0] = ""; } else { print STDERR "\nInput file not a single sequence FASTA file\n\n"; } for(my $i=1; $istructure/) { $ret[1] = ""; } elsif($lines[$i] =~ /^([\.]+)$/) { $ret[1] .= $1; } } if(!exists($ret[1]) && $str_f eq "") { printf STDERR "\nERROR: Your fasta file contains no structure \n"; printf STDERR " and no external structure file is given\n\n"; print $usage; exit(0); } elsif(!exists($ret[1]) && $str_f ne "") { my @x = split /\//,$file; open(S, $str_f) or die $!; my @murx = ; close(S); for(my $m=0; $m$x[scalar @x -1]/) { my @k = split /\s+/,$murx[$m+2]; $ret[1] = $k[0]; last; } } close(S); } return @ret; } # --------------------------------------------------------------------------------------- ### # --- search functions # # 1) search Single sequence for Kink-turn motif sub searchSeq { # searches a single sequence for the Motif motif # args: seqence, structure # return: uniq @list of positions were motif was found my ($seq, $struc) = @_; # printf "%s\n%s\n",$seq,$struc; # printf "%-51s%s"," ",substr($seq,51); # chomp $seq; # $_ = $struc; # $struc = (split)[0]; # print $struc; die; my @pt = make_pair_table($struc); my ($minl, $maxl, $minr, $maxr, $nl, $nr) = (0,0,0,0,0,0); foreach my $i (@{$Minsert[0]}) { $minl += $i->[1]; $maxl += $i->[2]; } foreach my $i (@{$Minsert[1]}) { $minr += $i->[1]; $maxr += $i->[2]; } foreach my $p (@Motif) { $nl = $p->[0] if $p->[0]>$nl; $nr = $p->[1] if $p->[1]>$nr; } my ($l_ins,$r_ins) = explode_inserts(@Minsert); my $res = ""; my @ret; for my $i (0..length($seq)-1) { my $j = $pt[$i]; next if $j<0; for my $il ($minl .. $maxl) { my $k = $i+$nl+$il; next if $k>=$j; my $l = $pt[$k]; next if $l<$k; my $ir = $j-$l-$nr; next if (($ir<$minr) || ($ir>$maxr)); foreach my $lins (@{$l_ins->[$il]}) { foreach my $rins (@{$r_ins->[$ir]}) { if($i< (length $seq) - ($nl+$il+1) && $j<(length $seq) - ($nr+$ir+1)) { my ($s1,$s2) = splice_ins(substr($seq,$i,$nl+$il+1), substr($seq,$l,$nr+$ir+1), $lins, $rins); my $score = score_motif($s1,$s2,\@Mlog); # scores fuer mehr motife adden + verarbeiten $res = "$i,$j,$k,$l, $s1, $s2, $score\n" if $score>-10; # i: start motif 5' # j: end motif 3' # k: end motif 5' # l: start motif 3' # s1: 5' sequence # s2: 3' sequence # score: score # print $res; push @ret, $res; } } } } } return removeDbl(@ret); } # --------------------------------------------------------------------------------------- # 2) search aligned sequence for Kink-turn motif ### # 1. remove structure information on all positions were gaps occur in the aligned sequences # 2. search motif in all sequences (remove double entries) # 3. parse back to alignment coords, create cluster for each sequence # 4. check if hits overlap with respect to the alignment # if yes: retain alignment as positive # args: array of aligned sequences, such that consensus structure is last entry # sub search { my @arr = @_; # 1. remove structure information on all positions were gaps occur in the aligned sequences my $cons_struc = $arr[scalar @arr-1]; my (@singles, $tmpSeq, $tmpStr, @gapLess, @cluster, @cl_all); # print join "\n",@arr; splice @arr,scalar @arr - 1,1; foreach my $seq (@arr) { my @seq = split //,$seq; my @struc = split //,$cons_struc; my %gap2seq; for(my $i=0; $i0) { $p--; $l++ if($struc[$p] eq "("); $r++ if($struc[$p] eq ")"); } $struc[$p] = "."; $struc[$s] = "y"; $s = $p; } elsif($struc[$s] eq "(") { $p = $s; $l = 1; $r = 0; while(($l-$r)>0) { $p++; $l++ if($struc[$p] eq "("); $r++ if($struc[$p] eq ")"); } $struc[$p] = "."; $struc[$s] = "y"; } } $s++; }while((join "",@seq) =~ /\-/); my $sequence = join "",@seq; my $structure = join "",@struc; $sequence =~ s/x//g; $structure =~ s/y//g; # printf "%s\n%s\n%s\n%s\n",$seq,$cons_struc,$sequence,$structure; # 2. search motif in current sequence (remove double entries) my @t = searchSeq($sequence,$structure); if(scalar @t == 0) { return ""; } # print @t; # print "\n"; # 3a. parse back to alignment coords, create cluster for each sequence my @parsed; foreach my $part (@t) { next if(length $part == 0); my @coords = split /\,/,$part; # before 5' stem => change all values my $add = 0; for(my $p=0; $p<=$coords[0]; $p++) { if($gap2seq{$p} == 1) { $add++; } } if($add > 0) { for(my $c=0; $c<4; $c++) { $coords[$c] += $add; } } # in 5' stem => change only last 3 values $add = 0; for(my $p=$coords[0]; $p<=$coords[2]; $p++) { if($gap2seq{$p} == 1) { $add++; } } if($add > 0) { for(my $c=1; $c<4; $c++) { $coords[$c] += $add; } } # inbetween => change only second and fourth value $add = 0; for(my $p=$coords[2]; $p<=$coords[3]; $p++) { if($gap2seq{$p} == 1) { $add++; } } if($add > 0) { $coords[1] += $add; $coords[3] += $add; } # in 3' stem => change only second value $add = 0; for(my $p=$coords[3]; $p<=$coords[1]; $p++) { if($gap2seq{$p} == 1) { $add++; } } if($add > 0) { $coords[1] += $add; } push @parsed,join " ",@coords; } if(scalar @parsed == 0) { return ""; } # 3b. create cluster for each sequence my ($t); my @coords = split /\s+/,$parsed[0]; my $s5 = scalar $coords[0]; my $e5 = scalar $coords[2]; my $s3 = scalar $coords[3]; my $e3 = scalar $coords[1]; my $sum_score = $coords[6]; for(my $p=1; $p scalar $b) { return $a; } else { return $b; } } sub removeDbl { my @arr = @_; my %map; foreach my $a (@arr) { $map{$a} = 1; } return keys %map; } # PURPOSE: Find out if region s2-e2 overlaps to x% with region s1-e1 # x% in (0,1) # RETURNS: 1 in case they ovelap, otherwise 0. sub overlap { my($s1, $e1, $s2, $e2) = @_; my $overlap = 0; my $type = ''; if($e1<$s2 || $e2<$s1){ # no overlapping $type = "no ol"; #return (0, $type, 0, 0); $overlap = 0; } elsif($s1<=$s2 && $e1>=$e2){ # complete overlapping $overlap = $e2-$s2+1; $type = 'compl'; } elsif($s2<=$s1 && $e2>=$e1){ # complete overlapping $overlap = $e1-$s1+1; $type = 'compl'; } elsif($s1>=$s2 && $s1<=$e2){ # partly overlapping $overlap = $e2-$s1+1; $type = 'part'; } elsif($s2>=$s1 && $s2<=$e1){ # partly overlapping $overlap = $e1-$s2+1; $type = 'part'; } else{ die "ERROR: One case of overlapping not detected (sub 'overlapping')!\n\n"; } if($overlap == 0) { return 0;} else { return 1; } } # --------------------------------------------------------------------------------------- ### # --- motif functions # # 1) score found motif sub score_motif { # score the core motif without insertions # (i.e. splice out insertions before calling score_motif()) # $s1 = 5' stem sequence # $s2 = 3' stem sequence my ($s1, $s2, $motif) = @_; my $score = 0; foreach my $p (@$motif) { my ($i,$j,$hr) = @$p; # if($j< (length $s2) - 2 && $j<(length $s1) -2) { if($j< (length $s2) && $i<(length $s1)) { if($i == -1) { return INF unless exists($hr->{substr($s2,$j,1)}); $score += $hr->{substr($s2,$j,1)}; next; } if($j == -1) { return INF unless exists($hr->{substr($s1,$i,1)}); $score += $hr->{substr($s1,$i,1)}; next; } return INF unless exists($hr->{substr($s1,$i,1) . substr($s2,$j,1)}); $score += $hr->{substr($s1,$i,1) . substr($s2,$j,1)}; # printf "score +: %f %f %s %d %d\n",$score,$hr->{substr($s1,$i,1) . substr($s2,$j,1)},substr($s1,$i,1) . substr($s2,$j,1), $i, $j; # print Dumper($hr); } } return $score; } # --------------------------------------------------------------------------------------- # 2) convert probs to log odd scores, does a deep copy on the way sub logify { my $motif = shift; my @new; foreach my $p (@$motif) { my ($i,$j,$hr) = @$p; my %h; my $factor = 16; if($i == -1 || $j == -1) {$factor = 4;} foreach my $k (keys %$hr) { $h{$k} = log($factor*$hr->{$k}/100); } push @new, [$i,$j,\%h]; } return @new; } # --------------------------------------------------------------------------------------- # 3) produce the list of all possible insertions (left and right) sub explode_inserts { my ($lins, $rins) = @_; my @left = explode_once(@$lins); my @right = explode_once(@$rins); return (\@left, \@right); } # --------------------------------------------------------------------------------------- # 4) get all possible insertions sub explode_once { my @ins = @_; my @list = (); my $tot = 1; # compute total number of combinations foreach my $in (@ins) { my ($pos,$min,$max) = @$in; $tot *= $max-$min+1; } for my $i (0..$tot-1) { # produce combination number $index my $index = $i; my @stack = (); my $sum = 0; foreach my $in (@ins) { use integer; my ($pos,$min,$max) = @$in; # number of insertions at this position my $k = ($index % ($max-$min+1)) + $min; push @stack, [$pos, $k]; $sum += $k; $index /= $max-$min+1; } # add this combination to list indexed by total length of insertion push @{$list[$sum]}, [@stack]; } return @list; } # --------------------------------------------------------------------------------------- # 5) splice insertions sub splice_ins { my ($s1, $s2 ,$lins, $rins) = @_; my @lins = sort {$a->[0] <=> $b->[0]} @{$lins}; my @rins = sort {$a->[0] <=> $b->[0]} @{$rins}; foreach my $i (@lins) { substr($s1,$i->[0],$i->[1],''); } foreach my $i (@rins) { substr($s2,-$i->[0],$i->[1],''); } return ($s1,$s2); } # --------------------------------------------------------------------------------------- # 6) base pair table sub make_pair_table { my($structure) = shift; #indices start at 0 in this version! my($j, @olist, @table); # print $structure; die; my ($hx,$i) = (0,0); my @st_ar = split //,$structure; foreach my $c (@st_ar) { if ($c eq '.') { $table[$i]= -1; } elsif ($c eq '(') { $olist[$hx++]=$i; } elsif ($c eq ')') { $j = $olist[--$hx]; if($hx<0) { printf STDERR "ERROR:\nUnbalanced brackets in make_pair_table\n"; exit(0); } $table[$i]=$j; $table[$j]=$i; } $i++; } if($hx!= 0) { printf STDERR "ERROR:\nToo few closed brackets in make_pair_table\n"; exit(0); } return @table; } # --- end FUNCTIONS --- # --- end PROGRAM ---