Raw content of Bio::EnsEMBL::Pipeline::Alignment::InformativeSites package Bio::EnsEMBL::Pipeline::Alignment::InformativeSites; use vars qw(@ISA); use strict; use Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment; use Bio::EnsEMBL::Utils::Exception qw(throw warning info); use Bio::EnsEMBL::Utils::Argument qw(rearrange); # Object preamble - inherits from Bio::Root::Object; @ISA = qw(Bio::EnsEMBL::Root); sub new { my ($class, @args) = @_; my $self = bless {},$class; my ($translatable_seqs, $alignments ) = rearrange([qw(TRANSLATABLE_SEQS ALIGNMENTS )],@args); throw("No sequences to align.") unless $translatable_seqs; $self->_seqs($translatable_seqs); throw("No alignments to reconcile.") unless (defined $alignments); $self->_alignments($alignments); throw("Ids of translatable sequences do not fully match with " . "alignment ids. This sometimes happens because one of " . "the clustered genes has zero EST matches and hence no " . "alignment.") unless $self->_check_ids; return $self } sub informative_sites { my $self = shift; my $gene_informative_sites = $self->_informative_site_strings; my %inform_site_aligns; foreach my $seq (@{$self->_seqs}) { # Avoid making sequences where no informative sites exist # ie. identical sequences. next unless $gene_informative_sites->{$seq->display_id} =~ /\*/; $inform_site_aligns{$seq->display_id} = $self->_derive_informative_site_alignment( $seq, $gene_informative_sites->{$seq->display_id}); } return \%inform_site_aligns } sub _seqs { my $self = shift; if (@_) { $self->{_seqs} = shift; throw("Was expecting the translatable sequences to be Bio::Seq objects.") unless (scalar @{$self->{_seqs}} > 0 && defined $self->{_seqs}->[0] && $self->{_seqs}->[0]->isa("Bio::Seq")); foreach my $seq (@{$self->{_seqs}}){ throw("Gaps found in translatable sequence. These should be gap-free.") if $seq->seq =~ /-/; } } return $self->{_seqs} } sub _alignments { my $self = shift; if (@_) { $self->{_aligns} = shift; throw("Was expecting alignments to be " . "Bio::EnsEMBL::Pipeline::Alignment objects.") unless (scalar @{$self->{_aligns}} > 0 && defined $self->{_aligns}->[0] && $self->{_aligns}->[0]->isa("Bio::EnsEMBL::Pipeline::Alignment")); $self->{_align_vs_id} = undef; } return $self->{_aligns} } sub _retrieve_align_by_id { my ($self, $id) = @_; unless (defined $self->{_align_vs_id}) { foreach my $align (@{$self->_alignments}){ $self->{_align_vs_id}->{$align->alignment_name} = $align } } warning ("Alignment with identifier [$id] does not exist.") unless $self->{_align_vs_id}->{$id}; return $self->{_align_vs_id}->{$id} } sub _check_ids { my $self = shift; my %alignment_names; my $verdict = 1; foreach my $align (@{$self->_alignments}){ $alignment_names{$align->alignment_name}++; } foreach my $seq (@{$self->_seqs}){ $verdict = 0 unless $alignment_names{$seq->display_id} } return $verdict } sub _informative_site_strings { my $self = shift; # Do a codon based alignment on our translatable sequences my $cba = Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment->new( -genetic_code => 1); $cba->sequences($self->_seqs); my $aligned_seqs = $cba->run_alignment; # Work through the length of this alignment marking sequence # variations/informative sites. Informative site information # is stored as a long string, the same length as each UNALIGNED # sequence. Informative sites are marked as '*' and uninformative # sites are marked as '-' my @seq_arrays; my @seq_name; foreach my $realigned_seq (@$aligned_seqs) { my @seq_array = split //, $realigned_seq->seq; push @seq_arrays, \@seq_array; push @seq_name, $realigned_seq->display_id; } my $alignment_length = length($aligned_seqs->[0]->seq); my %informative_site_string; for (my $i = 0; $i < $alignment_length; $i++) { my $informative_site = 0; my $reference_base = $seq_arrays[0]->[$i]; for (my $j = 1; $j < scalar @seq_arrays; $j++) { if ($seq_arrays[$j]->[$i] ne $reference_base) { $informative_site = 1; last } } for (my $j = 0; $j < scalar @seq_arrays; $j++) { if ($informative_site && $seq_arrays[$j]->[$i] ne '-') { $informative_site_string{$seq_name[$j]} .= '*'; } elsif ($seq_arrays[$j]->[$i] ne '-') { $informative_site_string{$seq_name[$j]} .= '-'; } } } @seq_arrays = undef; return \%informative_site_string } sub _derive_informative_site_alignment { my ($self, $seq, $informative_sites_string) = @_; # Put gene sequence somewhere handy my $gene_string = $seq->seq; my $gene_length = length($gene_string); #print STDERR ">gene seq\n" . $gene_string . "\n"; # Take our existing external EvidenceAlignment and # put the evidence sequences somewhere accessible. my $ext_align = $self->_retrieve_align_by_id($seq->display_id); my %evidence_strings; foreach my $align_seq (@{$ext_align->fetch_AlignmentSeqs}){ $evidence_strings{$align_seq->name} = $align_seq->seq; } my $align_length = length($evidence_strings{'genomic_sequence'}); # Place the sequence for gene onto the identical sequence in the # external alignment. Start by finding the 5 prime end of the # gene and slide along the length of the alignment, matching identical # bases and hopping gaps in the sequence and intron. Where # an informative site is apparently, splice out these bases from all # evidence sequences to create strings of informative sites. my %aligned_informative_site_strings; my $exon_string = $evidence_strings{'exon_sequence'}; $exon_string =~ s/intron-truncated/----------------/g; #print STDERR ">exon seq\n$exon_string\n"; my $align_coord = 0; my $lost = 1; for (my $gene_coord = 0; $gene_coord < $gene_length; $gene_coord++) { #print STDERR "Gene coord : " . $gene_coord . "\n"; # This prone-to-breakage code loop shuffles along the alignment # and tries and place the unaligned gene. As the gene sequence should be # base-for-base identical to the exonic sequence in the alignment (barring # missing UTR sequence), the code tries to place 10bp substrs of the gene # against the aligned exonic sequence. if ($lost) { my $gene_seed_string = substr($gene_string, $gene_coord, 10); #print STDERR " Gene seed : " . $gene_seed_string . "\n"; LOST: while (1){ my $exon_seed_string = substr($exon_string, $align_coord, 10); # Move along if we have starting gaps if ($exon_seed_string =~ /^(-+)/){ $align_coord += length($1); next LOST; } # Make sure we havent run off the end of the sequence throw("Have run off the end of the alignment without matching the gene seed. Bugger.") if ($align_coord >= $align_length); # Get on with extending the exon seed, without gaps. $exon_seed_string =~ s/-//g; #print STDERR " Align coord : " . $align_coord . " Seed : $exon_seed_string\n"; my $increment = 0; EXTENTION: while (length ($exon_seed_string) < 8){ $increment++; $exon_seed_string = substr($exon_string, $align_coord, 10 + $increment); if ($exon_seed_string =~ /-$/){ $increment++; $exon_seed_string =~ s/-//g; next EXTENTION } $exon_seed_string =~ s/-//g; throw("Have run off the end of the alignment without matching the gene seed. Bugger.") if ($align_coord + 10 + $increment >= $align_length); #print STDERR " Extending exon seed : " . $exon_seed_string . "\n"; } if ($gene_seed_string =~ /^$exon_seed_string/) { #print STDERR "Sequence FOUND\n"; $lost = 0; last LOST } $align_coord++; } } # Sanity check #print STDERR "Gene neighbourhood : " . substr($gene_string, $gene_coord, 10) . " Exon neighbourhood : " . substr($exon_string, $align_coord, 10) . "\n"; throw("Gene and aligned exon sequence have unexpectedly become unaligned.") unless substr($gene_string, $gene_coord, 1) eq substr($exon_string, $align_coord, 1); # Do our work while we are still aligned #print STDERR "Trying to get some work done.\n"; if (substr($informative_sites_string, $gene_coord, 1) eq '*'){ #print STDERR " Found an informative site.\n"; foreach my $evidence_id (keys %evidence_strings) { # next if (($evidence_id eq 'genomic_sequence') || # ($evidence_id eq 'exon_sequence')); $aligned_informative_site_strings{$evidence_id} .= substr($evidence_strings{$evidence_id}, $align_coord, 1) } } else { #print STDERR "Not an informative site, not interested and moving right along.\n"; } # Do a quick check that the next base of the gene will # align to the alignment. If there is gap, try hopping # over it, before assuming that we are lost again. my $hop = 1; if (($gene_coord + 1 <= $gene_length)&& (substr($exon_string, $align_coord + $hop, 1) ne '-')) { #print STDERR "Next base is not a gap, hence not trying to hop it.\n"; # Gene coord is about to be incremented in the next iteration # of this loop, so had better do this to $align_coord as well. $align_coord++; next } else { #print STDERR "Next base is a gap. Trying to hop.\n"; while (substr($exon_string, $align_coord + $hop, 1) eq '-'){ $hop++; #print STDERR " Little hop.\n"; while (substr($exon_string, $align_coord + $hop, 10) eq '-' x 10) { $hop += 10; #print STDERR " Big hop.\n"; while (substr($exon_string, $align_coord + $hop, 100) eq '-' x 100) { $hop += 100; #print STDERR " Really big hop.\n"; } } } $align_coord += $hop; #print STDERR "Align coord after hop is $align_coord.\n"; #print STDERR "Gene neighbourhood : " . substr($gene_string, $gene_coord + 1, 10) . " Exon neighbourhood : " . substr($exon_string, $align_coord, 10) . "\n"; my $exon_neighbourhood = substr($exon_string, $align_coord, 10); $exon_neighbourhood =~ s/-//g; if ((! substr($gene_string, $gene_coord + 1, 10) =~ /^$exon_neighbourhood/) && ($gene_coord < (length($gene_string) - 1))) { warning("Erk. Have managed to become lost."); $lost = 1; } } } #print STDERR "DONE.\n"; # Finally, quickly make an alignment object out of the # informative site strings. my $informative_sites_alignment = Bio::EnsEMBL::Pipeline::Alignment->new( -name => $seq->display_id); foreach my $evidence_id (keys %aligned_informative_site_strings) { my $new_is_seq = Bio::EnsEMBL::Pipeline::Alignment::AlignmentSeq->new( -name => $evidence_id, -seq => $aligned_informative_site_strings{$evidence_id}); $informative_sites_alignment->add_sequence($new_is_seq); } return $informative_sites_alignment; }