Bio::EnsEMBL::Pipeline::Alignment InformativeSites
Included librariesPackage variablesGeneral documentationMethods
Toolbar
WebCvsRaw content
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( throw warning info )
Inherit
Bio::EnsEMBL::Root
Synopsis
No synopsis!
Description
No description!
Methods
_alignments
No description
Code
_check_ids
No description
Code
_derive_informative_site_alignment
No description
Code
_informative_site_strings
No description
Code
_retrieve_align_by_id
No description
Code
_seqs
No description
Code
informative_sites
No description
Code
new
No description
Code
Methods description
None available.
Methods code
_alignmentsdescriptionprevnextTop
sub _alignments {
  my $self = shift;

  if (@_) {
    $self->{_aligns} = shift;

    throw("Was expecting alignments to be " .
	  "Bio::EnsEMBL::Pipeline::Alignment objects.")
      unless (scalar @{$self->{_aligns}} > 0 &&
	      defined $self->{_aligns}->[0] &&
	      $self->{_aligns}->[0]->isa("Bio::EnsEMBL::Pipeline::Alignment"));

    $self->{_align_vs_id} = undef;
  }

  return $self->{_aligns}
}
_check_idsdescriptionprevnextTop
sub _check_ids {
  my $self = shift;

  my %alignment_names;
  my $verdict = 1;

  foreach my $align (@{$self->_alignments}){
    $alignment_names{$align->alignment_name}++;
  }

  foreach my $seq (@{$self->_seqs}){
    $verdict = 0 
      unless $alignment_names{$seq->display_id}
  }

  return $verdict
}
_derive_informative_site_alignmentdescriptionprevnextTop
sub _derive_informative_site_alignment {
  my ($self, $seq, $informative_sites_string) = @_;

  # Put gene sequence somewhere handy
my $gene_string = $seq->seq; my $gene_length = length($gene_string); #print STDERR ">gene seq\n" . $gene_string . "\n";
# Take our existing external EvidenceAlignment and
# put the evidence sequences somewhere accessible.
my $ext_align = $self->_retrieve_align_by_id($seq->display_id); my %evidence_strings; foreach my $align_seq (@{$ext_align->fetch_AlignmentSeqs}){ $evidence_strings{$align_seq->name} = $align_seq->seq; } my $align_length = length($evidence_strings{'genomic_sequence'}); # Place the sequence for gene onto the identical sequence in the
# external alignment. Start by finding the 5 prime end of the
# gene and slide along the length of the alignment, matching identical
# bases and hopping gaps in the sequence and intron. Where
# an informative site is apparently, splice out these bases from all
# evidence sequences to create strings of informative sites.
my %aligned_informative_site_strings; my $exon_string = $evidence_strings{'exon_sequence'}; $exon_string =~ s/intron-truncated/----------------/g; #print STDERR ">exon seq\n$exon_string\n";
my $align_coord = 0; my $lost = 1; for (my $gene_coord = 0; $gene_coord < $gene_length; $gene_coord++) { #print STDERR "Gene coord : " . $gene_coord . "\n";
# This prone-to-breakage code loop shuffles along the alignment
# and tries and place the unaligned gene. As the gene sequence should be
# base-for-base identical to the exonic sequence in the alignment (barring
# missing UTR sequence), the code tries to place 10bp substrs of the gene
# against the aligned exonic sequence.
if ($lost) { my $gene_seed_string = substr($gene_string, $gene_coord, 10); #print STDERR " Gene seed : " . $gene_seed_string . "\n";
LOST: while (1){ my $exon_seed_string = substr($exon_string, $align_coord, 10); # Move along if we have starting gaps
if ($exon_seed_string =~ /^(-+)/){ $align_coord += length($1); next LOST; } # Make sure we havent run off the end of the sequence
throw("Have run off the end of the alignment without matching the gene seed. Bugger.") if ($align_coord >= $align_length); # Get on with extending the exon seed, without gaps.
$exon_seed_string =~ s/-//g; #print STDERR " Align coord : " . $align_coord . " Seed : $exon_seed_string\n";
my $increment = 0; EXTENTION: while (length ($exon_seed_string) < 8){ $increment++; $exon_seed_string = substr($exon_string, $align_coord, 10 + $increment); if ($exon_seed_string =~ /-$/){ $increment++; $exon_seed_string =~ s/-//g; next EXTENTION } $exon_seed_string =~ s/-//g; throw("Have run off the end of the alignment without matching the gene seed. Bugger.") if ($align_coord + 10 + $increment >= $align_length); #print STDERR " Extending exon seed : " . $exon_seed_string . "\n";
} if ($gene_seed_string =~ /^$exon_seed_string/) { #print STDERR "Sequence FOUND\n";
$lost = 0; last LOST } $align_coord++; } } # Sanity check
#print STDERR "Gene neighbourhood : " . substr($gene_string, $gene_coord, 10) . " Exon neighbourhood : " . substr($exon_string, $align_coord, 10) . "\n";
throw("Gene and aligned exon sequence have unexpectedly become unaligned.") unless substr($gene_string, $gene_coord, 1) eq substr($exon_string, $align_coord, 1); # Do our work while we are still aligned
#print STDERR "Trying to get some work done.\n";
if (substr($informative_sites_string, $gene_coord, 1) eq '*'){ #print STDERR " Found an informative site.\n";
foreach my $evidence_id (keys %evidence_strings) { # next if (($evidence_id eq 'genomic_sequence') ||
# ($evidence_id eq 'exon_sequence'));
$aligned_informative_site_strings{$evidence_id} .= substr($evidence_strings{$evidence_id}, $align_coord, 1) } } else { #print STDERR "Not an informative site, not interested and moving right along.\n";
} # Do a quick check that the next base of the gene will
# align to the alignment. If there is gap, try hopping
# over it, before assuming that we are lost again.
my $hop = 1; if (($gene_coord + 1 <= $gene_length)&& (substr($exon_string, $align_coord + $hop, 1) ne '-')) { #print STDERR "Next base is not a gap, hence not trying to hop it.\n";
# Gene coord is about to be incremented in the next iteration
# of this loop, so had better do this to $align_coord as well.
$align_coord++; next } else { #print STDERR "Next base is a gap. Trying to hop.\n";
while (substr($exon_string, $align_coord + $hop, 1) eq '-'){ $hop++; #print STDERR " Little hop.\n";
while (substr($exon_string, $align_coord + $hop, 10) eq '-' x 10) { $hop += 10; #print STDERR " Big hop.\n";
while (substr($exon_string, $align_coord + $hop, 100) eq '-' x 100) { $hop += 100; #print STDERR " Really big hop.\n";
} } } $align_coord += $hop; #print STDERR "Align coord after hop is $align_coord.\n";
#print STDERR "Gene neighbourhood : " . substr($gene_string, $gene_coord + 1, 10) . " Exon neighbourhood : " . substr($exon_string, $align_coord, 10) . "\n";
my $exon_neighbourhood = substr($exon_string, $align_coord, 10); $exon_neighbourhood =~ s/-//g; if ((! substr($gene_string, $gene_coord + 1, 10) =~ /^$exon_neighbourhood/) && ($gene_coord < (length($gene_string) - 1))) { warning("Erk. Have managed to become lost."); $lost = 1; } } } #print STDERR "DONE.\n";
# Finally, quickly make an alignment object out of the
# informative site strings.
my $informative_sites_alignment = Bio::EnsEMBL::Pipeline::Alignment->new( -name => $seq->display_id); foreach my $evidence_id (keys %aligned_informative_site_strings) { my $new_is_seq = Bio::EnsEMBL::Pipeline::Alignment::AlignmentSeq->new( -name => $evidence_id, -seq => $aligned_informative_site_strings{$evidence_id}); $informative_sites_alignment->add_sequence($new_is_seq); } return $informative_sites_alignment;
}
_informative_site_stringsdescriptionprevnextTop
sub _informative_site_strings {
  my $self = shift;

  # Do a codon based alignment on our translatable sequences
my $cba = Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment->new( -genetic_code => 1); $cba->sequences($self->_seqs); my $aligned_seqs = $cba->run_alignment; # Work through the length of this alignment marking sequence
# variations/informative sites. Informative site information
# is stored as a long string, the same length as each UNALIGNED
# sequence. Informative sites are marked as '*' and uninformative
# sites are marked as '-'
my @seq_arrays; my @seq_name; foreach my $realigned_seq (@$aligned_seqs) { my @seq_array = split //, $realigned_seq->seq; push @seq_arrays,\@ seq_array; push @seq_name, $realigned_seq->display_id; } my $alignment_length = length($aligned_seqs->[0]->seq); my %informative_site_string; for (my $i = 0; $i < $alignment_length; $i++) { my $informative_site = 0; my $reference_base = $seq_arrays[0]->[$i]; for (my $j = 1; $j < scalar @seq_arrays; $j++) { if ($seq_arrays[$j]->[$i] ne $reference_base) { $informative_site = 1; last } } for (my $j = 0; $j < scalar @seq_arrays; $j++) { if ($informative_site && $seq_arrays[$j]->[$i] ne '-') { $informative_site_string{$seq_name[$j]} .= '*'; } elsif ($seq_arrays[$j]->[$i] ne '-') { $informative_site_string{$seq_name[$j]} .= '-'; } } } @seq_arrays = undef; return\% informative_site_string
}
_retrieve_align_by_iddescriptionprevnextTop
sub _retrieve_align_by_id {
  my ($self, $id) = @_;

  unless (defined $self->{_align_vs_id}) {
    foreach my $align (@{$self->_alignments}){
      $self->{_align_vs_id}->{$align->alignment_name} = $align
    }
  }

  warning ("Alignment with identifier [$id] does not exist.")
    unless $self->{_align_vs_id}->{$id};

  return $self->{_align_vs_id}->{$id}
}
_seqsdescriptionprevnextTop
sub _seqs {
  my $self = shift;

  if (@_) {
    $self->{_seqs} = shift;

    throw("Was expecting the translatable sequences to be Bio::Seq objects.")
      unless (scalar @{$self->{_seqs}} > 0 &&
	      defined $self->{_seqs}->[0] &&
	      $self->{_seqs}->[0]->isa("Bio::Seq")); 

    foreach my $seq (@{$self->{_seqs}}){
      throw("Gaps found in translatable sequence.  These should be gap-free.")
	if $seq->seq =~ /-/;
    }
  }

  return $self->{_seqs}
}
informative_sitesdescriptionprevnextTop
sub informative_sites {
  my $self = shift;

  my $gene_informative_sites = $self->_informative_site_strings;

  my %inform_site_aligns;

  foreach my $seq (@{$self->_seqs}) {
    # Avoid making sequences where no informative sites exist
# ie. identical sequences.
next unless $gene_informative_sites->{$seq->display_id} =~ /\*/; $inform_site_aligns{$seq->display_id} = $self->_derive_informative_site_alignment( $seq, $gene_informative_sites->{$seq->display_id}); } return\% inform_site_aligns
}
newdescriptionprevnextTop
sub new {
  my ($class, @args) = @_;

  my $self = bless {},$class;

  my ($translatable_seqs,
      $alignments
     ) = rearrange([qw(TRANSLATABLE_SEQS
		       ALIGNMENTS
                      )],@args);

  throw("No sequences to align.")
    unless $translatable_seqs;

  $self->_seqs($translatable_seqs);

  throw("No alignments to reconcile.")
    unless (defined $alignments);

  $self->_alignments($alignments);

  throw("Ids of translatable sequences do not fully match with " . 
	"alignment ids.  This sometimes happens because one of " . 
	"the clustered genes has zero EST matches and hence no " . 
	"alignment.")
    unless $self->_check_ids;

  return $self
}
General documentation
No general documentation available.