Raw content of Bio::EnsEMBL::Compara::AlignSlice::Exon # # Ensembl module for Bio::EnsEMBL::Compara::AlignSlice::Exon # # Original author: Javier Herrero <jherrero@ebi.ac.uk> # # Copyright EnsEMBL Team # # You may distribute this module under the same terms as perl itself # pod documentation - main docs before the code =head1 NAME Bio::EnsEMBL::Compara::AlignSlice::Exon - Description =head1 INHERITANCE This module inherits attributes and methods from Bio::EnsEMBL::Exon module =head1 SYNOPSIS use Bio::EnsEMBL::Compara::AlignSlice::Exon; my $exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon( ); SET VALUES GET VALUES =head1 OBJECT ATTRIBUTES =over =item exon original Bio::EnsEMBL::Exon object =item slice Bio::EnsEMBL::Slice object on which this Bio::EnsEMBL::Compara::AlignSlice::Exon is defined =item cigar_line A string describing the mapping of this exon on the slice =item phase This exon results from the mapping of a real exon. It may suffer indels and duplications during the process which makes this mapped exon unreadable by a translation machinery. The phase is set to -1 by default. =item end_phase This exon results from the mapping of a real exon. It may suffer indels and duplications during the process which makes this mapped exon unreadable by a translation machinery. The end_phase is set to -1 by default. =back =head1 AUTHORS Javier Herrero (jherrero@ebi.ac.uk) =head1 COPYRIGHT Copyright (c) 2004. EnsEMBL Team You may distribute this module under the same terms as perl itself =head1 CONTACT This modules is part of the EnsEMBL project () Questions can be posted to the ensembl-dev mailing list: ensembl-dev@ebi.ac.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::EnsEMBL::Compara::AlignSlice::Exon; use strict; use Bio::EnsEMBL::Exon; use Bio::EnsEMBL::Utils::Argument qw(rearrange); use Bio::EnsEMBL::Utils::Exception qw(throw warning info); our @ISA = qw(Bio::EnsEMBL::Exon); =head2 new (CONSTRUCTOR) Arg[1] : a reference to a hash where keys can be: -exon -adaptor -reference_slice Example : my $align_slice = new Bio::EnsEMBL::Compara::AlignSlice( -exon => $original_exon -reference_slice => $reference_slice, ); Description: Creates a new Bio::EnsEMBL::AlignSlice::Exon object Returntype : Bio::EnsEMBL::Compara::AlignSlice::Exon object Exceptions : return an object with no start, end nor strand if the exon cannot be mapped on the reference Slice. =cut sub new { my ($class, @args) = @_; my $self = {}; bless($self, $class); my ($exon, $align_slice, $from_mapper, $to_mapper, $original_rank) = rearrange([qw( EXON ALIGN_SLICE FROM_MAPPER TO_MAPPER ORIGINAL_RANK )], @args); $self->exon($exon) if (defined ($exon)); # $self->genomic_align($genomic_align) if (defined($genomic_align)); # $self->from_genomic_align_id($from_genomic_align_id) if (defined($from_genomic_align_id)); # $self->to_genomic_align_id($to_genomic_align_id) if (defined($to_genomic_align_id)); $self->slice($align_slice) if (defined($align_slice)); $self->original_rank($original_rank) if (defined($original_rank)); $self->phase(-1); $self->end_phase(-1); return $self->map_Exon_on_Slice($from_mapper, $to_mapper); } =head2 copy (CONSTRUCTOR) Arg[1] : none Example : my $new_align_slice = $old_align_slice->copy() Description: Creates a new Bio::EnsEMBL::AlignSlice::Exon object which is an exact copy of the calling object Returntype : Bio::EnsEMBL::Compara::AlignSlice::Exon object Exceptions : Caller : $obeject->methodname =cut sub copy { my ($self) = @_; my $copy; while (my ($key, $value) = each %$self) { $copy->{$key} = $value; } bless($copy, ref($self)); return $copy; } =head2 slice Arg[1] : (optional) Bio::EnsEMBL::Slice $reference_slice Example : $align_exon->slice($reference_slice); Example : $reference_slice = $align_exon->slice(); Description: Get/set the attribute slice. This method is overloaded in order to map original coordinates onto the reference Slice. Returntype : Bio::EnsEMBL::Slice object Exceptions : =cut sub slice { my ($self, $slice) = @_; if (defined($slice)) { $self->{'slice'} = $slice; } return $self->{'slice'}; } =head2 original_rank Arg[1] : (optional) integer $original_rank Example : $align_exon->original_rank(5); Example : $original_rank = $align_exon->original_rank(); Description: Get/set the attribute original_rank. The orignal_rank is the position of the orginal Exon in the original Transcript Returntype : integer Exceptions : =cut sub original_rank { my ($self, $original_rank) = @_; if (defined($original_rank)) { $self->{'original_rank'} = $original_rank; } return $self->{'original_rank'}; } =head2 map_Exon_on_Slice Arg[1] : Bio::EnsEMBL::Mapper $from_mapper Arg[2] : Bio::EnsEMBL::Mapper $to_mapper Example : $align_exon->map_Exon_on_Slice($from_mapper, $to_mapper); Description: This function takes the original exon and maps it on the slice using the mappers. Returntype : Bio::EnsEMBL::Compara::AlignSlice::Exon object Exceptions : returns undef if not enough information is provided Exceptions : returns undef if no piece of the original exon can be mapped. Caller : new =cut sub map_Exon_on_Slice { my ($self, $from_mapper, $to_mapper) = @_; my $original_exon = $self->exon; my $slice = $self->slice; if (!defined($slice) or !defined($original_exon) or !defined($from_mapper)) { return $self; } my @alignment_coords = $from_mapper->map_coordinates( "sequence", # $self->genomic_align->dbID, $original_exon->slice->start + $original_exon->start - 1, $original_exon->slice->start + $original_exon->end - 1, $original_exon->strand, "sequence" # $from_mapper->from ); my $aligned_start; my $aligned_end; my $aligned_strand = 0; my $aligned_sequence = ""; my $aligned_cigar = ""; my $global_alignment_coord_start; my $global_alignment_coord_end; my $last_alignment_coord_end; my $last_alignment_coord_start; foreach my $alignment_coord (@alignment_coords) { ## $alignment_coord refer to genomic_align_block: (1 to genomic_align_block->length) [+] if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { if ($alignment_coord->strand == 1) { if ($last_alignment_coord_end) { # Consider gap between this piece and the previous as a deletion my $length = $alignment_coord->start - $last_alignment_coord_end - 1; $aligned_cigar .= $length if ($length>1); $aligned_cigar .= "D" if ($length); } $last_alignment_coord_end = $alignment_coord->end; $global_alignment_coord_start = $alignment_coord->start if (!$global_alignment_coord_start); $global_alignment_coord_end = $alignment_coord->end; } else { if ($last_alignment_coord_start) { # Consider gap between this piece and the previous as a deletion my $length = $last_alignment_coord_start - $alignment_coord->end - 1; $aligned_cigar .= $length if ($length>1); $aligned_cigar .= "D" if ($length); } $last_alignment_coord_start = $alignment_coord->start; $global_alignment_coord_end = $alignment_coord->end if (!$global_alignment_coord_end); $global_alignment_coord_start = $alignment_coord->start; } } else { # This piece is outside of the alignment -> consider as an insertion my $length = $alignment_coord->length; $aligned_cigar .= $length if ($length>1); $aligned_cigar .= "I" if ($length); next; } if (!defined($to_mapper)) { ## Mapping on the alignment (expanded mode) if ($alignment_coord->strand == 1) { $aligned_strand = 1; $aligned_start = $alignment_coord->start if (!$aligned_start); $aligned_end = $alignment_coord->end; } else { $aligned_strand = -1; $aligned_start = $alignment_coord->start; $aligned_end = $alignment_coord->end if (!$aligned_end); } my $num = $alignment_coord->end - $alignment_coord->start + 1; $aligned_cigar .= $num if ($num > 1); $aligned_cigar .= "M" if ($num); } else { ## Mapping on the reference_Slice (collapsed mode) my @mapped_coords = $to_mapper->map_coordinates( "alignment", # $self->genomic_align->genomic_align_block->dbID, $alignment_coord->start, $alignment_coord->end, $alignment_coord->strand, "alignment" # $to_mapper->to ); foreach my $mapped_coord (@mapped_coords) { ## $mapped_coord refer to reference_slice if ($mapped_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { if ($alignment_coord->strand == 1) { $aligned_strand = 1; $aligned_start = $mapped_coord->start if (!$aligned_start); $aligned_end = $mapped_coord->end; } else { $aligned_strand = -1; $aligned_start = $mapped_coord->start; $aligned_end = $mapped_coord->end if (!$aligned_end); } my $num = $mapped_coord->end - $mapped_coord->start + 1; $aligned_cigar .= $num if ($num > 1); $aligned_cigar .= "M" if ($num); } else { my $num = $mapped_coord->end - $mapped_coord->start + 1; $aligned_cigar .= $num if ($num > 1); $aligned_cigar .= "I" if ($num); } } } } if ($aligned_strand == 0) { ## the whole sequence maps on a gap $self->{start} = undef; $self->{end} = undef; $self->{strand} = undef; return $self; } ## Set coordinates on "slice" coordinates $self->start($aligned_start - $slice->start + 1); $self->end($aligned_end - $slice->start + 1); $self->strand($aligned_strand); $self->cigar_line($aligned_cigar); if ($self->start > $slice->length or $self->end < 1) { $self->{start} = undef; $self->{end} = undef; $self->{strand} = undef; return $self; } return $self; } =head2 exon Arg[1] : (optional) Bio::EnsEMBL::Exon $original_exon Example : $align_exon->exon($original_exon); Example : $start = $align_exon->start(); Description: Get/set the attribute start. This method is overloaded in order to return the starting postion on the AlignSlice instead of the original one. Original starting position may be retrieved using the SUPER::start() method or the orginal_start() method Returntype : Bio::EnsEMBL::Exon object Exceptions : =cut sub exon { my ($self, $exon) = @_; if (defined($exon)) { $self->{'exon'} = $exon; $self->stable_id($exon->stable_id) if (defined($exon->stable_id)); } return $self->{'exon'}; } =head2 cigar_line Arg[1] : (optional) string $cigar_line Example : $align_exon->cigar_line($cigar_line); Example : $cigar_line = $align_exon->cigar_line(); Description: Get/set the attribute cigar_line. Returntype : string Exceptions : none Caller : object->methodname =cut sub cigar_line { my ($self, $cigar_line) = @_; if (defined($cigar_line)) { $self->{'cigar_line'} = $cigar_line; } return $self->{'cigar_line'}; } sub get_aligned_start { my ($self) = @_; my $cigar_line = $self->cigar_line; if (defined($cigar_line)) { my @cig = ( $cigar_line =~ /(\d*[GMDI])/g ); my $cigType = substr( $cig[0], -1, 1 ); my $cigCount = substr( $cig[0], 0 ,-1 ); $cigCount = 1 unless ($cigCount =~ /^\d+$/); next if ($cigCount == 0); if ($cigType eq "I") { return (1 + $cigCount); } else { return 1; } } return undef; } sub get_aligned_end { my ($self) = @_; my $cigar_line = $self->cigar_line; if (defined($cigar_line)) { my @cig = ( $cigar_line =~ /(\d*[GMDI])/g ); my $cigType = substr( $cig[-1], -1, 1 ); my $cigCount = substr( $cig[-1], 0 ,-1 ); $cigCount = 1 unless ($cigCount =~ /^\d+$/); next if ($cigCount == 0); if ($cigType eq "I") { return ($self->exon->end - $self->exon->start + 1 - $cigCount); } else { return ($self->exon->end - $self->exon->start + 1); } } return undef; } =head2 seq Arg [1] : none Example : my $seq_str = $exon->seq->seq; Description: Retrieves the dna sequence of this Exon. Returned in a Bio::Seq object. Note that the sequence may include UTRs (or even be entirely UTR). Returntype : Bio::Seq or undef Exceptions : warning if argument passed, warning if exon does not have attatched slice warning if exon strand is not defined (or 0) Caller : general =cut sub seq { my ($self, $seq) = @_; if (defined($seq)) { $self->{'_seq_cache'} = $seq->seq(); } ## Use _template_seq if defined. It is a concatenation of several original ## exon sequences and is produced during the merging of align_exons. if(!defined($self->{'_seq_cache'})) { my $seq = &_get_aligned_sequence_from_original_sequence_and_cigar_line( ($self->{'_template_seq'} or $self->exon->seq->seq), $self->cigar_line, "ref" ); $self->{'_seq_cache'} = $seq; } return Bio::Seq->new( -seq => $self->{'_seq_cache'}, -id => $self->stable_id, -moltype => 'dna', -alphabet => 'dna', ); } =head2 append_Exon Arg [1] : Example : Description: Returntype : Exceptions : Caller : =cut sub append_Exon { my ($self, $exon, $gap_length) = @_; $self->seq(new Bio::Seq(-seq => $self->seq->seq.("-"x$gap_length).$exon->seq->seq)); ## As it is possible to merge two partially repeated parts of an Exon, ## the merging is done by concatenating both cigar_lines with the right ## number of gaps in the middle. The underlaying sequence must be lengthen ## accordingly. This is stored in the _template_seq private attribute if (defined($self->{'_template_seq'})) { $self->{'_template_seq'} .= $self->exon->seq->seq } else { $self->{'_template_seq'} = $self->exon->seq->seq x 2; } if ($gap_length) { $gap_length = "" if ($gap_length == 1); $self->cigar_line( $self->cigar_line. $gap_length."D". $exon->cigar_line); } else { $self->cigar_line( $self->cigar_line. $exon->cigar_line); } return $self; } =head2 prepend_Exon Arg [1] : Example : Description: Returntype : Exceptions : Caller : =cut sub prepend_Exon { my ($self, $exon, $gap_length) = @_; $self->seq(new Bio::Seq(-seq => $exon->seq->seq.("-"x$gap_length).$self->seq->seq)); ## As it is possible to merge two partially repeated parts of an Exon, ## the merging is done by concatenating both cigar_lines with the right ## number of gaps in the middle. The underlaying sequence must be lengthen ## accordingly. This is stored in the _template_seq private attribute if (defined($self->{'_template_seq'})) { $self->{'_template_seq'} .= $self->exon->seq->seq } else { $self->{'_template_seq'} = $self->exon->seq->seq x 2; } if ($gap_length) { $gap_length = "" if ($gap_length == 1); $self->cigar_line( $exon->cigar_line. $gap_length."D". $self->cigar_line); } else { $self->cigar_line( $exon->cigar_line. $self->cigar_line); } return $self; } =head2 _get_aligned_sequence_from_original_sequence_and_cigar_line Arg [1] : string $original_sequence Arg [1] : string $cigar_line Example : $aligned_sequence = _get_aligned_sequence_from_original_sequence_and_cigar_line( "CGTAACTGATGTTA", "3MD8M2D3M") Description: get gapped sequence from original one and cigar line Returntype : string $aligned_sequence Exceptions : thrown if cigar_line does not match sequence length Caller : methodname =cut sub _get_aligned_sequence_from_original_sequence_and_cigar_line { my ($original_sequence, $cigar_line, $mode) = @_; my $aligned_sequence = ""; $mode ||= ""; return undef if (!$original_sequence or !$cigar_line); my $seq_pos = 0; my @cig = ( $cigar_line =~ /(\d*[GMDI])/g ); for my $cigElem ( @cig ) { my $cigType = substr( $cigElem, -1, 1 ); my $cigCount = substr( $cigElem, 0 ,-1 ); $cigCount = 1 unless ($cigCount =~ /^\d+$/); if( $cigType eq "M" ) { $aligned_sequence .= substr($original_sequence, $seq_pos, $cigCount); $seq_pos += $cigCount; } elsif( $cigType eq "G" || $cigType eq "D") { $aligned_sequence .= "-" x $cigCount; } elsif( $cigType eq "I") { $aligned_sequence .= "-" x $cigCount if ($mode ne "ref"); $seq_pos += $cigCount; } } throw("Cigar line ($seq_pos) does not match sequence lenght (".length($original_sequence).")") if ($seq_pos != length($original_sequence)); return $aligned_sequence; } 1;