Raw content of Bio::EnsEMBL::Compara::AlignSlice::Slice # # Ensembl module for Bio::EnsEMBL::Compara::AlignSlice::Slice # # Original author: Javier Herrero <jherrero@ebi.ac.uk> # # Copyright EnsEMBL Team # # You may distribute this module under the same terms as perl itself # pod documentation - main docs before the code =head1 NAME Bio::EnsEMBL::Compara::AlignSlice::Slice - These objects contain all the information needed for mapping features through genomic alignments =head1 INHERITING This module inherits methods and attributes from Bio::EnsEMBL::Slice module. =head1 SYNOPSIS SET VALUES GET VALUES =head1 OBJECT ATTRIBUTES =over =item attribute Description =back =head1 AUTHORS Javier Herrero (jherrero@ebi.ac.uk) =head1 COPYRIGHT Copyright (c) 2004. EnsEMBL Team You may distribute this module under the same terms as perl itself =head1 CONTACT This modules is part of the EnsEMBL project () Questions can be posted to the ensembl-dev mailing list: ensembl-dev@ebi.ac.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::EnsEMBL::Compara::AlignSlice::Slice; use strict; use Bio::EnsEMBL::Slice; use Bio::EnsEMBL::CoordSystem; use Bio::EnsEMBL::Compara::AlignSlice::Translation; use Bio::EnsEMBL::Utils::Argument qw(rearrange); use Bio::EnsEMBL::Utils::Exception qw(throw warning info verbose); use Scalar::Util qw(weaken); our @ISA = qw(Bio::EnsEMBL::Slice); ## Creates a new coordinate system for creating gap Slices. my $gap_coord_system = new Bio::EnsEMBL::CoordSystem( -NAME => 'gap', -VERSION => "none", -TOP_LEVEL => 0, -SEQUENCE_LEVEL => 1, -RANK => 1, ); =head2 new (CONSTRUCTOR) Arg[1] : Example : Description: Returntype : Exceptions : Caller : Bio::EnsEMBL::Compara::AlignSlice->_create_underlying_Slices() =cut sub new { my ($class, @args) = @_; my $self = {}; bless $self,$class; my ($length, $requesting_slice, $align_slice, $method_link_species_set, $genome_db, $expanded) = rearrange([qw( LENGTH REQUESTING_SLICE ALIGN_SLICE METHOD_LINK_SPECIES_SET GENOME_DB EXPANDED )], @args); my $version = ""; if ($requesting_slice and ref($requesting_slice) and $requesting_slice->isa("Bio::EnsEMBL::Slice")) { $self->{'requesting_slice'} = $requesting_slice; my $name = $requesting_slice->name; $name =~ s/\:/_/g; $version .= $name; } weaken($self->{_align_slice} = $align_slice) if ($align_slice); if ($method_link_species_set and ref($method_link_species_set) and $method_link_species_set->isa("Bio::EnsEMBL::Compara::MethodLinkSpeciesSet")) { $self->{_method_link_species_set} = $method_link_species_set; $version .= "+".$method_link_species_set->method_link_type; my $species_set = $method_link_species_set->species_set(); if ($species_set) { $species_set = [sort {$a->name cmp $b->name} @{$species_set}]; $version .= "(\"".join("\"+\"", map {$_->name} @$species_set)."\")"; } if ($expanded) { $version .= "+expanded"; } else { $version .= "+condensed"; } } my $coord_system = new Bio::EnsEMBL::CoordSystem( -NAME => 'align_slice', -VERSION => $version, -TOP_LEVEL => 0, -SEQUENCE_LEVEL => 1, -RANK => 1, ); $self->{start} = 1; $self->{end} = $length; $self->{strand} = 1; $self->{adaptor} = undef; $self->{coord_system} = $coord_system; $self->genome_db($genome_db) if (defined($genome_db)); $self->{seq_region_name} = (eval{$genome_db->name} or "FakeAlignSlice"); $self->{display_Slice_name} = $self->{seq_region_name}; $self->{display_Slice_name} =~ s/ /_/g; $self->{seq_region_length} = $length; # $self->{located_slices} = []; if (!$self->genome_db) { throw("You must specify a Bio::EnsEMBL::Compara::GenomeDB when\n". "creating a Bio::EnsEMBL::Compara::AlignSlice::Slice object"); } return $self; } =head2 genome_db Arg[1] : Bio::EnsEMBL::Compara::GenomeDB $genome_db Example : $slice->genome_db($human_gdb); Description: getter/setter for the attribute genome_db Returntype : Bio::EnsEMBL::Compara::GenomeDB object Exceptions : This attribute should never be unset. Several methods rely on this attribute. Caller : $object->methodname =cut sub genome_db { my ($self, $genome_db) = @_; if (defined($genome_db)) { throw("[$genome_db] must bu a Bio::EnsEMBL::Compara::GenomeDB object") unless ($genome_db and ref($genome_db) and $genome_db->isa("Bio::EnsEMBL::Compara::GenomeDB")); $self->{genome_db} = $genome_db; } return $self->{genome_db}; } =head2 display_Slice_name Arg[1] : string $name Example : $slice->display_Slice_name("Homo_sapiens"); Description: getter/setter for the attribute display_Slice_name Returntype : string Caller : $object->methodname =cut sub display_Slice_name { my ($self, $display_slice_name) = @_; if (defined($display_slice_name)) { $self->{display_Slice_name} = $display_slice_name; } return $self->{display_Slice_name}; } =head2 add_Slice_Mapper_pair Arg[1] : Bio::EnsEMBL::Slice $slice Arg[2] : Bio::EnsEMBL::Mapper $mapper Arg[3] : integer $start Arg[4] : integer $end Arg[5] : integer $strand Example : $slice->add_Slice_Mapper_pair($slice, $mapper, 124, 542, -1); Description: Attaches a pair of Slice and the corresponding Mapper to this Bio::EnsEMBL::Compara::AlginSlice::Slice object. The Mapper contains the information for mapping coordinates from (to) the Slice to (from) the Bio::EnsEMBL::Compara::AlignSlice. Start, end and strand locates the $slice in the AlignSlice::Slice. $start and $end refer to the Coordinate System. If you are using a sub_Slice, this method will be using the coordinates of the original Bio::EnsEMBL::Compara::AlignSlice::Slice object! Returntype : Exceptions : throws if $slice is not a Bio::EnsEMBL::Slice object Exceptions : throws if $mapper is not a Bio::EnsEMBL::Mapper object =cut sub add_Slice_Mapper_pair { my ($self, $slice ,$mapper, $start, $end, $strand) = @_; if (!$slice or !ref($slice) or !$slice->isa("Bio::EnsEMBL::Slice")) { throw("[$slice] must be a Bio::EnsEMBL::Slice object"); } if (!$mapper or !ref($mapper) or !$mapper->isa("Bio::EnsEMBL::Mapper")) { throw("[$slice] must be a Bio::EnsEMBL::Mapper object"); } push(@{$self->{slice_mapper_pairs}}, { slice => $slice, mapper => $mapper, start => $start, end => $end, strand => $strand, }); return $self->{slice_mapper_pairs}; } =head2 get_all_Slice_Mapper_pair Arg[1] : [optional] bool $get_gap_slices Example : $slice_mapper_pairs = $slice->get_all_Slice_Mapper_pairs(); Description: Returns all pairs of Slices and Mappers attached to this Bio::EnsEMBL::Compara::AlignSlice::Slice The $get_gap_slice flag is normally used internally to get the the gap slices. These are created when the alignment(s) underlying the AlignSlice correspond to gaps only in one or more species. These are used to tell the difference between gap due to lack of alignments and gaps due to the alignments. If you set this this flag to true you will get these gap slices back but it is your responsability to deal with these gap slices properly. Returntype : list ref of hashes which keys are "slice", "mapper", "start", "end" and "strand". Each hash corresponds to a pair of Slice and Mapper and the coordintes needed to locate the Slice in the AlignSlice::Slice. start and end refer to the Coordinate System. If you are using a sub_Slice, this method will be using the coordinates of the original Bio::EnsEMBL::Compara::AlignSlice::Slice object! Exceptions : return a ref to an empty list if no pairs have been attached so far. =cut sub get_all_Slice_Mapper_pairs { my ($self, $get_gap_slices) = @_; my $slice_mapper_pairs = ($self->{slice_mapper_pairs} or []); if (!$get_gap_slices) { $slice_mapper_pairs = [grep {$_->{slice}->coord_system_name ne "alignment"} @$slice_mapper_pairs]; } return $slice_mapper_pairs; } =head2 get_all_Genes_by_type Arg [1] : string $type The biotype of genes wanted. Arg [2] : (optional) string $logic_name Arg [3] : (optional) boolean $load_transcripts If set to true, transcripts will be loaded immediately rather than being lazy-loaded on request. This will result in a significant speed up if the Transcripts and Exons are going to be used (but a slow down if they are not). Example : @genes = @{$slice->get_all_Genes_by_type('protein_coding', 'ensembl')}; Description: Retrieves genes that overlap this slice of biotype $type. This is primarily used by the genebuilding code when several biotypes of genes are used. The logic name is the analysis of the genes that are retrieved. If not provided all genes will be retrieved instead. This methods overwrites the core one since it sends a warning message and return an empty array because this AlignSlice::Slice object has no adaptor. This implementation calls the get_all_Genes methdo elsewhere in this module to fulfil the query. Returntype : listref of Bio::EnsEMBL::Genes Exceptions : none Caller : genebuilder, general Status : Stable =cut sub get_all_Genes_by_type{ my ($self, $type, $logic_name, $load_transcripts) = @_; my @out = grep { $_->biotype eq $type } @{ $self->get_all_Genes($logic_name, undef, $load_transcripts)}; return \@out; } =head2 get_all_Genes Arg [1] : (optional) string $logic_name The name of the analysis used to generate the genes to retrieve Arg [2] : (optional) string $dbtype The dbtype of genes to obtain. This assumes that the db has been added to the DBAdaptor under this name (using the DBConnection::add_db_adaptor method). Arg [3] : (optional) boolean $load_transcripts This option is always disabled for AlingSlices. It only exists for compatibility with the Bio::EnsEMBL::Slice objects. Example : @genes = @{$slice->get_all_Genes}; Description: Retrieves all genes that overlap this slice. Returntype : listref of Bio::EnsEMBL::Genes Exceptions : none Caller : none =cut sub get_all_Genes { my ($self, $logic_name, $dbtype, $load_transcripts, @parameters) = @_; $logic_name ||= ""; $dbtype ||= "core"; my ($max_repetition_length, $strict_order_of_exon_pieces, $strict_order_of_exons, $return_unmapped_exons) = rearrange([qw( MAX_REPETITION_LENGTH STRICT_ORDER_OF_EXON_PIECES STRICT_ORDER_OF_EXONS RETURN_UNMAPPED_EXONS )], @parameters); my $max_gap_length = 0; my $max_intron_length = 0; $max_repetition_length = 100 if (!defined($max_repetition_length)); $strict_order_of_exon_pieces = 1 if (!defined($strict_order_of_exon_pieces)); $strict_order_of_exons = 0 if (!defined($strict_order_of_exons)); $return_unmapped_exons = 1 if (!defined($return_unmapped_exons)); my $key = 'key_'. $dbtype.":". $logic_name.":". $max_repetition_length.":". $strict_order_of_exon_pieces.":". $strict_order_of_exons.":". $return_unmapped_exons; if (!defined($self->{$key})) { my $all_genes = []; my $all_pairs = $self->get_all_Slice_Mapper_pairs; return [] if (!$all_pairs or !@$all_pairs); ## Create larger Slices in order to speed up fetching of genes my $all_slices_coordinates; my $this_slice_coordinates; my $this_slice_adaptor = $all_pairs->[0]->{slice}->adaptor; foreach my $pair (sort { $a->{slice}->seq_region_name cmp $b->{slice}->seq_region_name or $a->{slice}->start <=> $b->{slice}->start } @$all_pairs) { # print STDERR "Foreach pair ($pair)... $pair->{slice}->{seq_region_name} $pair->{slice}->{start}\n"; my $this_slice = $pair->{slice}; if ($this_slice_coordinates and ($this_slice_coordinates->{seq_region_name} eq $this_slice->seq_region_name) and (($this_slice->start - $this_slice_coordinates->{end}) < 10000000)) { ## lengthen current slice_coordinates $this_slice_coordinates->{end} = $this_slice->end; } else { ## save a deep copy (if needed) and reset current slice_coordinates if ($this_slice_coordinates->{seq_region_name}) { my $new_slice_coordinates = { "coord_system_name" => $this_slice_coordinates->{coord_system_name}, "seq_region_name" => $this_slice_coordinates->{seq_region_name}, "start" => $this_slice_coordinates->{start}, "end" => $this_slice_coordinates->{end}, "pairs" => $this_slice_coordinates->{pairs} }; push(@$all_slices_coordinates, $new_slice_coordinates); } $this_slice_coordinates->{coord_system_name} = $this_slice->coord_system_name; $this_slice_coordinates->{seq_region_name} = $this_slice->seq_region_name; $this_slice_coordinates->{start} = $this_slice->start; $this_slice_coordinates->{end} = $this_slice->end; $this_slice_coordinates->{pairs} = []; } ## Add this pair to the set of pairs of the current slice_coordinates push(@{$this_slice_coordinates->{pairs}}, $pair); } push(@$all_slices_coordinates, $this_slice_coordinates) if ($this_slice_coordinates); foreach $this_slice_coordinates (@$all_slices_coordinates) { # print STDERR "Foreach this_slice_coordinates...\n"; my $this_slice = $this_slice_adaptor->fetch_by_region( $this_slice_coordinates->{coord_system_name}, $this_slice_coordinates->{seq_region_name}, $this_slice_coordinates->{start}, $this_slice_coordinates->{end} ); ## Do not load transcripts immediately or the cache could produce ## some troubles in some special cases! Moreover, in this way we will have ## the genes, the transcripts and the exons in the same slice!! my $these_genes = $this_slice->get_all_Genes($logic_name, $dbtype); foreach my $pair (@{$this_slice_coordinates->{pairs}}) { # print STDERR "Foreach pair ($pair)...\n"; foreach my $this_gene (@$these_genes) { # print STDERR "1. GENE: $this_gene->{stable_id} ($this_gene->{start} - $this_gene->{end})\n"; my $mapped_gene = $self->_get_mapped_Gene($this_gene, $pair, $return_unmapped_exons); # print STDERR "2. GENE: $mapped_gene->{stable_id} ($mapped_gene->{start} - $mapped_gene->{end})\n"; # $mapped_gene = $self->_get_mapped_Gene($this_gene, $pair); if ($mapped_gene and @{$mapped_gene->get_all_Transcripts}) { push(@$all_genes, $mapped_gene); } } } } $all_genes = $self->_compile_mapped_Genes( $all_genes, $max_repetition_length, $max_gap_length, $max_intron_length, $strict_order_of_exon_pieces, $strict_order_of_exons ); $self->{$key} = $all_genes; } return $self->{$key}; } =head2 _get_mapped_Gene Arg[1] : Bio::EnsEMBL::Gene $original_gene Arg[2] : Bio::EnsEMBL::Compara::GenomicAlign $genomic_align Example : my $mapped_gene = $align_slice->get_mapped_Gene($orignal_gene, $genomic_align); Description: returns a new Bio::EnsEMBL::Gene object. Mapping is based on exons. The object returned contains Bio::EnsEMBL::Transcripts objects. Those mapped transcripts contain Bio::EnsEMBL::Compara::AlignSlice::Exon objects. If no exon can be mapped, the returned object will contain an empty array of transcripts. Since mapped objects are not stored in the DB, they have no dbID and no adaptor. Returntype : Bio::EnsEMBL::Gene object. (new object) Exceptions : returns undef if no part of $original_gene can be mapped using this $genomic_align. This method assumes that the slices of all the exons got from that gene have the same coordinates as the gene slice. It will throw if this is not true. Caller : get_all_Genes =cut sub _get_mapped_Gene { my ($self, $gene, $pair, $return_unmapped_exons) = @_; ## $range_start and $range_end are used to get rid of extra genes fetched because of the ## previous speed improvement where several slices are merged in order to access the DB ## a minimum number of times. $pair->{slice} correspond to the actual alignment while ## $gene->slice might be much larger and include several alignments. $range_start and ## $range_end are the coordinates (using $gene->slice as a ref.) where the alignment is. ## If the gene (or later on, the exon) falls outside of this range, it can be discarded ## as it will be impossible to map it with using $pair->mapper! my $range_start = $pair->{slice}->start - $gene->slice->start + 1; my $range_end = $pair->{slice}->end - $gene->slice->start + 1; return undef if (($gene->start > $range_end) or ($gene->end < $range_start)); my $from_mapper = $pair->{mapper}; my $to_mapper = $self->{to_mapper}; my $these_transcripts = []; foreach my $this_transcript (@{$gene->get_all_Transcripts}) { my $these_exons = []; my $all_exons = $this_transcript->get_all_Exons; for (my $i=0; $i<@$all_exons; $i++){ my $this_exon = $all_exons->[$i]; throw("Oops, this method assumes that all the exons are defined on the same slice as the". " gene they belong to") if ($this_exon->slice->start != $gene->slice->start); if ($this_exon->start <= $range_end and $this_exon->end >= $range_start) { my $this_align_exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon( -EXON => $this_exon, -ALIGN_SLICE => $self, -FROM_MAPPER => $from_mapper, -TO_MAPPER => $to_mapper, -ORIGINAL_RANK => $i + 1 ); if ($this_align_exon) { push(@{$these_exons}, $this_align_exon); } elsif ($return_unmapped_exons) { $this_align_exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon( -EXON => $this_exon, -ORIGINAL_RANK => $i + 1 ); push(@{$these_exons}, $this_align_exon) if ($this_align_exon); } } elsif ($return_unmapped_exons) { my $this_align_exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon( -EXON => $this_exon, -ORIGINAL_RANK => $i + 1 ); push(@{$these_exons}, $this_align_exon) if ($this_align_exon); } } if (grep {defined($_->start)} @$these_exons) { ## if any of the exons has been mapped my $new_transcript = $this_transcript->new( -dbID => $this_transcript->dbID, -adaptor => $this_transcript->adaptor, -stable_id => $this_transcript->stable_id, -version => $this_transcript->version, -external_db => $this_transcript->external_db, -external_name => $this_transcript->external_name, -external_status => $this_transcript->external_status, -display_xref => $this_transcript->display_xref, -analysis => $this_transcript->analysis, -status => $this_transcript->status, -biotype => $this_transcript->biotype, -exons => $these_exons, ); if ($this_transcript->translation) { $new_transcript->translation($this_transcript->translation); } push(@{$these_transcripts}, $new_transcript); } } if (!@$these_transcripts) { return undef; } ## Create a new gene object. This is needed in order to avoid ## troubles with cache. Attach mapped transcripts. my $mapped_gene = $gene->new( -ANALYSIS => $gene->analysis, -SEQNAME => $gene->seqname, -STABLE_ID => $gene->stable_id, -VERSION => $gene->version, -EXTERNAL_NAME => $gene->external_name, -TYPE => $gene->biotype, -BIOTYPE => $gene->biotype, -STATUS => $gene->status, -EXTERNAL_DB => $gene->external_db, -EXTERNAL_STATUS => $gene->external_status, -DISPLAY_XREF => $gene->display_xref, -DESCRIPTION => $gene->description, -TRANSCRIPTS => $these_transcripts ); return $mapped_gene; } =head2 _compile_mapped_Genes Arg[1] : listref of Bio::EnsEMBL::Gene $mapped_genes Arg[2] : int $max_repetition_length Arg[3] : int $max_gap_length Arg[4] : int $max_intron_length Arg[5] : int $strict_order_of_exon_pieces Arg[6] : int $strict_order_of_exons Example : my $compiled_genes = $align_slice->compile_mapped_Genes($mapped_genes, 100, 1000, 100000, 1, 0); Description: This method compiles all the pieces of gene mapped before into a list of Bio::EnsEMBL::Gene objects. It tries to merge pieces of the same exon, link them according to their original transcript and finally group the transcripts in Bio::EnsEMBL::Gene objects. Merging and linking of Exons is done according to some rules that can be changed with the parameters. Parameters: They are sent as is to _separate_in_incompatible_sets_of_Exons() and _merge_Exons() methods. Please refer to them elsewhere in this document. Returntype : listref of Bio::EnsEMBL::Gene objects. (overrides $mapped_genes) Exceptions : none Caller : $object->methodname =cut sub _compile_mapped_Genes { my ($self, $mapped_genes, $max_repetition_length, $max_gap_length, $max_intron_length, $strict_order_of_exon_pieces, $strict_order_of_exons) = @_; my $verbose = verbose(); verbose(0); # Avoid warnings when mapped transcripts are not in the same strand ## Compile genes: group transcripts by gene->stable_id my $gene_by_gene_stable_id; my $transcripts_by_gene_stable_id; foreach my $mapped_gene (@$mapped_genes) { # print STDERR "1. GENE:$mapped_gene->{stable_id} ", join(" - ", map {$_->stable_id} @{$mapped_gene->get_all_Transcripts}), "\n"; if (!$gene_by_gene_stable_id->{$mapped_gene->stable_id}) { $gene_by_gene_stable_id->{$mapped_gene->stable_id} = $mapped_gene; } ## Group all the transcripts by gene stable_id... push(@{$transcripts_by_gene_stable_id->{$mapped_gene->stable_id}}, @{$mapped_gene->get_all_Transcripts}); } # $mapped_genes = [values %$genes_by_stable_id]; ## Compile transcripts: group exons by transcript->stable_id while (my ($gene_stable_id, $set_of_transcripts) = each %$transcripts_by_gene_stable_id) { my $transcript_by_transcript_stable_id; my $exons_by_transcript_stable_id; foreach my $transcript (@{$set_of_transcripts}) { if (!$transcript_by_transcript_stable_id->{$transcript->stable_id}) { $transcript_by_transcript_stable_id->{$transcript->stable_id} = $transcript; } ## Group all the exons by the transcript stable_id... # print STDERR "0. TRANS:$transcript->{stable_id} ", join(" - ", map {$_->stable_id} @{$transcript->get_all_Exons}), "\n"; push(@{$exons_by_transcript_stable_id->{$transcript->stable_id}}, @{$transcript->get_all_Exons}); } ## Try to merge splitted exons whenever possible while (my ($transcript_stable_id, $set_of_exons) = each %$exons_by_transcript_stable_id) { # print STDERR "1. ", join(" - ", map {$_->stable_id} @{$set_of_exons}), "\n"; $exons_by_transcript_stable_id->{$transcript_stable_id} = _merge_Exons( $set_of_exons, $max_repetition_length, $max_gap_length, $strict_order_of_exon_pieces ); # print STDERR "2. ", join(" - ", map {$_->stable_id} @{$exons_by_transcript_stable_id->{$transcript_stable_id}}), "\n"; } my $all_transcripts; while (my ($transcript_stable_id, $set_of_exons) = each %$exons_by_transcript_stable_id) { # my $sets_of_compatible_exons = [$set_of_exons]; my $sets_of_compatible_exons = _separate_in_incompatible_sets_of_Exons( $set_of_exons, $max_repetition_length, $max_intron_length, $strict_order_of_exons ); my $old_transcript = $transcript_by_transcript_stable_id->{$transcript_stable_id}; # # Save first set of exons in the # my $first_set_of_compatible_exons = shift(@{$sets_of_compatible_exons}); # my $first_transcript = $transcript_by_transcript_stable_id->{$transcript_stable_id}; # $first_transcript->flush_Exons(); # foreach my $exon (@$first_set_of_compatible_exons) { # $first_transcript->add_Exon($exon); # } # push(@$all_transcripts, $first_transcript); foreach my $this_set_of_compatible_exons (@{$sets_of_compatible_exons}) { my $new_transcript = $old_transcript->new( -dbID => $old_transcript->dbID, -adaptor => $old_transcript->adaptor, -stable_id => $old_transcript->stable_id, -version => $old_transcript->version, -external_db => $old_transcript->external_db, -external_name => $old_transcript->external_name, -external_status => $old_transcript->external_status, -display_xref => $old_transcript->display_xref, -analysis => $old_transcript->analysis, -status => $old_transcript->status, -biotype => $old_transcript->biotype, -EXONS => $this_set_of_compatible_exons ); ## $old_transcript->translation is the original Bio::EnsEMBL::Translation! if ($old_transcript->translation) { my $start_exon; my $seq_start; my $end_exon; my $seq_end; my $all_start_codon_mappings; my $all_end_codon_mappings; my @sorted_exons; if ($new_transcript->strand == 1) { @sorted_exons = @{$new_transcript->get_all_Exons}; } else { @sorted_exons = reverse @{$new_transcript->get_all_Exons}; } my $coding = 0; foreach my $this_exon (@sorted_exons) { if ($old_transcript->translation->start_Exon->stable_id eq $this_exon->stable_id) { $coding = 1; } if ($coding and $this_exon->start) { if (!$start_exon) { if ($old_transcript->translation->start_Exon->stable_id eq $this_exon->stable_id) { if ($old_transcript->translation->start_Exon->strand == 1) { $all_start_codon_mappings = $self->map_original_Slice( $this_exon->exon->slice->sub_Slice( $old_transcript->coding_region_start, $old_transcript->coding_region_start + 2)); } else { $all_start_codon_mappings = $self->map_original_Slice( $this_exon->exon->slice->sub_Slice( $old_transcript->coding_region_end-2, $old_transcript->coding_region_end)); } $seq_start = _map_position_using_cigar_line($this_exon->cigar_line, $old_transcript->translation->start, $this_exon->exon->length, 1); if ($seq_start <= $this_exon->length) { $start_exon = $this_exon; } } else { $seq_start = 1; $start_exon = $this_exon; } } if ($old_transcript->translation->end_Exon->stable_id eq $this_exon->stable_id) { if ($old_transcript->translation->end_Exon->strand == 1) { $all_end_codon_mappings = $self->map_original_Slice( $this_exon->exon->slice->sub_Slice($old_transcript->coding_region_end - 2,$old_transcript->coding_region_end)); } else { $all_end_codon_mappings = $self->map_original_Slice( $this_exon->exon->slice->sub_Slice( $old_transcript->coding_region_start, $old_transcript->coding_region_start + 2)); } $seq_end = _map_position_using_cigar_line($this_exon->cigar_line, $old_transcript->translation->end, $this_exon->exon->length, 0); $seq_end = $this_exon->length if ($seq_end > $this_exon->length); if ($seq_end >= 1) { $end_exon = $this_exon; } else { ## Set $seq_end to previous value $seq_end = $end_exon->length if ($end_exon); } } else { $end_exon = $this_exon; $seq_end = $end_exon->length; } } if ($old_transcript->translation->end_Exon->stable_id eq $this_exon->stable_id) { $coding = 0; last; } } if ($start_exon and $end_exon) { $new_transcript->translation(new Bio::EnsEMBL::Compara::AlignSlice::Translation( -start_exon => $start_exon, -end_exon => $end_exon, -seq_start => $seq_start, -seq_end => $seq_end, -stable_id => $old_transcript->translation->stable_id, -version => $old_transcript->translation->version, -created_date => ($old_transcript->translation->created_date or undef), -modified_date =>$old_transcript->translation->modified_date, )); } else { ## Translation cannot be mapped. In this case we should return a translation outside ## of the coordinate system (negative coding_region_start and coding_region_end). $new_transcript->translation(new Bio::EnsEMBL::Compara::AlignSlice::Translation( -stable_id => $old_transcript->translation->stable_id, -version => $old_transcript->translation->version, -created_date => ($old_transcript->translation->created_date or undef), -modified_date =>$old_transcript->translation->modified_date, )); $new_transcript->coding_region_start(-10000000000); $new_transcript->coding_region_end(-10000000000); } $new_transcript->translation->all_start_codon_mappings($all_start_codon_mappings); $new_transcript->translation->all_end_codon_mappings($all_end_codon_mappings); } push(@$all_transcripts, $new_transcript); } } $gene_by_gene_stable_id->{$gene_stable_id}->{'_transcript_array'} = $all_transcripts; # adjust start, end, strand and slice $gene_by_gene_stable_id->{$gene_stable_id}->recalculate_coordinates; } verbose($verbose); return [values %$gene_by_gene_stable_id]; } =head2 _map_position_using_cigar_line Arg[1] : string $cigar_line Arg[2] : int $original_position Arg[3] : int $original_length Arg[4] : bool $start Example : my $mapped_start_position = _map_position_using_cigar_line( "16M4I", 10, 20, 1); Example : my $mapped_end_position = _map_position_using_cigar_line( "16M4I", 10, 20, 1); Description: This method is used to locate the start or the end position of a Translation on the Bio::EnsEMBL::Compara::AlignSlice::Exon object using the cigar_line of the object. As the Bio::EnsEMBL::Compara:: AlignSlice::Exon object may result from the fusion of the same Bio::EnsEMBL::Exon object several times, this method returns the best mapping among all the available possibilities. When the original position maps on an insertion in the Bio::EnsEMBL::Compara:: AlignSlice::Exon object, the resulting mapped start position is the next one to the latest mapped one. Returntype : int $mapped_position Exceptions : returns 0 when the end position cannot be mapped. Exceptions : returns a number larger than the length of the Bio::EnsEMBL::Compara:: AlignSlice::Exon when the start position cannot be mapped. =cut sub _map_position_using_cigar_line { my ($cigar_line, $original_pos, $original_length, $start) = @_; my $mapped_pos = 0; my @cigar = grep {$_} split(/(\d*[GIDM])/, $cigar_line); my $original_count = 0; my $mapped_count = 0; my $pending_count = 0; foreach my $cigar_piece (@cigar) { my ($num, $mode) = $cigar_piece =~ /(\d*)([GIDM])/; $num = 1 if ($num eq ""); if ($mode eq "D" or $mode eq "G") { $pending_count += $num; } elsif ($mode eq "I") { $original_count += $num; } elsif ($mode eq "M") { if ($pending_count) { $mapped_count += $pending_count; $pending_count = 0; } if ($original_count + $num < $original_pos) { $original_count += $num; $mapped_count += $num; } else { $mapped_count += ($original_pos - $original_count); $original_count += ($original_pos - $original_count); $mapped_pos = $mapped_count; last; } } if ($original_count >= $original_pos and $mode eq "I") { ## This position matches in an insertion. ## If we are mapping the end position we will prefer the first match in an insertion ## rather than the following ones (except if the following matches in a "M" region). ## On the other hand, if we are mapping the start position, we will prefer the last ## match in an insertion if ($start or !$mapped_pos) { if ($start and $pending_count) { ## $pending_count corresponds to deletion and needs to be added when mapping the ## start position (we will want the position AFTER the deletion) but not when ## mapping the end position (we will want the position BEFORE the deletion) $mapped_count += $pending_count; $pending_count = 0; } $mapped_pos = $mapped_count; ## When matching the start position, add 1 as the mapped start will be just after ## this position. If this position appears after the end of the mapped exon, this ## means we fail to map the start position $mapped_pos ++ if ($start); } ## Try to look further. A second match may happen if this exon is the result of a fusion process $original_pos += $original_length; } } return $mapped_pos; } =head2 _merge_Exons Arg[1] : listref of Bio::EnsEMBL::Compara::AlignSlice::Exon $set_of_exons Arg[2] : int $max_repetition_length Arg[3] : int $max_gap_length Arg[4] : int $strict_order_of_exon_pieces Example : my $merged_exons = _merge_Exons($exons_to_be_merged, 100, 1000, 1); Description: Takes a list of Bio::EnsEMBL::Compara::AlignSlice::Exon objects and tries to merge them according to exon stable_id and some rules that can be tunned using some optional parameters. This method can overwrite some of the exon in the $set_of_exons. Parameters: MAX_REPETITION_LENGTH. In principle you want to merge together pieces of an exon which do not overlap (the beginning and the end of the exon). With this flag you can to set up what amount of the original exon is allowed on two aligned exons to be merged. MAX_GAP_LENGTH. If the distance between two pieces of exons in the aligned slice is larger than this parameter, they will not be merged. Setting this parameter to -1 will avoid any merging event. STRICT_ORDER_OF_EXON_PIECES. This flag allows you to decide whether two pieces of an exon should be merged or not if they are not in the right order, for instance if the end of the original exon will appear before the start on the merged exon. Returntype : lisref of Bio::EnsEMBL::Compara::AlignSlice::Exon objects. Exceptions : none Caller : methodname =cut sub _merge_Exons { my ($set_of_exons, $max_repetition_length, $max_gap_length, $strict_order_of_exon_pieces) = @_; my $merged_exons = []; # returned value my $exon_by_stable_id; # Group exons by stable_id foreach my $exon (@$set_of_exons) { push(@{$exon_by_stable_id->{$exon->stable_id}}, $exon); } # Merge compatible pieces of exons foreach my $these_exons (values %$exon_by_stable_id) { if (!grep {defined($_->start)} @$these_exons) { push(@$merged_exons, $these_exons->[0]); next; } # Sort exons according to $these_exons= [sort {($a->start or 0) <=> ($b->start or 0)} @$these_exons]; while (my $first_exon = shift @$these_exons) { if (!defined($first_exon->start)) { # push(@$merged_exons, $first_exon); next; } for (my $count=0; $count<@$these_exons; $count++) { my $second_exon = $these_exons->[$count]; # Check strands next if ($first_exon->strand != $second_exon->strand); my $gap_between_pieces_of_exon = $second_exon->start - $first_exon->end - 1; # Check whether both mapped parts do not overlap next if ($gap_between_pieces_of_exon < 0); # Check maximum gap between both pieces of exon next if ($max_gap_length and $gap_between_pieces_of_exon > $max_gap_length); # Check whether both mapped parts are in the right order if ($strict_order_of_exon_pieces) { if ($first_exon->strand == 1) { next if ($first_exon->get_aligned_start > $second_exon->get_aligned_start); } else { next if ($first_exon->get_aligned_end < $second_exon->get_aligned_end); } } # Check maximum overlapping within original exon, i.e. how much of the # same exon can be mapped twice my $repetition_length; if ($first_exon->strand == 1) { $repetition_length = $first_exon->get_aligned_end - $second_exon->get_aligned_start + 1; } else { $repetition_length = $second_exon->get_aligned_end - $first_exon->get_aligned_start + 1; } next if ($repetition_length > $max_repetition_length); ## Merge exons!! $second_exon = splice(@$these_exons, $count, 1); # remove exon from the list $count-- if (@$these_exons); $first_exon->end($second_exon->end); if ($first_exon->strand == 1) { $first_exon->append_Exon($second_exon, $gap_between_pieces_of_exon); } else { $first_exon->prepend_Exon($second_exon, $gap_between_pieces_of_exon); } } push(@$merged_exons, $first_exon); } } return $merged_exons; } =head2 _separate_in_incompatible_sets_of_Exons Arg[1] : listref of Bio::EnsEMBL::Compara::AlignSlice::Exon $set_of_exons Arg[2] : int $max_repetition_length Arg[3] : int $max_intron_length Arg[4] : int $strict_order_of_exons Example : my $sets_of_exons = _separate_in_incompatible_sets_of_Exons( $set_of_exons, 100, 100000, 0); Description: Takes a list of Bio::EnsEMBL::Compara::AlignSlice::Exon and separate them in sets of comaptible exons. Compatibility is defined taking into account 5 parameters: - exons must be in the same strand - exons cannot overlap on the align_slice - distance between exons cannot be larger than MAX_INTRON_LENGTH - two exons with the same stable_id can belong to the same transcript only if they represent diferent parts of the original exon. Some overlapping is allowed (see MAX_REPETITION_LENGTH parameter). - exons must be in the same order as in the original transcript if the STRICT_ORDER_OF_EXONS parameter is true. Parameters: MAX_REPETITION_LENGTH. In principle you want to link together pieces of an exon which do not overlap (the beginning and the end of the exon). With this flag you can to set up what amount of the original exon is allowed on two aligned exons to be linked. MAX_INTRON_LENGTH. If the distance between two exons in the aligned slice is larger than this parameter, they will not be linked. STRICT_ORDER_OF_EXONS. This flag allows you to decide whether two exons should be linked or not if they are not in the original order. Returntype : listref of lisrefs of Bio::EnsEMBL::Compara::AlignSlice::Exon objects. Exceptions : none Caller : methodname =cut sub _separate_in_incompatible_sets_of_Exons { my ($set_of_exons, $max_repetition_length, $max_intron_length, $strict_order_of_exons) = @_; my $sets_of_exons = []; my $last_exon; my $this_set_of_exons = []; ################################################################### ## ## Return all the exons by strand. ## - A given exon can be mapped partially on both strands! ## if (grep {$_->strand and $_->strand == 1} @$set_of_exons and grep {$_->strand and $_->strand == -1} @$set_of_exons) { ## Keep track of the exons that have been mapped by strand my $indexes; foreach my $exon (@$set_of_exons) { if ($exon->strand) { $indexes->{$exon->strand}->{$exon->original_rank} = 1; } } my $forward_stranded_set_of_exons; # set of exons to be returned in the forward strand my $reverse_stranded_set_of_exons; # set of exons to be returned in the reverse strand foreach my $exon (@$set_of_exons) { my $new_exon = $exon->copy(); if ($exon->strand) { if ($exon->strand == 1) { if ($indexes->{"-1"}->{$exon->original_rank}) { ## This exon has been mapped on the reverse strand as well push(@$forward_stranded_set_of_exons, $exon); next; } else { ## Undef coordinates of the new exon before adding it to the set ## of reverse stranded exons $new_exon->start(undef); $new_exon->end(undef); $new_exon->strand(undef); } } else { if ($indexes->{"1"}->{$exon->original_rank}) { ## This exon has been mapped on the forward strand as well push(@$reverse_stranded_set_of_exons, $new_exon); next; } else { ## Undef coordinates of the new exon before adding it to the set ## of forward stranded exons $exon->start(undef); $exon->end(undef); $exon->strand(undef); } } } push(@$forward_stranded_set_of_exons, $exon); push(@$reverse_stranded_set_of_exons, $new_exon); } push(@$sets_of_exons, @{_separate_in_incompatible_sets_of_Exons($reverse_stranded_set_of_exons, $max_repetition_length, $max_intron_length, $strict_order_of_exons)}); $set_of_exons = $forward_stranded_set_of_exons; } ## ################################################################### my $transcript_strand = 1; if (grep {$_->strand and $_->strand == -1} @$set_of_exons) { $transcript_strand = -1; } foreach my $this_exon (_sort_Exons(@$set_of_exons)) { if (!defined($this_exon->start)) { if ($transcript_strand == -1) { ## Insert this exon in the right place my $inserted = 0; for (my $i=0; $i<@$this_set_of_exons; $i++) { if ($this_set_of_exons->[$i]->original_rank == $this_exon->original_rank - 1) { splice(@$this_set_of_exons, $i, 0, $this_exon); $inserted = 1; last; } } if (!$inserted) { push(@$this_set_of_exons, $this_exon); } } else { ## Append this exon push(@$this_set_of_exons, $this_exon); } next; } if ($last_exon) { # Calculate intron length my $intron_length = $this_exon->start - $last_exon->end - 1; # Calculate whether both mapped parts are in the right order my $order_is_ok = 1; if ($strict_order_of_exons) { if ($this_exon->strand == $this_exon->exon->strand) { $order_is_ok = 0 if ($this_exon->exon->start < $last_exon->exon->start); } else { $order_is_ok = 0 if ($this_exon->exon->start > $last_exon->exon->start); } } my $repetition_length = 0; if ($last_exon->stable_id eq $this_exon->stable_id) { if ($this_exon->strand == 1) { $repetition_length = $last_exon->get_aligned_end - $this_exon->get_aligned_start + 1; } else { $repetition_length = $this_exon->get_aligned_end - $last_exon->get_aligned_start + 1; } } if (($last_exon->strand != $this_exon->strand) or ($intron_length < 0) or ($max_intron_length and ($intron_length > $max_intron_length)) or (!$order_is_ok) or ($repetition_length > $max_repetition_length)) { # this_exon and last_exon should be in separate sets. Save current # set_of_exons and start a new set_of_exons push(@$sets_of_exons, $this_set_of_exons); $this_set_of_exons = []; } } push(@$this_set_of_exons, $this_exon); $last_exon = $this_exon; } push(@$sets_of_exons, $this_set_of_exons); return $sets_of_exons; } sub _sort_Exons { my @exons = @_; my @sorted_exons = (); my $transcript_strand = 1; if (grep {$_->strand and $_->strand == -1} @exons) { $transcript_strand = -1; } my @mapped_exons = grep {defined($_->start)} @exons; my @unmapped_exons = grep {!defined($_->start)} @exons; @mapped_exons = sort {$a->start <=> $b->start} @mapped_exons; my $sorted_exons_by_rank; foreach my $mapped_exon (@mapped_exons) { my $rank = $mapped_exon->original_rank; # the same exon can be mapped twice! push(@{$sorted_exons_by_rank->{$rank}}, $mapped_exon); } @unmapped_exons = sort {$a->original_rank <=> $b->original_rank} @unmapped_exons; foreach my $unmapped_exon (@unmapped_exons) { my $rank = $unmapped_exon->original_rank; do { if (defined($sorted_exons_by_rank->{$rank})) { push(@{$sorted_exons_by_rank->{$rank}}, $unmapped_exon); $rank = 0; } elsif ($rank == 1) { if ($transcript_strand == 1) { push(@{$sorted_exons_by_rank->{$rank}}, $unmapped_exon); } else { $rank++ while(!defined($sorted_exons_by_rank->{$rank})); splice(@{$sorted_exons_by_rank->{$rank}}, 1, 0, $unmapped_exon); } $rank = 0; } $rank--; } while ($rank > 0); } foreach my $exons (sort { if (defined($a->[0]->start) and defined($b->[0]->start)) { $a->[0]->start <=> $b->[0]->start; } else { $a->[0]->original_rank <=> $b->[0]->original_rank; } } values %$sorted_exons_by_rank) { push(@sorted_exons, @{$exons}); } return @sorted_exons; } =head2 OVERWRITEN METHODS FROM Bio::EnsEMBL::Slice module =head2 WARNING - WARNING - WARNING - WARNING - WARNING - WARNING - WARNING All the methods that need acces to the database (the adaptor) and which are not listed here are not supported!! =head2 WARNING - WARNING - WARNING - WARNING - WARNING - WARNING - WARNING =head2 invert (not supported) Maybe at some point... This method returns undef =cut sub invert { warning("Cannot invert a Bio::EnsEMBL::Compara::AlignSlice::Slice object"); return undef; } =head2 sub_Slice Arg 1 : int $start Arg 2 : int $end Arge [3] : int $strand Example : none Description: Makes another Slice that covers only part of this slice If a slice is requested which lies outside of the boundaries of this function will return undef. This means that behaviour will be consistant whether or not the slice is attached to the database (i.e. if there is attached sequence to the slice). Alternatively the expand() method or the SliceAdaptor::fetch_by_region method can be used instead. Returntype : Bio::EnsEMBL::Slice or undef if arguments are wrong Exceptions : return undef if $start and $end define a region outside of actual Slice. Caller : general Status : Testing =cut sub sub_Slice { my ( $self, $start, $end, $strand ) = @_; if( $start < 1 || $start > $self->{'end'} ) { # throw( "start argument not valid" ); return undef; } if( $end < $start || $end > $self->{'end'} ) { # throw( "end argument not valid" ) return undef; } my ($new_start, $new_end, $new_strand); if (!defined($strand)) { $strand = 1; } if( $self->{'strand'} == 1 ) { $new_start = $self->{'start'} + $start - 1; $new_end = $self->{'start'} + $end - 1; $new_strand = $strand; } else { $new_start = $self->{'end'} - $end + 1;; $new_end = $self->{'end'} - $start + 1; $new_strand = -$strand; } #fastest way to copy a slice is to do a shallow hash copy my %new_slice = %$self; ## Delete cached genes foreach my $key (grep {/^key_/} keys %new_slice) { delete($new_slice{$key}); } $new_slice{'seq'} = undef; $new_slice{'start'} = int($new_start); $new_slice{'end'} = int($new_end); $new_slice{'strand'} = $new_strand; return bless \%new_slice, ref($self); } =head2 seq Arg [1] : none Example : print "SEQUENCE = ", $slice->seq(); Description: Returns the sequence of the region represented by this slice formatted as a string. This Slice is made of several Bio::EnsEMBL::Slices mapped on it with gaps inside and regions with no matching sequence. The resulting string might contain gaps and/or dots as padding characters. If several Slices map on the same positions, the last one will override the positions. Returntype : string Exceptions : none Caller : general =cut sub seq { my $self = shift; my $start = 1; my $end = $self->length; my $strand = 1; # strand is reversed in the subseq method if needed return $self->{seq} if (defined($self->{seq})); $self->{seq} = $self->subseq($start, $end, $strand); return $self->{seq}; } =head2 subseq Arg [1] : int $startBasePair relative to start of slice, which is 1. Arg [2] : int $endBasePair relative to start of slice. Arg [3] : (optional) int $strand The strand of the slice to obtain sequence from. Default value is 1. Description: returns string of dna sequence This Slice is made of several Bio::EnsEMBL::Slices mapped on it with gaps inside and regions with no matching sequence. The resulting string might contain gaps and/or dots as padding characters. If several Slices map on the same positions, the last one will override the positions. Returntype : txt Exceptions : end should be at least as big as start strand must be set Caller : general =cut sub subseq { my ($self, $start, $end, $strand) = @_; $start = 1 if (!defined($start)); $end ||= $self->length; $strand ||= 1; ## Fix coordinates (needed for sub_Slices) if ($self->strand == -1) { $strand = -$strand; my $aux = $start; $start = $self->start + ($self->length - $end); $end = $self->start + ($self->length - $aux); } else { $start += $self->start - 1; $end += $self->start - 1; } my $length = ($end - $start + 1); my $seq = "." x $length; foreach my $pair (sort {$a->{start} <=> $b->{start}} @{$self->get_all_Slice_Mapper_pairs()}) { my $this_slice = $pair->{slice}; my $mapper = $pair->{mapper}; my $slice_start = $pair->{start}; my $slice_end = $pair->{end}; next if ($slice_start > $end or $slice_end < $start); my $this_slice_seq = $this_slice->seq(); # Set slice_start and slice_end in "subseq" coordinates (0 based, for compliance wiht substr() perl func) and trim them $slice_start -= $start; # $slice_start is now in subseq coordinates $slice_start = 0 if ($slice_start < 0); $slice_end -= $start; # $slice_end is now in subseq coordinates $slice_end = $length if ($slice_end > $length); # Invert start and end for the reverse strand if ($strand == -1) { my $aux = $slice_end; $slice_end = $length - $slice_start; $slice_start = $length - $aux; } my @sequence_coords = $mapper->map_coordinates( 'alignment', $start, $end, $strand, 'alignment' ); ##################### # $this_pos refers to the starting position of the subseq if requesting the forward strand # or the ending position of the subseq if the reverse strand has been requested: # # FORWARD STRAND (1) # $this_pos = 0 # | # ----------------------------------------------------------------------> # <---------------------------------------------------------------------- # # REVERSE STRAND (-1) # ----------------------------------------------------------------------> # <---------------------------------------------------------------------- # | # $this_pos = 0 # # All remaining coordinates work in the same way except the start and end position # of the gaps which correspond to the coordinates in the original Slice... # my $this_pos = 0; foreach my $sequence_coord (@sequence_coords) { ## $sequence_coord refer to genomic_align (a slice in the [+] strand) if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { my $subseq; if ($this_slice->strand == 1) { $subseq = substr($this_slice_seq, $sequence_coord->start - $this_slice->start, $sequence_coord->length); if ($sequence_coord->strand * $pair->{strand} == -1) { $subseq = reverse($subseq); $subseq =~ tr/ACGTacgt/TGCAtgca/; } } else { $subseq = substr($this_slice_seq, $this_slice->end - $sequence_coord->end, $sequence_coord->length); if ($sequence_coord->strand * $pair->{strand} == -1) { $subseq = reverse($subseq); $subseq =~ tr/ACGTacgt/TGCAtgca/; } } substr($seq, $this_pos, $sequence_coord->length, $subseq); } else { ## Gap or sequence outside of any alignment ############ # Get the start and end positions of this gap in "subseq" coordinates my $this_original_start = $sequence_coord->start - $start; my $this_original_end = $sequence_coord->end - $start; if ($strand == -1) { my $aux = $this_original_end; $this_original_end = $length - $this_original_start; $this_original_start = $length - $aux; } if ($this_original_start <= $slice_end and $this_original_end >= $slice_start) { ## This is a gap my $start_position_of_gap_seq = $this_pos; my $end_position_of_gap_seq = $this_pos + $sequence_coord->length; if ($start_position_of_gap_seq < $slice_start) { $start_position_of_gap_seq = $slice_start; } if ($end_position_of_gap_seq > $slice_end + 1) { $end_position_of_gap_seq = $slice_end + 1; } my $length_of_gap_seq = $end_position_of_gap_seq - $start_position_of_gap_seq; substr($seq, $start_position_of_gap_seq, $length_of_gap_seq, "-" x $length_of_gap_seq) if ($length_of_gap_seq > 0); } } $this_pos += $sequence_coord->length; } } return $seq; } =head2 get_cigar_line Arg : -none- Description: returns a cigar_line describing the gaps in the mapped sequence This Slice is made of several Bio::EnsEMBL::Slices mapped on it with gaps inside and regions with no matching sequence. The resulting cigar line corresponds to the mapping of all the nucleotides that can be mapepd. If several Slices map on the same positions, the behaviour is undefined. The cigar_line includes 3 types of regions: M for matches/mismatches, D for alignment gaps (formerly known as deletions) and G for gaps between alignment blocks. Returntype : txt Exceptions : Caller : general =cut sub get_cigar_line { my ($self, $start, $end, $strand) = @_; $start = $self->start; $end = $self->end; $strand ||= 1; ## Fix coordinates (needed for sub_Slices) if ($self->strand == -1) { $strand = -$strand; my $aux = $start; $start = $self->start + ($self->length - $end); $end = $self->start + ($self->length - $aux); } else { $start += $self->start - 1; $end += $self->start - 1; } my $length = ($end - $start + 1); my $seq = "." x $length; foreach my $pair (sort {$a->{start} <=> $b->{start}} @{$self->get_all_Slice_Mapper_pairs("get_gap_slices")}) { my $this_slice = $pair->{slice}; my $mapper = $pair->{mapper}; my $slice_start = $pair->{start}; my $slice_end = $pair->{end}; next if ($slice_start > $end or $slice_end < $start); # Set slice_start and slice_end in "subseq" coordinates (0 based, for compliance wiht substr() perl func) and trim them $slice_start -= $start; # $slice_start is now in subseq coordinates $slice_start = 0 if ($slice_start < 0); $slice_end -= $start; # $slice_end is now in subseq coordinates $slice_end = $length if ($slice_end > $length); # Invert start and end for the reverse strand if ($strand == -1) { my $aux = $slice_end; $slice_end = $length - $slice_start; $slice_start = $length - $aux; } my @sequence_coords = $mapper->map_coordinates( 'alignment', $start, $end, $strand, 'alignment' ); ##################### # $this_pos refers to the starting position of the subseq if requesting the forward strand # or the ending position of the subseq if the reverse strand has been requested: # # FORWARD STRAND (1) # $this_pos = 0 # | # ----------------------------------------------------------------------> # <---------------------------------------------------------------------- # # REVERSE STRAND (-1) # ----------------------------------------------------------------------> # <---------------------------------------------------------------------- # | # $this_pos = 0 # # All remaining coordinates work in the same way except the start and end position # of the gaps which correspond to the coordinates in the original Slice... # my $this_pos = 0; foreach my $sequence_coord (@sequence_coords) { ## $sequence_coord refer to genomic_align (a slice in the [+] strand) if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { my $subseq = "N" x ($sequence_coord->length); substr($seq, $this_pos, $sequence_coord->length, $subseq); } else { ## Gap or sequence outside of any alignment ############ # Get the start and end positions of this gap in "subseq" coordinates my $this_original_start = $sequence_coord->start - $start; my $this_original_end = $sequence_coord->end - $start; if ($strand == -1) { my $aux = $this_original_end; $this_original_end = $length - $this_original_start; $this_original_start = $length - $aux; } if ($this_original_start <= $slice_end and $this_original_end >= $slice_start) { ## This is a gap my $start_position_of_gap_seq = $this_pos; my $end_position_of_gap_seq = $this_pos + $sequence_coord->length; if ($start_position_of_gap_seq < $slice_start) { $start_position_of_gap_seq = $slice_start; } if ($end_position_of_gap_seq > $slice_end + 1) { $end_position_of_gap_seq = $slice_end + 1; } my $length_of_gap_seq = $end_position_of_gap_seq - $start_position_of_gap_seq; substr($seq, $start_position_of_gap_seq, $length_of_gap_seq, "-" x $length_of_gap_seq) if ($length_of_gap_seq > 0); } } $this_pos += $sequence_coord->length; } } my $cigar_line = ""; my @pieces = split(/(\-+|\.+)/, $seq); foreach my $piece (@pieces) { my $mode; if ($piece =~ /\./) { $mode = "G"; # D for gaps (deletions) } elsif ($piece =~ /\-/) { $mode = "D"; # D for gaps (deletions) } else { $mode = "M"; # M for matches/mismatches } if (length($piece) == 1) { $cigar_line .= $mode; } elsif (length($piece) > 1) { #length can be 0 if the sequence starts with a gap $cigar_line .= length($piece).$mode; } } return $cigar_line; } =head2 get_all_underlying_Slices Arg [1] : int $startBasePair relative to start of slice, which is 1. Arg [2] : int $endBasePair relative to start of slice. Arg [3] : (optional) int $strand The strand of the slice to obtain sequence from. Default value is 1. Description: This Slice is made of several Bio::EnsEMBL::Slices mapped on it with gaps inside and regions with no matching sequence. This method returns these Slices (or part of them) with the original coordinates and the gapped sequence attached to them. Additionally, extra Slices could be returned in order to fill in gaps between underlying Slices. Returntype : listref of Bio::EnsEMBL::Slice objects Exceptions : end should be at least as big as start Caller : general =cut sub get_all_underlying_Slices { my ($self, $start, $end, $strand) = @_; my $underlying_slices = []; $start = 1 if (!defined($start)); $end ||= $self->length; $strand ||= 1; ## Fix coordinates (needed for sub_Slices) if ($self->strand == -1) { $strand = -$strand; my $aux = $start; $start = $self->start + ($self->length - $end); $end = $self->start + ($self->length - $aux); } else { $start += $self->start - 1; $end += $self->start - 1; } my $current_position; # if ($strand == 1) { $current_position = $start; # } else { # $current_position = $end; # } foreach my $pair (sort {$a->{start} <=> $b->{start}} @{$self->get_all_Slice_Mapper_pairs}) { my $this_slice = $pair->{slice}; my $mapper = $pair->{mapper}; my $slice_start = $pair->{start}; my $slice_end = $pair->{end}; next if ($slice_start > $end or $slice_end < $start); my @sequence_coords = $mapper->map_coordinates( 'alignment', $start, $end, $strand, 'alignment' ); my $this_subseq_start; my $this_subseq_end; my $this_subseq_strand; foreach my $sequence_coord (@sequence_coords) { ## $sequence_coord refer to genomic_align (a slice in the [+] strand) if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { $this_subseq_start = $sequence_coord->start if (!defined($this_subseq_start) or $this_subseq_start > $sequence_coord->start); $this_subseq_end = $sequence_coord->end if (!defined($this_subseq_end) or $this_subseq_end < $sequence_coord->end); $this_subseq_strand = $sequence_coord->strand if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate") and !defined($this_subseq_strand)); } } # next if (!defined($this_subseq_start)); # if the whole requested region correspond to a gap my $start_position = ($start>$slice_start)?$start:$slice_start; # in AlignSlice coordinates my $end_position = ($end<$slice_end)?$end:$slice_end; # in AlignSlice coordinates # if ($strand == 1) { if ($start_position > $current_position) { my $this_underlying_slice = new Bio::EnsEMBL::Slice( -coord_system => $gap_coord_system, -seq_region_name => "GAP", -start => $current_position, -end => $start_position - 1, -strand => 0 ); $this_underlying_slice->{seq} = "." x ($start_position - $current_position); push(@$underlying_slices, $this_underlying_slice); } # } else { # if ($end_position < $current_position) { # my $this_underlying_slice = new Bio::EnsEMBL::Slice( # -seq_region_name => "GAP", # -start => $end_position + 1, # -end => $current_position, # -strand => 0 # ); # $this_underlying_slice->{seq} = "." x ($current_position - $end_position); # unshift(@$underlying_slices, $this_underlying_slice); # } # } $current_position = $end_position + 1; my $this_underlying_slice; if (!defined($this_subseq_start)) { $this_underlying_slice = new Bio::EnsEMBL::Slice( -coord_system => $gap_coord_system, -seq_region_name => "GAP", -start => $start_position, -end => $end_position, -strand => 0 ); $this_underlying_slice->{seq} = "-" x ($end_position - $start_position + 1); } else { $this_underlying_slice = new Bio::EnsEMBL::Slice( -coord_system => $this_slice->coord_system, -seq_region_name => $this_slice->seq_region_name, -start => $this_subseq_start, -end => $this_subseq_end, -strand => $this_subseq_strand ); $this_underlying_slice->{seq} = $self->subseq($start_position, $end_position, $strand); $this_underlying_slice->{_tree} = $this_slice->{_tree} if (defined($this_slice->{_tree})); } # if ($strand == 1) { push(@$underlying_slices, $this_underlying_slice); # } else { # unshift(@$underlying_slices, $this_underlying_slice); # } } if ($end >= $current_position) { my $this_underlying_slice = new Bio::EnsEMBL::Slice( -coord_system => $gap_coord_system, -seq_region_name => "GAP", -seq_region_length => ($end - $current_position + 1), -start => $current_position, -end => $end, -strand => 0 ); $this_underlying_slice->{seq} = "." x ($end - $current_position + 1); push(@$underlying_slices, $this_underlying_slice); } if ($strand == -1) { @$underlying_slices = reverse(@$underlying_slices); } return $underlying_slices; } =head2 get_original_seq_region_position Arg [1] : int $position relative to start of slice, which is 1. Description: This Slice is made of several Bio::EnsEMBL::Slices mapped on it with gaps inside and regions with no matching sequence. This method returns the original seq_region_position in the original Slice of the requested position in AlignSlice coordinates Example : my ($slice, $seq_region_position) = $as_slice-> get_original_seq_region_position(100); Returntype : ($slice, $seq_region_position), an array where the first element is a Bio::EnsEMBL::Slice and the second one is the requested seq_region_position. Exceptions : if the position corresponds to a gap, the slice will be a fake GAP slice and the position will be the requested one (in AlignSlice coordinates) Caller : general =cut sub get_original_seq_region_position { my ($self, $position) = @_; my $underlying_slice = $self->get_all_underlying_Slices($position, $position, 1)->[0]; return ($underlying_slice, $underlying_slice->start); } =head2 get_all_constrained_elements Arg 1 : (opt) string $method_link_type (default = GERP_CONSTRAINED_ELEMENT) Arg 2 : (opt) listref Bio::EnsEMBL::Compara::GenomeDB $species_set (default, the set of species from the MethodLinkSpeciesSet used to build this AlignSlice) Example : my $constrained_elements = $align_slice->get_all_constrained_elements(); Description: Retrieve the corresponding constrained elements for these alignments. Objects will be mapped on this AlignSlice::Slice, i.e. the reference_slice, reference_slice_start, reference_slice_end and reference_slice_strand will refer to this AlignSlice::Slice object Returntype : ref. to an array of Bio::EnsEMBL::Compara::GenomicAlignBlock objects. Caller : object::methodname Status : At risk =cut sub get_all_constrained_elements { my ($self, $method_link_type, $species_set) = @_; my $all_constrained_elements = []; return [] if (!$self->{_align_slice}); my $all_original_constrained_elements = $self->{_align_slice}-> get_all_constrained_elements($method_link_type, $species_set); foreach my $this_constrained_element (@$all_original_constrained_elements) { foreach my $this_genomic_align (@{$this_constrained_element->get_all_GenomicAligns}) { if ($this_genomic_align->genome_db->name eq $self->genome_db->name) { my $constrained_slice = $this_genomic_align->get_Slice; my $slices = $self->map_original_Slice($constrained_slice); $this_constrained_element->reference_slice($self); $this_constrained_element->reference_slice_start($slices->[0]->start); $this_constrained_element->reference_slice_end($slices->[0]->end); $this_constrained_element->reference_slice_strand($slices->[0]->strand); push(@$all_constrained_elements, $this_constrained_element); } } } return $all_constrained_elements; } =head2 map_original_Slice Arg [1] : Bio::EnsEMBL::Slice $original_slice Description: This Slice is made of several Bio::EnsEMBL::Slices mapped on it with gaps inside and regions with no matching sequence. This method tries to map on this Slice the region(s) corresponding to the provided $original_slice NB: This method does not know how to project Slices onto other coordinate systems. It is your responsability to provide an original Slice on the same coordinate system as the underlying Bio::EnsEMBL::Slices Example : my $slices = $as_slice->map_original_Slice($orginal_slice); Returntype : listref of Bio::EnsEMBL::Compara::AlignSlice::Slice objects which are the sub_Slices of this Bio::EnsEMBL::Compara:: AlignSlice::Slice where the $original_slice maps Exceptions : =cut sub map_original_Slice { my ($self, $original_slice) = @_; my $mapped_slices = []; return $mapped_slices if (!defined($original_slice)); foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) { my $this_slice = $pair->{slice}; my $mapper = $pair->{mapper}; my $slice_start = $pair->{start}; my $slice_end = $pair->{end}; next if (!$this_slice->coord_system->equals($original_slice->coord_system)); next if ($this_slice->seq_region_name ne $original_slice->seq_region_name); next if ($this_slice->start > $original_slice->end or $this_slice->end < $original_slice->start); my @sequence_coords = $mapper->map_coordinates( 'sequence', $original_slice->start, $original_slice->end, $original_slice->strand, 'sequence' ); my $mapped_start; my $mapped_end; my $mapped_strand; foreach my $sequence_coord (@sequence_coords) { if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { if (!defined($mapped_start) or ($mapped_start > $sequence_coord->start)) { $mapped_start = $sequence_coord->start; } if (!defined($mapped_end) or ($mapped_end < $sequence_coord->end)) { $mapped_end = $sequence_coord->end; } if (!defined($mapped_strand)) { $mapped_strand = $sequence_coord->strand; } elsif ($mapped_strand != $sequence_coord->strand) { warning("strand inversion within a Slice-Mapper pair!"); $mapped_start = undef; $mapped_end = undef; $mapped_strand = undef; last; } } } if (defined($mapped_start) and defined($mapped_end) and defined($mapped_strand)) { my $sub_slice = $self->sub_Slice($mapped_start, $mapped_end, $mapped_strand); push(@$mapped_slices, $sub_slice) if ($sub_slice); } } return $mapped_slices; } =head2 expand (not supported) Expanding a Bio::EnsEMBL::Compara::AlignSlice::Slice object is not supported at the moment. You could create a new Bio::EnsEMBL::Slice and fetch a new Bio::EnsEMBL::Compara::AlignSlice from it. On the hand, if you only want to extend the sequence, you could use the subseq method with a negatve start value or an end value larger than the end of this Slice. This method returns undef =cut sub expand { my $self = shift; warning("Cannot expand a Bio::EnsEMBL::Compara::AlignSlice::Slice object"); return undef; } =head2 get_all_Attributes Arg [1] : optional string $attrib_code The code of the attribute type to retrieve values for. Example : ($htg_phase) = @{$slice->get_all_Attributes('htg_phase')}; @slice_attributes = @{$slice->get_all_Attributes()}; Description: Gets a list of Attributes of all teh underlying slice''s seq_region. Optionally just get Attributes for given code. This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This method go through all of them, retrieves the data and return them in order. There will be one set of Attributes by underlying slice. Returntype : listref Bio::EnsEMBL::Attribute Exceptions : warning if slice does not have attached adaptor Caller : general =cut sub get_all_Attributes { my $self = shift; my $attributes = []; foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) { my $this_slice = $pair->{slice}; push(@$attributes, @{$this_slice->get_all_Attributes(@_)}); } return $attributes; } =head2 get_all_VariationFeatures Args : none Description :returns all variation features on this slice. This function will only work correctly if the variation database has been attached to the core database. This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This method go through all of them, retrieves the data and maps them on this Bio::EnsEMBL::Compara::AlignSlice::Slice object. ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature Exceptions : none Caller : contigview, snpview =cut sub get_all_VariationFeatures { my $self = shift; return $self->_method_returning_simple_features("get_all_VariationFeatures", @_) } =head2 get_all_genotyped_VariationFeatures Args : none Function : returns all variation features on this slice that have been genotyped. This function will only work correctly if the variation database has been attached to the core database. This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This method go through all of them, retrieves the data and maps them on this Bio::EnsEMBL::Compara::AlignSlice::Slice object by changing start, end, strand and slice attributes. ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature Exceptions : none Caller : contigview, snpview =cut sub get_all_genotyped_VariationFeatures { my $self = shift; return $self->_method_returning_simple_features("get_all_genotyped_VariationFeatures", @_) } =head2 get_all_RepeatFeatures Arg [1] : (optional) string $logic_name The name of the analysis performed on the repeat features to obtain. Arg [2] : (optional) string $repeat_type Limits features returned to those of the specified repeat_type Example : @repeat_feats = @{$slice->get_all_RepeatFeatures(undef,'LTR')}; Description: Retrieves the RepeatFeatures which overlap with logic name $logic_name and with score above $score. If $logic_name is not defined features of all logic names are retrieved. This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This method go through all of them, retrieves the data and maps them on this Bio::EnsEMBL::Compara::AlignSlice::Slice object by changing start, end, strand and slice attributes. Returntype : listref of Bio::EnsEMBL::RepeatFeatures Exceptions : warning if slice does not have attached adaptor Caller : general =cut sub get_all_RepeatFeatures { my $self = shift; return $self->_method_returning_simple_features("get_all_RepeatFeatures", @_) } =head2 project (testing) Arg [1] : string $name The name of the coordinate system to project this slice onto Arg [2] : string $version The version of the coordinate system (such as 'NCBI34') to project this slice onto Example : my $clone_projection = $slice->project('clone'); foreach my $seg (@$clone_projection) { my $clone = $segment->to_Slice(); print $slice->seq_region_name(), ':', $seg->from_start(), '-', $seg->from_end(), ' -> ', $clone->seq_region_name(), ':', $clone->start(), '-',$clone->end(), $clone->strand(), "\n"; } Description: This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This method go through all of them, porject them and maps the projections on this Bio::EnsEMBL::Compara::AlignSlice::Slice object. The original 'project' method returns the results of 'projecting' a slice onto another coordinate system. Projecting to a coordinate system that the slice is assembled from is analagous to retrieving a tiling path. The original method may also be used to 'project up' to a higher level coordinate system, however. This method returns a listref of triplets [start,end,slice] which represents the projection. The start and end defined the region of this slice which is made up of the third value of the triplet: a slice in the requested coordinate system. Because of the gaps in the mapping of the Bio::EnsEMBL::Slices the lenght of the slice returned in the tripet may be different than the distance defined by the start and end of the Bio::EnsEMBL::ProjectionSegment object. Returntype : list reference of Bio::EnsEMBL::ProjectionSegment objects which can also be used as [$start,$end,$slice] triplets Exceptions : none Caller : general =cut sub project { my $self = shift; my $cs_name = shift; my $cs_version = shift; my $projections = []; throw('Coord_system name argument is required') if(!$cs_name); foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) { my $this_slice = $pair->{slice}; my $this_mapper = $pair->{mapper}; my $this_projections = $this_slice->project($cs_name, $cs_version); foreach my $this_projection (@$this_projections) { my ($this_start, $this_end); if ($this_slice->strand == 1) { $this_start = $this_slice->start + $this_projection->from_start - 1; $this_end = $this_slice->start + $this_projection->from_end - 1; } else { $this_start = $this_slice->start + ($this_slice->length - $this_projection->from_start); $this_end = $this_slice->start + ($this_slice->length - $this_projection->from_end); } my $new_slice = $this_projection->to_Slice; my ($new_start, $new_end); my @alignment_coords = $this_mapper->map_coordinates( 'sequence', $this_start, $this_start, 1, 'sequence' ); foreach my $alignment_coord (@alignment_coords) { if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { $new_start = $alignment_coord->start; } } next if (!defined($new_start)); @alignment_coords = $this_mapper->map_coordinates( 'sequence', $this_end, $this_end, 1, 'sequence' ); foreach my $alignment_coord (@alignment_coords) { if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { $new_end = $alignment_coord->start; } } next if (!defined($new_end)); ## Truncate projection in order to fit into this AlignSlice if ($new_start > $self->end) { next; ## Maps outside of sub_AlignSlice } elsif ($new_start < $self->start) { if ($new_slice->strand == -1) { $new_slice = $new_slice->sub_Slice(1, $new_slice->length - ($self->start - $new_start)); } else { $new_slice = $new_slice->sub_Slice($self->start - $new_start + 1, $new_slice->length); } $new_start = 1; $new_end = $new_slice->length; } ## Truncate projection in order to fit into this AlignSlice if ($new_end < $self->start) { next; ## Maps outside of sub_AlignSlice } elsif ($new_end > $self->length) { # We have to truncate this projection if ($new_slice->strand == -1) { $new_slice = $new_slice->sub_Slice(1 + $new_end - $self->length, $new_slice->length); } else { $new_slice = $new_slice->sub_Slice(1, $new_slice->length - ($new_end - $self->length)); } $new_end = $self->length; } my $new_projection = bless([$new_start, $new_end, $new_slice], "Bio::EnsEMBL::ProjectionSegment"); push(@$projections, $new_projection); } } return $projections; } =head2 _method_returning_simple_features Args[1] : method_name Description : This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This method go through all of them, calls method_name and maps teh result on this Bio::EnsEMBL::Compara::AlignSlice::Slice object. ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature Exceptions : none Caller : contigview, snpview =cut sub _method_returning_simple_features { my $self = shift; my $method = shift; my $ret = []; foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) { my $this_slice = $pair->{slice}; my $this_mapper = $pair->{mapper}; my $this_ret = $this_slice->$method(@_); foreach my $this_object (@$this_ret) { my ($this_start, $this_end); if ($this_slice->strand == 1) { $this_start = $this_object->slice->start + $this_object->start - 1; $this_end = $this_object->slice->start + $this_object->end - 1; } else { $this_start = $this_object->slice->start + ($this_object->slice->length - $this_object->end); $this_end = $this_object->slice->start + ($this_object->slice->length - $this_object->start); } my @alignment_coords = $this_mapper->map_coordinates( 'sequence', $this_start, $this_end, $this_object->strand, 'sequence' ); my ($start, $end, $strand); foreach my $alignment_coord (@alignment_coords) { if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { if (!defined($start) or $start > $alignment_coord->start) { $start = $alignment_coord->start; } if (!defined($end) or $end < $alignment_coord->end) { $end = $alignment_coord->end; } if (!defined($strand)) { $strand = $alignment_coord->strand * $this_slice->strand; } } } my $new_object; %$new_object = %$this_object; bless $new_object, ref($this_object); if (defined($start) and defined($end) and defined($strand)) { $new_object->{start} = $start - $self->start + 1; $new_object->{end} = $end - $self->start + 1; $new_object->{strand} = $strand; $new_object->{slice} = $self; # Skip this object if it maps outside of this AlignSlice next if ($new_object->{start} > $self->length or $new_object->{end} < 1); push(@$ret, $new_object); } } } return $ret; } 1;