Raw content of Bio::EnsEMBL::ExternalData::DAS::Coordinator =head1 NAME Bio::EnsEMBL::ExternalData::DAS::Coordinator =head1 SYNOPSIS # Instantiate with a list of Bio::EnsEMBL::ExternalData::DAS::Source objects: my $c = Bio::EnsEMBL::ExternalData::DAS::Coordinator->new(-sources => $list); # Fetch by slice my $struct = $c->fetch_Features( $slice ); for my $logic_name ( keys %{ $struct } ) { # Bio::EnsEMBL::ExternalData::DAS::Source object: my $source = $struct->{$logic_name}{'source'}{'object'}; my $error = $struct->{$logic_name}{'source'}{'error'}; # Bio::EnsEMBL::ExternalData::DAS::Stylesheet object: my $stylesheet = $struct->{$logic_name}{'stylesheet'}{'object'}; my $s_error = $struct->{$logic_name}{'stylesheet'}{'error'}; for my $segment ( keys %{ $struct->{$logic_name}{'features'} } ) { my $f_url = $struct->{$logic_name}{'features'}{$segment}{'url'}; my $f_error = $struct->{$logic_name}{'features'}{$segment}{'error'}; # arrayref of Bio::EnsEMBL::ExternalData::DAS::Feature objects: my $features = $struct->{$logic_name}{'features'}{$segment}{'objects'}; } } # Fetch by gene my $struct = $c->fetch_Features( $gene ); # Fetch by protein my $struct = $c->fetch_Features( $translation ); # Feature ID filtering my $struct = $c->fetch_Features( $slice, feature => 'xyz1234' ); # Type ID and Group ID filtering my $struct = $c->fetch_Features( $slice, group => 'xyz', type => 'foo' ); =head1 DESCRIPTION Given a set of DAS::Source objects and a target object such as a Slice or Translation, will simultaneously perform all DAS requests and map the features onto the target object. =cut package Bio::EnsEMBL::ExternalData::DAS::Coordinator; use strict; use warnings; no warnings 'uninitialized'; use POSIX qw(ceil); use Bio::EnsEMBL::Mapper; use Bio::Das::Lite; use Bio::EnsEMBL::ExternalData::DAS::CoordSystem; use Bio::EnsEMBL::ExternalData::DAS::GenomicMapper; use Bio::EnsEMBL::ExternalData::DAS::XrefPeptideMapper; use Bio::EnsEMBL::ExternalData::DAS::GenomicPeptideMapper; use Bio::EnsEMBL::ExternalData::DAS::Feature; use Bio::EnsEMBL::ExternalData::DAS::Stylesheet; use Bio::EnsEMBL::ExternalData::DAS::SourceParser qw(%GENE_COORDS %PROT_COORDS is_genomic); use Bio::EnsEMBL::Utils::Argument qw(rearrange); use Bio::EnsEMBL::Utils::Exception qw(throw info warning); our %ORI_NUMERIC = ( 1 => 1, '+' => 1, -1 => -1, '-' => -1, 0 => 0, '.' => 0, ); # This variable determines the supported xref mapping paths. # The first level key is the xref coordinate system. # The first level value is a hashref, containing: # 'predicate': a code block to filter xref's of the relevant type # 'transformer': a code block to obtain the DAS segment ID # our %XREF_PEPTIDE_FILTERS = ( 'uniprot_peptide' => { 'predicate' => sub { $_[0]->dbname eq 'Uniprot/SPTREMBL' || $_[0]->dbname eq 'Uniprot/SWISSPROT' }, 'transformer' => sub { $_[0]->primary_id }, }, 'ipi_acc' => { 'predicate' => sub { $_[0]->dbname eq 'IPI' }, 'transformer' => sub { $_[0]->primary_id }, }, 'ipi' => { 'predicate' => sub { $_[0]->dbname eq 'IPI' }, 'transformer' => sub { $_[0]->display_id }, }, 'entrezgene_acc' => { 'predicate' => sub { $_[0]->dbname eq 'EntrezGene' }, 'transformer' => sub { $_[0]->primary_id }, }, ); our %XREF_GENE_FILTERS = ( 'hgnc' => { 'predicate' => sub { $_[0]->dbname eq 'HGNC' }, 'transformer' => sub { $_[0]->primary_id }, }, 'mgi_acc' => { 'predicate' => sub { $_[0]->dbname eq 'MGI' }, 'transformer' => sub { my $id = $_[0]->primary_id; $id =~ s/\://; $id; }, }, 'mgi' => { 'predicate' => sub { $_[0]->dbname eq 'MGI' }, 'transformer' => sub { $_[0]->display_id }, }, ); =head2 new Arg [..] : List of named arguments: -SOURCES - Arrayref of Bio::EnsEMBL::DAS::Source objects. -PROXY - A URL to use as an HTTP proxy server -NOPROXY - A list of domains/hosts to not use the proxy for -TIMEOUT - The request timeout, in seconds -GENE_COORDS - Override the coordinate system representing genes -PROT_COORDS - Override the coordinate system representing proteins Description: Constructor Returntype : Bio::EnsEMBL::DAS::Coordinator Exceptions : If unable to assign the gene and protein coordinate systems Caller : Status : =cut sub new { my $class = shift; my ($sources, $proxy, $no_proxy, $timeout, $gene_cs, $prot_cs) = rearrange(['SOURCES','PROXY', 'NOPROXY', 'TIMEOUT', 'GENE_COORDS', 'PROT_COORDS'], @_); $sources = [$sources] if ($sources && !ref $sources); my $das = Bio::Das::Lite->new(); $das->user_agent('Ensembl'); $das->timeout($timeout); $das->caching(0); $das->http_proxy($proxy); # Bio::Das::Lite support for no_proxy added around September 2008 if ($no_proxy) { if ($das->can('no_proxy')) { $das->no_proxy($no_proxy); } else { warning("Installed version of Bio::Das::Lite does not support use of 'no_proxy'"); } } $gene_cs ||= $GENE_COORDS{'ensembl_gene'} || throw('Unable to determine Gene coordinate system'); $prot_cs ||= $PROT_COORDS{'ensembl_peptide'} || throw('Unable to determine Peptide coordinate system'); my $self = { 'sources' => $sources, 'daslite' => $das, 'gene_cs' => $gene_cs, 'prot_cs' => $prot_cs, 'objects' => {}, }; bless $self, $class; return $self; } =head2 fetch_Features Arg [1] : Bio::EnsEMBL::Object $root_obj - the query object (e.g. Slice, Gene) Arg [2] : (optional) hash of filters: maxbins - the maximum available "rendering space" for features NOTE this is only passed to the server, it is not guaranteed to be honoured feature - the feature ID type - the type ID group - the group ID Description: Fetches DAS features for a given Slice, Gene or Translation Example : $hashref = $c->fetch_Features( $slice, type => 'mytype' ); Returntype : A hash reference containing Bio::...::DAS::Feature and Bio::...::DAS::Stylesheet objects: { $logic_name => { 'source' => { 'object' => $source_object, 'error' => 'Not applicable', }, 'features' => { 'X:1000,2000' => { 'error' => 'Error fetching...', 'url' => 'http://...', 'objects' => [ $feat1, $feat2 ], }, }, 'stylesheet' => { 'object' => $style1, 'error' => 'Error fetching...', }, } } Exceptions : Throws if the object is not supported Caller : Status : =cut sub fetch_Features { my ( $self, $target_obj ) = splice @_, 0, 2; my %filters = @_; # maxbins, feature, type, group # TODO: review this structure that is returned, would we prefer to split by # segment ID? We don't always know it before we parse the feature though (e.g. # when querying by feat ID). Also stylesheet errors aren't segment-specific my ( $target_cs, $target_segment, $slice, $gene, $prot ); if ( $target_obj->isa('Bio::EnsEMBL::Gene') ) { $slice = $target_obj->slice; $gene = $target_obj; $target_cs = $slice->coord_system; # actually want features relative to the slice $target_segment = $target_obj->stable_id; } elsif ( $target_obj->isa('Bio::EnsEMBL::Slice') ) { $slice = $target_obj; $target_cs = $target_obj->coord_system; $target_segment = sprintf '%s:%s,%s', $target_obj->seq_region_name, $target_obj->start, $target_obj->end; } elsif ( $target_obj->isa('Bio::EnsEMBL::Translation') ) { $prot = $target_obj; $target_cs = Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'ensembl_peptide' ); $target_segment = $target_obj->stable_id; } else { throw('Unsupported object type: '.$target_obj); } my $target_species = $target_obj->adaptor->db->species; my %coords = (); my $final = {}; my %sources_with_data = (); #==========================================================# # First sort the sources into coordinate systems # #==========================================================# for my $source (@{ $self->{'sources'} }) { # Set up the data structure... $final->{$source->logic_name} = { 'source' => { 'object' => $source, }, 'features' => {}, 'stylesheet' => {}, }; my @coord_systems = @{ $self->_choose_coord_systems($target_cs, $target_obj, $source->coord_systems) }; if (! scalar @coord_systems ) { warning($source->logic_name.' has no coord systems'); $final->{$source->logic_name}{'source'}{'error'} = 'Bad source configuration'; next; } # Check the coordinate system is the correct species (if it has one) @coord_systems = grep { $_->matches_species( $target_species ) } @coord_systems; if (! scalar @coord_systems ) { $final->{$source->logic_name}{'source'}{'error'} = "Source not compatible with $target_species"; } # Query in all compatible coordinate systems for my $source_cs ( @coord_systems ) { # The coordinate system name doesn't need species in it because we have # just checked it is species-compatible - we treat them the same from now #Êon. That is, Ensembl,Gene_ID == Ensembl,Gene_ID,Homo sapiens. my $cs_key = $source_cs->name . ' ' . $source_cs->version; # Sort sources by coordinate system if ( !$coords{$cs_key} ) { # Do a lot of funky stuff to get the query segments, and build up # mappers at the same time my $segments = $self->_get_Segments( $source_cs, $target_cs, $slice, $gene, $prot); $coords{ $cs_key } = { 'sources' => {}, 'coord_system' => $source_cs, 'segments' => $segments }; } $coords{ $cs_key }{'sources'}{$source->full_url} ||= []; push @{ $coords{ $cs_key }{'sources'}{$source->full_url} }, $source; } } #==========================================================# # Parallelise the requests for each coordinate system # #==========================================================# my $daslite = $self->{'daslite'}; # Split the requests by coordinate system, i.e. parallelise # requests for segments that are from the same coordinate system while (my ($coord_key, $coord_data) = each %coords) { my @segments = @{ $coord_data->{'segments'} }; my @urls = keys %{ $coord_data->{'sources'} }; my $source_cs = $coord_data->{'coord_system'}; my $coord_name = $source_cs->name . ' ' . $source_cs->version; # Either the mapping isn't supported, or nothing maps to the region we're # interested in. if (! scalar @segments ) { info("No segments found for $coord_name"); for ( values %{ $coord_data->{'sources'} } ) { for my $source (@{ $_ }) { $final->{$source->logic_name}{'source'}{'error'} = 'Not applicable'; } } next; } info("Querying with @segments for $coord_name"); $daslite->dsn( \@urls ); my $response; my $statuses; my $maxbins = $source_cs->equals( $target_cs ) ? $filters{maxbins} : undef; #==========================================================# # Get features for all DAS sources # #==========================================================# =head ######## # If we are looking for a specific feature, try quering the server(s) for # it specifically first # if ( $filters{feature} ) { $response = $daslite->features( { 'feature_id' => $filters{feature} } ); # returns a hashref $statuses = $daslite->statuscodes(); # Find out if it worked (has to work for EVERY source) while (my ($url, $features) = each %{ $response }) { my $status = $statuses->{$url}; if ($status !~ m/^200/) { undef $response; last; } elsif (!defined $features || ref $features ne 'ARRAY' || !scalar @{ $features }) { undef $response; last; } } } =cut ######## # If this didn't work, or we are running a normal query, use the segments # if ( !$response ) { # Build a query array for each segment, with the optional filter # parameters. Note that not all DAS servers implement these filters. If # they do, great, but we still have to filter on the client side later. my @features_query = map { { 'segment' => $_, 'type' => $filters{type}, 'feature_id' => $filters{feature}, 'group_id' => $filters{group}, 'maxbins' => $maxbins, } } @segments; $response = $daslite->features( \@features_query ); # returns a hashref $statuses = $daslite->statuscodes(); } #========================================================# # Check and map the features # #========================================================# while (my ($raw_url, $features) = each %{ $response }) { # Now iterating over coordsys + url my $status = $statuses->{$raw_url}; # Parse the segment from the URL # Should be one URL for each source/query combination my ($url, $segment) = $raw_url =~ m|(.+)/features\?.*segment=([^;&]+)|; info("*** $url $segment ***"); my @sources = @{ $coord_data->{'sources'}{$url} }; for my $source ( @sources ) { $final->{$source->logic_name}{'features'}{$segment} = { 'url' => $raw_url, 'coord_system' => $source_cs, 'objects' => [], } } # DAS source generated an error if ($status !~ m/^200/) { for my $source ( @sources ) { $final->{$source->logic_name}{'features'}{$segment}{'error'} = "Error fetching features - $status"; } } # We got some features in the region of interest elsif ($features && ref $features eq 'ARRAY' && scalar @{ $features }) { ######## # Convert into the query coordinate system if applicable # $features = $self->map_Features($features, $source_cs, $target_cs, $slice, %filters); # We got something useful if (scalar @{ $features }) { for my $source ( @sources ) { # Store features: $final->{$source->logic_name}{'features'}{$segment}{'objects'} = $features; # For retrieving stylesheets: $sources_with_data{$url}{$source->logic_name} = $source; } } # Either we couldn't map the features, or nothing matched the filters else { for my $source ( @sources ) { $final->{$source->logic_name}{'features'}{$segment}{'error'} = 'No relevant features'; } } } } } #==========================================================# # Get stylesheets for the sources with data # #==========================================================# $daslite->dsn( [ keys %sources_with_data ] ); my $response = $daslite->stylesheet(); my $statuses = $daslite->statuscodes(); while (my ($url, $styledata) = each %{ $response }) { my $status = $statuses->{$url}; $url =~ s|/stylesheet\??$||; my @sources = values %{ $sources_with_data{$url} }; # DAS source generated an error if ( $status !~ m/^200/ ) { for my $source ( @sources ) { $final->{$source->logic_name}{'stylesheet'}{'error'} = "Error fetching stylesheet - $status"; } } # DAS source has stylesheet data elsif ($styledata && ref $styledata eq 'ARRAY' && scalar @{ $styledata }) { # Build stylesheet object: my $stylesheet = Bio::EnsEMBL::ExternalData::DAS::Stylesheet->new( $styledata->[0] ); for my $source ( @sources ) { $final->{$source->logic_name}{'stylesheet'}{'object'} = $stylesheet; } } } return $final; } # Returns: new arrayref with features sub map_Features { my ( $self, $features, $source_cs, $to_cs, $slice ) = splice @_, 0, 5; my %filters = @_; # feature, type, group # TODO: implement maxbins filter?? my $filter_f = $filters{feature}; my $filter_t = $filters{type}; my $filter_g = $filters{group}; # If filtering we're more likely to have a small region, so it's better to # make 4 tests when filtering and 1 when not than always make 3 tests. # The big question is, is it better to test each filter is enabled than to # just preprocess in an extra iteration? I suspect the former. my $nofilter = !$filter_f && !$filter_t && !$filter_g; # Code block to build a feature object from raw hash my $build_Feature = sub { my $f = shift; $f = Bio::EnsEMBL::ExternalData::DAS::Feature->new( $f ); # Where target coordsys is genomic, make a slice-relative feature if ($slice) { $f->slice($slice->seq_region_Slice); $f = $f->transfer($slice); } else { $f->seqname( $f->{'segment_id'} ); } return $f; }; # Code block to apply optional filters my $filter_Feature = sub { my $f = shift; # Test type first, because this is the more likely filter for large regions # where efficiency matters most if ( $filter_t ) { $f->{'type_id'} eq $filter_t || return 0; } if ( $filter_f ) { $f->{'feature_id'} eq $filter_f || return 0; } if ( $filter_g ) { return 0 unless grep { $_->{'group_id'} eq $filter_g } @{ $f->{'group'}||[] }; } return 1; }; my @new_features = (); # As part of the feature parsing we need to do some converting and filtering. # We could do this in a separate loop before doing any mapping, but this adds # an extra iteration step which is inefficient (especially for large numbers # of features). So we duplicate a bit of code. if ( $source_cs->equals( $to_cs ) || ( $slice && $source_cs->name eq 'toplevel' && $slice->is_toplevel ) ) { for my $f ( @{ $features } ) { if ( $nofilter || &$filter_Feature( $f ) ) { $f->{'strand'} = $ORI_NUMERIC{$f->{'orientation'} || '.'} || 0; # Convert to Ensembl-style (numeric) strand push @new_features, &$build_Feature( $f ); # Build object } } return \@new_features; } # May need multiple mapping steps to reach the target coordinate system # This loop works by undefining $source_cs, the redefining it when we know # which coordinate system the mapper is mapping to while ( $source_cs && !$source_cs->equals($to_cs) ) { info('Beginning mapping from '.$source_cs->name); my @this_features = @{ $features }; my $mappers = $self->{'mappers'}{$source_cs->name}{$source_cs->version||''}; my $passthrough = $self->{'passthrough'}{$source_cs->name}{$source_cs->version||''}; $features = []; my $this_cs = undef; my $positional_mapping_errors = 0; # Map the current set of features to the next coordinate system for my $f ( @this_features ) { $nofilter || &$filter_Feature( $f ) || next; my $strand = $f->{'strand'}; if (!defined $strand) { $strand = $f->{'strand'} = $ORI_NUMERIC{$f->{'orientation'} || '.'} || 0; } # It doesn't matter what coordinate system non-positional features come # from, they are always included and don't need mapping if (!$f->{'start'} && !$f->{'end'}) { push @new_features, &$build_Feature( $f ); next; } my $segid = $f->{'segment_id'}; # Check for passthrough mappings (i.e. no mapping is needed but the coord system needs to change) # This is used when mapping from toplevel if (my $pass_cs = $passthrough->{$segid}) { $this_cs = $pass_cs; # If this is the final step, convert to Ensembl Feature if ( $this_cs->equals( $to_cs ) ) { push @new_features, &$build_Feature( $f ); } else { push @{ $features }, $f; } next; } # Otherwise check there are mappings available for this segment my $mapper = $mappers->{$segid}; if (!$mapper) { $positional_mapping_errors++; next; } $this_cs = $mapper->{'to_cs'} || throw('Mapper maps to unknown coordinate system'); # Get new coordinates for this feature my @coords = $mapper->map_coordinates($segid, $f->{'start'}, $f->{'end'}, $strand, 'from'); # Create new features from the mapped coordinates for my $c ( @coords ) { $c->isa('Bio::EnsEMBL::Mapper::Coordinate') || next; my %new = %{ $f }; $new{'segment_id'} = $c->id; $new{'start' } = $c->start; $new{'end' } = $c->end; $new{'strand' } = $c->strand; # If this is the final step, convert to Ensembl Feature if ( $this_cs->equals( $to_cs ) ) { push @new_features, &$build_Feature( \%new ); } else { push @{ $features }, \%new; } } } if ($positional_mapping_errors) { warning(sprintf '%d positional features could not be mapped (%s -> %s)', $positional_mapping_errors, $source_cs ? $source_cs->name.' '.$source_cs->version : 'UNKNOWN', $this_cs ? $this_cs->name .' '.$this_cs->version : 'UNKNOWN' ); } $source_cs = $this_cs; } return \@new_features; } # Supports mappings: # location-based to location-based # location-based to protein-based # protein-based to location-based # protein-based to protein-based # gene-based to location-based # gene-based to protein-based # xref-based to location-based # xref-based to protein-based # xref-based to gene-based # # Coordinate system definitions: # location-based == chromosome|clone|contig|scaffold|supercontig etc # protein-based == $self->{prot_cs} (ensembl_peptide) # gene-based == $self->{gene_cs} (ensembl_gene) # xref-based == uniprot_peptide|entrez_gene... (see %XREF_PEPTIDE_FILTERS) sub _get_Segments { my $self = shift; my $from_cs = shift; # the "foreign" source coordinate system my $to_cs = shift; # the target coordsys that mapped objects will be converted to my ($slice, $gene, $prot) = @_; #warn sprintf "Getting mapper for %s -> %s", $from_cs->name, $to_cs->name; info(sprintf 'Building mappings for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); my %mappers = (); my @segments = (); # There are several different Mapper implementations in the API to convert # between various coordinate systems: AssemblyMapper, TranscriptMapper, # IdentityXref. For DAS, we often need to convert across the different realms # these mappers serve, such as chromosome:NCBI35 -> peptide which requires an # intermediary NCBI35 -> NCBI36 step. Unfortunately, the different mappers all # work in different ways and have different interfaces and limitations. # # For example, AssemblyMapper and TranscriptMapper use custom methods rather # than the standard API 'map_coordinates', IdentityXref uses custom # 'external_id' and 'ensembl_id' identifiers for the regions it is mapping # between, and all name the coordinate systems differently. These differences # mean the different mappers cannot be strung together, so this module uses # wrappers in order to achieve this. # Mapping to slice-relative coordinates if ( is_genomic($to_cs) ) { # Sanity checks $slice || throw('Trying to convert to slice coordinates, but no Slice provided'); $slice->coord_system->equals($to_cs) || throw('Provided slice is not in target coordinate system'); # Mapping from a slice-based coordinate system if ( is_genomic($from_cs) || $from_cs->name eq 'toplevel' ) { # No mapping needed - the coords are identical if ( $from_cs->equals( $to_cs ) ) { info(sprintf 'No mappings needed for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); push @segments, sprintf '%s:%s,%s', $slice->seq_region_name, $slice->start, $slice->end; } # No mapping needed, but we need to indicate what the real coordsys is elsif ( $from_cs->name eq 'toplevel' && $slice->is_toplevel ) { info(sprintf 'No mappings needed for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); $self->{'passthrough'}{$from_cs->name}{$from_cs->version}{$slice->seq_region_name} = $to_cs; push @segments, sprintf '%s:%s,%s', $slice->seq_region_name, $slice->start, $slice->end; } # We can't map from toplevel, only detect when no mapping is required... elsif ( $from_cs->name eq 'toplevel' ) { warning(sprintf 'Mapping from toplevel to %s is only supported where the given %1$s slice is also toplevel', $to_cs->name); } else { # AssemblyMapperAdaptor doesn't like DAS::CoordSystem # And sometimes DAS coordinate systems have spurious versions... my $csa = $slice->adaptor->db->get_CoordSystemAdaptor; my $ama = $slice->adaptor->db->get_AssemblyMapperAdaptor; my $tmpfrom = $csa->fetch_by_name( $from_cs->name, $from_cs->version ) || $csa->fetch_by_name( $from_cs->name ); my $tmpto = $csa->fetch_by_name( $to_cs->name, $to_cs->version ) || $csa->fetch_by_name( $to_cs->name ); my $tmpmap = $tmpfrom && $tmpto ? $ama->fetch_by_CoordSystems($tmpfrom, $tmpto) : undef; # Ensembl might not support a specific genomic -> genomic mapping if ( $tmpmap ) { # Wrapper for AssemblyMapper: my $mapper = Bio::EnsEMBL::ExternalData::DAS::GenomicMapper->new( 'from', 'to', $tmpfrom, $tmpto, $tmpmap ); # Map backwards to get the query segments my @coords = $mapper->map_coordinates($slice->seq_region_name, $slice->start, $slice->end, $slice->strand, 'to'); info(sprintf 'Adding mappings for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); for my $c ( @coords ) { $c->isa('Bio::EnsEMBL::Mapper::Coordinate') || next; $self->{'mappers'}{$from_cs->name}{$from_cs->version}{$c->id} ||= $mapper; push @segments, sprintf '%s:%s,%s', $c->id, $c->start, $c->end; } } else { warning(sprintf 'Mapping from %s to %s is not supported', $from_cs->name, $to_cs->name); } } } # Mapping from ensembl_gene to slice elsif ( $from_cs->equals( $self->{'gene_cs'} ) ) { my @genes = $gene ? ($gene) : @{ $slice->get_all_Genes }; info(sprintf 'Adding mappings for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); for my $g ( @genes ) { # Genes are already definitely relative to the target slice, so don't need to do any assembly mapping my $mapper = Bio::EnsEMBL::Mapper->new('from', 'to', $from_cs, $to_cs); $mapper->add_map_coordinates( $g->stable_id, 1, $g->length, $g->seq_region_strand, $slice->seq_region_name, $g->seq_region_start, $g->seq_region_end ); $self->{'mappers'}{$from_cs->name}{$from_cs->version}{$g->stable_id} = $mapper; push @segments, $g->stable_id; } } # Mapping from ensembl_peptide to slice elsif ( $from_cs->equals( $self->{'prot_cs'} ) ) { my @transcripts = $gene ? @{ $gene->get_all_Transcripts } : @{ $slice->get_all_Transcripts }; info(sprintf 'Adding mappings for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); for my $tran ( @transcripts ) { my $p = $tran->translation || next; $self->{'mappers'}{$from_cs->name}{$from_cs->version}{$p->stable_id} ||= Bio::EnsEMBL::ExternalData::DAS::GenomicPeptideMapper->new('from', 'to', $from_cs, $to_cs, $tran); push @segments, $p->stable_id; } } # Mapping from translation-mapped xref to slice elsif ( my $callback = $XREF_PEPTIDE_FILTERS{$from_cs->name} ) { # Mapping path is xref -> ensembl_peptide -> slice my $mid_cs = $self->{'prot_cs'}; my @transcripts = $gene ? @{ $gene->get_all_Transcripts } : @{ $slice->get_all_Transcripts }; for my $tran ( @transcripts ) { my $p = $tran->translation || next; # first stage mapper: xref to translation push @segments, @{ $self->_get_Segments($from_cs, $mid_cs, $slice, $gene, $p) }; } # If the first stage actually produced mappings, we'll need to map from # peptide to slice if ($self->{'mappers'}{$from_cs->name}{$from_cs->version}) { # second stage mapper: gene or translation to transcript's slice $self->_get_Segments($mid_cs, $to_cs, $slice, $gene, $prot); } } # Mapping from gene-mapped xref to slice elsif ( $callback = $XREF_GENE_FILTERS{$from_cs->name} ) { my @genes = $gene ? ($gene) : @{ $slice->get_all_Genes }; info(sprintf 'Adding (nonpositional) mappings for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); for my $g ( @genes ) { for my $xref (grep { $callback->{'predicate'}($_) } @{ $g->get_all_DBEntries() }) { my $segid = $callback->{'transformer'}( $xref ); push @segments, $segid; } # Gene-based xrefs don't have alignments and so don't generate mappings. # It is enough to simply collate the segment ID's; only non-positional # features will mapped. } } else { warning(sprintf 'Mapping from %s to %s is not supported', $from_cs->name, $to_cs->name); } } # end mapping to slice/gene # Mapping to peptide-relative coordinates elsif ( $to_cs->equals( $self->{'prot_cs'} ) ) { $prot || throw('Trying to convert to peptide coordinates, but no Translation provided'); # Mapping from protein to protein (the same) if ( $from_cs->equals( $to_cs ) ) { # no mapper needed info(sprintf 'No mappings needed for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); $self->{'passthrough'}{$from_cs->name}{$from_cs->version}{$prot->stable_id} = $to_cs; push @segments, $prot->stable_id; } # Mapping from slice. Note that from_cs isnt necessarily the same as the transcript's coord_system elsif ( is_genomic($from_cs) || $from_cs->name eq 'toplevel' ) { my $ta = $prot->adaptor->db->get_TranscriptAdaptor(); my $sa = $prot->adaptor->db->get_SliceAdaptor(); my $tran = $ta->fetch_by_translation_stable_id($prot->stable_id); $slice = $sa->fetch_by_transcript_stable_id($tran->stable_id); $tran = $tran->transfer($slice); my $tran_cs = $slice->coord_system; # second stage mapper: transcript's slice to protein my $mapper = Bio::EnsEMBL::ExternalData::DAS::GenomicPeptideMapper->new('from', 'to', $tran_cs, $to_cs, $tran); info(sprintf 'Adding mappings for %s %s -> %s %s', $tran_cs->name, $tran_cs->version, $to_cs->name, $to_cs->version); $self->{'mappers'}{$slice->coord_system->name}{$slice->coord_system->version||''}{$slice->seq_region_name} = $mapper; # first stage mapper: from_cs to transcript's slice push @segments, @{ $self->_get_Segments($from_cs, $tran->slice->coord_system, $slice) }; } # Mapping from gene on a slice with the same coordinate system elsif ( $from_cs->equals( $self->{'gene_cs'} ) ) { my $ga = $prot->adaptor->db->get_GeneAdaptor(); my $sa = $prot->adaptor->db->get_SliceAdaptor(); $gene = $ga->fetch_by_translation_stable_id($prot->stable_id); $slice = $sa->fetch_by_gene_stable_id($gene->stable_id); # Second stage mapper: slice to peptide $self->_get_Segments($slice->coord_system, $to_cs, $slice, $gene, $prot); # First stage mapper: gene to slice push @segments, @{ $self->_get_Segments($from_cs, $slice->coord_system, $slice, $gene, $prot) }; } # Mapping from xref to peptide elsif ( my $callback = $XREF_PEPTIDE_FILTERS{$from_cs->name} ) { info(sprintf 'Adding mappings for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); for my $xref (grep { $callback->{'predicate'}($_) } @{ $prot->get_all_DBEntries() }) { my $segid = $callback->{'transformer'}( $xref ); push @segments, $segid; # If xref has a cigar alignment, use it to build mappings to the # Ensembl translation (assume they all align to the translation). # If not, we still query with the segment because non-positional # features don't need a mapper. if ($xref->can('get_mapper')) { my $mapper = Bio::EnsEMBL::ExternalData::DAS::XrefPeptideMapper->new('from', 'to', $from_cs, $to_cs, $xref, $prot); $mapper->external_id($segid); $mapper->ensembl_id($prot->stable_id); $self->{'mappers'}{$from_cs->name}{$from_cs->version}{$segid} = $mapper; } } } # Mapping from gene-mapped xref to peptide elsif ( $callback = $XREF_GENE_FILTERS{$from_cs->name} ) { my $ga = $prot->adaptor->db->get_GeneAdaptor(); $gene = $ga->fetch_by_translation_stable_id($prot->stable_id); info(sprintf 'Adding (nonpositional) mappings for %s %s -> %s %s', $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version); for my $xref (grep { $callback->{'predicate'}($_) } @{ $gene->get_all_DBEntries() }) { my $segid = $callback->{'transformer'}( $xref ); push @segments, $segid; # Gene-based xrefs don't have alignments and so don't generate mappings. # It is enough to simply collate the segment ID's; only non-positional # features will mapped. } } else { warning(sprintf 'Mapping from %s to %s is not supported', $from_cs->name, $to_cs->name); } } else { warning(sprintf 'Mapping to %s is not supported', $to_cs->name); } my %segments = map { $_ => 1 } @segments; return [ keys %segments ]; } sub _choose_coord_systems { my ( $self, $target_cs, $target_ob, $coord_systems ) = @_; my @best_genomic = (); my @best_gene = (); my @best_protein = (); my $csa = $target_ob->adaptor->db->get_CoordSystemAdaptor; my $ens_rank; my $ens_gene; my $ens_prot; for my $cs ( @{ $coord_systems } ) { if ( $cs->equals( $target_cs ) ) { return [ $cs ]; } if ( is_genomic($cs) || $cs->name eq 'toplevel' ) { my $tmp = $csa->fetch_by_name( $cs->name, $cs->version ) || $csa->fetch_by_name( $cs->name ) || next; if ( !defined $ens_rank || $tmp->rank < $ens_rank ) { $ens_rank = $tmp->rank; @best_genomic = ($cs); } elsif ( $tmp->rank == $ens_rank ) { push @best_genomic, $cs; } } elsif ( $GENE_COORDS{$cs->name} ) { if ( $cs->equals( $self->{'gene_cs'} ) ) { $ens_gene = 1; @best_gene = ($cs); } elsif ( !$ens_gene ) { push @best_gene, $cs; } } elsif ( $PROT_COORDS{$cs->name} ) { if ( $cs->equals( $self->{'prot_cs'} ) ) { $ens_prot = 1; @best_protein = ($cs); } elsif ( !$ens_prot ) { push @best_protein, $cs; } } } my $best = []; if ( is_genomic($target_cs) || $target_cs->name eq 'toplevel' ) { $best = @best_genomic ? \@best_genomic : @best_protein ? \@best_protein : \@best_gene; } elsif ( $GENE_COORDS{$target_cs->name} ) { $best = @best_gene ? \@best_gene : @best_genomic ? \@best_genomic : \@best_protein; } elsif ( $PROT_COORDS{$target_cs->name} ) { $best = @best_protein ? \@best_protein : @best_gene ? \@best_gene : \@best_genomic; } info('Choosen from '.scalar @{$coord_systems}.' coords: ' . join '; ', map { $_->name .' '. $_->version } @{$best}); return $best; } 1;