Coordinator documentation.

sub _get_Segments {

  my $self = shift;
  my $from_cs = shift; # the "foreign" source coordinate system 
  my $to_cs = shift;   # the target coordsys that mapped objects will be converted to
  my ($slice, $gene, $prot) = @_;
  #warn sprintf "Getting mapper for %s -> %s", $from_cs->name, $to_cs->name;
  
  info(sprintf 'Building mappings for %s %s -> %s %s',
               $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
  my %mappers = ();
  my @segments = ();
  
  # There are several different Mapper implementations in the API to convert
  # between various coordinate systems: AssemblyMapper, TranscriptMapper,
  # IdentityXref. For DAS, we often need to convert across the different realms
  # these mappers serve, such as chromosome:NCBI35 -> peptide which requires an
  # intermediary NCBI35 -> NCBI36 step. Unfortunately, the different mappers all
  # work in different ways and have different interfaces and limitations.
  #
  # For example, AssemblyMapper and TranscriptMapper use custom methods rather
  # than the standard API 'map_coordinates', IdentityXref uses custom
  # 'external_id' and 'ensembl_id' identifiers for the regions it is mapping
  # between, and all name the coordinate systems differently. These differences
  # mean the different mappers cannot be strung together, so this module uses
  # wrappers in order to achieve this.
  
  # Mapping to slice-relative coordinates
  if ( is_genomic($to_cs) ) {
    
    # Sanity checks
    $slice || throw('Trying to convert to slice coordinates, but no Slice provided');
    $slice->coord_system->equals($to_cs) || throw('Provided slice is not in target coordinate system');
    
    # Mapping from a slice-based coordinate system
    if ( is_genomic($from_cs) || $from_cs->name eq 'toplevel' ) {
      
      # No mapping needed - the coords are identical
      if ( $from_cs->equals( $to_cs ) ) {
        info(sprintf 'No mappings needed for %s %s -> %s %s',
          $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
        push @segments, sprintf '%s:%s,%s', $slice->seq_region_name, $slice->start, $slice->end;
      }
      # No mapping needed, but we need to indicate what the real coordsys is
      elsif ( $from_cs->name eq 'toplevel' && $slice->is_toplevel ) {
        info(sprintf 'No mappings needed for %s %s -> %s %s',
          $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
        $self->{'passthrough'}{$from_cs->name}{$from_cs->version}{$slice->seq_region_name} = $to_cs;
        push @segments, sprintf '%s:%s,%s', $slice->seq_region_name, $slice->start, $slice->end;
      }
      
      # We can't map from toplevel, only detect when no mapping is required...
      elsif ( $from_cs->name eq 'toplevel' ) {
        warning(sprintf 'Mapping from toplevel to %s is only supported where the given %1$s slice is also toplevel', $to_cs->name);
      }
      
      else {
        # AssemblyMapperAdaptor doesn't like DAS::CoordSystem
        # And sometimes DAS coordinate systems have spurious versions...
        my $csa = $slice->adaptor->db->get_CoordSystemAdaptor;
        my $ama = $slice->adaptor->db->get_AssemblyMapperAdaptor;
        my $tmpfrom = $csa->fetch_by_name( $from_cs->name, $from_cs->version ) || $csa->fetch_by_name( $from_cs->name );
        my $tmpto   = $csa->fetch_by_name( $to_cs->name,   $to_cs->version   ) || $csa->fetch_by_name( $to_cs->name   );
        my $tmpmap  = $tmpfrom && $tmpto ? $ama->fetch_by_CoordSystems($tmpfrom, $tmpto) : undef;
        
        # Ensembl might not support a specific genomic -> genomic mapping
        if ( $tmpmap ) {
          
          # Wrapper for AssemblyMapper:
          my $mapper = Bio::EnsEMBL::ExternalData::DAS::GenomicMapper->new(
            'from', 'to', $tmpfrom, $tmpto, $tmpmap
          );
          
          # Map backwards to get the query segments
          my @coords = $mapper->map_coordinates($slice->seq_region_name,
                                                $slice->start,
                                                $slice->end,
                                                $slice->strand,
                                                'to');
          
          info(sprintf 'Adding mappings for %s %s -> %s %s',
            $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
          for my $c ( @coords ) {
            $c->isa('Bio::EnsEMBL::Mapper::Coordinate') || next;
            $self->{'mappers'}{$from_cs->name}{$from_cs->version}{$c->id} ||= $mapper;
            push @segments, sprintf '%s:%s,%s', $c->id, $c->start, $c->end;
          }
          
        } else {
          warning(sprintf 'Mapping from %s to %s is not supported', $from_cs->name, $to_cs->name);
        }
      }
    }
    
    # Mapping from ensembl_gene to slice
    elsif ( $from_cs->equals( $self->{'gene_cs'} ) ) {
      
      my @genes = $gene ? ($gene)
                        : @{ $slice->get_all_Genes };
      
      info(sprintf 'Adding mappings for %s %s -> %s %s',
        $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
      for my $g ( @genes ) {
        # Genes are already definitely relative to the target slice, so don't need to do any assembly mapping
        my $mapper = Bio::EnsEMBL::Mapper->new('from', 'to', $from_cs, $to_cs);
        $mapper->add_map_coordinates(
          $g->stable_id,           1,                    $g->length, $g->seq_region_strand,
          $slice->seq_region_name, $g->seq_region_start, $g->seq_region_end
        );
        $self->{'mappers'}{$from_cs->name}{$from_cs->version}{$g->stable_id} = $mapper;
        push @segments, $g->stable_id;
      }
    }
    
    # Mapping from ensembl_peptide to slice
    elsif ( $from_cs->equals( $self->{'prot_cs'} ) ) {
      
      my @transcripts = $gene ? @{ $gene->get_all_Transcripts }
                              : @{ $slice->get_all_Transcripts };
      
      info(sprintf 'Adding mappings for %s %s -> %s %s',
        $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
      for my $tran ( @transcripts ) {
        my $p = $tran->translation || next;
        $self->{'mappers'}{$from_cs->name}{$from_cs->version}{$p->stable_id} ||= Bio::EnsEMBL::ExternalData::DAS::GenomicPeptideMapper->new('from', 'to', $from_cs, $to_cs, $tran);
        push @segments, $p->stable_id;
      }
    }
    
    # Mapping from translation-mapped xref to slice
    elsif ( my $callback = $XREF_PEPTIDE_FILTERS{$from_cs->name} ) {
      # Mapping path is xref -> ensembl_peptide -> slice
      my $mid_cs = $self->{'prot_cs'};
      
      my @transcripts = $gene ? @{ $gene->get_all_Transcripts }
                              : @{ $slice->get_all_Transcripts };
      
      for my $tran ( @transcripts ) {
        my $p = $tran->translation || next;
        # first stage mapper: xref to translation
        push @segments, @{ $self->_get_Segments($from_cs, $mid_cs, $slice, $gene, $p) };
      }
      # If the first stage actually produced mappings, we'll need to map from
      # peptide to slice
      if ($self->{'mappers'}{$from_cs->name}{$from_cs->version}) {
        # second stage mapper: gene or translation to transcript's slice
        $self->_get_Segments($mid_cs, $to_cs, $slice, $gene, $prot);
      }
    }
    
    # Mapping from gene-mapped xref to slice
    elsif ( $callback = $XREF_GENE_FILTERS{$from_cs->name} ) {
      
      my @genes = $gene ? ($gene)
                        : @{ $slice->get_all_Genes };
      
      info(sprintf 'Adding (nonpositional) mappings for %s %s -> %s %s',
        $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
      for my $g ( @genes ) {
        for my $xref (grep { $callback->{'predicate'}($_) } @{ $g->get_all_DBEntries() }) {
          my $segid = $callback->{'transformer'}( $xref );
          push @segments, $segid;
        }
        # Gene-based xrefs don't have alignments and so don't generate mappings.
        # It is enough to simply collate the segment ID's; only non-positional
        # features will mapped.
      }
    }
    
    else {
      warning(sprintf 'Mapping from %s to %s is not supported', $from_cs->name, $to_cs->name);
    }
  } # end mapping to slice/gene
  
  # Mapping to peptide-relative coordinates
  elsif ( $to_cs->equals( $self->{'prot_cs'} ) ) {
    
    $prot || throw('Trying to convert to peptide coordinates, but no Translation provided');
    
    # Mapping from protein to protein (the same)
    if ( $from_cs->equals( $to_cs ) ) {
      # no mapper needed
      info(sprintf 'No mappings needed for %s %s -> %s %s',
        $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
      $self->{'passthrough'}{$from_cs->name}{$from_cs->version}{$prot->stable_id} = $to_cs;
      push @segments, $prot->stable_id;
    }
    
    # Mapping from slice. Note that from_cs isnt necessarily the same as the transcript's coord_system
    elsif ( is_genomic($from_cs) || $from_cs->name eq 'toplevel' ) {
      my $ta    = $prot->adaptor->db->get_TranscriptAdaptor();
      my $sa    = $prot->adaptor->db->get_SliceAdaptor();
      my $tran  = $ta->fetch_by_translation_stable_id($prot->stable_id);
      $slice = $sa->fetch_by_transcript_stable_id($tran->stable_id);
      $tran = $tran->transfer($slice);
      my $tran_cs = $slice->coord_system;
      # second stage mapper: transcript's slice to protein
      my $mapper = Bio::EnsEMBL::ExternalData::DAS::GenomicPeptideMapper->new('from', 'to', $tran_cs, $to_cs, $tran);
      
      info(sprintf 'Adding mappings for %s %s -> %s %s',
        $tran_cs->name, $tran_cs->version, $to_cs->name, $to_cs->version);
      $self->{'mappers'}{$slice->coord_system->name}{$slice->coord_system->version||''}{$slice->seq_region_name} = $mapper;
      # first stage mapper: from_cs to transcript's slice
      push @segments, @{ $self->_get_Segments($from_cs, $tran->slice->coord_system, $slice) };
    }
    
    # Mapping from gene on a slice with the same coordinate system
    elsif ( $from_cs->equals( $self->{'gene_cs'} ) ) {
      my $ga  = $prot->adaptor->db->get_GeneAdaptor();
      my $sa  = $prot->adaptor->db->get_SliceAdaptor();
      $gene   = $ga->fetch_by_translation_stable_id($prot->stable_id);
      $slice  = $sa->fetch_by_gene_stable_id($gene->stable_id);
      # Second stage mapper: slice to peptide
      $self->_get_Segments($slice->coord_system, $to_cs, $slice, $gene, $prot);
      # First stage mapper: gene to slice
      push @segments, @{ $self->_get_Segments($from_cs, $slice->coord_system, $slice, $gene, $prot) };
    }
    
    # Mapping from xref to peptide
    elsif ( my $callback = $XREF_PEPTIDE_FILTERS{$from_cs->name} ) {
      info(sprintf 'Adding mappings for %s %s -> %s %s',
        $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
      for my $xref (grep { $callback->{'predicate'}($_) } @{ $prot->get_all_DBEntries() }) {
        my $segid = $callback->{'transformer'}( $xref );
        push @segments, $segid;
        # If xref has a cigar alignment, use it to build mappings to the
        # Ensembl translation (assume they all align to the translation).
        # If not, we still query with the segment because non-positional
        # features don't need a mapper.
        if ($xref->can('get_mapper')) {
          my $mapper = Bio::EnsEMBL::ExternalData::DAS::XrefPeptideMapper->new('from', 'to', $from_cs, $to_cs, $xref, $prot);
          $mapper->external_id($segid);
          $mapper->ensembl_id($prot->stable_id);
          $self->{'mappers'}{$from_cs->name}{$from_cs->version}{$segid} = $mapper;
        }
      }
    }
    
    # Mapping from gene-mapped xref to peptide
    elsif ( $callback = $XREF_GENE_FILTERS{$from_cs->name} ) {
      my $ga = $prot->adaptor->db->get_GeneAdaptor();
      $gene  = $ga->fetch_by_translation_stable_id($prot->stable_id);
      info(sprintf 'Adding (nonpositional) mappings for %s %s -> %s %s',
        $from_cs->name, $from_cs->version, $to_cs->name, $to_cs->version);
      for my $xref (grep { $callback->{'predicate'}($_) } @{ $gene->get_all_DBEntries() }) {
        my $segid = $callback->{'transformer'}( $xref );
        push @segments, $segid;
        # Gene-based xrefs don't have alignments and so don't generate mappings.
        # It is enough to simply collate the segment ID's; only non-positional
        # features will mapped.
      }
    }
    
    else {
      warning(sprintf 'Mapping from %s to %s is not supported', $from_cs->name, $to_cs->name);
    }
  }
  
  else {
    warning(sprintf 'Mapping to %s is not supported', $to_cs->name);
  }
  
  my %segments = map { $_ => 1 } @segments;
  return [ keys %segments ];

}

sub fetch_Features {

  my ( $self, $target_obj ) = splice @_, 0, 2;
  my %filters = @_; # maxbins, feature, type, group
  
  # TODO: review this structure that is returned, would we prefer to split by
  # segment ID? We don't always know it before we parse the feature though (e.g.
  # when querying by feat ID). Also stylesheet errors aren't segment-specific
  
  my ( $target_cs, $target_segment, $slice, $gene, $prot );
  if ( $target_obj->isa('Bio::EnsEMBL::Gene') ) {
    $slice = $target_obj->slice;
    $gene = $target_obj;
    $target_cs = $slice->coord_system; # actually want features relative to the slice
    $target_segment = $target_obj->stable_id;
  } elsif ( $target_obj->isa('Bio::EnsEMBL::Slice') ) {
    $slice = $target_obj;
    $target_cs = $target_obj->coord_system;
    $target_segment = sprintf '%s:%s,%s', $target_obj->seq_region_name, $target_obj->start, $target_obj->end;
  } elsif ( $target_obj->isa('Bio::EnsEMBL::Translation') ) {
    $prot = $target_obj;
    $target_cs = Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'ensembl_peptide' );
    $target_segment = $target_obj->stable_id;
  } else {
    throw('Unsupported object type: '.$target_obj);
  }
  my $target_species = $target_obj->adaptor->db->species;
  
  my %coords = ();
  my $final  = {};
  my %sources_with_data = ();
  
  #==========================================================#
  #      First sort the sources into coordinate systems      #
  #==========================================================#
  
  for my $source (@{ $self->{'sources'} }) {
    
    # Set up the data structure...
    $final->{$source->logic_name} = {
                                     'source'     => {
                                                      'object' => $source,
                                                     },
                                     'features'   => {},
                                     'stylesheet' => {},
                                    };
    
    my @coord_systems = @{ $self->_choose_coord_systems($target_cs, $target_obj, $source->coord_systems) };
    
    if (! scalar @coord_systems ) {
      warning($source->logic_name.' has no coord systems');
      $final->{$source->logic_name}{'source'}{'error'}
        = 'Bad source configuration';
      next;
    }
    
    # Check the coordinate system is the correct species (if it has one)
    @coord_systems = grep {
      $_->matches_species( $target_species )
    } @coord_systems;
    
    if (! scalar @coord_systems ) {
      $final->{$source->logic_name}{'source'}{'error'}
        = "Source not compatible with $target_species";
    }
    
    # Query in all compatible coordinate systems
    for my $source_cs ( @coord_systems ) {
      
      # The coordinate system name doesn't need species in it because we have
      # just checked it is species-compatible - we treat them the same from now
      #�on. That is, Ensembl,Gene_ID == Ensembl,Gene_ID,Homo sapiens.
      my $cs_key = $source_cs->name . ' ' . $source_cs->version;
      
      # Sort sources by coordinate system
      if ( !$coords{$cs_key} ) {
        # Do a lot of funky stuff to get the query segments, and build up
        # mappers at the same time
        my $segments = $self->_get_Segments( $source_cs, $target_cs,
                                             $slice, $gene, $prot);
        
        $coords{ $cs_key } = { 'sources'      => {},
                               'coord_system' => $source_cs,
                               'segments'     => $segments   };
      }
      
      $coords{ $cs_key }{'sources'}{$source->full_url} ||= [];
      push @{ $coords{ $cs_key }{'sources'}{$source->full_url} }, $source;
    }
  }
  
  #==========================================================#
  #   Parallelise the requests for each coordinate system    #
  #==========================================================#
  
  my $daslite = $self->{'daslite'};
  
  # Split the requests by coordinate system, i.e. parallelise
  # requests for segments that are from the same coordinate system
  while (my ($coord_key, $coord_data) = each %coords) {
    my @segments   = @{ $coord_data->{'segments'} };
    my @urls       = keys %{ $coord_data->{'sources'} };
    my $source_cs  = $coord_data->{'coord_system'};
    my $coord_name = $source_cs->name . ' ' . $source_cs->version;
    
    # Either the mapping isn't supported, or nothing maps to the region we're
    # interested in.
    if (! scalar @segments ) {
      info("No segments found for $coord_name");
      for ( values %{ $coord_data->{'sources'} } ) {
        for my $source (@{ $_ }) {
          $final->{$source->logic_name}{'source'}{'error'} = 'Not applicable';
        }
      }
      next;
    }
    
    info("Querying with @segments for $coord_name");
    $daslite->dsn(\@ urls );
    
    my $response;
    my $statuses;
    
    my $maxbins = $source_cs->equals( $target_cs ) ? $filters{maxbins} : undef;
    
    #==========================================================#
    #             Get features for all DAS sources             #
    #==========================================================#

}

sub map_Features {

  my ( $self, $features, $source_cs, $to_cs, $slice ) = splice @_, 0, 5;
  my %filters = @_; # feature, type, group
  
  # TODO: implement maxbins filter??
  my $filter_f = $filters{feature};
  my $filter_t = $filters{type};
  my $filter_g = $filters{group};
  # If filtering we're more likely to have a small region, so it's better to
  # make 4 tests when filtering and 1 when not than always make 3 tests.
  # The big question is, is it better to test each filter is enabled than to
  # just preprocess in an extra iteration? I suspect the former.
  my $nofilter = !$filter_f && !$filter_t && !$filter_g;
  
  # Code block to build a feature object from raw hash
  my $build_Feature = sub {
    my $f = shift;
    $f = Bio::EnsEMBL::ExternalData::DAS::Feature->new( $f );
    # Where target coordsys is genomic, make a slice-relative feature
    if ($slice) {
      $f->slice($slice->seq_region_Slice);
      $f = $f->transfer($slice);
    } else {
      $f->seqname( $f->{'segment_id'} );
    }
    return $f;
  };
  
  # Code block to apply optional filters
  my $filter_Feature = sub {
    my $f = shift;
    # Test type first, because this is the more likely filter for large regions
    # where efficiency matters most
    if ( $filter_t ) {
      $f->{'type_id'} eq $filter_t || return 0;
    }
    if ( $filter_f ) {
      $f->{'feature_id'} eq $filter_f || return 0;
    }
    if ( $filter_g ) {
      return 0 unless grep { $_->{'group_id'} eq $filter_g } @{ $f->{'group'}||[] };
    }
    return 1;
  };
  
  my @new_features = ();
  
  # As part of the feature parsing we need to do some converting and filtering.
  # We could do this in a separate loop before doing any mapping, but this adds
  # an extra iteration step which is inefficient (especially for large numbers
  # of features). So we duplicate a bit of code.
  
  if ( $source_cs->equals( $to_cs ) || ( $slice && $source_cs->name eq 'toplevel' && $slice->is_toplevel ) ) {
    
    for my $f ( @{ $features } ) {
      if ( $nofilter || &$filter_Feature( $f ) ) {
        $f->{'strand'} = $ORI_NUMERIC{$f->{'orientation'} || '.'} || 0; # Convert to Ensembl-style (numeric) strand
        push @new_features, &$build_Feature( $f ); # Build object
      }
    }
    
    return\@ new_features;
  }
  
  # May need multiple mapping steps to reach the target coordinate system
  # This loop works by undefining $source_cs, the redefining it when we know
  # which coordinate system the mapper is mapping to
  while ( $source_cs && !$source_cs->equals($to_cs) ) {
    
    info('Beginning mapping from '.$source_cs->name);
    
    my @this_features = @{ $features };
    my $mappers = $self->{'mappers'}{$source_cs->name}{$source_cs->version||''};
    my $passthrough = $self->{'passthrough'}{$source_cs->name}{$source_cs->version||''};
    $features  = [];
    my $this_cs = undef;
    
    my $positional_mapping_errors = 0;
    
    # Map the current set of features to the next coordinate system
    for my $f ( @this_features ) {
      
      $nofilter || &$filter_Feature( $f ) || next;
      
      my $strand = $f->{'strand'};
      if (!defined $strand) {
        $strand = $f->{'strand'} = $ORI_NUMERIC{$f->{'orientation'} || '.'} || 0;
      }
      
      # It doesn't matter what coordinate system non-positional features come
      # from, they are always included and don't need mapping
      if (!$f->{'start'} && !$f->{'end'}) {
        push @new_features, &$build_Feature( $f );
        next;
      }
      
      my $segid  = $f->{'segment_id'};
      
      # Check for passthrough mappings (i.e. no mapping is needed but the coord system needs to change)
      # This is used when mapping from toplevel
      if (my $pass_cs = $passthrough->{$segid}) {
        $this_cs = $pass_cs;
        # If this is the final step, convert to Ensembl Feature
        if ( $this_cs->equals( $to_cs ) ) {
          push @new_features, &$build_Feature( $f );
        }
        else {
          push @{ $features }, $f;
        }
        next;
      }
      
      # Otherwise check there are mappings available for this segment
      my $mapper = $mappers->{$segid};
      if (!$mapper) {
        $positional_mapping_errors++;
        next;
      }
      $this_cs = $mapper->{'to_cs'} || throw('Mapper maps to unknown coordinate system');
      
      # Get new coordinates for this feature
      my @coords = $mapper->map_coordinates($segid,
                                            $f->{'start'},
                                            $f->{'end'},
                                            $strand,
                                            'from');
      
      # Create new features from the mapped coordinates
      for my $c ( @coords ) {
        $c->isa('Bio::EnsEMBL::Mapper::Coordinate') || next;
        my %new = %{ $f };
        $new{'segment_id'} = $c->id;
        $new{'start'     } = $c->start;
        $new{'end'       } = $c->end;
        $new{'strand'    } = $c->strand;
        
        # If this is the final step, convert to Ensembl Feature
        if ( $this_cs->equals( $to_cs ) ) {
          push @new_features, &$build_Feature(\% new );
        }
        else {
          push @{ $features },\% new;
        }
      }
      
    }
    
    if ($positional_mapping_errors) {
      warning(sprintf '%d positional features could not be mapped (%s -> %s)',
        $positional_mapping_errors,
        $source_cs ? $source_cs->name.' '.$source_cs->version : 'UNKNOWN',
        $this_cs   ? $this_cs->name  .' '.$this_cs->version   : 'UNKNOWN'
      );
    }
    
    $source_cs = $this_cs;
  }
  
  return\@ new_features;
}

# Supports mappings:
#   location-based to location-based
#   location-based to protein-based
#   protein-based to location-based
#   protein-based to protein-based
#   gene-based to location-based
#   gene-based to protein-based
#   xref-based to location-based
#   xref-based to protein-based
#   xref-based to gene-based
#
# Coordinate system definitions:
#   location-based  == chromosome|clone|contig|scaffold|supercontig etc
#   protein-based   == $self->{prot_cs} (ensembl_peptide)
#   gene-based      == $self->{gene_cs} (ensembl_gene)
#   xref-based      == uniprot_peptide|entrez_gene... (see %XREF_PEPTIDE_FILTERS)

}

_choose_coord_systems	No description	Code
_get_Segments	No description	Code
fetch_Features	Description	Code
map_Features	No description	Code
new	Description	Code