Bio::EnsEMBL::Analysis::RunnableDB ProbeAlign
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Analysis::RunnableDB::ProbeAlign;
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::Config::ProbeAlign
Bio::EnsEMBL::Analysis::Runnable::ExonerateProbe
Bio::EnsEMBL::Analysis::RunnableDB
Bio::EnsEMBL::DBEntry
Bio::EnsEMBL::DBSQL::DBAdaptor
Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor
Bio::EnsEMBL::UnmappedObject
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Inherit
Bio::EnsEMBL::Analysis::RunnableDB
Synopsis
  my $affy = Bio::EnsEMBL::Analysis::RunnableDB::ProbeAlign->new
(-db => $refdb,
-analysis => $analysis_obj,
-database => $EST_GENOMIC,
-query_seqs => \@sequences,
);
$affy->fetch_input(); $affy->run(); $affy->write_output(); #writes to DB
Description
This object maps probes to a genomic and transcript sequence, writing the results as
Bio::EnsEMBL::Funcgen::ProbeFeatures. UnmappedObjects are also written for those probes
which either do not map at all or exceed the maximum mapping threshold defined by HIT_SATURATION_LEVEL.
You must FIRST have created and imported all the necessary Arrays and Probe objects using the ImportArrays module.
Methods
DNADB
No description
Code
HIT_SATURATION_LEVEL
No description
Code
IIDREGEXP
No description
Code
MAX_MISMATCHES
No description
Code
OPTIONS
No description
Code
OUTDB
No description
Code
QUERYSEQS
No description
Code
QUERYTYPE
No description
Code
TARGETSEQS
No description
Code
arrays
No description
Code
features
No description
Code
fetch_input
No description
Code
filter_features
No description
Code
get_display_name_by_stable_idDescriptionCode
mapping_type
No description
Code
new
No description
Code
outdb
No description
Code
probes
No description
Code
query_file
No description
Code
read_and_check_config
No description
Code
run
No description
Code
set_probe_and_slice
No description
Code
unmapped_objects
No description
Code
write_output
No description
Code
Methods description
get_display_name_by_stable_idcode    nextTop
  Args [0]   : stable ID from core DB.
Args [1] : stable feature type e.g. gene, transcript, translation
Example : $self->validate_and_store_feature_types;
Description: Builds a cache of stable ID to display names.
Returntype : string - display name
Exceptions : Throws is type is not valid.
Caller : General
Status : At risk
Methods code
DNADBdescriptionprevnextTop
sub DNADB {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_DNADB'} = $value;
  }

  if ( exists( $self->{'_CONFIG_DNADB'} ) ) {
    return $self->{'_CONFIG_DNADB'};
  } else {
    return undef;
  }
}
HIT_SATURATION_LEVELdescriptionprevnextTop
sub HIT_SATURATION_LEVEL {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{'_CONFIG_HIT_SATURATION_LEVEL'} = $val;
  }

  if (exists $self->{'_CONFIG_HIT_SATURATION_LEVEL'}) {
    return $self->{'_CONFIG_HIT_SATURATION_LEVEL'};
  } else {
    # default to 100
# This is not used for ProbeTranscriptAlign
return 100; } } ###############################################
### end of config
###############################################
1;
}
IIDREGEXPdescriptionprevnextTop
sub IIDREGEXP {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_IIDREGEXP'} = $value;
  }

  if ( exists( $self->{'_CONFIG_IIDREGEXP'} ) ) {
    return $self->{'_CONFIG_IIDREGEXP'};
  } else {
    return undef;
  }
}
MAX_MISMATCHESdescriptionprevnextTop
sub MAX_MISMATCHES {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_MAX_MISMATCHES'} = $value;
  }

  if ( exists( $self->{'_CONFIG_MAX_MISMATCHES'} ) ) {
    return $self->{'_CONFIG_MAX_MISMATCHES'};
  } else {
    return undef;
  }
}
OPTIONSdescriptionprevnextTop
sub OPTIONS {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_OPTIONS'} = $value;
  }

  if ( exists( $self->{'_CONFIG_OPTIONS'} ) ) {
    return $self->{'_CONFIG_OPTIONS'};
  } else {
    return undef;
  }
}
OUTDBdescriptionprevnextTop
sub OUTDB {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_OUTDB'} = $value;
  }

  if ( exists( $self->{'_CONFIG_OUTDB'} ) ) {
    return $self->{'_CONFIG_OUTDB'};
  } else {
    return undef;
  }
}
QUERYSEQSdescriptionprevnextTop
sub QUERYSEQS {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_QUERYSEQS'} = $value;
  }

  if ( exists( $self->{'_CONFIG_QUERYSEQS'} ) ) {
    return $self->{'_CONFIG_QUERYSEQS'};
  } else {
    return undef;
  }
}
QUERYTYPEdescriptionprevnextTop
sub QUERYTYPE {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_QUERYTYPE'} = $value;
  }

  if ( exists( $self->{'_CONFIG_QUERYTYPE'} ) ) {
    return $self->{'_CONFIG_QUERYTYPE'};
  } else {
    return undef;
  }
}
TARGETSEQSdescriptionprevnextTop
sub TARGETSEQS {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_TARGETSEQS'} = $value;
  }

  if ( exists( $self->{'_CONFIG_TARGETSEQS'} ) ) {
    return $self->{'_CONFIG_TARGETSEQS'};
  } else {
    return undef;
  }
}
arraysdescriptionprevnextTop
sub arrays {
  my ( $self, $value ) = @_;

  $self->{'_arrays'} = $value if defined $value;
  
  return $self->{'_arrays'};
  
}

#############################################################
}
featuresdescriptionprevnextTop
sub features {
  my ( $self, $value ) = @_;

  $self->{'_features'} = $value  if defined $value;
  return $self->{'_features'};#do we need to test and return undef?
} #############################################################
# Declare and set up config variables
#############################################################
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my ($self) = @_;


  ##########################################
# set up the target (genome)
##########################################
my $target = $self->TARGETSEQS; if ( -e $target ){ if(-d $target ) { warn ("Target $target is a directory of files\n"); }elsif (-s $target){ warn ("Target $target is a whole-genome file\n"); }else{ throw("'$target' isn't a file or a directory?"); } } else { throw("'$target' could not be found"); } ##########################################
# set up the query probe seq
##########################################
my ($query_file, $chunk_number, $chunk_total); my $query = $self->QUERYSEQS; if ( -e $query and -s $query ) { # query seqs is a single file; input id will correspond to a chunk number
$query_file = $query; my $iid_regexp = $self->IIDREGEXP; if (not defined $iid_regexp){ throw("You must define IIDREGEXP in config to enable inference of chunk number and total from your single fasta file" ) } ( $chunk_number, $chunk_total ) = $self->input_id =~ /$iid_regexp/; if(!$chunk_number || !$chunk_total){ throw "I can't make sense of your input id using the IIDREGEXP in the config!\n"; } #store this for reference later
$self->query_file($query_file); } else { throw("'$query' must refer to a single fasta file with all probe sequences referenced by affy_probe_id\n"); } ##########################################
# setup the runnable
##########################################
my %parameters = %{ $self->parameters_hash }; #Parameter hash is picked up from the DB analysis table
#else the config -options value is used
if (not exists $parameters{-options}) { if ($self->OPTIONS) { $parameters{-options} = $self->OPTIONS; } else { throw('You have not options stored in the DB or accessible from the analysis config hash for '.$self->analysis->logic_name); #$parameters{-options} = "";
} } print STDERR "PROGRAM FILE: ".$self->analysis->program_file."\n"; my $runnable = Bio::EnsEMBL::Analysis::Runnable::ExonerateProbe->new ( -program => $self->analysis->program_file, -analysis => $self->analysis, -target_file => $target, -query_type => $self->QUERYTYPE,#dna
-query_file => $query_file, -query_chunk_number => $chunk_number, -query_chunk_total => $chunk_total, -max_mismatches => $self->MAX_MISMATCHES, -mapping_type => $self->mapping_type, #-filter_method => $self->FILTER_METHOD,
%parameters, ); $self->runnable($runnable); } ############################################################
#Overrides RunnableDB::run
}
filter_featuresdescriptionprevnextTop
sub filter_features {
  my ($self, $features) = @_;

  my (%hits_by_probe, @kept_hits);
  my $analysis     = $self->analysis;
  my $logic_name   = $analysis->logic_name;
  my $mapping_type = $self->mapping_type;
  my $max_hits     = $self->HIT_SATURATION_LEVEL;#default is 100
my $uo_adaptor = $self->outdb->get_UnmappedObjectAdaptor; #We don't really want one for every logic name,
#so let's simplify this to just ProbeAlign and ProbeTranscriptAlign
my $uo_type; ($uo_type = $logic_name) =~ s/^.*_//; #This is tricky
#We may map something >100 times to the genome but only a few times to transcripts.
#Do we need a clean up step which removes all transcript mapping if we have an
#UnmappedObject due to exceeding the genomic hit saturation???
#Or keep the transcript mapping and warn in the annotation
#Do we need a HIT_SATURATION_LEVEL explcitly for transcripts?
#I think yes and yes, yes we keep and warn and yes we need a more stringent saturation level,
#probably needs to be number of genes mapped rather than simply number of mappings as we will always
#get multiple mappings across alt trans
#In fact this is why we ignored it for transcript mapping.
#But we are throwing away non-gapped alignments for transcripts!
#So it is very unlikely that we will ever reach this level!
#We are also not throwing ungapped away before we do this test
#So must not filter here as we may throw away a probe which matches many times ungapped
#but only once gapped...but we're stil matching transcripts rather than the whole genome
#here so this is still valid.
foreach my $hit (@$features) { #we need to capture first an dlast transcript ID's to
#enable cleaning of potential duplicates
push @{$hits_by_probe{$hit->probe_id}}, $hit; } foreach my $probe_id (keys %hits_by_probe) { my @hits = @{$hits_by_probe{$probe_id}}; my $num_hits = scalar(@hits); my $all_hits = $num_hits; if ($num_hits > $max_hits) { @hits = grep { $_->mismatchcount == 0 } @hits; $num_hits = scalar(@hits); if ($num_hits <= $max_hits) { warn "Keeping only perfect matches($num_hits/$all_hits) to $probe_id. Do we need to keep this in an UnmappedObject?\n"; } else { warn "UnmappedObject:\tToo many hits($num_hits|$all_hits/$max_hits) to $probe_id so rejecting all hits\n"; #So we have issues with what we consider for transcript mapping.
#If it has failed genomic mapping then we currently do not consider if for xrefing.
#But it may only map to one transcript, but all over the genome
#We don't store the genomic mappings, but maybe we should store the transcript mapping which we can then warn against using the unmapped object info from the genomic mapping.
#Okay, so now we don't have interdepedancy of the mapping steps
#We need to grab the external_db_id here if there is only one
#Leave blank if this exists in multiple DBs e.g. affy 3' UTR designs
#This assumes that the same ID in two different DBs are the same entity
#This is true for affy but maybe not for other arrays
#The simple way to do this is to pull back the array names using the probe_id and direct sql
#This means yet another call for each unmapped probe
#push @{$self->unmapped_objects},
$uo_adaptor->store(Bio::EnsEMBL::UnmappedObject->new ( -type => $uo_type,#Currently get's set to NULL as can only have xref or probe2transcript
-analysis => $analysis, -ensembl_id => $probe_id, -ensembl_object_type => 'Probe', -external_db_id => $self->{'_external_db_id'}, -identifier => $mapping_type, -summary => 'Promiscuous probe', -full_desc => "Probe exceeded maximum allowed number of $mapping_type mappings(${num_hits}/${max_hits})" )); $num_hits = 0; } } elsif($num_hits == 0){#Both ProbeAlign and ProbeTranscriptAlign
#Very unlikely, and probably only ProbeTranscriptAlign
#push @{$self->unmapped_objects},
#We want to an external_db_id here for if this is Trancsript
#What about genomic mapping?
#Yes we need to reimplement species DB?
warn "UnmappedObject:\t$probe_id has no $mapping_type mappings\n"; $uo_adaptor->store(Bio::EnsEMBL::UnmappedObject->new ( -type => $uo_type,#Currently get's set to NULL as can only have xref or probe2transcript
-analysis => $analysis, -ensembl_id => $probe_id, -ensembl_object_type => 'Probe', -external_db_id => $self->{'_external_db_id'}, -identifier => $mapping_type, -summary => 'Unmapped probe', -full_desc => "Probe has no $mapping_type mappings", )); } if($num_hits) { push @kept_hits, @hits; } } return\@ kept_hits; } ############################################################
}
get_display_name_by_stable_iddescriptionprevnextTop
sub get_display_name_by_stable_id {
  my ($self, $stable_id, $type) = @_;

  $type = lc($type);

  if($type !~ /(gene|transcript|translation)/){
	throw("Cannot get display_name for stable_id $stable_id with type $type");
  }
  
  if(! exists $self->{'display_name_cache'}->{$stable_id}){
	#warn "Generating $type display_name cache\n";
($self->{'display_name_cache'}->{$stable_id}) = $self->outdb->dnadb->dbc->db_handle->selectrow_array("SELECT x.display_label FROM ${type}_stable_id s, $type t, xref x where t.display_xref_id=x.xref_id and s.${type}_id=t.gene_id and s.stable_id='${stable_id}'"); } return $self->{'display_name_cache'}->{$stable_id};
}
mapping_typedescriptionprevnextTop
sub mapping_type {
  return $_[0]->{'mapping_type'};
}
newdescriptionprevnextTop
sub new {
  my ( $class, @args ) = @_;
  my $self = $class->SUPER::new(@args);
  
  

  #Because we dont know whether the sort of adapter (compara, hive, core)
#we'll be passed, it's better to just remake the core dbadaptor and
#keep it on ourselves as the dna db - this has to be the intent of the
#dataadaptor input through the rulemanager / beekeeper / whatever
$self->read_and_check_config; #Set up and check DBs before we do the mapping
$self->outdb->dbc->db_handle; $self->outdb->dnadb->dbc->db_handle; $self->arrays({}); $self->probes({}); #$self->features({});#??
#$self->{'_unmapped_objects'} = [];
my $logic = $self->analysis->logic_name; my $schema_build = $self->outdb->_get_schema_build($self->outdb->dnadb); my $species = $self->outdb->species; throw('Must provide a -species to the OUTDB in Bio::EnsEMBL::Analysis::Config::ProbeAlign') if ! $species; my ($db_name, $display_name); if($logic =~ /Transcript/){ $self->{'mapping_type'} = 'transcript'; #We need to check for ensembl_Core_Transcript external db here
#This should also be linked to the release
#Should we store here? May over-write each other?
#No problem if it fails once as the pipeline will resubmit the job
#Will this cause problems with maintaining a master external_dbs file
#As this may will grow *3 for every release.
#But once we allow GENE, TRANSCRIPT, PROTEIN in xref.info_type
#Then this would only be one extra db per release
#Would be DBEntry::ADaptor->get_all_external_dbs_by_name
#This is tricky as we may be aligning against an old DB
#i.e. if there was a new array but not a new build
#Current DBEntry queries are naive to external_db version
$db_name = $species.'_core_Transcript'; $display_name = 'EnsemblTranscript'; } else{ $self->{'mapping_type'} = 'genomic'; $db_name = $species.'_core_Genome'; $display_name = 'EnsemblGenome'; #This is really just a place holder for UnmappedObjects against the entire genome
#Should we redo this as EnsemblSpecies and actually have xref entries for each species name?
} my $sql = "select external_db_id from external_db where db_name='$db_name' and db_release='$schema_build'"; my ($extdb_id) = $self->outdb->db_handle->selectrow_array($sql); if(! $extdb_id){ warn 'No external_db found for '.$self->{'mapping_type'}." mapping, inserting $db_name $schema_build"; #status is dubious here as this should really be on object_xref
my $insert_sql = 'INSERT into external_db(db_name, db_release, status, dbprimary_acc_linkable, priority, db_display_name, type)'. " values('$db_name', '$schema_build', 'KNOWNXREF', 1, 5, '$display_name', 'MISC')"; $self->outdb->db_handle->do($insert_sql); ($extdb_id) = $self->outdb->db_handle->selectrow_array($sql); #Now test again just to make sure the db_name fits!
($extdb_id) = $self->outdb->db_handle->selectrow_array($sql); if(! $extdb_id){ throw("Failed to store external_db properly:\t$db_name $schema_build"); } } $self->{'_external_db_id'} = $extdb_id; return $self;
}
outdbdescriptionprevnextTop
sub outdb {
  my ($self) = @_;

  my $outdb;
  my $dnadb;

  #Do we need to alter this???????????????????????????????????????????????????????????????????????????????????????????????????
#Look at ImportArrays
#There is method duplication here which we could move to a ProbeDB.pm?
if(! defined $self->{'efg_db'}){ #This DNADB testing is a work around to avoid having to edit Config::ProbeAlign
my $dnadb; #not defined as an empty env var will give the defined null string
if($self->DNADB->{-dbname}){ $dnadb = new Bio::EnsEMBL::DBSQL::DBAdaptor(%{ $self->DNADB }); } $self->{'efg_db'} = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new ( %{ $self->OUTDB }, -dnadb => $dnadb, ); if(! $self->DNADB->{-dbname}){ print "WARNING: Using default DNADB ". $self->{'efg_db'}->dnadb->dbname."\n"; } #We are now forcing use of OUTDB/DNADB config
#else {
# #Historical, but sensible?
# $self->{'efg_db'} = $self->SUPER::db;
#}
} return $self->{'efg_db'};
}
probesdescriptionprevnextTop
sub probes {
  my ( $self, $value ) = @_;

  $self->{'_probes'} = $value if  defined $value;
  return $self->{'_probes'};
}

#############################################################
}
query_filedescriptionprevnextTop
sub query_file {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_query_file'} = $value;
  }

  if ( exists( $self->{'_query_file'} ) ) {
    return $self->{'_query_file'};
  } else {
    return undef;
  }
}


#############################################################
}
read_and_check_configdescriptionprevnextTop
sub read_and_check_config {
  my $self = shift;

  $self->SUPER::read_and_check_config($PROBE_CONFIG);

  ##########
# CHECKS
##########
my $logic = $self->analysis->logic_name; #We need to set the QUERYSEQS here dependant on the config
#We are running all array formats from one cat'd file so we don't need to know any special format specific config
# check that compulsory options have values
# Remove DNADB??
# Removed ARRAY_DESIGN
foreach my $config_var ( qw( QUERYSEQS QUERYTYPE TARGETSEQS OUTDB DNADB OPTIONS HIT_SATURATION_LEVEL MAX_MISMATCHES ) ){ #We could make MAX_MISMATCHES optional and have % ID here instead?
# FILTER_METHOD
if ( not defined $self->$config_var ){ throw("You must define $config_var in config for logic '$logic'"); } } #Test DB config hashes
for my $db(qw(OUTDB DNADB)){ if (ref( $self->$db ) ne "HASH" || ! defined $self->$db->{-dbname}) { throw("$db in config for '$logic' must be a hash ref of db connection parameters."); } } } #sub ARRAY_DESIGN{
# my ( $self, $value ) = @_;
# $self->{'ARRAY_DESIGN'} = $value if defined $value;
# return $self->{'ARRAY_DESIGN'};
#}
#sub FILTER_METHOD{
# my ( $self, $value ) = @_;
# $self->{'_CONFIG_FILTER_METHOD'} = $value if defined $value;
# return $self->{'_CONFIG_FILTER_METHOD'} || undef;
#
}
rundescriptionprevnextTop
sub run {
  my ($self) = @_;

  throw("Can't run - no runnable objects") unless ( $self->runnable );

  my $runnable = @{ $self->runnable }[0];
  $runnable->run;


  #Can we rollback individual chunk output here
#so we never duplicate output
#If job fails we currently don't know whether it failed before storing any unmapped objects
#Can we do this optionally by setting an env var?
#We would set this in SubmitAlign via a parameter
#We need to set this in the config to maintain decoupling between the environment and the RunnableDB
#Could we record a status in the eFG DB to track this?
#Don't need both?
#does not fit with efg status as is not a table, but could be hijacked quite easily?
#Let's just set this manually. You will have to reset the jobs, so we could add a warning here to use the appropriate flag?
#These must be used in the rule_manager.pl
#Which is where write_output must also be called ffrom
#$self->output($features);#Moved this after filter/set_probe so we get the write output reported
$self->features($self->filter_features($runnable->output)); } ############################################################
#overrides RunnableDB::write_output
}
set_probe_and_slicedescriptionprevnextTop
sub set_probe_and_slice {
  my ( $self, $features ) = @_;

  my $db = $self->outdb;
  my $slice_adaptor = $db->get_SliceAdaptor;
  my $probe_adaptor = $db->get_ProbeAdaptor;
  my $trans_adaptor = $db->dnadb->get_TranscriptAdaptor;
  my $mapping_type  = $self->mapping_type;
  my $gene_adaptor  = $db->dnadb->get_GeneAdaptor;
  #my $dbe_adaptor   = $db->get_DBEntryAdaptor;
my $analysis = $self->analysis; my $schema_build = $self->outdb->_get_schema_build($self->outdb->dnadb); my $edb_name = $self->outdb->species.'_core_Transcript'; my (%slices, $slice_id, $trans_mapper, $align_type, $align_length, @tmp, $gap_length); my (@trans_cigar_line, @genomic_blocks, $cigar_line, $gap_start, $block_end); my ($block_start, $slice, @gaps, @features, %gene_hits, $gene, @stranded_cigar_line, $gene_sid); my ($query_perc, $display_name, $gene_hit_key, %transcript_cache); foreach my $feature (@$features) { # get the slice based on the seqname stamped on in the runnable
my $seq_id = $feature->seqname; my $probe_id = $feature->probe_id; my $load_feature = 1; my $xref; my ($genomic_start, $genomic_end); #Get the slice
if($mapping_type eq 'transcript'){ if(! exists $transcript_cache{$seq_id}){ $transcript_cache{$seq_id} = $trans_adaptor->fetch_by_stable_id($seq_id); } $slice_id = $transcript_cache{$seq_id}->seq_region_name; if ( not exists $slices{$slice_id} ) { $slices{$slice_id} = $transcript_cache{$seq_id}->slice; } } else{ $slice_id= $seq_id; if ( not exists $slices{$slice_id} ) { # assumes genome seqs were named in the Ensembl API Slice naming
# convention, i.e. coord_syst:version:seq_reg_id:start:end:strand
$slices{$slice_id} = $slice_adaptor->fetch_by_name($slice_id); } } $slice = $slices{$slice_id}; #Now project onto genomic coords if we are on cdna
if($mapping_type eq 'transcript'){ #This will basically need to change the start and stops accordingly
#and insert D's the size of introns
#reject id no D's found as we will already have this as genomic mapping.
#The cigar line should always be displayed with reference to the +ve strand of the
#target feature i.e. the -ve strand if the transcript is on the -ve strand
#These are local stranded start and ends with respect to transcript strand
my $transcript_start = $feature->start; my $transcript_end = $feature->end; my $transcript_strand = $transcript_cache{$seq_id}->feature_Slice->strand; #warn "trans($seq_id) start end strand $transcript_start $transcript_end $transcript_strand" if $warn;
$trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript_cache{$seq_id}); @genomic_blocks = $trans_mapper->cdna2genomic($transcript_start, $transcript_end); #Next feature is this is an ungapped alignment (1 block)
#or just representing a flank seq overhang
#which will have been caught by the genomic mapping (2 blocks)
#print "Found genomic blocks @genomic_blocks\n";
next if(! (scalar(@genomic_blocks) >2)); #Coordinate object returned are genomic coords
#Introns(ProbeFeature deletions) are only represented by absent Coordindate blocks
#5'/3' overhangs(Genomic deletions) are represented by Gap objects
#We are not accounting for 5'/3' hanging alignment mismatches
#Which may actually be a sequence match or mismatch to the genomic sequence!
#else alter the start stop values and rebuild the cigarline
my $cigar_line = ''; @trans_cigar_line = split/:/, $feature->cigar_line; if($transcript_strand == -1){ #Always report start end and cigar line wrt the +ve strand of the target seq.
#Even if the match is against the -ve strand. This is what we do in compara.
#This is always the genome even for transcript mapping, as we a re projecting back.
#Need to flip genomic blocks around so that we're still dealing with +ve order
#This is counterintuitive to how the rest of the API deals with stranded features
#Normally returns lowest start first wrt to +ve strand even if you use a -ve strand slice, no?
@genomic_blocks = reverse(@genomic_blocks); @stranded_cigar_line = reverse(@trans_cigar_line); #This only makes a difference if there are any m's (seq mismatches)
#print "Transcript is -ve strand so reverse cigar is @stranded_cigar_line\n";
} else{ @stranded_cigar_line = @trans_cigar_line; } #Account for 5'/3' overhanging gaps here
my %gap_lengths = ( 5 => undef, 3 => undef, ); foreach my $block(@genomic_blocks){ #warn $block.' '.$block->start.' '.$block->end;#.' '.$block->strand;
if(! $genomic_start){ if($block->isa('Bio::EnsEMBL::Mapper::Coordinate')){#Set genomic_start
if($gap_lengths{5}){#We have seen a gap
$genomic_start = $block->start - $gap_length; } else{#Just set to first start
$genomic_start = $block->start; } } else{#Must be 5' Gap
$gap_lengths{5} = $block->length; } } elsif($block->isa('Bio::EnsEMBL::Mapper::Coordinate')){ $genomic_end = $block->end; } else{#Must be 3' Gap
$gap_lengths{3} = $block->length; $genomic_end += $gap_lengths{3}; } #So calculate gap coordinates
#We want to skip and Gap objects
#as these will have already been inserted into the cigarline by ExonerateProbe
next if $block->isa('Bio::EnsEMBL::Mapper::Gap'); if(@gaps){ push @gaps, ($block->start - $gaps[$#gaps]); } push @gaps, ($block->end + 1); } #remove last value as this is the end of the match and not a gap start
pop @gaps; #Insert intron gaps into cDNA cigarline
$gap_start = shift @gaps; $gap_length = shift @gaps; #Set this to a hypothetical previous block end
#So the initial block_start calculation will be valid
$block_end = ($genomic_start - 1); my $last_gap = 0; my $match_count = 0; my $mismatch_count = 0; my $score = 0; my $q_length = 0; #Account for 5'/3' overhangs here by simple bp matching
#We need to account for Gap length as we may get 2m
#Which might actually be 1m1M, where the 1M is an overhang Gap
my ($gap_block); foreach my $end(keys %gap_lengths){ if($gap_lengths{$end}){ if($end == 5){ $gap_block = shift @stranded_cigar_line; } else{ $gap_block = pop @stranded_cigar_line; } @tmp = split//, $gap_block; $align_type = pop @tmp; ($align_length = $gap_block) =~ s/$align_type//; if($align_type ne 'm' || $align_length > $gap_lengths{$end}){ throw("Found unexpected alignment block($gap_block) when handling ${end}' overhanging Gap"); } #Now match against genomic sequence???
#No way of doing this as we don't have the probe sequence!!!!
#Change this to U?
#Can we get the query seq from ExonerateProbe?
$gap_block = $gap_lengths{$end}.'U'; $align_length -= $gap_lengths{$end}; if($align_length){ if($end == 5 ){ $gap_block .= $align_length.$align_type; }else{ $gap_block = $align_length.$align_type.$gap_block; } } if($end == 5){ unshift @stranded_cigar_line, $gap_block; }else{ push @stranded_cigar_line, $gap_block; } } } foreach my $block(@stranded_cigar_line){ @tmp = split//, $block; $align_type = pop @tmp; ($align_length = $block) =~ s/$align_type//; $q_length += $align_length; if($align_type eq 'M'){ $match_count += $align_length; $score += ($align_length * 5); } else{# 'm' or U mismatch
$mismatch_count += $align_length; $score -= ($align_length * 4); } #We need to calculate the genomic end of the current block
#given the previous block end or the genomic_start
$block_start = ($block_end + 1); $block_end += $align_length; if($block_end >= $gap_start){ #Could have multiple deletions
while(($block_end >= $gap_start) && ! $last_gap){ #Insert the match first
$align_length = ($gap_start - $block_start); $cigar_line .= $align_length.$align_type if $align_length; #Deletion wrt probe i.e. intron
$cigar_line .= $gap_length.'D'; #Now redefine start and end values
$block_start += $align_length + $gap_length; $block_end += $gap_length; #Now grab the next gap
if(@gaps){ $gap_start = shift @gaps; $gap_length = shift @gaps; } else{ $last_gap = 1; } } #We have reached the end of the gaps in this block
#so just redefine block here
$align_length = ($block_end - $block_start + 1); $block = ($align_length) ? $align_length.$align_type : ''; } $cigar_line .= $block; } warn "$genomic_start $genomic_end $cigar_line"; #We could assign the start end directly
$feature->start($genomic_start); $feature->end($genomic_end); $feature->strand($transcript_strand); $feature->cigar_line($cigar_line); #warn "Final Feature $genomic_start $genomic_end $cigar_line" if $warn;
#Test if we have already seen this alignment
$gene = $gene_adaptor->fetch_by_transcript_stable_id($seq_id); #The only way of doing this is to test the genomic_start/end and the genomic cigarline with the gene_stable_id and the probe_id
$gene_sid = $gene->stable_id; $gene_hit_key = "${gene_sid}:${probe_id}:${genomic_start}:${genomic_end}:${cigar_line}"; if(exists $gene_hits{$gene_hit_key}){ $load_feature = 0; }else{ #No need to count hits here
$gene_hits{$gene_hit_key} = undef; } #Now store the IDXref for this probe transcript hit
#This will mean we don't have recalculate this during the probe2transcript mapping step
#% ID over aligned region
#Which can be different if we have I|Ds
#As these will give different seq lengths
#But this is the ungapped alignment
#So ignore Target ID?
#cigar_line is reported wrt to strand of target
#we filter out all -ve hits by this point so don't need to account for here.
$query_perc = ($match_count/($match_count + $mismatch_count)) * 100;
$display_name = $self->get_display_name_by_stable_id($seq_id, 'transcript'); #$id_xref = Bio::EnsEMBL::IdentityXref->new
$xref = Bio::EnsEMBL::DBEntry->new ( #-XREF_IDENTITY => $query_perc,
#-TARGET_IDENTITY => 90.1,
#-SCORE => $score,
#-EVALUE => 12,
#-CIGAR_LINE => $cigar_line,
#-XREF_START => 1,#We are currently padding with mismatches to full length of query
#-XREF_END => $q_length,
#-ENSEMBL_START => $transcript_start,#target/hit_start
#-ENSEMBL_END => $transcript_end,#target/hit_end
#-ANALYSIS => $analysis,
-PRIMARY_ID => $seq_id, -DISPLAY_ID => $display_name, -DBNAME => $edb_name, -release => $schema_build, -info_type => 'MISC', -info_text => 'TRANSCRIPT', -linkage_annotation => "ProbeTranscriptAlign",#Add query_perc here when we have analysis
#-info_text => , #? What is this for? Is used in unique key so we get duplicated if null!!!
#-version => , #version of transcript sid?
); #No strand here! Always +ve?!
#$dbe_adaptor->store($id_xref, $probe_id, 'Probe', 1);#Do we need ignore release flag here?
###This cannot be done until we have ox.analysis_id in place v54?
#Yes we can, just use the linkage_annotation for now!
#No store this as a ProbeFeature DBEntry in line with how the individual probe xrefs
#are stored in probe2transcript
#we don't really need this extra translation and score info here
#and we are storing the cigar_line as part of the probe_feature
#so we can reconstitute the alignment if required
#This will mean we can separate the individual feature xrefs from the Probe/ProbeSet level full xref annotation
#Will mean some duplication for single probe arrays
#But will allow different sets of rules between ProbeAlign and probe2transcript
#i.e. we could relax the mapping rules but keep the xreffin rules more stringent
#We don't know what the ProbeFeature dbID is yet so we will have to pass this to store with the feature
#$dbe_adaptor->store($xref, $probe_id, 'Probe', 1);#Do we need ignore release flag here?
} else{#No introns - reformat cigar line
$feature->cigar_line(join('', (split/:/, $feature->cigar_line))); } if($load_feature){ #Reset start ends for non-ref slices
if($slice->start != 1){ #Need to reslice, but we don't want to affect
#the slice in the cahse as this will screw up
#further feature start/end tranforms
my ($level, undef, $name) = split /:/, $slice->name; #warn "Resetting slice for ".$slice->name;
my $start = $feature->start + $slice->start - 1; my $end = $feature->end + $slice->start - 1; $feature->start($start); $feature->end($end); $slice = $slice_adaptor->fetch_by_region($level, $name, 1, $end); } $feature->slice($slice); # Recover the probe from the cache or DB
my $real_probe = $self->probes->{$probe_id}; if(!$real_probe){ $real_probe = $probe_adaptor->fetch_by_dbID($probe_id); if (!$real_probe){ throw "Trying to clean up features for persistence: cant find probe with dbID $probe_id in database!\n"; } $self->probes->{$probe_id} = $real_probe; } $feature->probe($real_probe); $feature->analysis($analysis); push @features, [ $feature, $xref ]; } } return $self->features(\@features);
}
unmapped_objectsdescriptionprevnextTop
sub unmapped_objects {
  my ( $self, $value ) = @_;

  $self->{'_unmnapped_objects'} = $value if defined $value;
  
  return $self->{'_unmapped_objects'};
  
}

#############################################################
#############################################################
}
write_outputdescriptionprevnextTop
sub write_output {
  my ( $self, @output ) = @_;

  my $outdb           = $self->outdb;
  my $probe_adaptor   = $outdb->get_ProbeAdaptor;
  my $feature_adaptor = $outdb->get_ProbeFeatureAdaptor;
  my $dbe_adaptor     = $outdb->get_DBEntryAdaptor;
  
  #Add analysis, slices to features, and make
#sure they're pointing at the persistent array / probe instances
#instead of the fake arrays & probes they were created with
my $features = $self->set_probe_and_slice($self->features); #Now set correct output
#This is only used to report the number of features
$self->output($features);#$self->features);
warn "Writing ProbeFeatures\n"; foreach my $feature_xref(@{$features}){ my ($feature, $xref) = @$feature_xref; if(! defined $feature->slice()){ warn "Skipping feature storage for Probe, no slice available for ".$feature->seqname(); next; } eval{ $feature_adaptor->store($feature)}; if ($@) { warn $feature->slice->name; $self->throw('Unable to store ProbeFeature for probe '.$feature->probe_id." !\n $@"); } if($xref){ #Remove ignore release flag so we store on the correct schema build
eval{ $dbe_adaptor->store($xref, $feature->dbID, 'ProbeFeature') }; if ($@) { $self->throw('Unable to store ProbeFeature DBEntry for probe '.$feature->dbID." !\n $@"); } } } } ############################################################
}
General documentation
CONTACTTop
Post general queries to ensembl-dev@ebi.ac.uk