Bio::EnsEMBL::Analysis::RunnableDB
ProbeAlign
Toolbar
Summary
Bio::EnsEMBL::Analysis::RunnableDB::ProbeAlign;
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
my $affy = Bio::EnsEMBL::Analysis::RunnableDB::ProbeAlign->new
(-db => $refdb,
-analysis => $analysis_obj,
-database => $EST_GENOMIC,
-query_seqs => \@sequences,
);
$affy->fetch_input();
$affy->run();
$affy->write_output(); #writes to DB
Description
This object maps probes to a genomic and transcript sequence, writing the results as
Bio::EnsEMBL::Funcgen::ProbeFeatures. UnmappedObjects are also written for those probes
which either do not map at all or exceed the maximum mapping threshold defined by HIT_SATURATION_LEVEL.
You must FIRST have created and imported all the necessary Arrays and Probe objects using the ImportArrays module.
Methods
DNADB | No description | Code |
HIT_SATURATION_LEVEL | No description | Code |
IIDREGEXP | No description | Code |
MAX_MISMATCHES | No description | Code |
OPTIONS | No description | Code |
OUTDB | No description | Code |
QUERYSEQS | No description | Code |
QUERYTYPE | No description | Code |
TARGETSEQS | No description | Code |
arrays | No description | Code |
features | No description | Code |
fetch_input | No description | Code |
filter_features | No description | Code |
get_display_name_by_stable_id | Description | Code |
mapping_type | No description | Code |
new | No description | Code |
outdb | No description | Code |
probes | No description | Code |
query_file | No description | Code |
read_and_check_config | No description | Code |
run | No description | Code |
set_probe_and_slice | No description | Code |
unmapped_objects | No description | Code |
write_output | No description | Code |
Methods description
Args [0] : stable ID from core DB. Args [1] : stable feature type e.g. gene, transcript, translation Example : $self->validate_and_store_feature_types; Description: Builds a cache of stable ID to display names. Returntype : string - display name Exceptions : Throws is type is not valid. Caller : General Status : At risk |
Methods code
sub DNADB
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_CONFIG_DNADB'} = $value;
}
if ( exists( $self->{'_CONFIG_DNADB'} ) ) {
return $self->{'_CONFIG_DNADB'};
} else {
return undef;
} } |
sub HIT_SATURATION_LEVEL
{ my ($self, $val) = @_;
if (defined $val) {
$self->{'_CONFIG_HIT_SATURATION_LEVEL'} = $val;
}
if (exists $self->{'_CONFIG_HIT_SATURATION_LEVEL'}) {
return $self->{'_CONFIG_HIT_SATURATION_LEVEL'};
} else {
return 100;
}
}
1; } |
sub IIDREGEXP
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_CONFIG_IIDREGEXP'} = $value;
}
if ( exists( $self->{'_CONFIG_IIDREGEXP'} ) ) {
return $self->{'_CONFIG_IIDREGEXP'};
} else {
return undef;
} } |
sub MAX_MISMATCHES
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_CONFIG_MAX_MISMATCHES'} = $value;
}
if ( exists( $self->{'_CONFIG_MAX_MISMATCHES'} ) ) {
return $self->{'_CONFIG_MAX_MISMATCHES'};
} else {
return undef;
} } |
sub OPTIONS
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_CONFIG_OPTIONS'} = $value;
}
if ( exists( $self->{'_CONFIG_OPTIONS'} ) ) {
return $self->{'_CONFIG_OPTIONS'};
} else {
return undef;
} } |
sub OUTDB
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_CONFIG_OUTDB'} = $value;
}
if ( exists( $self->{'_CONFIG_OUTDB'} ) ) {
return $self->{'_CONFIG_OUTDB'};
} else {
return undef;
} } |
sub QUERYSEQS
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_CONFIG_QUERYSEQS'} = $value;
}
if ( exists( $self->{'_CONFIG_QUERYSEQS'} ) ) {
return $self->{'_CONFIG_QUERYSEQS'};
} else {
return undef;
} } |
sub QUERYTYPE
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_CONFIG_QUERYTYPE'} = $value;
}
if ( exists( $self->{'_CONFIG_QUERYTYPE'} ) ) {
return $self->{'_CONFIG_QUERYTYPE'};
} else {
return undef;
} } |
sub TARGETSEQS
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_CONFIG_TARGETSEQS'} = $value;
}
if ( exists( $self->{'_CONFIG_TARGETSEQS'} ) ) {
return $self->{'_CONFIG_TARGETSEQS'};
} else {
return undef;
} } |
sub arrays
{ my ( $self, $value ) = @_;
$self->{'_arrays'} = $value if defined $value;
return $self->{'_arrays'};
}
} |
sub features
{ my ( $self, $value ) = @_;
$self->{'_features'} = $value if defined $value;
return $self->{'_features'}; }
} |
sub fetch_input
{ my ($self) = @_;
my $target = $self->TARGETSEQS;
if ( -e $target ){
if(-d $target ) {
warn ("Target $target is a directory of files\n");
}elsif (-s $target){
warn ("Target $target is a whole-genome file\n");
}else{
throw("'$target' isn't a file or a directory?");
}
} else {
throw("'$target' could not be found");
}
my ($query_file, $chunk_number, $chunk_total);
my $query = $self->QUERYSEQS;
if ( -e $query and -s $query ) {
$query_file = $query;
my $iid_regexp = $self->IIDREGEXP;
if (not defined $iid_regexp){
throw("You must define IIDREGEXP in config to enable inference of chunk number and total from your single fasta file" )
}
( $chunk_number, $chunk_total ) = $self->input_id =~ /$iid_regexp/;
if(!$chunk_number || !$chunk_total){
throw "I can't make sense of your input id using the IIDREGEXP in the config!\n";
}
$self->query_file($query_file);
} else {
throw("'$query' must refer to a single fasta file with all probe sequences referenced by affy_probe_id\n");
}
my %parameters = %{ $self->parameters_hash };
if (not exists $parameters{-options}) {
if ($self->OPTIONS) {
$parameters{-options} = $self->OPTIONS;
} else {
throw('You have not options stored in the DB or accessible from the analysis config hash for '.$self->analysis->logic_name);
}
}
print STDERR "PROGRAM FILE: ".$self->analysis->program_file."\n";
my $runnable = Bio::EnsEMBL::Analysis::Runnable::ExonerateProbe->new
(
-program => $self->analysis->program_file,
-analysis => $self->analysis,
-target_file => $target,
-query_type => $self->QUERYTYPE, -query_file => $query_file,
-query_chunk_number => $chunk_number,
-query_chunk_total => $chunk_total,
-max_mismatches => $self->MAX_MISMATCHES,
-mapping_type => $self->mapping_type,
%parameters,
);
$self->runnable($runnable);
}
} |
sub filter_features
{ my ($self, $features) = @_;
my (%hits_by_probe, @kept_hits);
my $analysis = $self->analysis;
my $logic_name = $analysis->logic_name;
my $mapping_type = $self->mapping_type;
my $max_hits = $self->HIT_SATURATION_LEVEL; my $uo_adaptor = $self->outdb->get_UnmappedObjectAdaptor;
my $uo_type;
($uo_type = $logic_name) =~ s/^.*_//;
foreach my $hit (@$features) {
push @{$hits_by_probe{$hit->probe_id}}, $hit;
}
foreach my $probe_id (keys %hits_by_probe) {
my @hits = @{$hits_by_probe{$probe_id}};
my $num_hits = scalar(@hits);
my $all_hits = $num_hits;
if ($num_hits > $max_hits) {
@hits = grep { $_->mismatchcount == 0 } @hits;
$num_hits = scalar(@hits);
if ($num_hits <= $max_hits) {
warn "Keeping only perfect matches($num_hits/$all_hits) to $probe_id. Do we need to keep this in an UnmappedObject?\n";
}
else {
warn "UnmappedObject:\tToo many hits($num_hits|$all_hits/$max_hits) to $probe_id so rejecting all hits\n";
$uo_adaptor->store(Bio::EnsEMBL::UnmappedObject->new
(
-type => $uo_type, -analysis => $analysis,
-ensembl_id => $probe_id,
-ensembl_object_type => 'Probe',
-external_db_id => $self->{'_external_db_id'},
-identifier => $mapping_type,
-summary => 'Promiscuous probe',
-full_desc => "Probe exceeded maximum allowed number of $mapping_type mappings(${num_hits}/${max_hits})"
));
$num_hits = 0;
}
}
elsif($num_hits == 0){ warn "UnmappedObject:\t$probe_id has no $mapping_type mappings\n";
$uo_adaptor->store(Bio::EnsEMBL::UnmappedObject->new
(
-type => $uo_type, -analysis => $analysis,
-ensembl_id => $probe_id,
-ensembl_object_type => 'Probe',
-external_db_id => $self->{'_external_db_id'},
-identifier => $mapping_type,
-summary => 'Unmapped probe',
-full_desc => "Probe has no $mapping_type mappings",
));
}
if($num_hits) {
push @kept_hits, @hits;
}
}
return\@ kept_hits;
}
} |
sub get_display_name_by_stable_id
{ my ($self, $stable_id, $type) = @_;
$type = lc($type);
if($type !~ /(gene|transcript|translation)/){
throw("Cannot get display_name for stable_id $stable_id with type $type");
}
if(! exists $self->{'display_name_cache'}->{$stable_id}){
($self->{'display_name_cache'}->{$stable_id}) = $self->outdb->dnadb->dbc->db_handle->selectrow_array("SELECT x.display_label FROM ${type}_stable_id s, $type t, xref x where t.display_xref_id=x.xref_id and s.${type}_id=t.gene_id and s.stable_id='${stable_id}'");
}
return $self->{'display_name_cache'}->{$stable_id}; } |
sub mapping_type
{ return $_[0]->{'mapping_type'}; } |
sub new
{ my ( $class, @args ) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config;
$self->outdb->dbc->db_handle;
$self->outdb->dnadb->dbc->db_handle;
$self->arrays({});
$self->probes({});
my $logic = $self->analysis->logic_name;
my $schema_build = $self->outdb->_get_schema_build($self->outdb->dnadb);
my $species = $self->outdb->species;
throw('Must provide a -species to the OUTDB in Bio::EnsEMBL::Analysis::Config::ProbeAlign') if ! $species;
my ($db_name, $display_name);
if($logic =~ /Transcript/){
$self->{'mapping_type'} = 'transcript';
$db_name = $species.'_core_Transcript';
$display_name = 'EnsemblTranscript';
}
else{
$self->{'mapping_type'} = 'genomic';
$db_name = $species.'_core_Genome';
$display_name = 'EnsemblGenome';
}
my $sql = "select external_db_id from external_db where db_name='$db_name' and db_release='$schema_build'";
my ($extdb_id) = $self->outdb->db_handle->selectrow_array($sql);
if(! $extdb_id){
warn 'No external_db found for '.$self->{'mapping_type'}." mapping, inserting $db_name $schema_build";
my $insert_sql = 'INSERT into external_db(db_name, db_release, status, dbprimary_acc_linkable, priority, db_display_name, type)'.
" values('$db_name', '$schema_build', 'KNOWNXREF', 1, 5, '$display_name', 'MISC')";
$self->outdb->db_handle->do($insert_sql);
($extdb_id) = $self->outdb->db_handle->selectrow_array($sql);
($extdb_id) = $self->outdb->db_handle->selectrow_array($sql);
if(! $extdb_id){
throw("Failed to store external_db properly:\t$db_name $schema_build");
}
}
$self->{'_external_db_id'} = $extdb_id;
return $self; } |
sub outdb
{ my ($self) = @_;
my $outdb;
my $dnadb;
if(! defined $self->{'efg_db'}){
my $dnadb;
if($self->DNADB->{-dbname}){
$dnadb = new Bio::EnsEMBL::DBSQL::DBAdaptor(%{ $self->DNADB });
}
$self->{'efg_db'} = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new
(
%{ $self->OUTDB },
-dnadb => $dnadb,
);
if(! $self->DNADB->{-dbname}){
print "WARNING: Using default DNADB ". $self->{'efg_db'}->dnadb->dbname."\n";
}
}
return $self->{'efg_db'}; } |
sub probes
{ my ( $self, $value ) = @_;
$self->{'_probes'} = $value if defined $value;
return $self->{'_probes'};
}
} |
sub query_file
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_query_file'} = $value;
}
if ( exists( $self->{'_query_file'} ) ) {
return $self->{'_query_file'};
} else {
return undef;
}
}
} |
sub read_and_check_config
{ my $self = shift;
$self->SUPER::read_and_check_config($PROBE_CONFIG);
my $logic = $self->analysis->logic_name;
foreach my $config_var (
qw(
QUERYSEQS
QUERYTYPE
TARGETSEQS
OUTDB
DNADB
OPTIONS
HIT_SATURATION_LEVEL
MAX_MISMATCHES
)
){
if ( not defined $self->$config_var ){
throw("You must define $config_var in config for logic '$logic'");
}
}
for my $db(qw(OUTDB DNADB)){
if (ref( $self->$db ) ne "HASH" || ! defined $self->$db->{-dbname}) {
throw("$db in config for '$logic' must be a hash ref of db connection parameters.");
}
}
}
} |
sub run
{ my ($self) = @_;
throw("Can't run - no runnable objects") unless ( $self->runnable );
my $runnable = @{ $self->runnable }[0];
$runnable->run;
$self->features($self->filter_features($runnable->output));
}
} |
sub set_probe_and_slice
{ my ( $self, $features ) = @_;
my $db = $self->outdb;
my $slice_adaptor = $db->get_SliceAdaptor;
my $probe_adaptor = $db->get_ProbeAdaptor;
my $trans_adaptor = $db->dnadb->get_TranscriptAdaptor;
my $mapping_type = $self->mapping_type;
my $gene_adaptor = $db->dnadb->get_GeneAdaptor;
my $analysis = $self->analysis;
my $schema_build = $self->outdb->_get_schema_build($self->outdb->dnadb);
my $edb_name = $self->outdb->species.'_core_Transcript';
my (%slices, $slice_id, $trans_mapper, $align_type, $align_length, @tmp, $gap_length);
my (@trans_cigar_line, @genomic_blocks, $cigar_line, $gap_start, $block_end);
my ($block_start, $slice, @gaps, @features, %gene_hits, $gene, @stranded_cigar_line, $gene_sid);
my ($query_perc, $display_name, $gene_hit_key, %transcript_cache);
foreach my $feature (@$features) {
my $seq_id = $feature->seqname;
my $probe_id = $feature->probe_id;
my $load_feature = 1;
my $xref;
my ($genomic_start, $genomic_end);
if($mapping_type eq 'transcript'){
if(! exists $transcript_cache{$seq_id}){
$transcript_cache{$seq_id} = $trans_adaptor->fetch_by_stable_id($seq_id);
}
$slice_id = $transcript_cache{$seq_id}->seq_region_name;
if ( not exists $slices{$slice_id} ) {
$slices{$slice_id} = $transcript_cache{$seq_id}->slice;
}
}
else{
$slice_id= $seq_id;
if ( not exists $slices{$slice_id} ) {
$slices{$slice_id} = $slice_adaptor->fetch_by_name($slice_id);
}
}
$slice = $slices{$slice_id};
if($mapping_type eq 'transcript'){
my $transcript_start = $feature->start;
my $transcript_end = $feature->end;
my $transcript_strand = $transcript_cache{$seq_id}->feature_Slice->strand;
$trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript_cache{$seq_id});
@genomic_blocks = $trans_mapper->cdna2genomic($transcript_start, $transcript_end);
next if(! (scalar(@genomic_blocks) >2));
my $cigar_line = '';
@trans_cigar_line = split/:/, $feature->cigar_line;
if($transcript_strand == -1){
@genomic_blocks = reverse(@genomic_blocks);
@stranded_cigar_line = reverse(@trans_cigar_line);
}
else{
@stranded_cigar_line = @trans_cigar_line;
}
my %gap_lengths = (
5 => undef,
3 => undef,
);
foreach my $block(@genomic_blocks){
if(! $genomic_start){
if($block->isa('Bio::EnsEMBL::Mapper::Coordinate')){
if($gap_lengths{5}){ $genomic_start = $block->start - $gap_length;
}
else{ $genomic_start = $block->start;
}
}
else{ $gap_lengths{5} = $block->length;
}
}
elsif($block->isa('Bio::EnsEMBL::Mapper::Coordinate')){
$genomic_end = $block->end;
}
else{ $gap_lengths{3} = $block->length;
$genomic_end += $gap_lengths{3};
}
next if $block->isa('Bio::EnsEMBL::Mapper::Gap');
if(@gaps){
push @gaps, ($block->start - $gaps[$#gaps]);
}
push @gaps, ($block->end + 1);
}
pop @gaps;
$gap_start = shift @gaps;
$gap_length = shift @gaps;
$block_end = ($genomic_start - 1);
my $last_gap = 0;
my $match_count = 0;
my $mismatch_count = 0;
my $score = 0;
my $q_length = 0;
my ($gap_block);
foreach my $end(keys %gap_lengths){
if($gap_lengths{$end}){
if($end == 5){
$gap_block = shift @stranded_cigar_line;
}
else{
$gap_block = pop @stranded_cigar_line;
}
@tmp = split//, $gap_block;
$align_type = pop @tmp;
($align_length = $gap_block) =~ s/$align_type//;
if($align_type ne 'm' ||
$align_length > $gap_lengths{$end}){
throw("Found unexpected alignment block($gap_block) when handling ${end}' overhanging Gap");
}
$gap_block = $gap_lengths{$end}.'U';
$align_length -= $gap_lengths{$end};
if($align_length){
if($end == 5 ){
$gap_block .= $align_length.$align_type;
}else{
$gap_block = $align_length.$align_type.$gap_block;
}
}
if($end == 5){
unshift @stranded_cigar_line, $gap_block;
}else{
push @stranded_cigar_line, $gap_block;
}
}
}
foreach my $block(@stranded_cigar_line){
@tmp = split//, $block;
$align_type = pop @tmp;
($align_length = $block) =~ s/$align_type//;
$q_length += $align_length;
if($align_type eq 'M'){
$match_count += $align_length;
$score += ($align_length * 5);
}
else{ $mismatch_count += $align_length;
$score -= ($align_length * 4);
}
$block_start = ($block_end + 1);
$block_end += $align_length;
if($block_end >= $gap_start){
while(($block_end >= $gap_start) && ! $last_gap){
$align_length = ($gap_start - $block_start);
$cigar_line .= $align_length.$align_type if $align_length;
$cigar_line .= $gap_length.'D';
$block_start += $align_length + $gap_length;
$block_end += $gap_length;
if(@gaps){
$gap_start = shift @gaps;
$gap_length = shift @gaps;
}
else{
$last_gap = 1;
}
}
$align_length = ($block_end - $block_start + 1);
$block = ($align_length) ? $align_length.$align_type : '';
}
$cigar_line .= $block;
}
warn "$genomic_start $genomic_end $cigar_line";
$feature->start($genomic_start);
$feature->end($genomic_end);
$feature->strand($transcript_strand);
$feature->cigar_line($cigar_line);
$gene = $gene_adaptor->fetch_by_transcript_stable_id($seq_id);
$gene_sid = $gene->stable_id;
$gene_hit_key = "${gene_sid}:${probe_id}:${genomic_start}:${genomic_end}:${cigar_line}";
if(exists $gene_hits{$gene_hit_key}){
$load_feature = 0;
}else{
$gene_hits{$gene_hit_key} = undef;
}
$query_perc = ($match_count/($match_count + $mismatch_count)) * 100; $display_name = $self->get_display_name_by_stable_id($seq_id, 'transcript');
$xref = Bio::EnsEMBL::DBEntry->new
(
-PRIMARY_ID => $seq_id,
-DISPLAY_ID => $display_name,
-DBNAME => $edb_name,
-release => $schema_build,
-info_type => 'MISC',
-info_text => 'TRANSCRIPT',
-linkage_annotation => "ProbeTranscriptAlign",
);
}
else{ $feature->cigar_line(join('', (split/:/, $feature->cigar_line)));
}
if($load_feature){
if($slice->start != 1){
my ($level, undef, $name) = split /:/, $slice->name;
my $start = $feature->start + $slice->start - 1;
my $end = $feature->end + $slice->start - 1;
$feature->start($start);
$feature->end($end);
$slice = $slice_adaptor->fetch_by_region($level, $name, 1, $end);
}
$feature->slice($slice);
my $real_probe = $self->probes->{$probe_id};
if(!$real_probe){
$real_probe = $probe_adaptor->fetch_by_dbID($probe_id);
if (!$real_probe){
throw "Trying to clean up features for persistence: cant find probe with dbID $probe_id in database!\n";
}
$self->probes->{$probe_id} = $real_probe;
}
$feature->probe($real_probe);
$feature->analysis($analysis);
push @features, [ $feature, $xref ];
}
}
return $self->features(\@features); } |
sub unmapped_objects
{ my ( $self, $value ) = @_;
$self->{'_unmnapped_objects'} = $value if defined $value;
return $self->{'_unmapped_objects'};
}
} |
sub write_output
{ my ( $self, @output ) = @_;
my $outdb = $self->outdb;
my $probe_adaptor = $outdb->get_ProbeAdaptor;
my $feature_adaptor = $outdb->get_ProbeFeatureAdaptor;
my $dbe_adaptor = $outdb->get_DBEntryAdaptor;
my $features = $self->set_probe_and_slice($self->features);
$self->output($features);
warn "Writing ProbeFeatures\n";
foreach my $feature_xref(@{$features}){
my ($feature, $xref) = @$feature_xref;
if(! defined $feature->slice()){
warn "Skipping feature storage for Probe, no slice available for ".$feature->seqname();
next;
}
eval{ $feature_adaptor->store($feature)};
if ($@) {
warn $feature->slice->name;
$self->throw('Unable to store ProbeFeature for probe '.$feature->probe_id." !\n $@");
}
if($xref){
eval{ $dbe_adaptor->store($xref, $feature->dbID, 'ProbeFeature') };
if ($@) {
$self->throw('Unable to store ProbeFeature DBEntry for probe '.$feature->dbID." !\n $@");
}
}
}
}
} |
General documentation
Post general queries to ensembl-dev@ebi.ac.uk