Bio::EnsEMBL::Analysis::Runnable HaplotypeMapper
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Analysis::Runnable::HaplotypeProjection
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis
Bio::EnsEMBL::Analysis::Config::HaplotypeProjection qw ( )
Bio::EnsEMBL::Analysis::Runnable::Blastz
Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster
Bio::EnsEMBL::Attribute
Bio::EnsEMBL::DBEntry
Bio::EnsEMBL::DnaDnaAlignFeature
Bio::EnsEMBL::Exon
Bio::EnsEMBL::Gene
Bio::EnsEMBL::Root
Bio::EnsEMBL::Slice
Bio::EnsEMBL::Transcript
Inherit
Bio::EnsEMBL::Root
Synopsis
# This is the main analysis database
    my $genebuilder = new Bio::EnsEMBL::Analysis::Runnable::HaplotypeProjection
(
'-hap_slice' => $self->query,
'-slice' => $self->target,
'-input_id' => $self->input_id,
);
Description
This module aligned the genomic sequence of a haplotype region and the corresponding
reference chromosome region and projects the gene annotations
Methods
check_transcript_in_discarded_db
No description
Code
clean_queries
No description
Code
create_alignmentDescriptionCode
discarded_db
No description
Code
ensembl_db
No description
Code
features
No description
Code
fetch_sequence
No description
Code
filter_alignmentDescriptionCode
gene_typesDescriptionCode
hits
No description
Code
input_idDescriptionCode
load_tables
No description
Code
make_map_regionsDescriptionCode
new
No description
Code
output_db
No description
Code
query
No description
Code
raw_hits
No description
Code
remove_from_list
No description
Code
target
No description
Code
trim_hit
No description
Code
Methods description
create_alignmentcode    nextTop
 Description: Generates Blastz alignment between the ref and the hap sequences 
ReturnType : none, but $self->combined_Transcripts is filled
Args : none
filter_alignmentcodeprevnextTop
 Description: Filters the alignments between . 
ReturnType : none, but $self->combined_Transcripts is filled
Args : none
gene_typescodeprevnextTop
 Description: get/set for the type(s) of genes (usually TGE_gw, similarity_genewise and combined_e2g genes) 
to be used in the genebuilder they get set in new()
Does not include the ab inition predictions
input_idcodeprevnextTop
 Function: get/set for input id
Returns : string
Args : string (it expects a string of the format chr_name.start_coord-end_coord
make_map_regionscodeprevnextTop
 Description: retrieves ensembl and havana gene annotations with supporting evidence. 
ReturnType : none, but $self->combined_Transcripts is filled
Args : none
Methods code
check_transcript_in_discarded_dbdescriptionprevnextTop
sub check_transcript_in_discarded_db {
  my ($self, $tran) = @_;
 
  my @exons = @{$tran->get_all_Exons};

  my $discardedslice = $self->discarded_db->get_SliceAdaptor->fetch_by_region('toplevel',$tran->slice->seq_region_name,$tran->seq_region_start,$tran->seq_region_end);
  #print STDERR "Fetching discarded genes\n"; 
#print "NUMBER OF DISCARDED GENES: ",scalar(@{$discardedslice->get_all_Genes}),"\n";
DGENE: foreach my $dgene (@{$discardedslice->get_all_Genes}){ DTRANS:foreach my $dtran (@{$dgene->get_all_Transcripts}){ my @dexons = @{$dtran->get_all_Exons}; if(scalar(@exons) == scalar(@dexons)){ #print "Number of exons: ",scalar(@exons),"\n";
for (my $i=0; $i < scalar(@exons); $i++){ if ($exons[$i]->seq_region_start != $dexons[$i]->seq_region_start || $exons[$i]->strand != $dexons[$i]->strand || $exons[$i]->seq_region_end != $dexons[$i]->seq_region_end){ # if you enter here means that these two transcripts are not the same
#print "transcript exon coordinates are different\n";
next DTRANS; } } # If you are here means that both transcripts are the same and $trans must be discarded
print "transcript found in discarded db\n"; return 0; }else{ # if you enter here means that these two transcripts are not the same
#print "transcript number of exons is different\n";
next DGENE; } } } #If we reach here means that no transcript in the discarded db is the same as our transcript so we keep it
return 1;
}
clean_queriesdescriptionprevnextTop
sub clean_queries {
  my ($self,$sql_q) = @_;
  
  if ($sql_q) {
    push (@{$self->{_clean_queries}}, $sql_q);
  }

  return $self->{_clean_queries};
}

#fetches sequence from appropriate database
}
create_alignmentdescriptionprevnextTop
sub create_alignment {
  my ($self) = @_;

  my $target_slice = $self->target;
  my $query_slice =  $self->query;

  my $target_top_slice = $self->ensembl_db->get_SliceAdaptor->
      fetch_by_region($target_slice->coord_system->name,
                      $target_slice->seq_region_name,
                      undef,
                      undef,
                      1,
                      $target_slice->coord_system->version);
  
  my $query_top_slice = $self->ensembl_db->get_SliceAdaptor->
      fetch_by_region($query_slice->coord_system->name,
                      $query_slice->seq_region_name,
                      undef,
                      undef,
                      1,
                      $query_slice->coord_system->version);
  
#my $ana = $db->get_AnalysisAdaptor->fetch_by_logic_name($logic);
#if (not defined $ana) {
my $ana = Bio::EnsEMBL::Analysis->new(-logic_name => 'Hap_proj_blastz'); $self->ensembl_db->get_AnalysisAdaptor->store($ana); #}
print "Starting the Blastz alignment. This may take a while so sit and relax.\n"; my $run = Bio::EnsEMBL::Analysis::Runnable::Blastz->new( -analysis => $ana, -query => $query_slice, -options => "C=2 B=0", -database => [Bio::PrimarySeq-> new( -id => $target_slice->seq_region_name, -seq => $target_slice->seq)]); $run->run; my @hits; foreach my $f (@{$run->output}) { printf("PRELIM: %d %d %d %d %d %d %d\n", $f->start, $f->end, $f->strand, $f->hstart, $f->hend, $f->hstrand, $f->score); $f->analysis($ana); $f->seqname($query_slice->seq_region_name); $f->hseqname($target_slice->seq_region_name); $f->invert($target_slice); $f = $f->transfer($target_top_slice); $f->invert($query_slice); $f = $f->transfer($query_top_slice); printf("HIT: %s %d %d %d %s %d %d %d (%d, %.2f)\n", $f->seqname, $f->start, $f->end, $f->strand, $f->hseqname, $f->hstart, $f->hend, $f->hstrand, $f->score, $f->percent_id); # BEWARE this is a partial solution that may not really fix the problem
# May be better just to skip this cases and not add them to the @hits
push @hits, $f; } @hits = sort { $a->start <=> $b->start } @hits; $self->raw_hits(@hits); } ############################################################
}
discarded_dbdescriptionprevnextTop
sub discarded_db {
  my ($self, $discarded_db) = @_;

  if ( $discarded_db ){
    $self->{_discarded_db} = $discarded_db;;
  }

  return $self->{_discarded_db};
}


############################################################
}
ensembl_dbdescriptionprevnextTop
sub ensembl_db {
 my ($self,$ensembl_db) = @_;
 if ( $ensembl_db ){
   $self->{_ensembl_db} = $ensembl_db;
 }
 
 return $self->{_ensembl_db};
}
featuresdescriptionprevnextTop
sub features {
  my ($self,@features) = @_;
  
  if (!defined($self->{_feature})) {
    $self->{_feature} = [];
  }
  if ( scalar @features ) {
    push(@{$self->{_feature}},@features);
  }
  return @{$self->{_feature}};
}

############################################################
}
fetch_sequencedescriptionprevnextTop
sub fetch_sequence {
  my ($self, $name, $db) = @_;

  my $sa = $db->get_SliceAdaptor; 

  my $slice = $sa->fetch_by_name($name);

  return $slice;
}

1;
}
filter_alignmentdescriptionprevnextTop
sub filter_alignment {
  my ($self) = @_;

  my @hits = $self->raw_hits;

  my @keep = sort { $a->start <=> $b->start } @hits;
    
  @keep = sort { $b->score <=> $a->score } @hits;

  my @retained;
  
  foreach my $kept(@keep){ 
    push @retained, $self->trim_hit($kept, @retained);
  }

  @retained = sort { $a->start <=> $b->start } @retained;

  #Check integrity of retained hit. Check that end is bigger than start
my @final_hits; foreach my $retained (@retained){ unless($retained->hend < $retained->hstart && $retained->end < $retained->start){ push (@final_hits,$retained); } $self->hits(@final_hits); }
}
gene_typesdescriptionprevnextTop
sub gene_types {
  my ($self,$type) = @_;

  if (defined($type)) {
     push(@{$self->{_gene_types}},$type);
  }

  return @{$self->{_gene_types}};
}
hitsdescriptionprevnextTop
sub hits {
  my ($self,@hits) = @_;

  if (@hits) {
    push (@{$self->{_hits}}, @hits);
  }

  return @{$self->{_hits}};
}
input_iddescriptionprevnextTop
sub input_id {
  my ($self,$id) = @_;
  
  if (defined($id)) {
    $self->{_input_id} = $id;
  }
  return $self->{_input_id};
}
############################################################
}
load_tablesdescriptionprevnextTop
sub load_tables {
  my ($self, @mappings) = @_;

  print "REACH TEST 3\n";  
  print "MAppings: ",@mappings,"\n";
  foreach my $map(@mappings){
    print "YOUR SQL: ",$map,"\n";
    my $loading = $self->ensembl_db->dbc->prepare($map);
    $loading->execute();
    $loading->finish;
  }
  
  return 1;
}

############################################################
#
# GETSET METHODS
#
############################################################
# get/set method holding a reference to the db with genewise and combined genes,
# havana genes and discarded genes
# this reference is set in Bio::EnsEMBL::Analysis::RunnableDB::HaplotypeProjection
}
make_map_regionsdescriptionprevnextTop
sub make_map_regions {
  my ($self) = @_;
  
  my @mappings;
  my @assembly_writes;
  my $cutoff = 50.0;
  
  my $target_slice =  $self->target;
  
  my $tl_target_slice = $self->ensembl_db->get_SliceAdaptor->fetch_by_region($target_slice->coord_system->name,
                                                          $target_slice->seq_region_name);
  
  my $target_dbid = $self->ensembl_db->get_SliceAdaptor->get_seq_region_id($tl_target_slice);
  
  my @genes = @{$target_slice->get_all_Genes};
  @genes = map { $_->transfer($tl_target_slice) } @genes;
  
  my @ex_regs;
  foreach my $g (@genes) {
    foreach my $t (@{$g->get_all_Transcripts}) {
      foreach my $e (@{$t->get_all_Exons}) {
        push @ex_regs, {
          start => $e->start,
          end   => $e->end,
        }
      }
    }
  }
  @ex_regs = sort { $a->{start} <=> $b->{start} } @ex_regs;
  my @nr_ex_regs;
  foreach my $e (@ex_regs) {
    if (not @nr_ex_regs or
        $nr_ex_regs[-1]->{end} < $e->{start} - 1) {
      push @nr_ex_regs, $e;
    } else {
      if ($e->{end} > $nr_ex_regs[-1]->{end}) {
        $nr_ex_regs[-1]->{end} = $e->{end};
      }
    }
  }
  
  my $query_slice = $self->query;
  
  my $query_dbid = $self->ensembl_db->get_SliceAdaptor->get_seq_region_id($query_slice);
  
  my @f = $self->hits;
  
  print STDERR "Fetched ", scalar(@f), " features\n";
  
  @f = grep { $_->percent_id >= $cutoff } @f;
  @f = sort { $a->start <=> $b->start } @f;
  
  my @seq_bits;
  
  foreach my $bit (@{$query_slice->project('seqlevel')}) {
    if (not @seq_bits or $seq_bits[-1]->{end} < $bit->from_start - 1) {
      push @seq_bits, {
        start => $bit->from_start,
        end   => $bit->from_end,
      };
    } else {
      $seq_bits[-1]->{end} = $bit->from_end;
    }
  }
  
  my @ungapped;
  
  foreach my $gf (@f) { 
    #print "YOUR GF is: ",$gf,"\n";
print "start: ",$gf->start,"\n"; print "end: ",$gf->end,"\n"; print "Strand: ",$gf->strand,"\n"; print "hit_start: ",$gf->hstart,"\n"; print "hit_end: ",$gf->hend,"\n"; print "hit_Strand: ",$gf->hstrand,"\n"; foreach my $f ($gf->ungapped_features) { push @ungapped, $f; } } # merge consistent ungapped;
my @merged_ungapped; foreach my $f (@ungapped) { if (not @merged_ungapped or $f->start - $merged_ungapped[-1]->end != $f->hstart - $merged_ungapped[-1]->hend) { push @merged_ungapped, $f; } else { # need to make sure that be merging, we dont extend
# across a gap
my $bridge_start = $merged_ungapped[-1]->end + 1; my $bridge_end = $f->start - 1; my $inside_seq = 0; foreach my $b (@seq_bits) { if ($bridge_start >= $b->{start} and $bridge_end <= $b->{end}) { $inside_seq = 0; last; } } if ($inside_seq) { $merged_ungapped[-1]->end($f->end); $merged_ungapped[-1]->hend($f->hend); } else { push @merged_ungapped, $f; } } } foreach my $f (@merged_ungapped) { push @assembly_writes, [$query_dbid, $target_dbid, $f->start, $f->end, $f->hstart, $f->hend, 1]; } my $mapping_path = $query_slice->coord_system->name . ":" . $query_slice->seq_region_name . "#" . $target_slice->coord_system->name . ":" . $target_slice->coord_system->version; # Update the meta table with the new mapping between the reference chromosome and the haplotype
my $st = "INSERT into meta(meta_key, meta_value) VALUES(\"assembly.mapping\",\" ".$mapping_path."\")"; push (@mappings, $st); my $contig_mapping_path = $query_slice->coord_system->name . ":" . $query_slice->seq_region_name . "#contig"; my $cst = "INSERT into meta(meta_key, meta_value) VALUES(\"assembly.mapping\",\" ".$contig_mapping_path."\")"; push (@mappings, $cst); # calculate coverage of query sequence
# temporary add an entry in the coord_system table;
my $rq = $self->ensembl_db->dbc->prepare("select MAX(rank) from coord_system"); #print "select MAX(rank) from coord_system\n";
$rq->execute(); my @ranks = $rq->fetchrow_array; #print "RANKS: ",@ranks,"\n";
my $rank_val = $ranks[0]+1; # Create a new coordinate system where the Haplotype name looks like a version of the reference chromosome
my $cs = "INSERT into coord_system(name,version,rank,attrib) VALUES(\"chromosome\",\" ".$query_slice->seq_region_name."\", ".$rank_val.",\"\" )"; push (@mappings, $cs); my $total_len = 0; my $align_len = 0; map { $total_len += ($_->{end} - $_->{start} + 1) } @seq_bits; map { $align_len += ($_->end - $_->start + 1) } @f; printf("# Total seq length = %d, total align length = %d, = %.2f\%\n", $total_len, $align_len, 100 * ($align_len / $total_len));
# find bits of @seq_bits that are not covered by alignment
foreach my $f (@f) { @seq_bits = $self->remove_from_list($f,\@ seq_bits); } foreach my $bit (@seq_bits) { printf("# Missing: %d %d (%d)\n", $bit->{start}, $bit->{end}, $bit->{end} - $bit->{start} + 1); } my $total_ex_len = 0; my $total_ex_non_aln_len = 0; map { $total_ex_len += ($_->{end} - $_->{start} + 1) } @nr_ex_regs; foreach my $f (@f) { @nr_ex_regs = $self->remove_from_list($f,\@ nr_ex_regs, 1); } map { $total_ex_non_aln_len += ($_->{end} - $_->{start} + 1) } @nr_ex_regs; printf ("total length = %d\n ",$total_ex_len); printf ("covered length = %d\n ",$total_ex_len - $total_ex_non_aln_len); printf ("percent length = %.2f\n ",100 * (($total_ex_len - $total_ex_non_aln_len) / $total_ex_len));
printf("# Total exon length = %d, covered by alignment = %d, = %.2f\n", $total_ex_len, $total_ex_len - $total_ex_non_aln_len, 100 * (($total_ex_len - $total_ex_non_aln_len) / $total_ex_len));
#Dirty way to remove duplicates
my %unique_aw; foreach my $all_vals(@assembly_writes){ #print "All vals: ",@$all_vals,"\n";
$unique_aw{join(":",@$all_vals)}=1; } my @unique_assembly_writes; foreach my $uniq(keys %unique_aw){ push (@unique_assembly_writes, $uniq); } # Insert the new assembly mapping between the reference chromosome and the Haplotype
foreach my $el (@unique_assembly_writes) { my @real_vals = split(/:/,$el); my $stl = "INSERT into assembly values(".$real_vals[0].", ".$real_vals[1].", ".$real_vals[2].", ".$real_vals[3].", ".$real_vals[4].", ".$real_vals[5].", ".$real_vals[6].")"; push (@mappings, $stl); } #Get the new coord region for the Haplotype sequence
#my $cr = $self->ensembl_db->dbc->prepare("SELECT coord_system_id from coord_system where name ='chromosome' and version =?");
#$cr->execute($self->query->seq_region_name);
#my @coord_ids = $cr->fetchrow_array;
#print "This is what it returns: ",@coord_ids,"\n";
#$cr->finish;
# Update the Haplotaype sequence coord_system_id in the seq_region table to be able to perform the projection
my $sr = "UPDATE seq_region sr, coord_system cs set sr.coord_system_id =cs.coord_system_id where sr.name = cs.version and sr.name =\"".$self->query->seq_region_name."\""; push (@mappings, $sr); print "TEST REACH POINT 1\n"; return @mappings;
}
newdescriptionprevnextTop
sub new {
    my ($class,@args) = @_;

    my $self = $class->SUPER::new(@args);
    my ($hap_slice,$slice,$input_id) = $self->_rearrange([qw(HAP_SLICE SLICE INPUT_ID)],
					      @args);

    $self->throw("Must input a reference slice and a haplotype slice to HaplotypeProjection") unless (defined($slice) && defined($hap_slice));
    $self->{_final_genes} = [];
    $self->{_gene_types}  = [];

    $self->query($hap_slice);
    $self->target($slice);

    #$self->gene_types($GB_ENSEMBL_INPUT_GENETYPE);
$self->input_id($input_id); return $self; } ############################################################
}
output_dbdescriptionprevnextTop
sub output_db {
 my ($self,$output_db) = @_;

 if ( $output_db ){
   $self->{_output_db} = $output_db;
 }
 
 return $self->{_output_db};
}
querydescriptionprevnextTop
sub query {
  my ($self,$slice) = @_;
  
  if (defined($slice)) {
    $self->{_query} = $slice;
  }
  return $self->{_query};
}
raw_hitsdescriptionprevnextTop
sub raw_hits {
  my ($self,@raw_hits) = @_;
  
  if (@raw_hits) {
    push (@{$self->{_raw_hits}}, @raw_hits);
  }

  return @{$self->{_raw_hits}};
}
remove_from_listdescriptionprevnextTop
sub remove_from_list {
  my ($self, $f, $bits, $hit_coords) = @_;

  # $f will overlap at most one of @b
my $start = $f->start; my $end = $f->end; if (defined $hit_coords and $hit_coords) { $start = $f->hstart; $end = $f->hend; } my $overlap; my @keep; foreach my $bit (@$bits) { if ($start <= $bit->{end} and $end >= $bit->{start}) { $overlap = $bit; } else { push @keep, $bit; } } if (defined $overlap) { if ($start > $overlap->{start} and $end < $overlap->{end}) { my $new_b = { start => $end + 1, end => $overlap->{end}, }; push @keep, $new_b; $overlap->{end} = $start - 1; push @keep, $overlap; } elsif ($start > $overlap->{start}) { $overlap->{end} = $start - 1; push @keep, $overlap; } elsif ($end < $overlap->{end}) { $overlap->{start} = $end + 1; push @keep, $overlap; } } @keep = sort { $a->{start} <=> $b->{start} } @keep; return @keep; } ############################################################
}
targetdescriptionprevnextTop
sub target {
  my ($self,$slice) = @_;
  
  if (defined($slice)) {
    $self->{_target} = $slice;
  }
  return $self->{_target};
}
trim_hitdescriptionprevnextTop
sub trim_hit {
  my ($self, $h, @others) = @_;

  @others = sort { $a->start <=> $b->start } @others;

  # get list of "unique" parts of $h
my (@ov_regs, @cut_feats); foreach my $oh (@others) { if (($oh->end >= $h->start and $oh->start <= $h->end) or ($oh->hend >= $h->hstart and $oh->hstart <= $h->hend)) { my $ov_start = $h->start; $ov_start = $oh->start if $oh->start > $ov_start; my $ov_end = $h->end; $ov_end = $oh->end if $oh->end < $ov_end; my $ov_hstart = $h->hstart; $ov_hstart = $oh->hstart if $oh->hstart > $ov_hstart; my $ov_hend = $h->hend; $ov_hend = $oh->hend if $oh->hend < $ov_hend; my $f1 = $h->restrict_between_positions($ov_start, $ov_end, 'SEQ'); my $f2 = $h->restrict_between_positions($ov_hstart, $ov_hend, 'HSEQ'); if (defined $f2) { $ov_start = $f2->start if $f2->start < $ov_start; $ov_end = $f2->end if $f2->end > $ov_end; } if (defined $f1) { $ov_hstart = $f1->hstart if $f1->hstart < $ov_hstart; $ov_hend = $f1->hend if $f1->hend > $ov_hend; } push @ov_regs, { start => $ov_start, end => $ov_end, hstart => $ov_hstart, hend => $ov_hend, }; } } foreach my $reg (@ov_regs) { } if (@ov_regs) { if ($ov_regs[0]->{start} > $h->{start}) { my $new_f = $h->restrict_between_positions($h->start, $ov_regs[0]->{start} - 1, 'SEQ'); if (defined $new_f) { push @cut_feats, $new_f; } } for(my $i=1; $i < @ov_regs; $i++) { my $new_f = $h->restrict_between_positions($ov_regs[$i-1]->{end} + 1, $ov_regs[$i]->{start} - 1, 'SEQ'); if (defined $new_f) { push @cut_feats, $new_f; } } if ($ov_regs[-1]->{end} < $h->{end}) { my $new_f = $h->restrict_between_positions($ov_regs[-1]->{end} + 1, $h->end, 'SEQ'); if (defined $new_f) { push @cut_feats, $new_f; } } } else { @cut_feats = ($h); } my @all = sort { $a->start <=> $b->start } (@others, @cut_feats); my $consistent = 1; for(my $i=1; $i < @all; $i++) { if ($all[$i]->hstart <= $all[$i-1]->hend) { $consistent = 0; last; } } if ($consistent) { return @cut_feats; } else { return (); } } ############################################################
}
General documentation
CONTACTTop
ensembl-dev@ebi.ac.uk
APPENDIXTop
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _