HaplotypeMapper documentation.

Bio::EnsEMBL::Analysis::Runnable::HaplotypeProjection

No package variables defined.

# This is the main analysis database

    my $genebuilder = new Bio::EnsEMBL::Analysis::Runnable::HaplotypeProjection
      (       
       '-hap_slice' => $self->query,
       '-slice'   => $self->target,
       '-input_id' => $self->input_id,
      );

This module aligned the genomic sequence of a haplotype region and the corresponding
reference chromosome region and projects the gene annotations

sub check_transcript_in_discarded_db {

  my ($self, $tran) = @_;
 
  my @exons = @{$tran->get_all_Exons};

  my $discardedslice = $self->discarded_db->get_SliceAdaptor->fetch_by_region('toplevel',$tran->slice->seq_region_name,$tran->seq_region_start,$tran->seq_region_end);
  #print STDERR "Fetching discarded genes\n"; 
  #print "NUMBER OF DISCARDED GENES: ",scalar(@{$discardedslice->get_all_Genes}),"\n"; 
  DGENE: 
  foreach my $dgene (@{$discardedslice->get_all_Genes}){
    DTRANS:foreach my $dtran (@{$dgene->get_all_Transcripts}){
      my @dexons = @{$dtran->get_all_Exons};
      if(scalar(@exons) == scalar(@dexons)){
        #print "Number of exons: ",scalar(@exons),"\n";
        for (my $i=0; $i < scalar(@exons); $i++){

          if ($exons[$i]->seq_region_start   != $dexons[$i]->seq_region_start ||
              $exons[$i]->strand  != $dexons[$i]->strand ||
              $exons[$i]->seq_region_end     != $dexons[$i]->seq_region_end){
            # if you enter here means that these two transcripts are not the same
            #print "transcript exon coordinates are different\n";
            next DTRANS;
          }
        }
        # If you are here means that both transcripts are the same and $trans must be discarded
        print "transcript found in discarded db\n";
        return 0;
      }else{
      # if you enter here means that these two transcripts are not the same
        #print "transcript number of exons is different\n";
        next DGENE;
      }
    }
  }
  #If we reach here means that no transcript in the discarded db is the same as our transcript so we keep it
  return 1;

}

sub clean_queries {

  my ($self,$sql_q) = @_;
  
  if ($sql_q) {
    push (@{$self->{_clean_queries}}, $sql_q);
  }

  return $self->{_clean_queries};
}

#fetches sequence from appropriate database

}

sub create_alignment {

  my ($self) = @_;

  my $target_slice = $self->target;
  my $query_slice =  $self->query;

  my $target_top_slice = $self->ensembl_db->get_SliceAdaptor->
      fetch_by_region($target_slice->coord_system->name,
                      $target_slice->seq_region_name,
                      undef,
                      undef,
                      1,
                      $target_slice->coord_system->version);
  
  my $query_top_slice = $self->ensembl_db->get_SliceAdaptor->
      fetch_by_region($query_slice->coord_system->name,
                      $query_slice->seq_region_name,
                      undef,
                      undef,
                      1,
                      $query_slice->coord_system->version);
  
#my $ana = $db->get_AnalysisAdaptor->fetch_by_logic_name($logic);
#if (not defined $ana) {
  my $ana = Bio::EnsEMBL::Analysis->new(-logic_name => 'Hap_proj_blastz');
  $self->ensembl_db->get_AnalysisAdaptor->store($ana);
#}
  print "Starting the Blastz alignment. This may take a while so sit and relax.\n";

  my $run = Bio::EnsEMBL::Analysis::Runnable::Blastz->new(
                                                          -analysis => $ana,
                                                          -query => $query_slice, 
                                                          -options => "C=2 B=0",
                                                          -database => [Bio::PrimarySeq->
                                                                        new(
                                                                            -id  => $target_slice->seq_region_name, 
                                                                            -seq => $target_slice->seq)]);
  
  $run->run;
  
  my @hits;
  foreach my $f (@{$run->output}) {
    
    printf("PRELIM: %d %d %d %d %d %d %d\n", 
           $f->start,
           $f->end,
           $f->strand,
           $f->hstart,
           $f->hend,
           $f->hstrand,
           $f->score);
    
    $f->analysis($ana);
    $f->seqname($query_slice->seq_region_name);
    $f->hseqname($target_slice->seq_region_name);
    
    $f->invert($target_slice);
    $f = $f->transfer($target_top_slice);
    $f->invert($query_slice);
    $f = $f->transfer($query_top_slice);

    printf("HIT: %s %d %d %d %s %d %d %d (%d, %.2f)\n", 
           $f->seqname,
           $f->start,
           $f->end,
           $f->strand,
           $f->hseqname,
           $f->hstart,
           $f->hend,
           $f->hstrand, 
           $f->score,
           $f->percent_id);

    # BEWARE this is a partial solution that may not really fix the problem
    # May be better just to skip this cases and not add them to the @hits
    
      push @hits, $f;
      
  }
  
  @hits = sort { $a->start <=> $b->start } @hits;
  
  $self->raw_hits(@hits);

}

############################################################

}

sub discarded_db {

  my ($self, $discarded_db) = @_;

  if ( $discarded_db ){
    $self->{_discarded_db} = $discarded_db;;
  }

  return $self->{_discarded_db};
}


############################################################

}

sub ensembl_db {

 my ($self,$ensembl_db) = @_;
 if ( $ensembl_db ){
   $self->{_ensembl_db} = $ensembl_db;
 }
 
 return $self->{_ensembl_db};

}

sub features {

  my ($self,@features) = @_;
  
  if (!defined($self->{_feature})) {
    $self->{_feature} = [];
  }
  if ( scalar @features ) {
    push(@{$self->{_feature}},@features);
  }
  return @{$self->{_feature}};
}

############################################################

}

sub fetch_sequence {

  my ($self, $name, $db) = @_;

  my $sa = $db->get_SliceAdaptor; 

  my $slice = $sa->fetch_by_name($name);

  return $slice;
}

1;

}

sub filter_alignment {

  my ($self) = @_;

  my @hits = $self->raw_hits;

  my @keep = sort { $a->start <=> $b->start } @hits;
    
  @keep = sort { $b->score <=> $a->score } @hits;

  my @retained;
  
  foreach my $kept(@keep){ 
    push @retained, $self->trim_hit($kept, @retained);
  }

  @retained = sort { $a->start <=> $b->start } @retained;

  #Check integrity of retained hit. Check that end is bigger than start
  my @final_hits;

  foreach my $retained (@retained){

    unless($retained->hend < $retained->hstart && $retained->end < $retained->start){
      push (@final_hits,$retained);
    }
    $self->hits(@final_hits);
  }

}

sub gene_types {

  my ($self,$type) = @_;

  if (defined($type)) {
     push(@{$self->{_gene_types}},$type);
  }

  return @{$self->{_gene_types}};

}

sub hits {

  my ($self,@hits) = @_;

  if (@hits) {
    push (@{$self->{_hits}}, @hits);
  }

  return @{$self->{_hits}};

}

sub input_id {

  my ($self,$id) = @_;
  
  if (defined($id)) {
    $self->{_input_id} = $id;
  }
  return $self->{_input_id};
}
############################################################

}

sub load_tables {

  my ($self, @mappings) = @_;

  print "REACH TEST 3\n";  
  print "MAppings: ",@mappings,"\n";
  foreach my $map(@mappings){
    print "YOUR SQL: ",$map,"\n";
    my $loading = $self->ensembl_db->dbc->prepare($map);
    $loading->execute();
    $loading->finish;
  }
  
  return 1;
}

############################################################
#
# GETSET METHODS
#
############################################################

# get/set method holding a reference to the db with genewise and combined genes,
# havana genes and discarded genes
# this reference is set in Bio::EnsEMBL::Analysis::RunnableDB::HaplotypeProjection

}

sub make_map_regions {

  my ($self) = @_;
  
  my @mappings;
  my @assembly_writes;
  my $cutoff = 50.0;
  
  my $target_slice =  $self->target;
  
  my $tl_target_slice = $self->ensembl_db->get_SliceAdaptor->fetch_by_region($target_slice->coord_system->name,
                                                          $target_slice->seq_region_name);
  
  my $target_dbid = $self->ensembl_db->get_SliceAdaptor->get_seq_region_id($tl_target_slice);
  
  my @genes = @{$target_slice->get_all_Genes};
  @genes = map { $_->transfer($tl_target_slice) } @genes;
  
  my @ex_regs;
  foreach my $g (@genes) {
    foreach my $t (@{$g->get_all_Transcripts}) {
      foreach my $e (@{$t->get_all_Exons}) {
        push @ex_regs, {
          start => $e->start,
          end   => $e->end,
        }
      }
    }
  }
  @ex_regs = sort { $a->{start} <=> $b->{start} } @ex_regs;
  my @nr_ex_regs;
  foreach my $e (@ex_regs) {
    if (not @nr_ex_regs or
        $nr_ex_regs[-1]->{end} < $e->{start} - 1) {
      push @nr_ex_regs, $e;
    } else {
      if ($e->{end} > $nr_ex_regs[-1]->{end}) {
        $nr_ex_regs[-1]->{end} = $e->{end};
      }
    }
  }
  
  my $query_slice = $self->query;
  
  my $query_dbid = $self->ensembl_db->get_SliceAdaptor->get_seq_region_id($query_slice);
  
  my @f = $self->hits;
  
  print STDERR "Fetched ", scalar(@f), " features\n";
  
  @f = grep { $_->percent_id >= $cutoff } @f;
  @f = sort { $a->start <=> $b->start } @f;
  
  my @seq_bits;
  
  foreach my $bit (@{$query_slice->project('seqlevel')}) {
    if (not @seq_bits or $seq_bits[-1]->{end} < $bit->from_start - 1) {
      push @seq_bits, {
        start => $bit->from_start,
        end   => $bit->from_end,
      };
    } else {
      $seq_bits[-1]->{end} = $bit->from_end;
    }
  }
  
  my @ungapped;
  
  foreach my $gf (@f) { 
    #print "YOUR GF is: ",$gf,"\n";
    print "start: ",$gf->start,"\n";
    print "end: ",$gf->end,"\n";
    print "Strand: ",$gf->strand,"\n";
    print "hit_start: ",$gf->hstart,"\n";
    print "hit_end: ",$gf->hend,"\n";
    print "hit_Strand: ",$gf->hstrand,"\n";

    foreach my $f ($gf->ungapped_features) {
      push @ungapped, $f;
    }
  }
  
  # merge consistent ungapped;
  my @merged_ungapped;
  foreach my $f (@ungapped) {
    if (not @merged_ungapped or
        $f->start - $merged_ungapped[-1]->end !=
        $f->hstart - $merged_ungapped[-1]->hend) {
      push @merged_ungapped, $f;
    } else {
      # need to make sure that be merging, we dont extend
      # across a gap
      my $bridge_start = $merged_ungapped[-1]->end + 1;
      my $bridge_end   = $f->start - 1;
      
      my $inside_seq = 0;
      foreach my $b (@seq_bits) {
        if ($bridge_start >= $b->{start} and
            $bridge_end   <= $b->{end}) {
          $inside_seq = 0;
          last;
        }
      }
      
      if ($inside_seq) {
        $merged_ungapped[-1]->end($f->end);
        $merged_ungapped[-1]->hend($f->hend);
      } else {
        push @merged_ungapped, $f;
      }
    }
  }
  
  foreach my $f (@merged_ungapped) {
    push @assembly_writes, [$query_dbid,
                            $target_dbid, 
                            $f->start, 
                            $f->end,
                            $f->hstart, 
                            $f->hend, 
                            1];
  }
  
  my $mapping_path = $query_slice->coord_system->name 
      . ":" 
      . $query_slice->seq_region_name 
      . "#" 
      . $target_slice->coord_system->name 
      . ":" 
      . $target_slice->coord_system->version;
  
  # Update the meta table with the new mapping between the reference chromosome and the haplotype
  my $st = "INSERT into meta(meta_key, meta_value) VALUES(\"assembly.mapping\",\" ".$mapping_path."\")"; 
  
  push (@mappings, $st);
  
  my $contig_mapping_path = $query_slice->coord_system->name 
      . ":" 
      . $query_slice->seq_region_name 
      . "#contig"; 

  my $cst = "INSERT into meta(meta_key, meta_value) VALUES(\"assembly.mapping\",\" ".$contig_mapping_path."\")"; 
  
  push (@mappings, $cst);


  # calculate coverage of query sequence
  
  # temporary add an entry in the coord_system table;
      
  my $rq = $self->ensembl_db->dbc->prepare("select MAX(rank) from coord_system");
  #print "select MAX(rank) from coord_system\n";
  $rq->execute();
  my @ranks  = $rq->fetchrow_array;
  #print "RANKS: ",@ranks,"\n";
  
  my $rank_val = $ranks[0]+1;
  
  # Create a new coordinate system where the Haplotype name looks like a version of the reference chromosome
  my $cs = "INSERT into coord_system(name,version,rank,attrib) VALUES(\"chromosome\",\" ".$query_slice->seq_region_name."\", ".$rank_val.",\"\" )";
  push (@mappings, $cs);

  my $total_len = 0;
  my $align_len = 0;
  
  map { $total_len += ($_->{end} - $_->{start} + 1) } @seq_bits;
  map { $align_len += ($_->end - $_->start + 1) } @f;
  
  printf("# Total seq length = %d, total align length = %d, = %.2f\%\n", 
         $total_len,
         $align_len,
         100 * ($align_len / $total_len));
  
  
  # find bits of @seq_bits that are not covered by alignment
  foreach my $f (@f) {
    @seq_bits = $self->remove_from_list($f,\@ seq_bits);
  }
  
  foreach my $bit (@seq_bits) {
    printf("# Missing: %d %d (%d)\n", 
           $bit->{start}, 
           $bit->{end}, $bit->{end} - $bit->{start} + 1);
  }
  
  my $total_ex_len = 0;
  my $total_ex_non_aln_len = 0;
  
  map { $total_ex_len += ($_->{end} - $_->{start} + 1) } @nr_ex_regs;
  foreach my $f (@f) {
    @nr_ex_regs = $self->remove_from_list($f,\@ nr_ex_regs, 1);
  }
  map { $total_ex_non_aln_len += ($_->{end} - $_->{start} + 1) } @nr_ex_regs;
  printf ("total length = %d\n ",$total_ex_len);
  printf ("covered length = %d\n ",$total_ex_len - $total_ex_non_aln_len);
  printf ("percent length = %.2f\n ",100 * (($total_ex_len - $total_ex_non_aln_len) / $total_ex_len));

  printf("# Total exon length = %d, covered by alignment = %d, = %.2f\n", 
         $total_ex_len, 
         $total_ex_len - $total_ex_non_aln_len,
         100 * (($total_ex_len - $total_ex_non_aln_len) / $total_ex_len));

  #Dirty way to remove duplicates
  my %unique_aw;
  foreach my $all_vals(@assembly_writes){
    #print "All vals: ",@$all_vals,"\n";
    $unique_aw{join(":",@$all_vals)}=1;
  }    
  
  my @unique_assembly_writes;
  foreach my $uniq(keys %unique_aw){
    push (@unique_assembly_writes, $uniq); 
    
  }

  # Insert the new assembly mapping between the reference chromosome and the Haplotype 
  foreach my $el (@unique_assembly_writes) {
    my @real_vals = split(/:/,$el);
    my $stl = "INSERT into assembly values(".$real_vals[0].", ".$real_vals[1].", ".$real_vals[2].", ".$real_vals[3].", ".$real_vals[4].", ".$real_vals[5].", ".$real_vals[6].")";
  push (@mappings, $stl);

  } 

  #Get the new coord region for the Haplotype sequence
  #my $cr = $self->ensembl_db->dbc->prepare("SELECT coord_system_id from coord_system where name ='chromosome' and version =?");
  #$cr->execute($self->query->seq_region_name);
  #my @coord_ids = $cr->fetchrow_array;
  #print "This is what it returns: ",@coord_ids,"\n";
  #$cr->finish;

  # Update the Haplotaype sequence coord_system_id in the seq_region table to be able to perform the projection
  my $sr = "UPDATE seq_region sr, coord_system cs set sr.coord_system_id =cs.coord_system_id where sr.name = cs.version and sr.name =\"".$self->query->seq_region_name."\"";
  push (@mappings, $sr);

  print "TEST REACH POINT 1\n";

  return @mappings;

}

sub new {

    my ($class,@args) = @_;

    my $self = $class->SUPER::new(@args);
    my ($hap_slice,$slice,$input_id) = $self->_rearrange([qw(HAP_SLICE SLICE INPUT_ID)],
					      @args);

    $self->throw("Must input a reference slice and a haplotype slice to HaplotypeProjection") unless (defined($slice) && defined($hap_slice));
    $self->{_final_genes} = [];
    $self->{_gene_types}  = [];

    $self->query($hap_slice);
    $self->target($slice);

    #$self->gene_types($GB_ENSEMBL_INPUT_GENETYPE);
  
    $self->input_id($input_id);

    return $self;
}

############################################################

}

sub output_db {

 my ($self,$output_db) = @_;

 if ( $output_db ){
   $self->{_output_db} = $output_db;
 }
 
 return $self->{_output_db};

}

sub query {

  my ($self,$slice) = @_;
  
  if (defined($slice)) {
    $self->{_query} = $slice;
  }
  return $self->{_query};

}

sub raw_hits {

  my ($self,@raw_hits) = @_;
  
  if (@raw_hits) {
    push (@{$self->{_raw_hits}}, @raw_hits);
  }

  return @{$self->{_raw_hits}};

}

sub remove_from_list {

  my ($self, $f, $bits, $hit_coords) = @_;

  # $f will overlap at most one of @b

  my $start = $f->start;
  my $end   = $f->end;
  if (defined $hit_coords and $hit_coords) {
    $start = $f->hstart;
    $end   = $f->hend;
  }

  my $overlap; 
  my @keep;

  foreach my $bit (@$bits) {
    if ($start <= $bit->{end} and
        $end   >= $bit->{start}) {
      $overlap = $bit;
    } else {
      push @keep, $bit;
    }
  }

  if (defined $overlap) {
    if ($start > $overlap->{start} and 
        $end < $overlap->{end}) {
      my $new_b = {
        start => $end + 1,
        end   => $overlap->{end},
      };      
      push @keep, $new_b;
      $overlap->{end} = $start - 1;
      push @keep, $overlap;
    } elsif ($start > $overlap->{start}) {
      $overlap->{end} = $start - 1;
      push @keep, $overlap;
    } elsif ($end < $overlap->{end}) {
      $overlap->{start} = $end + 1;
      push @keep, $overlap;
    }
  }

  @keep = sort { $a->{start} <=> $b->{start} } @keep;

  return @keep;
}

############################################################

}

sub target {

  my ($self,$slice) = @_;
  
  if (defined($slice)) {
    $self->{_target} = $slice;
  }
  return $self->{_target};

}

sub trim_hit {

  my ($self, $h, @others) = @_;

  @others = sort { $a->start <=> $b->start } @others;

  # get list of "unique" parts of $h
  my (@ov_regs, @cut_feats);

  foreach my $oh (@others) {
    if (($oh->end >= $h->start and $oh->start <= $h->end) or
        ($oh->hend >= $h->hstart and $oh->hstart <= $h->hend))  { 
      
      my $ov_start = $h->start;
      $ov_start = $oh->start if $oh->start > $ov_start;
      
      my $ov_end = $h->end;
      $ov_end = $oh->end if $oh->end < $ov_end;
      
      my $ov_hstart = $h->hstart;
      $ov_hstart = $oh->hstart if $oh->hstart > $ov_hstart;
      
      my $ov_hend = $h->hend;
      $ov_hend = $oh->hend if $oh->hend < $ov_hend;

      my $f1 = $h->restrict_between_positions($ov_start,
                                              $ov_end,
                                              'SEQ');
      my $f2 = $h->restrict_between_positions($ov_hstart,
                                              $ov_hend,
                                              'HSEQ');
      if (defined $f2) {
        $ov_start = $f2->start if $f2->start < $ov_start;
        $ov_end   = $f2->end   if $f2->end   > $ov_end;
      }
      if (defined $f1) {
        $ov_hstart = $f1->hstart if $f1->hstart < $ov_hstart;
        $ov_hend   = $f1->hend if $f1->hend > $ov_hend;
      }
      
      push @ov_regs, {
        start => $ov_start,
        end   => $ov_end,
        hstart => $ov_hstart,
        hend   => $ov_hend,
      };
    }
  }
  
  foreach my $reg (@ov_regs) {

  }


  if (@ov_regs) {
    if ($ov_regs[0]->{start} > $h->{start}) {
      my $new_f = $h->restrict_between_positions($h->start,
                                                 $ov_regs[0]->{start} - 1,
                                                 'SEQ');
      if (defined $new_f) {
        push @cut_feats, $new_f;
      }
    }
    for(my $i=1; $i < @ov_regs; $i++) {      
      my $new_f =  $h->restrict_between_positions($ov_regs[$i-1]->{end} + 1,
                                                  $ov_regs[$i]->{start} - 1,
                                                  'SEQ');
      if (defined $new_f) { 
        push @cut_feats, $new_f;
      }
    }
    if ($ov_regs[-1]->{end} < $h->{end}) {
      my $new_f =  $h->restrict_between_positions($ov_regs[-1]->{end} + 1,
                                                  $h->end,
                                                  'SEQ');
      if (defined $new_f) {
        push @cut_feats, $new_f;
      }
    }
  } else {
    @cut_feats = ($h);
  }

  my @all = sort { $a->start <=> $b->start } (@others, @cut_feats);
  my $consistent = 1;
  for(my $i=1; $i < @all; $i++) {
    if ($all[$i]->hstart <= $all[$i-1]->hend) {
      $consistent = 0;
      last;
    }
  }



  if ($consistent) {
    return @cut_feats;
  } else {
    return ();
  }
}

 

############################################################

}

General documentation

The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _

check_transcript_in_discarded_db	No description	Code
clean_queries	No description	Code
create_alignment	Description	Code
discarded_db	No description	Code
ensembl_db	No description	Code
features	No description	Code
fetch_sequence	No description	Code
filter_alignment	Description	Code
gene_types	Description	Code
hits	No description	Code
input_id	Description	Code
load_tables	No description	Code
make_map_regions	Description	Code
new	No description	Code
output_db	No description	Code
query	No description	Code
raw_hits	No description	Code
remove_from_list	No description	Code
target	No description	Code
trim_hit	No description	Code