Raw content of Bio::EnsEMBL::Analysis::Runnable::HaplotypeMapper
#
#
# BioPerl module for GeneBuilder
#
# Cared for by EnsEMBL
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable::HaplotypeProjection
=head1 SYNOPSIS
# This is the main analysis database
my $genebuilder = new Bio::EnsEMBL::Analysis::Runnable::HaplotypeProjection
(
'-hap_slice' => $self->query,
'-slice' => $self->target,
'-input_id' => $self->input_id,
);
=head1 DESCRIPTION
This module aligned the genomic sequence of a haplotype region and the corresponding
reference chromosome region and projects the gene annotations
=head1 CONTACT
ensembl-dev@ebi.ac.uk
=head1 APPENDIX
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _
=cut
# Let the code begin...
package Bio::EnsEMBL::Analysis::Runnable::HaplotypeMapper;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::Analysis;
use Bio::EnsEMBL::Slice;
use Bio::EnsEMBL::Root;
use Bio::EnsEMBL::DBEntry;
use Bio::EnsEMBL::Attribute;
use Bio::EnsEMBL::DnaDnaAlignFeature;
use Bio::EnsEMBL::Analysis::Runnable::Blastz;
use Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster;
use Bio::EnsEMBL::Analysis::Config::HaplotypeProjection qw (
);
use vars qw(@ISA);
use strict;
@ISA = qw(Bio::EnsEMBL::Root);
############################################################
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($hap_slice,$slice,$input_id) = $self->_rearrange([qw(HAP_SLICE SLICE INPUT_ID)],
@args);
$self->throw("Must input a reference slice and a haplotype slice to HaplotypeProjection") unless (defined($slice) && defined($hap_slice));
$self->{_final_genes} = [];
$self->{_gene_types} = [];
$self->query($hap_slice);
$self->target($slice);
#$self->gene_types($GB_ENSEMBL_INPUT_GENETYPE);
$self->input_id($input_id);
return $self;
}
############################################################
=head2 input_id
Function: get/set for input id
Returns : string
Args : string (it expects a string of the format chr_name.start_coord-end_coord
=cut
sub input_id {
my ($self,$id) = @_;
if (defined($id)) {
$self->{_input_id} = $id;
}
return $self->{_input_id};
}
############################################################
=head2 create_alignment
Description: Generates Blastz alignment between the ref and the hap sequences
ReturnType : none, but $self->combined_Transcripts is filled
Args : none
=cut
sub create_alignment {
my ($self) = @_;
my $target_slice = $self->target;
my $query_slice = $self->query;
my $target_top_slice = $self->ensembl_db->get_SliceAdaptor->
fetch_by_region($target_slice->coord_system->name,
$target_slice->seq_region_name,
undef,
undef,
1,
$target_slice->coord_system->version);
my $query_top_slice = $self->ensembl_db->get_SliceAdaptor->
fetch_by_region($query_slice->coord_system->name,
$query_slice->seq_region_name,
undef,
undef,
1,
$query_slice->coord_system->version);
#my $ana = $db->get_AnalysisAdaptor->fetch_by_logic_name($logic);
#if (not defined $ana) {
my $ana = Bio::EnsEMBL::Analysis->new(-logic_name => 'Hap_proj_blastz');
$self->ensembl_db->get_AnalysisAdaptor->store($ana);
#}
print "Starting the Blastz alignment. This may take a while so sit and relax.\n";
my $run = Bio::EnsEMBL::Analysis::Runnable::Blastz->new(
-analysis => $ana,
-query => $query_slice,
-options => "C=2 B=0",
-database => [Bio::PrimarySeq->
new(
-id => $target_slice->seq_region_name,
-seq => $target_slice->seq)]);
$run->run;
my @hits;
foreach my $f (@{$run->output}) {
printf("PRELIM: %d %d %d %d %d %d %d\n",
$f->start,
$f->end,
$f->strand,
$f->hstart,
$f->hend,
$f->hstrand,
$f->score);
$f->analysis($ana);
$f->seqname($query_slice->seq_region_name);
$f->hseqname($target_slice->seq_region_name);
$f->invert($target_slice);
$f = $f->transfer($target_top_slice);
$f->invert($query_slice);
$f = $f->transfer($query_top_slice);
printf("HIT: %s %d %d %d %s %d %d %d (%d, %.2f)\n",
$f->seqname,
$f->start,
$f->end,
$f->strand,
$f->hseqname,
$f->hstart,
$f->hend,
$f->hstrand,
$f->score,
$f->percent_id);
# BEWARE this is a partial solution that may not really fix the problem
# May be better just to skip this cases and not add them to the @hits
push @hits, $f;
}
@hits = sort { $a->start <=> $b->start } @hits;
$self->raw_hits(@hits);
}
############################################################
=head2 filter_alignment
Description: Filters the alignments between .
ReturnType : none, but $self->combined_Transcripts is filled
Args : none
=cut
sub filter_alignment {
my ($self) = @_;
my @hits = $self->raw_hits;
my @keep = sort { $a->start <=> $b->start } @hits;
@keep = sort { $b->score <=> $a->score } @hits;
my @retained;
foreach my $kept(@keep){
push @retained, $self->trim_hit($kept, @retained);
}
@retained = sort { $a->start <=> $b->start } @retained;
#Check integrity of retained hit. Check that end is bigger than start
my @final_hits;
foreach my $retained (@retained){
unless($retained->hend < $retained->hstart && $retained->end < $retained->start){
push (@final_hits,$retained);
}
$self->hits(@final_hits);
}
}
sub trim_hit {
my ($self, $h, @others) = @_;
@others = sort { $a->start <=> $b->start } @others;
# get list of "unique" parts of $h
my (@ov_regs, @cut_feats);
foreach my $oh (@others) {
if (($oh->end >= $h->start and $oh->start <= $h->end) or
($oh->hend >= $h->hstart and $oh->hstart <= $h->hend)) {
my $ov_start = $h->start;
$ov_start = $oh->start if $oh->start > $ov_start;
my $ov_end = $h->end;
$ov_end = $oh->end if $oh->end < $ov_end;
my $ov_hstart = $h->hstart;
$ov_hstart = $oh->hstart if $oh->hstart > $ov_hstart;
my $ov_hend = $h->hend;
$ov_hend = $oh->hend if $oh->hend < $ov_hend;
my $f1 = $h->restrict_between_positions($ov_start,
$ov_end,
'SEQ');
my $f2 = $h->restrict_between_positions($ov_hstart,
$ov_hend,
'HSEQ');
if (defined $f2) {
$ov_start = $f2->start if $f2->start < $ov_start;
$ov_end = $f2->end if $f2->end > $ov_end;
}
if (defined $f1) {
$ov_hstart = $f1->hstart if $f1->hstart < $ov_hstart;
$ov_hend = $f1->hend if $f1->hend > $ov_hend;
}
push @ov_regs, {
start => $ov_start,
end => $ov_end,
hstart => $ov_hstart,
hend => $ov_hend,
};
}
}
foreach my $reg (@ov_regs) {
}
if (@ov_regs) {
if ($ov_regs[0]->{start} > $h->{start}) {
my $new_f = $h->restrict_between_positions($h->start,
$ov_regs[0]->{start} - 1,
'SEQ');
if (defined $new_f) {
push @cut_feats, $new_f;
}
}
for(my $i=1; $i < @ov_regs; $i++) {
my $new_f = $h->restrict_between_positions($ov_regs[$i-1]->{end} + 1,
$ov_regs[$i]->{start} - 1,
'SEQ');
if (defined $new_f) {
push @cut_feats, $new_f;
}
}
if ($ov_regs[-1]->{end} < $h->{end}) {
my $new_f = $h->restrict_between_positions($ov_regs[-1]->{end} + 1,
$h->end,
'SEQ');
if (defined $new_f) {
push @cut_feats, $new_f;
}
}
} else {
@cut_feats = ($h);
}
my @all = sort { $a->start <=> $b->start } (@others, @cut_feats);
my $consistent = 1;
for(my $i=1; $i < @all; $i++) {
if ($all[$i]->hstart <= $all[$i-1]->hend) {
$consistent = 0;
last;
}
}
if ($consistent) {
return @cut_feats;
} else {
return ();
}
}
############################################################
=head2 make_map_regions
Description: retrieves ensembl and havana gene annotations with supporting evidence.
ReturnType : none, but $self->combined_Transcripts is filled
Args : none
=cut
sub make_map_regions {
my ($self) = @_;
my @mappings;
my @assembly_writes;
my $cutoff = 50.0;
my $target_slice = $self->target;
my $tl_target_slice = $self->ensembl_db->get_SliceAdaptor->fetch_by_region($target_slice->coord_system->name,
$target_slice->seq_region_name);
my $target_dbid = $self->ensembl_db->get_SliceAdaptor->get_seq_region_id($tl_target_slice);
my @genes = @{$target_slice->get_all_Genes};
@genes = map { $_->transfer($tl_target_slice) } @genes;
my @ex_regs;
foreach my $g (@genes) {
foreach my $t (@{$g->get_all_Transcripts}) {
foreach my $e (@{$t->get_all_Exons}) {
push @ex_regs, {
start => $e->start,
end => $e->end,
}
}
}
}
@ex_regs = sort { $a->{start} <=> $b->{start} } @ex_regs;
my @nr_ex_regs;
foreach my $e (@ex_regs) {
if (not @nr_ex_regs or
$nr_ex_regs[-1]->{end} < $e->{start} - 1) {
push @nr_ex_regs, $e;
} else {
if ($e->{end} > $nr_ex_regs[-1]->{end}) {
$nr_ex_regs[-1]->{end} = $e->{end};
}
}
}
my $query_slice = $self->query;
my $query_dbid = $self->ensembl_db->get_SliceAdaptor->get_seq_region_id($query_slice);
my @f = $self->hits;
print STDERR "Fetched ", scalar(@f), " features\n";
@f = grep { $_->percent_id >= $cutoff } @f;
@f = sort { $a->start <=> $b->start } @f;
my @seq_bits;
foreach my $bit (@{$query_slice->project('seqlevel')}) {
if (not @seq_bits or $seq_bits[-1]->{end} < $bit->from_start - 1) {
push @seq_bits, {
start => $bit->from_start,
end => $bit->from_end,
};
} else {
$seq_bits[-1]->{end} = $bit->from_end;
}
}
my @ungapped;
foreach my $gf (@f) {
#print "YOUR GF is: ",$gf,"\n";
print "start: ",$gf->start,"\n";
print "end: ",$gf->end,"\n";
print "Strand: ",$gf->strand,"\n";
print "hit_start: ",$gf->hstart,"\n";
print "hit_end: ",$gf->hend,"\n";
print "hit_Strand: ",$gf->hstrand,"\n";
foreach my $f ($gf->ungapped_features) {
push @ungapped, $f;
}
}
# merge consistent ungapped;
my @merged_ungapped;
foreach my $f (@ungapped) {
if (not @merged_ungapped or
$f->start - $merged_ungapped[-1]->end !=
$f->hstart - $merged_ungapped[-1]->hend) {
push @merged_ungapped, $f;
} else {
# need to make sure that be merging, we dont extend
# across a gap
my $bridge_start = $merged_ungapped[-1]->end + 1;
my $bridge_end = $f->start - 1;
my $inside_seq = 0;
foreach my $b (@seq_bits) {
if ($bridge_start >= $b->{start} and
$bridge_end <= $b->{end}) {
$inside_seq = 0;
last;
}
}
if ($inside_seq) {
$merged_ungapped[-1]->end($f->end);
$merged_ungapped[-1]->hend($f->hend);
} else {
push @merged_ungapped, $f;
}
}
}
foreach my $f (@merged_ungapped) {
push @assembly_writes, [$query_dbid,
$target_dbid,
$f->start,
$f->end,
$f->hstart,
$f->hend,
1];
}
my $mapping_path = $query_slice->coord_system->name
. ":"
. $query_slice->seq_region_name
. "#"
. $target_slice->coord_system->name
. ":"
. $target_slice->coord_system->version;
# Update the meta table with the new mapping between the reference chromosome and the haplotype
my $st = "INSERT into meta(meta_key, meta_value) VALUES(\"assembly.mapping\", \"".$mapping_path."\")";
push (@mappings, $st);
my $contig_mapping_path = $query_slice->coord_system->name
. ":"
. $query_slice->seq_region_name
. "#contig";
my $cst = "INSERT into meta(meta_key, meta_value) VALUES(\"assembly.mapping\", \"".$contig_mapping_path."\")";
push (@mappings, $cst);
# calculate coverage of query sequence
# temporary add an entry in the coord_system table;
my $rq = $self->ensembl_db->dbc->prepare("select MAX(rank) from coord_system");
#print "select MAX(rank) from coord_system\n";
$rq->execute();
my @ranks = $rq->fetchrow_array;
#print "RANKS: ",@ranks,"\n";
my $rank_val = $ranks[0]+1;
# Create a new coordinate system where the Haplotype name looks like a version of the reference chromosome
my $cs = "INSERT into coord_system(name,version,rank,attrib) VALUES(\"chromosome\", \"".$query_slice->seq_region_name."\", ".$rank_val.",\" \")";
push (@mappings, $cs);
my $total_len = 0;
my $align_len = 0;
map { $total_len += ($_->{end} - $_->{start} + 1) } @seq_bits;
map { $align_len += ($_->end - $_->start + 1) } @f;
printf("# Total seq length = %d, total align length = %d, = %.2f\%\n",
$total_len,
$align_len,
100 * ($align_len / $total_len));
# find bits of @seq_bits that are not covered by alignment
foreach my $f (@f) {
@seq_bits = $self->remove_from_list($f, \@seq_bits);
}
foreach my $bit (@seq_bits) {
printf("# Missing: %d %d (%d)\n",
$bit->{start},
$bit->{end}, $bit->{end} - $bit->{start} + 1);
}
my $total_ex_len = 0;
my $total_ex_non_aln_len = 0;
map { $total_ex_len += ($_->{end} - $_->{start} + 1) } @nr_ex_regs;
foreach my $f (@f) {
@nr_ex_regs = $self->remove_from_list($f, \@nr_ex_regs, 1);
}
map { $total_ex_non_aln_len += ($_->{end} - $_->{start} + 1) } @nr_ex_regs;
printf ("total length = %d \n",$total_ex_len);
printf ("covered length = %d \n",$total_ex_len - $total_ex_non_aln_len);
printf ("percent length = %.2f \n",100 * (($total_ex_len - $total_ex_non_aln_len) / $total_ex_len));
printf("# Total exon length = %d, covered by alignment = %d, = %.2f\n",
$total_ex_len,
$total_ex_len - $total_ex_non_aln_len,
100 * (($total_ex_len - $total_ex_non_aln_len) / $total_ex_len));
#Dirty way to remove duplicates
my %unique_aw;
foreach my $all_vals(@assembly_writes){
#print "All vals: ",@$all_vals,"\n";
$unique_aw{join(":",@$all_vals)}=1;
}
my @unique_assembly_writes;
foreach my $uniq(keys %unique_aw){
push (@unique_assembly_writes, $uniq);
}
# Insert the new assembly mapping between the reference chromosome and the Haplotype
foreach my $el (@unique_assembly_writes) {
my @real_vals = split(/:/,$el);
my $stl = "INSERT into assembly values(".$real_vals[0].", ".$real_vals[1].", ".$real_vals[2].", ".$real_vals[3].", ".$real_vals[4].", ".$real_vals[5].", ".$real_vals[6].")";
push (@mappings, $stl);
}
#Get the new coord region for the Haplotype sequence
#my $cr = $self->ensembl_db->dbc->prepare("SELECT coord_system_id from coord_system where name ='chromosome' and version =?");
#$cr->execute($self->query->seq_region_name);
#my @coord_ids = $cr->fetchrow_array;
#print "This is what it returns: ",@coord_ids,"\n";
#$cr->finish;
# Update the Haplotaype sequence coord_system_id in the seq_region table to be able to perform the projection
my $sr = "UPDATE seq_region sr, coord_system cs set sr.coord_system_id =cs.coord_system_id where sr.name = cs.version and sr.name =\"".$self->query->seq_region_name."\"";
push (@mappings, $sr);
print "TEST REACH POINT 1\n";
return @mappings;
}
sub remove_from_list {
my ($self, $f, $bits, $hit_coords) = @_;
# $f will overlap at most one of @b
my $start = $f->start;
my $end = $f->end;
if (defined $hit_coords and $hit_coords) {
$start = $f->hstart;
$end = $f->hend;
}
my $overlap;
my @keep;
foreach my $bit (@$bits) {
if ($start <= $bit->{end} and
$end >= $bit->{start}) {
$overlap = $bit;
} else {
push @keep, $bit;
}
}
if (defined $overlap) {
if ($start > $overlap->{start} and
$end < $overlap->{end}) {
my $new_b = {
start => $end + 1,
end => $overlap->{end},
};
push @keep, $new_b;
$overlap->{end} = $start - 1;
push @keep, $overlap;
} elsif ($start > $overlap->{start}) {
$overlap->{end} = $start - 1;
push @keep, $overlap;
} elsif ($end < $overlap->{end}) {
$overlap->{start} = $end + 1;
push @keep, $overlap;
}
}
@keep = sort { $a->{start} <=> $b->{start} } @keep;
return @keep;
}
############################################################
sub check_transcript_in_discarded_db{
my ($self, $tran) = @_;
my @exons = @{$tran->get_all_Exons};
my $discardedslice = $self->discarded_db->get_SliceAdaptor->fetch_by_region('toplevel',$tran->slice->seq_region_name,$tran->seq_region_start,$tran->seq_region_end);
#print STDERR "Fetching discarded genes\n";
#print "NUMBER OF DISCARDED GENES: ",scalar(@{$discardedslice->get_all_Genes}),"\n";
DGENE:
foreach my $dgene (@{$discardedslice->get_all_Genes}){
DTRANS:foreach my $dtran (@{$dgene->get_all_Transcripts}){
my @dexons = @{$dtran->get_all_Exons};
if(scalar(@exons) == scalar(@dexons)){
#print "Number of exons: ",scalar(@exons),"\n";
for (my $i=0; $i < scalar(@exons); $i++){
if ($exons[$i]->seq_region_start != $dexons[$i]->seq_region_start ||
$exons[$i]->strand != $dexons[$i]->strand ||
$exons[$i]->seq_region_end != $dexons[$i]->seq_region_end){
# if you enter here means that these two transcripts are not the same
#print "transcript exon coordinates are different\n";
next DTRANS;
}
}
# If you are here means that both transcripts are the same and $trans must be discarded
print "transcript found in discarded db\n";
return 0;
}else{
# if you enter here means that these two transcripts are not the same
#print "transcript number of exons is different\n";
next DGENE;
}
}
}
#If we reach here means that no transcript in the discarded db is the same as our transcript so we keep it
return 1;
}
sub load_tables {
my ($self, @mappings) = @_;
print "REACH TEST 3\n";
print "MAppings: ",@mappings,"\n";
foreach my $map(@mappings){
print "YOUR SQL: ",$map,"\n";
my $loading = $self->ensembl_db->dbc->prepare($map);
$loading->execute();
$loading->finish;
}
return 1;
}
############################################################
#
# GETSET METHODS
#
############################################################
# get/set method holding a reference to the db with genewise and combined genes,
# havana genes and discarded genes
# this reference is set in Bio::EnsEMBL::Analysis::RunnableDB::HaplotypeProjection
sub ensembl_db{
my ($self,$ensembl_db) = @_;
if ( $ensembl_db ){
$self->{_ensembl_db} = $ensembl_db;
}
return $self->{_ensembl_db};
}
sub output_db{
my ($self,$output_db) = @_;
if ( $output_db ){
$self->{_output_db} = $output_db;
}
return $self->{_output_db};
}
sub discarded_db{
my ($self, $discarded_db) = @_;
if ( $discarded_db ){
$self->{_discarded_db} = $discarded_db;;
}
return $self->{_discarded_db};
}
############################################################
=head2 gene_types
Description: get/set for the type(s) of genes (usually TGE_gw, similarity_genewise and combined_e2g genes)
to be used in the genebuilder they get set in new()
Does not include the ab inition predictions
=cut
sub gene_types {
my ($self,$type) = @_;
if (defined($type)) {
push(@{$self->{_gene_types}},$type);
}
return @{$self->{_gene_types}};
}
sub features {
my ($self,@features) = @_;
if (!defined($self->{_feature})) {
$self->{_feature} = [];
}
if ( scalar @features ) {
push(@{$self->{_feature}},@features);
}
return @{$self->{_feature}};
}
############################################################
sub query {
my ($self,$slice) = @_;
if (defined($slice)) {
$self->{_query} = $slice;
}
return $self->{_query};
}
sub target {
my ($self,$slice) = @_;
if (defined($slice)) {
$self->{_target} = $slice;
}
return $self->{_target};
}
sub raw_hits {
my ($self,@raw_hits) = @_;
if (@raw_hits) {
push (@{$self->{_raw_hits}}, @raw_hits);
}
return @{$self->{_raw_hits}};
}
sub hits {
my ($self,@hits) = @_;
if (@hits) {
push (@{$self->{_hits}}, @hits);
}
return @{$self->{_hits}};
}
sub clean_queries {
my ($self,$sql_q) = @_;
if ($sql_q) {
push (@{$self->{_clean_queries}}, $sql_q);
}
return $self->{_clean_queries};
}
#fetches sequence from appropriate database
sub fetch_sequence{
my ($self, $name, $db) = @_;
my $sa = $db->get_SliceAdaptor;
my $slice = $sa->fetch_by_name($name);
return $slice;
}
1;