Bio::EnsEMBL::Analysis::Runnable
HaplotypeProjection
Toolbar
Summary
Bio::EnsEMBL::Analysis::Runnable::HaplotypeProjection
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::Config::HaplotypeProjection qw ( )
Inherit
Synopsis
# This is the main analysis database
my $genebuilder = new Bio::EnsEMBL::Analysis::Runnable::HaplotypeProjection
(
'-hap_slice' => $self->query,
'-slice' => $self->target,
'-input_id' => $self->input_id,
);
Description
This module aligned the genomic sequence of a haplotype region and the corresponding
reference chromosome region and projects the gene annotations
Methods
check_transcript_in_discarded_db | No description | Code |
check_translation | No description | Code |
clean_queries | No description | Code |
clean_tables | No description | Code |
compare_seqs | No description | Code |
discarded_db | No description | Code |
ensembl_db | No description | Code |
features | No description | Code |
fetch_sequence | No description | Code |
fix_coding_phases | No description | Code |
gene_types | Description | Code |
get_coding_part_of_transcript | No description | Code |
get_complete_transcript | No description | Code |
hits | No description | Code |
input_id | Description | Code |
log_compare_transcripts | No description | Code |
log_summarise_projection_by_exon | No description | Code |
new | No description | Code |
output_db | No description | Code |
project_genes | Description | Code |
project_transcript_the_hard_way | No description | Code |
query | No description | Code |
target | No description | Code |
transcript_is_missed | No description | Code |
Methods description
Description: get/set for the type(s) of genes (usually TGE_gw, similarity_genewise and combined_e2g genes) to be used in the genebuilder they get set in new() Does not include the ab inition predictions |
Function: get/set for input id Returns : string Args : string (it expects a string of the format chr_name.start_coord-end_coord |
Description: retrieves ensembl and havana gene annotations with supporting evidence. ReturnType : none, but $self->combined_Transcripts is filled Args : none |
Methods code
check_transcript_in_discarded_db | description | prev | next | Top |
sub check_transcript_in_discarded_db
{ my ($self, $tran) = @_;
my @exons = @{$tran->get_all_Exons};
my $discardedslice = $self->discarded_db->get_SliceAdaptor->fetch_by_region('toplevel',$tran->slice->seq_region_name,$tran->seq_region_start,$tran->seq_region_end);
DGENE:
foreach my $dgene (@{$discardedslice->get_all_Genes}){
DTRANS:foreach my $dtran (@{$dgene->get_all_Transcripts}){
my @dexons = @{$dtran->get_all_Exons};
if(scalar(@exons) == scalar(@dexons)){
for (my $i=0; $i < scalar(@exons); $i++){
if ($exons[$i]->seq_region_start != $dexons[$i]->seq_region_start ||
$exons[$i]->strand != $dexons[$i]->strand ||
$exons[$i]->seq_region_end != $dexons[$i]->seq_region_end){
next DTRANS;
}
}
print "transcript found in discarded db\n";
return 0;
}else{
next DGENE;
}
}
}
return 1;
}
} |
sub check_translation
{ my ($self, $trans) = @_;
my $tseq = $trans->translate();
if ( $tseq->seq =~ /\*/ ) {
$trans->{'translation'} = undef;
$trans->biotype("processed_transcript");
} } |
sub clean_queries
{ my ($self,$sql_q) = @_;
if ($sql_q) {
push (@{$self->{_clean_queries}}, $sql_q);
}
return $self->{_clean_queries};
}
} |
sub clean_tables
{
my ($self) = @_;
my $target_slice = $self->target;
my $query_slice = $self->query;
print "Cleaning meta table\n";
my $meta_q = "DELETE from meta where meta_key=\"assembly.mapping\" and meta_value like\" %".$query_slice->seq_region_name."\%\"";
print "CHECK $meta_q\n";
my $clean_meta = $self->ensembl_db->dbc->prepare($meta_q);
$clean_meta->execute();
$clean_meta->finish;
print "Cleaning meta_coord table\n";
my $meta_coord = $self->ensembl_db->dbc->prepare("DELETE mt.* from meta_coord mt, coord_system cs where mt.coord_system_id = cs.coord_system_id and cs.version = ?");
$meta_coord->execute($query_slice->seq_region_name);
$meta_coord->finish;
print "Cleaning coord_system table\n";
my $clean_coord_system = $self->ensembl_db->dbc->prepare("DELETE from coord_system where version = ?");
$clean_coord_system->execute($query_slice->seq_region_name);
$clean_coord_system->finish;
my $sr_id = $self->ensembl_db->dbc->prepare("SELECT seq_region_id from seq_region where name =?");
$sr_id->execute($query_slice->seq_region_name);
my @hap_sr = $sr_id->fetchrow_array;
$sr_id->execute($target_slice->seq_region_name);
my @chr_sr = $sr_id->fetchrow_array;
$sr_id->finish;
print "Cleaning assembly table\n";
my $clean_assembly = $self->ensembl_db->dbc->prepare("DELETE from assembly where asm_seq_region_id = ? and cmp_seq_region_id = ?");
$clean_assembly->execute($hap_sr[0],$chr_sr[0]);
$clean_assembly->finish;
my $old_sr = $self->ensembl_db->dbc->prepare("SELECT coord_system_id from seq_region where name =?");
$old_sr->execute($target_slice->seq_region_name);
my @old_coord = $old_sr->fetchrow_array;
$old_sr->finish;
print "Cleaning seq_region table\n";
my $clean_seq_req = $self->ensembl_db->dbc->prepare("UPDATE seq_region set coord_system_id = ? where name = ?");
$clean_seq_req->execute($old_coord[0],$query_slice->seq_region_name);
$clean_seq_req->finish;
print "Table clean up done\n"; } |
sub compare_seqs
{ my ($self, $seq1, $seq2) = @_;
my @seq1 = split(//, $seq1);
my @seq2 = split(//, $seq2);
my $total = 0;
my $diffs = 0;
for(my $i=0; $i < @seq1; $i++) {
if ($seq1[$i] ne $seq2[$i]) {
$diffs++;
}
$total++;
}
return $diffs;
}
} |
sub discarded_db
{ my ($self, $discarded_db) = @_;
if ( $discarded_db ){
$self->{_discarded_db} = $discarded_db;;
}
return $self->{_discarded_db};
}
} |
sub ensembl_db
{ my ($self,$ensembl_db) = @_;
if ( $ensembl_db ){
$self->{_ensembl_db} = $ensembl_db;
}
return $self->{_ensembl_db}; } |
sub features
{ my ($self,@features) = @_;
if (!defined($self->{_feature})) {
$self->{_feature} = [];
}
if ( scalar @features ) {
push(@{$self->{_feature}},@features);
}
return @{$self->{_feature}};
}
} |
sub fetch_sequence
{ my ($self, $name, $db) = @_;
my $sa = $db->get_SliceAdaptor;
my $slice = $sa->fetch_by_name($name);
return $slice;
}
1; } |
sub fix_coding_phases
{ my ($self, $trans) = @_;
if (defined($trans->translation)) {
my @exons = @{$trans->get_all_translateable_Exons};
my $prev_exon = $exons[0];
my $start_phase = $exons[0]->phase;
if ($start_phase == -1) {
$start_phase = 0;
}
my @calc_phases;
my $phase = $start_phase;
for (my $i=0; $i<scalar(@exons);$i++) {
push @calc_phases, $phase;
$phase = (($exons[$i]->length + $phase) % 3);
}
for (my $i=0; $i<scalar(@exons);$i++) {
my $exon=$exons[$i];
if ($exon->phase != $calc_phases[$i] && $i != 0) {
print "Had to fix coding exon phase in exon\n ";
$exon->phase($calc_phases[$i]);
if ($i == $#exons) {
$trans->translation->end_Exon->phase($calc_phases[$i]);
}
}
my $calc_endphase = (($exon->length + $calc_phases[$i]) % 3);
if ($exon->end_phase != $calc_endphase && $i != $#exons) {
print "Had to fix coding exon end_phase in exon\n ";
$exon->end_phase($calc_endphase);
if ($i == 0) {
$trans->translation->start_Exon->end_phase($calc_endphase);
}
}
if ($exon->phase != $prev_exon->end_phase && $i != 0) {
die "EndPhase/Phase mismatch\n "; }
$prev_exon = $exon;
}
} } |
sub gene_types
{ my ($self,$type) = @_;
if (defined($type)) {
push(@{$self->{_gene_types}},$type);
}
return @{$self->{_gene_types}}; } |
sub get_coding_part_of_transcript
{ my ($self, $t) = @_;
my @cds = @{$t->get_all_translateable_Exons};
my $cds_t = Bio::EnsEMBL::Transcript->new();
for(my $i=0; $i < @cds; $i++) {
$cds[$i]->get_all_supporting_features();
$cds_t->add_Exon($cds[$i]);
}
$cds_t->analysis($t->analysis);
$cds_t->biotype($t->biotype);
$cds_t->slice($t->slice);
$cds_t->status($t->status);
my $tr = Bio::EnsEMBL::Translation->new;
$tr->start_Exon($cds[0]);
$tr->start(1);
$tr->end_Exon($cds[-1]);
$tr->end($tr->end_Exon->length);
$cds_t->translation($tr);
return $cds_t; } |
sub get_complete_transcript
{ my ($self, $t) = @_;
my @exons = @{$t->get_all_Exons};
my $full_t = Bio::EnsEMBL::Transcript->new();
$full_t->add_supporting_features(@{$t->get_all_supporting_features});
for(my $i=0; $i < @exons; $i++) {
$exons[$i]->get_all_supporting_features();
$full_t->add_Exon($exons[$i]);
}
$full_t->analysis($t->analysis);
$full_t->biotype($t->biotype);
$full_t->slice($t->slice);
$full_t->status($t->status);
if($t->translation){
my @tr_exons = @{$t->get_all_translateable_Exons};
my $tr = Bio::EnsEMBL::Translation->new;
$tr->start($t->translation->start);
$tr->end($t->translation->end);
for(my $i=0; $i < @exons; $i++) {
if ($tr_exons[0]->dbID eq $exons[$i]->dbID){
$tr->start_Exon($exons[$i]);
}
if ($tr_exons[-1]->dbID eq $exons[$i]->dbID){
$tr->end_Exon($exons[$i]);
}
}
$full_t->translation($tr);
}
for(my $i=0; $i < @exons; $i++) {
$exons[$i]->{'adaptor'} = '';
$exons[$i]->{'dbID'} = '';
}
my $attribute = Bio::EnsEMBL::Attribute->new
(-CODE => 'SourceTran',
-NAME => 'source transcript',
-DESCRIPTION => 'source transcript',
-VALUE => $t->stable_id);
$full_t->add_Attributes($attribute);
return $full_t; } |
sub hits
{ my ($self,@hits) = @_;
if (@hits) {
push (@{$self->{_hits}}, @hits);
}
return @{$self->{_hits}}; } |
sub input_id
{ my ($self,$id) = @_;
if (defined($id)) {
$self->{_input_id} = $id;
}
return $self->{_input_id};
}
} |
sub log_compare_transcripts
{ my ($self, $old, $new) = @_;
my $cdnaseq_old = $old->spliced_seq;
my $cdnaseq_new = $new->spliced_seq;
my @exons_old = @{$old->get_all_Exons};
my @exons_new = @{$new->get_all_Exons};
my ($pepseq_old, $pepseq_new);
if ($old->translation) {
$pepseq_old = $old->translate;
}
if ($new->translation) {
$pepseq_new = $new->translate;
}
if ($cdnaseq_old eq $cdnaseq_new) {
print " transfer to identical DNA\n";
} else {
my $cdna_diffs = $self->compare_seqs($cdnaseq_old, $cdnaseq_new);
print " transfer to $cdna_diffs cDNA diffs";
if (defined $pepseq_old and defined $pepseq_new) {
my $prot_diffs = $self->compare_seqs($pepseq_old->seq, $pepseq_new->seq);
print " and $prot_diffs protein diffs";
my $stop_count = $pepseq_new->seq =~ tr/\*/\*/;
if ($stop_count) {
print " ($stop_count STOPS)";
}
}
print "\n";
} } |
sub log_summarise_projection_by_exon
{ my ($self, $t, $hapname) = @_;
my $exons_total = 0;
my $exons_transferred = 0;
my $exons_missed = 0;
my $exons_part_missed = 0;
my $exons_over_indel = 0;
foreach my $e (@{$t->get_all_Exons}) {
$exons_total++;
my $te = $e->transform("chromosome",
$hapname);
if (defined $te) {
$exons_transferred++;
} else {
my @bits = @{$e->project("chromosome",
$hapname)};
if (not @bits) {
$exons_missed++;
} elsif ($bits[0]->from_start != 1 or
$bits[-1]->from_end != $e->length) {
$exons_part_missed++;
} else {
$exons_over_indel++;
}
}
}
printf(" %d exons; %d transferred, %d missed, %d part missed, %d over indel\n",
$exons_total,
$exons_transferred,
$exons_missed,
$exons_part_missed,
$exons_over_indel); } |
sub new
{ my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($hap_slice,$slice,$input_id) = $self->_rearrange([qw(HAP_SLICE SLICE INPUT_ID)],
@args);
$self->throw("Must input a reference slice and a haplotype slice to HaplotypeProjection") unless (defined($slice) && defined($hap_slice));
$self->{_final_genes} = [];
$self->{_gene_types} = [];
$self->query($hap_slice);
$self->target($slice);
$self->input_id($input_id);
return $self;
}
} |
sub output_db
{ my ($self,$output_db) = @_;
if ( $output_db ){
$self->{_output_db} = $output_db;
}
return $self->{_output_db}; } |
sub project_genes
{ my($self) = @_;
my @projected_genes;
my $target_slice = $self->target;
my $query_slice = $self->query;
my $full_query_slice = $self->ensembl_db->get_SliceAdaptor->fetch_by_region($query_slice->coord_system->name,
$query_slice->seq_region_name,
undef,
undef,
undef,
$query_slice->seq_region_name);
my $tl_slice = $self->ensembl_db->get_SliceAdaptor->fetch_by_region($target_slice->coord_system->name,
$target_slice->seq_region_name);
print "About to start fetching genes in reference sequence\n";
my @genes = @{$target_slice->get_all_Genes(undef,undef,1)};
@genes = map { $_->transfer($tl_slice) } @genes;
@genes = sort { $a->start <=> $b->start } @genes;
foreach my $g (@genes) {
foreach my $t (@{$g->get_all_Transcripts}) {
$t->translation;
foreach my $e (@{$t->get_all_Exons}) {
; }
}
}
my $projected = 0;
my $not_projected = 0;
my $query_version = $full_query_slice->coord_system->version;
my $query_name = $full_query_slice->seq_region_name;
print "Processing $query_name...\n";
foreach my $g (@genes) {
my $tg = Bio::EnsEMBL::Gene->new;
$tg->analysis($g->analysis);
$tg->biotype($g->biotype);
$tg->status($g->status);
printf("Transferring gene %s %s %d %d\n",
$g->stable_id,
$g->biotype,
$g->start,
$g->end);
my @proj_trans;
my $before = scalar(@{$g->get_all_Transcripts});
foreach my $t (@{$g->get_all_Transcripts}) {
my $complete_t = $self->get_complete_transcript($t);
my $tt = $complete_t->transform("chromosome", $query_name);
if (defined $tt) {
push @proj_trans, $tt;
printf(" %s %s %d %d transferred successfully:\n",
$t->stable_id,
$t->biotype,
$t->start,
$t->end);
} else {
if ($self->transcript_is_missed($t, $query_name)) {
} else {
if ($complete_t->translation) {
my $cds_t = $self->get_coding_part_of_transcript($complete_t);
my $tt = $cds_t->transform("chromosome", $query_name);
if (defined $tt) {
push @proj_trans, $tt;
} else {
my $new_t = $self->project_transcript_the_hard_way($complete_t, $query_name, $full_query_slice);
if ($new_t){
push @proj_trans, $new_t;
}
}
} else {
my $new_t = $self->project_transcript_the_hard_way($complete_t, $query_name, $full_query_slice);
if ($new_t){
push @proj_trans, $new_t;
}
}
}
}
}
if (@proj_trans) {
foreach my $proj_t (@proj_trans){
if ($proj_t->translation){
$self->check_translation($proj_t);
}
}
map { $tg->add_Transcript($_) } @proj_trans;
if($before > scalar(@{$tg->get_all_Transcripts})){
print "MISSING TRANSCRIPTS\n";
}
push(@projected_genes, $tg);
}
}
print "Gene projection Finished\n";
return @projected_genes; } |
sub project_transcript_the_hard_way
{ my ($self, $t, $hapname, $hap_slice) = @_;
my $new_t = Bio::EnsEMBL::Transcript->new();
$new_t->analysis($t->analysis);
$new_t->biotype($t->biotype);
$new_t->status($t->status);
my @new_e;
my @exons;
if($t->translation){
@exons = @{$t->get_all_translateable_Exons};
}else{
@exons = @{$t->get_all_Exons};
}
foreach my $e (@exons) {
$e->get_all_supporting_features();
my $te = $e->transform("chromosome", $hapname);
if (defined $te) {
push @new_e, $te;
} else {
my @bits = @{$e->project("chromosome",
$hapname)};
if (@bits) {
my ($ex_st, $ex_en, $strand);
foreach my $bit (@bits) {
if (not defined $ex_st or $bit->to_Slice->start < $ex_st) {
$ex_st = $bit->to_Slice->start;
}
if (not defined $ex_en or $bit->to_Slice->end > $ex_en) {
$ex_en = $bit->to_Slice->end;
}
if (not defined $strand) {
$strand = $bit->to_Slice->strand;
}
}
if (defined $ex_st and defined $ex_en) {
my $new_e = Bio::EnsEMBL::Exon->new;
$new_e->start($ex_st);
$new_e->end($ex_en);
$new_e->strand($strand);
$new_e->phase(-1);
$new_e->end_phase(-1);
$new_e->slice($hap_slice);
push @new_e, $new_e;
}
}
}
}
if($t->translation && scalar(@new_e) > 0){
my $tr = Bio::EnsEMBL::Translation->new;
$tr->start(1);
$tr->end($new_e[-1]->seq->length);
$tr->start_Exon($new_e[0]);
$tr->end_Exon($new_e[-1]);
$new_t->translation($tr);
}
if (scalar(@new_e) > 0){
map { $new_t->add_Exon($_) } @new_e;
if($new_t->translation){
$self->fix_coding_phases($new_t);
}
return $new_t;
}else{
return 0;
} } |
sub query
{ my ($self,$slice) = @_;
if (defined($slice)) {
$self->{_query} = $slice;
}
return $self->{_query}; } |
sub target
{ my ($self,$slice) = @_;
if (defined($slice)) {
$self->{_target} = $slice;
}
return $self->{_target}; } |
transcript_is_missed | description | prev | next | Top |
sub transcript_is_missed
{ my ($self, $t, $hapname) = @_;
my $found_part = 0;
foreach my $e (@{$t->get_all_Exons}) {
my @bits = @{$e->project("chromosome", $hapname)};
if (@bits) {
$found_part = 1;
last;
}
}
return not $found_part; } |
General documentation
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _