ensembl-pipeline
FlyBaseGff
Toolbar
Package variables
No package variables defined.
Included modules
Synopsis
No synopsis!
Description
No description!
Methods
Methods description
Title : create_analysis Usage : $obj->create_analysis ( $analysis_adaptor, $logic_name) Function : 1st creates an analysis-object of the values in the FlyBaseConf-file and stores the analysis by using the submitted AnalysisAdaptor (normally created out of the registry-file) Arguments : AnalysisAdaptor-Object, $type, $logic_name Return-Val : none |
Usage : $obj->db() || $obj->db($database_adaptor) Function : sets/gets the DatabaseAdaptor |
Title : db_entry Usage : Function : for xrefs Arguments : Return-Val : |
Title : get_ids_by_type Usage : $obj->get_ids_by_type('gene') Function : returns an Arrayref to all known IDs of a given type (in the scope of the gff) Arguments : ID Return-Val : Arrayref |
Usage : $obj->gff() || $obj->gff( $gff_filename) Function : sets/gets the filename of the current gff-file |
Title : new Usage : Function : new GFF object Arguments : Return-Val : |
Arg [1] : Bio::EnsEMBL::Gene Function : remove duplicate exons between two transcripts Returntype: Bio::EnsEMBL::Gene |
Usage : $obj->slice || $obj->slice($slice) Function : sets/gets the actual slice on which the features/genes are stored |
Title : store_as_gene_object Usage : store_as_gene_object ('gene', 'mRNA','flybase_gene') Function : looks in gff for entity of type 'gene' which has childs 'mRNA' and 'exon' and stores them as gene of type 'flybase_gene' in the database Decsription : Given a slice, parent type (eg. Gene), child type (eg. mRNA, tRNA, pseudogene) this goes through the gff hash and finds all parents of the given type and makes them into Genes. If they have associates Transcripts and Exons then these are added too. They are all stored in the database. Arguments : Return-Val : |
Title : store_as_simple_feature Usage : $obj->store_as_simple_feature($analysis_adaptor, $type, $logic_name) Function : 1st creates an analysis-object using the submitted analysis-adaptor and stores all features of gff of class $type (indicated by 3rd column in gff) as SimpleFeature of a given Analysis Arguments : AnalysisAdaptor-Object, $type, $logic_name Return-Val : none |
Usage : warn_inconsistency($warning) Function : prints out warning-statement |
Methods code
add_Translation | description | prev | next | Top |
sub add_Translation
{ my ($self, $transcript) = @_;
my @protein_ids;
foreach my $child_id (keys %{$self->get_childrenID_from_parentID($transcript->stable_id)}) {
if (${$self->get_features_from_ID($child_id)}{'type'} eq 'protein') {
push @protein_ids, $child_id;
print " Found protein for transcript ".$transcript->stable_id." (strand ".$transcript->strand.") : ";
}
}
if (scalar(@protein_ids) > 1) {
print "WARNING: More than one protein for transcript ".$transcript->stable_id." @protein_ids";
}
if (defined $protein_ids[0]) {
print $protein_ids[0]."\n";
my $cds_start = ${$self->get_features_from_ID($protein_ids[0])}{'start'};
my $cds_end = ${$self->get_features_from_ID($protein_ids[0])}{'end'};
my $cds_strand = ${$self->get_features_from_ID($protein_ids[0])}{'strand'};
print "Adding tln with protein ".$protein_ids[0]." having start $cds_start end $cds_end strand $cds_strand\n";
$transcript = add_TranslationObject($protein_ids[0], $transcript, $cds_start,$cds_end,$cds_strand);
$transcript = $self->setExonPhases($transcript);
} else {
print "No protein for transcript ".$transcript->stable_id." and phases not set\n";
}
return $transcript; } |
sub create_analysis
{ my ($self,$logic_name, $db_file) = @_;
my $ana_obj =Bio::EnsEMBL::Analysis->new(
-logic_name =>$logic_name,
-db => $ANA_DB,
-db_version => $ANA_DB_VERSION,
-db_file => $ANA_DB_FILE,
-program => $ANA_PROGRAM,
-program_version => $ANA_PROGRAM_VERSION,
-program_file => $ANA_PROGRAM_FILE,
-gff_source => $ANA_GFF_SOURCE,
-gff_feature => $ANA_GFF_FEATURE,
-module => $ANA_MODULE,
-module_version => $ANA_MODULE_VERSION,
-parameters => $ANA_PARAMETERS,
-created => $ANA_CREATED
);
if ($db_file) {
$ana_obj->db_file($db_file);
}
return $ana_obj; } |
sub db
{ my ($self,$db) = @_;
$self->{'db'} = $db if $db;
return $self->{'db'}; } |
sub db_entry
{ my ($self, $tr,$db_entry) = @_;
if($db_entry){
$ {$self->{db_entry}} {$tr}=$db_entry;
}
return $ { $self->{db_entry}}{$tr}; } |
sub featurehash
{ my ($self,$hashref) = @_;
$self->{featurehash} = $hashref if $hashref;
return $self->{featurehash};
}
} |
sub get_childrenID_from_parentID
{ my ($self,$id) = @_;
my $children_ref = {};
if (${$self->parent2child}{$id}) {
$children_ref = ${$self->parent2child}{$id};
}
return $children_ref; } |
sub get_dbxrefs
{ my ($self,$id, $xref_type) = @_;
my @dbes;
if (exists ${$self->get_features_from_ID($id)}{'Dbxref'}) {
DBXREF: foreach my $dbx (@{$self->get_features_from_ID($id)->{'Dbxref'}}) {
my ($db_name, @primary_id_string) = split /\:/,$dbx;
my $primary_id = join(':',@primary_id_string);
if ($db_name eq 'if' && $primary_id !~ m/www/) { # Interactive Fly = IF $primary_id = "http://www.sdbonline.org/fly".$primary_id; }
if ($db_name && $primary_id) {
print "got xref $db_name with pid $primary_id\n";
my $db_entry = Bio::EnsEMBL::DBEntry->new(
-DBNAME => $db_name,
-RELEASE => 1,
-PRIMARY_ID => $primary_id,
-DISPLAY_ID => $primary_id
);
push @dbes, $db_entry;
} else {
print "No xref db_name or primary_id for id $id ";
} } } else {
warning("No xrefs for id $id of type $xref_type\n");
@dbes = ();
}
if (exists ${$self->get_features_from_ID($id)}{'Name'}) {
foreach my $syn (@{${$self->get_features_from_ID($id)}{'Name'}}) {
$dbes[0]->add_synonym($syn);
print "Added synonym name $syn to ".$dbes[0]->primary_id."\n";
}
}
if (exists ${$self->get_features_from_ID($id)}{'Alias'}) {
foreach my $syn (@{${$self->get_features_from_ID($id)}{'Alias'}}) {
$dbes[0]->add_synonym($syn);
print "Added synonym alias $syn to ".$dbes[0]->primary_id."\n";
}
}
return\@ dbes; } |
sub get_features_from_ID
{ my ($self,$id) = @_;
return ${$self->featurehash}{$id};
}
} |
sub get_ids_by_type
{ my ($self,$type) = @_;
my $idref = [];
if (${$self->type2id}{$type}) {
$idref = ${$self->type2id}{$type}
}
return $idref;
}
1; } |
sub gff
{ my ($self,$gff)=@_;
if ($gff) {
$self->{gff}=$gff;
$self->throw("gff not found $gff: $!\n") unless (-e $gff);
}
return $self->{gff}; } |
sub make_Exons
{ my ($self, $transcript, $exon_ids, $time ) = @_;
my @exons;
EXON: foreach my $exon_id (@$exon_ids) {
if (${$self->get_features_from_ID($exon_id)}{'type'} ne 'exon') {
next EXON;
}
my $ex = Bio::EnsEMBL::Exon->new(
-START => ${$self->get_features_from_ID($exon_id)}{'start'},
-END => ${$self->get_features_from_ID($exon_id)}{'end'},
-STRAND => ${$self->get_features_from_ID($exon_id)}{'strand'},
-PHASE => 0,
-END_PHASE => 0,
-SLICE => $self->slice,
-ANALYSIS => $self->create_analysis($LOGIC_NAME_EXON),
-STABLE_ID => $exon_id,
-CREATED_DATE => $time,
-MODIFIED_DATE => $time,
-VERSION => $ATTRIBUTE_VERSION
);
print " For Transcript ".$transcript->stable_id.", have Exon ".$ex->stable_id." with ".
"start ".$ex->start." end ".$ex->end." strand ".$ex->strand."\n" ;
$transcript->add_Exon($ex);
}
return $transcript; } |
sub make_Transcripts
{ my ($self, $tref, $biotype, $time) = @_;
my @transcript_ids = @{$tref};
my @transcripts;
for my $transcript_id (@transcript_ids) {
my $new_transcript = new Bio::EnsEMBL::Transcript(
-STABLE_ID => $transcript_id,
-VERSION => $ATTRIBUTE_VERSION,
-TYPE => $biotype,
-CREATED_DATE => $time,
-MODIFIED_DATE => $time,
-ANALYSIS => $self->create_analysis($LOGIC_NAME_EXON),
);
my @exon_ids = keys %{$self->get_childrenID_from_parentID($transcript_id)};
if (scalar(@exon_ids) == 0) {
print "\n x No children for transcript $transcript_id, must be non-coding\n";
} else {
print "\nTranscript $transcript_id has ".@exon_ids." possible exons\n";
my $transcript = make_Exons($self, $new_transcript,\@ exon_ids, $time);
my $transcript_with_tln = add_Translation($self, $transcript);
push @transcripts, $transcript_with_tln;
}
} return\@ transcripts; } |
sub new
{ my ($class,@args) = @_;
my $self={};
bless $self,$class;
my ($db,$gff,$slice) = rearrange(
[qw(
DB
GFF
SLICE
)],@args);
$self->db($db);
$self->gff($gff);
$self->slice($slice);
$self->{_gene}=[];
$self->{exons}={};
$self->{db_entry}={};
$self->parse_gff();
print "\n* * *\ngff $gff parsed\n* * *\n ";
return $self; } |
sub parent2child
{ my ($self,$hashref) = @_;
$self->{parent2child} = $hashref if $hashref;
return $self->{parent2child}; } |
sub parse_gff
{ my ($self) = @_;
my %featurehash;
my %parent2child;
my %m_type2id;
open(GFF,$self->gff)|| die "Could not open ".$self->gff."\n";
LINE: while(<GFF>) {
chomp;
last if /^\#\#FASTA/;
next if /^\#/;
my @line = split/\t/;
my ($seqid,$source,$type,$start,$end,$score,$strand,$phase,$attrib) = @line;
if ($strand eq '-') {
$strand = $line[6] = "-1";
}
if ($strand eq '+') {
$strand = $line[6] = "+1";
}
if ($strand eq '.') {
$strand = $line[6] = "0";
}
if ($strand eq '?') {
$strand = $line[6] = "0";
}
my $unique_id;
my @attributes = split/;/,$attrib if $attrib;
my $seenID;
for (@attributes) { my ($attrib_name, $attrib_values ) = split /=/;
my @values = split /,/,$attrib_values;
if ($attrib_name eq 'ID') {
$unique_id = $values[0];
$seenID = 1;
if (exists($featurehash{$unique_id})) {
warning("Already seen ID $unique_id in line @line");
}
} elsif (!$seenID) {
throw("parsing problem, need ID @line");
}
foreach my $value (@values) {
push @{$featurehash{$unique_id}{$attrib_name}}, $value;
if ($attrib_name eq 'Parent' || $attrib_name eq 'Derives_from') {
my @parents = @values;
foreach my $parent (@parents) {
push @{$parent2child{$parent}{$unique_id}}, ($attrib_name, $type);
}
}
}
} push @{$m_type2id{$type}}, $unique_id;
$featurehash{$unique_id}{'seqid'} = $seqid;
$featurehash{$unique_id}{'source'} = $source;
$featurehash{$unique_id}{'type'} = $type;
$featurehash{$unique_id}{'start'} = $start;
$featurehash{$unique_id}{'end'} = $end;
$featurehash{$unique_id}{'score'} = $score;
$featurehash{$unique_id}{'strand'} = $strand;
$featurehash{$unique_id}{'phase'} = $phase;
if ($featurehash{$unique_id}{'type'} eq 'orthologous_to') {
my $new_unique_id = $unique_id."_".$featurehash{$unique_id}{'to_id'}[0];
if (exists $featurehash{$unique_id}{'to_id'}[0]) {
foreach my $old_attrib_name (keys %{$featurehash{$unique_id}}) {
$featurehash{$new_unique_id}{$old_attrib_name} = $featurehash{$unique_id}{$old_attrib_name};
} delete $featurehash{$unique_id};
for (my $x=0; $x < scalar(@{$m_type2id{$type}}); $x++) {
if (${$m_type2id{$type}}[$x] eq $unique_id) {
${$m_type2id{$type}}[$x] = $new_unique_id;
next;
}
}
foreach my $p (keys %parent2child) {
if (exists $parent2child{$p}{$unique_id}) {
@{$parent2child{$p}{$new_unique_id}} = @{$parent2child{$p}{$unique_id}};
delete $parent2child{$p}{$unique_id};
}
}
} else {
throw("Unable to make ID ".$featurehash{$unique_id}{'ID'}[0]." unique");
} } }
$self->featurehash(\%featurehash);
$self->parent2child(\%parent2child);
$self->type2id(\%m_type2id);
close GFF; } |
sub prune_Exons
{ my ($gene) = @_;
my @unique_Exons;
foreach my $tran (@{$gene->get_all_Transcripts}) {
my @newexons;
foreach my $exon (@{$tran->get_all_Exons}) {
my $found;
UNI:foreach my $uni (@unique_Exons) {
if ($uni->start == $exon->start &&
$uni->end == $exon->end &&
$uni->strand == $exon->strand &&
$uni->phase == $exon->phase &&
$uni->end_phase == $exon->end_phase
) {
$found = $uni;
last UNI;
}
}
if (defined($found)) {
push(@newexons,$found);
if ($exon == $tran->translation->start_Exon) {
$tran->translation->start_Exon($found);
}
if ($exon == $tran->translation->end_Exon) {
$tran->translation->end_Exon($found);
}
} else {
push(@newexons,$exon);
push(@unique_Exons, $exon);
}
}
$tran->flush_Exons;
foreach my $exon (@newexons) {
$tran->add_Exon($exon);
}
}
return $gene; } |
sub rename_exons
{ my ($gene_id, $all_new_transcripts) = @_;
my @transcripts = @$all_new_transcripts;
my %all_processed_exons;
my (%hk_name,%exoncheck);
foreach my $trans ( @transcripts ) {
print "Finding exons for transcript $trans with stable_id ".$trans->stable_id."\n";
foreach my $e ( @{$trans->get_all_Exons()} ) {
if (exists $exoncheck{$e->stable_id}) {
${$exoncheck{$e->stable_id}}{$e->hashkey} = $e->stable_id;
}else{
${$exoncheck{$e->stable_id}}{$e->hashkey} = $e->stable_id;
}
$hk_name{$e->hashkey}= $e->stable_id;
my @line = split /\-/,$e->hashkey;
${$all_processed_exons{"$line[0]-$line[1]-$line[2]"}}{$gene_id}=();
}
}
my @multiple_hashkey;
for (keys %exoncheck) {
if (scalar (keys %{$exoncheck{$_}} ) > 1) {
for my $hashkey (keys %{$exoncheck{$_}}) {
push @multiple_hashkey, $hashkey;
}
}
}
my %same_bounds;
for my $mh(@multiple_hashkey) {
my @line = split /\-/,$mh;
${$same_bounds{"$line[0]-$line[1]-$line[2]"}}{$mh}=();
}
for(keys %same_bounds) {
my %exon_hk = %{$same_bounds{$_}};
my $cnt = "A";
for my $ehk(keys %exon_hk) {
$hk_name{$ehk} = $hk_name{$ehk}."-$cnt";
$cnt++;
}
}
for my $mh(@multiple_hashkey) {
foreach my $trans ( @transcripts ) {
foreach my $e ( @{$trans->get_all_Exons} ) {
if ($e->hashkey eq $mh) {
$e->stable_id($hk_name{$mh});
}
}
}
}
foreach my $trans ( @transcripts ) {
foreach my $e ( @{$trans->get_all_Exons} ) {
my @line = split /\-/,$e->hashkey; my %genes_of_exon = %{$all_processed_exons{"$line[0]-$line[1]-$line[2]"} };
for my $gid (keys %genes_of_exon) {
if ($gid ne $gene_id) {
my $new_name = $e->stable_id . "--" . $gene_id ;
$e->stable_id ($new_name);
}
}
}
}
return\@ transcripts; } |
sub slice
{ my ($self,$slice) = @_;
$self->{'slice'} = $slice if $slice;
return $self->{'slice'}; } |
sub store_as_gene_object
{ my ($self, $parent_type, $child_type, $ensembl_gene_type) = @_;
my %all_processed_exons;
print "Parent feature_type=$parent_type and Child_feature_type=$child_type\n";
my @gene_ids = @{$self->get_ids_by_type($parent_type)};
if (!@gene_ids || scalar(@gene_ids) < 1) {
warning("No parent type '$parent_type' found in gff file ".$self->gff()." (child type $child_type)\n");
return;
} else {
print "Have ".scalar(@gene_ids)." genes of parent type $parent_type\n";
}
my $time = time();
GENE: foreach my $gene_id ( @gene_ids ) {
print "Looking at gene $gene_id\n";
my $all_new_transcripts;
my @transcript_ids = keys %{$self->get_childrenID_from_parentID($gene_id)};
my @keep;
if (scalar(@transcript_ids) == 0) {
print " x WARNING: No child of type $child_type for $parent_type with id $gene_id\n";
next GENE;
} else {
foreach my $t (@transcript_ids) {
if (${$self->get_childrenID_from_parentID($gene_id)}{$t}[1] eq $child_type) {
push @keep, $t;
}
}
if (scalar(@keep) == 0) {
print " x WARNING: No child of type $child_type for $parent_type with id $gene_id\n";
next GENE;
} else {
print "\n* * *\n>> Doing gene $gene_id\n * * *\n";
$all_new_transcripts = make_Transcripts($self,\@ keep, $child_type, $time);
}
}
print "Renaming exons\n";
my ($rt) = rename_exons($gene_id, $all_new_transcripts);
my @renamed_transcripts = @{$rt};
my $gene = Bio::EnsEMBL::Gene->new(
-START => ${$self->get_features_from_ID($gene_id)}{'start'},
-END => ${$self->get_features_from_ID($gene_id)}{'end'},
-STRAND => ${$self->get_features_from_ID($gene_id)}{'strand'},
-SLICE => $self->slice,
-ANALYSIS => $self->create_analysis($LOGIC_NAME_EXON),
-STABLE_ID => $gene_id,
-VERSION => $ATTRIBUTE_VERSION,
-TYPE => $ensembl_gene_type,
-CREATED_DATE => $time,
-MODIFIED_DATE => $time,
-SOURCE => ${$self->get_features_from_ID($gene_id)}{'source'},
);
foreach my $tr (@renamed_transcripts) {
$gene->add_Transcript($tr);
}
print "\nNow trying to store gene $gene_id....." ;
my $gene_DB_id = $self->db->get_GeneAdaptor->store($gene);
print "STORED gene $gene_DB_id\n\n " ;
} }
} |
sub store_as_simple_feature
{ my ($self, $sf_adaptor, $type) = @_;
if ($self->get_ids_by_type($type)) {
my @ids_of_given_type = @{$self->get_ids_by_type($type)};
print "\n* * *\n>> Doing type $type\n* * *\n";
print "Found ".scalar(@ids_of_given_type)." ids of type $type\n";
my %unique_ids;
for (@ids_of_given_type){
if (!exists $unique_ids{$_}) {
$unique_ids{$_} = 1;
} else {
warning("Non-unique entry of type $type : ID ".$_);
}
}
print "Found ".scalar(keys %unique_ids)." UNIQUE ids of type $type\n";
my @simple_features;
my %analyses;
SIMPLE: foreach my $unique_id (keys %unique_ids) { if (${$self->get_features_from_ID($unique_id)}{'type'} eq $type) {
my $analysis_logic = ${$self->get_features_from_ID($unique_id)}{'type'};
if (length($analysis_logic) > 40) {
my $old = $analysis_logic;
$analysis_logic = substr($analysis_logic,0,40);
warning("Shortened analysis logic_name from $old to $analysis_logic");
}
if (!exists $analyses{$analysis_logic}) {
$analyses{$analysis_logic} = $self->create_analysis($analysis_logic);
}
my $score = ${$self->get_features_from_ID($unique_id)}{'score'};
my $display = ${$self->get_features_from_ID($unique_id)}{'ID'}[0];
if (${$self->get_features_from_ID($unique_id)}{'Name'}[0]) {
$display = ${$self->get_features_from_ID($unique_id)}{'Name'}[0];
}
my $sf = Bio::EnsEMBL::SimpleFeature->new(
-start => ${$self->get_features_from_ID($unique_id)}{'start'},
-end => ${$self->get_features_from_ID($unique_id)}{'end'},
-strand => ${$self->get_features_from_ID($unique_id)}{'strand'},
-slice => $self->slice,
-analysis => $analyses{$analysis_logic},
-score => ($score eq '.' ? '0.0' : $score),
-display_label => $display,
-dbID => undef,
-adaptor => undef
);
push @simple_features, $sf;
} else {
next SIMPLE;
}
} if (scalar(@simple_features > 0)) {
$sf_adaptor->store(@simple_features);
return scalar @simple_features;
} else {
return 0;
}
} else {
warning("Can't find any simple features of type $type");
return 0;
}
}
} |
sub type2id
{ my ($self,$type2id_hash) = @_;
$self->{_type2id} = $type2id_hash if $type2id_hash;
return $self->{_type2id}; } |
sub warn_inconsistency
{ my $error = shift;
print STDERR "\tWARNING: UNEXPECTED INCONSISTENCY IN GFF-FILE !!\t\t$error";
return; } |
General documentation
get_all_attributes_by_id | Top |
Title : get_all_attributes_by_id
Usage : $obj->get_all_attributes_by_id('ID')
Function : access all stored attributes of a feature-id
Arguments : ID
Return-Val : arrayref to arrayrefs of attributes of a given ID
storing attributes, genes, SimpleFeatures | Top |
alternative getter/setters | Top |
get_dbxrefs_($id, $xref_type ) | Top |
Usage : $obj->get_field_attributes($id, $descriptor )
Function : returns the Dbxref-Identifier of a given ID like 'FBgn0033692' for type 'gene'
Returnvalue : Arrayref. to all DBxrefs of a given type
getting attributes by their type ('gene') | Top |