Raw content of XrefParser::Flybase_dmel_GFFv3_Parser # Parse UniProt (SwissProt & SPTrEMBL) files to create xrefs. # # Files actually contain both types of xref, distinguished by ID line; # # This module will read in the fly gff text file and make xrefs from the information in the file. # First of all, it read knows what all the gene, transcript and translation types are, found in column 3 of the gff file: # Gene = gene # Transcript = mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA # Translation = protein (could include CDS here but haven't ?) # # ID=FBgn => flybase_gene_id # ID=FBtr => flybase_transcript_id # ID=FBpp => flybase_polypeptide_id # Name=CG0123 => FlyBaseName_gene # Name=CG0123-RA => FlyBaseName_transcript # Name=CG0123-PA => FlyBaseName_translations # Dbxref=FlyBase:FBan => flybase_annotation_id # Dbxref=FlyBase_Annotation_IDs:CG0123 => gadfly_gene_cgid # Dbxref=FlyBase_Annotation_IDs:CG0123-RA => gadfly_transcript_cgid # Dbxref=FlyBase_Annotation_IDs:CG0123-PA => gadfly_translation_id # Alias= => flybase_synonym # # For each line in the gff file for the above list of genes, transcript and translations, the following happens: # The unique_id is read in from ID= (FBgn, FBtr, FBpp). This is the direct xref for all xrefs of this entry. # An xref is made for the entry, using the ID as the xref's accession. Synonyms from Alias= are added to this xref. # The Name (Name=) is read in and added as an xref. Synonyms from Alias= are added to this xref. # All entries from Dbxref= are added in as xrefs for the entry; they have no synonyms. #2L gene [...] ID=CG11023;Dbxref=FlyBase:FBan0011023,FlyBase:FBgn0031208;gbunit=AE003590;synonym=CG11023 #2L mRNA [...] ID=CG11023-RA;Dbxref=FlyBase:FBtr008,FlyBase:FBgn003;dbxref_2nd=Gadfly:CG11023-RA;synonym=CG23-RA #3R FlyBase gene 8084471 8128509 . + . #ID=FBgn0003651;Name=svp;Alias=FBgn0011337,FBgn0011492,FBgn0011510,FBgn0038010,FBgn0063263;Ontology_term=SO:0000010,SO:0000087,GO:0004872,GO:0007270,GO:0042331,GO:0005515,GO:0007419,GO:0007503,GO:0045449,GO:0004879,GO:0003700,GO:0005634,GO:0007465,GO:0007462,GO:0007464,GO:0007510,GO:0005737,GO:0007507,GO:0007417,GO:0001700,GO:0006357,GO:0007165,GO:0043565,GO:0003707,GO:0008270,GO:0048749,GO:0001752;Dbxref=FlyBase:FBan0011502,FlyBase_Annotation_IDs:CG11502,INTERPRO:IPR013088,GB:AC007724,GB:AE003695,GB_protein:AAF54773,GB_protein:AAN13541,GB_protein:AAF54774,GB:AI108883,GB:AI402121,GB:AY075272,GB_protein:AAL68139,GB:AY119490,GB_protein:AAM50144,GB:AY129452,GB_protein:AAM76194,GB:BG633933,GB:BI167911,GB:CZ468719,GB:CZ472606,GB:CZ475640,GB:CZ475641,GB:CZ477001,GB:CZ482253,GB:CZ485541,GB:CZ485543,GB:G00472,GB:M28863,GB_protein:AAA62770,GB:M28864,GB_protein:AAA03014,UniProt/Swiss-Prot:P16375,UniProt/Swiss-Prot:P16376,UniProt/TrEMBL:Q8MRP3,INTERPRO:IPR000536,INTERPRO:IPR001628,INTERPRO:IPR001723,INTERPRO:IPR003068,INTERPRO:IPR008946,INTERPRO:IPR013629,dedb:9161,flygrid:66603,hybrigenics:521960,if:/newgene/sevenup.htm,orthologs:ensAG:ENSANGG00000002454,orthologs:ensAM:ENSAPMG00000000116,orthologs:ensCF:ENSCAFG00000008076,orthologs:ensDM:CG12744,orthologs:ensDR:ENSDARG00000017168,orthologs:ensFR:SINFRUG00000127451,orthologs:ensGG:ENSGALG00000007000,orthologs:ensHS:ENSG00000185551,orthologs:ensMM:ENSMUSG00000030551,orthologs:ensPT:ENSPTRG00000007484,orthologs:ensRN:ENSRNOG00000010308,orthologs:ensTN:GSTENG00006911001,orthologs:modCB:WBGene00030075;cyto_range=87B4-87B5;gbunit=AE014297; package XrefParser::Flybase_dmel_GFFv3_Parser; use strict; use POSIX qw(strftime); use File::Basename; use Bio::EnsEMBL::Utils::Exception; use base qw( XrefParser::BaseParser ); my %cache_source =(); my $verbose; # -------------------------------------------------------------------------------- # Parse command line and run if being run directly if (!defined(caller())) { if (scalar(@ARGV) != 1) { print "\nUsage: Flybase_dmel_GFFv3_Parser.pm file.gff\n"; print scalar(@ARGV); exit(1); } run($ARGV[0], -1); } # -------------------------------------------------------------------------------- sub new { my $proto = shift; my $self = $proto->SUPER::new(@_); $self->external_source_db_name('flybase_gff'); # my @gff_obj =qw( CDS exon gene mRNA ncRNA pseudogene rRNA snRNA snoRNA tRNA ); # this array may need to change between releases so check that it's updated my @gff_obj =qw( gene mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA); $self->gff_object_types(\@gff_obj); # # hard-coded field separators out of gff # $self->gff_name("Name="); $self->gff_ontology("Ontology_term="); $self->gff_synonym("Alias="); $self->gff_dbxref("Dbxref="); # # hard-coded source-names for different objects out of ./sql/populate_metadata.sql # # For Alias $self->source_name_synonym('flybase_synonym'); # source for any Alias # For Name $self->source_name_name_prefix('FlyBaseName_'); # source for any Name # For Dbxref $self->source_name_fbgn('flybase_gene_id'); # source-name for ID=FBgn $self->source_name_fbtr('flybase_transcript_id'); # source-name for ID=FBtr $self->source_name_fbpp('flybase_polypeptide_id'); # source-name for ID=FBpp $self->source_name_fban('flybase_annotation_id'); # source-name for ID=FBan $self->source_name_gadfly_gene('gadfly_gene_cgid'); # For Dbxref=FlyBase_Annotation_IDs $self->source_name_gadfly_transcript('gadfly_transcript_cgid'); # For Dbxref=FlyBase_Annotation_IDs $self->source_name_gadfly_translation('gadfly_translation_cgid'); # For Dbxref=FlyBase_Annotation_IDs $self->source_name_affymetrix('AFFY_DrosGenome1'); # For Dbxref=Affymetrix $self->source_name_dgrc1('DGRC-1'); # For Dbxref=DGRC-1 $self->source_name_dgrc2('DGRC-2'); # For Dbxref=DGRC-2 $self->source_name_drsc('DRSC'); # For Dbxref=DRSC $self->source_name_epd('EPD'); # For Dbxref=EPD $self->source_name_flyreg('FlyReg'); # For Dbxref=FlyReg $self->source_name_gb('EMBL'); # For Dbxref=GB $self->source_name_gbprotein('protein_id'); # For Dbxref=GB_protein $self->source_name_gcr('GPCR'); # For Dbxref=GCR $self->source_name_gi('GI'); # For Dbxref=GI $self->source_name_go('GO'); # For Dbxref=GO $self->source_name_genomeRNAi('GenomeRNAi'); # For Dbxref=GenomeRNAi $self->source_name_interpro('Interpro'); # For Dbxref=INTERPRO $self->source_name_merops('MEROPS'); # For Dbxref=MEROPS $self->source_name_miRBase('miRBase'); # For Dbxref=miRBase $self->source_name_mitodrome('MitoDrome'); # For Dbxref=MitoDrome $self->source_name_nrl3d('PDB'); # For Dbxref=NRL_3D $self->source_name_pdb('PDB'); # For Dbxref=PDB $self->source_name_rfam('RFAM'); # For Dbxref=Rfam $self->source_name_tf('TransFac'); # For Dbxref=TF $self->source_name_uniprotsp('Uniprot/SWISSPROT'); # For Dbxref=UniProt/Swiss-Prot $self->source_name_uniprottr('Uniprot/SPTREMBL'); # For Dbxref=UniProt/TrEMBL $self->source_name_bdgpinsituexpr('BDGP_insitu_expr'); # For Dbxref=bdgpinsituexpr $self->source_name_dedb('DEDb'); # For Dbxref=dedb $self->source_name_drosdel('DrosDel'); # For Dbxref=drosdel $self->source_name_flygrid('FlyGrid'); # For Dbxref=flygrid $self->source_name_hybrigenics('hybrigenics'); # For Dbxref=hybrigenics $self->source_name_if('InteractiveFly'); # For Dbxref=if $self->source_name_prefix_ensAGgene('Ens_Ag_gene'); # For Dbxref=ensAG $self->source_name_prefix_ensAMgene('Ens_Am_gene'); # For Dbxref=ensAM $self->source_name_prefix_ensCEgene('Ens_Ce_gene'); # For Dbxref=ensCE $self->source_name_prefix_ensCFgene('Ens_Cf_gene'); # For Dbxref=ensCF $self->source_name_prefix_ensDMgene('Ens_Dm_gene'); # For Dbxref=ensDM $self->source_name_prefix_ensDRgene('Ens_Dr_gene'); # For Dbxref=ensDR $self->source_name_prefix_ensFRgene('Ens_Fr_gene'); # For Dbxref=ensFR $self->source_name_prefix_ensGGgene('Ens_Gg_gene'); # For Dbxref=ensGG $self->source_name_prefix_ensHSgene('Ens_Hs_gene'); # For Dbxref=ensHS $self->source_name_prefix_ensMMgene('Ens_Mm_gene'); # For Dbxref=ensMM $self->source_name_prefix_ensPTgene('Ens_Pt_gene'); # For Dbxref=ensPT $self->source_name_prefix_ensRNgene('Ens_Rn_gene'); # For Dbxref=ensRN $self->source_name_prefix_ensTNgene('Ens_Tn_gene'); # For Dbxref=ensTN $self->source_name_prefix_modCBgene('modCB_gene'); # For Dbxref=modCB $self->source_name_prefix_modCEgene('modCE_gene'); # For Dbxref=modCE $self->source_name_prefix_modDDgene('modDD_gene'); # For Dbxref=modDD my @gene_types = qw (gene) ; my @translation_types = qw (protein); # The transcript_types may change from release to release so check that this list is up-to-date my @transcript_types = qw (mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA); $self->gene_types(\@gene_types) ; $self->translation_types(\@translation_types) ; $self->transcript_types(\@transcript_types) ; $self->{'_xrefs'}=[]; $self->{'_direct_xrefs'}=[]; $self->{'_synonyms'}={}; return $self; } # -------------------------------------------------------------------------------- # large number of calls to SQL should now be speeded up as cached. sub get_source{ my ($self, $name) =@_; if(!defined($cache_source{$name})){ $cache_source{$name} = XrefParser::BaseParser->get_source_id_for_source_name($name) } return $cache_source{$name}; } sub run { my $self = shift if (defined(caller(1))); my $source_id = shift; my $species_id = shift; my $files = shift; my $release_file = shift; $verbose = shift; my $file = @{$files}[0]; # my $self = shift if (defined(caller(1))); # my $source_id = shift; # my $species_id = shift; # my $file = shift; my $species_name; if(!defined($species_id)){ ($species_id, $species_name) = get_species($file); } $self->species_id($species_id) ; my $external_source_db_name = $self->external_source_db_name() ; my $flybase_source_id = $self->get_source($external_source_db_name); if(!$self->create_xrefs($flybase_source_id, $file)){ return 1; } my @xrefs = @{$self->xrefs}; $self->relink_synonyms_to_xrefs(); my @direct_xrefs = @{ $self->direct_xrefs } ; # delete previous if running directly rather than via BaseParser if (!defined(caller(1))) { print "Deleting previous xrefs for these sources\n" if($verbose); XrefParser::BaseParser->delete_by_source(\@xrefs); } print "... parsed.\n" if($verbose); print STDERR "uploading ".scalar(@xrefs)." xrefs's\n" if($verbose); XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs); print STDERR "uploading ".scalar(@direct_xrefs)." direct-xrefs's\n" if($verbose); XrefParser::BaseParser->upload_direct_xrefs(\@direct_xrefs); return 0; } sub relink_synonyms_to_xrefs{ my $self = shift; foreach my $x (@{$self->xrefs} ){ my $src_name = XrefParser::BaseParser->get_source_name_for_source_id($x->{SOURCE_ID}); if ($src_name =~ m/^FlyBaseName_/ || $src_name =~ m/^flybase_.*_id$/) { $x->{SYNONYMS} = $self->get_synonyms($x->{ENSEMBL_STABLE_ID}); } } } # -------------------------------------------------------------------------------- # Parse file into array of xref objects # parse sub create_xrefs { my ($self, $flybase_source_id, $file) = @_; print STDERR "starting to parse $file...." if($verbose); my $gff_io = $self->get_filehandle($file); if ( !defined $gff_io ) { print STDERR "ERROR: Can't open the GFF file $file\n"; return 0; } while ( $_ = $gff_io->getline() ) { chomp; my @col = split /\t/; if($col[3]){ # test if line contains information for object wanted (CDS,mRNA,gene,..) if ( $self->line_contains_object_to_process( $col[2] ) ){ # work out if we have a gene, transcript or translation my $type = $self->set_ensembl_object_type($col[2]); # the 9th column contains all the attributes my @desc = split /\;/,$col[8]; # the ID= is always the first element of this array my $unique_id = shift @desc; if(!$unique_id=~m/ID=/){ throw("parse-error: There seems to be no Identifier: $unique_id. Suspicous!"); # print "parse-error: There seems to be no Identifier: $unique_id. Suspicous!"; # return 0; } # for a gene, this will be FBgn, for a transcript this will be FBtr, etc $unique_id =~s/ID=//g; $self->make_id_xref($unique_id,$type); # set up xref-entry for EVERY single item foreach my $item (@desc) { $self->set_flybase_synonyms($item,$unique_id); # make all xrefs for type "Name=" in desc-field # these are FlyBaseName_gene for genes, FlyBaseName_transcript for transcripts, etc $self->make_name_xref($item,$unique_id,$type); # make all xrefs for type "Name=" in desc-field $self->make_dbxref_xref($item,$unique_id,$type); } } } # we don't want to read the line otherwise } # while ( $_ = $gff_io->getline() ) { $gff_io->close(); return 1; } sub set_ensembl_object_type{ my ($self,$t) = @_ ; # $t is identifier in gff for object : CDS,mRNA,gene,pseudogene,snRNA,.... for my $hc (@{ $self->gene_types } ){ if ($t=~m/^$hc$/){ return 'gene'; } } for my $hc (@{ $self->translation_types } ){ if ($t=~m/^$hc$/){ return 'translation'; } } for my $hc (@{ $self->transcript_types} ){ if ($t=~m/^$hc$/){ return 'transcript'; } } } sub make_dbxref_xref{ my ($self,$item,$unique_id,$type) = @_; # item = attribute # unique_id = ID # type = gene, transcript, translation my ($xref); my $tg1 = $self->gff_dbxref ; my $tg2 = $self->gff_ontology; if ($item=~/$tg1/ || $item=~/$tg2/){ # Dbxref= # split the xrefs up into a list my $dbx1 = get_fields($item,$tg1); my @dbx; push @dbx, @{$dbx1} if $dbx1; foreach my $dbx (@dbx) { my $src_id = undef; my $source_type = undef; if ($dbx =~m/FlyBase:/){ $dbx =~s/FlyBase://g; if($dbx=~m/FBgn/ and $type eq "gene"){ $src_id = $self->get_source($self->source_name_fbgn); }elsif ($dbx =~m/FBtr/ and $type eq "transcript"){ $src_id = $self->get_source($self->source_name_fbtr); }elsif ($dbx =~m/FBpp/ and $type eq "translation"){ $src_id = $self->get_source($self->source_name_fbpp); }elsif ($dbx =~m/FBan/){ $src_id = $self->get_source($self->source_name_fban); } }elsif($dbx =~m/FlyBase_Annotation_IDs:/){ $dbx =~s/FlyBase_Annotation_IDs://g; if($type eq "gene"){ $src_id = $self->get_source($self->source_name_gadfly_gene) ; } elsif($type eq "translation"){ $src_id = $self->get_source($self->source_name_gadfly_translation); } elsif($type eq "transcript"){ $src_id = $self->get_source($self->source_name_gadfly_transcript); } } elsif ($dbx =~m/Affymetrix:/) { $dbx =~s/Affymetrix://g; $src_id = $self->get_source($self->source_name_affymetrix) ; } elsif ($dbx =~m/DGRC-1:/) { $dbx =~s/DGRC-1://g; $src_id = $self->get_source($self->source_name_dgrc1) ; } elsif ($dbx =~m/DGRC-2:/) { $dbx =~s/DGRC-2://g; $src_id = $self->get_source($self->source_name_dgrc2); } elsif ($dbx =~m/DRSC:/) { $dbx =~s/DRSC://g; $src_id = $self->get_source($self->source_name_drsc); } elsif ($dbx =~m/EPD:/) { $dbx =~s/EPD://g; $src_id = $self->get_source($self->source_name_epd); } elsif ($dbx =~m/FlyReg:/) { $dbx =~s/FlyReg://g; $src_id = $self->get_source($self->source_name_flyreg); } elsif ($dbx =~m/GB:/) { $dbx =~s/GB://g; $src_id = $self->get_source($self->source_name_gb); } elsif ($dbx =~m/GB_protein:/) { $dbx =~s/GB_protein://g; $src_id = $self->get_source($self->source_name_gbprotein); } elsif ($dbx =~m/GCR:/) { $dbx =~s/GCR://g; $src_id = $self->get_source($self->source_name_gcr); } elsif ($dbx =~m/GI:/) { $dbx =~s/GI://g; $src_id = $self->get_source($self->source_name_gi); } elsif ($dbx =~m/GO:/) { # this is an ontology_term $dbx =~s/GO://g; $src_id = $self->get_source($self->source_name_go); } elsif ($dbx =~m/GenomeRNAi:/) { $dbx =~s/GenomeRNAi://g; $src_id = $self->get_source($self->source_name_genomeRNAi); } elsif ($dbx =~m/INTERPRO:/) { $dbx =~s/INTERPRO://g; $src_id = $self->get_source($self->source_name_interpro); } elsif ($dbx =~m/MEROPS:/) { $dbx =~s/MEROPS://g; $src_id = $self->get_source($self->source_name_merops); } elsif ($dbx =~m/MIR:/) { $dbx =~s/MIR://g; $src_id = $self->get_source($self->source_name_miRBase); } elsif ($dbx =~m/MITODROME:/) { $dbx =~s/MITODROME://g; $src_id = $self->get_source($self->source_name_mitodrome); } elsif ($dbx =~m/NRL_3D:/) { $dbx =~s/NRL_3D://g; $src_id = $self->get_source($self->source_name_nrl3d); } elsif ($dbx =~m/PDB:/) { $dbx =~s/PDB://g; $src_id = $self->get_source($self->source_name_pdb); } elsif ($dbx =~m/Rfam:/) { $dbx =~s/Rfam://g; $src_id = $self->get_source($self->source_name_rfam); } elsif ($dbx =~m/SO:/) { # do nothing, we don't collect these } elsif ($dbx =~m/TF:/) { $dbx =~s/TF://g; $src_id = $self->get_source($self->source_name_tf); } elsif ($dbx =~m/UniProt\/Swiss-Prot:/) { $dbx =~s/UniProt\/Swiss-Prot://g; $src_id = $self->get_source($self->source_name_uniprotsp); } elsif ($dbx =~m/UniProt\/TrEMBL:/) { $dbx =~s/UniProt\/TrEMBL://g; $src_id = $self->get_source($self->source_name_uniprottr); } elsif ($dbx =~m/bdgpinsituexpr:/) { $dbx =~s/bdgpinsituexpr://g; $src_id = $self->get_source($self->source_name_bdgpinsituexpr); } elsif ($dbx =~m/dedb:/) { $dbx =~s/dedb://g; $src_id = $self->get_source($self->source_name_dedb); } elsif ($dbx =~m/drosdel:/) { $dbx =~s/drosdel://g; $src_id = $self->get_source($self->source_name_drosdel); } elsif ($dbx =~m/flygrid:/) { $dbx =~s/flygrid://g; $src_id = $self->get_source($self->source_name_flygrid); } elsif ($dbx =~m/hybrigenics:/) { $dbx =~s/hybrigenics://g; $src_id = $self->get_source($self->source_name_hybrigenics); } elsif ($dbx =~m/if:/) { $dbx =~s/if://g; $src_id = $self->get_source($self->source_name_if); } elsif ($dbx =~m/orthologs:ensAG:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensAGgene); } elsif ($dbx =~m/orthologs:ensAM:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensAMgene); } elsif ($dbx =~m/orthologs:ensCE:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensCEgene); } elsif ($dbx =~m/orthologs:ensCF:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensCFgene); } elsif ($dbx =~m/orthologs:ensDM:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensDMgene); } elsif ($dbx =~m/orthologs:ensDR:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensDRgene); } elsif ($dbx =~m/orthologs:ensFR:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensFRgene); } elsif ($dbx =~m/orthologs:ensGG:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensGGgene); } elsif ($dbx =~m/orthologs:ensHS:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensHSgene); } elsif ($dbx =~m/orthologs:ensMM:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensMMgene); } elsif ($dbx =~m/orthologs:ensPT:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensPTgene); } elsif ($dbx =~m/orthologs:ensRN:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensRNgene); } elsif ($dbx =~m/orthologs:ensTN:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_ensTNgene); } elsif ($dbx =~m/orthologs:modCB:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_modCBgene); } elsif ($dbx =~m/orthologs:modCE:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_modCEgene); } elsif ($dbx =~m/orthologs:modDD:/) { $dbx =~s/orthologs://g; $src_id = $self->get_source($self->source_name_prefix_modDDgene); } else { warning("Dbxref type not recognised : $dbx"); } if ($src_id){ # only add xref entry for FBgn FBtr... my $xref ; $xref->{ACCESSION} = $dbx ; $xref->{LABEL} = $dbx; $xref->{SOURCE_ID} = $src_id; $xref->{SPECIES_ID} = $self->species_id(); #$xref->{SYNONYMS} = $self->get_synonyms($unique_id); $self->add_xref($xref); if ($type){ my $direct_xref; $direct_xref = $xref ; $direct_xref->{ENSEMBL_STABLE_ID} = $unique_id; $direct_xref->{ENSEMBL_TYPE} = $type; #$direct_xref->{LINKAGE_XREF}=undef; $self->add_direct_xref($direct_xref) if $type ; } } } return; } } sub set_flybase_synonyms { my ($self,$item,$unique_id) = @_; my $syn1 = $self->gff_synonym; if ($item=~/$syn1/){ my $s1 = get_fields($item,$syn1); my @syns; push @syns, @{$s1} if $s1; $self->add_synonym($unique_id,\@syns); return \@syns; } return undef; } sub make_id_xref{ my ($self,$unique_id,$type) = @_; my $xref=undef; # make an xref $xref->{ACCESSION} = $unique_id; $xref->{LABEL} = $unique_id; $xref->{SPECIES_ID} = $self->species_id(); $xref->{SYNONYMS} = $self->get_synonyms($unique_id); my $type_s = $type; if ($type eq "gene") { $type_s = $self->source_name_fbgn(); } elsif ($type eq "transcript") { $type_s = $self->source_name_fbtr(); } elsif ($type eq "translation") { $type_s = $self->source_name_fbpp(); } else { throw ("Type $type not recognised"); } $xref->{SOURCE_ID} = $self->get_source($type_s); $self->add_xref($xref); # only allow Name on genes. This is a fix for Biomart really. if (defined($xref) and $type){ my $direct_xref; $direct_xref = $xref ; $direct_xref->{ENSEMBL_STABLE_ID} = $unique_id; $direct_xref->{ENSEMBL_TYPE} = $type; $direct_xref->{LINKAGE_TYPE}='bla'; $direct_xref->{SYNONYMS} = $self->get_synonyms($unique_id); $self->add_direct_xref($direct_xref); } return; } sub make_name_xref{ my ($self,$item,$unique_id,$type) = @_; my $xref=undef; my $target = $self->gff_name ; if($item=~m/$target/){ ##Name= #print "having $$gff_gene_name[0]\n" ; # remove the Name= bit and split the names on a ',' my $gff_gene_name = get_fields ( $item, $target ) ; throw("there is more than one id for item $item\n") if $$gff_gene_name[1]; $xref->{ACCESSION} = $$gff_gene_name[0]; $xref->{LABEL} = $$gff_gene_name[0]; $xref->{SPECIES_ID} = $self->species_id(); $xref->{SYNONYMS} = $self->get_synonyms($unique_id); my $type_s = $type; if($type eq "translation"){ $type_s = $type."s"; } $xref->{SOURCE_ID} = $self->get_source($self->source_name_name_prefix().$type_s); $self->add_xref($xref); } # only allow Name on genes. This is a fix for Biomart really. if (defined($xref) and $type){ my $direct_xref; $direct_xref = $xref ; $direct_xref->{ENSEMBL_STABLE_ID} = $unique_id; $direct_xref->{ENSEMBL_TYPE} = $type; $direct_xref->{LINKAGE_TYPE}='bla'; $direct_xref->{SYNONYMS} = $self->get_synonyms($unique_id); $self->add_direct_xref($direct_xref); } return; } sub get_fields { my ($item,$target) =@_; my @entrys; if ($item =~m/$target/){ $item =~s/$target//g; # check if there is more than one synonym / dbxref ... if ($item =~/,/){ @entrys = split (/\,/,$item); } else{ push @entrys, $item; } return \@entrys; # if the item does not hold information of specific field }else{ return undef; } } sub source_name_name{ my $self = shift; $self->{_source_name_name} = shift if @_ ; return $self->{_source_name_name}; } sub source_name_name_prefix{ my $self = shift; $self->{_source_name_name_prefix} = shift if @_ ; return $self->{_source_name_name_prefix}; } sub source_name_synonym{ my $self = shift; $self->{_source_name_synonym} = shift if @_ ; return $self->{_source_name_synonym}; } sub source_name_fbgn{ my $self = shift; $self->{_source_name_gene} = shift if @_ ; return $self->{_source_name_gene}; } sub source_name_gadfly_gene{ my $self = shift; $self->{_source_name_gadfly_gene} = shift if @_ ; return $self->{_source_name_gadfly_gene}; } sub source_name_gadfly_transcript{ my $self = shift; $self->{_source_name_gadfly_transcript} = shift if @_ ; return $self->{_source_name_gadfly_transcript}; } sub source_name_gadfly_translation{ my $self = shift; $self->{_source_name_gadfly_translation} = shift if @_ ; return $self->{_source_name_gadfly_translation}; } sub source_name_fbtr{ my $self = shift; $self->{_source_name_transcript} = shift if @_ ; return $self->{_source_name_transcript} ; } sub source_name_fbpp{ my $self = shift; $self->{_source_name_fbpp} = shift if @_ ; return $self->{_source_name_fbpp}; } sub source_name_fban{ my $self = shift; $self->{_sn_fban} = shift if @_ ; return $self->{_sn_fban}; } sub source_name_affymetrix { my $self = shift; $self->{_sn_affymetrix} = shift if @_ ; return $self->{_sn_affymetrix}; } sub source_name_dgrc1 { my $self = shift; $self->{_sn_dgrc1} = shift if @_ ; return $self->{_sn_dgrc1}; } sub source_name_dgrc2 { my $self = shift; $self->{_sn_dgrc2} = shift if @_ ; return $self->{_sn_dgrc2}; } sub source_name_drsc { my $self = shift; $self->{_sn_drsc} = shift if @_ ; return $self->{_sn_drsc}; } sub source_name_epd { my $self = shift; $self->{_sn_epd} = shift if @_ ; return $self->{_sn_epd}; } sub source_name_flyreg { my $self = shift; $self->{_sn_flyreg} = shift if @_ ; return $self->{_sn_flyreg}; } sub source_name_gb { my $self = shift; $self->{_sn_gb} = shift if @_ ; return $self->{_sn_gb}; } sub source_name_gbprotein { my $self = shift; $self->{_sn_gbprotein} = shift if @_ ; return $self->{_sn_gbprotein}; } sub source_name_gcr { my $self = shift; $self->{_sn_gcr} = shift if @_ ; return $self->{_sn_gcr}; } sub source_name_gi { my $self = shift; $self->{_sn_gi} = shift if @_ ; return $self->{_sn_gi}; } sub source_name_go { my $self = shift; $self->{_sn_go} = shift if @_ ; return $self->{_sn_go}; } sub source_name_genomeRNAi { my $self = shift; $self->{_sn_genomeRNAi} = shift if @_ ; return $self->{_sn_genomeRNAi}; } sub source_name_interpro { my $self = shift; $self->{_sn_interpro} = shift if @_ ; return $self->{_sn_interpro}; } sub source_name_merops { my $self = shift; $self->{_sn_merops} = shift if @_ ; return $self->{_sn_merops}; } sub source_name_miRBase { my $self = shift; $self->{_sn_miRBase} = shift if @_ ; return $self->{_sn_miRBase}; } sub source_name_mitodrome { my $self = shift; $self->{_sn_mitodrome} = shift if @_ ; return $self->{_sn_mitodrome}; } sub source_name_nrl3d { my $self = shift; $self->{_sn_nrl3d} = shift if @_ ; return $self->{_sn_nrl3d}; } sub source_name_pdb { my $self = shift; $self->{_sn_pdb} = shift if @_ ; return $self->{_sn_pdb}; } sub source_name_rfam { my $self = shift; $self->{_sn_rfam} = shift if @_ ; return $self->{_sn_rfam}; } sub source_name_tf { my $self = shift; $self->{_sn_tf} = shift if @_ ; return $self->{_sn_tf}; } sub source_name_uniprotsp { my $self = shift; $self->{_sn_uniprotsp} = shift if @_ ; return $self->{_sn_uniprotsp}; } sub source_name_uniprottr { my $self = shift; $self->{_sn_uniprottr} = shift if @_ ; return $self->{_sn_uniprottr}; } sub source_name_bdgpinsituexpr { my $self = shift; $self->{_sn_bdgpinsituexpr} = shift if @_ ; return $self->{_sn_bdgpinsituexpr}; } sub source_name_dedb { my $self = shift; $self->{_sn_dedb} = shift if @_ ; return $self->{_sn_dedb}; } sub source_name_drosdel { my $self = shift; $self->{_sn_drosdel} = shift if @_ ; return $self->{_sn_drosdel}; } sub source_name_flygrid { my $self = shift; $self->{_sn_flygrid} = shift if @_ ; return $self->{_sn_flygrid}; } sub source_name_hybrigenics { my $self = shift; $self->{_sn_hybrigenics} = shift if @_ ; return $self->{_sn_hybrigenics}; } sub source_name_if { my $self = shift; $self->{_sn_if} = shift if @_ ; return $self->{_sn_if}; } sub source_name_prefix_ensAGgene { my $self = shift; $self->{_sn_prefix_ensAG} = shift if @_ ; return $self->{_sn_prefix_ensAG}; } sub source_name_prefix_ensAMgene { my $self = shift; $self->{_sn_prefix_ensAM} = shift if @_ ; return $self->{_sn_prefix_ensAM}; } sub source_name_prefix_ensCEgene { my $self = shift; $self->{_sn_prefix_ensCE} = shift if @_ ; return $self->{_sn_prefix_ensCE}; } sub source_name_prefix_ensCFgene { my $self = shift; $self->{_sn_prefix_ensCF} = shift if @_ ; return $self->{_sn_prefix_ensCF}; } sub source_name_prefix_ensDMgene { my $self = shift; $self->{_sn_prefix_ensDM} = shift if @_ ; return $self->{_sn_prefix_ensDM}; } sub source_name_prefix_ensDRgene { my $self = shift; $self->{_sn_prefix_ensDR} = shift if @_ ; return $self->{_sn_prefix_ensDR}; } sub source_name_prefix_ensFRgene { my $self = shift; $self->{_sn_prefix_ensFR} = shift if @_ ; return $self->{_sn_prefix_ensFR}; } sub source_name_prefix_ensGGgene { my $self = shift; $self->{_sn_prefix_ensGG} = shift if @_ ; return $self->{_sn_prefix_ensGG}; } sub source_name_prefix_ensHSgene { my $self = shift; $self->{_sn_prefix_ensHS} = shift if @_ ; return $self->{_sn_prefix_ensHS}; } sub source_name_prefix_ensMMgene { my $self = shift; $self->{_sn_prefix_ensMM} = shift if @_ ; return $self->{_sn_prefix_ensMM}; } sub source_name_prefix_ensPTgene { my $self = shift; $self->{_sn_prefix_ensPT} = shift if @_ ; return $self->{_sn_prefix_ensPT}; } sub source_name_prefix_ensRNgene { my $self = shift; $self->{_sn_ensRN} = shift if @_ ; return $self->{_sn_ensRN}; } sub source_name_prefix_ensTNgene { my $self = shift; $self->{_sn_ensTN} = shift if @_ ; return $self->{_sn_ensTN}; } sub source_name_prefix_modCBgene { my $self = shift; $self->{_sn_modCB} = shift if @_ ; return $self->{_sn_modCB}; } sub source_name_prefix_modCEgene { my $self = shift; $self->{_sn_modCE} = shift if @_ ; return $self->{_sn_modCE}; } sub source_name_prefix_modDDgene { my $self = shift; $self->{_sn_modDD} = shift if @_ ; return $self->{_sn_modDD}; } sub gff_name{ my $self = shift; $self->{_gff_name} = shift if @_ ; return $self->{_gff_name}; } sub gff_dbxref{ my $self = shift; $self->{_gff_dbxref} = shift if @_ ; return $self->{_gff_dbxref}; } sub gff_synonym{ my $self = shift; $self->{_gff_synonym} = shift if @_ ; return $self->{_gff_synonym}; } sub gff_ontology{ my $self = shift; $self->{_gff_ontology} = shift if @_ ; return $self->{_gff_ontology}; } sub species_id { my $self = shift; $self->{_species_id} = shift if @_ ; return $self->{_species_id}; } sub xrefs{ my $self = shift; $self->{_xrefs} = shift if @_ ; return $self->{_xrefs}; } sub add_xref{ my ($self,$add_xref) = @_; push @{$self->xrefs() }, $add_xref; return; } sub direct_xrefs{ my $self = shift; $self->{_direct_xrefs} = shift if @_ ; return $self->{_direct_xrefs}; } sub add_direct_xref{ my ($self,$dr) = @_; push @{$self->direct_xrefs() }, $dr; return; } sub line_contains_object_to_process{ my ($self,$type_of_line) = @_; # shoud be mRNA, gene, pseudogene, CDS,... for my $check_types ( @{$self->gff_object_types}) { if ($check_types =~/^$type_of_line$/){ return 1; } } return 0; } =pod =head2 gff_object_types Title : gff_object_types Usage : $obj->gff_object_types(array-ref) Function : contains gff-type-identifiers of gff-objects which have to be processed Arguments : array-ref Return-Val : array-ref =cut sub gff_object_types{ my $self = shift; $self->{_gff_object_types} = shift if @_ ; return $self->{_gff_object_types}; } =pod =head2 external_source_db_name Title : external_source_db_name Usage : $obj->external_source_db_name(external db name) Function : returns name of hardcoded external source db name Arguments : external db name Return-Val : string =cut sub external_source_db_name{ my $self = shift; $self->{_external_source_db_name} = shift if @_ ; return $self->{_external_source_db_name}; } # -------------------------------------------------------------------------------- # Get species (id and name) from file # For UniProt files the filename is the taxonomy ID sub get_species { my ($file) = @_; my ($taxonomy_id, $extension) = split(/\./, basename($file)); my $sth = XrefParser::BaseParser->dbi()->prepare("SELECT species_id,name FROM species WHERE taxonomy_id=?"); $sth->execute($taxonomy_id); my ($species_id, $species_name); while(my @row = $sth->fetchrow_array()) { $species_id = $row[0]; $species_name = $row[1]; } $sth->finish; if (defined $species_name) { print "Taxonomy ID " . $taxonomy_id . " corresponds to species ID " . $species_id . " name " . $species_name . "\n" if($verbose); } else { throw("Cannot find species corresponding to taxonomy ID " . $species_id . " - check species table\n"); } return ($species_id, $species_name); } sub add_synonym{ my ($self,$unique_id,$synref) = @_; #print "adding synonym for -$unique_id-:".join(" " , @$synref)."\n" ; ; ${$self->synonyms}{$unique_id}=$synref if($synref); return; } sub get_synonyms{ my ($self,$unique_id) = @_; return ${$self->synonyms}{$unique_id}; } sub synonyms{ my $self = shift; $self->{_synonyms} = shift if @_ ; return $self->{_synonyms}; } sub gene_types{ my $self = shift; $self->{_gene_types} = shift if @_ ; return $self->{_gene_types}; } sub transcript_types{ my $self = shift; $self->{_trans_types} = shift if @_ ; return $self->{_trans_types}; } sub translation_types{ my $self = shift; $self->{_tl_types} = shift if @_ ; return $self->{_tl_types}; } 1; # Drosophila v5.3 : xrefs # Gff_file external_db_id db_name # == # Affymetrix 3120 AFFY_DrosGenome1 # DGRC-1 830 DGRC-1 # DGRC-2 831 DGRC-2 # DRSC 840 DRSC # EPD 10100 EPD # FlyBase 800 flybase_gene_id # FlyBase_Annotation_IDs 804 flybase_annotation_id # FlyReg 850 FlyReg # GB 700 EMBL # GB_protein 1700 protein_id # GCR 10200 GPCR # GI 10900 GI # GO 1000 GO # GenomeRNAi 860 GenomeRNAi # INTERPRO 1200 Interpro # MEROPS 10300 MEROPS # MIR 10400 miRBase # MITODROME 870 MitoDrome # NRL_3D 1600 PDB # PDB 1600 PDB # Rfam 4200 RFAM # TF 10500 TransFac # UniProt/Swiss-Prot 2200 Uniprot/SWISSPROT # UniProt/TrEMBL 2000 Uniprot/SPTREMBL # bdgpinsituexpr 880 BDGP_insitu_expr # dedb 890 DEDb # drosdel 881 DrosDel # flygrid 882 FlyGrid # hybrigenics 883 hybrigenics # if 884 InteractiveFly # ensAG 6600 Ens_Ag_gene # Anopheles gambiae # ensAM 6630 Ens_Am_gene # apis mellifera? # ensCE 6660 Ens_Ce_gene # C Elegans # ensCF 5700 Ens_Cf_gene # Canis familiaris # ensDM 6690 Ens_Dm_gene # # ensDR 5800 Ens_Dr_gene # Danio rerio # ensFR 6720 Ens_Fr_gene # Takifugu rubripes # ensGG 6400 Ens_Gg_gene # Gallus gallus # ensHS 2700 Ens_Hs_gene # Homo sapiens # ensMM 5000 Ens_Mm_gene # mus musculus # ensPT 6750 Ens_Pt_gene # Pan troglodytes # ensRN 6200 Ens_Rn_gene # Rattus norvegicus # ensTN 6810 Ens_Tn_gene # Tetraodon nigroviridis # modCB 10600 modCB # InParanoid Model organism database, Caenorhabditis briggsae # modCE 10700 modCE # Caenorhabditis elegans # modDD 10800 modDD # Dictyostelium discoideum