Raw content of Bio::EnsEMBL::Compara::RunnableDB::LoadUniProt.

Raw content of Bio::EnsEMBL::Compara::RunnableDB::LoadUniProt # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Compara::RunnableDB::LoadUniProt =cut =head1 DESCRIPTION This object uses the getz and pfetch command line programs to access the SRS database of Uniprot sequences. Its purpose is to load protein sequences from Uniprot into the compara database. Right now it has hard coded filters of a minimum sequence length of 80 and taxon in metazoa and distinguishes SWISSPROT from SPTREMBL. The format of the input_id follows the format of a perl hash reference example: "{srs=>'uniprot', taxon_id=>4932}" #loads all uniprot for S. cerevisiae keys: srs => valid values 'swissprot', 'sptrembl', 'uniprot' taxon_id => <taxon_id> optional if one want to load from a specific species if not specified it will load all 'metazoa' from the srs source genome_db_id => <genome_db_id> optional: will associate this loaded set into the specified GenomeDB does not create genome_db entry, assumes it was already created. Use prudently since it does no checks accession_number => 0/1 (default is 1=on) optional if one want to load Accession Number (AC) (DEFAULT) as stable_id rather than Entry Name (ID) more examples: "{srs=>'swissprot'}" #loads all swissprot metazoa "{srs=>'swissprot', taxon_id=>4932}" "{srs=>'sptrembl', taxon_id=>4932}" =cut =head1 CONTACT Contact Jessica Severin on LoadUniprot implemetation/design detail: jessica@ebi.ac.uk Contact Abel Ureta-Vidal on EnsEMBL::Compara in general: abel@ebi.ac.uk Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk =cut =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::EnsEMBL::Compara::RunnableDB::LoadUniProt; use strict; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::DBLoader; use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor; use Bio::EnsEMBL::Compara::Member; use Bio::EnsEMBL::Compara::Homology; use Bio::EnsEMBL::Compara::Member; use Bio::EnsEMBL::Compara::Subset; use Bio::EnsEMBL::Hive; use Bio::EnsEMBL::Hive::Process; our @ISA = qw(Bio::EnsEMBL::Hive::Process); =head2 fetch_input Title : fetch_input Usage : $self->fetch_input Function: prepares global variables and DB connections Returns : none Args : none =cut sub fetch_input { my( $self) = @_; #create a Compara::DBAdaptor which shares the same DBI handle #with the DBAdaptor that is based into this runnable $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN => $self->db->dbc); $self->{'source'} = 'SWISSPROT'; $self->{'taxon_id'} = undef; #no ncbi_taxid filter, get all metzoa $self->{'genome_db_id'} = undef; $self->{'accession_number'} = 1; $self->debug(0); if(defined($self->input_id)) { #print("input_id = ".$self->input_id."\n"); my $input_hash = eval($self->input_id); if(defined($input_hash)) { $self->{'accession_number'} = $input_hash->{'accession_number'} if(defined($input_hash->{'accession_number'})); $self->{'source'} = $input_hash->{'srs'} if(defined($input_hash->{'srs'})); $self->{'taxon_id'} = $input_hash->{'taxon_id'} if(defined($input_hash->{'taxon_id'})); $self->{'genome_db_id'} = $input_hash->{'genome_db_id'} if(defined($input_hash->{'genome_db_id'})); } } return 1; } sub run { my $self = shift; $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); $self->{internal_taxon_ids}; foreach my $genome_db (@{$self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_all}) { $self->{internal_taxon_ids}{$genome_db->taxon_id} = 1; } return unless($self->{'source'}); my $subset_name = $self->{'source'}; my $uniprot_ids = []; my %allowed_taxon_ids = {}; if($self->{'taxon_id'}) { $subset_name .= " ncbi_taxid:" . $self->{'taxon_id'}; $uniprot_ids = $self->fetch_all_uniprot_ids_for_taxid($self->{'source'}, $self->{'taxon_id'}); $allowed_taxon_ids{$self->{'taxon_id'}} = 1; } else { $subset_name .= " metazoa"; $uniprot_ids = $self->get_metazoa_uniprot_ids($self->{'source'}); # Fungi/Metazoa group my $taxon_id = 33154; my $node = $self->{'comparaDBA'}->get_NCBITaxonAdaptor->fetch_node_by_taxon_id($taxon_id); #foreach my $leaf ( @{$node->get_all_leaves} ) { foreach my $leaf ( @{$node->get_all_leaves_indexed} ) { # the indexed method should be much faster when data has left and right indexes built $allowed_taxon_ids{$leaf->node_id} = 1; if ($leaf->rank ne "species") { $allowed_taxon_ids{$leaf->parent->node_id} = 1; } } $node->release_tree; } $self->{'subset'} = Bio::EnsEMBL::Compara::Subset->new(-name=>$subset_name); $self->{'comparaDBA'}->get_SubsetAdaptor->store($self->{'subset'}); $self->{'allowed_taxon_ids'} = \%allowed_taxon_ids; $self->loadMembersFromUniprotIdList($self->{'source'}, $uniprot_ids); return 1; } sub write_output { my $self = shift; my $outputHash = {}; $outputHash = eval($self->input_id) if(defined($self->input_id)); $outputHash->{'ss'} = $self->{'subset'}->dbID; $outputHash->{'gdb'} = $self->{'genome_db_id'} if($self->{'genome_db_id'}); my $output_id = $self->encode_hash($outputHash); $self->input_job->input_id($output_id); if($self->{'genome_db_id'}) { $self->dataflow_output_id($output_id, 2); } return 1; } ###################################### # # subroutines # ##################################### sub loadMembersFromUniprotIdList { my $self = shift; my $source = shift; my $uniprot_ids = shift; #array ref my @id_chunk; my $count = scalar(@$uniprot_ids); # while(@uniprot_ids) { # @id_chunk = splice(@uniprot_ids, 0, 30); # $self->pfetch_and_store_by_ids($source, @id_chunk); # } my $index=1; foreach my $id (@$uniprot_ids) { $index++; print("check/load $index ids\n") if($index % 100 == 0 and $self->debug); my $stable_id = $id; $stable_id = $1 if($id =~ /^(\S+)\.\d+$/); my $member = $self->{'comparaDBA'}->get_MemberAdaptor->fetch_by_source_stable_id('Uniprot/'.$source, $stable_id); if($member and $member->sequence_id) { #print("$source $stable_id : already loadled in compara\n"); } else { #print("need to load $index : $source $stable_id\n"); push @id_chunk, $id; } if(scalar(@id_chunk)>=30) { $self->pfetch_and_store_by_ids($source, @id_chunk); @id_chunk = (); } } $self->pfetch_and_store_by_ids($source, @id_chunk); printf("fetched %d ids from %s\n", $count, $source) if($self->debug); } sub get_metazoa_uniprot_ids { my $self = shift; my $source = shift; #'swissprot' or 'sptrembl' my @taxonomy_division = qw(FUN HUM MAM ROD VRT INV); my $division = "STD"; $division = "PRE" if ($source eq "SPTREMBL"); my @all_ids; foreach my $txd (@taxonomy_division) { my $cmd = "mfetch -d uniprot -v av -i 'txd:$txd&div:$division'"; print("$cmd\n") if($self->debug); my @ids = split(/\s/, qx/$cmd/); push @all_ids, @ids; } printf("fetched %d ids from %s\n", scalar(@all_ids), $source) if($self->debug); return \@all_ids; } sub fetch_all_uniprot_ids_for_taxid { my $self = shift; my $source = shift; #'uniprot', 'swissprot' or 'sptrembl' my $taxon_id = shift; my $division = "STD"; $division = "PRE" if ($source eq "SPTREMBL"); my $cmd = "mfetch -d uniprot -v av -i 'txi:$taxon_id&div:$division'"; print("$cmd\n") if($self->debug); my @ids = split(/\s/, qx/$cmd/); printf("fetched %d ids from %s\n", scalar(@ids), $source) if($self->debug); return \@ids; } sub pfetch_and_store_by_ids { my $self = shift; my $source = shift; # my @orig_ids = @_; # my @ids; my @ids = @_; # foreach my $id (@orig_ids) { # if($id =~ /(.*)\:(.*)/) { # #$source = $1; # $id = $2; # } # push @ids, $id; # } return unless(@ids); my $id_string = join(' ', @ids); open(IN, "pfetch -F $id_string |") or $self->throw("Error running pfetch for ids [$id_string]"); print STDERR "$id_string\n"; my $fh = Bio::SeqIO->new(-fh=>\*IN, -format=>"swiss"); my $nb_seq = 0; while (my $seq = $fh->next_seq){ next if ($seq->length < 80); #################################################################### # This bit is to avoid duplicated entries btw Ensembl and Uniprot # It only affects the Ensembl species dbs, and right now I am using # a home-brewed version of Bio::SeqIO::swiss to parse the PE entries # in a similar manner as comments (CC) but of type 'evidence' my $taxon_id; eval { $taxon_id = $seq->species->ncbi_taxid;}; if (defined($self->{internal_taxon_ids}{$taxon_id})) { my $evidence_annotations; eval { $evidence_annotations = $seq->get_Annotations('evidence');}; # old style if ($@) { my $annotation = $seq->annotation; $evidence_annotations = $annotation->get_Annotations('evidence'); } if (defined $evidence_annotations) { if ($evidence_annotations->text =~ /^4/) { print STDERR $seq->display_id, "PE discarded ", $evidence_annotations->text, "\n"; next; } # We dont want duplicated entries } } #################################################################### $self->store_bioseq($seq, $source); $nb_seq++; } close IN; if ($self->debug && $nb_seq != scalar @ids) { print "Expected ", scalar @ids," seqs but only $nb_seq seen from $id_string\n"; } } sub store_bioseq { my $self = shift; my $bioseq = shift; my $source = shift; if ($source =~ /swissprot/i) { $source = "Uniprot/SWISSPROT"; } elsif ($source =~ /sptrembl/i) { $source = "Uniprot/SPTREMBL"; } return unless($bioseq); my $species = $bioseq->species; return unless($species); if($self->debug) { printf("store_bioseq %s %s : %d : %s", $source, $bioseq->display_id, $species->ncbi_taxid, $species->species); } my $taxon = $self->{'comparaDBA'}->get_NCBITaxonAdaptor->fetch_node_by_taxon_id($species->ncbi_taxid); unless($taxon) { #taxon not in compara, do not store the member and warn warning("Taxon id " . $species->ncbi_taxid . " from $source " . $bioseq->accession_number ." not in the database. Member not stored."); return 1; } unless ($self->{'allowed_taxon_ids'}{$taxon->dbID}) { return 1; } my $member = new Bio::EnsEMBL::Compara::Member; if (defined $self->{'accession_number'} && $self->{'accession_number'} == 1) { $member->stable_id($bioseq->accession_number); } else { $member->stable_id($bioseq->display_id); } $member->display_label($bioseq->display_id); $member->taxon_id($taxon->dbID); $member->description($bioseq->desc); $member->source_name($source); $member->sequence($bioseq->seq); $member->genome_db_id($self->{'genome_db_id'}) if($self->{'genome_db_id'}); eval { $self->{'comparaDBA'}->get_MemberAdaptor->store($member); print(" --stored") if($self->debug); }; $self->{'subset'}->add_member($member); print("\n") if($self->debug); return 1; } 1;