LoadUniProt documentation.

This object uses the getz and pfetch command line programs to access
the SRS database of Uniprot sequences.
Its purpose is to load protein sequences from Uniprot into the compara
database.
Right now it has hard coded filters of a minimum sequence length of 80
and taxon in metazoa and distinguishes SWISSPROT from SPTREMBL.
The format of the input_id follows the format of a perl hash reference
example:
"{srs=>'uniprot', taxon_id=>4932}"
#loads all uniprot for S. cerevisiae
keys:
srs => valid values 'swissprot', 'sptrembl', 'uniprot'
taxon_id => <taxon_id>
optional if one want to load from a specific species
if not specified it will load all 'metazoa' from the srs source
genome_db_id => <genome_db_id>
optional: will associate this loaded set into the specified
GenomeDB does not create genome_db entry, assumes it was already
created. Use prudently since it does no checks
accession_number => 0/1 (default is 1=on)
optional if one want to load Accession Number (AC) (DEFAULT) as
stable_id rather than Entry Name (ID)
more examples:
"{srs=>'swissprot'}" #loads all swissprot metazoa
"{srs=>'swissprot', taxon_id=>4932}"
"{srs=>'sptrembl', taxon_id=>4932}"

sub fetch_all_uniprot_ids_for_taxid {

  my $self   = shift;
  my $source = shift;  #'uniprot', 'swissprot' or 'sptrembl'
  my $taxon_id = shift;
  my $division = "STD";
  $division = "PRE" if ($source eq "SPTREMBL");

  my $cmd = "mfetch -d uniprot -v av -i 'txi:$taxon_id&div:$division'";

  print("$cmd\n") if($self->debug);
  my @ids = split(/\s/, qx//$cmd/);
  printf("fetched %d ids from %s\n", scalar(@ids), $source) if($self->debug);
  return \@ids;

}

sub fetch_input {

  my( $self) = @_;

  #create a Compara::DBAdaptor which shares the same DBI handle
  #with the DBAdaptor that is based into this runnable
  $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN => $self->db->dbc);

  $self->{'source'} = 'SWISSPROT';
  $self->{'taxon_id'} = undef;  #no ncbi_taxid filter, get all metzoa
  $self->{'genome_db_id'} = undef;
  $self->{'accession_number'} = 1;
  $self->debug(0);
  if(defined($self->input_id)) {
    #print("input_id = ".$self->input_id."\n");
    my $input_hash = eval($self->input_id);
    if(defined($input_hash)) {
      $self->{'accession_number'} = $input_hash->{'accession_number'} if(defined($input_hash->{'accession_number'}));
      $self->{'source'} = $input_hash->{'srs'} if(defined($input_hash->{'srs'}));
      $self->{'taxon_id'} = $input_hash->{'taxon_id'} if(defined($input_hash->{'taxon_id'}));
      $self->{'genome_db_id'} = $input_hash->{'genome_db_id'} if(defined($input_hash->{'genome_db_id'}));
    }
  }
    
  return 1;

}

sub get_metazoa_uniprot_ids {

  my $self   = shift;
  my $source = shift;  #'swissprot' or 'sptrembl'

  my @taxonomy_division = qw(FUN HUM MAM ROD VRT INV);
  my $division = "STD";
  $division = "PRE" if ($source eq "SPTREMBL");
  my @all_ids;
  foreach my $txd (@taxonomy_division) {
    my $cmd = "mfetch -d uniprot -v av -i 'txd:$txd&div:$division'";
    print("$cmd\n") if($self->debug);
    my @ids = split(/\s/, qx//$cmd/);
    push @all_ids, @ids;
  }
  printf("fetched %d ids from %s\n", scalar(@all_ids), $source) if($self->debug);
  return \@all_ids;

}

sub loadMembersFromUniprotIdList {

  my $self = shift;
  my $source = shift;
  my $uniprot_ids = shift;  #array ref
  
  my @id_chunk; 

  my $count = scalar(@$uniprot_ids);
  
# while(@uniprot_ids) {
#   @id_chunk = splice(@uniprot_ids, 0, 30);
#   $self->pfetch_and_store_by_ids($source, @id_chunk);
# }

  my $index=1;
  foreach my $id (@$uniprot_ids) {
    $index++;
    print("check/load $index ids\n") if($index % 100 == 0 and $self->debug);
    my $stable_id = $id;
    $stable_id = $1 if($id =~ /^(\S+)\.\d+$/);
    my $member = $self->{'comparaDBA'}->get_MemberAdaptor->fetch_by_source_stable_id('Uniprot/'.$source, $stable_id);
    if($member and $member->sequence_id) {
      #print("$source $stable_id : already loadled in compara\n");
    } else {
      #print("need to load $index : $source $stable_id\n");
      push @id_chunk, $id;
    }
    if(scalar(@id_chunk)>=30) {
      $self->pfetch_and_store_by_ids($source, @id_chunk);
      @id_chunk = ();
    }
  }
  $self->pfetch_and_store_by_ids($source, @id_chunk);

  printf("fetched %d ids from %s\n", $count, $source) if($self->debug);

}

sub pfetch_and_store_by_ids {

  my $self = shift;
  my $source = shift;

#  my @orig_ids = @_;
#  my @ids;
  my @ids = @_;

#  foreach my $id (@orig_ids) {
#    if($id =~ /(.*)\:(.*)/) {
#      #$source = $1;
#      $id = $2;
#    }
#    push @ids, $id;
#  }
  return unless(@ids);
  my $id_string = join(' ', @ids);

  open(IN, "pfetch -F $id_string |")
    or $self->throw("Error running pfetch for ids [$id_string]");

  print STDERR "$id_string\n";
  my $fh = Bio::SeqIO->new(-fh=>\*IN, -format=>"swiss");
  my $nb_seq = 0;
  while (my $seq = $fh->next_seq){
    next if ($seq->length < 80);

    ####################################################################
    # This bit is to avoid duplicated entries btw Ensembl and Uniprot
    # It only affects the Ensembl species dbs, and right now I am using
    # a home-brewed version of Bio::SeqIO::swiss to parse the PE entries
    # in a similar manner as comments (CC) but of type 'evidence'
    my $taxon_id; eval { $taxon_id = $seq->species->ncbi_taxid;};
    if (defined($self->{internal_taxon_ids}{$taxon_id})) {
      my $evidence_annotations; 
      eval { $evidence_annotations = $seq->get_Annotations('evidence');}; # old style
      if ($@) {
        my $annotation = $seq->annotation;
        $evidence_annotations = $annotation->get_Annotations('evidence');
      }
      if (defined $evidence_annotations) {
        if ($evidence_annotations->text =~ /^4/) {
          print STDERR $seq->display_id, "PE discarded ", $evidence_annotations->text, "\n";
          next;
        }
        # We dont want duplicated entries
      }
    }
    ####################################################################

    $self->store_bioseq($seq, $source);
    $nb_seq++;
  }
  close IN;
  if ($self->debug && $nb_seq != scalar @ids) {
    print "Expected ", scalar @ids," seqs but only $nb_seq seen from $id_string\n";
  }

}

sub run {

  my $self = shift;

  $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);

  $self->{internal_taxon_ids};
  foreach my $genome_db (@{$self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_all}) {
    $self->{internal_taxon_ids}{$genome_db->taxon_id} = 1;
  }

  return unless($self->{'source'});

  my $subset_name = $self->{'source'};
  
  my $uniprot_ids = [];
  my %allowed_taxon_ids = {};
  if($self->{'taxon_id'}) {
    $subset_name .= " ncbi_taxid:" . $self->{'taxon_id'};
    $uniprot_ids = $self->fetch_all_uniprot_ids_for_taxid($self->{'source'}, $self->{'taxon_id'});
    $allowed_taxon_ids{$self->{'taxon_id'}} = 1;
  } else {
    $subset_name .= " metazoa";
    $uniprot_ids = $self->get_metazoa_uniprot_ids($self->{'source'});

    # Fungi/Metazoa group
    my $taxon_id = 33154;
    my $node = $self->{'comparaDBA'}->get_NCBITaxonAdaptor->fetch_node_by_taxon_id($taxon_id);

    #foreach my $leaf ( @{$node->get_all_leaves} ) {
    foreach my $leaf ( @{$node->get_all_leaves_indexed} ) {
    # the indexed method should be much faster when data has left and right indexes built
      $allowed_taxon_ids{$leaf->node_id} = 1;
      if ($leaf->rank ne "species") {
        $allowed_taxon_ids{$leaf->parent->node_id} = 1;
      }
    }
    $node->release_tree;
  }

  $self->{'subset'}  = Bio::EnsEMBL::Compara::Subset->new(-name=>$subset_name);
  $self->{'comparaDBA'}->get_SubsetAdaptor->store($self->{'subset'});
  $self->{'allowed_taxon_ids'} =\% allowed_taxon_ids;

  $self->loadMembersFromUniprotIdList($self->{'source'}, $uniprot_ids);
  
  return 1;

}

sub store_bioseq {

  my $self = shift;
  my $bioseq = shift;
  my $source = shift;

  if ($source =~ /swissprot/i) {
    $source = "Uniprot/SWISSPROT";
  } elsif ($source =~ /sptrembl/i) {
    $source = "Uniprot/SPTREMBL";
  }

  return unless($bioseq);
  my $species = $bioseq->species;
  return unless($species);

  if($self->debug) {
    printf("store_bioseq %s %s : %d : %s", $source, $bioseq->display_id, $species->ncbi_taxid, $species->species);
  }
   
  my $taxon = $self->{'comparaDBA'}->get_NCBITaxonAdaptor->fetch_node_by_taxon_id($species->ncbi_taxid);
  unless($taxon) {
    #taxon not in compara, do not store the member and warn
    warning("Taxon id " . $species->ncbi_taxid . " from $source " . $bioseq->accession_number ." not in the database.
Member not stored.");
    return 1;
  }
  unless ($self->{'allowed_taxon_ids'}{$taxon->dbID}) {
    return 1;
  }

  my $member = new Bio::EnsEMBL::Compara::Member;
  if (defined $self->{'accession_number'} && $self->{'accession_number'} == 1) {
    $member->stable_id($bioseq->accession_number);
  } else {
    $member->stable_id($bioseq->display_id);
  }
  $member->display_label($bioseq->display_id);
  $member->taxon_id($taxon->dbID);
  $member->description($bioseq->desc);
  $member->source_name($source);
  $member->sequence($bioseq->seq);
  $member->genome_db_id($self->{'genome_db_id'}) if($self->{'genome_db_id'});

  eval {
    $self->{'comparaDBA'}->get_MemberAdaptor->store($member);
    print(" --stored") if($self->debug);
  };

  $self->{'subset'}->add_member($member);
  print("\n") if($self->debug);
  return 1;
}


1;

}

sub write_output {

  
  my $self = shift;

  my $outputHash = {};
  $outputHash = eval($self->input_id) if(defined($self->input_id));
  $outputHash->{'ss'} = $self->{'subset'}->dbID;
  $outputHash->{'gdb'} = $self->{'genome_db_id'} if($self->{'genome_db_id'});
  my $output_id = $self->encode_hash($outputHash);
  
  $self->input_job->input_id($output_id);

  if($self->{'genome_db_id'}) {
    $self->dataflow_output_id($output_id, 2);
  }
  return 1;
}


######################################
#
# subroutines
#
#####################################

}

fetch_all_uniprot_ids_for_taxid	No description	Code
fetch_input	Description	Code
get_metazoa_uniprot_ids	No description	Code
loadMembersFromUniprotIdList	No description	Code
pfetch_and_store_by_ids	No description	Code
run	No description	Code
store_bioseq	No description	Code
write_output	No description	Code