Bio::EnsEMBL::Pipeline::SeqFetcher Mfetch
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
  Bio::EnsEMBL::Pipeline::SeqFetcher::Mfetch
Package variables
No package variables defined.
Included modules
Bio::DB::RandomAccessI
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Bio::Seq
Inherit
Bio::DB::RandomAccessI
Synopsis
    my $obj = Bio::EnsEMBL::Pipeline::SeqFetcher::Mfetch->new(
-executable => $exe
);
my $seq = $obj->get_Seq_by_acc($acc);
Description
  Object to retrieve sequences as Bio::Seq, using mfetch.
Methods
_make_mfetch_prefix_command
No description
Code
executableDescriptionCode
get_Entry_Fields
No description
Code
get_Entry_Fields_BatchFetchDescriptionCode
get_Seq_by_accDescriptionCode
new
No description
Code
optionsDescriptionCode
verbose
No description
Code
Methods description
executablecode    nextTop
  Title   : executable
Usage : $self->executable('/path/to/executable');
Function: Get/set for the path to the executable being used by the module. If not set, the executable is looked for in $PATH.
Returns : string
Args : string
get_Entry_Fields_BatchFetch codeprevnextTop
  Title   : batch_fetch
Usage : $self->batch_retrieval(@accession_list);
Function: Retrieves multiple sequences via mfetch
Returns : reference to a list of Bio::Seq objects
Args : array of accession strings
get_Seq_by_acccodeprevnextTop
  Title   : get_Seq_by_acc
Usage : $self->get_eq_by_acc($accession);
Function: Does the sequence retrieval via mfetch
Returns : Bio::Seq
Args :
optionscodeprevnextTop
  Title   : options
Usage : $self->options('tc');
Function: Get/set for options to mfetch
Returns : string
Args : string
Methods code
_make_mfetch_prefix_commanddescriptionprevnextTop
sub _make_mfetch_prefix_command {
     my ($self, $f ) = @_ ; 
   my @fields = @$f ;  

   my $mfetch = $self->executable;  
   my $options = $self->options;
   if (defined($options)) { 
     unless ($options =~ /^-/ ) {
       $options = '-' . $options; 
     }
   }
  my $command = "$mfetch "; 
  if ( scalar(@fields) > 0 ) {   
    my $f = join (" ",@fields) ;  
    if ( $f=~m/acc/) {  
      $f =~s/acc//g; 
} $f = "acc " .$f; $command .= " -f\" $f\" " ; }else { $command .= " -v full" ; } if (defined $options){ $command .= "$options "; } print "PREFIX COMMAND $command\n" ; return $command ; } 1;
}
executabledescriptionprevnextTop
sub executable {
  my ($self, $exe) = @_;
  if ($exe)
    {
      $self->{'_exe'} = $exe;
    }
  return $self->{'_exe'};
}
get_Entry_FieldsdescriptionprevnextTop
sub get_Entry_Fields {
  my ($self, $acc_to_fetch,$fields) = @_; 

   print "Fields to get : " . join ( " " , @$fields )."\n" if $self->{verbose}; 

  if ( ref($acc_to_fetch)=~m/ARRAY/ ) {   
    print "BatchFetch fields to get : " . join ( " " , @$fields )."\n" if $self->{verbose}; 
    return $self->get_Entry_Fields_BatchFetch($acc_to_fetch,$fields); 
  } 
 
  if (!defined($acc_to_fetch)) {
    throw("No accession input");
  }  

  print " try to fetch $acc_to_fetch\n" ; 

  my $command ;  
  my %all_entries; 
  my @entries_not_found;   

  my $cmd_prefix = $self->_make_mfetch_prefix_command($fields);  
  $command = $cmd_prefix ." " .  $acc_to_fetch; 

  # extract acc if you do a wildcard like mfetch -i  "AJFLD%"  
my $acc_format = $acc_to_fetch; $acc_format=~s/acc://g;
$acc_format=~s/\%//g;
$acc_format=~s/\"//g;
print "cmd: $command\n"; open(IN,"$command |") or throw("Error opening pipe to mfetch for accession [$acc_to_fetch]: $command "); my @lines = <IN> ; close IN or throw("Error running mfetch for accession [$acc_format]: $command "); my %entry; for my $line ( @lines ) { chomp($line) ; print "LINE $line\n" ; # we're just parsing one entry so if we get a no_match we just return ....
if ( $line =~m/no match/ ) { print "no entry found for $acc_to_fetch with $command\n"; push @entries_not_found, $acc_to_fetch; return [\%all_entries ,\@ entries_not_found ] ; } my @l = split /\s+/, $line; my $key_field = shift @l ; # result can contain more than one line begining with same field identifier , ie
# AC Q4589;
# AC Q0999;
# not sure how this works if we only get AC's .... but why would we do this anyway ?
if ( $key_field =~/AC/) { if ( scalar(@l) > 1 ) { warning ("more than one AC number returned : " . join(" ", @l) . " - we ignore the others " . scalar(@l) . " objects\n") ; } } $entry{$key_field}.= join(" ", @l); #print "populating $key_field ... ".join(" ", @l) . "\n";
} # aim is to return $all_entries{Q4981}{AC} -> string
# aim is to return $all_entries{Q4981}{org} -> string
# aim is to return $all_entries{ ACC }{ FIELD } -> string
# populate all_entry-hash
for my $field ( keys %entry ) { $all_entries{$acc_format}{$field}=$entry{$field}; } # my $hit=0;
# for my $result ( @l ) {
#print "testing : $result <--> $tmp_acc .... \n" ;
# if ( $result =~m/$tmp_acc/ ) {
# $hit = 1 ;
# last;
# }
# }
for my $key ( keys %all_entries ) { print "KEY $key\n"; my %tmp = %{$all_entries{$key}} ; for ( keys %tmp ) { #if ( /AC/ ) {
# print "\t\t$_ --> " . join(" ", @{$tmp{$_}} ). "\n";
#}else {
print "\t\t$_ --> $tmp{$_}\n"; #}
} print "\n" ; } return [\%all_entries ,\@ entries_not_found ] ;
}
get_Entry_Fields_BatchFetchdescriptionprevnextTop
sub get_Entry_Fields_BatchFetch {
   my ($self, $acc,$fields) = @_;
  print "using batchFetch\n" ; 
  my @acc_to_fetch ; 
  if ( ref($acc) =~m/ARRAY/ ) {   
    # batch mode - more than 1 acc to fetch 
@acc_to_fetch = @$acc; }else { #have only single acc to fetch
push @acc_to_fetch,$acc; } my $seqstr; my $seq; my $mfetch = $self->executable; my $options = $self->options; if (defined($options)) { $options = '-' . $options unless $options =~ /^-/; } my $command = "$mfetch "; if ( scalar(@$fields) > 0 ) { my $f = join (" ",@$fields) ; if ( $f=~m/acc/) { $f =~s/acc//g;
} $f = "acc " .$f; $command .= " -f\" $f\" " ; }else { $command .= " -v full" ; } if (defined $options){ $command .= "$options "; } # we will hand over the acc to fetch as string.
# if in batch mode do not submit more than 50 entries / strings to mfetch at one time.
my @fetch_strings ; if ( scalar(@acc_to_fetch) > 500 ) { # more than 500 acc to fetch - make strings of 500 acc which is faster.
while ( @acc_to_fetch ) { my @tmp = splice (@acc_to_fetch, 0, 300 ) ; push @fetch_strings, join ( " ", @tmp) ; #print $fetch_strings[-1]."\n" ;
} } else { push @fetch_strings, join ( " ", @acc_to_fetch ) ; } my $cmd_prefix = $command ; my %all_entries; my @entries_not_found; my $last_acc; my %last_entry ; for my $acc_string ( @fetch_strings ) { $command = $cmd_prefix ." " . $acc_string; my @nr_acc = split /\s+/, $acc_string ; my @tmp; for my $acc_format ( @nr_acc) { $acc_format=~s/acc://g;
$acc_format=~s/\%//g;
$acc_format=~s/\"//g;
push @tmp, $acc_format ; } print "cmd: $command\n" if $self->{verbose}; open(IN,"$command |") or throw("Error opening pipe to mfetch for accession [$acc_string]: $mfetch"); @nr_acc = @tmp ; my $acc; my %entry_hash ; my $accPointer = -1 ; my $last_field =""; LINE: while (my $line=<IN>){ chomp($line) ; #print "line $line\n";
#print "pointer : $nr_acc[$accPointer] $accPointer\n" ;
if ( $line =~m/no match/ ) { #print "no entry found for $nr_acc[$accPointer] \n" ;
$accPointer++; #print "no -match - pointer val is : $accPointer\n" ;
#print "adding entry_not_found $nr_acc[$accPointer]\n";
push @entries_not_found, $nr_acc[$accPointer] ; next LINE; } my @l = split /\s+/, $line; my $field = shift @l ; #print "last_field : $last_field VS $field\n" ;
# result can contain more than one line begining with same field identifier , ie
# AC Q4589;
# AC Q0999;
# not sure how this works if we only get AC's .... but why would we do this anyway ?
#
if ($field =~m/AC/ ) { if ( $field eq $last_field ) { print "we got more than one line starting with AC ... wthis means it's not a new entry.\n " if $self->{verbose}; # as we don't store/process the multipe acc returned we just ignore this line. this will caus
# trouble if we only have acc's returned.
$last_field = $field ; next LINE; } else { print "NEW ENTRY : " . join ( " " , @l ) if $self->{verbose}; $acc = $l[0]; $acc=~s/\;//;
$accPointer++ unless ( length(@nr_acc) == 1 ) ; #if ( scalar(@l) > 1 ) {
# warning ("more than one AC number returned : " . join(" ", @l) . " - we ignore the others " . scalar(@l) . " objects\n") ;
#}
# this moves all already read information into the big hash before we
# read the new entry .
#
if ( $entry_hash{AC} ) { # entry is defined so we already read all AC information and we can add safely.
# problem if there's only one entry as it does not go here . .... is the last entry always lost ?
my @acc_for_entry = @{$entry_hash{AC}} ; for my $acc(@acc_for_entry) { for my $f ( keys %entry_hash ) { $all_entries{$acc}{$f}= $entry_hash{$f}; #print "adding $acc $f $entry_hash{$f};\n";
} } } %last_entry = %entry_hash ; undef %entry_hash; # we now got a clean new entry_hash for present entry
# $entry_hash{AC} = \@l;
#print "pointer val is : $accPointer\n" ;
my $tmp_acc = $nr_acc[$accPointer] ; #print "test : $acc versus $tmp_acc \n" ;
$last_acc = $tmp_acc ; my $hit=0; for my $result ( @l ) { #print "testing : $result <--> $tmp_acc .... \n" ;
if ( $result =~m/$tmp_acc/ ) { $hit = 1 ; last; } } unless ($tmp_acc =~m/$acc/ ) { if ( $hit == 0 ) { warning " acc does not match - somethings wrong with the pointers to acc. $tmp_acc vs $acc\n" ; } } #print "entries match - $tmp_acc $acc " ;
# print "other_entries: " . join(" " , @l) . "\n";
#$entry_hash{AC} = [$acc];
# print "populating $tmp_acc to entry_hash - NOT $acc\n";
$entry_hash{AC} = [$tmp_acc]; $last_field = $field ; next LINE ; } } #print "populating $field ... ".join(" ", @l) . "\n";
$entry_hash{$field}.= join (" ", @l); $last_field = $field ; } %last_entry = %entry_hash ; close IN or throw("Error running mfetch for accession [$acc_string]: $mfetch"); } # add last entry to all_entries .
unless ( $all_entries{$last_acc} ) { # add content of last hash last_ENTRY HERE - also populate last_entry as well beofre .
#print "need to add last_entry now \n" ;
if ( $last_entry{AC} ) { my @acc_for_entry = @{$last_entry{AC}} ; for my $acc(@acc_for_entry) { for my $f ( keys %last_entry ) { $all_entries{$acc}{$f}= $last_entry{$f}; #print "adding $acc $f $last_entry{$f};\n";
} } } } else { print "NO_ACC defined for last entry - skipping\n " if $self->{verbose} ; ; } if ( $self->{verbose} ) { for my $key ( keys %all_entries ) { print "KEY $key\n"; my %tmp = %{$all_entries{$key}} ; for ( keys %tmp ) { if ( /AC/ ) { print "\t\t$_ --> " . join(" ", @{$tmp{$_}} ). "\n"; }else { print "\t\t$_ --> $tmp{$_}\n"; } } print "\n" ; } } for ( @entries_not_found ) { print "no entry found for : $_\n" if $self->{verbose}; } return [\%all_entries ,\@ entries_not_found ] ;
}
get_Seq_by_accdescriptionprevnextTop
sub get_Seq_by_acc {
  my ($self, $acc) = @_;
  
  if (!defined($acc)) {
    throw("No accession input");
  }  
 
  if ( defined ( $self->{_seq_cache}{$acc})) {  
     return $self->{_seq_cache}{$acc};
  } 
  # seqence not cached, needs to be fetched 
my $seqstr; my $seq; my $mfetch = $self->executable; # the option for fetching a sequence only by mfetch is mfetch -v fasta
my $options = $self->options; if (defined($options)) { $options = '-' . $options unless $options =~ /^-/; } my $command = "$mfetch -v fasta "; if (defined $options){ $command .= "$options "; } $command .= $acc; print STDERR "$command\n" if $self->{verbose}; open(IN,"$command |") or throw("Error opening pipe to mfetch for accession [$acc]: $mfetch"); while (my $line=<IN>){ chomp($line) ; next if $line=~m/>/; $seqstr.=$line; } close IN or throw("Error running mfetch for accession [$acc]: $mfetch"); chomp($seqstr); eval{ if(defined $seqstr && $seqstr ne "no match") { $seq = new Bio::Seq('-seq' => $seqstr, '-accession_number' => $acc, '-display_id' => $acc); } }; if($@){ print STDERR "$@\n"; } throw("Could not mfetch sequence for [$acc]\n") unless defined $seq; $self->{_seq_cache}{$acc}=$seq; return $seq;
}
newdescriptionprevnextTop
sub new {
  my ($class, @args) = @_;
  my $self = bless {}, $class;

  my ($exe, $options) = rearrange(['EXECUTABLE', 'OPTIONS'], @args);
  
  if (!defined $exe) {
    $exe = 'mfetch';
  }
  $self->executable($exe);

  if (defined $options) {
    $self->options($options);
  } 
  # caching of sequences 
$self->{_seq_cache}={}; $self->{verbose} = 0 ; return $self; # success - we hope!
}
optionsdescriptionprevnextTop
sub options {
  my ($self, $options) = @_;
  if ($options)
    {
      $self->{'_options'} = $options;
    }
  return $self->{'_options'};
}
verbosedescriptionprevnextTop
sub verbose {
    my ($self, $arg ) = @_ ; 
   if ( $arg ) { 
      $self->{verbose}= $arg ;  
   }
}
General documentation
CONTACTTop
Describe contact details here
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
Method Bio::EnsEMBL::Root::_rearrange is deprecated.
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
rearrange(order, list); #instead
get_Entry_fieldsTop
  Title   : get_Entry_Fields
Usage : $self->get_Entry_Fields(\@accessions,\@fields);
: $self->get_Entry_Fields("Q5RFX5", \@fields);
Arg [0] : $self
arg [1] : ACC as string ( Q5RFX5 ) OR arrayref to array of acc
arg [2] : arrayref to fields which you want to receive
\@field = qw( pe taxon acc ) ;
Function: Does the retrieval of different files like Taxonomy_id or PE level via mfetch,
either for one acc or in batch mode.
Returns : arrayref to array of hashes for each acc.?
Args :