Bio::EnsEMBL::Pipeline::SeqFetcher
Mfetch
Toolbar
Summary
Bio::EnsEMBL::Pipeline::SeqFetcher::Mfetch
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
my $obj = Bio::EnsEMBL::Pipeline::SeqFetcher::Mfetch->new(
-executable => $exe
);
my $seq = $obj->get_Seq_by_acc($acc);
Description
Object to retrieve sequences as Bio::Seq, using mfetch.
Methods
Methods description
Title : executable Usage : $self->executable('/path/to/executable'); Function: Get/set for the path to the executable being used by the module. If not set, the executable is looked for in $PATH. Returns : string Args : string |
Title : batch_fetch Usage : $self->batch_retrieval(@accession_list); Function: Retrieves multiple sequences via mfetch Returns : reference to a list of Bio::Seq objects Args : array of accession strings |
Title : get_Seq_by_acc Usage : $self->get_eq_by_acc($accession); Function: Does the sequence retrieval via mfetch Returns : Bio::Seq Args : |
Title : options Usage : $self->options('tc'); Function: Get/set for options to mfetch Returns : string Args : string |
Methods code
_make_mfetch_prefix_command | description | prev | next | Top |
sub _make_mfetch_prefix_command
{ my ($self, $f ) = @_ ;
my @fields = @$f ;
my $mfetch = $self->executable;
my $options = $self->options;
if (defined($options)) {
unless ($options =~ /^-/ ) {
$options = '-' . $options;
}
}
my $command = "$mfetch ";
if ( scalar(@fields) > 0 ) {
my $f = join (" ",@fields) ;
if ( $f=~m/acc/) {
$f =~s/acc//g; }
$f = "acc " .$f;
$command .= " -f\" $f\" " ;
}else {
$command .= " -v full" ;
}
if (defined $options){
$command .= "$options ";
}
print "PREFIX COMMAND $command\n" ;
return $command ;
}
1; } |
sub executable
{ my ($self, $exe) = @_;
if ($exe)
{
$self->{'_exe'} = $exe;
}
return $self->{'_exe'}; } |
sub get_Entry_Fields
{ my ($self, $acc_to_fetch,$fields) = @_;
print "Fields to get : " . join ( " " , @$fields )."\n" if $self->{verbose};
if ( ref($acc_to_fetch)=~m/ARRAY/ ) {
print "BatchFetch fields to get : " . join ( " " , @$fields )."\n" if $self->{verbose};
return $self->get_Entry_Fields_BatchFetch($acc_to_fetch,$fields);
}
if (!defined($acc_to_fetch)) {
throw("No accession input");
}
print " try to fetch $acc_to_fetch\n" ;
my $command ;
my %all_entries;
my @entries_not_found;
my $cmd_prefix = $self->_make_mfetch_prefix_command($fields);
$command = $cmd_prefix ." " . $acc_to_fetch;
my $acc_format = $acc_to_fetch;
$acc_format=~s/acc://g; $acc_format=~s/\%//g; $acc_format=~s/\"//g;
print "cmd: $command\n";
open(IN,"$command |") or throw("Error opening pipe to mfetch for accession [$acc_to_fetch]: $command ");
my @lines = <IN> ;
close IN or throw("Error running mfetch for accession [$acc_format]: $command ");
my %entry;
for my $line ( @lines ) {
chomp($line) ;
print "LINE $line\n" ;
if ( $line =~m/no match/ ) {
print "no entry found for $acc_to_fetch with $command\n";
push @entries_not_found, $acc_to_fetch;
return [\%all_entries ,\@ entries_not_found ] ;
}
my @l = split /\s+/, $line;
my $key_field = shift @l ;
if ( $key_field =~/AC/) {
if ( scalar(@l) > 1 ) {
warning ("more than one AC number returned : " . join(" ", @l) . " - we ignore the others " . scalar(@l) . " objects\n") ;
}
}
$entry{$key_field}.= join(" ", @l);
}
for my $field ( keys %entry ) {
$all_entries{$acc_format}{$field}=$entry{$field};
}
for my $key ( keys %all_entries ) {
print "KEY $key\n";
my %tmp = %{$all_entries{$key}} ;
for ( keys %tmp ) {
print "\t\t$_ --> $tmp{$_}\n";
}
print "\n" ;
}
return [\%all_entries ,\@ entries_not_found ] ; } |
sub get_Entry_Fields_BatchFetch
{ my ($self, $acc,$fields) = @_;
print "using batchFetch\n" ;
my @acc_to_fetch ;
if ( ref($acc) =~m/ARRAY/ ) {
@acc_to_fetch = @$acc;
}else {
push @acc_to_fetch,$acc;
}
my $seqstr;
my $seq;
my $mfetch = $self->executable;
my $options = $self->options;
if (defined($options)) { $options = '-' . $options unless $options =~ /^-/; }
my $command = "$mfetch ";
if ( scalar(@$fields) > 0 ) {
my $f = join (" ",@$fields) ;
if ( $f=~m/acc/) {
$f =~s/acc//g; }
$f = "acc " .$f;
$command .= " -f\" $f\" " ;
}else {
$command .= " -v full" ;
}
if (defined $options){
$command .= "$options ";
}
my @fetch_strings ;
if ( scalar(@acc_to_fetch) > 500 ) {
while ( @acc_to_fetch ) {
my @tmp = splice (@acc_to_fetch, 0, 300 ) ;
push @fetch_strings, join ( " ", @tmp) ;
}
} else {
push @fetch_strings, join ( " ", @acc_to_fetch ) ;
}
my $cmd_prefix = $command ;
my %all_entries;
my @entries_not_found;
my $last_acc;
my %last_entry ;
for my $acc_string ( @fetch_strings ) {
$command = $cmd_prefix ." " . $acc_string;
my @nr_acc = split /\s+/, $acc_string ;
my @tmp;
for my $acc_format ( @nr_acc) {
$acc_format=~s/acc://g; $acc_format=~s/\%//g; $acc_format=~s/\"//g; push @tmp, $acc_format ;
}
print "cmd: $command\n" if $self->{verbose};
open(IN,"$command |") or throw("Error opening pipe to mfetch for accession [$acc_string]: $mfetch");
@nr_acc = @tmp ;
my $acc;
my %entry_hash ;
my $accPointer = -1 ;
my $last_field ="";
LINE: while (my $line=<IN>){
chomp($line) ;
if ( $line =~m/no match/ ) {
$accPointer++;
push @entries_not_found, $nr_acc[$accPointer] ;
next LINE;
}
my @l = split /\s+/, $line;
my $field = shift @l ;
if ($field =~m/AC/ ) {
if ( $field eq $last_field ) {
print "we got more than one line starting with AC ... wthis means it's not a new entry.\n " if $self->{verbose};
$last_field = $field ;
next LINE;
} else {
print "NEW ENTRY : " . join ( " " , @l ) if $self->{verbose};
$acc = $l[0];
$acc=~s/\;//;
$accPointer++ unless ( length(@nr_acc) == 1 ) ;
if ( $entry_hash{AC} ) {
my @acc_for_entry = @{$entry_hash{AC}} ;
for my $acc(@acc_for_entry) {
for my $f ( keys %entry_hash ) {
$all_entries{$acc}{$f}= $entry_hash{$f};
}
}
}
%last_entry = %entry_hash ;
undef %entry_hash;
my $tmp_acc = $nr_acc[$accPointer] ;
$last_acc = $tmp_acc ;
my $hit=0;
for my $result ( @l ) {
if ( $result =~m/$tmp_acc/ ) {
$hit = 1 ;
last;
}
}
unless ($tmp_acc =~m/$acc/ ) {
if ( $hit == 0 ) {
warning " acc does not match - somethings wrong with the pointers to acc. $tmp_acc vs $acc\n" ;
}
}
$entry_hash{AC} = [$tmp_acc];
$last_field = $field ;
next LINE ;
}
}
$entry_hash{$field}.= join (" ", @l);
$last_field = $field ;
}
%last_entry = %entry_hash ;
close IN or throw("Error running mfetch for accession [$acc_string]: $mfetch");
}
unless ( $all_entries{$last_acc} ) {
if ( $last_entry{AC} ) {
my @acc_for_entry = @{$last_entry{AC}} ;
for my $acc(@acc_for_entry) {
for my $f ( keys %last_entry ) {
$all_entries{$acc}{$f}= $last_entry{$f};
}
}
}
} else {
print "NO_ACC defined for last entry - skipping\n " if $self->{verbose} ; ;
}
if ( $self->{verbose} ) {
for my $key ( keys %all_entries ) {
print "KEY $key\n";
my %tmp = %{$all_entries{$key}} ;
for ( keys %tmp ) {
if ( /AC/ ) {
print "\t\t$_ --> " . join(" ", @{$tmp{$_}} ). "\n";
}else {
print "\t\t$_ --> $tmp{$_}\n";
}
}
print "\n" ;
}
}
for ( @entries_not_found ) {
print "no entry found for : $_\n" if $self->{verbose};
}
return [\%all_entries ,\@ entries_not_found ] ; } |
sub get_Seq_by_acc
{ my ($self, $acc) = @_;
if (!defined($acc)) {
throw("No accession input");
}
if ( defined ( $self->{_seq_cache}{$acc})) {
return $self->{_seq_cache}{$acc};
}
my $seqstr;
my $seq;
my $mfetch = $self->executable;
my $options = $self->options;
if (defined($options)) { $options = '-' . $options unless $options =~ /^-/; }
my $command = "$mfetch -v fasta ";
if (defined $options){
$command .= "$options ";
}
$command .= $acc;
print STDERR "$command\n" if $self->{verbose};
open(IN,"$command |") or throw("Error opening pipe to mfetch for accession [$acc]: $mfetch");
while (my $line=<IN>){
chomp($line) ;
next if $line=~m/>/;
$seqstr.=$line;
}
close IN or throw("Error running mfetch for accession [$acc]: $mfetch");
chomp($seqstr);
eval{
if(defined $seqstr && $seqstr ne "no match") {
$seq = new Bio::Seq('-seq' => $seqstr,
'-accession_number' => $acc,
'-display_id' => $acc);
}
};
if($@){
print STDERR "$@\n";
}
throw("Could not mfetch sequence for [$acc]\n") unless defined $seq;
$self->{_seq_cache}{$acc}=$seq;
return $seq; } |
sub new
{ my ($class, @args) = @_;
my $self = bless {}, $class;
my ($exe, $options) = rearrange(['EXECUTABLE', 'OPTIONS'], @args);
if (!defined $exe) {
$exe = 'mfetch';
}
$self->executable($exe);
if (defined $options) {
$self->options($options);
}
$self->{_seq_cache}={};
$self->{verbose} = 0 ;
return $self;
} |
sub options
{
my ($self, $options) = @_;
if ($options)
{
$self->{'_options'} = $options;
}
return $self->{'_options'}; } |
sub verbose
{ my ($self, $arg ) = @_ ;
if ( $arg ) {
$self->{verbose}= $arg ;
} } |
General documentation
Describe contact details here
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
Method Bio::EnsEMBL::Root::_rearrange is deprecated.
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
rearrange(order, list); #instead
Title : get_Entry_Fields
Usage : $self->get_Entry_Fields(\@accessions,\@fields);
: $self->get_Entry_Fields("Q5RFX5", \@fields);
Arg [0] : $self
arg [1] : ACC as string ( Q5RFX5 ) OR arrayref to array of acc
arg [2] : arrayref to fields which you want to receive
\@field = qw( pe taxon acc ) ;
Function: Does the retrieval of different files like Taxonomy_id or PE level via mfetch,
either for one acc or in batch mode.
Returns : arrayref to array of hashes for each acc.?
Args :