Bio::EnsEMBL::Pipeline::SeqFetcher Finished_Pfetch
Included librariesPackage variablesGeneral documentationMethods
Toolbar
WebCvsRaw content
Package variables
No package variables defined.
Included modules
Bio::DB::RandomAccessI
Bio::EnsEMBL::Pipeline::Tools::Embl
Bio::Root::Root
Bio::Seq
IO::Socket
Inherit
Bio::DB::RandomAccessI Bio::Root::Root
Synopsis
No synopsis!
Description
No description!
Methods
archive_port
No description
Code
fetch_descriptions
No description
Code
fetch_descriptions_by_accession
No description
Code
fetch_isoform_descriptions
No description
Code
fetch_lengths_from_archive
No description
Code
get_Seq_by_accDescriptionCode
get_acc_by_idDescriptionCode
get_hit_desc_sth
No description
Code
get_server
No description
Code
min
No description
Code
new
No description
Code
optionsDescriptionCode
portDescriptionCode
prepare_hit_desc_sth
No description
Code
serverDescriptionCode
write_descriptions
No description
Code
Methods description
get_Seq_by_acccode    nextTop
  Title   : get_Seq_by_acc
Usage : $self->get_eq_by_acc($accession);
Function: Does the sequence retrieval via pfetch
Returns : Bio::Seq
Args :
get_acc_by_idcodeprevnextTop
  Title   : get_acc_by_id
Usage : $self->get_acc_by_id($secondary_id);
Function: Retrieve the sequence accession using the id
Returns : string
Args : string
optionscodeprevnextTop
  Title   : options
Usage : $self->options('tc');
Function: Get/set for options to pfetch
Returns : string
Args : string
portcodeprevnextTop
  Title   : port
Usage : $self->port('port');
Function: Get/set for the port to the pfetch server.
Returns : string
Args : string
servercodeprevnextTop
  Title   : server
Usage : $self->server('address.of.server');
Function: Get/set for the path to the server being used by the module.
Returns : string
Args : string
Methods code
archive_portdescriptionprevnextTop
sub archive_port {
    my( $self, $archive_port ) = @_;

    if ($archive_port) {
        $self->{'_archive_port'} = $archive_port;
    }
    return $self->{'_archive_port'};
}
fetch_descriptionsdescriptionprevnextTop
sub fetch_descriptions {
    my( $self, $id_list, $chunk_size ) = @_;

	my $descriptions = {};	# results are stored here
# split protein isoform ids and normal ids
my $ids_iso = []; my $ids_no_iso = []; my %single_ids; foreach (@$id_list) { my $id = $_; if(/\-\d+/) { push @$ids_iso, $id ; $id =~ s/\-\d+//g; } $single_ids{$id} = 1; } my @ids = keys %single_ids; $ids_no_iso =\@ ids; # first pass fails when the sequence version has changed:
my $failed_first_pass = []; for (my $i = 0; $i < @$ids_no_iso; $i += $chunk_size) { my $j = $i + $chunk_size - 1; # Set second index to last element if we're off the end of the array
$j = $#$ids_no_iso if $#$ids_no_iso < $j; # Take a slice from the array
my $chunk = [@$ids_no_iso[$i..$j]]; push @$failed_first_pass, @{ $self->fetch_descriptions_by_accession($chunk, $descriptions) }; } my $failed_second_pass = []; for (my $i = 0; $i < @$failed_first_pass; $i += $chunk_size) { my $j = $i + $chunk_size - 1; $j = $#$failed_first_pass if $#$failed_first_pass < $j; my $chunk = [@$failed_first_pass[$i..$j]]; my ($succeeded_arch_len, $failed_arch_len) = $self->fetch_lengths_from_archive($chunk, $descriptions); my $failed_desc = $self->fetch_descriptions_by_accession($succeeded_arch_len, $descriptions, 1); push @$failed_second_pass, @$failed_arch_len, @$failed_desc; } # get isoform ids descriptions
my $failed_isoforms = []; for (my $i = 0; $i < @$ids_iso; $i += $chunk_size) { my $j = $i + $chunk_size - 1; # Set second index to last element if we're off the end of the array
$j = $#$ids_iso if $#$ids_iso < $j; # Take a slice from the array
my $chunk = [@$ids_iso[$i..$j]]; push @$failed_isoforms, @{ $self->fetch_isoform_descriptions($chunk, $descriptions) }; } push @$failed_second_pass, @$failed_isoforms; return ($descriptions, $failed_second_pass);
}
fetch_descriptions_by_accessiondescriptionprevnextTop
sub fetch_descriptions_by_accession {
    my( $self, $id_list, $descriptions, $trim_sv_flag ) = @_;

    my $srv = $self->get_server;
    warn "Fetching full entries for ", scalar(@$id_list), " identifiers\n";
    if ($trim_sv_flag) {
        my @req = @$id_list;
        foreach (@req) {
            s/\.\d+$//;
        }
        print $srv join(' ', '-F', @req), "\n";
    } else {
        print $srv join(' ', '-F', @$id_list), "\n";
    }

    # Set the input record separator to split on EMBL
# entries, which end with "//\n" on a line.
local ${/} = "//\n"; # ${/} is the same as $/, but doesn't mess up
# syntax highlighting in my editor!

my $embl_parser = Bio::EnsEMBL::Pipeline::Tools::Embl->new;

my $failed = [];
for (my $i = 0; $i < @$id_list; $i++) {
my $entry = <$srv>;

# Each entry may have one or more "no match" lines at the start
while ($entry =~ s/^no match\n//m) {
push(@$failed, $id_list->[$i]);
delete $descriptions->{$id_list->[$i]};
$i++;
}

# The end of the loop
last unless $entry;

my $full_name = $id_list->[$i] or die "No id at '
$i' in list:\n@$id_list";
$embl_parser->parse($entry);

my $name_without_version = $full_name;
$name_without_version =~ s/\.\d+$//;


my $found = 0;
NAMES: for my $one_of_names (@{ $embl_parser->accession }) {
if ($name_without_version eq $one_of_names) {
$found = 1;
# warn "Found '
$name_without_version'";

# NB: do not redefine any of these if already defined
$descriptions->{$full_name}{description} ||= $embl_parser->description;
$descriptions->{$full_name}{length} ||= $embl_parser->seq_length;
$descriptions->{$full_name}{taxonid} ||= $embl_parser->taxon;
$descriptions->{$full_name}{database} ||= $embl_parser->which_database;

last NAMES;
}
}
unless ($found) {
my $all_accs = $embl_parser->accession;
if (my $sv = $embl_parser->sequence_version) {
unshift @$all_accs, $sv;
}
warn "Expecting '
$full_name' but got (@$all_accs)\n";
push @$failed, $full_name;
}
}
return $failed;
}
fetch_isoform_descriptionsdescriptionprevnextTop
sub fetch_isoform_descriptions {
	my( $self, $id_list, $descriptions ) = @_;
	my $server = $self->get_server;

	# fetch isoform length
print $server join(' ', '-l', @$id_list), "\n"; my $length_succeeded = []; my $length_failed = []; my $i = 0; while (<$server>) { chomp; my $full_name = $id_list->[$i]; if($_ ne 'no match') { $descriptions->{$full_name}{length} = $_; push @$length_succeeded, $full_name; } else { warn "${full_name}'s length is not in pfetch server"; push @$length_failed, $full_name; } $i++; } # fetch isoform description
$server = $self->get_server; print $server join(' ', '-D', @$length_succeeded), "\n"; my $desc_succeeded = []; my $desc_failed = []; $i = 0; while (<$server>) { chomp; my $full_name = $id_list->[$i]; s/$full_name//g; if($_ ne 'no match') { $descriptions->{$full_name}{description} = $_; push @$desc_succeeded, $full_name; } else { warn "${full_name}'s description is not in pfetch server"; push @$desc_failed, $full_name; } $i++; } # copy isoform taxonid and database
foreach my $iso (@$desc_succeeded) { my $id = $iso; $id =~ s/\-\d+//g; $descriptions->{$iso}{taxonid} = $descriptions->{$id}{taxonid}; $descriptions->{$iso}{database} = $descriptions->{$id}{database}; if(!$descriptions->{$iso}{database} || !$descriptions->{$iso}{taxonid}){ delete $descriptions->{$iso}; push @$desc_failed,$iso; } } push @$length_failed,@$desc_failed; return $length_failed;
}
fetch_lengths_from_archivedescriptionprevnextTop
sub fetch_lengths_from_archive {
    my( $self, $id_list, $descriptions ) = @_;

	my $server = $self->get_server;
	print $server join(" ", '-l', @$id_list), "\n";

	my $succeeded = [];
	my $failed    = [];
	my $i = 0;
	while (<$server>) {
		chomp;
		my $full_name = $id_list->[$i];
		if($_ ne 'no match') {
			$descriptions->{$full_name}{length}	= $_;
			push @$succeeded, $full_name;
		} else {
			warn "${full_name}'s length is not in the archive";
			push @$failed, $full_name;
		}
		$i++;
	}
	return ($succeeded,$failed);
}
get_Seq_by_accdescriptionprevnextTop
sub get_Seq_by_acc {
    my ( $self, @id_list ) = @_;

    #confess "No names provided" unless @id_list;
unless (@id_list) { $self->throw("No accession input"); } my $server = $self->get_server(); print $server "-q @id_list\n"; my (@seq_list); for ( my $i = 0 ; $i < @id_list ; $i++ ) { chomp( my $seq_string = <$server> ); eval { if (defined $seq_string && $seq_string ne 'no match') { my $seq = new Bio::Seq( '-seq' => $seq_string, '-accession_number' => $id_list[$i], '-display_id' => $id_list[$i] ); $self->throw("Could not pfetch sequence for $id_list[[$i]]\n") unless defined $seq; $seq_list[$i] = $seq; } }; if ($@) { print STDERR "$@\n"; } } if (wantarray) { # an array was passed - return array of Bio:Seq objects
return @seq_list; } else { # one acc was passed then return the first(and only) element of the array
return $seq_list[0]; }
}
get_acc_by_iddescriptionprevnextTop
sub get_acc_by_id {
	my ( $self, $id ) = @_;

    #confess "No id provided" unless $id;
unless ($id) { $self->throw("No secondary id input"); } my $server = $self->get_server(); print $server "$id\n"; chomp( my $seq_string = <$server> ); my $sec_id; if (defined $seq_string && $seq_string ne 'no match') { ($sec_id) = $seq_string =~ />\S+\s+(\S+)\s+/; } return $sec_id ? $sec_id:undef;
}
get_hit_desc_sthdescriptionprevnextTop
sub get_hit_desc_sth {
    my( $self ) = @_;

    reuturn $self->{'_hit_desc_sth'};	# ------,,----- and restore it, if it's only used in 1 place?
} 1; __END__
}
get_serverdescriptionprevnextTop
sub get_server {
    my ($self) = @_;

    my $host = $self->server;
    my $port = $self->port;

    my $server = IO::Socket::INET->new(
        PeerAddr => $host,
        PeerPort => $port,
        #Proto    => 'tcp',
Proto => 6, Type => SOCK_STREAM, Timeout => 10, ); if ($server) { $server->autoflush(1); return $server; } else { $self->throw("Can't connect to '$host:$port' : $@"); }
}
mindescriptionprevnextTop
sub min {
	my ($a,$b) = @_;
	return ($a<$b) ? $a : $b;
}
newdescriptionprevnextTop
sub new {
    my ( $class, @args ) = @_;
    my $self = bless {}, $class;

    my ( $server, $port, $options, $archive_port ) = $self->_rearrange(
        [ 'PFETCH_SERVER', 'PFETCH_PORT', 'OPTIONS' ], @args );

    $self->server($server || 'cbi3.internal.sanger.ac.uk');
    $self->port($port || 22400);
    $self->options($options);

    return $self;
}
optionsdescriptionprevnextTop
sub options {
    my ( $self, $options ) = @_;
    if ($options) {
        $self->{'_options'} = $options;
    }
    return $self->{'_options'};
}
portdescriptionprevnextTop
sub port {
    my ( $self, $port ) = @_;
    if ($port) {
        $self->{'_port'} = $port;
    }
    return $self->{'_port'};
}
prepare_hit_desc_sthdescriptionprevnextTop
sub prepare_hit_desc_sth {
    my( $self, $dbobj ) = @_;

    my $sth = $dbobj->prepare(qq{
        REPLACE INTO hit_description (hit_name
              , hit_description
              , hit_length
              , hit_taxon
              , hit_db)
        VALUES (?,TRIM(?),?,?,?)
        });
    return $self->{'_hit_desc_sth'} = $sth; # James, do we really need to save it?
}
serverdescriptionprevnextTop
sub server {
    my ( $self, $server ) = @_;
    if ($server) {
        $self->{'_server'} = $server;
    }
    return $self->{'_server'};
}
write_descriptionsdescriptionprevnextTop
sub write_descriptions {
    my( $self, $dbobj, $id_list, $chunk_size ) = @_;

    $chunk_size ||= 200;
	my ($descriptions,$failed) = $self->fetch_descriptions($id_list, $chunk_size);
    my $sth = $self->prepare_hit_desc_sth($dbobj);
	for my $accession (keys %$descriptions) {
		my @desc = map { $descriptions->{$accession}{$_}; } ('description','length','taxonid','database');
		eval{
			$sth->execute(
				$accession,
				@desc
			);
		};
		if ($@) {
            print STDERR "Unable to fetch description for $accession [$@]\n";
        }
	}
    return $failed;
}
General documentation
No general documentation available.