Raw content of Bio::EnsEMBL::Pipeline::SeqFetcher::Finished_Pfetch package Bio::EnsEMBL::Pipeline::SeqFetcher::Finished_Pfetch; use strict; use Bio::Root::Root; use Bio::DB::RandomAccessI; use Bio::Seq; use IO::Socket; use Bio::EnsEMBL::Pipeline::Tools::Embl; use vars qw(@ISA); @ISA = qw(Bio::Root::Root Bio::DB::RandomAccessI); sub new { my ( $class, @args ) = @_; my $self = bless {}, $class; my ( $server, $port, $options, $archive_port ) = $self->_rearrange( [ 'PFETCH_SERVER', 'PFETCH_PORT', 'OPTIONS' ], @args ); $self->server($server || 'cbi3.internal.sanger.ac.uk'); $self->port($port || 22400); $self->options($options); return $self; } =head2 server Title : server Usage : $self->server('address.of.server'); Function: Get/set for the path to the server being used by the module. Returns : string Args : string =cut sub server { my ( $self, $server ) = @_; if ($server) { $self->{'_server'} = $server; } return $self->{'_server'}; } =head2 port Title : port Usage : $self->port('port'); Function: Get/set for the port to the pfetch server. Returns : string Args : string =cut sub port { my ( $self, $port ) = @_; if ($port) { $self->{'_port'} = $port; } return $self->{'_port'}; } sub archive_port { my( $self, $archive_port ) = @_; if ($archive_port) { $self->{'_archive_port'} = $archive_port; } return $self->{'_archive_port'}; } =head2 options Title : options Usage : $self->options('tc'); Function: Get/set for options to pfetch Returns : string Args : string =cut sub options { my ( $self, $options ) = @_; if ($options) { $self->{'_options'} = $options; } return $self->{'_options'}; } sub get_server { my ($self) = @_; my $host = $self->server; my $port = $self->port; my $server = IO::Socket::INET->new( PeerAddr => $host, PeerPort => $port, #Proto => 'tcp', Proto => 6, Type => SOCK_STREAM, Timeout => 10, ); if ($server) { $server->autoflush(1); return $server; } else { $self->throw("Can't connect to '$host:$port' : $@"); } } =head2 get_Seq_by_acc Title : get_Seq_by_acc Usage : $self->get_eq_by_acc($accession); Function: Does the sequence retrieval via pfetch Returns : Bio::Seq Args : =cut sub get_Seq_by_acc { my ( $self, @id_list ) = @_; #confess "No names provided" unless @id_list; unless (@id_list) { $self->throw("No accession input"); } my $server = $self->get_server(); print $server "-q @id_list\n"; my (@seq_list); for ( my $i = 0 ; $i < @id_list ; $i++ ) { chomp( my $seq_string = <$server> ); eval { if (defined $seq_string && $seq_string ne 'no match') { my $seq = new Bio::Seq( '-seq' => $seq_string, '-accession_number' => $id_list[$i], '-display_id' => $id_list[$i] ); $self->throw("Could not pfetch sequence for $id_list[[$i]]\n") unless defined $seq; $seq_list[$i] = $seq; } }; if ($@) { print STDERR "$@\n"; } } if (wantarray) { # an array was passed - return array of Bio:Seq objects return @seq_list; } else { # one acc was passed then return the first(and only) element of the array return $seq_list[0]; } } =head2 get_acc_by_id Title : get_acc_by_id Usage : $self->get_acc_by_id($secondary_id); Function: Retrieve the sequence accession using the id Returns : string Args : string =cut sub get_acc_by_id { my ( $self, $id ) = @_; #confess "No id provided" unless $id; unless ($id) { $self->throw("No secondary id input"); } my $server = $self->get_server(); print $server "$id\n"; chomp( my $seq_string = <$server> ); my $sec_id; if (defined $seq_string && $seq_string ne 'no match') { ($sec_id) = $seq_string =~ />\S+\s+(\S+)\s+/; } return $sec_id ? $sec_id:undef; } sub write_descriptions { my( $self, $dbobj, $id_list, $chunk_size ) = @_; $chunk_size ||= 200; my ($descriptions,$failed) = $self->fetch_descriptions($id_list, $chunk_size); my $sth = $self->prepare_hit_desc_sth($dbobj); for my $accession (keys %$descriptions) { my @desc = map { $descriptions->{$accession}{$_}; } ('description','length','taxonid','database'); eval{ $sth->execute( $accession, @desc ); }; if ($@) { print STDERR "Unable to fetch description for $accession [$@]\n"; } } return $failed; } sub min { my ($a,$b) = @_; return ($a<$b) ? $a : $b; } sub fetch_descriptions { my( $self, $id_list, $chunk_size ) = @_; my $descriptions = {}; # results are stored here # split protein isoform ids and normal ids my $ids_iso = []; my $ids_no_iso = []; my %single_ids; foreach (@$id_list) { my $id = $_; if(/\-\d+/) { push @$ids_iso, $id ; $id =~ s/\-\d+//g; } $single_ids{$id} = 1; } my @ids = keys %single_ids; $ids_no_iso = \@ids; # first pass fails when the sequence version has changed: my $failed_first_pass = []; for (my $i = 0; $i < @$ids_no_iso; $i += $chunk_size) { my $j = $i + $chunk_size - 1; # Set second index to last element if we're off the end of the array $j = $#$ids_no_iso if $#$ids_no_iso < $j; # Take a slice from the array my $chunk = [@$ids_no_iso[$i..$j]]; push @$failed_first_pass, @{ $self->fetch_descriptions_by_accession($chunk, $descriptions) }; } my $failed_second_pass = []; for (my $i = 0; $i < @$failed_first_pass; $i += $chunk_size) { my $j = $i + $chunk_size - 1; $j = $#$failed_first_pass if $#$failed_first_pass < $j; my $chunk = [@$failed_first_pass[$i..$j]]; my ($succeeded_arch_len, $failed_arch_len) = $self->fetch_lengths_from_archive($chunk, $descriptions); my $failed_desc = $self->fetch_descriptions_by_accession($succeeded_arch_len, $descriptions, 1); push @$failed_second_pass, @$failed_arch_len, @$failed_desc; } # get isoform ids descriptions my $failed_isoforms = []; for (my $i = 0; $i < @$ids_iso; $i += $chunk_size) { my $j = $i + $chunk_size - 1; # Set second index to last element if we're off the end of the array $j = $#$ids_iso if $#$ids_iso < $j; # Take a slice from the array my $chunk = [@$ids_iso[$i..$j]]; push @$failed_isoforms, @{ $self->fetch_isoform_descriptions($chunk, $descriptions) }; } push @$failed_second_pass, @$failed_isoforms; return ($descriptions, $failed_second_pass); } sub fetch_isoform_descriptions { my( $self, $id_list, $descriptions ) = @_; my $server = $self->get_server; # fetch isoform length print $server join(' ', '-l', @$id_list), "\n"; my $length_succeeded = []; my $length_failed = []; my $i = 0; while (<$server>) { chomp; my $full_name = $id_list->[$i]; if($_ ne 'no match') { $descriptions->{$full_name}{length} = $_; push @$length_succeeded, $full_name; } else { warn "${full_name}'s length is not in pfetch server"; push @$length_failed, $full_name; } $i++; } # fetch isoform description $server = $self->get_server; print $server join(' ', '-D', @$length_succeeded), "\n"; my $desc_succeeded = []; my $desc_failed = []; $i = 0; while (<$server>) { chomp; my $full_name = $id_list->[$i]; s/$full_name//g; if($_ ne 'no match') { $descriptions->{$full_name}{description} = $_; push @$desc_succeeded, $full_name; } else { warn "${full_name}'s description is not in pfetch server"; push @$desc_failed, $full_name; } $i++; } # copy isoform taxonid and database foreach my $iso (@$desc_succeeded) { my $id = $iso; $id =~ s/\-\d+//g; $descriptions->{$iso}{taxonid} = $descriptions->{$id}{taxonid}; $descriptions->{$iso}{database} = $descriptions->{$id}{database}; if(!$descriptions->{$iso}{database} || !$descriptions->{$iso}{taxonid}){ delete $descriptions->{$iso}; push @$desc_failed,$iso; } } push @$length_failed,@$desc_failed; return $length_failed; } sub fetch_descriptions_by_accession { my( $self, $id_list, $descriptions, $trim_sv_flag ) = @_; my $srv = $self->get_server; warn "Fetching full entries for ", scalar(@$id_list), " identifiers\n"; if ($trim_sv_flag) { my @req = @$id_list; foreach (@req) { s/\.\d+$//; } print $srv join(' ', '-F', @req), "\n"; } else { print $srv join(' ', '-F', @$id_list), "\n"; } # Set the input record separator to split on EMBL # entries, which end with "//\n" on a line. local ${/} = "//\n"; # ${/} is the same as $/, but doesn't mess up # syntax highlighting in my editor! my $embl_parser = Bio::EnsEMBL::Pipeline::Tools::Embl->new; my $failed = []; for (my $i = 0; $i < @$id_list; $i++) { my $entry = <$srv>; # Each entry may have one or more "no match" lines at the start while ($entry =~ s/^no match\n//m) { push(@$failed, $id_list->[$i]); delete $descriptions->{$id_list->[$i]}; $i++; } # The end of the loop last unless $entry; my $full_name = $id_list->[$i] or die "No id at '$i' in list:\n@$id_list"; $embl_parser->parse($entry); my $name_without_version = $full_name; $name_without_version =~ s/\.\d+$//; my $found = 0; NAMES: for my $one_of_names (@{ $embl_parser->accession }) { if ($name_without_version eq $one_of_names) { $found = 1; # warn "Found '$name_without_version'"; # NB: do not redefine any of these if already defined $descriptions->{$full_name}{description} ||= $embl_parser->description; $descriptions->{$full_name}{length} ||= $embl_parser->seq_length; $descriptions->{$full_name}{taxonid} ||= $embl_parser->taxon; $descriptions->{$full_name}{database} ||= $embl_parser->which_database; last NAMES; } } unless ($found) { my $all_accs = $embl_parser->accession; if (my $sv = $embl_parser->sequence_version) { unshift @$all_accs, $sv; } warn "Expecting '$full_name' but got (@$all_accs)\n"; push @$failed, $full_name; } } return $failed; } sub fetch_lengths_from_archive { my( $self, $id_list, $descriptions ) = @_; my $server = $self->get_server; print $server join(" ", '-l', @$id_list), "\n"; my $succeeded = []; my $failed = []; my $i = 0; while (<$server>) { chomp; my $full_name = $id_list->[$i]; if($_ ne 'no match') { $descriptions->{$full_name}{length} = $_; push @$succeeded, $full_name; } else { warn "${full_name}'s length is not in the archive"; push @$failed, $full_name; } $i++; } return ($succeeded,$failed); } sub prepare_hit_desc_sth { my( $self, $dbobj ) = @_; my $sth = $dbobj->prepare(qq{ REPLACE INTO hit_description (hit_name , hit_description , hit_length , hit_taxon , hit_db) VALUES (?,TRIM(?),?,?,?) }); return $self->{'_hit_desc_sth'} = $sth; # James, do we really need to save it? } sub get_hit_desc_sth { my( $self ) = @_; reuturn $self->{'_hit_desc_sth'}; # ------,,----- and restore it, if it's only used in 1 place? } 1; __END__