Raw content of Bio::EnsEMBL::Compara::Family package Bio::EnsEMBL::Compara::Family; use strict; use Bio::EnsEMBL::Compara::BaseRelation; use Bio::EnsEMBL::Utils::Argument; use Bio::EnsEMBL::Utils::Exception; use Bio::AlignIO; use Bio::SimpleAlign; use IO::File; our @ISA = qw(Bio::EnsEMBL::Compara::BaseRelation); =head2 new Arg [1] : Example : Description: Returntype : Bio::EnsEMBL::Compara::Family (but without members; caller has to fill using add_member) Exceptions : Caller : =cut sub new { my($class,@args) = @_; my $self = $class->SUPER::new(@args); if (scalar @args) { #do this explicitly. my ($description_score) = rearrange([qw(DESCRIPTION_SCORE)], @args); $description_score && $self->description_score($description_score); } return $self; } =head2 description_score Arg [1] : Example : Description: Returntype : Exceptions : Caller : =cut sub description_score { my $self = shift; $self->{'_description_score'} = shift if(@_); return $self->{'_description_score'}; } =head2 read_clustalw Arg [1] : string $file The name of the file containing the clustalw output Example : $family->read_clustalw('/tmp/clustalw.aln'); Description: Parses the output from clustalw and sets the alignment strings of each of the memebers of this family Returntype : none Exceptions : thrown if file cannot be parsed warning if alignment file contains identifiers for sequences which are not members of this family Caller : general =cut sub read_clustalw { my $self = shift; my $file = shift; my %align_hash; my $FH = IO::File->new(); $FH->open($file) || throw("Could not open alignment file [$file]"); <$FH>; #skip header while(<$FH>) { next if($_ =~ /^\s+/); #skip lines that start with space my ($id, $align) = split; $align_hash{$id} ||= ''; $align_hash{$id} .= $align; } $FH->close; #place all member attributes in a hash on their member name my @members_attributes; push @members_attributes,@{$self->get_Member_Attribute_by_source('ENSEMBLPEP')}; push @members_attributes,@{$self->get_Member_Attribute_by_source('Uniprot/SWISSPROT')}; push @members_attributes,@{$self->get_Member_Attribute_by_source('Uniprot/SPTREMBL')}; my %attribute_hash; foreach my $member_attribute (@members_attributes) { my ($member, $attribute) = @{$member_attribute}; $attribute_hash{$member->stable_id} = $attribute; } #assign cigar_line to each of the member attribute foreach my $id (keys %align_hash) { my $attribute = $attribute_hash{$id}; if($attribute) { my $alignment_string = $align_hash{$id}; $alignment_string =~ s/\-([A-Z])/\- $1/g; $alignment_string =~ s/([A-Z])\-/$1 \-/g; my @cigar_segments = split " ",$alignment_string; my $cigar_line = ""; foreach my $segment (@cigar_segments) { my $seglength = length($segment); $seglength = "" if ($seglength == 1); if ($segment =~ /^\-+$/) { $cigar_line .= $seglength . "D"; } else { $cigar_line .= $seglength . "M"; } } $attribute->cigar_line($cigar_line); } else { throw("No member for alignment portion: [$id]"); } } } sub load_cigars_from_fasta { my ($self, $file, $store) = @_; my $alignio = Bio::AlignIO->new (-file => "$file", -format => "fasta"); my $aln = $alignio->next_aln; #place all member attributes in a hash on their member name my @members_attributes = (); push @members_attributes,@{$self->get_Member_Attribute_by_source('ENSEMBLPEP')}; push @members_attributes,@{$self->get_Member_Attribute_by_source('Uniprot/SWISSPROT')}; push @members_attributes,@{$self->get_Member_Attribute_by_source('Uniprot/SPTREMBL')}; my %attribute_hash; foreach my $member_attribute (@members_attributes) { my ($member, $attribute) = @{$member_attribute}; $attribute_hash{$member->stable_id} = $attribute; } #assign cigar_line to each of the member attribute foreach my $seq ($aln->each_seq) { my $attribute = $attribute_hash{$seq->display_id}; if($attribute) { my $cigar_line = ''; while($seq->seq() =~/(?:\b|^)(.)(.*?)(?:\b|$)/g) { $cigar_line .= ($2 ? length($2)+1 : '').(($1 eq '-') ? 'D' : 'M'); } $attribute->cigar_line($cigar_line); } else { my $id = $seq->display_id; throw("No member for alignment portion: [$id]"); } } # either store everything or nothing: if($store) { my $family_adaptor = $self->adaptor(); foreach my $member_attribute (@members_attributes) { $family_adaptor->update_relation($member_attribute); } } } sub get_SimpleAlign { my $self = shift; my $sa = Bio::SimpleAlign->new(); #Hack to try to work with both bioperl 0.7 and 1.2: #Check to see if the method is called 'addSeq' or 'add_seq' my $bio07 = 0; if(!$sa->can('add_seq')) { $bio07 = 1; } my @members_attributes; push @members_attributes,@{$self->get_Member_Attribute_by_source('ENSEMBLPEP')}; push @members_attributes,@{$self->get_Member_Attribute_by_source('Uniprot/SWISSPROT')}; push @members_attributes,@{$self->get_Member_Attribute_by_source('Uniprot/SPTREMBL')}; foreach my $member_attribute (@members_attributes) { my ($member, $attribute) = @{$member_attribute}; my $seqstr = $attribute->alignment_string($member); next if(!$seqstr); my $seq = Bio::LocatableSeq->new(-SEQ => $seqstr, -START => 1, -END => length($seqstr), -ID => $member->stable_id, -STRAND => 0); if($bio07) { $sa->addSeq($seq); } else { $sa->add_seq($seq); } } return $sa; } =head2 get_all_taxa_by_member_source_name Arg [1] : string $source_name e.g. "ENSEMBLPEP" Example : Description: Returns the distinct taxons found in this family across the specified source. If you do not specify a source then the code will return all taxons in this family. Returntype : array reference of distinct Bio::EnsEMBL::Compara::NCBITaxon objects found in this family Exceptions : Caller : public =cut sub get_all_taxa_by_member_source_name { my ($self, $source_name) = @_; my $ncbi_ta = $self->adaptor->db->get_NCBITaxonAdaptor(); my $sub = sub { my ($row) = @_; return $ncbi_ta->fetch_node_by_taxon_id($row->[0]); }; my $results = $self->_run_finder_by_member_field_source_name( 'taxa', $source_name, $sub); return $results; } =head2 get_all_GenomeDBs_by_member_source_name Arg [1] : string $source_name e.g. "ENSEMBLPEP" Example : Description: Returns the distinct GenomeDBs found in this family. Please note that if you specify a source other than an EnsEMBL based one the chances of getting back GenomeDBs are very low. Returntype : array reference of distinct Bio::EnsEMBL::Compara::GenomeDB objects found in this family Exceptions : Caller : public =cut sub get_all_GenomeDBs_by_member_source_name { my ($self, $source_name) = @_; my $gdb_a = $self->adaptor->db->get_GenomeDBAdaptor(); my $sub = sub { my ($row) = @_; return $gdb_a->fetch_by_dbID($row->[0]); }; my $results = $self->_run_finder_by_member_field_source_name( 'genome_db', $source_name, $sub); return $results; } =head2 _run_finder_by_member_field_source_name Arg [1] : string $entity e.g. 'taxa' or 'genome_db' Arg [2] : string $source_name (not required) e.g. 'ENSEMBLPEP' Arg [3] : code $sub e.g. 'taxa' or 'genome_db' Example : Description: Returntype : array reference of distinct objects as returned by your subroutine Exceptions : If an unknown entity has been given Caller : private =cut sub _run_finder_by_member_field_source_name { my ($self, $entity, $source_name, $sub) = @_; my $field; if($entity eq 'genome_db') { $field = 'genome_db_id'; } elsif($entity eq 'taxa') { $field = 'taxon_id'; } exception("Unknown entity type given [$entity].") unless $field; my $sql = "SELECT distinct(m.${field}) FROM family_member fm,member m WHERE fm.family_id= ? AND fm.member_id=m.member_id"; my @args = ($self->dbID()); if (defined $source_name) { $sql .= ' AND m.source_name = ?'; push(@args, $source_name); } my @results; my $sth = $self->adaptor()->dbc()->prepare($sql); $sth->execute(@args); while (my $rowarray = $sth->fetchrow_arrayref()) { if(defined $rowarray->[0]) { my $val = $sub->($rowarray); push(@results, $val); } } $sth->finish(); return \@results; } 1;