Raw content of Bio::EnsEMBL::StrainSlice =head1 LICENSE Copyright (c) 1999-2009 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut =head1 NAME Bio::EnsEMBL::StrainSlice - SubClass of the Slice. Represents the slice of the genome for a certain strain (applying the variations) =head1 SYNOPSIS $sa = $db->get_SliceAdaptor; $slice = $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); $strainSlice = $slice->get_by_strain($strain_name); # get the sequence from the Strain Slice my $seq = $strainSlice->seq(); print $seq; # get allele features between this StrainSlice and the reference my $afs = $strainSlice->get_all_AlleleFeatures_Slice(); foreach my $af ( @{$afs} ) { print "AlleleFeature in position ", $af->start, "-", $af->end, " in strain with allele ", $af->allele_string, "\n"; } # compare a strain against another strain my $strainSlice_2 = $slice->get_by_strain($strain_name_2); my $differences = $strainSlice->get_all_differences_StrainSlice($strainSlice_2); foreach my $difference ( @{$differences} ) { print "Difference in position ", $difference->start, "-", $difference->end(), " in strain with allele ", $difference->allele_string(), "\n"; } =head1 DESCRIPTION A StrainSlice object represents a region of a genome for a certain strain. It can be used to retrieve sequence or features from a strain. =head1 METHODS =cut package Bio::EnsEMBL::StrainSlice; use vars qw(@ISA); use strict; use Bio::EnsEMBL::Utils::Argument qw(rearrange); use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); use Bio::EnsEMBL::Slice; use Bio::EnsEMBL::Mapper; use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); @ISA = qw(Bio::EnsEMBL::Slice); =head2 new Arg[1] : Bio::EnsEMBL::Slice $slice Arg[2] : string $strain_name Example : $strainSlice = Bio::EnsEMBL::StrainSlice->new(-.... => , -strain_name => $strain_name); Description : Creates a new Bio::EnsEMBL::StrainSlice object that will contain a shallow copy of the Slice object, plus additional information such as the Strain this Slice refers to and listref of Bio::EnsEMBL::Variation::AlleleFeatures of differences with the reference sequence ReturnType : Bio::EnsEMBL::StrainSlice Exceptions : none Caller : general =cut sub new{ my $caller = shift; my $class = ref($caller) || $caller; my ($strain_name) = rearrange(['STRAIN_NAME'],@_); my $self = $class->SUPER::new(@_); $self->{'strain_name'} = $strain_name; if(!$self->adaptor()) { warning('Cannot get new StrainSlice features without attached adaptor'); return ''; } my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); unless($variation_db) { warning("Variation database must be attached to core database to " . "retrieve variation information" ); return ''; } my $af_adaptor = $variation_db->get_AlleleFeatureAdaptor; if( $af_adaptor ) { #get the Individual for the given strain my $ind_adaptor = $variation_db->get_IndividualAdaptor; if ($ind_adaptor){ my $individual = shift @{$ind_adaptor->fetch_all_by_name($self->{'strain_name'})}; #the name should be unique for a strain #check that the individua returned isin the database if (defined $individual){ my $allele_features = $af_adaptor->fetch_all_by_Slice($self,$individual); warning("No strain genotype data available for Slice ".$self->name." and Strain ".$individual->name) if ! defined $allele_features->[0]; my $vf_ids = {}; #hash containing the relation vf_id->af $self->{'_strain'} = $individual; map {defined $_->{'_variation_feature_id'} ? $vf_ids->{$_->{'_variation_feature_id'}} = $_ : '' } @{$allele_features}; # my $new_allele_features = $self->_filter_af_by_coverage($allele_features); # $self->{'alleleFeatures'} = $new_allele_features; $self->{'alleleFeatures'} = $allele_features; $self->{'_vf_ids'} = $vf_ids; return $self; } else{ warning("Strain not in the database"); return $self; } } else{ warning("Not possible to retrieve IndividualAdaptor from the variation database"); return ''; } } else { warning("Not possible to retrieve VariationFeatureAdaptor from variation database"); return ''; } } =head2 _filter_af_by_coverage Arg [1] : listref to Bio::EnsEMBL::Variation::AlleleFeatures $allele_features Example : my $new_list_allele_features = $strainSlice->_filter_af_by_coverage($allele_features); Description : For a list of allele features, gets a new list where they are filter depending on coverage ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature Exceptions : none Caller : internal function =cut sub _filter_af_by_coverage{ my $self = shift; my $allele_features = shift; my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); unless($variation_db) { warning("Variation database must be attached to core database to " . "retrieve variation information" ); return ''; } my $rc_adaptor = $variation_db->get_ReadCoverageAdaptor(); #this is ugly, but ReadCoverage is always defined in the positive strand my $rcs = $rc_adaptor->fetch_all_by_Slice_Sample_depth($self,$self->{'_strain'},1); my $new_af; foreach my $af (@{$allele_features}){ foreach my $rc (@{$rcs}){ if ($af->start <= $rc->end and $af->start >= $rc->start){ push @{$new_af}, $af; last; } } } return $new_af; } =head2 strain_name Arg [1] : (optional) string $strain_name Example : my $strain_name = $strainSlice->strain_name(); Description : Getter/Setter for the name of the strain ReturnType : string Exceptions : none Caller : general =cut sub strain_name{ my $self = shift; if (@_){ $self->{'strain_name'} = shift @_; } return $self->{'strain_name'}; } =head2 display_Slice_name Args : none Example : my $strain_name = $strainSlice->display_Slice_name(); Description : Getter for the name of the strain ReturnType : string Exceptions : none Caller : webteam =cut sub display_Slice_name{ my $self = shift; return $self->strain_name; } =head2 seq Arg [1] : int $with_coverage (optional) Example : print "SEQUENCE = ", $strainSlice->seq(); Description: Returns the sequence of the region represented by this slice formatted as a string in the strain. If flag with_coverage is set to 1, returns sequence if there is coverage in the region Returntype : string Exceptions : none Caller : general =cut sub seq { my $self = shift; my $with_coverage = shift; $with_coverage ||= 0; # special case for in-between (insert) coordinates return '' if($self->start() == $self->end() + 1); return $self->{'seq'} if($self->{'seq'}); if($self->adaptor()) { my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor(); my $reference_sequence = $seqAdaptor->fetch_by_Slice_start_end_strand($self,1,undef,1); #get the reference sequence for that slice #apply all differences to the reference sequence #first, in case there are any indels, create the new sequence (containing the '-' bases) # sort edits in reverse order to remove complication of # adjusting downstream edits my @indels_ordered = sort {$b->start() <=> $a->start()} @{$self->{'alignIndels'}} if (defined $self->{'alignIndels'}); foreach my $vf (@indels_ordered){ $vf->apply_edit($reference_sequence); #change, in the reference sequence, the vf } #need to find coverage information if diffe # sort edits in reverse order to remove complication of # adjusting downstream edits my @variation_features_ordered = sort {$b->start() <=> $a->start()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); foreach my $vf (@variation_features_ordered){ $vf->apply_edit($reference_sequence); #change, in the reference sequence, the vf } #need to find coverage information if different from reference my $indAdaptor = $self->adaptor->db->get_db_adaptor('variation')->get_IndividualAdaptor; my $ref_strain = $indAdaptor->get_reference_strain_name; $self->_add_coverage_information($reference_sequence) if ($with_coverage == 1 && $self->strain_name ne $ref_strain); return substr(${$reference_sequence},0,1) if ($self->length == 1); return substr(${$reference_sequence},0,$self->length); #returns the reference sequence, applying the variationFeatures. Remove additional bases added due to indels } # no attached sequence, and no db, so just return Ns return 'N' x $self->length(); } sub _add_coverage_information{ my $self = shift; my $reference_sequence = shift; my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); unless($variation_db) { warning("Variation database must be attached to core database to " . "retrieve variation information" ); return ''; } my $rc_adaptor = $variation_db->get_ReadCoverageAdaptor(); my $rcs = $rc_adaptor->fetch_all_by_Slice_Sample_depth($self,$self->{'_strain'},1); my $rcs_sorted; @{$rcs_sorted} = sort {$a->start <=> $b->start} @{$rcs} if ($self->strand == -1); $rcs = $rcs_sorted if ($self->strand == -1); my $start = 0; foreach my $rc (@{$rcs}){ $rc->start(1) if ($rc->start < 0); #if the region lies outside the boundaries of the slice $rc->end($self->end - $self->start + 1) if ($rc->end + $self->start > $self->end); substr($$reference_sequence, $start,($rc->start - $start - 1),'~' x ($rc->start - $start - 1)) if ($rc->start - 1 > $start); $start = $rc->end - 1; } substr($$reference_sequence, $start, ($self->length - $start) ,'~' x ($self->length - $start)) if ($self->length -1 > $start); } =head2 get_AlleleFeature Arg[1] : Bio::EnsEMBL::Variation::VariationFeature $vf Example : my $af = $strainSlice->get_AlleleFeature($vf); Description : Returns the AlleleFeature object associated with the VariationFeature (if any) ReturnType : Bio::EnsEMBL::Variation::AlleleFeature Exceptions : none Caller : general =cut sub get_AlleleFeature{ my $self = shift; my $vf = shift; my $af; #look at the hash containing the relation vf_id->alleleFeature, if present, return object, otherwise, undef $af = $self->{'_vf_ids'}->{$vf->dbID} if (defined $self->{'_vf_ids'}->{$vf->dbID}); return $af; } =head2 get_all_AlleleFeatures_Slice Arg[1] : int $with_coverage (optional) Example : my $af = $strainSlice->get_all_AlleleFeatures_Slice() Description : Gets all AlleleFeatures between the StrainSlice object and the Slice is defined. If argument $with_coverage set to 1, returns only AF if they have coverage information ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature Exceptions : none Caller : general =cut sub get_all_AlleleFeatures_Slice{ my $self = shift; my $with_coverage = shift; my $variation_db = $self->adaptor->db->get_db_adaptor('variation'); unless($variation_db) { warning("Variation database must be attached to core database to " . "retrieve variation information" ); return ''; } my $indAdaptor = $variation_db->get_IndividualAdaptor(); my $ref_name = $indAdaptor->get_reference_strain_name; return [] if ($self->strain_name eq $ref_name); $with_coverage ||= 0; #by default, get all AlleleFeatures if ($with_coverage == 1){ my $new_allele_features = $self->_filter_af_by_coverage($self->{'alleleFeatures'}); return $new_allele_features; } return $self->{'alleleFeatures'}; } =head2 get_all_differences_StrainSlice Arg[1] : Bio::EnsEMBL::StrainSlice $ss Example : my $differences = $strainSlice->get_all_differences_StrainSlice($ss) Description : Gets differences between 2 StrainSlice objects ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature Exceptions : thrown on bad argument Caller : general =cut sub get_all_differences_StrainSlice{ my $self = shift; my $strainSlice = shift; if (!ref($strainSlice) || !$strainSlice->isa('Bio::EnsEMBL::StrainSlice')){ throw('Bio::EnsEMBL::StrainSlice arg expected'); } if ( @{$self->{'alleleFeatures'}} == 0 && @{$strainSlice->{'alleleFeatures'}} == 0){ return undef; #there are no differences in any of the Strains } my $differences; #differences between strains if (@{$strainSlice->{'alleleFeatures'}} == 0){ #need to create a copy of VariationFeature foreach my $difference (@{$self->{'alleleFeatures'}}){ my %vf = %$difference; push @{$differences},bless \%vf,ref($difference); } } elsif (@{$self->{'alleleFeatures'}} == 0){ #need to create a copy of VariationFeature, but changing the allele by the allele in the reference foreach my $difference (@{$strainSlice->{'alleleFeatures'}}){ push @{$differences}, $strainSlice->_convert_difference($difference); } } else{ #both strains have differences #create a hash with the differences in the self strain slice my %variation_features_self = map {$_->start => $_} @{$self->{'alleleFeatures'}}; foreach my $difference (@{$strainSlice->{'alleleFeatures'}}){ #there is no difference in the other strain slice, convert the allele if (!defined $variation_features_self{$difference->start}){ push @{$differences},$strainSlice->_convert_difference($difference); } else{ #if it is defined and have the same allele, delete from the hash if ($variation_features_self{$difference->start}->allele_string eq $difference->allele_string){ delete $variation_features_self{$difference->start}; } } } #and copy the differences that in the self foreach my $difference (values %variation_features_self){ my %vf = %$difference; push @{$differences},bless \%vf,ref($difference); } } #need to map differences to the self my $mapper = $self->mapper(); #now that we have the differences, map them in the StrainSlice # print Dumper($mapper); my @results; foreach my $difference (@{$differences}){ @results = $mapper->map_coordinates('Slice',$difference->start,$difference->end,$difference->strand,'Slice'); #we can have 3 possibilities: #the difference is an insertion and when mapping returns the boundaries of the insertion in the StrainSlice if (@results == 2){ #the first position in the result is the beginning of the insertion if($results[0]->start < $results[1]->start){ $difference->start($results[0]->end+1); $difference->end($results[1]->start-1); } else{ $difference->start($results[1]->end+1); $difference->end($results[0]->start-1); } $difference->strand($results[0]->strand); } else{ #it can be either a SNP or a deletion, and we have the coordinates in the result, etither a Bio::EnsEMBL::Mapper::Coordinate # or a Bio::EnsEMBL::Mapper::IndelCoordinate # print "Difference: ",$difference->start, "-", $difference->end,"strand ",$difference->strand,"\n"; $difference->start($results[0]->start); $difference->end($results[0]->end); $difference->strand($results[0]->strand); } } return $differences; } #for a given VariationFeatures, converts the allele into the reference allele and returns a new list with #the converted VariationFeatures sub _convert_difference{ my $self = shift; my $difference = shift; my %new_vf = %$difference; #make a copy of the variationFeature #and change the allele with the one from the reference Slice $new_vf{'allele_string'} = $self->SUPER::subseq($difference->start,$difference->end,$difference->strand); return bless \%new_vf,ref($difference); } =head2 sub_Slice Arg 1 : int $start Arg 2 : int $end Arge [3] : int $strand Example : none Description: Makes another StrainSlice that covers only part of this slice with the appropriate differences to the reference Slice If a slice is requested which lies outside of the boundaries of this function will return undef. This means that behaviour will be consistant whether or not the slice is attached to the database (i.e. if there is attached sequence to the slice). Alternatively the expand() method or the SliceAdaptor::fetch_by_region method can be used instead. Returntype : Bio::EnsEMBL::StrainSlice or undef if arguments are wrong Exceptions : thrown when trying to get the subSlice in the middle of a insertion Caller : general =cut sub sub_Slice { my ( $self, $start, $end, $strand ) = @_; my $mapper = $self->mapper(); #finally map from the Slice to the Strain my @results = $mapper->map_coordinates('StrainSlice',$start,$end,$strand,'StrainSlice'); my $new_start; my $new_end; my $new_strand; my $new_seq; #Get need start and end for the subSlice of the StrainSlice my @results_ordered = sort {$a->start <=> $b->start} @results; $new_start = $results_ordered[0]->start(); $new_strand = $results_ordered[0]->strand() if (ref($results_ordered[0]) eq 'Bio::EnsEMBL::Mapper::Coordinate'); $new_strand = $results_ordered[-1]->strand() if (ref($results_ordered[-1]) eq 'Bio::EnsEMBL::Mapper::Coordinate'); $new_end = $results_ordered[-1]->end(); #get last element of the array, the end of the slice my $subSlice = $self->SUPER::sub_Slice($new_start,$new_end,$new_strand); $subSlice->{'strain_name'} = $self->{'strain_name'}; my $new_variations; #reference to an array that will contain the variationFeatures in the new subSlice #update the VariationFeatures in the sub_Slice of the Strain my $vf_start; my $vf_end; my $offset = $subSlice->start - $self->start; foreach my $variationFeature (@{$self->{'alleleFeatures'}}){ #calculate the new position of the variation_feature in the subSlice $vf_start = $variationFeature->start - $offset; $vf_end = $variationFeature->end - $offset; if ($vf_start >= 1 and $vf_end <= $subSlice->length){ #copy the variationFeature my %new_vf; %new_vf = %$variationFeature; #and shift to the new coordinates $new_vf{'start'} = $vf_start; $new_vf{'end'} = $vf_end; my $test = bless \%new_vf, ref($variationFeature); push @{$new_variations}, $test; } } $subSlice->{'alleleFeatures'} = $new_variations; return $subSlice; } =head2 ref_subseq Arg [1] : int $startBasePair relative to start of slice, which is 1. Arg [2] : int $endBasePair relative to start of slice. Arg [3] : (optional) int $strand The strand of the slice to obtain sequence from. Default value is 1. Description: returns string of dna from reference sequence Returntype : txt Exceptions : end should be at least as big as start strand must be set Caller : general =cut sub ref_subseq{ my $self = shift; my $start = shift; my $end = shift; my $strand = shift; # special case for in-between (insert) coordinates return '' if($start == $end + 1); my $subseq; if($self->adaptor){ my $seqAdaptor = $self->adaptor->db->get_SequenceAdaptor(); $subseq = ${$seqAdaptor->fetch_by_Slice_start_end_strand ( $self, $start, $end, $strand )}; } else { ## check for gap at the beginning and pad it with Ns if ($start < 1) { $subseq = "N" x (1 - $start); $start = 1; } $subseq .= substr ($self->seq(), $start-1, $end - $start + 1); ## check for gap at the end and pad it with Ns if ($end > $self->length()) { $subseq .= "N" x ($end - $self->length()); } reverse_comp(\$subseq) if($strand == -1); } return $subseq; } =head2 subseq Arg [1] : int $startBasePair relative to start of slice, which is 1. Arg [2] : int $endBasePair relative to start of slice. Arg [3] : (optional) int $strand The strand of the slice to obtain sequence from. Default value is 1. Description: returns string of dna sequence Returntype : txt Exceptions : end should be at least as big as start strand must be set Caller : general =cut sub subseq { my ( $self, $start, $end, $strand ) = @_; if ( $end+1 < $start ) { throw("End coord + 1 is less then start coord"); } # handle 'between' case for insertions return '' if( $start == $end + 1); $strand = 1 unless(defined $strand); if ( $strand != -1 && $strand != 1 ) { throw("Invalid strand [$strand] in call to Slice::subseq."); } my $subseq; my $seq; if($self->adaptor){ $seq = $self->seq; reverse_comp(\$seq) if ($strand == -1); $subseq = substr($seq,$start-1,$end - $start + 1); } else { ## check for gap at the beginning and pad it with Ns if ($start < 1) { $subseq = "N" x (1 - $start); $start = 1; } $subseq .= substr ($self->seq(), $start-1, $end - $start + 1); ## check for gap at the end and pad it with Ns if ($end > $self->length()) { $subseq .= "N" x ($end - $self->length()); } reverse_comp(\$subseq) if($strand == -1); } return $subseq; } sub mapper{ my $self = shift; if (@_) { delete $self->{'mapper'}; } if(!defined $self->{'mapper'}){ #create the mapper between the Slice and StrainSlice my $mapper = Bio::EnsEMBL::Mapper->new('Slice','StrainSlice'); #align with Slice #get all the VariationFeatures in the strain Slice, from start to end in the Slice my @variation_features_ordered = sort {$a->start() <=> $b->start()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); my $start_slice = 1; my $end_slice; my $start_strain = 1; my $end_strain; my $length_allele; foreach my $variation_feature (@variation_features_ordered){ #we have a insertion/deletion: marks the beginning of new slice move coordinates if ($variation_feature->length_diff != 0){ $length_allele = $variation_feature->length + $variation_feature->length_diff(); $end_slice = $variation_feature->start() - 1; if ($end_slice >= $start_slice){ $end_strain = $end_slice - $start_slice + $start_strain; #add the sequence that maps $mapper->add_map_coordinates('Slice',$start_slice,$end_slice,1,'StrainSlice',$start_strain,$end_strain); #add the indel $mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $variation_feature->length,1,'StrainSlice',$end_strain+1,$end_strain + $length_allele); $start_strain = $end_strain + $length_allele + 1; } else{ #add the indel $mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $variation_feature->length,1,'StrainSlice',$end_strain+1,$end_strain + $length_allele); $start_strain += $length_allele; } $start_slice = $end_slice + $variation_feature->length+ 1; } } if ($start_slice <= $self->length){ $mapper->add_map_coordinates('Slice',$start_slice,$self->length,1,'StrainSlice',$start_strain,$start_strain + $self->length - $start_slice); } $self->{'mapper'} = $mapper; } return $self->{'mapper'}; } =head2 get_all_differences_Slice Description : DEPRECATED use get_all_AlleleFeatures instead =cut sub get_all_differences_Slice{ my $self = shift; deprecate('Use get_all_differences_Slice instead'); return $self->get_all_AlleleFeatures_Slice(@_); } =head2 get_all_VariationFeatures Arg[1] : int $with_coverage (optional) Description :returns all alleleFeatures features on this slice. ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature Exceptions : none Caller : contigview, snpview =cut sub get_all_VariationFeatures { my $self = shift; my $with_coverage = shift; $with_coverage ||= 0; return $self->get_all_AlleleFeatures_Slice($with_coverage); } =head2 get_original_seq_region_position Arg [1] : int $position relative to start of slice, which is 1. Description: Placeholder method - this method has no explicit use beyond providiing compatibility with AlignSlice. To map positions between the StrainSlice and the reference slice, use the mapper and its methods. Returntype : ($strainSlice, $seq_region_position), an array where the first element is a Bio::EnsEMBL::StrainSlice and the second one is the requested seq_region_position. Exceptions : none Caller : general =cut sub get_original_seq_region_position { my $self = shift; my $position = shift; #coordinates in the AlignSlice and Slice are the same, so far will return the same Slice #and coordinate return ($self,$position); } 1;