Raw content of Bio::EnsEMBL::IndividualSlice
=head1 LICENSE
Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at .
Questions may also be sent to the Ensembl help desk at
.
=cut
=head1 NAME
Bio::EnsEMBL::IndividualSlice - SubClass of the Slice. Represents the
slice of the genome for a certain individual (applying the alleles for
this individual)
=head1 SYNOPSIS
$sa = $db->get_SliceAdaptor;
$slice =
$sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 );
$individualSlice = $slice->get_by_Individual($individual_name);
# Get the sequence from the Individual Slice: will contain IUPAC codes
# for SNPs and Ensembl ambiguity codes for indels
my $seq = $individualSlice->seq();
print $seq;
# Get a subSlice of the Strain
my $subSlice_individual =
$individualSlice->sub_Slice( 5_000, 8_000, 1 )
# Compare two different individuals in the same Slice
my $sliceIndividual2 = $slice->get_by_Individual($individual_name2);
my $differences =
$individualSlice->get_all_differences_IndividualSlice(
$sliceIndividual2);
foreach my $af ( @{$differences} ) {
print
"There is a difference between $individual_name "
. "and $individual_name2 at ",
$af->start, "-", $af->end,
" with allele ", $af->allele_string(), "\n";
}
=head1 DESCRIPTION
A IndividualSlice object represents a region of a genome for a certain
individual. It can be used to retrieve sequence or features from a
individual.
=head1 METHODS
=cut
package Bio::EnsEMBL::IndividualSlice;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use Bio::EnsEMBL::Slice;
use Bio::EnsEMBL::Mapper;
use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning);
use Data::Dumper;
@ISA = qw(Bio::EnsEMBL::Slice);
=head2 new
Arg [1..N] : List of named arguments
Bio::EnsEMBL::CoordSystem COORD_SYSTEM
string SEQ_REGION_NAME,
int START,
int END,
string VERSION (optional, defaults to '')
int STRAND, (optional, defaults to 1)
Bio::EnsEMBL::DBSQL::SliceAdaptor ADAPTOR (optional)
Arg[N+1] : string $individual_name
Example : $individualSlice = Bio::EnsEMBL::IndividualSlice->new(-coord_system => $cs,
-start => 1,
-end => 10000,
-strand => 1,
-seq_region_name => 'X',
-seq_region_length => 12e6,
-individual_name => $individual_name);
Description : Creates a new Bio::EnsEMBL::IndividualSlice object that will contain a shallow copy of the
Slice object, plus additional information such as the individual this Slice refers to
and listref of Bio::EnsEMBL::Variation::AlleleFeatures of differences with the
reference sequence
ReturnType : Bio::EnsEMBL::IndividualSlice
Exceptions : none
Caller : general
=cut
sub new{
my $caller = shift;
my $class = ref($caller) || $caller;
#create the IndividualSlice object as the Slice, plus the individual attribute
my ($individual_name, $sample_id) = rearrange(['INDIVIDUAL', 'SAMPLE_ID'],@_);
my $self = $class->SUPER::new(@_);
$self->{'individual_name'} = $individual_name;
$self->{'sample_id'} = $sample_id;
return $self;
}
=head2 individual_name
Arg [1] : (optional) string $individual_name
Example : my $individual_name = $individualSlice->individual_name();
Description : Getter/Setter for the name of the individual in the slice
ReturnType : string
Exceptions : none
Caller : general
=cut
sub individual_name{
my $self = shift;
if (@_){
$self->{'individual_name'} = shift @_;
}
return $self->{'individual_name'};
}
=head2 seq
Arg [1] : none
Example : print "SEQUENCE = ", $strainSlice->seq();
Description: Returns the sequence of the region represented by this
StrainSlice formatted as a string.
Returntype : string
Exceptions : none
Caller : general
=cut
sub seq {
my $self = shift;
# special case for in-between (insert) coordinates
return '' if($self->start() == $self->end() + 1);
return $self->{'seq'} if($self->{'seq'});
if($self->adaptor()) {
my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor();
my $reference_sequence = $seqAdaptor->fetch_by_Slice_start_end_strand($self,1,undef,1); #get the reference sequence for that slice
#apply all differences to the reference sequence
# sort edits in reverse order to remove complication of
# adjusting downstream edits
my @allele_features_ordered = sort {$b->start() <=> $a->start() || $b->end() <=> $a->end()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'});
foreach my $af (@allele_features_ordered){
$af->apply_edit($reference_sequence); #change, in the reference sequence, the af
}
# return substr(${$reference_sequence},0,1) if ($self->length == 1);
return ${$reference_sequence}; #returns the reference sequence, applying the alleleFeatures
}
# no attached sequence, and no db, so just return Ns
return 'N' x $self->length();
}
=head2 get_all_differences_Slice
Args : none
Example : my $differences = $individualSlice->get_all_differences_Slice()
Description : Gets all differences between the IndividualSlice object and the Slice is defined
ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature
Exceptions : none
Caller : general
=cut
sub get_all_differences_Slice{
my $self = shift;
my $differences; #reference to the array with the differences between Slice and StrainSlice
my $ref_allele;
foreach my $difference (@{$self->{'alleleFeatures'}}){
if ($difference->length_diff == 0){
#the difference is a SNP, check if it is the same as the reference allele
$ref_allele = $self->SUPER::subseq($difference->start,$difference->end,$difference->strand);
$ref_allele = '-' if ($ref_allele eq '');
if ($ref_allele ne $difference->allele_string){
#when the alleleFeature is different from the reference allele, add to the differences list
push @{$differences},$difference;
}
}
else{
push @{$differences},$difference;
}
}
return $differences;
}
=head2 get_all_differences_IndividualSlice
Arg[1] : Bio::EnsEMBL::IndividualSlice $is
Example : my $differences = $individualSlice->get_all_differences_IndividualSlice($individualslice)
Description : Gets differences between 2 IndividualSlice objects
ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature
Exceptions : thrown on bad argument
Caller : general
=cut
sub get_all_differences_IndividualSlice{
my $self = shift;
my $individualSlice = shift;
if (!ref($individualSlice) || !$individualSlice->isa('Bio::EnsEMBL::IndividualSlice')){
throw('Bio::EnsEMBL::IndividualSlice arg expected');
}
if ( @{$self->{'alleleFeatures'}} == 0 && @{$individualSlice->{'alleleFeatures'}} == 0){
return undef; #there are no differences in any of the Individuals
}
my $differences; #differences between individuals
if (@{$individualSlice->{'alleleFeatures'}} == 0){
#need to create a copy of alleleFeature for the first Individual
foreach my $difference (@{$self->{'alleleFeatures'}}){
my %vf = %$difference;
push @{$differences},bless \%vf,ref($difference);
}
}
elsif (@{$self->{'alleleFeatures'}} == 0){
#need to create a copy of AlleleFeature, but changing the allele by the allele in the reference sequence
foreach my $difference (@{$individualSlice->{'alleleFeatures'}}){
push @{$differences}, $individualSlice->_convert_difference($difference);
}
}
else{
#both individuals have differences
#create a hash with the differences in the first slice
my %allele_features_self = map {$_->start.'-'.$_->end => $_} @{$self->{'alleleFeatures'}};
foreach my $difference (@{$individualSlice->{'alleleFeatures'}}){
#there is no difference in the other individual slice, convert the allele
if (!defined $allele_features_self{$difference->start.'-'.$difference->end}){
push @{$differences},$individualSlice->_convert_difference($difference);
}
else{
#if it is defined and have the same allele, delete from the hash since it is not a difference
#between the individuals
if ($allele_features_self{$difference->start.'-'.$difference->end}->allele_string eq $difference->allele_string){
delete $allele_features_self{$difference->start.'-'.$difference->end};
}
}
}
#and finally, make a shallow copy of the differences in the first individual
foreach my $difference (values %allele_features_self){
my %vf = %$difference;
push @{$differences},bless \%vf,ref($difference);
}
}
#need to map differences to the first individual, self, since the coordinates are in the Slice coordinate system
my $mapper = $self->mapper(); #now that we have the differences, map them in the IndividualSlice
my @results;
foreach my $difference (@{$differences}){
@results = $mapper->map_coordinates('Slice',$difference->start,$difference->end,$difference->strand,'Slice');
#we can have 3 possibilities:
#the difference is an insertion and when mapping returns the boundaries of the insertion in the IndividualSlice
if (@results == 2){
#the first position in the result is the beginning of the insertion
if($results[0]->start < $results[1]->start){
$difference->start($results[0]->end+1);
$difference->end($results[1]->start-1);
}
else{
#it is the second position the beginning of the insertion
$difference->start($results[1]->end+1);
$difference->end($results[0]->start-1);
}
$difference->strand($results[0]->strand);
}
else{
#it can be either a SNP or a deletion, and we have the coordinates in the result, etither a Bio::EnsEMBL::Mapper::Coordinate
# or a Bio::EnsEMBL::Mapper::IndelCoordinate
$difference->start($results[0]->start);
$difference->end($results[0]->end);
$difference->strand($results[0]->strand);
}
}
return $differences;
}
#for a given AlleleFeature, converts the allele into the reference allele and returns
#the converted AlleleFeature
sub _convert_difference{
my $self = shift;
my $difference = shift;
my %new_af = %$difference; #make a copy of the alleleFeature
#and change the allele with the one from the reference Slice
$new_af{'allele_string'} = $self->SUPER::subseq($difference->start,$difference->end,$difference->strand);
return bless \%new_af,ref($difference);
}
=head2 mapper
Args : none
Description: Getter for the mapper between the between the IndividualSlice and the Slice it refers to.
It is done automatically when necessary to create subSlice or to get the differences between individuals
Returntype : Bio::EnsEMBL::Mapper
Exceptions : none
Caller : Internal function
=cut
sub mapper{
my $self = shift;
if (@_) {
#allow to create again the mapper
delete $self->{'mapper'};
}
if(!defined $self->{'mapper'}){
#create the mapper between the Slice and StrainSlice
my $mapper = Bio::EnsEMBL::Mapper->new('Slice','IndividualSlice');
#align with Slice
#get all the VariationFeatures in the Individual Slice, from start to end in the Slice
my @allele_features_ordered = sort {$a->start() <=> $b->start() || $b->end() <=> $a->end()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'});
my $start_slice = 1;
my $end_slice;
my $start_individual = 1;
my $end_individual;
my $length_allele;
my $total_length_diff = 0;
#we will walk from left to right in the slice object, updating the start and end individual every time
#there is a new alleleFeature in the Individual
foreach my $allele_feature (@allele_features_ordered){
#we have a insertion/deletion: marks the beginning of new slice move coordinates
if ($allele_feature->length_diff != 0){
$total_length_diff += $allele_feature->length_diff;
$length_allele = $allele_feature->length + $allele_feature->length_diff(); #length of the allele in the Individual
$end_slice = $allele_feature->start() - 1; #set the end of the slice before the alleleFeature
if ($end_slice >= $start_slice){
#normal cases (not with gaps)
$end_individual = $end_slice - $start_slice + $start_individual; #set the end of the individual from the beginning plus the offset
#add the sequence that maps
$mapper->add_map_coordinates('Slice',$start_slice,$end_slice,1,'IndividualSlice',$start_individual,$end_individual);
#and add the indel
$mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $allele_feature->length,1,'IndividualSlice',$end_individual+1,$end_individual + $length_allele);
$start_individual = $end_individual + $length_allele + 1; #set the beginning of the individual after the allele
}
else{
#add the indel
$mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $allele_feature->length,1,'IndividualSlice',$end_individual+1,$end_individual + $length_allele);
$start_individual += $length_allele;
}
$start_slice = $end_slice + $allele_feature->length+ 1; #set the beginning of the slice after the variation feature
}
}
if ($start_slice <= $self->length){
#if we haven't reached the end of the IndividualSlice, add the final map coordinates between the individual and the slice
$mapper->add_map_coordinates('Slice',$start_slice,$self->length,1,'IndividualSlice',$start_individual,$start_individual + $self->length - $start_slice);
}
$mapper->add_map_coordinates('Slice', -$self->start+1, 0,1, 'IndividualSlice', -$self->start +1,0) if ($self->start > 0); #before individualSlice
$mapper->add_map_coordinates('Slice', $self->length + 1,$self->seq_region_length - ($self->length +1),1, 'IndividualSlice', $self->length + 1 + $total_length_diff,$self->seq_region_length + $total_length_diff - ($self->length +1) ) if ($self->length <= $self->seq_region_length); #after strainSlice
$self->{'mapper'} = $mapper;
}
return $self->{'mapper'};
}
=head2 sub_Slice
Arg 1 : int $start
Arg 2 : int $end
Arge [3] : int $strand
Example : none
Description: Makes another IndividualSlice that covers only part of this IndividualSlice
with the appropriate differences to the reference Slice
If a slice is requested which lies outside of the boundaries
of this function will return undef. This means that
behaviour will be consistant whether or not the slice is
attached to the database (i.e. if there is attached sequence
to the slice). Alternatively the expand() method or the
SliceAdaptor::fetch_by_region method can be used instead.
Returntype : Bio::EnsEMBL::IndividualSlice or undef if arguments are wrong
Exceptions : thrown when trying to get the subSlice in the middle of a
insertion
Caller : general
=cut
sub sub_Slice {
my ( $self, $start, $end, $strand ) = @_;
my $mapper = $self->mapper();
#map from the Individual to the Slice to get the sub_Slice, and then, apply the differences in the subSlice
my @results = $mapper->map_coordinates('IndividualSlice',$start,$end,$strand,'IndividualSlice');
my $new_start;
my $new_end;
my $new_strand;
my $new_seq;
#Get need start and end for the subSlice of the IndividualSlice
my @results_ordered = sort {$a->start <=> $b->start} grep {ref($_) eq 'Bio::EnsEMBL::Mapper::Coordinate'} @results;
$new_start = $results_ordered[0]->start();
$new_strand = $results_ordered[0]->strand() if (ref($results_ordered[0]) eq 'Bio::EnsEMBL::Mapper::Coordinate');
# $new_strand = $results_ordered[-1]->strand() if (ref($results_ordered[-1]) eq 'Bio::EnsEMBL::Mapper::Coordinate');
$new_end = $results_ordered[-1]->end(); #get last element of the array, the end of the slice
my $subSlice = $self->SUPER::sub_Slice($new_start,$new_end,$new_strand);
$subSlice->{'individual_name'} = $self->{'individual_name'};
my $new_alleles; #reference to an array that will contain the variationFeatures in the new subSlice
#update the VariationFeatures in the sub_Slice of the Individual
my %af;
my $new_allele_feature;
foreach my $alleleFeature (@{$self->{'alleleFeatures'}}){
$new_allele_feature = $alleleFeature->transfer($subSlice);
#only transfer the coordinates to the SubSlice that are within the boundaries
if ($new_allele_feature->start >= 1 && $new_allele_feature->end <= $subSlice->length){
push @{$new_alleles}, $new_allele_feature;
}
}
$subSlice->{'alleleFeatures'} = $new_alleles;
return $subSlice;
}
=head2 subseq
Arg [1] : int $startBasePair
relative to start of slice, which is 1.
Arg [2] : int $endBasePair
relative to start of slice.
Arg [3] : (optional) int $strand
The strand of the individual slice to obtain sequence from. Default
value is 1.
Description: returns string of dna sequence
Returntype : txt
Exceptions : end should be at least as big as start
strand must be set
Caller : general
=cut
sub subseq {
my ( $self, $start, $end, $strand ) = @_;
if ( $end+1 < $start ) {
throw("End coord + 1 is less then start coord");
}
# handle 'between' case for insertions
return '' if( $start == $end + 1);
$strand = 1 unless(defined $strand);
if ( $strand != -1 && $strand != 1 ) {
throw("Invalid strand [$strand] in call to Slice::subseq.");
}
my $subseq;
my $seq;
if($self->adaptor){
my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor();
$subseq = ${$seqAdaptor->fetch_by_Slice_start_end_strand($self,$start,$end,$strand)}; #get the reference sequence for that slice
#apply all differences to the reference sequence
# sort edits in reverse order to remove complication of
# adjusting downstream edits
my @allele_features_ordered = sort {$b->start() <=> $a->start() || $b->end() <=> $a->end()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'});
my $af_start;
my $af_end;
foreach my $af (@allele_features_ordered){
if (($af->start - $start +1 > 0) && ($end - $af->end > 0)){
#save the current start and end of the alleleFeature before changing for apply_edit
$af_start = $af->start;
$af_end = $af->end;
#apply the difference if the feature is in the new slice
$af->start($af->start - $start +1);
$af->end($af->end - $start +1);
$af->apply_edit(\$subseq); #change, in the reference sequence, the af
#restore the initial values of alleleFeature start and end
$af->start($af_start);
$af->end($af_end);
}
}
}
else {
## check for gap at the beginning and pad it with Ns
if ($start < 1) {
$subseq = "N" x (1 - $start);
$start = 1;
}
$subseq .= substr ($self->seq(), $start-1, $end - $start + 1);
## check for gap at the end and pad it with Ns
if ($end > $self->length()) {
$subseq .= "N" x ($end - $self->length());
}
reverse_comp(\$subseq) if($strand == -1);
}
return $subseq;
}
=head2 get_all_Transcripts
Args : None
Example : @transcripts = @{$individualslice->get_all_Transcripts)};
Description: Gets all transcripts which overlap this Individual Slice. If you want to
specify a particular analysis or type, then you are better off
using get_all_Genes or get_all_Genes_by_type and iterating
through the transcripts of each gene.
Returntype : reference to a list of Bio::EnsEMBL::Transcripts
Exceptions : none
Caller : general
=cut
sub get_all_Transcripts {
my $self = shift;
my $transcripts = $self->SUPER::get_all_Transcripts(1);
$self->map_to_Individual($transcripts);
return $transcripts;
}
=head2 get_all_Exons
Arg [1] : (optional) string $dbtype
The dbtype of exons to obtain. This assumes that the db has
been added to the DBAdaptor under this name (using the
DBConnection::add_db_adaptor method).
Example : @exons = @{$individualSlice->get_all_Exons};
Description: Gets all exons which overlap this IndividualSlice. Note that these exons
will not be associated with any transcripts, so this may not
be terribly useful.
Returntype : reference to a list of Bio::EnsEMBL::Exons
Exceptions : none
Caller : general
=cut
sub get_all_Exons {
my $self = shift;
my $dbtype = shift;
my $exons = $self->SUPER::get_all_Exons($dbtype);
$self->map_to_Individual($exons); #map the exons to the Individual
return $exons;
}
=head2 get_all_Genes
Arg [1] : (optional) string $logic_name
The name of the analysis used to generate the genes to retrieve
Arg [2] : (optional) string $dbtype
The dbtype of genes to obtain. This assumes that the db has
been added to the DBAdaptor under this name (using the
DBConnection::add_db_adaptor method).
Example : @genes = @{$individualSlice->get_all_Genes};
Description: Retrieves all genes that overlap this slice.
Returntype : listref of Bio::EnsEMBL::Genes
Exceptions : none
Caller : none
=cut
sub get_all_Genes{
my ($self, $logic_name, $dbtype) = @_;
my $genes = $self->SUPER::get_all_Genes($logic_name, $dbtype, 1);
$self->map_to_Individual($genes);
foreach my $gene (@{$genes}){
$self->map_to_Individual($gene->get_all_Exons); #map the Exons to the Individual
$self->map_to_Individual($gene->get_all_Transcripts); #map the Transcripts to the Individual
}
return $genes;
}
=head2 map_to_Individual
Arg[1] : ref $features
Example : $individualSlice->map_to_Individual($exons);
Description : Gets the features from the Slice and maps it in the IndividualSlice, using the mapper
between Slice and IndividualSlice
ReturnType : None
Exceptions : None
Caller : general
=cut
sub map_to_Individual{
my $self = shift;
my $features = shift;
my $mapper = $self->mapper();
my (@results, @results_ordered, $new_start, $new_end, $new_strand);
#foreach of the transcripts, map them to the IndividualSlice and replace the Slice with the IndividualSlice
foreach my $feature (@{$features}){
$feature->slice($self); #replace the IndividualSlice as the Slice for this feature (the Slice plus the AlleleFeatures)
#map from the Slice to the Individual Slice
my @results = $mapper->map_coordinates('Slice',$feature->start,$feature->end,$feature->strand,'Slice');
#from the results, order them but filter out those that are not coordinates
@results_ordered = sort {$a->start <=> $b->start} grep {ref($_) eq 'Bio::EnsEMBL::Mapper::Coordinate'} @results;
$new_start = $results_ordered[0]->start();
$new_strand = $results_ordered[0]->strand();
$new_end = $results_ordered[-1]->end(); #get last element of the array, the end of the slice
$feature->start($new_start); #update new coordinates
$feature->end($new_end);
$feature->strand($new_strand);
}
}
sub alleleFeatures{
my $self = shift;
return $self->{'alleleFeatures'};
}
sub add_AlleleFeature{
my $self = shift;
if (@_){
if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::AlleleFeature')) {
throw("Bio::EnsEMBL::Variation::AlleleFeature argument expected");
}
#add the alleleFeature to the individualSlice
push @{$self->{'alleleFeatures'}},shift;
}
}
1;