Bio::EnsEMBL::Analysis::RunnableDB MapCloneEnds
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Analysis::RunnableDB::MapCloneEnds;
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::Config::ExonerateCloneEnds
Bio::EnsEMBL::Analysis::RunnableDB
Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Inherit
Bio::EnsEMBL::Analysis::RunnableDB
Synopsis
my $clonemap =
Bio::EnsEMBL::Analysis::RunnableDB::MapCloneEnds->new(
-db => $refdb,
-analysis => $analysis_obj,
-database => $EST_GENOMIC,
);
$clonemap->fetch_input();
$clonemap->run();
$clonemap->write_output(); #writes to DB
Description
This object maps clone sequences to a genome by using the
exonerate alignment program,and write the results as Dna Align Features.
It relies on the following modules:
Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds;
Bio::EnsEMBL::Analysis::Runnable::ExonerateCloneEnds;
In the analysis table, you have to add 3 analysis, one that calls
MapCloneEnds, and two for the different exonerate steps:
MapCloneEnds (module -> MapCloneEnds),
EXONERATE_CLONE_ENDS (module -> MapCloneEnds),
REFINE_CLONE_ENDS(module -> MapCloneEnds)
MapCloneEnds reads the input_id_chunks from a file where each line
contains a number of IDs separated by ":". In order to be able to correctly
parse the input ids, they should be given in a very specific format. The file
can be automaticaly generated from an XML using chunker.pl or manually generated
where each id should be in the format:
Clone_ID,length_of_clone,standard_deviation_of_length,Clone_end_ID,Direction_of_Clone
(i.e. CH243-307D10,184000.0,36800.0,1098421033278,F).
Methods
CHUNKSLIST
No description
Code
DNADB
No description
Code
GENOMICSEQS
No description
Code
IIDREGEXP
No description
Code
OPTIONS
No description
Code
OUTDB
No description
Code
QUERYSEQS
No description
Code
QUERYTYPE
No description
Code
clean_clusters
No description
Code
fetch_analysis
No description
Code
fetch_input
No description
Code
filter_alignments
No description
Code
new
No description
Code
read_and_check_config
No description
Code
refined_results
No description
Code
run
No description
Code
single_filter_alignments
No description
Code
write_output
No description
Code
Methods description
None available.
Methods code
CHUNKSLISTdescriptionprevnextTop
sub CHUNKSLIST {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_CHUNKSLIST'} = $value;
  }

  if ( exists( $self->{'_CONFIG_CHUNKSLIST'} ) ) {
    return $self->{'_CONFIG_CHUNKSLIST'};
  } else {
    return undef;
  }
}
DNADBdescriptionprevnextTop
sub DNADB {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_DNADB'} = $value;
  }

  if ( exists( $self->{'_CONFIG_DNADB'} ) ) {
    return $self->{'_CONFIG_DNADB'};
  } else {
    return undef;
  }
}
GENOMICSEQSdescriptionprevnextTop
sub GENOMICSEQS {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_GENOMICSEQS'} = $value;
  }

  if ( exists( $self->{'_CONFIG_GENOMICSEQS'} ) ) {
    return $self->{'_CONFIG_GENOMICSEQS'};
  } else {
    return undef;
  }
}
IIDREGEXPdescriptionprevnextTop
sub IIDREGEXP {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_IIDREGEXP'} = $value;
  }

  if ( exists( $self->{'_CONFIG_IIDREGEXP'} ) ) {
    return $self->{'_CONFIG_IIDREGEXP'};
  } else {
    return undef;
  }
}
OPTIONSdescriptionprevnextTop
sub OPTIONS {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_OPTIONS'} = $value;
  }

  if ( exists( $self->{'_CONFIG_OPTIONS'} ) ) {
    return $self->{'_CONFIG_OPTIONS'};
  } else {
    return undef;
  }
}
OUTDBdescriptionprevnextTop
sub OUTDB {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_OUTDB'} = $value;
  }

  if ( exists( $self->{'_CONFIG_OUTDB'} ) ) {
    return $self->{'_CONFIG_OUTDB'};
  } else {
    return undef;
  }
}
QUERYSEQSdescriptionprevnextTop
sub QUERYSEQS {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_QUERYSEQS'} = $value;
  }

  if ( exists( $self->{'_CONFIG_QUERYSEQS'} ) ) {
    return $self->{'_CONFIG_QUERYSEQS'};
  } else {
    return undef;
  }
}
QUERYTYPEdescriptionprevnextTop
sub QUERYTYPE {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_QUERYTYPE'} = $value;
  }

  if ( exists( $self->{'_CONFIG_QUERYTYPE'} ) ) {
    return $self->{'_CONFIG_QUERYTYPE'};
  } else {
    return undef;
  }
}
clean_clustersdescriptionprevnextTop
sub clean_clusters {
  my ($self, $clone_cluster_alignments) = @_;
 
  # Cluster alignments obtained for each clone
my %chr_cluster = (); foreach my $alignment(@{$clone_cluster_alignments}){ my $hit_name= $alignment->hseqname; my $chr_name= $alignment->seqname; # Generate a unique key that will be found only when a cloneEnd aligns more than
# once against the same chromosomal sequence
my $cluster_key = $chr_name."-".$hit_name; # check if there is any sequence aligned from the same clone against the chromosome
# In case there isn't, start the count of alignments for that clone.
if(!$chr_cluster{$cluster_key}){ $chr_cluster{$cluster_key} = []; } push (@{$chr_cluster{$cluster_key}}, $alignment); } my @selection = (); # For each clone check how many clusters were obtained.
# Check the clusters and selected the best alignment cluster
# to be used as template for the more exhaustive exonerate alignment.
foreach my $chr_cluster_key (keys %chr_cluster){ # If the cluster contains more than one sequence, order the sequences and check if they are
# separated by less than 4000 bases. If the separation is bigger don't count them as part of the cluster
if (scalar(@{$chr_cluster{$chr_cluster_key}})>1){ # get the first aligment of the cluster to be used in the exonerate step
my $first_selected_alignment; # order the alignments in the cluster to be able to check the real distance among them
my @ordered_aligns = sort{$a->hstart() <=> $b->hstart()} @{$chr_cluster{$chr_cluster_key}}; my $initial_end = 0; my $initial_alignment; my %cluster_alignments = (); # Use this value to avoid the comparison of the first alignment against null, that in case the start
# point of the alignment is lower than the cutoff which lead to the clustering of the first alignment
# with a NULL alignment
my $first_alignment = 0; my $alignment_added = 0; foreach my $ordered_align(@ordered_aligns){ # if the two condiditions are acomplished check if the alignments form a cluster
if ($first_alignment!=0){ my $cloneEnd_distance = abs(($ordered_align->start)-($initial_end)); # if the distance is sorter that the threshold the alignments cluster
if($cloneEnd_distance < 4000){ $initial_alignment = $ordered_align; $initial_end = $ordered_align->end; if ($alignment_added==0){ push (@selection, $initial_alignment); $alignment_added = 1; } }else{ push (@selection, $ordered_align); $initial_alignment = $ordered_align; $initial_end = $ordered_align->end; } } # Change this value after the first alignment is checked.
if ($first_alignment == 0){ $first_alignment = 1; $initial_alignment = $ordered_align; $initial_end = $ordered_align->end; } } }else{ push (@selection, $chr_cluster{$chr_cluster_key}[0]); } } return\@ selection;
}
fetch_analysisdescriptionprevnextTop
sub fetch_analysis {
  my ($self, $logic_name) = @_;
 
  my  $db = $self->db;

  my $analysis_adaptor= $db->get_AnalysisAdaptor;

  my $analysis = $analysis_adaptor->fetch_by_logic_name($logic_name);
  
  return $analysis;
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
 
# It does nothing so no really code implementation here.
my ($self) = @_; } ##########################################################################
}
filter_alignmentsdescriptionprevnextTop
sub filter_alignments {
  my ($self, $clone_alignments, $clones_ref, $cloneEnd_ids_ref ) = @_;

  my %clone_cluster = ();
  my %aligned_clones = (); 
  my %clones = %{$clones_ref};
  my %cloneEnd_ids = %{$cloneEnd_ids_ref};
  my @selected_alignments;

  # Group alignments by clone name
foreach my $clone_alignment(@{$clone_alignments}){ # Get the cloneEnds name
my $clone_name = $clone_alignment->hseqname(); # Get the clone id to which the cloneEnds belongs
my $complete_clone_name = $cloneEnd_ids{$clone_name}; # Store the clone name in a hash, so only selected clones will be checked in the next step.
# the object stored in the hash is not important, what we really want to get is a list of
# unique clones(complete_clone_name) with no duplicate entries
$aligned_clones{$complete_clone_name} = $clone_name; if(!$clone_cluster{$clone_name}){ $clone_cluster{$clone_name} = []; } push (@{$clone_cluster{$clone_name}}, $clone_alignment); } foreach my $pair (keys %aligned_clones){ # Check if the clone has two cloneEnds
if ((scalar @{$clones{$pair}})/4 > 1){

# Check if at least one of the cluster pair alignments is selected.
# in case none is selected it will send cloneEnds to the single_filter
my
$cluster_selected = 0;
my %clean_cloneEnds = (); my %status = (); my %clone_dir = (); # Get the length of the clone and the name of the two cloneEnds
my $clone_length = $clones{$pair}[1]+$clones{$pair}[2]+200000; for (my $numberOfEnd=0;$numberOfEnd < scalar @{$clones{$pair}};$numberOfEnd+=4){ my $cloneEnd =$clones{$pair}[$numberOfEnd]; # Clone_dir used to avoid pairing of two cloneEnds in the same end as for some clones
# there are more than two cloneEnds where more than one correspond to the same end that
# was sequenced more than once.
$clone_dir{$numberOfEnd} = $clones{$pair}[$numberOfEnd+3]; my @cloneEnd_clean = @{$self->clean_clusters($clone_cluster{$cloneEnd})}; if (!$clean_cloneEnds{$numberOfEnd}){ $clean_cloneEnds{$numberOfEnd} =\@ cloneEnd_clean; } # Set initial status of cloneEnd to 0 which means that none of its alignments pair with the other cloneEnd
if (!$status{$numberOfEnd}){ $status{$numberOfEnd} = 0; } } foreach my $clean_cloneEnd1 (keys %clean_cloneEnds){ foreach my $clean_cloneEnd2 (keys %clean_cloneEnds){ # first check that one cloneEnd is not compared with itself. Then check that if we have pair
# A->B don't use again B->A and finally get only pairs of F + R cloneEnds
if ($clean_cloneEnd1 != $clean_cloneEnd2 && $clean_cloneEnd1 < $clean_cloneEnd2 && $clone_dir{$clean_cloneEnd1} ne $clone_dir{$clean_cloneEnd2}){ # Use this variable to check where an alignment occur and avoid duplicate alignments because of
# short consecutive sequences that align near by and are both selected for the next exonerate step
my $first_prev_chr_start = 0; my $first_chr_hit = 'Null'; foreach my $fa (@{$clean_cloneEnds{$clean_cloneEnd1}}){ # Check if two alignments don't belong to a cluster or they align in different chromosomes.
# This is made to avoid getting duplicate alignments in the next exonerate step when very close
# coordinates are selected for the target sequence
if (($fa->seqname() eq $first_chr_hit && ($fa->start()-$first_prev_chr_start) > 4000) || $fa->seqname() ne $first_chr_hit){ my $second_prev_chr_start = 0; foreach my $sa (@{$clean_cloneEnds{$clean_cloneEnd2}}){ if ($fa->seqname() eq $sa->seqname()){ my $diff = ($fa->start())-($sa->end()); my $abs_diff = abs($diff); # Check that the two cloneEnds are close enough as to be considered as paired and that
# the second cloneEnd don't belong to a cluster that was previously selected
# This is made to avoid the same cloneEnds to be paired more than once when there is more
# than one alignment in a short region.
if ($abs_diff <= $clone_length && (($sa->start()-$second_prev_chr_start)>4000)){ # In case two alignments are selected store them for the next exonerate step
push (@selected_alignments, $fa); push (@selected_alignments, $sa); $status{$clean_cloneEnd1} = 1; $status{$clean_cloneEnd2} = 1; $cluster_selected = 1; $first_prev_chr_start = $fa->start(); $second_prev_chr_start = $sa->start(); $first_chr_hit= $fa->seqname(); } } } } } } } } foreach my $clone_status (keys %status){ if ($status{$clone_status} == 0){ my $cloneEnd =$clones{$pair}[$clone_status]; my $selected_align = $self->single_filter_alignments($clone_cluster{$cloneEnd}); push (@selected_alignments, ${$selected_align}); } } }else{ # if the clone has only one cloneEnd do:
# get the name of the cloneEnds,
my $first_cloneEnd =$clones{$pair}[0]; # send all the alignments for that cloneEnd stored in clone_cluster to the single_filter_alignment
my $selected_align = $self->single_filter_alignments($clone_cluster{$first_cloneEnd}); # add the result of filtering to the selected alignments array.
push (@selected_alignments, ${$selected_align}); } } return\@ selected_alignments;
}
newdescriptionprevnextTop
sub new {
  my ( $class, @args ) = @_;
  my $self = $class->SUPER::new(@args);
  $self->{_refined_results}=[] ;
  $self->read_and_check_config($CLONE_CONFIG);
  return $self;
}

##########################################################################
}
read_and_check_configdescriptionprevnextTop
sub read_and_check_config {
  my $self = shift;

  $self->SUPER::read_and_check_config($CLONE_CONFIG);

  ##########
# CHECKS
##########
my $logic = $self->analysis->logic_name; # check that compulsory options have values
foreach my $config_var ( qw( CHUNKSLIST ) ){ if ( not defined $self->$config_var ){ throw("You must define $config_var in config for logic '$logic'"); } }
}
refined_resultsdescriptionprevnextTop
sub refined_results {
  my ($self, $refine_result) = @_;

 # if(!$self->{'refined_results'}){
# $self->{'refined_results'}=[];
# }
if ($refine_result) { push @{$self->{_refined_results}}, $refine_result ; } return $self->{_refined_results};
}
rundescriptionprevnextTop
sub run {
  my ($self) = @_;

  my $trace_name = '';  # CloneEnd id as in database
my $clone_id = '';# Clone id
my $insert_size = '';# Length of the inserted sequence in clone
my $insert_stdev = '';# Standard deviation of the clone insert.
# The two insert values will be used in the filter process to check quality of alignments
# You need to run perl ensembl-personal/jb16/scripts/general/chunker.pl in order to create the
# input_analysis_ids and create this file. To change the path to the list of chunks, you should also do it in
# the chunker script
my $chunks_list = $self->CHUNKSLIST; my %clones = (); # hash where each clone object will be stored
my %cloneEnd_ids = (); # Open the file with the list of seq_ids per chunk and load it into an array to be used by fetch_input
open (INFILE,"<$chunks_list"); my @listOfIDs = (); while (my $line = <INFILE>){ chomp($line); # Split by ":" to get information of each clone as an independent element in an array.
my @clone = split (/:/,$line); my $listOfClones = ''; foreach my $clone (@clone){ # Split all the information of one cloneEnd and load each element into an array
my @clone_data = split (/,/,$clone); if (!$clones{$clone_data[0]}){ $clones{$clone_data[0]}=[]; } # Group information of each cloneEnd using clone_id as reference (will be used in the filter step)
push (@{$clones{$clone_data[0]}}, $clone_data[3]); # Add cloneEnd_name
push (@{$clones{$clone_data[0]}}, $clone_data[1]); # Add insert length
push (@{$clones{$clone_data[0]}}, $clone_data[2]); # Add insert length standard deviation
push (@{$clones{$clone_data[0]}}, $clone_data[4]); # Add cloneEnd direction (R or F)
$cloneEnd_ids{$clone_data[3]} = $clone_data[0]; if ($listOfClones eq ''){ $listOfClones.= $clone_data[3]; }else{ $listOfClones.= ":".$clone_data[3]; } } # Creates the array that contains the list of cloneEnd_ids that will go in each chunks
push (@listOfIDs,$listOfClones); } close INFILE; # Run ExonerateClones to get a first idea on the aligned/location of the clone Ends
my $exonerate = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds->new( -DB => $self->db, -INPUT_ID => $self->input_id, -ANALYSIS => $self->fetch_analysis("EXONERATE_CLONE_ENDS"), ); $exonerate->fetch_input(\@listOfIDs); $exonerate->run(); # Run exonerate and get the output.
my $clone_alignments = $exonerate->output(); my @selected_alignments = @{$self->filter_alignments($clone_alignments,\% clones,\% cloneEnd_ids)}; foreach my $selected_alignment(@selected_alignments){ if ($selected_alignment ne ''){ my $clone_id = $selected_alignment->hseqname; my $chr_id = $selected_alignment->seqname; my $start = ($selected_alignment->start)-2000; my $end = ($selected_alignment->end)+2000; my @chr_name = split (/:/, $chr_id); # Rebuild the input_id to send the coordinates for the target slice and the query sequence.
my $input_id = $chr_name[0].":".$chr_name[1].":".$chr_name[2].":".$start.":".$end.":".$chr_name[5].":".$clone_id; my $refine = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds->new( -DB => $self->db, -INPUT_ID => $input_id, -ANALYSIS => $self->fetch_analysis("REFINE_CLONE_ENDS"), ); $refine ->fetch_input(); $refine ->run(); $self->refined_results($refine); } } } ##########################################################################
}
single_filter_alignmentsdescriptionprevnextTop
sub single_filter_alignments {
  my ($self, $single_clone_alignments) = @_;
 
  # Cluster alignments obtained for each clone
my %chr_cluster = (); foreach my $alignment(@{$single_clone_alignments}){ my $hit_name= $alignment->hseqname; my $chr_name= $alignment->seqname; # Generate a unique key that will be found only when only clone aligns more than
# once against the same chromosomal sequence
my $cluster_key = $chr_name."-".$hit_name; # check if there is any sequence aligned from the same clone against the chromosome
# In case there isn't, start the count of alignments for that clone.
if(!$chr_cluster{$cluster_key}){ $chr_cluster{$cluster_key} = []; } push (@{$chr_cluster{$cluster_key}}, $alignment); } my $selected_alignment = ""; my $cluster_size = 0; my $biggest_score = 0; # For each clone check how many clusters were obtained.
# Check the clusters and selected the best alignment cluster
# to be used as template for the more exhaustive exonerate alignment.
foreach my $chr_cluster_key (keys %chr_cluster){ # If the cluster contains more than one sequence, order the sequences and check if they are
# separated by less than 1000 bases. If the separation is bigger don't count them as part of the cluster
if (scalar(@{$chr_cluster{$chr_cluster_key}})>1){ # counter to get the real number of alignment within the cluster.
my $size_counter = 0; # get the first aligment of the cluster to be used in the exonerate step
my $first_selected_alignment; # order the alignments in the clster to be able to check the real distance among them
my @ordered_aligns = sort{$a->hstart() <=> $b->hstart()} @{$chr_cluster{$chr_cluster_key}}; my $previous_end = 0; my $previous_alignment; my $end_of_cluster = 0; my %cluster_alignments = (); # Use this value to avoid the comparison of the first alignment against null, that in case the start
# point of the alignment is lower than the cutoff which lead to the clustering of the first alignment
# with a NULL alignment
my $first_alignment = 0; foreach my $ordered_align(@ordered_aligns){ # if the two condiditions are acomplished check if the alignments form a cluster
if ($first_alignment!=0 && $end_of_cluster == 0){ my $cloneEnd_distance = abs($ordered_align->start)-($previous_end); # if the distance is sorter that the threshold the alignments cluster
if($cloneEnd_distance<4000){ if(!$cluster_alignments{$ordered_align->hseqname}){ $cluster_alignments{$ordered_align->hseqname} = []; push (@{$cluster_alignments{$ordered_align->hseqname}}, $previous_alignment); $size_counter++; $first_selected_alignment = $previous_alignment; } push (@{$cluster_alignments{$ordered_align->hseqname}}, $ordered_align); $size_counter++; }elsif($cluster_alignments{$ordered_align->hseqname}){ $end_of_cluster = 1; } $previous_alignment = $ordered_align; $previous_end = $ordered_align->end; } # Change this value after the first alignment is checked.
if ($first_alignment == 0){ $first_alignment = 1; $previous_alignment = $ordered_align; $previous_end = $ordered_align->end; } } if ($size_counter > $cluster_size){ $cluster_size= $size_counter; $selected_alignment = $first_selected_alignment; } } elsif($cluster_size==0 && ${$chr_cluster{$chr_cluster_key}}[0]->score()> $biggest_score){ $biggest_score = ${$chr_cluster{$chr_cluster_key}}[0]->score(); $selected_alignment = $chr_cluster{$chr_cluster_key}[0]; } } return\$ selected_alignment;
}
write_outputdescriptionprevnextTop
sub write_output {
  my ( $self, @output ) = @_;
   
  foreach my $refine_object( @{ $self->refined_results }) {
      
      $refine_object->write_output();
  }
}


############################################################################################
}
General documentation
CONTACTTop
Post general queries, or on how to get chunker.pl, to <ensembl-dev@ebi.ac.uk>
APPENDIXTop