MapCloneEnds documentation.

This object maps clone sequences to a genome by using the
exonerate alignment program,and write the results as Dna Align Features.
It relies on the following modules:
Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds;
Bio::EnsEMBL::Analysis::Runnable::ExonerateCloneEnds;
In the analysis table, you have to add 3 analysis, one that calls
MapCloneEnds, and two for the different exonerate steps:
MapCloneEnds (module -> MapCloneEnds),
EXONERATE_CLONE_ENDS (module -> MapCloneEnds),
REFINE_CLONE_ENDS(module -> MapCloneEnds)
MapCloneEnds reads the input_id_chunks from a file where each line
contains a number of IDs separated by ":". In order to be able to correctly
parse the input ids, they should be given in a very specific format. The file
can be automaticaly generated from an XML using chunker.pl or manually generated
where each id should be in the format:
Clone_ID,length_of_clone,standard_deviation_of_length,Clone_end_ID,Direction_of_Clone
(i.e. CH243-307D10,184000.0,36800.0,1098421033278,F).

sub clean_clusters {

  my ($self, $clone_cluster_alignments) = @_;
 
  # Cluster alignments obtained for each clone
  my %chr_cluster = ();
  
  foreach my $alignment(@{$clone_cluster_alignments}){
     
    my $hit_name= $alignment->hseqname;
    my $chr_name= $alignment->seqname;
  
    # Generate a unique key that will be found only when a cloneEnd aligns more than
    # once against the same chromosomal sequence
    my $cluster_key = $chr_name."-".$hit_name;

    # check if there is any sequence aligned from the same clone against the chromosome
    # In case there isn't, start the count of alignments for that clone.
    if(!$chr_cluster{$cluster_key}){
      $chr_cluster{$cluster_key} = [];
    }
    push (@{$chr_cluster{$cluster_key}}, $alignment);
  }

  my @selection = ();

  # For each clone check how many clusters were obtained.
  # Check the clusters and selected the best alignment cluster
  # to be used as template for the more exhaustive exonerate alignment.

  foreach my $chr_cluster_key (keys %chr_cluster){
    # If the cluster contains more than one sequence, order the sequences and check if they are
    # separated by less than 4000 bases. If the separation is bigger don't count them as part of the cluster

    if (scalar(@{$chr_cluster{$chr_cluster_key}})>1){
      # get the first aligment of the cluster to be used in the exonerate step
      my $first_selected_alignment;

      # order the alignments in the cluster to be able to check the real distance among them
      my  @ordered_aligns = sort{$a->hstart() <=> $b->hstart()} @{$chr_cluster{$chr_cluster_key}};    

      my $initial_end = 0;
      my $initial_alignment;
    
      my %cluster_alignments = ();

      # Use this value to avoid the comparison of the first alignment against null, that in case the start
      # point of the alignment is lower than the cutoff which lead to the clustering of the first alignment
      # with a NULL alignment
      my $first_alignment = 0;
      my $alignment_added = 0;
      
      foreach my $ordered_align(@ordered_aligns){
        
        # if the two condiditions are acomplished check if the alignments form a cluster
        if ($first_alignment!=0){
     
	  my $cloneEnd_distance = abs(($ordered_align->start)-($initial_end));
          # if the distance is sorter that the threshold the alignments cluster
          if($cloneEnd_distance < 4000){
 
            $initial_alignment = $ordered_align;
            $initial_end = $ordered_align->end;
            if ($alignment_added==0){
              push (@selection, $initial_alignment);
	      $alignment_added = 1;
	    }
          }else{
            push (@selection, $ordered_align);
            $initial_alignment = $ordered_align;
            $initial_end = $ordered_align->end;         
          }
        }
        # Change this value after the first alignment is checked.
        if ($first_alignment == 0){
          $first_alignment = 1;
          $initial_alignment = $ordered_align;
          $initial_end = $ordered_align->end;
        }
      }

    }else{
      push (@selection, $chr_cluster{$chr_cluster_key}[0]);
    }
  }
  return\@ selection;

}

sub filter_alignments {

  my ($self, $clone_alignments, $clones_ref, $cloneEnd_ids_ref ) = @_;

  my %clone_cluster = ();
  my %aligned_clones = (); 
  my %clones = %{$clones_ref};
  my %cloneEnd_ids = %{$cloneEnd_ids_ref};
  my @selected_alignments;

  # Group alignments by clone name
  foreach my $clone_alignment(@{$clone_alignments}){
   
    # Get the cloneEnds name
    my $clone_name = $clone_alignment->hseqname();

    # Get the clone id to which the cloneEnds belongs
    my $complete_clone_name = $cloneEnd_ids{$clone_name};

    # Store the clone name in a hash, so only selected clones will be checked in the next step.
    # the object stored in the hash is not important, what we really want to get is a list of 
    # unique clones(complete_clone_name) with no duplicate entries
    $aligned_clones{$complete_clone_name} = $clone_name;


    if(!$clone_cluster{$clone_name}){
      $clone_cluster{$clone_name} = [];
    }
    push (@{$clone_cluster{$clone_name}}, $clone_alignment);
  }


  foreach my $pair (keys %aligned_clones){
    
    # Check if the clone has two cloneEnds
    if ((scalar @{$clones{$pair}})/4 > 1){

      # Check if at least one of the cluster pair alignments is selected.
      # in case none is selected it will send cloneEnds to the single_filter
      my $cluster_selected = 0;

      my %clean_cloneEnds = ();
     
      my %status = ();

      my %clone_dir = ();

      # Get the length of the clone and the name of the two cloneEnds
      my $clone_length = $clones{$pair}[1]+$clones{$pair}[2]+200000;
      for (my $numberOfEnd=0;$numberOfEnd < scalar @{$clones{$pair}};$numberOfEnd+=4){
       
        my $cloneEnd =$clones{$pair}[$numberOfEnd];

        # Clone_dir used to avoid pairing of two cloneEnds in the same end as for some clones
        # there are more than two cloneEnds where more than one correspond to the same end that
        # was sequenced more than once.
        $clone_dir{$numberOfEnd} = $clones{$pair}[$numberOfEnd+3];
	
        my @cloneEnd_clean  = @{$self->clean_clusters($clone_cluster{$cloneEnd})};
	if (!$clean_cloneEnds{$numberOfEnd}){
          $clean_cloneEnds{$numberOfEnd} =\@ cloneEnd_clean;
        }
       
        # Set initial status of cloneEnd to 0 which means that none of its alignments pair with the other cloneEnd
        if (!$status{$numberOfEnd}){
	  $status{$numberOfEnd} = 0;
        }
      }

      foreach my $clean_cloneEnd1 (keys %clean_cloneEnds){

        foreach my $clean_cloneEnd2 (keys %clean_cloneEnds){
          # first check that one cloneEnd is not compared with itself. Then check that if we have pair
          # A->B don't use again B->A and finally get only pairs of F + R cloneEnds
      	  if ($clean_cloneEnd1 != $clean_cloneEnd2 && $clean_cloneEnd1 < $clean_cloneEnd2 
              && $clone_dir{$clean_cloneEnd1} ne $clone_dir{$clean_cloneEnd2}){

            # Use this variable to check where an alignment occur and avoid duplicate alignments because of
            # short consecutive sequences that align near by and are both selected for the next exonerate step
	    my $first_prev_chr_start = 0;
            my $first_chr_hit = 'Null';

            foreach my $fa (@{$clean_cloneEnds{$clean_cloneEnd1}}){

              # Check if two alignments don't belong to a cluster or they align in different chromosomes.
              # This is made to avoid getting duplicate alignments in the next exonerate step when very close
              # coordinates are selected for the target sequence
	      if (($fa->seqname() eq $first_chr_hit && ($fa->start()-$first_prev_chr_start) > 4000)
                   || $fa->seqname() ne $first_chr_hit){

	        my $second_prev_chr_start = 0;

                foreach my $sa (@{$clean_cloneEnds{$clean_cloneEnd2}}){
	
                  if ($fa->seqname() eq $sa->seqname()){

                    my $diff = ($fa->start())-($sa->end());
                    my $abs_diff = abs($diff);

                    # Check that the two cloneEnds are close enough as to be considered as paired and that
                    # the second cloneEnd don't belong to a cluster that was previously selected
                    # This is made to avoid the same cloneEnds to be paired more than once when there is more 
                    # than one alignment in a short region.
                    if ($abs_diff <= $clone_length  && (($sa->start()-$second_prev_chr_start)>4000)){
             
                      # In case two alignments are selected store them for the next exonerate step
                      push (@selected_alignments, $fa);
                      push (@selected_alignments, $sa); 
                    
                      $status{$clean_cloneEnd1} = 1;
                      $status{$clean_cloneEnd2} = 1;
                      $cluster_selected = 1;
                      $first_prev_chr_start = $fa->start();
                      $second_prev_chr_start = $sa->start();
                      $first_chr_hit= $fa->seqname();
     	            }
	          }
	        }
              }
            }
          }
        }
      }

      foreach my $clone_status (keys %status){
	if ($status{$clone_status} == 0){

          my $cloneEnd =$clones{$pair}[$clone_status];
          my $selected_align = $self->single_filter_alignments($clone_cluster{$cloneEnd});
          push (@selected_alignments, ${$selected_align});
        
        }
      }
    }else{
      # if the clone has only one cloneEnd do:
      # get the name of the cloneEnds,
      my $first_cloneEnd =$clones{$pair}[0];
      # send all the alignments for that cloneEnd stored in clone_cluster to the single_filter_alignment 
      my $selected_align = $self->single_filter_alignments($clone_cluster{$first_cloneEnd});
      # add the result of filtering to the selected alignments array.
      push (@selected_alignments, ${$selected_align});
      
    }
  }
  return\@ selected_alignments;

}

sub run {

  my ($self) = @_;

  my $trace_name = '';  # CloneEnd id as in database
  my $clone_id = '';# Clone id
  my $insert_size = '';# Length of the inserted sequence in clone
  my $insert_stdev = '';# Standard deviation of the clone insert. 
  # The two insert values will be used in the filter process to check quality of alignments

  # You need to run  perl ensembl-personal/jb16/scripts/general/chunker.pl in order to create the 
  # input_analysis_ids and create this file. To change the path to the list of chunks, you should also do it in
  # the chunker script
 
  my $chunks_list = $self->CHUNKSLIST;

  my %clones  = (); # hash where each clone object will be stored

  my %cloneEnd_ids = ();

  # Open the file with the list of seq_ids per chunk and load it into an array to be used by fetch_input
  open (INFILE,"<$chunks_list");
  my @listOfIDs = ();

  while (my $line = <INFILE>){

    chomp($line);
 
    # Split by ":" to get information of each clone as an independent element in an array. 
    my @clone = split (/:/,$line);
    my $listOfClones = '';

    foreach my $clone (@clone){

      # Split all the information of one cloneEnd and load each element into an array
      my @clone_data = split (/,/,$clone);

      if (!$clones{$clone_data[0]}){
        $clones{$clone_data[0]}=[];
      }
    
      # Group information of each cloneEnd using clone_id as reference (will be used in the filter step)  
      push (@{$clones{$clone_data[0]}}, $clone_data[3]); # Add cloneEnd_name
      push (@{$clones{$clone_data[0]}}, $clone_data[1]); # Add insert length
      push (@{$clones{$clone_data[0]}}, $clone_data[2]); # Add insert length standard deviation
      push (@{$clones{$clone_data[0]}}, $clone_data[4]); # Add cloneEnd direction (R or F)
      $cloneEnd_ids{$clone_data[3]} = $clone_data[0];

      if ($listOfClones eq ''){
	$listOfClones.= $clone_data[3];
      }else{
        $listOfClones.= ":".$clone_data[3];
      }
    }

    # Creates the array that contains the list of cloneEnd_ids that will go in each chunks
    push (@listOfIDs,$listOfClones);
  }

  close INFILE;

  # Run ExonerateClones to get a first idea on the aligned/location of the clone Ends 
    my $exonerate = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds->new(
                    -DB          => $self->db,
                    -INPUT_ID    => $self->input_id,
                    -ANALYSIS    => $self->fetch_analysis("EXONERATE_CLONE_ENDS"),
                    );

    $exonerate->fetch_input(\@listOfIDs);

    $exonerate->run();
  
  # Run exonerate and get the output.
  my $clone_alignments = $exonerate->output();

  my @selected_alignments = @{$self->filter_alignments($clone_alignments,\% clones,\% cloneEnd_ids)};

  foreach my $selected_alignment(@selected_alignments){
    
    if ($selected_alignment ne ''){
    
      my $clone_id = $selected_alignment->hseqname;
      my $chr_id   = $selected_alignment->seqname;
      my $start    = ($selected_alignment->start)-2000;
      my $end      = ($selected_alignment->end)+2000;

      my @chr_name = split (/:/, $chr_id);
  
      # Rebuild the input_id to send the coordinates for the target slice and the query sequence.
      my $input_id = $chr_name[0].":".$chr_name[1].":".$chr_name[2].":".$start.":".$end.":".$chr_name[5].":".$clone_id;

      my $refine = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds->new(
                   -DB          => $self->db,
                   -INPUT_ID    => $input_id,
                   -ANALYSIS    => $self->fetch_analysis("REFINE_CLONE_ENDS"),
                   );
    
      $refine ->fetch_input();
      $refine ->run();
      $self->refined_results($refine);
    }
  }
}

##########################################################################

}

sub single_filter_alignments {

  my ($self, $single_clone_alignments) = @_;
 
  # Cluster alignments obtained for each clone
  my %chr_cluster = ();
  
  foreach my $alignment(@{$single_clone_alignments}){
   
    my $hit_name= $alignment->hseqname;
    my $chr_name= $alignment->seqname;
    
    # Generate a unique key that will be found only when only clone aligns more than
    # once against the same chromosomal sequence
    my $cluster_key = $chr_name."-".$hit_name;

    # check if there is any sequence aligned from the same clone against the chromosome
    # In case there isn't, start the count of alignments for that clone.
    if(!$chr_cluster{$cluster_key}){
      $chr_cluster{$cluster_key} = [];
    }
    push (@{$chr_cluster{$cluster_key}}, $alignment);
  }

  my $selected_alignment = "";
  my $cluster_size = 0;
  my $biggest_score = 0;

  # For each clone check how many clusters were obtained.
  # Check the clusters and selected the best alignment cluster
  # to be used as template for the more exhaustive exonerate alignment.

  foreach my $chr_cluster_key (keys %chr_cluster){
    # If the cluster contains more than one sequence, order the sequences and check if they are
    # separated by less than 1000 bases. If the separation is bigger don't count them as part of the cluster
    if (scalar(@{$chr_cluster{$chr_cluster_key}})>1){

      # counter to get the real number of alignment within the cluster.
      my $size_counter = 0;

      # get the first aligment of the cluster to be used in the exonerate step
      my $first_selected_alignment;

      # order the alignments in the clster to be able to check the real distance among them
      my  @ordered_aligns = sort{$a->hstart() <=> $b->hstart()} @{$chr_cluster{$chr_cluster_key}};    

      my $previous_end = 0;
      my $previous_alignment;

      my $end_of_cluster = 0;
     
      my %cluster_alignments = ();

      # Use this value to avoid the comparison of the first alignment against null, that in case the start
      # point of the alignment is lower than the cutoff which lead to the clustering of the first alignment
      # with a NULL alignment
      my $first_alignment = 0;
      
      foreach my $ordered_align(@ordered_aligns){
        
        # if the two condiditions are acomplished check if the alignments form a cluster
        if ($first_alignment!=0 && $end_of_cluster == 0){
	  my $cloneEnd_distance = abs($ordered_align->start)-($previous_end);
          # if the distance is sorter that the threshold the alignments cluster
          if($cloneEnd_distance<4000){
          
            if(!$cluster_alignments{$ordered_align->hseqname}){
              $cluster_alignments{$ordered_align->hseqname} = [];
              push (@{$cluster_alignments{$ordered_align->hseqname}}, $previous_alignment);
              $size_counter++;  
              $first_selected_alignment = $previous_alignment;
            }
            push (@{$cluster_alignments{$ordered_align->hseqname}}, $ordered_align);
            $size_counter++;
         
          }elsif($cluster_alignments{$ordered_align->hseqname}){
            $end_of_cluster = 1;
            
	  }
          $previous_alignment = $ordered_align;
          $previous_end = $ordered_align->end;
        }

        # Change this value after the first alignment is checked.
        if ($first_alignment == 0){
          $first_alignment = 1;
          $previous_alignment = $ordered_align;
          $previous_end = $ordered_align->end;
        }
      }
      if ($size_counter > $cluster_size){
        $cluster_size= $size_counter;
        $selected_alignment = $first_selected_alignment;

      } 
    }
    elsif($cluster_size==0 && ${$chr_cluster{$chr_cluster_key}}[0]->score()> $biggest_score){
      $biggest_score = ${$chr_cluster{$chr_cluster_key}}[0]->score();
      $selected_alignment = $chr_cluster{$chr_cluster_key}[0];
    }
  }
  return\$ selected_alignment;

}

CHUNKSLIST	No description	Code
DNADB	No description	Code
GENOMICSEQS	No description	Code
IIDREGEXP	No description	Code
OPTIONS	No description	Code
OUTDB	No description	Code
QUERYSEQS	No description	Code
QUERYTYPE	No description	Code
clean_clusters	No description	Code
fetch_analysis	No description	Code
fetch_input	No description	Code
filter_alignments	No description	Code
new	No description	Code
read_and_check_config	No description	Code
refined_results	No description	Code
run	No description	Code
single_filter_alignments	No description	Code
write_output	No description	Code