Raw content of Bio::EnsEMBL::Analysis::RunnableDB::MapCloneEnds =pod =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::MapCloneEnds; =head1 SYNOPSIS my $clonemap = Bio::EnsEMBL::Analysis::RunnableDB::MapCloneEnds->new( -db => $refdb, -analysis => $analysis_obj, -database => $EST_GENOMIC, ); $clonemap->fetch_input(); $clonemap->run(); $clonemap->write_output(); #writes to DB =head1 DESCRIPTION This object maps clone sequences to a genome by using the exonerate alignment program,and write the results as Dna Align Features. It relies on the following modules: Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds; Bio::EnsEMBL::Analysis::Runnable::ExonerateCloneEnds; In the analysis table, you have to add 3 analysis, one that calls MapCloneEnds, and two for the different exonerate steps: MapCloneEnds (module -> MapCloneEnds), EXONERATE_CLONE_ENDS (module -> MapCloneEnds), REFINE_CLONE_ENDS(module -> MapCloneEnds) MapCloneEnds reads the input_id_chunks from a file where each line contains a number of IDs separated by ":". In order to be able to correctly parse the input ids, they should be given in a very specific format. The file can be automaticaly generated from an XML using chunker.pl or manually generated where each id should be in the format: Clone_ID,length_of_clone,standard_deviation_of_length,Clone_end_ID,Direction_of_Clone (i.e. CH243-307D10,184000.0,36800.0,1098421033278,F). =head1 CONTACT Post general queries, or on how to get chunker.pl, to <ensembl-dev@ebi.ac.uk> =head1 APPENDIX =cut package Bio::EnsEMBL::Analysis::RunnableDB::MapCloneEnds; use strict; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Analysis::RunnableDB; use Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds; use Bio::EnsEMBL::Analysis::Config::ExonerateCloneEnds; use vars qw(@ISA); @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB); ########################################################################## sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); $self->{_refined_results}=[] ; $self->read_and_check_config($CLONE_CONFIG); return $self; } ########################################################################## sub fetch_input { # It does nothing so no really code implementation here. my ($self) = @_; } ########################################################################## sub run { my ($self) = @_; my $trace_name = ''; # CloneEnd id as in database my $clone_id = '';# Clone id my $insert_size = '';# Length of the inserted sequence in clone my $insert_stdev = '';# Standard deviation of the clone insert. # The two insert values will be used in the filter process to check quality of alignments # You need to run perl ensembl-personal/jb16/scripts/general/chunker.pl in order to create the # input_analysis_ids and create this file. To change the path to the list of chunks, you should also do it in # the chunker script my $chunks_list = $self->CHUNKSLIST; my %clones = (); # hash where each clone object will be stored my %cloneEnd_ids = (); # Open the file with the list of seq_ids per chunk and load it into an array to be used by fetch_input open (INFILE,"<$chunks_list"); my @listOfIDs = (); while (my $line = <INFILE>){ chomp($line); # Split by ":" to get information of each clone as an independent element in an array. my @clone = split (/:/,$line); my $listOfClones = ''; foreach my $clone (@clone){ # Split all the information of one cloneEnd and load each element into an array my @clone_data = split (/,/,$clone); if (!$clones{$clone_data[0]}){ $clones{$clone_data[0]}=[]; } # Group information of each cloneEnd using clone_id as reference (will be used in the filter step) push (@{$clones{$clone_data[0]}}, $clone_data[3]); # Add cloneEnd_name push (@{$clones{$clone_data[0]}}, $clone_data[1]); # Add insert length push (@{$clones{$clone_data[0]}}, $clone_data[2]); # Add insert length standard deviation push (@{$clones{$clone_data[0]}}, $clone_data[4]); # Add cloneEnd direction (R or F) $cloneEnd_ids{$clone_data[3]} = $clone_data[0]; if ($listOfClones eq ''){ $listOfClones.= $clone_data[3]; }else{ $listOfClones.= ":".$clone_data[3]; } } # Creates the array that contains the list of cloneEnd_ids that will go in each chunks push (@listOfIDs,$listOfClones); } close INFILE; # Run ExonerateClones to get a first idea on the aligned/location of the clone Ends my $exonerate = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds->new( -DB => $self->db, -INPUT_ID => $self->input_id, -ANALYSIS => $self->fetch_analysis("EXONERATE_CLONE_ENDS"), ); $exonerate->fetch_input(\@listOfIDs); $exonerate->run(); # Run exonerate and get the output. my $clone_alignments = $exonerate->output(); my @selected_alignments = @{$self->filter_alignments($clone_alignments, \%clones, \%cloneEnd_ids)}; foreach my $selected_alignment(@selected_alignments){ if ($selected_alignment ne ''){ my $clone_id = $selected_alignment->hseqname; my $chr_id = $selected_alignment->seqname; my $start = ($selected_alignment->start)-2000; my $end = ($selected_alignment->end)+2000; my @chr_name = split (/:/, $chr_id); # Rebuild the input_id to send the coordinates for the target slice and the query sequence. my $input_id = $chr_name[0].":".$chr_name[1].":".$chr_name[2].":".$start.":".$end.":".$chr_name[5].":".$clone_id; my $refine = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateCloneEnds->new( -DB => $self->db, -INPUT_ID => $input_id, -ANALYSIS => $self->fetch_analysis("REFINE_CLONE_ENDS"), ); $refine ->fetch_input(); $refine ->run(); $self->refined_results($refine); } } } ########################################################################## sub write_output { my ( $self, @output ) = @_; foreach my $refine_object( @{ $self->refined_results }) { $refine_object->write_output(); } } ############################################################################################ sub fetch_analysis{ my ($self, $logic_name) = @_; my $db = $self->db; my $analysis_adaptor= $db->get_AnalysisAdaptor; my $analysis = $analysis_adaptor->fetch_by_logic_name($logic_name); return $analysis; } sub refined_results{ my ($self, $refine_result) = @_; # if(!$self->{'refined_results'}){ # $self->{'refined_results'}=[]; # } if ($refine_result) { push @{$self->{_refined_results}}, $refine_result ; } return $self->{_refined_results}; } sub read_and_check_config { my $self = shift; $self->SUPER::read_and_check_config($CLONE_CONFIG); ########## # CHECKS ########## my $logic = $self->analysis->logic_name; # check that compulsory options have values foreach my $config_var ( qw( CHUNKSLIST ) ){ if ( not defined $self->$config_var ){ throw("You must define $config_var in config for logic '$logic'"); } } } sub QUERYSEQS { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_QUERYSEQS'} = $value; } if ( exists( $self->{'_CONFIG_QUERYSEQS'} ) ) { return $self->{'_CONFIG_QUERYSEQS'}; } else { return undef; } } sub QUERYTYPE { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_QUERYTYPE'} = $value; } if ( exists( $self->{'_CONFIG_QUERYTYPE'} ) ) { return $self->{'_CONFIG_QUERYTYPE'}; } else { return undef; } } sub GENOMICSEQS { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_GENOMICSEQS'} = $value; } if ( exists( $self->{'_CONFIG_GENOMICSEQS'} ) ) { return $self->{'_CONFIG_GENOMICSEQS'}; } else { return undef; } } sub IIDREGEXP { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_IIDREGEXP'} = $value; } if ( exists( $self->{'_CONFIG_IIDREGEXP'} ) ) { return $self->{'_CONFIG_IIDREGEXP'}; } else { return undef; } } sub OUTDB { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_OUTDB'} = $value; } if ( exists( $self->{'_CONFIG_OUTDB'} ) ) { return $self->{'_CONFIG_OUTDB'}; } else { return undef; } } sub DNADB { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_DNADB'} = $value; } if ( exists( $self->{'_CONFIG_DNADB'} ) ) { return $self->{'_CONFIG_DNADB'}; } else { return undef; } } sub OPTIONS { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_OPTIONS'} = $value; } if ( exists( $self->{'_CONFIG_OPTIONS'} ) ) { return $self->{'_CONFIG_OPTIONS'}; } else { return undef; } } sub CHUNKSLIST { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_CHUNKSLIST'} = $value; } if ( exists( $self->{'_CONFIG_CHUNKSLIST'} ) ) { return $self->{'_CONFIG_CHUNKSLIST'}; } else { return undef; } } sub filter_alignments{ my ($self, $clone_alignments, $clones_ref, $cloneEnd_ids_ref ) = @_; my %clone_cluster = (); my %aligned_clones = (); my %clones = %{$clones_ref}; my %cloneEnd_ids = %{$cloneEnd_ids_ref}; my @selected_alignments; # Group alignments by clone name foreach my $clone_alignment(@{$clone_alignments}){ # Get the cloneEnds name my $clone_name = $clone_alignment->hseqname(); # Get the clone id to which the cloneEnds belongs my $complete_clone_name = $cloneEnd_ids{$clone_name}; # Store the clone name in a hash, so only selected clones will be checked in the next step. # the object stored in the hash is not important, what we really want to get is a list of # unique clones(complete_clone_name) with no duplicate entries $aligned_clones{$complete_clone_name} = $clone_name; if(!$clone_cluster{$clone_name}){ $clone_cluster{$clone_name} = []; } push (@{$clone_cluster{$clone_name}}, $clone_alignment); } foreach my $pair (keys %aligned_clones){ # Check if the clone has two cloneEnds if ((scalar @{$clones{$pair}})/4 > 1){ # Check if at least one of the cluster pair alignments is selected. # in case none is selected it will send cloneEnds to the single_filter my $cluster_selected = 0; my %clean_cloneEnds = (); my %status = (); my %clone_dir = (); # Get the length of the clone and the name of the two cloneEnds my $clone_length = $clones{$pair}[1]+$clones{$pair}[2]+200000; for (my $numberOfEnd=0;$numberOfEnd < scalar @{$clones{$pair}};$numberOfEnd+=4){ my $cloneEnd =$clones{$pair}[$numberOfEnd]; # Clone_dir used to avoid pairing of two cloneEnds in the same end as for some clones # there are more than two cloneEnds where more than one correspond to the same end that # was sequenced more than once. $clone_dir{$numberOfEnd} = $clones{$pair}[$numberOfEnd+3]; my @cloneEnd_clean = @{$self->clean_clusters($clone_cluster{$cloneEnd})}; if (!$clean_cloneEnds{$numberOfEnd}){ $clean_cloneEnds{$numberOfEnd} = \@cloneEnd_clean; } # Set initial status of cloneEnd to 0 which means that none of its alignments pair with the other cloneEnd if (!$status{$numberOfEnd}){ $status{$numberOfEnd} = 0; } } foreach my $clean_cloneEnd1 (keys %clean_cloneEnds){ foreach my $clean_cloneEnd2 (keys %clean_cloneEnds){ # first check that one cloneEnd is not compared with itself. Then check that if we have pair # A->B don't use again B->A and finally get only pairs of F + R cloneEnds if ($clean_cloneEnd1 != $clean_cloneEnd2 && $clean_cloneEnd1 < $clean_cloneEnd2 && $clone_dir{$clean_cloneEnd1} ne $clone_dir{$clean_cloneEnd2}){ # Use this variable to check where an alignment occur and avoid duplicate alignments because of # short consecutive sequences that align near by and are both selected for the next exonerate step my $first_prev_chr_start = 0; my $first_chr_hit = 'Null'; foreach my $fa (@{$clean_cloneEnds{$clean_cloneEnd1}}){ # Check if two alignments don't belong to a cluster or they align in different chromosomes. # This is made to avoid getting duplicate alignments in the next exonerate step when very close # coordinates are selected for the target sequence if (($fa->seqname() eq $first_chr_hit && ($fa->start()-$first_prev_chr_start) > 4000) || $fa->seqname() ne $first_chr_hit){ my $second_prev_chr_start = 0; foreach my $sa (@{$clean_cloneEnds{$clean_cloneEnd2}}){ if ($fa->seqname() eq $sa->seqname()){ my $diff = ($fa->start())-($sa->end()); my $abs_diff = abs($diff); # Check that the two cloneEnds are close enough as to be considered as paired and that # the second cloneEnd don't belong to a cluster that was previously selected # This is made to avoid the same cloneEnds to be paired more than once when there is more # than one alignment in a short region. if ($abs_diff <= $clone_length && (($sa->start()-$second_prev_chr_start)>4000)){ # In case two alignments are selected store them for the next exonerate step push (@selected_alignments, $fa); push (@selected_alignments, $sa); $status{$clean_cloneEnd1} = 1; $status{$clean_cloneEnd2} = 1; $cluster_selected = 1; $first_prev_chr_start = $fa->start(); $second_prev_chr_start = $sa->start(); $first_chr_hit= $fa->seqname(); } } } } } } } } foreach my $clone_status (keys %status){ if ($status{$clone_status} == 0){ my $cloneEnd =$clones{$pair}[$clone_status]; my $selected_align = $self->single_filter_alignments($clone_cluster{$cloneEnd}); push (@selected_alignments, ${$selected_align}); } } }else{ # if the clone has only one cloneEnd do: # get the name of the cloneEnds, my $first_cloneEnd =$clones{$pair}[0]; # send all the alignments for that cloneEnd stored in clone_cluster to the single_filter_alignment my $selected_align = $self->single_filter_alignments($clone_cluster{$first_cloneEnd}); # add the result of filtering to the selected alignments array. push (@selected_alignments, ${$selected_align}); } } return \@selected_alignments; } sub single_filter_alignments{ my ($self, $single_clone_alignments) = @_; # Cluster alignments obtained for each clone my %chr_cluster = (); foreach my $alignment(@{$single_clone_alignments}){ my $hit_name= $alignment->hseqname; my $chr_name= $alignment->seqname; # Generate a unique key that will be found only when only clone aligns more than # once against the same chromosomal sequence my $cluster_key = $chr_name."-".$hit_name; # check if there is any sequence aligned from the same clone against the chromosome # In case there isn't, start the count of alignments for that clone. if(!$chr_cluster{$cluster_key}){ $chr_cluster{$cluster_key} = []; } push (@{$chr_cluster{$cluster_key}}, $alignment); } my $selected_alignment = ""; my $cluster_size = 0; my $biggest_score = 0; # For each clone check how many clusters were obtained. # Check the clusters and selected the best alignment cluster # to be used as template for the more exhaustive exonerate alignment. foreach my $chr_cluster_key (keys %chr_cluster){ # If the cluster contains more than one sequence, order the sequences and check if they are # separated by less than 1000 bases. If the separation is bigger don't count them as part of the cluster if (scalar(@{$chr_cluster{$chr_cluster_key}})>1){ # counter to get the real number of alignment within the cluster. my $size_counter = 0; # get the first aligment of the cluster to be used in the exonerate step my $first_selected_alignment; # order the alignments in the clster to be able to check the real distance among them my @ordered_aligns = sort{$a->hstart() <=> $b->hstart()} @{$chr_cluster{$chr_cluster_key}}; my $previous_end = 0; my $previous_alignment; my $end_of_cluster = 0; my %cluster_alignments = (); # Use this value to avoid the comparison of the first alignment against null, that in case the start # point of the alignment is lower than the cutoff which lead to the clustering of the first alignment # with a NULL alignment my $first_alignment = 0; foreach my $ordered_align(@ordered_aligns){ # if the two condiditions are acomplished check if the alignments form a cluster if ($first_alignment!=0 && $end_of_cluster == 0){ my $cloneEnd_distance = abs($ordered_align->start)-($previous_end); # if the distance is sorter that the threshold the alignments cluster if($cloneEnd_distance<4000){ if(!$cluster_alignments{$ordered_align->hseqname}){ $cluster_alignments{$ordered_align->hseqname} = []; push (@{$cluster_alignments{$ordered_align->hseqname}}, $previous_alignment); $size_counter++; $first_selected_alignment = $previous_alignment; } push (@{$cluster_alignments{$ordered_align->hseqname}}, $ordered_align); $size_counter++; }elsif($cluster_alignments{$ordered_align->hseqname}){ $end_of_cluster = 1; } $previous_alignment = $ordered_align; $previous_end = $ordered_align->end; } # Change this value after the first alignment is checked. if ($first_alignment == 0){ $first_alignment = 1; $previous_alignment = $ordered_align; $previous_end = $ordered_align->end; } } if ($size_counter > $cluster_size){ $cluster_size= $size_counter; $selected_alignment = $first_selected_alignment; } } elsif($cluster_size==0 && ${$chr_cluster{$chr_cluster_key}}[0]->score()> $biggest_score){ $biggest_score = ${$chr_cluster{$chr_cluster_key}}[0]->score(); $selected_alignment = $chr_cluster{$chr_cluster_key}[0]; } } return \$selected_alignment; } sub clean_clusters{ my ($self, $clone_cluster_alignments) = @_; # Cluster alignments obtained for each clone my %chr_cluster = (); foreach my $alignment(@{$clone_cluster_alignments}){ my $hit_name= $alignment->hseqname; my $chr_name= $alignment->seqname; # Generate a unique key that will be found only when a cloneEnd aligns more than # once against the same chromosomal sequence my $cluster_key = $chr_name."-".$hit_name; # check if there is any sequence aligned from the same clone against the chromosome # In case there isn't, start the count of alignments for that clone. if(!$chr_cluster{$cluster_key}){ $chr_cluster{$cluster_key} = []; } push (@{$chr_cluster{$cluster_key}}, $alignment); } my @selection = (); # For each clone check how many clusters were obtained. # Check the clusters and selected the best alignment cluster # to be used as template for the more exhaustive exonerate alignment. foreach my $chr_cluster_key (keys %chr_cluster){ # If the cluster contains more than one sequence, order the sequences and check if they are # separated by less than 4000 bases. If the separation is bigger don't count them as part of the cluster if (scalar(@{$chr_cluster{$chr_cluster_key}})>1){ # get the first aligment of the cluster to be used in the exonerate step my $first_selected_alignment; # order the alignments in the cluster to be able to check the real distance among them my @ordered_aligns = sort{$a->hstart() <=> $b->hstart()} @{$chr_cluster{$chr_cluster_key}}; my $initial_end = 0; my $initial_alignment; my %cluster_alignments = (); # Use this value to avoid the comparison of the first alignment against null, that in case the start # point of the alignment is lower than the cutoff which lead to the clustering of the first alignment # with a NULL alignment my $first_alignment = 0; my $alignment_added = 0; foreach my $ordered_align(@ordered_aligns){ # if the two condiditions are acomplished check if the alignments form a cluster if ($first_alignment!=0){ my $cloneEnd_distance = abs(($ordered_align->start)-($initial_end)); # if the distance is sorter that the threshold the alignments cluster if($cloneEnd_distance < 4000){ $initial_alignment = $ordered_align; $initial_end = $ordered_align->end; if ($alignment_added==0){ push (@selection, $initial_alignment); $alignment_added = 1; } }else{ push (@selection, $ordered_align); $initial_alignment = $ordered_align; $initial_end = $ordered_align->end; } } # Change this value after the first alignment is checked. if ($first_alignment == 0){ $first_alignment = 1; $initial_alignment = $ordered_align; $initial_end = $ordered_align->end; } } }else{ push (@selection, $chr_cluster{$chr_cluster_key}[0]); } } return \@selection; }