Bio::EnsEMBL::Analysis::RunnableDB WGA2Genes
Other packages in the module: Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::Config::General
Bio::EnsEMBL::Analysis::Config::WGA2Genes
Bio::EnsEMBL::Analysis::RunnableDB
Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw ( replace_stops_with_introns )
Bio::EnsEMBL::Analysis::Tools::Logger
Bio::EnsEMBL::Analysis::Tools::WGA2Genes::ChainUtils
Bio::EnsEMBL::Analysis::Tools::WGA2Genes::CoordUtils
Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold
Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
Bio::EnsEMBL::DBSQL::DBAdaptor
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Bio::SeqIO
DBD::mysql
DBI qw ( :sql_types )
Data::Dumper
IO::String
Inherit
Bio::EnsEMBL::Analysis::RunnableDB
Synopsis
  my $db      = Bio::EnsEMBL::DBAdaptor->new($locator);
my $genscan = Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes
->new (-db => $pipelinedb,
-input_id => $input_id
-analysis => $analysis );
$genscan->fetch_input();
$genscan->run();
$genscan->write_output(); #writes to stdout
Description
Methods
COMPARA_DB
No description
Code
FULLY_GAP_EXONS
No description
Code
INPUT_METHOD_LINK_TYPE
No description
Code
KILL_LIST
No description
Code
LONGEST_SOURCE_TRANSCRIPT
No description
Code
MAX_EDITABLE_STOPS_NON_PRIMARY
No description
Code
MAX_EDITABLE_STOPS_PRIMARY
No description
Code
MAX_EXON_READTHROUGH_DIST
No description
Code
MIN_CHAIN_ANNOTATION_OVERLAP
No description
Code
MIN_COVERAGE
No description
Code
MIN_NON_GAP
No description
Code
OUTPUT_DIR
No description
Code
OVERLAP_CHAIN_FILTER
No description
Code
PARTIAL_GAP_EXONS
No description
Code
QUERY_CORE_DB
No description
Code
TARGET_CORE_DB
No description
Code
TARGET_SPECIES_EXTERNAL_DBNAME
No description
Code
create_output_dir
No description
Code
fetch_inputDescriptionCode
gene_records
No description
Code
genomic_align_block_chains
No description
Code
get_external_db_id
No description
Code
make_gene_scaffold_and_project_genes
No description
Code
newDescriptionCode
process_transcript
No description
Code
query_slice
No description
Code
read_and_check_config
No description
Code
remove_contig_split_chains
No description
Code
runDescriptionCode
segregate_chains_and_generecords
No description
Code
target_slices
No description
Code
trim_exon_split_chains
No description
Code
write_agp
No description
Code
write_gene
No description
Code
write_outputDescriptionCode
Methods description
fetch_inputcode    nextTop
    Title   :   fetch_input
newcodeprevnextTop
    Title   :   fetch_input
runcodeprevnextTop
    Title   :   run
write_outputcodeprevnextTop
    Title   :   write_output
Methods code
COMPARA_DBdescriptionprevnextTop
sub COMPARA_DB {
  my ($self, $db) = @_;

  if (defined $db) {
    $self->{_compara_db} = $db;
  }

  return $self->{_compara_db};
}
FULLY_GAP_EXONSdescriptionprevnextTop
sub FULLY_GAP_EXONS {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_fully_gap_exons} = $val; 
  }

  return $self->{_fully_gap_exons};
}
INPUT_METHOD_LINK_TYPEdescriptionprevnextTop
sub INPUT_METHOD_LINK_TYPE {
  my ($self, $type) = @_;

  if (defined $type) {
    $self->{_input_method_link_type} = $type;
  }

  return $self->{_input_method_link_type};
}
KILL_LISTdescriptionprevnextTop
sub KILL_LIST {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_kill_list} = $val;
  }

  return $self->{_kill_list};
}
LONGEST_SOURCE_TRANSCRIPTdescriptionprevnextTop
sub LONGEST_SOURCE_TRANSCRIPT {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_longest_source} = $val;
  }

  return $self->{_longest_source};
}
MAX_EDITABLE_STOPS_NON_PRIMARYdescriptionprevnextTop
sub MAX_EDITABLE_STOPS_NON_PRIMARY {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_max_editable_stops_non_primary} = $val; 
  }

  return $self->{_max_editable_stops_non_primary};
}
MAX_EDITABLE_STOPS_PRIMARYdescriptionprevnextTop
sub MAX_EDITABLE_STOPS_PRIMARY {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_max_editable_stops} = $val; 
  }

  return $self->{_max_editable_stops};
}
MAX_EXON_READTHROUGH_DISTdescriptionprevnextTop
sub MAX_EXON_READTHROUGH_DIST {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_max_exon_readthrough_dist} = $val; 
  }

  return $self->{_max_exon_readthrough_dist};
}
MIN_CHAIN_ANNOTATION_OVERLAPdescriptionprevnextTop
sub MIN_CHAIN_ANNOTATION_OVERLAP {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_min_chain_annotation_overlap} = $val;
  }

  if (not exists($self->{_min_chain_annotation_overlap})) {
    $self->{_min_chain_annotation_overlap} = 1;
  }

  return $self->{_min_chain_annotation_overlap};
}
MIN_COVERAGEdescriptionprevnextTop
sub MIN_COVERAGE {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_min_coverage} = $val;
  }

  return $self->{_min_coverage};
}
MIN_NON_GAPdescriptionprevnextTop
sub MIN_NON_GAP {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_min_non_gap} = $val;
  }

  return $self->{_min_non_gap};
}
OUTPUT_DIRdescriptionprevnextTop
sub OUTPUT_DIR {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_output_dir} = $val;
  }

  return $self->{_output_dir};
}
OVERLAP_CHAIN_FILTERdescriptionprevnextTop
sub OVERLAP_CHAIN_FILTER {
  my ($self, $val) = @_;
  
  if (defined $val) {
    $self->{_overlap_chain_filter} = $val;
  }
  if (not exists($self->{_overlap_chain_filter})) {
    $self->{_overlap_chain_filter} = 0;
  }

  return $self->{_overlap_chain_filter};
}


#
# transcript editing and filtering
#
}
PARTIAL_GAP_EXONSdescriptionprevnextTop
sub PARTIAL_GAP_EXONS {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_partial_gap_exons} = $val; 
  }

  return $self->{_partial_gap_exons};
}
QUERY_CORE_DBdescriptionprevnextTop
sub QUERY_CORE_DB {
  my ($self, $db) = @_;

  if (defined $db) {
    $self->{_query_core_db} = $db;
  }

  return $self->{_query_core_db};
}
TARGET_CORE_DBdescriptionprevnextTop
sub TARGET_CORE_DB {
  my ($self, $db) = @_;

  if (defined $db) {
    $self->{_target_core_db} = $db;
  }

  return $self->{_target_core_db};
}
TARGET_SPECIES_EXTERNAL_DBNAMEdescriptionprevnextTop
sub TARGET_SPECIES_EXTERNAL_DBNAME {
  my ($self, $target_species_external_dbname) = @_;

  if (defined $target_species_external_dbname) {
    $self->{_target_species_external_dbname} = $target_species_external_dbname;
  }

  return $self->{_target_species_external_dbname};
}

#
# chain filtering
#
}
create_output_dirdescriptionprevnextTop
sub create_output_dir {
  my ($self, $output_dir_number) = @_;
  $output_dir_number = 10 if(!$output_dir_number);
  my $num = int(rand($output_dir_number));
  my $output_dir = $self->OUTPUT_DIR;
  if (! -e $output_dir) {
    warning("Your output directory '" . $output_dir . "' does not exist");
    warning("- it will be created now\n");
    eval{
      system("mkdir -p " . $output_dir);
    };
    if($@){
      throw("Failed to make output directory " . $output_dir . "$@" );
    }
  }
  my $dir = $output_dir."/".$num;
  if(! -e $dir){
    my $command = "mkdir $dir";
    eval{
      system($command);
    };
    if($@){
      throw("Failed to make $dir $@");
    }
  }
  my $filename = $dir. "/" . $self->input_id."_".$self->analysis->logic_name."_";
  $filename .= int(rand(1000));
  $filename .= ".out";
  return $filename;
}

###########################
# gets/sets
###########################
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_; 
  
  my $input_id = $self->input_id;  
  throw("No input id") unless defined($input_id);

  my $q_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
      new(%{$self->QUERY_CORE_DB});
  my $t_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
      new(%{$self->TARGET_CORE_DB});
  my $compara_dbh = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->
      new(%{$self->COMPARA_DB});
  
  my $query_species = 
      $q_dbh->get_MetaContainerAdaptor->get_Species->binomial;
  my $target_species = 
      $t_dbh->get_MetaContainerAdaptor->get_Species->binomial;
  
  my $gdb_adap = $compara_dbh->get_GenomeDBAdaptor;
  my $q_gdb = $gdb_adap->fetch_by_name_assembly($query_species);
  my $t_gdb = $gdb_adap->fetch_by_name_assembly($target_species);

  ########
# check that the default assembly for the query and target agrees
# with that for the method_link_species_set GenomeDBs
########
my ($q_assembly_version, $t_assembly_version); eval { $q_assembly_version = $q_dbh->get_CoordSystemAdaptor-> fetch_by_name('toplevel', $q_gdb->assembly); $t_assembly_version = $t_dbh->get_CoordSystemAdaptor-> fetch_by_name('toplevel', $t_gdb->assembly); }; $@ and do { throw("Had trouble fetching coord systems for ". $q_gdb->assembly . " and " . $t_gdb->assembly . " from core dbs: $@"); }; ##############
# populate a list of genes and transcripts to ignore
##############
my $kill_list = {}; if ($self->KILL_LIST) { open(KILL, $self->KILL_LIST) or throw("Could not open kill list " . $self->KILL_LIST . " for reading"); while(<KILL>) { /^\#/ and next; /^(\S+)/ and $kill_list->{$1} = 1; } close(KILL); } ########
# fetch the genes; need to work in the coordinate space of the
# top-level slice to be consistent with compara
########
my $sa = $q_dbh->get_SliceAdaptor; my (@gene_records, $reg_start, $reg_end); if ($input_id =~ /:/) { # assume slice name
my $slice = $sa->fetch_by_name($input_id); $reg_start = $slice->start; $reg_end = $slice->end; $self->query_slice($sa->fetch_by_region('toplevel', $slice->seq_region_name)); foreach my $g (@{$slice->get_all_Genes}) { next if $g->biotype ne 'protein_coding'; next if exists $kill_list->{$g->stable_id}; $g = $g->transfer($self->query_slice); my @good_trans; foreach my $t (@{$g->get_all_Transcripts}) { next if $t->coding_region_start < $slice->start; next if $t->coding_region_end > $slice->end; next if exists $kill_list->{$t->stable_id}; push @good_trans, $t; } if (@good_trans) { if ($self->LONGEST_SOURCE_TRANSCRIPT) { # load seq up front. Otherwise get warnings in the sort
map { $_->translateable_seq } @good_trans; @good_trans = sort { CORE::length($b->translateable_seq) <=> CORE::length($a->translateable_seq); } @good_trans; @good_trans = ($good_trans[0]); } push @gene_records, Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes::GeneRecord ->new(-gene => $g, -source_transcripts =>\@ good_trans); } } } else { # assume iid is a gene stable id; ignore the kill-list
my ($gene, $tran); eval { $gene = $q_dbh->get_GeneAdaptor->fetch_by_stable_id($input_id); }; eval { $tran = $q_dbh->get_TranscriptAdaptor->fetch_by_stable_id($input_id); }; if (not defined $gene and not defined $tran) { throw("Could not find gene or transcript with '$input_id' in query database"); } if (not defined $gene) { $gene = Bio::EnsEMBL::Gene->new; $gene->stable_id("None"); $gene->add_Transcript($tran); } $self->query_slice($sa->fetch_by_region('toplevel', $gene->seq_region_name)); $gene = $gene->transfer($self->query_slice); $reg_start = $gene->start; $reg_end = $gene->end; my @good_trans; foreach my $t (@{$gene->get_all_Transcripts}) { if (not exists($kill_list->{$t->stable_id})) { push @good_trans, $t; } } if (@good_trans) { if ($self->LONGEST_SOURCE_TRANSCRIPT) { # load seq up front. Otherwise get warnings in the sort
map { $_->translateable_seq } @good_trans; @good_trans = sort { CORE::length($b->translateable_seq) <=> CORE::length($a->translateable_seq); } @good_trans; @good_trans = ($good_trans[0]); } push @gene_records, Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes::GeneRecord ->new(-gene => $gene, -source_transcripts =>\@ good_trans); } } $self->gene_records(\@ gene_records ); #########
# get the compara data: MethodLinkSpeciesSet, reference DnaFrag,
# and all GenomicAlignBlocks
#########
my $mlss = $compara_dbh->get_MethodLinkSpeciesSetAdaptor ->fetch_by_method_link_type_GenomeDBs($self->INPUT_METHOD_LINK_TYPE, [$q_gdb, $t_gdb]); throw("No MethodLinkSpeciesSet for :\n" . $self->INPUT_METHOD_LINK_TYPE . "\n" . $query_species . "\n" . $target_species) if not $mlss; my $dnafrag = $compara_dbh->get_DnaFragAdaptor-> fetch_by_GenomeDB_and_name($q_gdb, $self->query_slice->seq_region_name); my $gaba = $compara_dbh->get_GenomicAlignBlockAdaptor; my $gen_al_blocks = $gaba->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss, $dnafrag, $reg_start, $reg_end); my (%chains, @chains); foreach my $block (@$gen_al_blocks) { my $qga = $block->reference_genomic_align; my ($tga) = @{$block->get_all_non_reference_genomic_aligns}; # fetch the target slice for later reference
if (not exists $self->target_slices->{$tga->dnafrag->name}) { $self->target_slices->{$tga->dnafrag->name} = $t_dbh->get_SliceAdaptor->fetch_by_region('toplevel', $tga->dnafrag->name); } if ($block->reference_genomic_align->dnafrag_strand < 0) { $block->reverse_complement; } push @{$chains{$block->group_id}}, $block; } foreach my $chain_id (keys %chains) { push @chains, [ sort { $a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start; } @{$chains{$chain_id}} ]; } @chains = sort { $b->[0]->score <=> $a->[0]->score } @chains; $self->genomic_align_block_chains(\@chains); } ############################################################
}
gene_recordsdescriptionprevnextTop
sub gene_records {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_gene_recs} = $val;
  }

  return $self->{_gene_recs};
}
genomic_align_block_chainsdescriptionprevnextTop
sub genomic_align_block_chains {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_gen_al_chains} = $val;
  }

  return $self->{_gen_al_chains};
}
get_external_db_iddescriptionprevnextTop
sub get_external_db_id {
  my ($self) = @_;

  my $query_db = Bio::EnsEMBL::DBSQL::DBAdaptor->
      new(%{$self->QUERY_CORE_DB});
  my $external_dbname = $self->TARGET_SPECIES_EXTERNAL_DBNAME;

  my $sth = $query_db->prepare(
            "SELECT external_db_id ".
            "FROM external_db ".
            "WHERE db_name = ?"
                          );
  $sth->bind_param(1, $external_dbname, SQL_VARCHAR);
  $sth->execute();

  my ($external_db_id) = $sth->fetchrow();
  $sth->finish();

  return $external_db_id;
}

###########################################
# Local class Result to encapsulate results
############################################
package Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes::GeneRecord; use Bio::EnsEMBL::Utils::Argument qw( rearrange );
}
make_gene_scaffold_and_project_genesdescriptionprevnextTop
sub make_gene_scaffold_and_project_genes {
  my ($self, 
      $blocks, 
      $res_genes, 
      $name,
      $max_stops,
      $min_coverage,
      $min_non_gap,
      $add_gaps,
      $extend_into_gaps,
      $external_db_id) = @_;

  my $gene_scaffold = Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold->
      new(-name => $name,
          -genomic_align_blocks => $blocks,
          -from_slice => $self->query_slice,
          -to_slices  => $self->target_slices,
          -transcripts   => [map {@{$_->source_transcripts}} @$res_genes],
          -max_readthrough_dist => $self->MAX_EXON_READTHROUGH_DIST,
          -add_gaps => $add_gaps,
          -extend_into_gaps => $extend_into_gaps,
          );
  
  foreach my $res_gene (@$res_genes) {
    $res_gene->name($name . "." . $res_gene->gene->stable_id);
    
    foreach my $tran (@{$res_gene->source_transcripts}) {
      my $proj_trans = $gene_scaffold->place_transcript($tran, 1, $external_db_id); 

     if ( $tran->analysis) { 
        $proj_trans->analysis($tran->analysis);     
        for my $e( @{$proj_trans->get_all_Exons } ) {  
            for my $sf ( @{ $e->get_all_supporting_features } ) {  
                 $sf->analysis($tran->analysis); 
             } 
        }
     } 
      $proj_trans = 
          $self->process_transcript($proj_trans, 
                                    $max_stops,
                                    $min_coverage,
                                    $min_non_gap,
                                    $tran->stable_id);
      
      if ($proj_trans) {
        push @{$res_gene->good_source_transcripts}, $tran;
        push @{$res_gene->projected_transcripts}, $proj_trans;
      }
    }
  }

  return $gene_scaffold;
}



#################################################################
# FUNCTION : process_transcript
#
# Description:
# Subjects the given transcript to a few tests, returning
# the transcript if they succeed, undef if not. If the
# transcript contains less than $max_stops stops, these
# are "spliced out"; otherwise the transcripts is rejected
#################################################################
}
newdescriptionprevnextTop
sub new {
  my ($class,@args) = @_;
  my $self = $class->SUPER::new(@args);
  
  $self->read_and_check_config($WGA2GENES_CONFIG_BY_LOGIC);

  return $self;
}

############################################################
}
process_transcriptdescriptionprevnextTop
sub process_transcript {
  my ($self, 
      $tran, 
      $max_stops,
      $min_coverage,
      $min_non_gap,
      $source_id) = @_;
  
  return 0 if not $tran;

  my (@processed_transcripts, $num_stops, $prop_non_gap);

  my @exons = @{$tran->get_all_Exons};
  my ($tsf) = @{$tran->get_all_supporting_features};
  my $pep = $tran->translate->seq;

  ##################
# reject transcripts that have:
# zero length
# less than minimum coverage of parent peptide
# higher that maximum proportion of gap residues
##################
if (CORE::length($pep) == 0) { logger_info("Rejecting proj of $source_id because was just a stop codon"); return 0; } if ($tsf->hcoverage < $min_coverage) { logger_info("Rejecting proj of $source_id because coverage (" . $tsf->hcoverage . ") is below threshold"); return 0; } foreach my $attr (@{$tran->get_all_Attributes}) { if ($attr->code eq "PropNonGap") { $prop_non_gap = $attr->value; } elsif ($attr->code eq "NumStops") { $num_stops = $attr->value; } } if ($prop_non_gap < $min_non_gap) { logger_info("Rejecting proj of $source_id due to too many Ns ($prop_non_gap)"); return 0; } if ($num_stops > $max_stops) { logger_info("Rejecting proj of $source_id due to too many stops ($num_stops)"); return 0; } elsif ($num_stops == 0) { return $tran; } ##################
# number of stops is non-zero but acceptable. Need to
# operate on the transcript to jump over the stops
$tran = replace_stops_with_introns($tran); return $tran; } ###################################################################
# FUNCTION: remove_contig_split_chains
#
# Decription:
# This function removes chains that (a) interlock with higher
# scoring chains, and (b) this interlocking cannot be explained
# by contig mis-ordering within the involved scaffolds
###################################################################
}
query_slicedescriptionprevnextTop
sub query_slice {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_query_tl_slice} = $val;
  }

  return $self->{_query_tl_slice};
}
read_and_check_configdescriptionprevnextTop
sub read_and_check_config {
  my ($self, $hash) = @_;

  $self->SUPER::read_and_check_config($hash);
 
  my $logic = $self->analysis->logic_name;

  foreach my $var (qw(INPUT_METHOD_LINK_TYPE
                      QUERY_CORE_DB
                      TARGET_CORE_DB
                      COMPARA_DB
                      TARGET_SPECIES_EXTERNAL_DBNAME)) {

    throw("You must define $var in config for logic '$logic'" . 
          " or in the DEFAULT entry")
        if not $self->$var;
  }
}

#
# core options
#
}
remove_contig_split_chainsdescriptionprevnextTop
sub remove_contig_split_chains {
  my ($self, $input_chains) = @_;

  my @kept_chains;

  foreach my $c (@$input_chains) {
    my (%coords_by_tid, @all_coords, @merged_coords);

    my $consistent = 1;

    my @these_chains = (@kept_chains, $c);

    my $blocks = flatten_chains(\@these_chains, 1);

    foreach my $bl (@$blocks) {
      my ($tga) = @{$bl->get_all_non_reference_genomic_aligns};
      my $tgac = Bio::EnsEMBL::Mapper::Coordinate->new
          ($tga->dnafrag->name,
           $tga->dnafrag_start,
           $tga->dnafrag_end,
           $tga->dnafrag_strand);
      
      push @{$coords_by_tid{$tgac->id}}, $tgac;
      push @all_coords, $tgac;
    }

    foreach my $tgac (@all_coords) {
      my $merged = 0;

      if (@merged_coords and 
          $merged_coords[-1]->id eq $tgac->id and
          $merged_coords[-1]->strand eq $tgac->strand) {

        if ($tgac->strand > 0) {
          if ($tgac->start > $merged_coords[-1]->end) {
            my $can_merge = 1;
            foreach my $o_c (@{$coords_by_tid{$tgac->id}}) {
              if ($o_c->start > $merged_coords[-1]->end and
                  $o_c->end   < $tgac->start) {
                $can_merge = 0;
                last;
              }
            }
            if ($can_merge) {
              $merged_coords[-1]->end($tgac->end);
              $merged = 1;
            }
          }
        } else {
          if ($tgac->end < $merged_coords[-1]->start) {
            my $can_merge = 1;
            foreach my $o_c (@{$coords_by_tid{$tgac->id}}) {
              if ($o_c->start > $tgac->end and
                  $o_c->end < $merged_coords[-1]->start) {
                $can_merge = 0;
                last;
              }
            }
            if ($can_merge) {
              $merged_coords[-1]->start($tgac->start);
              $merged = 1;
            }
          }
        }
      }

      if (not $merged) {
        push @merged_coords, $tgac;
      }
    }

    %coords_by_tid = ();
    foreach my $c (@merged_coords) {
      push @{$coords_by_tid{$c->id}}, $c;
    }

    TID: foreach my $id (keys %coords_by_tid) {
      my @chain = @{$coords_by_tid{$id}};
      @chain = sort { $a->start <=> $b->start } @chain; 

      # each chain should be separable from the last at contig
# level
for(my $i=1; $i < @chain; $i++) { my $prev = $chain[$i-1]; my $this = $chain[$i]; my ($ex_prev) = extend_coord($prev, $self->target_slices->{$id}); my ($ex_this) = extend_coord($this, $self->target_slices->{$id}); if ($ex_prev->end >= $ex_this->start) { $consistent = 0; last TID; } } } if ($consistent) { push @kept_chains, $c; } } return\@ kept_chains; } ###################################################################
# FUNCTION: trim_exon_split_chains
#
# Decription:
# When a single exon overlaps blocks in more than one chain, the
# minority chains are trimmed back.
# This prevents the situation of a single-exon gene in the
# reference mapping down to a multi-scaffold GeneScaffold, and
# generally gives cleaner results.
###################################################################
}
rundescriptionprevnextTop
sub run {
  my ($self) = @_;

  my (@results, @original_genes, @chains_generecs_pairs);

  my $external_db_id = $self->get_external_db_id();

  logger_info("INITIAL GENES: " . join(" ", map {$_->gene->stable_id}
  @{$self->gene_records}) . "\n");

  logger_info("ORIGINAL_CHAINS\n" . 
              stringify_chains($self->genomic_align_block_chains));

  @chains_generecs_pairs = 
      $self->segregate_chains_and_generecords($self->genomic_align_block_chains,
                                              $self->gene_records,
                                              $self->MIN_CHAIN_ANNOTATION_OVERLAP);
      
  foreach my $chains_generecs (@chains_generecs_pairs) {
    my ($alignment_chains, $generecs) = @$chains_generecs;
    
    my $pair_id = join(":", 
                       map { $_->gene->stable_id } @$generecs);
    
    logger_info("CHAINS_FOR_GENESET $pair_id\n" . 
                stringify_chains($alignment_chains));
    
    my %blocks_from_used_chains;
    
    my @these_generecs = map { $_->clone } @$generecs;
    
    # Repeat transcript contruction until we had no rejected transcripts.
# This is because if a transcript is rejected, some of the underlying
# GeneScaffold components may be unnecessary
for(;;) { logger_info("Working with genes " . join(" ", map { $_->gene->stable_id} @these_generecs)); my @cds_feats = map { @{$_->transcript_cds_feats} } @these_generecs; my $filtered_chains = filter_irrelevant_chains($alignment_chains,\@ cds_feats); logger_info("RELEVANT_CHAINS\n" . stringify_chains($filtered_chains)); $filtered_chains = filter_inconsistent_chains($filtered_chains, $self->OVERLAP_CHAIN_FILTER); logger_info("CONSISTENT_CHAINS\n" . stringify_chains($filtered_chains)); $filtered_chains = $self->remove_contig_split_chains($filtered_chains); logger_info("NON_CONTIG_SPLIT_CHAINS\n" . stringify_chains($filtered_chains)); $filtered_chains = $self->trim_exon_split_chains($filtered_chains,\@ cds_feats); logger_info("TRIMMED_CHAINS\n" . stringify_chains($filtered_chains)); my @these_pairs = $self->segregate_chains_and_generecords($filtered_chains,\@ these_generecs, $self->MIN_CHAIN_ANNOTATION_OVERLAP); my @these_results; foreach my $pair (@these_pairs) { my ($subset_chains, $subset_generecs) = @$pair; my $net_blocks = flatten_chains($subset_chains, 1); my $gs_name = $subset_generecs->[0]->gene->stable_id . "-0"; my $gene_scaffold = $self->make_gene_scaffold_and_project_genes($net_blocks, $subset_generecs, $gs_name, $self->MAX_EDITABLE_STOPS_PRIMARY, $self->MIN_COVERAGE, $self->MIN_NON_GAP, $self->FULLY_GAP_EXONS, $self->PARTIAL_GAP_EXONS, $external_db_id); push @these_results, [$gene_scaffold, @$subset_generecs]; } # check that all transcripts were successfully projected. If not,
# the gene scaffolds may contain unnecessary pieces, in which case
# we remove the unsuccesful transcript and repeat.
my @kept_generecs; my $need_to_repeat = 0; foreach my $res (@these_results) { my ($gs, @res_genes) = @$res; foreach my $res_gene (@res_genes) { if (scalar(@{$res_gene->source_transcripts}) == scalar(@{$res_gene->projected_transcripts})) { push @kept_generecs, $res_gene; } else { if (@{$res_gene->good_source_transcripts}) { push @kept_generecs, $res_gene; } $need_to_repeat = 1; } } } if ($need_to_repeat) { if (@kept_generecs) { @these_generecs = @kept_generecs; foreach my $grec (@these_generecs) { $grec->source_transcripts($grec->good_source_transcripts); $grec->projected_transcripts([]); $grec->good_source_transcripts([]); } next; } else { last; } } else { if (@these_results) { push @results, @these_results; foreach my $pair (@these_pairs) { my ($these_chains) = @$pair; foreach my $chain (@$these_chains) { foreach my $block (@$chain) { $blocks_from_used_chains{$block} = $block; } } } } last; } } my $filtered_chains = filter_block_overlap_chains($alignment_chains, [values %blocks_from_used_chains]); logger_info("ITERATING over remaining single chains\n"); foreach my $chain (@$filtered_chains) { my ($single_pair) = $self->segregate_chains_and_generecords([$chain], $generecs, $self->MIN_CHAIN_ANNOTATION_OVERLAP); if (defined $single_pair) { logger_info("CHAIN_WITH_GENES\n" . stringify_chains([$chain])); my @grs = map { $_->clone } @{$single_pair->[1]};; my ($tg_al) = @{$chain->[0]->get_all_non_reference_genomic_aligns}; my $gs_name = $tg_al->dnafrag->name . "-" . $tg_al->dnafrag_start; my $gene_scaffold = $self->make_gene_scaffold_and_project_genes($chain,\@ grs, $gs_name, $self->MAX_EDITABLE_STOPS_NON_PRIMARY, $self->MIN_COVERAGE, $self->MIN_NON_GAP, $self->FULLY_GAP_EXONS, $self->PARTIAL_GAP_EXONS, $external_db_id); my @kept; foreach my $gr (@grs) { if (scalar(@{$gr->projected_transcripts})) { push @kept, $gr; } } if (@kept) { push @results, [$gene_scaffold, @kept]; } } } } $self->output(\@results); } ############################################################
}
segregate_chains_and_generecordsdescriptionprevnextTop
sub segregate_chains_and_generecords {
  my ($self, $chains, $generecs, $min_overlap) = @_;

  my (%generecs_by_id, %chains_by_id);

  foreach my $grec (@$generecs) {
    $generecs_by_id{$grec} = $grec;
  }

  my (%genes_per_chain);

  foreach my $c (@$chains) {
    $chains_by_id{$c} = $c;

    my @ref_gas = map { $_->reference_genomic_align } @$c;
    @ref_gas = sort { $a->dnafrag_start <=> $b->dnafrag_start } @ref_gas;
    
    my %overlapping;
    
    foreach my $grec (@$generecs) {
      my $overlap_bps = 0;

    BLOCK: 
      foreach my $bl (@ref_gas) {
        #printf " looking at block %s %d %d\n", $bl->dnafrag->name, $bl->dnafrag_start, $bl->dnafrag_end;
FEAT: foreach my $f (@{$grec->transcript_cds_feats}) { #printf " looking at cds feature %d %d\n", $f->start, $f->end;
if ($f->start > $bl->dnafrag_end) { # no exons in this transcript will overlap the block
next BLOCK; } elsif ($f->end < $bl->dnafrag_start) { next FEAT; } else { my $ov_start = $f->start; my $ov_end = $f->end; $ov_start = $bl->dnafrag_start if $bl->dnafrag_start > $ov_start; $ov_end = $bl->dnafrag_end if $bl->dnafrag_end < $ov_end; $overlap_bps += ($ov_end - $ov_start + 1); } } } if ($overlap_bps >= $min_overlap) { $overlapping{$grec} = 1; } } foreach my $grecid (keys %overlapping) { #print "FOUND A GENE OVERLAP\n";
$genes_per_chain{$c}->{$grecid} = 1; } } my (@clusters, @name_clusters, @res_clusters); foreach my $cref (keys %genes_per_chain) { my $clust = { chains => { $cref => 1 }, genes => {}, }; foreach my $grecid (keys %{$genes_per_chain{$cref}}) { $clust->{genes}->{$grecid} = 1; } push @clusters, $clust; } while(@clusters) { my $this_clust = shift @clusters; my @merged; do { @merged = (); for (my $i=0; $i < @clusters; $i++) { # possibly merge with this clust
# if merge, remove from @clusters and re-search
my $other_clust = $clusters[$i]; my $in_common = 0; foreach my $this_g (keys %{$this_clust->{genes}}) { if (exists $other_clust->{genes}->{$this_g}) { $in_common = 1; last; } } if ($in_common) { foreach my $cref (keys %{$other_clust->{chains}}) { $this_clust->{chains}->{$cref} = 1; } foreach my $gref (keys %{$other_clust->{genes}}) { $this_clust->{genes}->{$gref} = 1; } push @merged, $i; } } if (@merged) { foreach my $i (reverse @merged) { splice (@clusters, $i, 1); } } } while @merged; push @name_clusters, $this_clust; } # finally: form the clusters
foreach my $c (@name_clusters) { my (@chains, @genes); foreach my $cref (keys %{$c->{chains}}) { push @chains, $chains_by_id{$cref}; } foreach my $gref (keys %{$c->{genes}}) { push @genes, $generecs_by_id{$gref}; } @chains = sort { $b->[0]->score <=> $a->[0]->score } @chains; @genes = sort { $a->gene->start <=> $b->gene->start } @genes; push @res_clusters, [\@chains,\@ genes]; } return @res_clusters; } ###########################################################
# write methods
###########################################################
}
target_slicesdescriptionprevnextTop
sub target_slices {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_target_slices} = $val;
  }
  
  if (not exists $self->{_target_slices}) {
    $self->{_target_slices} = {};
  }

  return $self->{_target_slices};
}



####################################
# config variable holders
####################################
}
trim_exon_split_chainsdescriptionprevnextTop
sub trim_exon_split_chains {
  my ($self, $chains, $feats) = @_;

  # make nr list of coding regions
my (@nr_cds); foreach my $cds (sort { $a->start <=> $b->start } @$feats) { if (not @nr_cds or $cds->start > $nr_cds[-1]->end + 1) { push @nr_cds, $cds; } elsif ($cds->end > $nr_cds[-1]->end) { $nr_cds[-1]->end($cds->end); } } foreach my $cds (@nr_cds) { my (@ov_chains, @other_chains); CH: foreach my $ch (@$chains) { my $ov_bps = 0; foreach my $bl (@$ch) { my $ga = $bl->reference_genomic_align; if ($ga->dnafrag_start <= $cds->end and $ga->dnafrag_end >= $cds->start) { my ($st, $en) = ($cds->start, $cds->end); $st = $ga->dnafrag_start if $ga->dnafrag_start > $st; $en = $ga->dnafrag_end if $ga->dnafrag_end < $en; $ov_bps += ($en - $st + 1); } } if ($ov_bps) { push @ov_chains, { chain => $ch, ov_bps => $ov_bps, }; } else { push @other_chains, $ch; } } if (scalar(@ov_chains) > 1) { # more than one chain implicated. Find the "dominant" chain
# for this exon and trim back the blocks for the other ones
@ov_chains = sort { $b->{ov_bps} <=> $a->{ov_bps} } @ov_chains; my @new_chains = (@other_chains, $ov_chains[0]->{chain}); for (my $i=1; $i < @ov_chains; $i++) { my @retained_blocks; foreach my $bl (@{$ov_chains[$i]->{chain}}) { my $ga = $bl->reference_genomic_align; if ($ga->dnafrag_end < $cds->start or $ga->dnafrag_start > $cds->end) { push @retained_blocks, $bl; } else { if ($ga->dnafrag_start < $cds->start) { my $cut_bl = $bl->restrict_between_reference_positions($ga->dnafrag_start, $cds->start - 1); $cut_bl->score($bl->score); push @retained_blocks, $cut_bl; } if ($ga->dnafrag_end > $cds->end) { my $cut_bl = $bl->restrict_between_reference_positions($cds->end + 1, $ga->dnafrag_end); $cut_bl->score($bl->score); push @retained_blocks, $cut_bl; } } } if (@retained_blocks) { push @new_chains,\@ retained_blocks; } } $chains =\@ new_chains; } } return $chains; } ###################################################################
# FUNCTION: segregate_chains_and_generecords
#
# Decription:
# partition the chains and genes into (chain_list, gene_list)
# pairs, where each chain_list contains chains that touch
# one or more genes in gene_list
###################################################################
}
write_agpdescriptionprevnextTop
sub write_agp {
  my ($self,
      $gscaf,
      $fh) = @_;

  $fh =\* STDOUT if(!$fh);

  my $prefix = "##-AGP";

  my $line_count = 0;
  my @tsegs = $gscaf->project_down;
  my @qsegs = $gscaf->project_up;

  print $fh "$prefix\#\n ";
  $line_count++;

  printf($fh "$prefix\#\#  AGP for %s source-region=%s/%d-%d (iid=%s)\n", 
         $gscaf->seq_region_name,
         $qsegs[0]->to_Slice->seq_region_name,
         $qsegs[0]->to_Slice->start,
         $qsegs[-1]->to_Slice->end,
         $self->input_id);
  $line_count++;

  print $fh "$prefix\#\n ";
  $line_count++;

  my $piece_count = 1;

  for(my $i=0; $i < @tsegs; $i++) {
    my $seg = $tsegs[$i];

    printf($fh "$prefix %s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\t%s\n",
           $gscaf->seq_region_name,
           $seg->from_start,
           $seg->from_end,
           $piece_count++,
           "W",
           $seg->to_Slice->seq_region_name,
           $seg->to_Slice->start,
           $seg->to_Slice->end,
           $seg->to_Slice->strand < 0 ? "-" : "+"
           );
    $line_count++;

    if ($i < @tsegs - 1) {
      printf($fh "$prefix %s\t%d\t%d\t%s\t%s\t%d\n",
             $gscaf->seq_region_name,
             $seg->from_end + 1,
             $tsegs[$i+1]->from_start - 1,
             $piece_count++,
             "N",
             $tsegs[$i+1]->from_start - $seg->from_end - 1,
             );
      $line_count++;
    }
  }
  return $line_count;
}
write_genedescriptionprevnextTop
sub write_gene {
  my ($self,
      $gene_scaf,
      $g,
      $fh ) = @_;

  $fh =\* STDOUT if(!$fh);

  my $prefix = "##-GENES ";

  my $line_count = 0;

  my $seq_id = $gene_scaf->seq_region_name;
  my $gene_id = $g->name;

  printf $fh "$prefix\#  Gene report for $seq_id\n";
  $line_count++;

  foreach my $tran (@{$g->projected_transcripts}) {
    my ($sf) = @{$tran->get_all_supporting_features};
    my $tran_id = $gene_id . "_" . $sf->hseqname; 

    foreach my $attr (@{$tran->get_all_Attributes}) {
      printf($fh "$prefix\# #\-ATTRIBUTE transcript=$tran_id code=%s value=%s\n",
             $attr->code, 
             $attr->value);
      $line_count++;
    }

    my @exons = @{$tran->get_all_Exons};

    for (my $i=0; $i < @exons; $i++) {
      my $exon = $exons[$i];
      my $exon_id = $tran_id . "_" . $i;

      printf($fh "%s %s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%s=%s; %s=%s; %s=%s\n", 
             $prefix,
             $seq_id,
             "WGA2Genes",
             "Exon",
             $exon->start,
             $exon->end,
             $exon->strand,
             $exon->phase,
             $exon->end_phase,
             "exon",
             $exon_id,
             "transcript",
             $tran_id,
             "gene",
             $gene_id);
      $line_count++;

      foreach my $sup_feat (@{$exon->get_all_supporting_features}) {
        foreach my $ug ($sup_feat->ungapped_features) {
          printf($fh "%s %s\t%s\t%s\t%d\t%d\t%d\t%s=%s; %s=%s; %s=%s; %s=%s; %s=%s; %s=%s\n", 
                 $prefix,
                 $seq_id,
                 "WGA2Genes",
                 "Supporting",
                 $ug->start,
                 $ug->end,
                 $ug->strand,
                 "exon",
                 $exon_id,
                 "hname",
                 $ug->hseqname,
                 "hstart",
                 $ug->hstart,
                 "hend",
                 $ug->hend,
                 "external_db_id",
                 $ug->external_db_id,
                 "hcoverage",
                 $ug->hcoverage);
          $line_count++;
        }
      }
    }

    my $pep = $tran->translate;
    $pep->id($tran_id);

    my $fasta_string;
    my $stringio = IO::String->new($fasta_string);
    my $seqio = Bio::SeqIO->new(-format => 'fasta',
                                -fh => $stringio);
    $seqio->write_seq($pep);
    logger_info("PROJECTED-PEPTIDE:\n$fasta_string\n"); 
    if ($pep->seq =~ /^X/ or $pep->seq =~ /X$/) {
      logger_info("Transcript $tran_id has Xs at the ends");
    }
  }
  return $line_count;
}
write_outputdescriptionprevnextTop
sub write_output {
  my ($self) = @_;

  my $outfile = $self->create_output_dir;

  # This is just to make sure that the output files written properly
# and contain all the information that they should. This is done by
# counting the number of lines in each file and comparing them to
# what is actually printed out.
open(FH, ">$outfile") or throw("Failed to open $outfile!\n$!"); my $fh =\* FH; my $line_numbers = 0; print ($fh "#\n"); $line_numbers++; printf($fh "# WGA2Genes output for %s\n", $self->input_id); $line_numbers++; printf($fh "# Genes are:"); $line_numbers++; foreach my $grec (@{$self->gene_records}) { print ($fh " " . $grec->gene->stable_id); } print ($fh "\n#\n"); $line_numbers++; #print "OUTFILE ".$outfile." used\n";
foreach my $obj (@{$self->output}) { my ($gs, @res_genes) = @$obj; $line_numbers += $self->write_agp($gs, $fh); foreach my $g (@res_genes) { $line_numbers += $self->write_gene($gs, $g, $fh); } } if (!-e $outfile) { throw("The $outfile was not created.\n"); } #print $line_numbers, "<-- counted number of lines\n";
my $cmd = "wc -l $outfile"; # Run the command and capture the output
my $cmd_string = `$cmd`; my @wc = (split(/ /, $cmd_string)); #print $wc[0], "<-- what should have been written\n";
if ($wc[0] !~ $line_numbers) { throw("The number of lines in the output files differ to what should have been saved. You might need to rerun the pipeline\n"); } close($fh); return; } ######################################
# internal methods
#####################################
#################################################################
# FUNCTION : make_gene_scaffold_and_project_genes
#
# Description:
# Obvious
#################################################################
}
General documentation
No general documentation available.