Raw content of WormBase # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code # mod fsk =pod =head1 NAME WormBase =head1 SYNOPSIS the methods are all used by wormbase_to_ensembl.pl script which should be in the same directory =head1 DESCRIPTION parse gff and agp files from the wormbase database for caenorhabditis elegans to create ensembl database. =head1 CONTACT ensembl-dev@ebi.ac.uk about code issues wormbase-hel@wormbase.org about data issues =head1 APPENDIX =cut use lib '~/PerlCode/ensembl-pipeline/scripts/DataConversion/wormbase'; package WormBase; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(get_seq_ids get_sequences_pfetch agp_parse parse_gff write_genes translation_check insert_agp_line display_exons non_translate process_file parse_operons write_simple_features parse_rnai parse_expr parse_SL1 parse_SL2 parse_pseudo_gff store_coord_system store_slice parse_tRNA parse_rRNA_genes parse_tRNA_genes parse_pseudo_files); use strict; use Bio::EnsEMBL::Exon; use Bio::EnsEMBL::Gene; use Bio::EnsEMBL::Transcript; use Bio::EnsEMBL::Translation; use Bio::EnsEMBL::SimpleFeature; use Bio::EnsEMBL::CoordSystem; use Bio::EnsEMBL::Slice; $|=1; #turns off buffering on STDOUT =head2 get_seq_ids Arg [1] : filehandle to an agp file Function : retrives sequence ids from an agp file Returntype: array ref of seq ids and array ref of ides which don't fit the format' Exceptions: non Caller : Example : &get_seq_ids($fh); =cut sub get_seq_ids{ my ($fh) = @_; my @seq_ids; my @non_ids; while(<$fh>){ chomp; #I 47490 107680 3 F AC024796.1 1 60191 + ##print; #print "\n"; my ($status, $contig) = (split)[4, 5]; if(!$contig =~ /\S+\.\d+/){ #print STDERR "contig doesn't match ".$contig." accepted format\n"; push(@non_ids, $contig); next; } if($contig =~ /\S+\.$/){ #print STDERR "contig doesn't match ".$contig." accepted format\n"; push(@non_ids, $contig); next; } #print STDERR "contig being added ".$contig."\n"; push(@seq_ids, $contig) } return \@seq_ids, \@non_ids; } =head2 get_sequences_pfetch Arg [1] : array ref of sequence ids which will be recognised by pfetch Arg [2] : a Bio::EnsEMBL::Pipeline::Seqfetcher::Pfetch object Function : gets sequences for the ids passed to it using pfetch Returntype: hash keyed on seq id containing Bio::Seq object Exceptions: throws if seqfetcher passed to it isn't pfetch' Caller : Example : %get_sequences_pfetch($seq_ids, $seqfetcher); =cut sub get_sequences_pfetch{ my ($seq_ids, $seqfetcher) = @_; unless($seqfetcher->isa("Bio::EnsEMBL::Pipeline::SeqFetcher::Pfetch")){ die("seqfetcher ".$seqfetcher." needs to be a pfetch for this too work"); } my %seqs; foreach my $id(@$seq_ids){ my $seq; eval{ $seq = $seqfetcher->get_Seq_by_acc($id); }; if($@){ warn "$id isn't most recent sequence trying archive\n"; $seqfetcher->options('-a'); $seq = $seqfetcher->get_Seq_by_acc($id); } if($seq){ $seqs{$id} = $seq; }else{ warn "sequence ".$id." wasn't found\n"; } } return(\%seqs); } =head2 agp_parse Arg [1] : filehandle to an agp file Arg [2] : chromosome id from chromsome table Arg [3] : agp type for assembly table Function : parses an agp file into a suitable format for inserting into the assembly table Returntype: hash ref, keyed on contig id, each hash element is itself a hash which contains all the info needed for the assembly table Exceptions: dies if the same contig id is found twice Caller : Example : =cut sub agp_parse{ my ($fh, $chr_id, $agp_type) = @_; my $chr_hash = {}; #print STDERR "parsing agp\n"; while(<$fh>){ chomp; #I 47490 107680 3 F AC024796.1 1 60191 + #print; #print "\n"; my ($chr, $chr_start, $chr_end, $gap, $contig, $raw_start, $raw_end, $raw_ori) = (split)[0, 1, 2, 4, 5, 6, 7, 8]; if(!$contig =~ /\S+\.\d+/){ next; } if($contig =~ /\S+\.$/){ next; } if ($raw_ori eq '+') { $raw_ori = 1; } elsif ($raw_ori eq '-') { $raw_ori = -1; } else { $raw_ori = 1; #print "$chr Contig $contig $chr_start \n"; #print "Warning assumed default orientation for $contig\n"; } #print "have raw coords ".$raw_start." ".$raw_end." ".$raw_ori."\n"; if($chr_hash->{$contig}){ die "contig ".$contig." has been found twice something odd is going on\n"; }else{ $chr_hash->{$contig} = { 'chromosome_id' => $chr_id, 'chr_start' => $chr_start, 'chr_end' => $chr_end, 'superctg_name' => $chr, 'superctg_start' => $chr_start, 'superctg_end' => $chr_end, 'superctg_ori' => 1, 'contig_start' => $raw_start, 'contig_end' => $raw_end, 'contig_ori' => $raw_ori, 'type' => $agp_type, } } } return $chr_hash; } =head2 parse_gff Arg [1] : filename of gff file Arg [2] : Bio::Seq object Arg [3] : Bio::EnsEMBL::Analysis object Function : parses gff file given into genes Returntype: array ref of Bio::EnEMBL::Genes Exceptions: dies if can't open file or seq isn't a Bio::Seq Caller : Example : =cut sub parse_gff{ my ($file, $seq, $analysis) = @_; use Storable qw(store retrieve freeze thaw dclone); #print STDERR "opening ".$file."\n"; open(FH, $file) or die "couldn't open ".$file." $!"; die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") || $seq->isa("Bio::Seq") || $seq->isa("Bio::PrimarySeqI")); my @genes; my ($transcripts, $five_prime, $three_prime) = &process_file(\*FH); print "there are ".keys(%$transcripts)." distinct transcripts\n"; my $genes = undef; my ($processed_transcripts, $five_start, $three_end, $trans_start_exon, $trans_end_exon) = &generate_transcripts($transcripts, $seq, $analysis, $five_prime, $three_prime); print "\nthere are ".keys(%$processed_transcripts)." transcripts\n"; #print keys(%$five_start)." transcripts have 5' UTRs and ".keys(%$three_end)." have 3' UTRs\n"; $genes = undef; $genes = &create_transcripts($processed_transcripts, $five_start, $three_end, $trans_start_exon, $trans_end_exon); print "\nPARSE GFF has ".keys(%$genes)." genes\n"; foreach my $gene_id(keys(%$genes)){ my $transcripts = $genes->{$gene_id}; my $unpruned = &create_gene($transcripts, $gene_id); #print STDERR "gene ".$unpruned."\n"; my $gene = &prune_Exons($unpruned); push(@genes, $gene); } close(FH); print "\nPARSE_GFF got ".@genes." genes\n"; return \@genes; } =head2 process_file Arg [1] : filehandle pointing to a gff file Function : parses out lines for exons Returntype: hash keyed on transcript id each containig array of lines for that transcript and two hashes of hashes with arrays containing 5 prime and 3 prime UTR lines Exceptions: Caller : Example : =cut sub process_file{ my ($fh) = @_; my %genes; my $transcript; my %five_prime; my %three_prime; LOOP: while(<$fh>){ #CHROMOSOME_I curated three_prime_UTR 11696828 11697110 . - . Transcript "T22H2.5a" #CHROMOSOME_I curated three_prime_UTR 11697157 11697230 . - . Transcript "T22H2.5a" #CHROMOSOME_I curated five_prime_UTR 11698944 11698954 . - . Transcript "T22H2.5a" chomp; my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split; my $element = $_; #print STDERR $element."\n" if($type eq 'exon'); if($chr =~ /sequence-region/){ #print STDERR $_; next LOOP; } if(/^##/){ next LOOP; } if(!$status && !$type){ #print "status and type no defined skipping\n"; next LOOP; } my $line = $status." ".$type; if( ($line eq 'Coding_transcript five_prime_UTR') or ($line eq 'Coding_transcript three_prime_UTR') ){ $gene =~ s/\"//g; $transcript = $gene; #remove transcript-specific part: Y105E8B.1a.2 $gene =~ s/(\.\w+)\.\d+$/$1/; my ($position) = $type; if($position =~/^five/){ #print STDERR "have 5 prime utr ".$gene." (".$transcript.")\n"; if(!$five_prime{$gene}){ $five_prime{$gene} = {}; if(!$five_prime{$gene}{$transcript}){ $five_prime{$gene}{$transcript} = []; } push(@{$five_prime{$gene}{$transcript}}, $element); } else{ if(!$five_prime{$gene}{$transcript}){ $five_prime{$gene}{$transcript} = []; } push(@{$five_prime{$gene}{$transcript}}, $element); } }elsif($position =~/^three/){ #print STDERR "have 3 prime utr ".$gene." (".$transcript.")\n"; if(!$three_prime{$gene}){ $three_prime{$gene} = {}; if(!$three_prime{$gene}{$transcript}){ $three_prime{$gene}{$transcript} = []; } push(@{$three_prime{$gene}{$transcript}}, $element); } else{ if(!$three_prime{$gene}{$transcript}){ $three_prime{$gene}{$transcript} = []; } push(@{$three_prime{$gene}{$transcript}}, $element); } } next LOOP; }elsif($line ne 'curated coding_exon'){ next LOOP; } $gene =~ s/\"//g; if(!$genes{$gene}){ $genes{$gene} = []; push(@{$genes{$gene}}, $element); }else{ push(@{$genes{$gene}}, $element); } } print STDERR "Have ".keys(%genes). " genes (CDS), ". keys(%five_prime)." have 5' UTR and ".keys(%three_prime)." have 3' UTR information\n"; return \%genes, \%five_prime, \%three_prime; } =head2 generate_transcripts Arg [1] : hash ref (as returned by process_file, containing information about the CDS) Arg [2] : Bio::EnsEMBL::Slice Arg [3] : Bio::EnsEMBL::Analysis Arg [4] : ref to hash of hashes (as returned by process_file, containing information about the 5' UTR regions Arg [5] : ref to hash of hashes (as returned by process_file, containing information about the 3' UTR regions Function : takes line representing a transcript and creates an exon for each one Returntype: hash ref hash keyed on transcript id containing an array of exons Exceptions: Caller : Example : =cut sub generate_transcripts{ my ($genesRef, $slice, $analysis, $five_prime, $three_prime) = @_; my %genes; my %transcripts; my %temp_transcripts; my %five_trans_start; my %three_trans_end; my %trans_start_exon; my %trans_end_exon; my $translation_end; my $genecount = 0; my @global_exons; my %overlapcheck; use Bio::EnsEMBL::Pipeline::Tools::ExonUtils; #go through all genes GENE: foreach my $gene_name(keys(%$genesRef)){ #print "\nGENE $gene_name"; #create gene-hash entry $genes{$gene_name} = []; my $transcriptcount = 0; %temp_transcripts = (); ## collect all "curated_coding_exons" for this gene ## my @lines = @{$$genesRef{$gene_name}}; #is this right? my @global_exons = (); my %three_prime_exons; my %five_prime_exons; foreach my $line(@lines){ my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split /\s+/, $line; $chr =~ s/CHROMOSOME_//; # we re currently not using singe nucleotide exons if($start == $end){ next; } my $exon = new Bio::EnsEMBL::Exon; my $phase = (3 - $frame)%3; # wormbase gff cotains frame which is effectively the opposite of phase # for a good explaination of phase see the Bio::EnsEMBL::Exon documentation #print STDERR "phase calculated to be ".$phase." \n"; $exon->start($start); $exon->end($end); $exon->analysis($analysis); $exon->slice($slice); $exon->phase($phase); my $end_phase = ($phase + ($exon->end-$exon->start) + 1)%3; #print STDERR "end phase calculated to be ".$end_phase."\n"; $exon->end_phase($end_phase); if($strand eq '+'){ $exon->strand(1); }else{ $exon->strand(-1); } #$exon->score(100); push(@global_exons, $exon); } #sort exons for this gene if($global_exons[0]->strand == -1){ @global_exons = sort{$b->start <=> $a->start} @global_exons; }else{ @global_exons = sort{$a->start <=> $b->start} @global_exons; } #save information if there is not further UTR info if(!defined($$five_prime{$gene_name}) and !defined($$three_prime{$gene_name})){ #print "\nno alternative transcripts..."; $transcripts{$gene_name} = \@global_exons; #print "\nCOUNT: ".keys %transcripts; next GENE; } ## check different transcripts using UTR information ## #collect 5' UTRs foreach my $transcript_name ( keys(%{$$five_prime{$gene_name}}) ){ #print "\nchecking 5' transcript $transcript_name. "; my @five_prime_exons = (); %overlapcheck = (); #more than one transcript at 5 prime level $temp_transcripts{$transcript_name} = 1; #get UTR lines foreach my $line(@{$$five_prime{$gene_name}{$transcript_name}}){ my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split(/\s+/, $line); #avoid saving two identical exons if(defined $overlapcheck{$start}){ #print "\nexon already defined"; next; } $overlapcheck{$start} = 1; my $exon = new Bio::EnsEMBL::Exon; my $phase = -1; $exon->start($start); $exon->end($end); $exon->analysis($analysis); $exon->slice($slice); $exon->phase($phase); my $end_phase = -1; $exon->end_phase($end_phase); if($strand eq '+'){ $exon->strand(1); }else{ $exon->strand(-1); } push(@five_prime_exons, $exon); } #sort exons for this transcript if($five_prime_exons[0]->strand == -1){ @five_prime_exons = sort{$b->start <=> $a->start} @five_prime_exons; }else{ @five_prime_exons = sort{$a->start <=> $b->start} @five_prime_exons; } #save them to transcript #print "\nhave ".scalar @five_prime_exons." UTR lines. "; $five_prime_exons{$transcript_name} = \@five_prime_exons; } ## collect 3' UTRs ## foreach my $transcript_name ( keys(%{$$three_prime{$gene_name}}) ){ #print "\nchecking 3' transcript $transcript_name. "; my @three_prime_exons = (); %overlapcheck = (); #more than one transcript at the 3 prime level, save the name $temp_transcripts{$transcript_name} = 1; #get UTR lines foreach my $line(@{$$three_prime{$gene_name}{$transcript_name}}){ my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split /\s+/, $line; #avoid saving two identical exons if(defined $overlapcheck{$start}){ #print "\nexon already defined"; next; } $overlapcheck{$start} = 1; my $exon = new Bio::EnsEMBL::Exon; my $phase = -1; $exon->start($start); $exon->end($end); $exon->analysis($analysis); $exon->slice($slice); $exon->phase($phase); my $end_phase = -1; $exon->end_phase($end_phase); if($strand eq '+'){ $exon->strand(1); }else{ $exon->strand(-1); } push(@three_prime_exons, $exon); } #sort exons for this transcript if($three_prime_exons[0]->strand == -1){ @three_prime_exons = sort{$b->start <=> $a->start} @three_prime_exons; }else{ @three_prime_exons = sort{$a->start <=> $b->start} @three_prime_exons; } #save them to transcript #print "\nhave ".scalar @three_prime_exons." UTR lines. "; $three_prime_exons{$transcript_name} = \@three_prime_exons; } ## combine exons, 5' and 3' for every transcript ## foreach my $transcript_name (keys %temp_transcripts){ print "transcript $transcript_name\n"; $transcriptcount++; my @exons = (); foreach my $temp_exon (@global_exons){ push(@exons, Bio::EnsEMBL::Pipeline::Tools::ExonUtils->_clone_Exon($temp_exon)); } my $translation_start = 1; my $first = 1; #set default translation range $trans_start_exon{$transcript_name} = 0; $trans_end_exon{$transcript_name} = $#exons; print "\ntrans-exons: ".$trans_start_exon{$transcript_name}." - ".$trans_end_exon{$transcript_name}." (".scalar @exons.")"; #check 5' exons if(defined($five_prime_exons{$transcript_name})){ my @five_prime_exons = @{$five_prime_exons{$transcript_name}}; print "\nworking on 5' of $transcript_name (".scalar @five_prime_exons.") "; #is last 5' UTR exon part of first coding exon? FIVEUTR: while(my $five_prime_exon = shift(@five_prime_exons)){ my $start = $five_prime_exon->start; my $end = $five_prime_exon->end; my $strand = $five_prime_exon->strand; #print "\n- 5'exon: $start - $end."; if($exons[$trans_start_exon{$transcript_name}]->strand == 1 and $strand == 1){ #forward strand if($start > $end){ #print "\n>>strange 5' UTR (+) exon: ".$end." - ".$start; next FIVEUTR; } if($end == ($exons[$trans_start_exon{$transcript_name}]->start)-1){ #combine exons, adjust translation start #print "\ncombine exons, adjust translation start..."; $translation_start = $exons[$trans_start_exon{$transcript_name}]->start - $start + 1; #print $translation_start."(+) "; $five_trans_start{$transcript_name} = $translation_start; $exons[$trans_start_exon{$transcript_name}]->start($start); } elsif($end < $exons[$trans_start_exon{$transcript_name}]->start -1){ #additional non-coding exon #add to exon array, keep translation start on last coding exon $trans_start_exon{$transcript_name}++; $trans_end_exon{$transcript_name}++; unshift(@exons, $five_prime_exon); print "\nadditional non-coding exon (+) ".$start." - ".$end; } else{ print STDERR "\n>>$transcript_name strange 5' UTR exon (+): $start - $end with 1.exons at ". $exons[$trans_start_exon{$transcript_name}]->start. " - ".$exons[$trans_start_exon{$transcript_name}]->end; next FIVEUTR; } } elsif($exons[$trans_start_exon{$transcript_name}]->strand == -1 and $strand == -1){ #reverse strand if($start > $end){ #print "\n>>strange 5' UTR (-) exon: ".$end." - ".$start; next FIVEUTR; } if($start == ($exons[$trans_start_exon{$transcript_name}]->end)+1){ #combine exons, adjust translation start #print "\ncombine exons, adjust translation start..."; $translation_start = ($end - $exons[$trans_start_exon{$transcript_name}]->end + 1); #print $translation_start."(-) "; $five_trans_start{$transcript_name} = $translation_start; $exons[$trans_start_exon{$transcript_name}]->end($end); } elsif($start > ($exons[$trans_start_exon{$transcript_name}]->end)+1){ #additional non-coding exon #add to exon array, keep translation start on last coding exon $trans_start_exon{$transcript_name}++; $trans_end_exon{$transcript_name}++; unshift(@exons, $five_prime_exon); #print "\nadditional 5' non-coding exon (-)".$start." - ".$end; } else{ print "\n>>$transcript_name strange 5' UTR exon (-): $start - $end with 1.exons at ". $exons[$trans_start_exon{$transcript_name}]->start. " - ".$exons[$trans_start_exon{$transcript_name}]->end; next FIVEUTR; } } else{ print STDERR "\n>>strand switch in UTR / coding!"; } } } #check 3' exons if(defined($three_prime_exons{$transcript_name})){ my @three_prime_exons = @{$three_prime_exons{$transcript_name}}; #print "\nworking on 3' of $transcript_name (".scalar @three_prime_exons.") "; #is first 3' UTR exon part of last coding exon? THREEUTR: while(my $three_prime_exon = shift(@three_prime_exons)){ my $start = $three_prime_exon->start; my $end = $three_prime_exon->end; my $strand = $three_prime_exon->strand; #print "\n- 3'exon: $start - $end."; if($exons[$trans_end_exon{$transcript_name}]->strand == 1 and $strand == 1){ #forward strand if($start > $end){ print STDERR "\n>>$transcript_name strange 3' UTR (+) exon: ".$start." - ".$end; next THREEUTR; } if($start == (($exons[$trans_end_exon{$transcript_name}]->end)+1)){ #combine exons, adjust translation start #print "\ncombine exons, keep current translation end..."; $translation_end = (($exons[$trans_end_exon{$transcript_name}]->end - $exons[$trans_end_exon{$transcript_name}]->start) + 1); #print $translation_end."(+) "; $three_trans_end{$transcript_name} = $translation_end; $exons[$trans_end_exon{$transcript_name}]->end($end); } elsif($start > (($exons[$trans_end_exon{$transcript_name}]->end)+1)){ #additional non-coding exon #add to exon array push(@exons, $three_prime_exon); #print "\nadditional 3' non-coding exon (+)"; } else{ print STDERR "\n$transcript_name strange 3' UTR exon (+): $start - $end with 1.exons at ".$exons[$trans_end_exon{$transcript_name}]->start; next THREEUTR; } } elsif($exons[$trans_end_exon{$transcript_name}]->strand == -1 and $strand == -1){ #reverse strand if($start > $end){ print STDERR "\n>>$transcript_name strange 3' UTR (-) exon: ".$start." - ".$end; next THREEUTR; } if($end == (($exons[$trans_end_exon{$transcript_name}]->start)-1)){ #combine exons, keep translation start #print "\ncombine exons, keep current translation end...."; $translation_end = (($exons[$trans_end_exon{$transcript_name}]->end - $exons[$trans_end_exon{$transcript_name}]->start) +1); #print $translation_end."(-) "; $three_trans_end{$transcript_name} = $translation_end; $exons[$trans_end_exon{$transcript_name}]->start($start); } elsif($end < (($exons[$trans_end_exon{$transcript_name}]->start)-1)){ #additional non-coding exon #add to exon array push(@exons, $three_prime_exon); #print "\nadditional 3' non-coding exon (-)".$start." - ".$end; } else{ print STDERR "\n$transcript_name strange 3' UTR exon (-): $start - $end with 1.exons at ".$exons[$trans_end_exon{$transcript_name}]->start; next THREEUTR; } } } #print "\ntrans-exons: ".$trans_start_exon{$transcript_name}." - ".$trans_end_exon{$transcript_name}." (".scalar @exons.")"; } #add exon data to transcript $transcripts{$transcript_name} = \@exons; } #print STDERR "\nCOUNT: ".keys %transcripts; } # my $c=0; # foreach my $tt (keys %transcripts){ # print "\ntranscript $tt: "; # foreach my $ex (@{$transcripts{$tt}}){ # print "..".$ex->start." -> ".$ex->end." (".$ex->strand."), "; # } # } return (\%transcripts, \%five_trans_start, \%three_trans_end, \%trans_start_exon, \%trans_end_exon); } =head2 create_transcripts Arg [1] : hash ref from process transcripts Function : creates actually transcript objects from the arrays of exons Returntype: hash ref keyed on gene id containg an array of transcripts Exceptions: Caller : Example : =cut sub create_transcripts{ my ($transcriptsRef, $five_start, $three_end, $trans_start_exon, $trans_end_exon) = @_; my @keys = keys(%$five_start); foreach my $key(@keys){ print STDERR "have start of translation for ".$key." ".$five_start->{$key}."\n"; } my %transcripts = %$transcriptsRef; my @non_translate; my %genes; my $gene_name; my $transcript_id; foreach my $transcript(keys(%transcripts)){ my $time = time; my @exons = @{$transcripts{$transcript}}; #print STDERR "\nWorking on $transcript.(".$exons[0]->strand.") "; #get the gene-name if($transcript =~ /(\w+\.\d+)[a-z A-Z]*/){#if($transcript =~ /\w+\.\d+[a-z A-Z]*/){ $gene_name = $1;#($gene_name) = $transcript =~ /(\w+\.\d+)[a-z A-Z]*/; $transcript_id = $transcript; }else{ $gene_name = $transcript; $transcript_id = $transcript; } print STDERR "\nNote: Gene name= ".$gene_name." Transcript_id= ".$transcript_id." (for transcript ".$transcript.")"; my $transcript = new Bio::EnsEMBL::Transcript; my $translation = new Bio::EnsEMBL::Translation; my @sorted_exons; if($exons[0]->strand == 1){ @sorted_exons = sort{$a->start <=> $b->start} @exons; }else{ @sorted_exons = sort{$b->start <=> $a->start} @exons; } my $exon_count = 1; my $phase = 0; foreach my $exon (@sorted_exons){ #$exon->created($time); #$exon->modified($time); $exon->version(1); $exon->stable_id($transcript_id.".".$exon_count); $exon_count++; eval{ $transcript->add_Exon($exon); #print "adding exon ".$exon->stable_id." \n"; }; if($@){ print STDERR "\n>>$transcript_id EXON ERROR: ".$@."\n"; } } my $start_exon_ind; if(exists($trans_start_exon->{$transcript_id})){ print "\nadjusting coding exons to ".$trans_start_exon->{$transcript_id}." - ".$trans_end_exon->{$transcript_id}; $translation->start_Exon($sorted_exons[$trans_start_exon->{$transcript_id}]); $translation->end_Exon ($sorted_exons[$trans_end_exon->{$transcript_id}]); $start_exon_ind = $trans_start_exon->{$transcript_id}; } else{ print "not defined\n"; $translation->start_Exon($sorted_exons[0]); $translation->end_Exon ($sorted_exons[$#sorted_exons]); $start_exon_ind = 0; } print " creating translation for ".$transcript_id."\n"; if (!exists ($trans_start_exon->{$transcript_id})){ print "dne: no trans_start_exon for ".$transcript_id."\n"; } if (exists($five_start->{$transcript_id})){ #print "five_start->{transcript_id} is defined\n"; print "1 setting translation start on transcript ".$transcript_id." to ".$five_start->{$transcript_id}."\n"; $translation->start($five_start->{$transcript_id}); } elsif($sorted_exons[$start_exon_ind]->phase == 0) { print "case 0\n"; $translation->start(1); } elsif ($sorted_exons[$start_exon_ind]->phase == 1) { print "case 1\n"; $translation->start(3); } elsif ($sorted_exons[$start_exon_ind]->phase == 2) { print "case 2\n"; $translation->start(2); } else{ print "dies here "; die "Strange phase in $transcript_id ".$sorted_exons[0]->phase; } #print "done five prime\n"; if((!defined($translation->start)) or ($translation->start <= 0) ){ print STDERR ">> no translation start info for ".$transcript_id; print STDERR "..".$five_start->{$transcript_id}."\n"; die(); } if(exists($three_end->{$transcript_id})){ print "2 setting translation end on transcript ".$transcript_id." to ".$three_end->{$transcript_id}." (1)\n"; $translation->end($three_end->{$transcript_id}); }else{ if(defined($trans_end_exon->{$transcript_id})){ $translation->end($sorted_exons[$trans_end_exon->{$transcript_id}]->end - $sorted_exons[$trans_end_exon->{$transcript_id}]->start +1); print "3 setting translation end on transcript ".$transcript_id." to ". ($exons[$trans_end_exon->{$transcript_id}]->end - $exons[$trans_end_exon->{$transcript_id}]->start +1)." (2)\n"; } else{ $translation->end($sorted_exons[$#sorted_exons]->end - $sorted_exons[$#sorted_exons]->start +1); print "4 setting translation end on transcript ".$transcript_id." to ". ($sorted_exons[$#sorted_exons]->end - $sorted_exons[$#sorted_exons]->start +1)." (2)\n"; } } $translation->stable_id($transcript_id); $translation->version(1); $transcript->translation($translation); $transcript->version(1); $transcript->stable_id($transcript_id); if(!$genes{$gene_name}){ $genes{$gene_name} = []; push(@{$genes{$gene_name}}, $transcript); }else{ push(@{$genes{$gene_name}}, $transcript); } #print "\nstored: $gene_name / $transcript_id"; } return \%genes; } =head2 create_gene Arg [1] : array ref of Bio::EnsEMBL::Transcript Arg [2] : name to be used as stable_id Function : take an array of transcripts and create a gene Returntype: Bio::EnsEMBL::Gene Exceptions: Caller : Example : =cut sub create_gene{ my ($transcripts, $name) = @_; my $time = time; my $gene = new Bio::EnsEMBL::Gene; my $exons = $transcripts->[0]->get_all_Exons; my $analysis = $exons->[0]->analysis; $gene->analysis($analysis); $gene->biotype($analysis->logic_name); $gene->version(1); $gene->stable_id($name); foreach my $transcript(@$transcripts){ $gene->add_Transcript($transcript); } return $gene; } =head2 prune_Exons Arg [1] : Bio::EnsEMBL::Gene Function : remove duplicate exons between two transcripts Returntype: Bio::EnsEMBL::Gene Exceptions: Caller : Example : =cut sub prune_Exons { my ($gene) = @_; my @unique_Exons; # keep track of all unique exons found so far to avoid making duplicates # need to be very careful about translation->start_Exon and translation->end_Exon foreach my $tran (@{$gene->get_all_Transcripts}) { my @newexons; foreach my $exon (@{$tran->get_all_Exons}) { my $found; #always empty UNI:foreach my $uni (@unique_Exons) { if ($uni->start == $exon->start && $uni->end == $exon->end && $uni->strand == $exon->strand && $uni->phase == $exon->phase && $uni->end_phase == $exon->end_phase ) { $found = $uni; last UNI; } } if (defined($found)) { push(@newexons,$found); if ($exon == $tran->translation->start_Exon){ $tran->translation->start_Exon($found); } if ($exon == $tran->translation->end_Exon){ $tran->translation->end_Exon($found); } } else { push(@newexons,$exon); push(@unique_Exons, $exon); } } $tran->flush_Exons; foreach my $exon (@newexons) { $tran->add_Exon($exon); } } return $gene; } =head2 write_genes Arg [1] : array ref of Bio::EnsEMBL::Genes Arg [2] : Bio::EnsEMBL::DBSQL::DBAdaptor Function : transforms genes into raw conti coords then writes them to the db provided Returntype: hash ref of genes keyed on clone name which wouldn't transform Exceptions: dies if a gene could't be stored Caller : Example : =cut sub write_genes{ my ($genes, $db, $stable_id_check) = @_; my $e=0; my %stable_ids; if($stable_id_check){ my $sql = 'select stable_id from gene_stable_id'; my $sth = $db->dbc->prepare($sql); $sth->execute; while(my($stable_id) = $sth->fetchrow){ $stable_ids{$stable_id} = 1; } } my %stored; GENE: foreach my $gene(@$genes){ #print STDERR "BEFORE STORAGE \n"; #&display_exons(@{$gene->get_all_Exons}); if($stable_id_check){ if($stable_ids{$gene->stable_id}){ print STDERR $gene->stable_id." already exists\n"; my $id = $gene->stable_id; $id .= '.pseudo'; $gene->stable_id($id); foreach my $transcript(@{$gene->get_all_Transcripts}){ my $trans_id = $transcript->stable_id; $trans_id .= '.pseudo'; $transcript->stable_id($trans_id); foreach my $e(@{$transcript->get_all_Exons}){ my $id = $e->stable_id; $id .= '.pseudo'; $e->stable_id($id); } } } } if($stored{$gene->stable_id}){ print STDERR "we have stored ".$gene->stable_id." already\n"; next GENE; } my $gene_adaptor = $db->get_GeneAdaptor; eval{ $stored{$gene->stable_id} = 1; $gene_adaptor->store($gene); $e++; }; if($@){ die "couldn't store ".$gene->stable_id." problems ".$@; } } print "\nStored gene: ".$e."\n"; } =head2 translation_check Arg [1] : Bio::EnsEMBL::Gene Function : checks if the gene translates Returntype: Bio::EnsEMBL::Gene if translates undef if doesn't' Exceptions: Caller : Example : =cut sub translation_check{ my ($gene, $db) = @_; my @transcripts = @{$gene->get_all_Transcripts}; foreach my $t (@transcripts){ my $pep = $t->translate->seq; if($pep =~ /\*/g){ if($gene->stable_id eq 'C06G3.7' and $db){ #add Selenocysteine to translation. There seems to be only on Selenoc. in our worm... my $pos = pos($pep); print STDERR "transcript ".$t->stable_id." doesn't translate. Adding Selenocystein at position $pos.\n". "Please beware of problems during further analysis steps because of this."; selenocysteine($t, $pos, $db); } else{ print STDERR "transcript ".$t->stable_id." doesn't translate\n"; print STDERR "translation start ".$t->translation->start." end ".$t->translation->end."\n"; print STDERR "start exon coords ".$t->translation->start_Exon->start." ".$t->translation->start_Exon->end."\n"; print STDERR "end exon coords ".$t->translation->end_Exon->start." ".$t->translation->end_Exon->end."\n"; print STDERR "peptide ".$pep."\n"; &display_exons(@{$t->get_all_Exons}); &non_translate($t); return undef; } } } return $gene; } =head2 selenocysteine Arg [1] : transcript object to be modified Arg [2] : position (integer) in transcripts sequence to be modified Arg [3] : Bio::EnsEMBL::DBSQL::DBAdaptor Function : modifiy transcript/translation by adding a seleocysteine residue Returntype: Exceptions: Caller : Example : =cut sub selenocysteine{ my ($transcript, $pos, $db) = @_; print "\nmodifying ".$transcript->stable_id; my $seq_edit = Bio::EnsEMBL::SeqEdit->new( -CODE => '_selenocysteine', -NAME => 'Selenocysteine', -DESC => 'Selenocysteine', -START => $pos, -END => $pos, -ALT_SEQ => 'U' #the one-letter symbol for selenocysteine ); my $attribute = $seq_edit->get_Attribute(); my $translation = $transcript->translation(); my $attribute_adaptor = $db->get_AttributeAdaptor(); $attribute_adaptor->store_on_Translation($translation, [$attribute]); } =head2 insert_agp_line Arg [1] : the first 12 args are info for the assembly table Arg [2] : Bio::EnsEMBL::DBSQL::DBAdaptor pointing to db where you want the assembly loaded Function : load the provided info into the assembly table Returntype: Exceptions: Caller : Example : =cut sub insert_agp_line{ my ($chr_id, $chr_start, $chr_end, $contig, $contig_start, $contig_end, $contig_ori, $db) = @_; if(!$contig){ #print STDERR "trying to insert into ".$chr_id." ".$chr_start." ".$chr_end."\n"; die "contig id must be defined for this to work\n"; } my $sql = "insert into assembly(asm_seq_region_id, asm_start, asm_end, cmp_seq_region_id, cmp_start, cmp_end, ori) values(?, ?, ?, ?, ?, ?, ?)"; my $sth = $db->dbc->prepare($sql); $sth->execute($chr_id, $chr_start, $chr_end, $contig, $contig_start, $contig_end, $contig_ori); } =head2 display_exons Arg [1] : array of Bio::EnsEMBL::Exons Function : displays the array of exons provided for debug purposes put here for safe keeping Returntype: Exceptions: Caller : Example : =cut sub display_exons{ my (@exons) = @_; @exons = sort{$a->start <=> $b->start || $a->end <=> $b->end} @exons if($exons[0]->strand == 1); @exons = sort{$b->start <=> $a->start || $b->end <=> $a->end} @exons if($exons[0]->strand == -1); foreach my $e(@exons){ print STDERR $e->stable_id."\t ".$e->start."\t ".$e->end."\t ".$e->strand."\t ".$e->phase."\t ".$e->end_phase."\n"; } } =head2 non_translate Arg [1] : array of Bio::EnsEMBL::Transcripts Function : displays the three frame translation of each exon here for safe keeping and debug purposes Returntype: Exceptions: Caller : Example : =cut sub non_translate{ my (@transcripts) = @_; foreach my $t(@transcripts){ my @exons = @{$t->get_all_Exons}; # print "transcript sequence :\n".$t->seq."\n"; foreach my $e(@exons){ print "exon ".$e->stable_id." ".$e->start." ".$e->end." ".$e->strand."\n"; my $seq = $e->seq; my $pep0 = $seq->translate('*', 'X', 0); my $pep1 = $seq->translate('*', 'X', 1); my $pep2 = $seq->translate('*', 'X', 2); print "exon sequence :\n".$e->seq->seq."\n\n"; print $e->seqname." ".$e->start." : ".$e->end." translation in 0 frame\n ".$pep0->seq."\n\n"; print $e->seqname." ".$e->start." : ".$e->end." translation in 1 phase\n ".$pep2->seq."\n\n"; print $e->seqname." ".$e->start." : ".$e->end." translation in 2 phase\n ".$pep1->seq."\n\n"; print "\n\n"; } } } =head2 parse_operons Arg [1] : gff file path Arg [2] : Bio:Seq object Arg [3] : analysis object Function : parse operon information Returntype: Exceptions: Caller : Example : =cut sub parse_operons{ my ($file, $seq, $analysis) = @_; #print STDERR "opening ".$file."\n"; open(FH, $file) or die"couldn't open ".$file." $!"; die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") || $seq->isa("Bio::Seq") || $seq->isa("Bio::PrimarySeqI")); #CHROMOSOME_I operon transcription 13758388 13764178 . - . Operon "CEOP1716" my @operons; LINE: while(<FH>){ chomp; my($type, $start, $end, $strand, $id) = (split)[1,3,4,6,9]; if($type ne 'operon'){ next LINE; } if($strand eq '-'){ $strand = -1; }else{ $strand = 1; } my $simple_feature = Bio::EnsEMBL::SimpleFeature->new(); $simple_feature->start($start); $simple_feature->strand($strand); $simple_feature->end($end); $id =~ s/\"//g; $simple_feature->display_label($id); $simple_feature->slice($seq); $simple_feature->analysis($analysis); push(@operons, $simple_feature); } return \@operons ; } sub parse_rnai{ my ($file, $seq, $analysis) = @_; #print STDERR "opening ".$file."\n"; open(FH, $file) or die "couldn't open ".$file." $!"; die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") || $seq->isa("Bio::Seq") || $seq->isa("Bio::PrimarySeqI")); print "ONLY FETCHES RNAi_PRIMARY\n"; my @operons; LINE: while(<FH>){ #CHROMOSOME_IV RNAi experimental 3195031 3196094 . + . RNAi "JA:Y67D8A_375.b" #CHROMOSOME_I RNAi_primary RNAi_reagent 15040052 15041000 . . . Target "RNAi:WBRNAi00027818" 1371 423 #CHROMOSOME_I RNAi_secondary RNAi_reagent 15040723 15040982 . . . Target "RNAi:WBRNAi00027816" 475 216 #print; chomp; my @values = split; if($values[1] ne 'cDNA_for_RNAi'){ next LINE; } my ($start, $end, $strand, $id, $count); $count = 0; # if($values[2] ne 'experimental'){ # #print "have no experimental tag\n"; # $start = $values[2]; # $end = $values[3]; # $strand = $values[5]; # $id = $values[8]; # }else{ $start = $values[3]; $end = $values[4]; $strand = $values[6]; $id = $values[9]; # } # print $_."\n"; #print "have ".$start." ".$end." ".$strand." ".$id."\n"; if($strand eq '+'){ $strand = 1; }else{ $strand = -1; } $id =~ s/\"//g; my $simple_feature = &create_simple_feature($start, $end, $strand, $id, $seq, $analysis); push(@operons, $simple_feature); } return \@operons ; } =head2 parse_operons Arg [1] : gff file path Arg [2] : Bio:Seq object Arg [3] : analysis object Function : parse expression information Returntype: Exceptions: Caller : Example : =cut sub parse_expr{ my ($file, $seq, $analysis) = @_; #print STDERR "opening ".$file."\n"; open(FH, $file) or die"couldn't open ".$file." $!"; die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") || $seq->isa("Bio::Seq") || $seq->isa("Bio::PrimarySeqI")); my @operons; LINE: while(<FH>){ #CHROMOSOME_IV RNAi experimental 3195031 3196094 . + . RNAi "JA:Y67D8A_375.b" #print; chomp; my @values = split; if($values[1] ne 'Expr_profile'){ next LINE; } my ($start, $end, $strand, $id, $count); $count = 0; $start = $values[3]; $end = $values[4]; $strand = $values[6]; $id = $values[9]; #print $_."\n"; #print "have ".$start." ".$end." ".$strand." ".$id."\n"; if($strand eq '+'){ $strand = 1; }else{ $strand = -1; } $id =~ s/\"//g; my $simple_feature = &create_simple_feature($start, $end, $strand, $id, $seq, $analysis); push(@operons, $simple_feature); } return \@operons ; } sub parse_SL1{ my ($file, $seq, $analysis) = @_; #print STDERR "opening ".$file."\n"; open(FH, $file) or die"couldn't open ".$file." $!"; die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") || $seq->isa("Bio::Seq") || $seq->isa("Bio::PrimarySeqI")); my @operons; LINE: while(<FH>){ #CHROMOSOME_I SL1 trans-splice_acceptor 6473786 6473787 . - . Note "SL1 trans-splice site; see yk719h1.5" #print; chomp; my @values = split; if($values[1] ne 'SL1'){ next LINE; } if($_ =~ /personal\s+communication/){ print STDERR "Can't put information in table about ".$_."\n"; next LINE; } my ($start, $end, $strand, $id, $count); $count = 0; $start = $values[3]; $end = $values[4]; $strand = $values[6]; $id = $values[14]; if(!$id){ $id = $values[13]; } if(($id =~ /cDNA/) || ($id =~ /EST/)){ $id = $values[15]; } if($id eq '"'){ $id = $values[13]; } if((!$id) || ($id eq '"')){ $id = $values[12]; } #print $_."\n"; #print "have ".$start." ".$end." ".$strand." ".$id."\n"; if($strand eq '+'){ $strand = 1; }else{ $strand = -1; } #print "have id ".$id."\n"; if((!$id) || ($id eq '.') || ($id =~ /EST/) || ($id =~ /cDNA/) || ($id eq '"')){ #foreach my $v(@values){ # print $count." ".$v."\n"; # $count++; #} print "line ".$_." produced a weird id ".$id."\n"; next LINE } $id =~ s/\"//g; my $simple_feature = &create_simple_feature($start, $end, $strand, $id, $seq, $analysis); push(@operons, $simple_feature); } return \@operons; } sub parse_SL2{ my ($file, $seq, $analysis) = @_; #print STDERR "opening ".$file."\n"; open(FH, $file) or die"couldn't open ".$file." $!"; die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") || $seq->isa("Bio::Seq") || $seq->isa("Bio::PrimarySeqI")); my @operons; LINE: while(<FH>){ #CHROMOSOME_I SL2 trans-splice_acceptor 6473786 6473787 . - . Note "SL2 trans-splice site; see yk729h2.5" #print; chomp; my @values = split; if($values[1] ne 'SL2'){ next LINE; } if($_ =~ /personal\s+communication/){ print STDERR "Can't put information in table about ".$_."\n"; next LINE; } my ($start, $end, $strand, $id, $count); $count = 0; $start = $values[3]; $end = $values[4]; $strand = $values[6]; $id = $values[14]; if(!$id){ $id = $values[13]; } if(($id =~ /cDNA/) || ($id =~ /EST/)){ $id = $values[15]; } if($id eq '"'){ $id = $values[13]; if($id eq 'ESTyk1004g06.5'){ $id =~ s/EST//g; } } if((!$id) || ($id eq '"')){ $id = $values[12]; } #print $_."\n"; #print "have ".$start." ".$end." ".$strand." ".$id."\n"; if($strand eq '+'){ $strand = 1; }else{ $strand = -1; } #print "have id ".$id."\n"; if((!$id) || ($id eq '.') || ($id =~ /EST/) || ($id =~ /cDNA/) || ($id eq '"')){ #foreach my $v(@values){ # print $count." ".$v."\n"; # $count++; #} print "line ".$_." produced a weird id ".$id."\n"; next LINE } $id =~ s/\"//g; my $simple_feature = &create_simple_feature($start, $end, $strand, $id, $seq, $analysis); push(@operons, $simple_feature); } return \@operons ; } sub create_simple_feature{ my ($start, $end, $strand, $id, $seq, $analysis) = @_; #warn "first: $start, $end, $strand, $id..."; my $simple_feature = Bio::EnsEMBL::SimpleFeature->new(); $simple_feature->start($start); $simple_feature->strand($strand); $simple_feature->end($end); $simple_feature->display_label($id); $simple_feature->slice($seq); $simple_feature->analysis($analysis); return $simple_feature; } sub write_simple_features{ my ($operons, $db) = @_; eval{ print "\n check 1: ".$$operons[0]->display_label }; eval{ print "\n check 2: ".$$operons[0]->start." - ".$$operons[0]->end."\n" }; my $operon_adaptor = $db->get_SimpleFeatureAdaptor; eval{ $operon_adaptor->store(@$operons); }; if($@){ die "couldn't store simple features problems ".$@; } } #not used at this time... sub parse_tRNA_genes{ my $type = "tRNA"; my ($file, $seq, $analysis) = @_; &parse_pseudo_files($file, $seq, $analysis, $type); } sub parse_rRNA_genes{ my $type = "rRNA"; my ($file, $seq, $analysis) = @_; &parse_pseudo_files($file, $seq, $analysis, $type); } =head2 parse_pseudo_gff Arg [1] : gff file path Arg [2] : Bio:Seq object Arg [3] : analysis object Function : parse pseudogene information Returntype: Exceptions: Caller : Example : =cut sub parse_pseudo_gff{ my $type = "Pseudogene"; my ($file, $seq, $analysis) = @_; &parse_pseudo_files($file, $seq, $analysis, $type); } =head2 parse_pseudo_files Arg [1] : gff file path Arg [2] : Bio:Seq object Arg [3] : analysis object Arg [4] : type of feature to be parsed Function : parse specific feature information from gff file Returntype: Exceptions: Caller : Example : =cut sub parse_pseudo_files{ my ($file, $seq, $analysis, $types) = @_; open(FH, $file) or die "couldn't open ".$file." $!"; die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") || $seq->isa("Bio::Seq") || $seq->isa("Bio::PrimarySeqI")); my @genes; my ($transcripts) = &process_pseudo_files(\*FH, $types); print "there are ".keys(%$transcripts)." distinct special transcripts\n"; my ($processed_transcripts) = &process_pseudo_transcripts($transcripts, $seq, $analysis); print "there are ".keys(%$processed_transcripts)." processed special transcripts\n"; my $genes = undef; $genes = &create_pseudo_transcripts($processed_transcripts); print "PARSE GFF there are ".keys(%$genes)." special genes\n"; foreach my $gene_id(keys(%$genes)){ my $transcripts = $genes->{$gene_id}; my $gene = &create_gene($transcripts, $gene_id); push(@genes, $gene); } close(FH); return \@genes; } #generic version sub process_pseudo_files{ my ($fh, $types) = @_; my %transcripts; LOOP: while(<$fh>){ chomp; if(/^##/){ next LOOP; } my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split; my $element = $_; if($chr =~ /sequence-region/){ #print STDERR $_; next LOOP; } if(!$status && !$type){ print "status and type no defined skipping\n"; next LOOP; } my $line = $status." ".$type; #print "\n".$types." exon (".$line.")\n"; if($line ne $types.' exon'){ #print "ignoring ".$line."\n"; next LOOP; } $gene =~ s/\"//g; # print "gene ".$gene."\n"; if(!$transcripts{$gene}){ $transcripts{$gene} = []; push(@{$transcripts{$gene}}, $element); }else{ push(@{$transcripts{$gene}}, $element); } } return \%transcripts; } sub process_pseudo_transcripts{ my ($transcripts, $slice, $analysis) = @_; my %genes; my %transcripts = %$transcripts; my @names = keys(%transcripts); #print STDERR "PROCESSING TRANSCRIPTS \n"; foreach my $name(@names){ my @lines = @{$transcripts{$name}}; $transcripts{$name} = []; my @exons; foreach my $line(@lines){ #print STDERR $line."\n"; my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split /\s+/, $line; $chr =~ s/CHROMOSOME_//; if($start == $end){ next; } my $exon = new Bio::EnsEMBL::Exon; if($frame eq '.'){ $frame = 0; } my $phase = (3 - $frame)%3; # wormbase gff cotains frame which is effectively the opposite of phase # for a good explaination of phase see the Bio::EnsEMBL::Exon documentation #print STDERR "phase calculated to be ".$phase."\n"; $exon->start($start); $exon->end($end); $exon->analysis($analysis); $exon->slice($slice); $exon->phase($phase); my $end_phase = ($phase + ($exon->end-$exon->start) + 1)%3; #print STDERR "end phase calculated to be ".$end_phase."\n"; $exon->end_phase($end_phase); if($strand eq '+'){ $exon->strand(1); }else{ $exon->strand(-1); } #$exon->score(100); push(@exons, $exon); } if($exons[0]->strand == -1){ @exons = sort{$b->start <=> $a->start} @exons; }else{ @exons = sort{$a->start <=> $b->start} @exons; } my $phase = 0; foreach my $e(@exons){ push(@{$transcripts{$name}}, $e); } } return (\%transcripts); } sub create_pseudo_transcripts{ my ($transcripts) = @_; my %transcripts = %$transcripts; my %genes; my $gene_name; my $transcript_id; foreach my $transcript(keys(%transcripts)){ my $time = time; my @exons = @{$transcripts{$transcript}}; if($transcript =~ /\w+\.\d+[a-z A-Z]/){ ($gene_name) = $transcript =~ /(\w+\.\d+)[a-z A-Z]/; $transcript_id = $transcript; }else{ $gene_name = $transcript; $transcript_id = $transcript; } my $transcript = new Bio::EnsEMBL::Transcript; my @sorted_exons; if($exons[0]->strand == 1){ @sorted_exons = sort{$a->start <=> $b->start} @exons }else{ @sorted_exons = sort{$b->start <=> $a->start} @exons } my $exon_count = 1; my $phase = 0; foreach my $exon(@sorted_exons){ $exon->version(1); $exon->stable_id($transcript_id.".".$exon_count); $exon_count++; $transcript->add_Exon($exon); } $transcript->version(1); $transcript->stable_id($transcript_id); if(!$genes{$gene_name}){ $genes{$gene_name} = []; push(@{$genes{$gene_name}}, $transcript); }else{ push(@{$genes{$gene_name}}, $transcript); } } return \%genes; } sub store_coord_system{ my ($db, $name, $version, $sequence_level, $default, $rank) = @_; my $csa = $db->get_CoordSystemAdaptor(); my $cs = Bio::EnsEMBL::CoordSystem->new ( -NAME => $name, -VERSION => $version, -DEFAULT => $default, -SEQUENCE_LEVEL => $sequence_level, #-TOP_LEVEL => $top_level, -RANK => $rank ); $csa->store($cs); return $cs; } sub store_slice{ my ($db, $name, $start, $end, $strand, $coord_system, $sequence) = @_; my $sa = $db->get_SliceAdaptor(); my $slice = Bio::EnsEMBL::Slice->new (-seq_region_name => $name, -start => $start, -end => $end, -strand => $strand, -coord_system => $coord_system); my $seq_ref; if($sequence){ $seq_ref = \$sequence; } $sa->store($slice, $seq_ref); $slice->adaptor($sa); return $slice; } 1;