ensembl-pipeline
WormBase
Toolbar
Summary
WormBase
Package variables
No package variables defined.
Included modules
Inherit
Exporter
Synopsis
the methods are all used by wormbase_to_ensembl.pl script which should be in the same directory
Description
parse gff and agp files from the wormbase database for caenorhabditis elegans to create ensembl database.
Methods
Methods description
Arg [1] : filehandle to an agp file Arg [2] : chromosome id from chromsome table Arg [3] : agp type for assembly table Function : parses an agp file into a suitable format for inserting into the assembly table Returntype: hash ref, keyed on contig id, each hash element is itself a hash which contains all the info needed for the assembly table Exceptions: dies if the same contig id is found twice Caller : Example : |
Arg [1] : array ref of Bio::EnsEMBL::Transcript Arg [2] : name to be used as stable_id Function : take an array of transcripts and create a gene Returntype: Bio::EnsEMBL::Gene Exceptions: Caller : Example : |
Arg [1] : hash ref from process transcripts Function : creates actually transcript objects from the arrays of exons Returntype: hash ref keyed on gene id containg an array of transcripts Exceptions: Caller : Example : |
Arg [1] : array of Bio::EnsEMBL::Exons Function : displays the array of exons provided for debug purposes put here for safe keeping Returntype: Exceptions: Caller : Example : |
Arg [1] : hash ref (as returned by process_file, containing information about the CDS) Arg [2] : Bio::EnsEMBL::Slice Arg [3] : Bio::EnsEMBL::Analysis Arg [4] : ref to hash of hashes (as returned by process_file, containing information about the 5' UTR regions Arg [5] : ref to hash of hashes (as returned by process_file, containing information about the 3' UTR regions Function : takes line representing a transcript and creates an exon for each one Returntype: hash ref hash keyed on transcript id containing an array of exons Exceptions: Caller : Example : |
Arg [1] : filehandle to an agp file Function : retrives sequence ids from an agp file Returntype: array ref of seq ids and array ref of ides which don't fit the format' Exceptions: non Caller : Example : &get_seq_ids($fh); |
Arg [1] : array ref of sequence ids which will be recognised by pfetch Arg [2] : a Bio::EnsEMBL::Pipeline::Seqfetcher::Pfetch object Function : gets sequences for the ids passed to it using pfetch Returntype: hash keyed on seq id containing Bio::Seq object Exceptions: throws if seqfetcher passed to it isn't pfetch' Caller : Example : %get_sequences_pfetch($seq_ids, $seqfetcher); |
Arg [1] : the first 12 args are info for the assembly table Arg [2] : Bio::EnsEMBL::DBSQL::DBAdaptor pointing to db where you want the assembly loaded Function : load the provided info into the assembly table Returntype: Exceptions: Caller : Example : |
Arg [1] : array of Bio::EnsEMBL::Transcripts Function : displays the three frame translation of each exon here for safe keeping and debug purposes Returntype: Exceptions: Caller : Example : |
Arg [1] : filename of gff file Arg [2] : Bio::Seq object Arg [3] : Bio::EnsEMBL::Analysis object Function : parses gff file given into genes Returntype: array ref of Bio::EnEMBL::Genes Exceptions: dies if can't open file or seq isn't a Bio::Seq Caller : Example : |
Arg [1] : gff file path Arg [2] : Bio:Seq object Arg [3] : analysis object Function : parse expression information Returntype: Exceptions: Caller : Example : |
Arg [1] : gff file path Arg [2] : Bio:Seq object Arg [3] : analysis object Arg [4] : type of feature to be parsed Function : parse specific feature information from gff file Returntype: Exceptions: Caller : Example : |
Arg [1] : gff file path Arg [2] : Bio:Seq object Arg [3] : analysis object Function : parse pseudogene information Returntype: Exceptions: Caller : Example : |
Arg [1] : filehandle pointing to a gff file Function : parses out lines for exons Returntype: hash keyed on transcript id each containig array of lines for that transcript and two hashes of hashes with arrays containing 5 prime and 3 prime UTR lines Exceptions: Caller : Example : |
Arg [1] : Bio::EnsEMBL::Gene Function : remove duplicate exons between two transcripts Returntype: Bio::EnsEMBL::Gene Exceptions: Caller : Example : |
Arg [1] : transcript object to be modified Arg [2] : position (integer) in transcripts sequence to be modified Arg [3] : Bio::EnsEMBL::DBSQL::DBAdaptor Function : modifiy transcript/translation by adding a seleocysteine residue Returntype: Exceptions: Caller : Example : |
Arg [1] : Bio::EnsEMBL::Gene Function : checks if the gene translates Returntype: Bio::EnsEMBL::Gene if translates undef if doesn't' Exceptions: Caller : Example : |
Arg [1] : array ref of Bio::EnsEMBL::Genes Arg [2] : Bio::EnsEMBL::DBSQL::DBAdaptor Function : transforms genes into raw conti coords then writes them to the db provided Returntype: hash ref of genes keyed on clone name which wouldn't transform Exceptions: dies if a gene could't be stored Caller : Example : |
Methods code
sub agp_parse
{ my ($fh, $chr_id, $agp_type) = @_;
my $chr_hash = {};
while(<$fh>){
chomp;
my ($chr, $chr_start, $chr_end, $gap, $contig, $raw_start, $raw_end, $raw_ori) =
(split)[0, 1, 2, 4, 5, 6, 7, 8];
if(!$contig =~ /\S+\.\d+/){
next;
}
if($contig =~ /\S+\.$/){
next;
}
if ($raw_ori eq '+') {
$raw_ori = 1;
}
elsif ($raw_ori eq '-') {
$raw_ori = -1;
}
else {
$raw_ori = 1;
}
if($chr_hash->{$contig}){
die "contig ".$contig." has been found twice something odd is going on\n";
}else{
$chr_hash->{$contig} = { 'chromosome_id' => $chr_id,
'chr_start' => $chr_start,
'chr_end' => $chr_end,
'superctg_name' => $chr,
'superctg_start' => $chr_start,
'superctg_end' => $chr_end,
'superctg_ori' => 1,
'contig_start' => $raw_start,
'contig_end' => $raw_end,
'contig_ori' => $raw_ori,
'type' => $agp_type,
}
}
}
return $chr_hash; } |
sub create_gene
{ my ($transcripts, $name) = @_;
my $time = time;
my $gene = new Bio::EnsEMBL::Gene;
my $exons = $transcripts->[0]->get_all_Exons;
my $analysis = $exons->[0]->analysis;
$gene->analysis($analysis);
$gene->biotype($analysis->logic_name);
$gene->version(1);
$gene->stable_id($name);
foreach my $transcript(@$transcripts){
$gene->add_Transcript($transcript);
}
return $gene; } |
sub create_pseudo_transcripts
{ my ($transcripts) = @_;
my %transcripts = %$transcripts;
my %genes;
my $gene_name;
my $transcript_id;
foreach my $transcript(keys(%transcripts)){
my $time = time;
my @exons = @{$transcripts{$transcript}};
if($transcript =~ /\w+\.\d+[a-z A-Z]/){
($gene_name) = $transcript =~ /(\w+\.\d+)[a-z A-Z]/;
$transcript_id = $transcript;
}else{
$gene_name = $transcript;
$transcript_id = $transcript;
}
my $transcript = new Bio::EnsEMBL::Transcript;
my @sorted_exons;
if($exons[0]->strand == 1){
@sorted_exons = sort{$a->start <=> $b->start} @exons
}else{
@sorted_exons = sort{$b->start <=> $a->start} @exons
}
my $exon_count = 1;
my $phase = 0;
foreach my $exon(@sorted_exons){
$exon->version(1);
$exon->stable_id($transcript_id.".".$exon_count);
$exon_count++;
$transcript->add_Exon($exon);
}
$transcript->version(1);
$transcript->stable_id($transcript_id);
if(!$genes{$gene_name}){
$genes{$gene_name} = [];
push(@{$genes{$gene_name}}, $transcript);
}else{
push(@{$genes{$gene_name}}, $transcript);
}
}
return\% genes; } |
sub create_simple_feature
{ my ($start, $end, $strand, $id, $seq, $analysis) = @_;
my $simple_feature = Bio::EnsEMBL::SimpleFeature->new();
$simple_feature->start($start);
$simple_feature->strand($strand);
$simple_feature->end($end);
$simple_feature->display_label($id);
$simple_feature->slice($seq);
$simple_feature->analysis($analysis);
return $simple_feature; } |
sub create_transcripts
{ my ($transcriptsRef, $five_start, $three_end, $trans_start_exon, $trans_end_exon) = @_;
my @keys = keys(%$five_start);
foreach my $key(@keys){
print STDERR "have start of translation for ".$key." ".$five_start->{$key}."\n";
}
my %transcripts = %$transcriptsRef;
my @non_translate;
my %genes;
my $gene_name;
my $transcript_id;
foreach my $transcript(keys(%transcripts)){
my $time = time;
my @exons = @{$transcripts{$transcript}};
if($transcript =~ /(\w+\.\d+)[a-z A-Z]*/){ $gene_name = $1; $transcript_id = $transcript;
}else{
$gene_name = $transcript;
$transcript_id = $transcript;
}
print STDERR "\nNote: Gene name= ".$gene_name." Transcript_id= ".$transcript_id." (for transcript ".$transcript.")";
my $transcript = new Bio::EnsEMBL::Transcript;
my $translation = new Bio::EnsEMBL::Translation;
my @sorted_exons;
if($exons[0]->strand == 1){
@sorted_exons = sort{$a->start <=> $b->start} @exons;
}else{
@sorted_exons = sort{$b->start <=> $a->start} @exons;
}
my $exon_count = 1;
my $phase = 0;
foreach my $exon (@sorted_exons){
$exon->version(1);
$exon->stable_id($transcript_id.".".$exon_count);
$exon_count++;
eval{
$transcript->add_Exon($exon);
};
if($@){ print STDERR "\n>>$transcript_id EXON ERROR: ".$@."\n"; }
}
my $start_exon_ind;
if(exists($trans_start_exon->{$transcript_id})){
print "\nadjusting coding exons to ".$trans_start_exon->{$transcript_id}." - ".$trans_end_exon->{$transcript_id};
$translation->start_Exon($sorted_exons[$trans_start_exon->{$transcript_id}]);
$translation->end_Exon ($sorted_exons[$trans_end_exon->{$transcript_id}]);
$start_exon_ind = $trans_start_exon->{$transcript_id};
}
else{
print "not defined\n";
$translation->start_Exon($sorted_exons[0]);
$translation->end_Exon ($sorted_exons[$#sorted_exons]);
$start_exon_ind = 0;
}
print " creating translation for ".$transcript_id."\n";
if (!exists ($trans_start_exon->{$transcript_id})){
print "dne: no trans_start_exon for ".$transcript_id."\n";
}
if (exists($five_start->{$transcript_id})){
print "1 setting translation start on transcript ".$transcript_id." to ".$five_start->{$transcript_id}."\n";
$translation->start($five_start->{$transcript_id});
}
elsif($sorted_exons[$start_exon_ind]->phase == 0) {
print "case 0\n";
$translation->start(1);
}
elsif ($sorted_exons[$start_exon_ind]->phase == 1) {
print "case 1\n";
$translation->start(3);
}
elsif ($sorted_exons[$start_exon_ind]->phase == 2) {
print "case 2\n";
$translation->start(2);
}
else{
print "dies here ";
die "Strange phase in $transcript_id ".$sorted_exons[0]->phase;
}
if((!defined($translation->start)) or ($translation->start <= 0) ){
print STDERR ">> no translation start info for ".$transcript_id;
print STDERR "..".$five_start->{$transcript_id}."\n";
die();
}
if(exists($three_end->{$transcript_id})){
print "2 setting translation end on transcript ".$transcript_id." to ".$three_end->{$transcript_id}." (1)\n";
$translation->end($three_end->{$transcript_id});
}else{
if(defined($trans_end_exon->{$transcript_id})){
$translation->end($sorted_exons[$trans_end_exon->{$transcript_id}]->end - $sorted_exons[$trans_end_exon->{$transcript_id}]->start +1);
print "3 setting translation end on transcript ".$transcript_id." to ".
($exons[$trans_end_exon->{$transcript_id}]->end - $exons[$trans_end_exon->{$transcript_id}]->start +1)." (2)\n";
}
else{
$translation->end($sorted_exons[$#sorted_exons]->end - $sorted_exons[$#sorted_exons]->start +1);
print "4 setting translation end on transcript ".$transcript_id." to ".
($sorted_exons[$#sorted_exons]->end - $sorted_exons[$#sorted_exons]->start +1)." (2)\n";
}
}
$translation->stable_id($transcript_id);
$translation->version(1);
$transcript->translation($translation);
$transcript->version(1);
$transcript->stable_id($transcript_id);
if(!$genes{$gene_name}){
$genes{$gene_name} = [];
push(@{$genes{$gene_name}}, $transcript);
}else{
push(@{$genes{$gene_name}}, $transcript);
}
}
return\% genes; } |
sub display_exons
{ my (@exons) = @_;
@exons = sort{$a->start <=> $b->start || $a->end <=> $b->end} @exons if($exons[0]->strand == 1);
@exons = sort{$b->start <=> $a->start || $b->end <=> $a->end} @exons if($exons[0]->strand == -1);
foreach my $e(@exons){
print STDERR $e->stable_id."\t ".$e->start."\t ".$e->end."\t ".$e->strand."\t ".$e->phase."\t ".$e->end_phase."\n";
} } |
sub generate_transcripts
{ my ($genesRef, $slice, $analysis, $five_prime, $three_prime) = @_;
my %genes;
my %transcripts;
my %temp_transcripts;
my %five_trans_start;
my %three_trans_end;
my %trans_start_exon;
my %trans_end_exon;
my $translation_end;
my $genecount = 0;
my @global_exons;
my %overlapcheck;
use Bio::EnsEMBL::Pipeline::Tools::ExonUtils;
GENE: foreach my $gene_name(keys(%$genesRef)){
$genes{$gene_name} = [];
my $transcriptcount = 0;
%temp_transcripts = ();
my @lines = @{$$genesRef{$gene_name}}; my @global_exons = ();
my %three_prime_exons;
my %five_prime_exons;
foreach my $line(@lines){
my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split /\s+/, $line;
$chr =~ s/CHROMOSOME_//;
if($start == $end){
next;
}
my $exon = new Bio::EnsEMBL::Exon;
my $phase = (3 - $frame)%3; $exon->start($start);
$exon->end($end);
$exon->analysis($analysis);
$exon->slice($slice);
$exon->phase($phase);
my $end_phase = ($phase + ($exon->end-$exon->start) + 1)%3;
$exon->end_phase($end_phase);
if($strand eq '+'){
$exon->strand(1);
}else{
$exon->strand(-1);
}
push(@global_exons, $exon);
}
if($global_exons[0]->strand == -1){
@global_exons = sort{$b->start <=> $a->start} @global_exons;
}else{
@global_exons = sort{$a->start <=> $b->start} @global_exons;
}
if(!defined($$five_prime{$gene_name}) and !defined($$three_prime{$gene_name})){
$transcripts{$gene_name} =\@ global_exons;
next GENE;
}
foreach my $transcript_name ( keys(%{$$five_prime{$gene_name}}) ){
my @five_prime_exons = ();
%overlapcheck = ();
$temp_transcripts{$transcript_name} = 1;
foreach my $line(@{$$five_prime{$gene_name}{$transcript_name}}){
my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split(/\s+/, $line);
if(defined $overlapcheck{$start}){
next;
}
$overlapcheck{$start} = 1;
my $exon = new Bio::EnsEMBL::Exon;
my $phase = -1;
$exon->start($start);
$exon->end($end);
$exon->analysis($analysis);
$exon->slice($slice);
$exon->phase($phase);
my $end_phase = -1;
$exon->end_phase($end_phase);
if($strand eq '+'){
$exon->strand(1);
}else{
$exon->strand(-1);
}
push(@five_prime_exons, $exon);
}
if($five_prime_exons[0]->strand == -1){
@five_prime_exons = sort{$b->start <=> $a->start} @five_prime_exons;
}else{
@five_prime_exons = sort{$a->start <=> $b->start} @five_prime_exons;
}
$five_prime_exons{$transcript_name} =\@ five_prime_exons;
}
foreach my $transcript_name ( keys(%{$$three_prime{$gene_name}}) ){
my @three_prime_exons = ();
%overlapcheck = ();
$temp_transcripts{$transcript_name} = 1;
foreach my $line(@{$$three_prime{$gene_name}{$transcript_name}}){
my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split /\s+/, $line;
if(defined $overlapcheck{$start}){
next;
}
$overlapcheck{$start} = 1;
my $exon = new Bio::EnsEMBL::Exon;
my $phase = -1;
$exon->start($start);
$exon->end($end);
$exon->analysis($analysis);
$exon->slice($slice);
$exon->phase($phase);
my $end_phase = -1;
$exon->end_phase($end_phase);
if($strand eq '+'){
$exon->strand(1);
}else{
$exon->strand(-1);
}
push(@three_prime_exons, $exon);
}
if($three_prime_exons[0]->strand == -1){
@three_prime_exons = sort{$b->start <=> $a->start} @three_prime_exons;
}else{
@three_prime_exons = sort{$a->start <=> $b->start} @three_prime_exons;
}
$three_prime_exons{$transcript_name} =\@ three_prime_exons;
}
foreach my $transcript_name (keys %temp_transcripts){
print "transcript $transcript_name\n";
$transcriptcount++;
my @exons = ();
foreach my $temp_exon (@global_exons){
push(@exons, Bio::EnsEMBL::Pipeline::Tools::ExonUtils->_clone_Exon($temp_exon));
}
my $translation_start = 1;
my $first = 1;
$trans_start_exon{$transcript_name} = 0;
$trans_end_exon{$transcript_name} = $#exons;
print "\ntrans-exons: ".$trans_start_exon{$transcript_name}." - ".$trans_end_exon{$transcript_name}." (".scalar @exons.")";
if(defined($five_prime_exons{$transcript_name})){
my @five_prime_exons = @{$five_prime_exons{$transcript_name}};
print "\nworking on 5' of $transcript_name (".scalar @five_prime_exons.") ";
FIVEUTR: while(my $five_prime_exon = shift(@five_prime_exons)){
my $start = $five_prime_exon->start;
my $end = $five_prime_exon->end;
my $strand = $five_prime_exon->strand;
if($exons[$trans_start_exon{$transcript_name}]->strand == 1 and $strand == 1){
if($start > $end){
next FIVEUTR;
}
if($end == ($exons[$trans_start_exon{$transcript_name}]->start)-1){
$translation_start = $exons[$trans_start_exon{$transcript_name}]->start - $start + 1;
$five_trans_start{$transcript_name} = $translation_start;
$exons[$trans_start_exon{$transcript_name}]->start($start);
}
elsif($end < $exons[$trans_start_exon{$transcript_name}]->start -1){
$trans_start_exon{$transcript_name}++;
$trans_end_exon{$transcript_name}++;
unshift(@exons, $five_prime_exon);
print "\nadditional non-coding exon (+) ".$start." - ".$end;
}
else{
print STDERR "\n>>$transcript_name strange 5' UTR exon (+): $start - $end with 1.exons at ".
$exons[$trans_start_exon{$transcript_name}]->start.
" - ".$exons[$trans_start_exon{$transcript_name}]->end;
next FIVEUTR;
}
}
elsif($exons[$trans_start_exon{$transcript_name}]->strand == -1 and $strand == -1){
if($start > $end){
next FIVEUTR;
}
if($start == ($exons[$trans_start_exon{$transcript_name}]->end)+1){
$translation_start = ($end - $exons[$trans_start_exon{$transcript_name}]->end + 1);
$five_trans_start{$transcript_name} = $translation_start;
$exons[$trans_start_exon{$transcript_name}]->end($end);
}
elsif($start > ($exons[$trans_start_exon{$transcript_name}]->end)+1){
$trans_start_exon{$transcript_name}++;
$trans_end_exon{$transcript_name}++;
unshift(@exons, $five_prime_exon);
}
else{
print "\n>>$transcript_name strange 5' UTR exon (-): $start - $end with 1.exons at ".
$exons[$trans_start_exon{$transcript_name}]->start.
" - ".$exons[$trans_start_exon{$transcript_name}]->end;
next FIVEUTR;
}
}
else{
print STDERR "\n>>strand switch in UTR / coding!";
}
}
}
if(defined($three_prime_exons{$transcript_name})){
my @three_prime_exons = @{$three_prime_exons{$transcript_name}};
THREEUTR: while(my $three_prime_exon = shift(@three_prime_exons)){
my $start = $three_prime_exon->start;
my $end = $three_prime_exon->end;
my $strand = $three_prime_exon->strand;
if($exons[$trans_end_exon{$transcript_name}]->strand == 1 and $strand == 1){
if($start > $end){
print STDERR "\n>>$transcript_name strange 3' UTR (+) exon: ".$start." - ".$end;
next THREEUTR;
}
if($start == (($exons[$trans_end_exon{$transcript_name}]->end)+1)){
$translation_end = (($exons[$trans_end_exon{$transcript_name}]->end - $exons[$trans_end_exon{$transcript_name}]->start) + 1);
$three_trans_end{$transcript_name} = $translation_end;
$exons[$trans_end_exon{$transcript_name}]->end($end);
}
elsif($start > (($exons[$trans_end_exon{$transcript_name}]->end)+1)){
push(@exons, $three_prime_exon);
}
else{
print STDERR "\n$transcript_name strange 3' UTR exon (+): $start - $end with 1.exons at ".$exons[$trans_end_exon{$transcript_name}]->start;
next THREEUTR;
}
}
elsif($exons[$trans_end_exon{$transcript_name}]->strand == -1 and $strand == -1){
if($start > $end){
print STDERR "\n>>$transcript_name strange 3' UTR (-) exon: ".$start." - ".$end;
next THREEUTR;
}
if($end == (($exons[$trans_end_exon{$transcript_name}]->start)-1)){
$translation_end = (($exons[$trans_end_exon{$transcript_name}]->end - $exons[$trans_end_exon{$transcript_name}]->start) +1);
$three_trans_end{$transcript_name} = $translation_end;
$exons[$trans_end_exon{$transcript_name}]->start($start);
}
elsif($end < (($exons[$trans_end_exon{$transcript_name}]->start)-1)){
push(@exons, $three_prime_exon);
}
else{
print STDERR "\n$transcript_name strange 3' UTR exon (-): $start - $end with 1.exons at ".$exons[$trans_end_exon{$transcript_name}]->start;
next THREEUTR;
}
}
}
}
$transcripts{$transcript_name} =\@ exons;
}
}
return (\%transcripts,\% five_trans_start,\% three_trans_end,\% trans_start_exon,\% trans_end_exon); } |
sub get_seq_ids
{ my ($fh) = @_;
my @seq_ids;
my @non_ids;
while(<$fh>){
chomp;
my ($status, $contig) =
(split)[4, 5];
if(!$contig =~ /\S+\.\d+/){
push(@non_ids, $contig);
next;
}
if($contig =~ /\S+\.$/){
push(@non_ids, $contig);
next;
}
push(@seq_ids, $contig)
}
return\@ seq_ids,\@ non_ids; } |
sub get_sequences_pfetch
{ my ($seq_ids, $seqfetcher) = @_;
unless($seqfetcher->isa("Bio::EnsEMBL::Pipeline::SeqFetcher::Pfetch")){
die("seqfetcher ".$seqfetcher." needs to be a pfetch for this too work");
}
my %seqs;
foreach my $id(@$seq_ids){
my $seq;
eval{
$seq = $seqfetcher->get_Seq_by_acc($id);
};
if($@){
warn "$id isn't most recent sequence trying archive\n";
$seqfetcher->options('-a');
$seq = $seqfetcher->get_Seq_by_acc($id);
}
if($seq){
$seqs{$id} = $seq;
}else{
warn "sequence ".$id." wasn't found\n";
}
}
return(\%seqs); } |
sub insert_agp_line
{ my ($chr_id, $chr_start, $chr_end, $contig, $contig_start, $contig_end, $contig_ori, $db) = @_;
if(!$contig){
die "contig id must be defined for this to work\n";
}
my $sql = "insert into assembly(asm_seq_region_id, asm_start, asm_end, cmp_seq_region_id, cmp_start, cmp_end, ori) values(?, ?, ?, ?, ?, ?, ?)";
my $sth = $db->dbc->prepare($sql);
$sth->execute($chr_id, $chr_start, $chr_end, $contig, $contig_start, $contig_end, $contig_ori); } |
sub non_translate
{ my (@transcripts) = @_;
foreach my $t(@transcripts){
my @exons = @{$t->get_all_Exons};
foreach my $e(@exons){
print "exon ".$e->stable_id." ".$e->start." ".$e->end." ".$e->strand."\n";
my $seq = $e->seq;
my $pep0 = $seq->translate('*', 'X', 0);
my $pep1 = $seq->translate('*', 'X', 1);
my $pep2 = $seq->translate('*', 'X', 2);
print "exon sequence :\n".$e->seq->seq."\n\n";
print $e->seqname." ".$e->start." : ".$e->end." translation in 0 frame\n ".$pep0->seq."\n\n";
print $e->seqname." ".$e->start." : ".$e->end." translation in 1 phase\n ".$pep2->seq."\n\n";
print $e->seqname." ".$e->start." : ".$e->end." translation in 2 phase\n ".$pep1->seq."\n\n";
print "\n\n";
}
} } |
sub parse_SL1
{ my ($file, $seq, $analysis) = @_;
open(FH, $file) or die"couldn't open ".$file." $!";
die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") ||
$seq->isa("Bio::Seq") ||
$seq->isa("Bio::PrimarySeqI"));
my @operons;
LINE: while(<FH>){
chomp;
my @values = split;
if($values[1] ne 'SL1'){
next LINE;
}
if($_ =~ /personal\s+communication/){
print STDERR "Can't put information in table about ".$_."\n";
next LINE;
}
my ($start, $end, $strand, $id, $count);
$count = 0;
$start = $values[3];
$end = $values[4];
$strand = $values[6];
$id = $values[14];
if(!$id){
$id = $values[13];
}
if(($id =~ /cDNA/) || ($id =~ /EST/)){
$id = $values[15];
}
if($id eq '"'){
$id = $values[13];
}
if((!$id) || ($id eq '"')){
$id = $values[12];
}
if($strand eq '+'){
$strand = 1;
}else{
$strand = -1;
}
if((!$id) || ($id eq '.') || ($id =~ /EST/) || ($id =~ /cDNA/) || ($id eq '"')){
print "line ".$_." produced a weird id ".$id."\n";
next LINE
}
$id =~ s/\"//g;
my $simple_feature = &create_simple_feature($start, $end, $strand, $id, $seq, $analysis);
push(@operons, $simple_feature);
}
return\@ operons; } |
sub parse_SL2
{ my ($file, $seq, $analysis) = @_;
open(FH, $file) or die"couldn't open ".$file." $!";
die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") ||
$seq->isa("Bio::Seq") ||
$seq->isa("Bio::PrimarySeqI"));
my @operons;
LINE: while(<FH>){
chomp;
my @values = split;
if($values[1] ne 'SL2'){
next LINE;
}
if($_ =~ /personal\s+communication/){
print STDERR "Can't put information in table about ".$_."\n";
next LINE;
}
my ($start, $end, $strand, $id, $count);
$count = 0;
$start = $values[3];
$end = $values[4];
$strand = $values[6];
$id = $values[14];
if(!$id){
$id = $values[13];
}
if(($id =~ /cDNA/) || ($id =~ /EST/)){
$id = $values[15];
}
if($id eq '"'){
$id = $values[13];
if($id eq 'ESTyk1004g06.5'){
$id =~ s/EST//g;
}
}
if((!$id) || ($id eq '"')){
$id = $values[12];
}
if($strand eq '+'){
$strand = 1;
}else{
$strand = -1;
}
if((!$id) || ($id eq '.') || ($id =~ /EST/) || ($id =~ /cDNA/) || ($id eq '"')){
print "line ".$_." produced a weird id ".$id."\n";
next LINE
}
$id =~ s/\"//g;
my $simple_feature = &create_simple_feature($start, $end, $strand, $id, $seq, $analysis);
push(@operons, $simple_feature);
}
return\@ operons ; } |
sub parse_expr
{ my ($file, $seq, $analysis) = @_;
open(FH, $file) or die"couldn't open ".$file." $!";
die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") ||
$seq->isa("Bio::Seq") ||
$seq->isa("Bio::PrimarySeqI"));
my @operons;
LINE: while(<FH>){
chomp;
my @values = split;
if($values[1] ne 'Expr_profile'){
next LINE;
}
my ($start, $end, $strand, $id, $count);
$count = 0;
$start = $values[3];
$end = $values[4];
$strand = $values[6];
$id = $values[9];
if($strand eq '+'){
$strand = 1;
}else{
$strand = -1;
}
$id =~ s/\"//g;
my $simple_feature = &create_simple_feature($start, $end, $strand, $id, $seq, $analysis);
push(@operons, $simple_feature);
}
return\@ operons ; } |
sub parse_gff
{ my ($file, $seq, $analysis) = @_;
use Storable qw(store retrieve freeze thaw dclone);
open(FH, $file) or die "couldn't open ".$file." $!";
die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") ||
$seq->isa("Bio::Seq") ||
$seq->isa("Bio::PrimarySeqI"));
my @genes;
my ($transcripts, $five_prime, $three_prime) = &process_file(\*FH);
print "there are ".keys(%$transcripts)." distinct transcripts\n";
my $genes = undef;
my ($processed_transcripts, $five_start, $three_end, $trans_start_exon, $trans_end_exon) = &generate_transcripts($transcripts, $seq, $analysis, $five_prime, $three_prime);
print "\nthere are ".keys(%$processed_transcripts)." transcripts\n";
$genes = undef;
$genes = &create_transcripts($processed_transcripts, $five_start, $three_end, $trans_start_exon, $trans_end_exon);
print "\nPARSE GFF has ".keys(%$genes)." genes\n";
foreach my $gene_id(keys(%$genes)){
my $transcripts = $genes->{$gene_id};
my $unpruned = &create_gene($transcripts, $gene_id);
my $gene = &prune_Exons($unpruned);
push(@genes, $gene);
}
close(FH);
print "\nPARSE_GFF got ".@genes." genes\n";
return\@ genes; } |
sub parse_operons
{ my ($file, $seq, $analysis) = @_;
open(FH, $file) or die"couldn't open ".$file." $!";
die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") ||
$seq->isa("Bio::Seq") ||
$seq->isa("Bio::PrimarySeqI"));
my @operons;
LINE: while(<FH>){
chomp;
my($type, $start, $end, $strand, $id) = (split)[1,3,4,6,9];
if($type ne 'operon'){
next LINE;
}
if($strand eq '-'){
$strand = -1;
}else{
$strand = 1;
}
my $simple_feature = Bio::EnsEMBL::SimpleFeature->new();
$simple_feature->start($start);
$simple_feature->strand($strand);
$simple_feature->end($end);
$id =~ s/\"//g;
$simple_feature->display_label($id);
$simple_feature->slice($seq);
$simple_feature->analysis($analysis);
push(@operons, $simple_feature);
}
return\@ operons ; } |
sub parse_pseudo_files
{ my ($file, $seq, $analysis, $types) = @_;
open(FH, $file) or die "couldn't open ".$file." $!";
die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") ||
$seq->isa("Bio::Seq") ||
$seq->isa("Bio::PrimarySeqI"));
my @genes;
my ($transcripts) = &process_pseudo_files(\*FH, $types);
print "there are ".keys(%$transcripts)." distinct special transcripts\n";
my ($processed_transcripts) = &process_pseudo_transcripts($transcripts, $seq, $analysis);
print "there are ".keys(%$processed_transcripts)." processed special transcripts\n";
my $genes = undef;
$genes = &create_pseudo_transcripts($processed_transcripts);
print "PARSE GFF there are ".keys(%$genes)." special genes\n";
foreach my $gene_id(keys(%$genes)){
my $transcripts = $genes->{$gene_id};
my $gene = &create_gene($transcripts, $gene_id);
push(@genes, $gene);
}
close(FH);
return\@ genes;
}
} |
sub parse_pseudo_gff
{ my $type = "Pseudogene";
my ($file, $seq, $analysis) = @_;
&parse_pseudo_files($file, $seq, $analysis, $type); } |
sub parse_rRNA_genes
{ my $type = "rRNA";
my ($file, $seq, $analysis) = @_;
&parse_pseudo_files($file, $seq, $analysis, $type); } |
sub parse_rnai
{ my ($file, $seq, $analysis) = @_;
open(FH, $file) or die "couldn't open ".$file." $!";
die " seq ".$seq." is not a Bio::Seq " unless($seq->isa("Bio::SeqI") ||
$seq->isa("Bio::Seq") ||
$seq->isa("Bio::PrimarySeqI"));
print "ONLY FETCHES RNAi_PRIMARY\n";
my @operons;
LINE: while(<FH>){
chomp;
my @values = split;
if($values[1] ne 'cDNA_for_RNAi'){
next LINE;
}
my ($start, $end, $strand, $id, $count);
$count = 0;
$start = $values[3];
$end = $values[4];
$strand = $values[6];
$id = $values[9];
if($strand eq '+'){
$strand = 1;
}else{
$strand = -1;
}
$id =~ s/\"//g;
my $simple_feature = &create_simple_feature($start, $end, $strand, $id, $seq, $analysis);
push(@operons, $simple_feature);
}
return\@ operons ; } |
sub parse_tRNA_genes
{ my $type = "tRNA";
my ($file, $seq, $analysis) = @_;
&parse_pseudo_files($file, $seq, $analysis, $type); } |
sub process_file
{ my ($fh) = @_;
my %genes;
my $transcript;
my %five_prime;
my %three_prime;
LOOP: while(<$fh>){
chomp;
my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split;
my $element = $_;
if($chr =~ /sequence-region/){
next LOOP;
}
if(/^##/){
next LOOP;
}
if(!$status && !$type){
next LOOP;
}
my $line = $status." ".$type;
if( ($line eq 'Coding_transcript five_prime_UTR') or ($line eq 'Coding_transcript three_prime_UTR') ){
$gene =~ s/\"//g;
$transcript = $gene;
$gene =~ s/(\.\w+)\.\d+$/$1/;
my ($position) = $type;
if($position =~/^five/){
if(!$five_prime{$gene}){
$five_prime{$gene} = {};
if(!$five_prime{$gene}{$transcript}){
$five_prime{$gene}{$transcript} = [];
}
push(@{$five_prime{$gene}{$transcript}}, $element);
}
else{
if(!$five_prime{$gene}{$transcript}){
$five_prime{$gene}{$transcript} = [];
}
push(@{$five_prime{$gene}{$transcript}}, $element);
}
}elsif($position =~/^three/){
if(!$three_prime{$gene}){
$three_prime{$gene} = {};
if(!$three_prime{$gene}{$transcript}){
$three_prime{$gene}{$transcript} = [];
}
push(@{$three_prime{$gene}{$transcript}}, $element);
}
else{
if(!$three_prime{$gene}{$transcript}){
$three_prime{$gene}{$transcript} = [];
}
push(@{$three_prime{$gene}{$transcript}}, $element);
}
}
next LOOP;
}elsif($line ne 'curated coding_exon'){
next LOOP;
}
$gene =~ s/\"//g;
if(!$genes{$gene}){
$genes{$gene} = [];
push(@{$genes{$gene}}, $element);
}else{
push(@{$genes{$gene}}, $element);
}
}
print STDERR "Have ".keys(%genes). " genes (CDS), ".
keys(%five_prime)." have 5' UTR and ".keys(%three_prime)." have 3' UTR information\n";
return\% genes,\% five_prime,\% three_prime; } |
sub process_pseudo_files
{ my ($fh, $types) = @_;
my %transcripts;
LOOP: while(<$fh>){
chomp;
if(/^##/){
next LOOP;
}
my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split;
my $element = $_;
if($chr =~ /sequence-region/){
next LOOP;
}
if(!$status && !$type){
print "status and type no defined skipping\n";
next LOOP;
}
my $line = $status." ".$type;
if($line ne $types.' exon'){
next LOOP;
}
$gene =~ s/\"//g;
if(!$transcripts{$gene}){
$transcripts{$gene} = [];
push(@{$transcripts{$gene}}, $element);
}else{
push(@{$transcripts{$gene}}, $element);
}
}
return\% transcripts; } |
sub process_pseudo_transcripts
{ my ($transcripts, $slice, $analysis) = @_;
my %genes;
my %transcripts = %$transcripts;
my @names = keys(%transcripts);
foreach my $name(@names){
my @lines = @{$transcripts{$name}};
$transcripts{$name} = [];
my @exons;
foreach my $line(@lines){
my($chr, $status, $type, $start, $end, $score, $strand, $frame, $sequence, $gene) = split /\s+/, $line;
$chr =~ s/CHROMOSOME_//;
if($start == $end){
next;
}
my $exon = new Bio::EnsEMBL::Exon;
if($frame eq '.'){
$frame = 0;
}
my $phase = (3 - $frame)%3; $exon->start($start);
$exon->end($end);
$exon->analysis($analysis);
$exon->slice($slice);
$exon->phase($phase);
my $end_phase = ($phase + ($exon->end-$exon->start) + 1)%3;
$exon->end_phase($end_phase);
if($strand eq '+'){
$exon->strand(1);
}else{
$exon->strand(-1);
}
push(@exons, $exon);
}
if($exons[0]->strand == -1){
@exons = sort{$b->start <=> $a->start} @exons;
}else{
@exons = sort{$a->start <=> $b->start} @exons;
}
my $phase = 0;
foreach my $e(@exons){
push(@{$transcripts{$name}}, $e);
}
}
return (\%transcripts); } |
sub prune_Exons
{ my ($gene) = @_;
my @unique_Exons;
foreach my $tran (@{$gene->get_all_Transcripts}) {
my @newexons;
foreach my $exon (@{$tran->get_all_Exons}) {
my $found;
UNI:foreach my $uni (@unique_Exons) {
if ($uni->start == $exon->start &&
$uni->end == $exon->end &&
$uni->strand == $exon->strand &&
$uni->phase == $exon->phase &&
$uni->end_phase == $exon->end_phase
) {
$found = $uni;
last UNI;
}
}
if (defined($found)) {
push(@newexons,$found);
if ($exon == $tran->translation->start_Exon){
$tran->translation->start_Exon($found);
}
if ($exon == $tran->translation->end_Exon){
$tran->translation->end_Exon($found);
}
} else {
push(@newexons,$exon);
push(@unique_Exons, $exon);
}
}
$tran->flush_Exons;
foreach my $exon (@newexons) {
$tran->add_Exon($exon);
}
}
return $gene; } |
sub selenocysteine
{ my ($transcript, $pos, $db) = @_;
print "\nmodifying ".$transcript->stable_id;
my $seq_edit = Bio::EnsEMBL::SeqEdit->new(
-CODE => '_selenocysteine',
-NAME => 'Selenocysteine',
-DESC => 'Selenocysteine',
-START => $pos,
-END => $pos,
-ALT_SEQ => 'U' );
my $attribute = $seq_edit->get_Attribute();
my $translation = $transcript->translation();
my $attribute_adaptor = $db->get_AttributeAdaptor();
$attribute_adaptor->store_on_Translation($translation, [$attribute]); } |
sub store_coord_system
{ my ($db, $name, $version, $sequence_level, $default, $rank) = @_;
my $csa = $db->get_CoordSystemAdaptor();
my $cs = Bio::EnsEMBL::CoordSystem->new
(
-NAME => $name,
-VERSION => $version,
-DEFAULT => $default,
-SEQUENCE_LEVEL => $sequence_level,
-RANK => $rank
);
$csa->store($cs);
return $cs; } |
sub store_slice
{ my ($db, $name, $start, $end, $strand, $coord_system, $sequence) = @_;
my $sa = $db->get_SliceAdaptor();
my $slice = Bio::EnsEMBL::Slice->new
(-seq_region_name => $name,
-start => $start,
-end => $end,
-strand => $strand,
-coord_system => $coord_system);
my $seq_ref;
if($sequence){
$seq_ref =\$ sequence;
}
$sa->store($slice, $seq_ref);
$slice->adaptor($sa);
return $slice;
}
1; } |
sub translation_check
{ my ($gene, $db) = @_;
my @transcripts = @{$gene->get_all_Transcripts};
foreach my $t (@transcripts){
my $pep = $t->translate->seq;
if($pep =~ /\*/g){
if($gene->stable_id eq 'C06G3.7' and $db){
my $pos = pos($pep);
print STDERR "transcript ".$t->stable_id." doesn't translate. Adding Selenocystein at position $pos.\n".
"Please beware of problems during further analysis steps because of this.";
selenocysteine($t, $pos, $db);
}
else{
print STDERR "transcript ".$t->stable_id." doesn't translate\n";
print STDERR "translation start ".$t->translation->start." end ".$t->translation->end."\n";
print STDERR "start exon coords ".$t->translation->start_Exon->start." ".$t->translation->start_Exon->end."\n";
print STDERR "end exon coords ".$t->translation->end_Exon->start." ".$t->translation->end_Exon->end."\n";
print STDERR "peptide ".$pep."\n";
&display_exons(@{$t->get_all_Exons});
&non_translate($t);
return undef;
}
}
}
return $gene; } |
sub write_genes
{ my ($genes, $db, $stable_id_check) = @_;
my $e=0;
my %stable_ids;
if($stable_id_check){
my $sql = 'select stable_id from gene_stable_id';
my $sth = $db->dbc->prepare($sql);
$sth->execute;
while(my($stable_id) = $sth->fetchrow){
$stable_ids{$stable_id} = 1;
}
}
my %stored;
GENE: foreach my $gene(@$genes){
if($stable_id_check){
if($stable_ids{$gene->stable_id}){
print STDERR $gene->stable_id." already exists\n";
my $id = $gene->stable_id;
$id .= '.pseudo';
$gene->stable_id($id);
foreach my $transcript(@{$gene->get_all_Transcripts}){
my $trans_id = $transcript->stable_id;
$trans_id .= '.pseudo';
$transcript->stable_id($trans_id);
foreach my $e(@{$transcript->get_all_Exons}){
my $id = $e->stable_id;
$id .= '.pseudo';
$e->stable_id($id);
}
}
}
}
if($stored{$gene->stable_id}){
print STDERR "we have stored ".$gene->stable_id." already\n";
next GENE;
}
my $gene_adaptor = $db->get_GeneAdaptor;
eval{
$stored{$gene->stable_id} = 1;
$gene_adaptor->store($gene);
$e++;
};
if($@){
die "couldn't store ".$gene->stable_id." problems ".$@;
}
}
print "\nStored gene: ".$e."\n"; } |
write_simple_features | description | prev | next | Top |
sub write_simple_features
{ my ($operons, $db) = @_;
eval{ print "\n check 1: ".$$operons[0]->display_label };
eval{ print "\n check 2: ".$$operons[0]->start." - ".$$operons[0]->end."\n" };
my $operon_adaptor = $db->get_SimpleFeatureAdaptor;
eval{
$operon_adaptor->store(@$operons);
};
if($@){
die "couldn't store simple features problems ".$@;
}
}
} |
General documentation