Raw content of Bio::SeqIO::chado # $Id: chado.pm,v 1.1 2002/12/03 08:13:55 cjm Exp $ # # BioPerl module for Bio::SeqIO::chado # # Chris Mungall <cjm@fruitfly.org> # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::SeqIO::chado - chado sequence input/output stream =head1 SYNOPSIS It is probably best not to use this object directly, but rather go through the SeqIO handler system. Go: $stream = Bio::SeqIO->new(-file => $filename, -format => 'chado'); while ( my $seq = $stream->next_seq() ) { # do something with $seq } =head1 DESCRIPTION This object can transform Bio::Seq objects to and from chado flat file databases. CURRENTLY ONLY TO =head2 Optional functions =over 3 =item _show_dna() (output only) shows the dna or not =item _post_sort() (output only) provides a sorting func which is applied to the FTHelpers before printing =back =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to one of the Bioperl mailing lists. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://www.bioperl.org/MailList.shtml - About the mailing lists =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via email or the web: bioperl-bugs@bio.perl.org http://bio.perl.org/bioperl-bugs/ =head1 AUTHOR - Chris Mungall Email cjm@fruitfly.org =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::SeqIO::chado; use vars qw(@ISA); use strict; use Bio::SeqIO; use Bio::SeqFeature::Generic; use Bio::Species; use Bio::Seq::SeqFactory; use Bio::Annotation::Collection; use Bio::Annotation::Comment; use Bio::Annotation::Reference; use Bio::Annotation::DBLink; use Data::Stag qw(:all); @ISA = qw(Bio::SeqIO); sub _initialize { my($self,@args) = @_; $self->SUPER::_initialize(@args); if( ! defined $self->sequence_factory ) { $self->sequence_factory(new Bio::Seq::SeqFactory (-verbose => $self->verbose(), -type => 'Bio::Seq::RichSeq')); } my $wclass = $self->default_handler_class; $self->handler($wclass->new); $self->{_end_of_data} = 0; $self->handler->S("chado"); return; } sub DESTROY { my $self = shift; $self->end_of_data(); $self->SUPER::DESTROY(); } sub end_of_data { my $self = shift; $self->{_end_of_data} = 1; $self->handler->E("chado"); } sub default_handler_class { return "Data::Stag::BaseHandler"; } =head2 next_seq Title : next_seq Usage : $seq = $stream->next_seq() Function: returns the next sequence in the stream Returns : Bio::Seq object Args : =cut sub next_seq { my ($self,@args) = @_; my $seq = $self->sequence_factory->create ( # '-verbose' =>$self->verbose(), # %params, # -seq => $seqc, # -annotation => $annotation, # -features => \@features ); return $seq; } sub handler { my $self = shift; $self->{_handler} = shift if @_; return $self->{_handler}; } =head2 write_seq Title : write_seq Usage : $stream->write_seq($seq) Function: writes the $seq object (must be seq) to the stream Returns : 1 for success and 0 for error Args : Bio::Seq =cut sub write_seq { my ($self,$seq) = @_; if( !defined $seq ) { $self->throw("Attempting to write with no seq!"); } if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); } # get a handler - must inherit from Data::Stag::BaseHandler; my $w = $self->handler; # start of data $w->S("seqset"); # my $seq_temp_uid = $self->get_temp_uid($seq); my $seq_temp_uid = $seq->accession . '.' . ($seq->can('seq_version') ? $seq->seq_version : $seq->version); # data structure representing the core sequence for this record my $seqnode = Data::Stag->new(feature=>[ [feature_id=>$seq_temp_uid], [dbxrefstr=>$seq->accession_number], [name=>$seq->display_name], [residues=>$seq->seq], ]); # soft properties my %prop = (); my ($div, $mol); my $len = $seq->length(); if ( $seq->can('division') ) { $div=$seq->division; } if( !defined $div || ! $div ) { $div = 'UNK'; } if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) { $mol = $seq->alphabet || 'DNA'; } my $circular = 'linear '; $circular = 'circular' if $seq->is_circular; # cheeky hack - access symbol table no strict 'refs'; map { $prop{$_} = $ {*$_}; } qw(mol div circular); use strict 'refs'; map { $prop{$_} = $seq->$_() if $seq->can($_); } qw(desc keywords); local($^W) = 0; # supressing warnings about uninitialized fields. # Organism lines if (my $spec = $seq->species) { my ($species, $genus, @class) = $spec->classification(); my $OS; if( $spec->common_name ) { $OS = $spec->common_name; } else { $OS = "$genus $species"; } if (my $ssp = $spec->sub_species) { $OS .= " $ssp"; } } # Reference lines my $count = 1; foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { # TODO } # Comment lines foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { $seqnode->add_featureprop([[pkey=>'comment'],[pval=>$comment->text]]); } # throw the writer an event $w->ev(@$seqnode); $seqnode = undef; # free memory # make events for all the features within the record foreach my $sf ( $seq->top_SeqFeatures ) { $self->write_sf($sf, $seq_temp_uid); } # data end $w->E("seqset"); return 1; } # ---- # writes a seq feature # ---- sub write_sf { my $self = shift; my $sf = shift; my $seq_temp_uid = shift; my $w = $self->handler; my %props = map { $_=>[$sf->each_tag_value($_)] } $sf->all_tags; my $loc = $sf->location; my $name = $sf->display_name; my $type = $sf->primary_tag; my @subsfs = $sf->sub_SeqFeature; my @locnodes = (); my $sid = $loc->is_remote ? $loc->seq_id : $seq_temp_uid; if( $loc->isa("Bio::Location::SplitLocationI") ) { # turn splitlocs into subfeatures my $n = 1; push(@subsfs, map { my $ssf = Bio::SeqFeature::Generic->new( -start=>$_->start, -end=>$_->end, -strand=>$_->strand, -primary=>$self->subpartof($type), ); if ($_->is_remote) { $ssf->location->is_remote(1); $ssf->location->seq_id($_->seq_id); } $ssf; } $loc->each_Location); } elsif( $loc->isa("Bio::Location::RemoteLocationI") ) { # turn splitlocs into subfeatures my $n = 1; push(@subsfs, map { Bio::SeqFeature::Generic->new( # -name=>$name.'.'.$n++, -start=>$_->start, -end=>$_->end, -strand=>$_->strand, -primary=>$self->subpartof($type), ) } $loc->each_Location); } else { my ($beg, $end, $strand) = $self->bp2ib($loc); @locnodes = ( [featureloc=>[ [nbeg=>$beg], [nend=>$end], [strand=>$strand], [srcfeature_id=>$sid], [group=>0], [rank=>0], ] ] ); } my $feature_id = $self->get_temp_uid($sf); my $fnode = [feature=>[ [feature_id=>$feature_id], [name=>$name], [typename=>$type], @locnodes, (map { my $k = $_; map { [featureprop=>[[pkey=>$k],[pval=>$_]]] } @{$props{$k}} } keys %props), ]]; $w->ev(@$fnode); foreach my $ssf (@subsfs) { my $ssfid = $self->write_sf($ssf, $sid); $w->ev(feature_relationship=>[ [subjfeature_id=>$ssfid], [objfeature_id=>$feature_id] ] ); } return $feature_id; } # private; # an ID for this session that should be # unique... hmm sub session_id { my $self = shift; $self->{_session_id} = shift if @_; if (!$self->{_session_id}) { $self->{_session_id} = $$.time; } return $self->{_session_id}; } our $next_id = 1; our %obj2id_hash = (); sub get_temp_uid { my $self = shift; my $ob = shift; my $id = $obj2id_hash{$ob}; if (!$id) { $id = $next_id++; $obj2id_hash{$ob} = $id; } return $self->session_id.'.'.$id; } # interbase and directional semantics sub bp2ib { my $self = shift; my $loc = shift; my ($s, $e, $str) = ref($loc) eq "ARRAY" ? (@$loc) : ($loc->start, $loc->end, $loc->strand); if ($str < 0) { ($s, $e) = ($e, $s); } $s--; return ($s, $e, $str); } sub subpartof { my $self = shift; my $type = 'partof_'.shift; $type =~ s/partof_CDS/CDS_exon/; $type =~ s/partof_\wRNA/exon/; return $type; } 1;