GenomicSequenceMod documentation.

A hidden Dataset containing sequence attributes that can be imported to other
visible Datasets which are compatible with its required data input, based
on the presence of one or more importable-exportable relationships.

Description

Dataset providing Genomic Sequence attributes, which can be imported into
other Datasets. GenomicSequence is itself not a visible Dataset.

Methods

BEGIN		Code
__processNewQuery	No description	Code
_addRow	No description	Code
_calcSeqOverLocations	No description	Code
_codingCdnaPeptideSequences	No description	Code
_continueWithBatch	No description	Code
_editSequence	No description	Code
_exonIntronFlankSequences	No description	Code
_exonSequences	No description	Code
_getConfigurationTree	No description	Code
_getLocationFrom	No description	Code
_getResultTable	No description	Code
_ignoreRow	No description	Code
_incrementBatch	No description	Code
_initializeDNAAdaptor	No description	Code
_initializeIndices	No description	Code
_initializeReturnRow	No description	Code
_modFlanks	No description	Code
_new	No description	Code
_processRow	No description	Code
_processSequence	No description	Code
_rawSequences	No description	Code
_rc	No description	Code
_snpSequences	No description	Code
_translate	No description	Code
_translate_ambiguous_codon	No description	Code
_unambiquous_codons	No description	Code
_utrSequences	No description	Code

Methods description

None available.

Methods code

BEGIN

Top

BEGIN {

    @NAMES =            #id
    (
     'Standard',        #1
     'Vertebrate Mitochondrial',#2
     'Yeast Mitochondrial',# 3
     'Mold, Protozoan, and CoelenterateMitochondrial and Mycoplasma/Spiroplasma',#4
     'Invertebrate Mitochondrial',#5
     'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
     '', '',
     'Echinoderm Mitochondrial',#9
     'Euplotid Nuclear',#10
     '"Bacterial"',# 11
     'Alternative Yeast Nuclear',# 12
     'Ascidian Mitochondrial',# 13
     'Flatworm Mitochondrial',# 14
     'Blepharisma Nuclear',# 15
     'Chlorophycean Mitochondrial',# 16
     '', '',  '', '',
     'Trematode Mitochondrial',# 21
     'Scenedesmus obliquus Mitochondrial', #22
     'Thraustochytrium Mitochondrial' #23
     );

    @TABLES =
    qw(
       FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
       FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
       FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       '' ''
       FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
       FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
       FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
       FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       '' '' '' ''
       FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
       FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
       );

    my @nucs = qw(t c a g);
    my $x = 0;
    ($CODONS, $TRCOL) = ({}, {});

    for my $i (@nucs) {
	for my $j (@nucs) {
	    for my $k (@nucs) {
		my $codon = "$i$j$k";

		$CODONS->{$codon} = $x;
		$TRCOL->{$x} = $codon;
		$x++;
	    }
	}
    }

    %IUPAC_DNA = ( A => [qw(A)],
		   C => [qw(C)],
		   G => [qw(G)],
		   T => [qw(T)],
		   U => [qw(U)],
		   M => [qw(A C)],
		   R => [qw(A G)],
		   W => [qw(A T)],
		   S => [qw(C G)],
		   Y => [qw(C T)],
		   K => [qw(G T)],
		   V => [qw(A C G)],
		   H => [qw(A C T)],
		   D => [qw(A G T)],
		   B => [qw(C G T)],
		   X => [qw(G A T C)],
		   N => [qw(G A T C)]
		   );

}

__processNewQuery

description

Top

sub __processNewQuery {

    my ($self, $query) = @_;

    my $attribute = $query->getAllAttributes($self->name)->[0];
    my $seq_name = $attribute->name;

    # hack to keep webservices working for 0_5 originating query XML
    if ($seq_name eq 'pkey'){
	$attribute = $query->getAllAttributes($self->name)->[-1];
	$seq_name = $attribute->name;
    }

    $self->set('seq_name', $seq_name);
    $self->set('translate', ($seq_name =~ m/peptide$/));

    my $ignore = IGNORE;
    if ($seq_name =~ m/(coding|cdna|peptide)$/) {
	# this is not actually going to ignore anything, but is
	# simply used to determine translation table for each gene
	# without creating a second instance variable
	$self->set('ignore_row', "type");
	$self->set('recipe', '_codingCdnaPeptideSequences');
    } 
    elsif ($seq_name =~ m/(exon_intron|flank)$/) {
	$self->set('recipe', '_exonIntronFlankSequences');
    } 
    elsif ($seq_name =~ m/raw/){
	$self->set('recipe', '_rawSequences');
    } 
    elsif ($seq_name =~ m/(gene_exon|transcript_exon|transcript_intron)$/) {
	# set the system to ignore rows with duplicate pkeys
	# for gene_exon
	$self->set('ignore', $ignore->{$1}); #undef for transcript_exon
	$self->set('ignore_row', "pkey");
	$self->set('recipe', '_exonSequences');
    } 
    elsif ($seq_name =~ m/utr$/) {
	$self->set('recipe', '_utrSequences');
    } 
    elsif ($seq_name =~ m/snp$/) {
	$self->set('recipe', '_snpSequences');
    } 
    else {
	BioMart::Exception::Configuration->throw("Unsupported sequence name $seq_name recieved by GenomicSequence\n");
    }

    $self->set('downstream_flank', 0);
    $self->set('upstream_flank', 0);
    $self->set('importable', undef);
    $self->set('lastPkey', undef);
    $self->set('importable_indices', undef);
    $self->set('returnRow_indices', undef);
    $self->set('locations', {});
    $self->set('outRow', undef);
    $self->set('calc_location', undef);
    $self->set('sequence', undef);

    #determine which BaseSequenceA object to create
    my $filters = $query->getAllFilters($self->name);

    foreach my $filt (@{$filters}) {
	if ($filt->isa("BioMart::Configuration::FilterList")) {
	    if ($filt->linkName) {
		if ($self->get('importable') ) {
		    BioMart::Exception::Configuration->throw("Recieved two importables, can only work with one\n");
		} 
		else {
		    $self->set('importable', $filt);
		}
	    } 
	    else {
		BioMart::Exception::Configuration->throw("Recieved invalid linkName ".
			     $filt->linkName."\n");
	    }
	} 
	else {
	    #must be a downstream or upstream valueFilter
	    unless ($filt->isa("BioMart::Configuration::ValueFilter")) {
		BioMart::Exception::Configuration->throw("Recieved unknown filter ".$filt->name." in GenomicSequence Dataset!\n");
	    }

	    if ($self->get($filt->name)) {
		BioMart::Exception::Configuration->throw("Recieved two ".$filt->name." flanking filters in GenomicSequence Dataset\n");
	    }

	    #could still be some strange ValueFilter that is not upstream or 
	    # downstream, but not likely. Will throw an exception if this is 
	    # the case
	    my $table = $filt->getTable;
	    my $row = $table->nextRow;
	    my $value = $row->[0];
	    if ($value) {
		$self->set($filt->name, $value);
	    }
	}
    }
    
    unless ($self->get('importable')) {
	BioMart::Exception::Configuration->throw("No Importable Recieved in GenomicSequence\n");
    }

}

_addRow

description

Top

sub _addRow {

    my ($self, $atable, $outRow, $sequence) = @_;
  
    push @{$outRow}, $sequence;
    $atable->addRow($outRow);
    $self->_incrementBatch;
}

#interface methods

}

_calcSeqOverLocations

description

Top

sub _calcSeqOverLocations {

    my ($self, $this_location) = @_;
    $this_location->{start} || return; # Sanity check
    $this_location->{end}   || return; # Sanitt check

    my $calc_location = $self->get('calc_location');

    if ($calc_location) {
	$calc_location->{"start"} = $this_location->{"start"}  
	      if ($this_location->{"start"} < $calc_location->{"start"});
	$calc_location->{"end"} = $this_location->{"end"}  
	      if ($this_location->{"end"} > $calc_location->{"end"});
    } 
    else {
	$calc_location = {};
	foreach my $key (keys %{$this_location}) {
	    $calc_location->{$key} = $this_location->{$key};
	}
    }
  
    $self->set('calc_location', $calc_location);

}

_codingCdnaPeptideSequences

description

Top

sub _codingCdnaPeptideSequences {

    my ($self, $atable, $curRow) = @_;
    
    # Determine this and last primary keys
    my $importable_indices = $self->get('importable_indices');
    # Get the primary sequence ID from this row. Use DUMMY if missing
    my $pkey     = $curRow ? 
	($curRow->[$importable_indices->{"pkey"}] || 'DUMMY') : undef;
    my $lastPkey = $self->get('lastPkey') || $pkey;
    
    my $locations = $self->get('locations');
    
    my $outRow = $self->get('outRow');
    
    if( ( ! defined $pkey ) or ( $pkey ne $lastPkey ) ){
	# Start of new row, or end of results; Dump the current sequence
  
	my $sequence;
	if( grep{ $locations->{$_}->{"start"} } keys %$locations ) {
	    $sequence = $self->_processSequence($locations);
	    $sequence = $self->_translate($sequence) 
		if ($self->get('translate'));
	    $self->_editSequence(\$sequence);
	}
    
	if ($sequence) { 
	    $self->_addRow($atable, $outRow, $sequence);
	} 
	else {      
	    $self->_addRow($atable, $outRow, "Sequence unavailable");
	}
	$locations = {};
	$outRow = undef;
    } # End sequence dumping

    if ($curRow) {
	# Update the location corresponding to this row
	my $rank = $curRow->[ $importable_indices->{"rank"} ];

	# Requesting for phase info as well, to fix the bug of additional 
	# Ns in the beginning - syed
	my $location = $self->_getLocationFrom($curRow, "chr", "start", "end", 
					       "strand", "phase"); 
    
	$location = $self->_modFlanks($location, 0);
	$locations->{$rank} = $location if ($location->{"start"});
    } 
    
    $outRow ||= $self->_initializeReturnRow($curRow);
    $self->set('locations', $locations);
    $self->set('lastPkey', $pkey);
    $self->set('outRow', $outRow);

}

_continueWithBatch

description

Top

sub _continueWithBatch {

    my ($self, $batchSize, $rtable) = @_;

    #always true if underlying table is an AttributeTable and it has rows
    my $continue = ($rtable->isa("BioMart::ResultTable")) 
                 ? $rtable->inCurrentBatch() 
                 : $rtable->hasMoreRows;

    if ($continue && $batchSize) {
	my $batchIndex = $self->get('batchIndex');
	$continue = ($batchIndex < $batchSize);
    }
    return $continue;

}

_editSequence

description

Top

sub _editSequence {

  my ($self, $seqref) = @_;

  my $seq_edits = $self->get('seq_edits');

  if ($$seqref && $seq_edits) {
      foreach my $seq_edit (split /\;/, $seq_edits) {
	  my ($start, $end, $alt_seq) = split /\,/, $seq_edit;

	  my $len = $end - $start + 1;
	  substr($$seqref, $start - 1, $len) = $alt_seq;
      }
  }

}

_exonIntronFlankSequences

description

Top

sub _exonIntronFlankSequences {

    my ($self, $atable, $curRow) = @_;
    
    my $rank = 1;
    
    # Determine this and last primary keys
    my $importable_indices = $self->get('importable_indices');
    # Get the primary sequence ID from this row. Use DUMMY if missing
    my $pkey     = $curRow ? 
	($curRow->[$importable_indices->{"pkey"}] || 'DUMMY') : undef;
    my $lastPkey = $self->get('lastPkey') || $pkey;

    my $outRow = $self->get('outRow');

    if( ( ! defined $pkey ) or ( $pkey ne $lastPkey ) ){
	# Start of new row, or end of results; Dump the current sequence
	my $shift = ($self->get('seq_name') =~ m/flank/);
	my $location = $self->_modFlanks( $self->get('calc_location'), 
					  $shift );
	$self->set('calc_location', undef); # Reset location cache

	my $sequence;
	if ($location->{"start"}) {
	    my $locations = { $rank => $location };
	    $sequence = $self->_processSequence($locations);
	    $self->_editSequence(\$sequence);
	}

	if ($sequence) { 
	    $self->_addRow($atable, $outRow, $sequence);
	} 
	else {      
	    $self->_addRow($atable, $outRow, "Sequence unavailable");
	}
	$outRow = undef;
    } # End sequence dumping

    if ($curRow) {
	# Update the location corresponding to this row
	my $location = $self->_getLocationFrom($curRow, "chr", "start", 
					       "end", "strand");
	$self->_calcSeqOverLocations( $location );
    }

    $outRow ||= $self->_initializeReturnRow($curRow);
    $self->set('lastPkey', $pkey);
    $self->set('outRow', $outRow);

}

_exonSequences

description

Top

sub _exonSequences {

    my ($self, $atable, $curRow) = @_;

    $curRow || return; # Process row-by-row; can discard last (empty) call
    
    return if ($self->_ignoreRow($curRow)); #ignore duplicate exons 
    my $rank = 1;

    my $locations = {};
    $locations->{$rank} = $self->_modFlanks( $self->_getLocationFrom($curRow, 
	   "chr", "start", "end", "strand"), 0 );

    my $sequence;
    if ($locations->{1}->{"start"}) {
	$sequence = $self->_processSequence($locations);
	$self->_editSequence(\$sequence);
    }
    if ($sequence) {
	$self->_addRow($atable, $self->_initializeReturnRow($curRow), 
		       $sequence);
    } 
    else {      
	$self->_addRow($atable, $self->_initializeReturnRow($curRow), 
		       "Sequence unavailable");
    }
  
    if ($self->get('ignore')) {
	#will only be true for gene_exon
	my $ignore = $self->get('ignore');
	my $ignore_row = $self->get('ignore_row');
	my $ref = $self->_getLocationFrom($curRow, $ignore_row);
	$ignore->{ $ref->{ $ignore_row  } } = 1; #skip duplicate pkeys
	$self->set('ignore', $ignore);
    }
    #else there will be no last entry

}

_getConfigurationTree

description

Top

sub _getConfigurationTree {

    my ($self,$interface,$dsCounter)=@_;;

    return $self->getParam('configurator')->getConfigurationTree(
       $self->virtualSchema, 
       $self->name,
       $interface,
       $dsCounter);

}

_getLocationFrom

description

Top

sub _getLocationFrom {

    my ($self, $curRow, @expectedFields) = @_;
    
    my $importable_indices = $self->get('importable_indices');
    my $location = {};
    
    foreach my $expectedField (@expectedFields) {
	$location->{$expectedField} = 
	    ( exists( $importable_indices->{$expectedField}  ) ) ? 
	    $curRow->[ $importable_indices->{$expectedField} ] : undef;
    }
    
    return $location;

}

_getResultTable

description

Top

sub _getResultTable {

    my ($self, @param) = @_;
  
    $self->set('batchIndex', 0);
    local($^W) = 0;  # prevent "odd number of elements" warning with -w.
    my(%param) = @param;

    my $query = $param{'query'};
    my $atable = $param{'table'};

    my $batch_size = $param{'batch_size'};
  
    if ($self->serverType eq "web"){  
	my $batch_start = $param{'batch_start'} || 0;
	
	my $location = $self->getParam('configurator')->get('location');
	my $xml = $query->toXML($batch_start,$batch_size,0);    
        	
	foreach my $el($location->getResultSet("","POST",$xml)){
	    if ($el =~ /No Sequence Returned/) {
		$self->_setExhausted(1);
		last;
	    }
	    my @clean=split(/\t/,$el);
	    $atable->addRow([@clean]);
	}
	
	return $atable;
    } else {
		$self->_initializeDNAAdaptor($query->
				     getInterfaceForDataset($self->name));
    }

    my $importable = $self->get('importable');
    my $rtable = $importable->getTable();

    my $attribute_count = @{$query->getAllAttributes};
    if ($rtable->hashedResults || $attribute_count > 1){
	$self->set('attribute_merge_required','1');
    }

    my $has_rows = $rtable->hasMoreRows;
    while ($has_rows && $self->_continueWithBatch($batch_size, $rtable)) {
	$self->_processRow( $atable, $rtable->nextRow);
    }
    # the last and final call to GenomicSequence, after the call which 
    # exhausts the importable, will result in the last sequence being 
    # processed and added to the resultTable.
    # the next call after this returns undef.
    unless ($has_rows) {
	$self->_setExhausted(1);
	$self->_processRow($atable);
    }
    $importable->setTable($rtable);
    $self->set('importable', $importable);

  	$self->get('dna')->close;
  		
    return $atable;
}
### --------------------------------------------------------------------
###     sequence __recipes__
### --------------------------------------------------------------------

}

_ignoreRow

description

Top

sub _ignoreRow {

  my ($self, $curRow) = @_;

  my $ignore = $self->get('ignore');
  return 0 unless ($ignore);  

  my $ignore_row = $self->get('ignore_row');
  my $test = $self->_getLocationFrom($curRow, $ignore_row);
  
  #if the actual value is false, return false, 
  # else, return ignore for the value
  return $test->{ $ignore_row } && $ignore->{ $test->{ $ignore_row  } };

}

_incrementBatch

description

Top

sub _incrementBatch {

    my $self = shift;
  
    my $batchIndex = $self->get('batchIndex');
    $batchIndex++;

    $self->set('batchIndex', $batchIndex);

}

_initializeDNAAdaptor

description

Top

sub _initializeDNAAdaptor {

    my ($self,$interface) = @_;
    
    my $dna_params = $self->getConfigurationTree($interface)->
	optionalParameters;
    unless ($dna_params) {
	BioMart::Exception::Configuration->throw("GenomicSequence Dataset requires optional_parameters to be set in the DatasetConfig\n");
    }

	my ($dnatablename, $chunk_name_fieldname, $chunk_start_fieldname, 
	    $seqfieldname,$chunk_size) = split /\,/, $dna_params;
	my $dna = BioMart::Dataset::GenomicSequence::DNAAdaptor->new(
		'seq_name' => $self->name,
		'dna_tablename' => $dnatablename,
		'seq_fieldname' => $seqfieldname,
		'chunk_name_fieldname' => $chunk_name_fieldname,
		'chunk_start_fieldname' => $chunk_start_fieldname,
		'chunk_size' => $chunk_size,
		'configurator' => $self->getParam('configurator'),
								     );

	unless ($dna) {
	    BioMart::Exception::Configuration->throw("Couldnt connect to DNAAdaptor\n");
	}

	$self->set('dna', $dna);

}

_initializeIndices

description

Top

sub _initializeIndices {

    my ($self, $numFields) = @_;

    my $returnRow_indices = {};
    my $importable_indices = {};

    my $filts = $self->get('importable')->getAllFilters;

    #define where the importable fields are in rtable
    my $index = 0;
    foreach my $filt (@{$filts}) {
	$importable_indices->{$filt->name} = $index;
	$index++;
    }

    # define where fields needing to be merged into final returnRow are 
    # in rtable
    my $resultIndex = 0;
    while ($index < $numFields) {
	$returnRow_indices->{$index} = $resultIndex;
	$index++;
	$resultIndex++;
    }

    $self->set('importable_indices', $importable_indices);
    $self->set('returnRow_indices', $returnRow_indices);

}

_initializeReturnRow

description

Top

sub _initializeReturnRow {

      my ($self, $curRow) = @_;

      # this method used to handle the structure attributes as FASTA headers
      # no longer necessary with new attribute merging code in DatasetI.pm

      # if hashed attributes exist from previous structure dataset then just 
      # return $curRow, otherwise return []
      # my $importable = $self->get('importable');
      # my $rtable = $importable->getTable();

      return $self->get('attribute_merge_required') ? $curRow : [];

}

_modFlanks

description

Top

sub _modFlanks {

    my ($self, $location, $shift) = @_;
    
    $location->{start} || return $location; # Sanity check
    $location->{end}   || return $location; # Sanity check

    if ($shift) {
	#shift for flanks only - if user accidentally chooses both flanks, 
	# assume upstream as the original martview
	if ($self->get('upstream_flank')) {
	    if ($location->{"strand"} < 0) {
		$location->{"start"} = $location->{"end"} + 1;
		$location->{"end"} += $self->get('upstream_flank');
	    } 
	    else {
		$location->{"end"} = $location->{"start"} - 1;
		$location->{"start"} -= $self->get('upstream_flank');
	    }
	} 
	elsif ($self->get('downstream_flank')) {
	    if ($location->{"strand"} < 0) {
		$location->{"end"} = $location->{"start"} - 1;
		$location->{"start"} -= $self->get('downstream_flank'); 
	    } 
	    else {
		$location->{"start"} = $location->{"end"} + 1;
		$location->{"end"} += $self->get('downstream_flank');
	    }
	} 
	else {
	    BioMart::Exception::Configuration->throw("Requests for flank sequence must be accompanied by an upstream_flank or downstream_flank request\n");
	}
    } 
    else {
	if ($location->{"strand"} < 0) {
	    $location->{"start"} -= $self->get('downstream_flank');
	    $location->{"end"} += $self->get('upstream_flank');
	} else {
	    $location->{"start"} -= $self->get('upstream_flank');
	    $location->{"end"} += $self->get('downstream_flank');
	}
    }

    #sometimes users request more flanking sequence than is avaiable
    $location->{"start"} = 1 if ($location->{"start"} < 1);
    return $location;

}

_new

description

Top

sub _new {

    my ($self, @param) = @_;
    $self->SUPER::_new(@param);

    $self->attr('dna', undef);
    $self->attr('dnaparams', undef);

    $self->attr('recipe', undef); #this will hold a subRef
    $self->attr('ignore', undef);
    $self->attr('ignore_row', undef);
    $self->attr('seq_edits', undef); 
    $self->attr('codon_table_id', 1); #codon table defaults to 1
    $self->attr('seq_name', undef); # this is linked to the Attribute->name, 
                                    # determines which sequence recipe to run
    $self->attr('translate', 0); # set to true for peptide
    $self->attr('downstream_flank', 0);
    $self->attr('upstream_flank', 0);
    $self->attr('importable', undef);
    $self->attr('lastPkey', undef);
    $self->attr('importable_indices', undef); # initialized when first row 
                                              # processed in first batch for a 
                                              # given query
    $self->attr('returnRow_indices', undef); # initialized when first row 
                                             # processed in first batch for a 
                                             # given query
    $self->attr('returnRow', undef);
    $self->attr('batchIndex', 0); # increment each time a new pkey is seen
    
    $self->attr('locations', {}); # not used by all sequences
    $self->attr('outRow', undef); # not used by all sequences
    
    #attributes calculated over sequence locations
    $self->attr('calc_location', undef);

    $self->attr('sequence', undef);
    $self->attr('attribute_merge_required', 0);
}

#private methods

}

_processRow

description

Top

sub _processRow {

    my ($self, $atable, $curRow) = @_;
    
    # if this is the very first row for a new query, initialize the indices 
    # using its length as numFields
    unless ($self->get('importable_indices')) {
	if ($self->get('exhausted')) {
	    $atable->addRow(["No Sequence Returned"]);
	} 
	else {
	    my $numFields = @{$curRow};
	    $self->_initializeIndices($numFields);
	}
    }

    my $method = $self->get('recipe');
    $self->$method($atable, $curRow);

}

_processSequence

description

Top

sub _processSequence {

    my ($self, $locations) = @_;

    my $seq = '';
    my $temp_Seq = '';
    my $first_coding_exon_flag = 0;
	
    my $dna = $self->get('dna');

    foreach my $rank (sort { $a <=> $b } keys %{$locations}) {
	my $location = $locations->{$rank}; #warn "M_processSequence Rank: $location\n";
	my $chr = $location->{'chr'};#warn "M_processSequence Chr: $chr\n";
	my $start = $location->{'start'};#warn "M_processSequence Start: $start\n";
	my $end = $location->{'end'};#warn "M_processSequence End: $end\n";
	my $strand = exists( $location->{'strand'}) ? $location->{'strand'} : 1; #warn "M_processSequence Strand: $strand\n";
   	my $phase = $location->{'phase'} || 0; #warn "M_processSequence Phase: $phase\n";
	
	if ($first_coding_exon_flag == 0) {
	    if ($strand < 0) {
		    $temp_Seq = $self->_rc( $dna->
			  getSequence( $chr, $start, $end ) );
	        }
		else {
		    $temp_Seq = $dna->getSequence( $chr, $start, $end );
  		}
		if($temp_Seq) { # incase its not the first coding exon, 
		                # undef is returned by DNAAdapter
		    if ($phase > 0) { # copying Ns in beginning, exactly as the
			              # value of phase of first coding exon.
			$seq = 'N'x$phase;
		    }	
		    $seq .= $temp_Seq;
		    $first_coding_exon_flag = 1;
		}
		
        }
	else {
	    if ($strand < 0) {
		$seq .= $self->_rc( $dna->getSequence( $chr, $start, $end ) );
	    } 
	    else {
		$seq .= $dna->getSequence( $chr, $start, $end );
	    }
    	}
     }

    if (length($seq)) {
	return $seq;
    
    }
    return undef

}

_rawSequences

description

Top

sub _rawSequences {

    my ($self, $atable, $curRow) = @_;
  
    my $rank = 1;

    if ($curRow) {
	my $importable_indices = $self->get('importable_indices');
	
	my $locations = {};
    my $location = $self->_getLocationFrom($curRow, "chr", "start", "end");
	
	$location->{"strand"} = ( exists( $importable_indices->{"strand"} ) ) ?
	    $curRow->[  $importable_indices->{"strand"} ] : 1;
	
	$locations->{$rank} = $location if ($location->{"start"});
	my $sequence = $self->_processSequence($locations);
	$self->_editSequence(\$sequence);
	if ($sequence) {
	    $self->_addRow($atable, $self->_initializeReturnRow($curRow), 
			   $sequence);
	}
    }

}

_rc

description

Top

sub _rc {

    my ($self, $seq) = @_;

    $seq = reverse($seq);
    $seq =~ tr/YABCDGHKMRSTUVyabcdghkmrstuv/RTVGHCDMKYSAABrtvghcdmkysaab/;
    return $seq;

}

_snpSequences

description

Top

sub _snpSequences {

    my ($self, $atable, $curRow) = @_;
  
    my $rank = 1;

    if ($curRow) {
	#since locations are just hashes, hack them
	my $location = $self->_getLocationFrom($curRow, "chr", "pos", 
					       "strand", "allele");
	
	if ($location->{"strand"} < 0) {
	    $location->{"start"} = $location->{"pos"} - 
		$self->get('downstream_flank');
	    $location->{"start"} = 1 
		if ($location->{"start"} < 1);
	    $location->{"end"} = $location->{"pos"} + 
		$self->get('upstream_flank');
	    $location->{"off"} = $self->get('upstream_flank');
	} 
	else {
	    $location->{"start"} = $location->{"pos"} - 
		$self->get('upstream_flank');
	    $location->{"end"} = $location->{"pos"} + 
		$self->get('downstream_flank');
	    $location->{"off"} = $self->get('upstream_flank');
	    if ($location->{"start"} < 1) {
		$location->{"off"} = $self->get('upstream_flank') + 
		    $location->{"start"} - 1;
		$location->{"start"} = 1;
	    }
	}

	my $locations = {};
	$locations->{$rank} = $location;
	my $sequence = $self->_processSequence($locations);
	$self->_editSequence(\$sequence);
	if ($sequence) {
	    substr($sequence, $location->{"off"}, 1) = "\n".
		$location->{"allele"}."\n";
	    $self->_addRow($atable, $self->_initializeReturnRow($curRow), 
			   $sequence);
	}    
    }
}

1;

}

_translate

description

Top

sub _translate {

   my ($self, $seq) = @_;
   
   BioMart::Exception::Configuration->throw("Calling translate without a seq argument!") 
       unless defined $seq;
   return '' unless $seq;

   my $id = $self->get('codon_table_id') || DEFAULTCODONTABLEID;
   my ($partial) = 0;
   $partial = 2 if length($seq) % 3 == 2;

   $seq = lc $seq;
   $seq =~ tr/u/t/;
   my $protein = "";
   if ($seq =~ /[^actg]/ ) { #ambiguous chars
       for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) {
	   my $triplet = substr($seq, $i, 3);
	   if (exists $CODONS->{$triplet}) {
	       $protein .= substr($TABLES[$id-1],
				  $CODONS->{$triplet},1);
	   } 
	   else {
	       $protein .= $self->_translate_ambiguous_codon($triplet);
	   }
       }
   } 
   else { # simple, strict translation
       for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) {
	   my $triplet = substr($seq, $i, 3);
	   if (exists $CODONS->{$triplet}) {
	       $protein .= substr($TABLES[$id-1], $CODONS->{$triplet}, 1);
	   } 
	   else {
	       $protein .= 'X';
	   }
       }
   }
   if ($partial == 2) { # 2 overhanging nucleotides
       my $triplet = substr($seq, ($partial -4)). "n";
       if (exists $CODONS->{$triplet}) {
	   my $aa = substr($TABLES[$id-1], $CODONS->{$triplet},1);
	   $protein .= $aa;
       } else {
	   $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
       }
   }
   return $protein;

}

_translate_ambiguous_codon

description

Top

sub _translate_ambiguous_codon {

    my ($self, $triplet, $partial) = @_;
    $partial ||= 0;
    my $id = $self->get('codon_table_id') || DEFAULTCODONTABLEID;
    my $aa;
    my @codons = _unambiquous_codons($triplet);

    my %aas =();
    foreach my $codon (@codons) {
	$aas{substr($TABLES[$id-1],$CODONS->{$codon},1)} = 1;
    }
    my $count = scalar keys %aas;
    
    if ( $count == 1 ) {
	$aa = (keys %aas)[0];
    }
    elsif ( $count == 2 ) {
	if ($aas{'D'} and $aas{'N'}) {
	    $aa = 'B';
	}
	elsif ($aas{'E'} and $aas{'Q'}) {
	    $aa = 'Z';
	} 
	else {
	    $partial ? ($aa = '') : ($aa = 'X');
	}
    } 
    else {
	$partial ? ($aa = '') :  ($aa = 'X');
    }
    return $aa;

}

_unambiquous_codons

description

Top

sub _unambiquous_codons {

    my ($value) = @_;
    my @nts = ();
    my @codons = ();
    my ($i, $j, $k);

    @nts = map { $IUPAC_DNA{uc $_} }  split(//, $value);

    for my $i (@{$nts[0]}) {
	for my $j (@{$nts[1]}) {
	    for my $k (@{$nts[2]}) {
		push @codons, lc "$i$j$k";
	    }
	}
    }
    return @codons;

}

_utrSequences

description

Top

sub _utrSequences {

    my ($self, $atable, $curRow) = @_;

    my $locations = $self->get('locations');
    my $outRow = $self->get('outRow');
    
    if ($curRow) {
	my $importable_indices = $self->get('importable_indices');

	my $pkey = $curRow->[ $importable_indices->{"pkey"} ];

	my $lastPkey = $self->get('lastPkey');
	if ( $lastPkey && ($pkey ne $lastPkey) ) {
	    my $hasUtr = keys %{$locations};
	    if ($hasUtr) {
		my @ranks = sort { $a <=> $b } keys %{$locations};
		my $low_rank = shift @ranks;
		my $hi_rank = ($hasUtr > 1) ? pop @ranks : $low_rank;

		my $calc_location = $self->get('calc_location');
		
		my $low_loc_strand = $locations->{$low_rank}->{"strand"};
		if ($self->get('upstream_flank')) {
		    if ($low_loc_strand < 0) {
			$calc_location->{"start"} = 
			    $calc_location->{"end"} + 1;
			$calc_location->{"end"} = 
			    $calc_location->{"start"} + 
			    $self->get('upstream_flank') - 1;
		    } 
		    else {
			$calc_location->{"end"} = 
			    $calc_location->{"start"} - 1;
			$calc_location->{"start"} = 
			    $calc_location->{"start"} - 
			    $self->get('upstream_flank') + 1;
			$calc_location->{"start"} = 1 
			    if ($calc_location->{"start"} < 1);
		    }
		    
		    #prepend to sequence
		    $locations->{$low_rank - 1} = $calc_location;
		} 
		elsif ($self->get('downstream_flank')) {
		    if ($low_loc_strand < 0) {
			$calc_location->{"end"} = 
			    $calc_location->{"start"} - 1;
			$calc_location->{"start"} = 
			    $calc_location->{"end"} - 
			    $self->get('downstream_flank') + 1;
		    } 
		    else {
			$calc_location->{"start"} = 
			    $calc_location->{"end"} + 1;
			$calc_location->{"end"} = 
			    $calc_location->{"start"} + 
			    $self->get('downstream_flank') - 1;
		    }
		    
		    #append to sequence
		    $locations->{$hi_rank + 1} = $calc_location;
		}      
		
		my $sequence = $self->_processSequence($locations);
		$self->_editSequence(\$sequence);
		if ($sequence) {
		    $locations = {};
		    $hasUtr = 0;
		    $self->_addRow($atable, $outRow, $sequence);
		    $outRow = undef;
		    $self->set('calc_location', undef);
		}
	    } 
	    else {
		$self->_addRow($atable, $outRow, 
			       "No UTR is annotated for this transcript");
		$outRow = undef;
	    }
	}

	unless ($outRow) { 
	    $outRow = $self->_initializeReturnRow($curRow);
	}

	#for this one we build and calc
	my $rank = $curRow->[ $importable_indices->{"rank"} ];
	my $location = $self->_getLocationFrom($curRow, "chr", "start", 
					       "end", "strand");
	if ($location->{"start"}) {
	    $locations->{$rank} = $location;  
	    $self->_calcSeqOverLocations($location);
	}

	$self->set('lastPkey', $pkey);
    } 
    else {
	# last entry
	my $hasUtr = keys %{$locations};
	if ($hasUtr) {
	    my @ranks = sort { $a <=> $b } keys %{$locations};
	    my $low_rank = shift @ranks;
	    my $hi_rank = ($hasUtr > 1) ? pop @ranks : $low_rank;
	    
	    my $calc_location = $self->get('calc_location');

	    my $low_loc_strand = $locations->{$low_rank}->{"strand"};
	    if ($self->get('upstream_flank')) {
		if ($low_loc_strand < 0) {
		    $calc_location->{"start"} = $calc_location->{"end"} + 1;
		    $calc_location->{"end"} = 
			$calc_location->{"start"} + 
			$self->get('upstream_flank') - 1;
		} 
		else {
		    $calc_location->{"end"} = $calc_location->{"start"} - 1;
		    $calc_location->{"start"} = 
			$calc_location->{"start"} - 
			$self->get('upstream_flank') + 1;
		    $calc_location->{"start"} = 1 
			if ($calc_location->{"start"} < 1);
		}

		#prepend to sequence
		$locations->{$low_rank - 1} = $calc_location;
	    } 
	    elsif ($self->get('downstream_flank')) {
		if ($low_loc_strand < 0) {
		    $calc_location->{"end"} = $calc_location->{"start"} - 1;
		    $calc_location->{"start"} = $calc_location->{"end"} - 
			$self->get('downstream_flank') + 1;
		} 
		else {
		    $calc_location->{"start"} = $calc_location->{"end"} + 1;
		    $calc_location->{"end"} = $calc_location->{"start"} + 
			$self->get('downstream_flank') - 1;
		}

		#append to sequence
		$locations->{$hi_rank + 1} = $calc_location;
	    }

	    my $sequence = $self->_processSequence($locations);
	    $self->_editSequence(\$sequence);
	    if ($sequence) {
		$locations = {};
		$hasUtr = 0;
		$self->_addRow($atable, $outRow, $sequence);
		$outRow = undef;
		$self->set('calc_location', undef);
	    }
	} 
	else {
	    $self->_addRow($atable, $outRow, 
			   "No UTR is annotated for this transcript");
	    $outRow = undef;
	}
    }
    
    $self->set('locations', $locations);
    $self->set('outRow', $outRow);

}

General documentation

AUTHOR - Darin London, Damian Smedley

Top

CONTACT

Top

This module is part of the BioMart project http://www.biomart.org
Questions can be posted to the mart-dev mailing list: mart-dev@ebi.ac.uk

Developer Notes

Top

  The peptide translation algorithm is taken directly
  from the CodonTable module that is part of the
  BioPerl project.  For more information about the
  BioPerl project, visit:

  http://www.bioperl.org