GenomicAlign documentation.

A hidden Dataset containing sequence attributes that can be imported to other
visible Datasets which are compatible with its required data input, based
on the presence of one or more importable-exportable relationships.

Description

Dataset providing Align Sequence attributes, which can be imported into
other Datasets. AlignSequence is itself not a visible Dataset.

Methods

__processNewQuery	No description	Code
_addRow	No description	Code
_calcSeqOverLocations	No description	Code
_continueWithBatch	No description	Code
_editSequence	No description	Code
_getConfigurationTree	No description	Code
_getExportables	No description	Code
_getImportables	No description	Code
_getLocationFrom	No description	Code
_getResultTable	No description	Code
_ignoreRow	No description	Code
_incrementBatch	No description	Code
_initializeDNAAdaptor	No description	Code
_initializeIndices	No description	Code
_initializeReturnRow	No description	Code
_new	No description	Code
_processRow	No description	Code
_processSequence	No description	Code
_rawSequences	No description	Code
_rawSequencesOriginal	No description	Code
_rc	No description	Code

Methods description

None available.

Methods code

__processNewQuery

description

Top

sub __processNewQuery {

    my ($self, $query) = @_;

    #print STDERR ("##GenomicAlign  1_processNewQuery  ".localtime(time)."\n");
#    print "\nname:",$self->name," GenomicAlign\n";
#    print "\nquery: ",$query," GenomicAlign\n";
#    print "\ntableau: ",$query->getAllAttributes($self->name)," GenomicAlign\n";
    my $attribute = $query->getAllAttributes($self->name)->[0];
    my $seq_name = $attribute->name;
    $self->set('seq_name', $seq_name);
    ##$self->set('translate', ($seq_name =~ m/peptide$/));
    #print STDERR ("##GenomicAlign  2_processNewQuery  ".localtime(time)."\n");
    my $ignore = 'IGNORE';
    if ($seq_name =~ m/raw/){
	
	$self->set('recipe', '_rawSequences');
	
    } else {
	BioMart::Exception::Configuration->throw("Unsupported sequence name $seq_name recieved by GenomicAlign\n");    
    }
    #print STDERR ("##GenomicAlign  3_processNewQuery  ".localtime(time)."\n");
    #$self->set('downstream_flank', 0);
    #$self->set('upstream_flank', 0);
    $self->set('importable', undef);
    $self->set('lastPkey', undef);
    $self->set('importable_indices', undef);
    $self->set('returnRow_indices', undef);
    $self->set('locations', {});
    $self->set('outRow', undef);
    $self->set('calc_location', undef);
    $self->set('sequence', undef);
    
    #determine which BaseSequenceA object to create
    my $filters = $query->getAllFilters($self->name);
    #print STDERR ("##GenomicAlign  4_processNewQuery  ".localtime(time)."\n");
    foreach my $filt (@{$filters}) {
	if ($filt->isa("BioMart::Configuration::FilterList")) {
	    if ($filt->linkName) {
		if ($self->get('importable') ) {
		    BioMart::Exception::Configuration->throw("Recieved two importables, can only work with one\n");
		} else {
		    $self->set('importable', $filt);
		}
	    } else {
		BioMart::Exception::Configuration->throw("Recieved invalid linkName ".$filt->linkName."\n");
	    }
	} else {
	    #must be a downstream or upstream valueFilter
	    unless ($filt->isa("BioMart::Configuration::ValueFilter")) {
		BioMart::Exception::Configuration->throw("Recieved unknown filter ".$filt->name." in GenomicSequence Dataset!\n");
	    }
	    
	    if ($self->get($filt->name)) {
		BioMart::Exception::Configuration->throw("Recieved two ".$filt->name." flanking filters in GenomicSequence Dataset\n");
	    }
	    
	    #could still be some strange ValueFilter that is not upstream or downstream, but not likely
	    #will throw an exception if this is the case
	    my $table = $filt->getTable;
	    my $row = $table->nextRow;
	    my $value = $row->[0];
	    if ($value) {
		$self->set($filt->name, $value);
	    }
	}
    }
    #print STDERR ("##GenomicAlign  5_processNewQuery  ".localtime(time)."\n");
    unless ($self->get('importable')) {
	BioMart::Exception::Configuration->throw("No Importable Recieved in GenomicAlign\n");
    }

}

_addRow

description

Top

sub _addRow {

    my ($self, $atable, $outRow) = @_;
    $atable->addRow($outRow);
    $self->_incrementBatch;

}

_calcSeqOverLocations

description

Top

sub _calcSeqOverLocations {

  my ($self, $this_location) = @_;
  $this_location->{start} || return; # Sanity check
  $this_location->{end}   || return; # Sanitt check

  my $calc_location = $self->get('calc_location');

  if ($calc_location) {
    $calc_location->{"start"} = $this_location->{"start"}  if ($this_location->{"start"} < $calc_location->{"start"});
    $calc_location->{"end"} = $this_location->{"end"}  if ($this_location->{"end"} > $calc_location->{"end"});
  } else {
    $calc_location = {};
    foreach my $key (keys %{$this_location}) {
      $calc_location->{$key} = $this_location->{$key};
	}
  }
  
  $self->set('calc_location', $calc_location);

}

_continueWithBatch

description

Top

sub _continueWithBatch {

    my ($self, $batchSize, $rtable) = @_;
    
    #always true if underlying table is an AttributeTable and it has rows
    my $continue = ($rtable->isa("BioMart::ResultTable")) 
	? $rtable->inCurrentBatch() 
	: $rtable->hasMoreRows;
    
    if ($continue && $batchSize) {
	my $batchIndex = $self->get('batchIndex');
	$continue = ($batchIndex < $batchSize);
    }
    
    return $continue;

}

_editSequence

description

Top

sub _editSequence {

  my ($self, $seqref) = @_;

  my $seq_edits = $self->get('seq_edits');

  if ($$seqref && $seq_edits) {
    foreach my $seq_edit (split /\;/, $seq_edits) {
      my ($start, $end, $alt_seq) = split /\,/, $seq_edit;

      my $len = $end - $start + 1;
      substr($$seqref, $start - 1, $len) = $alt_seq;
	}
  }

}

_getConfigurationTree

description

Top

sub _getConfigurationTree {

  my $self = shift;
  return $self->getParam('configurator')->getConfigurationTree($self->virtualSchema, $self->name);

}

_getExportables

description

Top

sub _getExportables {

    my ($self, $linkName) = @_;

    my $exportables = $self->get('exportables');
    if ($linkName) {
      return [ $exportables->{$linkName} ];
	}

    my $ref = [];
    push @{$ref}, values %{$exportables};
    return $ref;

}

_getImportables

description

Top

sub _getImportables {

  my ($self, $linkName) = @_;

  my $importables = $self->get('importables');
  if ($linkName) {
    #return [ $importables->{$linkName} ];
    return $importables->{$linkName};
  }

  my $ref = [];
  push @{$ref}, values %{$importables};

  return $ref;

}

_getLocationFrom

description

Top

sub _getLocationFrom {

    my ($self, $curRow, @expectedFields) = @_;
    
    my $importable_indices = $self->get('importable_indices');
    my $location = {};
    
    foreach my $expectedField (@expectedFields) {
    $location->{$expectedField} = ( exists( $importable_indices->{$expectedField}  ) ) ? $curRow->[ $importable_indices->{$expectedField} ] : undef;
}
    
    return $location;
}


######################################################

}

_getResultTable

description

Top

sub _getResultTable {

   my ($self, @param) = @_;
  $self->set('batchIndex', 0);
  local($^W) = 0;  # prevent "odd number of elements" warning with -w.
  my(%param) = @param;

  my $query = $param{'query'};

  my $atable = $param{'table'};
  my $batch_size = $param{'batch_size'};
  
    if ($self->serverType eq "web"){  
	my $batch_start = $param{'batch_start'} || 0;
	
	my $location = $self->getParam('configurator')->get('location');
	my $xml = $query->toXML($batch_start,$batch_size,0);    
        	
	foreach my $el($location->getResultSet("","POST",$xml)){
	    if ($el =~ /No Sequence Returned/) {
		$self->_setExhausted(1);
		last;
	    }
	    my @clean=split(/\t/,$el);
	    $atable->addRow([@clean]);
	}
	
	return $atable;
    } else {
		$self->_initializeDNAAdaptor($query->
				     getInterfaceForDataset($self->name));
    }
    
# print STDERR ("##BATCH SIZE FOR ".$self->name." IS ".$batch_size."\n");
  
  my $importable = $self->get('importable');
  my $rtable = $importable->getTable();
  
  my $has_rows = $rtable->hasMoreRows;
  
  while ($has_rows && $self->_continueWithBatch($batch_size, $rtable)) {
      $self->_processRow( $atable, $rtable->nextRow );
  }
  
  # the last and final call to GenomicSequence, after the call which exhausts the importable,  
  # will result in the last sequence being processed and added to the resultTable.  
  # the next call after this returns undef.
  unless ($has_rows) {
      $self->_setExhausted(1);
#      $self->_processRow($atable);
  }
  
  $importable->setTable($rtable);
  $self->set('importable', $importable); 
  
  		my $dna = $self->get('dna');
		foreach my $attribute_name (keys %$dna) {
			 $dna->{$attribute_name}->close;
		}
  
  return $atable;
}

### sequence __recipes__

}

_ignoreRow

description

Top

sub _ignoreRow {

  my ($self, $curRow) = @_;

  my $ignore = $self->get('ignore');
  return 0 unless ($ignore);  

  my $ignore_row = $self->get('ignore_row');
  my $test = $self->_getLocationFrom($curRow, $ignore_row);
  
  #if the actual value is false, return false, else, return ignore for the value
  return $test->{ $ignore_row } && $ignore->{ $test->{ $ignore_row  } };

}

_incrementBatch

description

Top

sub _incrementBatch {

  my $self = shift;

  my $batchIndex = $self->get('batchIndex');
  $batchIndex++;

  $self->set('batchIndex', $batchIndex);

}

_initializeDNAAdaptor

description

Top

sub _initializeDNAAdaptor {

    my $self = shift;
    
    my $dna_params = $self->getConfigurationTree()->optionalParameters;
    unless ($dna_params) {
	BioMart::Exception::Configuration->throw("GenomicSequence Dataset requires optional_parameters to be set in the DatasetConfig\n");
    }
    
    my $dna = {};

## from the has_mmu_align_sequence  dataset	
## need to parse the attributes - to get different DNAadaptor for each species  
## Attribute : optional parameters
## hsapiens_sequence,hsapiens_genomic_sequence__dna_chunks__main,chr_name,chr_start,sequence,100000;mmus_oriented_raw_sequence,mmusculus_genomic_sequence__dna_chunks__main,chr_name,chr_start,sequence,100000
##
	foreach  my $dna_params4specie  ( split /\;/, $dna_params ){

	    #print STDERR ("##GenomicAlign _initializeDNAAdaptor  ".localtime(time)."\n");

#	    print "\n+In subroutine __dnaAdaptor dna_params4specie  $dna_params4specie\n";
	    ## hsa_oriented_raw_sequence,hsapiens_genomic_sequence__dna_chunks__main,chr_name,chr_start,sequence,100000
	    my ($attribute_name, $dnatablename, $chunk_name_fieldname, $chunk_start_fieldname, $seqfieldname,$chunk_size) = split /\,/, $dna_params4specie ;

#	    print "\n\n\nstoring dna for $attribute_name $dnatablename $chunk_name_fieldname $chunk_start_fieldname $seqfieldname $chunk_size\n\n\n";

#	    print "chunk_size: $chunk_size\n";
	    $dna->{$attribute_name} = BioMart::Dataset::GenomicSequence::DNAAdaptor->new('seq_name' => $attribute_name,
									 'dna_tablename' => $dnatablename,                   ##hsapiens_genomic_sequence__dna_chunks__main
									 'seq_fieldname' => $seqfieldname,                   ##sequence
									 'chunk_name_fieldname' => $chunk_name_fieldname,    ##chr_name
									 'chunk_start_fieldname' => $chunk_start_fieldname,  ##chr_start
									 'chunk_size' => $chunk_size,                        ##100000
									 'configurator' => $self->getParam('configurator')
									 );
	    
	    unless ($dna->{$attribute_name}) {
		BioMart::Exception::Configuration->throw("Couldnt connect to DNAAdaptor for $attribute_name\n\n");
	    }	    
	}
#	print "dna: ",$dna,"\n";
        $self->set('dna', $dna);
        
#    print "\n+Out subroutine: __dnaAdaptor GenomicAlign\n";

}

_initializeIndices

description

Top

sub _initializeIndices {

  my ($self, $numFields) = @_;

  my $importable_names = [];
  my $returnRow_indices = {};
  my $importable_indices = {};
  
  my $filts = $self->get('importable')->getAllFilters;

  #define where the importable fields are in rtable
  my $index = 0;
  foreach my $filt (@{$filts}) {
    push @{$importable_names}, $filt->name;
    $importable_indices->{$filt->name} = $index;
    $index++;
  }

  #define where fields needing to be merged into final returnRow are in rtable
  my $resultIndex = 0;
  while ($index < $numFields) {
    $returnRow_indices->{$index} = $resultIndex;
    $index++;
    $resultIndex++;
  }

  $self->set('importable_indices', $importable_indices);
  $self->set('returnRow_indices', $returnRow_indices);
  $self->set('importable_names', $importable_names);

}

_initializeReturnRow

description

Top

sub _initializeReturnRow {

      my ($self, $curRow) = @_;

      #This does __NOT__ concatenate fields that are many<->one with the pkey
      my $returnRow = [];
      
      foreach my $val (@{$curRow}) {
	push @{$returnRow}, $val;
      }
      return $returnRow;

}

_new

description

Top

sub _new {

	
    my ($self, @param) = @_;
    $self->SUPER::_new(@param);
    
    my $i=1; ##
    
    $self->attr('dna', undef);
    $self->attr('dnaparams', undef);
    ######### modified
    $self->attr('recipe', 'raw_sequence'); ## this will hold a subRef
    $self->attr('ignore', undef);
    $self->attr('ignore_row', undef);
    $self->attr('seq_edits', undef); #will hold seq_edits to be applied after sequence has been produced  
    #$self->attr('codon_table_id', 1); #codon table   defaults to 1  
    $self->attr('seq_name', undef); #this is linked to the Attribute->name, determines which sequence recipe to run  
    $self->attr('translate', 0); # set to true for peptide  
    #$self->attr('downstream_flank', 0);  
    #$self->attr('upstream_flank', 0);
    $self->attr('importable', undef);
    $self->attr('lastPkey', undef);
    $self->attr('importable_names', undef); # initialized when first row   processed in first batch for a given query  
    $self->attr('importable_indices', undef); # initialized when first row   processed in first batch for a given query  
    $self->attr('returnRow_indices', undef); # initialized when first row processed in first batch for a given query  
    $self->attr('returnRow', undef);  
    $self->attr('batchIndex', 0); #increment each time a new pkey is encountered.
    $self->attr('seq_species', undef);

    #process rows   until this equals batchSize
    
    $self->attr('locations', {}); # not used by all sequences
    $self->attr('outRow', undef); # not used by all sequences
    
    #attributes calculated over sequence locations
    $self->attr('calc_location', undef);
    
    $self->attr('sequence', undef);
}

#private methods

}

_processRow

description

Top

sub _processRow {

  my ($self, $atable, $curRow) = @_;

  # if this is the very first row for a new query, initialize the indices using its length as numFields
  unless ($self->get('importable_indices')) {
      if ($self->get('exhausted')) {
	  $atable->addRow(["No Sequence Returned"]);
      } else {
        my $numFields = @{$curRow};
        $self->_initializeIndices($numFields);
    }
  }

  my $method = $self->get('recipe');
  $self->$method($atable, $curRow);

}

_processSequence

description

Top

sub _processSequence {

    ############################  my ($self, $location, $attribute_name, $count) = @_;
  #@species_attribute_name contains hsa et mmu
  #print STDERR ("##GenomicAlign  _processSequence  starting ".localtime(time)."\n");
  #print "attribute_name $attribute_name\n";
  #print "count  $count\n";
 # my $i=1;

  my $seq    = ''; 
  my $dna    = $self->get('dna')->{$attribute_name};
  my $chr    = $location->{'chr'.$count};
  my $start  = $location->{'start'.$count};
  my $end    = $location->{'end'.$count};
  my $strand = $location->{'strand'.$count};
  #------------------------
  #my $seq = '';
  #my $dna = $self->get('dna')->{$attribute_name};
  #my $chr = $location->{'chr1'};
  #$chr = $location->{'chr2'} unless (defined $chr);
  #my $start = $location->{'start1'};
  #$start = $location->{'start2'} unless (defined $start);
  #my $end = $location->{'end1'};
  #$end = $location->{'end2'} unless (defined $end);
  #my $strand = $location->{'strand1'};
  #$strand = $location->{'strand2'} unless (defined $strand);
  #$strand = 1 unless (defined $strand);
  #print "coucou $chr $start $end $strand\n";
  
  
  if ($strand < 0) {
      #print STDERR ("##GenomicAlign  NO rc \n".$dna->getSequence( $chr, $start, $end )."\n");
      #print STDERR ("#GenomicAlign  start rc \n"._rc( $dna->getSequence( $chr, $start, $end ))."\n");
      $seq .= $self->_rc( $dna->getSequence( $chr, $start, $end ) );
      #print STDERR ("##GenomicAlign  start end ".localtime(time)."\n");
      #print STDERR ("##GenomicAlign  RC  \n".$seq."\n");
  } else {
      #print STDERR ("##GenomicAlign  start getSequence ".localtime(time)."\n");
      $seq .= $dna->getSequence( $chr, $start, $end );
      #print STDERR ("##GenomicAlign  start getSequence ".localtime(time)."\n");
  }
  $i++;
  #print STDERR ("##GenomicAlign  3_processSequence   $attribute_name ".localtime(time)."\n");
  
  if (length($seq)) {
      return $seq;
  }
 # print STDERR ("##GenomicAlign  4_processSequence   $attribute_name ".localtime(time)."\n");
  return undef;

}

_rawSequences

description

Top

sub _rawSequences {

    my ($self, $atable, $curRow) = @_;
    my $rank             = 1;
    my $overall_count    = 0;
    my $local_count      = 0;
    my $species_numbers  = 0;
    my $count            = 1;
    my $n = 0;
    my @importable_names = @{$self->get('importable_names')};
    my $dna_params = $self->getConfigurationTree()->optionalParameters;
    my @species_dna_params = split(/\;/, $dna_params);
    my @species_attribute_name;

#    print STDERR ("##GenomicAlign  1_rawSequences   ".localtime(time)."\n");
    foreach my $sdp (@species_dna_params) {
	my ($attribute_name) = split(/\,/,$sdp);
	push @species_attribute_name, $attribute_name;
	#print "## $attribute_name\n";
    }
#    print STDERR ("##GenomicAlign  2_rawSequences   ".localtime(time)."\n");

    my $initRow =  $self->_initializeReturnRow($curRow);
    
#    print STDERR ("##GenomicAlign  3_rawSequences   ".localtime(time)."\n");
	
    while (my $attribute_name = shift @species_attribute_name){
	
	my $importable_indices = $self->get('importable_indices');
#	print STDERR ("##GenomicAlign  4_rawSequences   ".localtime(time)."\n");
	
	my ($name, $start, $end, $strand);
	    foreach my $var (\$name,\$ start,\$ end,\$ strand) {
		$$var = shift @importable_names;
		last if (defined $strand);
		#print "strand $strand";
		#shift @importable_names;
	    }
#	print STDERR ("##GenomicAlign  5_rawSequences   $attribute_name".localtime(time)."\n");
	
	my $location = $self->_getLocationFrom($curRow, ($name, $start, $end, $strand));
	
#	print STDERR ("##GenomicAlign  6_rawSequences   $attribute_name ".localtime(time)."\n");
	#print "$attribute_name $name, $start, $end, $strand\n";
	my $sequence = $self->_processSequence($location, $attribute_name, $count);
#	print STDERR ("##GenomicAlign  7_rawSequences   $attribute_name ".localtime(time)."\n");
	if ($sequence) {
	    push @{$initRow}, $sequence;
	}
	## IMPORTANT remove the length as the coordinate aer as folow
	## name, start, end, strand, length
	shift @importable_names ;
#	print STDERR ("##GenomicAlign  8_rawSequences  $attribute_name ".localtime(time)."\n");
	$count++;
    }
    $self->_addRow($atable, $initRow);
#    print STDERR ("##GenomicAlign  9_rawSequences   ".localtime(time)."\n");
}
1;

}

_rawSequencesOriginal

description

Top

sub _rawSequencesOriginal {

           ## SHOULD BE REMOVED   my ($self, $atable, $curRow) = @_;  ## AS IT WAS FROM GENOMICSEQUENCE.PM
  my $rank = 1;
  
  if ($curRow) {
      my $importable_indices = $self->get('importable_indices');
      my $locations = {};
      my $location = $self->_getLocationFrom($curRow, "chr", "start", "end");
      
      $location->{"strand"} = ( exists( $importable_indices->{"strand"} ) ) ?  $curRow->[  $importable_indices->{"strand"} ] : 1;
      $locations->{$rank} = $location if ($location->{"start"});
      my $sequence = $self->_processSequence($locations);
      $self->_editSequence(\$sequence);
      if ($sequence) {
	  $self->_addRow($atable, $self->_initializeReturnRow($curRow), $sequence);
      }
  }
  #else there is no last entry

}

_rc

description

Top

sub _rc {

    my ($self, $seq) = @_;

    #print STDERR "GenomicAlign reverse start ".localtime(time)."\n";
    $seq = reverse($seq);
    #print STDERR "GenomicAlign reverse end tr start ".localtime(time)."\n";
    $seq =~ tr/YABCDGHKMRSTUVyabcdghkmrstuv/RTVGHCDMKYSAABrtvghcdmkysaab/;
    #print STDERR "GenomicAlign tr end _rc end ".localtime(time)."\n";
    return $seq;
}

#interface methods

}

General documentation

AUTHOR - Arek Kasprzyk, Darin London

Top

CONTACT

Top

This module is part of the BioMart project http://www.biomart.org
Questions can be posted to the mart-dev mailing list: mart-dev@ebi.ac.uk

Developer Notes

Top

  The peptide translation algorithm is taken directly
  from the CodonTable module that is part of the
  BioPerl project.  For more information about the
  BioPerl project, visit:

  http://www.bioperl.org