BioMart::Dataset
GenomicSequence
Toolbar
Summary
BioMart::Dataset::GenomicSequence
Package variables
No package variables defined.
Included modules
Data::Dumper
Log::Log4perl
Inherit
Synopsis
A hidden Dataset containing sequence attributes that can be imported to other
visible Datasets which are compatible with its required data input, based
on the presence of one or more importable-exportable relationships.
Description
Dataset providing Genomic Sequence attributes, which can be imported into
other Datasets. GenomicSequence is itself not a visible Dataset.
Methods
BEGIN | | Code |
__processNewQuery | No description | Code |
_addRow | No description | Code |
_calcSeqOverLocations | No description | Code |
_codingCdnaPeptideSequences | No description | Code |
_continueWithBatch | No description | Code |
_editSequence | No description | Code |
_exonSequences | No description | Code |
_gene_exonIntronFlankSequences | No description | Code |
_getConfigurationTree | No description | Code |
_getLocationFrom | No description | Code |
_getResultTable | No description | Code |
_ignoreRow | No description | Code |
_incrementBatch | No description | Code |
_initializeDNAAdaptor | No description | Code |
_initializeIndices | No description | Code |
_initializeReturnRow | No description | Code |
_modFlanks | No description | Code |
_new | No description | Code |
_processRow | No description | Code |
_processSequence | No description | Code |
_rawSequences | No description | Code |
_rc | No description | Code |
_setCodingExonFlag | No description | Code |
_snpSequences | No description | Code |
_transcript_exonIntronFlankSequences | No description | Code |
_translate | No description | Code |
_translate_ambiguous_codon | No description | Code |
_unambiquous_codons | No description | Code |
_utrSequences | No description | Code |
set_seq_edits | No description | Code |
toString | Description | Code |
Methods description
Usage :
Description: Helper rouinte for Array comparison by converting
it to string first. Comparing arrays as "@A" eq "@B"
brings flood of warning
Returntype : String
Exceptions : none
Caller : caller |
Methods code
BEGIN { @NAMES = (
'Standard', 'Vertebrate Mitochondrial', 'Yeast Mitochondrial', 'Mold, Protozoan, and CoelenterateMitochondrial and Mycoplasma/Spiroplasma', 'Invertebrate Mitochondrial', 'Ciliate, Dasycladacean and Hexamita Nuclear', '', '',
'Echinoderm Mitochondrial', 'Euplotid Nuclear', '"Bacterial"', 'Alternative Yeast Nuclear', 'Ascidian Mitochondrial', 'Flatworm Mitochondrial', 'Blepharisma Nuclear', 'Chlorophycean Mitochondrial', '', '', '', '',
'Trematode Mitochondrial', 'Scenedesmus obliquus Mitochondrial', 'Thraustochytrium Mitochondrial' );
@TABLES =
qw(
FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
'' ''
FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
'' '' '' ''
FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
);
my @nucs = qw(t c a g);
my $x = 0;
($CODONS, $TRCOL) = ({}, {});
for my $i (@nucs) {
for my $j (@nucs) {
for my $k (@nucs) {
my $codon = "$i$j$k";
$CODONS->{$codon} = $x;
$TRCOL->{$x} = $codon;
$x++;
}
}
}
%IUPAC_DNA = ( A => [qw(A)],
C => [qw(C)],
G => [qw(G)],
T => [qw(T)],
U => [qw(U)],
M => [qw(A C)],
R => [qw(A G)],
W => [qw(A T)],
S => [qw(C G)],
Y => [qw(C T)],
K => [qw(G T)],
V => [qw(A C G)],
H => [qw(A C T)],
D => [qw(A G T)],
B => [qw(C G T)],
X => [qw(G A T C)],
N => [qw(G A T C)]
); } |
__processNewQuery | description | prev | next | Top |
sub __processNewQuery
{ my ($self, $query) = @_;
my $attribute = $query->getAllAttributes($self->name)->[0];
my $seq_name = $attribute->name;
if ($seq_name eq 'pkey'){
$attribute = $query->getAllAttributes($self->name)->[-1];
$seq_name = $attribute->name;
}
$self->set('seq_name', $seq_name);
$self->set('translate', ($seq_name =~ m/peptide$/));
my $ignore = IGNORE;
if ($seq_name =~ m/(coding|cdna|peptide)$/) { # this is not actually going to ignore anything, but is # simply used to determine translation table for each gene # without creating a second instance variable $self->set('ignore_row', "type"); $self->set('recipe', '_codingCdnaPeptideSequences');
}
elsif ($seq_name =~ m/(transcript_exon_intron|transcript_flank|coding_transcript_flank)$/) { $self->set('recipe', '_transcript_exonIntronFlankSequences'); }
elsif ($seq_name =~ m/(gene_exon_intron|gene_flank|coding_gene_flank)$/) { $self->set('recipe', '_gene_exonIntronFlankSequences'); }
elsif ($seq_name =~ m/raw/){ $self->set('recipe', '_rawSequences'); }
elsif ($seq_name =~ m/(gene_exon|transcript_exon|transcript_intron)$/) { # set the system to ignore rows with duplicate pkeys # for gene_exon $self->set('ignore', $ignore->{$1}); #undef for transcript_exon $self->set('ignore_row', "pkey"); $self->set('recipe', '_exonSequences');
}
elsif ($seq_name =~ m/utr$/ or $seq_name =~ m/intergenic$/) { $self->set('recipe', '_utrSequences'); }
elsif ($seq_name =~ m/snp$/) { $self->set('recipe', '_snpSequences'); }
else {
BioMart::Exception::Configuration->throw("Unsupported sequence name $seq_name recieved by GenomicSequence\n");
}
$self->set('downstream_flank', 0);
$self->set('upstream_flank', 0);
$self->set('importable', undef);
$self->set('lastPkey', undef);
$self->set('importable_indices', undef);
$self->set('returnRow_indices', undef);
$self->set('locations', {});
$self->set('outRow', undef);
$self->set('calc_location', undef);
$self->set('sequence', undef);
$self->set('rowsFromLastBatch', undef);
$self->set('seqStorageHash', undef);
$self->set('onHoldSeqsKeys', undef);
$self->set('exon_idHash', undef);
my $filters = $query->getAllFilters($self->name);
foreach my $filt (@{$filters}) {
if ($filt->isa("BioMart::Configuration::FilterList")) {
if ($filt->linkName) {
if ($self->get('importable') ) {
BioMart::Exception::Configuration->throw("Recieved two importables, can only work with one\n");
}
else {
$self->set('importable', $filt);
}
}
else {
BioMart::Exception::Configuration->throw("Recieved invalid linkName ".
$filt->linkName."\n");
}
}
else {
unless ($filt->isa("BioMart::Configuration::ValueFilter")) {
BioMart::Exception::Configuration->throw("Recieved unknown filter ".$filt->name." in GenomicSequence Dataset!\n");
}
if ($self->get($filt->name)) {
BioMart::Exception::Configuration->throw("Recieved two ".$filt->name." flanking filters in GenomicSequence Dataset\n");
}
my $table = $filt->getTable;
my $row = $table->nextRow;
my $value = $row->[0];
if ($value) {
$self->set($filt->name, $value);
}
}
}
unless ($self->get('importable')) {
BioMart::Exception::Configuration->throw("No Importable Recieved in GenomicSequence\n");
} } |
sub _addRow
{ my ($self, $atable, $outRow, $sequence) = @_;
push @{$outRow}, $sequence;
$atable->addRow($outRow);
$self->_incrementBatch;
}
} |
sub _calcSeqOverLocations
{ my ($self, $this_location) = @_;
$this_location->{start} || return; $this_location->{end} || return;
my $calc_location = $self->get('calc_location');
if ($calc_location) {
$calc_location->{"start"} = $this_location->{"start"}
if ($this_location->{"start"} < $calc_location->{"start"});
$calc_location->{"end"} = $this_location->{"end"}
if ($this_location->{"end"} > $calc_location->{"end"});
}
else {
$calc_location = {};
foreach my $key (keys %{$this_location}) {
$calc_location->{$key} = $this_location->{$key};
}
}
$self->set('calc_location', $calc_location); } |
sub _codingCdnaPeptideSequences
{ my ($self, $atable, $curRow) = @_;
my $importable_indices = $self->get('importable_indices');
my $pkey = $curRow ?
($curRow->[$importable_indices->{"pkey"}] || 'DUMMY') : undef;
my $lastPkey = $self->get('lastPkey') || $pkey;
my $locations = $self->get('locations');
my $outRow = $self->get('outRow');
if( ( ! defined $pkey ) or ( $pkey ne $lastPkey ) ){
my ($up, $down);
if ($self->get('upstream_flank')){$up = "up"}
if ($self->get('downstream_flank')){$down = "down"}
my @ranks = sort { $a <=> $b } keys %{$locations};
my $array_size = scalar @ranks;
my ($firstExon,$lastExon);
if ($array_size == 1){
$firstExon = shift @ranks; $lastExon = $firstExon ;
}else {
$firstExon = shift @ranks; $lastExon = pop @ranks; }
if ($self->get('upstream_flank'))
{
my $location = $locations->{$firstExon};
$location = $self->_modFlanks($location, 2, $up);
$locations->{$firstExon} = $location if ($location->{"start"});
}
if ($self->get('downstream_flank'))
{
my $location = $locations->{$lastExon};
$location = $self->_modFlanks($location, 2, $down);
$locations->{$lastExon} = $location if ($location->{"start"});
}
my $sequence;
if( grep{ $locations->{$_}->{"start"} } keys %$locations ) {
$sequence = $self->_processSequence($locations);
$sequence = $self->_translate($sequence)
if ($self->get('translate'));
$self->_editSequence(\$sequence);
}
if ($sequence) {
$self->_addRow($atable, $outRow, $sequence);
}
else {
$self->_addRow($atable, $outRow, "Sequence unavailable");
}
$locations = {};
$outRow = undef;
}
if ($curRow) {
my $rank = $curRow->[ $importable_indices->{"rank"} ];
my $location = $self->_getLocationFrom($curRow, "chr", "start", "end",
"strand", "phase", "codon_table_id", "seq_edits");
$self->set('codon_table_id',$location->{"codon_table_id"});
$self->set_seq_edits($location->{"seq_edits"});
$locations->{$rank} = $location if ($location->{"start"} && $rank);
}
$outRow ||= $self->_initializeReturnRow($curRow);
$self->set('locations', $locations);
$self->set('lastPkey', $pkey);
$self->set('outRow', $outRow); } |
sub _continueWithBatch
{ my ($self, $batchSize, $rtable) = @_;
my $continue = ($rtable->isa("BioMart::ResultTable"))
? $rtable->inCurrentBatch()
: $rtable->hasMoreRows;
if ($continue && $batchSize) {
my $batchIndex = $self->get('batchIndex');
$continue = ($batchIndex < $batchSize);
}
return $continue; } |
sub _editSequence
{ my ($self, $seqref) = @_;
my $seq_edits = $self->get('seq_edits');
if ($$seqref && $seq_edits) {
foreach my $seq_edit (split /\;/, $seq_edits) {
my ($start, $end, $alt_seq) = split /\,/, $seq_edit;
my $len = $end - $start + 1;
substr($$seqref, $start - 1, $len) = $alt_seq;
}
}
$self->set('seq_edits', ""); } |
sub _exonSequences
{ my ($self, $atable, $curRow) = @_;
$curRow || return;
return if ($self->_ignoreRow($curRow)); my $rank = 1;
my $locations = {};
$locations->{$rank} = $self->_modFlanks( $self->_getLocationFrom($curRow,
"chr", "start", "end", "strand"), 0 );
my $sequence;
if ($locations->{1}->{"start"}) {
$sequence = $self->_processSequence($locations);
$self->_editSequence(\$sequence);
}
if ($sequence) {
$self->_addRow($atable, $self->_initializeReturnRow($curRow),
$sequence);
}
else {
$self->_addRow($atable, $self->_initializeReturnRow($curRow),
"Sequence unavailable");
}
if ($self->get('ignore')) {
my $ignore = $self->get('ignore');
my $ignore_row = $self->get('ignore_row');
my $ref = $self->_getLocationFrom($curRow, $ignore_row);
$ignore->{ $ref->{ $ignore_row } } = 1; $self->set('ignore', $ignore);
}
} |
sub _gene_exonIntronFlankSequences
{ my ($self, $atable, $curRow) = @_;
my $rank = 1;
my $importable_indices = $self->get('importable_indices');
my $pkey = $curRow ?
($curRow->[$importable_indices->{"pkey"}] || 'DUMMY') : undef;
my $lastPkey = $self->get('lastPkey') || $pkey;
my $outRow = $self->get('outRow');
if( ( ! defined $pkey ) or ( $pkey ne $lastPkey ) )
{
my $shift = ($self->get('seq_name') =~ m/flank/);
my $location = $self->_modFlanks( $self->get('calc_location'), $shift );
$self->set('calc_location', undef);
my $sequence;
if ($location->{"start"}) {
my $locations = { $rank => $location };
$sequence = $self->_processSequence($locations);
$self->_editSequence(\$sequence);
}
my $storageHash = $self->get('seqStorageHash');
if (exists $storageHash->{$lastPkey})
{
if (($storageHash->{$lastPkey}->{'transcriptCount'} >= $storageHash->{$lastPkey}->{'totalTranscripts'})
|| $self->lastDS() == 2)
{
$sequence = $storageHash->{$lastPkey}->{'seq'} if ($storageHash->{$lastPkey}->{'seq'});
if ($sequence)
{
$self->_addRow($atable, $outRow, $sequence);
}
else
{
$self->_addRow($atable, $outRow, "Sequence unavailable");
}
delete $storageHash->{$lastPkey}; }
else
{
$storageHash->{$lastPkey}->{'seq'} = $sequence;
$storageHash->{$lastPkey}->{'outRow'} = $outRow;
$self->set('seqStorageHash', $storageHash);
$self->_incrementBatch;
}
}
else
{
if ($sequence)
{
$self->_addRow($atable, $outRow, $sequence);
}
else
{
$self->_addRow($atable, $outRow, "Sequence unavailable");
}
}
$outRow = undef;
}
if ($curRow)
{
my $location = $self->_getLocationFrom($curRow,"pkey","transcript_pkey","chr", "start", "end", "strand", "transcript_count");
$self->_calcSeqOverLocations( $location );
if ($location->{'transcript_pkey'})
{
my $storageHash = $self->get('seqStorageHash');
my $prkey = $location->{'pkey'};
my $transcript_key = $location->{'transcript_pkey'};
$storageHash->{$prkey}->{'totalTranscripts'} = $location->{'transcript_count'} || 1; if(exists $storageHash->{$prkey}->{'transcriptKeys'}) {
if ($storageHash->{$prkey}->{'transcriptKeys'} !~ m/$transcript_key/) { $storageHash->{$prkey}->{'transcriptKeys'} .= $transcript_key.','; $storageHash->{$prkey}->{'transcriptCount'} ++;
}
}
else
{
$storageHash->{$prkey}->{'transcriptKeys'} = $transcript_key.',';
$storageHash->{$prkey}->{'transcriptCount'}++;
}
$self->set('seqStorageHash', $storageHash);
}
}
$outRow ||= $self->_initializeReturnRow($curRow);
$self->set('lastPkey', $pkey);
$self->set('outRow', $outRow); } |
sub _getConfigurationTree
{ my ($self,$interface,$dsCounter)=@_;;
return $self->getParam('configurator')->getConfigurationTree(
$self->virtualSchema,
$self->name,
$interface,
$dsCounter); } |
sub _getLocationFrom
{ my ($self, $curRow, @expectedFields) = @_;
my $importable_indices = $self->get('importable_indices');
my $location = {};
foreach my $expectedField (@expectedFields) {
$location->{$expectedField} = ( exists( $importable_indices->{$expectedField} ) ) ?
$curRow->[ $importable_indices->{$expectedField} ] : undef;
}
my $newCalculationsRequired = 0;
foreach my $key (keys %$importable_indices) {
$newCalculationsRequired = 1 if ($key eq 'coding_start_offset');
}
if($newCalculationsRequired) {
my $pkey = $curRow->[$importable_indices->{'pkey'}];
my $rank = $curRow->[$importable_indices->{'exon_id'}];
my $start_rank = $curRow->[$importable_indices->{'start_exon_id'}];
my $end_rank = $curRow->[$importable_indices->{'end_exon_id'}];
my $exon_start = $curRow->[$importable_indices->{'start'}];
my $exon_end = $curRow->[$importable_indices->{'end'}];
my $strand = $curRow->[$importable_indices->{'strand'}];
my $coding_start_offset = $curRow->[$importable_indices->{'coding_start_offset'}];
my $coding_end_offset = $curRow->[$importable_indices->{'coding_end_offset'}];
my ($new_start, $new_end);
my $exon_idHash = $self->get('exon_idHash');
if ($start_rank && $end_rank) {
if ($self->get('seq_name') eq '5utr') {
if ($rank == $start_rank) {
if ($strand == 1) {
$new_start = $exon_start;
$new_end = $exon_start + $coding_start_offset - 2;
}
if ($strand == -1) {
$new_start = $exon_end - $coding_start_offset + 2;
$new_end = $exon_end;
}
$self->_setCodingExonFlag($pkey.$start_rank, 1);
}
else {
if ($exon_idHash->{$pkey.$start_rank} && $exon_idHash->{$pkey.$start_rank} == 1) {
$new_start = undef;
$new_end = undef;
}
else {
$new_start = $exon_start;
$new_end = $exon_end;
}
}
}
elsif ($self->get('seq_name') eq '3utr') {
if ($rank == $end_rank) {
if ($strand == 1) {
$new_start = $exon_start + $coding_end_offset;
$new_end = $exon_end;
}
if ($strand == -1) {
$new_start = $exon_start;
$new_end = $exon_end - $coding_end_offset;
}
$self->_setCodingExonFlag($pkey.$end_rank, 1);
}
else {
if ($exon_idHash->{$pkey.$end_rank} && $exon_idHash->{$pkey.$end_rank} == 1) {
$new_start = $exon_start;
$new_end = $exon_end;
}
else {
$new_start = undef;
$new_end = undef;
}
}
}
if ($self->get('seq_name') =~ m/^coding.*?|peptide/) { # this includes sequence types : coding_gene_flank, coding_transcript_flank, coding, peptide if ($rank == $start_rank && $rank == $end_rank) { # coding start and finish on the same exon if ($strand == 1) { $new_start = $exon_start + $coding_start_offset - 1; $new_end = $exon_start + $coding_end_offset - 1;
}
if ($strand == -1) {
$new_start = $exon_end - $coding_end_offset + 1;
$new_end = $exon_end - $coding_start_offset + 1;
}
}
elsif ($rank == $start_rank) {
if ($strand == 1) {
$new_start = $exon_start + $coding_start_offset - 1;
$new_end = $exon_end;
}
if ($strand == -1) {
$new_start = $exon_start;
$new_end = $exon_end - $coding_start_offset + 1;
}
$self->_setCodingExonFlag($pkey.$start_rank, 1);
}
elsif ($rank == $end_rank) {
if ($strand == 1) {
$new_start = $exon_start;
$new_end = $exon_start + $coding_end_offset - 1;
}
if ($strand == -1) {
$new_start = $exon_end - $coding_end_offset + 1;
$new_end = $exon_end;
}
$self->_setCodingExonFlag($pkey.$start_rank, 0);
}
else {
if ($exon_idHash->{$pkey.$start_rank} && $exon_idHash->{$pkey.$start_rank} == 1) {
$new_start = $exon_start;
$new_end = $exon_end;
}
else {
$new_start = undef;
$new_end = undef;
}
}
}
if ($new_start && $new_end && ($new_start > $new_end)) {
$new_start = undef;
$new_end = undef;
}
$location->{'start'} = $new_start;
$location->{'end'} = $new_end;
}
else { $location->{'start'} = undef;
$location->{'end'} = undef;
}
}
return $location; } |
sub _getResultTable
{ my ($self, @param) = @_;
$self->set('batchIndex', 0);
local($^W) = 0; my(%param) = @param;
my $query = $param{'query'};
my $atable = $param{'table'};
my $batch_size = $param{'batch_size'};
if ($self->serverType eq "web"){
my $batch_start = $param{'batch_start'} || 0;
my $location = $self->getParam('configurator')->get('location');
my $xml = $query->toXML($batch_start,$batch_size,0);
my $logger=Log::Log4perl->get_logger(__PACKAGE__);
$logger->info("QUERY XML: $xml");
foreach my $el($location->getResultSet("","POST",$xml)){
if ($el =~ /No Sequence Returned/) {
$self->_setExhausted(1);
last;
}
my @clean=split(/\t/,$el);
$atable->addRow([@clean]);
}
return $atable;
} else {
$self->_initializeDNAAdaptor($query->
getInterfaceForDataset($self->name));
}
my $importable = $self->get('importable');
my $rtable = $importable->getTable();
my $attribute_count = @{$query->getAllAttributes};
if ($rtable->hashedResults || $attribute_count > 1){
$self->set('attribute_merge_required','1');
}
my $has_rows = $rtable->hasMoreRows;
my %avoidDuplication = ();
NEXTROW: while ($has_rows && $self->_continueWithBatch($batch_size, $rtable)) {
my $curRow = $rtable->nextRow;
my $rowAsString = $self->toString($curRow); next NEXTROW if (!$rowAsString || exists $avoidDuplication{$rowAsString} ) ; $avoidDuplication{$rowAsString} = '';
$self->_processRow( $atable, $curRow);
}
unless ($has_rows) {
$self->_setExhausted(1);
$self->_processRow($atable);
my $storageHash = $self->get('seqStorageHash');
if ($storageHash)
{
foreach my $fkey (%$storageHash)
{
my $sequence = $storageHash->{$fkey}->{'seq'};
my $outRow = $storageHash->{$fkey}->{'outRow'};
$self->_addRow($atable, $outRow, $sequence);
}
}
}
$importable->setTable($rtable);
$self->set('importable', $importable);
$self->get('dna')->close;
return $atable;
}
} |
sub _ignoreRow
{ my ($self, $curRow) = @_;
my $ignore = $self->get('ignore');
return 0 unless ($ignore);
my $ignore_row = $self->get('ignore_row');
my $test = $self->_getLocationFrom($curRow, $ignore_row);
return $test->{ $ignore_row } && $ignore->{ $test->{ $ignore_row } }; } |
sub _incrementBatch
{ my $self = shift;
my $batchIndex = $self->get('batchIndex');
$batchIndex++;
$self->set('batchIndex', $batchIndex); } |
sub _initializeDNAAdaptor
{ my ($self,$interface) = @_;
my $dna_params = $self->getConfigurationTree($interface)->
optionalParameters;
unless ($dna_params) {
BioMart::Exception::Configuration->throw("GenomicSequence Dataset requires optional_parameters to be set in the DatasetConfig\n");
}
my ($dnatablename, $chunk_name_fieldname, $chunk_start_fieldname,
$seqfieldname,$chunk_size) = split /\,/, $dna_params;
my $dna = BioMart::Dataset::GenomicSequence::DNAAdaptor->new(
'seq_name' => $self->name,
'dna_tablename' => $dnatablename,
'seq_fieldname' => $seqfieldname,
'chunk_name_fieldname' => $chunk_name_fieldname,
'chunk_start_fieldname' => $chunk_start_fieldname,
'chunk_size' => $chunk_size,
'configurator' => $self->getParam('configurator'),
);
unless ($dna) {
BioMart::Exception::Configuration->throw("Couldnt connect to DNAAdaptor\n");
}
$self->set('dna', $dna); } |
sub _initializeIndices
{ my ($self, $numFields) = @_;
my $returnRow_indices = {};
my $importable_indices = {};
my $filts = $self->get('importable')->getAllFilters;
my $index = 0;
foreach my $filt (@{$filts}) {
$importable_indices->{$filt->name} = $index;
$index++;
}
my $resultIndex = 0;
while ($index < $numFields) {
$returnRow_indices->{$index} = $resultIndex;
$index++;
$resultIndex++;
}
$self->set('importable_indices', $importable_indices);
$self->set('returnRow_indices', $returnRow_indices); } |
sub _initializeReturnRow
{ my ($self, $curRow) = @_;
return $self->get('attribute_merge_required') ? $curRow : []; } |
sub _modFlanks
{ my ($self, $location, $shift, $flank) = @_;
$location->{start} || return $location; $location->{end} || return $location;
if ($shift == 1)
{
if ($self->get('upstream_flank') && $self->get('downstream_flank')){
BioMart::Exception::Usage->throw("For this sequence option choose upstream OR downstream gene flanking sequence, NOT both, as makes no sense to simply concatenate them together.\n");
}
if ($self->get('upstream_flank'))
{
if ($location->{"strand"} < 0) {
$location->{"start"} = $location->{"end"} + 1;
$location->{"end"} += $self->get('upstream_flank');
}
else
{
$location->{"end"} = $location->{"start"} - 1;
$location->{"start"} -= $self->get('upstream_flank');
}
}
elsif ($self->get('downstream_flank'))
{
if ($location->{"strand"} < 0)
{
$location->{"end"} = $location->{"start"} - 1;
$location->{"start"} -= $self->get('downstream_flank');
}
else
{
$location->{"start"} = $location->{"end"} + 1;
$location->{"end"} += $self->get('downstream_flank');
}
}
else
{
BioMart::Exception::Usage->throw("Requests for flank sequence must be accompanied by an upstream_flank or downstream_flank request\n");
}
}
elsif ($shift == 2) {
if ($self->get('upstream_flank') && ($flank eq "up")) {
if ($location->{"strand"} < 0)
{
$location->{"end"} += $self->get('upstream_flank');
}
else
{
$location->{"start"} -= $self->get('upstream_flank');
}
}
elsif ($self->get('downstream_flank') && ($flank eq "down")) {
if ($location->{"strand"} < 0)
{
$location->{"start"} -= $self->get('downstream_flank');
}
else
{
$location->{"end"} += $self->get('downstream_flank');
}
}
}
else {
if ($location->{"strand"} < 0)
{
$location->{"start"} -= $self->get('downstream_flank');
$location->{"end"} += $self->get('upstream_flank');
}
else
{
$location->{"start"} -= $self->get('upstream_flank');
$location->{"end"} += $self->get('downstream_flank');
}
}
$location->{"start"} = 1 if ($location->{"start"} < 1);
return $location; } |
sub _new
{ my ($self, @param) = @_;
$self->SUPER::_new(@param);
$self->attr('dna', undef);
$self->attr('dnaparams', undef);
$self->attr('recipe', undef); $self->attr('ignore', undef);
$self->attr('ignore_row', undef);
$self->attr('seq_edits', undef);
$self->attr('codon_table_id', undef); $self->attr('seq_name', undef); $self->attr('translate', 0); $self->attr('downstream_flank', 0);
$self->attr('upstream_flank', 0);
$self->attr('importable', undef);
$self->attr('lastPkey', undef);
$self->attr('importable_indices', undef); $self->attr('returnRow_indices', undef); $self->attr('returnRow', undef);
$self->attr('batchIndex', 0);
$self->attr('locations', {}); $self->attr('outRow', undef);
$self->attr('calc_location', undef);
$self->attr('sequence', undef);
$self->attr('attribute_merge_required', 0);
$self->attr('rowsFromLastBatch', undef);
$self->attr('seqStorageHash', undef);
$self->attr('onHoldSeqsKeys', undef);
$self->attr('exon_idHash', undef);
}
} |
sub _processRow
{ my ($self, $atable, $curRow) = @_;
unless ($self->get('importable_indices')) {
if ($self->get('exhausted')) {
$atable->addRow(["No Sequence Returned"]);
}
else {
my $numFields = @{$curRow};
$self->_initializeIndices($numFields);
}
}
my $method = $self->get('recipe');
$self->$method($atable, $curRow); } |
sub _processSequence
{ my ($self, $locations) = @_;
my $seq = '';
my $temp_Seq = '';
my $first_coding_exon_flag = 0;
my $dna = $self->get('dna');
foreach my $rank (sort { $a <=> $b } keys %{$locations}) {
my $location = $locations->{$rank};
my $chr = $location->{'chr'};
my $start = $location->{'start'};
my $end = $location->{'end'};
my $strand = exists( $location->{'strand'}) ?
$location->{'strand'} : 1;
my $phase = $location->{'phase'} || 0;
if ($first_coding_exon_flag == 0) {
if ($strand < 0) {
$temp_Seq = $self->_rc( $dna->
getSequence( $chr, $start, $end ) );
}
else {
$temp_Seq = $dna->getSequence( $chr, $start, $end );
}
if($temp_Seq) { if ($phase > 0) { $seq = 'N'x$phase;
}
$seq .= $temp_Seq;
$first_coding_exon_flag = 1;
}
}
else {
if ($strand < 0) {
$seq .= $self->_rc( $dna->getSequence( $chr, $start, $end ) );
}
else {
$seq .= $dna->getSequence( $chr, $start, $end );
}
}
}
if (length($seq)) {
return $seq;
}
return undef } |
sub _rawSequences
{ my ($self, $atable, $curRow) = @_;
my $rank = 1;
if ($curRow) {
my $importable_indices = $self->get('importable_indices');
my $locations = {};
my $location = $self->_getLocationFrom($curRow, "chr", "start", "end");
$location->{"strand"} = ( exists( $importable_indices->{"strand"} ) ) ?
$curRow->[ $importable_indices->{"strand"} ] : 1;
$locations->{$rank} = $location if ($location->{"start"});
my $sequence = $self->_processSequence($locations);
$self->_editSequence(\$sequence);
if ($sequence) {
$self->_addRow($atable, $self->_initializeReturnRow($curRow),
$sequence);
}
} } |
sub _rc
{ my ($self, $seq) = @_;
$seq = reverse($seq);
$seq =~ tr/YABCDGHKMRSTUVyabcdghkmrstuv/RTVGHCDMKYSAABrtvghcdmkysaab/;
return $seq; } |
sub _setCodingExonFlag
{ my ($self, $key, $val) = @_;
my $exon_idHash = $self->get('exon_idHash');
$exon_idHash->{$key} = $val;
$self->set('exon_idHash', $exon_idHash); } |
sub _snpSequences
{ my ($self, $atable, $curRow) = @_;
my $rank = 1;
if ($curRow) {
my $location = $self->_getLocationFrom($curRow, "chr", "pos",
"strand", "allele");
if ($location->{"strand"} < 0) {
$location->{"start"} = $location->{"pos"} -
$self->get('downstream_flank');
$location->{"start"} = 1
if ($location->{"start"} < 1);
$location->{"end"} = $location->{"pos"} +
$self->get('upstream_flank');
$location->{"off"} = $self->get('upstream_flank');
}
else {
$location->{"start"} = $location->{"pos"} -
$self->get('upstream_flank');
$location->{"end"} = $location->{"pos"} +
$self->get('downstream_flank');
$location->{"off"} = $self->get('upstream_flank');
if ($location->{"start"} < 1) {
$location->{"off"} = $self->get('upstream_flank') +
$location->{"start"} - 1;
$location->{"start"} = 1;
}
}
my $locations = {};
$locations->{$rank} = $location;
my $sequence = $self->_processSequence($locations);
$self->_editSequence(\$sequence);
if ($sequence) {
substr($sequence, $location->{"off"}, 1) = "%".$location->{"allele"}."%";
$self->_addRow($atable, $self->_initializeReturnRow($curRow),
$sequence);
}
} } |
_transcript_exonIntronFlankSequences | description | prev | next | Top |
sub _transcript_exonIntronFlankSequences
{ my ($self, $atable, $curRow) = @_;
my $rank = 1;
my $importable_indices = $self->get('importable_indices');
my $pkey = $curRow ?
($curRow->[$importable_indices->{"pkey"}] || 'DUMMY') : undef;
my $lastPkey = $self->get('lastPkey') || $pkey;
my $outRow = $self->get('outRow');
if( ( ! defined $pkey ) or ( $pkey ne $lastPkey ) ){
my $shift = ($self->get('seq_name') =~ m/flank/);
my $location = $self->_modFlanks( $self->get('calc_location'),
$shift );
$self->set('calc_location', undef);
my $sequence;
if ($location->{"start"}) {
my $locations = { $rank => $location };
$sequence = $self->_processSequence($locations);
$self->_editSequence(\$sequence);
}
if ($sequence) {
$self->_addRow($atable, $outRow, $sequence);
}
else {
$self->_addRow($atable, $outRow, "Sequence unavailable");
}
$outRow = undef;
}
if ($curRow) {
my $location = $self->_getLocationFrom($curRow, "chr", "start",
"end", "strand");
$self->_calcSeqOverLocations( $location );
}
$outRow ||= $self->_initializeReturnRow($curRow);
$self->set('lastPkey', $pkey);
$self->set('outRow', $outRow); } |
sub _translate
{ my ($self, $seq) = @_;
BioMart::Exception::Configuration->throw("Calling translate without a seq argument!")
unless defined $seq;
return '' unless $seq;
my $id = $self->get('codon_table_id') || DEFAULTCODONTABLEID;
my ($partial) = 0;
$partial = 2 if length($seq) % 3 == 2;
$seq = lc $seq;
$seq =~ tr/u/t/;
my $protein = "";
if ($seq =~ /[^actg]/ ) { for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) {
my $triplet = substr($seq, $i, 3);
if (exists $CODONS->{$triplet}) {
$protein .= substr($TABLES[$id-1],
$CODONS->{$triplet},1);
}
else {
$protein .= $self->_translate_ambiguous_codon($triplet);
}
}
}
else { for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) {
my $triplet = substr($seq, $i, 3);
if (exists $CODONS->{$triplet}) {
$protein .= substr($TABLES[$id-1], $CODONS->{$triplet}, 1);
}
else {
$protein .= 'X';
}
}
}
if ($partial == 2) { my $triplet = substr($seq, ($partial -4)). "n";
if (exists $CODONS->{$triplet}) {
my $aa = substr($TABLES[$id-1], $CODONS->{$triplet},1);
$protein .= $aa;
} else {
$protein .= $self->_translate_ambiguous_codon($triplet, $partial);
}
}
return $protein; } |
sub _translate_ambiguous_codon
{ my ($self, $triplet, $partial) = @_;
$partial ||= 0;
my $id = $self->get('codon_table_id') || DEFAULTCODONTABLEID;
my $aa;
my @codons = _unambiquous_codons($triplet);
my %aas =();
foreach my $codon (@codons) {
$aas{substr($TABLES[$id-1],$CODONS->{$codon},1)} = 1;
}
my $count = scalar keys %aas;
if ( $count == 1 ) {
$aa = (keys %aas)[0];
}
elsif ( $count == 2 ) {
if ($aas{'D'} and $aas{'N'}) {
$aa = 'B';
}
elsif ($aas{'E'} and $aas{'Q'}) {
$aa = 'Z';
}
else {
$partial ? ($aa = '') : ($aa = 'X');
}
}
else {
$partial ? ($aa = '') : ($aa = 'X');
}
return $aa; } |
sub _unambiquous_codons
{ my ($value) = @_;
my @nts = ();
my @codons = ();
my ($i, $j, $k);
@nts = map { $IUPAC_DNA{uc $_} } split(//, $value);
for my $i (@{$nts[0]}) {
for my $j (@{$nts[1]}) {
for my $k (@{$nts[2]}) {
push @codons, lc "$i$j$k";
}
}
}
return @codons; } |
sub _utrSequences
{ my ($self, $atable, $curRow) = @_;
my $locations = $self->get('locations');
my $outRow = $self->get('outRow');
if ($curRow) {
my $importable_indices = $self->get('importable_indices');
my $pkey = $curRow->[ $importable_indices->{"pkey"} ];
my $lastPkey = $self->get('lastPkey');
if ( $lastPkey && ($pkey ne $lastPkey) ) {
my $hasUtr = keys %{$locations};
if ($hasUtr) {
my @ranks = sort { $a <=> $b } keys %{$locations};
my $low_rank = shift @ranks;
my $hi_rank = ($hasUtr > 1) ? pop @ranks : $low_rank;
my $calc_location = $self->get('calc_location');
my $low_loc_strand = $locations->{$low_rank}->{"strand"};
if ($self->get('upstream_flank')) {
if ($low_loc_strand < 0) {
$calc_location->{"start"} =
$calc_location->{"end"} + 1;
$calc_location->{"end"} =
$calc_location->{"start"} +
$self->get('upstream_flank') - 1;
}
else {
$calc_location->{"end"} =
$calc_location->{"start"} - 1;
$calc_location->{"start"} =
$calc_location->{"start"} -
$self->get('upstream_flank'); $calc_location->{"start"} = 1
if ($calc_location->{"start"} < 1);
}
$locations->{$low_rank - 1} = $calc_location;
}
elsif ($self->get('downstream_flank')) {
if ($low_loc_strand < 0) {
$calc_location->{"end"} =
$calc_location->{"start"} - 1;
$calc_location->{"start"} =
$calc_location->{"end"} -
$self->get('downstream_flank') + 1;
}
else {
$calc_location->{"start"} =
$calc_location->{"end"} + 1;
$calc_location->{"end"} =
$calc_location->{"start"} +
$self->get('downstream_flank') -1; }
$locations->{$hi_rank + 1} = $calc_location;
}
my $sequence = $self->_processSequence($locations);
$self->_editSequence(\$sequence);
if ($sequence) {
$locations = {};
$hasUtr = 0;
$self->_addRow($atable, $outRow, $sequence);
$outRow = undef;
$self->set('calc_location', undef);
}
}
else {
$self->_addRow($atable, $outRow,
"Sequence unavailable");
$outRow = undef;
}
}
unless ($outRow) {
$outRow = $self->_initializeReturnRow($curRow);
}
my $rank = $curRow->[ $importable_indices->{"rank"} ];
my $location = $self->_getLocationFrom($curRow, "chr", "start",
"end", "strand");
if ($location->{"start"}) {
$locations->{$rank} = $location;
$self->_calcSeqOverLocations($location);
}
$self->set('lastPkey', $pkey);
}
else {
my $hasUtr = keys %{$locations};
if ($hasUtr) {
my @ranks = sort { $a <=> $b } keys %{$locations};
my $low_rank = shift @ranks;
my $hi_rank = ($hasUtr > 1) ? pop @ranks : $low_rank;
my $calc_location = $self->get('calc_location');
my $low_loc_strand = $locations->{$low_rank}->{"strand"};
if ($self->get('upstream_flank')) {
if ($low_loc_strand < 0) {
$calc_location->{"start"} = $calc_location->{"end"} + 1;
$calc_location->{"end"} =
$calc_location->{"start"} +
$self->get('upstream_flank') - 1;
}
else {
$calc_location->{"end"} = $calc_location->{"start"} - 1;
$calc_location->{"start"} =
$calc_location->{"start"} -
$self->get('upstream_flank') + 1;
$calc_location->{"start"} = 1
if ($calc_location->{"start"} < 1);
}
$locations->{$low_rank - 1} = $calc_location;
}
elsif ($self->get('downstream_flank')) {
if ($low_loc_strand < 0) {
$calc_location->{"end"} = $calc_location->{"start"} - 1;
$calc_location->{"start"} = $calc_location->{"end"} -
$self->get('downstream_flank') + 1;
}
else {
$calc_location->{"start"} = $calc_location->{"end"} + 1;
$calc_location->{"end"} = $calc_location->{"start"} +
$self->get('downstream_flank') - 1;
}
$locations->{$hi_rank + 1} = $calc_location;
}
my $sequence = $self->_processSequence($locations);
$self->_editSequence(\$sequence);
if ($sequence) {
$locations = {};
$hasUtr = 0;
$self->_addRow($atable, $outRow, $sequence);
$outRow = undef;
$self->set('calc_location', undef);
}
}
else {
$self->_addRow($atable, $outRow,
"No UTR is annotated for this transcript");
$outRow = undef;
}
}
$self->set('locations', $locations);
$self->set('outRow', $outRow); } |
sub set_seq_edits
{
my ($self, $seq_edits) = @_;
if ($seq_edits) {
my $temp_str;
if ($seq_edits =~ m/\,/) { $temp_str = $seq_edits; }
else { $temp_str = $self->get('seq_edits');
$seq_edits =~ s/\s/\,/g;
if ($temp_str) {
$temp_str = $temp_str.';'.$seq_edits if($temp_str !~ m/$seq_edits/); }
else {
$temp_str = $seq_edits;
}
}
$self->set('seq_edits', $temp_str);
}
else {
$self->set('seq_edits', "");
} } |
sub toString
{
my ($self, $curRow) = @_;
my $string;
foreach (@{$curRow})
{
$string .= $_ if ($_);
}
return $string;
}
1; } |
General documentation
AUTHOR - Arek Kasprzyk, Syed Haider, Darin London, Damian Smedley | Top |
The peptide translation algorithm is taken directly
from the CodonTable module that is part of the
BioPerl project. For more information about the
BioPerl project, visit:
http://www.bioperl.org