Bio::EnsEMBL::Analysis::RunnableDB
ImportArrays
Toolbar
Summary
Bio::EnsEMBL::Analysis::RunnableDB::ImportArrays;
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
my $importer =
Bio::EnsEMBL::Analysis::RunnableDB::ImportArrays->new(
);
$importer->fetch_input();
$importer->run();
$importer->output();
$importer->write_output(); #writes to DB and a big fasta file
Description
This object imports array fasta files of a common array format and collapses redundant
probes into unique records based on their probeset name and sequence identity. The
probes supplied are redundant - a single probe (characterised by a unique sequence) may
occur in different positions of different arrays (i.e. for AFFY the chip coord is the
probe name). Parsing of the fasta file is facilitated by dynamic configuration of regular
expressions and array parameters based on the array format and array being imported. To
enable multiple instances of ImportArrays to run at the same time using different
configurations, the correct config type is read from an ImportArrays.config file,
using the input_id as the key.
A non-redundant array format specific output file is written, using the probe dbIDs in
the fasta headers. This can then be cat'd with out no-redundant array format fasta file
for use in the mapping carried out by ProbeAlign and ProbeTranscriptAlign.
Note that probes are defined as redundant when they share the same
- sequence and
- probeset
Methods
ARRAY_FORMAT | No description | Code |
ARRAY_PARAMS | No description | Code |
DNADB | No description | Code |
IFIELDORDER | No description | Code |
IIDREGEXP | No description | Code |
INPUT_FORMAT | No description | Code |
NAMES_FILE | No description | Code |
NON_REDUNDANT_PROBE_SEQS | No description | Code |
OUTDB | No description | Code |
OUTPUT_DIR | No description | Code |
QUERYSEQS | No description | Code |
add_array_chip_to_existing_probe | No description | Code |
create_new_array_chip | No description | Code |
create_new_probe | No description | Code |
fetch_input | No description | Code |
get_ARRAY_PARAMS_by_array_name | No description | Code |
get_IFIELDORDER | No description | Code |
get_IIDREGEXP | No description | Code |
helper | No description | Code |
new | No description | Code |
outdb | No description | Code |
probes | No description | Code |
query_file | No description | Code |
read_and_check_config | No description | Code |
run | No description | Code |
run_FASTA | No description | Code |
write_output | No description | Code |
Methods description
None available.
Methods code
sub ARRAY_FORMAT
{ my ( $self, $value ) = @_;
$self->{'_ARRAY_FORMAT'} = $value if defined $value;
return $self->{'_ARRAY_FORMAT'}; } |
sub ARRAY_PARAMS
{ my ( $self, $value ) = @_;
$self->{'_CONFIG_ARRAY_PARAMS'} = $value if defined $value;
return $self->{'_CONFIG_ARRAY_PARAMS'};
}
} |
sub DNADB
{ my ( $self, $value ) = @_;
$self->{'_CONFIG_DNADB'} = $value if defined $value;
return $self->{'_CONFIG_DNADB'};
}
1; } |
sub IFIELDORDER
{ my ( $self, $value ) = @_;
$self->{'_CONFIG_IFIELDORDER'} = $value if defined $value;
return $self->{'_CONFIG_IFIELDORDER'}; } |
sub IIDREGEXP
{ my ( $self, $value ) = @_;
$self->{'_CONFIG_IIDREGEXP'} = $value if defined $value;
return $self->{'_CONFIG_IIDREGEXP'}; } |
sub INPUT_FORMAT
{ my ( $self, $value ) = @_;
$self->{'_CONFIG_INPUT_FORMAT'} = $value if defined $value ;
return $self->{'_CONFIG_INPUT_FORMAT'}; } |
sub NAMES_FILE
{ my ( $self, $value ) = @_;
$self->{'_NAMES_FILE'} = $value if defined $value;
return $self->{'_NAMES_FILE'};
}
} |
sub NON_REDUNDANT_PROBE_SEQS
{ my ( $self, $value ) = @_;
$self->{'_NON_REDUNDANT_PROBE_SEQS'} = $value if defined $value;
return $self->{'_NON_REDUNDANT_PROBE_SEQS'}; } |
sub OUTDB
{ my ( $self, $value ) = @_;
$self->{'_CONFIG_OUTDB'} = $value if defined $value;
return $self->{'_CONFIG_OUTDB'}; } |
sub OUTPUT_DIR
{ my ( $self, $value ) = @_;
$self->{'_CONFIG_OUTPUT_DIR'} = $value if defined $value;
return $self->{'_CONFIG_OUTPUT_DIR'};
}
} |
sub QUERYSEQS
{ my ( $self, $value ) = @_;
$self->{'_QUERYSEQS'} = $value if defined $value;
return $self->{'_QUERYSEQS'}; } |
sub add_array_chip_to_existing_probe
{ my ($self, $probe, $array_chip, $probeset, $probename) = @_;
$probe->add_array_chip_probename($array_chip->dbID, $probename, $array_chip->get_Array);
}
} |
sub create_new_array_chip
{ my($self, $array_name, $design_id) = @_;
if(! ($array_name && $design_id)){
throw('Need to pass an Array name and an ArrayChip design ID');
}
my $array_adaptor = $self->outdb->get_ArrayAdaptor;
my $achip_adaptor = $self->outdb->get_ArrayChipAdaptor;
my $array_params = $self->get_ARRAY_PARAMS_by_array_name($array_name);
my $array = $array_adaptor->fetch_by_name_vendor($array_name, $array_params->{'-vendor'});
my $array_chip;
if(! defined $array){
$array = Bio::EnsEMBL::Funcgen::Array->new(%{$array_params});
($array) = @{$self->outdb->get_ArrayAdaptor->store($array)};
}
if($array_chip = $achip_adaptor->fetch_by_array_design_ids($array->dbID, $design_id)){
if($array_chip->has_status('IMPORTED')){
throw("$array_name ArrayChip has already been IMPORTED. Please rollback_ArrayChip or recreate your arrays_nr".$self->ARRAY_FORMAT.'.fasta file for alignment');
}
else{ $self->helper->rollback_ArrayChip($array_chip, 'probe'); }
}
else{
$array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new
(
-name => $array->name,
-design_id => $design_id, -array_id => $array->dbID,
);
($array_chip) = @{$self->outdb->get_ArrayChipAdaptor->store($array_chip)};
}
if(! exists $self->{'_array_names'}->{$array_name}){
$self->{'_array_names'}->{$array->name} = $array_chip;
}
return $array_chip;
}
} |
sub create_new_probe
{ my($self, $array_chip, $probe_hash, $length) = @_;
$probe_hash->{'-length'} = $length;
$probe_hash->{'-class'} = 'EXPERIMENTAL'; $probe_hash->{'-array_chip_id'} = $array_chip->dbID;
$probe_hash->{'-array'} = $array_chip->get_Array;
delete $probe_hash->{-probe_set};
return Bio::EnsEMBL::Funcgen::Probe->new(%{$probe_hash}); } |
sub fetch_input
{ my ($self) = @_;
my $logic = $self->analysis->logic_name;
my ($query_file, $chunk_number, $chunk_total);
my $query = $self->QUERYSEQS;
if ( -e $query and -d $query ) {
throw "I need to have all affy probes input in one big file\n";
} elsif ( -e $query and -s $query ) {
$self->query_file($query);
} else {
throw("'$query' refers to something that could not be made sense of\n");
}
} |
sub get_ARRAY_PARAMS_by_array_name
{ my ( $self, $array_name ) = @_;
if(! exists $self->{'_CONFIG_ARRAY_PARAMS'}{$array_name}){
throw("No ARRAY_PARAMS config available for $array_name. You must add this to the ImportArrays config before importing");
}
return $self->{'_CONFIG_ARRAY_PARAMS'}{$array_name}; } |
sub get_IFIELDORDER
{ my $self = shift;
return $self->{'_CONFIG_IFIELDORDER'}; } |
sub get_IIDREGEXP
{ my $self = shift;
return $self->{'_CONFIG_IIDREGEXP'}; } |
sub helper
{ return $_[0]->{'_helper'}; } |
sub new
{ my ( $class, @args ) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config;
$self->outdb->dbc->db_handle;
$self->{'_array_names'} = {};
$self->{'_helper'} = Bio::EnsEMBL::Funcgen::Utils::Helper->new( no_log => 1 );
my $db = $self->db;
return $self; } |
sub outdb
{ my ($self) = @_;
my ($outdb);
if(! defined $self->{'_outdb'}){
my $dnadb;
if($self->DNADB->{-dbname}){
$dnadb = new Bio::EnsEMBL::DBSQL::DBAdaptor(%{ $self->DNADB });
}
$self->{'_outdb'} = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new
(
%{ $self->OUTDB },
-dnadb => $dnadb,
);
if(! $self->DNADB->{-dbname}){
print "WARNING: Using default DNADB ". $self->{'_outdb'}->dnadb->dbname."\n";
}
}
return $self->{'_outdb'};
}
} |
sub probes
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_probes'} = $value;
}
if ( exists( $self->{'_probes'} ) ) {
return $self->{'_probes'};
} else {
return undef;
}
}
} |
sub query_file
{ my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_query_file'} = $value;
}
if ( exists( $self->{'_query_file'} ) ) {
return $self->{'_query_file'};
} else {
return undef;
}
}
} |
sub read_and_check_config
{ my $self = shift;
$self->SUPER::read_and_check_config($ARRAY_CONFIG);
my $logic = $self->analysis->logic_name;
my ($array_format) = split/\:/, $self->input_id;
$self->ARRAY_FORMAT($array_format);
foreach my $config_var (
qw(
OUTDB
OUTPUT_DIR
IIDREGEXP
INPUT_FORMAT
IFIELDORDER
IIDREGEXP
IFIELDORDER
INPUT_FORMAT
)
){
if ( ! defined $self->$config_var ){
throw("You must define $config_var in config for logic '$logic'");
}
}
$self->QUERYSEQS($self->OUTPUT_DIR."/arrays.${array_format}.fasta");
$self->NON_REDUNDANT_PROBE_SEQS($self->OUTPUT_DIR."/arrays_nr.${array_format}.fasta");
$self->NAMES_FILE($self->OUTPUT_DIR."/arrays.${array_format}.names");
if ( ref( $self->OUTDB ) ne "HASH" || ! defined $self->OUTDB->{-dbname}) {
throw("OUTDB in config for '$logic' must be a hash ref of db connection pars.");
}
}
} |
sub run
{ my $self = shift;
my $method = 'run_'.$self->INPUT_FORMAT;
$self->$method;
}
} |
sub run_FASTA
{ my ($self) = @_;
my (%probes_by_sequence, %array_chips, %probe_attrs, $probe_set);
my ($current_array_chip, $existing_probe, $current_sequence, $sequence_fragment);
my $header_regex = $self->get_IIDREGEXP;
my %valid_fields = (
-probe_set => undef,
-name => undef,
-array => undef,
-array_chip => undef,
);
my %field_order = %{$self->get_IFIELDORDER};
foreach my $config_field(keys(%field_order)){
if(! exists $valid_fields{$config_field}){
throw("Found invalid field on ImportArrays.pm config:\t$config_field\n".
"IFIELDORDER must only contain keys:\t".join("\t", keys %valid_fields));
}
}
my @match_refs = (\$1,\$ 2,\$ 3,\$ 4,\$ 5);
open( PROBES, "<".$self->query_file);
while(<PROBES>){
chomp;
if(/$header_regex$/){
if($current_sequence){
if(! $current_array_chip){
throw ("Have sequence $current_sequence but no current array chip!\n");
}
$probe_set = (exists $probe_attrs{'-probe_set'}) ? $probe_attrs{'-probe_set'} : undef;
$existing_probe = $probes_by_sequence{$probe_set}{$current_sequence};
if(! $existing_probe){
$existing_probe = $self->create_new_probe(
$current_array_chip,\%
probe_attrs,
length($current_sequence),
);
$probes_by_sequence{$probe_set}{$current_sequence} = $existing_probe;
}
else{
$self->add_array_chip_to_existing_probe
(
$existing_probe,
$current_array_chip,
$probe_set,
$probe_attrs{'-name'},
);
}
$current_sequence = undef;
}
foreach my $field(keys %field_order){
$probe_attrs{$field} = ${$match_refs[$field_order{$field}]};
}
$current_array_chip = $array_chips{$probe_attrs{-array_chip}};
if(! $current_array_chip){
$current_array_chip = $self->create_new_array_chip($probe_attrs{-array}, $probe_attrs{-array_chip});
$array_chips{$probe_attrs{-array_chip}} = $current_array_chip;
}
}
elsif(/^[atgcuATGCU]+$/){ $sequence_fragment = $_;
if($current_sequence){
$current_sequence = $current_sequence.$sequence_fragment;
}
else{
$current_sequence = $sequence_fragment;
}
}
else{
throw('Found header which does not match '.$self->INPUT_FORMAT." regex($header_regex):\n$_");
}
}
$array_chips{$probe_attrs{-array_chip}} = $current_array_chip;
$probe_set = (exists $probe_attrs{'-probe_set'}) ? $probe_attrs{'-probe_set'} : undef;
$existing_probe = $probes_by_sequence{$probe_set}{$current_sequence};
if(! $existing_probe){
$existing_probe = $self->create_new_probe(
$current_array_chip,\%
probe_attrs,
length($current_sequence),
);
$probes_by_sequence{$probe_set}{$current_sequence} = $existing_probe;
}
else{
$self->add_array_chip_to_existing_probe
(
$existing_probe,
$current_array_chip,
$probe_set,
$probe_attrs{'-name'},
);
}
$self->probes(\%probes_by_sequence);
return; } |
sub write_output
{ my ( $self, @output ) = @_;
my $outdb = $self->outdb;
my $outfile = $self->NON_REDUNDANT_PROBE_SEQS;
my $probe_adaptor = $outdb->get_ProbeAdaptor;
my $probeset_adaptor = $outdb->get_ProbeSetAdaptor;
open (OUTFILE, ">".$outfile) || throw("Failed to open ouput file:\t".$outfile);
foreach my $probeset(keys %{$self->probes}){
my %probes = %{$self->probes->{$probeset}};
if($probeset){
$probeset = Bio::EnsEMBL::Funcgen::ProbeSet->new
(
-name => $probeset,
-size => scalar(values %probes),
);
($probeset) = @{$probeset_adaptor->store($probeset)};
}
foreach my $sequence(keys %probes){
my $probe = $probes{$sequence};
$probe->probeset($probeset) if $probeset;
($probe) = @{$probe_adaptor->store($probe)};
print OUTFILE ">".$probe->dbID."\n".$sequence."\n";
}
}
close(OUTFILE);
$outfile = $self->NAMES_FILE;
open (OUTFILE, ">".$outfile) || throw("Failed to open ouput file:\t".$outfile);
foreach my $aname(keys %{$self->{'_array_names'}}){
print OUTFILE $aname."\n";
$self->{'_array_names'}->{$aname}->add_status('IMPORTED');
$self->{'_array_names'}->{$aname}->adaptor->store_states($self->{'_array_names'}->{$aname});
}
close(OUTFILE);
return;
}
} |
General documentation
This module was written by Nathan Johnson, based on the CollapseAffy/Oligo code.
Post general queries to ensembl-dev@ebi.ac.uk