Raw content of ScriptUtils # Utilities for scripts for doing tasks such as defining/filtering input package ScriptUtils; use Exporter; use vars qw(@ISA @EXPORT); use strict; @ISA=qw(Exporter); @EXPORT=qw( filter_to_chr_list get_chrlengths_v20 get_chrlengths_v19 sort_chr_names ); sub get_chrlengths_v20 { my $db = shift; my $type = shift; my $coordsystem = shift; my %chrhash; if ($coordsystem ne 'toplevel') { if (!$db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) { die "get_chrlengths should be passed a Bio::EnsEMBL::DBSQL::DBAdaptor\n"; } my $query = "select seq_region.name, seq_region.length as mce from seq_region,coord_system where" . " seq_region.coord_system_id=coord_system.coord_system_id and" . " coord_system.version = '" . $type . "' and coord_system.name='$coordsystem'"; my $sth = $db->dbc->prepare($query); $sth->execute; my $hashref; while (($hashref = $sth->fetchrow_hashref) && defined($hashref)) { $chrhash{$hashref->{'name'}} = $hashref->{mce}; # print $hashref->{'name'} . " " . $hashref->{'mce'} . "\n"; } } else { my $sa = $db->get_SliceAdaptor; my @slices = @{$sa->fetch_all('toplevel')}; foreach my $slice (@slices) { #print $slice->seq_region_name . " " . $slice->length . " " . $slice->coord_system->version . "\n"; if ($slice->coord_system->version eq $type) { $chrhash{$slice->seq_region_name} = $slice->length; } else { print STDERR "WARNING: " . $slice->seq_region_name . " " . $slice->length . " " . $slice->coord_system->version . " NOT on requested assembly version but is toplevel\n"; } } } return \%chrhash; } sub get_chrlengths_v19 { my $db = shift; my $type = shift; if (!$db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) { die "get_chrlengths should be passed a Bio::EnsEMBL::DBSQL::DBAdaptor\n"; } my %chrhash; my $q = qq( SELECT chrom.name,max(chr_end) FROM assembly as ass, chromosome as chrom WHERE ass.type = '$type' and ass.chromosome_id = chrom.chromosome_id GROUP BY chrom.name ); my $sth = $db->dbc->prepare($q) || $db->throw("can't prepare: $q"); my $res = $sth->execute || $db->throw("can't execute: $q"); while( my ($chr, $length) = $sth->fetchrow_array) { $chrhash{$chr} = $length; } return \%chrhash; } sub sort_chr_names { my ($chrhash) = @_; my @sorted_names = reverse sort bychrnum keys %$chrhash; return \@sorted_names; } sub bychrnum { my @awords = split /_/, $a; my @bwords = split /_/, $b; my $anum = $awords[0]; my $bnum = $bwords[0]; # if ($anum !~ /^chr/ || $bnum !~ /^chr/) { # die "Chr name doesn't begin with chr for $a or $b"; # } $anum =~ s/chr//; $bnum =~ s/chr//; if ($anum !~ /^[0-9]*$/) { if ($bnum !~ /^[0-9]*$/) { return $anum cmp $bnum; } else { return 1; } } if ($bnum !~ /^[0-9]*$/) { return -1; } if ($anum <=> $bnum) { return $anum <=> $bnum; } else { if ($#awords == 0) { return -1; } elsif ($#bwords == 0) { return 1; } else { return $awords[1] cmp $bwords[1]; } } } sub filter_to_chr_list { my ($chromosomes, $chrhash, $dbname) = @_; #filter to specified chromosome names only if (scalar(@$chromosomes)) { foreach my $chr (@$chromosomes) { my $found = 0; foreach my $chr_from_hash (keys %$chrhash) { if ($chr_from_hash =~ /^${chr}$/) { $found = 1; last; } } if (!$found) { print "Didn't find chromosome named $chr in database " . $dbname . "\n"; } } HASH: foreach my $chr_from_hash (keys %$chrhash) { foreach my $chr (@$chromosomes) { if ($chr_from_hash =~ /^${chr}$/) { #print "Keeping $chr_from_hash\n"; next HASH; } } #print "Removing $chr_from_hash\n"; delete($chrhash->{$chr_from_hash}); } } return 1; } return 1;