Raw content of XrefParser::UCSCParser # $Id: UCSCParser.pm,v 1.4 2008/09/02 10:02:10 ianl Exp $ package XrefParser::UCSCParser; use strict; use warnings; use base qw( XrefParser::CoordinateParser ); sub run { my $self = shift if (defined(caller(1))); my $source_id = shift; my $species_id = shift; my $files = shift; my $release_file = shift; my $verbose = shift; my $data_file = @{$files}[0]; # Get the $source_id for the "UCSC" source. $source_id = $self->get_source_id_for_source_name('UCSC'); my $data_io = $self->get_filehandle($data_file); while ( defined( my $line = $data_io->getline() ) ) { chomp($line); # Each line will have the following tab-delimited fields: # 0. name (UCSC stable ID) # 1. chrom (chromosome name, a la UCSC) # 2. strand (plus or minus) # 3. txStart (transcript start) # 4. txEnd (transcript end) # 5. cdsStart (CDS start) # 6. cdsEnd (CDS end) # 7. exonCount (number of exons in transcript) # 8. exonStarts (comma-separated list of exon start positions) # 9. exonEnds (comma-separated list of exon end positions) # 10. proteinID (cross reference to a protein ID, e.g. UniProt) # 11. alignID (not sure what this is right now) my ( $name, $chrom, $strand, $txStart, $txEnd, $cdsStart, $cdsEnd, $exonStarts, $exonEnds ) = ( split( /\t/, $line ) )[ 0 .. 6, 8, 9 ]; # UCSC uses slightly different chromosome names, at least for # human and mouse, so chop off the 'chr' in the beginning. We do # not yet translate the names of the special chromosomes, e.g. # "chr6_cox_hap1" (UCSC) into "c6_COX" (Ensembl). $chrom =~ s/^chr//; # They also use '+' and '-' for the strand, instead of -1, 0, or 1. if ( $strand eq '+' ) { $strand = 1 } elsif ( $strand eq '-' ) { $strand = -1 } else { $strand = 0 } # ... and non-coding transcripts have cdsStart == cdsEnd. We would # like these to be stored as NULLs. if ( $cdsStart == $cdsEnd ) { undef($cdsStart); undef($cdsEnd); } # ... and they use the same kind of "inbetween" coordinates as e.g. # exonerate, so increment all start coordinates by one. $txStart += 1; $exonStarts = join( ',', map( { ++$_ } split( /,/, $exonStarts ) ) ); if ( defined($cdsStart) ) { $cdsStart += 1 } # Cut off the last comma from $exonEnds, if it exists. This is done # for $exonStarts already (above). if ( substr( $exonEnds, -1, 1 ) eq ',' ) { chop($exonEnds) } my %xref = ( 'accession' => $name, 'chromosome' => $chrom, 'strand' => $strand, 'txStart' => $txStart, 'txEnd' => $txEnd, 'cdsStart' => $cdsStart, 'cdsEnd' => $cdsEnd, 'exonStarts' => $exonStarts, 'exonEnds' => $exonEnds ); $self->add_xref( $source_id, $species_id, \%xref ); } ## end while ( defined( my $line... $data_io->close(); return 0; } ## end sub run 1;