BioMart::Formatter MAF
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
BioMart::Formatter::MAF
Package variables
Privates (from "my" definitions)
$logger;
$aln_nb = 0
Included modules
Log::Log4perl
Inherit
BioMart::FormatterI
Synopsis
TODO: Synopsis here.
Description
  MAF Formatter

For more documentation see :

http://genome.ucsc.edu/FAQ/FAQformat.html#format5
Methods
_get_aligned_sequence_from_original_sequence_and_cigar_line
No description
Code
_new
No description
Code
_printHeader
No description
Code
_rc
No description
Code
_rcCigarLine
No description
Code
getDisplayNames
No description
Code
isSpecial
No description
Code
nextRow
No description
Code
preProcessRow
No description
Code
preProcessRow2
No description
Code
preProcessRowMlagan
No description
Code
preProcessRowMlaganOLD
No description
Code
processQuery
No description
Code
returnMAFline
No description
Code
returnMAFline4Mlagan
No description
Code
trimspe
No description
Code
Methods description
None available.
Methods code
_get_aligned_sequence_from_original_sequence_and_cigar_linedescriptionprevnextTop
sub _get_aligned_sequence_from_original_sequence_and_cigar_line {
    
    my ($original_sequence, $cigar_line) = @_;
    my $aligned_sequence = "";

    return undef if (!$original_sequence or !$cigar_line);
    
    my $seq_pos = 0;
    
    my @cig = ( $cigar_line =~ /(\d*[GMD])/g );
    for my $cigElem ( @cig ) {
	
	my $cigType = substr( $cigElem, -1, 1 );
	my $cigCount = substr( $cigElem, 0 ,-1 );
	$cigCount = 1 unless ($cigCount =~ /^\d+$/);
	#print "-- $cigElem $cigCount $cigType\n";
if( $cigType eq "M" ) { $aligned_sequence .= substr($original_sequence, $seq_pos, $cigCount); $seq_pos += $cigCount; } elsif( $cigType eq "G" or $cigType eq "D") { $aligned_sequence .= "-" x $cigCount; } } warn ("Cigar line ($seq_pos) does not match sequence lenght (".length($original_sequence).")") if ($seq_pos != length($original_sequence)); return $aligned_sequence; } #--------------------------------------------
}
_newdescriptionprevnextTop
sub _new {
    my ($self) = @_;
    $self->SUPER::_new();
    $aln_nb = 0 ;
    # Get reference to logger
$logger = Log::Log4perl->get_logger(__PACKAGE__);
}
_printHeaderdescriptionprevnextTop
sub _printHeader {
    my $date = localtime();
    my $p1 = sprintf "##maf version=1\n";
    my $p2 = sprintf "#".localtime()."\n";
    my $p3 = sprintf "#The start coordinate is a zero-based number.\n";
    my $p4 = sprintf "#For segments in the negative strand, the start\n";
    my $p5 = sprintf "#is relative to the end of the chromosome. Please, refer to\n";
    my $p6 = sprintf "#http://genome.ucsc.edu/FAQ/FAQformat#format5 for a\n";
    my $p7 = sprintf "#description of this file format.\n\n";
    #print $p1;
#print $p2;
return $p1.$p2.$p3.$p4.$p5.$p6.$p7; } # subroutines from AXT.pm <alpha version>
#--------------------------------------------
}
_rcdescriptionprevnextTop
sub _rc {
    my ($seq) = @_;

    $seq = reverse($seq);
    $seq =~ tr/YABCDGHKMRSTUVyabcdghkmrstuv/RTVGHCDMKYSAABrtvghcdmkysaab/;

    return $seq;
}
#--------------------------------------------
}
_rcCigarLinedescriptionprevnextTop
sub _rcCigarLine {
    my ($cigar_line) = @_;
        
    #print STDERR "###cigar_line $cigar_line\n";
my @cig = ( $cigar_line =~ /(\d*[GMD])/g ); my @rev_cigar = reverse(@cig); my $rev_cigar; for my $cigElem ( @rev_cigar ) { $rev_cigar.=$cigElem; } #print STDERR "###rev_cigar $rev_cigar\n";
return $rev_cigar; } #-----------------------------------------
}
getDisplayNamesdescriptionprevnextTop
sub getDisplayNames {
    my $self = shift;
    return '' ;
}
#--------------------------------------------
}
isSpecialdescriptionprevnextTop
sub isSpecial {
    return 1;
}
1;
}
nextRowdescriptionprevnextTop
sub nextRow {
    my $self = shift;
    my @data ;
    my $HEADER = "";
    my $PROCESSED_SEQS ;
    my $SCORE2 ;
    my $new_start_time = time();
    $logger->debug("START NEXT ROW ");
    my $rtable = $self->get('result_table');
    my $row = $rtable->nextRow;
    if (!$row){
        return;
    } 
    #**********
#my $aln_nb = 0;
#rint "\n\n+++++++++++++++++++ $aln_nb \n\n";
### Print maf comments using the $aln_nb variable
$HEADER = &_printHeader if ($aln_nb == 0); # Need to test if the data comes from MLAGAN
# in that case the row contain [seq1 seq2 seqN data1 data2 dataN]
if ( ( ($$row[0]=~/^(A|C|G|T|N)/) && ($$row[0]!~/^(Chr)/) ) && ( ($$row[1]=~/^(A|C|G|T|N)/) && ($$row[1]!~/^(Chr)/) ) ){ # 15/08/06 removed /i
# added a hack for 'Ch'
@data = &preProcessRowMlagan(\@{$row}); my $score ; # score is now in data[$i][8]
my $size_chro = 0 ; # calculate the size of the longuest chro # for sprintf
foreach my $value (@data){ my $chr = $value->[1]; if ($size_chro < length($chr) ){$size_chro = length($chr);} } foreach my $foo (@data){ my $seq = $foo->[0] ; my $chr = $foo->[1] ; my $start = $foo->[2] ; my $end = $foo->[3] ; my $strand = $foo->[4] ; my $length = $foo->[5] ; my $genome = $foo->[6] ; my $cigar = $foo->[7] ; $score = $foo->[8] ; if (($score) && (!$SCORE2)){ $SCORE2 .= "a score=$score\n"; } if ($seq ne 'N'){ # means that there is no data for that species
$PROCESSED_SEQS .= &returnMAFline4Mlagan($seq,$chr,$start,$end,$strand,$length,$genome,$size_chro,$cigar); } } $aln_nb++; #-- if score is undefined, put 0 instead of nothing
if (!$SCORE2){ $SCORE2 .= "a score=0\n";} return $HEADER . $SCORE2 . $PROCESSED_SEQS . "\n"; } else { warn "MAF.pm -- Error processing data (attribute list)\n"; } my $time_elapsed = round(time() - $new_start_time); $logger->debug("END NEXT ROW : $time_elapsed to format in MAF "); } #--------------------------------------------
}
preProcessRowdescriptionprevnextTop
sub preProcessRow {
    my $row =  shift ;
    my @want ;
    my $to = 0;
    my $score;
    my $size_row = @{$row};
    #print "size_row subroutine $size_row\n";
while ($size_row > 0) { #print "rentre loop while $to \n";
if ($to == 0) { for (my $i=0;$i<=6;$i++){ #print "==$to $i\n";
$want[$to][$i] = shift (@{$row}); #print " ---- $want[$to][$i]\n";
} $score = shift (@{$row}); #print "==score $to $score\n";
$to++; } else { for (my $i=0;$i<=6;$i++){ #print "==$to $i\n";
$want[$to][$i] = shift (@{$row}); #print " ---- $want[$to][$i]\n";
} $to++; } $size_row = @{$row}; } my $size = @want; return (@want, $score); } #--------------------------------------------
}
preProcessRow2descriptionprevnextTop
sub preProcessRow2 {
    my @row =  @_ ;
    my @want ;
    my $to = 0;
    my $score;
    my $size_row = @row;
    print "size_row subroutine :  $size_row\n";
    while ($size_row > 0) {
	#print "rendre loop while $to \n";	
if ($to == 0) { for (my $i=0;$i<=6;$i++){ #print "==$to $i\n";
$want[$to][$i] = shift (@row); #print " ---- $want[$to][$i]\n";
} $score = shift (@row); #print "==score $to $score\n";
$to++; } else { for (my $i=0;$i<=6;$i++){ #print "==$to $i\n";
$want[$to][$i] = shift (@row); #print " ---- $want[$to][$i]\n";
} $to++; } $size_row = @row; } my $size = @want; return (@want, $score); } #--------------------------------------------
}
preProcessRowMlagandescriptionprevnextTop
sub preProcessRowMlagan {
    my $row =  shift ;
    my @want ;
    my $score;
    my $k = 0;
    my $size_row = @{$row};
 #   print "\nsize_row subroutine :  $size_row\n";
while ( ($$row[0]=~/^(A|C|G|T|N)/) && ($$row[0]!~/^Chr/i) && ($$row[0]!~/\_/) ){ # get all seq out
$want[$k][0] = shift (@{$row}); $k++; } # my $o = 0;
# # $k-1 is equal to the number of seqs (=nb of species)
for (my $j=0;$j<=$k-1;$j++){ # print "== $j 0 ";print " ->>--- $want[$j][0]\n";
for (my $i=1;$i<=8;$i++){ #IMPORTANT changed from 7 to 8, as I have now a score for all species
# print "== $j $i ";
$want[$j][$i] = shift (@{$row}); # $want[$j][$i] = $row->[$o];
# print " ->>--- $want[$j][$i]\n";
# $o++;
} #### #if ($j == 0){#if ($j == 0){ #for the first species which contain the score
#### # $score = shift (@{$row});
#### # print "==score $j $score\n";
#### #}
} return (@want); } #--------------------------------------------
}
preProcessRowMlaganOLDdescriptionprevnextTop
sub preProcessRowMlaganOLD {
    my $row =  shift ;
    my @want ;
    my $score;
    my $k = 0;
    my $size_row = @{$row};
    print "\nsize_row subroutine :  $size_row\n";
    
    while ( ($$row[0]=~/^(A|C|G|T|N)/i) && ($$row[0]!~/^Chr/i) ) { # get all seq out
$want[$k][0] = shift (@{$row}); print "Get seq out: $want[$k][0]\n"; $k++; print "k= $k\n\n"; } # $k-1 is equal to the number of seqs (=nb of species)
for (my $j=0;$j<=$k-1;$j++){ print "== $j 0 ";print " ---- $want[$j][0]\n"; for (my $i=1;$i<=7;$i++){ print "== $j $i "; $want[$j][$i] = shift (@{$row}); print " ---- $want[$j][$i]\n"; } if ($j == 0){#if ($j == 0){ #for the first species which contain the score
$score = shift (@{$row}); print "==score $j $score\n"; } } return (@want, $score); } #--------------------------------------------
}
processQuerydescriptionprevnextTop
sub processQuery {
    my ($self, $query) = @_;
    $self->set('original_attributes',[@{$query->getAllAttributes()}]) if ($query->getAllAttributes());
    $self->set('query',$query);
    return $query;
}
returnMAFlinedescriptionprevnextTop
sub returnMAFline {
    my $size = @_;
    my ($seq,$chr,$start,$end,$strand,$length,$cigar) = @_;
    #warn "\n\n######  size returnMAFline $size  \n\n";
my ($length_seq,$hstrand,$hstart,$hend); if ($strand > 0){ $length_seq = length($seq); $hstrand = "+"; $hstart = $start; } elsif ($strand < 0){ $length_seq = length($seq); $hstrand = "-"; $hstart = $length - $end + 1; } else { warn "\n\n\nProblem in returning maf formated lines\n\n\n ";} my $formated_seq = _get_aligned_sequence_from_original_sequence_and_cigar_line($seq, $cigar); # was "%1s %16s %10d %10d %-5s %10d %10s \n","s",$chr etc.
my $maf_line = sprintf ("%1s %5s %10d %5s %-5s %10s %5s\n ","s",$chr ,$hstart ,$length_seq ,$hstrand ,$length ,$formated_seq); return $maf_line; } #--------------------------------------------
}
returnMAFline4MlagandescriptionprevnextTop
sub returnMAFline4Mlagan {
    my $size = @_;
    my ($seq,$chr,$start,$end,$strand,$length,$gdb_id,$size_chro,$cigar) = @_;
    my $chr2;
    
    if (length($gdb_id) > 3){
	$gdb_id = &trimspe($gdb_id);
    }else{
	$gdb_id = "sp".$gdb_id ;
    }
    
    if (length($chr) > 2){ $chr2 = $chr; }# add 'chr' to the chromosome name if <=2
else { $chr2 = "chr".$chr; } my ($length_seq,$hstrand,$hstart,$hend); if ($strand > 0){ $length_seq = length($seq); $hstrand = "+"; $hstart = $start; } elsif ($strand < 0){ $length_seq = length($seq); $hstrand = "-"; $hstart = $length - $end + 1; } else { warn "\n\n\nProblem in returning maf formated lines\n\n\n ";} my $formated_seq = _get_aligned_sequence_from_original_sequence_and_cigar_line($seq, $cigar); # was "%1s %16s %10d %10d %-5s %10d %10s \n","s",$chr etc.
# $size_chro+8 mean that I add 8 to make some space for 'hsap'+'.chr'
my $maf_line = sprintf ("%1s %-".($size_chro+8)."s %10d %7d %-5s %10s %5s\n ","s",$gdb_id.".".$chr2 ,$hstart ,$length_seq ,$hstrand ,$length ,$formated_seq); return $maf_line; } #--------------------------------------------
}
trimspedescriptionprevnextTop
sub trimspe {
    my $short_spec;
    my $spec = $_[0];

    $spec =~ tr[A-Z][a-z];
    if ($spec =~ /(\w+)\s+(\w+)/){
	$short_spec = substr($1,0,1).substr($2,0,3) ;
    }
    return $short_spec;        
}
#--------------------------------------------
}
General documentation
EXAMPLETop
    
##maf version=1 scoring=tba.v8
# tba.v8 (((human chimp) baboon) (mouse rat))
# multiz.v7
# maf_project.v5 _tba_right.maf3 mouse _tba_C
# single_cov2.v4 single_cov2 /dev/stdin

a score=23262.0
s hg16.chr7 27578828 38 + 158545518 AAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG
s panTro1.chr6 28741140 38 + 161576975 AAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG
s baboon 116834 38 + 4622798 AAA-GGGAATGTTAACCAAATGA---GTTGTCTCTTATGGTG
s mm4.chr6 53215344 38 + 151104725 -AATGGGAATGTTAAGCAAACGA---ATTGTCTCTCAGTGTG
s rn3.chr4 81344243 40 + 187371129 -AA-GGGGATGCTAAGCCAATGAGTTGTTGTCTCTCAATGTG
AUTHORSTop
    *
CONTACTTop
This module is part of the BioMart project
http://www.biomart.org
Questions can be posted to the mart-dev mailing list: mart-dev@ebi.ac.uk