package EnsEMBL::Web::Text::FeatureParser;

=head1 NAME

EnsEMBL::Web::Text::FeatureParser;

=head1 SYNOPSIS

This object parses data supplied by the user and identifies sequence locations for use by other Ensembl objects

=head1 DESCRIPTION

  my $parser = EnsEMBL::Web::Text::FeatureParser->new();
    
      $parser->parse($data);

=head1 LICENCE

This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details

=head1 CONTACT

=cut

use strict;
use warnings;
no warnings "uninitialized";
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use Data::Dumper;

use EnsEMBL::Web::Text::FeatureParser::BED;
use EnsEMBL::Web::Text::FeatureParser::GFF;
use EnsEMBL::Web::Text::FeatureParser::GTF;
use EnsEMBL::Web::Text::FeatureParser::PSL;
use EnsEMBL::Web::Text::FeatureParser::DAS;
use EnsEMBL::Web::Text::FeatureParser::WIG;
use EnsEMBL::Web::Text::FeatureParser::GBrowse;
use EnsEMBL::Web::Text::Feature::generic;
use EnsEMBL::Web::SpeciesDefs;
use EnsEMBL::Web::CompressionSupport;
#----------------------------------------------------------------------

=head2 new

  Arg [1]   : Ensembl Object 
  Function  : creates a new FeatureParser object
  Returntype: EnsEMBL::Web::Text::FeatureParser
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub new {
  my $class = shift;
  my $data = {
  'filter' => {},
  'URLs'       => [],
  'browser_switches' => {},
  'tracks'       => {},
  '_current_key'   => 'default',
  };
  bless $data, $class;
  return $data;
}

sub species_defs {
  my $self = shift;
  return $self->{'_species_defs'} ||= EnsEMBL::Web::SpeciesDefs->new(); 
}

#----------------------------------------------------------------------

=head2 current_key

  Arg [1]   :  
  Function  : 
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub current_key {
  my $self = shift;
  $self->{'_current_key'} = shift if @_;
  return $self->{'_current_key'};
}
#----------------------------------------------------------------------

=head2 format

  Arg [1]   :  
  Function  : 
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub format {
  my $self = shift;
  $self->{_info}{format} = shift if @_;
  return $self->{'_info'}->{'format'};
}

sub get_format {
  my $self = shift;
  return $self->{'_info'}->{'format'};
}

#----------------------------------------------------------------------

=head2 set_filter

  Arg [1]   :  
  Function  : 
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub set_filter {
  my $self = shift;
  $self->{'filter'} = {
     'chr'  => $_[0] eq 'ALL' ? undef : $_[0],
     'start'  => $_[1],
     'end'  => $_[2],
  }
}
#----------------------------------------------------------------------

=head2 analyse

  Arg [1]   :  
  Function  : Analyses a data string (e.g. from a form input), with the intention of identifying file format and other contents
  Returntype: hash reference 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub analyse {
  my ($self, $data) = @_;
  return unless $data;
  my %info;
  foreach my $row ( split /\n/, $data ) {
    my @analysis = $self->analyse_row($row);
    if( $analysis[2] ) {
      $info{$analysis[0]}{$analysis[1]} = $analysis[2];
    } else {
      $info{$analysis[0]} = $analysis[1];
    }
  ## Should we halt the analysis once we have a file format? Will any other useful info appear later in the file?
    last if $analysis[0] eq 'format';
  }

  $self->format($info{'format'});
  return \%info;
}

#----------------------------------------------------------------------

=head2 analyse_row

  Arg [1]   :  
  Function  : Parses an individual row of data, i.e. a single feature
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub analyse_row {
  my( $self, $row ) = @_;
  chomp;
  $row =~ s/[\t\r\s]+$//g;
  if( $row =~ /^browser\s+(\w+)\s+(.*)/i ) {
    return ('browser_switches', $1, $2);
  } elsif ($row =~ s/^track\s+(.*)$/$1/i) {
    my %config;
    while( $row ne '' ) {
      if( $row =~ s/^(\w+)\s*=\s*\"([^\"]+)\"// ) {  
        my $key   = $1;
        my $value = $2;
        while( $value =~ s/\\$// && $row ne '') {
          if( $row =~ s/^([^\"]+)\"\s*// ) {
            $value .= "\"$1";
          } else {
            $value .= "\"$row"; 
            $row = '';
          }
        }
        $row =~ s/^\s*//;
        $config{$key} = $value;
      } 
      elsif( $row =~ s/(\w+)\s*=\s*(\S+)\s*// ) {
        $config{$1} = $2;
      } 
      else {
        $row ='';
      }
    }
    if (my $ttype = $config{type}) {
      return ('format', 'WIG') if ($ttype =~ /wiggle_0/i);
    }
  } else {
    return unless $row =~ /\d+/g ;
    if( $row =~ /^reference(\s+)?=(\s+)?(.+)/ ) {
      return ('format', 'GBrowse');
    }   
    my @tab_del = split /(\t|  +)/, $row;

    my $current_key = $self->{'_current_key'} ;
    if( $tab_del[12] eq '.' || $tab_del[12] eq '+' || $tab_del[12] eq '-' ) {
      if( $tab_del[16] =~ /^(gene_id|transcript_id) [^;]+(\; (gene_id|transcript_id) [^;]+)?/ ) { ## GTF format
        return ('format', 'GTF');   
      } else {     ## GFF format
        return ('format', 'GFF');   
      }
    } elsif ( $tab_del[14] eq '+' || $tab_del[14] eq '-' || $tab_del[14] eq '.') { # DAS format accepted by Ensembl
      return ('format', 'DAS');   
    } else {
      my @ws_delim = split /\s+/, $row;
      if( $ws_delim[8] =~/^[-+][-+]?$/  ) { ## PSL format
        return ('format', 'PSL');   
      } elsif ($ws_delim[0] =~/^>/ ) {  ## Simple format (chr/start/end/type
        return ('format', 'generic');   
      } else { 
        my $fcount = scalar(@ws_delim);
        if ($fcount > 2 and $fcount < 13) {
          if ($ws_delim[1] =~ /\d+/ && $ws_delim[2] =~ /\d+/) {
            return ('format', 'BED');   
          }
        }
      }
    } 
  }
}
#----------------------------------------------------------------------

=head2 parse

  Arg [1]   :  
  Function  : Parses a data string (e.g. from a form input)
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub parse {
  my ($self, $data, $format) = @_;
  return unless $data;
  foreach my $row ( split /\n/, $data ) {
    $self->parse_row($row, $format);
  }

}

sub parse_old {
  my ($self, $data, $format) = @_;
  return unless $data;
  if (!$format) {
    my $info = $self->analyse($data);
    $format = $info->{'format'};
  }
  foreach my $row ( split /\n/, $data ) {
   $self->parse_row($row, $format);
  }
}

#----------------------------------------------------------------------

=head2 parse_file

  Arg [1]   :  
  Function  : 
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub parse_file {
  my( $self, $file, $format ) = @_;
  return unless $file;

  if( !$format ) {
    while( <$file> ) {
      my @analysis = $self->analyse_row( $_ );
      if( $analysis[0] eq 'format') {
        $format = $analysis[1];
        last;
      }
    }   
  }

  while( <$file> ) {
    $self->parse_row( $_, $format );
  }   
}

#----------------------------------------------------------------------

=head2 parse_URL

  Arg [1]   :  
  Function  : 
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub parse_URL {
  my( $self, $url, $format ) = @_;
  my $useragent = LWP::UserAgent->new();  
  $useragent->proxy( 'http', $self->species_defs->ENSEMBL_WWW_PROXY ) if( $self->species_defs->ENSEMBL_WWW_PROXY );   
  foreach my $URL ( $url ) {  
    my $request = new HTTP::Request( 'GET', $URL );
    $request->header( 'Pragma'       => 'no-cache' );
    $request->header( 'Cache-control' => 'no-cache' );
    my $response = $useragent->request($request); 
    if( $response->is_success ) {
      my $content = $response->content;
      EnsEMBL::Web::CompressionSupport::uncomp( \$content ); 
      if (!$format) {
        my $info = $self->analyse( $content );
        $format = $info->{'format'};
      }
      $self->parse( $content, $format );
    } else {
       warn( "Failed to parse: $URL" );
    }
  }   
}

#----------------------------------------------------------------------

=head2 parse_row

  Arg [1]   :  
  Function  : Parses an individual row of data, i.e. a single feature
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub parse_row {
  my( $self, $row, $format ) = @_;
  return if ($row =~ /^\#/);
  $row =~ s/[\t\r\s]+$//g;
  if( $row =~ /^browser\s+(\w+)\s+(.*)/i ) {
    $self->{'browser_switches'}{$1}=$2;   
  } elsif ($row =~ s/^track\s+(.*)$/$1/i) {
    my %config;
    while( $row ne '' ) {
      if( $row =~ s/^(\w+)\s*=\s*"([^"]+)"// ) {  
        my $key   = $1;
        my $value = $2;
        while( $value =~ s/\\$// && $row ne '') {
          if( $row =~ s/^([^"]+)"\s*// ) {
            $value .= "\"$1";
          } else {
            $value .= "\"$row"; 
            $row = '';
          }
        }
        $row =~ s/^\s*//;
        $config{$key} = $value;
      } elsif( $row =~ s/(\w+)\s*=\s*(\S+)\s*// ) {
        $config{$1} = $2;
      } else {
        $row ='';
      }
    }
    $config{'name'} ||= 'default';
    my $current_key = $config{'name'};# || 'default';
    $self->{'tracks'}{ $current_key } ||= { 'features' => [], 'config' => \%config };
    $self->{'_current_key'} = $current_key;
  } else {
    return unless $row =~ /\d+/g ;
    my @tab_del = split /(\t|  +)/, $row;
    my $current_key = $self->{'_current_key'} ;
    if( $format =~ /^G[TF]F/ ) { ## Hack can't distinguish GFF from GTF cleanly
      $self->store_feature( $current_key, EnsEMBL::Web::Text::Feature::GFF->new( \@tab_del ) ) 
        if $self->filter($tab_del[0],$tab_del[6],$tab_del[8]);
#    } 
#  elsif ($format eq 'GTF')  { 
#    $self->store_feature( $current_key, EnsEMBL::Web::Text::Feature::GTF->new( \@tab_del ) ) 
#    if $self->filter($tab_del[0],$tab_del[6],$tab_del[8]);
    } elsif( $format eq 'DAS' ) { 
#      $current_key = $tab_del[2] if $current_key eq 'default';
      $self->store_feature( $current_key, EnsEMBL::Web::Text::Feature::DAS->new( \@tab_del ) ) 
        if $self->filter($tab_del[8],$tab_del[10],$tab_del[12]);
    } else {
      my @ws_delim = split /\s+/, $row; 
      if( $format eq 'PSL' ) {
        $self->store_feature( $current_key, EnsEMBL::Web::Text::Feature::PSL->new( \@ws_delim ) ) 
          if $self->filter($ws_delim[13],$ws_delim[15],$ws_delim[16]);
      } elsif( $format eq 'BED' ) {
#        $current_key = $ws_delim[3] if $current_key eq 'default';
        $self->store_feature( $current_key, EnsEMBL::Web::Text::Feature::BED->new( \@ws_delim ) )
          if $self->filter($ws_delim[0],$ws_delim[1],$ws_delim[2]);
      } else {
        $self->store_feature( $ws_delim[4], EnsEMBL::Web::Text::Feature::generic->new( \@ws_delim ) ) 
          if $self->filter($ws_delim[1],$ws_delim[2],$ws_delim[3]);
      } 
    } 
  }
}

#----------------------------------------------------------------------

=head2 

  Arg [1]   :  
  Function  : stores a feature in the parser object
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub store_feature {
  my ( $self, $key, $feature ) = @_;
  push @{$self->{'tracks'}{$key}{'features'}}, $feature;
}

#----------------------------------------------------------------------

=head2 

  Arg [1]   :  
  Function  : 
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub get_all_tracks{$_[0]->{'tracks'}}

#----------------------------------------------------------------------

=head2 

  Arg [1]   :  
  Function  : 
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub fetch_features_by_tracktype{
  my ( $self, $type ) = @_;
  return $self->{'tracks'}{ $type }{'features'} ;
}

#----------------------------------------------------------------------

=head2 

  Arg [1]   :  
  Function  : 
  Returntype: 
  Exceptions: 
  Caller  : 
  Example   : 

=cut

sub filter {
  my ( $self, $chr, $start, $end) = @_;
  return ( ! $self->{'filter'}{'chr'}   || $chr eq 'chr'.$self->{'filter'}{'chr'} 
        || $chr eq $self->{'filter'}{'chr'}   ) &&
     ( ! $self->{'filter'}{'end'}   || $start <= $self->{'filter'}{'end'}   ) &&
     ( ! $self->{'filter'}{'start'} || $end   >= $self->{'filter'}{'start'} )  ;
}

sub _check_data_row {
  my $self = shift;
  my @formatCheck = $self->my_spec;

  my @fields = ();
  for (my $i=0; $i<$#fields; $i++) {
    my $check = $formatCheck[$i] or return 'Unexpected field';
    my $regexp = $check->{'regexp'} or next; # Field can contain anything
    if ($fields[$i] =~ /$regexp/) {
      $formatCheck[$i]->{check_fail} = 0;
    } else {
      return 'Illegal field entry';
    }
  }
  foreach my $f (@formatCheck) {
    return 'Missing required field' if ($f->{'check_fail'});
  }
  return;
}

sub init {
  my ($self, $data) = @_;
  return unless $data;

  my %info;
  my $has_data = 0;
  foreach my $row ( split '\n', $data ) {
    next unless $row;
    $has_data++;
    my @analysis = $self->analyse_row($row);
    if( $analysis[2] ) {
      $info{$analysis[0]}{$analysis[1]} = $analysis[2];
    } else {
      $info{$analysis[0]} = $analysis[1];
    }
    ## Should we halt the analysis once we have a file format? Will any other useful info appear later in the file?
    last if $analysis[0] eq 'format';
    ## Yes it will all to do with what is in the file! but we can leave this for the moment!
  }
  $info{'count'} = $has_data;
  if (my $format = $info{'format'}) {
#     my $p =  __PACKAGE__."::$format";
#     $self = $p->new();
    bless $self, __PACKAGE__."::$format";
  }
  $self->{_info} = \%info;
  return $self;
}

1;