sub check_CDS_start_end_remarks
{ my $self = shift;
my $trans = shift;
my @remarks = @{$trans->get_all_Attributes('remark')};
my $coding_end = $trans->cdna_coding_end;
my $coding_start = $trans->cdna_coding_start;
my $trans_end = $trans->length;
my $trans_seq = $trans->seq->seq;
my $stop_codon = substr($trans_seq, $coding_end-3, 3);
my $start_codon = substr($trans_seq, $coding_start-1, 3);
my $results;
if (grep {$_->value eq 'CDS end not found'} @remarks) {
if ( ($coding_end != $trans_end)
&& ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) {
$results->{'END_EXTRA'} = 1;
}
}
if ( $coding_end == $trans_end ) {
if (! grep {$_->value eq 'CDS end not found'} @remarks) {
if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) {
$results->{'END_MISSING_2'} = 1;
}
else {
$results->{'END_MISSING_1'} = $stop_codon;
}
}
}
if (grep {$_->value eq 'CDS start not found'} @remarks) {
if ( ($coding_start != 1)
&& ($start_codon eq 'ATG') ) {
$results->{'START_EXTRA'} = 1;
}
}
if ( $coding_start == 1) {
if ( ! grep {$_->value eq 'CDS start not found'} @remarks) {
if ($start_codon eq 'ATG') {
$results->{'START_MISSING_2'} = 1;
} else {
$results->{'START_MISSING_1'} = $start_codon;
}
}
}
return $results; } |
sub check_CDS_start_end_remarks_loutre
{ my $self = shift;
my $trans = shift;
my @stops = qw(TGA TAA TAG);
my %attributes;
foreach my $attribute (@{$trans->get_all_Attributes()}) {
$attributes{$attribute->code} = $attribute;
}
my $coding_end = $trans->cdna_coding_end;
my $coding_start = $trans->cdna_coding_start;
my $trans_end = $trans->length;
my $trans_seq = $trans->seq->seq;
my $stop_codon = substr($trans_seq, $coding_end-3, 3);
my $start_codon = substr($trans_seq, $coding_start-1, 3);
my $results;
if ( ($attributes{'cds_end_NF'}->value == 1)
&& ($coding_end != $trans_end)
&& ( grep {$_ eq $stop_codon} @stops) ) {
$results->{'END_EXTRA'} = 1;
}
if ( $coding_end == $trans_end ) {
if ($attributes{'cds_end_NF'}->value == 0 ) {
if (grep {$_ eq $stop_codon} @stops) {
$results->{'END_MISSING_2'} = 1;
}
else {
$results->{'END_MISSING_1'} = $stop_codon;
}
}
}
if ( ($attributes{'cds_start_NF'}->value == 1 )
&& ($coding_start != 1)
&& ($start_codon eq 'ATG') ) {
$results->{'START_EXTRA'} = 1;
}
if ( $coding_start == 1) {
if ( $attributes{'cds_start_NF'}->value == 0 ) {
if ($start_codon eq 'ATG') {
$results->{'START_MISSING_2'} = 1;
} else {
$results->{'START_MISSING_1'} = $start_codon;
}
}
}
return $results; } |
sub check_for_stops
{ my $support = shift;
my ($gene,$seen_transcripts) = @_;
my $gname = $gene->get_all_Attributes('name')->[0]->value;
my $scodon = 'TGA';
my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' );
TRANS:
foreach my $trans (@{$gene->get_all_Transcripts}) {
my $tsi = $trans->stable_id;
my $tID = $trans->dbID;
my $tname = $trans->get_all_Attributes('name')->[0]->value;
$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
my $peptide;
next TRANS unless ($peptide = $trans->translate);
my $pseq = $peptide->seq;
my $orig_seq = $pseq;
next TRANS unless ($pseq =~ /\*/);
$support->log_verbose("Stops found in $tsi ($tname)\n",1);
my @found_stops;
my $mrna = $trans->translateable_seq;
my $offset = 0;
my $tstop;
while ($pseq =~ /([^\*]+)\*(.*)/) {
my $pseq1_f = $1;
$pseq = $2;
my $seq_flag = 0;
$offset += length($pseq1_f) * 3;
my $stop = substr($mrna, $offset, 3);
my $aaoffset = int($offset / 3)+1; push(@found_stops, [ $stop, $aaoffset ]);
$tstop .= "$aaoffset ";
$offset += 3;
}
my $num_stops = scalar(@found_stops);
my $num_tga = 0;
my $positions;
foreach my $stop (@found_stops) {
$positions .= $stop->[0]."(".$stop->[1].") ";
if ($stop->[0] eq $scodon) {
$num_tga++;
}
}
my $source = $gene->source;
if ($num_tga < $num_stops) {
if ($source eq 'havana') {
$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non\' $scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
}
else {
$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non\' $scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
}
}
else {
my $flag_remark = 0; my $flag_remark2 = 0; my $alabel = 'Annotation_remark- selenocysteine ';
my $alabel2 = 'selenocysteine ';
my $annot_stops;
my $remarks;
foreach my $remark_type ('remark','hidden_remark') {
foreach my $attrib ( @{$trans->get_all_Attributes($remark_type)}) {
push @{$remarks->{$remark_type}}, $attrib->value;
}
}
while (my ($attrib,$remarks)= each %$remarks) {
foreach my $text (@{$remarks}) {
if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){
$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
$annot_stops=$1;
}
elsif ($text =~ /^$alabel2(.*)/) {
$annot_stops=$1;
}
}
}
my @annotated_stops;
if ($annot_stops){
my $i = 0;
foreach my $offset (split(/\s+/, $annot_stops)) {
if ($offset!~/^\d+$/){
}
elsif ($found_stops[$i]->[1] == $offset) {
push @annotated_stops, $offset;
}
elsif (($found_stops[$i]->[1] * 3) == $offset) {
$support->log_warning("DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]\n");
}
elsif (($found_stops[$i]->[1]) == $offset+1) {
$support->log_warning("PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]\n");
}
else {
$support->log_warning("Annotated stop for transcript $tsi ($tname) does not match a TGA codon) [$mod_date]\n");
push @annotated_stops, $offset;
}
$i++;
}
}
foreach my $stop ( @found_stops ) {
my $pos = $stop->[1];
my $seq = $stop->[0];
unless ( grep { $pos == $_} @annotated_stops) {
if ($seen_transcripts->{$tsi}) {
$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
}
else {
$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
}
}
}
}
}
}
__DATA__
OTTHUMT00000144659 = FIXED- changed to transcript
OTTHUMT00000276377 = FIXED- changed to transcript
OTTHUMT00000257741 = FIXED- changed to nmd
OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene
OTTHUMT00000285227 = FIXED- changed start site
OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon
OTTHUMT00000157999 = FIXED- changed incorrect stop
OTTHUMT00000150523 = FIXED- incorrect trimming of CDS
OTTHUMT00000150525 = FIXED- incorrect trimming of CDS
OTTHUMT00000150522 = FIXED- incorrect trimming of CDS
OTTHUMT00000150521 = FIXED- incorrect trimming of CDS
OTTHUMT00000246819 = FIXED- corrected frame
OTTHUMT00000314078 = FIXED- corrected frame
OTTHUMT00000080133 = FIXED- corrected frame
OTTHUMT00000286423 = FIXED- changed to transcript
OTTMUST00000055509 = FIXED- error
OTTMUST00000038729 = FIXED- corrected frame
OTTMUST00000021760 = FIXED- corrected frame
OTTMUST00000023057 = FIXED- corrected frame
OTTMUST00000015207 = FIXED- corrected frame
OTTMUST00000056646 = FIXED- error
OTTMUST00000059686 = FIXED- corrected frame
OTTMUST00000013426 = FIXED- corrected frame
OTTMUST00000044412 = FIXED- error } |
sub get_havana_seleno_comments
{ my $seen_translations;
while (<DATA>) {
next if /^\s+$/ or /#+/;
my ($obj,$comment) = split /=/;
$obj =~ s/^\s+|\s+$//g;
$comment =~ s/^\s+|\s+$//g;
$seen_translations->{$obj} = $comment;
}
return $seen_translations; } |