Skip to content
Snippets Groups Projects
GFFStreamFeature.java 19.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • tjc's avatar
    tjc committed
    /* GFFStreamFeature.java
     *
     * created: Tue Sep 14 1999
     *
     * This file is part of Artemis
     *
     * Copyright (C) 1999,2000,2001  Genome Research Limited
     *
     * This program is free software; you can redistribute it and/or
     * modify it under the terms of the GNU General Public License
     * as published by the Free Software Foundation; either version 2
     * of the License, or (at your option) any later version.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     *
     * You should have received a copy of the GNU General Public License
     * along with this program; if not, write to the Free Software
     * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
     *
    
    tjc's avatar
    tjc committed
     * $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/io/GFFStreamFeature.java,v 1.3 2005-01-06 11:21:06 tjc Exp $
    
    tjc's avatar
    tjc committed
     */
    
    package uk.ac.sanger.artemis.io;
    
    import uk.ac.sanger.artemis.util.*;
    
    import java.io.*;
    import java.util.Hashtable;
    import java.util.StringTokenizer;
    
    /**
     *  A StreamFeature that thinks it is a GFF feature.
     *
     *  @author Kim Rutherford
    
    tjc's avatar
    tjc committed
     *  @version $Id: GFFStreamFeature.java,v 1.3 2005-01-06 11:21:06 tjc Exp $
    
    tjc's avatar
    tjc committed
     **/
    
    
    public class GFFStreamFeature extends SimpleDocumentFeature
                           implements DocumentFeature, StreamFeature, ComparableFeature 
    {
    
    tjc's avatar
    tjc committed
      /**
       *  Create a new GFFStreamFeature object.  The feature should be added
    
       *  to an Entry (with Entry.add()).
    
    tjc's avatar
    tjc committed
       *  @param key The new feature key
       *  @param location The Location object for the new feature
       *  @param qualifiers The qualifiers for the new feature
       **/
    
      public GFFStreamFeature(final Key key, final Location location,
                              final QualifierVector qualifiers) 
      {
        super(null);
        try 
        {
          setKey(key);
          setLocation(location);
          setQualifiers(qualifiers);
          if(getQualifierByName("score") == null)
            setQualifier(new Qualifier("score", "."));
          
          if(getQualifierByName("gff_source") == null)
            setQualifier(new Qualifier("gff_source", "artemis"));
          
          if(getQualifierByName("gff_seqname") == null)
            setQualifier(new Qualifier("gff_seqname", "."));
          
        } 
        catch(EntryInformationException e) 
        {
    
    tjc's avatar
    tjc committed
          // this should never happen because the feature will not be in an Entry
    
          throw new Error("internal error - unexpected exception: " + e);
        }
        catch(ReadOnlyException e) 
        {
    
    tjc's avatar
    tjc committed
          // this should never happen because the feature will not be in an Entry
    
          throw new Error("internal error - unexpected exception: " + e);
        } 
        catch(OutOfRangeException e) 
        {
    
    tjc's avatar
    tjc committed
          // this should never happen because the feature will not be in an Entry
    
          throw new Error("internal error - unexpected exception: " + e);
    
    tjc's avatar
    tjc committed
        }
      }
    
      /**
       *  Create a new GFFStreamFeature with the same key, location and
       *  qualifiers as the given feature.  The feature should be added to an
    
       *  Entry (with Entry.add()).
    
    tjc's avatar
    tjc committed
       *  @param feature The feature to copy.
       **/
    
      public GFFStreamFeature(final Feature feature) 
      {
        this(feature.getKey(), feature.getLocation(), feature.getQualifiers());
    
    tjc's avatar
    tjc committed
    
    
        if(feature instanceof GFFStreamFeature)
          gff_lines = new StringVector(((GFFStreamFeature)feature).gff_lines);
    
    tjc's avatar
    tjc committed
      }
    
      /**
       *  Create a new GFFStreamFeature from the given line.  The String should be
       *  in gene finder format.
       **/
    
      private GFFStreamFeature(final String line)
          throws ReadFormatException 
      {
        super(null);
    
    tjc's avatar
    tjc committed
    
    
        final StringVector line_bits = StringVector.getStrings(line, "\t", true);
    
    tjc's avatar
    tjc committed
    
    
        if(line_bits.size() < 8) 
          throw new ReadFormatException("invalid GFF line: 8 fields needed " +
                                        "(got " + line_bits.size () +
                                        " fields) from: " + line);
    
    tjc's avatar
    tjc committed
    
    
        final String start_base_string = line_bits.elementAt(3).trim();
        final String end_base_string   = line_bits.elementAt(4).trim();
    
    tjc's avatar
    tjc committed
    
        final int start_base;
        final int end_base;
    
    
        try 
        {
          start_base = Integer.parseInt(start_base_string);
        } 
        catch(NumberFormatException e)
        {
          throw new ReadFormatException("Could not understand the start base " +
                                        "of a GFF feature: " + start_base_string);
    
    tjc's avatar
    tjc committed
        }
    
    
        try 
        {
          end_base = Integer.parseInt(end_base_string);
        } 
        catch(NumberFormatException e) 
        {
          throw new ReadFormatException("Could not understand the end base " +
                                        "of a GFF feature: " + end_base_string);
    
    tjc's avatar
    tjc committed
        }
    
        // start of qualifier parsing and setting
    
    tjc's avatar
    tjc committed
          final boolean complement_flag;
    
    
          if(line_bits.elementAt(6).equals("+")) 
    
    tjc's avatar
    tjc committed
            complement_flag = false;
    
          else 
          {
            if(line_bits.elementAt(6).equals("-")) 
    
    tjc's avatar
    tjc committed
              complement_flag = true;
    
    tjc's avatar
    tjc committed
              // must be unstranded
              complement_flag = false;
    
              // best we can do
              final String note_string = "this feature is unstranded";
    
    
              setQualifier(new Qualifier("note", note_string));
    
    tjc's avatar
    tjc committed
            }
          }
    
    
          if(line_bits.size() == 9) 
          {
            final String rest_of_line = decode(line_bits.elementAt(8));
    
    tjc's avatar
    tjc committed
    
            // parse the rest of the line as ACeDB format attributes
    
            final Hashtable attributes = parseAttributes(rest_of_line);
    
            for(final java.util.Enumeration attribute_enum = attributes.keys();
                attribute_enum.hasMoreElements();)
           {
              final String name = (String)attribute_enum.nextElement();
              final StringVector values = (StringVector)attributes.get(name);
    
              if(values.size() == 0)
                setQualifier(new Qualifier(name));
              else
                setQualifier(new Qualifier(name, values));
    
    tjc's avatar
    tjc committed
            }
          }
    
          final Qualifier gff_seqname =
    
            new Qualifier("gff_seqname", decode(line_bits.elementAt(0)));
    
    tjc's avatar
    tjc committed
    
    
          setQualifier(gff_seqname);
    
    tjc's avatar
    tjc committed
    
    
          final Key key = new Key(line_bits.elementAt(2));
    
    tjc's avatar
    tjc committed
    
    
          setKey(key);
    
    tjc's avatar
    tjc committed
    
          final Qualifier source_qualifier =
    
            new Qualifier("gff_source", line_bits.elementAt(1));
    
    tjc's avatar
    tjc committed
    
    
          setQualifier(source_qualifier);
    
    tjc's avatar
    tjc committed
    
          final Qualifier score_qualifier =
    
            new Qualifier("score", line_bits.elementAt(5));
    
    tjc's avatar
    tjc committed
    
    
          setQualifier(score_qualifier);
    
    tjc's avatar
    tjc committed
    
    
          String frame = line_bits.elementAt(7);
    
    tjc's avatar
    tjc committed
    
    
          if(frame.equals ("0"))
    
    tjc's avatar
    tjc committed
            frame = "1";
    
          else 
          {
            if(frame.equals("1"))
    
    tjc's avatar
    tjc committed
              frame = "2";
    
            else
            {
              if(frame.equals("2")) 
    
    tjc's avatar
    tjc committed
                frame = "3";
    
    tjc's avatar
    tjc committed
                frame = ".";
            }
          }
    
    
          if(!frame.equals("1") && !frame.equals(".")) 
          {
    
    tjc's avatar
    tjc committed
            final Qualifier codon_start_qualifier =
    
              new Qualifier("codon_start", frame);
    
    tjc's avatar
    tjc committed
    
    
            setQualifier(codon_start_qualifier);
    
    tjc's avatar
    tjc committed
          }
    
    
          if(start_base > end_base) 
            throw new ReadFormatException("start position is greater than end " +
                                          "position: " + start_base + " > " +
                                          end_base);
    
    tjc's avatar
    tjc committed
    
    
          if(start_base < 0)
            throw new ReadFormatException("start position must be positive: " +
                                          start_base); 
    
    tjc's avatar
    tjc committed
          
    
          final Range location_range = new Range(start_base, end_base);
    
    tjc's avatar
    tjc committed
    
    
          final RangeVector location_ranges = new RangeVector(location_range);
    
    tjc's avatar
    tjc committed
    
    
          setLocation(new Location(location_ranges, complement_flag));
        }
        catch(ReadOnlyException e) 
        {
          throw new Error("internal error - unexpected exception: " + e);
        } 
        catch(EntryInformationException e) 
        {
          throw new Error("internal error - unexpected exception: " + e);
        } 
        catch(OutOfRangeException e) 
        {
          throw new Error("internal error - unexpected exception: " + e);
    
    tjc's avatar
    tjc committed
        }
    
    
        this.gff_lines = new StringVector(line);
      }
    
      /**
      *
      * For gff-version 3:
      * http://song.sourceforge.net/gff3-jan04.shtml
      *
      * Remove URL escaping rule (e.g. space="%20" or "+")
      *
      */
      private String decode(String s)
      {
        int ind;
    
        // white space
        while( (ind = s.indexOf("%20")) > -1)
          s = s.substring(0,ind) + " " + s.substring(ind+3);
    
        // comma
        while( (ind = s.indexOf("%2C")) > -1)
          s = s.substring(0,ind) + "," + s.substring(ind+3);
    
        // white space
        while( (ind = s.indexOf("+")) > -1)
          s = s.substring(0,ind) + " " + s.substring(ind+1);
    
        // semi-colon
        while( (ind = s.indexOf("%3B")) > -1)
          s = s.substring(0,ind) + ";" + s.substring(ind+3);
    
        // equals
        while( (ind = s.indexOf("%3D")) > -1)
          s = s.substring(0,ind) + "=" + s.substring(ind+3);
    
    //  ind = -1;
    //  while( (ind = s.indexOf("=",ind+2)) > -1)
    //    s = s.substring(0,ind+1) + "\"" + s.substring(ind+1);
    
    //  ind = -1;
    //  while( (ind = s.indexOf(";",ind+2)) > -1)
    //    s = s.substring(0,ind+1) + "\"" + s.substring(ind+1);
    
        return s;
      }
       
      /**
       *  Return the reference of a new copy of this Feature.
       **/
      public Feature copy() 
      {
        final Feature return_value = new GFFStreamFeature(this);
        return return_value;
    
    tjc's avatar
    tjc committed
      }
    
      /**
       *  Helper method for the constructor - returns a String that is the
       *  concatenation of the Strings in the given StringVector.  The strings
       *  will be separated by four spaces
       **/
    
      private String joinStringVector(final StringVector string_vector) 
      {
        final StringBuffer return_buffer = new StringBuffer();
    
        for(int i = 0 ; i < string_vector.size() ; ++i) 
        {
          if(i != 0)
            return_buffer.append("    ");
          
          return_buffer.append(string_vector.elementAt(i));
    
    tjc's avatar
    tjc committed
        }
    
    
        return return_buffer.toString();
    
    tjc's avatar
    tjc committed
      }
    
      /**
       *  Read and return a GFFStreamFeature from a stream.  A feature must be the
       *  next thing in the stream.
       *  @param stream the Feature is read from this stream
       *  @exception IOException thrown if there is a problem reading the Feature -
       *    most likely ReadFormatException.
       *  @exception InvalidRelationException Thrown if this Feature cannot contain
       *    the given Qualifier.
       *  @return null if in_stream is at the end of file when the method is
       *    called
       */
    
      protected static GFFStreamFeature readFromStream(LinePushBackReader stream)
          throws IOException, InvalidRelationException 
      {
    
    tjc's avatar
    tjc committed
    
    
        String line = stream.readLine();
    
    tjc's avatar
    tjc committed
    
    
        if(line == null) 
    
    tjc's avatar
    tjc committed
          return null;
    
    
        try
        {
          final GFFStreamFeature new_feature = new GFFStreamFeature(line);
    
    tjc's avatar
    tjc committed
          return new_feature;
    
        } 
        catch(ReadFormatException exception) 
        {
    
    tjc's avatar
    tjc committed
          // re-throw the exception with the line number added
    
          final String new_error_string = exception.getMessage();
    
    tjc's avatar
    tjc committed
    
    
          throw new ReadFormatException(new_error_string,
                                        stream.getLineNumber());
    
    tjc's avatar
    tjc committed
        }
      }
    
      /**
       *  Read the details of a feature from an EMBL stream into the current
       *  object.
       *  @param entry_information The EntryInformation object of the Entry that
       *    will contain the Feature.
       *  @param in_stream the Feature is read from this stream
       *  @exception IOException thrown if there is a problem reading the Feature -
       *    most likely ReadFormatException if the stream does not contain GFF
       *    feature.
       **/
    
      public void setFromStream(final EntryInformation entry_information,
                                final LinePushBackReader in_stream)
          throws IOException, InvalidRelationException, ReadOnlyException 
      {
        throw new ReadOnlyException();
    
    tjc's avatar
    tjc committed
      }
    
      /**
       *  Write this Feature to the given stream.
       *  @param writer The stream to write to.
       *  @exception IOException thrown if there is an io problem while writing
       *    the Feature.
       **/
    
      public void writeToStream(final Writer writer)
          throws IOException 
      {
    
    tjc's avatar
    tjc committed
        // for now GFF features are read-only so just write what we read
    
        if(gff_lines == null) 
        {
          final RangeVector ranges = getLocation().getRanges();
    
          for(int i = 0 ; i < ranges.size() ; ++i) 
          {
            final Range this_range = ranges.elementAt(i);
            Qualifier   seqname    = getQualifierByName("gff_seqname");
            Qualifier   source     = getQualifierByName("gff_source");
            Qualifier   score      = getQualifierByName("score");
            Qualifier   group      = getQualifierByName("group");
    
            if(seqname == null) 
              seqname = new Qualifier("gff_seqname", "");
    
            if(source == null) 
              source = new Qualifier("source", "");
    
            if(score == null) 
              score = new Qualifier("score", "");
    
            if(group == null || group.getValues() == null ||
               group.getValues().elementAt(0).equals(""))
            {
              final Qualifier gene = getQualifierByName("gene");
    
              if(gene == null) 
                group = new Qualifier("group", "");
              else 
    
    tjc's avatar
    tjc committed
                group = gene;
            }
    
            String frame = ".";
    
    
            final Qualifier codon_start = getQualifierByName("codon_start");
    
    tjc's avatar
    tjc committed
    
    
            if(codon_start != null && i == 0) 
            {
              frame = codon_start.getValues().elementAt(0);
    
    tjc's avatar
    tjc committed
    
    
              if(frame.equals ("1")) 
    
    tjc's avatar
    tjc committed
                frame = "0";
    
              else
              {
                if(frame.equals("2")) 
    
    tjc's avatar
    tjc committed
                  frame = "1";
    
                else 
                {
                  if(frame.equals("3")) 
    
    tjc's avatar
    tjc committed
                    frame = "2";
    
    tjc's avatar
    tjc committed
                    frame = ".";
                }
              }
            }
    
    
            final String attribute_string = unParseAttributes();
    
    tjc's avatar
    tjc committed
    
    
            writer.write(seqname.getValues().elementAt(0) + "\t" +
                         source.getValues().elementAt(0) + "\t" +
                         getKey() + "\t" +
                         this_range.getStart() + "\t" +
                         this_range.getEnd() + "\t" +
                         score.getValues() .elementAt(0)+ "\t" +
                         (getLocation().isComplement() ? "-\t" : "+\t") +
    
    tjc's avatar
    tjc committed
                          frame + "\t" +
                          attribute_string + "\n");
          }
    
        } 
        else 
        {
          for(int i = 0 ; i < gff_lines.size() ; ++i) 
            writer.write(gff_lines.elementAt(i) + "\n");
    
    tjc's avatar
    tjc committed
        }
      }
    
      /**
       *  Return a String containing the qualifiers of this feature in a form
       *  suitable for using as the last field of a GFF line.  The codon_start
       *  attribute is not included since GFF has a frame field.  gff_seqname,
       *  gff_source and score aren't included since they have corresponding
       *  fields.
       **/
    
      private String unParseAttributes() 
      {
        final StringBuffer buffer = new StringBuffer();
        final QualifierVector qualifiers = getQualifiers();
        final QualifierVector qualifiers_to_write = new QualifierVector();
    
    tjc's avatar
    tjc committed
    
    
        for(int i = 0 ; i < qualifiers.size() ; ++i) 
        {
    
    tjc's avatar
    tjc committed
          final Qualifier this_qualifier = (Qualifier)qualifiers.elementAt(i);
    
    tjc's avatar
    tjc committed
    
    
          final String name = this_qualifier.getName();
    
    tjc's avatar
    tjc committed
    
    
          if(name.equals("codon_start") || name.equals("gff_source") ||
             name.equals("gff_seqname") || name.equals("score"))
    
    tjc's avatar
    tjc committed
            continue;
    
    
          if(i != 0)
            buffer.append(" ; ");
    
          final StringVector values = this_qualifier.getValues();
    
          buffer.append(name);
    
          if(values != null) 
          {
            for(int value_index = 0;
                value_index < values.size();
                ++value_index) 
            {
              final String this_value = values.elementAt(value_index);
              buffer.append(' ');
              try 
              {
                buffer.append(Integer.valueOf(this_value));
              } 
              catch(NumberFormatException _) 
              {
    
    tjc's avatar
    tjc committed
                // not an integer
    
                try 
                {
                  buffer.append(Double.valueOf(this_value));
                }
                catch (NumberFormatException __) 
                {
    
    tjc's avatar
    tjc committed
                  // not a double or integer so quote it
    
                  buffer.append('"' + this_value + '"');
    
        return buffer.toString();
    
    tjc's avatar
    tjc committed
      }
    
      /**
       *  Parse the given String as ACeDB format attributes.
       *  Adapted from code by Matthew Pocock for the BioJava project.
    
       *
       *  Modified for gff-version 3.
       *
    
    tjc's avatar
    tjc committed
       *  @return Return a Hashtable.  Each key is an attribute name and each value
       *    of the Hashtable is a StringVector containing the attribute values.
       *    If the attribute has no value then the Hashtable value will be a zero
       *    length vector.
       **/
    
      private Hashtable parseAttributes(final String att_val_list) 
      {
        Hashtable attributes = new Hashtable();
    
    tjc's avatar
    tjc committed
    
    
        StringTokenizer tokeniser = new StringTokenizer(att_val_list, ";", false);
    
    tjc's avatar
    tjc committed
    
    
        while(tokeniser.hasMoreTokens()) 
        {
    
    tjc's avatar
    tjc committed
          final String this_token = tokeniser.nextToken().trim();
    
          int index_of_first_space = this_token.indexOf(" ");
    
    tjc's avatar
    tjc committed
    
          final String att_name;
    
          final StringVector att_values = new StringVector();
    
          if(this_token.indexOf("=") > -1 &&
             this_token.indexOf("=") < index_of_first_space)
          {
            index_of_first_space = this_token.indexOf("=");
            att_name = this_token.substring(0, index_of_first_space);
            att_values.add(this_token.substring(index_of_first_space+1).trim());
          }
          else if(index_of_first_space == -1) 
    
    tjc's avatar
    tjc committed
            att_name = this_token;
    
          else 
          {
            att_name = this_token.substring(0, index_of_first_space);
    
    tjc's avatar
    tjc committed
    
            String rest_of_token =
    
              this_token.substring(index_of_first_space+1).trim();
    
    tjc's avatar
    tjc committed
    
    
            while(rest_of_token.length() > 0) 
            {
              if(rest_of_token.startsWith("\""))
              {
    
    tjc's avatar
    tjc committed
                int quote_index = 0;
    
    tjc's avatar
    tjc committed
                  quote_index++;
    
                  quote_index = rest_of_token.indexOf("\"", quote_index);
                } while(quote_index > -1 &&
                        rest_of_token.charAt(quote_index - 1) == '\\');
    
    tjc's avatar
    tjc committed
    
    
                if(quote_index < 0) 
                {
    
    tjc's avatar
    tjc committed
                  // no closing quote - panic
    
                  final Hashtable panic_attributes = new Hashtable();
                  final StringVector notes = new StringVector();
                  notes.add(att_val_list);
                  panic_attributes.put("note", notes);
    
    tjc's avatar
    tjc committed
    
                  return panic_attributes;
                }
    
    
                final String next_bit = rest_of_token.substring(1, quote_index);
                att_values.add(next_bit);
                rest_of_token = rest_of_token.substring(quote_index + 1).trim();
              } 
              else
              {
                final int index_of_next_space = rest_of_token.indexOf(" ");
    
                if(index_of_next_space == -1) 
                {
                  att_values.add(rest_of_token);
    
    tjc's avatar
    tjc committed
                  rest_of_token = "";
    
    tjc's avatar
    tjc committed
                  final String next_bit =
    
                    rest_of_token.substring(0, index_of_next_space);
    
    tjc's avatar
    tjc committed
    
    
                  att_values.add(next_bit);
    
    tjc's avatar
    tjc committed
                  rest_of_token =
    
                    rest_of_token.substring(index_of_next_space).trim();
    
    tjc's avatar
    tjc committed
                }
              }
            }
    
    
            if(!rest_of_token.equals(""))
              att_values.add(rest_of_token);
    
    tjc's avatar
    tjc committed
          }
    
    
          if(attributes.get(att_name) != null) 
            ((StringVector)attributes.get(att_name)).add(att_values);
          else 
            attributes.put(att_name, att_values);
    
    tjc's avatar
    tjc committed
        }
    
        return attributes;
      }
    
      /**
       *  The DocumentEntry object that contains this Feature as passed to the
       *  constructor.
       **/
      private DocumentEntry entry;
    
      /**
       *  This is the line of GFF input that was read to get this
       *  GFFStreamFeature.  A GFFStreamFeature that was created from multiple GFF
       *  lines will have a gff_lines variable that contains multiple line.
       **/
      StringVector gff_lines = null;
    }