Skip to content
Snippets Groups Projects
LineGroup.java 13.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • tjc's avatar
    tjc committed
    /* LineGroup.java
     *
     * created: Mon Oct 12 1998
     *
     * This file is part of Artemis
     *
     * Copyright (C) 1998,1999,2000  Genome Research Limited
     *
     * This program is free software; you can redistribute it and/or
     * modify it under the terms of the GNU General Public License
     * as published by the Free Software Foundation; either version 2
     * of the License, or (at your option) any later version.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     *
     * You should have received a copy of the GNU General Public License
     * along with this program; if not, write to the Free Software
     * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
     *
    
    tjc's avatar
    tjc committed
     * $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/io/LineGroup.java,v 1.5 2005-06-14 08:18:41 tjc Exp $
    
    tjc's avatar
    tjc committed
     */
    
    package uk.ac.sanger.artemis.io;
    
    import java.io.Writer;
    import java.io.IOException;
    import java.util.Hashtable;
    
    import uk.ac.sanger.artemis.util.LinePushBackReader;
    
    /**
     *  This class corresponds to a group of associated lines in an EMBL entry.
     *  An example of a group of associated lines is all the lines in an entry
     *  that start with FT.
     *
     *  @author Kim Rutherford
    
    tjc's avatar
    tjc committed
     *  @version $Id: LineGroup.java,v 1.5 2005-06-14 08:18:41 tjc Exp $
    
    tjc's avatar
    tjc committed
     *
     */
    
    abstract class LineGroup
        extends EMBLObject
    {
    
      /**
       *  The tag used for unidentified input.
       **/
      final static private int UNKNOWN = 0;
                                                                                                    
      /**
       *  The tag for the end of entry line: "//"
       **/
      final static int END_OF_ENTRY = 1;
      final static String END_OF_ENTRY_STRING = "//";
                                                                                                    
      /**
       *  The tag for the start of sequence line
       **/
      final static int SEQUENCE = 2;
      final static String EMBL_SEQUENCE_STRING = "SQ";
                                                                                                    
      /**
       *  The tag for an EMBL feature table line
       **/
      final static int EMBL_FEATURE = 3;
      final static String EMBL_FEATURE_STRING = "FT";
                                                                                                    
      /**
       *  The tag for an EMBL feature header lines (FH ...)
       **/
      final static int EMBL_FEATURE_HEADER = 4;
      final static String EMBL_FEATURE_HEADER_STRING = "FH";
                                                                                                    
      /**
       *  The tag for a GENBANK feature table line
       **/
      final static int GENBANK_FEATURE = 5;
                                                                                                    
      /**
       *  This is the tag for an EMBL LineGroup that we don't have a handler for.
       *  It will be stored in an object of type EmblMisc.
       **/
      final static int EMBL_MISC = 6;
    
      /**
       *  This is the tag for an Genbank LineGroup that we don't have a handler
       *  for.  It will be stored in an object of type GenbankMisc.
       **/
      final static int GENBANK_MISC = 7;
                                                                                                    
      /**
       *  This is the tag for a GFF LineGroup (generally a comment line) that we
       *  don't have a handler for.  It will be stored in an object of type
       *  GFFMisc.
       **/
      final static int GFF_MISC = 8;
                                                                                                    
      /**
       *  This is the tag for a GFF format line.
       **/
      final static int GFF_FEATURE = 9;
                                                                                                    
      /**
       *  This is the tag for lines generated by MSPcrunch -d
       **/
      final static int MSPCRUNCH_FEATURE = 10;
                                                                                                    
      /**
       *  This is the tag for lines generated by blast
       **/
      final static int BLAST_FEATURE = 11;
                                                                                                    
      /**
       *  The tag for files that look like binary.
       **/
      final static int BINARY_CHARACTERS = 12;
                                                                                                    
      /**
       *  The tag for BSML XML files.
       **/
      final static int BSML_XML = 13;
    
      /**
       *  The tag for AGAVE XML files.
       **/
      final static int AGAVE_XML = 14;
                                                                                                    
      /**
       *  The tag for GAME XML files.
       **/
      final static int GAME_XML = 15;
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
      /**
       *  This hash table contains the GENBANK start of line keywords (LOCUS,
       *  DEFINITION, FEATURES etc.)
       **/
      private static Hashtable genbank_hash = null;
                                                                                                    
      static 
      {
        genbank_hash = new Hashtable ();
        genbank_hash.put ("LOCUS","LOCUS");
        genbank_hash.put ("DEFINITION","DEFINITION");
        genbank_hash.put ("ACCESSION","ACCESSION");
        genbank_hash.put ("NID","NID");
        genbank_hash.put ("VERSION","VERSION");
        genbank_hash.put ("KEYWORDS","KEYWORDS");
        genbank_hash.put ("SOURCE","SOURCE");
        genbank_hash.put ("REFERENCE","REFERENCE");
        genbank_hash.put ("COMMENT","COMMENT");
        genbank_hash.put ("FEATURES","FEATURES");
        genbank_hash.put ("SEGMENT","SEGMENT");
      }
    
      /**
       *  Try to read and return a new LineGroup object from a stream.
       *  @param reader The stream to read from.
       *  @return A new LineGroup object or null if stream is at the end of file.
       *  @exception IOException Thrown if exception occurs while reading.
       *  @exception ReadFormatException Thrown if the format of the input is in
       *    error.
       *  @exception InvalidRelationException Thrown if this Feature cannot contain
       *    a particular Qualifier.
       **/
      public static LineGroup readNextLineGroup (LinePushBackReader reader)
          throws IOException, InvalidRelationException 
      {
    
        String line;
    
        // read until we get to a non-blank line
        LINES: while(true) 
        {
          line = reader.readLine ();
    
          if(line == null) 
            return null; // end of file
    
          // check for and ignore blank lines
          for(int i = 0 ; i < line.length () ; ++i) 
          {
            final char letter = line.charAt (i);
    
            if(letter != ' ' && letter != '\t') 
              break LINES;
          }
        }
    
        final int line_type = LineGroup.getLineType (line);
    
        reader.pushBack (line);
    
    
    tjc's avatar
    tjc committed
        switch (line_type) 
        {
          case SEQUENCE:
            return StreamSequenceFactory.makeStreamSequence (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case EMBL_FEATURE:
            return EmblStreamFeature.readFromStream (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case EMBL_FEATURE_HEADER:
            return new FeatureHeader (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case GENBANK_FEATURE:
            return GenbankStreamFeature.readFromStream (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case GFF_FEATURE:
            return GFFStreamFeature.readFromStream (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case BLAST_FEATURE:
            return BlastStreamFeature.readFromStream (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case MSPCRUNCH_FEATURE:
            return MSPcrunchStreamFeature.readFromStream (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case END_OF_ENTRY:
            // in this case we do want to read the line (which will be //) so that
            // the next call to readNextEntry () starts on the next entry
            reader.readLine ();
            return null;
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case EMBL_MISC:
            return new EmblMisc (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case GENBANK_MISC:
            return new GenbankMisc (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case GFF_MISC:
            return new GFFMisc (reader);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          case BINARY_CHARACTERS:
            throw new ReadFormatException ("cannot recognise format of binary file");
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          default:
            throw new ReadFormatException ("reader got confused - " +
                                           "unknown line type",
                                           reader.getLineNumber ());
    
    tjc's avatar
    tjc committed
        }
      }
    
      /**
       *  Return the embl line type of the line contained in the argument String.
       */
    
    tjc's avatar
    tjc committed
      public static int getLineType(String line)
    
    tjc's avatar
    tjc committed
      {
    
    tjc's avatar
    tjc committed
        if(line.startsWith ("<?xml")) 
    
    tjc's avatar
    tjc committed
          return GAME_XML;
    
    
    tjc's avatar
    tjc committed
        if(line.startsWith ("#")) 
    
    tjc's avatar
    tjc committed
          return GFF_MISC;
    
    
    tjc's avatar
    tjc committed
        if(line.length () >= 2 &&
           (line.charAt (0) == '/' || Character.isLetter (line.charAt (0))) &&
           (line.charAt (1) == '/' || Character.isLetter (line.charAt (1))) &&
           (line.length () == 2 ||
            line.length () == 3 && line.endsWith (" ") ||
            line.length () == 4 && line.endsWith ("  ") ||
            (line.length () >= 5 && line.substring (2,5).equals ("   ") || 
             line.startsWith("HD * confidential") )))                       // EMBL pre-submission line
    
    tjc's avatar
    tjc committed
        {
    
    
    tjc's avatar
    tjc committed
          if(line.startsWith(EMBL_FEATURE_STRING)) 
    
    tjc's avatar
    tjc committed
            return EMBL_FEATURE;
    
    
    tjc's avatar
    tjc committed
          if(line.startsWith(END_OF_ENTRY_STRING)) 
    
    tjc's avatar
    tjc committed
            return END_OF_ENTRY;
    
    
    tjc's avatar
    tjc committed
          if(line.startsWith(EMBL_SEQUENCE_STRING)) 
    
    tjc's avatar
    tjc committed
            return SEQUENCE;
    
    
    tjc's avatar
    tjc committed
          if(line.startsWith(EMBL_FEATURE_HEADER_STRING)) 
    
    tjc's avatar
    tjc committed
            return EMBL_FEATURE_HEADER;
    
          // this covers all the lines in the header
          return EMBL_MISC;
        }
    
        if(line.length () > 21 &&
            ((line.startsWith ("     ") &&
              (Character.isLetter (line.charAt (5)) ||
               Character.isDigit (line.charAt (5)) ||
               line.charAt (5) == '-') &&
              line.charAt (20) == ' ') ||
             (line.startsWith ("                    ") &&
              line.trim ().length () > 0))) 
          return GENBANK_FEATURE;
    
    
    tjc's avatar
    tjc committed
        final int genbank_type = getGenbankType(line);
    
    tjc's avatar
    tjc committed
    
        if(genbank_type != UNKNOWN) 
          return GENBANK_MISC;
        
    
        if(isGFFLine(line))
          return GFF_FEATURE;
    
    
    tjc's avatar
    tjc committed
        if(isMSPcrunchLine(line)) 
    
    tjc's avatar
    tjc committed
          return MSPCRUNCH_FEATURE;
    
    
    tjc's avatar
    tjc committed
        if(isBlastLine(line)) 
    
    tjc's avatar
    tjc committed
          return BLAST_FEATURE;
    
    
    tjc's avatar
    tjc committed
        if(looksLikeBinary(line)) 
    
    tjc's avatar
    tjc committed
          return BINARY_CHARACTERS;
    
        // default is sequence
        return SEQUENCE;
      }
    
      /**
       *  Return true if and only if the argument contains more than 30% binary
       *  characters.  "binary" means a control character before space in ascii
       *  (except for tab, new line and form feed) and characters with the high
       *  bit set.  This is supposed to approximate the Perl -B test.
       **/
      private static boolean looksLikeBinary (final String line) 
      {
        int count = 0;
    
        if(line.length () == 0) 
          return false;
    
        for(int i = 0 ; i < line.length () ; ++i) 
        {
          final char this_char = line.charAt (i);
    
          if (Character.isISOControl (this_char) &&
              this_char != '\t' &&
              this_char != ' ' &&
              this_char != '\r' &&
              this_char != '\n' ||
              this_char >= 128) {
            ++count;
          }
        }
    
        if (count * 100 / line.length () >= 30) {
          return true;
        } else {
          return false;
        }
      }
    
      /**
       *  Return true if and only if the given String appears to be a feature
       *  generated by MSPcrunch -d
       **/
      private static boolean isMSPcrunchLine (final String line) 
      {
        final String trim_line = line.trim ();
    
        if (trim_line.length () > 0 &&
            Character.isDigit (trim_line.charAt (0)) &&
            trim_line.indexOf (' ') != -1) 
          return true;
        else 
          return false;
      }
    
      /**
       *  Return true if and only if the given String appears to be a feature
       *  generated by blast.  This method is easily fooled.
       **/
      private static boolean isBlastLine (final String line) 
      {
        if (line.length () > 0 && countChars (line, '\t') == 11) 
          return true;
        else 
          return false;
      }
    
      /**
       *  Return true if and only if the given String appears to be a GFF feature.
       *  This method is easily fooled.
       **/
      private static boolean isGFFLine (final String line) 
      {
        if (line.length () > 0) 
        {
          final int tab_count = countChars (line.trim (), '\t');
    
          if (tab_count == 7 || tab_count == 8 || tab_count == 9 ||
              tab_count == 10) 
            return true;
        }
        return false;
      }
    
      /**
       *  Return the number of occurrences of the character c in the String s.
       **/
      private static int countChars (final String s, final char c)
      {
        int count = 0;
    
        for (int i = 0 ; i < s.length () ; ++i) 
        {
          if (s.charAt (i) == c) 
            ++count;
        }
    
        return count;
      }
    
      /**
       *  Return the type of GENBANK LineGroup that starts with the given String
       *  or UNKNOWN if the String isn't the
       **/
      private static int getGenbankType (final String line) 
      {
        if (line.length () > 0 && Character.isLetter (line.charAt (0))) 
        {
          final int first_space = line.indexOf (' ');
          if (first_space == -1) 
          {
            if (genbank_hash.get (line) != null) 
              return GENBANK_MISC;
          }
          else
          {
            final String first_word = line.substring (0, first_space);
    
            if (genbank_hash.get (first_word) != null) 
              return GENBANK_MISC;
          }
        }
    
        return UNKNOWN;
      }
    
      /**
       *  Returns a String containing the contents of the line with the initial
       *  type string (two letters) and white space (three spaces) removed.
       */
      public static String getRestOfLine (String line) 
      {
        final int END_OF_SPACES = 5;
    
        if (line.length () > END_OF_SPACES) 
          return line.substring (END_OF_SPACES);
        else 
          return "";
      }
    
      /**
       *  Write the end of entry marker - "//".
       **/
      public static void writeEndOfEMBLEntry (Writer writer) throws IOException 
      {
        writer.write (END_OF_ENTRY_STRING + "\n");
      }
    
      /**
       *  Write this object to the given stream.
       *  @param writer The stream to write to.
       **/
      public abstract void writeToStream (final Writer out_stream)
          throws IOException;
    
    }