Newer
Older
/* LineGroup.java
*
* created: Mon Oct 12 1998
*
* This file is part of Artemis
*
* Copyright (C) 1998,1999,2000 Genome Research Limited
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/io/LineGroup.java,v 1.5 2005-06-14 08:18:41 tjc Exp $
*/
package uk.ac.sanger.artemis.io;
import java.io.Writer;
import java.io.IOException;
import java.util.Hashtable;
import uk.ac.sanger.artemis.util.LinePushBackReader;
/**
* This class corresponds to a group of associated lines in an EMBL entry.
* An example of a group of associated lines is all the lines in an entry
* that start with FT.
*
* @author Kim Rutherford
* @version $Id: LineGroup.java,v 1.5 2005-06-14 08:18:41 tjc Exp $
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
*
*/
abstract class LineGroup
extends EMBLObject
{
/**
* The tag used for unidentified input.
**/
final static private int UNKNOWN = 0;
/**
* The tag for the end of entry line: "//"
**/
final static int END_OF_ENTRY = 1;
final static String END_OF_ENTRY_STRING = "//";
/**
* The tag for the start of sequence line
**/
final static int SEQUENCE = 2;
final static String EMBL_SEQUENCE_STRING = "SQ";
/**
* The tag for an EMBL feature table line
**/
final static int EMBL_FEATURE = 3;
final static String EMBL_FEATURE_STRING = "FT";
/**
* The tag for an EMBL feature header lines (FH ...)
**/
final static int EMBL_FEATURE_HEADER = 4;
final static String EMBL_FEATURE_HEADER_STRING = "FH";
/**
* The tag for a GENBANK feature table line
**/
final static int GENBANK_FEATURE = 5;
/**
* This is the tag for an EMBL LineGroup that we don't have a handler for.
* It will be stored in an object of type EmblMisc.
**/
final static int EMBL_MISC = 6;
/**
* This is the tag for an Genbank LineGroup that we don't have a handler
* for. It will be stored in an object of type GenbankMisc.
**/
final static int GENBANK_MISC = 7;
/**
* This is the tag for a GFF LineGroup (generally a comment line) that we
* don't have a handler for. It will be stored in an object of type
* GFFMisc.
**/
final static int GFF_MISC = 8;
/**
* This is the tag for a GFF format line.
**/
final static int GFF_FEATURE = 9;
/**
* This is the tag for lines generated by MSPcrunch -d
**/
final static int MSPCRUNCH_FEATURE = 10;
/**
* This is the tag for lines generated by blast
**/
final static int BLAST_FEATURE = 11;
/**
* The tag for files that look like binary.
**/
final static int BINARY_CHARACTERS = 12;
/**
* The tag for BSML XML files.
**/
final static int BSML_XML = 13;
/**
* The tag for AGAVE XML files.
**/
final static int AGAVE_XML = 14;
/**
* The tag for GAME XML files.
**/
final static int GAME_XML = 15;
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
/**
* This hash table contains the GENBANK start of line keywords (LOCUS,
* DEFINITION, FEATURES etc.)
**/
private static Hashtable genbank_hash = null;
static
{
genbank_hash = new Hashtable ();
genbank_hash.put ("LOCUS","LOCUS");
genbank_hash.put ("DEFINITION","DEFINITION");
genbank_hash.put ("ACCESSION","ACCESSION");
genbank_hash.put ("NID","NID");
genbank_hash.put ("VERSION","VERSION");
genbank_hash.put ("KEYWORDS","KEYWORDS");
genbank_hash.put ("SOURCE","SOURCE");
genbank_hash.put ("REFERENCE","REFERENCE");
genbank_hash.put ("COMMENT","COMMENT");
genbank_hash.put ("FEATURES","FEATURES");
genbank_hash.put ("SEGMENT","SEGMENT");
}
/**
* Try to read and return a new LineGroup object from a stream.
* @param reader The stream to read from.
* @return A new LineGroup object or null if stream is at the end of file.
* @exception IOException Thrown if exception occurs while reading.
* @exception ReadFormatException Thrown if the format of the input is in
* error.
* @exception InvalidRelationException Thrown if this Feature cannot contain
* a particular Qualifier.
**/
public static LineGroup readNextLineGroup (LinePushBackReader reader)
throws IOException, InvalidRelationException
{
String line;
// read until we get to a non-blank line
LINES: while(true)
{
line = reader.readLine ();
if(line == null)
return null; // end of file
// check for and ignore blank lines
for(int i = 0 ; i < line.length () ; ++i)
{
final char letter = line.charAt (i);
if(letter != ' ' && letter != '\t')
break LINES;
}
}
final int line_type = LineGroup.getLineType (line);
reader.pushBack (line);
switch (line_type)
{
case SEQUENCE:
return StreamSequenceFactory.makeStreamSequence (reader);
case EMBL_FEATURE:
return EmblStreamFeature.readFromStream (reader);
case EMBL_FEATURE_HEADER:
return new FeatureHeader (reader);
case GENBANK_FEATURE:
return GenbankStreamFeature.readFromStream (reader);
case GFF_FEATURE:
return GFFStreamFeature.readFromStream (reader);
case BLAST_FEATURE:
return BlastStreamFeature.readFromStream (reader);
case MSPCRUNCH_FEATURE:
return MSPcrunchStreamFeature.readFromStream (reader);
case END_OF_ENTRY:
// in this case we do want to read the line (which will be //) so that
// the next call to readNextEntry () starts on the next entry
reader.readLine ();
return null;
case BINARY_CHARACTERS:
throw new ReadFormatException ("cannot recognise format of binary file");
default:
throw new ReadFormatException ("reader got confused - " +
"unknown line type",
reader.getLineNumber ());
}
}
/**
* Return the embl line type of the line contained in the argument String.
*/
if(line.length () >= 2 &&
(line.charAt (0) == '/' || Character.isLetter (line.charAt (0))) &&
(line.charAt (1) == '/' || Character.isLetter (line.charAt (1))) &&
(line.length () == 2 ||
line.length () == 3 && line.endsWith (" ") ||
line.length () == 4 && line.endsWith (" ") ||
(line.length () >= 5 && line.substring (2,5).equals (" ") ||
line.startsWith("HD * confidential") ))) // EMBL pre-submission line
return EMBL_FEATURE_HEADER;
// this covers all the lines in the header
return EMBL_MISC;
}
if(line.length () > 21 &&
((line.startsWith (" ") &&
(Character.isLetter (line.charAt (5)) ||
Character.isDigit (line.charAt (5)) ||
line.charAt (5) == '-') &&
line.charAt (20) == ' ') ||
(line.startsWith (" ") &&
line.trim ().length () > 0)))
return GENBANK_FEATURE;
if(isGFFLine(line))
return GFF_FEATURE;
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
return BINARY_CHARACTERS;
// default is sequence
return SEQUENCE;
}
/**
* Return true if and only if the argument contains more than 30% binary
* characters. "binary" means a control character before space in ascii
* (except for tab, new line and form feed) and characters with the high
* bit set. This is supposed to approximate the Perl -B test.
**/
private static boolean looksLikeBinary (final String line)
{
int count = 0;
if(line.length () == 0)
return false;
for(int i = 0 ; i < line.length () ; ++i)
{
final char this_char = line.charAt (i);
if (Character.isISOControl (this_char) &&
this_char != '\t' &&
this_char != ' ' &&
this_char != '\r' &&
this_char != '\n' ||
this_char >= 128) {
++count;
}
}
if (count * 100 / line.length () >= 30) {
return true;
} else {
return false;
}
}
/**
* Return true if and only if the given String appears to be a feature
* generated by MSPcrunch -d
**/
private static boolean isMSPcrunchLine (final String line)
{
final String trim_line = line.trim ();
if (trim_line.length () > 0 &&
Character.isDigit (trim_line.charAt (0)) &&
trim_line.indexOf (' ') != -1)
return true;
else
return false;
}
/**
* Return true if and only if the given String appears to be a feature
* generated by blast. This method is easily fooled.
**/
private static boolean isBlastLine (final String line)
{
if (line.length () > 0 && countChars (line, '\t') == 11)
return true;
else
return false;
}
/**
* Return true if and only if the given String appears to be a GFF feature.
* This method is easily fooled.
**/
private static boolean isGFFLine (final String line)
{
if (line.length () > 0)
{
final int tab_count = countChars (line.trim (), '\t');
if (tab_count == 7 || tab_count == 8 || tab_count == 9 ||
tab_count == 10)
return true;
}
return false;
}
/**
* Return the number of occurrences of the character c in the String s.
**/
private static int countChars (final String s, final char c)
{
int count = 0;
for (int i = 0 ; i < s.length () ; ++i)
{
if (s.charAt (i) == c)
++count;
}
return count;
}
/**
* Return the type of GENBANK LineGroup that starts with the given String
* or UNKNOWN if the String isn't the
**/
private static int getGenbankType (final String line)
{
if (line.length () > 0 && Character.isLetter (line.charAt (0)))
{
final int first_space = line.indexOf (' ');
if (first_space == -1)
{
if (genbank_hash.get (line) != null)
return GENBANK_MISC;
}
else
{
final String first_word = line.substring (0, first_space);
if (genbank_hash.get (first_word) != null)
return GENBANK_MISC;
}
}
return UNKNOWN;
}
/**
* Returns a String containing the contents of the line with the initial
* type string (two letters) and white space (three spaces) removed.
*/
public static String getRestOfLine (String line)
{
final int END_OF_SPACES = 5;
if (line.length () > END_OF_SPACES)
return line.substring (END_OF_SPACES);
else
return "";
}
/**
* Write the end of entry marker - "//".
**/
public static void writeEndOfEMBLEntry (Writer writer) throws IOException
{
writer.write (END_OF_ENTRY_STRING + "\n");
}
/**
* Write this object to the given stream.
* @param writer The stream to write to.
**/
public abstract void writeToStream (final Writer out_stream)
throws IOException;
}