Newer
Older
/* GFFStreamFeature.java
*
* created: Tue Sep 14 1999
*
* This file is part of Artemis
*
* Copyright (C) 1999,2000,2001 Genome Research Limited
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/io/GFFStreamFeature.java,v 1.72 2009-08-28 10:33:12 tjc Exp $
*/
package uk.ac.sanger.artemis.io;
import java.util.Hashtable;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.StringTokenizer;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import uk.ac.sanger.artemis.Options;
import uk.ac.sanger.artemis.components.genebuilder.GeneUtils;
tjc
committed
import uk.ac.sanger.artemis.components.genebuilder.ProteinMapPanel;
import uk.ac.sanger.artemis.components.genebuilder.ortholog.MatchPanel;
import uk.ac.sanger.artemis.util.LinePushBackReader;
import uk.ac.sanger.artemis.util.OutOfRangeException;
import uk.ac.sanger.artemis.util.ReadOnlyException;
import uk.ac.sanger.artemis.util.StringVector;
* A StreamFeature that thinks it is a GFF feature.
public class GFFStreamFeature extends SimpleDocumentFeature implements
DocumentFeature, StreamFeature, ComparableFeature {
private static org.apache.log4j.Logger logger4j = org.apache.log4j.Logger
.getLogger(GFFStreamFeature.class);
/** store for spliced features containing id and range of each segment */
private Hashtable<String, Range> id_range_store;
/** store a record of the new and old uniquenames that have been changed */
private Hashtable<String, String> newIdMapToOldId;
/** combined feature_relationship.rank store for exons */
private Hashtable<String, Integer> feature_relationship_rank_store;
/** first tabbed parameter */
private String gffSeqName;
/** second tabbed parameter */
private boolean lazyLoaded = false;
private org.gmod.schema.sequence.Feature chadoLazyFeature;
private boolean readOnlyFeature = false;
private static Set<String> attrs_to_filter = new HashSet<String>();
* Registers an attribute not to be included in the GFF3 output for
* GFFStreamFeatures
* @param attr
* The GFF3 attribute to remove
public static void removeAttribute(String attr) {
attrs_to_filter.add(attr);
}
/**
* Registers an attribute to be included in the GFF3 output for
* GFFStreamFeatures
* @param attr
* The GFF3 attribute to include
public static void includeAttribute(String attr) {
attrs_to_filter.remove(attr);
}
* Create a new GFFStreamFeature object. The feature should be added to an
* Entry (with Entry.add()).
* @param key
* The new feature key
* @param location
* The Location object for the new feature
* @param qualifiers
* The qualifiers for the new feature
public GFFStreamFeature(final Key key, final Location location,
setKey(key);
setLocation(location);
setQualifiers(qualifiers);
String idStr = null;
StringVector v = Options.getOptions().getSystematicQualifierNames();
for (int i = 0; i < v.size(); i++) {
final String sysName = (String) v.get(i);
if (getQualifierByName(sysName) != null) {
idStr = (String) getQualifierByName(sysName).getValues().get(0);
break;
}
}
// autogenerate ID
if (idStr == null)
idStr = key.getKeyString() + ":" + location.toString();
setQualifier(new Qualifier("ID", idStr));
}
// this should never happen because the feature will not be in an Entry
throw new Error("internal error - unexpected exception: " + e);
// this should never happen because the feature will not be in an Entry
throw new Error("internal error - unexpected exception: " + e);
// this should never happen because the feature will not be in an Entry
throw new Error("internal error - unexpected exception: " + e);
public GFFStreamFeature(final Feature feature) {
* Create a new GFFStreamFeature with the same key, location and qualifiers as
* the given feature. The feature should be added to an Entry (with
* Entry.add()).
@SuppressWarnings("unchecked")
public GFFStreamFeature(final Feature feature,
final boolean isDuplicatedInChado) {
this(feature.getKey(), feature.getLocation(), feature.getQualifiers());
if (feature instanceof GFFStreamFeature) {
if (((GFFStreamFeature) feature).id_range_store != null)
this.id_range_store = (Hashtable<String, Range>) (((GFFStreamFeature) feature).id_range_store)
.clone();
if (((GFFStreamFeature) feature).feature_relationship_rank_store != null)
this.feature_relationship_rank_store = (Hashtable<String, Integer>) (((GFFStreamFeature) feature).feature_relationship_rank_store)
.clone();
this.setGffSeqName(((GFFStreamFeature) feature).getGffSeqName());
this.setGffSource(((GFFStreamFeature) feature).getGffSource());
if (feature instanceof GFFStreamFeature) {
((GFFStreamFeature) feature).duplicate++;
duplicatePrefix = "DUP"
+ Short.toString(((GFFStreamFeature) feature).duplicate) + "-";
} else
if (id_range_store != null) {
final Hashtable<String, Range> new_id_range_store = new Hashtable<String, Range>(
id_range_store.size());
final Enumeration<String> enumIdRangeStore = id_range_store.keys();
final String keyId = enumIdRangeStore.nextElement();
final Range range = id_range_store.get(keyId);
new_id_range_store.put(duplicatePrefix + keyId, range);
this.id_range_store = (Hashtable<String, Range>) new_id_range_store
.clone();
uniquename = getSegmentID(getLocation().getRanges());
else {
if (((String) getQualifierByName("ID").getValues().get(0))
.endsWith("}"))
uniquename = duplicatePrefix
+ (String) getQualifierByName("ID").getValues().get(0);
} else
uniquename = duplicatePrefix
+ (String) getQualifierByName("ID").getValues().get(0);
if (getQualifierByName("Parent") != null) {
final String parent = (String) getQualifierByName("Parent")
.getValues().get(0);
setQualifier(new Qualifier("Parent", duplicatePrefix + parent));
if (getQualifierByName("Derives_from") != null) {
final String derives_from = (String) getQualifierByName(
"Derives_from").getValues().get(0);
setQualifier(new Qualifier("Derives_from", duplicatePrefix
+ derives_from));
tjc
committed
// remove qualifiers that don't get transferred to duplicate
final String removeQualifierNames[] = { "feature_id",
"timelastmodified", "feature_relationship_rank",
ProteinMapPanel.POLYPEPTIDE_DOMAIN, ProteinMapPanel.TMHMM[0],
ProteinMapPanel.TMHMM[1], ProteinMapPanel.TMHMM[2],
ProteinMapPanel.TMHMM[3], MatchPanel.ORTHOLOG,
MatchPanel.ORTHOLOG };
for (int i = 0; i < removeQualifierNames.length; i++)
tjc
committed
removeQualifierByName(removeQualifierNames[i]);
} catch (ReadOnlyException e) {
} catch (EntryInformationException e) {
} else {
chadoGene = ((GFFStreamFeature) feature).chadoGene;
* Create a new GFFStreamFeature from the given line. The String should be in
* gene finder format.
public GFFStreamFeature(final String line) throws ReadFormatException {
final StringVector line_bits = StringVector.getStrings(line, "\t", true);
if (line_bits.size() < 8)
throw new ReadFormatException("invalid GFF line: 8 fields needed "
+ "(got " + line_bits.size() + " fields) from: " + line);
final String end_base_str = line_bits.elementAt(4).trim();
end_base = Integer.parseInt(end_base_str);
} catch (NumberFormatException e) {
throw new ReadFormatException(
"Could not understand the start or end base " + "of a GFF feature: "
+ start_base_str + " " + end_base_str);
final Hashtable<String, StringVector> attributes = parseAttributes(rest_of_line);
for (final Enumeration<String> attribute_enum = attributes.keys(); attribute_enum
.hasMoreElements();) {
String name = attribute_enum.nextElement();
final StringVector values = attributes.get(name);
List<ClusterLazyQualifierValue> lazyValues = new Vector<ClusterLazyQualifierValue>();
for (int i = 0; i < values.size(); i++)
lazyValues.add(new ClusterLazyQualifierValue((String) values
.get(i), name, this));
setQualifier(new Qualifier(name));
else
setQualifier(new Qualifier(name, values));
}
if (!line_bits.elementAt(0).equals("null"))
setGffSeqName(GFF3Encoder.decode(line_bits.elementAt(0)));
setKey(new Key(line_bits.elementAt(2)));
setGffSource(line_bits.elementAt(1));
if (!line_bits.elementAt(5).equals(".")) {
final Qualifier score_qualifier = new Qualifier("score",
line_bits.elementAt(5));
setQualifier(score_qualifier);
}
if (!frame.equals(".")) {
final Qualifier codon_start_qualifier = new Qualifier("codon_start",
frame);
if (start_base > end_base)
throw new ReadFormatException("start position is greater than end "
+ "position: " + start_base + " > " + end_base + "\n" + line);
if (start_base < 0)
throw new ReadFormatException("start position must be positive: "
+ start_base);
final Range location_range = new Range(start_base, end_base);
final RangeVector location_ranges = new RangeVector(location_range);
setLocation(new Location(location_ranges, complement_flag));
throw new Error("internal error - unexpected exception: " + e);
throw new Error("internal error - unexpected exception: " + e);
throw new Error("internal error - unexpected exception: " + e);
* Store for spliced regions of segments ID's and ranges.
*/
public void setSegmentRangeStore(Hashtable<String, Range> id_range_store) {
public Hashtable<String, Range> getSegmentRangeStore() {
if (id_range_store == null) {
id_range_store = new Hashtable<String, Range>();
id_range_store.put((String) this.getQualifierByName("ID").getValues()
.get(0), this.getLocation().getTotalRange());
public Hashtable<String, String> getNewIdMapToOldId() {
return newIdMapToOldId;
}
/**
* Used when changing spliced feature uniquenames
* @param newIdMapToOldId
*/
public void setNewIdMapToOldId(Hashtable<String, String> newIdMapToOldId) {
this.newIdMapToOldId = newIdMapToOldId;
}
* Store for ID's and CHADO feature_relationship.rank
public void setFeature_relationship_rank_store(
Hashtable<String, Integer> feature_relationship_rank_store) {
this.feature_relationship_rank_store = feature_relationship_rank_store;
}
/**
* Store for ID's and CHADO feature_relationship.rank
public Hashtable<String, Integer> getFeature_relationship_rank_store() {
if ( id_range_store != null &&
getKey().getKeyString().indexOf("gene") == -1 &&
getKey().getKeyString().indexOf("RNA") == -1 ) {
if (getGffSeqName() != null && contig_ranges != null
&& contig_ranges.containsKey(getGffSeqName())) {
// adjust for coordinates in multi-sequence GFF
Range offset_range = contig_ranges.get(getGffSeqName());
Enumeration<String> enum_ranges = id_range_store.keys();
while (enum_ranges.hasMoreElements()) {
String key = enum_ranges.nextElement();
if (range.getStart() == r.getStart() - offset
&& range.getEnd() == r.getEnd() - offset)
} else if (getQualifierByName("ID") != null) {
return (String) getQualifierByName("ID").getValues().get(0);
logger4j.warn("RANGE NOT FOUND " + r.toString());
* Get the feature ID based on the segments chado uniquename's.
public String getSegmentID(final RangeVector rv) {
for (int i = 0; i < rv.size(); i++) {
range = (Range) rv.get(i);
if (id.equals("") || index < 0) {
if (!id.equals(""))
id = id + ",";
id = id + prefix[0] + "{" + prefix[1] + "}";
id = id.substring(0, index) + "," + prefix[1] + id.substring(index);
} else if (id_new != null) {
if (!id.equals(""))
id = id + ",";
id = id + id_new;
* Get the ID prefix, e.g. for SPAC1556.06.1:exon:2 returns SPAC1556.06.1:exon
* as the prefix and 2 as the index.
public String[] getPrefix(final String id, final char separator) {
String prefix[] = new String[2];
int index = id.lastIndexOf(separator);
if (index > -1) {
prefix[0] = id.substring(0, index);
prefix[1] = id.substring(index + 1);
public int getAutoNumber(final String prefix, final char separator) {
int auto = 1;
auto++;
val = prefix + separator + auto;
}
return auto;
}
* Return the reference of a new copy of this Feature.
final Feature return_value = new GFFStreamFeature(this);
return return_value;
* Read and return a GFFStreamFeature from a stream. A feature must be the
* next thing in the stream.
* @param stream
* the Feature is read from this stream
* @exception IOException
* thrown if there is a problem reading the Feature - most likely
* ReadFormatException.
* @exception InvalidRelationException
* Thrown if this Feature cannot contain the given Qualifier.
* @return null if in_stream is at the end of file when the method is called
protected static GFFStreamFeature readFromStream(LinePushBackReader stream)
throws IOException, InvalidRelationException {
final String new_error_string = exception.getMessage();
throw new ReadFormatException(new_error_string, stream.getLineNumber());
* Read the details of a feature from an EMBL stream into the current object.
* @param entry_information
* The EntryInformation object of the Entry that will contain the
* Feature.
* @param in_stream
* the Feature is read from this stream
* @exception IOException
* thrown if there is a problem reading the Feature - most likely
* ReadFormatException if the stream does not contain GFF
* feature.
public void setFromStream(final EntryInformation entry_information,
final LinePushBackReader in_stream) throws IOException,
InvalidRelationException, ReadOnlyException {
* @param writer
* The stream to write to.
* @exception IOException
* thrown if there is an io problem while writing the Feature.
public void writeToStream(final Writer writer) throws IOException {
final RangeVector ranges = getLocation().getRanges();
final int ranges_size = ranges.size();
// final Hashtable contig_ranges = SimpleDocumentEntry.getContigRanges();
for (int i = 0; i < ranges_size; ++i) {
Range this_range = (Range) ranges.elementAt(i);
String seqname = getGffSeqName();
String source = getGffSource();
Qualifier score = getQualifierByName("score");
Qualifier group = getQualifierByName("group");
source_str = getDbxrefGFFSource(getQualifierByName("Dbxref"));
}
if (seqname == null
&& ((GFFDocumentEntry) getEntry()).getDocument() != null)
seqname = ((GFFDocumentEntry) getEntry()).getDocument().getName();
if (seqname == null)
if (seqname != null && contig_ranges != null
&& contig_ranges.containsKey(seqname)) {
start = start - offset_range.getStart() + 1;
end = end - offset_range.getStart() + 1;
if (group == null || group.getValues() == null
|| group.getValues().elementAt(0).equals("")) {
String frame = ".";
if (codon_start != null) {
frame = (String) (codon_start.getValues()).elementAt(0);
// phase is REQUIRED for all CDS features
if (getKey().equals("CDS") && frame.equals("."))
String attribute_string = unParseAttributes(myId);
if (source_str == null && source != null)
source_str = source;
attribute_string = attribute_string + ";" + translation;
writer.write(seqname + "\t" + source_str + "\t" + getKey().getKeyString()
+ "\t" + start + "\t" + end + "\t" + score.getValues().elementAt(0)
+ "\t" + (getLocation().isComplement() ? "-\t" : "+\t") + frame
+ "\t" + attribute_string + "\n");
* If the seqname is not set for this feature try to derive the
* contig/chromosome it is located on
final Enumeration<String> contigEnum = contig_ranges.keys();
final String key = contigEnum.nextElement();
final Range r = contig_ranges.get(key);
} else {
try {
seqname = ((GFFStreamFeature) (getEntry().getAllFeatures().elementAt(0)))
.getGffSeqName();
} catch (Exception e) {
* Return a String containing the qualifiers of this feature in a form
* suitable for using as the last field of a GFF line.
private String unParseAttributes(final String myId) {
final QualifierVector qualifiers = getQualifiers();
GFF3AttributeBuilder abuf = new GFF3AttributeBuilder();
prepareProcessors(abuf);
for (String attr : attrs_to_filter) {
abuf.ignore(attr);
}
final int names_length = abuf.reserved_a.length;
// add ID attribute
if (myId != null) {
abuf.add("ID", myId);
// build reserved attributes
for (int i = 1; i < names_length; i++) {
Qualifier this_qualifier = qualifiers.getQualifierByName(abuf.reserved_a[i]);
abuf.add(this_qualifier.getName(), this_qualifier.getValues());
for (Qualifier this_qualifier : qualifiers) {
// skip reserved names
for (int j = 0; j < names_length; j++)
if (this_qualifier.getName().equals(abuf.reserved_a[j]))
continue;
if ( (this_qualifier.getName().equals("private") &&
System.getProperty("noprivate") != null) ||
(this_qualifier.getName().equals("history") &&
System.getProperty("nohistory") != null) ||
this_qualifier.getName().equals("codon_start"))
abuf.add(this_qualifier.getName(), this_qualifier.getValues());
private static String strJoin(String[] aArr, String sSep) {
StringBuilder sbStr = new StringBuilder();
for (int i = 0, il = aArr.length; i < il; i++) {
if (i > 0)
sbStr.append(sSep);
sbStr.append(aArr[i]);
}
return sbStr.toString();
}
void prepareProcessors(GFF3AttributeBuilder abuf) {
GFF3AttributeAggregator productProc = new GFF3AttributeAggregator() {
@Override
public String process(StringVector values) {
StringBuilder buffer = new StringBuilder();
if (values != null && values.size() > 0) {
for (int value_index = 0; value_index < values.size(); ++value_index) {
String this_value = GFF3Encoder.encode(values.elementAt(value_index));
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
if (value_index > 0 && value_index < (values.size())) {
buffer.append(",");
}
buffer.append(this_value);
}
}
return buffer.toString();
}
};
GFF3AttributeAggregator ecProc = new GFF3AttributeAggregator() {
@Override
public String process(StringVector values) {
StringBuilder buffer = new StringBuilder();
if (values != null && values.size() > 0) {
for (int value_index = 0; value_index < values.size(); ++value_index) {
final String this_value = "EC:"
+ GFF3Encoder.encode(values.elementAt(value_index));
if (value_index > 0 && value_index < (values.size())) {
buffer.append(",");
}
buffer.append(this_value);
}
}
return buffer.toString();
}
};
GFF3AttributeAggregator psysIDProc = new GFF3AttributeAggregator() {
@Override
public String process(StringVector values) {
StringBuilder buffer = new StringBuilder();
if (values != null && values.size() > 0) {
for (int value_index = 0; value_index < values.size(); ++value_index) {
final String this_value;
int index = values.elementAt(value_index).indexOf(";current=");
if (index > -1)
this_value = GFF3Encoder.encode(values.elementAt(value_index)
.substring(0, index));
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
else
this_value = GFF3Encoder.encode(values.elementAt(value_index));
if (value_index > 0 && value_index < (values.size())) {
buffer.append(",");
}
buffer.append(this_value);
}
}
return buffer.toString();
}
};
GFF3AttributeAggregator classProc = new GFF3AttributeAggregator() {
@Override
public String process(StringVector values) {
StringBuilder buffer = new StringBuilder();
if (values != null && values.size() > 0) {
for (int value_index = 0; value_index < values.size(); ++value_index) {
final String this_value;
int index = values.elementAt(value_index).indexOf("::");
if (index > -1)
this_value = GFF3Encoder.encode(values.elementAt(value_index)
.substring(0, index));
else
this_value = GFF3Encoder.encode(values.elementAt(value_index));
if (value_index > 0 && value_index < (values.size())) {
buffer.append(",");
}
buffer.append(this_value);
}
}
return buffer.toString();
}
};
GFF3AttributeAggregator startEndRangeProc = new GFF3AttributeAggregator() {
@Override
public String process(StringVector values) {
StringBuilder buffer = new StringBuilder();
if (values != null && values.size() > 0) {
for (int value_index = 0; value_index < values.size(); ++value_index) {
if (value_index > 0 && value_index < (values.size())) {
buffer.append(",");
}
buffer.append(values.elementAt(value_index));
}
}
return buffer.toString();
}
};
GFF3AttributeAggregator goProc = new GFF3AttributeAggregator() {
@Override
public String process(StringVector values) {
StringBuilder buffer = new StringBuilder();
if (values != null && values.size() > 0) {
for (int value_index = 0; value_index < values.size(); ++value_index) {
int goindex = values.elementAt(value_index).indexOf("GOid=");
int termindex = values.elementAt(value_index).indexOf(";term=");
if (goindex > -1 && termindex > -1) {
buffer.append(GFF3Encoder.encode(values.elementAt(value_index)
.substring(goindex + 5, termindex)));
if (value_index < (values.size()) - 1)
buffer.append(",");
}
}
}
return buffer.toString();
}
};
GFF3AttributeAggregator ucProc = new GFF3AttributeAggregator() {
@Override
public String process(StringVector values) {
StringBuilder buffer = new StringBuilder();
Set<String> set = new HashSet<String>();
if (values != null && values.size() > 0) {
for (int value_index = 0; value_index < values.size(); ++value_index) {
String regex = "eupathdb_uc[:=]\"?([a-zA-Z0-9]+)";
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(values.elementAt(value_index));
while (matcher.find()) {
set.add(GFF3Encoder.encode(matcher.group(1)));
}
}
}
return strJoin(set.toArray(new String[set.size()]), ",");
}
};
GFF3AttributeAggregator curcomProc = new GFF3AttributeAggregator() {
@Override
public String process(StringVector values) {
StringBuilder buffer = new StringBuilder();
if (values != null && values.size() > 0) {
for (int value_index = 0; value_index < values.size(); ++value_index) {
buffer.append(GFF3Encoder.encode(values.elementAt(value_index)));
if (value_index < (values.size()) - 1)
buffer.append(" ");
}
}
return buffer.toString();
}
};
// map GO -> full_GO
abuf.setMapping("GO", "full_GO");
abuf.setGlue("full_GO", ",");
// merge curation and comment
abuf.setMapping("curation", "comment");
abuf.setGlue("comment", " ");
abuf.setAggregator("comment", curcomProc);
// also put GOs in Ontology_term
abuf.setClone("full_GO", "Ontology_term");
abuf.setAggregator("Ontology_term", goProc);
abuf.setGlue("Ontology_term", ",");
// also put EuPathDB UC numbers into separate attribute
abuf.setClone("history", "eupathdb_uc");
abuf.setAggregator("eupathdb_uc", ucProc);
abuf.setGlue("eupathdb_uc", ",");
// class
abuf.setAggregator("class", classProc);
// EC numbers go into Dbxref
abuf.setMapping("EC_number", "Dbxref");
abuf.setAggregator("EC_number", ecProc);
abuf.setGlue("Dbxref", ",");
// start/end ranges
abuf.setAggregator("Start_range", startEndRangeProc);
abuf.setAggregator("End_range", startEndRangeProc);
// previous_systematic_id
abuf.setAggregator("previous_systematic_id", psysIDProc);
// product
abuf.setAggregator("product", productProc);
}
/**
* Get the translation qualifier string for polypeptide features.
private String getTranslation() {
if (!getKey().getKeyString().equals("polypeptide"))
return null;
if (chadoGene != null) {
if (getUserData() == null)
// the above line constructs the appropriate userData within this current
// GFFStreamFeature object,
// which is required by the following GeneUtils.deriveResidues()
String residues = GeneUtils.deriveResidues(this);
if (residues != null)
* Parse the given String as ACeDB format attributes. Adapted from code by
* Matthew Pocock for the BioJava project.
* @return Return a Hashtable. Each key is an attribute name and each value of
* the Hashtable is a StringVector containing the attribute values. If