Newer
Older
/* GFFStreamFeature.java
*
* created: Tue Sep 14 1999
*
* This file is part of Artemis
*
* Copyright (C) 1999,2000,2001 Genome Research Limited
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/io/GFFStreamFeature.java,v 1.72 2009-08-28 10:33:12 tjc Exp $
*/
package uk.ac.sanger.artemis.io;
import java.util.Hashtable;
import java.util.StringTokenizer;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import uk.ac.sanger.artemis.Options;
import uk.ac.sanger.artemis.components.genebuilder.GeneUtils;
tjc
committed
import uk.ac.sanger.artemis.components.genebuilder.ProteinMapPanel;
import uk.ac.sanger.artemis.components.genebuilder.ortholog.MatchPanel;
import uk.ac.sanger.artemis.util.LinePushBackReader;
import uk.ac.sanger.artemis.util.OutOfRangeException;
import uk.ac.sanger.artemis.util.ReadOnlyException;
import uk.ac.sanger.artemis.util.StringVector;
/**
* A StreamFeature that thinks it is a GFF feature.
* @author Kim Rutherford
**/
public class GFFStreamFeature extends SimpleDocumentFeature
implements DocumentFeature, StreamFeature, ComparableFeature
private static org.apache.log4j.Logger logger4j =
/** store for spliced features containing id and range of each segment */
private Hashtable<String, Range> id_range_store;
/** store a record of the new and old uniquenames that have been changed */
private Hashtable<String, String> newIdMapToOldId;
/** combined feature_relationship.rank store for exons */
private Hashtable<String, Integer> feature_relationship_rank_store;
/** first tabbed parameter */
private String gffSeqName;
/** second tabbed parameter */
private String gffSource;
/** duplication count */
private short duplicate = 0;
private boolean lazyLoaded = false;
private org.gmod.schema.sequence.Feature chadoLazyFeature;
private static String MAP_DECODE[][] = {
{ " ", "%20" }, // white space
{ ",", "%2C" }, // comma
{ ";", "%3B" }, // semi-colon
{ "=", "%3D" }, // equals
{ "\t", "%09" }, // tab
{ " ", "+" }, // white space
{ "+", "%2B" },
{ "(", "%28" }, // left bracket
{ ")", "%29" }, // right bracket
{ "'", "\"" }
private static String MAP_ENCODE[][] = {
// { " ", "%20" }, // white space
{ ";", "%3B" }, // semi-colon
{ "=", "%3D" }, // equals
{ "\t", "%09" }, // tab
{ "+", "%2B" },
{ " ", "+" }, // white space
{ "(", "%28" }, // left bracket
{ ")", "%29" }, // right bracket
/**
* Create a new GFFStreamFeature object. The feature should be added
* @param key The new feature key
* @param location The Location object for the new feature
* @param qualifiers The qualifiers for the new feature
**/
public GFFStreamFeature(final Key key, final Location location,
final QualifierVector qualifiers)
{
setKey(key);
setLocation(location);
setQualifiers(qualifiers);
{
String idStr = null;
StringVector v = Options.getOptions().getSystematicQualifierNames();
for(int i=0; i<v.size(); i++)
{
final String sysName = (String)v.get(i);
if(getQualifierByName(sysName) != null)
{
idStr = (String)getQualifierByName(sysName).getValues().get(0);
break;
}
}
// autogenerate ID
if(idStr == null)
idStr = key.getKeyString()+":"+location.toString();
setQualifier(new Qualifier("ID", idStr));
}
}
catch(EntryInformationException e)
// this should never happen because the feature will not be in an Entry
throw new Error("internal error - unexpected exception: " + e);
}
// this should never happen because the feature will not be in an Entry
throw new Error("internal error - unexpected exception: " + e);
}
catch(OutOfRangeException e)
// this should never happen because the feature will not be in an Entry
throw new Error("internal error - unexpected exception: " + e);
public GFFStreamFeature(final Feature feature)
{
this(feature, false);
}
/**
* Create a new GFFStreamFeature with the same key, location and
* qualifiers as the given feature. The feature should be added to an
public GFFStreamFeature(final Feature feature, final boolean isDuplicatedInChado)
{
this(feature.getKey(), feature.getLocation(), feature.getQualifiers());
if(feature instanceof GFFStreamFeature)
{
if(((GFFStreamFeature)feature).id_range_store != null)
(Hashtable)(((GFFStreamFeature)feature).id_range_store).clone();
if(((GFFStreamFeature)feature).feature_relationship_rank_store != null)
this.feature_relationship_rank_store =
(Hashtable)(((GFFStreamFeature)feature).feature_relationship_rank_store).clone();
this.setGffSeqName(((GFFStreamFeature)feature).getGffSeqName());
this.setGffSource(((GFFStreamFeature)feature).getGffSource());
if(isDuplicatedInChado)
{
try
{
final String uniquename;
if(feature instanceof GFFStreamFeature)
{
((GFFStreamFeature)feature).duplicate++;
duplicatePrefix = "DUP"+Short.toString(((GFFStreamFeature)feature).duplicate)+"-";
}
else
duplicatePrefix = "DUP";
final Hashtable<String, Range> new_id_range_store = new Hashtable<String, Range>(id_range_store.size());
final Enumeration<String> enumIdRangeStore = id_range_store.keys();
while(enumIdRangeStore.hasMoreElements())
{
final String keyId = enumIdRangeStore.nextElement();
final Range range = id_range_store.get(keyId);
new_id_range_store.put(duplicatePrefix+keyId, range);
}
id_range_store.clear();
this.id_range_store = (Hashtable) new_id_range_store.clone();
if(getLocation().getRanges().size() > 1)
uniquename = getSegmentID(getLocation().getRanges());
else
{
if( ((String)getQualifierByName("ID").getValues().get(0)).endsWith("}") )
uniquename = id_range_store.keys().nextElement();
else
uniquename = duplicatePrefix+ (String)getQualifierByName("ID").getValues().get(0);
}
uniquename = duplicatePrefix+ (String)getQualifierByName("ID").getValues().get(0);
if(getQualifierByName("Parent") != null)
{
final String parent =
(String) getQualifierByName("Parent").getValues().get(0);
setQualifier(new Qualifier("Parent", duplicatePrefix+parent));
if(getQualifierByName("Derives_from") != null)
{
final String derives_from =
(String) getQualifierByName("Derives_from").getValues().get(0);
setQualifier(new Qualifier("Derives_from", duplicatePrefix+derives_from));
tjc
committed
// remove qualifiers that don't get transferred to duplicate
final String removeQualifierNames[] =
{ "feature_id",
"timelastmodified",
"feature_relationship_rank",
tjc
committed
ProteinMapPanel.POLYPEPTIDE_DOMAIN,
ProteinMapPanel.TMHMM[0],
ProteinMapPanel.TMHMM[1],
ProteinMapPanel.TMHMM[2],
ProteinMapPanel.TMHMM[3],
MatchPanel.ORTHOLOG,
MatchPanel.ORTHOLOG
};
tjc
committed
for(int i=0;i<removeQualifierNames.length; i++)
removeQualifierByName(removeQualifierNames[i]);
}
catch(ReadOnlyException e){}
catch(EntryInformationException e){}
chadoGene = ((GFFStreamFeature)feature).chadoGene;
}
/**
* Create a new GFFStreamFeature from the given line. The String should be
* in gene finder format.
**/
final StringVector line_bits = StringVector.getStrings(line, "\t", true);
throw new ReadFormatException("invalid GFF line: 8 fields needed " +
"(got " + line_bits.size () +
" fields) from: " + line);
final String start_base_str = line_bits.elementAt(3).trim();
final String end_base_str = line_bits.elementAt(4).trim();
start_base = Integer.parseInt(start_base_str);
end_base = Integer.parseInt(end_base_str);
throw new ReadFormatException("Could not understand the start or end base " +
"of a GFF feature: " + start_base_str +
if(line_bits.elementAt(6).equals("+"))
final Hashtable<String, StringVector> attributes = parseAttributes(rest_of_line);
for(final Enumeration<String> attribute_enum = attributes.keys();
String name = attribute_enum.nextElement();
final StringVector values = attributes.get(name);
List<ClusterLazyQualifierValue> lazyValues = new Vector<ClusterLazyQualifierValue>();
new ClusterLazyQualifierValue( (String)values.get(i), name,
this ));
setQualifier(new QualifierLazyLoading(name, lazyValues));
}
{
if(values.size() == 0)
setQualifier(new Qualifier(name));
else
setQualifier(new Qualifier(name, values));
}
if( !line_bits.elementAt(0).equals("null") )
setGffSeqName( decode(line_bits.elementAt(0)) );
setKey(new Key(line_bits.elementAt(2)));
setGffSource(line_bits.elementAt(1));
{
final Qualifier score_qualifier =
setQualifier(score_qualifier);
}
throw new ReadFormatException("start position is greater than end " +
"position: " + start_base + " > " +
if(start_base < 0)
throw new ReadFormatException("start position must be positive: " +
final Range location_range = new Range(start_base, end_base);
final RangeVector location_ranges = new RangeVector(location_range);
setLocation(new Location(location_ranges, complement_flag));
}
{
throw new Error("internal error - unexpected exception: " + e);
}
catch(EntryInformationException e)
{
throw new Error("internal error - unexpected exception: " + e);
}
catch(OutOfRangeException e)
{
throw new Error("internal error - unexpected exception: " + e);
/**
*
* Store for spliced regions of segments ID's and ranges.
*
*/
public void setSegmentRangeStore(Hashtable<String, Range> id_range_store)
public Hashtable<String, Range> getSegmentRangeStore()
if(id_range_store == null)
{
id_range_store = new Hashtable<String, Range>();
id_range_store.put((String)this.getQualifierByName("ID").getValues().get(0),
this.getLocation().getTotalRange());
}
public Hashtable<String, String> getNewIdMapToOldId()
{
return newIdMapToOldId;
}
/**
* Used when changing spliced feature uniquenames
* @param newIdMapToOldId
*/
public void setNewIdMapToOldId(Hashtable<String, String> newIdMapToOldId)
{
this.newIdMapToOldId = newIdMapToOldId;
}
* Store for ID's and CHADO feature_relationship.rank
* @param feature_relationship_rank_store
public void setFeature_relationship_rank_store(
Hashtable<String, Integer> feature_relationship_rank_store)
{
this.feature_relationship_rank_store = feature_relationship_rank_store;
}
/**
* Store for ID's and CHADO feature_relationship.rank
* @return
*/
public Hashtable<String, Integer> getFeature_relationship_rank_store()
{
return feature_relationship_rank_store;
}
int offset = 0;
if(getGffSeqName() != null && contig_ranges != null &&
contig_ranges.containsKey(getGffSeqName()))
{
// adjust for coordinates in multi-sequence GFF
Range offset_range = contig_ranges.get(getGffSeqName());
offset = offset_range.getStart()-1;
}
Enumeration<String> enum_ranges = id_range_store.keys();
String key = enum_ranges.nextElement();
Range range = id_range_store.get(key);
if(range.getStart() == r.getStart()-offset &&
range.getEnd() == r.getEnd()-offset)
else if (getQualifierByName("ID") != null)
{
return (String)getQualifierByName("ID").getValues().get(0);
}
* Get the feature ID based on the segments chado
* uniquename's.
* @param rv
* @return
*/
{
String id = "";
if(id_range_store != null)
{
String id_new;
Range range;
for(int i=0; i<rv.size(); i++)
{
range = (Range)rv.get(i);
id_new = getSegmentID(range);
String prefix[] = getPrefix(id_new, ':');
if(prefix[0] != null)
{
index = id.indexOf(prefix[0]);
if(id.equals("") || index < 0)
{
if(!id.equals(""))
id = id +",";
id = id+prefix[0] + "{" + prefix[1] + "}";
continue;
}
id = id.substring(0,index) + "," +
prefix[1] + id.substring(index);
}
else if(id_new != null)
{
if(!id.equals(""))
id = id +",";
id = id+id_new;
}
}
}
/**
* Get the ID prefix, e.g. for SPAC1556.06.1:exon:2
* returns SPAC1556.06.1:exon as the prefix and 2 as the
* index.
* @param id
* @return
*/
public String[] getPrefix(final String id,
final char separator)
{
String prefix[] = new String[2];
int index = id.lastIndexOf(separator);
if(index > -1)
{
prefix[0] = id.substring(0,index);
prefix[1] = id.substring(index+1);
}
return prefix;
}
/**
* Used to automatically generate
* @param prefix
* @return
*/
public int getAutoNumber(final String prefix,
final char separator)
{
int auto = 1;
String val = prefix + separator + auto;
while(id_range_store.containsKey(val))
{
auto++;
val = prefix + separator + auto;
}
return auto;
}
* Remove URL escaping rule (e.g. space="%20" or "+")
*/
enc = MAP_DECODE[i][1];
dec = MAP_DECODE[i][0];
while( (ind = s.indexOf(enc)) > -1)
s = s.substring(0,ind) + dec + s.substring(ind+enc.length());
}
* Add URL escaping rule (e.g. space="%20" or "+")
*/
enc = MAP_ENCODE[i][1];
dec = MAP_ENCODE[i][0];
while( (ind = s.indexOf(dec)) > -1 )
s = s.substring(0,ind) + enc + s.substring(ind+1);
}
/**
* Return the reference of a new copy of this Feature.
**/
{
final Feature return_value = new GFFStreamFeature(this);
return return_value;
}
/**
* Read and return a GFFStreamFeature from a stream. A feature must be the
* next thing in the stream.
* @param stream the Feature is read from this stream
* @exception IOException thrown if there is a problem reading the Feature -
* most likely ReadFormatException.
* @exception InvalidRelationException Thrown if this Feature cannot contain
* the given Qualifier.
* @return null if in_stream is at the end of file when the method is
* called
*/
protected static GFFStreamFeature readFromStream(LinePushBackReader stream)
throws IOException, InvalidRelationException
}
catch(ReadFormatException exception)
final String new_error_string = exception.getMessage();
throw new ReadFormatException(new_error_string,
stream.getLineNumber());
}
}
/**
* Read the details of a feature from an EMBL stream into the current
* object.
* @param entry_information The EntryInformation object of the Entry that
* will contain the Feature.
* @param in_stream the Feature is read from this stream
* @exception IOException thrown if there is a problem reading the Feature -
* most likely ReadFormatException if the stream does not contain GFF
* feature.
**/
public void setFromStream(final EntryInformation entry_information,
final LinePushBackReader in_stream)
throws IOException, InvalidRelationException, ReadOnlyException
}
/**
* Write this Feature to the given stream.
* @param writer The stream to write to.
* @exception IOException thrown if there is an io problem while writing
* the Feature.
**/
public void writeToStream(final Writer writer)
final RangeVector ranges = getLocation().getRanges();
final int ranges_size = ranges.size();
// final Hashtable contig_ranges = SimpleDocumentEntry.getContigRanges();
for(int i = 0; i < ranges_size; ++i)
Range this_range = (Range)ranges.elementAt(i);
String seqname = getGffSeqName();
String source = getGffSource();
Qualifier score = getQualifierByName("score");
Qualifier group = getQualifierByName("group");
String source_str = null;
if(getQualifierByName("Dbxref") != null)
{
source_str = getDbxrefGFFSource(getQualifierByName("Dbxref"));
}
int start = this_range.getStart();
int end = this_range.getEnd();
if(seqname == null && ((GFFDocumentEntry)getEntry()).getDocument() != null)
seqname = ((GFFDocumentEntry)getEntry()).getDocument().getName();
contig_ranges.containsKey(seqname))
start = start-offset_range.getStart()+1;
end = end-offset_range.getStart()+1;
}
if(group == null || group.getValues() == null ||
group.getValues().elementAt(0).equals(""))
{
final Qualifier gene = getQualifierByName("gene");
String frame = ".";
frame = "0";
else if(frame.equals("2"))
frame = "1";
else if(frame.equals("3"))
frame = "2";
else
frame = ".";
// phase is REQUIRED for all CDS features
if(getKey().equals("CDS") && frame.equals("."))
frame = "0";
String attribute_string = unParseAttributes(myId);
source_str = source;
if(translation != null)
attribute_string = attribute_string + ";" + translation;
writer.write(seqname + "\t" +
score.getValues() .elementAt(0)+ "\t" +
(getLocation().isComplement() ? "-\t" : "+\t") +
frame + "\t" +
attribute_string + "\n");
}
* If the seqname is not set for this feature try to derive the contig/chromosome
* it is located on
* @param start
* @return
String seqname = null;
if(contig_ranges != null)
final Enumeration<String> contigEnum = contig_ranges.keys();
while(contigEnum.hasMoreElements())
final String key = contigEnum.nextElement();
final Range r = contig_ranges.get(key);
if(r.getStart() > start)
continue;
if(r.getEnd() > start)
return key;
else
{
try
{
seqname = ((GFFStreamFeature)(getEntry().getAllFeatures().elementAt(0))).getGffSeqName();
}
catch(Exception e) {}
}
if(seqname == null)
seqname = "gff_seqname";
return seqname;
/**
* Return a String containing the qualifiers of this feature in a form
* suitable for using as the last field of a GFF line. The codon_start
* attribute is not included since GFF has a frame field. gff_seqname,
* gff_source and score aren't included since they have corresponding
* fields.
**/
private String unParseAttributes(final String myId)
{
final StringBuffer buffer = new StringBuffer();
final QualifierVector qualifiers = getQualifiers();
"Target", "Gap", "Note",
"Dbxref", "Ontology_term",
if(myId != null)
{
buffer.append("ID=");
buffer.append(encode(myId));
count++;
}
final String this_qualifier_str = getQualifierString(this_qualifier, true);
if(this_qualifier_str == null)
continue;
if(count != 0)
buffer.append(";");
buffer.append(this_qualifier_str);
count++;
}
for(Qualifier this_qualifier: qualifiers)
if(this_qualifier.getName().equals(names[j]))
if( (this_qualifier.getName().equals("private") && System.getProperty("noprivate") != null) ||
(this_qualifier.getName().equals("history") && System.getProperty("nohistory") != null) )
continue;
final String this_qualifier_str = getQualifierString(this_qualifier, false);
if(this_qualifier_str == null)
continue;
buffer.append(this_qualifier_str);
}
return buffer.toString();
}
/**
* Get the translation qualifier string for polypeptide features.
if (! getKey().getKeyString().equals("polypeptide"))
return null;
if (chadoGene != null)
{
if(getUserData() == null)
// the above line constructs the appropriate userData within this current GFFStreamFeature object,
// which is required by the following GeneUtils.deriveResidues()
String residues = GeneUtils.deriveResidues(this);
if (residues != null)
return "translation="+residues;
/**
* Used to write out the GFF attributes.
* @param q the qualifier to represent as a <code>String</code>
* @param reserved indicate if this is one of the reserved tags or not
* @return the <code>String</code> representation
*/
private String getQualifierString(Qualifier q, boolean reserved )
{
StringBuffer buffer = new StringBuffer();
final String name = q.getName();
if(name.equals("codon_start") || name.equals("gff_source") ||
name.equals("gff_seqname") || name.equals("score"))
return null;
/* ignore qualifiers with just one empty value, will mess up GFF3 output */
if(values != null && values.size() == 1)
{
if (values.elementAt(0).replaceAll("\\s+","").equals(""))
return null;
}
/*
* GSV :
* The Bio::FeatureIO perl module falls over if there are Uppercased
* attribute names for tags which aren't part of the standard reserved
* set. So we lowercase these, since in the specification it says :
* "All attributes that begin with an uppercase letter are reserved for
* later use. Attributes that begin with a lowercase letter can be used
* freely by applications."
* see http://www.sequenceontology.org/gff3.shtml
*/
String nameToBuffer = encode(name);
if (! reserved)
nameToBuffer = Character.toLowerCase(nameToBuffer.charAt(0)) + nameToBuffer.substring(1);
buffer.append(nameToBuffer);
if(values != null && values.size() > 0)
for(int value_index = 0; value_index < values.size();
++value_index)
if(value_index>0)
buffer.append("%2C");
if(name.equals("Parent"))
buffer.append(this_value);
else