Newer
Older
/* GFFDocumentEntry.java
*
* created: Tue Sep 14 1999
*
* This file is part of Artemis
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/io/GFFDocumentEntry.java,v 1.46 2007-07-23 10:34:35 tjc Exp $
import uk.ac.sanger.artemis.chado.SimilarityLazyQualifierValue;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
/**
* A DocumentEntry that can read an GFF entry from a Document.
*
* @author Kim Rutherford
* @version $Id: GFFDocumentEntry.java,v 1.46 2007-07-23 10:34:35 tjc Exp $
/**
* Create a new GFFDocumentEntry object associated with the given
* Document.
* @param document This is the file that we will read from. This is also
* used for saving the entry back to the file it came from and to give
* the new object a name.
* @param listener The object that will listen for ReadEvents.
* @exception IOException thrown if there is a problem reading the entry -
* most likely ReadFormatException.
**/
GFFDocumentEntry(final Document document, final ReadListener listener)
throws IOException, EntryInformationException
{
super(new GFFEntryInformation(), document, listener);
finished_constructor = true;
}
/**
* Create a new GFFDocumentEntry that will be a copy of the given Entry and
* has no Document associated with it. The new GFFDocumentEntry cannot be
* saved to a file with save() unless save(Document) has been called
* first. Some qualifier and location information will be lost.
* @param force If true then invalid qualifiers and any features with
* invalid keys in the new Entry will be quietly thrown away. "Invalid"
* means that the key/qualifier is not allowed to occur in an Entry of
* this type (probably determined by the EntryInformation object of this
* Entry). If false an EntryInformationException will be thrown for
* invalid keys or qualifiers.
**/
public GFFDocumentEntry(final Entry new_entry, final boolean force)
throws EntryInformationException
{
super(new GFFEntryInformation(), new_entry, force);
finished_constructor = true;
}
/**
* Create a new empty GFFDocumentEntry object that has no Document
* associated with it. The new GFFDocumentEntry cannot be saved to a
* file with save() unless save(Document) has been called first. The
* save(Document) method will assign a Document.
public GFFDocumentEntry(final EntryInformation entry_information)
{
super(new GFFEntryInformation());
finished_constructor = true;
}
/**
* Returns true if and only if this entry is read only. For now this
* always returns true - GFFDocumentEntry objects can't be changed.
**/
/**
* Returns true if and only if this entry is read only. For now this
* always returns true - BlastDocumentEntry objects can't be changed.
**/
return isReadOnly;
}
public void setReadOnly(final boolean isReadOnly)
{
this.isReadOnly = isReadOnly;
}
/**
* If the given feature can be added directly to this Entry, then return
* it, otherwise create and return a new feature of the appropriate type.
* @param copy if true then always new a new copy of the Feature.
**/
protected SimpleDocumentFeature makeNativeFeature(final Feature feature,
final boolean copy)
{
if(!copy && feature instanceof GFFStreamFeature)
return (GFFStreamFeature)feature;
else
return new GFFStreamFeature(feature);
}
/**
* If the given Sequence can be added directly to this Entry, then return a
* copy of it, otherwise create and return a new feature of the appropriate
* type for this Entry.
**/
protected StreamSequence makeNativeSequence(final Sequence sequence)
{
return new FastaStreamSequence(sequence);
private void combineGeneFeatures()
{
final FeatureVector original_features = getAllFeatures();
Feature this_feature;
Hashtable chado_gene = new Hashtable();
try
{
// find the genes
for(int i = 0 ; i < original_features.size() ; ++i)
{
this_feature = original_features.featureAt(i);
String key = this_feature.getKey().getKeyString();
if(key.equals("gene") || key.equals("pseudogene"))
{
String id = (String)this_feature.getQualifierByName("ID").getValues().get(0);
ChadoCanonicalGene gene = new ChadoCanonicalGene();
gene.setGene(this_feature);
chado_gene.put(id, gene);
((GFFStreamFeature)this_feature).setChadoGene(gene);
}
}
for(int i = 0 ; i < original_features.size() ; ++i)
{
this_feature = original_features.featureAt(i);
// transcript
Qualifier parent_qualifier = this_feature.getQualifierByName("Parent");
if(parent_qualifier == null)
continue;
StringVector parents = parent_qualifier.getValues();
for(int j=0; j<parents.size(); j++)
{
String parent = (String)parents.get(j);
if(chado_gene.containsKey(parent))
{
// store transcript
ChadoCanonicalGene gene = (ChadoCanonicalGene)chado_gene.get(parent);
gene.addTranscript(this_feature);
// store the transcript ID with its ChadoCanonicalGene object
transcripts_lookup.put((String)this_feature.getQualifierByName("ID").getValues().get(0),
gene);
for(int i = 0 ; i < original_features.size() ; ++i)
{
this_feature = original_features.featureAt(i);
// exons
//if(!key.equals("exon") && !key.equals("polypeptide") &&
// !key.endsWith("prime_UTR"))
// continue;
Qualifier derives_qualifier = this_feature.getQualifierByName("Derives_from");
if(parent_qualifier == null && derives_qualifier == null)
Qualifier featureRelationship =
this_feature.getQualifierByName("feature_relationship_rank");
// compare this features parent_id's to transcript id's in the
// chado gene hash to decide if it is part of it
final StringVector parent_id;
if(parent_qualifier != null)
parent_id = parent_qualifier.getValues();
else
parent_id = derives_qualifier.getValues();
for(int j=0; j<parent_id.size(); j++)
{
String parent = (String)parent_id.get(j);
if(transcripts_lookup.containsKey(parent))
{
ChadoCanonicalGene gene = (ChadoCanonicalGene)transcripts_lookup.get(parent);
if(parent_qualifier == null)
gene.addProtein(parent, this_feature);
else if(key.equals("three_prime_UTR"))
gene.add3PrimeUtr(parent, this_feature);
else if(key.equals("five_prime_UTR"))
gene.add5PrimeUtr(parent, this_feature);
else if(key.equals("exon") || featureRelationship != null ||
key.equals("pseudogenic_exon"))
else
gene.addOtherFeatures(parent, this_feature);
Enumeration enum_genes = chado_gene.elements();
while(enum_genes.hasMoreElements())
{
ChadoCanonicalGene gene = (ChadoCanonicalGene)enum_genes.nextElement();
combineChadoExons(gene);
}
if(getDocument() instanceof DatabaseDocument)
loadSimilarityLazyData(original_features);
}
catch(InvalidRelationException e)
{
e.printStackTrace();
}
}
/**
* Get 'similarity' qualifiers
* @param fv
* @throws InvalidRelationException
*/
private void loadSimilarityLazyData(final FeatureVector fv)
throws InvalidRelationException
{
DatabaseDocument doc = (DatabaseDocument)getDocument();
List matches;
if(fv.size() < 30 && fv.size() > 0) // if just a few features to look up e.g. for gene editorr
{
List featureIds = new Vector(fv.size());
for(int i=0;i<fv.size(); i++)
{
Qualifier featureIdQualifier = fv.featureAt(i).getQualifierByName("feature_id");
featureIds.add( (String)featureIdQualifier.getValues().get(0) );
}
matches = doc.getSimilarityMatches(featureIds);
}
else
matches = doc.getSimilarityMatches(null);
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
Hashtable temp_lookup_hash = new Hashtable(matches.size()/2);
String f_id;
for(int i=0; i<fv.size(); i++)
{
Feature f = (Feature)fv.elementAt(i);
Qualifier qualifier = ((Feature)f).getQualifierByName("feature_id");
if(qualifier != null)
{
f_id = (String)qualifier.getValues().get(0);
temp_lookup_hash.put(f_id, f);
}
}
for(int i=0; i<matches.size(); i++)
{
org.gmod.schema.sequence.Feature matchFeature =
(org.gmod.schema.sequence.Feature)matches.get(i);
java.util.Collection featureLocs = matchFeature.getFeatureLocsForFeatureId();
java.util.Iterator it = featureLocs.iterator();
while(it.hasNext())
{
org.gmod.schema.sequence.FeatureLoc featureLoc =
(org.gmod.schema.sequence.FeatureLoc)it.next();
Feature queryFeature =
(Feature)temp_lookup_hash.get(Integer.toString(featureLoc.getSrcFeatureId()));
if(queryFeature != null)
{
Qualifier qualifier = queryFeature.getQualifierByName("similarity");
SimilarityLazyQualifierValue sim = new SimilarityLazyQualifierValue(matchFeature, featureLoc.getSrcFeatureId());
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
if(qualifier == null)
qualifier = new QualifierLazyLoading("similarity", sim);
else
((QualifierLazyLoading)qualifier).addValue(sim);
try
{
queryFeature.setQualifier(qualifier);
}
catch(ReadOnlyException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
catch(EntryInformationException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
break;
}
}
}
temp_lookup_hash.clear();
}
/**
* Join the separate exons into one feature (if appropriate).
**/
{
final FeatureVector original_features = getAllFeatures();
// the key of these hashes will be the group name and the value is a
// FeatureVector containing the feature that are in that group
final Hashtable forward_feature_groups = new Hashtable();
final Hashtable reverse_feature_groups = new Hashtable();
for(int i = 0 ; i < original_features.size() ; ++i)
{
if(key.equals("CDS") || key.equals("polypeptide_domain") ||
key.equals("polypeptide") || key.equals("exon"))
if(this_feature.getQualifierByName("ID") != null &&
!key.equals("exon"))
{
values =
this_feature.getQualifierByName("ID").getValues();
group_name = group_name+values.elementAt(0);
}
final FeatureVector other_features =
(FeatureVector) this_strand_feature_groups.get(group_name);
if(other_features == null)
final FeatureVector new_feature_vector = new FeatureVector();
new_feature_vector.add(this_feature);
this_strand_feature_groups.put(group_name, new_feature_vector);
}
}
catch(InvalidRelationException e)
{
throw new Error("internal error - unexpected exception: " + e);
combineFeaturesFromHash(forward_feature_groups);
combineFeaturesFromHash(reverse_feature_groups);
/**
* Combine the features (which are exons) and delete the orignals from this
* Entry. The key of this hash will be the group name and the value is a
* FeatureVector containing the feature that are in that group. Groups
* that have more than one member will be combined.
**/
public void combineChadoExons(ChadoCanonicalGene gene)
GFFStreamFeature transcript = (GFFStreamFeature)transcripts.get(i);
transcript_id = (String)(transcript.getQualifierByName("ID").getValues().get(0));
Set splicedSiteTypes = gene.getSpliceTypes(transcript_id);
if(splicedSiteTypes == null)
Iterator it = splicedSiteTypes.iterator();
Vector new_set = new Vector();
while(it.hasNext())
{
String type = (String)it.next();
List splicedSites = gene.getSpliceSitesOfTranscript(transcript_id, type);
if(splicedSites == null)
continue;
mergeFeatures(splicedSites, new_set,
(String)(transcript.getQualifierByName("ID").getValues().get(0)));
}
// now merge the exons in the ChadoCanonicalGene feature
int num = 0;
while(enum_exon_set.hasMoreElements())
{
String transcript_id = (String)enum_exon_set.nextElement();
try
{
if(num == 0)
num++;
}
catch(InvalidRelationException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
}*/
}
private void mergeFeatures(final List gffFeatures,
final List new_set,
final String transcript_id)
{
Hashtable feature_relationship_rank_store = new Hashtable();
Hashtable id_range_store = new Hashtable();
RangeVector new_range_vector = new RangeVector();
QualifierVector qualifier_vector = new QualifierVector();
Timestamp lasttimemodified = null;
for(int j = 0; j < gffFeatures.size(); j++)
{
final GFFStreamFeature this_feature = (GFFStreamFeature)gffFeatures.get(j);
Integer rank;
Qualifier rankQualifier = this_feature
.getQualifierByName("feature_relationship_rank");
if(rankQualifier == null)
rank = new Integer(0);
else
{
rank = new Integer((String) (rankQualifier.getValues().get(0)));
this_feature.getQualifiers().removeQualifierByName("feature_relationship_rank");
}
// use the most current lastmodified datestamp
if(this_feature.getLastModified() != null
&& (lasttimemodified == null || this_feature.getLastModified()
.compareTo(lasttimemodified) > 0))
lasttimemodified = this_feature.getLastModified();
final Location this_feature_location = this_feature.getLocation();
if(this_feature_location.getRanges().size() > 1)
{
throw new Error("internal error - new location should have "
+ "exactly one range");
}
final Range new_range = (Range) this_feature_location.getRanges()
.elementAt(0);
Qualifier id_qualifier = this_feature.getQualifierByName("ID");
if(id_qualifier != null)
{
String id = (String) (id_qualifier.getValues()).elementAt(0);
id_range_store.put(id, new_range);
feature_relationship_rank_store.put(id, rank);
}
else
Splash.logger4j.warn("NO ID FOUND FOR FEATURE AT: "+
this_feature.getLocation().toString());
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
if(this_feature_location.isComplement())
new_range_vector.insertElementAt(new_range, 0);
else
new_range_vector.add(new_range);
removeInternal(this_feature);
qualifier_vector.addAll(this_feature.getQualifiers());
}
final Feature first_old_feature = (Feature)gffFeatures.get(0);
final Location new_location = new Location(new_range_vector,
first_old_feature.getLocation().isComplement());
qualifier_vector = mergeQualifiers(qualifier_vector, first_old_feature
.getLocation().isComplement());
final GFFStreamFeature new_feature = new GFFStreamFeature(first_old_feature
.getKey(), new_location, qualifier_vector);
if(lasttimemodified != null)
new_feature.setLastModified(lasttimemodified);
new_feature.setSegmentRangeStore(id_range_store);
new_feature
.setFeature_relationship_rank_store(feature_relationship_rank_store);
// set the ID
String ID;
try
{
ID = new_feature.getSegmentID(new_feature.getLocation().getRanges());
}
catch(NullPointerException npe)
{
if(new_feature.getQualifierByName("Parent") != null)
ID = ((String)new_feature.getQualifierByName("Parent").getValues().get(0)) +
":"+new_feature.getKey().getKeyString();
else
ID = new_feature.getKey().getKeyString();
}
final Qualifier id_qualifier = new_feature.getQualifierByName("ID");
id_qualifier.removeValue((String)(id_qualifier.getValues()).elementAt(0));
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
try
{
new_feature.setLocation(new_location);
final Qualifier gene_qualifier = new_feature.getQualifierByName("gene");
if(gene_qualifier != null
&& gene_qualifier.getValues().size() > 0
&& ((String) (gene_qualifier.getValues()).elementAt(0))
.startsWith("Phat"))
{
// special case to handle incorrect output of the Phat gene
// prediction tool
new_feature.removeQualifierByName("codon_start");
}
else
{
final Qualifier old_codon_start_qualifier = first_old_feature
.getQualifierByName("codon_start");
if(old_codon_start_qualifier != null)
new_feature.setQualifier(old_codon_start_qualifier);
}
forcedAdd(new_feature);
//gene.addExon(transcript_id, new_feature, true );
new_set.add(new_feature);
}
catch(ReadOnlyException e)
{
throw new Error("internal error - unexpected exception: " + e);
}
catch(OutOfRangeException e)
{
throw new Error("internal error - unexpected exception: " + e);
}
catch(EntryInformationException e)
{
throw new Error("internal error - unexpected exception: " + e);
/**
* Combine the features (which are exons) and delete the orignals from this
* Entry. The key of this hash will be the group name and the value is a
* FeatureVector containing the feature that are in that group. Groups
* that have more than one member will be combined.
**/
/*private void combineFeaturesFromHash(final Hashtable feature_groups)
{
final Enumeration enumFeat = feature_groups.keys();
final RangeVector new_range_vector = new RangeVector();
QualifierVector qualifier_vector = new QualifierVector();
Hashtable id_range_store = new Hashtable();
Timestamp lasttimemodified = null;
lasttimemodified = this_feature.getLastModified();
final Location this_feature_location = this_feature.getLocation();
throw new Error("internal error - new location should have " +
(Range)this_feature_location.getRanges().elementAt(0);
Qualifier id_qualifier = this_feature.getQualifierByName("ID");
if(id_qualifier != null)
{
if(this_feature_location.isComplement())
new_range_vector.insertElementAt(new_range, 0);
else
new_range_vector.add(new_range);
final Location new_location = new Location(new_range_vector,
first_old_feature.getLocation().isComplement());
qualifier_vector = mergeQualifiers(qualifier_vector,
first_old_feature.getLocation().isComplement());
final GFFStreamFeature new_feature = new GFFStreamFeature(first_old_feature.getKey(),
new_location, qualifier_vector);
if(lasttimemodified != null)
new_feature.setLastModified(lasttimemodified);
if(gene_qualifier != null &&
gene_qualifier.getValues().size() > 0 &&
((String)(gene_qualifier.getValues()).elementAt(0)).startsWith("Phat"))
// special case to handle incorrect output of the Phat gene
// prediction tool
new_feature.removeQualifierByName("codon_start");
}
else
{
if(old_codon_start_qualifier != null)
new_feature.setQualifier(old_codon_start_qualifier);
forcedAdd(new_feature);
}
catch(ReadOnlyException e)
{
throw new Error("internal error - unexpected exception: " + e);
}
catch(OutOfRangeException e)
{
throw new Error("internal error - unexpected exception: " + e);
}
catch(EntryInformationException e)
{
throw new Error("internal error - unexpected exception: " + e);
private QualifierVector mergeQualifiers(QualifierVector qualifier_vector,
boolean complement)
{
QualifierVector merge_qualifier_vector = new QualifierVector();
for(int i = 0 ; i < qualifier_vector.size() ; ++i)
{
Qualifier qual = (Qualifier)qualifier_vector.elementAt(i);
if(qual.getName().equals("codon_start"))
{
if(!complement && !seen)
{
merge_qualifier_vector.addElement(qual);
seen = true;
}
else if(complement)
merge_qualifier_vector.setQualifier(qual);
}
final Qualifier id_qualifier =
merge_qualifier_vector.getQualifierByName("Alias");
if(id_qualifier == null)
merge_qualifier_vector.addElement(qual);
else
{
String id1 = (String)(id_qualifier.getValues()).elementAt(0);
String id2 = (String)(qual.getValues()).elementAt(0);
id_qualifier.removeValue(id1);
id_qualifier.addValue(id1+","+id2);
}
}
else if(!qual.getName().equals("ID") &&
!qual.getName().equals("feature_id"))
merge_qualifier_vector.setQualifier(qual);