From 521118ea44faf50ec49b2f8daaf580da9daf1d17 Mon Sep 17 00:00:00 2001 From: tcarver <tjc> Date: Wed, 21 Aug 2013 14:13:56 +0100 Subject: [PATCH] add check for ID prefix in gene models --- uk/ac/sanger/artemis/io/ValidateFeature.java | 56 +++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/uk/ac/sanger/artemis/io/ValidateFeature.java b/uk/ac/sanger/artemis/io/ValidateFeature.java index eb3551cdf..202710694 100644 --- a/uk/ac/sanger/artemis/io/ValidateFeature.java +++ b/uk/ac/sanger/artemis/io/ValidateFeature.java @@ -33,6 +33,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.Vector; +import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.JOptionPane; @@ -71,6 +72,7 @@ public class ValidateFeature //##sequence-region seqid start end private static Pattern HEADER_SEQ_REGION = Pattern.compile("##sequence-region \\S+ \\d+ \\d+"); private static Pattern CAPITAL_START = Pattern.compile("^[A-Z]+\\S*"); + private static Pattern ID_PREFIX = Pattern.compile("^[^.:]+"); private static String[] RESERVED_TAGS = { "ID", "Name", "Alias", "Parent", @@ -133,7 +135,8 @@ public class ValidateFeature showFeatureList(STRAND_PREDICATE, "Gene Strand Errors", grp, sel, gotoSrc, plotGrp); showFeatureList(BOUNDARY_PREDICATE, "Gene Boundary Errors", grp, sel, gotoSrc, plotGrp); showFeatureList(COMPLETE_GENE_MODEL_PREDICATE, "Incomplete Gene Model", grp, sel, gotoSrc, plotGrp); - showFeatureList(PARTIAL_PREDICATE, "Partial Settings", grp, sel, gotoSrc, plotGrp); + showFeatureList(PARTIAL_PREDICATE, "Check Partial Settings", grp, sel, gotoSrc, plotGrp); + showFeatureList(ID_PREDICATE, "Check ID Settings", grp, sel, gotoSrc, plotGrp); } showFeatureList(INTERNAL_STOP, "Internal Stop Codons", grp, sel, gotoSrc, plotGrp); @@ -219,6 +222,12 @@ public class ValidateFeature pass = false; report.put("Partial settings not consistent", Level.FATAL); } + + if(!isIdPrefixConsistent(gffFeature)) + { + pass = false; + report.put("Prefix of ID attribute not consistent within gene model", Level.FATAL); + } } if( (entryGrp == null || !GeneUtils.isDatabaseEntry(entryGrp)) && !isCDSPhaseOK(gffFeature)) @@ -544,6 +553,41 @@ public class ValidateFeature return true; } + /** + * Test if the ID GFF3 attribute prefix is consistent within a gene model + * @param gffFeature + * @return true if the prefix is the same within the gene model features + */ + private static boolean isIdPrefixConsistent(final GFFStreamFeature gffFeature) + { + final ChadoCanonicalGene gene = gffFeature.getChadoGene(); + if(gene == null) + return true; + + try + { + if(gffFeature.getKey().getKeyString().endsWith("gene")) + return (gene.getGene().getQualifierByName("ID") != null); + + if(gene.getGene().getQualifierByName("ID") == null) + return true; + if(gffFeature.getQualifierByName("ID") == null) + return false; + + String id = gene.getGene().getQualifierByName("ID").getValues().elementAt(0); + final Matcher m = ID_PREFIX.matcher(id); + if(m.matches()) + { + id = gffFeature.getQualifierByName("ID").getValues().elementAt(0); + return id.startsWith( m.group() ); + } + } + catch (Exception e) + { + e.printStackTrace(); + } + return true; + } /** * The phase is REQUIRED for all CDS features. @@ -912,6 +956,16 @@ public class ValidateFeature } }; + private static FeaturePredicate ID_PREDICATE = new FeaturePredicate() + { + public boolean testPredicate(uk.ac.sanger.artemis.Feature feature) + { + if( isIdPrefixConsistent((GFFStreamFeature) feature.getEmblFeature() )) + return false; + return true; + } + }; + private FeaturePredicate INTERNAL_STOP = new FeaturePredicate() { public boolean testPredicate(uk.ac.sanger.artemis.Feature feature) -- GitLab