@Test public final void testNormalizeInoh() { // there're two Protein objects that have entityReference // rdf:ID="IMR_0100366_G_alpha_s_Canonical" (in fact a generic ProteinReference inproperly // defined); // and that PR has two UniProt UnificationXref: P63092 (human), P63095 (rat). // Identifiers.org URI for the PR should NOT be generated. // // Also, there was a bug when replacing the PR's URI (warning: IllegalBioPAXArgumentException: // Incompatible type!..) // The cause is that there was a (weird, invalid) PublicationXref having db:id as UniProt:P63092 // that gets the same URI! // To avoid such issues altogether (despite illegal pub. xrefs), the Normalizer won't use // Identifiers.org for PublicationXrefs anymore. Model model = simpleIO.convertFromOWL(getClass().getResourceAsStream("/test-inoh.owl")); Normalizer normalizer = new Normalizer(); normalizer.setXmlBase(""); normalizer.normalize(model); // A weird PublicationXref that uses a UniProt ID won't be normalized: assertFalse(model.containsID("http://identifiers.org/uniprot/P63092")); // A PR with two UniProt IDs/unif.xrefs - human, rat - won't be normalized! assertTrue(model.containsID(model.getXmlBase() + "IMR_0100366_G_alpha_s_Canonical")); assertEquals( "ProteinReference", model .getByID(model.getXmlBase() + "IMR_0100366_G_alpha_s_Canonical") .getModelInterface() .getSimpleName()); }
@Test public final void testNormalize2() { Model model = BioPAXLevel.L3.getDefaultFactory().createModel(); Xref ref = model.addNew(UnificationXref.class, "Xref1"); ref.setDb("uniprotkb"); // will be converted to 'uniprot' ref.setId("Q0VCL1"); Xref uniprotX = ref; ProteinReference pr = model.addNew(ProteinReference.class, "ProteinReference"); pr.setDisplayName("ProteinReference"); pr.addXref(uniprotX); ref = model.addNew(RelationshipXref.class, "Xref2"); ref.setDb("refseq"); ref.setId("NP_001734"); pr.addXref(ref); // normalizer won't merge diff. types of xref with the same db:id ref = model.addNew(PublicationXref.class, "Xref3"); ref.setDb("pubmed"); ref.setId("2549346"); // the same id pr.addXref(ref); ref = model.addNew(RelationshipXref.class, "Xref4"); ref.setDb("pubmed"); ref.setId("2549346"); // the same id pr.addXref(ref); Normalizer normalizer = new Normalizer(); normalizer.normalize(model); ProteinReference e = (ProteinReference) model.getByID("http://identifiers.org/uniprot/Q0VCL1"); assertNotNull(e); assertEquals(4, e.getXref().size()); // print(e, model); }
private void check(String input, Normalizer.Status status, String output, String rest) throws ParserException { Normalizer normalizer = new Normalizer(input); Assert.assertEquals(normalizer.normalize(), status); Assert.assertEquals(normalizer.getNormalized(), output); Assert.assertEquals(normalizer.getUnparsed(), rest); }
private void check(String input, Normalizer.Status status, String output) throws ParserException { Normalizer normalizer = new Normalizer(input); Assert.assertEquals(normalizer.normalize(), status); if (status != Normalizer.Status.NEEDMORE) { Assert.assertEquals(normalizer.getNormalized(), output); } }
@Test public final void testAutoName() { Model model = BioPAXLevel.L3.getDefaultFactory().createModel(); Provenance pro = model.addNew(Provenance.class, "http://identifiers.org/pid.pathway/"); pro.setStandardName("foo"); Normalizer.autoName(pro); assertNotNull(pro.getStandardName()); assertTrue(pro.getName().contains("PID")); assertTrue(pro.getName().contains("NCI_Nature curated")); assertFalse(pro.getStandardName().equals("foo")); }
@Test public final void testNormalize3() { Model model = BioPAXLevel.L3.getDefaultFactory().createModel(); Xref ref = model.addNew(UnificationXref.class, "Xref1"); ref.setDb("uniprotkb"); // will be converted to 'uniprot' ref.setId("Q0VCL1"); ProteinReference pr = model.addNew(ProteinReference.class, "ProteinReference1"); pr.setDisplayName("A ProteinReference"); pr.addXref(ref); assertEquals(1, ref.getXrefOf().size()); // System.out.println("Before the model is normalized - "); // print(pr, model); // go normalize! Normalizer normalizer = new Normalizer(); normalizer.normalize(model); // System.out.println("After the model is normalized - "); // print(pr, model); assertFalse(model.contains(pr)); // replaced by new norm. PR in the model assertFalse(model.contains(ref)); // replaced by new norm. xref in the model // now xrefOf is consistent with xref for all objects inn the model (since after some paxtools // 4.1.3 snapshot) assertEquals(0, pr.getXref().size()); // old PR has xref removed! assertEquals( 0, ref.getXrefOf().size()); // because the old xref was replaced in all parent elements! ProteinReference e = (ProteinReference) model.getByID("http://identifiers.org/uniprot/Q0VCL1"); assertNotNull(e); assertEquals(1, e.getXref().size()); String normUri = Normalizer.uri(model.getXmlBase(), "UNIPROT", "Q0VCL1", UnificationXref.class); ref = (UnificationXref) model.getByID(normUri); assertNotNull(ref); assertEquals(1, ref.getXrefOf().size()); // print(e, model); }
/** * Decomposes Diacritic characters to their combining forms. * * @param str String to be Normalized * @return A Normalized String */ public String normalizeDiac(String str) { StringBuilder retStr = new StringBuilder(); int strLength = str.length(); for (int i = 0; i < strLength; i++) { char c = str.charAt(i); int type = Character.getType(c); if (type == Character.NON_SPACING_MARK || type == Character.MODIFIER_SYMBOL || type == Character.MODIFIER_LETTER) { /* * Trim because some decompositions have an extra space, such as * U+00B4 */ retStr.append(Normalizer.normalize(c, Normalizer.NFKC).trim()); } else { retStr.append(str.charAt(i)); } } return retStr.toString(); }
/** * Normalize presentation forms of characters to the separate parts. * * @see TextNormalize.normalizePres(String) * @param str String to normalize * @return Normalized form */ public String normalizePres(String str) { StringBuilder builder = null; int p = 0; int q = 0; int strLength = str.length(); for (; q < strLength; q++) { // We only normalize if the codepoint is in a given range. // Otherwise, NFKC converts too many things that would cause // confusion. For example, it converts the micro symbol in // extended Latin to the value in the Greek script. We normalize // the Unicode Alphabetic and Arabic A&B Presentation forms. char c = str.charAt(q); if ((0xFB00 <= c && c <= 0xFDFF) || (0xFE70 <= c && c <= 0xFEFF)) { if (builder == null) { builder = new StringBuilder(strLength * 2); } builder.append(str.substring(p, q)); // Some fonts map U+FDF2 differently than the Unicode spec. // They add an extra U+0627 character to compensate. // This removes the extra character for those fonts. if (c == 0xFDF2 && q > 0 && (str.charAt(q - 1) == 0x0627 || str.charAt(q - 1) == 0xFE8D)) { builder.append("\u0644\u0644\u0647"); } else { // Trim because some decompositions have an extra space, // such as U+FC5E builder.append(Normalizer.normalize(c, Normalizer.NFKC).trim()); } p = q + 1; } } if (builder == null) { return str; } else { builder.append(str.substring(p, q)); return builder.toString(); } }
/** * Set the uriStartString attribute. * * @param uriStartString The uriStartString attribute value. */ public void setURIStartString(String uriStartString) { CatalogMessages.reportNPEOnNull("uriStartString", uriStartString); this.uriStartString = Normalizer.normalizeURI(uriStartString); setMatchId(this.uriStartString); }
@Test public final void testUri() { // using null or non-standard db assertEquals( Normalizer.uri("test/", "foo", "bar", UnificationXref.class), Normalizer.uri("test/", "FOo", "bar", UnificationXref.class)); // 'pubchem' is a ambigous synonym (correct ones are: pubchem-substance, pubchem-compound, etc.) assertEquals( Normalizer.uri("", "pubchem", "bar", UnificationXref.class), Normalizer.uri("", "PubChem", "bar", UnificationXref.class)); assertEquals( Normalizer.uri("", null, "bar", UnificationXref.class), Normalizer.uri(null, null, "bar", UnificationXref.class)); assertFalse( Normalizer.uri(null, "foo", "bar", UnificationXref.class) .equals(Normalizer.uri(null, "foo", "BAR", UnificationXref.class))); assertFalse( Normalizer.uri(null, "foo", "bar", UnificationXref.class) .equals(Normalizer.uri(null, "foo", "bar", PublicationXref.class))); // using standard db names (Miriam is used to normalize name and/or get identifiers.org URI) - assertEquals( Normalizer.uri("test/", "pubmed", "12345", PublicationXref.class), Normalizer.uri("test/", "PubMED", "12345", PublicationXref.class)); assertEquals( "http://identifiers.org/pubmed/12345", Normalizer.uri("test/", "PubMED", "12345", PublicationXref.class)); assertFalse( "http://identifiers.org/pubmed/12345" .equals( Normalizer.uri( null, "PubMED", "12345", RelationshipXref.class))); // - not PublicationXref assertEquals( "http://identifiers.org/chebi/CHEBI:12345", Normalizer.uri("", "chebi", "CHEBI:12345", SmallMoleculeReference.class)); assertEquals( "http://identifiers.org/pubchem.substance/12345", Normalizer.uri("", "pubchem-substance", "12345", SmallMoleculeReference.class)); System.setProperty( "biopax.normalizer.uri.strategy", Normalizer.VALUE_NORMALIZER_URI_STRATEGY_SIMPLE); assertEquals( "SequenceModificationVocabulary_protein_modification_ontology_MOD_12345", Normalizer.uri("", "PSI-mod", "MOD:12345", SequenceModificationVocabulary.class)); assertEquals( "ControlledVocabulary_protein_modification_ontology_MOD_12345", Normalizer.uri("", "MOD", "MOD:12345", ControlledVocabulary.class)); System.setProperty( "biopax.normalizer.uri.strategy", Normalizer.VALUE_NORMALIZER_URI_STRATEGY_MD5); // wrong id (case-sens.) assertFalse( "http://identifiers.org/chebi/CHEBI:12345" .equals(Normalizer.uri("", "chebi", "chebi:12345", SmallMoleculeReference.class))); // no 'pubchem' namespace there assertFalse( "http://identifiers.org/pubchem/12345" .equals(Normalizer.uri("", "pubchem-substance", "12345", UnificationXref.class))); // when there're special symbols, spaces in the 'id' part assertEquals( "UnificationXref_foo_bar", Normalizer.uri(null, null, "foo bar", UnificationXref.class)); assertEquals( "http://identifiers.org/taxonomy/9606", Normalizer.uri(null, "taxonomy", "9606", BioSource.class)); assertEquals( "http://identifiers.org/taxonomy/9606", Normalizer.uri(null, "NCBI Taxonomy", "9606", BioSource.class)); assertEquals( "http://identifiers.org/taxonomy/9606", Normalizer.uri(null, "NEWT", "9606", BioSource.class)); // when organism's id is not taxID (e.g., if the BioSource has tissue, cellType CVs...) assertNotSame( "http://identifiers.org/taxonomy/9606", Normalizer.uri(null, "taxonomy", "9606_blah_blah", BioSource.class)); String uri = Normalizer.uri("", "UniProt", "W0C7J9", UnificationXref.class); assertEquals("UnificationXref_uniprot_knowledgebase_W0C7J9", uri); }
@Test public final void testNormalize() throws UnsupportedEncodingException { // Note: a UniProt AC version (e.g. P68250.1 .. P68250.94) is not the same thing as isoform ID! Model model = BioPAXLevel.L3.getDefaultFactory().createModel(); Xref ref = model.addNew(UnificationXref.class, "Xref1"); ref.setDb("uniprotkb"); ref.setId("P68250"); ProteinReference pr = model.addNew(ProteinReference.class, "ProteinReference1"); pr.setDisplayName("ProteinReference1"); pr.addXref(ref); ref = model.addNew(RelationshipXref.class, "Xref2"); ref.setDb("refseq"); ref.setId("NP_001734"); ref.setIdVersion( "1"); // this xref won't be removed by norm. (version matters in xrefs comparing!) pr.addXref(ref); ref = model.addNew(UnificationXref.class, "Xref3"); ref.setDb("uniprotkb"); // will be converted to 'uniprot knowledgebase' /* The following ID is the secondary accession of P68250, * but Normalizer won't complain (it's Validator's and - later - Merger's job)! * However, if it were P68250, the normalize(model) would throw exception * (because ProteinReference1 becomes ProteinReference2, both get RDFId= urn:miriam:uniprot:P68250!) */ ref.setId("Q0VCL1"); Xref uniprotX = ref; pr = model.addNew(ProteinReference.class, "ProteinReference2"); pr.setDisplayName("ProteinReference2"); pr.addXref(uniprotX); ref = model.addNew(RelationshipXref.class, "Xref4"); ref.setDb("refseq"); ref.setId("NP_001734"); pr.addXref(ref); // this ER is duplicate (same uniprot xref as ProteinReference2's) and must be removed by // normalizer pr = model.addNew(ProteinReference.class, "ProteinReference3"); pr.setDisplayName("ProteinReference3"); pr.addXref(uniprotX); ref = model.addNew(RelationshipXref.class, "Xref5"); ref.setDb("refseq"); ref.setId("NP_001734"); pr.addXref(ref); // normalizer won't merge diff. types of xref with the same db:id ref = model.addNew(PublicationXref.class, "Xref6"); ref.setDb("pubmed"); ref.setId("2549346"); // the same id pr.addXref(ref); ref = model.addNew(RelationshipXref.class, "Xref7"); ref.setDb("pubmed"); ref.setId("2549346"); // the same id pr.addXref(ref); // add biosource ref = model.addNew(UnificationXref.class, "Xref8"); ref.setDb("taxonomy"); ref.setId("10090"); // the same id BioSource bioSource = model.addNew(BioSource.class, "BioSource_Mouse_Tissue"); bioSource.addXref((UnificationXref) ref); // Provenance (must set ID and standard names from a name) Provenance pro1 = model.addNew(Provenance.class, "pid"); pro1.addName("nci_nature"); // must be case insensitive (recognized) pro1.setStandardName("foo"); // must be replaced // Provenance (must create names from urn) Provenance pro2 = model.addNew(Provenance.class, "http://identifiers.org/signaling-gateway/"); // add some entities with props Pathway pw1 = model.addNew(Pathway.class, "pathway"); pw1.addDataSource(pro1); pw1.setStandardName("Pathway"); Pathway pw2 = model.addNew(Pathway.class, "sub_pathway"); pw2.setStandardName("Sub-Pathway"); pw2.addDataSource(pro2); pw1.addPathwayComponent(pw2); // add data to test uniprot isoform xref and PR normalization ref = model.addNew(UnificationXref.class, "Xref9"); ref.setDb("UniProt"); // normalizer will change it to "uniprot isoform" ref.setId("P68250-2"); pr = model.addNew(ProteinReference.class, "ProteinReference4"); pr.setDisplayName("ProteinReference1isoformA"); pr.addXref(ref); // next ones are to test normalizer can auto-fix 'uniprot' to 'uniprot isoform' xref, // and also merge xrefs #9,#10 and PRs #4,#5 into one PR with one xref // below, uniprot xref's idVersion='2' will be moved back to the id value, and db set to // "UniProt Isoform" - ref = model.addNew(UnificationXref.class, "Xref10"); ref.setDb( "UniProtKb"); // NOT to be replaced with "UniProt Isoform" (version and isoform # are not // the same thing) ref.setId("P68250"); ref.setIdVersion("2"); // may be lost after merging with two other P68250 xrefs // (version is not the same as isoform, unless db name is 'uniprot isoform') pr = model.addNew(ProteinReference.class, "ProteinReference5"); pr.setDisplayName("ProteinReference1isoformB"); pr.addXref(ref); // Following three Xrefs and PRs will be normalized to uniprot.isoform:P68250-1 and merged into // one ref = model.addNew(UnificationXref.class, "Xref11"); ref.setDb("UniProtKb"); // will be replaced with "uniprot isoform" ref.setId("P68250-1"); pr = model.addNew(ProteinReference.class, "ProteinReference6"); pr.addXref(ref); ref = model.addNew(UnificationXref.class, "Xref12"); ref.setDb("UniProt Isoform"); // because this is standard (isoform) db name (special case) -> ref.setId("P68250"); // - this id will set to "P68250-1", ref.setIdVersion("1"); // - and idVersion will be cleared! pr = model.addNew(ProteinReference.class, "ProteinReference7"); pr.addXref(ref); ref = model.addNew(UnificationXref.class, "Xref13"); ref.setDb("UniProt Isoform"); ref.setId("P68250-1"); pr = model.addNew(ProteinReference.class, "ProteinReference8"); pr.addXref(ref); // special dangling UXs to test/catch a weird bug that accidentally makes db='uniprot // isoform'... UnificationXref ux = model.addNew(UnificationXref.class, "UniprotUX1"); ux.setDb("uniprot"); ux.setId("W0C7J9"); ux = model.addNew(UnificationXref.class, "UniprotUX2"); ux.setDb("uniprot"); ux.setId("W0C7J9.1"); // NOT to be changed to 'uniprot isoform' ux = model.addNew(UnificationXref.class, "UniprotUX3"); ux.setDb("uniprot"); ux.setId("W0C7J9"); // NOT to be changed to 'uniprot isoform' ux.setIdVersion("1"); ux = model.addNew(UnificationXref.class, "UniprotUX4"); ux.setDb("uniprot"); // will be changed to 'uniprot isoform' ux.setId("W0C7J9-1"); ux = model.addNew(UnificationXref.class, "UniprotUX5"); ux.setDb("uniprot"); // will be changed to 'uniprot isoform' ux.setId("P68250-3"); // go normalize! Normalizer normalizer = new Normalizer(); normalizer.normalize(model); // //tmp test print // ByteArrayOutputStream out = new ByteArrayOutputStream(); // simpleIO.convertToOWL(model, out); // System.out.println(out.toString()); // test for a bug that causes db='uniprot' become 'uniprot isoform' (the id matches both // patterns) assertTrue(model.containsID("UnificationXref_uniprot_knowledgebase_W0C7J9")); assertFalse(model.containsID("UnificationXref_uniprot_isoform_W0C7J9")); assertTrue(model.containsID("UnificationXref_uniprot_knowledgebase_W0C7J9_1")); assertTrue(model.containsID("UnificationXref_uniprot_isoform_W0C7J9-1")); assertTrue(model.containsID("UnificationXref_uniprot_isoform_P68250-3")); assertTrue(model.containsID("UnificationXref_uniprot_knowledgebase_W0C7J9")); // check Xref String normUri = Normalizer.uri(model.getXmlBase(), "uniprot", "P68250", UnificationXref.class); BioPAXElement bpe = model.getByID(normUri); assertTrue(bpe instanceof UnificationXref); // check PR bpe = model.getByID("http://identifiers.org/uniprot/Q0VCL1"); assertTrue(bpe instanceof ProteinReference); assertTrue( model.containsID( "Xref7")); // RX is not normalized unless (by mistake) it has identifiers.org uri // test BioSource assertFalse(model.containsID("BioSource_Mouse_Tissue")); bpe = model.getByID( Normalizer.uri( model.getXmlBase(), "taxonomy", "10090", BioSource.class)); // "taxonomy" - capitalization can be any assertTrue(bpe instanceof BioSource); normUri = Normalizer.uri(model.getXmlBase(), "taxonomy", "10090", UnificationXref.class); bpe = model.getByID(normUri); assertTrue(bpe instanceof UnificationXref); // test that one of each pair ProteinReference, 2nd,3rd and 4th,5th is removed/merged: assertEquals(4, model.getObjects(ProteinReference.class).size()); // Provenance is no more normalized (Miriam is not enough for this task)! assertEquals(2, model.getObjects(Provenance.class).size()); // dataSource property is not inferred/inherited from pw1 anymore (prop. inference feature was // removed) pw2 = (Pathway) model.getByID("sub_pathway"); assertEquals(1, pw2.getDataSource().size()); pw1 = (Pathway) model.getByID("pathway"); assertEquals(1, pw1.getDataSource().size()); // test uniprot isoform xrefs are detected and normalized the same way // get the expected xref URI first normUri = Normalizer.uri(model.getXmlBase(), "uniprot isoform", "P68250-2", UnificationXref.class); bpe = model.getByID(normUri); assertNotNull(bpe); assertEquals(1, ((Xref) bpe).getXrefOf().size()); }