private ConLLWord getWordFromCorpusLine(String line) { ConLLWord word = new ConLLWord(); String[] conllFields = line.split("\t"); word.setWordId(conllFields[0]); word.setWordString(conllFields[1]); word.setLemma(conllFields[2]); word.setPos(conllFields[4]); word.setPosFine(conllFields[5]); word.setMorphs(conllFields[6]); word.setPhraseHeadId(Integer.parseInt(conllFields[8])); word.setPhraseType(conllFields[10]); return word; }
/** * Parse a sentence into a list of ConLLWord objects * * @param sentence list of line strings from the ConLL file * @return */ private List<ConLLWord> transformSentenceToObjects(List<String> sentence, int startOffset) { String sentenceString = ""; List<ConLLWord> sentenceObjects = new ArrayList<ConLLWord>(); int offset = startOffset; boolean addSpaceToOffset = false; for (String line : sentence) { ConLLWord word = getWordFromCorpusLine(line); // TODO: check if word is punctuation class word instead of this static check if (word.getWordString().matches("([\\.!?,;:)]|'+)")) { // remove whitespace before punctuation mark and decrement offset if (sentenceString.endsWith(" ")) { sentenceString = sentenceString.substring(0, sentenceString.length() - 1); offset = offset - 1; } sentenceString += word.getWordString(); // no space here } else if (word.getWordString().matches("([(]|`+)")) { sentenceString += word.getWordString(); // add a space after the word } else { sentenceString += word.getWordString() + " "; addSpaceToOffset = true; } word.setStart(offset); int wordEnd = offset + word.getWordString().length(); word.setEnd(wordEnd); offset = wordEnd; if (addSpaceToOffset) { offset++; addSpaceToOffset = false; } sentenceObjects.add(word); } this.tempSentence = sentenceString.trim(); // return the length of the sentence added to the offset return sentenceObjects; }
/** * Parse a dependency tree from generated objects TODO: There are NIF properties missing to * describe dependency relations from phrases to other phrases TODO: What exactly is the "root" * node? I will make this the sentence resource but that seems to be wrong. should there be an * artificial "root" phrase? what does it contain? what offsets does it have? * * @param sentenceObjects */ private void parseDependencyTree( List<ConLLWord> sentenceObjects, OntModel inputModel, Individual sentence, Individual context) { // create the tree // TODO: defining new property here, should be changed to work ootb ObjectProperty phraseHead = inputModel.createObjectProperty( "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#phraseHead"); phraseHead.addProperty(RDFS.comment, "The head of a Phrase."); phraseHead.addProperty(RDFS.domain, NIFOntClasses.Phrase.getUri()); phraseHead.addProperty(RDFS.range, NIFOntClasses.Phrase.getUri()); ObjectProperty depRelType = inputModel.createObjectProperty( "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#dependencyRelationType"); depRelType.addProperty( RDFS.comment, "Dependency relation to the HEAD. The set of dependency relations depends on the particular language. Note that depending on the original treebank annotation, the dependency relation may be meaningful or simply 'ROOT'. "); depRelType.addProperty(RDFS.domain, NIFOntClasses.Phrase.getUri()); ObjectProperty govTODep = NIFObjectProperties.dependency.getObjectProperty(inputModel); for (ConLLWord word : sentenceObjects) { int phraseHeadId = word.getPhraseHeadId(); Individual wordResource = word.getResource(); // every word is a phrase wordResource.addOntClass(NIFOntClasses.Phrase.getOntClass(inputModel)); if (phraseHeadId == 0) { // root node, making the sentence the head wordResource.addProperty(depRelType, word.getPhraseType()); } else { // ids start with 1, List<> indexes starts with 0 ConLLWord phraseHeadObject = sentenceObjects.get(phraseHeadId - 1); phraseHeadObject.getResource().addProperty(govTODep, wordResource); wordResource.addProperty(phraseHead, phraseHeadObject.getResource()); wordResource.addProperty( NIFDatatypeProperties.head.getDatatypeProperty(inputModel), String.valueOf(phraseHeadId)); wordResource.addProperty(depRelType, word.getPhraseType()); } } }
private List<ConLLWord> addWordResourcesToModel( OntModel outputModel, List<ConLLWord> wordObjectsOfSentence, Individual sentenceResource, Individual contextResource) { for (ConLLWord word : wordObjectsOfSentence) { // generate URI differently String uri = contextResource.getURI().substring(0, contextResource.getURI().lastIndexOf("=") + 1) + word.getStart() + "," + word.getEnd(); Individual wordResource = outputModel.createIndividual( uri, outputModel.createClass(NIFOntClasses.RFC5147String.getUri())); wordResource.addOntClass(NIFOntClasses.Word.getOntClass(outputModel)); wordResource.addOntClass(NIFOntClasses.String.getOntClass(outputModel)); wordResource.addProperty( NIFDatatypeProperties.beginIndex.getDatatypeProperty(outputModel), word.getStart() + ""); wordResource.addProperty( NIFDatatypeProperties.endIndex.getDatatypeProperty(outputModel), word.getEnd() + ""); wordResource.addLiteral( NIFDatatypeProperties.anchorOf.getDatatypeProperty(outputModel), outputModel.createLiteral(word.getWordString())); wordResource.addProperty( NIFDatatypeProperties.posTag.getDatatypeProperty(outputModel), word.getPos()); if (tagsetKnown) addOliaPos(wordResource, word.getPos(), outputModel); wordResource.addProperty( NIFObjectProperties.referenceContext.getObjectProperty(outputModel), contextResource); // TODO: we may need something different here if (!word.getPosFine().equals("_")) wordResource.addProperty( NIFDatatypeProperties.posTag.getDatatypeProperty(outputModel), word.getPosFine()); if (!word.getLemma().equals("_")) wordResource.addProperty( NIFDatatypeProperties.lemma.getDatatypeProperty(outputModel), word.getLemma()); // TODO: add genus, numerus etc here wordResource.addProperty( NIFObjectProperties.sentence.getObjectProperty(outputModel), sentenceResource); word.setResource(wordResource); sentenceResource.addProperty( NIFObjectProperties.word.getObjectProperty(outputModel), wordResource); sentenceResource.addProperty( NIFObjectProperties.referenceContext.getObjectProperty(outputModel), contextResource); } return wordObjectsOfSentence; }