private List<ConLLWord> addWordResourcesToModel( OntModel outputModel, List<ConLLWord> wordObjectsOfSentence, Individual sentenceResource, Individual contextResource) { for (ConLLWord word : wordObjectsOfSentence) { // generate URI differently String uri = contextResource.getURI().substring(0, contextResource.getURI().lastIndexOf("=") + 1) + word.getStart() + "," + word.getEnd(); Individual wordResource = outputModel.createIndividual( uri, outputModel.createClass(NIFOntClasses.RFC5147String.getUri())); wordResource.addOntClass(NIFOntClasses.Word.getOntClass(outputModel)); wordResource.addOntClass(NIFOntClasses.String.getOntClass(outputModel)); wordResource.addProperty( NIFDatatypeProperties.beginIndex.getDatatypeProperty(outputModel), word.getStart() + ""); wordResource.addProperty( NIFDatatypeProperties.endIndex.getDatatypeProperty(outputModel), word.getEnd() + ""); wordResource.addLiteral( NIFDatatypeProperties.anchorOf.getDatatypeProperty(outputModel), outputModel.createLiteral(word.getWordString())); wordResource.addProperty( NIFDatatypeProperties.posTag.getDatatypeProperty(outputModel), word.getPos()); if (tagsetKnown) addOliaPos(wordResource, word.getPos(), outputModel); wordResource.addProperty( NIFObjectProperties.referenceContext.getObjectProperty(outputModel), contextResource); // TODO: we may need something different here if (!word.getPosFine().equals("_")) wordResource.addProperty( NIFDatatypeProperties.posTag.getDatatypeProperty(outputModel), word.getPosFine()); if (!word.getLemma().equals("_")) wordResource.addProperty( NIFDatatypeProperties.lemma.getDatatypeProperty(outputModel), word.getLemma()); // TODO: add genus, numerus etc here wordResource.addProperty( NIFObjectProperties.sentence.getObjectProperty(outputModel), sentenceResource); word.setResource(wordResource); sentenceResource.addProperty( NIFObjectProperties.word.getObjectProperty(outputModel), wordResource); sentenceResource.addProperty( NIFObjectProperties.referenceContext.getObjectProperty(outputModel), contextResource); } return wordObjectsOfSentence; }
/** * Parse a sentence into a list of ConLLWord objects * * @param sentence list of line strings from the ConLL file * @return */ private List<ConLLWord> transformSentenceToObjects(List<String> sentence, int startOffset) { String sentenceString = ""; List<ConLLWord> sentenceObjects = new ArrayList<ConLLWord>(); int offset = startOffset; boolean addSpaceToOffset = false; for (String line : sentence) { ConLLWord word = getWordFromCorpusLine(line); // TODO: check if word is punctuation class word instead of this static check if (word.getWordString().matches("([\\.!?,;:)]|'+)")) { // remove whitespace before punctuation mark and decrement offset if (sentenceString.endsWith(" ")) { sentenceString = sentenceString.substring(0, sentenceString.length() - 1); offset = offset - 1; } sentenceString += word.getWordString(); // no space here } else if (word.getWordString().matches("([(]|`+)")) { sentenceString += word.getWordString(); // add a space after the word } else { sentenceString += word.getWordString() + " "; addSpaceToOffset = true; } word.setStart(offset); int wordEnd = offset + word.getWordString().length(); word.setEnd(wordEnd); offset = wordEnd; if (addSpaceToOffset) { offset++; addSpaceToOffset = false; } sentenceObjects.add(word); } this.tempSentence = sentenceString.trim(); // return the length of the sentence added to the offset return sentenceObjects; }