/** * Parse a sentence into a list of ConLLWord objects * * @param sentence list of line strings from the ConLL file * @return */ private List<ConLLWord> transformSentenceToObjects(List<String> sentence, int startOffset) { String sentenceString = ""; List<ConLLWord> sentenceObjects = new ArrayList<ConLLWord>(); int offset = startOffset; boolean addSpaceToOffset = false; for (String line : sentence) { ConLLWord word = getWordFromCorpusLine(line); // TODO: check if word is punctuation class word instead of this static check if (word.getWordString().matches("([\\.!?,;:)]|'+)")) { // remove whitespace before punctuation mark and decrement offset if (sentenceString.endsWith(" ")) { sentenceString = sentenceString.substring(0, sentenceString.length() - 1); offset = offset - 1; } sentenceString += word.getWordString(); // no space here } else if (word.getWordString().matches("([(]|`+)")) { sentenceString += word.getWordString(); // add a space after the word } else { sentenceString += word.getWordString() + " "; addSpaceToOffset = true; } word.setStart(offset); int wordEnd = offset + word.getWordString().length(); word.setEnd(wordEnd); offset = wordEnd; if (addSpaceToOffset) { offset++; addSpaceToOffset = false; } sentenceObjects.add(word); } this.tempSentence = sentenceString.trim(); // return the length of the sentence added to the offset return sentenceObjects; }