/** * Tokenize some text - not thread safe. * * @param text tokenize this * @return tokenized text */ public List<List<Token>> process(final String text) { final List<List<Token>> paragraph = new ArrayList<>(); currentSentence = new ArrayList<>(); final Tokens tokens = splitText(text); while (tokens.hasNext()) { final Token t = tokens.next(); final String trimmedWord = t.text.trim(); // skip spaces if (trimmedWord.isEmpty()) continue; if (((mode == WITH_PUNCTUATION) || (mode == WITHOUT_PUNCTUATION && isLetterOrDigit(initChar(t.text))))) { boolean canBreakSentence = true; if (t.text.contains("'")) { wordContainsApostrophe(t); } else if (".".equals(trimmedWord)) { canBreakSentence = wordIsFullStop(t); } else if (":".equals(trimmedWord)) { wordIsColon(tokens, t); } else currentSentence.add(t); // handling the end of a sentence if (canBreakSentence && equalss(trimmedWord, ".", ";", "?", "!")) { paragraph.add(currentSentence); currentSentence = new ArrayList<>(); } } } if (!currentSentence.isEmpty()) paragraph.add(currentSentence); return paragraph; }
private void wordIsColon(final Tokens tokens, final Token t) { // check we can get a previous and next word to merge together if (!currentSentence.isEmpty() && tokens.hasNext()) { // if the colon does not have a space on either side if (!isSpaceChar(lastChar(tokens.peekPrev().text)) && !isSpaceChar(initChar(tokens.peekNext().text))) { // try to merge the 3 tokens back together again final int prevWordIndex = currentSentence.size() - 1; final Token prevSentenceWord = currentSentence.get(prevWordIndex); mergeWordsIntoSentence(prevSentenceWord, t, tokens.next(), prevWordIndex); } else currentSentence.add(t); } else currentSentence.add(t); }
public static Expr parse(Tokens tokens) { int pos = tokens.getPosition(); Expr firstExpr = MultiplyExpr.parse(tokens); if (firstExpr == null) { tokens.setPosition(pos); return null; } List<Expr> multiplyExprs = new LinkedList<Expr>(); multiplyExprs.add(firstExpr); BitSet operators = new BitSet(); for (int i = 0; tokens.hasNext(); i++) { char operator = tokens.nextChar(); if (operator == '+' || operator == '-') { Expr nextExpr = MultiplyExpr.parse(tokens); if (nextExpr == null) { throw new QuerySyntaxException(tokens); } multiplyExprs.add(nextExpr); if (operator == '+') { operators.set(i); } } else { tokens.pushback(); break; } } return multiplyExprs.size() == 1 ? firstExpr : new AdditiveExpr(multiplyExprs, operators); }