Exemple #1
0
  /**
   * Tokenize some text - not thread safe.
   *
   * @param text tokenize this
   * @return tokenized text
   */
  public List<List<Token>> process(final String text) {
    final List<List<Token>> paragraph = new ArrayList<>();
    currentSentence = new ArrayList<>();
    final Tokens tokens = splitText(text);

    while (tokens.hasNext()) {
      final Token t = tokens.next();
      final String trimmedWord = t.text.trim();

      // skip spaces
      if (trimmedWord.isEmpty()) continue;

      if (((mode == WITH_PUNCTUATION)
          || (mode == WITHOUT_PUNCTUATION && isLetterOrDigit(initChar(t.text))))) {
        boolean canBreakSentence = true;
        if (t.text.contains("'")) {
          wordContainsApostrophe(t);
        } else if (".".equals(trimmedWord)) {
          canBreakSentence = wordIsFullStop(t);
        } else if (":".equals(trimmedWord)) {
          wordIsColon(tokens, t);
        } else currentSentence.add(t);

        // handling the end of a sentence
        if (canBreakSentence && equalss(trimmedWord, ".", ";", "?", "!")) {
          paragraph.add(currentSentence);
          currentSentence = new ArrayList<>();
        }
      }
    }

    if (!currentSentence.isEmpty()) paragraph.add(currentSentence);
    return paragraph;
  }
Exemple #2
0
 private void wordIsColon(final Tokens tokens, final Token t) {
   // check we can get a previous and next word to merge together
   if (!currentSentence.isEmpty() && tokens.hasNext()) {
     // if the colon does not have a space on either side
     if (!isSpaceChar(lastChar(tokens.peekPrev().text))
         && !isSpaceChar(initChar(tokens.peekNext().text))) {
       // try to merge the 3 tokens back together again
       final int prevWordIndex = currentSentence.size() - 1;
       final Token prevSentenceWord = currentSentence.get(prevWordIndex);
       mergeWordsIntoSentence(prevSentenceWord, t, tokens.next(), prevWordIndex);
     } else currentSentence.add(t);
   } else currentSentence.add(t);
 }
Exemple #3
0
  public static Expr parse(Tokens tokens) {
    int pos = tokens.getPosition();
    Expr firstExpr = MultiplyExpr.parse(tokens);

    if (firstExpr == null) {
      tokens.setPosition(pos);
      return null;
    }

    List<Expr> multiplyExprs = new LinkedList<Expr>();
    multiplyExprs.add(firstExpr);

    BitSet operators = new BitSet();

    for (int i = 0; tokens.hasNext(); i++) {
      char operator = tokens.nextChar();

      if (operator == '+' || operator == '-') {
        Expr nextExpr = MultiplyExpr.parse(tokens);

        if (nextExpr == null) {
          throw new QuerySyntaxException(tokens);
        }

        multiplyExprs.add(nextExpr);

        if (operator == '+') {
          operators.set(i);
        }
      } else {
        tokens.pushback();
        break;
      }
    }

    return multiplyExprs.size() == 1 ? firstExpr : new AdditiveExpr(multiplyExprs, operators);
  }