Пример #1
0
  private static String rewrite(String text, Set<PatternStringTuple> patterns) {
    String x = text;
    for (PatternStringTuple pair : patterns)
      x = pair.getPattern().matcher(x).replaceAll(pair.getEntry());

    return x.trim();
  }
Пример #2
0
  private static List<TokenTagTuple> recursiveTokenize(
      String text, int index, Tokenizer tokenization) {
    if (index < tupleList.size()) {
      PatternStringTuple pst = tupleList.get(index);
      Pattern pattern = pst.getPattern();
      String tag = pst.getEntry();
      Matcher matcher;
      matcher = pattern.matcher(text);
      int groupCount = matcher.groupCount();
      String textFragment = "";
      if (groupCount > 0) {
        List<List<TokenTagTuple>> arrays = new ArrayList<>();
        int lastEnd = 0;
        while (matcher.find()) {
          if (matcher.start() > lastEnd) {
            textFragment = text.substring(lastEnd, matcher.start()).trim();
            if (textFragment.length() > 0) // possible could have
              // started all as
              // whitespace
              arrays.add(recursiveTokenize(textFragment, index + 1, tokenization));
          }
          // System.out.println("[" + matcher.group() + "] " +
          // matcher.start() + " " + matcher.end());
          List<TokenTagTuple> tmpList = new ArrayList<>();
          tmpList.add(new TokenTagTuple(matcher.group(), tag));
          arrays.add(tmpList);
          lastEnd = matcher.end();
        }
        if (lastEnd < text.length())
          arrays.add(
              recursiveTokenize(
                  text.substring(lastEnd, text.length()).trim(), index + 1, tokenization));

        return concatAll(arrays);
      } else {
        return recursiveTokenize(text.trim(), index + 1, tokenization);
      }
    } else {
      List<String> tokenList = tokenization.tokenize(text);
      List<TokenTagTuple> y = new ArrayList<>(tokenList.size());
      for (String token : tokenList) y.add(new TokenTagTuple(token, null));
      return y;
    }
  }