コード例 #1
0
ファイル: JapaneseTokenizer.java プロジェクト: fpapai/basex
  @Override
  public JapaneseTokenizer init(final byte[] txt) {
    String source = string(txt);
    if (wc) { // convert wide-space to space
      source = source.replace('\u3000', '\u0020');
    }
    final ArrayList<?> morpheme = (ArrayList<?>) Reflect.invoke(parse, tagger, source);
    final ArrayList<Morpheme> list = new ArrayList<>();
    try {
      int prev = 0;
      final int ms = morpheme.size();
      for (int i = 0; i < ms; i++) {
        final Object m = morpheme.get(i);
        final String srfc = surface.get(m).toString();
        final String ftr = feature.get(m).toString();
        final int strt = start.getInt(m);
        if (i != 0) {
          final int l = strt - prev;
          if (l != 0) {
            list.add(new Morpheme(source.substring(strt - 1, strt + l - 1), KIGOU_FEATURE));
          }
        }
        prev = srfc.length() + strt;

        // separates continuous mark (ASCII)
        boolean cont = true;
        final ArrayList<Morpheme> marks = new ArrayList<>();
        final int sl = srfc.length();
        for (int s = 0; s < sl; s++) {
          final String c = String.valueOf(srfc.charAt(s));
          final byte[] t = token(c);
          if (t.length == 1) {
            if (letter(t[0]) || digit(t[0])) cont = false;
            else marks.add(new Morpheme(c, KIGOU_FEATURE));
          } else {
            cont = false;
          }
        }

        if (cont) list.addAll(marks);
        else list.add(new Morpheme(srfc, ftr));
      }
    } catch (final Exception ex) {
      Util.errln(Util.className(this) + ": " + ex);
    }
    tokenList = list;
    tokens = list.iterator();

    return this;
  }
コード例 #2
0
ファイル: JapaneseTokenizer.java プロジェクト: fpapai/basex
  /**
   * Returns whether the following token exists (using wildcards).
   *
   * @return result of check
   */
  private boolean moreWC() {
    final StringBuilder word = new StringBuilder();
    final int size = tokenList.size();
    boolean period = false, bs = false, more = false;

    for (; cpos < size; cpos++) {
      String cSrfc = tokenList.get(cpos).getSurface();
      final boolean cMark = tokenList.get(cpos).isMark();
      String nSrfc = null;
      boolean nMark = false;
      if (cpos < size - 1) {
        nSrfc = tokenList.get(cpos + 1).getSurface();
        nMark = tokenList.get(cpos + 1).isMark();
      }

      if (nSrfc != null) {
        if ("\\".equals(cSrfc)) bs = true;

        // delimiter
        if (cMark && !isFtChar(cSrfc) || "\\".equals(cSrfc) && nMark) {
          period = false;
          bs = false;
          if (word.length() != 0) {
            more = true;
            break;
          }
          if ("\\".equals(cSrfc) && nMark) cpos++;
          continue;
        }

        word.append(cSrfc);

        if (bs || "\\".equals(nSrfc)) {
          more = true;
          continue;
        }

        if (".".equals(cSrfc) || ".".equals(nSrfc)) {
          period = true;
          continue;
        }
        if (period) {
          if ("{".equals(cSrfc)) {
            cpos++;
            for (; cpos < size; cpos++) {
              cSrfc = tokenList.get(cpos).getSurface();
              word.append(cSrfc);
              if ("}".equals(cSrfc)) {
                more = true;
                break;
              }
            }
            cpos++;
            break;
          }
          continue;
        }
      } else {
        // last token.
        if (cMark) {
          if ("\\".equals(cSrfc)) continue;
          if (word.length() != 0) {
            word.append(cSrfc);
          }
          more = true;
          continue;
        }
      }

      if (period) {
        word.append(cSrfc);
      } else {
        if (bs)
          if (!isFtChar(cSrfc)) word.append(cSrfc);
          else word.setLength(0);
      }
      more = true;
      cpos++;
      break;
    }
    if (more) {
      currToken =
          word.length() == 0
              ? tokenList.get(cpos - 1)
              : new Morpheme(word.toString(), MEISHI_FEATURE);
    }
    return more;
  }