Example #1
0
  @Override
  public JapaneseTokenizer init(final byte[] txt) {
    String source = string(txt);
    if (wc) { // convert wide-space to space
      source = source.replace('\u3000', '\u0020');
    }
    final ArrayList<?> morpheme = (ArrayList<?>) Reflect.invoke(parse, tagger, source);
    final ArrayList<Morpheme> list = new ArrayList<>();
    try {
      int prev = 0;
      final int ms = morpheme.size();
      for (int i = 0; i < ms; i++) {
        final Object m = morpheme.get(i);
        final String srfc = surface.get(m).toString();
        final String ftr = feature.get(m).toString();
        final int strt = start.getInt(m);
        if (i != 0) {
          final int l = strt - prev;
          if (l != 0) {
            list.add(new Morpheme(source.substring(strt - 1, strt + l - 1), KIGOU_FEATURE));
          }
        }
        prev = srfc.length() + strt;

        // separates continuous mark (ASCII)
        boolean cont = true;
        final ArrayList<Morpheme> marks = new ArrayList<>();
        final int sl = srfc.length();
        for (int s = 0; s < sl; s++) {
          final String c = String.valueOf(srfc.charAt(s));
          final byte[] t = token(c);
          if (t.length == 1) {
            if (letter(t[0]) || digit(t[0])) cont = false;
            else marks.add(new Morpheme(c, KIGOU_FEATURE));
          } else {
            cont = false;
          }
        }

        if (cont) list.addAll(marks);
        else list.add(new Morpheme(srfc, ftr));
      }
    } catch (final Exception ex) {
      Util.errln(Util.className(this) + ": " + ex);
    }
    tokenList = list;
    tokens = list.iterator();

    return this;
  }
Example #2
0
 /**
  * Reflection invoke XMLWriter.setOutputProperty().
  *
  * @param name property
  * @param value value
  */
 private static void opt(final String name, final String value) {
   Reflect.invoke(METHOD, WRITER, name, value);
 }