Пример #1
0
  @Override
  public JapaneseTokenizer init(final byte[] txt) {
    String source = string(txt);
    if (wc) { // convert wide-space to space
      source = source.replace('\u3000', '\u0020');
    }
    final ArrayList<?> morpheme = (ArrayList<?>) Reflect.invoke(parse, tagger, source);
    final ArrayList<Morpheme> list = new ArrayList<>();
    try {
      int prev = 0;
      final int ms = morpheme.size();
      for (int i = 0; i < ms; i++) {
        final Object m = morpheme.get(i);
        final String srfc = surface.get(m).toString();
        final String ftr = feature.get(m).toString();
        final int strt = start.getInt(m);
        if (i != 0) {
          final int l = strt - prev;
          if (l != 0) {
            list.add(new Morpheme(source.substring(strt - 1, strt + l - 1), KIGOU_FEATURE));
          }
        }
        prev = srfc.length() + strt;

        // separates continuous mark (ASCII)
        boolean cont = true;
        final ArrayList<Morpheme> marks = new ArrayList<>();
        final int sl = srfc.length();
        for (int s = 0; s < sl; s++) {
          final String c = String.valueOf(srfc.charAt(s));
          final byte[] t = token(c);
          if (t.length == 1) {
            if (letter(t[0]) || digit(t[0])) cont = false;
            else marks.add(new Morpheme(c, KIGOU_FEATURE));
          } else {
            cont = false;
          }
        }

        if (cont) list.addAll(marks);
        else list.add(new Morpheme(srfc, ftr));
      }
    } catch (final Exception ex) {
      Util.errln(Util.className(this) + ": " + ex);
    }
    tokenList = list;
    tokens = list.iterator();

    return this;
  }
Пример #2
0
  static {
    IOFile dic = null;
    if (Reflect.available(PATTERN)) {
      dic = new IOFile(LANG);
      if (!dic.exists()) {
        dic = new IOFile(Prop.HOME, "etc/" + LANG);
        if (!dic.exists()) {
          available = false;
        }
      }
    } else {
      available = false;
    }

    if (available) {
      Class<?> clz = Reflect.find(PATTERN);
      if (clz == null) {
        Util.debug("Could not initialize Igo Japanese lexer.");
      } else {
        /* Igo constructor. */
        final Constructor<?> tgr = Reflect.find(clz, String.class);
        tagger = Reflect.get(tgr, dic.path());
        if (tagger == null) {
          available = false;
          Util.debug("Could not initialize Igo Japanese lexer.");
        } else {
          parse = Reflect.method(clz, "parse", CharSequence.class);
          if (parse == null) {
            Util.debug("Could not initialize Igo lexer method.");
          }
          clz = Reflect.find("net.reduls.igo.Morpheme");
          surface = Reflect.field(clz, "surface");
          feature = Reflect.field(clz, "feature");
          start = Reflect.field(clz, "start");
        }
      }
    }
  }