@Override public JapaneseTokenizer init(final byte[] txt) { String source = string(txt); if (wc) { // convert wide-space to space source = source.replace('\u3000', '\u0020'); } final ArrayList<?> morpheme = (ArrayList<?>) Reflect.invoke(parse, tagger, source); final ArrayList<Morpheme> list = new ArrayList<>(); try { int prev = 0; final int ms = morpheme.size(); for (int i = 0; i < ms; i++) { final Object m = morpheme.get(i); final String srfc = surface.get(m).toString(); final String ftr = feature.get(m).toString(); final int strt = start.getInt(m); if (i != 0) { final int l = strt - prev; if (l != 0) { list.add(new Morpheme(source.substring(strt - 1, strt + l - 1), KIGOU_FEATURE)); } } prev = srfc.length() + strt; // separates continuous mark (ASCII) boolean cont = true; final ArrayList<Morpheme> marks = new ArrayList<>(); final int sl = srfc.length(); for (int s = 0; s < sl; s++) { final String c = String.valueOf(srfc.charAt(s)); final byte[] t = token(c); if (t.length == 1) { if (letter(t[0]) || digit(t[0])) cont = false; else marks.add(new Morpheme(c, KIGOU_FEATURE)); } else { cont = false; } } if (cont) list.addAll(marks); else list.add(new Morpheme(srfc, ftr)); } } catch (final Exception ex) { Util.errln(Util.className(this) + ": " + ex); } tokenList = list; tokens = list.iterator(); return this; }
/** * Returns whether the following token exists (using wildcards). * * @return result of check */ private boolean moreWC() { final StringBuilder word = new StringBuilder(); final int size = tokenList.size(); boolean period = false, bs = false, more = false; for (; cpos < size; cpos++) { String cSrfc = tokenList.get(cpos).getSurface(); final boolean cMark = tokenList.get(cpos).isMark(); String nSrfc = null; boolean nMark = false; if (cpos < size - 1) { nSrfc = tokenList.get(cpos + 1).getSurface(); nMark = tokenList.get(cpos + 1).isMark(); } if (nSrfc != null) { if ("\\".equals(cSrfc)) bs = true; // delimiter if (cMark && !isFtChar(cSrfc) || "\\".equals(cSrfc) && nMark) { period = false; bs = false; if (word.length() != 0) { more = true; break; } if ("\\".equals(cSrfc) && nMark) cpos++; continue; } word.append(cSrfc); if (bs || "\\".equals(nSrfc)) { more = true; continue; } if (".".equals(cSrfc) || ".".equals(nSrfc)) { period = true; continue; } if (period) { if ("{".equals(cSrfc)) { cpos++; for (; cpos < size; cpos++) { cSrfc = tokenList.get(cpos).getSurface(); word.append(cSrfc); if ("}".equals(cSrfc)) { more = true; break; } } cpos++; break; } continue; } } else { // last token. if (cMark) { if ("\\".equals(cSrfc)) continue; if (word.length() != 0) { word.append(cSrfc); } more = true; continue; } } if (period) { word.append(cSrfc); } else { if (bs) if (!isFtChar(cSrfc)) word.append(cSrfc); else word.setLength(0); } more = true; cpos++; break; } if (more) { currToken = word.length() == 0 ? tokenList.get(cpos - 1) : new Morpheme(word.toString(), MEISHI_FEATURE); } return more; }