@Override public JapaneseTokenizer init(final byte[] txt) { String source = string(txt); if (wc) { // convert wide-space to space source = source.replace('\u3000', '\u0020'); } final ArrayList<?> morpheme = (ArrayList<?>) Reflect.invoke(parse, tagger, source); final ArrayList<Morpheme> list = new ArrayList<>(); try { int prev = 0; final int ms = morpheme.size(); for (int i = 0; i < ms; i++) { final Object m = morpheme.get(i); final String srfc = surface.get(m).toString(); final String ftr = feature.get(m).toString(); final int strt = start.getInt(m); if (i != 0) { final int l = strt - prev; if (l != 0) { list.add(new Morpheme(source.substring(strt - 1, strt + l - 1), KIGOU_FEATURE)); } } prev = srfc.length() + strt; // separates continuous mark (ASCII) boolean cont = true; final ArrayList<Morpheme> marks = new ArrayList<>(); final int sl = srfc.length(); for (int s = 0; s < sl; s++) { final String c = String.valueOf(srfc.charAt(s)); final byte[] t = token(c); if (t.length == 1) { if (letter(t[0]) || digit(t[0])) cont = false; else marks.add(new Morpheme(c, KIGOU_FEATURE)); } else { cont = false; } } if (cont) list.addAll(marks); else list.add(new Morpheme(srfc, ftr)); } } catch (final Exception ex) { Util.errln(Util.className(this) + ": " + ex); } tokenList = list; tokens = list.iterator(); return this; }
/** * Converts an HTML document to XML. * * @param io io reference * @param opts html options * @return parser * @throws IOException I/O exception */ private static IO toXML(final IO io, final HtmlOptions opts) throws IOException { // reader could not be initialized; fall back to XML if (READER == null) return io; try { // tries to extract the encoding from the input final TextInput ti = new TextInput(io); String enc = ti.encoding(); final byte[] content = ti.content(); // looks for a charset definition final byte[] encoding = token("charset="); int cs = indexOf(content, encoding); if (cs > 0) { // extracts the encoding string cs += encoding.length; int ce = cs; final int cl = content.length; while (++ce < cl && content[ce] > 0x28) ; enc = string(substring(content, cs, ce)); } // define input final InputSource is = new InputSource(new ArrayInput(content)); is.setEncoding(supported(enc) ? normEncoding(enc) : UTF8); // define output final StringWriter sw = new StringWriter(); final XMLReader reader = (XMLReader) Reflect.get(READER); final Object writer = Reflect.get(WRITER, sw); // set TagSoup options if (opts.get(HtmlOptions.HTML)) { reader.setFeature("http://xml.org/sax/features/namespaces", false); opt("method", "html"); opt("omit-xml-declaration", "yes"); } if (opts.get(HtmlOptions.NONS)) reader.setFeature("http://xml.org/sax/features/namespaces", false); if (opts.get(HtmlOptions.OMITXML)) opt("omit-xml-declaration", "yes"); if (opts.get(HtmlOptions.NOBOGONS)) reader.setFeature(FEATURES + "ignore-bogons", true); if (opts.get(HtmlOptions.NODEFAULTS)) reader.setFeature(FEATURES + "default-attributes", false); if (opts.get(HtmlOptions.NOCOLONS)) reader.setFeature(FEATURES + "translate-colons", true); if (opts.get(HtmlOptions.NORESTART)) reader.setFeature(FEATURES + "restart-elements", false); if (opts.get(HtmlOptions.IGNORABLE)) reader.setFeature(FEATURES + "ignorable-whitespace", true); if (opts.get(HtmlOptions.EMPTYBOGONS)) reader.setFeature(FEATURES + "bogons-empty", true); if (opts.get(HtmlOptions.ANY)) reader.setFeature(FEATURES + "bogons-empty", false); if (opts.get(HtmlOptions.NOROOTBOGONS)) reader.setFeature(FEATURES + "root-bogons", false); if (opts.get(HtmlOptions.NOCDATA)) reader.setFeature(FEATURES + "cdata-elements", false); if (opts.get(HtmlOptions.LEXICAL)) reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer); if (opts.contains(HtmlOptions.METHOD)) opt("method", opts.get(HtmlOptions.METHOD)); if (opts.contains(HtmlOptions.DOCTYPESYS)) opt("doctype-system", opts.get(HtmlOptions.DOCTYPESYS)); if (opts.contains(HtmlOptions.DOCTYPEPUB)) opt("doctype-public", opts.get(HtmlOptions.DOCTYPEPUB)); if (opts.contains(HtmlOptions.ENCODING)) is.setEncoding(opts.get(HtmlOptions.ENCODING)); // end TagSoup options reader.setContentHandler((ContentHandler) writer); reader.parse(is); return new IOContent(token(sw.toString()), io.name()); } catch (final SAXException ex) { Util.errln(ex); return io; } }
/** Writes the properties to disk. */ public final synchronized void write() { final StringBuilder user = new StringBuilder(); BufferedReader br = null; try { // caches options specified by the user if (file.exists()) { br = new BufferedReader(new FileReader(file.file())); for (String line; (line = br.readLine()) != null; ) { if (line.equals(PROPUSER)) break; } for (String line; (line = br.readLine()) != null; ) { user.append(line).append(NL); } } } catch (final Exception ex) { Util.debug(ex); } finally { if (br != null) try { br.close(); } catch (final IOException e) { } } BufferedWriter bw = null; try { bw = new BufferedWriter(new FileWriter(file.file())); bw.write(PROPHEADER + NL); for (final Field f : getClass().getFields()) { final Object obj = f.get(null); if (!(obj instanceof Object[])) continue; final String key = ((Object[]) obj)[0].toString(); final Object val = props.get(key); if (val instanceof String[]) { final String[] str = (String[]) val; bw.write(key + " = " + str.length + NL); final int is = str.length; for (int i = 0; i < is; ++i) { if (str[i] != null) bw.write(key + (i + 1) + " = " + str[i] + NL); } } else if (val instanceof int[]) { final int[] num = (int[]) val; final int ns = num.length; for (int i = 0; i < ns; ++i) { bw.write(key + i + " = " + num[i] + NL); } } else { bw.write(key + " = " + val + NL); } } bw.write(NL + PROPUSER + NL); bw.write(user.toString()); } catch (final Exception ex) { Util.errln("% could not be written.", file); Util.debug(ex); } finally { if (bw != null) try { bw.close(); } catch (final IOException e) { } } }