Example #1
0
  @Override
  public JapaneseTokenizer init(final byte[] txt) {
    String source = string(txt);
    if (wc) { // convert wide-space to space
      source = source.replace('\u3000', '\u0020');
    }
    final ArrayList<?> morpheme = (ArrayList<?>) Reflect.invoke(parse, tagger, source);
    final ArrayList<Morpheme> list = new ArrayList<>();
    try {
      int prev = 0;
      final int ms = morpheme.size();
      for (int i = 0; i < ms; i++) {
        final Object m = morpheme.get(i);
        final String srfc = surface.get(m).toString();
        final String ftr = feature.get(m).toString();
        final int strt = start.getInt(m);
        if (i != 0) {
          final int l = strt - prev;
          if (l != 0) {
            list.add(new Morpheme(source.substring(strt - 1, strt + l - 1), KIGOU_FEATURE));
          }
        }
        prev = srfc.length() + strt;

        // separates continuous mark (ASCII)
        boolean cont = true;
        final ArrayList<Morpheme> marks = new ArrayList<>();
        final int sl = srfc.length();
        for (int s = 0; s < sl; s++) {
          final String c = String.valueOf(srfc.charAt(s));
          final byte[] t = token(c);
          if (t.length == 1) {
            if (letter(t[0]) || digit(t[0])) cont = false;
            else marks.add(new Morpheme(c, KIGOU_FEATURE));
          } else {
            cont = false;
          }
        }

        if (cont) list.addAll(marks);
        else list.add(new Morpheme(srfc, ftr));
      }
    } catch (final Exception ex) {
      Util.errln(Util.className(this) + ": " + ex);
    }
    tokenList = list;
    tokens = list.iterator();

    return this;
  }
Example #2
0
  /**
   * Converts an HTML document to XML.
   *
   * @param io io reference
   * @param opts html options
   * @return parser
   * @throws IOException I/O exception
   */
  private static IO toXML(final IO io, final HtmlOptions opts) throws IOException {
    // reader could not be initialized; fall back to XML
    if (READER == null) return io;

    try {
      // tries to extract the encoding from the input
      final TextInput ti = new TextInput(io);
      String enc = ti.encoding();
      final byte[] content = ti.content();

      // looks for a charset definition
      final byte[] encoding = token("charset=");
      int cs = indexOf(content, encoding);
      if (cs > 0) {
        // extracts the encoding string
        cs += encoding.length;
        int ce = cs;
        final int cl = content.length;
        while (++ce < cl && content[ce] > 0x28) ;
        enc = string(substring(content, cs, ce));
      }

      // define input
      final InputSource is = new InputSource(new ArrayInput(content));
      is.setEncoding(supported(enc) ? normEncoding(enc) : UTF8);
      // define output
      final StringWriter sw = new StringWriter();
      final XMLReader reader = (XMLReader) Reflect.get(READER);
      final Object writer = Reflect.get(WRITER, sw);

      // set TagSoup options
      if (opts.get(HtmlOptions.HTML)) {
        reader.setFeature("http://xml.org/sax/features/namespaces", false);
        opt("method", "html");
        opt("omit-xml-declaration", "yes");
      }
      if (opts.get(HtmlOptions.NONS))
        reader.setFeature("http://xml.org/sax/features/namespaces", false);
      if (opts.get(HtmlOptions.OMITXML)) opt("omit-xml-declaration", "yes");
      if (opts.get(HtmlOptions.NOBOGONS)) reader.setFeature(FEATURES + "ignore-bogons", true);
      if (opts.get(HtmlOptions.NODEFAULTS))
        reader.setFeature(FEATURES + "default-attributes", false);
      if (opts.get(HtmlOptions.NOCOLONS)) reader.setFeature(FEATURES + "translate-colons", true);
      if (opts.get(HtmlOptions.NORESTART)) reader.setFeature(FEATURES + "restart-elements", false);
      if (opts.get(HtmlOptions.IGNORABLE))
        reader.setFeature(FEATURES + "ignorable-whitespace", true);
      if (opts.get(HtmlOptions.EMPTYBOGONS)) reader.setFeature(FEATURES + "bogons-empty", true);
      if (opts.get(HtmlOptions.ANY)) reader.setFeature(FEATURES + "bogons-empty", false);
      if (opts.get(HtmlOptions.NOROOTBOGONS)) reader.setFeature(FEATURES + "root-bogons", false);
      if (opts.get(HtmlOptions.NOCDATA)) reader.setFeature(FEATURES + "cdata-elements", false);
      if (opts.get(HtmlOptions.LEXICAL))
        reader.setProperty("http://xml.org/sax/properties/lexical-handler", writer);
      if (opts.contains(HtmlOptions.METHOD)) opt("method", opts.get(HtmlOptions.METHOD));
      if (opts.contains(HtmlOptions.DOCTYPESYS))
        opt("doctype-system", opts.get(HtmlOptions.DOCTYPESYS));
      if (opts.contains(HtmlOptions.DOCTYPEPUB))
        opt("doctype-public", opts.get(HtmlOptions.DOCTYPEPUB));
      if (opts.contains(HtmlOptions.ENCODING)) is.setEncoding(opts.get(HtmlOptions.ENCODING));
      // end TagSoup options

      reader.setContentHandler((ContentHandler) writer);
      reader.parse(is);
      return new IOContent(token(sw.toString()), io.name());

    } catch (final SAXException ex) {
      Util.errln(ex);
      return io;
    }
  }
Example #3
0
  /** Writes the properties to disk. */
  public final synchronized void write() {
    final StringBuilder user = new StringBuilder();
    BufferedReader br = null;
    try {
      // caches options specified by the user
      if (file.exists()) {
        br = new BufferedReader(new FileReader(file.file()));
        for (String line; (line = br.readLine()) != null; ) {
          if (line.equals(PROPUSER)) break;
        }
        for (String line; (line = br.readLine()) != null; ) {
          user.append(line).append(NL);
        }
      }
    } catch (final Exception ex) {
      Util.debug(ex);
    } finally {
      if (br != null)
        try {
          br.close();
        } catch (final IOException e) {
        }
    }

    BufferedWriter bw = null;
    try {
      bw = new BufferedWriter(new FileWriter(file.file()));
      bw.write(PROPHEADER + NL);

      for (final Field f : getClass().getFields()) {
        final Object obj = f.get(null);
        if (!(obj instanceof Object[])) continue;
        final String key = ((Object[]) obj)[0].toString();

        final Object val = props.get(key);
        if (val instanceof String[]) {
          final String[] str = (String[]) val;
          bw.write(key + " = " + str.length + NL);
          final int is = str.length;
          for (int i = 0; i < is; ++i) {
            if (str[i] != null) bw.write(key + (i + 1) + " = " + str[i] + NL);
          }
        } else if (val instanceof int[]) {
          final int[] num = (int[]) val;
          final int ns = num.length;
          for (int i = 0; i < ns; ++i) {
            bw.write(key + i + " = " + num[i] + NL);
          }
        } else {
          bw.write(key + " = " + val + NL);
        }
      }
      bw.write(NL + PROPUSER + NL);
      bw.write(user.toString());
    } catch (final Exception ex) {
      Util.errln("% could not be written.", file);
      Util.debug(ex);
    } finally {
      if (bw != null)
        try {
          bw.close();
        } catch (final IOException e) {
        }
    }
  }