示例#1
0
  /**
   * auto-detect the charset of a file used code from http://jchardet.sourceforge.net/; see also:
   * http://www-archive.mozilla.org/projects/intl/chardet.html
   *
   * @param file
   * @return a set of probable charsets
   * @throws IOException
   */
  public static Set<String> detectCharset(File file) throws IOException {
    // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also:
    // http://www-archive.mozilla.org/projects/intl/chardet.html
    nsDetector det = new nsDetector(nsPSMDetector.ALL);
    BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file));

    byte[] buf = new byte[1024];
    int len;
    boolean done = false;
    boolean isAscii = true;

    while ((len = imp.read(buf, 0, buf.length)) != -1) {
      if (isAscii) isAscii = det.isAscii(buf, len);
      if (!isAscii && !done) done = det.DoIt(buf, len, false);
    }
    det.DataEnd();
    Set<String> result = new HashSet<>();
    if (isAscii) {
      result.add("ASCII");
    } else {
      for (String c : det.getProbableCharsets()) result.add(c);
    }

    return result;
  }
示例#2
0
  public String detectFileCharset(File file) throws FileNotFoundException, IOException {

    detector.Init(
        new nsICharsetDetectionObserver() {

          @Override
          public void Notify(String arg0) {
            found = true;
            charset = arg0;
          }
        });

    byte[] buffer = new byte[2048];
    int length;

    BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file));

    boolean done = false;
    boolean isAscii = true;

    while ((length = stream.read(buffer, 0, buffer.length)) != -1) {
      if (isAscii) {
        isAscii = detector.isAscii(buffer, length);
      }

      if (!isAscii && !done) {
        done = detector.DoIt(buffer, length, false);
      }
    }

    detector.DataEnd();

    stream.close();

    if (isAscii) {
      charset = "us-ascii";
      found = true;
    }

    if (!found) {
      return null;
    }

    return charset;
  }