/** * auto-detect the charset of a file used code from http://jchardet.sourceforge.net/; see also: * http://www-archive.mozilla.org/projects/intl/chardet.html * * @param file * @return a set of probable charsets * @throws IOException */ public static Set<String> detectCharset(File file) throws IOException { // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: // http://www-archive.mozilla.org/projects/intl/chardet.html nsDetector det = new nsDetector(nsPSMDetector.ALL); BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file)); byte[] buf = new byte[1024]; int len; boolean done = false; boolean isAscii = true; while ((len = imp.read(buf, 0, buf.length)) != -1) { if (isAscii) isAscii = det.isAscii(buf, len); if (!isAscii && !done) done = det.DoIt(buf, len, false); } det.DataEnd(); Set<String> result = new HashSet<>(); if (isAscii) { result.add("ASCII"); } else { for (String c : det.getProbableCharsets()) result.add(c); } return result; }
public String detectFileCharset(File file) throws FileNotFoundException, IOException { detector.Init( new nsICharsetDetectionObserver() { @Override public void Notify(String arg0) { found = true; charset = arg0; } }); byte[] buffer = new byte[2048]; int length; BufferedInputStream stream = new BufferedInputStream(new FileInputStream(file)); boolean done = false; boolean isAscii = true; while ((length = stream.read(buffer, 0, buffer.length)) != -1) { if (isAscii) { isAscii = detector.isAscii(buffer, length); } if (!isAscii && !done) { done = detector.DoIt(buffer, length, false); } } detector.DataEnd(); stream.close(); if (isAscii) { charset = "us-ascii"; found = true; } if (!found) { return null; } return charset; }