public static void main(String[] args) throws IOException { if (args.length != 1) { System.err.println("Usage: EncodingDetector <file>"); System.exit(1); } Configuration conf = NutchConfiguration.create(); EncodingDetector detector = new EncodingDetector(NutchConfiguration.create()); // do everything as bytes; don't want any conversion BufferedInputStream istr = new BufferedInputStream(new FileInputStream(args[0])); ByteArrayOutputStream ostr = new ByteArrayOutputStream(); byte[] bytes = new byte[1000]; boolean more = true; while (more) { int len = istr.read(bytes); if (len < bytes.length) { more = false; if (len > 0) { ostr.write(bytes, 0, len); } } else { ostr.write(bytes); } } byte[] data = ostr.toByteArray(); // make a fake Content Content content = new Content("", "", data, "text/html", new Metadata(), conf); detector.autoDetectClues(content, true); String encoding = detector.guessEncoding(content, conf.get("parser.character.encoding.default")); System.out.println("Guessed encoding: " + encoding); }
@Test public void testGuessing() { // first disable auto detection conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1); // Metadata metadata = new Metadata(); EncodingDetector detector; // Content content; String encoding; WebPage page = new WebPage(); page.setBaseUrl(new Utf8("http://www.example.com/")); page.setContentType(new Utf8("text/plain")); page.setContent(ByteBuffer.wrap(contentInOctets)); detector = new EncodingDetector(conf); detector.autoDetectClues(page, true); encoding = detector.guessEncoding(page, "windows-1252"); // no information is available, so it should return default encoding assertEquals("windows-1252", encoding.toLowerCase()); page = new WebPage(); page.setBaseUrl(new Utf8("http://www.example.com/")); page.setContentType(new Utf8("text/plain")); page.setContent(ByteBuffer.wrap(contentInOctets)); page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/plain; charset=UTF-16")); detector = new EncodingDetector(conf); detector.autoDetectClues(page, true); encoding = detector.guessEncoding(page, "windows-1252"); assertEquals("utf-16", encoding.toLowerCase()); page = new WebPage(); page.setBaseUrl(new Utf8("http://www.example.com/")); page.setContentType(new Utf8("text/plain")); page.setContent(ByteBuffer.wrap(contentInOctets)); detector = new EncodingDetector(conf); detector.autoDetectClues(page, true); detector.addClue("windows-1254", "sniffed"); encoding = detector.guessEncoding(page, "windows-1252"); assertEquals("windows-1254", encoding.toLowerCase()); // enable autodetection conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50); page = new WebPage(); page.setBaseUrl(new Utf8("http://www.example.com/")); page.setContentType(new Utf8("text/plain")); page.setContent(ByteBuffer.wrap(contentInOctets)); page.putToMetadata( new Utf8(Response.CONTENT_TYPE), ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes())); detector = new EncodingDetector(conf); detector.autoDetectClues(page, true); detector.addClue("utf-32", "sniffed"); encoding = detector.guessEncoding(page, "windows-1252"); assertEquals("utf-8", encoding.toLowerCase()); }