Exemplo n.º 1
0
  public static void main(String[] args) throws IOException {
    if (args.length != 1) {
      System.err.println("Usage: EncodingDetector <file>");
      System.exit(1);
    }

    Configuration conf = NutchConfiguration.create();
    EncodingDetector detector = new EncodingDetector(NutchConfiguration.create());

    // do everything as bytes; don't want any conversion
    BufferedInputStream istr = new BufferedInputStream(new FileInputStream(args[0]));
    ByteArrayOutputStream ostr = new ByteArrayOutputStream();
    byte[] bytes = new byte[1000];
    boolean more = true;
    while (more) {
      int len = istr.read(bytes);
      if (len < bytes.length) {
        more = false;
        if (len > 0) {
          ostr.write(bytes, 0, len);
        }
      } else {
        ostr.write(bytes);
      }
    }

    byte[] data = ostr.toByteArray();

    // make a fake Content
    Content content = new Content("", "", data, "text/html", new Metadata(), conf);

    detector.autoDetectClues(content, true);
    String encoding =
        detector.guessEncoding(content, conf.get("parser.character.encoding.default"));
    System.out.println("Guessed encoding: " + encoding);
  }
  @Test
  public void testGuessing() {
    // first disable auto detection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);

    // Metadata metadata = new Metadata();
    EncodingDetector detector;
    // Content content;
    String encoding;

    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));

    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    encoding = detector.guessEncoding(page, "windows-1252");
    // no information is available, so it should return default encoding
    assertEquals("windows-1252", encoding.toLowerCase());

    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/plain; charset=UTF-16"));

    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    encoding = detector.guessEncoding(page, "windows-1252");
    assertEquals("utf-16", encoding.toLowerCase());

    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));

    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    detector.addClue("windows-1254", "sniffed");
    encoding = detector.guessEncoding(page, "windows-1252");
    assertEquals("windows-1254", encoding.toLowerCase());

    // enable autodetection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
    page.putToMetadata(
        new Utf8(Response.CONTENT_TYPE), ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes()));

    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    detector.addClue("utf-32", "sniffed");
    encoding = detector.guessEncoding(page, "windows-1252");
    assertEquals("utf-8", encoding.toLowerCase());
  }