コード例 #1
0
ファイル: EncodingDetector.java プロジェクト: samhooyu/nutch
  public void autoDetectClues(Content content, boolean filter) {
    byte[] data = content.getContent();

    if (minConfidence >= 0
        && DETECTABLES.contains(content.getContentType())
        && data.length > MIN_LENGTH) {
      CharsetMatch[] matches = null;

      // do all these in a try/catch; setText and detect/detectAll
      // will sometimes throw exceptions
      try {
        detector.enableInputFilter(filter);
        if (data.length > MIN_LENGTH) {
          detector.setText(data);
          matches = detector.detectAll();
        }
      } catch (Exception e) {
        LOG.debug("Exception from ICU4J (ignoring): ");
        e.printStackTrace(LogUtil.getDebugStream(LOG));
      }

      if (matches != null) {
        for (CharsetMatch match : matches) {
          addClue(match.getName(), "detect", match.getConfidence());
        }
      }
    }

    // add character encoding coming from HTTP response header
    addClue(parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), "header");
  }
コード例 #2
0
  private String getContentCharset(byte[] content, Metadata metadata) {
    String charset = null;

    // check if the server specified a charset
    String specifiedContentType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
    try {
      if (specifiedContentType != null) {
        ContentType parsedContentType = ContentType.parse(specifiedContentType);
        charset = parsedContentType.getCharset().name();
      }
    } catch (Exception e) {
      charset = null;
    }

    // filter HTML tags
    CharsetDetector detector = new CharsetDetector();
    detector.enableInputFilter(true);
    // give it a hint
    detector.setDeclaredEncoding(charset);
    detector.setText(content);
    try {
      CharsetMatch charsetMatch = detector.detect();
      if (charsetMatch != null) {
        charset = charsetMatch.getName();
      }
    } catch (Exception e) {
      // ignore and leave the charset as-is
    }
    return charset;
  }