示例#1
0
  public void autoDetectClues(Content content, boolean filter) {
    byte[] data = content.getContent();

    if (minConfidence >= 0
        && DETECTABLES.contains(content.getContentType())
        && data.length > MIN_LENGTH) {
      CharsetMatch[] matches = null;

      // do all these in a try/catch; setText and detect/detectAll
      // will sometimes throw exceptions
      try {
        detector.enableInputFilter(filter);
        if (data.length > MIN_LENGTH) {
          detector.setText(data);
          matches = detector.detectAll();
        }
      } catch (Exception e) {
        LOG.debug("Exception from ICU4J (ignoring): ");
        e.printStackTrace(LogUtil.getDebugStream(LOG));
      }

      if (matches != null) {
        for (CharsetMatch match : matches) {
          addClue(match.getName(), "detect", match.getConfidence());
        }
      }
    }

    // add character encoding coming from HTTP response header
    addClue(parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), "header");
  }
示例#2
0
  private String getContentCharset(byte[] content, Metadata metadata) {
    String charset = null;

    // check if the server specified a charset
    String specifiedContentType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
    try {
      if (specifiedContentType != null) {
        ContentType parsedContentType = ContentType.parse(specifiedContentType);
        charset = parsedContentType.getCharset().name();
      }
    } catch (Exception e) {
      charset = null;
    }

    // filter HTML tags
    CharsetDetector detector = new CharsetDetector();
    detector.enableInputFilter(true);
    // give it a hint
    detector.setDeclaredEncoding(charset);
    detector.setText(content);
    try {
      CharsetMatch charsetMatch = detector.detect();
      if (charsetMatch != null) {
        charset = charsetMatch.getName();
      }
    } catch (Exception e) {
      // ignore and leave the charset as-is
    }
    return charset;
  }
  @Nullable
  public static String readFileAsStringGuessEncoding(final @NonNull File file) {
    try {
      final byte[] fileData = new byte[(int) file.length()];
      final DataInputStream dataInputStream = new DataInputStream(new FileInputStream(file));
      dataInputStream.readFully(fileData);
      dataInputStream.close();

      final CharsetMatch match = new CharsetDetector().setText(fileData).detect();

      if (match != null)
        try {
          return new String(fileData, match.getName());
        } catch (UnsupportedEncodingException ignored) {
        }
      return new String(fileData);
    } catch (Exception e) {
      e.printStackTrace();
      return null;
    }
  }