Exemplo n.º 1
0
  /**
   * Guess the encoding with the previously specified list of clues.
   *
   * @param content Content instance
   * @param defaultValue Default encoding to return if no encoding can be detected with enough
   *     confidence. Note that this will <b>not</b> be normalized with {@link
   *     EncodingDetector#resolveEncodingAlias}
   * @return Guessed encoding or defaultValue
   */
  public String guessEncoding(Content content, String defaultValue) {
    /*
     * This algorithm could be replaced by something more sophisticated;
     * ideally we would gather a bunch of data on where various clues
     * (autodetect, HTTP headers, HTML meta tags, etc.) disagree, tag each with
     * the correct answer, and use machine learning/some statistical method
     * to generate a better heuristic.
     */

    String base = content.getBaseUrl();

    if (LOG.isTraceEnabled()) {
      findDisagreements(base, clues);
    }

    /*
     * Go down the list of encoding "clues". Use a clue if:
     *  1. Has a confidence value which meets our confidence threshold, OR
     *  2. Doesn't meet the threshold, but is the best try,
     *     since nothing else is available.
     */
    EncodingClue defaultClue = new EncodingClue(defaultValue, "default");
    EncodingClue bestClue = defaultClue;

    for (EncodingClue clue : clues) {
      if (LOG.isTraceEnabled()) {
        LOG.trace(base + ": charset " + clue);
      }
      String charset = clue.value;
      if (minConfidence >= 0 && clue.confidence >= minConfidence) {
        if (LOG.isTraceEnabled()) {
          LOG.trace(
              base + ": Choosing encoding: " + charset + " with confidence " + clue.confidence);
        }
        return resolveEncodingAlias(charset).toLowerCase();
      } else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) {
        bestClue = clue;
      }
    }

    if (LOG.isTraceEnabled()) {
      LOG.trace(base + ": Choosing encoding: " + bestClue);
    }
    return bestClue.value.toLowerCase();
  }