Пример #1
0
  private Document cleanHtml(ExtractorContext ctx) {

    Document originalDoc = ctx.getDoc();

    int minDistance = Integer.MAX_VALUE;
    BaseHtmlCleaner bestSuitableCleaner = null;

    for (BaseHtmlCleaner cleaner : BASE_CLEANERS) {

      int curDistance = new SitesDistanceCalculator().distance(cleaner.getBaseDoc(), originalDoc);

      if (curDistance < minDistance) {
        minDistance = curDistance;
        bestSuitableCleaner = cleaner;
      }
    }

    bestSuitableCleaner.clean(originalDoc);
    ctx.setCleaner(bestSuitableCleaner);

    return originalDoc;
  }