Esempio n. 1
0
  private Document cleanHtml(ExtractorContext ctx) {

    Document originalDoc = ctx.getDoc();

    int minDistance = Integer.MAX_VALUE;
    BaseHtmlCleaner bestSuitableCleaner = null;

    for (BaseHtmlCleaner cleaner : BASE_CLEANERS) {

      int curDistance = new SitesDistanceCalculator().distance(cleaner.getBaseDoc(), originalDoc);

      if (curDistance < minDistance) {
        minDistance = curDistance;
        bestSuitableCleaner = cleaner;
      }
    }

    bestSuitableCleaner.clean(originalDoc);
    ctx.setCleaner(bestSuitableCleaner);

    return originalDoc;
  }
Esempio n. 2
0
  public ExtractorContext extractContent() {

    HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

    ExtractorContext ctx = new ExtractorContext(page, htmlParseData.getHtml());

    ctx.setMetaTags(MetadataExtractor.INST.extractMetaTags(ctx.getDoc()));

    String language = ctx.getMetaTags().get(MetadataExtractor.CONTENT_LANGUAGE_TAG);

    // skip not english pages
    if ((language != null && !ENGLISH_US.equals(language.toLowerCase()))) {
      LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH");
      return null;
    }

    if (ctx.getUrl().contains(YOUTUBE)) {

      String keyWords = ctx.getMetaTags().get(MetadataExtractor.KEYWORDS_TAG);

      if (keyWords == null) {
        LOG.info("Skipping " + ctx.getUrl() + " no KEYWORDS");
        return null;
      }

      keyWords = keyWords.toLowerCase();

      if (!keyWords.contains(SAP)) {
        LOG.info("Skipping " + ctx.getUrl() + " no SAP in KEYWORDS");
        return null;
      }
    }

    Document clearedHtml = cleanHtml(ctx);

    if (clearedHtml.text().length() < MIN_CONTENT_LENGTH) {
      LOG.info("Skipping " + ctx.getUrl() + " less then " + MIN_CONTENT_LENGTH + " characters");
      return null;
    }

    // skip pages with not English content
    if (!LanguageDetector.INST.isEnglish(clearedHtml.text())) {
      LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH");
      return null;
    }

    NavigableSet<ExtractedImage> images = MostRelevantImageExtractor.INST.extract(ctx);

    String uniqueFileName = CryptographyUtils.sha1(clearedHtml.text());

    PageObject pageObject = new PageObject();
    pageObject.setId(uniqueFileName);
    pageObject.setTitle(ctx.getTitle());
    pageObject.setContent(clearedHtml.html());
    pageObject.setUrl(page.getWebURL().getURL());
    pageObject.setHtmlCleanerName(ctx.getCleaner().getName());

    List<TextBlock> blocks = TextBlocksExtractor.INST.extract(clearedHtml);

    if (!blocks.isEmpty()) {
      pageObject.setMainBlock(blocks.get(0));
      blocks.remove(0);
      pageObject.addBlocks(blocks);
    }

    // set title from First text block if NULL or EMPTY.
    if ((pageObject.getTitle() == null || "".equals(pageObject.getTitle().trim()))
        && !pageObject.getBlocks().isEmpty()) {
      pageObject.setTitle(pageObject.getBlocks().get(0).getTitle());
    }

    pageObject.addAllKeywords(BaseClassifier.INST.classify(ctx));

    if (!images.isEmpty()) {
      pageObject.addImages(images);
    }

    ctx.setPageObject(pageObject);

    return ctx;
  }