private ExtractedImage createImageObject(URL imageUrl, PlanarImage planarImage, int priority)
      throws IOException {
    ByteArrayOutputStream imagesBytes = new ByteArrayOutputStream();

    String extension = FileUtils.extractExtension(imageUrl.toString(), "jpg");

    ImageIO.write(planarImage, extension, imagesBytes);
    return new ExtractedImage(
        CryptographyUtils.sha1(imageUrl.toString()), imagesBytes.toByteArray(), priority);
  }
Пример #2
0
  public ExtractorContext extractContent() {

    HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

    ExtractorContext ctx = new ExtractorContext(page, htmlParseData.getHtml());

    ctx.setMetaTags(MetadataExtractor.INST.extractMetaTags(ctx.getDoc()));

    String language = ctx.getMetaTags().get(MetadataExtractor.CONTENT_LANGUAGE_TAG);

    // skip not english pages
    if ((language != null && !ENGLISH_US.equals(language.toLowerCase()))) {
      LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH");
      return null;
    }

    if (ctx.getUrl().contains(YOUTUBE)) {

      String keyWords = ctx.getMetaTags().get(MetadataExtractor.KEYWORDS_TAG);

      if (keyWords == null) {
        LOG.info("Skipping " + ctx.getUrl() + " no KEYWORDS");
        return null;
      }

      keyWords = keyWords.toLowerCase();

      if (!keyWords.contains(SAP)) {
        LOG.info("Skipping " + ctx.getUrl() + " no SAP in KEYWORDS");
        return null;
      }
    }

    Document clearedHtml = cleanHtml(ctx);

    if (clearedHtml.text().length() < MIN_CONTENT_LENGTH) {
      LOG.info("Skipping " + ctx.getUrl() + " less then " + MIN_CONTENT_LENGTH + " characters");
      return null;
    }

    // skip pages with not English content
    if (!LanguageDetector.INST.isEnglish(clearedHtml.text())) {
      LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH");
      return null;
    }

    NavigableSet<ExtractedImage> images = MostRelevantImageExtractor.INST.extract(ctx);

    String uniqueFileName = CryptographyUtils.sha1(clearedHtml.text());

    PageObject pageObject = new PageObject();
    pageObject.setId(uniqueFileName);
    pageObject.setTitle(ctx.getTitle());
    pageObject.setContent(clearedHtml.html());
    pageObject.setUrl(page.getWebURL().getURL());
    pageObject.setHtmlCleanerName(ctx.getCleaner().getName());

    List<TextBlock> blocks = TextBlocksExtractor.INST.extract(clearedHtml);

    if (!blocks.isEmpty()) {
      pageObject.setMainBlock(blocks.get(0));
      blocks.remove(0);
      pageObject.addBlocks(blocks);
    }

    // set title from First text block if NULL or EMPTY.
    if ((pageObject.getTitle() == null || "".equals(pageObject.getTitle().trim()))
        && !pageObject.getBlocks().isEmpty()) {
      pageObject.setTitle(pageObject.getBlocks().get(0).getTitle());
    }

    pageObject.addAllKeywords(BaseClassifier.INST.classify(ctx));

    if (!images.isEmpty()) {
      pageObject.addImages(images);
    }

    ctx.setPageObject(pageObject);

    return ctx;
  }