private Document cleanHtml(ExtractorContext ctx) { Document originalDoc = ctx.getDoc(); int minDistance = Integer.MAX_VALUE; BaseHtmlCleaner bestSuitableCleaner = null; for (BaseHtmlCleaner cleaner : BASE_CLEANERS) { int curDistance = new SitesDistanceCalculator().distance(cleaner.getBaseDoc(), originalDoc); if (curDistance < minDistance) { minDistance = curDistance; bestSuitableCleaner = cleaner; } } bestSuitableCleaner.clean(originalDoc); ctx.setCleaner(bestSuitableCleaner); return originalDoc; }
public ExtractorContext extractContent() { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); ExtractorContext ctx = new ExtractorContext(page, htmlParseData.getHtml()); ctx.setMetaTags(MetadataExtractor.INST.extractMetaTags(ctx.getDoc())); String language = ctx.getMetaTags().get(MetadataExtractor.CONTENT_LANGUAGE_TAG); // skip not english pages if ((language != null && !ENGLISH_US.equals(language.toLowerCase()))) { LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH"); return null; } if (ctx.getUrl().contains(YOUTUBE)) { String keyWords = ctx.getMetaTags().get(MetadataExtractor.KEYWORDS_TAG); if (keyWords == null) { LOG.info("Skipping " + ctx.getUrl() + " no KEYWORDS"); return null; } keyWords = keyWords.toLowerCase(); if (!keyWords.contains(SAP)) { LOG.info("Skipping " + ctx.getUrl() + " no SAP in KEYWORDS"); return null; } } Document clearedHtml = cleanHtml(ctx); if (clearedHtml.text().length() < MIN_CONTENT_LENGTH) { LOG.info("Skipping " + ctx.getUrl() + " less then " + MIN_CONTENT_LENGTH + " characters"); return null; } // skip pages with not English content if (!LanguageDetector.INST.isEnglish(clearedHtml.text())) { LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH"); return null; } NavigableSet<ExtractedImage> images = MostRelevantImageExtractor.INST.extract(ctx); String uniqueFileName = CryptographyUtils.sha1(clearedHtml.text()); PageObject pageObject = new PageObject(); pageObject.setId(uniqueFileName); pageObject.setTitle(ctx.getTitle()); pageObject.setContent(clearedHtml.html()); pageObject.setUrl(page.getWebURL().getURL()); pageObject.setHtmlCleanerName(ctx.getCleaner().getName()); List<TextBlock> blocks = TextBlocksExtractor.INST.extract(clearedHtml); if (!blocks.isEmpty()) { pageObject.setMainBlock(blocks.get(0)); blocks.remove(0); pageObject.addBlocks(blocks); } // set title from First text block if NULL or EMPTY. if ((pageObject.getTitle() == null || "".equals(pageObject.getTitle().trim())) && !pageObject.getBlocks().isEmpty()) { pageObject.setTitle(pageObject.getBlocks().get(0).getTitle()); } pageObject.addAllKeywords(BaseClassifier.INST.classify(ctx)); if (!images.isEmpty()) { pageObject.addImages(images); } ctx.setPageObject(pageObject); return ctx; }