/** Image with the highest priority will be last in NavigableSet. */ public NavigableSet<ExtractedImage> extract(ExtractorContext ctx) { final NavigableSet<ExtractedImage> images = new TreeSet<>(); int curImgPriority = MAX_PRIORITY; String baseUrl = ctx.getUrl(); String imgUrlStr = null; try { Document doc = Jsoup.parse(ctx.getOriginalHtml()); for (Element singleImgLink : doc.select("img")) { String srcImgPath = singleImgLink.attr("src"); if (srcImgPath.contains("logo")) { continue; } String classAtrValue = singleImgLink.attr("class"); if (classAtrValue != null && classAtrValue.equals("thumb")) { continue; } if (!IMG_PATTERN.matcher(srcImgPath).matches()) { continue; } imgUrlStr = URLCanonicalizer.getCanonicalURL(srcImgPath, baseUrl); URL imageUrl = new URL(imgUrlStr); PlanarImage planarImage = createPlanarImage(imgUrlStr); int curArea = planarImage.getWidth() * planarImage.getHeight(); if (curArea >= MIN_AREA && curArea <= MAX_AREA) { images.add(createImageObject(imageUrl, planarImage, curImgPriority)); --curImgPriority; } } // check META tag String imgUrlFromTag = ctx.getMetaTags().get("image"); if (imgUrlFromTag != null) { imgUrlStr = URLCanonicalizer.getCanonicalURL(imgUrlFromTag, ctx.getUrl()); PlanarImage planarImage = createPlanarImage(URLCanonicalizer.getCanonicalURL(imgUrlFromTag, ctx.getUrl())); int curArea = planarImage.getWidth() * planarImage.getHeight(); if (curArea >= MIN_AREA && curArea <= MAX_AREA) { images.add(createImageObject(new URL(imgUrlStr), planarImage, curImgPriority)); --curImgPriority; } } } catch (Exception ex) { LOG.error( "Can't download image from url '" + imgUrlStr + "', exception message: " + ex.getMessage(), ex); } return images; }