/** Image with the highest priority will be last in NavigableSet. */ public NavigableSet<ExtractedImage> extract(ExtractorContext ctx) { final NavigableSet<ExtractedImage> images = new TreeSet<>(); int curImgPriority = MAX_PRIORITY; String baseUrl = ctx.getUrl(); String imgUrlStr = null; try { Document doc = Jsoup.parse(ctx.getOriginalHtml()); for (Element singleImgLink : doc.select("img")) { String srcImgPath = singleImgLink.attr("src"); if (srcImgPath.contains("logo")) { continue; } String classAtrValue = singleImgLink.attr("class"); if (classAtrValue != null && classAtrValue.equals("thumb")) { continue; } if (!IMG_PATTERN.matcher(srcImgPath).matches()) { continue; } imgUrlStr = URLCanonicalizer.getCanonicalURL(srcImgPath, baseUrl); URL imageUrl = new URL(imgUrlStr); PlanarImage planarImage = createPlanarImage(imgUrlStr); int curArea = planarImage.getWidth() * planarImage.getHeight(); if (curArea >= MIN_AREA && curArea <= MAX_AREA) { images.add(createImageObject(imageUrl, planarImage, curImgPriority)); --curImgPriority; } } // check META tag String imgUrlFromTag = ctx.getMetaTags().get("image"); if (imgUrlFromTag != null) { imgUrlStr = URLCanonicalizer.getCanonicalURL(imgUrlFromTag, ctx.getUrl()); PlanarImage planarImage = createPlanarImage(URLCanonicalizer.getCanonicalURL(imgUrlFromTag, ctx.getUrl())); int curArea = planarImage.getWidth() * planarImage.getHeight(); if (curArea >= MIN_AREA && curArea <= MAX_AREA) { images.add(createImageObject(new URL(imgUrlStr), planarImage, curImgPriority)); --curImgPriority; } } } catch (Exception ex) { LOG.error( "Can't download image from url '" + imgUrlStr + "', exception message: " + ex.getMessage(), ex); } return images; }
public static void testCanonizalier() { assertEquals( "http://www.example.com/display?category=foo/bar+baz", URLCanonicalizer.getCanonicalURL("http://www.example.com/display?category=foo/bar+baz")); assertEquals( "http://www.example.com/?q=a+b", URLCanonicalizer.getCanonicalURL("http://www.example.com/?q=a+b")); assertEquals( "http://www.example.com/display?category=foo%2Fbar%2Bbaz", URLCanonicalizer.getCanonicalURL( "http://www.example.com/display?category=foo%2Fbar%2Bbaz")); assertEquals( "http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037", URLCanonicalizer.getCanonicalURL( "http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037")); assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://hostname.com")); assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://HOSTNAME.com")); assertEquals( "http://www.example.com/index.html", URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&")); assertEquals( "http://www.example.com/index.html", URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?")); assertEquals( "http://www.example.com/", URLCanonicalizer.getCanonicalURL("http://www.example.com")); assertEquals( "http://www.example.com/bar.html", URLCanonicalizer.getCanonicalURL("http://www.example.com:80/bar.html")); assertEquals( "http://www.example.com/index.html?name=test&rame=base", URLCanonicalizer.getCanonicalURL( "http://www.example.com/index.html?name=test&rame=base#123")); assertEquals( "http://www.example.com/~username/", URLCanonicalizer.getCanonicalURL("http://www.example.com/%7Eusername/")); assertEquals( "http://www.example.com/A/B/index.html", URLCanonicalizer.getCanonicalURL("http://www.example.com//A//B/index.html")); assertEquals( "http://www.example.com/index.html?x=y", URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&x=y")); assertEquals( "http://www.example.com/a.html", URLCanonicalizer.getCanonicalURL("http://www.example.com/../../a.html")); assertEquals( "http://www.example.com/a/c/d.html", URLCanonicalizer.getCanonicalURL("http://www.example.com/../a/b/../c/./d.html")); assertEquals( "http://foo.bar.com/?baz=1", URLCanonicalizer.getCanonicalURL("http://foo.bar.com?baz=1")); assertEquals( "http://www.example.com/index.html?c=d&e=f&a=b", URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&c=d&e=f&a=b")); assertEquals( "http://www.example.com/index.html?q=a b", URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?q=a b")); assertEquals( "http://www.example.com/search?width=100%&height=100%", URLCanonicalizer.getCanonicalURL("http://www.example.com/search?width=100%&height=100%")); assertEquals( "http://foo.bar/mydir/myfile?page=2", URLCanonicalizer.getCanonicalURL("?page=2", "http://foo.bar/mydir/myfile")); assertEquals( "http://www.lampsplus.com/products/s_%20/", URLCanonicalizer.getCanonicalURL("http://www.lampsplus.com/products/s_ /")); assertEquals( "http://www.vitacost.com/productResults.aspx?N=1300986+2009046", URLCanonicalizer.getCanonicalURL( "http://www.vitacost.com/productResults.aspx?N=1300986+2009046")); assertEquals( "http://www.pier1.com/Bright-Chenille-Striped-Rug/2527614,default,pd.html", URLCanonicalizer.getCanonicalURL( "/Bright-Chenille-Striped-Rug/2527614,default,pd.html", "http://www.pier1.com/Bright-Chenille-Striped-Rug/2527614,default,pd.html")); }