/** Image with the highest priority will be last in NavigableSet. */
  public NavigableSet<ExtractedImage> extract(ExtractorContext ctx) {

    final NavigableSet<ExtractedImage> images = new TreeSet<>();

    int curImgPriority = MAX_PRIORITY;

    String baseUrl = ctx.getUrl();

    String imgUrlStr = null;

    try {
      Document doc = Jsoup.parse(ctx.getOriginalHtml());

      for (Element singleImgLink : doc.select("img")) {

        String srcImgPath = singleImgLink.attr("src");

        if (srcImgPath.contains("logo")) {
          continue;
        }

        String classAtrValue = singleImgLink.attr("class");

        if (classAtrValue != null && classAtrValue.equals("thumb")) {
          continue;
        }

        if (!IMG_PATTERN.matcher(srcImgPath).matches()) {
          continue;
        }

        imgUrlStr = URLCanonicalizer.getCanonicalURL(srcImgPath, baseUrl);

        URL imageUrl = new URL(imgUrlStr);
        PlanarImage planarImage = createPlanarImage(imgUrlStr);
        int curArea = planarImage.getWidth() * planarImage.getHeight();

        if (curArea >= MIN_AREA && curArea <= MAX_AREA) {
          images.add(createImageObject(imageUrl, planarImage, curImgPriority));
          --curImgPriority;
        }
      }

      // check META tag
      String imgUrlFromTag = ctx.getMetaTags().get("image");
      if (imgUrlFromTag != null) {
        imgUrlStr = URLCanonicalizer.getCanonicalURL(imgUrlFromTag, ctx.getUrl());
        PlanarImage planarImage =
            createPlanarImage(URLCanonicalizer.getCanonicalURL(imgUrlFromTag, ctx.getUrl()));
        int curArea = planarImage.getWidth() * planarImage.getHeight();

        if (curArea >= MIN_AREA && curArea <= MAX_AREA) {
          images.add(createImageObject(new URL(imgUrlStr), planarImage, curImgPriority));
          --curImgPriority;
        }
      }
    } catch (Exception ex) {
      LOG.error(
          "Can't download image from url '"
              + imgUrlStr
              + "', exception message: "
              + ex.getMessage(),
          ex);
    }

    return images;
  }
Example #2
0
  public static void testCanonizalier() {

    assertEquals(
        "http://www.example.com/display?category=foo/bar+baz",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/display?category=foo/bar+baz"));

    assertEquals(
        "http://www.example.com/?q=a+b",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/?q=a+b"));

    assertEquals(
        "http://www.example.com/display?category=foo%2Fbar%2Bbaz",
        URLCanonicalizer.getCanonicalURL(
            "http://www.example.com/display?category=foo%2Fbar%2Bbaz"));

    assertEquals(
        "http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037",
        URLCanonicalizer.getCanonicalURL(
            "http://somedomain.com/uploads/1/0/2/5/10259653/6199347.jpg?1325154037"));

    assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://hostname.com"));

    assertEquals("http://hostname.com/", URLCanonicalizer.getCanonicalURL("http://HOSTNAME.com"));

    assertEquals(
        "http://www.example.com/index.html",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&"));

    assertEquals(
        "http://www.example.com/index.html",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?"));

    assertEquals(
        "http://www.example.com/", URLCanonicalizer.getCanonicalURL("http://www.example.com"));

    assertEquals(
        "http://www.example.com/bar.html",
        URLCanonicalizer.getCanonicalURL("http://www.example.com:80/bar.html"));

    assertEquals(
        "http://www.example.com/index.html?name=test&rame=base",
        URLCanonicalizer.getCanonicalURL(
            "http://www.example.com/index.html?name=test&rame=base#123"));

    assertEquals(
        "http://www.example.com/~username/",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/%7Eusername/"));

    assertEquals(
        "http://www.example.com/A/B/index.html",
        URLCanonicalizer.getCanonicalURL("http://www.example.com//A//B/index.html"));

    assertEquals(
        "http://www.example.com/index.html?x=y",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&x=y"));

    assertEquals(
        "http://www.example.com/a.html",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/../../a.html"));

    assertEquals(
        "http://www.example.com/a/c/d.html",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/../a/b/../c/./d.html"));

    assertEquals(
        "http://foo.bar.com/?baz=1", URLCanonicalizer.getCanonicalURL("http://foo.bar.com?baz=1"));

    assertEquals(
        "http://www.example.com/index.html?c=d&e=f&a=b",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?&c=d&e=f&a=b"));

    assertEquals(
        "http://www.example.com/index.html?q=a b",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/index.html?q=a b"));

    assertEquals(
        "http://www.example.com/search?width=100%&height=100%",
        URLCanonicalizer.getCanonicalURL("http://www.example.com/search?width=100%&height=100%"));

    assertEquals(
        "http://foo.bar/mydir/myfile?page=2",
        URLCanonicalizer.getCanonicalURL("?page=2", "http://foo.bar/mydir/myfile"));

    assertEquals(
        "http://www.lampsplus.com/products/s_%20/",
        URLCanonicalizer.getCanonicalURL("http://www.lampsplus.com/products/s_ /"));

    assertEquals(
        "http://www.vitacost.com/productResults.aspx?N=1300986+2009046",
        URLCanonicalizer.getCanonicalURL(
            "http://www.vitacost.com/productResults.aspx?N=1300986+2009046"));

    assertEquals(
        "http://www.pier1.com/Bright-Chenille-Striped-Rug/2527614,default,pd.html",
        URLCanonicalizer.getCanonicalURL(
            "/Bright-Chenille-Striped-Rug/2527614,default,pd.html",
            "http://www.pier1.com/Bright-Chenille-Striped-Rug/2527614,default,pd.html"));
  }