/**
  * @param webPageEntity
  * @param category
  * @return
  */
 private String[] getNormalizedCategories(WebPageEntity webPageEntity, String category) {
   if (mapping.containsKey(category)) {
     return mapping.get(category).split(",");
   }
   LOGGER.warn("Unknown category: {} url {}", webPageEntity.getCategory(), webPageEntity.getUrl());
   return new String[] {"misc"};
 }
  private Observable<WebPageEntity> parseDocument(DownloadResult downloadResult) {
    Set<WebPageEntity> result = new HashSet<>(1);

    Document document = downloadResult.getDocument();
    if (document != null) {
      Elements elements = document.select(".InfoArea a[title]");
      if (!elements.isEmpty()) {
        for (Element element : elements) {
          WebPageEntity webPageEntity =
              new WebPageEntity(
                  downloadResult.getSourcePage(),
                  "",
                  "productPage",
                  element.attr("abs:href"),
                  downloadResult.getSourcePage().getCategory());
          LOGGER.info("productPageUrl={}", webPageEntity.getUrl());
          result.add(webPageEntity);
        }
      } else {
        WebPageEntity webPageEntity =
            new WebPageEntity(
                downloadResult.getSourcePage(),
                "",
                "productPage",
                downloadResult.getSourcePage().getUrl(),
                downloadResult.getSourcePage().getCategory());
        LOGGER.info("productPageUrl={}", webPageEntity.getUrl());
        result.add(webPageEntity);
      }
    }
    return Observable.from(result);
  }
 /**
  * @param webPageEntity
  * @return
  */
 private String[] getNormalizedCategories(WebPageEntity webPageEntity) {
   String s = mapping.get(webPageEntity.getCategory());
   if (null != s) {
     return s.split(",");
   }
   LOGGER.warn("Unknown category: {} url {}", webPageEntity.getCategory(), webPageEntity.getUrl());
   return new String[] {"misc"};
 }
  /**
   * @param webPageEntity
   * @return
   * @throws Exception
   */
  @Override
  public Observable<ProductEntity> parse(WebPageEntity webPageEntity) {
    HashSet<ProductEntity> result = new HashSet<>();
    try {
      ProductEntity product;
      String productName = null;
      String url = null;
      String regularPrice = null;
      String specialPrice = null;
      String productImage = null;
      String description = null;
      Map<String, String> attr = new HashMap<>();
      String[] category = null;

      url = webPageEntity.getUrl();

      Document document = Jsoup.parse(webPageEntity.getContent(), webPageEntity.getUrl());
      productName = document.select(".naitem").text();
      LOGGER.info("Parsing {}, page={}", productName, webPageEntity.getUrl());

      if (!document.select(".outofstock").isEmpty()) {
        return Observable.empty();
      }

      productImage = document.select(".itemImgDiv img.itemDetailImg").attr("abs:src");
      String priceText = document.select(".itemDetailPrice").text().replace("\\xEF\\xBF\\xBD", " ");
      Matcher matcher = pricePattern.matcher(priceText);
      if (matcher.find()) {
        regularPrice = matcher.group().replace(",", "");
      }
      description = document.select(".itemDescription").text();
      category = getNormalizedCategories(webPageEntity);

      product =
          new ProductEntity(
              productName,
              url,
              regularPrice,
              specialPrice,
              productImage,
              description,
              attr,
              category);
      result.add(product);
    } catch (Exception e) {
      LOGGER.error("Failed to parse: {}", webPageEntity, e);
    }
    return Observable.from(result).doOnNext(e -> parseResultCounter.inc());
  }
Esempio n. 5
0
 @Override
 public Observable<WebPageEntity> parse(WebPageEntity parent) {
   return client
       .get(parent.getUrl(), new DocumentCompletionHandler(parent))
       .flatMap(this::parseDocument)
       .doOnNext(e -> this.parseResultCounter.inc());
 }
 @Override
 public Observable<WebPageEntity> parse(WebPageEntity webPage) {
   LOGGER.trace("Processing productPage {}", webPage.getUrl());
   return PageDownloader.download(client, webPage, "productPageRaw")
       .filter(data -> null != data)
       .doOnNext(e -> this.parseResultCounter.inc());
 }
  @Override
  public Observable<ProductEntity> parse(WebPageEntity webPageEntity) {
    HashSet<ProductEntity> result = new HashSet<>();

    try {
      ProductEntity product;
      String productName = null;
      String url = null;
      String regularPrice = null;
      String specialPrice = null;
      String productImage = null;
      String description = null;
      Map<String, String> attr = new HashMap<>();
      String[] category = null;

      url = webPageEntity.getUrl();
      Document document = Jsoup.parse(webPageEntity.getContent(), webPageEntity.getUrl());

      productName = document.select(".product_title").text();
      LOGGER.info("Parsing {}, page={}", productName, webPageEntity.getUrl());

      productImage = document.select(".product .wp-post-image").attr("abs:src");
      regularPrice = document.select("meta[itemprop=price]").attr("content");
      description = document.select("#tab-description").text().replace("Product Description", "");
      category = webPageEntity.getCategory().split(",");

      product =
          new ProductEntity(
              productName,
              url,
              regularPrice,
              specialPrice,
              productImage,
              description,
              attr,
              category);
      result.add(product);
    } catch (Exception e) {
      LOGGER.error("Failed to parse: {}", webPageEntity, e);
    }
    return Observable.from(result).doOnNext(e -> parseResultCounter.inc());
  }
Esempio n. 8
0
  private Observable<WebPageEntity> parseDocument(DownloadResult downloadResult) {
    Set<WebPageEntity> result = new HashSet<>(1);

    Document document = downloadResult.getDocument();
    if (document != null) {
      Elements elements = document.select(".SideCategoryListFlyout a");
      for (Element element : elements) {
        WebPageEntity webPageEntity =
            new WebPageEntity(
                downloadResult.getSourcePage(),
                "",
                "productList",
                element.attr("abs:href"),
                element.text());
        LOGGER.info("Product page listing={}", webPageEntity.getUrl());
        result.add(webPageEntity);
      }
    }
    return Observable.from(result);
  }
 /**
  * @param price
  * @return
  */
 private static String parsePrice(WebPageEntity webPageEntity, String price) {
   Matcher matcher = pricePattern.matcher(price);
   if (matcher.find()) {
     try {
       return matcher.group(1).replace(",", "");
       //                return
       // NumberFormat.getInstance(Locale.US).parse(matcher.group(1)).toString();
     } catch (Exception ignored) {
       return Double.valueOf(matcher.group(1)).toString();
     }
   } else {
     LOGGER.error("failed to parse price {}, page {}", price, webPageEntity.getUrl());
     return price;
   }
 }
  @Override
  public Observable<ProductEntity> parse(WebPageEntity webPageEntity) {
    HashSet<ProductEntity> result = new HashSet<>();

    try {
      ProductEntity product;
      String productName = null;
      String url = null;
      String regularPrice = null;
      String specialPrice = null;
      String productImage = null;
      String description = null;
      Map<String, String> attr = new HashMap<>();
      String[] category = null;

      url = webPageEntity.getUrl();

      Document document = Jsoup.parse(webPageEntity.getContent(), webPageEntity.getUrl());
      if (!document.select(".saleImage").isEmpty()) {
        return Observable.empty();
      }

      productName = document.select("div.innercontentDiv > div > div > h2").text();
      if (productName.isEmpty()) {
        return Observable.empty();
      }

      LOGGER.info("Parsing {}, page={}", productName, webPageEntity.getUrl());

      String manufacturer =
          document.select(".product-details__title .product__manufacturer").text();
      if (!manufacturer.isEmpty()) {
        attr.put("manufacturer", manufacturer);
      }
      productImage = document.select("div.imgLiquidNoFill a").attr("abs:src");
      if (productImage.isEmpty()) {
        productImage = document.select(".es-carousel img").attr("abs:src");
      }

      regularPrice =
          parsePrice(
              webPageEntity,
              document.select("#desPrice > li:nth-child(1) > span.pricetag.show").text());
      specialPrice = document.select("#desPrice > li:nth-child(2) > span.pricetag.show").text();
      if (!specialPrice.isEmpty()) {
        specialPrice = parsePrice(webPageEntity, specialPrice);
      }
      description = document.select("#TabbedPanels1 > div > div:nth-child(1)").text();

      Iterator<Element> labels =
          document.select("table.productTbl > tbody > tr > td:nth-child(1)").iterator();
      Iterator<Element> values =
          document.select("table.productTbl > tbody > tr > td:nth-child(2)").iterator();

      while (labels.hasNext()) {
        String specName =
            CaseFormat.LOWER_UNDERSCORE.to(
                CaseFormat.LOWER_CAMEL,
                labels.next().text().replace(' ', '_').replace(":", "").trim());
        String specValue = values.next().text();
        if (specName.contains("department")) {
          category = getNormalizedCategories(webPageEntity, specValue);
        } else {
          attr.put(specName, specValue);
        }
      }
      if (category == null) {
        LOGGER.warn("Category not found");
      }

      product =
          new ProductEntity(
              productName,
              url,
              regularPrice,
              specialPrice,
              productImage,
              description,
              attr,
              category);
      result.add(product);
    } catch (Exception e) {
      LOGGER.error("Failed to parse: {}", webPageEntity, e);
    }
    return Observable.from(result).doOnNext(e -> parseResultCounter.inc());
  }
Esempio n. 11
0
 @Override
 public Observable<WebPageEntity> parse(WebPageEntity webPageEntity) {
   Observable<DownloadResult> pages =
       client.get(webPageEntity.getUrl(), new DocumentCompletionHandler(webPageEntity));
   return pages.flatMap(this::parseDocument).doOnNext(e -> this.parseResultCounter.inc());
 }