/** * @param webPageEntity * @param category * @return */ private String[] getNormalizedCategories(WebPageEntity webPageEntity, String category) { if (mapping.containsKey(category)) { return mapping.get(category).split(","); } LOGGER.warn("Unknown category: {} url {}", webPageEntity.getCategory(), webPageEntity.getUrl()); return new String[] {"misc"}; }
private Observable<WebPageEntity> parseDocument(DownloadResult downloadResult) { Set<WebPageEntity> result = new HashSet<>(1); Document document = downloadResult.getDocument(); if (document != null) { Elements elements = document.select(".InfoArea a[title]"); if (!elements.isEmpty()) { for (Element element : elements) { WebPageEntity webPageEntity = new WebPageEntity( downloadResult.getSourcePage(), "", "productPage", element.attr("abs:href"), downloadResult.getSourcePage().getCategory()); LOGGER.info("productPageUrl={}", webPageEntity.getUrl()); result.add(webPageEntity); } } else { WebPageEntity webPageEntity = new WebPageEntity( downloadResult.getSourcePage(), "", "productPage", downloadResult.getSourcePage().getUrl(), downloadResult.getSourcePage().getCategory()); LOGGER.info("productPageUrl={}", webPageEntity.getUrl()); result.add(webPageEntity); } } return Observable.from(result); }
/** * @param webPageEntity * @return */ private String[] getNormalizedCategories(WebPageEntity webPageEntity) { String s = mapping.get(webPageEntity.getCategory()); if (null != s) { return s.split(","); } LOGGER.warn("Unknown category: {} url {}", webPageEntity.getCategory(), webPageEntity.getUrl()); return new String[] {"misc"}; }
/** * @param webPageEntity * @return * @throws Exception */ @Override public Observable<ProductEntity> parse(WebPageEntity webPageEntity) { HashSet<ProductEntity> result = new HashSet<>(); try { ProductEntity product; String productName = null; String url = null; String regularPrice = null; String specialPrice = null; String productImage = null; String description = null; Map<String, String> attr = new HashMap<>(); String[] category = null; url = webPageEntity.getUrl(); Document document = Jsoup.parse(webPageEntity.getContent(), webPageEntity.getUrl()); productName = document.select(".naitem").text(); LOGGER.info("Parsing {}, page={}", productName, webPageEntity.getUrl()); if (!document.select(".outofstock").isEmpty()) { return Observable.empty(); } productImage = document.select(".itemImgDiv img.itemDetailImg").attr("abs:src"); String priceText = document.select(".itemDetailPrice").text().replace("\\xEF\\xBF\\xBD", " "); Matcher matcher = pricePattern.matcher(priceText); if (matcher.find()) { regularPrice = matcher.group().replace(",", ""); } description = document.select(".itemDescription").text(); category = getNormalizedCategories(webPageEntity); product = new ProductEntity( productName, url, regularPrice, specialPrice, productImage, description, attr, category); result.add(product); } catch (Exception e) { LOGGER.error("Failed to parse: {}", webPageEntity, e); } return Observable.from(result).doOnNext(e -> parseResultCounter.inc()); }
@Override public Observable<WebPageEntity> parse(WebPageEntity parent) { return client .get(parent.getUrl(), new DocumentCompletionHandler(parent)) .flatMap(this::parseDocument) .doOnNext(e -> this.parseResultCounter.inc()); }
@Override public Observable<WebPageEntity> parse(WebPageEntity webPage) { LOGGER.trace("Processing productPage {}", webPage.getUrl()); return PageDownloader.download(client, webPage, "productPageRaw") .filter(data -> null != data) .doOnNext(e -> this.parseResultCounter.inc()); }
@Override public Observable<ProductEntity> parse(WebPageEntity webPageEntity) { HashSet<ProductEntity> result = new HashSet<>(); try { ProductEntity product; String productName = null; String url = null; String regularPrice = null; String specialPrice = null; String productImage = null; String description = null; Map<String, String> attr = new HashMap<>(); String[] category = null; url = webPageEntity.getUrl(); Document document = Jsoup.parse(webPageEntity.getContent(), webPageEntity.getUrl()); productName = document.select(".product_title").text(); LOGGER.info("Parsing {}, page={}", productName, webPageEntity.getUrl()); productImage = document.select(".product .wp-post-image").attr("abs:src"); regularPrice = document.select("meta[itemprop=price]").attr("content"); description = document.select("#tab-description").text().replace("Product Description", ""); category = webPageEntity.getCategory().split(","); product = new ProductEntity( productName, url, regularPrice, specialPrice, productImage, description, attr, category); result.add(product); } catch (Exception e) { LOGGER.error("Failed to parse: {}", webPageEntity, e); } return Observable.from(result).doOnNext(e -> parseResultCounter.inc()); }
private Observable<WebPageEntity> parseDocument(DownloadResult downloadResult) { Set<WebPageEntity> result = new HashSet<>(1); Document document = downloadResult.getDocument(); if (document != null) { Elements elements = document.select(".SideCategoryListFlyout a"); for (Element element : elements) { WebPageEntity webPageEntity = new WebPageEntity( downloadResult.getSourcePage(), "", "productList", element.attr("abs:href"), element.text()); LOGGER.info("Product page listing={}", webPageEntity.getUrl()); result.add(webPageEntity); } } return Observable.from(result); }
/** * @param price * @return */ private static String parsePrice(WebPageEntity webPageEntity, String price) { Matcher matcher = pricePattern.matcher(price); if (matcher.find()) { try { return matcher.group(1).replace(",", ""); // return // NumberFormat.getInstance(Locale.US).parse(matcher.group(1)).toString(); } catch (Exception ignored) { return Double.valueOf(matcher.group(1)).toString(); } } else { LOGGER.error("failed to parse price {}, page {}", price, webPageEntity.getUrl()); return price; } }
@Override public Observable<ProductEntity> parse(WebPageEntity webPageEntity) { HashSet<ProductEntity> result = new HashSet<>(); try { ProductEntity product; String productName = null; String url = null; String regularPrice = null; String specialPrice = null; String productImage = null; String description = null; Map<String, String> attr = new HashMap<>(); String[] category = null; url = webPageEntity.getUrl(); Document document = Jsoup.parse(webPageEntity.getContent(), webPageEntity.getUrl()); if (!document.select(".saleImage").isEmpty()) { return Observable.empty(); } productName = document.select("div.innercontentDiv > div > div > h2").text(); if (productName.isEmpty()) { return Observable.empty(); } LOGGER.info("Parsing {}, page={}", productName, webPageEntity.getUrl()); String manufacturer = document.select(".product-details__title .product__manufacturer").text(); if (!manufacturer.isEmpty()) { attr.put("manufacturer", manufacturer); } productImage = document.select("div.imgLiquidNoFill a").attr("abs:src"); if (productImage.isEmpty()) { productImage = document.select(".es-carousel img").attr("abs:src"); } regularPrice = parsePrice( webPageEntity, document.select("#desPrice > li:nth-child(1) > span.pricetag.show").text()); specialPrice = document.select("#desPrice > li:nth-child(2) > span.pricetag.show").text(); if (!specialPrice.isEmpty()) { specialPrice = parsePrice(webPageEntity, specialPrice); } description = document.select("#TabbedPanels1 > div > div:nth-child(1)").text(); Iterator<Element> labels = document.select("table.productTbl > tbody > tr > td:nth-child(1)").iterator(); Iterator<Element> values = document.select("table.productTbl > tbody > tr > td:nth-child(2)").iterator(); while (labels.hasNext()) { String specName = CaseFormat.LOWER_UNDERSCORE.to( CaseFormat.LOWER_CAMEL, labels.next().text().replace(' ', '_').replace(":", "").trim()); String specValue = values.next().text(); if (specName.contains("department")) { category = getNormalizedCategories(webPageEntity, specValue); } else { attr.put(specName, specValue); } } if (category == null) { LOGGER.warn("Category not found"); } product = new ProductEntity( productName, url, regularPrice, specialPrice, productImage, description, attr, category); result.add(product); } catch (Exception e) { LOGGER.error("Failed to parse: {}", webPageEntity, e); } return Observable.from(result).doOnNext(e -> parseResultCounter.inc()); }
@Override public Observable<WebPageEntity> parse(WebPageEntity webPageEntity) { Observable<DownloadResult> pages = client.get(webPageEntity.getUrl(), new DocumentCompletionHandler(webPageEntity)); return pages.flatMap(this::parseDocument).doOnNext(e -> this.parseResultCounter.inc()); }