public static void startCrawling() { DealerPost.Dao dao = new AutohomeModule().getInstance(DealerPost.Dao.class); // TODO: fix DB Connection lost in a about 30 minutes issue (cause: auto finalize the DBCursor?) List<DealerPost> dealerList = new ArrayList<>(); dao.find().forEach(dealerList::add); log.info("DealerList size: {}", dealerList.size()); CrawlAutohomeDealerInfoTask crawlTask = new CrawlAutohomeDealerInfoTask(dealerList.iterator()); new TaskControl(crawlTask); }
public void exportDealersToCsv(String fileName, DealerPost.Dao dao) { List<DealerPost> dealerPosts = dao.find().toArray(); // Map<String, List<DealerPost>> dealerByCity = StreamSupport.stream(cursor.spliterator(), // false) // .collect((groupingBy(DealerPost::getCity))); // Sort by province first, then by city dealerPosts.sort( (a, b) -> { if (a.getCity() != null && b.getCity() != null) { return City.order(a.getCity(), a.getProvince()) - City.order(b.getCity(), b.getProvince()); } return a.getCity() != null ? 1 : -1; }); exportToCsv(fileName, dealerPosts.iterator()); }
@Override public boolean operate(DealerPost dealerPost) { if (dealerPost.getLinkToUrl() == null) { log.warn("No linkToUrl found, {}", dealerPost); missingURls.incrementAndGet(); return false; } int i = 0; boolean isSuccess; do { try { Document dealerInfoPage = Jsoup.connect(dealerPost.getLinkToUrl()) .timeout(10_000) .userAgent(AutohomeParser.USER_AGENT) .get(); Thread.sleep(200); isSuccess = parser.parseDealerInfo(dealerInfoPage, dealerPost); } catch (IOException e) { isSuccess = false; log.error( "Error connect to {}, already failed {} times -- {}", dealerPost.getLinkToUrl(), i, e); } catch (InterruptedException e) { e.printStackTrace(); isSuccess = false; } } while (!isSuccess && i++ < MAX_RETRY); if (!isSuccess) { failures.incrementAndGet(); } if (isSuccess) { dao.updateById( dealerPost.getId(), ImmutableMap.of("brandDetailList", dealerPost.getBrandDetailList())); } return isSuccess; }