public static void startCrawling() {
   DealerPost.Dao dao = new AutohomeModule().getInstance(DealerPost.Dao.class);
   // TODO: fix DB Connection lost in a about 30 minutes issue (cause: auto finalize the DBCursor?)
   List<DealerPost> dealerList = new ArrayList<>();
   dao.find().forEach(dealerList::add);
   log.info("DealerList size: {}", dealerList.size());
   CrawlAutohomeDealerInfoTask crawlTask = new CrawlAutohomeDealerInfoTask(dealerList.iterator());
   new TaskControl(crawlTask);
 }
Beispiel #2
0
 public void exportDealersToCsv(String fileName, DealerPost.Dao dao) {
   List<DealerPost> dealerPosts = dao.find().toArray();
   //    Map<String, List<DealerPost>> dealerByCity = StreamSupport.stream(cursor.spliterator(),
   // false)
   //        .collect((groupingBy(DealerPost::getCity)));
   // Sort by province first, then by city
   dealerPosts.sort(
       (a, b) -> {
         if (a.getCity() != null && b.getCity() != null) {
           return City.order(a.getCity(), a.getProvince())
               - City.order(b.getCity(), b.getProvince());
         }
         return a.getCity() != null ? 1 : -1;
       });
   exportToCsv(fileName, dealerPosts.iterator());
 }
  @Override
  public boolean operate(DealerPost dealerPost) {
    if (dealerPost.getLinkToUrl() == null) {
      log.warn("No linkToUrl found, {}", dealerPost);
      missingURls.incrementAndGet();
      return false;
    }
    int i = 0;
    boolean isSuccess;
    do {
      try {
        Document dealerInfoPage =
            Jsoup.connect(dealerPost.getLinkToUrl())
                .timeout(10_000)
                .userAgent(AutohomeParser.USER_AGENT)
                .get();
        Thread.sleep(200);
        isSuccess = parser.parseDealerInfo(dealerInfoPage, dealerPost);
      } catch (IOException e) {
        isSuccess = false;
        log.error(
            "Error connect to {}, already failed {} times -- {}", dealerPost.getLinkToUrl(), i, e);
      } catch (InterruptedException e) {
        e.printStackTrace();
        isSuccess = false;
      }
    } while (!isSuccess && i++ < MAX_RETRY);

    if (!isSuccess) {
      failures.incrementAndGet();
    }
    if (isSuccess) {
      dao.updateById(
          dealerPost.getId(), ImmutableMap.of("brandDetailList", dealerPost.getBrandDetailList()));
    }

    return isSuccess;
  }