Example #1
0
  @SuppressWarnings("finally")
  public JSONObject getKeyWords(JSONObject semiData) {
    JSONObject keyWords = new JSONObject();

    try {
      this.buildIndex(semiData);
      this.getIndexInfo(this.indexDir, 4);
      this.generateWekaFile(this.termList, this.maxDocNum, this.wekaFile);
      JSONArray array = this.Cluster(this.wekaFile, 7);
      int totalNum = 0;
      for (int i = 0; i < array.length(); i++) {
        totalNum += array.getJSONArray(i).length();
      }
      keyWords.put("maxFreq", this.maxFreq);
      keyWords.put("totalNum", totalNum);
      keyWords.put("WordList", array);
    } catch (WeiboException e) {
      System.out.print("getKeyWords++++++weibo\n");
      System.out.print("error:" + e.getError() + "toString:" + e.toString());
      keyWords.put("error", e.getError());
      e.printStackTrace();
    } catch (Exception e) {
      System.out.print("getKeyWords++++++Exception");
      keyWords.put("error", e.toString());
      e.printStackTrace();
    } finally {
      try {
        this.myDelete(this.indexDir);
        this.myDelete(this.wekaFile);
      } catch (Exception e) {
        e.printStackTrace();
      }
      return keyWords;
    }
  }
  private void process() {
    // check start crawl time
    if (!CrawlTool.checkStartCrawlTime(this.startCrawlTimeStamp)) { // wait
      // to
      // start
      Long nowTimeStamp = new Date().getTime();
      int waitTimeSecond =
          CrawlTool.safeLongToInt((this.startCrawlTimeStamp - nowTimeStamp) / 1000);
      System.out.println(
          this.logSign
              + " Sleep a while to start: "
              + waitTimeSecond
              + "s, or:"
              + waitTimeSecond / 60
              + "m.");
      CrawlTool.sleep(waitTimeSecond, this.logSign);
    }

    // the variables and flags
    String currentKey = "";
    int page = 0;
    String currentUid = "";
    boolean isResultEmpty = false;
    boolean isResultTooOld = false;
    boolean isError = false;

    while (true) {

      // check if it is time to stop
      boolean stopCrawl = CrawlTool.checkStopCrawlTime(this.stopCrawlTimeStamp);
      if (stopCrawl) {
        // report this round
        System.out.println("Time's up. Stop crawling now.");
        break;
      }

      // check if it is the max times to crawl
      if (!(this.requestedCrawlingTimes > 0 && this.requestedCrawlingTimes >= this.crawledTimes)) {
        return;
      }

      // prepare the key
      currentKey = CrawlTool.getNextKey(keyList, currentKey);

      // prepare the uid, page, count
      if (null == currentUid || "".equals(currentUid)) {
        currentUid = getNextUidTillEnd(currentUid);

        if (currentUid == "finish" || "finish".equals(currentUid)) { // all
          // uids
          // have
          // been
          // crawled
          String line = "All uids have been crawled, now finish! " + new Date();
          System.out.println(line);
          ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), line);
          break;
        }
        page = 0; // because later there will be page++
      }
      if (isResultEmpty || page > this.maxPageCount) {
        // empty result get or page > max page, goto next id, page = 1
        System.out.println("The result is empty.");
        currentUid = getNextUidTillEnd(currentUid);
        page = 0;

        if (currentUid == "finish" || "finish".equals(currentUid)) { // all
          // uids
          // have
          // been
          // crawled
          String line = "All uids have been crawled, now finish! " + new Date();
          System.out.println(line);
          ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), line);
          break;
        }
      }

      // if the posts got is older than the required date, then stop next
      // page, and change to the next userid
      // if (isResultTooOld) {
      // System.out.println("The last post is too old.");
      // currentUid = getNextUidTillEnd(currentUid);
      // page = 0;
      //
      // if (currentUid == "finish" || "finish".equals(currentUid)) {//
      // all
      // // uids
      // // have
      // // been
      // // crawled
      // String line = "All uids have been crawled, now finish! "
      // + new Date();
      // System.out.println(line);
      // ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), line);
      // break;
      // }
      //
      // }

      page++;

      // prepare the parameter map
      Map<String, String> map = getParaMap(currentUid, page, count);
      System.out.println(map.toString());
      this.crawledTimes++;
      System.out.println("Crawling for the times: " + this.crawledTimes);

      // sleep interval
      CrawlTool.sleep(this.interval, this.logSign);

      // try to crawl
      try {
        String line =
            "uid="
                + currentUid
                + ", page="
                + page
                + ", count="
                + count
                + ", Crawl times: "
                + this.crawledTimes;
        ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), line);

        // try to get the data
        String result = ExpUtils.crawlData(api, map, currentKey);
        // result = StringEscapeUtils.unescapeJava(result);

        // init and set the flags
        isResultEmpty = false;
        isError = false;

        if ("[]".equals(result)) {
          isResultEmpty = true;
        } else {
          isResultEmpty = false;
        }

        if (result.startsWith("{\"error\"")) {
          isError = true;
        } else {
          isError = false;
        }

        // if (isResultTooOld(result)) {
        // isResultTooOld = true;
        // } else {
        // isResultTooOld = false;
        // }

        if (!isResultEmpty && !isError) {
          ExpUtils.mylogJson(CrawlTool.splitFileNameByHour(JsonlogName), result);
        } else {
          // something wrong
          ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), result);
        }
      } catch (WeiboException e) {
        e.printStackTrace();
        ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), e.getError());
        continue;
      }
    }
  }