@SuppressWarnings("finally") public JSONObject getKeyWords(JSONObject semiData) { JSONObject keyWords = new JSONObject(); try { this.buildIndex(semiData); this.getIndexInfo(this.indexDir, 4); this.generateWekaFile(this.termList, this.maxDocNum, this.wekaFile); JSONArray array = this.Cluster(this.wekaFile, 7); int totalNum = 0; for (int i = 0; i < array.length(); i++) { totalNum += array.getJSONArray(i).length(); } keyWords.put("maxFreq", this.maxFreq); keyWords.put("totalNum", totalNum); keyWords.put("WordList", array); } catch (WeiboException e) { System.out.print("getKeyWords++++++weibo\n"); System.out.print("error:" + e.getError() + "toString:" + e.toString()); keyWords.put("error", e.getError()); e.printStackTrace(); } catch (Exception e) { System.out.print("getKeyWords++++++Exception"); keyWords.put("error", e.toString()); e.printStackTrace(); } finally { try { this.myDelete(this.indexDir); this.myDelete(this.wekaFile); } catch (Exception e) { e.printStackTrace(); } return keyWords; } }
private void process() { // check start crawl time if (!CrawlTool.checkStartCrawlTime(this.startCrawlTimeStamp)) { // wait // to // start Long nowTimeStamp = new Date().getTime(); int waitTimeSecond = CrawlTool.safeLongToInt((this.startCrawlTimeStamp - nowTimeStamp) / 1000); System.out.println( this.logSign + " Sleep a while to start: " + waitTimeSecond + "s, or:" + waitTimeSecond / 60 + "m."); CrawlTool.sleep(waitTimeSecond, this.logSign); } // the variables and flags String currentKey = ""; int page = 0; String currentUid = ""; boolean isResultEmpty = false; boolean isResultTooOld = false; boolean isError = false; while (true) { // check if it is time to stop boolean stopCrawl = CrawlTool.checkStopCrawlTime(this.stopCrawlTimeStamp); if (stopCrawl) { // report this round System.out.println("Time's up. Stop crawling now."); break; } // check if it is the max times to crawl if (!(this.requestedCrawlingTimes > 0 && this.requestedCrawlingTimes >= this.crawledTimes)) { return; } // prepare the key currentKey = CrawlTool.getNextKey(keyList, currentKey); // prepare the uid, page, count if (null == currentUid || "".equals(currentUid)) { currentUid = getNextUidTillEnd(currentUid); if (currentUid == "finish" || "finish".equals(currentUid)) { // all // uids // have // been // crawled String line = "All uids have been crawled, now finish! " + new Date(); System.out.println(line); ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), line); break; } page = 0; // because later there will be page++ } if (isResultEmpty || page > this.maxPageCount) { // empty result get or page > max page, goto next id, page = 1 System.out.println("The result is empty."); currentUid = getNextUidTillEnd(currentUid); page = 0; if (currentUid == "finish" || "finish".equals(currentUid)) { // all // uids // have // been // crawled String line = "All uids have been crawled, now finish! " + new Date(); System.out.println(line); ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), line); break; } } // if the posts got is older than the required date, then stop next // page, and change to the next userid // if (isResultTooOld) { // System.out.println("The last post is too old."); // currentUid = getNextUidTillEnd(currentUid); // page = 0; // // if (currentUid == "finish" || "finish".equals(currentUid)) {// // all // // uids // // have // // been // // crawled // String line = "All uids have been crawled, now finish! " // + new Date(); // System.out.println(line); // ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), line); // break; // } // // } page++; // prepare the parameter map Map<String, String> map = getParaMap(currentUid, page, count); System.out.println(map.toString()); this.crawledTimes++; System.out.println("Crawling for the times: " + this.crawledTimes); // sleep interval CrawlTool.sleep(this.interval, this.logSign); // try to crawl try { String line = "uid=" + currentUid + ", page=" + page + ", count=" + count + ", Crawl times: " + this.crawledTimes; ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), line); // try to get the data String result = ExpUtils.crawlData(api, map, currentKey); // result = StringEscapeUtils.unescapeJava(result); // init and set the flags isResultEmpty = false; isError = false; if ("[]".equals(result)) { isResultEmpty = true; } else { isResultEmpty = false; } if (result.startsWith("{\"error\"")) { isError = true; } else { isError = false; } // if (isResultTooOld(result)) { // isResultTooOld = true; // } else { // isResultTooOld = false; // } if (!isResultEmpty && !isError) { ExpUtils.mylogJson(CrawlTool.splitFileNameByHour(JsonlogName), result); } else { // something wrong ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), result); } } catch (WeiboException e) { e.printStackTrace(); ExpUtils.mylog(CrawlTool.splitFileNameByHour(logName), e.getError()); continue; } } }