/* * 将前面产生的Prepare的BatchJob文件 添加上起始标志,并且重新生成新的文件,(不含Prepare的后缀了) * */ public static void createAllOtherCityBatchJobReadyFile() { // 提取所有的城市和子域名的对应关系 initCityAndSubdomainList(); for (Pair pair : cityAndSubdomainList) { String URL_SUBDOMAIN = pair.getValue(); String CITY_PINYING = URL_SUBDOMAIN.toUpperCase().charAt(0) + URL_SUBDOMAIN.substring(1); String BATCH_FILE_PREPARE_PATH = "./data/batch/" + SITE_NAME_PINYING + "/" + SITE_NAME_PINYING + CITY_PINYING + TYPE_PINYING + "BatchCreateJobFilePrepare.txt"; String BATCH_FILE_READY_PATH = "./data/batch/" + SITE_NAME_PINYING + "/" + SITE_NAME_PINYING + CITY_PINYING + TYPE_PINYING + "BatchCreateJobFile.txt"; String fileContent = FileUtil.getDataFile2StrKeepReturn(BATCH_FILE_PREPARE_PATH, "utf-8"); fileContent = "^" + "\n" + fileContent + "$" + "\n"; FileUtil.writeStr2File(fileContent, BATCH_FILE_READY_PATH, "utf-8"); System.out.println(BATCH_FILE_READY_PATH + " is completed!"); } }
/* * http://www.anjuke.com/index/ 上的所有其它城市,(除去 北京 上海) 但是 包括 广州 深圳 * */ @Test public void allOtherCityWhole() { Element elem = ParserUtil.parseUrlWithRegexAndResultIndex(ALL_OTHER_CITY_URL, ALL_OTHER_CITY_REGEX, 0); Elements elements = ParserUtil.parseElementWithRegex(elem, "a"); StringBuffer sb = new StringBuffer(); for (Element element : elements) { String href = element.attr("href"); String word = element.text(); if (!(word.equals("北京") || word.equals("上海"))) { sb.append(href); sb.append(","); sb.append(word); sb.append("\n"); } } System.out.println(sb.toString()); FileUtil.writeStr2File(sb.toString(), ALL_OTHER_CITY_FILE_PATH, "utf-8"); }
// 提取所有的城市和子域名的对应关系 public static void initCityAndSubdomainList() { String content = FileUtil.getDataFile2StrKeepReturn(ALL_OTHER_CITY_FILE_PATH, "utf-8"); String[] lineArr = content.split("\n"); cityAndSubdomainList = new ArrayList<Pair>(); // 提取所有的城市和子域名的对应关系 for (String line : lineArr) { String[] attrArr = line.split(","); String href = attrArr[0]; String city = attrArr[1]; String subDomain = href.substring(href.lastIndexOf("/") + 1, href.indexOf(".", href.lastIndexOf("/"))); Pair pair = new Pair(); pair.setKey(city); pair.setValue(subDomain); cityAndSubdomainList.add(pair); System.out.println(city + " : " + subDomain); } System.out.println("/----------------------------------------------------------------/"); System.out.println("all cities and their subdomains extraction is completed!"); System.out.println("/----------------------------------------------------------------/"); }