/** * 递归遍历html文件 * * @param file * @throws * @throws Exception */ private static void show(File file, Bucket bucket, AdministrationUtils util) throws Exception { String variable = null; String html = null; ArchivesVO arch = null; Map<String, List<String>> list = null; List<ArchivesVO> listarchs = null; Document doc; listarchs = new ArrayList<ArchivesVO>(); int i = 0; if (file.isFile()) { arch = new ArchivesVO(); String suffix = file.getName(); suffix = suffix.substring(suffix.indexOf(".") + 1, suffix.length()); suffix = MAPS.get(suffix); if (null == suffix) { return; } logger.info("网址:" + file.getPath()); for (String val : charset) { // 匹配不同编码格式 doc = Jsoup.parse(file, val); html = doc.body().text(); // 取页面body标签中所有内容 boolean Garbled = getErrorCode(html); // 判断编码是否错误 if (Garbled == false) { logger.info(val + "编码错误!!!"); i++; if (i == 5) { html = null; } continue; } i = 0; variable = getReplaceAll(doc.title()); if (variable != null && !"".equals(variable)) { arch.setTitle(variable.trim()); // 标题 √ } logger.info("标题:" + arch.getTitle()); break; } if (html == null || "".equals(html)) { logger.info("内容为空的HTML页面:" + file.getPath()); } html = getReplaceAll(html).trim(); logger.info("所有内容:" + html); list = ExtractthepeopleText.getPersonName(html); arch.setPlaintiff(getKeyName(list, 1)); // 原告相关人 √ logger.info("<<------------------------------------------------------>>"); logger.info("原告相关人:" + arch.getPlaintiff()); arch.setDefendant(getKeyName(list, 2)); // 被告相关人 √ logger.info("被告相关人:" + arch.getDefendant()); variable = getCourtName(html); if (variable != null && !"".equals(variable)) { arch.setCourtName(variable); // 法院 √ } if (variable == null || "".equals(variable)) { arch.setCourtName(getAtherthe(html)); // 法院 √ } logger.info("法院:" + arch.getCourtName()); arch.setCaseCause(StringCause(html)); // 案由 √ logger.info("案由:" + arch.getCaseCause()); arch.setApprovalDate(getConcludeDate(html)); // 审结日期 √ logger.info("审结日期:" + arch.getApprovalDate()); arch.setApproval(getTheVerdictData(html)); // 判决结果 √ logger.info("判决结果:" + arch.getApproval()); arch.setCatalog(getCatalog(html)); // 文书类型 √ logger.info("文书类型:" + arch.getCatalog()); variable = getCaseNum(html); if (!"".equals(variable) && variable != null) { arch.setCaseNum(variable); // 案号 √ } if ("".equals(variable) || variable == null) { arch.setCaseNum(getSentenceNo3(html)); // 案号 √ } logger.info("案号:" + arch.getCaseNum()); logger.info("<<------------------------------------------------------>>"); arch.setUuid(file.getName().substring(0, file.getName().lastIndexOf("."))); listarchs.add(arch); boolean result = updateJsonData(listarchs, bucket, util); if (!result) { logger.info(file.getPath() + ":更新失败"); } count += listarchs.size(); logger.info("<<------------------------count------------------------------>>" + count); listarchs = null; return; } File[] files = file.listFiles(); for (File fi : files) { if (fi.isFile()) { arch = new ArchivesVO(); String suffix = fi.getName(); suffix = suffix.substring(suffix.indexOf(".") + 1, suffix.length()); suffix = MAPS.get(suffix); if (null == suffix) { return; } // logger.info("网址:" + fi.getPath()); for (String val : charset) { // 匹配不同编码格式 doc = Jsoup.parse(fi, val); html = doc.body().text(); // html = doc.text(); // html = getDataAll(html); boolean Garbled = getErrorCode(html); // 判断编码是否错误 if (Garbled == false) { i++; if (i == 5) { html = null; } // 判断编码格式都不匹配的时候赋予空值 continue; } i = 0; variable = getReplaceAll(doc.title()); if (variable != null && !"".equals(variable)) { // 防止title标签为空的情况 arch.setTitle(variable.trim()); // 标题 √ } break; } if (html == null || "".equals(html)) { logger.info("内容为空的HTML页面:" + fi.getPath()); continue; } html = getReplaceAll(html).trim(); // 所有内容去掉特殊字符√ list = ExtractthepeopleText.getPersonName(html); arch.setPlaintiff(getKeyName(list, 1)); // 原告相关人√ arch.setDefendant(getKeyName(list, 2)); // 被告相关人√ arch.setCatalog(getCatalog(html)); // 文书类型 √ variable = getCourtName(html); if (variable != null && !"".equals(variable)) { arch.setCourtName(variable); // 法院 √ } if (variable == null || "".equals(variable)) { arch.setCourtName(getAtherthe(html)); // 法院 √ } arch.setCaseCause(StringCause(html)); // 案由 √ arch.setApprovalDate(getConcludeDate(html)); // 审结日期√ arch.setApproval(getTheVerdictData(html)); // 判决结果√ variable = getCaseNum(html); if (!"".equals(variable) && variable != null) { arch.setCaseNum(variable); // 案号 √ } if ("".equals(variable) || variable == null) { arch.setCaseNum(getSentenceNo3(html)); // 案号 √ } arch.setUuid(fi.getName().substring(0, fi.getName().lastIndexOf("."))); // showData(arch); // 打印所有截取字段 listarchs.add(arch); if (listarchs.size() >= 1000) { boolean result = updateJsonData(listarchs, bucket, util); if (!result) { logger.info(fi.getPath() + ":更新失败1"); } count += listarchs.size(); logger.info("<<------------------------count------------------------------>>" + count); listarchs = null; listarchs = new ArrayList<ArchivesVO>(); } } else if (fi.isDirectory()) { logger.info(fi.getName()); show(fi, bucket, util); } else { continue; } } if (null != listarchs && listarchs.size() > 0) { boolean result = updateJsonData(listarchs, bucket, util); if (!result) { logger.info(":更新失败2"); } count += listarchs.size(); listarchs = null; arch = null; return; } }