/** 裁判文书 抓取word,HTML修改court桶 */ public static boolean updateJsonData( List<ArchivesVO> list, Bucket bucket, AdministrationUtils util) throws Exception { if (null == list || list.size() <= 0) { return false; } // util.initData(); // 查询行政区 String[] array = null; JsonDocument doc = null; JsonObject obj2 = null; com.google.gson.JsonObject json = null; Gson gson = new Gson(); ArchivesVO archs = null; try { for (ArchivesVO arch : list) { SUM++; // 查询数据 doc = JsonDocument.create(arch.getUuid()); // 获取ID obj2 = bucket.get(doc) == null ? null : bucket.get(doc).content(); if (obj2 == null) { logger.info("匹配不到UUID:" + arch.getUuid()); continue; } archs = new ArchivesVO(); json = gson.fromJson(obj2.toString(), com.google.gson.JsonObject.class); archs = gson.fromJson(json, ArchivesVO.class); if (null != arch.getTitle() && !"".equals(arch.getTitle())) { archs.setTitle(arch.getTitle()); } if (null != obj2.get("title") && !"".equals(obj2.get("title"))) { archs.setTitle(obj2.get("title").toString()); // 标题 } if (null != arch.getCaseNum() && !"".equals(arch.getCaseNum())) { archs.setCaseNum(arch.getCaseNum()); } if (null != obj2.get("caseNum") && !"".equals(obj2.get("caseNum"))) { archs.setCaseNum(obj2.get("caseNum").toString()); // 案号 } if (null != arch.getCourtName() && !"".equals(arch.getCourtName())) { archs.setCourtName(arch.getCourtName()); } if (null != obj2.get("courtName") && !"".equals(obj2.get("courtName"))) { archs.setCourtName(obj2.get("courtName").toString()); // 法院名 } if (null != arch.getCatalog() && !"".equals(arch.getCatalog())) { archs.setCatalog(arch.getCatalog()); } if (null != obj2.get("catalog") && !"".equals(obj2.get("catalog"))) { archs.setCatalog(obj2.get("catalog").toString()); // 分类 } if (null != arch.getApproval() && !"".equals(arch.getApproval())) { archs.setApproval(arch.getApproval()); } if (null != obj2.get("approval") && !"".equals(obj2.get("approval"))) { archs.setApproval(obj2.get("approval").toString()); // 审批结果 } if (null != arch.getCaseCause() && !"".equals(arch.getCaseCause())) { archs.setCaseCause(arch.getCaseCause()); } if (null != obj2.get("caseCause") && !"".equals(obj2.get("caseCause"))) { archs.setCaseCause(obj2.get("caseCause").toString()); // 案由 } if (null != arch.getPlaintiff() && !"".equals(arch.getPlaintiff())) { archs.setPlaintiff(arch.getPlaintiff()); } if (null != obj2.get("plaintiff") && !"".equals(obj2.get("plaintiff"))) { archs.setPlaintiff(obj2.get("plaintiff").toString()); // 原告 } if (null != arch.getDefendant() && !"".equals(arch.getDefendant())) { archs.setDefendant(arch.getDefendant()); } if (null != obj2.get("defendant") && !"".equals(obj2.get("defendant"))) { archs.setDefendant(obj2.get("defendant").toString()); // 被告 } if (null != arch.getApprovalDate() && !"".equals(arch.getApprovalDate())) { archs.setApprovalDate(arch.getApprovalDate()); } if (null != obj2.get("approvalDate") && !"".equals(obj2.get("approvalDate"))) { archs.setApprovalDate(obj2.get("approvalDate").toString()); // 审结日期 } if (null != arch.getSummary() && !"".equals(arch.getSummary())) { archs.setSummary(arch.getSummary()); } if (null != obj2.get("summary") && !"".equals(obj2.get("summary"))) { archs.setSummary(obj2.get("summary").toString()); // 摘要 } if (null != obj2.get("detailLink") && !"".equals(obj2.get("detailLink"))) { archs.setDetailLink(obj2.get("detailLink").toString()); // url } if (null != obj2.get("publishDate") && !"".equals(obj2.get("publishDate"))) { archs.setPublishDate(getReplaceAllDate(obj2.get("publishDate").toString())); // 发布日期 } if (null != obj2.get("province") && !"".equals(obj2.get("province"))) { archs.setProvince(obj2.get("province").toString()); // 省 } if (null != obj2.get("city") && !"".equals(obj2.get("city"))) { archs.setCity(obj2.get("city").toString()); // 市 } if (null != obj2.get("area") && !"".equals(obj2.get("area"))) { archs.setArea(obj2.get("area").toString()); // 县 } if (null != archs.getCourtName() && !"".equals(archs.getCourtName())) { array = util.utils(arch.getCourtName()); } if (null != obj2.get("courtName") && !"".equals(obj2.get("courtName"))) { array = util.utils(obj2.get("courtName").toString()); } if (null != array) { if (null != array[0] && !"".equals(array[0])) { archs.setProvince(array[0]); } if (null != array[1] && !"".equals(array[1])) { archs.setCity(array[1]); } if (null != array[2] && !"".equals(array[2])) { archs.setArea(array[2]); } } if (null != obj2.get("collectDate") && !"".equals(obj2.get("collectDate"))) { archs.setCollectDate(getReplaceAllDate(obj2.get("collectDate").toString())); // 采集时间 } if (null != obj2.get("suitDate") && !"".equals(obj2.get("suitDate"))) { archs.setSuitDate(obj2.get("suitDate").toString()); // 起诉日期 } String jsonss = gson.toJson(archs); doc = JsonDocument.create(arch.getUuid(), JsonObject.fromJson(jsonss)); logger.info("更新条数:" + SUM + "---省:" + array[0] + "---市:" + array[1] + "---县/区:" + array[2]); bucket.upsert(doc); } } catch (Exception e) { logger.error(e.getMessage()); return false; } finally { array = null; gson = null; json = null; archs = null; obj2 = null; doc = null; } return true; }
/** * 递归遍历html文件 * * @param file * @throws * @throws Exception */ private static void show(File file, Bucket bucket, AdministrationUtils util) throws Exception { String variable = null; String html = null; ArchivesVO arch = null; Map<String, List<String>> list = null; List<ArchivesVO> listarchs = null; Document doc; listarchs = new ArrayList<ArchivesVO>(); int i = 0; if (file.isFile()) { arch = new ArchivesVO(); String suffix = file.getName(); suffix = suffix.substring(suffix.indexOf(".") + 1, suffix.length()); suffix = MAPS.get(suffix); if (null == suffix) { return; } logger.info("网址:" + file.getPath()); for (String val : charset) { // 匹配不同编码格式 doc = Jsoup.parse(file, val); html = doc.body().text(); // 取页面body标签中所有内容 boolean Garbled = getErrorCode(html); // 判断编码是否错误 if (Garbled == false) { logger.info(val + "编码错误!!!"); i++; if (i == 5) { html = null; } continue; } i = 0; variable = getReplaceAll(doc.title()); if (variable != null && !"".equals(variable)) { arch.setTitle(variable.trim()); // 标题 √ } logger.info("标题:" + arch.getTitle()); break; } if (html == null || "".equals(html)) { logger.info("内容为空的HTML页面:" + file.getPath()); } html = getReplaceAll(html).trim(); logger.info("所有内容:" + html); list = ExtractthepeopleText.getPersonName(html); arch.setPlaintiff(getKeyName(list, 1)); // 原告相关人 √ logger.info("<<------------------------------------------------------>>"); logger.info("原告相关人:" + arch.getPlaintiff()); arch.setDefendant(getKeyName(list, 2)); // 被告相关人 √ logger.info("被告相关人:" + arch.getDefendant()); variable = getCourtName(html); if (variable != null && !"".equals(variable)) { arch.setCourtName(variable); // 法院 √ } if (variable == null || "".equals(variable)) { arch.setCourtName(getAtherthe(html)); // 法院 √ } logger.info("法院:" + arch.getCourtName()); arch.setCaseCause(StringCause(html)); // 案由 √ logger.info("案由:" + arch.getCaseCause()); arch.setApprovalDate(getConcludeDate(html)); // 审结日期 √ logger.info("审结日期:" + arch.getApprovalDate()); arch.setApproval(getTheVerdictData(html)); // 判决结果 √ logger.info("判决结果:" + arch.getApproval()); arch.setCatalog(getCatalog(html)); // 文书类型 √ logger.info("文书类型:" + arch.getCatalog()); variable = getCaseNum(html); if (!"".equals(variable) && variable != null) { arch.setCaseNum(variable); // 案号 √ } if ("".equals(variable) || variable == null) { arch.setCaseNum(getSentenceNo3(html)); // 案号 √ } logger.info("案号:" + arch.getCaseNum()); logger.info("<<------------------------------------------------------>>"); arch.setUuid(file.getName().substring(0, file.getName().lastIndexOf("."))); listarchs.add(arch); boolean result = updateJsonData(listarchs, bucket, util); if (!result) { logger.info(file.getPath() + ":更新失败"); } count += listarchs.size(); logger.info("<<------------------------count------------------------------>>" + count); listarchs = null; return; } File[] files = file.listFiles(); for (File fi : files) { if (fi.isFile()) { arch = new ArchivesVO(); String suffix = fi.getName(); suffix = suffix.substring(suffix.indexOf(".") + 1, suffix.length()); suffix = MAPS.get(suffix); if (null == suffix) { return; } // logger.info("网址:" + fi.getPath()); for (String val : charset) { // 匹配不同编码格式 doc = Jsoup.parse(fi, val); html = doc.body().text(); // html = doc.text(); // html = getDataAll(html); boolean Garbled = getErrorCode(html); // 判断编码是否错误 if (Garbled == false) { i++; if (i == 5) { html = null; } // 判断编码格式都不匹配的时候赋予空值 continue; } i = 0; variable = getReplaceAll(doc.title()); if (variable != null && !"".equals(variable)) { // 防止title标签为空的情况 arch.setTitle(variable.trim()); // 标题 √ } break; } if (html == null || "".equals(html)) { logger.info("内容为空的HTML页面:" + fi.getPath()); continue; } html = getReplaceAll(html).trim(); // 所有内容去掉特殊字符√ list = ExtractthepeopleText.getPersonName(html); arch.setPlaintiff(getKeyName(list, 1)); // 原告相关人√ arch.setDefendant(getKeyName(list, 2)); // 被告相关人√ arch.setCatalog(getCatalog(html)); // 文书类型 √ variable = getCourtName(html); if (variable != null && !"".equals(variable)) { arch.setCourtName(variable); // 法院 √ } if (variable == null || "".equals(variable)) { arch.setCourtName(getAtherthe(html)); // 法院 √ } arch.setCaseCause(StringCause(html)); // 案由 √ arch.setApprovalDate(getConcludeDate(html)); // 审结日期√ arch.setApproval(getTheVerdictData(html)); // 判决结果√ variable = getCaseNum(html); if (!"".equals(variable) && variable != null) { arch.setCaseNum(variable); // 案号 √ } if ("".equals(variable) || variable == null) { arch.setCaseNum(getSentenceNo3(html)); // 案号 √ } arch.setUuid(fi.getName().substring(0, fi.getName().lastIndexOf("."))); // showData(arch); // 打印所有截取字段 listarchs.add(arch); if (listarchs.size() >= 1000) { boolean result = updateJsonData(listarchs, bucket, util); if (!result) { logger.info(fi.getPath() + ":更新失败1"); } count += listarchs.size(); logger.info("<<------------------------count------------------------------>>" + count); listarchs = null; listarchs = new ArrayList<ArchivesVO>(); } } else if (fi.isDirectory()) { logger.info(fi.getName()); show(fi, bucket, util); } else { continue; } } if (null != listarchs && listarchs.size() > 0) { boolean result = updateJsonData(listarchs, bucket, util); if (!result) { logger.info(":更新失败2"); } count += listarchs.size(); listarchs = null; arch = null; return; } }