Пример #1
0
  /** 裁判文书 抓取word,HTML修改court桶 */
  public static boolean updateJsonData(
      List<ArchivesVO> list, Bucket bucket, AdministrationUtils util) throws Exception {
    if (null == list || list.size() <= 0) {
      return false;
    }
    //		util.initData(); // 查询行政区
    String[] array = null;
    JsonDocument doc = null;
    JsonObject obj2 = null;
    com.google.gson.JsonObject json = null;
    Gson gson = new Gson();
    ArchivesVO archs = null;
    try {
      for (ArchivesVO arch : list) {
        SUM++;
        // 查询数据
        doc = JsonDocument.create(arch.getUuid()); // 获取ID
        obj2 = bucket.get(doc) == null ? null : bucket.get(doc).content();
        if (obj2 == null) {
          logger.info("匹配不到UUID:" + arch.getUuid());
          continue;
        }
        archs = new ArchivesVO();
        json = gson.fromJson(obj2.toString(), com.google.gson.JsonObject.class);
        archs = gson.fromJson(json, ArchivesVO.class);

        if (null != arch.getTitle() && !"".equals(arch.getTitle())) {
          archs.setTitle(arch.getTitle());
        }
        if (null != obj2.get("title") && !"".equals(obj2.get("title"))) {
          archs.setTitle(obj2.get("title").toString()); // 标题
        }
        if (null != arch.getCaseNum() && !"".equals(arch.getCaseNum())) {
          archs.setCaseNum(arch.getCaseNum());
        }
        if (null != obj2.get("caseNum") && !"".equals(obj2.get("caseNum"))) {
          archs.setCaseNum(obj2.get("caseNum").toString()); // 案号
        }
        if (null != arch.getCourtName() && !"".equals(arch.getCourtName())) {
          archs.setCourtName(arch.getCourtName());
        }
        if (null != obj2.get("courtName") && !"".equals(obj2.get("courtName"))) {
          archs.setCourtName(obj2.get("courtName").toString()); // 法院名
        }
        if (null != arch.getCatalog() && !"".equals(arch.getCatalog())) {
          archs.setCatalog(arch.getCatalog());
        }
        if (null != obj2.get("catalog") && !"".equals(obj2.get("catalog"))) {
          archs.setCatalog(obj2.get("catalog").toString()); // 分类
        }
        if (null != arch.getApproval() && !"".equals(arch.getApproval())) {
          archs.setApproval(arch.getApproval());
        }
        if (null != obj2.get("approval") && !"".equals(obj2.get("approval"))) {
          archs.setApproval(obj2.get("approval").toString()); // 审批结果
        }
        if (null != arch.getCaseCause() && !"".equals(arch.getCaseCause())) {
          archs.setCaseCause(arch.getCaseCause());
        }
        if (null != obj2.get("caseCause") && !"".equals(obj2.get("caseCause"))) {
          archs.setCaseCause(obj2.get("caseCause").toString()); // 案由
        }
        if (null != arch.getPlaintiff() && !"".equals(arch.getPlaintiff())) {
          archs.setPlaintiff(arch.getPlaintiff());
        }
        if (null != obj2.get("plaintiff") && !"".equals(obj2.get("plaintiff"))) {
          archs.setPlaintiff(obj2.get("plaintiff").toString()); // 原告
        }

        if (null != arch.getDefendant() && !"".equals(arch.getDefendant())) {
          archs.setDefendant(arch.getDefendant());
        }
        if (null != obj2.get("defendant") && !"".equals(obj2.get("defendant"))) {
          archs.setDefendant(obj2.get("defendant").toString()); // 被告
        }

        if (null != arch.getApprovalDate() && !"".equals(arch.getApprovalDate())) {
          archs.setApprovalDate(arch.getApprovalDate());
        }
        if (null != obj2.get("approvalDate") && !"".equals(obj2.get("approvalDate"))) {
          archs.setApprovalDate(obj2.get("approvalDate").toString()); // 审结日期
        }

        if (null != arch.getSummary() && !"".equals(arch.getSummary())) {
          archs.setSummary(arch.getSummary());
        }
        if (null != obj2.get("summary") && !"".equals(obj2.get("summary"))) {
          archs.setSummary(obj2.get("summary").toString()); // 摘要
        }

        if (null != obj2.get("detailLink") && !"".equals(obj2.get("detailLink"))) {
          archs.setDetailLink(obj2.get("detailLink").toString()); // url
        }

        if (null != obj2.get("publishDate") && !"".equals(obj2.get("publishDate"))) {
          archs.setPublishDate(getReplaceAllDate(obj2.get("publishDate").toString())); // 发布日期
        }

        if (null != obj2.get("province") && !"".equals(obj2.get("province"))) {
          archs.setProvince(obj2.get("province").toString()); // 省
        }
        if (null != obj2.get("city") && !"".equals(obj2.get("city"))) {
          archs.setCity(obj2.get("city").toString()); // 市
        }
        if (null != obj2.get("area") && !"".equals(obj2.get("area"))) {
          archs.setArea(obj2.get("area").toString()); // 县
        }
        if (null != archs.getCourtName() && !"".equals(archs.getCourtName())) {
          array = util.utils(arch.getCourtName());
        }
        if (null != obj2.get("courtName") && !"".equals(obj2.get("courtName"))) {
          array = util.utils(obj2.get("courtName").toString());
        }
        if (null != array) {
          if (null != array[0] && !"".equals(array[0])) {
            archs.setProvince(array[0]);
          }
          if (null != array[1] && !"".equals(array[1])) {
            archs.setCity(array[1]);
          }
          if (null != array[2] && !"".equals(array[2])) {
            archs.setArea(array[2]);
          }
        }

        if (null != obj2.get("collectDate") && !"".equals(obj2.get("collectDate"))) {
          archs.setCollectDate(getReplaceAllDate(obj2.get("collectDate").toString())); // 采集时间
        }
        if (null != obj2.get("suitDate") && !"".equals(obj2.get("suitDate"))) {
          archs.setSuitDate(obj2.get("suitDate").toString()); // 起诉日期
        }
        String jsonss = gson.toJson(archs);
        doc = JsonDocument.create(arch.getUuid(), JsonObject.fromJson(jsonss));
        logger.info("更新条数:" + SUM + "---省:" + array[0] + "---市:" + array[1] + "---县/区:" + array[2]);
        bucket.upsert(doc);
      }
    } catch (Exception e) {
      logger.error(e.getMessage());
      return false;
    } finally {
      array = null;
      gson = null;
      json = null;
      archs = null;
      obj2 = null;
      doc = null;
    }
    return true;
  }
Пример #2
0
  /**
   * 递归遍历html文件
   *
   * @param file
   * @throws
   * @throws Exception
   */
  private static void show(File file, Bucket bucket, AdministrationUtils util) throws Exception {
    String variable = null;
    String html = null;
    ArchivesVO arch = null;
    Map<String, List<String>> list = null;
    List<ArchivesVO> listarchs = null;
    Document doc;
    listarchs = new ArrayList<ArchivesVO>();
    int i = 0;
    if (file.isFile()) {
      arch = new ArchivesVO();
      String suffix = file.getName();
      suffix = suffix.substring(suffix.indexOf(".") + 1, suffix.length());
      suffix = MAPS.get(suffix);
      if (null == suffix) {
        return;
      }
      logger.info("网址:" + file.getPath());
      for (String val : charset) { // 匹配不同编码格式
        doc = Jsoup.parse(file, val);
        html = doc.body().text(); // 取页面body标签中所有内容
        boolean Garbled = getErrorCode(html); // 判断编码是否错误
        if (Garbled == false) {
          logger.info(val + "编码错误!!!");
          i++;
          if (i == 5) {
            html = null;
          }
          continue;
        }
        i = 0;
        variable = getReplaceAll(doc.title());
        if (variable != null && !"".equals(variable)) {
          arch.setTitle(variable.trim()); // 标题 √
        }
        logger.info("标题:" + arch.getTitle());
        break;
      }
      if (html == null || "".equals(html)) {
        logger.info("内容为空的HTML页面:" + file.getPath());
      }
      html = getReplaceAll(html).trim();
      logger.info("所有内容:" + html);
      list = ExtractthepeopleText.getPersonName(html);
      arch.setPlaintiff(getKeyName(list, 1)); // 原告相关人 √
      logger.info("<<------------------------------------------------------>>");
      logger.info("原告相关人:" + arch.getPlaintiff());
      arch.setDefendant(getKeyName(list, 2)); // 被告相关人 √
      logger.info("被告相关人:" + arch.getDefendant());
      variable = getCourtName(html);
      if (variable != null && !"".equals(variable)) {
        arch.setCourtName(variable); // 法院 √
      }
      if (variable == null || "".equals(variable)) {
        arch.setCourtName(getAtherthe(html)); // 法院 √
      }
      logger.info("法院:" + arch.getCourtName());
      arch.setCaseCause(StringCause(html)); // 案由 √
      logger.info("案由:" + arch.getCaseCause());
      arch.setApprovalDate(getConcludeDate(html)); // 审结日期 √
      logger.info("审结日期:" + arch.getApprovalDate());
      arch.setApproval(getTheVerdictData(html)); // 判决结果 √
      logger.info("判决结果:" + arch.getApproval());
      arch.setCatalog(getCatalog(html)); // 文书类型 √
      logger.info("文书类型:" + arch.getCatalog());
      variable = getCaseNum(html);
      if (!"".equals(variable) && variable != null) {
        arch.setCaseNum(variable); // 案号 √
      }
      if ("".equals(variable) || variable == null) {
        arch.setCaseNum(getSentenceNo3(html)); // 案号 √
      }
      logger.info("案号:" + arch.getCaseNum());
      logger.info("<<------------------------------------------------------>>");
      arch.setUuid(file.getName().substring(0, file.getName().lastIndexOf(".")));
      listarchs.add(arch);
      boolean result = updateJsonData(listarchs, bucket, util);
      if (!result) {
        logger.info(file.getPath() + ":更新失败");
      }
      count += listarchs.size();
      logger.info("<<------------------------count------------------------------>>" + count);
      listarchs = null;
      return;
    }
    File[] files = file.listFiles();
    for (File fi : files) {
      if (fi.isFile()) {
        arch = new ArchivesVO();
        String suffix = fi.getName();
        suffix = suffix.substring(suffix.indexOf(".") + 1, suffix.length());
        suffix = MAPS.get(suffix);
        if (null == suffix) {
          return;
        }
        //				logger.info("网址:" + fi.getPath());
        for (String val : charset) { // 匹配不同编码格式
          doc = Jsoup.parse(fi, val);
          html = doc.body().text();
          //					html = doc.text();
          //					html = getDataAll(html);
          boolean Garbled = getErrorCode(html); // 判断编码是否错误
          if (Garbled == false) {
            i++;
            if (i == 5) {
              html = null;
            } // 判断编码格式都不匹配的时候赋予空值
            continue;
          }
          i = 0;
          variable = getReplaceAll(doc.title());
          if (variable != null && !"".equals(variable)) { // 防止title标签为空的情况
            arch.setTitle(variable.trim()); // 标题 √
          }
          break;
        }
        if (html == null || "".equals(html)) {
          logger.info("内容为空的HTML页面:" + fi.getPath());
          continue;
        }
        html = getReplaceAll(html).trim(); // 所有内容去掉特殊字符√

        list = ExtractthepeopleText.getPersonName(html);

        arch.setPlaintiff(getKeyName(list, 1)); // 原告相关人√

        arch.setDefendant(getKeyName(list, 2)); // 被告相关人√

        arch.setCatalog(getCatalog(html)); // 文书类型 √

        variable = getCourtName(html);
        if (variable != null && !"".equals(variable)) {
          arch.setCourtName(variable); // 法院 √
        }
        if (variable == null || "".equals(variable)) {
          arch.setCourtName(getAtherthe(html)); // 法院 √
        }

        arch.setCaseCause(StringCause(html)); // 案由 √

        arch.setApprovalDate(getConcludeDate(html)); // 审结日期√

        arch.setApproval(getTheVerdictData(html)); // 判决结果√

        variable = getCaseNum(html);
        if (!"".equals(variable) && variable != null) {
          arch.setCaseNum(variable); // 案号 √
        }
        if ("".equals(variable) || variable == null) {
          arch.setCaseNum(getSentenceNo3(html)); // 案号 √
        }
        arch.setUuid(fi.getName().substring(0, fi.getName().lastIndexOf(".")));
        //				showData(arch); // 打印所有截取字段
        listarchs.add(arch);
        if (listarchs.size() >= 1000) {
          boolean result = updateJsonData(listarchs, bucket, util);
          if (!result) {
            logger.info(fi.getPath() + ":更新失败1");
          }
          count += listarchs.size();
          logger.info("<<------------------------count------------------------------>>" + count);
          listarchs = null;
          listarchs = new ArrayList<ArchivesVO>();
        }
      } else if (fi.isDirectory()) {
        logger.info(fi.getName());
        show(fi, bucket, util);
      } else {
        continue;
      }
    }
    if (null != listarchs && listarchs.size() > 0) {
      boolean result = updateJsonData(listarchs, bucket, util);
      if (!result) {
        logger.info(":更新失败2");
      }
      count += listarchs.size();
      listarchs = null;
      arch = null;
      return;
    }
  }