Beispiel #1
0
  /**
   * 递归遍历html文件
   *
   * @param file
   * @throws
   * @throws Exception
   */
  private static void show(File file, Bucket bucket, AdministrationUtils util) throws Exception {
    String variable = null;
    String html = null;
    ArchivesVO arch = null;
    Map<String, List<String>> list = null;
    List<ArchivesVO> listarchs = null;
    Document doc;
    listarchs = new ArrayList<ArchivesVO>();
    int i = 0;
    if (file.isFile()) {
      arch = new ArchivesVO();
      String suffix = file.getName();
      suffix = suffix.substring(suffix.indexOf(".") + 1, suffix.length());
      suffix = MAPS.get(suffix);
      if (null == suffix) {
        return;
      }
      logger.info("网址:" + file.getPath());
      for (String val : charset) { // 匹配不同编码格式
        doc = Jsoup.parse(file, val);
        html = doc.body().text(); // 取页面body标签中所有内容
        boolean Garbled = getErrorCode(html); // 判断编码是否错误
        if (Garbled == false) {
          logger.info(val + "编码错误!!!");
          i++;
          if (i == 5) {
            html = null;
          }
          continue;
        }
        i = 0;
        variable = getReplaceAll(doc.title());
        if (variable != null && !"".equals(variable)) {
          arch.setTitle(variable.trim()); // 标题 √
        }
        logger.info("标题:" + arch.getTitle());
        break;
      }
      if (html == null || "".equals(html)) {
        logger.info("内容为空的HTML页面:" + file.getPath());
      }
      html = getReplaceAll(html).trim();
      logger.info("所有内容:" + html);
      list = ExtractthepeopleText.getPersonName(html);
      arch.setPlaintiff(getKeyName(list, 1)); // 原告相关人 √
      logger.info("<<------------------------------------------------------>>");
      logger.info("原告相关人:" + arch.getPlaintiff());
      arch.setDefendant(getKeyName(list, 2)); // 被告相关人 √
      logger.info("被告相关人:" + arch.getDefendant());
      variable = getCourtName(html);
      if (variable != null && !"".equals(variable)) {
        arch.setCourtName(variable); // 法院 √
      }
      if (variable == null || "".equals(variable)) {
        arch.setCourtName(getAtherthe(html)); // 法院 √
      }
      logger.info("法院:" + arch.getCourtName());
      arch.setCaseCause(StringCause(html)); // 案由 √
      logger.info("案由:" + arch.getCaseCause());
      arch.setApprovalDate(getConcludeDate(html)); // 审结日期 √
      logger.info("审结日期:" + arch.getApprovalDate());
      arch.setApproval(getTheVerdictData(html)); // 判决结果 √
      logger.info("判决结果:" + arch.getApproval());
      arch.setCatalog(getCatalog(html)); // 文书类型 √
      logger.info("文书类型:" + arch.getCatalog());
      variable = getCaseNum(html);
      if (!"".equals(variable) && variable != null) {
        arch.setCaseNum(variable); // 案号 √
      }
      if ("".equals(variable) || variable == null) {
        arch.setCaseNum(getSentenceNo3(html)); // 案号 √
      }
      logger.info("案号:" + arch.getCaseNum());
      logger.info("<<------------------------------------------------------>>");
      arch.setUuid(file.getName().substring(0, file.getName().lastIndexOf(".")));
      listarchs.add(arch);
      boolean result = updateJsonData(listarchs, bucket, util);
      if (!result) {
        logger.info(file.getPath() + ":更新失败");
      }
      count += listarchs.size();
      logger.info("<<------------------------count------------------------------>>" + count);
      listarchs = null;
      return;
    }
    File[] files = file.listFiles();
    for (File fi : files) {
      if (fi.isFile()) {
        arch = new ArchivesVO();
        String suffix = fi.getName();
        suffix = suffix.substring(suffix.indexOf(".") + 1, suffix.length());
        suffix = MAPS.get(suffix);
        if (null == suffix) {
          return;
        }
        //				logger.info("网址:" + fi.getPath());
        for (String val : charset) { // 匹配不同编码格式
          doc = Jsoup.parse(fi, val);
          html = doc.body().text();
          //					html = doc.text();
          //					html = getDataAll(html);
          boolean Garbled = getErrorCode(html); // 判断编码是否错误
          if (Garbled == false) {
            i++;
            if (i == 5) {
              html = null;
            } // 判断编码格式都不匹配的时候赋予空值
            continue;
          }
          i = 0;
          variable = getReplaceAll(doc.title());
          if (variable != null && !"".equals(variable)) { // 防止title标签为空的情况
            arch.setTitle(variable.trim()); // 标题 √
          }
          break;
        }
        if (html == null || "".equals(html)) {
          logger.info("内容为空的HTML页面:" + fi.getPath());
          continue;
        }
        html = getReplaceAll(html).trim(); // 所有内容去掉特殊字符√

        list = ExtractthepeopleText.getPersonName(html);

        arch.setPlaintiff(getKeyName(list, 1)); // 原告相关人√

        arch.setDefendant(getKeyName(list, 2)); // 被告相关人√

        arch.setCatalog(getCatalog(html)); // 文书类型 √

        variable = getCourtName(html);
        if (variable != null && !"".equals(variable)) {
          arch.setCourtName(variable); // 法院 √
        }
        if (variable == null || "".equals(variable)) {
          arch.setCourtName(getAtherthe(html)); // 法院 √
        }

        arch.setCaseCause(StringCause(html)); // 案由 √

        arch.setApprovalDate(getConcludeDate(html)); // 审结日期√

        arch.setApproval(getTheVerdictData(html)); // 判决结果√

        variable = getCaseNum(html);
        if (!"".equals(variable) && variable != null) {
          arch.setCaseNum(variable); // 案号 √
        }
        if ("".equals(variable) || variable == null) {
          arch.setCaseNum(getSentenceNo3(html)); // 案号 √
        }
        arch.setUuid(fi.getName().substring(0, fi.getName().lastIndexOf(".")));
        //				showData(arch); // 打印所有截取字段
        listarchs.add(arch);
        if (listarchs.size() >= 1000) {
          boolean result = updateJsonData(listarchs, bucket, util);
          if (!result) {
            logger.info(fi.getPath() + ":更新失败1");
          }
          count += listarchs.size();
          logger.info("<<------------------------count------------------------------>>" + count);
          listarchs = null;
          listarchs = new ArrayList<ArchivesVO>();
        }
      } else if (fi.isDirectory()) {
        logger.info(fi.getName());
        show(fi, bucket, util);
      } else {
        continue;
      }
    }
    if (null != listarchs && listarchs.size() > 0) {
      boolean result = updateJsonData(listarchs, bucket, util);
      if (!result) {
        logger.info(":更新失败2");
      }
      count += listarchs.size();
      listarchs = null;
      arch = null;
      return;
    }
  }