コード例 #1
0
  private DocumentContentStructure convertContentParts(
      List<BxDocContentPart> contentParts, int level) {
    DocumentContentStructure dcs = new DocumentContentStructure();
    if (contentParts.isEmpty()) {
      return dcs;
    }

    if (level > 0) {
      dcs.setHeader(new DocumentHeader(level, contentParts.get(0).getCleanHeaderText(), dcs));
      for (String contentText : contentParts.get(0).getCleanContentTexts()) {
        dcs.addParagraph(new DocumentParagraph(contentText, dcs));
      }

      contentParts.remove(0);
      if (contentParts.isEmpty()) {
        return dcs;
      }
    }

    int topClusterNum = contentParts.get(0).getLevelId();

    List<BxDocContentPart> sectionContentParts = new ArrayList<BxDocContentPart>();

    for (BxDocContentPart contentPart : contentParts) {
      if (contentPart.getLevelId() == topClusterNum && !sectionContentParts.isEmpty()) {
        DocumentContentStructure dcp = convertContentParts(sectionContentParts, level + 1);
        dcs.addPart(dcp);
        sectionContentParts.clear();
      }
      sectionContentParts.add(contentPart);
    }
    if (!sectionContentParts.isEmpty()) {
      DocumentContentStructure dcp = convertContentParts(sectionContentParts, level + 1);
      dcs.addPart(dcp);
    }

    return dcs;
  }
コード例 #2
0
ファイル: ContentCleaner.java プロジェクト: matfed/CERMINE
  public void cleanupContent(BxDocContentStructure contentStructure) {
    for (BxDocContentPart contentPart : contentStructure.getParts()) {
      List<BxLine> headerLines = contentPart.getHeaderLines();
      StringBuilder sb = new StringBuilder();
      for (BxLine headerLine : headerLines) {
        String lineText = headerLine.toText();
        if (lineText.endsWith("-")) {
          lineText = lineText.substring(0, lineText.length() - 1);
          if (lineText.lastIndexOf(' ') < 0) {
            sb.append(lineText);
          } else {
            sb.append(lineText.substring(0, lineText.lastIndexOf(' ')));
            sb.append(" ");
            sb.append(lineText.substring(lineText.lastIndexOf(' ') + 1));
          }
        } else {
          sb.append(lineText);
          sb.append(" ");
        }
      }
      contentPart.setCleanHeaderText(cleanLigatures(sb.toString().trim()));

      List<BxLine> contentLines = contentPart.getContentLines();
      List<String> contentTexts = new ArrayList<String>();

      double maxLen = Double.NEGATIVE_INFINITY;
      for (BxLine line : contentLines) {
        if (line.getWidth() > maxLen) {
          maxLen = line.getWidth();
        }
      }

      String contentText = "";
      for (BxLine line : contentLines) {
        int score = 0;
        BxLine prev = line.getPrev();
        BxLine next = line.getNext();
        if (line.toText().matches("^[A-Z].*$")) {
          score++;
        }
        if (prev != null) {
          if (line.getX() > prev.getX()
              && line.getX() - prev.getX() < paragraphLineIndentMultiplier * maxLen
              && line.getX() - prev.getX() > minParagraphIndent) {
            score++;
          }
          if (prev.getWidth() < lastParagraphLineLengthMult * maxLen) {
            score++;
          }
          if (prev.toText().endsWith(".")) {
            score++;
          }
        }
        if (next != null
            && line.getX() > next.getX()
            && line.getX() - next.getX() < paragraphLineIndentMultiplier * maxLen
            && line.getX() - next.getX() > minParagraphIndent) {
          score++;
        }

        if (score >= firstParagraphLineMinScore) {
          if (!contentText.isEmpty()) {
            contentTexts.add(cleanLigatures(contentText.trim()));
          }
          contentText = "";
        }

        String lineText = line.toText();
        if (lineText.endsWith("-")) {
          lineText = lineText.substring(0, lineText.length() - 1);
          if (lineText.lastIndexOf(' ') < 0) {
            contentText += lineText;
          } else {
            contentText += lineText.substring(0, lineText.lastIndexOf(' '));
            contentText += "\n";
            contentText += lineText.substring(lineText.lastIndexOf(' ') + 1);
          }
        } else {
          contentText += lineText;
          contentText += "\n";
        }
      }
      if (!contentText.isEmpty()) {
        contentTexts.add(cleanLigatures(contentText.trim()));
      }

      contentPart.setCleanContentTexts(contentTexts);
    }
  }