private DocumentContentStructure convertContentParts( List<BxDocContentPart> contentParts, int level) { DocumentContentStructure dcs = new DocumentContentStructure(); if (contentParts.isEmpty()) { return dcs; } if (level > 0) { dcs.setHeader(new DocumentHeader(level, contentParts.get(0).getCleanHeaderText(), dcs)); for (String contentText : contentParts.get(0).getCleanContentTexts()) { dcs.addParagraph(new DocumentParagraph(contentText, dcs)); } contentParts.remove(0); if (contentParts.isEmpty()) { return dcs; } } int topClusterNum = contentParts.get(0).getLevelId(); List<BxDocContentPart> sectionContentParts = new ArrayList<BxDocContentPart>(); for (BxDocContentPart contentPart : contentParts) { if (contentPart.getLevelId() == topClusterNum && !sectionContentParts.isEmpty()) { DocumentContentStructure dcp = convertContentParts(sectionContentParts, level + 1); dcs.addPart(dcp); sectionContentParts.clear(); } sectionContentParts.add(contentPart); } if (!sectionContentParts.isEmpty()) { DocumentContentStructure dcp = convertContentParts(sectionContentParts, level + 1); dcs.addPart(dcp); } return dcs; }
public void cleanupContent(BxDocContentStructure contentStructure) { for (BxDocContentPart contentPart : contentStructure.getParts()) { List<BxLine> headerLines = contentPart.getHeaderLines(); StringBuilder sb = new StringBuilder(); for (BxLine headerLine : headerLines) { String lineText = headerLine.toText(); if (lineText.endsWith("-")) { lineText = lineText.substring(0, lineText.length() - 1); if (lineText.lastIndexOf(' ') < 0) { sb.append(lineText); } else { sb.append(lineText.substring(0, lineText.lastIndexOf(' '))); sb.append(" "); sb.append(lineText.substring(lineText.lastIndexOf(' ') + 1)); } } else { sb.append(lineText); sb.append(" "); } } contentPart.setCleanHeaderText(cleanLigatures(sb.toString().trim())); List<BxLine> contentLines = contentPart.getContentLines(); List<String> contentTexts = new ArrayList<String>(); double maxLen = Double.NEGATIVE_INFINITY; for (BxLine line : contentLines) { if (line.getWidth() > maxLen) { maxLen = line.getWidth(); } } String contentText = ""; for (BxLine line : contentLines) { int score = 0; BxLine prev = line.getPrev(); BxLine next = line.getNext(); if (line.toText().matches("^[A-Z].*$")) { score++; } if (prev != null) { if (line.getX() > prev.getX() && line.getX() - prev.getX() < paragraphLineIndentMultiplier * maxLen && line.getX() - prev.getX() > minParagraphIndent) { score++; } if (prev.getWidth() < lastParagraphLineLengthMult * maxLen) { score++; } if (prev.toText().endsWith(".")) { score++; } } if (next != null && line.getX() > next.getX() && line.getX() - next.getX() < paragraphLineIndentMultiplier * maxLen && line.getX() - next.getX() > minParagraphIndent) { score++; } if (score >= firstParagraphLineMinScore) { if (!contentText.isEmpty()) { contentTexts.add(cleanLigatures(contentText.trim())); } contentText = ""; } String lineText = line.toText(); if (lineText.endsWith("-")) { lineText = lineText.substring(0, lineText.length() - 1); if (lineText.lastIndexOf(' ') < 0) { contentText += lineText; } else { contentText += lineText.substring(0, lineText.lastIndexOf(' ')); contentText += "\n"; contentText += lineText.substring(lineText.lastIndexOf(' ') + 1); } } else { contentText += lineText; contentText += "\n"; } } if (!contentText.isEmpty()) { contentTexts.add(cleanLigatures(contentText.trim())); } contentPart.setCleanContentTexts(contentTexts); } }