private boolean shouldBeRemoved( BxLine line, Population heightPopulation, Population fontPopulation, Population distancePopulation, Population indentationPopulation) { if (line.getMostPopularFontName() == null) { return true; } if (heightPopulation.getZScore(line.getHeight()) < candMinHeightZScore) { return true; } if (looksLikeEquation(line)) { return true; } if (looksLikeFigure(line)) { return true; } if (looksLikeTable(line)) { return true; } if (!containsMostlyLetters(line)) { return true; } if (!containsWord(line)) { return true; } if (startsWithLargeNumber(line)) { return true; } if (heightPopulation.getZScore(line.getHeight()) < outlHeightZScore && Math.abs(fontPopulation.getZScore(getFontIndex(line))) < outlFontZScore && (!line.hasPrev() || distancePopulation.getZScore(line.getY() - line.getPrev().getY()) < outlDistanceZScore) && Math.abs(indentationPopulation.getZScore(line.getX())) < outlIndentZScore) { return true; } int i = 0; BxLine actLine = line; while (actLine.hasNext()) { actLine = actLine.getNext(); if (actLine.toText().matches("[A-Z].*")) { break; } if (i++ == maxHeaderLineCount) { return true; } } return false; }
public void cleanupContent(BxDocContentStructure contentStructure) { for (BxDocContentPart contentPart : contentStructure.getParts()) { List<BxLine> headerLines = contentPart.getHeaderLines(); StringBuilder sb = new StringBuilder(); for (BxLine headerLine : headerLines) { String lineText = headerLine.toText(); if (lineText.endsWith("-")) { lineText = lineText.substring(0, lineText.length() - 1); if (lineText.lastIndexOf(' ') < 0) { sb.append(lineText); } else { sb.append(lineText.substring(0, lineText.lastIndexOf(' '))); sb.append(" "); sb.append(lineText.substring(lineText.lastIndexOf(' ') + 1)); } } else { sb.append(lineText); sb.append(" "); } } contentPart.setCleanHeaderText(cleanLigatures(sb.toString().trim())); List<BxLine> contentLines = contentPart.getContentLines(); List<String> contentTexts = new ArrayList<String>(); double maxLen = Double.NEGATIVE_INFINITY; for (BxLine line : contentLines) { if (line.getWidth() > maxLen) { maxLen = line.getWidth(); } } String contentText = ""; for (BxLine line : contentLines) { int score = 0; BxLine prev = line.getPrev(); BxLine next = line.getNext(); if (line.toText().matches("^[A-Z].*$")) { score++; } if (prev != null) { if (line.getX() > prev.getX() && line.getX() - prev.getX() < paragraphLineIndentMultiplier * maxLen && line.getX() - prev.getX() > minParagraphIndent) { score++; } if (prev.getWidth() < lastParagraphLineLengthMult * maxLen) { score++; } if (prev.toText().endsWith(".")) { score++; } } if (next != null && line.getX() > next.getX() && line.getX() - next.getX() < paragraphLineIndentMultiplier * maxLen && line.getX() - next.getX() > minParagraphIndent) { score++; } if (score >= firstParagraphLineMinScore) { if (!contentText.isEmpty()) { contentTexts.add(cleanLigatures(contentText.trim())); } contentText = ""; } String lineText = line.toText(); if (lineText.endsWith("-")) { lineText = lineText.substring(0, lineText.length() - 1); if (lineText.lastIndexOf(' ') < 0) { contentText += lineText; } else { contentText += lineText.substring(0, lineText.lastIndexOf(' ')); contentText += "\n"; contentText += lineText.substring(lineText.lastIndexOf(' ') + 1); } } else { contentText += lineText; contentText += "\n"; } } if (!contentText.isEmpty()) { contentTexts.add(cleanLigatures(contentText.trim())); } contentPart.setCleanContentTexts(contentTexts); } }