private boolean containsMostlyLetters(BxLine line) { double letterCount = 0; for (char ch : line.toText().toCharArray()) { if (Character.isLetter(ch)) letterCount++; } return 2 * letterCount > line.toText().length(); }
private boolean shouldBeRemoved( BxLine line, Population heightPopulation, Population fontPopulation, Population distancePopulation, Population indentationPopulation) { if (line.getMostPopularFontName() == null) { return true; } if (heightPopulation.getZScore(line.getHeight()) < candMinHeightZScore) { return true; } if (looksLikeEquation(line)) { return true; } if (looksLikeFigure(line)) { return true; } if (looksLikeTable(line)) { return true; } if (!containsMostlyLetters(line)) { return true; } if (!containsWord(line)) { return true; } if (startsWithLargeNumber(line)) { return true; } if (heightPopulation.getZScore(line.getHeight()) < outlHeightZScore && Math.abs(fontPopulation.getZScore(getFontIndex(line))) < outlFontZScore && (!line.hasPrev() || distancePopulation.getZScore(line.getY() - line.getPrev().getY()) < outlDistanceZScore) && Math.abs(indentationPopulation.getZScore(line.getX())) < outlIndentZScore) { return true; } int i = 0; BxLine actLine = line; while (actLine.hasNext()) { actLine = actLine.getNext(); if (actLine.toText().matches("[A-Z].*")) { break; } if (i++ == maxHeaderLineCount) { return true; } } return false; }
public void cleanupContent(BxDocContentStructure contentStructure) { for (BxDocContentPart contentPart : contentStructure.getParts()) { List<BxLine> headerLines = contentPart.getHeaderLines(); StringBuilder sb = new StringBuilder(); for (BxLine headerLine : headerLines) { String lineText = headerLine.toText(); if (lineText.endsWith("-")) { lineText = lineText.substring(0, lineText.length() - 1); if (lineText.lastIndexOf(' ') < 0) { sb.append(lineText); } else { sb.append(lineText.substring(0, lineText.lastIndexOf(' '))); sb.append(" "); sb.append(lineText.substring(lineText.lastIndexOf(' ') + 1)); } } else { sb.append(lineText); sb.append(" "); } } contentPart.setCleanHeaderText(cleanLigatures(sb.toString().trim())); List<BxLine> contentLines = contentPart.getContentLines(); List<String> contentTexts = new ArrayList<String>(); double maxLen = Double.NEGATIVE_INFINITY; for (BxLine line : contentLines) { if (line.getWidth() > maxLen) { maxLen = line.getWidth(); } } String contentText = ""; for (BxLine line : contentLines) { int score = 0; BxLine prev = line.getPrev(); BxLine next = line.getNext(); if (line.toText().matches("^[A-Z].*$")) { score++; } if (prev != null) { if (line.getX() > prev.getX() && line.getX() - prev.getX() < paragraphLineIndentMultiplier * maxLen && line.getX() - prev.getX() > minParagraphIndent) { score++; } if (prev.getWidth() < lastParagraphLineLengthMult * maxLen) { score++; } if (prev.toText().endsWith(".")) { score++; } } if (next != null && line.getX() > next.getX() && line.getX() - next.getX() < paragraphLineIndentMultiplier * maxLen && line.getX() - next.getX() > minParagraphIndent) { score++; } if (score >= firstParagraphLineMinScore) { if (!contentText.isEmpty()) { contentTexts.add(cleanLigatures(contentText.trim())); } contentText = ""; } String lineText = line.toText(); if (lineText.endsWith("-")) { lineText = lineText.substring(0, lineText.length() - 1); if (lineText.lastIndexOf(' ') < 0) { contentText += lineText; } else { contentText += lineText.substring(0, lineText.lastIndexOf(' ')); contentText += "\n"; contentText += lineText.substring(lineText.lastIndexOf(' ') + 1); } } else { contentText += lineText; contentText += "\n"; } } if (!contentText.isEmpty()) { contentTexts.add(cleanLigatures(contentText.trim())); } contentPart.setCleanContentTexts(contentTexts); } }
private boolean startsWithLargeNumber(BxLine line) { return line.toText().matches("[0-9][0-9].*"); }
private boolean containsWord(BxLine line) { return line.toText().toLowerCase().matches(".*[a-z][a-z][a-z][a-z].*"); }
private boolean looksLikeTable(BxLine line) { return line.toText().toLowerCase().matches("table .*"); }
private boolean looksLikeFigure(BxLine line) { return line.toText().toLowerCase().matches("fig\\.? .*") || line.toText().toLowerCase().matches("figure .*"); }
private boolean looksLikeEquation(BxLine line) { return line.toText().contains("="); }
private boolean looksLikeHeader(BxLine line) { String text = line.toText(); return text.matches("^[A-Z].*") || text.matches("^[1-9].*[a-zA-Z].*") || text.matches("^[a-h]\\).*[a-zA-Z].*"); }
@Override public double calculateFeatureValue(BxLine line, BxPage page) { return (line.toText().matches("^[a-z]\\) [A-Z].*$")) ? 1 : 0; }