private boolean containsMostlyLetters(BxLine line) { double letterCount = 0; for (char ch : line.toText().toCharArray()) { if (Character.isLetter(ch)) letterCount++; } return 2 * letterCount > line.toText().length(); }
private boolean shouldBeRemoved( BxLine line, Population heightPopulation, Population fontPopulation, Population distancePopulation, Population indentationPopulation) { if (line.getMostPopularFontName() == null) { return true; } if (heightPopulation.getZScore(line.getHeight()) < candMinHeightZScore) { return true; } if (looksLikeEquation(line)) { return true; } if (looksLikeFigure(line)) { return true; } if (looksLikeTable(line)) { return true; } if (!containsMostlyLetters(line)) { return true; } if (!containsWord(line)) { return true; } if (startsWithLargeNumber(line)) { return true; } if (heightPopulation.getZScore(line.getHeight()) < outlHeightZScore && Math.abs(fontPopulation.getZScore(getFontIndex(line))) < outlFontZScore && (!line.hasPrev() || distancePopulation.getZScore(line.getY() - line.getPrev().getY()) < outlDistanceZScore) && Math.abs(indentationPopulation.getZScore(line.getX())) < outlIndentZScore) { return true; } int i = 0; BxLine actLine = line; while (actLine.hasNext()) { actLine = actLine.getNext(); if (actLine.toText().matches("[A-Z].*")) { break; } if (i++ == maxHeaderLineCount) { return true; } } return false; }
private boolean startsWithLargeNumber(BxLine line) { return line.toText().matches("[0-9][0-9].*"); }
private boolean containsWord(BxLine line) { return line.toText().toLowerCase().matches(".*[a-z][a-z][a-z][a-z].*"); }
private boolean looksLikeTable(BxLine line) { return line.toText().toLowerCase().matches("table .*"); }
private boolean looksLikeFigure(BxLine line) { return line.toText().toLowerCase().matches("fig\\.? .*") || line.toText().toLowerCase().matches("figure .*"); }
private boolean looksLikeEquation(BxLine line) { return line.toText().contains("="); }
private boolean looksLikeHeader(BxLine line) { String text = line.toText(); return text.matches("^[A-Z].*") || text.matches("^[1-9].*[a-zA-Z].*") || text.matches("^[a-h]\\).*[a-zA-Z].*"); }