private boolean containsMostlyLetters(BxLine line) { double letterCount = 0; for (char ch : line.toText().toCharArray()) { if (Character.isLetter(ch)) letterCount++; } return 2 * letterCount > line.toText().length(); }
private void appendLine(Document doc, Element parent, BxLine line, Object... hints) { Element node = doc.createElement("Line"); appendPropertyIfNotNull(doc, node, "LineID", line.getId()); appendBounds(doc, node, "LineCorners", line.getBounds(), hints); appendPropertyIfNotNull(doc, node, "LineNext", line.getNextId()); appendProperty(doc, node, "LineNumChars", ""); for (BxWord word : line.getWords()) { appendWord(doc, node, word, hints); } parent.appendChild(node); }
private BxLine parseLineElement(Element lineE) { BxLine line = new BxLine(); if (!(getChildren("LineCorners", lineE).isEmpty())) { line.setBounds(parseElementContainingVertexes(getChildren("LineCorners", lineE).get(0))); } List<Element> e = getChildren("Word", lineE); for (Element we : e) { BxWord wo = parseWordElement(we); line.addWord(wo); } return line; }
@Override public double calculateFeatureValue(BxLine refLine, BxDocumentBibReferences refs) { return refs.getZone(refLine).getBounds().getX() + refs.getZone(refLine).getBounds().getWidth() - refLine.getBounds().getX() - refLine.getBounds().getWidth(); }
@Override public double calculateFeatureValue(BxZone zone, BxPage page) { double charSpace = 0.0; for (BxLine line : zone.getLines()) { for (BxWord word : line.getWords()) { for (BxChunk chunk : word.getChunks()) { charSpace += chunk.getArea(); } } } double ret = zone.getArea() - charSpace; if (ret < 0) { return 0.0; } else { return ret; } }
@Override public double calculateFeatureValue(BxLine line, BxPage page) { if (!line.hasPrev()) { return 1.0; } double avLength = 0; int linesCount = 0; for (BxZone zone : page.getZones()) { for (BxLine l : zone.getLines()) { linesCount++; avLength += l.getBounds().getWidth(); } } if (linesCount == 0 || avLength == 0) { return 0; } avLength /= linesCount; return line.getPrev().getBounds().getWidth() / avLength; }
@Override public BxContentStructure extractHeaders(BxDocument document) throws AnalysisException { Population heightPopulation = new Population(); Population fontPopulation = new Population(); Population distancePopulation = new Population(); Population lengthPopulation = new Population(); Population indentationPopulation = new Population(); Set<BxLine> candidates = new HashSet<BxLine>(); for (BxPage page : document) { for (BxZone zone : page) { if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { for (BxLine line : zone) { heightPopulation.addObservation(line.getHeight()); lengthPopulation.addObservation(line.getWidth()); indentationPopulation.addObservation(line.getX()); if (line.hasPrev() && line.getY() - line.getPrev().getY() > 0) { distancePopulation.addObservation(line.getY() - line.getPrev().getY()); } fontPopulation.addObservation(getFontIndex(line)); if (isFirstInZone(line) && looksLikeHeader(line)) { candidates.add(line); } } } } } Set<BxLine> toDelete = new HashSet<BxLine>(); for (BxLine line : candidates) { if (shouldBeRemoved( line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) { toDelete.add(line); } if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore) { toDelete.add(line); } } candidates.removeAll(toDelete); toDelete.clear(); Set<String> headerFonts = new HashSet<String>(); List<BxLine> candidatesList = Lists.newArrayList(candidates); for (int x = 0; x < candidatesList.size(); x++) { BxLine line1 = candidatesList.get(x); for (int y = x + 1; y < candidatesList.size(); y++) { BxLine line2 = candidatesList.get(y); for (int z = y + 1; z < candidatesList.size(); z++) { BxLine line3 = candidatesList.get(z); if (line1.getMostPopularFontName().equals(line2.getMostPopularFontName()) && line3.getMostPopularFontName().equals(line2.getMostPopularFontName()) && Math.abs(fontPopulation.getZScore(getFontIndex(line1))) > outlFontZScore) { headerFonts.add(line1.getMostPopularFontName()); } } } } for (BxPage page : document) { for (BxZone zone : page) { if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { for (BxLine line : zone) { if (looksLikeHeader(line) && headerFonts.contains(line.getMostPopularFontName())) { candidates.add(line); } } } } } for (BxLine line : candidates) { if (shouldBeRemoved( line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) { toDelete.add(line); } if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore2) { toDelete.add(line); } } candidates.removeAll(toDelete); toDelete.clear(); for (BxLine line : candidates) { int i = 0; for (BxLine line2 : candidates) { if (line.equals(line2)) { continue; } if (areSimilar(line, line2)) { i++; } } if (i == 0 || i > maxSimilarLinesCount) { toDelete.add(line); for (BxLine line2 : candidates) { if (areSimilar(line, line2)) { toDelete.add(line2); } } } } candidates.removeAll(toDelete); candidatesList = new ArrayList<BxLine>(); for (BxPage page : document) { for (BxZone zone : page) { for (BxLine line : zone) { if (candidates.contains(line)) { candidatesList.add(line); } } } } int clusters[] = headersClusterizer.clusterLines(candidatesList); Set<Integer> keptClusters = new HashSet<Integer>(); for (int clusterIdx = 0; clusterIdx < clusters.length; clusterIdx++) { int cluster = clusters[clusterIdx]; if (keptClusters.size() < 3) { keptClusters.add(cluster); } if (!keptClusters.contains(cluster)) { candidates.remove(candidatesList.get(clusterIdx)); } } BxContentStructure contentStructure = new BxContentStructure(); BxLine lastHeaderLine = null; for (BxPage page : document) { for (BxZone zone : page) { if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { for (BxLine line : zone) { if (candidates.contains(line)) { contentStructure.addFirstHeaderLine(page, line); lastHeaderLine = line; } else if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { if (lastHeaderLine == null) { BxChunk chunk = new BxChunk(new BxBounds(), "--"); BxWord word = new BxWord().addChunk(chunk); lastHeaderLine = new BxLine().addWord(word); contentStructure.addFirstHeaderLine(page, lastHeaderLine); } contentStructure.addContentLine(lastHeaderLine, line); } } } } } headerLinesCompletener.completeLines(contentStructure); return contentStructure; }
private boolean areSimilar(BxLine line1, BxLine line2) { return line1.getMostPopularFontName().equals(line2.getMostPopularFontName()) && Math.abs(line1.getHeight() - line2.getHeight()) < maxHeightSimilarity; }
private boolean shouldBeRemoved( BxLine line, Population heightPopulation, Population fontPopulation, Population distancePopulation, Population indentationPopulation) { if (line.getMostPopularFontName() == null) { return true; } if (heightPopulation.getZScore(line.getHeight()) < candMinHeightZScore) { return true; } if (looksLikeEquation(line)) { return true; } if (looksLikeFigure(line)) { return true; } if (looksLikeTable(line)) { return true; } if (!containsMostlyLetters(line)) { return true; } if (!containsWord(line)) { return true; } if (startsWithLargeNumber(line)) { return true; } if (heightPopulation.getZScore(line.getHeight()) < outlHeightZScore && Math.abs(fontPopulation.getZScore(getFontIndex(line))) < outlFontZScore && (!line.hasPrev() || distancePopulation.getZScore(line.getY() - line.getPrev().getY()) < outlDistanceZScore) && Math.abs(indentationPopulation.getZScore(line.getX())) < outlIndentZScore) { return true; } int i = 0; BxLine actLine = line; while (actLine.hasNext()) { actLine = actLine.getNext(); if (actLine.toText().matches("[A-Z].*")) { break; } if (i++ == maxHeaderLineCount) { return true; } } return false; }
@Override public double calculateFeatureValue(BxLine object, BxPage context) { return object.getBounds().getX() - context.getX(); }
private boolean startsWithLargeNumber(BxLine line) { return line.toText().matches("[0-9][0-9].*"); }
private boolean looksLikeTable(BxLine line) { return line.toText().toLowerCase().matches("table .*"); }
public void cleanupContent(BxDocContentStructure contentStructure) { for (BxDocContentPart contentPart : contentStructure.getParts()) { List<BxLine> headerLines = contentPart.getHeaderLines(); StringBuilder sb = new StringBuilder(); for (BxLine headerLine : headerLines) { String lineText = headerLine.toText(); if (lineText.endsWith("-")) { lineText = lineText.substring(0, lineText.length() - 1); if (lineText.lastIndexOf(' ') < 0) { sb.append(lineText); } else { sb.append(lineText.substring(0, lineText.lastIndexOf(' '))); sb.append(" "); sb.append(lineText.substring(lineText.lastIndexOf(' ') + 1)); } } else { sb.append(lineText); sb.append(" "); } } contentPart.setCleanHeaderText(cleanLigatures(sb.toString().trim())); List<BxLine> contentLines = contentPart.getContentLines(); List<String> contentTexts = new ArrayList<String>(); double maxLen = Double.NEGATIVE_INFINITY; for (BxLine line : contentLines) { if (line.getWidth() > maxLen) { maxLen = line.getWidth(); } } String contentText = ""; for (BxLine line : contentLines) { int score = 0; BxLine prev = line.getPrev(); BxLine next = line.getNext(); if (line.toText().matches("^[A-Z].*$")) { score++; } if (prev != null) { if (line.getX() > prev.getX() && line.getX() - prev.getX() < paragraphLineIndentMultiplier * maxLen && line.getX() - prev.getX() > minParagraphIndent) { score++; } if (prev.getWidth() < lastParagraphLineLengthMult * maxLen) { score++; } if (prev.toText().endsWith(".")) { score++; } } if (next != null && line.getX() > next.getX() && line.getX() - next.getX() < paragraphLineIndentMultiplier * maxLen && line.getX() - next.getX() > minParagraphIndent) { score++; } if (score >= firstParagraphLineMinScore) { if (!contentText.isEmpty()) { contentTexts.add(cleanLigatures(contentText.trim())); } contentText = ""; } String lineText = line.toText(); if (lineText.endsWith("-")) { lineText = lineText.substring(0, lineText.length() - 1); if (lineText.lastIndexOf(' ') < 0) { contentText += lineText; } else { contentText += lineText.substring(0, lineText.lastIndexOf(' ')); contentText += "\n"; contentText += lineText.substring(lineText.lastIndexOf(' ') + 1); } } else { contentText += lineText; contentText += "\n"; } } if (!contentText.isEmpty()) { contentTexts.add(cleanLigatures(contentText.trim())); } contentPart.setCleanContentTexts(contentTexts); } }
private boolean looksLikeFigure(BxLine line) { return line.toText().toLowerCase().matches("fig\\.? .*") || line.toText().toLowerCase().matches("figure .*"); }
private boolean looksLikeEquation(BxLine line) { return line.toText().contains("="); }
private boolean looksLikeHeader(BxLine line) { String text = line.toText(); return text.matches("^[A-Z].*") || text.matches("^[1-9].*[a-zA-Z].*") || text.matches("^[a-h]\\).*[a-zA-Z].*"); }
private boolean isFirstInZone(BxLine line) { return !line.hasPrev() || line.getParent() != line.getPrev().getParent(); }
private double getFontIndex(BxLine line) { List<String> fonts = Lists.newArrayList(line.getParent().getParent().getParent().getFontNames()); Collections.sort(fonts); return fonts.indexOf(line.getMostPopularFontName()); }
private boolean containsWord(BxLine line) { return line.toText().toLowerCase().matches(".*[a-z][a-z][a-z][a-z].*"); }
@Override public double calculateFeatureValue(BxLine line, BxPage page) { return (line.toText().matches("^[a-z]\\) [A-Z].*$")) ? 1 : 0; }