public List<BxPage> read(Reader reader, Object... hints) throws TransformationException { List<BxPage> pages = new ArrayList<BxPage>(); try { pages.add(importSource(new InputSource(reader))); } catch (IOException ex) { throw new TransformationException(ex); } catch (ParserConfigurationException ex) { throw new TransformationException(ex); } catch (SAXException ex) { throw new TransformationException(ex); } return pages; }
@Override public BxContentStructure extractHeaders(BxDocument document) throws AnalysisException { Population heightPopulation = new Population(); Population fontPopulation = new Population(); Population distancePopulation = new Population(); Population lengthPopulation = new Population(); Population indentationPopulation = new Population(); Set<BxLine> candidates = new HashSet<BxLine>(); for (BxPage page : document) { for (BxZone zone : page) { if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { for (BxLine line : zone) { heightPopulation.addObservation(line.getHeight()); lengthPopulation.addObservation(line.getWidth()); indentationPopulation.addObservation(line.getX()); if (line.hasPrev() && line.getY() - line.getPrev().getY() > 0) { distancePopulation.addObservation(line.getY() - line.getPrev().getY()); } fontPopulation.addObservation(getFontIndex(line)); if (isFirstInZone(line) && looksLikeHeader(line)) { candidates.add(line); } } } } } Set<BxLine> toDelete = new HashSet<BxLine>(); for (BxLine line : candidates) { if (shouldBeRemoved( line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) { toDelete.add(line); } if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore) { toDelete.add(line); } } candidates.removeAll(toDelete); toDelete.clear(); Set<String> headerFonts = new HashSet<String>(); List<BxLine> candidatesList = Lists.newArrayList(candidates); for (int x = 0; x < candidatesList.size(); x++) { BxLine line1 = candidatesList.get(x); for (int y = x + 1; y < candidatesList.size(); y++) { BxLine line2 = candidatesList.get(y); for (int z = y + 1; z < candidatesList.size(); z++) { BxLine line3 = candidatesList.get(z); if (line1.getMostPopularFontName().equals(line2.getMostPopularFontName()) && line3.getMostPopularFontName().equals(line2.getMostPopularFontName()) && Math.abs(fontPopulation.getZScore(getFontIndex(line1))) > outlFontZScore) { headerFonts.add(line1.getMostPopularFontName()); } } } } for (BxPage page : document) { for (BxZone zone : page) { if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { for (BxLine line : zone) { if (looksLikeHeader(line) && headerFonts.contains(line.getMostPopularFontName())) { candidates.add(line); } } } } } for (BxLine line : candidates) { if (shouldBeRemoved( line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) { toDelete.add(line); } if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore2) { toDelete.add(line); } } candidates.removeAll(toDelete); toDelete.clear(); for (BxLine line : candidates) { int i = 0; for (BxLine line2 : candidates) { if (line.equals(line2)) { continue; } if (areSimilar(line, line2)) { i++; } } if (i == 0 || i > maxSimilarLinesCount) { toDelete.add(line); for (BxLine line2 : candidates) { if (areSimilar(line, line2)) { toDelete.add(line2); } } } } candidates.removeAll(toDelete); candidatesList = new ArrayList<BxLine>(); for (BxPage page : document) { for (BxZone zone : page) { for (BxLine line : zone) { if (candidates.contains(line)) { candidatesList.add(line); } } } } int clusters[] = headersClusterizer.clusterLines(candidatesList); Set<Integer> keptClusters = new HashSet<Integer>(); for (int clusterIdx = 0; clusterIdx < clusters.length; clusterIdx++) { int cluster = clusters[clusterIdx]; if (keptClusters.size() < 3) { keptClusters.add(cluster); } if (!keptClusters.contains(cluster)) { candidates.remove(candidatesList.get(clusterIdx)); } } BxContentStructure contentStructure = new BxContentStructure(); BxLine lastHeaderLine = null; for (BxPage page : document) { for (BxZone zone : page) { if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { for (BxLine line : zone) { if (candidates.contains(line)) { contentStructure.addFirstHeaderLine(page, line); lastHeaderLine = line; } else if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { if (lastHeaderLine == null) { BxChunk chunk = new BxChunk(new BxBounds(), "--"); BxWord word = new BxWord().addChunk(chunk); lastHeaderLine = new BxLine().addWord(word); contentStructure.addFirstHeaderLine(page, lastHeaderLine); } contentStructure.addContentLine(lastHeaderLine, line); } } } } } headerLinesCompletener.completeLines(contentStructure); return contentStructure; }
private double getFontIndex(BxLine line) { List<String> fonts = Lists.newArrayList(line.getParent().getParent().getParent().getFontNames()); Collections.sort(fonts); return fonts.indexOf(line.getMostPopularFontName()); }