@Override public BxDocument segmentDocument(BxDocument document) throws AnalysisException { Map<BxPage, List<Component>> componentMap = new HashMap<BxPage, List<Component>>(); ExecutorService exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER); ArrayList<Callable<NumBxPage>> tasks = new ArrayList<Callable<NumBxPage>>(); for (BxPage page : document.getPages()) { tasks.add(new ComponentCounter(page)); } List<Future<NumBxPage>> results; try { results = exec.invokeAll(tasks); exec.shutdown(); for (Future<NumBxPage> result : results) { NumBxPage p = result.get(); componentMap.put(p.page, p.components); } } catch (ExecutionException ex) { throw new AnalysisException("Cannot segment pages!", ex); } catch (InterruptedException ex) { throw new AnalysisException("Cannot segment pages!", ex); } this.computeDocumentOrientation(componentMap); BxDocument output = new BxDocument(); BxPage[] pages = new BxPage[document.getPages().size()]; exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER); tasks = new ArrayList<Callable<NumBxPage>>(); int i = 0; for (BxPage page : document.getPages()) { tasks.add(new SingleSegmenter(page, i++)); } try { results = exec.invokeAll(tasks); exec.shutdown(); for (Future<NumBxPage> result : results) { NumBxPage p = result.get(); pages[p.index] = p.page; } for (BxPage p : pages) { if (p.getBounds() != null) { output.addPage(p); } } return output; } catch (ExecutionException ex) { throw new AnalysisException("Cannot segment pages!", ex); } catch (InterruptedException ex) { throw new AnalysisException("Cannot segment pages!", ex); } }
private BxPage parsePageNode(Element elem) { BxPage page = new BxPage(); double minX = 0, minY = 0, maxX = 0, maxY = 0; boolean started = false; List<Element> e = getChildren("Zone", elem); for (Element zo : e) { BxZone zon = parseZoneNode(zo); page.addZone(zon); BxBounds zoneBounds = zon.getBounds(); if (!started) { minX = zoneBounds.getX(); minY = zoneBounds.getY(); maxX = zoneBounds.getX() + zoneBounds.getWidth(); maxY = zoneBounds.getY() + zoneBounds.getHeight(); started = true; } if (zoneBounds.getX() < minX) { minX = zoneBounds.getX(); } if (zoneBounds.getX() + zoneBounds.getWidth() > maxX) { maxX = zoneBounds.getX() + zoneBounds.getWidth(); } if (zoneBounds.getY() < minY) { minY = zoneBounds.getY(); } if (zoneBounds.getY() + zoneBounds.getHeight() > maxY) { maxY = zoneBounds.getY() + zoneBounds.getHeight(); } } Collections.sort( page.getZones(), new Comparator() { @Override public int compare(Object t, Object t1) { BxZone z1 = (BxZone) t; BxZone z2 = (BxZone) t1; int ret = Double.compare(z1.getBounds().getY(), z2.getBounds().getY()); if (ret == 0) { ret = Double.compare(z1.getBounds().getX(), z2.getBounds().getX()); } return ret; } }); return page.setBounds(new BxBounds(minX, minY, maxX - minX, maxY - minY)); }
private void appendPage(Document doc, Element parent, BxPage page, Object... hints) throws TransformationException { Element node = doc.createElement("Page"); appendPropertyIfNotNull(doc, node, "PageID", page.getId()); appendProperty(doc, node, "PageType", ""); appendProperty(doc, node, "PageNumber", ""); appendProperty(doc, node, "PageColumns", ""); appendPropertyIfNotNull(doc, node, "PageNext", page.getNextId()); appendProperty(doc, node, "PageZones", ""); for (BxZone zone : page.getZones()) { appendZone(doc, node, zone, hints); } parent.appendChild(node); }
@Override public double calculateFeatureValue(BxLine line, BxPage page) { if (!line.hasPrev()) { return 1.0; } double avLength = 0; int linesCount = 0; for (BxZone zone : page.getZones()) { for (BxLine l : zone.getLines()) { linesCount++; avLength += l.getBounds().getWidth(); } } if (linesCount == 0 || avLength == 0) { return 0; } avLength /= linesCount; return line.getPrev().getBounds().getWidth() / avLength; }
@Override public double calculateFeatureValue(BxZone zone, BxPage page) { return page.getNext() == null ? 1.0 : 0.0; }
@Override public double calculateFeatureValue(BxLine object, BxPage context) { return object.getBounds().getX() - context.getX(); }