@Override
  public BxDocument segmentDocument(BxDocument document) throws AnalysisException {
    Map<BxPage, List<Component>> componentMap = new HashMap<BxPage, List<Component>>();

    ExecutorService exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER);
    ArrayList<Callable<NumBxPage>> tasks = new ArrayList<Callable<NumBxPage>>();
    for (BxPage page : document.getPages()) {
      tasks.add(new ComponentCounter(page));
    }

    List<Future<NumBxPage>> results;
    try {
      results = exec.invokeAll(tasks);
      exec.shutdown();

      for (Future<NumBxPage> result : results) {
        NumBxPage p = result.get();
        componentMap.put(p.page, p.components);
      }
    } catch (ExecutionException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    } catch (InterruptedException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    }

    this.computeDocumentOrientation(componentMap);

    BxDocument output = new BxDocument();
    BxPage[] pages = new BxPage[document.getPages().size()];

    exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER);
    tasks = new ArrayList<Callable<NumBxPage>>();
    int i = 0;
    for (BxPage page : document.getPages()) {
      tasks.add(new SingleSegmenter(page, i++));
    }

    try {
      results = exec.invokeAll(tasks);
      exec.shutdown();

      for (Future<NumBxPage> result : results) {
        NumBxPage p = result.get();
        pages[p.index] = p.page;
      }
      for (BxPage p : pages) {
        if (p.getBounds() != null) {
          output.addPage(p);
        }
      }
      return output;
    } catch (ExecutionException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    } catch (InterruptedException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    }
  }
  private BxPage parsePageNode(Element elem) {
    BxPage page = new BxPage();

    double minX = 0, minY = 0, maxX = 0, maxY = 0;
    boolean started = false;

    List<Element> e = getChildren("Zone", elem);
    for (Element zo : e) {
      BxZone zon = parseZoneNode(zo);
      page.addZone(zon);

      BxBounds zoneBounds = zon.getBounds();
      if (!started) {
        minX = zoneBounds.getX();
        minY = zoneBounds.getY();
        maxX = zoneBounds.getX() + zoneBounds.getWidth();
        maxY = zoneBounds.getY() + zoneBounds.getHeight();
        started = true;
      }

      if (zoneBounds.getX() < minX) {
        minX = zoneBounds.getX();
      }
      if (zoneBounds.getX() + zoneBounds.getWidth() > maxX) {
        maxX = zoneBounds.getX() + zoneBounds.getWidth();
      }
      if (zoneBounds.getY() < minY) {
        minY = zoneBounds.getY();
      }
      if (zoneBounds.getY() + zoneBounds.getHeight() > maxY) {
        maxY = zoneBounds.getY() + zoneBounds.getHeight();
      }
    }

    Collections.sort(
        page.getZones(),
        new Comparator() {

          @Override
          public int compare(Object t, Object t1) {
            BxZone z1 = (BxZone) t;
            BxZone z2 = (BxZone) t1;
            int ret = Double.compare(z1.getBounds().getY(), z2.getBounds().getY());
            if (ret == 0) {
              ret = Double.compare(z1.getBounds().getX(), z2.getBounds().getX());
            }
            return ret;
          }
        });
    return page.setBounds(new BxBounds(minX, minY, maxX - minX, maxY - minY));
  }
 private void appendPage(Document doc, Element parent, BxPage page, Object... hints)
     throws TransformationException {
   Element node = doc.createElement("Page");
   appendPropertyIfNotNull(doc, node, "PageID", page.getId());
   appendProperty(doc, node, "PageType", "");
   appendProperty(doc, node, "PageNumber", "");
   appendProperty(doc, node, "PageColumns", "");
   appendPropertyIfNotNull(doc, node, "PageNext", page.getNextId());
   appendProperty(doc, node, "PageZones", "");
   for (BxZone zone : page.getZones()) {
     appendZone(doc, node, zone, hints);
   }
   parent.appendChild(node);
 }
  @Override
  public double calculateFeatureValue(BxLine line, BxPage page) {
    if (!line.hasPrev()) {
      return 1.0;
    }
    double avLength = 0;
    int linesCount = 0;
    for (BxZone zone : page.getZones()) {
      for (BxLine l : zone.getLines()) {
        linesCount++;
        avLength += l.getBounds().getWidth();
      }
    }

    if (linesCount == 0 || avLength == 0) {
      return 0;
    }

    avLength /= linesCount;

    return line.getPrev().getBounds().getWidth() / avLength;
  }
Beispiel #5
0
 @Override
 public double calculateFeatureValue(BxZone zone, BxPage page) {
   return page.getNext() == null ? 1.0 : 0.0;
 }
 @Override
 public double calculateFeatureValue(BxLine object, BxPage context) {
   return object.getBounds().getX() - context.getX();
 }