@Override
  public BxDocument segmentDocument(BxDocument document) throws AnalysisException {
    Map<BxPage, List<Component>> componentMap = new HashMap<BxPage, List<Component>>();

    ExecutorService exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER);
    ArrayList<Callable<NumBxPage>> tasks = new ArrayList<Callable<NumBxPage>>();
    for (BxPage page : document.getPages()) {
      tasks.add(new ComponentCounter(page));
    }

    List<Future<NumBxPage>> results;
    try {
      results = exec.invokeAll(tasks);
      exec.shutdown();

      for (Future<NumBxPage> result : results) {
        NumBxPage p = result.get();
        componentMap.put(p.page, p.components);
      }
    } catch (ExecutionException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    } catch (InterruptedException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    }

    this.computeDocumentOrientation(componentMap);

    BxDocument output = new BxDocument();
    BxPage[] pages = new BxPage[document.getPages().size()];

    exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER);
    tasks = new ArrayList<Callable<NumBxPage>>();
    int i = 0;
    for (BxPage page : document.getPages()) {
      tasks.add(new SingleSegmenter(page, i++));
    }

    try {
      results = exec.invokeAll(tasks);
      exec.shutdown();

      for (Future<NumBxPage> result : results) {
        NumBxPage p = result.get();
        pages[p.index] = p.page;
      }
      for (BxPage p : pages) {
        if (p.getBounds() != null) {
          output.addPage(p);
        }
      }
      return output;
    } catch (ExecutionException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    } catch (InterruptedException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    }
  }
Exemple #2
0
 public BxDocument setPages(Collection<BxPage> pages) {
   if (pages != null) {
     this.pages.clear();
     curPageNumber = 0;
     for (BxPage page : pages) {
       addPage(page);
     }
   }
   return this;
 }