Example #1
0
  public void run(String[] args)
      throws ParseException, TransformationException, IOException, AnalysisException {
    Options options = new Options();
    options.addOption("input", true, "input path");
    options.addOption("output", true, "output path");
    options.addOption("ext", true, "extension");
    CommandLineParser parser = new DefaultParser();
    CommandLine line = parser.parse(options, args);
    String inDir = line.getOptionValue("input");
    String outDir = line.getOptionValue("output");
    String extension = line.getOptionValue("ext");

    File dir = new File(inDir);

    for (File f : FileUtils.listFiles(dir, new String[] {extension}, true)) {
      TrueVizToBxDocumentReader tvReader = new TrueVizToBxDocumentReader();
      List<BxPage> pages = tvReader.read(new FileReader(f));
      BxDocument doc = new BxDocument().setPages(pages);
      doc.setFilename(f.getName());

      BxDocument rewritten = transform(doc);

      File f2 = new File(outDir + doc.getFilename());
      BxDocumentToTrueVizWriter wrt = new BxDocumentToTrueVizWriter();
      boolean created = f2.createNewFile();
      if (!created) {
        throw new IOException("Cannot create file: ");
      }
      FileWriter fw = new FileWriter(f2);
      wrt.write(fw, Lists.newArrayList(rewritten));
      fw.flush();
      fw.close();
    }
  }
  @Override
  public BxDocument segmentDocument(BxDocument document) throws AnalysisException {
    Map<BxPage, List<Component>> componentMap = new HashMap<BxPage, List<Component>>();

    ExecutorService exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER);
    ArrayList<Callable<NumBxPage>> tasks = new ArrayList<Callable<NumBxPage>>();
    for (BxPage page : document.getPages()) {
      tasks.add(new ComponentCounter(page));
    }

    List<Future<NumBxPage>> results;
    try {
      results = exec.invokeAll(tasks);
      exec.shutdown();

      for (Future<NumBxPage> result : results) {
        NumBxPage p = result.get();
        componentMap.put(p.page, p.components);
      }
    } catch (ExecutionException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    } catch (InterruptedException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    }

    this.computeDocumentOrientation(componentMap);

    BxDocument output = new BxDocument();
    BxPage[] pages = new BxPage[document.getPages().size()];

    exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER);
    tasks = new ArrayList<Callable<NumBxPage>>();
    int i = 0;
    for (BxPage page : document.getPages()) {
      tasks.add(new SingleSegmenter(page, i++));
    }

    try {
      results = exec.invokeAll(tasks);
      exec.shutdown();

      for (Future<NumBxPage> result : results) {
        NumBxPage p = result.get();
        pages[p.index] = p.page;
      }
      for (BxPage p : pages) {
        if (p.getBounds() != null) {
          output.addPage(p);
        }
      }
      return output;
    } catch (ExecutionException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    } catch (InterruptedException ex) {
      throw new AnalysisException("Cannot segment pages!", ex);
    }
  }
 @Override
 protected ClassificationResults compareDocuments(BxDocument expected, BxDocument actual) {
   ClassificationResults ret = newResults();
   for (Integer idx = 0; idx < actual.asZones().size(); ++idx) {
     ClassificationResults itemResults =
         compareItems(expected.asZones().get(idx), actual.asZones().get(idx));
     ret.add(itemResults);
   }
   return ret;
 }
 @Override
 protected Results compareItems(BxDocument expected, BxDocument actual) {
   Results results = new Results();
   for (int i = 0; i < expected.childrenCount(); i++) {
     BxPage expPage = expected.getChild(i);
     BxPage actPage = actual.getChild(i);
     results.zoneLevel.add(compareZones(expPage, actPage));
     results.lineLevel.add(compareLines(expPage, actPage));
     results.wordLevel.add(compareWords(expPage, actPage));
   }
   return results;
 }
  public static void main(String[] args)
      throws TransformationException, IOException, AnalysisException, ParseException,
          CloneNotSupportedException {
    Options options = new Options();
    options.addOption("input", true, "input path");
    options.addOption("output", true, "output path");
    CommandLineParser parser = new GnuParser();
    CommandLine line = parser.parse(options, args);
    String inDir = line.getOptionValue("input");
    String outDir = line.getOptionValue("output");

    File dir = new File(inDir);

    for (File f : FileUtils.listFiles(dir, new String[] {"xml"}, true)) {
      TrueVizToBxDocumentReader tvReader = new TrueVizToBxDocumentReader();
      List<BxPage> pages = tvReader.read(new FileReader(f));
      BxDocument doc = new BxDocument().setPages(pages);
      doc.setFilename(f.getName());

      int all = 0;
      int good = 0;
      for (BxZone z : doc.asZones()) {
        all++;
        if (!z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) {
          good++;
        }
      }
      int intcov = 0;
      if (all > 0) {
        intcov = good * 100 / all;
      }
      System.out.println(doc.getFilename() + " " + intcov);

      File f2 = new File(outDir + doc.getFilename() + "." + intcov);
      FileUtils.copyFile(f, f2);
    }
  }
 @Override
 protected void preprocessDocumentForEvaluation(BxDocument doc) {
   for (BxZone zone : doc.asZones()) zone.setLabel(zone.getLabel().getGeneralLabel());
 }
Example #7
0
  public static void main(String[] args)
      throws ParseException, IOException, TransformationException, AnalysisException,
          CloneNotSupportedException {
    Options options = new Options();

    CommandLineParser parser = new GnuParser();
    CommandLine line = parser.parse(options, args);

    if (args.length != 2) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(" [-options] input-directory extension", options);
      System.exit(1);
    }
    String inputDirPath = line.getArgs()[0];
    File inputDirFile = new File(inputDirPath);

    Integer docIdx = 0;

    HierarchicalReadingOrderResolver ror = new HierarchicalReadingOrderResolver();
    DocumentsIterator iter = new DocumentsIterator(inputDirPath, line.getArgs()[1]);

    FeatureVectorBuilder<BxZone, BxPage> metaVectorBuilder =
        SVMMetadataZoneClassifier.getFeatureVectorBuilder();
    FeatureVectorBuilder<BxZone, BxPage> initialVectorBuilder =
        SVMInitialZoneClassifier.getFeatureVectorBuilder();

    SampleFilter metaSamplesFilter = new SampleFilter(BxZoneLabelCategory.CAT_METADATA);

    FileWriter initialStream = new FileWriter("initial_" + inputDirFile.getName() + ".dat");
    BufferedWriter svmInitialFile = new BufferedWriter(initialStream);

    FileWriter metaStream = new FileWriter("meta_" + inputDirFile.getName() + ".dat");
    BufferedWriter svmMetaFile = new BufferedWriter(metaStream);

    for (BxDocument doc : iter) {
      System.out.println(docIdx + ": " + doc.getFilename());
      String filename = doc.getFilename();
      doc = ror.resolve(doc);
      doc.setFilename(filename);

      for (BxZone zone : doc.asZones()) {
        if (zone.getLabel() != null) {
          if (zone.getLabel().getCategory() != BxZoneLabelCategory.CAT_METADATA) {
            zone.setLabel(zone.getLabel().getGeneralLabel());
          }
        } else {
          zone.setLabel(BxZoneLabel.OTH_UNKNOWN);
        }
      }
      List<TrainingSample<BxZoneLabel>> newMetaSamples =
          BxDocsToTrainingSamplesConverter.getZoneTrainingSamples(
              doc, metaVectorBuilder, BxZoneLabel.getIdentityMap());
      newMetaSamples = metaSamplesFilter.pickElements(newMetaSamples);

      List<TrainingSample<BxZoneLabel>> newInitialSamples =
          BxDocsToTrainingSamplesConverter.getZoneTrainingSamples(
              doc, initialVectorBuilder, BxZoneLabel.getLabelToGeneralMap());

      for (TrainingSample<BxZoneLabel> sample : newMetaSamples) {
        toLibSVM(sample, svmMetaFile);
      }
      for (TrainingSample<BxZoneLabel> sample : newInitialSamples) {
        toLibSVM(sample, svmInitialFile);
      }
      ++docIdx;
    }
    svmInitialFile.close();
    svmMetaFile.close();
  }