예제 #1
0
  public static void main(String[] args)
      throws AnalysisException, IOException, TransformationException {

    SegmentationEvaluator evaluator = new SegmentationEvaluator();
    evaluator.ignoredLabels.add(BxZoneLabel.BODY_TABLE);
    evaluator.ignoredLabels.add(BxZoneLabel.BODY_FIGURE);
    evaluator.ignoredLabels.add(BxZoneLabel.BODY_EQUATION);

    File file = new File(args[0]);
    Collection<File> files = FileUtils.listFiles(file, new String[] {"xml"}, true);
    Results results = evaluator.newResults();
    int i = 0;

    double zoneScores = 0;
    double lineScores = 0;
    double wordScores = 0;
    BxDocument origDoc;
    BxDocument testDoc;
    FileReader reader;
    for (File filee : files) {
      System.out.println(new Date(System.currentTimeMillis()));
      System.out.println(filee.getName());

      reader = new FileReader(filee);
      origDoc = evaluator.prepareExpectedDocument(evaluator.readDocument(reader));
      testDoc = evaluator.prepareActualDocument(origDoc);
      Results docRes = evaluator.compareDocuments(origDoc, testDoc);
      results.add(docRes);
      zoneScores += results.zoneLevel.getScore();
      lineScores += results.lineLevel.getScore();
      wordScores += results.wordLevel.getScore();
      System.out.println(++i);
    }
    zoneScores /= i;
    lineScores /= i;
    wordScores /= i;
    System.out.println("Documents: " + i);
    System.out.println("Average zone score: " + zoneScores);
    System.out.println("Average line score: " + lineScores);
    System.out.println("Average word score: " + wordScores);
    results.printSummary();
  }
예제 #2
0
  public static void main(String[] args)
      throws ParseException, IOException, TransformationException, AnalysisException,
          CloneNotSupportedException {
    Options options = new Options();

    CommandLineParser parser = new GnuParser();
    CommandLine line = parser.parse(options, args);

    if (args.length != 2) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(" [-options] input-directory extension", options);
      System.exit(1);
    }
    String inputDirPath = line.getArgs()[0];
    File inputDirFile = new File(inputDirPath);

    Integer docIdx = 0;

    HierarchicalReadingOrderResolver ror = new HierarchicalReadingOrderResolver();
    DocumentsIterator iter = new DocumentsIterator(inputDirPath, line.getArgs()[1]);

    FeatureVectorBuilder<BxZone, BxPage> metaVectorBuilder =
        SVMMetadataZoneClassifier.getFeatureVectorBuilder();
    FeatureVectorBuilder<BxZone, BxPage> initialVectorBuilder =
        SVMInitialZoneClassifier.getFeatureVectorBuilder();

    SampleFilter metaSamplesFilter = new SampleFilter(BxZoneLabelCategory.CAT_METADATA);

    FileWriter initialStream = new FileWriter("initial_" + inputDirFile.getName() + ".dat");
    BufferedWriter svmInitialFile = new BufferedWriter(initialStream);

    FileWriter metaStream = new FileWriter("meta_" + inputDirFile.getName() + ".dat");
    BufferedWriter svmMetaFile = new BufferedWriter(metaStream);

    for (BxDocument doc : iter) {
      System.out.println(docIdx + ": " + doc.getFilename());
      String filename = doc.getFilename();
      doc = ror.resolve(doc);
      doc.setFilename(filename);

      for (BxZone zone : doc.asZones()) {
        if (zone.getLabel() != null) {
          if (zone.getLabel().getCategory() != BxZoneLabelCategory.CAT_METADATA) {
            zone.setLabel(zone.getLabel().getGeneralLabel());
          }
        } else {
          zone.setLabel(BxZoneLabel.OTH_UNKNOWN);
        }
      }
      List<TrainingSample<BxZoneLabel>> newMetaSamples =
          BxDocsToTrainingSamplesConverter.getZoneTrainingSamples(
              doc, metaVectorBuilder, BxZoneLabel.getIdentityMap());
      newMetaSamples = metaSamplesFilter.pickElements(newMetaSamples);

      List<TrainingSample<BxZoneLabel>> newInitialSamples =
          BxDocsToTrainingSamplesConverter.getZoneTrainingSamples(
              doc, initialVectorBuilder, BxZoneLabel.getLabelToGeneralMap());

      for (TrainingSample<BxZoneLabel> sample : newMetaSamples) {
        toLibSVM(sample, svmMetaFile);
      }
      for (TrainingSample<BxZoneLabel> sample : newInitialSamples) {
        toLibSVM(sample, svmInitialFile);
      }
      ++docIdx;
    }
    svmInitialFile.close();
    svmMetaFile.close();
  }