Ejemplo n.º 1
0
 public void setLabels(Collection<BxZoneLabel> labels) {
   ignoredLabels.addAll(EnumSet.allOf(BxZoneLabel.class));
   ignoredLabels.removeAll(labels);
 }
Ejemplo n.º 2
0
/** @author krusek */
public class SegmentationEvaluator
    extends AbstractSingleInputEvaluator<
        BxDocument, BxDocument, BxDocument, SegmentationEvaluator.Results> {

  private static final Pattern FILENAME_PATTERN = Pattern.compile("(.+)\\.xml");

  private DocumentSegmenter pageSegmenter = new DocstrumSegmenter();

  private final Set<BxZoneLabel> ignoredLabels = EnumSet.noneOf(BxZoneLabel.class);

  private UnsegmentedPagesFlattener flattener = new UnsegmentedPagesFlattener();

  private final ReadingOrderResolver resolver = new HierarchicalReadingOrderResolver();

  private TrueVizToBxDocumentReader reader = new TrueVizToBxDocumentReader();

  private BxDocumentToTrueVizWriter writer = new BxDocumentToTrueVizWriter();

  @Override
  protected Pattern getFilenamePattern() {
    return FILENAME_PATTERN;
  }

  public void setPageSegmenter(DocumentSegmenter pageSegmenter) {
    this.pageSegmenter = pageSegmenter;
  }

  public void setIgnoredLabels(Collection<BxZoneLabel> labels) {
    ignoredLabels.clear();
    ignoredLabels.addAll(labels);
  }

  public void setLabels(Collection<BxZoneLabel> labels) {
    ignoredLabels.addAll(EnumSet.allOf(BxZoneLabel.class));
    ignoredLabels.removeAll(labels);
  }

  @Override
  protected void preprocessDocument(BxDocument document) {
    flattener.process(document);
  }

  @Override
  protected BxDocument processDocument(BxDocument document) throws AnalysisException {
    return pageSegmenter.segmentDocument(document);
  }

  @Override
  protected Results compareItems(BxDocument expected, BxDocument actual) {
    Results results = new Results();
    for (int i = 0; i < expected.childrenCount(); i++) {
      BxPage expPage = expected.getChild(i);
      BxPage actPage = actual.getChild(i);
      results.zoneLevel.add(compareZones(expPage, actPage));
      results.lineLevel.add(compareLines(expPage, actPage));
      results.wordLevel.add(compareWords(expPage, actPage));
    }
    return results;
  }

  private void printSeparator() {
    System.out.print(" +----------+");
    Results.printSeparator();
  }

  @Override
  protected void printDocumentStart() {
    System.out.print(" |   Page   |");
    Results.printLevelHeader();
    System.out.print(" |          |");
    Results.printColumnHeader();
    printSeparator();
  }

  @Override
  protected void printItemResults(
      BxDocument expected, BxDocument actual, int idx, Results results) {
    printItemResults(idx, results);
  }

  protected void printItemResults(int pageIndex, Results results) {
    Formatter formatter = new Formatter(System.out, Locale.US);
    formatter.format(" | %8d |", pageIndex + 1);
    results.printResults(formatter);
  }

  @Override
  protected void printDocumentResults(Results results) {
    printSeparator();
    Formatter formatter = new Formatter(System.out, Locale.US);
    formatter.format(" |   Total: |");
    results.printResults(formatter);
  }

  @Override
  protected Results newResults() {
    return new Results();
  }

  @Override
  protected void printFinalResults(Results results) {
    results.printSummary();
  }

  private LevelResults compareWords(BxPage expected, BxPage actual) {
    Map<BxChunk, BxWord> map = BxModelUtils.mapChunksToWords(actual);

    LevelResults results = new LevelResults();
    for (BxZone expectedZone : expected) {
      if (ignoredLabels.contains(expectedZone.getLabel())) {
        continue;
      }
      for (BxLine expectedLine : expectedZone) {
        for (BxWord expectedWord : expectedLine) {
          Set<BxWord> actualWords = new HashSet<BxWord>();
          for (BxChunk chunk : expectedWord) {
            actualWords.add(map.get(chunk));
          }
          if (actualWords.size() == 1) {
            for (BxWord actualWord : actualWords) {
              if (actualWord.childrenCount() == expectedWord.childrenCount()) {
                results.matched++;
              }
            }
          }
          results.all++;
        }
      }
    }

    return results;
  }

  private LevelResults compareLines(BxPage expected, BxPage actual) {
    Map<BxChunk, BxLine> map = BxModelUtils.mapChunksToLines(actual);

    LevelResults results = new LevelResults();
    for (BxZone expectedZone : expected) {
      if (ignoredLabels.contains(expectedZone.getLabel())) {
        continue;
      }
      for (BxLine expectedLine : expectedZone) {
        Set<BxLine> actualLines = new HashSet<BxLine>();
        for (BxWord word : expectedLine) {
          for (BxChunk chunk : word) {
            actualLines.add(map.get(chunk));
          }
        }
        if (actualLines.size() == 1) {
          for (BxLine actualLine : actualLines) {
            if (BxModelUtils.countChunks(actualLine) == BxModelUtils.countChunks(expectedLine)) {
              results.matched++;
            }
          }
        }
        results.all++;
      }
    }

    return results;
  }

  private LevelResults compareZones(BxPage expected, BxPage actual) {
    Map<BxChunk, BxZone> map = BxModelUtils.mapChunksToZones(actual);

    LevelResults results = new LevelResults();
    for (BxZone expectedZone : expected) {
      if (ignoredLabels.contains(expectedZone.getLabel())) {
        continue;
      }
      Set<BxZone> actualZones = new HashSet<BxZone>();
      for (BxLine line : expectedZone) {
        for (BxWord word : line) {
          for (BxChunk chunk : word) {
            actualZones.add(map.get(chunk));
          }
        }
      }
      if (actualZones.size() == 1) {
        for (BxZone actualZone : actualZones) {
          if (BxModelUtils.countChunks(actualZone) == BxModelUtils.countChunks(expectedZone)) {
            results.matched++;
          }
        }
      }
      results.all++;
    }

    return results;
  }

  @Override
  protected BxDocument prepareActualDocument(BxDocument document) throws AnalysisException {
    document = BxModelUtils.deepClone(document);
    preprocessDocument(document);
    return processDocument(document);
  }

  @Override
  protected BxDocument prepareExpectedDocument(BxDocument document) throws AnalysisException {
    resolver.resolve(document);
    return document;
  }

  @Override
  protected BxDocument readDocument(Reader input) throws TransformationException {
    return new BxDocument().setPages(reader.read(input));
  }

  @Override
  protected void writeDocument(BxDocument document, Writer output) throws TransformationException {
    writer.write(output, Lists.newArrayList(document));
  }

  @Override
  protected Iterator<BxDocument> iterateItems(final BxDocument document) {
    return new Iterator<BxDocument>() {
      private boolean used = false;

      @Override
      public boolean hasNext() {
        return !used;
      }

      @Override
      public BxDocument next() {
        used = true;
        return document;
      }

      @Override
      public void remove() {
        used = true;
      }
    };
  }

  public static class Results implements AbstractEvaluator.Results<Results> {
    private LevelResults zoneLevel = new LevelResults();
    private LevelResults lineLevel = new LevelResults();
    private LevelResults wordLevel = new LevelResults();

    @Override
    public void add(Results results) {
      zoneLevel.add(results.zoneLevel);
      lineLevel.add(results.lineLevel);
      wordLevel.add(results.wordLevel);
    }

    public void printResults(Formatter formatter) {
      zoneLevel.printResults(formatter);
      lineLevel.printResults(formatter);
      wordLevel.printResults(formatter);
      formatter.format("%n");
    }

    public static void printLevelHeader() {
      System.out.print("                    Zones                     |");
      System.out.print("                    Lines                     |");
      System.out.print("                    Words                     |");
      System.out.println();
    }

    public static void printColumnHeader() {
      LevelResults.printHeader();
      LevelResults.printHeader();
      LevelResults.printHeader();
      System.out.println();
    }

    public static void printSeparator() {
      LevelResults.printSeparator();
      LevelResults.printSeparator();
      LevelResults.printSeparator();
      System.out.println();
    }

    public void printSummary() {
      Formatter formatter = new Formatter(System.out, Locale.US);
      System.out.println("  * zones");
      zoneLevel.printSummary(formatter);
      System.out.println("  * lines");
      lineLevel.printSummary(formatter);
      System.out.println("  * words");
      wordLevel.printSummary(formatter);
    }
  }

  public static class LevelResults {
    private int all = 0;
    private int matched = 0;

    public void add(LevelResults results) {
      all += results.all;
      matched += results.matched;
    }

    public void printResults(Formatter formatter) {
      formatter.format(" %8d %8d %7.2f%% |", all, matched, getScore() * 100);
    }

    public static void printHeader() {
      System.out.print("   All    Matched  Score   |");
    }

    public static void printSeparator() {
      System.out.print("----------------------------------------------+");
    }

    public void printSummary(Formatter formatter) {
      formatter.format("      * all      : %8d%n", all);
      formatter.format("      * matched  : %8d%n", matched);
      formatter.format("      * score    : %7.2f%%%n", getScore() * 100);
    }

    public double getScore() {
      if (all == 0) {
        return 1.0;
      } else {
        return ((double) matched) / all;
      }
    }
  }

  public static void main(String[] args)
      throws AnalysisException, IOException, TransformationException {

    SegmentationEvaluator evaluator = new SegmentationEvaluator();
    evaluator.ignoredLabels.add(BxZoneLabel.BODY_TABLE);
    evaluator.ignoredLabels.add(BxZoneLabel.BODY_FIGURE);
    evaluator.ignoredLabels.add(BxZoneLabel.BODY_EQUATION);

    File file = new File(args[0]);
    Collection<File> files = FileUtils.listFiles(file, new String[] {"xml"}, true);
    Results results = evaluator.newResults();
    int i = 0;

    double zoneScores = 0;
    double lineScores = 0;
    double wordScores = 0;
    BxDocument origDoc;
    BxDocument testDoc;
    FileReader reader;
    for (File filee : files) {
      System.out.println(new Date(System.currentTimeMillis()));
      System.out.println(filee.getName());

      reader = new FileReader(filee);
      origDoc = evaluator.prepareExpectedDocument(evaluator.readDocument(reader));
      testDoc = evaluator.prepareActualDocument(origDoc);
      Results docRes = evaluator.compareDocuments(origDoc, testDoc);
      results.add(docRes);
      zoneScores += results.zoneLevel.getScore();
      lineScores += results.lineLevel.getScore();
      wordScores += results.wordLevel.getScore();
      System.out.println(++i);
    }
    zoneScores /= i;
    lineScores /= i;
    wordScores /= i;
    System.out.println("Documents: " + i);
    System.out.println("Average zone score: " + zoneScores);
    System.out.println("Average line score: " + lineScores);
    System.out.println("Average word score: " + wordScores);
    results.printSummary();
  }
}