private void appendZone(Document doc, Element parent, BxZone zone, Object... hints)
     throws TransformationException {
   Element node = doc.createElement("Zone");
   appendPropertyIfNotNull(doc, node, "ZoneID", zone.getId());
   appendBounds(doc, node, "ZoneCorners", zone.getBounds(), hints);
   appendPropertyIfNotNull(doc, node, "ZoneNext", zone.getNextId());
   Element insetsNode = doc.createElement("ZoneInsets");
   insetsNode.setAttribute("Top", "");
   insetsNode.setAttribute("Bottom", "");
   insetsNode.setAttribute("Left", "");
   insetsNode.setAttribute("Right", "");
   node.appendChild(insetsNode);
   appendProperty(doc, node, "ZoneLines", "");
   if (zone.getLabel() != null) {
     if (ZONE_LABEL_MAP.get(zone.getLabel()) != null
         && !ZONE_LABEL_MAP.get(zone.getLabel()).isEmpty()) {
       appendClassification(doc, node, ZONE_LABEL_MAP.get(zone.getLabel()).toUpperCase(), "");
     } else {
       throw new TransformationException("Writing down an unknown zone label: " + zone.getLabel());
     }
   }
   for (BxLine line : zone.getLines()) {
     appendLine(doc, node, line, hints);
   }
   parent.appendChild(node);
 }
  private LevelResults compareZones(BxPage expected, BxPage actual) {
    Map<BxChunk, BxZone> map = BxModelUtils.mapChunksToZones(actual);

    LevelResults results = new LevelResults();
    for (BxZone expectedZone : expected) {
      if (ignoredLabels.contains(expectedZone.getLabel())) {
        continue;
      }
      Set<BxZone> actualZones = new HashSet<BxZone>();
      for (BxLine line : expectedZone) {
        for (BxWord word : line) {
          for (BxChunk chunk : word) {
            actualZones.add(map.get(chunk));
          }
        }
      }
      if (actualZones.size() == 1) {
        for (BxZone actualZone : actualZones) {
          if (BxModelUtils.countChunks(actualZone) == BxModelUtils.countChunks(expectedZone)) {
            results.matched++;
          }
        }
      }
      results.all++;
    }

    return results;
  }
  private LevelResults compareWords(BxPage expected, BxPage actual) {
    Map<BxChunk, BxWord> map = BxModelUtils.mapChunksToWords(actual);

    LevelResults results = new LevelResults();
    for (BxZone expectedZone : expected) {
      if (ignoredLabels.contains(expectedZone.getLabel())) {
        continue;
      }
      for (BxLine expectedLine : expectedZone) {
        for (BxWord expectedWord : expectedLine) {
          Set<BxWord> actualWords = new HashSet<BxWord>();
          for (BxChunk chunk : expectedWord) {
            actualWords.add(map.get(chunk));
          }
          if (actualWords.size() == 1) {
            for (BxWord actualWord : actualWords) {
              if (actualWord.childrenCount() == expectedWord.childrenCount()) {
                results.matched++;
              }
            }
          }
          results.all++;
        }
      }
    }

    return results;
  }
  public static void main(String[] args)
      throws TransformationException, IOException, AnalysisException, ParseException,
          CloneNotSupportedException {
    Options options = new Options();
    options.addOption("input", true, "input path");
    options.addOption("output", true, "output path");
    CommandLineParser parser = new GnuParser();
    CommandLine line = parser.parse(options, args);
    String inDir = line.getOptionValue("input");
    String outDir = line.getOptionValue("output");

    File dir = new File(inDir);

    for (File f : FileUtils.listFiles(dir, new String[] {"xml"}, true)) {
      TrueVizToBxDocumentReader tvReader = new TrueVizToBxDocumentReader();
      List<BxPage> pages = tvReader.read(new FileReader(f));
      BxDocument doc = new BxDocument().setPages(pages);
      doc.setFilename(f.getName());

      int all = 0;
      int good = 0;
      for (BxZone z : doc.asZones()) {
        all++;
        if (!z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) {
          good++;
        }
      }
      int intcov = 0;
      if (all > 0) {
        intcov = good * 100 / all;
      }
      System.out.println(doc.getFilename() + " " + intcov);

      File f2 = new File(outDir + doc.getFilename() + "." + intcov);
      FileUtils.copyFile(f, f2);
    }
  }
 @Override
 protected void preprocessDocumentForEvaluation(BxDocument doc) {
   for (BxZone zone : doc.asZones()) zone.setLabel(zone.getLabel().getGeneralLabel());
 }
  @Override
  public BxContentStructure extractHeaders(BxDocument document) throws AnalysisException {

    Population heightPopulation = new Population();
    Population fontPopulation = new Population();
    Population distancePopulation = new Population();
    Population lengthPopulation = new Population();
    Population indentationPopulation = new Population();

    Set<BxLine> candidates = new HashSet<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            heightPopulation.addObservation(line.getHeight());
            lengthPopulation.addObservation(line.getWidth());
            indentationPopulation.addObservation(line.getX());
            if (line.hasPrev() && line.getY() - line.getPrev().getY() > 0) {
              distancePopulation.addObservation(line.getY() - line.getPrev().getY());
            }
            fontPopulation.addObservation(getFontIndex(line));

            if (isFirstInZone(line) && looksLikeHeader(line)) {
              candidates.add(line);
            }
          }
        }
      }
    }

    Set<BxLine> toDelete = new HashSet<BxLine>();

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    Set<String> headerFonts = new HashSet<String>();
    List<BxLine> candidatesList = Lists.newArrayList(candidates);

    for (int x = 0; x < candidatesList.size(); x++) {
      BxLine line1 = candidatesList.get(x);
      for (int y = x + 1; y < candidatesList.size(); y++) {
        BxLine line2 = candidatesList.get(y);
        for (int z = y + 1; z < candidatesList.size(); z++) {
          BxLine line3 = candidatesList.get(z);
          if (line1.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && line3.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && Math.abs(fontPopulation.getZScore(getFontIndex(line1))) > outlFontZScore) {
            headerFonts.add(line1.getMostPopularFontName());
          }
        }
      }
    }

    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (looksLikeHeader(line) && headerFonts.contains(line.getMostPopularFontName())) {
              candidates.add(line);
            }
          }
        }
      }
    }

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore2) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    for (BxLine line : candidates) {
      int i = 0;
      for (BxLine line2 : candidates) {
        if (line.equals(line2)) {
          continue;
        }
        if (areSimilar(line, line2)) {
          i++;
        }
      }
      if (i == 0 || i > maxSimilarLinesCount) {
        toDelete.add(line);
        for (BxLine line2 : candidates) {
          if (areSimilar(line, line2)) {
            toDelete.add(line2);
          }
        }
      }
    }

    candidates.removeAll(toDelete);

    candidatesList = new ArrayList<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        for (BxLine line : zone) {
          if (candidates.contains(line)) {
            candidatesList.add(line);
          }
        }
      }
    }
    int clusters[] = headersClusterizer.clusterLines(candidatesList);
    Set<Integer> keptClusters = new HashSet<Integer>();
    for (int clusterIdx = 0; clusterIdx < clusters.length; clusterIdx++) {
      int cluster = clusters[clusterIdx];
      if (keptClusters.size() < 3) {
        keptClusters.add(cluster);
      }
      if (!keptClusters.contains(cluster)) {
        candidates.remove(candidatesList.get(clusterIdx));
      }
    }

    BxContentStructure contentStructure = new BxContentStructure();
    BxLine lastHeaderLine = null;
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (candidates.contains(line)) {
              contentStructure.addFirstHeaderLine(page, line);
              lastHeaderLine = line;
            } else if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
              if (lastHeaderLine == null) {
                BxChunk chunk = new BxChunk(new BxBounds(), "--");
                BxWord word = new BxWord().addChunk(chunk);
                lastHeaderLine = new BxLine().addWord(word);
                contentStructure.addFirstHeaderLine(page, lastHeaderLine);
              }
              contentStructure.addContentLine(lastHeaderLine, line);
            }
          }
        }
      }
    }

    headerLinesCompletener.completeLines(contentStructure);

    return contentStructure;
  }
Exemple #7
0
  public static void main(String[] args)
      throws ParseException, IOException, TransformationException, AnalysisException,
          CloneNotSupportedException {
    Options options = new Options();

    CommandLineParser parser = new GnuParser();
    CommandLine line = parser.parse(options, args);

    if (args.length != 2) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(" [-options] input-directory extension", options);
      System.exit(1);
    }
    String inputDirPath = line.getArgs()[0];
    File inputDirFile = new File(inputDirPath);

    Integer docIdx = 0;

    HierarchicalReadingOrderResolver ror = new HierarchicalReadingOrderResolver();
    DocumentsIterator iter = new DocumentsIterator(inputDirPath, line.getArgs()[1]);

    FeatureVectorBuilder<BxZone, BxPage> metaVectorBuilder =
        SVMMetadataZoneClassifier.getFeatureVectorBuilder();
    FeatureVectorBuilder<BxZone, BxPage> initialVectorBuilder =
        SVMInitialZoneClassifier.getFeatureVectorBuilder();

    SampleFilter metaSamplesFilter = new SampleFilter(BxZoneLabelCategory.CAT_METADATA);

    FileWriter initialStream = new FileWriter("initial_" + inputDirFile.getName() + ".dat");
    BufferedWriter svmInitialFile = new BufferedWriter(initialStream);

    FileWriter metaStream = new FileWriter("meta_" + inputDirFile.getName() + ".dat");
    BufferedWriter svmMetaFile = new BufferedWriter(metaStream);

    for (BxDocument doc : iter) {
      System.out.println(docIdx + ": " + doc.getFilename());
      String filename = doc.getFilename();
      doc = ror.resolve(doc);
      doc.setFilename(filename);

      for (BxZone zone : doc.asZones()) {
        if (zone.getLabel() != null) {
          if (zone.getLabel().getCategory() != BxZoneLabelCategory.CAT_METADATA) {
            zone.setLabel(zone.getLabel().getGeneralLabel());
          }
        } else {
          zone.setLabel(BxZoneLabel.OTH_UNKNOWN);
        }
      }
      List<TrainingSample<BxZoneLabel>> newMetaSamples =
          BxDocsToTrainingSamplesConverter.getZoneTrainingSamples(
              doc, metaVectorBuilder, BxZoneLabel.getIdentityMap());
      newMetaSamples = metaSamplesFilter.pickElements(newMetaSamples);

      List<TrainingSample<BxZoneLabel>> newInitialSamples =
          BxDocsToTrainingSamplesConverter.getZoneTrainingSamples(
              doc, initialVectorBuilder, BxZoneLabel.getLabelToGeneralMap());

      for (TrainingSample<BxZoneLabel> sample : newMetaSamples) {
        toLibSVM(sample, svmMetaFile);
      }
      for (TrainingSample<BxZoneLabel> sample : newInitialSamples) {
        toLibSVM(sample, svmInitialFile);
      }
      ++docIdx;
    }
    svmInitialFile.close();
    svmMetaFile.close();
  }