private LevelResults compareWords(BxPage expected, BxPage actual) {
    Map<BxChunk, BxWord> map = BxModelUtils.mapChunksToWords(actual);

    LevelResults results = new LevelResults();
    for (BxZone expectedZone : expected) {
      if (ignoredLabels.contains(expectedZone.getLabel())) {
        continue;
      }
      for (BxLine expectedLine : expectedZone) {
        for (BxWord expectedWord : expectedLine) {
          Set<BxWord> actualWords = new HashSet<BxWord>();
          for (BxChunk chunk : expectedWord) {
            actualWords.add(map.get(chunk));
          }
          if (actualWords.size() == 1) {
            for (BxWord actualWord : actualWords) {
              if (actualWord.childrenCount() == expectedWord.childrenCount()) {
                results.matched++;
              }
            }
          }
          results.all++;
        }
      }
    }

    return results;
  }
  private LevelResults compareZones(BxPage expected, BxPage actual) {
    Map<BxChunk, BxZone> map = BxModelUtils.mapChunksToZones(actual);

    LevelResults results = new LevelResults();
    for (BxZone expectedZone : expected) {
      if (ignoredLabels.contains(expectedZone.getLabel())) {
        continue;
      }
      Set<BxZone> actualZones = new HashSet<BxZone>();
      for (BxLine line : expectedZone) {
        for (BxWord word : line) {
          for (BxChunk chunk : word) {
            actualZones.add(map.get(chunk));
          }
        }
      }
      if (actualZones.size() == 1) {
        for (BxZone actualZone : actualZones) {
          if (BxModelUtils.countChunks(actualZone) == BxModelUtils.countChunks(expectedZone)) {
            results.matched++;
          }
        }
      }
      results.all++;
    }

    return results;
  }
 @Override
 protected boolean enhanceMetadata(BxZone zone, DocumentMetadata metadata) {
   Matcher simpleMatcher = simplePattern.matcher(zone.toText());
   if (simpleMatcher.find()) {
     MatchResult result = simpleMatcher.toMatchResult();
     String day = result.group(2);
     String month = null;
     for (int i = 0; i < 12; i++) {
       if (result.group(i + 3) != null) {
         month = String.valueOf(i + 1);
         break;
       }
     }
     String year = result.group(15);
     enhanceMetadata(metadata, day, month, year);
     return true;
   }
   Matcher anotherMatcher = anotherPattern.matcher(zone.toText());
   if (anotherMatcher.find()) {
     MatchResult result = anotherMatcher.toMatchResult();
     String day = result.group(14);
     String month = null;
     for (int i = 0; i < 12; i++) {
       if (result.group(i + 2) != null) {
         month = String.valueOf(i + 1);
         break;
       }
     }
     String year = result.group(15);
     enhanceMetadata(metadata, day, month, year);
     return true;
   }
   return false;
 }
 @Override
 public double calculateFeatureValue(BxZone zone, BxPage page) {
   return (zone.toText().matches("^.*[\\u0391-\\u03A9].*$")
           || zone.toText().matches("^.*[\\u03B1-\\u03C9].*$"))
       ? 1
       : 0;
 }
 @Override
 public double calculateFeatureValue(BxZone object, BxPage context) {
   for (BxZone zone : context) {
     if (zone.getWidth() > object.getWidth()) {
       return 0.0;
     }
   }
   return 1.0;
 }
 private void appendZone(Document doc, Element parent, BxZone zone, Object... hints)
     throws TransformationException {
   Element node = doc.createElement("Zone");
   appendPropertyIfNotNull(doc, node, "ZoneID", zone.getId());
   appendBounds(doc, node, "ZoneCorners", zone.getBounds(), hints);
   appendPropertyIfNotNull(doc, node, "ZoneNext", zone.getNextId());
   Element insetsNode = doc.createElement("ZoneInsets");
   insetsNode.setAttribute("Top", "");
   insetsNode.setAttribute("Bottom", "");
   insetsNode.setAttribute("Left", "");
   insetsNode.setAttribute("Right", "");
   node.appendChild(insetsNode);
   appendProperty(doc, node, "ZoneLines", "");
   if (zone.getLabel() != null) {
     if (ZONE_LABEL_MAP.get(zone.getLabel()) != null
         && !ZONE_LABEL_MAP.get(zone.getLabel()).isEmpty()) {
       appendClassification(doc, node, ZONE_LABEL_MAP.get(zone.getLabel()).toUpperCase(), "");
     } else {
       throw new TransformationException("Writing down an unknown zone label: " + zone.getLabel());
     }
   }
   for (BxLine line : zone.getLines()) {
     appendLine(doc, node, line, hints);
   }
   parent.appendChild(node);
 }
  private BxPage parsePageNode(Element elem) {
    BxPage page = new BxPage();

    double minX = 0, minY = 0, maxX = 0, maxY = 0;
    boolean started = false;

    List<Element> e = getChildren("Zone", elem);
    for (Element zo : e) {
      BxZone zon = parseZoneNode(zo);
      page.addZone(zon);

      BxBounds zoneBounds = zon.getBounds();
      if (!started) {
        minX = zoneBounds.getX();
        minY = zoneBounds.getY();
        maxX = zoneBounds.getX() + zoneBounds.getWidth();
        maxY = zoneBounds.getY() + zoneBounds.getHeight();
        started = true;
      }

      if (zoneBounds.getX() < minX) {
        minX = zoneBounds.getX();
      }
      if (zoneBounds.getX() + zoneBounds.getWidth() > maxX) {
        maxX = zoneBounds.getX() + zoneBounds.getWidth();
      }
      if (zoneBounds.getY() < minY) {
        minY = zoneBounds.getY();
      }
      if (zoneBounds.getY() + zoneBounds.getHeight() > maxY) {
        maxY = zoneBounds.getY() + zoneBounds.getHeight();
      }
    }

    Collections.sort(
        page.getZones(),
        new Comparator() {

          @Override
          public int compare(Object t, Object t1) {
            BxZone z1 = (BxZone) t;
            BxZone z2 = (BxZone) t1;
            int ret = Double.compare(z1.getBounds().getY(), z2.getBounds().getY());
            if (ret == 0) {
              ret = Double.compare(z1.getBounds().getX(), z2.getBounds().getX());
            }
            return ret;
          }
        });
    return page.setBounds(new BxBounds(minX, minY, maxX - minX, maxY - minY));
  }
 private BxZone parseZoneNode(Element zoneE) {
   BxZone zone = new BxZone();
   if (!getChildren("Classification", zoneE).isEmpty()) {
     zone.setLabel(parseClassification(getChildren("Classification", zoneE).get(0)));
   }
   if (!getChildren("ZoneCorners", zoneE).isEmpty()) {
     zone.setBounds(parseElementContainingVertexes(getChildren("ZoneCorners", zoneE).get(0)));
   }
   List<Element> e = getChildren("Line", zoneE);
   for (Element lin : e) {
     BxLine li = parseLineElement(lin);
     zone.addLine(li);
   }
   return zone;
 }
 @Override
 public double calculateFeatureValue(BxZone zone, BxPage page) {
   double charSpace = 0.0;
   for (BxLine line : zone.getLines()) {
     for (BxWord word : line.getWords()) {
       for (BxChunk chunk : word.getChunks()) {
         charSpace += chunk.getArea();
       }
     }
   }
   double ret = zone.getArea() - charSpace;
   if (ret < 0) {
     return 0.0;
   } else {
     return ret;
   }
 }
 @Override
 public double calculateFeatureValue(BxZone object, BxPage context) {
   String text = object.toText();
   String[] words = text.split("\\s");
   List<Integer> wordLengths = new ArrayList<Integer>(words.length);
   for (String word : words) wordLengths.add(word.length());
   Collections.sort(wordLengths);
   return wordLengths.get(wordLengths.size() / 2);
 }
示例#11
0
  @Override
  public double calculateFeatureValue(BxLine line, BxPage page) {
    if (!line.hasPrev()) {
      return 1.0;
    }
    double avLength = 0;
    int linesCount = 0;
    for (BxZone zone : page.getZones()) {
      for (BxLine l : zone.getLines()) {
        linesCount++;
        avLength += l.getBounds().getWidth();
      }
    }

    if (linesCount == 0 || avLength == 0) {
      return 0;
    }

    avLength /= linesCount;

    return line.getPrev().getBounds().getWidth() / avLength;
  }
  @Override
  public double calculateFeatureValue(BxZone zone, BxPage page) {
    String[] keywords = {"abstract", "keywords", "key words"
      // , "background", "methods", "results", "conclusions", "purpose", "trial", "discussion",
      // "summary", "conclusion"
    };

    for (String keyword : keywords) {
      if (zone.toText().toLowerCase().startsWith(keyword)) {
        return 1;
      }
    }
    return 0;
  }
  public static void main(String[] args)
      throws TransformationException, IOException, AnalysisException, ParseException,
          CloneNotSupportedException {
    Options options = new Options();
    options.addOption("input", true, "input path");
    options.addOption("output", true, "output path");
    CommandLineParser parser = new GnuParser();
    CommandLine line = parser.parse(options, args);
    String inDir = line.getOptionValue("input");
    String outDir = line.getOptionValue("output");

    File dir = new File(inDir);

    for (File f : FileUtils.listFiles(dir, new String[] {"xml"}, true)) {
      TrueVizToBxDocumentReader tvReader = new TrueVizToBxDocumentReader();
      List<BxPage> pages = tvReader.read(new FileReader(f));
      BxDocument doc = new BxDocument().setPages(pages);
      doc.setFilename(f.getName());

      int all = 0;
      int good = 0;
      for (BxZone z : doc.asZones()) {
        all++;
        if (!z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) {
          good++;
        }
      }
      int intcov = 0;
      if (all > 0) {
        intcov = good * 100 / all;
      }
      System.out.println(doc.getFilename() + " " + intcov);

      File f2 = new File(outDir + doc.getFilename() + "." + intcov);
      FileUtils.copyFile(f, f2);
    }
  }
 @Override
 protected void preprocessDocumentForEvaluation(BxDocument doc) {
   for (BxZone zone : doc.asZones()) zone.setLabel(zone.getLabel().getGeneralLabel());
 }
  @Override
  public BxContentStructure extractHeaders(BxDocument document) throws AnalysisException {

    Population heightPopulation = new Population();
    Population fontPopulation = new Population();
    Population distancePopulation = new Population();
    Population lengthPopulation = new Population();
    Population indentationPopulation = new Population();

    Set<BxLine> candidates = new HashSet<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            heightPopulation.addObservation(line.getHeight());
            lengthPopulation.addObservation(line.getWidth());
            indentationPopulation.addObservation(line.getX());
            if (line.hasPrev() && line.getY() - line.getPrev().getY() > 0) {
              distancePopulation.addObservation(line.getY() - line.getPrev().getY());
            }
            fontPopulation.addObservation(getFontIndex(line));

            if (isFirstInZone(line) && looksLikeHeader(line)) {
              candidates.add(line);
            }
          }
        }
      }
    }

    Set<BxLine> toDelete = new HashSet<BxLine>();

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    Set<String> headerFonts = new HashSet<String>();
    List<BxLine> candidatesList = Lists.newArrayList(candidates);

    for (int x = 0; x < candidatesList.size(); x++) {
      BxLine line1 = candidatesList.get(x);
      for (int y = x + 1; y < candidatesList.size(); y++) {
        BxLine line2 = candidatesList.get(y);
        for (int z = y + 1; z < candidatesList.size(); z++) {
          BxLine line3 = candidatesList.get(z);
          if (line1.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && line3.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && Math.abs(fontPopulation.getZScore(getFontIndex(line1))) > outlFontZScore) {
            headerFonts.add(line1.getMostPopularFontName());
          }
        }
      }
    }

    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (looksLikeHeader(line) && headerFonts.contains(line.getMostPopularFontName())) {
              candidates.add(line);
            }
          }
        }
      }
    }

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore2) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    for (BxLine line : candidates) {
      int i = 0;
      for (BxLine line2 : candidates) {
        if (line.equals(line2)) {
          continue;
        }
        if (areSimilar(line, line2)) {
          i++;
        }
      }
      if (i == 0 || i > maxSimilarLinesCount) {
        toDelete.add(line);
        for (BxLine line2 : candidates) {
          if (areSimilar(line, line2)) {
            toDelete.add(line2);
          }
        }
      }
    }

    candidates.removeAll(toDelete);

    candidatesList = new ArrayList<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        for (BxLine line : zone) {
          if (candidates.contains(line)) {
            candidatesList.add(line);
          }
        }
      }
    }
    int clusters[] = headersClusterizer.clusterLines(candidatesList);
    Set<Integer> keptClusters = new HashSet<Integer>();
    for (int clusterIdx = 0; clusterIdx < clusters.length; clusterIdx++) {
      int cluster = clusters[clusterIdx];
      if (keptClusters.size() < 3) {
        keptClusters.add(cluster);
      }
      if (!keptClusters.contains(cluster)) {
        candidates.remove(candidatesList.get(clusterIdx));
      }
    }

    BxContentStructure contentStructure = new BxContentStructure();
    BxLine lastHeaderLine = null;
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (candidates.contains(line)) {
              contentStructure.addFirstHeaderLine(page, line);
              lastHeaderLine = line;
            } else if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
              if (lastHeaderLine == null) {
                BxChunk chunk = new BxChunk(new BxBounds(), "--");
                BxWord word = new BxWord().addChunk(chunk);
                lastHeaderLine = new BxLine().addWord(word);
                contentStructure.addFirstHeaderLine(page, lastHeaderLine);
              }
              contentStructure.addContentLine(lastHeaderLine, line);
            }
          }
        }
      }
    }

    headerLinesCompletener.completeLines(contentStructure);

    return contentStructure;
  }
示例#16
0
  public static void main(String[] args)
      throws ParseException, IOException, TransformationException, AnalysisException,
          CloneNotSupportedException {
    Options options = new Options();

    CommandLineParser parser = new GnuParser();
    CommandLine line = parser.parse(options, args);

    if (args.length != 2) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(" [-options] input-directory extension", options);
      System.exit(1);
    }
    String inputDirPath = line.getArgs()[0];
    File inputDirFile = new File(inputDirPath);

    Integer docIdx = 0;

    HierarchicalReadingOrderResolver ror = new HierarchicalReadingOrderResolver();
    DocumentsIterator iter = new DocumentsIterator(inputDirPath, line.getArgs()[1]);

    FeatureVectorBuilder<BxZone, BxPage> metaVectorBuilder =
        SVMMetadataZoneClassifier.getFeatureVectorBuilder();
    FeatureVectorBuilder<BxZone, BxPage> initialVectorBuilder =
        SVMInitialZoneClassifier.getFeatureVectorBuilder();

    SampleFilter metaSamplesFilter = new SampleFilter(BxZoneLabelCategory.CAT_METADATA);

    FileWriter initialStream = new FileWriter("initial_" + inputDirFile.getName() + ".dat");
    BufferedWriter svmInitialFile = new BufferedWriter(initialStream);

    FileWriter metaStream = new FileWriter("meta_" + inputDirFile.getName() + ".dat");
    BufferedWriter svmMetaFile = new BufferedWriter(metaStream);

    for (BxDocument doc : iter) {
      System.out.println(docIdx + ": " + doc.getFilename());
      String filename = doc.getFilename();
      doc = ror.resolve(doc);
      doc.setFilename(filename);

      for (BxZone zone : doc.asZones()) {
        if (zone.getLabel() != null) {
          if (zone.getLabel().getCategory() != BxZoneLabelCategory.CAT_METADATA) {
            zone.setLabel(zone.getLabel().getGeneralLabel());
          }
        } else {
          zone.setLabel(BxZoneLabel.OTH_UNKNOWN);
        }
      }
      List<TrainingSample<BxZoneLabel>> newMetaSamples =
          BxDocsToTrainingSamplesConverter.getZoneTrainingSamples(
              doc, metaVectorBuilder, BxZoneLabel.getIdentityMap());
      newMetaSamples = metaSamplesFilter.pickElements(newMetaSamples);

      List<TrainingSample<BxZoneLabel>> newInitialSamples =
          BxDocsToTrainingSamplesConverter.getZoneTrainingSamples(
              doc, initialVectorBuilder, BxZoneLabel.getLabelToGeneralMap());

      for (TrainingSample<BxZoneLabel> sample : newMetaSamples) {
        toLibSVM(sample, svmMetaFile);
      }
      for (TrainingSample<BxZoneLabel> sample : newInitialSamples) {
        toLibSVM(sample, svmInitialFile);
      }
      ++docIdx;
    }
    svmInitialFile.close();
    svmMetaFile.close();
  }