예제 #1
0
  private LevelResults compareZones(BxPage expected, BxPage actual) {
    Map<BxChunk, BxZone> map = BxModelUtils.mapChunksToZones(actual);

    LevelResults results = new LevelResults();
    for (BxZone expectedZone : expected) {
      if (ignoredLabels.contains(expectedZone.getLabel())) {
        continue;
      }
      Set<BxZone> actualZones = new HashSet<BxZone>();
      for (BxLine line : expectedZone) {
        for (BxWord word : line) {
          for (BxChunk chunk : word) {
            actualZones.add(map.get(chunk));
          }
        }
      }
      if (actualZones.size() == 1) {
        for (BxZone actualZone : actualZones) {
          if (BxModelUtils.countChunks(actualZone) == BxModelUtils.countChunks(expectedZone)) {
            results.matched++;
          }
        }
      }
      results.all++;
    }

    return results;
  }
예제 #2
0
  private LevelResults compareWords(BxPage expected, BxPage actual) {
    Map<BxChunk, BxWord> map = BxModelUtils.mapChunksToWords(actual);

    LevelResults results = new LevelResults();
    for (BxZone expectedZone : expected) {
      if (ignoredLabels.contains(expectedZone.getLabel())) {
        continue;
      }
      for (BxLine expectedLine : expectedZone) {
        for (BxWord expectedWord : expectedLine) {
          Set<BxWord> actualWords = new HashSet<BxWord>();
          for (BxChunk chunk : expectedWord) {
            actualWords.add(map.get(chunk));
          }
          if (actualWords.size() == 1) {
            for (BxWord actualWord : actualWords) {
              if (actualWord.childrenCount() == expectedWord.childrenCount()) {
                results.matched++;
              }
            }
          }
          results.all++;
        }
      }
    }

    return results;
  }
 private void appendZone(Document doc, Element parent, BxZone zone, Object... hints)
     throws TransformationException {
   Element node = doc.createElement("Zone");
   appendPropertyIfNotNull(doc, node, "ZoneID", zone.getId());
   appendBounds(doc, node, "ZoneCorners", zone.getBounds(), hints);
   appendPropertyIfNotNull(doc, node, "ZoneNext", zone.getNextId());
   Element insetsNode = doc.createElement("ZoneInsets");
   insetsNode.setAttribute("Top", "");
   insetsNode.setAttribute("Bottom", "");
   insetsNode.setAttribute("Left", "");
   insetsNode.setAttribute("Right", "");
   node.appendChild(insetsNode);
   appendProperty(doc, node, "ZoneLines", "");
   if (zone.getLabel() != null) {
     if (ZONE_LABEL_MAP.get(zone.getLabel()) != null
         && !ZONE_LABEL_MAP.get(zone.getLabel()).isEmpty()) {
       appendClassification(doc, node, ZONE_LABEL_MAP.get(zone.getLabel()).toUpperCase(), "");
     } else {
       throw new TransformationException("Writing down an unknown zone label: " + zone.getLabel());
     }
   }
   for (BxLine line : zone.getLines()) {
     appendLine(doc, node, line, hints);
   }
   parent.appendChild(node);
 }
  private BxPage parsePageNode(Element elem) {
    BxPage page = new BxPage();

    double minX = 0, minY = 0, maxX = 0, maxY = 0;
    boolean started = false;

    List<Element> e = getChildren("Zone", elem);
    for (Element zo : e) {
      BxZone zon = parseZoneNode(zo);
      page.addZone(zon);

      BxBounds zoneBounds = zon.getBounds();
      if (!started) {
        minX = zoneBounds.getX();
        minY = zoneBounds.getY();
        maxX = zoneBounds.getX() + zoneBounds.getWidth();
        maxY = zoneBounds.getY() + zoneBounds.getHeight();
        started = true;
      }

      if (zoneBounds.getX() < minX) {
        minX = zoneBounds.getX();
      }
      if (zoneBounds.getX() + zoneBounds.getWidth() > maxX) {
        maxX = zoneBounds.getX() + zoneBounds.getWidth();
      }
      if (zoneBounds.getY() < minY) {
        minY = zoneBounds.getY();
      }
      if (zoneBounds.getY() + zoneBounds.getHeight() > maxY) {
        maxY = zoneBounds.getY() + zoneBounds.getHeight();
      }
    }

    Collections.sort(
        page.getZones(),
        new Comparator() {

          @Override
          public int compare(Object t, Object t1) {
            BxZone z1 = (BxZone) t;
            BxZone z2 = (BxZone) t1;
            int ret = Double.compare(z1.getBounds().getY(), z2.getBounds().getY());
            if (ret == 0) {
              ret = Double.compare(z1.getBounds().getX(), z2.getBounds().getX());
            }
            return ret;
          }
        });
    return page.setBounds(new BxBounds(minX, minY, maxX - minX, maxY - minY));
  }
 private BxZone parseZoneNode(Element zoneE) {
   BxZone zone = new BxZone();
   if (!getChildren("Classification", zoneE).isEmpty()) {
     zone.setLabel(parseClassification(getChildren("Classification", zoneE).get(0)));
   }
   if (!getChildren("ZoneCorners", zoneE).isEmpty()) {
     zone.setBounds(parseElementContainingVertexes(getChildren("ZoneCorners", zoneE).get(0)));
   }
   List<Element> e = getChildren("Line", zoneE);
   for (Element lin : e) {
     BxLine li = parseLineElement(lin);
     zone.addLine(li);
   }
   return zone;
 }
  @Override
  public BxContentStructure extractHeaders(BxDocument document) throws AnalysisException {

    Population heightPopulation = new Population();
    Population fontPopulation = new Population();
    Population distancePopulation = new Population();
    Population lengthPopulation = new Population();
    Population indentationPopulation = new Population();

    Set<BxLine> candidates = new HashSet<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            heightPopulation.addObservation(line.getHeight());
            lengthPopulation.addObservation(line.getWidth());
            indentationPopulation.addObservation(line.getX());
            if (line.hasPrev() && line.getY() - line.getPrev().getY() > 0) {
              distancePopulation.addObservation(line.getY() - line.getPrev().getY());
            }
            fontPopulation.addObservation(getFontIndex(line));

            if (isFirstInZone(line) && looksLikeHeader(line)) {
              candidates.add(line);
            }
          }
        }
      }
    }

    Set<BxLine> toDelete = new HashSet<BxLine>();

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    Set<String> headerFonts = new HashSet<String>();
    List<BxLine> candidatesList = Lists.newArrayList(candidates);

    for (int x = 0; x < candidatesList.size(); x++) {
      BxLine line1 = candidatesList.get(x);
      for (int y = x + 1; y < candidatesList.size(); y++) {
        BxLine line2 = candidatesList.get(y);
        for (int z = y + 1; z < candidatesList.size(); z++) {
          BxLine line3 = candidatesList.get(z);
          if (line1.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && line3.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && Math.abs(fontPopulation.getZScore(getFontIndex(line1))) > outlFontZScore) {
            headerFonts.add(line1.getMostPopularFontName());
          }
        }
      }
    }

    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (looksLikeHeader(line) && headerFonts.contains(line.getMostPopularFontName())) {
              candidates.add(line);
            }
          }
        }
      }
    }

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore2) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    for (BxLine line : candidates) {
      int i = 0;
      for (BxLine line2 : candidates) {
        if (line.equals(line2)) {
          continue;
        }
        if (areSimilar(line, line2)) {
          i++;
        }
      }
      if (i == 0 || i > maxSimilarLinesCount) {
        toDelete.add(line);
        for (BxLine line2 : candidates) {
          if (areSimilar(line, line2)) {
            toDelete.add(line2);
          }
        }
      }
    }

    candidates.removeAll(toDelete);

    candidatesList = new ArrayList<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        for (BxLine line : zone) {
          if (candidates.contains(line)) {
            candidatesList.add(line);
          }
        }
      }
    }
    int clusters[] = headersClusterizer.clusterLines(candidatesList);
    Set<Integer> keptClusters = new HashSet<Integer>();
    for (int clusterIdx = 0; clusterIdx < clusters.length; clusterIdx++) {
      int cluster = clusters[clusterIdx];
      if (keptClusters.size() < 3) {
        keptClusters.add(cluster);
      }
      if (!keptClusters.contains(cluster)) {
        candidates.remove(candidatesList.get(clusterIdx));
      }
    }

    BxContentStructure contentStructure = new BxContentStructure();
    BxLine lastHeaderLine = null;
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (candidates.contains(line)) {
              contentStructure.addFirstHeaderLine(page, line);
              lastHeaderLine = line;
            } else if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
              if (lastHeaderLine == null) {
                BxChunk chunk = new BxChunk(new BxBounds(), "--");
                BxWord word = new BxWord().addChunk(chunk);
                lastHeaderLine = new BxLine().addWord(word);
                contentStructure.addFirstHeaderLine(page, lastHeaderLine);
              }
              contentStructure.addContentLine(lastHeaderLine, line);
            }
          }
        }
      }
    }

    headerLinesCompletener.completeLines(contentStructure);

    return contentStructure;
  }