コード例 #1
0
 public List<BxPage> read(Reader reader, Object... hints) throws TransformationException {
   List<BxPage> pages = new ArrayList<BxPage>();
   try {
     pages.add(importSource(new InputSource(reader)));
   } catch (IOException ex) {
     throw new TransformationException(ex);
   } catch (ParserConfigurationException ex) {
     throw new TransformationException(ex);
   } catch (SAXException ex) {
     throw new TransformationException(ex);
   }
   return pages;
 }
コード例 #2
0
  @Override
  public BxContentStructure extractHeaders(BxDocument document) throws AnalysisException {

    Population heightPopulation = new Population();
    Population fontPopulation = new Population();
    Population distancePopulation = new Population();
    Population lengthPopulation = new Population();
    Population indentationPopulation = new Population();

    Set<BxLine> candidates = new HashSet<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            heightPopulation.addObservation(line.getHeight());
            lengthPopulation.addObservation(line.getWidth());
            indentationPopulation.addObservation(line.getX());
            if (line.hasPrev() && line.getY() - line.getPrev().getY() > 0) {
              distancePopulation.addObservation(line.getY() - line.getPrev().getY());
            }
            fontPopulation.addObservation(getFontIndex(line));

            if (isFirstInZone(line) && looksLikeHeader(line)) {
              candidates.add(line);
            }
          }
        }
      }
    }

    Set<BxLine> toDelete = new HashSet<BxLine>();

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    Set<String> headerFonts = new HashSet<String>();
    List<BxLine> candidatesList = Lists.newArrayList(candidates);

    for (int x = 0; x < candidatesList.size(); x++) {
      BxLine line1 = candidatesList.get(x);
      for (int y = x + 1; y < candidatesList.size(); y++) {
        BxLine line2 = candidatesList.get(y);
        for (int z = y + 1; z < candidatesList.size(); z++) {
          BxLine line3 = candidatesList.get(z);
          if (line1.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && line3.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && Math.abs(fontPopulation.getZScore(getFontIndex(line1))) > outlFontZScore) {
            headerFonts.add(line1.getMostPopularFontName());
          }
        }
      }
    }

    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (looksLikeHeader(line) && headerFonts.contains(line.getMostPopularFontName())) {
              candidates.add(line);
            }
          }
        }
      }
    }

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore2) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    for (BxLine line : candidates) {
      int i = 0;
      for (BxLine line2 : candidates) {
        if (line.equals(line2)) {
          continue;
        }
        if (areSimilar(line, line2)) {
          i++;
        }
      }
      if (i == 0 || i > maxSimilarLinesCount) {
        toDelete.add(line);
        for (BxLine line2 : candidates) {
          if (areSimilar(line, line2)) {
            toDelete.add(line2);
          }
        }
      }
    }

    candidates.removeAll(toDelete);

    candidatesList = new ArrayList<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        for (BxLine line : zone) {
          if (candidates.contains(line)) {
            candidatesList.add(line);
          }
        }
      }
    }
    int clusters[] = headersClusterizer.clusterLines(candidatesList);
    Set<Integer> keptClusters = new HashSet<Integer>();
    for (int clusterIdx = 0; clusterIdx < clusters.length; clusterIdx++) {
      int cluster = clusters[clusterIdx];
      if (keptClusters.size() < 3) {
        keptClusters.add(cluster);
      }
      if (!keptClusters.contains(cluster)) {
        candidates.remove(candidatesList.get(clusterIdx));
      }
    }

    BxContentStructure contentStructure = new BxContentStructure();
    BxLine lastHeaderLine = null;
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (candidates.contains(line)) {
              contentStructure.addFirstHeaderLine(page, line);
              lastHeaderLine = line;
            } else if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
              if (lastHeaderLine == null) {
                BxChunk chunk = new BxChunk(new BxBounds(), "--");
                BxWord word = new BxWord().addChunk(chunk);
                lastHeaderLine = new BxLine().addWord(word);
                contentStructure.addFirstHeaderLine(page, lastHeaderLine);
              }
              contentStructure.addContentLine(lastHeaderLine, line);
            }
          }
        }
      }
    }

    headerLinesCompletener.completeLines(contentStructure);

    return contentStructure;
  }
コード例 #3
0
 private double getFontIndex(BxLine line) {
   List<String> fonts =
       Lists.newArrayList(line.getParent().getParent().getParent().getFontNames());
   Collections.sort(fonts);
   return fonts.indexOf(line.getMostPopularFontName());
 }