private boolean containsMostlyLetters(BxLine line) {
   double letterCount = 0;
   for (char ch : line.toText().toCharArray()) {
     if (Character.isLetter(ch)) letterCount++;
   }
   return 2 * letterCount > line.toText().length();
 }
 private void appendLine(Document doc, Element parent, BxLine line, Object... hints) {
   Element node = doc.createElement("Line");
   appendPropertyIfNotNull(doc, node, "LineID", line.getId());
   appendBounds(doc, node, "LineCorners", line.getBounds(), hints);
   appendPropertyIfNotNull(doc, node, "LineNext", line.getNextId());
   appendProperty(doc, node, "LineNumChars", "");
   for (BxWord word : line.getWords()) {
     appendWord(doc, node, word, hints);
   }
   parent.appendChild(node);
 }
 private BxLine parseLineElement(Element lineE) {
   BxLine line = new BxLine();
   if (!(getChildren("LineCorners", lineE).isEmpty())) {
     line.setBounds(parseElementContainingVertexes(getChildren("LineCorners", lineE).get(0)));
   }
   List<Element> e = getChildren("Word", lineE);
   for (Element we : e) {
     BxWord wo = parseWordElement(we);
     line.addWord(wo);
   }
   return line;
 }
Esempio n. 4
0
 @Override
 public double calculateFeatureValue(BxLine refLine, BxDocumentBibReferences refs) {
   return refs.getZone(refLine).getBounds().getX()
       + refs.getZone(refLine).getBounds().getWidth()
       - refLine.getBounds().getX()
       - refLine.getBounds().getWidth();
 }
 @Override
 public double calculateFeatureValue(BxZone zone, BxPage page) {
   double charSpace = 0.0;
   for (BxLine line : zone.getLines()) {
     for (BxWord word : line.getWords()) {
       for (BxChunk chunk : word.getChunks()) {
         charSpace += chunk.getArea();
       }
     }
   }
   double ret = zone.getArea() - charSpace;
   if (ret < 0) {
     return 0.0;
   } else {
     return ret;
   }
 }
Esempio n. 6
0
  @Override
  public double calculateFeatureValue(BxLine line, BxPage page) {
    if (!line.hasPrev()) {
      return 1.0;
    }
    double avLength = 0;
    int linesCount = 0;
    for (BxZone zone : page.getZones()) {
      for (BxLine l : zone.getLines()) {
        linesCount++;
        avLength += l.getBounds().getWidth();
      }
    }

    if (linesCount == 0 || avLength == 0) {
      return 0;
    }

    avLength /= linesCount;

    return line.getPrev().getBounds().getWidth() / avLength;
  }
  @Override
  public BxContentStructure extractHeaders(BxDocument document) throws AnalysisException {

    Population heightPopulation = new Population();
    Population fontPopulation = new Population();
    Population distancePopulation = new Population();
    Population lengthPopulation = new Population();
    Population indentationPopulation = new Population();

    Set<BxLine> candidates = new HashSet<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            heightPopulation.addObservation(line.getHeight());
            lengthPopulation.addObservation(line.getWidth());
            indentationPopulation.addObservation(line.getX());
            if (line.hasPrev() && line.getY() - line.getPrev().getY() > 0) {
              distancePopulation.addObservation(line.getY() - line.getPrev().getY());
            }
            fontPopulation.addObservation(getFontIndex(line));

            if (isFirstInZone(line) && looksLikeHeader(line)) {
              candidates.add(line);
            }
          }
        }
      }
    }

    Set<BxLine> toDelete = new HashSet<BxLine>();

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    Set<String> headerFonts = new HashSet<String>();
    List<BxLine> candidatesList = Lists.newArrayList(candidates);

    for (int x = 0; x < candidatesList.size(); x++) {
      BxLine line1 = candidatesList.get(x);
      for (int y = x + 1; y < candidatesList.size(); y++) {
        BxLine line2 = candidatesList.get(y);
        for (int z = y + 1; z < candidatesList.size(); z++) {
          BxLine line3 = candidatesList.get(z);
          if (line1.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && line3.getMostPopularFontName().equals(line2.getMostPopularFontName())
              && Math.abs(fontPopulation.getZScore(getFontIndex(line1))) > outlFontZScore) {
            headerFonts.add(line1.getMostPopularFontName());
          }
        }
      }
    }

    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (looksLikeHeader(line) && headerFonts.contains(line.getMostPopularFontName())) {
              candidates.add(line);
            }
          }
        }
      }
    }

    for (BxLine line : candidates) {
      if (shouldBeRemoved(
          line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) {
        toDelete.add(line);
      }
      if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore2) {
        toDelete.add(line);
      }
    }

    candidates.removeAll(toDelete);
    toDelete.clear();

    for (BxLine line : candidates) {
      int i = 0;
      for (BxLine line2 : candidates) {
        if (line.equals(line2)) {
          continue;
        }
        if (areSimilar(line, line2)) {
          i++;
        }
      }
      if (i == 0 || i > maxSimilarLinesCount) {
        toDelete.add(line);
        for (BxLine line2 : candidates) {
          if (areSimilar(line, line2)) {
            toDelete.add(line2);
          }
        }
      }
    }

    candidates.removeAll(toDelete);

    candidatesList = new ArrayList<BxLine>();
    for (BxPage page : document) {
      for (BxZone zone : page) {
        for (BxLine line : zone) {
          if (candidates.contains(line)) {
            candidatesList.add(line);
          }
        }
      }
    }
    int clusters[] = headersClusterizer.clusterLines(candidatesList);
    Set<Integer> keptClusters = new HashSet<Integer>();
    for (int clusterIdx = 0; clusterIdx < clusters.length; clusterIdx++) {
      int cluster = clusters[clusterIdx];
      if (keptClusters.size() < 3) {
        keptClusters.add(cluster);
      }
      if (!keptClusters.contains(cluster)) {
        candidates.remove(candidatesList.get(clusterIdx));
      }
    }

    BxContentStructure contentStructure = new BxContentStructure();
    BxLine lastHeaderLine = null;
    for (BxPage page : document) {
      for (BxZone zone : page) {
        if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
            || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
          for (BxLine line : zone) {
            if (candidates.contains(line)) {
              contentStructure.addFirstHeaderLine(page, line);
              lastHeaderLine = line;
            } else if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT)
                || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) {
              if (lastHeaderLine == null) {
                BxChunk chunk = new BxChunk(new BxBounds(), "--");
                BxWord word = new BxWord().addChunk(chunk);
                lastHeaderLine = new BxLine().addWord(word);
                contentStructure.addFirstHeaderLine(page, lastHeaderLine);
              }
              contentStructure.addContentLine(lastHeaderLine, line);
            }
          }
        }
      }
    }

    headerLinesCompletener.completeLines(contentStructure);

    return contentStructure;
  }
 private boolean areSimilar(BxLine line1, BxLine line2) {
   return line1.getMostPopularFontName().equals(line2.getMostPopularFontName())
       && Math.abs(line1.getHeight() - line2.getHeight()) < maxHeightSimilarity;
 }
 private boolean shouldBeRemoved(
     BxLine line,
     Population heightPopulation,
     Population fontPopulation,
     Population distancePopulation,
     Population indentationPopulation) {
   if (line.getMostPopularFontName() == null) {
     return true;
   }
   if (heightPopulation.getZScore(line.getHeight()) < candMinHeightZScore) {
     return true;
   }
   if (looksLikeEquation(line)) {
     return true;
   }
   if (looksLikeFigure(line)) {
     return true;
   }
   if (looksLikeTable(line)) {
     return true;
   }
   if (!containsMostlyLetters(line)) {
     return true;
   }
   if (!containsWord(line)) {
     return true;
   }
   if (startsWithLargeNumber(line)) {
     return true;
   }
   if (heightPopulation.getZScore(line.getHeight()) < outlHeightZScore
       && Math.abs(fontPopulation.getZScore(getFontIndex(line))) < outlFontZScore
       && (!line.hasPrev()
           || distancePopulation.getZScore(line.getY() - line.getPrev().getY())
               < outlDistanceZScore)
       && Math.abs(indentationPopulation.getZScore(line.getX())) < outlIndentZScore) {
     return true;
   }
   int i = 0;
   BxLine actLine = line;
   while (actLine.hasNext()) {
     actLine = actLine.getNext();
     if (actLine.toText().matches("[A-Z].*")) {
       break;
     }
     if (i++ == maxHeaderLineCount) {
       return true;
     }
   }
   return false;
 }
Esempio n. 10
0
 @Override
 public double calculateFeatureValue(BxLine object, BxPage context) {
   return object.getBounds().getX() - context.getX();
 }
 private boolean startsWithLargeNumber(BxLine line) {
   return line.toText().matches("[0-9][0-9].*");
 }
 private boolean looksLikeTable(BxLine line) {
   return line.toText().toLowerCase().matches("table .*");
 }
Esempio n. 13
0
  public void cleanupContent(BxDocContentStructure contentStructure) {
    for (BxDocContentPart contentPart : contentStructure.getParts()) {
      List<BxLine> headerLines = contentPart.getHeaderLines();
      StringBuilder sb = new StringBuilder();
      for (BxLine headerLine : headerLines) {
        String lineText = headerLine.toText();
        if (lineText.endsWith("-")) {
          lineText = lineText.substring(0, lineText.length() - 1);
          if (lineText.lastIndexOf(' ') < 0) {
            sb.append(lineText);
          } else {
            sb.append(lineText.substring(0, lineText.lastIndexOf(' ')));
            sb.append(" ");
            sb.append(lineText.substring(lineText.lastIndexOf(' ') + 1));
          }
        } else {
          sb.append(lineText);
          sb.append(" ");
        }
      }
      contentPart.setCleanHeaderText(cleanLigatures(sb.toString().trim()));

      List<BxLine> contentLines = contentPart.getContentLines();
      List<String> contentTexts = new ArrayList<String>();

      double maxLen = Double.NEGATIVE_INFINITY;
      for (BxLine line : contentLines) {
        if (line.getWidth() > maxLen) {
          maxLen = line.getWidth();
        }
      }

      String contentText = "";
      for (BxLine line : contentLines) {
        int score = 0;
        BxLine prev = line.getPrev();
        BxLine next = line.getNext();
        if (line.toText().matches("^[A-Z].*$")) {
          score++;
        }
        if (prev != null) {
          if (line.getX() > prev.getX()
              && line.getX() - prev.getX() < paragraphLineIndentMultiplier * maxLen
              && line.getX() - prev.getX() > minParagraphIndent) {
            score++;
          }
          if (prev.getWidth() < lastParagraphLineLengthMult * maxLen) {
            score++;
          }
          if (prev.toText().endsWith(".")) {
            score++;
          }
        }
        if (next != null
            && line.getX() > next.getX()
            && line.getX() - next.getX() < paragraphLineIndentMultiplier * maxLen
            && line.getX() - next.getX() > minParagraphIndent) {
          score++;
        }

        if (score >= firstParagraphLineMinScore) {
          if (!contentText.isEmpty()) {
            contentTexts.add(cleanLigatures(contentText.trim()));
          }
          contentText = "";
        }

        String lineText = line.toText();
        if (lineText.endsWith("-")) {
          lineText = lineText.substring(0, lineText.length() - 1);
          if (lineText.lastIndexOf(' ') < 0) {
            contentText += lineText;
          } else {
            contentText += lineText.substring(0, lineText.lastIndexOf(' '));
            contentText += "\n";
            contentText += lineText.substring(lineText.lastIndexOf(' ') + 1);
          }
        } else {
          contentText += lineText;
          contentText += "\n";
        }
      }
      if (!contentText.isEmpty()) {
        contentTexts.add(cleanLigatures(contentText.trim()));
      }

      contentPart.setCleanContentTexts(contentTexts);
    }
  }
 private boolean looksLikeFigure(BxLine line) {
   return line.toText().toLowerCase().matches("fig\\.? .*")
       || line.toText().toLowerCase().matches("figure .*");
 }
 private boolean looksLikeEquation(BxLine line) {
   return line.toText().contains("=");
 }
 private boolean looksLikeHeader(BxLine line) {
   String text = line.toText();
   return text.matches("^[A-Z].*")
       || text.matches("^[1-9].*[a-zA-Z].*")
       || text.matches("^[a-h]\\).*[a-zA-Z].*");
 }
 private boolean isFirstInZone(BxLine line) {
   return !line.hasPrev() || line.getParent() != line.getPrev().getParent();
 }
 private double getFontIndex(BxLine line) {
   List<String> fonts =
       Lists.newArrayList(line.getParent().getParent().getParent().getFontNames());
   Collections.sort(fonts);
   return fonts.indexOf(line.getMostPopularFontName());
 }
 private boolean containsWord(BxLine line) {
   return line.toText().toLowerCase().matches(".*[a-z][a-z][a-z][a-z].*");
 }
Esempio n. 20
0
 @Override
 public double calculateFeatureValue(BxLine line, BxPage page) {
   return (line.toText().matches("^[a-z]\\) [A-Z].*$")) ? 1 : 0;
 }