Java Block 예제들

프로그래밍 언어: Java

네임스페이스/패키지 이름: org.grobid.core.layout

클래스/타입: Block

hotexamples.com에서의 예제들: 6

Java Block - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Java의 org.grobid.core.layout.Block에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

setText(3)

getText(3)

getStartToken(2)

getWidth(2)

getBold(2)

getFont(2)

getFontSize(2)

getY(2)

setPage(2)

setItalic(1)

setNbTokens(1)

addToken(1)

setFontSize(1)

setStartToken(1)

setWidth(1)

setX(1)

setHeight(1)

setBold(1)

setFont(1)

setEndToken(1)

setColorFont(1)

getX(1)

getTokens(1)

getItalic(1)

getHeight(1)

getEndToken(1)

getColorFont(1)

setY(1)

예제 #1

파일 보기

파일: BasicStructureBuilder.java 프로젝트: Immortalin/grobid

  /**
   * Cluster the blocks following the font, style and size aspects
   *
   * <p>-> not used at this stage, but could be an interesting feature in the full text model in the
   * future
   *
   * @param b integer
   * @param doc a document
   */
  private static void addBlockToCluster(Integer b, Document doc) {
    // get block features
    Block block = doc.getBlocks().get(b);
    String font = block.getFont();
    boolean bold = block.getBold();
    boolean italic = block.getItalic();
    double fontSize = block.getFontSize();
    boolean found = false;

    if (font == null) {
      font = "unknown";
    }
    // System.out.println(font + " " + bold + " " + italic + " " + fontSize );

    if (doc.getClusters() == null) {
      doc.setClusters(new ArrayList<Cluster>());
    } else {
      for (Cluster cluster : doc.getClusters()) {
        String font2 = cluster.getFont();
        if (font2 == null) font2 = "unknown";
        if (font.equals(font2)
            && (bold == cluster.getBold())
                & (italic == cluster.getItalic())
                & (fontSize == cluster.getFontSize())) {
          cluster.addBlock2(b);
          found = true;
        }
      }
    }

    if (!found) {
      Cluster cluster = new Cluster();
      cluster.setFont(font);
      cluster.setBold(bold);
      cluster.setItalic(italic);
      cluster.setFontSize(fontSize);
      cluster.addBlock2(b);
      doc.getClusters().add(cluster);
    }
  }

예제 #2

파일 보기

파일: BasicStructureBuilder.java 프로젝트: Immortalin/grobid

  /**
   * Filter out line numbering possibly present in the document. This can be frequent for document
   * in a review/submission format and degrades strongly the machine learning extraction results.
   *
   * <p>-> Not used !
   *
   * @param doc a document
   * @return if found numbering
   */
  public boolean filterLineNumber(Document doc) {
    // we first test if we have a line numbering by checking if we have an increasing integer
    // at the begin or the end of each block
    boolean numberBeginLine = false;
    boolean numberEndLine = false;

    boolean foundNumbering = false;

    int currentNumber = -1;
    int lastNumber = -1;
    int i = 0;
    for (Block block : doc.getBlocks()) {
      //            Integer ii = i;

      String localText = block.getText();
      List<LayoutToken> tokens = block.tokens;

      if ((localText != null) && (tokens != null)) {
        if (tokens.size() > 0) {
          // we get the first and last token iof the block
          // String tok1 = tokens.get(0).getText();
          // String tok2 = tokens.get(tokens.size()).getText();
          localText = localText.trim();

          Matcher ma1 = startNum.matcher(localText);
          Matcher ma2 = endNum.matcher(localText);

          if (ma1.find()) {
            String groupStr = ma1.group(0);
            try {
              currentNumber = Integer.parseInt(groupStr);
              numberBeginLine = true;
            } catch (NumberFormatException e) {
              currentNumber = -1;
            }
          } else if (ma2.find()) {
            String groupStr = ma2.group(0);
            try {
              currentNumber = Integer.parseInt(groupStr);
              numberEndLine = true;
            } catch (NumberFormatException e) {
              currentNumber = -1;
            }
          }

          if (lastNumber != -1) {
            if (currentNumber == lastNumber + 1) {
              foundNumbering = true;
              break;
            }
          } else lastNumber = currentNumber;
        }
      }
      i++;

      if (i > 5) {
        break;
      }
    }

    i = 0;
    if (foundNumbering) {
      // we have a line numbering, so we filter them
      int counter = 1; // we start at 1, if the actual start is 0,
      // it will remain (as it is negligeable)

      for (Block block : doc.getBlocks()) {

        String localText = block.getText();
        List<LayoutToken> tokens = block.tokens;

        if ((localText != null) && (tokens.size() > 0)) {

          if (numberEndLine) {
            Matcher ma2 = endNum.matcher(localText);

            if (ma2.find()) {
              String groupStr = ma2.group(0);
              if (groupStr.trim().equals("" + counter)) {
                localText = localText.substring(0, localText.length() - groupStr.length());
                block.setText(localText);
                tokens.remove(tokens.size() - 1);
                counter++;
              }
            }

          } else if (numberBeginLine) {
            Matcher ma1 = endNum.matcher(localText);

            if (ma1.find()) {
              String groupStr = ma1.group(0);
              if (groupStr.trim().equals("" + counter)) {
                localText = localText.substring(groupStr.length(), localText.length() - 1);
                block.setText(localText);
                tokens.remove(0);
                counter++;
              }
            }
          }
        }
        i++;
      }
    }

    return foundNumbering;
  }

예제 #3

파일 보기

파일: BasicStructureBuilder.java 프로젝트: Immortalin/grobid

  public static Document generalResultSegmentation(
      Document doc, String labeledResult, List<LayoutToken> documentTokens) {
    List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult);

    SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create();
    doc.setLabeledBlocks(labeledBlocks);

    /*try {
          	FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult);
    	FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString());
    }
    catch(Exception e) {
    	e.printStackTrace();
    }*/

    List<Block> docBlocks = doc.getBlocks();
    int indexLine = 0;
    int blockIndex = 0;
    int p = 0; // position in the labeled result
    int currentLineEndPos = 0; // position in the global doc. tokenization of the last
    // token of the current line
    int currentLineStartPos = 0; // position in the global doc.
    // tokenization of the first token of the current line
    String line = null;

    DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER;
    DocumentPointer currentPointer = null;
    DocumentPointer lastPointer = null;

    String curLabel;
    String curPlainLabel = null;
    String lastPlainLabel = null;

    int lastTokenInd = -1;
    for (int i = docBlocks.size() - 1; i >= 0; i--) {
      int endToken = docBlocks.get(i).getEndToken();
      if (endToken != -1) {
        lastTokenInd = endToken;
        break;
      }
    }

    // we do this concatenation trick so that we don't have to process stuff after the main loop
    // no copying of lists happens because of this, so it's ok to concatenate
    String ignoredLabel = "@IGNORED_LABEL@";
    for (Pair<String, String> labeledTokenPair :
        Iterables.concat(
            labeledTokens,
            Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) {
      if (labeledTokenPair == null) {
        p++;
        continue;
      }

      // as we process the document segmentation line by line, we don't use the usual
      // tokenization to rebuild the text flow, but we get each line again from the
      // text stored in the document blocks (similarly as when generating the features)
      line = null;
      while ((line == null) && (blockIndex < docBlocks.size())) {
        Block block = docBlocks.get(blockIndex);
        List<LayoutToken> tokens = block.getTokens();
        String localText = block.getText();
        if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        }
        String[] lines = localText.split("[\\n\\r]");
        if ((lines.length == 0) || (indexLine >= lines.length)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        } else {
          line = lines[indexLine];
          indexLine++;
          if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) {
            line = null;
            continue;
          }

          if (currentLineStartPos > lastTokenInd) continue;

          // adjust the start token position in documentTokens to this non trivial line
          // first skip possible space characters and tabs at the beginning of the line
          while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                  || documentTokens.get(currentLineStartPos).t().equals("\t"))
              && (currentLineStartPos != lastTokenInd)) {
            currentLineStartPos++;
          }
          if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) {
            while (currentLineStartPos < block.getEndToken()) {
              if (documentTokens.get(currentLineStartPos).t().equals("\n")
                  || documentTokens.get(currentLineStartPos).t().equals("\r")) {
                // move to the start of the next line, but ignore space characters and tabs
                currentLineStartPos++;
                while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                        || documentTokens.get(currentLineStartPos).t().equals("\t"))
                    && (currentLineStartPos != lastTokenInd)) {
                  currentLineStartPos++;
                }
                if ((currentLineStartPos != lastTokenInd)
                    && labeledTokenPair.a.startsWith(
                        documentTokens.get(currentLineStartPos).getText())) {
                  break;
                }
              }
              currentLineStartPos++;
            }
          }

          // what is then the position of the last token of this line?
          currentLineEndPos = currentLineStartPos;
          while (currentLineEndPos < block.getEndToken()) {
            if (documentTokens.get(currentLineEndPos).t().equals("\n")
                || documentTokens.get(currentLineEndPos).t().equals("\r")) {
              currentLineEndPos--;
              break;
            }
            currentLineEndPos++;
          }
        }
      }
      curLabel = labeledTokenPair.b;
      curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel);

      /*System.out.println("-------------------------------");
      System.out.println("block: " + blockIndex);
      System.out.println("line: " + line);
      System.out.println("token: " + labeledTokenPair.a);
      System.out.println("curPlainLabel: " + curPlainLabel);
      System.out.println("lastPlainLabel: " + lastPlainLabel);
      if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1))
      	System.out.println("currentLineStartPos: " + currentLineStartPos +
      								" (" + documentTokens.get(currentLineStartPos) + ")");
      if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1))
      	System.out.println("currentLineEndPos: " + currentLineEndPos +
      								" (" + documentTokens.get(currentLineEndPos) + ")");*/

      if (blockIndex == docBlocks.size()) {
        break;
      }

      currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos);

      // either a new entity starts or a new beginning of the same type of entity
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
        }
        pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos);
        // System.out.println("add segment for: " + lastPlainLabel + ", until " +
        // (currentLineStartPos-2));
      }

      // updating stuff for next iteration
      lastPlainLabel = curPlainLabel;
      lastPointer = currentPointer;
      currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line
      p++;
    }

    if (blockIndex == docBlocks.size()) {
      // the last labelled piece has still to be added
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
          // System.out.println("add segment for: " + lastPlainLabel + ", until " +
          // (currentLineStartPos-2));
        }
      }
    }

    return doc;
  }

예제 #4

파일 보기

파일: BasicStructureBuilder.java 프로젝트: Immortalin/grobid

  /**
   * First pass to detect basic structures: remove page header/footer, identify section numbering,
   * identify Figure and table blocks.
   *
   * <p>-> to be removed at some point!
   *
   * @param doc a document
   */
  public static void firstPass(Document doc) {
    if (doc == null) {
      throw new NullPointerException();
    }
    if (doc.getBlocks() == null) {
      throw new NullPointerException();
    }

    int i = 0;
    List<Integer> blockHeaders = new ArrayList<Integer>();
    List<Integer> blockFooters = new ArrayList<Integer>();
    List<Integer> blockSectionTitles = new ArrayList<Integer>();
    List<Integer> acknowledgementBlocks = new ArrayList<Integer>();
    List<Integer> blockTables = new ArrayList<Integer>();
    List<Integer> blockFigures = new ArrayList<Integer>();
    List<Integer> blockHeadTables = new ArrayList<Integer>();
    List<Integer> blockHeadFigures = new ArrayList<Integer>();
    List<Integer> blockDocumentHeaders = new ArrayList<Integer>();

    doc.setTitleMatchNum(false);
    try {
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText);
        Matcher ma2 = BasicStructureBuilder.references.matcher(localText);

        if ((ma1.find()) || (ma2.find())) {
          if (((localText.startsWith("1.")) || (localText.startsWith("1 ")))
              || ((localText.startsWith("2.")) || (localText.startsWith("2 ")))
              || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true);
          // System.out.println("Title section identified: block " + i + ", " + localText);
          blockSectionTitles.add(i);
        } else {
          StringTokenizer st = new StringTokenizer(localText, "\n");
          while (st.hasMoreTokens()) {
            String token = st.nextToken();

            if (token.startsWith("@PAGE")) {
              // current block should give the header/footors
              if (i > 4) {
                if (doc.getBlocks().get(i - 5).getNbTokens() < 20) {
                  Integer i2 = i - 5;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 3) {
                if (doc.getBlocks().get(i - 4).getNbTokens() < 20) {
                  Integer i2 = i - 4;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 2) {
                if (doc.getBlocks().get(i - 3).getNbTokens() < 20) {
                  Integer i2 = i - 3;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 1) {
                if (doc.getBlocks().get(i - 2).getNbTokens() < 20) {
                  Integer i2 = i - 2;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              if (i > 0) {
                if (doc.getBlocks().get(i - 1).getNbTokens() < 20) {
                  Integer i2 = i - 1;
                  if (!blockFooters.contains(i2)) blockFooters.add(i2);
                }
              }
              blockFooters.add(i);

              // page header candidates
              blockHeaders.add(i);
              if (i < doc.getBlocks().size() - 1) {
                if (doc.getBlocks().get(i + 1).getNbTokens() < 20) {
                  Integer i2 = i + 1;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1);
                }
              }
              if (i < doc.getBlocks().size() - 2) {
                if (doc.getBlocks().get(i + 2).getNbTokens() < 20) {
                  Integer i2 = i + 2;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2);
                }
              }
              if (i < doc.getBlocks().size() - 3) {
                if (doc.getBlocks().get(i + 3).getNbTokens() < 20) {
                  Integer i2 = i + 3;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3);
                }
              }
              if (i < doc.getBlocks().size() - 4) {
                if (doc.getBlocks().get(i + 4).getNbTokens() < 20) {
                  Integer i2 = i + 4;
                  if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4);
                }
              }
              // more ??
            }
          }
        }

        // clustering of blocks per font (for section header and figure/table detections)
        addBlockToCluster(i, doc);

        i++;
      }

      // try to find the cluster of section titles
      Cluster candidateCluster = null;
      // System.out.println("nb clusters: " + clusters.size());
      for (Cluster cluster : doc.getClusters()) {
        if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5))
            && (cluster.getNbBlocks() < 20)) {
          List<Integer> blo = cluster.getBlocks2();
          for (Integer b : blo) {
            if (blockSectionTitles.contains(b)) {
              if (candidateCluster == null) {
                candidateCluster = cluster;
                break;
              }
              // else if (cluster.getFontSize() >= candidateCluster.getFontSize())
              //	candidateCluster = cluster;
            }
          }
        }
      }
      if (candidateCluster != null) {
        List<Integer> newBlockSectionTitles = new ArrayList<Integer>();
        for (Integer bl : blockSectionTitles) {
          if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
        }

        List<Integer> blockClusterTitles = candidateCluster.getBlocks2();
        if (blockClusterTitles.size() < 20) {
          for (Integer bl : blockClusterTitles) {
            if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl);
          }
        }

        blockSectionTitles = newBlockSectionTitles;
      }

      // aknowledgement section recognition
      boolean ackn = false;
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // System.out.println(i + ": " + localText+"\n");

        Integer iii = i;
        Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText);
        if ((m3.find()) && (blockSectionTitles.contains(iii))) {
          acknowledgementBlocks.add(iii);
          ackn = true;
          // int index = blockSectionTitles.indexOf(iii);
          // blockSectionTitles.remove(index);
        } else if ((ackn) && (blockSectionTitles.contains(iii))) {
          ackn = false;
          break;
        } else if (ackn) {
          Matcher m4 = BasicStructureBuilder.references.matcher(localText);
          if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) {
            acknowledgementBlocks.add(iii);
          } else if (m4.find()) {
            ackn = false;
            break;
          }
        }
        i++;
      }

      // we remove references headers in blockSectionTitles
      int index = -1;
      for (Integer ii : blockSectionTitles) {
        Block block = doc.getBlocks().get(ii);
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();
        Matcher m4 = BasicStructureBuilder.references.matcher(localText);
        if (m4.find()) {
          index = blockSectionTitles.indexOf(ii);
          break;
        }
      }
      if (index != -1) {
        blockSectionTitles.remove(index);
      }

      // we check headers repetition from page to page to decide if it is an header or not
      ArrayList<Integer> toRemove = new ArrayList<Integer>();
      for (Integer ii : blockHeaders) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("header candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockHeaders) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                // System.out.println("dist with " + localText2 + " : " + dist);
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockHeaders.remove(ii);
      }

      // same for footers
      toRemove = new ArrayList<Integer>();
      for (Integer ii : blockFooters) {
        String localText = (doc.getBlocks().get(ii)).getText().trim();
        localText = TextUtilities.shadowNumbers(localText);
        int length = localText.length();
        if (length > 160) toRemove.add(ii);
        else {
          // System.out.println("footer candidate: " + localText);
          // evaluate distance with other potential headers
          boolean valid = false;
          for (Integer ii2 : blockFooters) {
            if (ii.intValue() != ii2.intValue()) {
              String localText2 = doc.getBlocks().get(ii2).getText().trim();
              if (localText2.length() < 160) {
                localText2 = TextUtilities.shadowNumbers(localText2);
                double dist =
                    (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length;
                if (dist < 0.25) {
                  valid = true;
                  break;
                }
              }
            }
          }
          if (!valid) {
            toRemove.add(ii);
          }
        }
      }

      for (Integer ii : toRemove) {
        blockFooters.remove(ii);
      }

      // a special step for added banner repositoryies such HAL
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        // HAL
        if (localText.startsWith("Author manuscript, published in")) {
          Double y = block.getY();
          // System.out.println("HAL banner found, " + "block " + i + ", y = " + y);
          if (Math.abs(y - 12.538) < 2) { // reference position
            // blockHeaders.add(new Integer(i));
            blockDocumentHeaders.add(i);
            // System.out.println("HAL banner added as header block");
            break;
          }
        }

        // ACM publications
        // System.out.println("test ACM " + i);
        // System.out.println(localText);
        if (localText.startsWith("Permission to make digital or hard copies")) {
          blockFooters.add(i);
          break;
        }

        // arXiv, etc. put here
        // IOP

        if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) {
          blockDocumentHeaders.add(i);
          // System.out.println("IOP banner added as header block");
          break;
        }
        i++;
      }

      // we try to recognize here table and figure blocks
      // the idea is that the textual elements are not located as the normal text blocks
      // this is recognized by exploiting the cluster of blocks starting up and down front the block
      // containing a table or a figure marker
      // two different runs, one for figures and one for tables (everything could be done in one
      // step)
      i = 0;
      for (Block block : doc.getBlocks()) {
        String localText = block.getText().trim();
        localText = localText.replace("\n", " ");
        localText = localText.replace("  ", " ");
        localText = localText.trim();

        Matcher m = BasicStructureBuilder.figure.matcher(localText);
        Matcher m2 = BasicStructureBuilder.table.matcher(localText);

        double width = block.getWidth();
        boolean bold = block.getBold();

        // table
        // if ( (m2.find()) && (localText.length() < 200) ) {
        if ((m2.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadTables.contains(i)) {
            blockHeadTables.add(i);
          }
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().length() < 160) || (width < 50)) {
                if ((!blockTables.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockTables.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        // figure
        // else if ( (m.find()) && (localText.length() < 200) ) {
        else if ((m.find()) && ((bold) || (localText.length() < 200))) {
          if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i);
          // we also put all the small blocks before and after the marker
          int j = i - 1;
          boolean imageFound = false;
          while ((j > i - 15) && (j > 0)) {
            Block b = doc.getBlocks().get(j);

            if (b.getText() != null) {
              String localText2 = b.getText().trim();
              // localText = localText.replace("\n", " ");
              localText2 = localText2.replace("  ", " ");
              localText2 = localText2.trim();

              if ((localText2.startsWith("@IMAGE")) && (!imageFound)) {
                // System.out.println(localText2);
                block.setText(block.getText() + " " + localText2);
                // System.out.println(block.getText());
                imageFound = true;
              }

              if ((localText2.length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = 0;
            }
            j--;
          }

          j = i + 1;
          while ((j < i + 15) && (j < doc.getBlocks().size())) {
            Block b = doc.getBlocks().get(j);
            if (b.getText() != null) {
              if ((b.getText().trim().length() < 160) || (width < 50)) {
                if ((!blockFigures.contains(j))
                    && (!blockSectionTitles.contains(j))
                    && (!blockHeaders.contains(j))
                    && (!blockFooters.contains(j))) blockFigures.add(j);
              } else j = doc.getBlocks().size();
            }
            j++;
          }
        }
        i++;
      }
    } finally {
      doc.setBlockHeaders(blockHeaders);
      doc.setBlockFooters(blockFooters);
      doc.setBlockSectionTitles(blockSectionTitles);
      doc.setAcknowledgementBlocks(acknowledgementBlocks);
      doc.setBlockTables(blockTables);
      doc.setBlockFigures(blockFigures);
      doc.setBlockHeadTables(blockHeadTables);
      doc.setBlockHeadFigures(blockHeadFigures);
      doc.setBlockDocumentHeaders(blockDocumentHeaders);
    }
  }

예제 #5

파일 보기

파일: PDF2XMLSaxParser.java 프로젝트: BigBoss21X/grobid

  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (qName.equals("PAGE")) {
      int length = atts.getLength();
      currentPage++;

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {;
          } else if (name.equals("number")) {

          } else if (name.equals("width")) {

          } else if (name.equals("height")) {

          }
        }
      }

      /*
       * if (block != null) { blabla.append("\n");
       * tokenizations.add("\n"); block.setText(blabla.toString());
       * block.setNbTokens(nbTokens); doc.addBlock(block); } Block block0
       * = new Block(); block0.setText("@PAGE\n"); block0.setNbTokens(0);
       * doc.addBlock(block0);
       */
      /*
       * block = new Block(); blabla = new StringBuffer(); nbTokens = 0;
       * //blabla.append("\n@block\n"); tokenizations.add("\n");
       */
    } else if (qName.equals("BLOCK")) {
      block = new Block();
      blabla = new StringBuffer();
      nbTokens = 0;
      block.setPage(currentPage);
      // blabla.append("\n@block\n");
    } else if (qName.equals("IMAGE")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("href")) {
            // if (images == null)
            // images = new ArrayList<String>();
            images.add(value);
          } else if (name.equals("x")) {
            double x = Double.parseDouble(value);
            if (x != currentX) {
              currentX = x;
            }
          } else if (name.equals("y")) {
            double y = Double.parseDouble(value);
            if (y != currentY) {
              currentY = y;
            }
          } else if (name.equals("width")) {
            double width = Double.parseDouble(value);
            if (width != currentWidth) {
              currentWidth = width;
            }
          } else if (name.equals("height")) {
            double height = Double.parseDouble(value);
            if (height != currentHeight) {
              currentHeight = height;
            }
          }
        }
      }

    } else if (qName.equals("TEXT")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {

          } else if (name.equals("x")) {

          } else if (name.equals("y")) {

          } else if (name.equals("width")) {

          } else if (name.equals("height")) {

          }
        }
      }
    } else if (qName.equals("TOKEN")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {;
          } else if (name.equals("font-name")) {
            if (!value.equals(currentFont)) {
              currentFont = value;
              blabla.append(" ");
            }
          } else if (name.equals("font-size")) {
            double fontSize = Double.parseDouble(value);
            if (fontSize != currentFontSize) {
              currentFontSize = fontSize;

              blabla.append(" ");
            }
          } else if (name.equals("bold")) {
            if (value.equals("yes")) {
              currentBold = true;
            } else {
              currentBold = false;
            }
          } else if (name.equals("italic")) {
            if (value.equals("yes")) {
              currentItalic = true;
            } else {
              currentItalic = false;
            }
          } else if (name.equals("font-color")) {
            if (!value.equals(colorFont)) {
              colorFont = value;
            }
          } else if (name.equals("rotation")) {
            if (value.equals("0")) currentRotation = false;
            else currentRotation = true;
          } else if (name.equals("x")) {
            double x = Double.parseDouble(value);
            if (x != currentX) {
              currentX = x;
            }
          } else if (name.equals("y")) {
            double y = Double.parseDouble(value);
            if (y != currentY) {
              currentY = y;
            }
          } else if (name.equals("base")) {
            double base = Double.parseDouble(value);

          } else if (name.equals("width")) {
            double width = Double.parseDouble(value);
            if (width != currentWidth) {
              currentWidth = width;
            }
          } else if (name.equals("height")) {
            double height = Double.parseDouble(value);
            if (height != currentHeight) {
              currentHeight = height;
            }
          }
        }
      }
    } else if (qName.equals("xi:include")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("href")) {
            // if (images == null)
            // images = new ArrayList<String>();
            images.add(value);
          }
        }
      }
    }
    // accumulator.setLength(0);
  }

예제 #6

파일 보기

파일: PDF2XMLSaxParser.java 프로젝트: BigBoss21X/grobid

  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") &&
    // !qName.equals("TEXT"))
    // System.out.println(qName);

    if (qName.equals("TEXT")) {
      blabla.append("\n");
      LayoutToken token = new LayoutToken();
      token.setText("\n");
      block.addToken(token);
      accumulator.setLength(0);
      tokenizations.add("\n");
    } else if (qName.equals("METADATA")) {
      accumulator.setLength(0);
    } else if (qName.equals("TOKEN")) {
      String tok0 = TextUtilities.clean(getText());

      if (block.getStartToken() == -1) {
        block.setStartToken(tokenizations.size());
      }

      if (tok0.length() > 0) {
        StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true);
        boolean diaresis = false;
        boolean accent = false;
        boolean keepLast = false;
        while (st.hasMoreTokens()) {

          diaresis = false;
          accent = false;
          keepLast = false;

          String tok = st.nextToken();
          if (tok.length() > 0) {

            LayoutToken token = new LayoutToken();

            if ((previousToken != null)
                && (tok != null)
                && (previousToken.length() > 0)
                && (tok.length() > 0)
                && blabla.length() > 0) {

              Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1);
              Character rightChar = tok.charAt(0);

              ModifierClass leftClass = classifyChar(leftChar);
              ModifierClass rightClass = classifyChar(rightChar);
              ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER;

              if (leftClass != ModifierClass.NOT_A_MODIFIER
                  || rightClass != ModifierClass.NOT_A_MODIFIER) {
                Character baseChar = null;
                Character modifierChar = null;

                if (leftClass != ModifierClass.NOT_A_MODIFIER) {
                  if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                    // assert false;
                    // keeping characters, but setting class
                    // to not a modifier
                    baseChar = leftChar;
                    modifierChar = rightChar;
                    modifierClass = ModifierClass.NOT_A_MODIFIER;
                  } else {
                    baseChar = rightChar;
                    modifierChar = leftChar;
                    modifierClass = leftClass;
                  }
                } else {
                  baseChar = leftChar;
                  modifierChar = rightChar;
                  modifierClass = rightClass;
                }

                String updatedChar = modifyCharacter(baseChar, modifierChar);

                tokenizations.remove(tokenizations.size() - 1);
                if (tokenizations.size() > 0) {
                  tokenizations.remove(tokenizations.size() - 1);
                }

                blabla.deleteCharAt(blabla.length() - 1);
                if (blabla.length() > 0) {
                  blabla.deleteCharAt(blabla.length() - 1);
                }

                removeLastCharacterIfPresent(previousTok);

                if (updatedChar != null) {
                  blabla.append(updatedChar);
                  previousTok.setText(previousTok.getText() + updatedChar);
                }

                blabla.append(tok.substring(1, tok.length()));
                previousTok.setText(previousTok.getText() + tok.substring(1, tok.length()));
                tokenizations.add(previousTok.getText());

                diaresis =
                    (modifierClass == ModifierClass.DIAERESIS
                        || modifierClass == ModifierClass.NORDIC_RING
                        || modifierClass == ModifierClass.CZECH_CARON
                        || modifierClass == ModifierClass.TILDE
                        || modifierClass == ModifierClass.CEDILLA);

                accent =
                    (modifierClass == ModifierClass.ACUTE_ACCENT
                        || modifierClass == ModifierClass.CIRCUMFLEX
                        || modifierClass == ModifierClass.GRAVE_ACCENT);

                if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                  tok = ""; // resetting current token as it
                  // is a single-item
                }
              }
            }

            if (tok != null) {
              // actually in certain cases, the extracted string under token can be a chunk of text
              // with separators that need to be preserved
              // tok = tok.replace(" ", "");
            }

            if ((!diaresis) && (!accent)) {
              // blabla.append(" ");
              blabla.append(tok);
              token.setText(tok);

              tokenizations.add(tok);
            } else {
              tok = "";
              keepLast = true;
            }

            /*
             * StringTokenizer st0 = new StringTokenizer(tok0,
             * TextUtilities.fullPunctuations, true);
             * while(st0.hasMoreTokens()) { String tok =
             * st0.nextToken(); tokenizations.add(tok); }
             * tokenizations.add(" ");
             */

            /*
             * boolean punct1 = false; boolean punct2 = false;
             * boolean punct3 = false; String content = null; int i
             * = 0; for(; i<TextUtilities.punctuations.length();
             * i++) { if (tok.length() > 0) { if
             * (tok.charAt(tok.length()-1) ==
             * TextUtilities.punctuations.charAt(i)) { punct1 =
             * true; content = tok.substring(0, tok.length()-1); if
             * (tok.length() > 1) { int j = 0; for(;
             * j<TextUtilities.punctuations.length(); j++) { if
             * (tok.charAt(tok.length()-2) ==
             * TextUtilities.punctuations.charAt(j)) { punct3 =
             * true; content = tok.substring(0, tok.length()-2); } }
             * } break; } } } if (tok.length() > 0) { if (
             * (tok.startsWith("(")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("("); } else if (
             * (tok.startsWith("[")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("["); } else if (
             * (tok.startsWith("\"")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("\""); } }
             */
            if (currentRotation) currentFontSize = currentFontSize / 2;

            /*
             * if (punct2) { if (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * token = new LayoutToken(); token.setText(content); }
             * if (punct1) { token.setText(content); if (currentFont
             * != null) token.setFont(currentFont.toLowerCase());
             * else token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * if (punct3) { token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-2)); if
             * (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token); }
             *
             * token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-1)); }
             */
            if (currentFont != null) token.setFont(currentFont.toLowerCase());
            else token.setFont("default");
            token.setItalic(currentItalic);
            token.setBold(currentBold);
            token.setRotation(currentRotation);
            token.setColorFont(colorFont);
            token.setX(currentX);
            token.setY(currentY);
            token.setWidth(currentWidth);
            token.setHeight(currentHeight);
            token.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              block.addToken(token);
            }

            if (block.getFont() == null) {
              if (currentFont != null) block.setFont(currentFont.toLowerCase());
              else token.setFont("default");
            }
            if (nbTokens == 0) {
              block.setItalic(currentItalic);
              block.setBold(currentBold);
            }
            if (block.getColorFont() == null) block.setColorFont(colorFont);
            if (block.getX() == 0.0) block.setX(currentX);
            if (block.getY() == 0.0) block.setY(currentY);
            if (block.getWidth() == 0.0) block.setWidth(currentWidth);
            if (block.getHeight() == 0.0) block.setHeight(currentHeight);
            if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              previousToken = tok;
              previousTok = token;
            } else {
              previousToken = previousTok.getText();
            }

            nbTokens++;
            accumulator.setLength(0);
          }
        }
        if (tokenizations.size() > 0) {
          String justBefore = tokenizations.get(tokenizations.size() - 1);
          if (!justBefore.endsWith("-")) {
            tokenizations.add(" ");
            blabla.append(" ");
          }
        }
      }
      block.setEndToken(tokenizations.size());
    } else if (qName.equals("PAGE")) {
      // page marker are usefull to detect headers (same first line(s)
      // appearing on each page)
      if (block != null) {
        blabla.append("\n");
        tokenizations.add("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      Block block0 = new Block();
      block0.setText("@PAGE\n");
      block0.setNbTokens(0);
      block0.setPage(currentPage);
      doc.addBlock(block0);
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      nbTokens = 0;
      // blabla.append("\n@block\n");
      tokenizations.add("\n");
    } else if (qName.equals("IMAGE")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      if (images.size() > 0) {
        blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      }
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      if (block.getX() == 0.0) block.setX(currentX);
      if (block.getY() == 0.0) block.setY(currentY);
      if (block.getWidth() == 0.0) block.setWidth(currentWidth);
      if (block.getHeight() == 0.0) block.setHeight(currentHeight);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }
    /*
     * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) {
     * blabla.append("\n"); block.setText(blabla.toString());
     * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new
     * Block(); block.setPage(currentPage); blabla = new StringBuffer();
     * blabla.append("@IMAGE " + "vectorial \n");
     * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if
     * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0)
     * block.setY(currentY); if (block.getWidth() == 0.0)
     * block.setWidth(currentWidth); if (block.getHeight() == 0.0)
     * block.setHeight(currentHeight); doc.addBlock(block); blabla = new
     * StringBuffer(); nbTokens = 0; block = new Block();
     * block.setPage(currentPage); }
     */
    else if (qName.equals("BLOCK")) {
      blabla.append("\n");
      tokenizations.add("\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);

      block.setWidth(currentX - block.getX() + currentWidth);
      block.setHeight(currentY - block.getY() + currentHeight);

      doc.addBlock(block);
      // blabla = new StringBuffer();
      nbTokens = 0;
      block = null;
    } else if (qName.equals("xi:include")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }

    /*
     * else if (qName.equals("DOCUMENT")) {
     * System.out.println(blabla.toString()); }
     */

  }