Java Block.getStartToken Examples

Programming Language: Java

Namespace/Package Name: org.grobid.core.layout

Class/Type: Block

Method/Function: getStartToken

Examples at hotexamples.com: 2

Java Block.getStartToken - 2 examples found. These are the top rated real world Java examples of org.grobid.core.layout.Block.getStartToken extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

setText(3)

getText(3)

getStartToken(2)

getWidth(2)

getBold(2)

getFont(2)

getFontSize(2)

getY(2)

setPage(2)

setItalic(1)

setNbTokens(1)

addToken(1)

setFontSize(1)

setStartToken(1)

setWidth(1)

setX(1)

setHeight(1)

setBold(1)

setFont(1)

setEndToken(1)

setColorFont(1)

getX(1)

getTokens(1)

getItalic(1)

getHeight(1)

getEndToken(1)

getColorFont(1)

setY(1)

Example #1

Show file

File: BasicStructureBuilder.java Project: Immortalin/grobid

  public static Document generalResultSegmentation(
      Document doc, String labeledResult, List<LayoutToken> documentTokens) {
    List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult);

    SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create();
    doc.setLabeledBlocks(labeledBlocks);

    /*try {
          	FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult);
    	FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString());
    }
    catch(Exception e) {
    	e.printStackTrace();
    }*/

    List<Block> docBlocks = doc.getBlocks();
    int indexLine = 0;
    int blockIndex = 0;
    int p = 0; // position in the labeled result
    int currentLineEndPos = 0; // position in the global doc. tokenization of the last
    // token of the current line
    int currentLineStartPos = 0; // position in the global doc.
    // tokenization of the first token of the current line
    String line = null;

    DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER;
    DocumentPointer currentPointer = null;
    DocumentPointer lastPointer = null;

    String curLabel;
    String curPlainLabel = null;
    String lastPlainLabel = null;

    int lastTokenInd = -1;
    for (int i = docBlocks.size() - 1; i >= 0; i--) {
      int endToken = docBlocks.get(i).getEndToken();
      if (endToken != -1) {
        lastTokenInd = endToken;
        break;
      }
    }

    // we do this concatenation trick so that we don't have to process stuff after the main loop
    // no copying of lists happens because of this, so it's ok to concatenate
    String ignoredLabel = "@IGNORED_LABEL@";
    for (Pair<String, String> labeledTokenPair :
        Iterables.concat(
            labeledTokens,
            Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) {
      if (labeledTokenPair == null) {
        p++;
        continue;
      }

      // as we process the document segmentation line by line, we don't use the usual
      // tokenization to rebuild the text flow, but we get each line again from the
      // text stored in the document blocks (similarly as when generating the features)
      line = null;
      while ((line == null) && (blockIndex < docBlocks.size())) {
        Block block = docBlocks.get(blockIndex);
        List<LayoutToken> tokens = block.getTokens();
        String localText = block.getText();
        if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        }
        String[] lines = localText.split("[\\n\\r]");
        if ((lines.length == 0) || (indexLine >= lines.length)) {
          blockIndex++;
          indexLine = 0;
          if (blockIndex < docBlocks.size()) {
            block = docBlocks.get(blockIndex);
            currentLineStartPos = block.getStartToken();
          }
          continue;
        } else {
          line = lines[indexLine];
          indexLine++;
          if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) {
            line = null;
            continue;
          }

          if (currentLineStartPos > lastTokenInd) continue;

          // adjust the start token position in documentTokens to this non trivial line
          // first skip possible space characters and tabs at the beginning of the line
          while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                  || documentTokens.get(currentLineStartPos).t().equals("\t"))
              && (currentLineStartPos != lastTokenInd)) {
            currentLineStartPos++;
          }
          if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) {
            while (currentLineStartPos < block.getEndToken()) {
              if (documentTokens.get(currentLineStartPos).t().equals("\n")
                  || documentTokens.get(currentLineStartPos).t().equals("\r")) {
                // move to the start of the next line, but ignore space characters and tabs
                currentLineStartPos++;
                while ((documentTokens.get(currentLineStartPos).t().equals(" ")
                        || documentTokens.get(currentLineStartPos).t().equals("\t"))
                    && (currentLineStartPos != lastTokenInd)) {
                  currentLineStartPos++;
                }
                if ((currentLineStartPos != lastTokenInd)
                    && labeledTokenPair.a.startsWith(
                        documentTokens.get(currentLineStartPos).getText())) {
                  break;
                }
              }
              currentLineStartPos++;
            }
          }

          // what is then the position of the last token of this line?
          currentLineEndPos = currentLineStartPos;
          while (currentLineEndPos < block.getEndToken()) {
            if (documentTokens.get(currentLineEndPos).t().equals("\n")
                || documentTokens.get(currentLineEndPos).t().equals("\r")) {
              currentLineEndPos--;
              break;
            }
            currentLineEndPos++;
          }
        }
      }
      curLabel = labeledTokenPair.b;
      curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel);

      /*System.out.println("-------------------------------");
      System.out.println("block: " + blockIndex);
      System.out.println("line: " + line);
      System.out.println("token: " + labeledTokenPair.a);
      System.out.println("curPlainLabel: " + curPlainLabel);
      System.out.println("lastPlainLabel: " + lastPlainLabel);
      if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1))
      	System.out.println("currentLineStartPos: " + currentLineStartPos +
      								" (" + documentTokens.get(currentLineStartPos) + ")");
      if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1))
      	System.out.println("currentLineEndPos: " + currentLineEndPos +
      								" (" + documentTokens.get(currentLineEndPos) + ")");*/

      if (blockIndex == docBlocks.size()) {
        break;
      }

      currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos);

      // either a new entity starts or a new beginning of the same type of entity
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
        }
        pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos);
        // System.out.println("add segment for: " + lastPlainLabel + ", until " +
        // (currentLineStartPos-2));
      }

      // updating stuff for next iteration
      lastPlainLabel = curPlainLabel;
      lastPointer = currentPointer;
      currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line
      p++;
    }

    if (blockIndex == docBlocks.size()) {
      // the last labelled piece has still to be added
      if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) {
        if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos())
            && (pointerA.getTokenDocPos() != -1)) {
          labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer));
          // System.out.println("add segment for: " + lastPlainLabel + ", until " +
          // (currentLineStartPos-2));
        }
      }
    }

    return doc;
  }

Example #2

Show file

File: PDF2XMLSaxParser.java Project: BigBoss21X/grobid

  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") &&
    // !qName.equals("TEXT"))
    // System.out.println(qName);

    if (qName.equals("TEXT")) {
      blabla.append("\n");
      LayoutToken token = new LayoutToken();
      token.setText("\n");
      block.addToken(token);
      accumulator.setLength(0);
      tokenizations.add("\n");
    } else if (qName.equals("METADATA")) {
      accumulator.setLength(0);
    } else if (qName.equals("TOKEN")) {
      String tok0 = TextUtilities.clean(getText());

      if (block.getStartToken() == -1) {
        block.setStartToken(tokenizations.size());
      }

      if (tok0.length() > 0) {
        StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true);
        boolean diaresis = false;
        boolean accent = false;
        boolean keepLast = false;
        while (st.hasMoreTokens()) {

          diaresis = false;
          accent = false;
          keepLast = false;

          String tok = st.nextToken();
          if (tok.length() > 0) {

            LayoutToken token = new LayoutToken();

            if ((previousToken != null)
                && (tok != null)
                && (previousToken.length() > 0)
                && (tok.length() > 0)
                && blabla.length() > 0) {

              Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1);
              Character rightChar = tok.charAt(0);

              ModifierClass leftClass = classifyChar(leftChar);
              ModifierClass rightClass = classifyChar(rightChar);
              ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER;

              if (leftClass != ModifierClass.NOT_A_MODIFIER
                  || rightClass != ModifierClass.NOT_A_MODIFIER) {
                Character baseChar = null;
                Character modifierChar = null;

                if (leftClass != ModifierClass.NOT_A_MODIFIER) {
                  if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                    // assert false;
                    // keeping characters, but setting class
                    // to not a modifier
                    baseChar = leftChar;
                    modifierChar = rightChar;
                    modifierClass = ModifierClass.NOT_A_MODIFIER;
                  } else {
                    baseChar = rightChar;
                    modifierChar = leftChar;
                    modifierClass = leftClass;
                  }
                } else {
                  baseChar = leftChar;
                  modifierChar = rightChar;
                  modifierClass = rightClass;
                }

                String updatedChar = modifyCharacter(baseChar, modifierChar);

                tokenizations.remove(tokenizations.size() - 1);
                if (tokenizations.size() > 0) {
                  tokenizations.remove(tokenizations.size() - 1);
                }

                blabla.deleteCharAt(blabla.length() - 1);
                if (blabla.length() > 0) {
                  blabla.deleteCharAt(blabla.length() - 1);
                }

                removeLastCharacterIfPresent(previousTok);

                if (updatedChar != null) {
                  blabla.append(updatedChar);
                  previousTok.setText(previousTok.getText() + updatedChar);
                }

                blabla.append(tok.substring(1, tok.length()));
                previousTok.setText(previousTok.getText() + tok.substring(1, tok.length()));
                tokenizations.add(previousTok.getText());

                diaresis =
                    (modifierClass == ModifierClass.DIAERESIS
                        || modifierClass == ModifierClass.NORDIC_RING
                        || modifierClass == ModifierClass.CZECH_CARON
                        || modifierClass == ModifierClass.TILDE
                        || modifierClass == ModifierClass.CEDILLA);

                accent =
                    (modifierClass == ModifierClass.ACUTE_ACCENT
                        || modifierClass == ModifierClass.CIRCUMFLEX
                        || modifierClass == ModifierClass.GRAVE_ACCENT);

                if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                  tok = ""; // resetting current token as it
                  // is a single-item
                }
              }
            }

            if (tok != null) {
              // actually in certain cases, the extracted string under token can be a chunk of text
              // with separators that need to be preserved
              // tok = tok.replace(" ", "");
            }

            if ((!diaresis) && (!accent)) {
              // blabla.append(" ");
              blabla.append(tok);
              token.setText(tok);

              tokenizations.add(tok);
            } else {
              tok = "";
              keepLast = true;
            }

            /*
             * StringTokenizer st0 = new StringTokenizer(tok0,
             * TextUtilities.fullPunctuations, true);
             * while(st0.hasMoreTokens()) { String tok =
             * st0.nextToken(); tokenizations.add(tok); }
             * tokenizations.add(" ");
             */

            /*
             * boolean punct1 = false; boolean punct2 = false;
             * boolean punct3 = false; String content = null; int i
             * = 0; for(; i<TextUtilities.punctuations.length();
             * i++) { if (tok.length() > 0) { if
             * (tok.charAt(tok.length()-1) ==
             * TextUtilities.punctuations.charAt(i)) { punct1 =
             * true; content = tok.substring(0, tok.length()-1); if
             * (tok.length() > 1) { int j = 0; for(;
             * j<TextUtilities.punctuations.length(); j++) { if
             * (tok.charAt(tok.length()-2) ==
             * TextUtilities.punctuations.charAt(j)) { punct3 =
             * true; content = tok.substring(0, tok.length()-2); } }
             * } break; } } } if (tok.length() > 0) { if (
             * (tok.startsWith("(")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("("); } else if (
             * (tok.startsWith("[")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("["); } else if (
             * (tok.startsWith("\"")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("\""); } }
             */
            if (currentRotation) currentFontSize = currentFontSize / 2;

            /*
             * if (punct2) { if (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * token = new LayoutToken(); token.setText(content); }
             * if (punct1) { token.setText(content); if (currentFont
             * != null) token.setFont(currentFont.toLowerCase());
             * else token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * if (punct3) { token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-2)); if
             * (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token); }
             *
             * token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-1)); }
             */
            if (currentFont != null) token.setFont(currentFont.toLowerCase());
            else token.setFont("default");
            token.setItalic(currentItalic);
            token.setBold(currentBold);
            token.setRotation(currentRotation);
            token.setColorFont(colorFont);
            token.setX(currentX);
            token.setY(currentY);
            token.setWidth(currentWidth);
            token.setHeight(currentHeight);
            token.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              block.addToken(token);
            }

            if (block.getFont() == null) {
              if (currentFont != null) block.setFont(currentFont.toLowerCase());
              else token.setFont("default");
            }
            if (nbTokens == 0) {
              block.setItalic(currentItalic);
              block.setBold(currentBold);
            }
            if (block.getColorFont() == null) block.setColorFont(colorFont);
            if (block.getX() == 0.0) block.setX(currentX);
            if (block.getY() == 0.0) block.setY(currentY);
            if (block.getWidth() == 0.0) block.setWidth(currentWidth);
            if (block.getHeight() == 0.0) block.setHeight(currentHeight);
            if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              previousToken = tok;
              previousTok = token;
            } else {
              previousToken = previousTok.getText();
            }

            nbTokens++;
            accumulator.setLength(0);
          }
        }
        if (tokenizations.size() > 0) {
          String justBefore = tokenizations.get(tokenizations.size() - 1);
          if (!justBefore.endsWith("-")) {
            tokenizations.add(" ");
            blabla.append(" ");
          }
        }
      }
      block.setEndToken(tokenizations.size());
    } else if (qName.equals("PAGE")) {
      // page marker are usefull to detect headers (same first line(s)
      // appearing on each page)
      if (block != null) {
        blabla.append("\n");
        tokenizations.add("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      Block block0 = new Block();
      block0.setText("@PAGE\n");
      block0.setNbTokens(0);
      block0.setPage(currentPage);
      doc.addBlock(block0);
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      nbTokens = 0;
      // blabla.append("\n@block\n");
      tokenizations.add("\n");
    } else if (qName.equals("IMAGE")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      if (images.size() > 0) {
        blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      }
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      if (block.getX() == 0.0) block.setX(currentX);
      if (block.getY() == 0.0) block.setY(currentY);
      if (block.getWidth() == 0.0) block.setWidth(currentWidth);
      if (block.getHeight() == 0.0) block.setHeight(currentHeight);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }
    /*
     * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) {
     * blabla.append("\n"); block.setText(blabla.toString());
     * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new
     * Block(); block.setPage(currentPage); blabla = new StringBuffer();
     * blabla.append("@IMAGE " + "vectorial \n");
     * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if
     * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0)
     * block.setY(currentY); if (block.getWidth() == 0.0)
     * block.setWidth(currentWidth); if (block.getHeight() == 0.0)
     * block.setHeight(currentHeight); doc.addBlock(block); blabla = new
     * StringBuffer(); nbTokens = 0; block = new Block();
     * block.setPage(currentPage); }
     */
    else if (qName.equals("BLOCK")) {
      blabla.append("\n");
      tokenizations.add("\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);

      block.setWidth(currentX - block.getX() + currentWidth);
      block.setHeight(currentY - block.getY() + currentHeight);

      doc.addBlock(block);
      // blabla = new StringBuffer();
      nbTokens = 0;
      block = null;
    } else if (qName.equals("xi:include")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }

    /*
     * else if (qName.equals("DOCUMENT")) {
     * System.out.println(blabla.toString()); }
     */

  }