コード例 #1
0
  /**
   * Set the main segments of the document based on the full text parsing results
   *
   * @param doc a document
   * @param labeledResult string
   * @param tokenizations tokens
   * @return a document
   */
  public static Document resultSegmentation(
      Document doc, String labeledResult, List<String> tokenizations) {
    if (doc == null) {
      throw new NullPointerException("Document is null");
    }
    if (doc.getBlocks() == null) {
      throw new NullPointerException("Blocks of the documents are null");
    }
    // System.out.println(tokenizations.toString());
    //        int i = 0;
    //        boolean first = true;
    List<Integer> blockHeaders = new ArrayList<Integer>();
    List<Integer> blockFooters = new ArrayList<Integer>();
    List<Integer> blockDocumentHeaders = new ArrayList<Integer>();
    List<Integer> blockSectionTitles = new ArrayList<Integer>();

    SortedSet<DocumentPiece> blockReferences = new TreeSet<DocumentPiece>();

    doc.setBibDataSets(new ArrayList<BibDataSet>());

    //        StringTokenizer st = new StringTokenizer(labeledResult, "\n");

    String[] lines = labeledResult.split("\n");

    String currentTag = null;
    String s2 = null;
    String lastTag = null;
    String lastPlainTag = null;

    int p = 0; // index in the results' tokenization (st)
    int blockIndex = 0;

    BibDataSet bib = null;

    DocumentPointer pointerA = null;
    //        DocumentPointer pointerB = null;
    DocumentPointer currentPointer;
    DocumentPointer lastPointer = null;

    for (String line : lines) {
      //        while (st.hasMoreTokens()) {

      for (; blockIndex < doc.getBlocks().size() - 1; blockIndex++) {
        //                int startTok = doc.getBlocks().get(blockIndex).getStartToken();
        int endTok = doc.getBlocks().get(blockIndex).getEndToken();

        if (endTok >= p) {
          break;
        }
      }

      ArrayList<String> localFeatures = new ArrayList<String>();
      boolean addSpace = false;

      //            String tok = st.nextToken().trim();
      line = line.trim();

      StringTokenizer stt = new StringTokenizer(line, "\t");
      int j = 0;

      boolean newLine = false;
      int ll = stt.countTokens();
      while (stt.hasMoreTokens()) {
        String s = stt.nextToken().trim();
        if (j == 0) {
          s2 = s;
          boolean strop = false;
          while ((!strop) && (p < tokenizations.size())) {
            String tokOriginal = tokenizations.get(p);
            if (tokOriginal.equals(" ")
                | tokOriginal.equals("\n")
                | tokOriginal.equals("\r")
                | tokOriginal.equals("\t")) {
              addSpace = true;
              p++;
            } else if (tokOriginal.equals("")) {
              p++;
            } else // if (tokOriginal.equals(s))
            {
              strop = true;
            }
          }
        } else if (j == ll - 1) {
          currentTag = s; // current tag
        } else {
          if (s.equals("LINESTART")) {
            newLine = true;
          }
          localFeatures.add(s);
        }
        j++;
      }

      if (lastTag != null) {
        if (lastTag.startsWith("I-")) {
          lastPlainTag = lastTag.substring(2, lastTag.length());
        } else {
          lastPlainTag = lastTag;
        }
      }

      String currentPlainTag = null;
      if (currentTag != null) {
        if (currentTag.startsWith("I-")) {
          currentPlainTag = currentTag.substring(2, currentTag.length());
        } else {
          currentPlainTag = currentTag;
        }
      }

      currentPointer = new DocumentPointer(doc, blockIndex, p);

      if (lastPlainTag != null
          && !currentPlainTag.equals(lastPlainTag)
          && lastPlainTag.equals("<references>")) {
        blockReferences.add(new DocumentPiece(pointerA, lastPointer));
        pointerA = currentPointer;
      }

      if (currentPlainTag.equals("<header>")) {
        if (!blockDocumentHeaders.contains(blockIndex)) {
          blockDocumentHeaders.add(blockIndex);
          // System.out.println("add block header: " + blockIndexInteger.intValue());
        }

      } else if (currentPlainTag.equals(
          "<references>")) { //                    if (!blockReferences.contains(blockIndex)) {
        //                        blockReferences.add(blockIndex);
        //                        //System.out.println("add block reference: " +
        // blockIndexInteger.intValue());
        //                    }

        if (currentTag.equals("I-<references>")) {
          pointerA = new DocumentPointer(doc, blockIndex, p);
          if (bib != null) {
            if (bib.getRawBib() != null) {
              doc.getBibDataSets().add(bib);
              bib = new BibDataSet();
            }
          } else {
            bib = new BibDataSet();
          }
          bib.setRawBib(s2);
        } else {
          if (addSpace) {
            if (bib == null) {
              bib = new BibDataSet();
              bib.setRawBib(" " + s2);
            } else {
              bib.setRawBib(bib.getRawBib() + " " + s2);
            }
          } else {
            if (bib == null) {
              bib = new BibDataSet();
              bib.setRawBib(s2);
            } else {
              bib.setRawBib(bib.getRawBib() + s2);
            }
          }
        }

        //                case "<reference_marker>":
        //                    if (!blockReferences.contains(blockIndex)) {
        //                        blockReferences.add(blockIndex);
        //                        //System.out.println("add block reference: " +
        // blockIndexInteger.intValue());
        //                    }
        //
        //                    if (currentTag.equals("I-<reference_marker>")) {
        //                        if (bib != null) {
        //                            if (bib.getRefSymbol() != null) {
        //                                doc.getBibDataSets().add(bib);
        //                                bib = new BibDataSet();
        //                            }
        //                        } else {
        //                            bib = new BibDataSet();
        //                        }
        //                        bib.setRefSymbol(s2);
        //                    } else {
        //                        if (addSpace) {
        //                            if (bib == null) {
        //                                bib = new BibDataSet();
        //                                bib.setRefSymbol(s2);
        //                            } else {
        //                                bib.setRefSymbol(bib.getRefSymbol() + " " + s2);
        //                            }
        //                        } else {
        //                            if (bib == null) {
        //                                bib = new BibDataSet();
        //                                bib.setRefSymbol(s2);
        //                            } else {
        //                                bib.setRefSymbol(bib.getRefSymbol() + s2);
        //                            }
        //                        }
        //                    }
        //                    break;
      } else if (currentPlainTag.equals("<page_footnote>")) {
        if (!blockFooters.contains(blockIndex)) {
          blockFooters.add(blockIndex);
          // System.out.println("add block foot note: " + blockIndexInteger.intValue());
        }

      } else if (currentPlainTag.equals("<page_header>")) {
        if (!blockHeaders.contains(blockIndex)) {
          blockHeaders.add(blockIndex);
          // System.out.println("add block page header: " + blockIndexInteger.intValue());
        }

      } else if (currentPlainTag.equals("<section>")) {
        if (!blockSectionTitles.contains(blockIndex)) {
          blockSectionTitles.add(blockIndex);
          // System.out.println("add block page header: " + blockIndexInteger.intValue());
        }
      }

      lastTag = currentTag;
      p++;
      lastPointer = currentPointer;
    }

    if (bib != null) {
      doc.getBibDataSets().add(bib);
    }

    if (!lastPointer.equals(pointerA)) {
      if (lastPlainTag.equals("<references>")) {
        blockReferences.add(new DocumentPiece(pointerA, lastPointer));
      }
    }

    doc.setBlockHeaders(blockHeaders);
    doc.setBlockFooters(blockFooters);
    doc.setBlockDocumentHeaders(blockDocumentHeaders);
    doc.setBlockReferences(blockReferences);
    doc.setBlockSectionTitles(blockSectionTitles);

    return doc;
  }
コード例 #2
0
ファイル: FullTextParser.java プロジェクト: snpts/grobid
  public String processing2(String input, boolean consolidateHeader, boolean consolidateCitations)
      throws Exception {
    if (input == null) {
      throw new GrobidResourceException("Cannot process pdf file, because input file was null.");
    }
    File inputFile = new File(input);
    if (!inputFile.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because input file '"
              + inputFile.getAbsolutePath()
              + "' does not exists.");
    }
    if (tmpPath == null) {
      throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
    }
    if (!tmpPath.exists()) {
      throw new GrobidResourceException(
          "Cannot process pdf file, because temp path '"
              + tmpPath.getAbsolutePath()
              + "' does not exists.");
    }
    doc = new Document(input, tmpPath.getAbsolutePath());
    try {
      int startPage = -1;
      int endPage = -1;
      pathXML =
          doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), true);
      // with timeout,
      // no force pdf reloading
      // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file,
      // path is the resource path
      // and we process images in the pdf file
      if (pathXML == null) {
        throw new Exception("PDF parsing fails");
      }
      doc.setPathXML(pathXML);
      ArrayList<String> tokenizations = doc.addFeaturesDocument();

      if (doc.getBlocks() == null) {
        throw new Exception("PDF parsing resulted in empty content");
      }

      String fulltext = doc.getFulltextFeatured(true, true);

      StringTokenizer st = new StringTokenizer(fulltext, "\n");
      feedTaggerAndParse(st);

      StringBuilder res = new StringBuilder();
      for (int i = 0; i < tagger.size(); i++) {
        for (int j = 0; j < tagger.xsize(); j++) {
          res.append(tagger.x(i, j)).append("\t");
        }
        res.append(tagger.y2(i));
        res.append("\n");
      }

      // buffer for the fulltext block
      String rese = res.toString();

      // set the different sections of the Document object
      doc = BasicStructureBuilder.resultSegmentation(doc, rese, tokenizations);

      // header processing
      if (headerParser == null) {
        headerParser = new HeaderParser();
      }
      resHeader = new BiblioItem();
      headerParser.processingHeaderBlock(consolidateHeader, doc, resHeader);
      // the language identification is normally done during the header parsing, but only
      // based on header information.
      // LanguageUtilities languageUtilities = LanguageUtilities.getInstance();
      Language langu = languageUtilities.runLanguageId(resHeader.getTitle() + "\n" + doc.getBody());
      if (langu != null) {
        String lang = langu.getLangId();
        doc.setLanguage(lang);
        resHeader.setLanguage(lang);
      }

      // citation processing
      if (citationParser == null) {
        citationParser = new CitationParser();
      }
      ArrayList<BibDataSet> resCitations;

      // ArrayList<String> tokenizationsRef = doc.getTokenizationsReferences();
      // System.out.println(tokenizationsRef.toString());

      // resCitations = BasicStructureBuilder.giveReferenceSegments(doc);
      resCitations = doc.bibDataSets;

      if (resCitations != null) {
        for (BibDataSet bds : resCitations) {
          String marker = bds.getRefSymbol();
          if (marker != null) {
            marker = marker.replace(".", "");
            marker = marker.replace(" ", "");
            bds.setRefSymbol(marker);
          }
          BiblioItem bib = citationParser.processing(bds.getRawBib(), consolidateCitations);
          bds.setResBib(bib);
        }
      }

      // final combination
      return toTEI(doc, rese, tokenizations, resHeader, false, null, false);
    } catch (Exception e) {
      throw new GrobidException("An exception occured while running Grobid.", e);
    } finally {
      // keep it clean when leaving...
      doc.cleanLxmlFile(pathXML, false);
    }
  }
コード例 #3
0
  public String getText() {
    String text = accumulator.toString();
    if (text.trim().length() == 0) {
      return "";
    }
    /*text = text.replace("\n", " ");
    text = text.replace("  ", " ");*/
    if (counting) {
      /*


      StringTokenizer st = new StringTokenizer(text, delimiters, true);
      int count = 0;

      while(st.hasMoreTokens()) {
      	String token = st.nextToken().trim();
      	if (token.length() == 0) {
      		continue;
      	}
      	count++;
      }
      */

      int i = currentPatentIndex;
      int count = text.length();

      while (i < patents.size()) {
        PatentItem currentPatent = patents.get(i);
        if (currentPatent != null) {
          int startOffset = currentPatent.getOffsetBegin();
          int endOffset = currentPatent.getOffsetEnd();

          if ((startOffset >= offset) && (endOffset <= offset + count)) {
            String context = currentPatent.getContext();

            /*System.out.println("OFFSET: " + offset);
            System.out.println("count: " + count);
            System.out.println("startOffset: " + startOffset);
            System.out.println("endOffset: " + endOffset);
            System.out.println("context: " + context);
            System.out.println("text: " + text);*/

            String target = "";
            if (context.charAt(0) == ' ') {
              target = " <ref type=\"patent\">" + context.substring(1, context.length()) + "</ref>";
            } else {
              target = "<ref type=\"patent\">" + context + "</ref>";
            }

            text = text.replace(context, target);
            currentPatentIndex = i;
          }
        }

        i++;
      }

      // i = currentArticleIndex;
      i = 0;
      while (i < articles.size()) {
        BibDataSet currentArticle = articles.get(i);
        if (currentArticle != null) {
          List<Integer> offsets = currentArticle.getOffsets();
          int startOffset = -1;
          int endOffset = -1;
          String context = currentArticle.getRawBib().trim();
          if (offsets.size() > 0) {
            if (offsets.get(0) != null) {
              startOffset = offsets.get(0).intValue();
              /*StringTokenizer stt = new StringTokenizer(context, delimiters, true);
              int count2 = 0;
              while(stt.hasMoreTokens()) {
              	String token2 = stt.nextToken().trim();
              	if (token2.length() == 0) {
              		continue;
              	}
              	count2++;
              }*/
              // endOffset = offsets.get(1).intValue();
              endOffset = startOffset + context.length();
            }
          }

          // if ( (startOffset >= offset) && (endOffset <= offset+count) ) {
          if ((startOffset >= offset)) {
            /*System.out.println("OFFSET: " + offset);
            System.out.println("count: " + count);
            System.out.println("startOffset: " + startOffset);
            System.out.println("endOffset: " + endOffset);
            System.out.println("context: " + context);
            System.out.println("text: " + text);*/

            String target = " <ref type=\"npl\">" + context + "</ref> ";
            text = text.replace(context, target);
            currentArticleIndex = i;
          }
        }

        i++;
      }

      offset += count;
    }

    return text;
  }