/** * Set the main segments of the document based on the full text parsing results * * @param doc a document * @param labeledResult string * @param tokenizations tokens * @return a document */ public static Document resultSegmentation( Document doc, String labeledResult, List<String> tokenizations) { if (doc == null) { throw new NullPointerException("Document is null"); } if (doc.getBlocks() == null) { throw new NullPointerException("Blocks of the documents are null"); } // System.out.println(tokenizations.toString()); // int i = 0; // boolean first = true; List<Integer> blockHeaders = new ArrayList<Integer>(); List<Integer> blockFooters = new ArrayList<Integer>(); List<Integer> blockDocumentHeaders = new ArrayList<Integer>(); List<Integer> blockSectionTitles = new ArrayList<Integer>(); SortedSet<DocumentPiece> blockReferences = new TreeSet<DocumentPiece>(); doc.setBibDataSets(new ArrayList<BibDataSet>()); // StringTokenizer st = new StringTokenizer(labeledResult, "\n"); String[] lines = labeledResult.split("\n"); String currentTag = null; String s2 = null; String lastTag = null; String lastPlainTag = null; int p = 0; // index in the results' tokenization (st) int blockIndex = 0; BibDataSet bib = null; DocumentPointer pointerA = null; // DocumentPointer pointerB = null; DocumentPointer currentPointer; DocumentPointer lastPointer = null; for (String line : lines) { // while (st.hasMoreTokens()) { for (; blockIndex < doc.getBlocks().size() - 1; blockIndex++) { // int startTok = doc.getBlocks().get(blockIndex).getStartToken(); int endTok = doc.getBlocks().get(blockIndex).getEndToken(); if (endTok >= p) { break; } } ArrayList<String> localFeatures = new ArrayList<String>(); boolean addSpace = false; // String tok = st.nextToken().trim(); line = line.trim(); StringTokenizer stt = new StringTokenizer(line, "\t"); int j = 0; boolean newLine = false; int ll = stt.countTokens(); while (stt.hasMoreTokens()) { String s = stt.nextToken().trim(); if (j == 0) { s2 = s; boolean strop = false; while ((!strop) && (p < tokenizations.size())) { String tokOriginal = tokenizations.get(p); if (tokOriginal.equals(" ") | tokOriginal.equals("\n") | tokOriginal.equals("\r") | tokOriginal.equals("\t")) { addSpace = true; p++; } else if (tokOriginal.equals("")) { p++; } else // if (tokOriginal.equals(s)) { strop = true; } } } else if (j == ll - 1) { currentTag = s; // current tag } else { if (s.equals("LINESTART")) { newLine = true; } localFeatures.add(s); } j++; } if (lastTag != null) { if (lastTag.startsWith("I-")) { lastPlainTag = lastTag.substring(2, lastTag.length()); } else { lastPlainTag = lastTag; } } String currentPlainTag = null; if (currentTag != null) { if (currentTag.startsWith("I-")) { currentPlainTag = currentTag.substring(2, currentTag.length()); } else { currentPlainTag = currentTag; } } currentPointer = new DocumentPointer(doc, blockIndex, p); if (lastPlainTag != null && !currentPlainTag.equals(lastPlainTag) && lastPlainTag.equals("<references>")) { blockReferences.add(new DocumentPiece(pointerA, lastPointer)); pointerA = currentPointer; } if (currentPlainTag.equals("<header>")) { if (!blockDocumentHeaders.contains(blockIndex)) { blockDocumentHeaders.add(blockIndex); // System.out.println("add block header: " + blockIndexInteger.intValue()); } } else if (currentPlainTag.equals( "<references>")) { // if (!blockReferences.contains(blockIndex)) { // blockReferences.add(blockIndex); // //System.out.println("add block reference: " + // blockIndexInteger.intValue()); // } if (currentTag.equals("I-<references>")) { pointerA = new DocumentPointer(doc, blockIndex, p); if (bib != null) { if (bib.getRawBib() != null) { doc.getBibDataSets().add(bib); bib = new BibDataSet(); } } else { bib = new BibDataSet(); } bib.setRawBib(s2); } else { if (addSpace) { if (bib == null) { bib = new BibDataSet(); bib.setRawBib(" " + s2); } else { bib.setRawBib(bib.getRawBib() + " " + s2); } } else { if (bib == null) { bib = new BibDataSet(); bib.setRawBib(s2); } else { bib.setRawBib(bib.getRawBib() + s2); } } } // case "<reference_marker>": // if (!blockReferences.contains(blockIndex)) { // blockReferences.add(blockIndex); // //System.out.println("add block reference: " + // blockIndexInteger.intValue()); // } // // if (currentTag.equals("I-<reference_marker>")) { // if (bib != null) { // if (bib.getRefSymbol() != null) { // doc.getBibDataSets().add(bib); // bib = new BibDataSet(); // } // } else { // bib = new BibDataSet(); // } // bib.setRefSymbol(s2); // } else { // if (addSpace) { // if (bib == null) { // bib = new BibDataSet(); // bib.setRefSymbol(s2); // } else { // bib.setRefSymbol(bib.getRefSymbol() + " " + s2); // } // } else { // if (bib == null) { // bib = new BibDataSet(); // bib.setRefSymbol(s2); // } else { // bib.setRefSymbol(bib.getRefSymbol() + s2); // } // } // } // break; } else if (currentPlainTag.equals("<page_footnote>")) { if (!blockFooters.contains(blockIndex)) { blockFooters.add(blockIndex); // System.out.println("add block foot note: " + blockIndexInteger.intValue()); } } else if (currentPlainTag.equals("<page_header>")) { if (!blockHeaders.contains(blockIndex)) { blockHeaders.add(blockIndex); // System.out.println("add block page header: " + blockIndexInteger.intValue()); } } else if (currentPlainTag.equals("<section>")) { if (!blockSectionTitles.contains(blockIndex)) { blockSectionTitles.add(blockIndex); // System.out.println("add block page header: " + blockIndexInteger.intValue()); } } lastTag = currentTag; p++; lastPointer = currentPointer; } if (bib != null) { doc.getBibDataSets().add(bib); } if (!lastPointer.equals(pointerA)) { if (lastPlainTag.equals("<references>")) { blockReferences.add(new DocumentPiece(pointerA, lastPointer)); } } doc.setBlockHeaders(blockHeaders); doc.setBlockFooters(blockFooters); doc.setBlockDocumentHeaders(blockDocumentHeaders); doc.setBlockReferences(blockReferences); doc.setBlockSectionTitles(blockSectionTitles); return doc; }
public String processing2(String input, boolean consolidateHeader, boolean consolidateCitations) throws Exception { if (input == null) { throw new GrobidResourceException("Cannot process pdf file, because input file was null."); } File inputFile = new File(input); if (!inputFile.exists()) { throw new GrobidResourceException( "Cannot process pdf file, because input file '" + inputFile.getAbsolutePath() + "' does not exists."); } if (tmpPath == null) { throw new GrobidResourceException("Cannot process pdf file, because temp path is null."); } if (!tmpPath.exists()) { throw new GrobidResourceException( "Cannot process pdf file, because temp path '" + tmpPath.getAbsolutePath() + "' does not exists."); } doc = new Document(input, tmpPath.getAbsolutePath()); try { int startPage = -1; int endPage = -1; pathXML = doc.pdf2xml(true, false, startPage, endPage, input, tmpPath.getAbsolutePath(), true); // with timeout, // no force pdf reloading // input is the pdf absolute path, tmpPath is the temp. directory for the temp. lxml file, // path is the resource path // and we process images in the pdf file if (pathXML == null) { throw new Exception("PDF parsing fails"); } doc.setPathXML(pathXML); ArrayList<String> tokenizations = doc.addFeaturesDocument(); if (doc.getBlocks() == null) { throw new Exception("PDF parsing resulted in empty content"); } String fulltext = doc.getFulltextFeatured(true, true); StringTokenizer st = new StringTokenizer(fulltext, "\n"); feedTaggerAndParse(st); StringBuilder res = new StringBuilder(); for (int i = 0; i < tagger.size(); i++) { for (int j = 0; j < tagger.xsize(); j++) { res.append(tagger.x(i, j)).append("\t"); } res.append(tagger.y2(i)); res.append("\n"); } // buffer for the fulltext block String rese = res.toString(); // set the different sections of the Document object doc = BasicStructureBuilder.resultSegmentation(doc, rese, tokenizations); // header processing if (headerParser == null) { headerParser = new HeaderParser(); } resHeader = new BiblioItem(); headerParser.processingHeaderBlock(consolidateHeader, doc, resHeader); // the language identification is normally done during the header parsing, but only // based on header information. // LanguageUtilities languageUtilities = LanguageUtilities.getInstance(); Language langu = languageUtilities.runLanguageId(resHeader.getTitle() + "\n" + doc.getBody()); if (langu != null) { String lang = langu.getLangId(); doc.setLanguage(lang); resHeader.setLanguage(lang); } // citation processing if (citationParser == null) { citationParser = new CitationParser(); } ArrayList<BibDataSet> resCitations; // ArrayList<String> tokenizationsRef = doc.getTokenizationsReferences(); // System.out.println(tokenizationsRef.toString()); // resCitations = BasicStructureBuilder.giveReferenceSegments(doc); resCitations = doc.bibDataSets; if (resCitations != null) { for (BibDataSet bds : resCitations) { String marker = bds.getRefSymbol(); if (marker != null) { marker = marker.replace(".", ""); marker = marker.replace(" ", ""); bds.setRefSymbol(marker); } BiblioItem bib = citationParser.processing(bds.getRawBib(), consolidateCitations); bds.setResBib(bib); } } // final combination return toTEI(doc, rese, tokenizations, resHeader, false, null, false); } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } finally { // keep it clean when leaving... doc.cleanLxmlFile(pathXML, false); } }
public String getText() { String text = accumulator.toString(); if (text.trim().length() == 0) { return ""; } /*text = text.replace("\n", " "); text = text.replace(" ", " ");*/ if (counting) { /* StringTokenizer st = new StringTokenizer(text, delimiters, true); int count = 0; while(st.hasMoreTokens()) { String token = st.nextToken().trim(); if (token.length() == 0) { continue; } count++; } */ int i = currentPatentIndex; int count = text.length(); while (i < patents.size()) { PatentItem currentPatent = patents.get(i); if (currentPatent != null) { int startOffset = currentPatent.getOffsetBegin(); int endOffset = currentPatent.getOffsetEnd(); if ((startOffset >= offset) && (endOffset <= offset + count)) { String context = currentPatent.getContext(); /*System.out.println("OFFSET: " + offset); System.out.println("count: " + count); System.out.println("startOffset: " + startOffset); System.out.println("endOffset: " + endOffset); System.out.println("context: " + context); System.out.println("text: " + text);*/ String target = ""; if (context.charAt(0) == ' ') { target = " <ref type=\"patent\">" + context.substring(1, context.length()) + "</ref>"; } else { target = "<ref type=\"patent\">" + context + "</ref>"; } text = text.replace(context, target); currentPatentIndex = i; } } i++; } // i = currentArticleIndex; i = 0; while (i < articles.size()) { BibDataSet currentArticle = articles.get(i); if (currentArticle != null) { List<Integer> offsets = currentArticle.getOffsets(); int startOffset = -1; int endOffset = -1; String context = currentArticle.getRawBib().trim(); if (offsets.size() > 0) { if (offsets.get(0) != null) { startOffset = offsets.get(0).intValue(); /*StringTokenizer stt = new StringTokenizer(context, delimiters, true); int count2 = 0; while(stt.hasMoreTokens()) { String token2 = stt.nextToken().trim(); if (token2.length() == 0) { continue; } count2++; }*/ // endOffset = offsets.get(1).intValue(); endOffset = startOffset + context.length(); } } // if ( (startOffset >= offset) && (endOffset <= offset+count) ) { if ((startOffset >= offset)) { /*System.out.println("OFFSET: " + offset); System.out.println("count: " + count); System.out.println("startOffset: " + startOffset); System.out.println("endOffset: " + endOffset); System.out.println("context: " + context); System.out.println("text: " + text);*/ String target = " <ref type=\"npl\">" + context + "</ref> "; text = text.replace(context, target); currentArticleIndex = i; } } i++; } offset += count; } return text; }