/** * Create the set of training and evaluation sets from the annotated examples with extraction of * citations in the patent description body. * * @param rank rank associated to the set for n-fold data generation * @param type type of data to be created, 0 is training data, 1 is evaluation data */ public void createDataSet( String setName, String rank, String corpusPath, String outputPath, int type) { int nbFiles = 0; int nbNPLRef = 0; int nbPatentRef = 0; int maxRef = 0; try { // PATENT REF. textual data // we use a SAX parser on the patent XML files MarecSaxParser sax = new MarecSaxParser(); sax.patentReferences = true; sax.nplReferences = false; int srCitations = 0; int previousSrCitations = 0; int withSR = 0; List<OffsetPosition> journalsPositions = null; List<OffsetPosition> abbrevJournalsPositions = null; List<OffsetPosition> conferencesPositions = null; List<OffsetPosition> publishersPositions = null; if (type == 0) { // training set sax.setN(trainWindow); } else { // for the test set we enlarge the focus window to include all the document. sax.setN(-1); } // get a factory /*SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setValidating(false); spf.setFeature("http://xml.org/sax/features/namespaces", false); spf.setFeature("http://xml.org/sax/features/validation", false); LinkedList<File> fileList = new LinkedList<File>(); if (setName == null) { fileList.add(new File(corpusPath)); } else if (rank == null) { fileList.add(new File(corpusPath)); } else { // n-fold evaluation fileList.add(new File(corpusPath + File.separator + setName + "ing" + rank + File.separator)); } Writer writer = null; if ((setName == null) || (setName.length() == 0)) { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + "/patent.train"), false), "UTF-8"); } else if (rank == null) { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + "/patent." + setName), false), "UTF-8"); } else { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + setName + "ing" + rank + "/patent." + setName), false), "UTF-8"); } while (fileList.size() > 0) { File file = fileList.removeFirst(); if (file.isDirectory()) { for (File subFile : file.listFiles()) fileList.addLast(subFile); } else { if (file.getName().endsWith(".xml")) { nbFiles++; System.out.println(file.getAbsolutePath()); try { //get a new instance of parser SAXParser p = spf.newSAXParser(); FileInputStream in = new FileInputStream(file); sax.setFileName(file.getName()); p.parse(in, sax); //writer1.write("\n"); nbPatentRef += sax.getNbPatentRef(); if (sax.citations != null) { if (sax.citations.size() > previousSrCitations) { previousSrCitations = sax.citations.size(); withSR++; } } journalsPositions = sax.journalsPositions; abbrevJournalsPositions = sax.abbrevJournalsPositions; conferencesPositions = sax.conferencesPositions; publishersPositions = sax.publishersPositions; if (sax.accumulatedText != null) { String text = sax.accumulatedText.toString(); if (text.trim().length() > 0) { // add features for the patent tokens addFeatures(text, writer, journalsPositions, abbrevJournalsPositions, conferencesPositions, publishersPositions); writer.write("\n \n"); } } } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } } } }*/ // NPL REF. textual data /*sax = new MarecSaxParser(); sax.patentReferences = false; sax.nplReferences = true; if (type == 0) { // training set sax.setN(trainWindow); } else { // for the test set we enlarge the focus window to include all the document. sax.setN(-1); } // get a factory spf = SAXParserFactory.newInstance(); spf.setValidating(false); spf.setFeature("http://xml.org/sax/features/namespaces", false); spf.setFeature("http://xml.org/sax/features/validation", false); fileList = new LinkedList<File>(); if (setName == null) { fileList.add(new File(corpusPath)); } else if (rank == null) { fileList.add(new File(corpusPath)); } else { fileList.add(new File(corpusPath + File.separator + setName + "ing" + rank + File.separator)); } if ((setName == null) || (setName.length() == 0)) { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + "/npl.train"), false), "UTF-8"); } else if (rank == null) { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + "/npl." + setName), false), "UTF-8"); } else { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + File.separator + setName + "ing" + rank + File.separator + "npl." + setName), false), "UTF-8"); } while (fileList.size() > 0) { File file = fileList.removeFirst(); if (file.isDirectory()) { for (File subFile : file.listFiles()) fileList.addLast(subFile); } else { if (file.getName().endsWith(".xml")) { //nbFiles++; //String text = Files.readFromFile(file,"UTF-8"); try { //get a new instance of parser SAXParser p = spf.newSAXParser(); FileInputStream in = new FileInputStream(file); sax.setFileName(file.toString()); p.parse(in, sax); //writer2.write("\n"); nbNPLRef += sax.getNbNPLRef(); if (sax.nbAllRef > maxRef) { maxRef = sax.nbAllRef; } if (sax.citations != null) { if (sax.citations.size() > previousSrCitations) { previousSrCitations = sax.citations.size(); withSR++; } } journalsPositions = sax.journalsPositions; abbrevJournalsPositions = sax.abbrevJournalsPositions; conferencesPositions = sax.conferencesPositions; publishersPositions = sax.publishersPositions; //totalLength += sax.totalLength; if (sax.accumulatedText != null) { String text = sax.accumulatedText.toString(); // add features for NPL addFeatures(text, writer, journalsPositions, abbrevJournalsPositions, conferencesPositions, publishersPositions); writer.write("\n"); } } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } } } } if (sax.citations != null) srCitations += sax.citations.size();*/ // Patent + NPL REF. textual data (the "all" model) sax = new MarecSaxParser(); sax.patentReferences = true; sax.nplReferences = true; if (type == 0) { // training set sax.setN(trainWindow); } else { // for the test set we enlarge the focus window to include all the document. sax.setN(-1); } // get a factory SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setValidating(false); spf.setFeature("http://xml.org/sax/features/namespaces", false); spf.setFeature("http://xml.org/sax/features/validation", false); LinkedList<File> fileList = new LinkedList<File>(); if (setName == null) { fileList.add(new File(corpusPath)); } else if (rank == null) { fileList.add(new File(corpusPath)); } else { fileList.add( new File(corpusPath + File.separator + setName + "ing" + rank + File.separator)); } Writer writer = null; if ((setName == null) || (setName.length() == 0)) { writer = new OutputStreamWriter( new FileOutputStream(new File(outputPath + File.separator + "all.train"), false), "UTF-8"); } else if (rank == null) { writer = new OutputStreamWriter( new FileOutputStream( new File(outputPath + File.separator + "all." + setName), false), "UTF-8"); } else { writer = new OutputStreamWriter( new FileOutputStream( new File( outputPath + File.separator + setName + "ing" + rank + File.separator + "all." + setName), false), "UTF-8"); } // int totalLength = 0; while (fileList.size() > 0) { File file = fileList.removeFirst(); if (file.isDirectory()) { for (File subFile : file.listFiles()) { fileList.addLast(subFile); } } else { if (file.getName().endsWith(".xml")) { nbFiles++; try { // get a new instance of parser SAXParser p = spf.newSAXParser(); FileInputStream in = new FileInputStream(file); sax.setFileName(file.toString()); p.parse(in, sax); // writer3.write("\n"); nbNPLRef += sax.getNbNPLRef(); nbPatentRef += sax.getNbPatentRef(); if (sax.nbAllRef > maxRef) { maxRef = sax.nbAllRef; } if (sax.citations != null) { if (sax.citations.size() > previousSrCitations) { previousSrCitations = sax.citations.size(); withSR++; } } journalsPositions = sax.journalsPositions; abbrevJournalsPositions = sax.abbrevJournalsPositions; conferencesPositions = sax.conferencesPositions; publishersPositions = sax.publishersPositions; // totalLength += sax.totalLength; if (sax.accumulatedText != null) { String text = sax.accumulatedText.toString(); // add features for patent+NPL addFeatures( text, writer, journalsPositions, abbrevJournalsPositions, conferencesPositions, publishersPositions); writer.write("\n"); } } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } } } } if (sax.citations != null) { srCitations += sax.citations.size(); } if (setName != null) { System.out.println(setName + "ing on " + nbFiles + " files"); } else { System.out.println("training on " + nbFiles + " files"); } // System.out.println("Number of file with search report: " + withSR); System.out.println("Number of references: " + (nbNPLRef + nbPatentRef)); System.out.println("Number of patent references: " + nbPatentRef); System.out.println("Number of NPL references: " + nbNPLRef); // System.out.println("Number of search report citations: " + srCitations); System.out.println( "Average number of references: " + TextUtilities.formatTwoDecimals((double) (nbNPLRef + nbPatentRef) / nbFiles)); System.out.println("Max number of references in file: " + maxRef); /*if ((setName == null) || (setName.length() == 0)) { System.out.println("patent data set under: " + outputPath + "/patent.train"); } else { System.out.println("patent data set under: " + outputPath + "/patent." + setName); } if ((setName == null) || (setName.length() == 0)) { System.out.println("npl data set under: " + outputPath + "/npl.train"); } else { System.out.println("npl data set under: " + outputPath + "/npl." + setName); }*/ if ((setName == null) || (setName.length() == 0)) { System.out.println("common data set under: " + outputPath + "/all.train"); } else { System.out.println("common data set under: " + outputPath + "/all." + setName); } } catch (Exception e) { throw new GrobidException("An exception occurred while running Grobid.", e); } }
/** * Extract results from a labelled full text in the training format without any string * modification. * * @param result reult * @param tokenizations toks * @return extraction */ private StringBuffer trainingExtraction(String result, ArrayList<String> tokenizations) { // this is the main buffer for the whole full text StringBuffer buffer = new StringBuffer(); try { StringTokenizer st = new StringTokenizer(result, "\n"); String s1 = null; String s2 = null; String lastTag = null; // current token position int p = 0; boolean start = true; boolean openFigure = false; boolean headFigure = false; boolean descFigure = false; boolean tableBlock = false; while (st.hasMoreTokens()) { boolean addSpace = false; String tok = st.nextToken().trim(); if (tok.length() == 0) { continue; } StringTokenizer stt = new StringTokenizer(tok, " \t"); ArrayList<String> localFeatures = new ArrayList<String>(); int i = 0; boolean newLine = false; int ll = stt.countTokens(); while (stt.hasMoreTokens()) { String s = stt.nextToken().trim(); if (i == 0) { s2 = TextUtilities.HTMLEncode(s); // lexical token boolean strop = false; while ((!strop) && (p < tokenizations.size())) { String tokOriginal = tokenizations.get(p); if (tokOriginal.equals(" ")) { addSpace = true; } else if (tokOriginal.equals(s)) { strop = true; } p++; } } else if (i == ll - 1) { s1 = s; // current tag } else { if (s.equals("LINESTART")) newLine = true; localFeatures.add(s); } i++; } if (newLine && !start) { buffer.append("<lb/>"); } String lastTag0 = null; if (lastTag != null) { if (lastTag.startsWith("I-")) { lastTag0 = lastTag.substring(2, lastTag.length()); } else { lastTag0 = lastTag; } } String currentTag0 = null; if (s1 != null) { if (s1.startsWith("I-")) { currentTag0 = s1.substring(2, s1.length()); } else { currentTag0 = s1; } } boolean closeParagraph = false; if (lastTag != null) { closeParagraph = testClosingTag(buffer, currentTag0, lastTag0, s1); } boolean output = false; if (!currentTag0.equals("<table>") && !currentTag0.equals("<trash>") && !currentTag0.equals("<figure_head>") && !currentTag0.equals("<label>")) { if (openFigure) { buffer.append("\n\t\t\t</figure>\n\n"); } openFigure = false; headFigure = false; descFigure = false; tableBlock = false; } output = writeField(buffer, s1, lastTag0, s2, "<header>", "<front>", addSpace, 3); if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<other>", "<note type=\"other\">", addSpace, 3); } // for paragraph we must distinguish starting and closing tags if (!output) { if (closeParagraph) { output = writeFieldBeginEnd(buffer, s1, "", s2, "<paragraph>", "<p>", addSpace, 3); } else { output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<paragraph>", "<p>", addSpace, 3); } } if (!output) { output = writeField( buffer, s1, lastTag0, s2, "<page_header>", "<note place=\"headnote\">", addSpace, 3); } if (!output) { output = writeField( buffer, s1, lastTag0, s2, "<page_footnote>", "<note place=\"footnote\">", addSpace, 3); } if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<page>", "<page>", addSpace, 3); } if (!output) { output = writeFieldBeginEnd(buffer, s1, lastTag0, s2, "<reference>", "<bibl>", addSpace, 3); } if (!output) { if (closeParagraph) { output = writeField(buffer, s1, "", s2, "<reference_marker>", "<label>", addSpace, 3); } else output = writeField(buffer, s1, lastTag0, s2, "<reference_marker>", "<label>", addSpace, 3); } if (!output) { output = writeField( buffer, s1, lastTag0, s2, "<citation_marker>", "<ref type=\"biblio\">", addSpace, 3); } if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<section>", "<head>", addSpace, 3); } if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<subsection>", "<head>", addSpace, 3); } if (!output) { if (openFigure) { output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 4); } else { // output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<figure>\n\t\t\t\t<trash>", output = writeField(buffer, s1, lastTag0, s2, "<trash>", "<trash>", addSpace, 3); if (output) { openFigure = true; } } } if (!output) { output = writeField(buffer, s1, lastTag0, s2, "<equation>", "<formula>", addSpace, 3); } if (!output) { output = writeField( buffer, s1, lastTag0, s2, "<figure_marker>", "<ref type=\"figure\">", addSpace, 3); } if (!output) { if (openFigure) { if (tableBlock && (!lastTag0.equals("<table>")) && (currentTag0.equals("<table>"))) { buffer.append("\n\t\t\t</figure>\n\n"); output = writeField( buffer, s1, lastTag0, s2, "<figure>\n\t\t\t\t<table>", "<figure>", addSpace, 3); if (output) { tableBlock = true; descFigure = false; headFigure = false; } } else { output = writeField(buffer, s1, lastTag0, s2, "<table>", "<table>", addSpace, 4); if (output) { tableBlock = true; } } } else { output = writeField( buffer, s1, lastTag0, s2, "<table>", "<figure>\n\t\t\t\t<table>", addSpace, 3); if (output) { openFigure = true; tableBlock = true; } } } if (!output) { if (openFigure) { if (descFigure && (!lastTag0.equals("<label>")) && (currentTag0.equals("<label>"))) { buffer.append("\n\t\t\t</figure>\n\n"); output = writeField( buffer, s1, lastTag0, s2, "<label>", "<figure>\n\t\t\t\t<figDesc>", addSpace, 3); if (output) { descFigure = true; tableBlock = false; headFigure = false; } } else { output = writeField(buffer, s1, lastTag0, s2, "<label>", "<figDesc>", addSpace, 4); if (output) { descFigure = true; } } } else { output = writeField( buffer, s1, lastTag0, s2, "<label>", "<figure>\n\t\t\t\t<figDesc>", addSpace, 3); if (output) { openFigure = true; descFigure = true; } } } if (!output) { if (openFigure) { if (headFigure && (!lastTag0.equals("<figure_head>")) && (currentTag0.equals("<figure_head>"))) { buffer.append("\n\t\t\t</figure>\n\n"); output = writeField( buffer, s1, lastTag0, s2, "<figure_head>", "<figure>\n\t\t\t\t<head>", addSpace, 3); if (output) { descFigure = false; tableBlock = false; headFigure = true; } } else { output = writeField(buffer, s1, lastTag0, s2, "<figure_head>", "<head>", addSpace, 4); if (output) { headFigure = true; } } } else { output = writeField( buffer, s1, lastTag0, s2, "<figure_head>", "<figure>\n\t\t\t\t<head>", addSpace, 3); if (output) { openFigure = true; headFigure = true; } } } // for item we must distinguish starting and closing tags if (!output) { output = writeFieldBeginEnd(buffer, s1, lastTag, s2, "<item>", "<item>", addSpace, 3); } lastTag = s1; if (!st.hasMoreTokens()) { if (lastTag != null) { testClosingTag(buffer, "", currentTag0, s1); } if (openFigure) { buffer.append("\n\t\t\t</figure>\n\n"); } } if (start) { start = false; } } return buffer; } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } }
public static Document generalResultSegmentation( Document doc, String labeledResult, List<LayoutToken> documentTokens) { List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult); SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create(); doc.setLabeledBlocks(labeledBlocks); /*try { FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult); FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString()); } catch(Exception e) { e.printStackTrace(); }*/ List<Block> docBlocks = doc.getBlocks(); int indexLine = 0; int blockIndex = 0; int p = 0; // position in the labeled result int currentLineEndPos = 0; // position in the global doc. tokenization of the last // token of the current line int currentLineStartPos = 0; // position in the global doc. // tokenization of the first token of the current line String line = null; DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER; DocumentPointer currentPointer = null; DocumentPointer lastPointer = null; String curLabel; String curPlainLabel = null; String lastPlainLabel = null; int lastTokenInd = -1; for (int i = docBlocks.size() - 1; i >= 0; i--) { int endToken = docBlocks.get(i).getEndToken(); if (endToken != -1) { lastTokenInd = endToken; break; } } // we do this concatenation trick so that we don't have to process stuff after the main loop // no copying of lists happens because of this, so it's ok to concatenate String ignoredLabel = "@IGNORED_LABEL@"; for (Pair<String, String> labeledTokenPair : Iterables.concat( labeledTokens, Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) { if (labeledTokenPair == null) { p++; continue; } // as we process the document segmentation line by line, we don't use the usual // tokenization to rebuild the text flow, but we get each line again from the // text stored in the document blocks (similarly as when generating the features) line = null; while ((line == null) && (blockIndex < docBlocks.size())) { Block block = docBlocks.get(blockIndex); List<LayoutToken> tokens = block.getTokens(); String localText = block.getText(); if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } String[] lines = localText.split("[\\n\\r]"); if ((lines.length == 0) || (indexLine >= lines.length)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } else { line = lines[indexLine]; indexLine++; if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) { line = null; continue; } if (currentLineStartPos > lastTokenInd) continue; // adjust the start token position in documentTokens to this non trivial line // first skip possible space characters and tabs at the beginning of the line while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) { while (currentLineStartPos < block.getEndToken()) { if (documentTokens.get(currentLineStartPos).t().equals("\n") || documentTokens.get(currentLineStartPos).t().equals("\r")) { // move to the start of the next line, but ignore space characters and tabs currentLineStartPos++; while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if ((currentLineStartPos != lastTokenInd) && labeledTokenPair.a.startsWith( documentTokens.get(currentLineStartPos).getText())) { break; } } currentLineStartPos++; } } // what is then the position of the last token of this line? currentLineEndPos = currentLineStartPos; while (currentLineEndPos < block.getEndToken()) { if (documentTokens.get(currentLineEndPos).t().equals("\n") || documentTokens.get(currentLineEndPos).t().equals("\r")) { currentLineEndPos--; break; } currentLineEndPos++; } } } curLabel = labeledTokenPair.b; curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel); /*System.out.println("-------------------------------"); System.out.println("block: " + blockIndex); System.out.println("line: " + line); System.out.println("token: " + labeledTokenPair.a); System.out.println("curPlainLabel: " + curPlainLabel); System.out.println("lastPlainLabel: " + lastPlainLabel); if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1)) System.out.println("currentLineStartPos: " + currentLineStartPos + " (" + documentTokens.get(currentLineStartPos) + ")"); if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1)) System.out.println("currentLineEndPos: " + currentLineEndPos + " (" + documentTokens.get(currentLineEndPos) + ")");*/ if (blockIndex == docBlocks.size()) { break; } currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos); // either a new entity starts or a new beginning of the same type of entity if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); } pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } // updating stuff for next iteration lastPlainLabel = curPlainLabel; lastPointer = currentPointer; currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line p++; } if (blockIndex == docBlocks.size()) { // the last labelled piece has still to be added if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } } } return doc; }
/** * First pass to detect basic structures: remove page header/footer, identify section numbering, * identify Figure and table blocks. * * <p>-> to be removed at some point! * * @param doc a document */ public static void firstPass(Document doc) { if (doc == null) { throw new NullPointerException(); } if (doc.getBlocks() == null) { throw new NullPointerException(); } int i = 0; List<Integer> blockHeaders = new ArrayList<Integer>(); List<Integer> blockFooters = new ArrayList<Integer>(); List<Integer> blockSectionTitles = new ArrayList<Integer>(); List<Integer> acknowledgementBlocks = new ArrayList<Integer>(); List<Integer> blockTables = new ArrayList<Integer>(); List<Integer> blockFigures = new ArrayList<Integer>(); List<Integer> blockHeadTables = new ArrayList<Integer>(); List<Integer> blockHeadFigures = new ArrayList<Integer>(); List<Integer> blockDocumentHeaders = new ArrayList<Integer>(); doc.setTitleMatchNum(false); try { for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText); Matcher ma2 = BasicStructureBuilder.references.matcher(localText); if ((ma1.find()) || (ma2.find())) { if (((localText.startsWith("1.")) || (localText.startsWith("1 "))) || ((localText.startsWith("2.")) || (localText.startsWith("2 "))) || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true); // System.out.println("Title section identified: block " + i + ", " + localText); blockSectionTitles.add(i); } else { StringTokenizer st = new StringTokenizer(localText, "\n"); while (st.hasMoreTokens()) { String token = st.nextToken(); if (token.startsWith("@PAGE")) { // current block should give the header/footors if (i > 4) { if (doc.getBlocks().get(i - 5).getNbTokens() < 20) { Integer i2 = i - 5; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 3) { if (doc.getBlocks().get(i - 4).getNbTokens() < 20) { Integer i2 = i - 4; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 2) { if (doc.getBlocks().get(i - 3).getNbTokens() < 20) { Integer i2 = i - 3; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 1) { if (doc.getBlocks().get(i - 2).getNbTokens() < 20) { Integer i2 = i - 2; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 0) { if (doc.getBlocks().get(i - 1).getNbTokens() < 20) { Integer i2 = i - 1; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } blockFooters.add(i); // page header candidates blockHeaders.add(i); if (i < doc.getBlocks().size() - 1) { if (doc.getBlocks().get(i + 1).getNbTokens() < 20) { Integer i2 = i + 1; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1); } } if (i < doc.getBlocks().size() - 2) { if (doc.getBlocks().get(i + 2).getNbTokens() < 20) { Integer i2 = i + 2; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2); } } if (i < doc.getBlocks().size() - 3) { if (doc.getBlocks().get(i + 3).getNbTokens() < 20) { Integer i2 = i + 3; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3); } } if (i < doc.getBlocks().size() - 4) { if (doc.getBlocks().get(i + 4).getNbTokens() < 20) { Integer i2 = i + 4; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4); } } // more ?? } } } // clustering of blocks per font (for section header and figure/table detections) addBlockToCluster(i, doc); i++; } // try to find the cluster of section titles Cluster candidateCluster = null; // System.out.println("nb clusters: " + clusters.size()); for (Cluster cluster : doc.getClusters()) { if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5)) && (cluster.getNbBlocks() < 20)) { List<Integer> blo = cluster.getBlocks2(); for (Integer b : blo) { if (blockSectionTitles.contains(b)) { if (candidateCluster == null) { candidateCluster = cluster; break; } // else if (cluster.getFontSize() >= candidateCluster.getFontSize()) // candidateCluster = cluster; } } } } if (candidateCluster != null) { List<Integer> newBlockSectionTitles = new ArrayList<Integer>(); for (Integer bl : blockSectionTitles) { if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl); } List<Integer> blockClusterTitles = candidateCluster.getBlocks2(); if (blockClusterTitles.size() < 20) { for (Integer bl : blockClusterTitles) { if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl); } } blockSectionTitles = newBlockSectionTitles; } // aknowledgement section recognition boolean ackn = false; i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); // System.out.println(i + ": " + localText+"\n"); Integer iii = i; Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText); if ((m3.find()) && (blockSectionTitles.contains(iii))) { acknowledgementBlocks.add(iii); ackn = true; // int index = blockSectionTitles.indexOf(iii); // blockSectionTitles.remove(index); } else if ((ackn) && (blockSectionTitles.contains(iii))) { ackn = false; break; } else if (ackn) { Matcher m4 = BasicStructureBuilder.references.matcher(localText); if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) { acknowledgementBlocks.add(iii); } else if (m4.find()) { ackn = false; break; } } i++; } // we remove references headers in blockSectionTitles int index = -1; for (Integer ii : blockSectionTitles) { Block block = doc.getBlocks().get(ii); String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher m4 = BasicStructureBuilder.references.matcher(localText); if (m4.find()) { index = blockSectionTitles.indexOf(ii); break; } } if (index != -1) { blockSectionTitles.remove(index); } // we check headers repetition from page to page to decide if it is an header or not ArrayList<Integer> toRemove = new ArrayList<Integer>(); for (Integer ii : blockHeaders) { String localText = (doc.getBlocks().get(ii)).getText().trim(); localText = TextUtilities.shadowNumbers(localText); int length = localText.length(); if (length > 160) toRemove.add(ii); else { // System.out.println("header candidate: " + localText); // evaluate distance with other potential headers boolean valid = false; for (Integer ii2 : blockHeaders) { if (ii.intValue() != ii2.intValue()) { String localText2 = doc.getBlocks().get(ii2).getText().trim(); if (localText2.length() < 160) { localText2 = TextUtilities.shadowNumbers(localText2); double dist = (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length; // System.out.println("dist with " + localText2 + " : " + dist); if (dist < 0.25) { valid = true; break; } } } } if (!valid) { toRemove.add(ii); } } } for (Integer ii : toRemove) { blockHeaders.remove(ii); } // same for footers toRemove = new ArrayList<Integer>(); for (Integer ii : blockFooters) { String localText = (doc.getBlocks().get(ii)).getText().trim(); localText = TextUtilities.shadowNumbers(localText); int length = localText.length(); if (length > 160) toRemove.add(ii); else { // System.out.println("footer candidate: " + localText); // evaluate distance with other potential headers boolean valid = false; for (Integer ii2 : blockFooters) { if (ii.intValue() != ii2.intValue()) { String localText2 = doc.getBlocks().get(ii2).getText().trim(); if (localText2.length() < 160) { localText2 = TextUtilities.shadowNumbers(localText2); double dist = (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length; if (dist < 0.25) { valid = true; break; } } } } if (!valid) { toRemove.add(ii); } } } for (Integer ii : toRemove) { blockFooters.remove(ii); } // a special step for added banner repositoryies such HAL i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); // HAL if (localText.startsWith("Author manuscript, published in")) { Double y = block.getY(); // System.out.println("HAL banner found, " + "block " + i + ", y = " + y); if (Math.abs(y - 12.538) < 2) { // reference position // blockHeaders.add(new Integer(i)); blockDocumentHeaders.add(i); // System.out.println("HAL banner added as header block"); break; } } // ACM publications // System.out.println("test ACM " + i); // System.out.println(localText); if (localText.startsWith("Permission to make digital or hard copies")) { blockFooters.add(i); break; } // arXiv, etc. put here // IOP if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) { blockDocumentHeaders.add(i); // System.out.println("IOP banner added as header block"); break; } i++; } // we try to recognize here table and figure blocks // the idea is that the textual elements are not located as the normal text blocks // this is recognized by exploiting the cluster of blocks starting up and down front the block // containing a table or a figure marker // two different runs, one for figures and one for tables (everything could be done in one // step) i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher m = BasicStructureBuilder.figure.matcher(localText); Matcher m2 = BasicStructureBuilder.table.matcher(localText); double width = block.getWidth(); boolean bold = block.getBold(); // table // if ( (m2.find()) && (localText.length() < 200) ) { if ((m2.find()) && ((bold) || (localText.length() < 200))) { if (!blockHeadTables.contains(i)) { blockHeadTables.add(i); } // we also put all the small blocks before and after the marker int j = i - 1; while ((j > i - 15) && (j > 0)) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().length() < 160) || (width < 50)) { if ((!blockTables.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockTables.add(j); } else j = 0; } j--; } j = i + 1; while ((j < i + 15) && (j < doc.getBlocks().size())) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().length() < 160) || (width < 50)) { if ((!blockTables.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockTables.add(j); } else j = doc.getBlocks().size(); } j++; } } // figure // else if ( (m.find()) && (localText.length() < 200) ) { else if ((m.find()) && ((bold) || (localText.length() < 200))) { if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i); // we also put all the small blocks before and after the marker int j = i - 1; boolean imageFound = false; while ((j > i - 15) && (j > 0)) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { String localText2 = b.getText().trim(); // localText = localText.replace("\n", " "); localText2 = localText2.replace(" ", " "); localText2 = localText2.trim(); if ((localText2.startsWith("@IMAGE")) && (!imageFound)) { // System.out.println(localText2); block.setText(block.getText() + " " + localText2); // System.out.println(block.getText()); imageFound = true; } if ((localText2.length() < 160) || (width < 50)) { if ((!blockFigures.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockFigures.add(j); } else j = 0; } j--; } j = i + 1; while ((j < i + 15) && (j < doc.getBlocks().size())) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().trim().length() < 160) || (width < 50)) { if ((!blockFigures.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockFigures.add(j); } else j = doc.getBlocks().size(); } j++; } } i++; } } finally { doc.setBlockHeaders(blockHeaders); doc.setBlockFooters(blockFooters); doc.setBlockSectionTitles(blockSectionTitles); doc.setAcknowledgementBlocks(acknowledgementBlocks); doc.setBlockTables(blockTables); doc.setBlockFigures(blockFigures); doc.setBlockHeadTables(blockHeadTables); doc.setBlockHeadFigures(blockHeadFigures); doc.setBlockDocumentHeaders(blockDocumentHeaders); } }
public String printVector() { if (string == null) return null; if (string.length() == 0) return null; StringBuilder res = new StringBuilder(); // token string (1) res.append(string); // lowercase string (1) res.append(" ").append(string.toLowerCase()); // prefix (4) res.append(" " + TextUtilities.prefix(string, 1)); res.append(" " + TextUtilities.prefix(string, 2)); res.append(" " + TextUtilities.prefix(string, 3)); res.append(" " + TextUtilities.prefix(string, 4)); // suffix (4) res.append(" " + TextUtilities.suffix(string, 1)); res.append(" " + TextUtilities.suffix(string, 2)); res.append(" " + TextUtilities.suffix(string, 3)); res.append(" " + TextUtilities.suffix(string, 4)); // line information (1) res.append(" ").append(lineStatus); // capitalisation (1) if (digit.equals("ALLDIGIT")) res.append(" NOCAPS"); else res.append(" ").append(capitalisation); // digit information (1) res.append(" ").append(digit); // character information (1) if (singleChar) res.append(" 1"); else res.append(" 0"); // lexical information (8) if (properName) res.append(" 1"); else res.append(" 0"); if (commonName) res.append(" 1"); else res.append(" 0"); if (firstName) res.append(" 1"); else res.append(" 0"); if (locationName) res.append(" 1"); else res.append(" 0"); if (year) res.append(" 1"); else res.append(" 0"); if (month) res.append(" 1"); else res.append(" 0"); if (email) res.append(" 1"); else res.append(" 0"); if (http) res.append(" 1"); else res.append(" 0"); // bibliographical information(4) if (isKnownJournalTitle || isKnownAbbrevJournalTitle) res.append(" 1"); else res.append(" 0"); if (isKnownConferenceTitle) res.append(" 1"); else res.append(" 0"); if (isKnownPublisher) res.append(" 1"); else res.append(" 0"); // punctuation information (1) res.append(" ").append(punctType); // in case the token is a punctuation (NO otherwise) // relative position in the sequence (1) res.append(" ").append(relativePosition); // label - for training data (1) if (label != null) res.append(" ").append(label).append("\n"); else res.append(" 0\n"); return res.toString(); }
public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws SAXException { // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") && // !qName.equals("TEXT")) // System.out.println(qName); if (qName.equals("TEXT")) { blabla.append("\n"); LayoutToken token = new LayoutToken(); token.setText("\n"); block.addToken(token); accumulator.setLength(0); tokenizations.add("\n"); } else if (qName.equals("METADATA")) { accumulator.setLength(0); } else if (qName.equals("TOKEN")) { String tok0 = TextUtilities.clean(getText()); if (block.getStartToken() == -1) { block.setStartToken(tokenizations.size()); } if (tok0.length() > 0) { StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true); boolean diaresis = false; boolean accent = false; boolean keepLast = false; while (st.hasMoreTokens()) { diaresis = false; accent = false; keepLast = false; String tok = st.nextToken(); if (tok.length() > 0) { LayoutToken token = new LayoutToken(); if ((previousToken != null) && (tok != null) && (previousToken.length() > 0) && (tok.length() > 0) && blabla.length() > 0) { Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1); Character rightChar = tok.charAt(0); ModifierClass leftClass = classifyChar(leftChar); ModifierClass rightClass = classifyChar(rightChar); ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER; if (leftClass != ModifierClass.NOT_A_MODIFIER || rightClass != ModifierClass.NOT_A_MODIFIER) { Character baseChar = null; Character modifierChar = null; if (leftClass != ModifierClass.NOT_A_MODIFIER) { if (rightClass != ModifierClass.NOT_A_MODIFIER) { // assert false; // keeping characters, but setting class // to not a modifier baseChar = leftChar; modifierChar = rightChar; modifierClass = ModifierClass.NOT_A_MODIFIER; } else { baseChar = rightChar; modifierChar = leftChar; modifierClass = leftClass; } } else { baseChar = leftChar; modifierChar = rightChar; modifierClass = rightClass; } String updatedChar = modifyCharacter(baseChar, modifierChar); tokenizations.remove(tokenizations.size() - 1); if (tokenizations.size() > 0) { tokenizations.remove(tokenizations.size() - 1); } blabla.deleteCharAt(blabla.length() - 1); if (blabla.length() > 0) { blabla.deleteCharAt(blabla.length() - 1); } removeLastCharacterIfPresent(previousTok); if (updatedChar != null) { blabla.append(updatedChar); previousTok.setText(previousTok.getText() + updatedChar); } blabla.append(tok.substring(1, tok.length())); previousTok.setText(previousTok.getText() + tok.substring(1, tok.length())); tokenizations.add(previousTok.getText()); diaresis = (modifierClass == ModifierClass.DIAERESIS || modifierClass == ModifierClass.NORDIC_RING || modifierClass == ModifierClass.CZECH_CARON || modifierClass == ModifierClass.TILDE || modifierClass == ModifierClass.CEDILLA); accent = (modifierClass == ModifierClass.ACUTE_ACCENT || modifierClass == ModifierClass.CIRCUMFLEX || modifierClass == ModifierClass.GRAVE_ACCENT); if (rightClass != ModifierClass.NOT_A_MODIFIER) { tok = ""; // resetting current token as it // is a single-item } } } if (tok != null) { // actually in certain cases, the extracted string under token can be a chunk of text // with separators that need to be preserved // tok = tok.replace(" ", ""); } if ((!diaresis) && (!accent)) { // blabla.append(" "); blabla.append(tok); token.setText(tok); tokenizations.add(tok); } else { tok = ""; keepLast = true; } /* * StringTokenizer st0 = new StringTokenizer(tok0, * TextUtilities.fullPunctuations, true); * while(st0.hasMoreTokens()) { String tok = * st0.nextToken(); tokenizations.add(tok); } * tokenizations.add(" "); */ /* * boolean punct1 = false; boolean punct2 = false; * boolean punct3 = false; String content = null; int i * = 0; for(; i<TextUtilities.punctuations.length(); * i++) { if (tok.length() > 0) { if * (tok.charAt(tok.length()-1) == * TextUtilities.punctuations.charAt(i)) { punct1 = * true; content = tok.substring(0, tok.length()-1); if * (tok.length() > 1) { int j = 0; for(; * j<TextUtilities.punctuations.length(); j++) { if * (tok.charAt(tok.length()-2) == * TextUtilities.punctuations.charAt(j)) { punct3 = * true; content = tok.substring(0, tok.length()-2); } } * } break; } } } if (tok.length() > 0) { if ( * (tok.startsWith("(")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("("); } else if ( * (tok.startsWith("[")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("["); } else if ( * (tok.startsWith("\"")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("\""); } } */ if (currentRotation) currentFontSize = currentFontSize / 2; /* * if (punct2) { if (currentFont != null) * token.setFont(currentFont.toLowerCase()); else * token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); * * token = new LayoutToken(); token.setText(content); } * if (punct1) { token.setText(content); if (currentFont * != null) token.setFont(currentFont.toLowerCase()); * else token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); * * if (punct3) { token = new LayoutToken(); * token.setText(""+tok.charAt(tok.length()-2)); if * (currentFont != null) * token.setFont(currentFont.toLowerCase()); else * token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); } * * token = new LayoutToken(); * token.setText(""+tok.charAt(tok.length()-1)); } */ if (currentFont != null) token.setFont(currentFont.toLowerCase()); else token.setFont("default"); token.setItalic(currentItalic); token.setBold(currentBold); token.setRotation(currentRotation); token.setColorFont(colorFont); token.setX(currentX); token.setY(currentY); token.setWidth(currentWidth); token.setHeight(currentHeight); token.setFontSize(currentFontSize); if (!diaresis && !accent) { block.addToken(token); } if (block.getFont() == null) { if (currentFont != null) block.setFont(currentFont.toLowerCase()); else token.setFont("default"); } if (nbTokens == 0) { block.setItalic(currentItalic); block.setBold(currentBold); } if (block.getColorFont() == null) block.setColorFont(colorFont); if (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) block.setY(currentY); if (block.getWidth() == 0.0) block.setWidth(currentWidth); if (block.getHeight() == 0.0) block.setHeight(currentHeight); if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize); if (!diaresis && !accent) { previousToken = tok; previousTok = token; } else { previousToken = previousTok.getText(); } nbTokens++; accumulator.setLength(0); } } if (tokenizations.size() > 0) { String justBefore = tokenizations.get(tokenizations.size() - 1); if (!justBefore.endsWith("-")) { tokenizations.add(" "); blabla.append(" "); } } } block.setEndToken(tokenizations.size()); } else if (qName.equals("PAGE")) { // page marker are usefull to detect headers (same first line(s) // appearing on each page) if (block != null) { blabla.append("\n"); tokenizations.add("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } Block block0 = new Block(); block0.setText("@PAGE\n"); block0.setNbTokens(0); block0.setPage(currentPage); doc.addBlock(block0); block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); nbTokens = 0; // blabla.append("\n@block\n"); tokenizations.add("\n"); } else if (qName.equals("IMAGE")) { if (block != null) { blabla.append("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); if (images.size() > 0) { blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n"); } block.setText(blabla.toString()); block.setNbTokens(nbTokens); if (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) block.setY(currentY); if (block.getWidth() == 0.0) block.setWidth(currentWidth); if (block.getHeight() == 0.0) block.setHeight(currentHeight); doc.addBlock(block); blabla = new StringBuffer(); nbTokens = 0; block = new Block(); block.setPage(currentPage); } /* * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) { * blabla.append("\n"); block.setText(blabla.toString()); * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new * Block(); block.setPage(currentPage); blabla = new StringBuffer(); * blabla.append("@IMAGE " + "vectorial \n"); * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) * block.setY(currentY); if (block.getWidth() == 0.0) * block.setWidth(currentWidth); if (block.getHeight() == 0.0) * block.setHeight(currentHeight); doc.addBlock(block); blabla = new * StringBuffer(); nbTokens = 0; block = new Block(); * block.setPage(currentPage); } */ else if (qName.equals("BLOCK")) { blabla.append("\n"); tokenizations.add("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); block.setWidth(currentX - block.getX() + currentWidth); block.setHeight(currentY - block.getY() + currentHeight); doc.addBlock(block); // blabla = new StringBuffer(); nbTokens = 0; block = null; } else if (qName.equals("xi:include")) { if (block != null) { blabla.append("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); blabla = new StringBuffer(); nbTokens = 0; block = new Block(); block.setPage(currentPage); } /* * else if (qName.equals("DOCUMENT")) { * System.out.println(blabla.toString()); } */ }