/** * Cluster the blocks following the font, style and size aspects * * <p>-> not used at this stage, but could be an interesting feature in the full text model in the * future * * @param b integer * @param doc a document */ private static void addBlockToCluster(Integer b, Document doc) { // get block features Block block = doc.getBlocks().get(b); String font = block.getFont(); boolean bold = block.getBold(); boolean italic = block.getItalic(); double fontSize = block.getFontSize(); boolean found = false; if (font == null) { font = "unknown"; } // System.out.println(font + " " + bold + " " + italic + " " + fontSize ); if (doc.getClusters() == null) { doc.setClusters(new ArrayList<Cluster>()); } else { for (Cluster cluster : doc.getClusters()) { String font2 = cluster.getFont(); if (font2 == null) font2 = "unknown"; if (font.equals(font2) && (bold == cluster.getBold()) & (italic == cluster.getItalic()) & (fontSize == cluster.getFontSize())) { cluster.addBlock2(b); found = true; } } } if (!found) { Cluster cluster = new Cluster(); cluster.setFont(font); cluster.setBold(bold); cluster.setItalic(italic); cluster.setFontSize(fontSize); cluster.addBlock2(b); doc.getClusters().add(cluster); } }
/** * Filter out line numbering possibly present in the document. This can be frequent for document * in a review/submission format and degrades strongly the machine learning extraction results. * * <p>-> Not used ! * * @param doc a document * @return if found numbering */ public boolean filterLineNumber(Document doc) { // we first test if we have a line numbering by checking if we have an increasing integer // at the begin or the end of each block boolean numberBeginLine = false; boolean numberEndLine = false; boolean foundNumbering = false; int currentNumber = -1; int lastNumber = -1; int i = 0; for (Block block : doc.getBlocks()) { // Integer ii = i; String localText = block.getText(); List<LayoutToken> tokens = block.tokens; if ((localText != null) && (tokens != null)) { if (tokens.size() > 0) { // we get the first and last token iof the block // String tok1 = tokens.get(0).getText(); // String tok2 = tokens.get(tokens.size()).getText(); localText = localText.trim(); Matcher ma1 = startNum.matcher(localText); Matcher ma2 = endNum.matcher(localText); if (ma1.find()) { String groupStr = ma1.group(0); try { currentNumber = Integer.parseInt(groupStr); numberBeginLine = true; } catch (NumberFormatException e) { currentNumber = -1; } } else if (ma2.find()) { String groupStr = ma2.group(0); try { currentNumber = Integer.parseInt(groupStr); numberEndLine = true; } catch (NumberFormatException e) { currentNumber = -1; } } if (lastNumber != -1) { if (currentNumber == lastNumber + 1) { foundNumbering = true; break; } } else lastNumber = currentNumber; } } i++; if (i > 5) { break; } } i = 0; if (foundNumbering) { // we have a line numbering, so we filter them int counter = 1; // we start at 1, if the actual start is 0, // it will remain (as it is negligeable) for (Block block : doc.getBlocks()) { String localText = block.getText(); List<LayoutToken> tokens = block.tokens; if ((localText != null) && (tokens.size() > 0)) { if (numberEndLine) { Matcher ma2 = endNum.matcher(localText); if (ma2.find()) { String groupStr = ma2.group(0); if (groupStr.trim().equals("" + counter)) { localText = localText.substring(0, localText.length() - groupStr.length()); block.setText(localText); tokens.remove(tokens.size() - 1); counter++; } } } else if (numberBeginLine) { Matcher ma1 = endNum.matcher(localText); if (ma1.find()) { String groupStr = ma1.group(0); if (groupStr.trim().equals("" + counter)) { localText = localText.substring(groupStr.length(), localText.length() - 1); block.setText(localText); tokens.remove(0); counter++; } } } } i++; } } return foundNumbering; }
public static Document generalResultSegmentation( Document doc, String labeledResult, List<LayoutToken> documentTokens) { List<Pair<String, String>> labeledTokens = GenericTaggerUtils.getTokensAndLabels(labeledResult); SortedSetMultimap<String, DocumentPiece> labeledBlocks = TreeMultimap.create(); doc.setLabeledBlocks(labeledBlocks); /*try { FileUtils.writeStringToFile(new File("/tmp/x1.txt"), labeledResult); FileUtils.writeStringToFile(new File("/tmp/x2.txt"), documentTokens.toString()); } catch(Exception e) { e.printStackTrace(); }*/ List<Block> docBlocks = doc.getBlocks(); int indexLine = 0; int blockIndex = 0; int p = 0; // position in the labeled result int currentLineEndPos = 0; // position in the global doc. tokenization of the last // token of the current line int currentLineStartPos = 0; // position in the global doc. // tokenization of the first token of the current line String line = null; DocumentPointer pointerA = DocumentPointer.START_DOCUMENT_POINTER; DocumentPointer currentPointer = null; DocumentPointer lastPointer = null; String curLabel; String curPlainLabel = null; String lastPlainLabel = null; int lastTokenInd = -1; for (int i = docBlocks.size() - 1; i >= 0; i--) { int endToken = docBlocks.get(i).getEndToken(); if (endToken != -1) { lastTokenInd = endToken; break; } } // we do this concatenation trick so that we don't have to process stuff after the main loop // no copying of lists happens because of this, so it's ok to concatenate String ignoredLabel = "@IGNORED_LABEL@"; for (Pair<String, String> labeledTokenPair : Iterables.concat( labeledTokens, Collections.singleton(new Pair<String, String>("IgnoredToken", ignoredLabel)))) { if (labeledTokenPair == null) { p++; continue; } // as we process the document segmentation line by line, we don't use the usual // tokenization to rebuild the text flow, but we get each line again from the // text stored in the document blocks (similarly as when generating the features) line = null; while ((line == null) && (blockIndex < docBlocks.size())) { Block block = docBlocks.get(blockIndex); List<LayoutToken> tokens = block.getTokens(); String localText = block.getText(); if ((tokens == null) || (localText == null) || (localText.trim().length() == 0)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } String[] lines = localText.split("[\\n\\r]"); if ((lines.length == 0) || (indexLine >= lines.length)) { blockIndex++; indexLine = 0; if (blockIndex < docBlocks.size()) { block = docBlocks.get(blockIndex); currentLineStartPos = block.getStartToken(); } continue; } else { line = lines[indexLine]; indexLine++; if ((line.trim().length() == 0) || (TextUtilities.filterLine(line))) { line = null; continue; } if (currentLineStartPos > lastTokenInd) continue; // adjust the start token position in documentTokens to this non trivial line // first skip possible space characters and tabs at the beginning of the line while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if (!labeledTokenPair.a.startsWith(documentTokens.get(currentLineStartPos).getText())) { while (currentLineStartPos < block.getEndToken()) { if (documentTokens.get(currentLineStartPos).t().equals("\n") || documentTokens.get(currentLineStartPos).t().equals("\r")) { // move to the start of the next line, but ignore space characters and tabs currentLineStartPos++; while ((documentTokens.get(currentLineStartPos).t().equals(" ") || documentTokens.get(currentLineStartPos).t().equals("\t")) && (currentLineStartPos != lastTokenInd)) { currentLineStartPos++; } if ((currentLineStartPos != lastTokenInd) && labeledTokenPair.a.startsWith( documentTokens.get(currentLineStartPos).getText())) { break; } } currentLineStartPos++; } } // what is then the position of the last token of this line? currentLineEndPos = currentLineStartPos; while (currentLineEndPos < block.getEndToken()) { if (documentTokens.get(currentLineEndPos).t().equals("\n") || documentTokens.get(currentLineEndPos).t().equals("\r")) { currentLineEndPos--; break; } currentLineEndPos++; } } } curLabel = labeledTokenPair.b; curPlainLabel = GenericTaggerUtils.getPlainLabel(curLabel); /*System.out.println("-------------------------------"); System.out.println("block: " + blockIndex); System.out.println("line: " + line); System.out.println("token: " + labeledTokenPair.a); System.out.println("curPlainLabel: " + curPlainLabel); System.out.println("lastPlainLabel: " + lastPlainLabel); if ((currentLineStartPos < lastTokenInd) && (currentLineStartPos != -1)) System.out.println("currentLineStartPos: " + currentLineStartPos + " (" + documentTokens.get(currentLineStartPos) + ")"); if ((currentLineEndPos < lastTokenInd) && (currentLineEndPos != -1)) System.out.println("currentLineEndPos: " + currentLineEndPos + " (" + documentTokens.get(currentLineEndPos) + ")");*/ if (blockIndex == docBlocks.size()) { break; } currentPointer = new DocumentPointer(doc, blockIndex, currentLineEndPos); // either a new entity starts or a new beginning of the same type of entity if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); } pointerA = new DocumentPointer(doc, blockIndex, currentLineStartPos); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } // updating stuff for next iteration lastPlainLabel = curPlainLabel; lastPointer = currentPointer; currentLineStartPos = currentLineEndPos + 2; // one shift for the EOL, one for the next line p++; } if (blockIndex == docBlocks.size()) { // the last labelled piece has still to be added if ((!curPlainLabel.equals(lastPlainLabel)) && (lastPlainLabel != null)) { if ((pointerA.getTokenDocPos() <= lastPointer.getTokenDocPos()) && (pointerA.getTokenDocPos() != -1)) { labeledBlocks.put(lastPlainLabel, new DocumentPiece(pointerA, lastPointer)); // System.out.println("add segment for: " + lastPlainLabel + ", until " + // (currentLineStartPos-2)); } } } return doc; }
/** * First pass to detect basic structures: remove page header/footer, identify section numbering, * identify Figure and table blocks. * * <p>-> to be removed at some point! * * @param doc a document */ public static void firstPass(Document doc) { if (doc == null) { throw new NullPointerException(); } if (doc.getBlocks() == null) { throw new NullPointerException(); } int i = 0; List<Integer> blockHeaders = new ArrayList<Integer>(); List<Integer> blockFooters = new ArrayList<Integer>(); List<Integer> blockSectionTitles = new ArrayList<Integer>(); List<Integer> acknowledgementBlocks = new ArrayList<Integer>(); List<Integer> blockTables = new ArrayList<Integer>(); List<Integer> blockFigures = new ArrayList<Integer>(); List<Integer> blockHeadTables = new ArrayList<Integer>(); List<Integer> blockHeadFigures = new ArrayList<Integer>(); List<Integer> blockDocumentHeaders = new ArrayList<Integer>(); doc.setTitleMatchNum(false); try { for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher ma1 = BasicStructureBuilder.introduction.matcher(localText); Matcher ma2 = BasicStructureBuilder.references.matcher(localText); if ((ma1.find()) || (ma2.find())) { if (((localText.startsWith("1.")) || (localText.startsWith("1 "))) || ((localText.startsWith("2.")) || (localText.startsWith("2 "))) || (localText.startsWith("Contents"))) doc.setTitleMatchNum(true); // System.out.println("Title section identified: block " + i + ", " + localText); blockSectionTitles.add(i); } else { StringTokenizer st = new StringTokenizer(localText, "\n"); while (st.hasMoreTokens()) { String token = st.nextToken(); if (token.startsWith("@PAGE")) { // current block should give the header/footors if (i > 4) { if (doc.getBlocks().get(i - 5).getNbTokens() < 20) { Integer i2 = i - 5; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 3) { if (doc.getBlocks().get(i - 4).getNbTokens() < 20) { Integer i2 = i - 4; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 2) { if (doc.getBlocks().get(i - 3).getNbTokens() < 20) { Integer i2 = i - 3; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 1) { if (doc.getBlocks().get(i - 2).getNbTokens() < 20) { Integer i2 = i - 2; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } if (i > 0) { if (doc.getBlocks().get(i - 1).getNbTokens() < 20) { Integer i2 = i - 1; if (!blockFooters.contains(i2)) blockFooters.add(i2); } } blockFooters.add(i); // page header candidates blockHeaders.add(i); if (i < doc.getBlocks().size() - 1) { if (doc.getBlocks().get(i + 1).getNbTokens() < 20) { Integer i2 = i + 1; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 1); } } if (i < doc.getBlocks().size() - 2) { if (doc.getBlocks().get(i + 2).getNbTokens() < 20) { Integer i2 = i + 2; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 2); } } if (i < doc.getBlocks().size() - 3) { if (doc.getBlocks().get(i + 3).getNbTokens() < 20) { Integer i2 = i + 3; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 3); } } if (i < doc.getBlocks().size() - 4) { if (doc.getBlocks().get(i + 4).getNbTokens() < 20) { Integer i2 = i + 4; if (!blockHeaders.contains(i2)) blockHeaders.add(i + 4); } } // more ?? } } } // clustering of blocks per font (for section header and figure/table detections) addBlockToCluster(i, doc); i++; } // try to find the cluster of section titles Cluster candidateCluster = null; // System.out.println("nb clusters: " + clusters.size()); for (Cluster cluster : doc.getClusters()) { if ((cluster.getNbBlocks() < (doc.getBlocks().size() / 5)) && (cluster.getNbBlocks() < 20)) { List<Integer> blo = cluster.getBlocks2(); for (Integer b : blo) { if (blockSectionTitles.contains(b)) { if (candidateCluster == null) { candidateCluster = cluster; break; } // else if (cluster.getFontSize() >= candidateCluster.getFontSize()) // candidateCluster = cluster; } } } } if (candidateCluster != null) { List<Integer> newBlockSectionTitles = new ArrayList<Integer>(); for (Integer bl : blockSectionTitles) { if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl); } List<Integer> blockClusterTitles = candidateCluster.getBlocks2(); if (blockClusterTitles.size() < 20) { for (Integer bl : blockClusterTitles) { if (!newBlockSectionTitles.contains(bl)) newBlockSectionTitles.add(bl); } } blockSectionTitles = newBlockSectionTitles; } // aknowledgement section recognition boolean ackn = false; i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); // System.out.println(i + ": " + localText+"\n"); Integer iii = i; Matcher m3 = BasicStructureBuilder.acknowledgement.matcher(localText); if ((m3.find()) && (blockSectionTitles.contains(iii))) { acknowledgementBlocks.add(iii); ackn = true; // int index = blockSectionTitles.indexOf(iii); // blockSectionTitles.remove(index); } else if ((ackn) && (blockSectionTitles.contains(iii))) { ackn = false; break; } else if (ackn) { Matcher m4 = BasicStructureBuilder.references.matcher(localText); if ((ackn) && (!blockFooters.contains(iii)) && (!m4.find())) { acknowledgementBlocks.add(iii); } else if (m4.find()) { ackn = false; break; } } i++; } // we remove references headers in blockSectionTitles int index = -1; for (Integer ii : blockSectionTitles) { Block block = doc.getBlocks().get(ii); String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher m4 = BasicStructureBuilder.references.matcher(localText); if (m4.find()) { index = blockSectionTitles.indexOf(ii); break; } } if (index != -1) { blockSectionTitles.remove(index); } // we check headers repetition from page to page to decide if it is an header or not ArrayList<Integer> toRemove = new ArrayList<Integer>(); for (Integer ii : blockHeaders) { String localText = (doc.getBlocks().get(ii)).getText().trim(); localText = TextUtilities.shadowNumbers(localText); int length = localText.length(); if (length > 160) toRemove.add(ii); else { // System.out.println("header candidate: " + localText); // evaluate distance with other potential headers boolean valid = false; for (Integer ii2 : blockHeaders) { if (ii.intValue() != ii2.intValue()) { String localText2 = doc.getBlocks().get(ii2).getText().trim(); if (localText2.length() < 160) { localText2 = TextUtilities.shadowNumbers(localText2); double dist = (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length; // System.out.println("dist with " + localText2 + " : " + dist); if (dist < 0.25) { valid = true; break; } } } } if (!valid) { toRemove.add(ii); } } } for (Integer ii : toRemove) { blockHeaders.remove(ii); } // same for footers toRemove = new ArrayList<Integer>(); for (Integer ii : blockFooters) { String localText = (doc.getBlocks().get(ii)).getText().trim(); localText = TextUtilities.shadowNumbers(localText); int length = localText.length(); if (length > 160) toRemove.add(ii); else { // System.out.println("footer candidate: " + localText); // evaluate distance with other potential headers boolean valid = false; for (Integer ii2 : blockFooters) { if (ii.intValue() != ii2.intValue()) { String localText2 = doc.getBlocks().get(ii2).getText().trim(); if (localText2.length() < 160) { localText2 = TextUtilities.shadowNumbers(localText2); double dist = (double) TextUtilities.getLevenshteinDistance(localText, localText2) / length; if (dist < 0.25) { valid = true; break; } } } } if (!valid) { toRemove.add(ii); } } } for (Integer ii : toRemove) { blockFooters.remove(ii); } // a special step for added banner repositoryies such HAL i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); // HAL if (localText.startsWith("Author manuscript, published in")) { Double y = block.getY(); // System.out.println("HAL banner found, " + "block " + i + ", y = " + y); if (Math.abs(y - 12.538) < 2) { // reference position // blockHeaders.add(new Integer(i)); blockDocumentHeaders.add(i); // System.out.println("HAL banner added as header block"); break; } } // ACM publications // System.out.println("test ACM " + i); // System.out.println(localText); if (localText.startsWith("Permission to make digital or hard copies")) { blockFooters.add(i); break; } // arXiv, etc. put here // IOP if (localText.startsWith("Confidential: ") && (localText.contains("IOP"))) { blockDocumentHeaders.add(i); // System.out.println("IOP banner added as header block"); break; } i++; } // we try to recognize here table and figure blocks // the idea is that the textual elements are not located as the normal text blocks // this is recognized by exploiting the cluster of blocks starting up and down front the block // containing a table or a figure marker // two different runs, one for figures and one for tables (everything could be done in one // step) i = 0; for (Block block : doc.getBlocks()) { String localText = block.getText().trim(); localText = localText.replace("\n", " "); localText = localText.replace(" ", " "); localText = localText.trim(); Matcher m = BasicStructureBuilder.figure.matcher(localText); Matcher m2 = BasicStructureBuilder.table.matcher(localText); double width = block.getWidth(); boolean bold = block.getBold(); // table // if ( (m2.find()) && (localText.length() < 200) ) { if ((m2.find()) && ((bold) || (localText.length() < 200))) { if (!blockHeadTables.contains(i)) { blockHeadTables.add(i); } // we also put all the small blocks before and after the marker int j = i - 1; while ((j > i - 15) && (j > 0)) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().length() < 160) || (width < 50)) { if ((!blockTables.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockTables.add(j); } else j = 0; } j--; } j = i + 1; while ((j < i + 15) && (j < doc.getBlocks().size())) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().length() < 160) || (width < 50)) { if ((!blockTables.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockTables.add(j); } else j = doc.getBlocks().size(); } j++; } } // figure // else if ( (m.find()) && (localText.length() < 200) ) { else if ((m.find()) && ((bold) || (localText.length() < 200))) { if (!blockHeadFigures.contains(i)) blockHeadFigures.add(i); // we also put all the small blocks before and after the marker int j = i - 1; boolean imageFound = false; while ((j > i - 15) && (j > 0)) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { String localText2 = b.getText().trim(); // localText = localText.replace("\n", " "); localText2 = localText2.replace(" ", " "); localText2 = localText2.trim(); if ((localText2.startsWith("@IMAGE")) && (!imageFound)) { // System.out.println(localText2); block.setText(block.getText() + " " + localText2); // System.out.println(block.getText()); imageFound = true; } if ((localText2.length() < 160) || (width < 50)) { if ((!blockFigures.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockFigures.add(j); } else j = 0; } j--; } j = i + 1; while ((j < i + 15) && (j < doc.getBlocks().size())) { Block b = doc.getBlocks().get(j); if (b.getText() != null) { if ((b.getText().trim().length() < 160) || (width < 50)) { if ((!blockFigures.contains(j)) && (!blockSectionTitles.contains(j)) && (!blockHeaders.contains(j)) && (!blockFooters.contains(j))) blockFigures.add(j); } else j = doc.getBlocks().size(); } j++; } } i++; } } finally { doc.setBlockHeaders(blockHeaders); doc.setBlockFooters(blockFooters); doc.setBlockSectionTitles(blockSectionTitles); doc.setAcknowledgementBlocks(acknowledgementBlocks); doc.setBlockTables(blockTables); doc.setBlockFigures(blockFigures); doc.setBlockHeadTables(blockHeadTables); doc.setBlockHeadFigures(blockHeadFigures); doc.setBlockDocumentHeaders(blockDocumentHeaders); } }
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { if (qName.equals("PAGE")) { int length = atts.getLength(); currentPage++; // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if ((name != null) && (value != null)) { if (name.equals("id")) {; } else if (name.equals("number")) { } else if (name.equals("width")) { } else if (name.equals("height")) { } } } /* * if (block != null) { blabla.append("\n"); * tokenizations.add("\n"); block.setText(blabla.toString()); * block.setNbTokens(nbTokens); doc.addBlock(block); } Block block0 * = new Block(); block0.setText("@PAGE\n"); block0.setNbTokens(0); * doc.addBlock(block0); */ /* * block = new Block(); blabla = new StringBuffer(); nbTokens = 0; * //blabla.append("\n@block\n"); tokenizations.add("\n"); */ } else if (qName.equals("BLOCK")) { block = new Block(); blabla = new StringBuffer(); nbTokens = 0; block.setPage(currentPage); // blabla.append("\n@block\n"); } else if (qName.equals("IMAGE")) { int length = atts.getLength(); // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if ((name != null) && (value != null)) { if (name.equals("href")) { // if (images == null) // images = new ArrayList<String>(); images.add(value); } else if (name.equals("x")) { double x = Double.parseDouble(value); if (x != currentX) { currentX = x; } } else if (name.equals("y")) { double y = Double.parseDouble(value); if (y != currentY) { currentY = y; } } else if (name.equals("width")) { double width = Double.parseDouble(value); if (width != currentWidth) { currentWidth = width; } } else if (name.equals("height")) { double height = Double.parseDouble(value); if (height != currentHeight) { currentHeight = height; } } } } } else if (qName.equals("TEXT")) { int length = atts.getLength(); // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if ((name != null) && (value != null)) { if (name.equals("id")) { } else if (name.equals("x")) { } else if (name.equals("y")) { } else if (name.equals("width")) { } else if (name.equals("height")) { } } } } else if (qName.equals("TOKEN")) { int length = atts.getLength(); // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if ((name != null) && (value != null)) { if (name.equals("id")) {; } else if (name.equals("font-name")) { if (!value.equals(currentFont)) { currentFont = value; blabla.append(" "); } } else if (name.equals("font-size")) { double fontSize = Double.parseDouble(value); if (fontSize != currentFontSize) { currentFontSize = fontSize; blabla.append(" "); } } else if (name.equals("bold")) { if (value.equals("yes")) { currentBold = true; } else { currentBold = false; } } else if (name.equals("italic")) { if (value.equals("yes")) { currentItalic = true; } else { currentItalic = false; } } else if (name.equals("font-color")) { if (!value.equals(colorFont)) { colorFont = value; } } else if (name.equals("rotation")) { if (value.equals("0")) currentRotation = false; else currentRotation = true; } else if (name.equals("x")) { double x = Double.parseDouble(value); if (x != currentX) { currentX = x; } } else if (name.equals("y")) { double y = Double.parseDouble(value); if (y != currentY) { currentY = y; } } else if (name.equals("base")) { double base = Double.parseDouble(value); } else if (name.equals("width")) { double width = Double.parseDouble(value); if (width != currentWidth) { currentWidth = width; } } else if (name.equals("height")) { double height = Double.parseDouble(value); if (height != currentHeight) { currentHeight = height; } } } } } else if (qName.equals("xi:include")) { int length = atts.getLength(); // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if ((name != null) && (value != null)) { if (name.equals("href")) { // if (images == null) // images = new ArrayList<String>(); images.add(value); } } } } // accumulator.setLength(0); }
public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws SAXException { // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") && // !qName.equals("TEXT")) // System.out.println(qName); if (qName.equals("TEXT")) { blabla.append("\n"); LayoutToken token = new LayoutToken(); token.setText("\n"); block.addToken(token); accumulator.setLength(0); tokenizations.add("\n"); } else if (qName.equals("METADATA")) { accumulator.setLength(0); } else if (qName.equals("TOKEN")) { String tok0 = TextUtilities.clean(getText()); if (block.getStartToken() == -1) { block.setStartToken(tokenizations.size()); } if (tok0.length() > 0) { StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true); boolean diaresis = false; boolean accent = false; boolean keepLast = false; while (st.hasMoreTokens()) { diaresis = false; accent = false; keepLast = false; String tok = st.nextToken(); if (tok.length() > 0) { LayoutToken token = new LayoutToken(); if ((previousToken != null) && (tok != null) && (previousToken.length() > 0) && (tok.length() > 0) && blabla.length() > 0) { Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1); Character rightChar = tok.charAt(0); ModifierClass leftClass = classifyChar(leftChar); ModifierClass rightClass = classifyChar(rightChar); ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER; if (leftClass != ModifierClass.NOT_A_MODIFIER || rightClass != ModifierClass.NOT_A_MODIFIER) { Character baseChar = null; Character modifierChar = null; if (leftClass != ModifierClass.NOT_A_MODIFIER) { if (rightClass != ModifierClass.NOT_A_MODIFIER) { // assert false; // keeping characters, but setting class // to not a modifier baseChar = leftChar; modifierChar = rightChar; modifierClass = ModifierClass.NOT_A_MODIFIER; } else { baseChar = rightChar; modifierChar = leftChar; modifierClass = leftClass; } } else { baseChar = leftChar; modifierChar = rightChar; modifierClass = rightClass; } String updatedChar = modifyCharacter(baseChar, modifierChar); tokenizations.remove(tokenizations.size() - 1); if (tokenizations.size() > 0) { tokenizations.remove(tokenizations.size() - 1); } blabla.deleteCharAt(blabla.length() - 1); if (blabla.length() > 0) { blabla.deleteCharAt(blabla.length() - 1); } removeLastCharacterIfPresent(previousTok); if (updatedChar != null) { blabla.append(updatedChar); previousTok.setText(previousTok.getText() + updatedChar); } blabla.append(tok.substring(1, tok.length())); previousTok.setText(previousTok.getText() + tok.substring(1, tok.length())); tokenizations.add(previousTok.getText()); diaresis = (modifierClass == ModifierClass.DIAERESIS || modifierClass == ModifierClass.NORDIC_RING || modifierClass == ModifierClass.CZECH_CARON || modifierClass == ModifierClass.TILDE || modifierClass == ModifierClass.CEDILLA); accent = (modifierClass == ModifierClass.ACUTE_ACCENT || modifierClass == ModifierClass.CIRCUMFLEX || modifierClass == ModifierClass.GRAVE_ACCENT); if (rightClass != ModifierClass.NOT_A_MODIFIER) { tok = ""; // resetting current token as it // is a single-item } } } if (tok != null) { // actually in certain cases, the extracted string under token can be a chunk of text // with separators that need to be preserved // tok = tok.replace(" ", ""); } if ((!diaresis) && (!accent)) { // blabla.append(" "); blabla.append(tok); token.setText(tok); tokenizations.add(tok); } else { tok = ""; keepLast = true; } /* * StringTokenizer st0 = new StringTokenizer(tok0, * TextUtilities.fullPunctuations, true); * while(st0.hasMoreTokens()) { String tok = * st0.nextToken(); tokenizations.add(tok); } * tokenizations.add(" "); */ /* * boolean punct1 = false; boolean punct2 = false; * boolean punct3 = false; String content = null; int i * = 0; for(; i<TextUtilities.punctuations.length(); * i++) { if (tok.length() > 0) { if * (tok.charAt(tok.length()-1) == * TextUtilities.punctuations.charAt(i)) { punct1 = * true; content = tok.substring(0, tok.length()-1); if * (tok.length() > 1) { int j = 0; for(; * j<TextUtilities.punctuations.length(); j++) { if * (tok.charAt(tok.length()-2) == * TextUtilities.punctuations.charAt(j)) { punct3 = * true; content = tok.substring(0, tok.length()-2); } } * } break; } } } if (tok.length() > 0) { if ( * (tok.startsWith("(")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("("); } else if ( * (tok.startsWith("[")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("["); } else if ( * (tok.startsWith("\"")) && (tok.length() > 1) ) { if * ((punct3) && (tok.length() > 2)) content = * tok.substring(1, tok.length()-2); else if (punct1) * content = tok.substring(1, tok.length()-1); else * content = tok.substring(1, tok.length()); punct2 = * true; token.setText("\""); } } */ if (currentRotation) currentFontSize = currentFontSize / 2; /* * if (punct2) { if (currentFont != null) * token.setFont(currentFont.toLowerCase()); else * token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); * * token = new LayoutToken(); token.setText(content); } * if (punct1) { token.setText(content); if (currentFont * != null) token.setFont(currentFont.toLowerCase()); * else token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); * * if (punct3) { token = new LayoutToken(); * token.setText(""+tok.charAt(tok.length()-2)); if * (currentFont != null) * token.setFont(currentFont.toLowerCase()); else * token.setFont("default"); * token.setItalic(currentItalic); * token.setBold(currentBold); * token.setRotation(currentRotation); * token.setColorFont(colorFont); token.setX(currentX); * token.setY(currentY); token.setWidth(currentWidth); * token.setHeight(currentHeight); * token.setFontSize(currentFontSize); * block.addToken(token); } * * token = new LayoutToken(); * token.setText(""+tok.charAt(tok.length()-1)); } */ if (currentFont != null) token.setFont(currentFont.toLowerCase()); else token.setFont("default"); token.setItalic(currentItalic); token.setBold(currentBold); token.setRotation(currentRotation); token.setColorFont(colorFont); token.setX(currentX); token.setY(currentY); token.setWidth(currentWidth); token.setHeight(currentHeight); token.setFontSize(currentFontSize); if (!diaresis && !accent) { block.addToken(token); } if (block.getFont() == null) { if (currentFont != null) block.setFont(currentFont.toLowerCase()); else token.setFont("default"); } if (nbTokens == 0) { block.setItalic(currentItalic); block.setBold(currentBold); } if (block.getColorFont() == null) block.setColorFont(colorFont); if (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) block.setY(currentY); if (block.getWidth() == 0.0) block.setWidth(currentWidth); if (block.getHeight() == 0.0) block.setHeight(currentHeight); if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize); if (!diaresis && !accent) { previousToken = tok; previousTok = token; } else { previousToken = previousTok.getText(); } nbTokens++; accumulator.setLength(0); } } if (tokenizations.size() > 0) { String justBefore = tokenizations.get(tokenizations.size() - 1); if (!justBefore.endsWith("-")) { tokenizations.add(" "); blabla.append(" "); } } } block.setEndToken(tokenizations.size()); } else if (qName.equals("PAGE")) { // page marker are usefull to detect headers (same first line(s) // appearing on each page) if (block != null) { blabla.append("\n"); tokenizations.add("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } Block block0 = new Block(); block0.setText("@PAGE\n"); block0.setNbTokens(0); block0.setPage(currentPage); doc.addBlock(block0); block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); nbTokens = 0; // blabla.append("\n@block\n"); tokenizations.add("\n"); } else if (qName.equals("IMAGE")) { if (block != null) { blabla.append("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); if (images.size() > 0) { blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n"); } block.setText(blabla.toString()); block.setNbTokens(nbTokens); if (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) block.setY(currentY); if (block.getWidth() == 0.0) block.setWidth(currentWidth); if (block.getHeight() == 0.0) block.setHeight(currentHeight); doc.addBlock(block); blabla = new StringBuffer(); nbTokens = 0; block = new Block(); block.setPage(currentPage); } /* * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) { * blabla.append("\n"); block.setText(blabla.toString()); * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new * Block(); block.setPage(currentPage); blabla = new StringBuffer(); * blabla.append("@IMAGE " + "vectorial \n"); * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0) * block.setY(currentY); if (block.getWidth() == 0.0) * block.setWidth(currentWidth); if (block.getHeight() == 0.0) * block.setHeight(currentHeight); doc.addBlock(block); blabla = new * StringBuffer(); nbTokens = 0; block = new Block(); * block.setPage(currentPage); } */ else if (qName.equals("BLOCK")) { blabla.append("\n"); tokenizations.add("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); block.setWidth(currentX - block.getX() + currentWidth); block.setHeight(currentY - block.getY() + currentHeight); doc.addBlock(block); // blabla = new StringBuffer(); nbTokens = 0; block = null; } else if (qName.equals("xi:include")) { if (block != null) { blabla.append("\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); } block = new Block(); block.setPage(currentPage); blabla = new StringBuffer(); blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n"); block.setText(blabla.toString()); block.setNbTokens(nbTokens); doc.addBlock(block); blabla = new StringBuffer(); nbTokens = 0; block = new Block(); block.setPage(currentPage); } /* * else if (qName.equals("DOCUMENT")) { * System.out.println(blabla.toString()); } */ }