/** * This method tries to check the columns. If there are any cells that don't fit in the column, * they are moved to another column or create a new column. */ private void checkColumns() { for (Column2 column : dataInColumns) { if (!column.checkFirstThreeCells()) { this.missPlacedCells = column.getWrongCells(); } } }
/** This method checks if the table itself is a false positive and should be flagged. */ public void checkForFalsePositive() { int almostEmptyColumns = 0; for (Column2 column : dataInColumns) { if (column.getNumberOfCells() == 1) { almostEmptyColumns += 1; } } if (almostEmptyColumns >= 2) { validation.setFalsePositive(true); } }
/** This method calculates the X1, X2, Y1 and Y2 values of the table. */ private void setTableBoundaries(SemanticFramework semanticFramework) { this.minY1 = semanticFramework.getTitle().get(0).getLowestY1(); int maxY2 = Integer.MIN_VALUE; int y2 = 0; for (Column2 column : dataInColumns) { y2 = column.getColumnBoundaryY2(); if (y2 > maxY2) { maxY2 = y2; } } this.maxY2 = y2; this.minX1 = dataInColumns.get(0).getColumnBoundaryX1(); this.maxX2 = dataInColumns.get(dataInColumns.size() - 1).getColumnBoundaryX2(); }
/** * This method is used for extraction of tables with lot of empty cells in it. It is required for * the successful extraction of most Matrix tables. */ private void fillBlankCells() { // We say: cells get a line number. If a column does not contain a cell on a certain line, add a // whitespace. // Any cell that is not filled must be empty: for (Line line : data) { int lineNumber = line.getLineNumber(); COLUMNLOOP: for (Column2 column : dataInColumns) { for (Cell cell : column.getCellObjects()) { if (cell.getLineNumber() == lineNumber) { break; } if (cell.getLineNumber() > line.getLineNumber()) { // the last cell? // Add a blank cell to this column. // System.out.println("Add line to :" + column + " in line: " + // line.getLineNumber()); // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span> Tag t = Tag.valueOf("span"); Attributes attributes = new Attributes(); attributes.put("class", "ocrx_word"); attributes.put("id", "word_ADDEDBYTEA"); attributes.put( "title", "bbox " + column.getAverageX1() + " " + (int) line.getAverageY1() + " " + column.getAverageX2() + " " + (int) line.getAverageY2()); Element newElement = new Element(t, "localhost:8080", attributes); newElement.text(" "); ArrayList<Element> newCell = new ArrayList<Element>(); newCell.add(newElement); // System.out.println("adding: " +newElement.text()); column.addCell(newCell); break COLUMNLOOP; } } } } }
private void replaceMissPlacedCells() { for (ArrayList<Element> cluster : missPlacedCells) { for (Column2 column : dataInColumns) { if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) && column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: column.addCell(cluster); } else if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) || column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: column.addCell(cluster); } else if (column.touchesColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // System.out.println("CLUSTER: " + cluster + " touches: " + column); column.addCell(cluster); } } } }
/** * This method adds lines with missing partitions to the current columns. It loops trough lines * that were flagged for containing missing data and then adds the ones to columns that they fit * in (or to columns that fit in the partition). If this fails it will also try to check if the * partition merely touches a column, although this will return a lower validation score if * successful. */ private void addLinesWithMissingDataToColumns() { ArrayList<Cell> cellsWithMissingDataAdded = new ArrayList<Cell>(); ArrayList<Column2> newDataInColumns = new ArrayList<Column2>(); for (Line line : linesWithMissingData) { ArrayList<ArrayList<Element>> clusters = line.getClusters(); for (ArrayList<Element> cluster : clusters) { for (Column2 column : dataInColumns) { if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) && column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 3, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } else if (column.fitsInColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster)) || column.columnFitsIn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // then we need to add this cluster to that column: newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 2, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } else if (column.touchesColumn(Line.getClusterX1(cluster), Line.getClusterX2(cluster))) { // System.out.println("CLUSTER: " + cluster + " touches: " + // column); newDataInColumns.remove(column); column.addCell(cluster); newDataInColumns.add(column); Cell cell = new Cell(cluster, 1, line.getLineNumber()); cellsWithMissingDataAdded.add(cell); } } } } validation.setCellsWithMissingDataAdded(cellsWithMissingDataAdded.size()); validation.setCellsWithMissingDataAddedScores(cellsWithMissingDataAdded); }
/** * This is the new write method that uses XML methods to generate an XML file. * * @param location The location where the new file should be stored. * @param file The file that was being used to create this table. * @param tableID The ID of the table. * @param semanticFramework The semantic framework object. This contains get methods that are * required for the output File. * @throws IOException If the given location doesn't exist. */ private void write2(String location, File file, int tableID, SemanticFramework semanticFramework) throws IOException { try { DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); Document doc = docBuilder.newDocument(); org.w3c.dom.Element rootElement = doc.createElement("TEAFile"); doc.appendChild(rootElement); // provenance: org.w3c.dom.Element provenance = doc.createElement("provenance"); rootElement.appendChild(provenance); org.w3c.dom.Element detectionID = doc.createElement("DetectionID"); detectionID.appendChild(doc.createTextNode(tableID + "")); provenance.appendChild(detectionID); org.w3c.dom.Element fromFile = doc.createElement("fromFile"); fromFile.appendChild(doc.createTextNode(file.getName())); provenance.appendChild(fromFile); org.w3c.dom.Element fromPath = doc.createElement("fromPath"); fromPath.appendChild(doc.createTextNode(file.getAbsolutePath())); provenance.appendChild(fromPath); org.w3c.dom.Element fromPage = doc.createElement("fromPage"); fromPage.appendChild(doc.createTextNode(Integer.toString(pageNumber))); provenance.appendChild(fromPage); org.w3c.dom.Element horizontalThresholdModifier = doc.createElement("horizontalThresholdModifier"); horizontalThresholdModifier.appendChild( doc.createTextNode(this.horizontalThresholdModifier + "")); provenance.appendChild(horizontalThresholdModifier); org.w3c.dom.Element verticalThresholdModifier = doc.createElement("horizontalThresholdModifier"); verticalThresholdModifier.appendChild( doc.createTextNode(this.verticalThresholdModifier + "")); provenance.appendChild(verticalThresholdModifier); // Results: org.w3c.dom.Element results = doc.createElement("results"); rootElement.appendChild(results); org.w3c.dom.Element minX1 = doc.createElement("TableBoundaryX1"); minX1.appendChild(doc.createTextNode(this.minX1 + "")); results.appendChild(minX1); org.w3c.dom.Element maxX2 = doc.createElement("TableBoundaryX2"); maxX2.appendChild(doc.createTextNode(this.maxX2 + "")); results.appendChild(maxX2); org.w3c.dom.Element minY1 = doc.createElement("TableBoundaryY1"); minY1.appendChild(doc.createTextNode(this.minY1 + "")); results.appendChild(minY1); org.w3c.dom.Element maxY2 = doc.createElement("TableBoundaryY2"); maxY2.appendChild(doc.createTextNode(this.maxY2 + "")); results.appendChild(maxY2); org.w3c.dom.Element title1 = doc.createElement("title1"); title1.appendChild(doc.createTextNode(name)); results.appendChild(title1); org.w3c.dom.Element title2 = doc.createElement("title2"); title2.appendChild(doc.createTextNode(titleAndHeaders.toString())); results.appendChild(title2); org.w3c.dom.Element columns = doc.createElement("columns"); results.appendChild(columns); for (Column2 columnContent : dataInColumns) { org.w3c.dom.Element column = doc.createElement("column"); column.appendChild(doc.createTextNode(columnContent.toString().replace("�", ""))); columns.appendChild(column); } org.w3c.dom.Element lines = doc.createElement("lines"); results.appendChild(lines); ArrayList<ArrayList<Cell>> table = recreateTableLines(semanticFramework); for (ArrayList<Cell> line : table) { org.w3c.dom.Element XMLLine = doc.createElement("line"); XMLLine.appendChild(doc.createTextNode(line.toString().replace("�", ""))); lines.appendChild(XMLLine); } /*System.out.println("table:" + table); for(Line line : data){ if(line.getHighestY2()<=this.maxY2&&line.getLowestY1()>=this.minY1&&line.getClusterSize()>1){ org.w3c.dom.Element XMLLine = doc.createElement("line"); XMLLine.appendChild(doc.createTextNode(line.toString().replace("�", ""))); lines.appendChild(XMLLine); } else{ System.out.println("NO GOOD: " + line); } } */ if (rowSpanners.size() > 0) { org.w3c.dom.Element rowSpanners = doc.createElement("rowSpanners"); results.appendChild(rowSpanners); for (Line line : this.rowSpanners) { org.w3c.dom.Element rowSpanner = doc.createElement("rowSpanner"); rowSpanner.appendChild(doc.createTextNode(line.toString())); rowSpanners.appendChild(rowSpanner); } } // Semantics: org.w3c.dom.Element semantics = doc.createElement("tableSemantics"); rootElement.appendChild(semantics); org.w3c.dom.Element title = doc.createElement("title"); title.appendChild(doc.createTextNode(semanticFramework.getTitle().toString())); semantics.appendChild(title); org.w3c.dom.Element titleConfidence = doc.createElement("titleConfidence"); Double semanticFrameworkDouble = semanticFramework.getTitleConfidence(); titleConfidence.appendChild(doc.createTextNode(semanticFrameworkDouble.toString())); semantics.appendChild(titleConfidence); if (!semanticFramework.getRowSpanners().isEmpty()) { org.w3c.dom.Element rowSpanners = doc.createElement("subHeaders"); rowSpanners.appendChild(doc.createTextNode(semanticFramework.getRowSpanners().toString())); semantics.appendChild(rowSpanners); org.w3c.dom.Element IdentifiersConfidenceAlignment = doc.createElement("subHeadersConfidenceAlignment"); IdentifiersConfidenceAlignment.appendChild( doc.createTextNode(semanticFramework.getIdentifiersConfidenceAlignment().toString())); semantics.appendChild(IdentifiersConfidenceAlignment); org.w3c.dom.Element getIdentifiersConfidenceColumnsSpanned = doc.createElement("subHeadersConfidenceColumnsSpanned"); getIdentifiersConfidenceColumnsSpanned.appendChild( doc.createTextNode( semanticFramework.getIdentifiersConfidenceColumnsSpanned().toString())); semantics.appendChild(getIdentifiersConfidenceColumnsSpanned); org.w3c.dom.Element IdentifiersConfidenceLineDistance = doc.createElement("subHeadersConfidenceLineDistance"); IdentifiersConfidenceLineDistance.appendChild( doc.createTextNode( semanticFramework.getIdentifiersConfidenceLineDistance().toString())); semantics.appendChild(IdentifiersConfidenceLineDistance); } if (!semanticFramework.getValidatedRowSpanners().isEmpty()) { org.w3c.dom.Element rowSpanners = doc.createElement("rowSpanners"); rowSpanners.appendChild( doc.createTextNode(semanticFramework.getValidatedRowSpanners().toString())); semantics.appendChild(rowSpanners); org.w3c.dom.Element rowSpannersConfidenceAlignment = doc.createElement("rowSpannersConfidenceAlignment"); rowSpannersConfidenceAlignment.appendChild( doc.createTextNode(semanticFramework.getRowSpannersConfidenceAlignment().toString())); semantics.appendChild(rowSpannersConfidenceAlignment); org.w3c.dom.Element rowSpannersConfidenceColumnsSpanned = doc.createElement("rowSpannersConfidenceColumnsSpanned"); rowSpannersConfidenceColumnsSpanned.appendChild( doc.createTextNode( semanticFramework.getRowSpannersConfidenceColumnsSpanned().toString())); semantics.appendChild(rowSpannersConfidenceColumnsSpanned); org.w3c.dom.Element rowSpannersConfidenceLineDistance = doc.createElement("rowSpannersConfidenceLineDistance"); rowSpannersConfidenceLineDistance.appendChild( doc.createTextNode( semanticFramework.getRowSpannersConfidenceLineDistance().toString())); semantics.appendChild(rowSpannersConfidenceLineDistance); } org.w3c.dom.Element headers = doc.createElement("headers"); headers.appendChild(doc.createTextNode(semanticFramework.getHeaders().toString())); semantics.appendChild(headers); org.w3c.dom.Element headersConfidence = doc.createElement("headersConfidence"); headersConfidence.appendChild( doc.createTextNode(semanticFramework.getHeaderConfidence().toString())); semantics.appendChild(headersConfidence); // validation: org.w3c.dom.Element validation = doc.createElement("validation"); rootElement.appendChild(validation); org.w3c.dom.Element clusterCertainty = doc.createElement("columnConfidence"); clusterCertainty.appendChild( doc.createTextNode(this.validation.getClusterCertainty().toString())); validation.appendChild(clusterCertainty); org.w3c.dom.Element mostFrequentNumberOfClusters = doc.createElement("mostFrequentNumberOfClusters"); mostFrequentNumberOfClusters.appendChild( doc.createTextNode(this.validation.getMostFrequentNumberOfClusters() + "")); validation.appendChild(mostFrequentNumberOfClusters); org.w3c.dom.Element highestAmountOfClusters = doc.createElement("highestAmountOfClusters"); highestAmountOfClusters.appendChild( doc.createTextNode(this.validation.getHighestAmountOfClusters() + "")); validation.appendChild(highestAmountOfClusters); org.w3c.dom.Element highestAmountOfClustersOccurrences = doc.createElement("highestAmountOfClustersOccurrences"); highestAmountOfClustersOccurrences.appendChild( doc.createTextNode(this.validation.getHighestAmountOfClustersOccurrences() + "")); validation.appendChild(highestAmountOfClustersOccurrences); org.w3c.dom.Element clusterThreshold = doc.createElement("clusterThreshold"); clusterThreshold.appendChild(doc.createTextNode(this.validation.getLineThreshold() + "")); validation.appendChild(clusterThreshold); org.w3c.dom.Element cellsWithMissingDataAdded = doc.createElement("cellsWithMissingDataAdded"); cellsWithMissingDataAdded.appendChild( doc.createTextNode(this.validation.getCellsWithMissingDataAdded() + "")); validation.appendChild(cellsWithMissingDataAdded); if (this.validation.getCellsWithMissingDataAdded() > 0) { org.w3c.dom.Element cellsWithMissingDataAddedScores = doc.createElement("cellsWithMissingDataAddedScores"); cellsWithMissingDataAddedScores.appendChild( doc.createTextNode(this.validation.getCellsWithMissingDataAddedObjects() + "")); validation.appendChild(cellsWithMissingDataAddedScores); } org.w3c.dom.Element averageDistanceBetweenRows = doc.createElement("averageDistanceBetweenRows"); averageDistanceBetweenRows.appendChild( doc.createTextNode(this.validation.getAverageDistanceBetweenRows() + "")); validation.appendChild(averageDistanceBetweenRows); if (this.validation.getTitleConfidence().size() > 0) { org.w3c.dom.Element TitleConfidence = doc.createElement("TitleConfidence"); TitleConfidence.appendChild(doc.createTextNode(this.validation.getTitleConfidence() + "")); validation.appendChild(TitleConfidence); } org.w3c.dom.Element falsePositive = doc.createElement("falsePositive"); falsePositive.appendChild(doc.createTextNode(this.validation.getFalsePositive() + "")); validation.appendChild(falsePositive); TransformerFactory transformerFactory = TransformerFactory.newInstance(); Transformer transformer = transformerFactory.newTransformer(); DOMSource source = new DOMSource(doc); LOGGER.info( "Written file: " + location + "\\results\\" + file.getName().substring(0, file.getName().length() - 5) + "-" + tableID + ".xml"); File file2 = new File( location + "\\results\\" + file.getName().substring(0, file.getName().length() - 5) + "-" + tableID + ".xml"); Writer output = new BufferedWriter(new FileWriter(file2)); StreamResult result = new StreamResult(output); // Output to console for testing transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); transformer.transform(source, result); output.close(); System.out.println("File saved."); } catch (ParserConfigurationException pce) { pce.printStackTrace(); } catch (TransformerConfigurationException e) { e.printStackTrace(); } catch (TransformerException e) { e.printStackTrace(); } }
/** * This is the constructor of the table class. It takes it's parameters and sets them as local * variables. It also puts the default values for the rest of the table and then starts calling * the other methods in this class. to extract the table according to the rules of TEA. * * @param spans These are the words below the table detection from the Page class. * @param charLengthThreshold This is the character length threshold as calculated in the Page * class. * @param file This is the File that was used to extract the table from. It is only used for the * creation of provenance. * @param workspace This is the workspace as specified by the user. * @param tableID This is the ID of the detected table. It is mainly used for the creation of the * output file and for provenance. * @param verticalThresholdModifier The modifier from the configuration file that should be used * to indicate how much space there should be between lines * @param horizontalThresholdModifier The modifier used for creating the threshold in horizontal * partitioning. * @param averageLineDistance The average (vertical) distance between lines as calculated in the * Page class. * @param debugging is true if the program is in debugging mode. * @param allowedHeaderIterations The amount of iterations that the program is allowed to run, * searching for headers. * @param allowedHeaderSize The amount of headers supported by the program. Implemented as a last * cut-off if thresholding fails. * @throws IOException When one of the files cant be found */ public Table2( Elements spans, double charLengthThreshold, File file, int pageNumber, String workspace, int tableID, double verticalThresholdModifier, double horizontalThresholdModifier, double averageLineDistance, boolean debugging, int allowedHeaderSize, int allowedHeaderIterations) throws IOException { String debugContent = ""; this.averageLineDistance = averageLineDistance; this.maxY1 = 0; this.spans = spans; this.name = ""; this.horizontalThresholdModifier = horizontalThresholdModifier; this.verticalThresholdModifier = verticalThresholdModifier; this.validation = new Validation(); this.validation.setAverageDistanceBetweenRows(averageLineDistance); this.pageNumber = pageNumber; if (spans.size() > 0) { setMaxY1(); this.table = new ArrayList<Line>(); createLines(charLengthThreshold); separateDataByCluster(); filterLinesThatAreAboveY1(); if (data.size() > 1) { System.out.println(getRawTable()); debugContent = debugContent + getRawTable() + "\n"; filterEmptyLines(); findMissingData(); findColumns(); createColumns(charLengthThreshold); checkColumns(); debugContent = debugContent + "lines with missing data: " + linesWithMissingData + "\n"; if (linesWithMissingData != null) { addLinesWithMissingDataToColumns(); } fillBlankCells(); } else { LOGGER.info( "The word Table was detected but no clusters were found.\n" + "It was found at position: " + maxY1); } if (data.size() > 1) { for (Line line : data) { validation.setClusterCertainty(line.getDistances(), line.getDistanceThreshold()); validation.setLineThreshold(line.getDistanceThreshold()); } LOGGER.info("Table: " + getName()); System.out.println("In Table: " + getName()); System.out.println( "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"); System.out.println("Data in this table is: "); ArrayList<Integer> distances = new ArrayList<Integer>(); Column2 lastColumn = null; for (Column2 column : dataInColumns) { System.out.println(column); if (dataInColumns.indexOf(column) == 0) { lastColumn = column; continue; } if (lastColumn != null) { distances.add(column.getAverageX1() - lastColumn.getAverageX2()); } } validation.setClusterCertainty( distances, averageLineDistance * horizontalThresholdModifier); if (linesWithMissingData != null && linesWithMissingData.size() > 0) { System.out.println( "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"); System.out.println( "The following lines were detected for having missing data or it was a line that had more clusters then the rest of the table.: "); for (Line line : linesWithMissingData) { System.out.println(line); } } System.out.println( "~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-"); if (rowSpanners.size() > 0) { System.out.println("Potential rowspanners: "); for (Line line : rowSpanners) { System.out.println(line); } } System.out.println("Validation:\n" + validation); System.out.println(table); setClusterCertainties(); System.out.println("Checking out the semantics."); SemanticFramework semanticFramework = new SemanticFramework( dataInColumns, (averageLineDistance * verticalThresholdModifier), rowSpanners, charLengthThreshold * horizontalThresholdModifier, table, validation, titleAndHeaders, allowedHeaderSize, allowedHeaderIterations); System.out.println("Checking for false positive..."); checkForFalsePositive(); System.out.println("False positive: " + validation.getFalsePositive()); LOGGER.info("False positive: " + validation.getFalsePositive()); System.out.println(); fillBlankCells(); System.out.println(semanticFramework); System.out.println("Calculating final table statistics."); setTableBoundaries(semanticFramework); System.out.println("Now writing to file."); write2( (workspace), file, tableID, semanticFramework); // write: getXMLContent(file, tableID, semanticFramework.getXML()), if (debugging) { writeDebugFile(debugContent, workspace, file); } } else { LOGGER.info("All the found data was filtered out!"); } } }
// This method will try to recreate the original lines of the table using all the previous // information about the table. private ArrayList<ArrayList<Cell>> recreateTableLines(SemanticFramework semanticFramework) { ArrayList<ArrayList<Cell>> table = new ArrayList<ArrayList<Cell>>(); for (Column2 column : dataInColumns) { ArrayList<Cell> cells = column.getCellObjects(); table.add(cells); } int maxLength = Integer.MIN_VALUE; for (ArrayList<Cell> column : table) { int length = column.size(); if (length > maxLength) { maxLength = length; } } ArrayList<ArrayList<Cell>> newTable = new ArrayList<ArrayList<Cell>>(); ArrayList<Column2> incompleteRowColumns = new ArrayList<Column2>(); ArrayList<Cell> col = new ArrayList<Cell>(); int counter = 0; while (counter < maxLength) { ArrayList<Cell> line = new ArrayList<Cell>(); newTable.add(line); counter++; } for (Column2 column : dataInColumns) { if (column.getCellObjects().size() == maxLength) { col = column.getCellObjects(); for (ArrayList<Cell> line : newTable) { if (line.size() >= dataInColumns.indexOf(column)) { line.add(dataInColumns.indexOf(column), col.get(newTable.indexOf(line))); } else { line.add(col.get(newTable.indexOf(line))); } } } else { incompleteRowColumns.add(column); // gonna be mapped } } for (Column2 column : incompleteRowColumns) { ArrayList<Cell> cells = column.getCellObjects(); for (Cell cellOfNewColumn : cells) { System.out.println(cellOfNewColumn); boolean breaking = false; Cells: for (ArrayList<Cell> line : newTable) { for (Cell cell : line) { if (cell.getY1() >= cellOfNewColumn .getY2()) { // ||(CommonMethods.calcDistance(cell.getY2(), // cellOfNewColumn.getY1())>(averageLineDistance*verticalThresholdModifier))&&line.indexOf(cellOfNewColumn)!=0) System.out.println( newTable.indexOf(line) + " " + cell + " " + cellOfNewColumn + " " + newTable.size() + " " + dataInColumns.indexOf(column) + " " + line.size()); if (newTable.indexOf(line) != 0) { System.out.println( newTable.get(newTable.indexOf(line) - 1).size() + " " + dataInColumns.indexOf(column)); if (newTable.get(newTable.indexOf(line) - 1).size() >= dataInColumns.indexOf(column)) { newTable .get(newTable.indexOf(line) - 1) .add(dataInColumns.indexOf(column), cellOfNewColumn); breaking = true; break Cells; } else { newTable.get(newTable.indexOf(line) - 1).add(cellOfNewColumn); breaking = true; break Cells; } } else { newTable.add(0, new ArrayList<Cell>()); newTable.get(0).add(cellOfNewColumn); breaking = true; break Cells; } } } } if (!breaking) { System.out.println("wasn't breaking. Now it is!"); System.out.println( newTable.size() + " " + dataInColumns.indexOf(column) + " " + newTable.get(newTable.size() - 1).size()); System.out.println(dataInColumns.indexOf(column) + " " + (newTable.size() - 1)); if (dataInColumns.indexOf(column) <= newTable.get(newTable.size() - 1).size() - 1) { newTable.get(newTable.size() - 1).add(dataInColumns.indexOf(column), cellOfNewColumn); breaking = true; } else { newTable.get(newTable.size() - 1).add(cellOfNewColumn); breaking = true; } } } } // TODO: Implement missing data by looking at the touchcolumn of the rowspan and add all the // columns from data in columns as empty cells. double totalY2 = 0; double averageY2; for (Line rowSpanner : semanticFramework.getValidatedRowSpanners()) { for (ArrayList<Cell> line : newTable) { for (Cell cell : line) { totalY2 = totalY2 + cell.getY2(); } averageY2 = totalY2 / line.size(); if (rowSpanner.getAverageY1() < averageY2) { newTable.add(newTable.indexOf(line), new ArrayList<Cell>()); newTable.get(newTable.indexOf(line) - 1).add(rowSpanner.getCellObject()); break; } } } for (Line subHeader : semanticFramework.getRowSpanners()) { for (ArrayList<Cell> line : newTable) { for (Cell cell : line) { totalY2 = totalY2 + cell.getY2(); } int firstMappedColumn = 0; averageY2 = totalY2 / line.size(); if (subHeader.getAverageY1() < averageY2) { newTable.add(newTable.indexOf(line), new ArrayList<Cell>()); for (Column2 column : dataInColumns) { newTable.get(newTable.indexOf(line) - 1).add(new Cell()); if (column.touchesColumn(subHeader.getFirstX1(), subHeader.getLastX2())) { firstMappedColumn = dataInColumns.indexOf(column); } } newTable .get(newTable.indexOf(line) - 1) .set(firstMappedColumn, subHeader.getCellObject()); break; } } } System.out.println("NEWTABLE:"); System.out.println(newTable); return newTable; }