/** This method calculates the X1, X2, Y1 and Y2 values of the table. */ private void setTableBoundaries(SemanticFramework semanticFramework) { this.minY1 = semanticFramework.getTitle().get(0).getLowestY1(); int maxY2 = Integer.MIN_VALUE; int y2 = 0; for (Column2 column : dataInColumns) { y2 = column.getColumnBoundaryY2(); if (y2 > maxY2) { maxY2 = y2; } } this.maxY2 = y2; this.minX1 = dataInColumns.get(0).getColumnBoundaryX1(); this.maxX2 = dataInColumns.get(dataInColumns.size() - 1).getColumnBoundaryX2(); }
/** * This is the new write method that uses XML methods to generate an XML file. * * @param location The location where the new file should be stored. * @param file The file that was being used to create this table. * @param tableID The ID of the table. * @param semanticFramework The semantic framework object. This contains get methods that are * required for the output File. * @throws IOException If the given location doesn't exist. */ private void write2(String location, File file, int tableID, SemanticFramework semanticFramework) throws IOException { try { DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); Document doc = docBuilder.newDocument(); org.w3c.dom.Element rootElement = doc.createElement("TEAFile"); doc.appendChild(rootElement); // provenance: org.w3c.dom.Element provenance = doc.createElement("provenance"); rootElement.appendChild(provenance); org.w3c.dom.Element detectionID = doc.createElement("DetectionID"); detectionID.appendChild(doc.createTextNode(tableID + "")); provenance.appendChild(detectionID); org.w3c.dom.Element fromFile = doc.createElement("fromFile"); fromFile.appendChild(doc.createTextNode(file.getName())); provenance.appendChild(fromFile); org.w3c.dom.Element fromPath = doc.createElement("fromPath"); fromPath.appendChild(doc.createTextNode(file.getAbsolutePath())); provenance.appendChild(fromPath); org.w3c.dom.Element fromPage = doc.createElement("fromPage"); fromPage.appendChild(doc.createTextNode(Integer.toString(pageNumber))); provenance.appendChild(fromPage); org.w3c.dom.Element horizontalThresholdModifier = doc.createElement("horizontalThresholdModifier"); horizontalThresholdModifier.appendChild( doc.createTextNode(this.horizontalThresholdModifier + "")); provenance.appendChild(horizontalThresholdModifier); org.w3c.dom.Element verticalThresholdModifier = doc.createElement("horizontalThresholdModifier"); verticalThresholdModifier.appendChild( doc.createTextNode(this.verticalThresholdModifier + "")); provenance.appendChild(verticalThresholdModifier); // Results: org.w3c.dom.Element results = doc.createElement("results"); rootElement.appendChild(results); org.w3c.dom.Element minX1 = doc.createElement("TableBoundaryX1"); minX1.appendChild(doc.createTextNode(this.minX1 + "")); results.appendChild(minX1); org.w3c.dom.Element maxX2 = doc.createElement("TableBoundaryX2"); maxX2.appendChild(doc.createTextNode(this.maxX2 + "")); results.appendChild(maxX2); org.w3c.dom.Element minY1 = doc.createElement("TableBoundaryY1"); minY1.appendChild(doc.createTextNode(this.minY1 + "")); results.appendChild(minY1); org.w3c.dom.Element maxY2 = doc.createElement("TableBoundaryY2"); maxY2.appendChild(doc.createTextNode(this.maxY2 + "")); results.appendChild(maxY2); org.w3c.dom.Element title1 = doc.createElement("title1"); title1.appendChild(doc.createTextNode(name)); results.appendChild(title1); org.w3c.dom.Element title2 = doc.createElement("title2"); title2.appendChild(doc.createTextNode(titleAndHeaders.toString())); results.appendChild(title2); org.w3c.dom.Element columns = doc.createElement("columns"); results.appendChild(columns); for (Column2 columnContent : dataInColumns) { org.w3c.dom.Element column = doc.createElement("column"); column.appendChild(doc.createTextNode(columnContent.toString().replace("�", ""))); columns.appendChild(column); } org.w3c.dom.Element lines = doc.createElement("lines"); results.appendChild(lines); ArrayList<ArrayList<Cell>> table = recreateTableLines(semanticFramework); for (ArrayList<Cell> line : table) { org.w3c.dom.Element XMLLine = doc.createElement("line"); XMLLine.appendChild(doc.createTextNode(line.toString().replace("�", ""))); lines.appendChild(XMLLine); } /*System.out.println("table:" + table); for(Line line : data){ if(line.getHighestY2()<=this.maxY2&&line.getLowestY1()>=this.minY1&&line.getClusterSize()>1){ org.w3c.dom.Element XMLLine = doc.createElement("line"); XMLLine.appendChild(doc.createTextNode(line.toString().replace("�", ""))); lines.appendChild(XMLLine); } else{ System.out.println("NO GOOD: " + line); } } */ if (rowSpanners.size() > 0) { org.w3c.dom.Element rowSpanners = doc.createElement("rowSpanners"); results.appendChild(rowSpanners); for (Line line : this.rowSpanners) { org.w3c.dom.Element rowSpanner = doc.createElement("rowSpanner"); rowSpanner.appendChild(doc.createTextNode(line.toString())); rowSpanners.appendChild(rowSpanner); } } // Semantics: org.w3c.dom.Element semantics = doc.createElement("tableSemantics"); rootElement.appendChild(semantics); org.w3c.dom.Element title = doc.createElement("title"); title.appendChild(doc.createTextNode(semanticFramework.getTitle().toString())); semantics.appendChild(title); org.w3c.dom.Element titleConfidence = doc.createElement("titleConfidence"); Double semanticFrameworkDouble = semanticFramework.getTitleConfidence(); titleConfidence.appendChild(doc.createTextNode(semanticFrameworkDouble.toString())); semantics.appendChild(titleConfidence); if (!semanticFramework.getRowSpanners().isEmpty()) { org.w3c.dom.Element rowSpanners = doc.createElement("subHeaders"); rowSpanners.appendChild(doc.createTextNode(semanticFramework.getRowSpanners().toString())); semantics.appendChild(rowSpanners); org.w3c.dom.Element IdentifiersConfidenceAlignment = doc.createElement("subHeadersConfidenceAlignment"); IdentifiersConfidenceAlignment.appendChild( doc.createTextNode(semanticFramework.getIdentifiersConfidenceAlignment().toString())); semantics.appendChild(IdentifiersConfidenceAlignment); org.w3c.dom.Element getIdentifiersConfidenceColumnsSpanned = doc.createElement("subHeadersConfidenceColumnsSpanned"); getIdentifiersConfidenceColumnsSpanned.appendChild( doc.createTextNode( semanticFramework.getIdentifiersConfidenceColumnsSpanned().toString())); semantics.appendChild(getIdentifiersConfidenceColumnsSpanned); org.w3c.dom.Element IdentifiersConfidenceLineDistance = doc.createElement("subHeadersConfidenceLineDistance"); IdentifiersConfidenceLineDistance.appendChild( doc.createTextNode( semanticFramework.getIdentifiersConfidenceLineDistance().toString())); semantics.appendChild(IdentifiersConfidenceLineDistance); } if (!semanticFramework.getValidatedRowSpanners().isEmpty()) { org.w3c.dom.Element rowSpanners = doc.createElement("rowSpanners"); rowSpanners.appendChild( doc.createTextNode(semanticFramework.getValidatedRowSpanners().toString())); semantics.appendChild(rowSpanners); org.w3c.dom.Element rowSpannersConfidenceAlignment = doc.createElement("rowSpannersConfidenceAlignment"); rowSpannersConfidenceAlignment.appendChild( doc.createTextNode(semanticFramework.getRowSpannersConfidenceAlignment().toString())); semantics.appendChild(rowSpannersConfidenceAlignment); org.w3c.dom.Element rowSpannersConfidenceColumnsSpanned = doc.createElement("rowSpannersConfidenceColumnsSpanned"); rowSpannersConfidenceColumnsSpanned.appendChild( doc.createTextNode( semanticFramework.getRowSpannersConfidenceColumnsSpanned().toString())); semantics.appendChild(rowSpannersConfidenceColumnsSpanned); org.w3c.dom.Element rowSpannersConfidenceLineDistance = doc.createElement("rowSpannersConfidenceLineDistance"); rowSpannersConfidenceLineDistance.appendChild( doc.createTextNode( semanticFramework.getRowSpannersConfidenceLineDistance().toString())); semantics.appendChild(rowSpannersConfidenceLineDistance); } org.w3c.dom.Element headers = doc.createElement("headers"); headers.appendChild(doc.createTextNode(semanticFramework.getHeaders().toString())); semantics.appendChild(headers); org.w3c.dom.Element headersConfidence = doc.createElement("headersConfidence"); headersConfidence.appendChild( doc.createTextNode(semanticFramework.getHeaderConfidence().toString())); semantics.appendChild(headersConfidence); // validation: org.w3c.dom.Element validation = doc.createElement("validation"); rootElement.appendChild(validation); org.w3c.dom.Element clusterCertainty = doc.createElement("columnConfidence"); clusterCertainty.appendChild( doc.createTextNode(this.validation.getClusterCertainty().toString())); validation.appendChild(clusterCertainty); org.w3c.dom.Element mostFrequentNumberOfClusters = doc.createElement("mostFrequentNumberOfClusters"); mostFrequentNumberOfClusters.appendChild( doc.createTextNode(this.validation.getMostFrequentNumberOfClusters() + "")); validation.appendChild(mostFrequentNumberOfClusters); org.w3c.dom.Element highestAmountOfClusters = doc.createElement("highestAmountOfClusters"); highestAmountOfClusters.appendChild( doc.createTextNode(this.validation.getHighestAmountOfClusters() + "")); validation.appendChild(highestAmountOfClusters); org.w3c.dom.Element highestAmountOfClustersOccurrences = doc.createElement("highestAmountOfClustersOccurrences"); highestAmountOfClustersOccurrences.appendChild( doc.createTextNode(this.validation.getHighestAmountOfClustersOccurrences() + "")); validation.appendChild(highestAmountOfClustersOccurrences); org.w3c.dom.Element clusterThreshold = doc.createElement("clusterThreshold"); clusterThreshold.appendChild(doc.createTextNode(this.validation.getLineThreshold() + "")); validation.appendChild(clusterThreshold); org.w3c.dom.Element cellsWithMissingDataAdded = doc.createElement("cellsWithMissingDataAdded"); cellsWithMissingDataAdded.appendChild( doc.createTextNode(this.validation.getCellsWithMissingDataAdded() + "")); validation.appendChild(cellsWithMissingDataAdded); if (this.validation.getCellsWithMissingDataAdded() > 0) { org.w3c.dom.Element cellsWithMissingDataAddedScores = doc.createElement("cellsWithMissingDataAddedScores"); cellsWithMissingDataAddedScores.appendChild( doc.createTextNode(this.validation.getCellsWithMissingDataAddedObjects() + "")); validation.appendChild(cellsWithMissingDataAddedScores); } org.w3c.dom.Element averageDistanceBetweenRows = doc.createElement("averageDistanceBetweenRows"); averageDistanceBetweenRows.appendChild( doc.createTextNode(this.validation.getAverageDistanceBetweenRows() + "")); validation.appendChild(averageDistanceBetweenRows); if (this.validation.getTitleConfidence().size() > 0) { org.w3c.dom.Element TitleConfidence = doc.createElement("TitleConfidence"); TitleConfidence.appendChild(doc.createTextNode(this.validation.getTitleConfidence() + "")); validation.appendChild(TitleConfidence); } org.w3c.dom.Element falsePositive = doc.createElement("falsePositive"); falsePositive.appendChild(doc.createTextNode(this.validation.getFalsePositive() + "")); validation.appendChild(falsePositive); TransformerFactory transformerFactory = TransformerFactory.newInstance(); Transformer transformer = transformerFactory.newTransformer(); DOMSource source = new DOMSource(doc); LOGGER.info( "Written file: " + location + "\\results\\" + file.getName().substring(0, file.getName().length() - 5) + "-" + tableID + ".xml"); File file2 = new File( location + "\\results\\" + file.getName().substring(0, file.getName().length() - 5) + "-" + tableID + ".xml"); Writer output = new BufferedWriter(new FileWriter(file2)); StreamResult result = new StreamResult(output); // Output to console for testing transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); transformer.transform(source, result); output.close(); System.out.println("File saved."); } catch (ParserConfigurationException pce) { pce.printStackTrace(); } catch (TransformerConfigurationException e) { e.printStackTrace(); } catch (TransformerException e) { e.printStackTrace(); } }
// This method will try to recreate the original lines of the table using all the previous // information about the table. private ArrayList<ArrayList<Cell>> recreateTableLines(SemanticFramework semanticFramework) { ArrayList<ArrayList<Cell>> table = new ArrayList<ArrayList<Cell>>(); for (Column2 column : dataInColumns) { ArrayList<Cell> cells = column.getCellObjects(); table.add(cells); } int maxLength = Integer.MIN_VALUE; for (ArrayList<Cell> column : table) { int length = column.size(); if (length > maxLength) { maxLength = length; } } ArrayList<ArrayList<Cell>> newTable = new ArrayList<ArrayList<Cell>>(); ArrayList<Column2> incompleteRowColumns = new ArrayList<Column2>(); ArrayList<Cell> col = new ArrayList<Cell>(); int counter = 0; while (counter < maxLength) { ArrayList<Cell> line = new ArrayList<Cell>(); newTable.add(line); counter++; } for (Column2 column : dataInColumns) { if (column.getCellObjects().size() == maxLength) { col = column.getCellObjects(); for (ArrayList<Cell> line : newTable) { if (line.size() >= dataInColumns.indexOf(column)) { line.add(dataInColumns.indexOf(column), col.get(newTable.indexOf(line))); } else { line.add(col.get(newTable.indexOf(line))); } } } else { incompleteRowColumns.add(column); // gonna be mapped } } for (Column2 column : incompleteRowColumns) { ArrayList<Cell> cells = column.getCellObjects(); for (Cell cellOfNewColumn : cells) { System.out.println(cellOfNewColumn); boolean breaking = false; Cells: for (ArrayList<Cell> line : newTable) { for (Cell cell : line) { if (cell.getY1() >= cellOfNewColumn .getY2()) { // ||(CommonMethods.calcDistance(cell.getY2(), // cellOfNewColumn.getY1())>(averageLineDistance*verticalThresholdModifier))&&line.indexOf(cellOfNewColumn)!=0) System.out.println( newTable.indexOf(line) + " " + cell + " " + cellOfNewColumn + " " + newTable.size() + " " + dataInColumns.indexOf(column) + " " + line.size()); if (newTable.indexOf(line) != 0) { System.out.println( newTable.get(newTable.indexOf(line) - 1).size() + " " + dataInColumns.indexOf(column)); if (newTable.get(newTable.indexOf(line) - 1).size() >= dataInColumns.indexOf(column)) { newTable .get(newTable.indexOf(line) - 1) .add(dataInColumns.indexOf(column), cellOfNewColumn); breaking = true; break Cells; } else { newTable.get(newTable.indexOf(line) - 1).add(cellOfNewColumn); breaking = true; break Cells; } } else { newTable.add(0, new ArrayList<Cell>()); newTable.get(0).add(cellOfNewColumn); breaking = true; break Cells; } } } } if (!breaking) { System.out.println("wasn't breaking. Now it is!"); System.out.println( newTable.size() + " " + dataInColumns.indexOf(column) + " " + newTable.get(newTable.size() - 1).size()); System.out.println(dataInColumns.indexOf(column) + " " + (newTable.size() - 1)); if (dataInColumns.indexOf(column) <= newTable.get(newTable.size() - 1).size() - 1) { newTable.get(newTable.size() - 1).add(dataInColumns.indexOf(column), cellOfNewColumn); breaking = true; } else { newTable.get(newTable.size() - 1).add(cellOfNewColumn); breaking = true; } } } } // TODO: Implement missing data by looking at the touchcolumn of the rowspan and add all the // columns from data in columns as empty cells. double totalY2 = 0; double averageY2; for (Line rowSpanner : semanticFramework.getValidatedRowSpanners()) { for (ArrayList<Cell> line : newTable) { for (Cell cell : line) { totalY2 = totalY2 + cell.getY2(); } averageY2 = totalY2 / line.size(); if (rowSpanner.getAverageY1() < averageY2) { newTable.add(newTable.indexOf(line), new ArrayList<Cell>()); newTable.get(newTable.indexOf(line) - 1).add(rowSpanner.getCellObject()); break; } } } for (Line subHeader : semanticFramework.getRowSpanners()) { for (ArrayList<Cell> line : newTable) { for (Cell cell : line) { totalY2 = totalY2 + cell.getY2(); } int firstMappedColumn = 0; averageY2 = totalY2 / line.size(); if (subHeader.getAverageY1() < averageY2) { newTable.add(newTable.indexOf(line), new ArrayList<Cell>()); for (Column2 column : dataInColumns) { newTable.get(newTable.indexOf(line) - 1).add(new Cell()); if (column.touchesColumn(subHeader.getFirstX1(), subHeader.getLastX2())) { firstMappedColumn = dataInColumns.indexOf(column); } } newTable .get(newTable.indexOf(line) - 1) .set(firstMappedColumn, subHeader.getCellObject()); break; } } } System.out.println("NEWTABLE:"); System.out.println(newTable); return newTable; }