public static void generateTrainingDataFromFile( String fileLocation) // Requires that the original file had the metadata and requires that this // file is formated the same in first sheet { testDataLL = (LinkedList<String[]>) dataLL.clone(); actualClassifications = (LinkedList<String>) classificationsLL.clone(); FileInputStream file; try { file = new FileInputStream(new File(fileLocation)); Workbook excelFile = new HSSFWorkbook(file); Sheet sheet1 = excelFile.getSheetAt(0); // Data sheet for (Row row : sheet1) { String data[] = new String[row.getPhysicalNumberOfCells() - 1]; String classification = ""; int offset = 0; // Used so that we can declare an array of the size of the attributes without the // classification for (Cell cell : row) { int index = cell.getColumnIndex(); if (classificationLocation != index) { data[index - offset] = cell.toString(); } else { classification = cell.toString(); offset++; } } // Even though data and classifications are not really used add it onto the end so it is // still complete for in the event they end up being used in a later version dataLL.add(data); classificationsLL.add(classification); trainingDataLL.add(data); knownClassifications.add(classification); // Check to see if we have seen that classification yet int occurrences = 0; for (int i = 0; i < classificationTypes.size() && occurrences == 0; i++) { if (classificationTypes.get(i).compareTo(classification) == 0) { occurrences = 1; } } if (occurrences == 0) { classificationTypes.add(classification); } } excelFile.close(); } catch (FileNotFoundException e) { System.out.println("Error file not found"); System.exit(0); } catch (IOException e) { System.out.println("Unable to read file, disk drive may be failing"); e.printStackTrace(); System.exit(0); } }
public static void writeExcelFile(String fileLocation) { try { FileOutputStream fileOut = new FileOutputStream(fileLocation); HSSFWorkbook workbook = new HSSFWorkbook(); Font bold = workbook.createFont(); // Create font bold.setBoldweight(Font.BOLDWEIGHT_BOLD); // Make font bold CellStyle correctCell = workbook.createCellStyle(); correctCell.setFillForegroundColor(HSSFColor.GREEN.index); correctCell.setFillBackgroundColor(HSSFColor.GREEN.index); correctCell.setFillPattern(CellStyle.SOLID_FOREGROUND); CellStyle incorrectCell = workbook.createCellStyle(); incorrectCell.setFillForegroundColor(HSSFColor.RED.index); incorrectCell.setFillBackgroundColor(HSSFColor.RED.index); incorrectCell.setFillPattern(CellStyle.SOLID_FOREGROUND); CellStyle classificationCells = workbook.createCellStyle(); classificationCells.setFillForegroundColor(HSSFColor.YELLOW.index); classificationCells.setFillBackgroundColor(HSSFColor.YELLOW.index); classificationCells.setFillPattern(CellStyle.SOLID_FOREGROUND); CellStyle attributeNameCells = workbook.createCellStyle(); attributeNameCells.setFont(bold); CellStyle classificationAttributeCell = workbook.createCellStyle(); classificationAttributeCell.setFillForegroundColor(HSSFColor.YELLOW.index); classificationAttributeCell.setFillBackgroundColor(HSSFColor.YELLOW.index); classificationAttributeCell.setFillPattern(CellStyle.SOLID_FOREGROUND); classificationAttributeCell.setFont(bold); Sheet worksheet = workbook.createSheet("Results"); Row currRow = worksheet.createRow(0); for (int attribute = 0; attribute < metadataLL.size() + 1; attribute++) { Cell currCell = currRow.createCell(attribute); if (attribute < metadataLL.size()) { currCell.setCellValue(metadataLL.get(attribute)[0]); if (metadataLL.get(attribute)[2].compareTo("classifier") == 0) { currCell.setCellStyle(classificationAttributeCell); } else { currCell.setCellStyle(attributeNameCells); } } else { currCell.setCellValue("Guessed Classification"); currCell.setCellStyle(attributeNameCells); } } int correct = 0; int incorrect = 0; for (int node = 0; node < testDataLL.size(); node++) { currRow = worksheet.createRow(node + 1); // Offset by one since first row is header data int classifierCompleted = 0; // Necessary for if data does not end in classifier for (int attribute = 0; attribute < metadataLL.size() + 2; attribute++) // +1 for the row for guessed data +1 for the row that contains { Cell currCell = currRow.createCell(attribute); if (attribute < metadataLL.size()) // Print testingData { if (metadataLL.get(attribute)[2].compareTo("classifier") == 0) { currCell.setCellValue(actualClassifications.get(node)); currCell.setCellStyle(classificationCells); classifierCompleted++; } else { currCell.setCellValue(testDataLL.get(node)[attribute - classifierCompleted]); } } else if (attribute == metadataLL.size()) // Print guessed classification { currCell.setCellValue(guessedClassifications.get(node)); if (guessedClassifications.get(node).compareTo(actualClassifications.get(node)) == 0) { currCell.setCellStyle(correctCell); correct++; } else { currCell.setCellStyle(incorrectCell); incorrect++; } if (node == testDataLL.size() - 1) // If this is the last loop { double precentRight = (double) correct / (correct + incorrect); currRow = worksheet.createRow(node + 2); currCell = currRow.createCell(attribute); currCell.setCellValue(precentRight); if (precentRight > .90) { correctCell.setDataFormat(workbook.createDataFormat().getFormat("0.000%")); currCell.setCellStyle(correctCell); } else { incorrectCell.setDataFormat(workbook.createDataFormat().getFormat("0.000%")); currCell.setCellStyle(incorrectCell); } } } else if (attribute == metadataLL.size() + 1) // Print potential bad training data if the flag is true { if (unseenDataFlag.get(node)) { currCell.setCellValue( "This node an attribute value not in the training set, classifier selected is based on most frequent classifier. If laplacian smoothing is 1 or more this likely wont happen"); // TODO make this a bit shorter } } } } worksheet = workbook.createSheet("Training Data"); currRow = worksheet.createRow(0); for (int attribute = 0; attribute < metadataLL.size(); attribute++) { Cell currCell = currRow.createCell(attribute); currCell.setCellValue(metadataLL.get(attribute)[0]); currCell.setCellStyle(attributeNameCells); } for (int node = 0; node < trainingDataLL.size(); node++) { currRow = worksheet.createRow(node + 1); // Offset by one since first row is header data int classifierCompleted = 0; // Necessary for if data does not end in classifier for (int attribute = 0; attribute < metadataLL.size(); attribute++) { Cell currCell = currRow.createCell(attribute); if (metadataLL.get(attribute)[2].compareTo("classifier") == 0) { currCell.setCellValue(knownClassifications.get(node)); classifierCompleted++; } else { currCell.setCellValue(trainingDataLL.get(node)[attribute - classifierCompleted]); } } } worksheet = workbook.createSheet("Likelihood"); currRow = worksheet.createRow(0); int largestAttributeSize = 0; for (int attribute = 0; attribute < classifier.size(); attribute++) { if (classifier.get(attribute).size() > largestAttributeSize) { largestAttributeSize = classifier.get(attribute).size(); } } // Label attributes along the top for (int i = 0; i < metadataLL.size(); i++) { if (i == 0) { Cell currCell = currRow.createCell(i); // currCell.setCellValue("Attributes"); currCell.setCellStyle(attributeNameCells); } else { Cell currCell = currRow.createCell(i); currCell.setCellValue( metadataLL .get(i - 1)[0]); // -1 since the first cell does not contain a attribute name currCell.setCellStyle(attributeNameCells); } } // List possible classifications on the side and classification likelihoods at the end for (int i = 0; i < (largestAttributeSize * (classificationTypes.size() + 1) + classificationTypes.size() + 1); i++) // +1 since the first row of each stride lists each attributes string of what // occurrence the likelihoods are displaying { // +classificationTypes.size() so we can list the classification types likelihood at the end currRow = worksheet.createRow(i + 1); // +1 since first row is attribute names Cell currCell = currRow.createCell(0); int currentClassificationType = i % (classificationTypes.size() + 1); // +1 since the first row of each stride lists each attributes string of // what occurrence the likelihoods are displaying // List the classification type of each row along the side if (i < largestAttributeSize * (classificationTypes.size() + 1)) // +1 since the first row of each stride lists each attributes string of // what occurrence the likelihoods are displaying { for (int j = 0; j < classificationTypes.size() + 1; j++) // +1 since the first row of each stride lists each attributes string of what // occurrence the likelihoods are displaying { if (currentClassificationType == 0) { // Do nothing for now may have it say something later } else if (currentClassificationType == j) { currCell.setCellValue( classificationTypes.get( j - 1)); // -1 since the first cell does not contain a classification type } } } else // List the likelihood of each classification at the end { for (int j = 0; j < classificationTypes.size() + 1; j++) // +1 since the first row of each stride lists each attributes string of what // occurrence the likelihoods are displaying { if (currentClassificationType == 0) { // Do nothing for now may have it say value later } else if (currentClassificationType == j) { currCell.setCellValue( "Likelihood of: " + classificationTypes.get(j - 1) + " is " + classificationLikelihood.get( j - 1)); // -1 since the first cell does not contain a classification type } } } currCell.setCellStyle(attributeNameCells); } // List the data for (int attribute = 0; attribute < classifier.size(); attribute++) { for (int occurrences = 0; occurrences < classifier.get(attribute).size(); occurrences++) { for (int classification = 0; classification < classifier.get(attribute).get(occurrences).length; classification++) { currRow = worksheet.getRow( (occurrences * classifier.get(attribute).get(occurrences).length + classification) + 1); // +1 since first row is attribute names Cell currCell = currRow.createCell( (attribute) + 1); // TODO figure out why this errors out at i:0 j:4 k:0 // largestAttributeSize:105 on kidney dataset currCell.setCellValue(classifier.get(attribute).get(occurrences)[classification]); } } } workbook.write(fileOut); workbook.close(); workbook.close(); } catch (FileNotFoundException e) { System.out.println("Error file not found"); e.printStackTrace(); System.exit(0); } catch (IOException e) { System.out.println("Unable to output file, is the output destination writelocked?"); e.printStackTrace(); System.exit(0); } }
public static void readExcelFile(String fileName) { FileInputStream file; try { file = new FileInputStream(new File(fileName)); Workbook excelFile = new HSSFWorkbook(file); Sheet sheet1 = excelFile.getSheetAt(0); // Data sheet // Set just in case metadata is incomplete or malformed classificationLocation = sheet1.getRow(0).getPhysicalNumberOfCells() - 1; // Minus one since classificationLocation includes 0 and getPhysicalNumberOfCells // does not Sheet sheet2 = excelFile.getSheetAt(1); // Metadata sheet // Loop based on number of attribute names for (int i = 0; i < sheet2.getRow(0).getPhysicalNumberOfCells(); i++) { String[] metadata = new String[METADATASIZE]; // Construct metadata Row currRow = sheet2.getRow(0); // This should be a row of names metadata[0] = currRow.getCell(i).toString(); currRow = sheet2.getRow(1); // This should be a row of data types (discrete or continuous) metadata[1] = currRow.getCell(i).toString(); currRow = sheet2.getRow(2); // This should say which one is the classifier if (currRow.getCell(i) == null || currRow.getCell(i).getCellType() == Cell.CELL_TYPE_BLANK) { metadata[2] = "attribute"; } else { metadata[2] = "classifier"; classificationLocation = i; } metadataLL.add(metadata); } for (Row row : sheet1) { String data[] = new String[row.getPhysicalNumberOfCells() - 1]; int offset = 0; // Used so that we can declare an array of the size of the attributes without the // classification for (Cell cell : row) { int index = cell.getColumnIndex(); if (classificationLocation != index) { data[index - offset] = cell.toString(); } else { classificationsLL.add(cell.toString()); // Moved to generate training data so that we do not get possible classifications from // unknown data since some denote unknown by saying ? // //Check to see if we have seen it yet // // int occurrences = 0; // for(int i = 0; i < classificationTypes.size(); i++) // { // if(classificationTypes.get(i).compareTo(cell.toString()) == 0) // { // occurrences++; // } // } // if(occurrences == 0) // { // classificationTypes.add(cell.toString()); // } offset++; } } dataLL.add(data); // classCount = temp.length; } excelFile.close(); } catch (FileNotFoundException e) { System.out.println("Error file not found"); System.exit(0); } catch (IOException e) { System.out.println("Unable to read file, disk drive may be failing"); e.printStackTrace(); System.exit(0); } }