public Instances getInstances(List<ImageData> data) { CSVLoader loader = new CSVLoader(); Instances instances; try { // Create a temp csv file tempFile = new File("tmp"); PrintWriter pw = null; try { pw = new PrintWriter(tempFile); } catch (FileNotFoundException e) { throw new Error(e); } // Load the data into the csv file for (int i = 0; i < Reader.featureSize; i++) { pw.print(i + ","); } pw.println("class"); for (int i = 0; i < data.size(); i++) { List<Double> features = data.get(i).getFeatures(); for (int j = 0; j < features.size(); j++) { pw.print(features.get(j) + ","); } pw.println(data.get(i).getClassType()); pw.flush(); } // Load the instances from the temp csv file loader.setSource(new File("tmp")); instances = loader.getDataSet(); instances.setClass(instances.attribute("class")); return instances; } catch (IOException e) { throw new Error(e); } finally { if (tempFile != null) { tempFile.delete(); tempFile = null; } } }
/** * Loads results from a set of instances contained in the supplied file. * * @param f a value of type 'File' */ protected void setInstancesFromFile(File f) { String fileType = f.getName(); try { m_FromLab.setText("Reading from file..."); if (f.getName().toLowerCase().endsWith(Instances.FILE_EXTENSION)) { fileType = "arff"; Reader r = new BufferedReader(new FileReader(f)); setInstances(new Instances(r)); r.close(); } else if (f.getName().toLowerCase().endsWith(CSVLoader.FILE_EXTENSION)) { fileType = "csv"; CSVLoader cnv = new CSVLoader(); cnv.setSource(f); Instances inst = cnv.getDataSet(); setInstances(inst); } else { throw new Exception("Unrecognized file type"); } } catch (Exception ex) { m_FromLab.setText("File '" + f.getName() + "' not recognised as an " + fileType + " file."); if (JOptionPane.showOptionDialog( ResultsPanel.this, "File '" + f.getName() + "' not recognised as an " + fileType + " file.\n" + "Reason:\n" + ex.getMessage(), "Load Instances", 0, JOptionPane.ERROR_MESSAGE, null, new String[] {"OK"}, null) == 1) {} } }
public static void test_NHBS_old() throws Exception { // load the data CSVLoader loader = new CSVLoader(); // these must come before the getDataSet() // loader.setEnclosureCharacters(",\'\"S"); // loader.setNominalAttributes("16,71"); //zip code, drug name // loader.setStringAttributes(""); // loader.setDateAttributes("0,1"); // loader.setSource(new File("hcv/data/NHBS/IDU2_HCV_model_012913_cleaned_for_weka.csv")); loader.setSource(new File("/home/sasha/hcv/code/data/IDU2_HCV_model_012913_cleaned.csv")); Instances nhbs_data = loader.getDataSet(); loader.setMissingValue("NOVALUE"); // loader.setMissingValue(""); nhbs_data.deleteAttributeAt(12); // zip code nhbs_data.deleteAttributeAt(1); // date - redundant with age nhbs_data.deleteAttributeAt(0); // date System.out.println("classifying attribute:"); nhbs_data.setClassIndex(1); // new index 3->2->1 nhbs_data.attribute(1).getMetadata().toString(); // HCVEIARSLT1 // wishlist: perhaps it would be smarter to throw out unclassified instance? they interfere // with the scoring nhbs_data.deleteWithMissingClass(); // nhbs_data.setClass(new Attribute("HIVRSLT"));//.setClassIndex(1); //2nd column. all are // mostly negative // nhbs_data.setClass(new Attribute("HCVEIARSLT1"));//.setClassIndex(2); //3rd column // #14, i.e. rds_fem, should be made numeric System.out.println("NHBS IDU 2009 Dataset"); System.out.println("Summary of input:"); // System.out.printlnnhbs_data.toSummaryString()); System.out.println(" Num of classes: " + nhbs_data.numClasses()); System.out.println(" Num of attributes: " + nhbs_data.numAttributes()); for (int idx = 0; idx < nhbs_data.numAttributes(); ++idx) { Attribute attr = nhbs_data.attribute(idx); System.out.println("" + idx + ": " + attr.toString()); System.out.println(" distinct values:" + nhbs_data.numDistinctValues(idx)); // System.out.println("" + attr.enumerateValues()); } // System.exit(0); // nhbs_data.deleteAttributeAt(0); //response ID // nhbs_data.deleteAttributeAt(16); //zip // Classifier classifier = new NNge(); //best nearest-neighbor classifier: 40.00 // Classifier classifier = new MINND(); // Classifier classifier = new CitationKNN(); // Classifier classifier = new LibSVM(); //requires LibSVM classes. only gets 37.7% // Classifier classifier = new SMOreg(); // Classifier classifier = new LinearNNSearch(); // LinearRegression: Cannot handle multi-valued nominal class! // Classifier classifier = new LinearRegression(); Classifier classifier = new RandomForest(); String[] options = { "-I", "100", "-K", "4" }; // -I trees, -K features per tree. generally, might want to optimize (or not // https://cwiki.apache.org/confluence/display/MAHOUT/Random+Forests) classifier.setOptions(options); // Classifier classifier = new Logistic(); // KStar classifier = new KStar(); // classifier.setGlobalBlend(20); //the amount of not greedy, in percent // does poorly // Classifier classifier = new AdaBoostM1(); // Classifier classifier = new MultiBoostAB(); // Classifier classifier = new Stacking(); // building a C45 tree classifier // J48 classifier = new J48(); // new instance of tree // String[] options = new String[1]; // options[0] = "-U"; // unpruned tree // classifier.setOptions(options); // set the options // classifier.buildClassifier(nhbs_data); // build classifier // wishlist: remove infrequent values // weka.filters.unsupervised.instance.RemoveFrequentValues() Filter f1 = new RemoveUseless(); f1.setInputFormat(nhbs_data); nhbs_data = Filter.useFilter(nhbs_data, f1); // evaluation Evaluation eval = new Evaluation(nhbs_data); eval.crossValidateModel(classifier, nhbs_data, 10, new Random(1)); System.out.println(eval.toSummaryString("\nResults\n\n", false)); System.out.println(eval.toClassDetailsString()); // System.out.println(eval.toCumulativeMarginDistributionString()); }
public static void main(String[] args) { if (args.length < 1) { System.out.println("usage: C4_5TweetTopicCategorization <root_path>"); System.exit(-1); } String rootPath = args[0]; File dataFolder = new File(rootPath + "/data"); String resultFolderPath = rootPath + "/results/C4_5/"; CrisisMailer crisisMailer = CrisisMailer.getCrisisMailer(); Logger logger = Logger.getLogger(C4_5TweetTopicCategorization.class); PropertyConfigurator.configure(Constants.LOG4J_PROPERTIES_FILE_PATH); File resultFolder = new File(resultFolderPath); if (!resultFolder.exists()) resultFolder.mkdir(); CSVLoader csvLoader = new CSVLoader(); try { for (File dataSetName : dataFolder.listFiles()) { Instances data = null; try { csvLoader.setSource(dataSetName); csvLoader.setStringAttributes("2"); data = csvLoader.getDataSet(); } catch (IOException ioe) { logger.error(ioe); crisisMailer.sendEmailAlert(ioe); System.exit(-1); } data.setClassIndex(data.numAttributes() - 1); data.deleteWithMissingClass(); Instances vectorizedData = null; StringToWordVector stringToWordVectorFilter = new StringToWordVector(); try { stringToWordVectorFilter.setInputFormat(data); stringToWordVectorFilter.setAttributeIndices("2"); stringToWordVectorFilter.setIDFTransform(true); stringToWordVectorFilter.setLowerCaseTokens(true); stringToWordVectorFilter.setOutputWordCounts(false); stringToWordVectorFilter.setUseStoplist(true); vectorizedData = Filter.useFilter(data, stringToWordVectorFilter); vectorizedData.deleteAttributeAt(0); // System.out.println(vectorizedData); } catch (Exception exception) { logger.error(exception); crisisMailer.sendEmailAlert(exception); System.exit(-1); } J48 j48Classifier = new J48(); /* FilteredClassifier filteredClassifier = new FilteredClassifier(); filteredClassifier.setFilter(stringToWordVectorFilter); filteredClassifier.setClassifier(j48Classifier); */ try { Evaluation eval = new Evaluation(vectorizedData); eval.crossValidateModel( j48Classifier, vectorizedData, 5, new Random(System.currentTimeMillis())); FileOutputStream resultOutputStream = new FileOutputStream(new File(resultFolderPath + dataSetName.getName())); resultOutputStream.write(eval.toSummaryString("=== Summary ===", false).getBytes()); resultOutputStream.write(eval.toMatrixString().getBytes()); resultOutputStream.write(eval.toClassDetailsString().getBytes()); resultOutputStream.close(); } catch (Exception exception) { logger.error(exception); crisisMailer.sendEmailAlert(exception); System.exit(-1); } } } catch (Exception exception) { logger.error(exception); crisisMailer.sendEmailAlert(exception); System.out.println(-1); } }
public static void main(String[] args) throws Exception { /* * First we load our preditons from the CSV formatted file. */ CSVLoader predictCsvLoader = new CSVLoader(); predictCsvLoader.setSource(new File("predict.csv")); /* * Since we are not using the ARFF format here, we have to give the * loader a little bit of information about the data types. Columns * 3,8,10 need to be of type string and columns 1,4,11 are nominal * types. */ predictCsvLoader.setStringAttributes("3,8,10"); predictCsvLoader.setNominalAttributes("1,4,11"); Instances predictDataSet = predictCsvLoader.getDataSet(); /* * Here we set the attribute we want to test the predicitons with */ Attribute testAttribute = predictDataSet.attribute(0); predictDataSet.setClass(testAttribute); /* * We still have to remove all string attributes before we can test */ predictDataSet.deleteStringAttributes(); /* * Next we load the training data from our ARFF file */ ArffLoader trainLoader = new ArffLoader(); trainLoader.setSource(new File("train.arff")); trainLoader.setRetrieval(Loader.BATCH); Instances trainDataSet = trainLoader.getDataSet(); /* * Now we tell the data set which attribute we want to classify, in our * case, we want to classify the first column: survived */ Attribute trainAttribute = trainDataSet.attribute(0); trainDataSet.setClass(trainAttribute); /* * The RandomForest implementation cannot handle columns of type string, * so we remove them for now. */ trainDataSet.deleteStringAttributes(); /* * Now we read in the serialized model from disk */ Classifier classifier = (Classifier) SerializationHelper.read("titanic.model"); /* * Next we will use an Evaluation class to evaluate the performance of * our Classifier. */ Evaluation evaluation = new Evaluation(trainDataSet); evaluation.evaluateModel(classifier, predictDataSet, new Object[] {}); /* * After we evaluate the Classifier, we write out the summary * information to the screen. */ System.out.println(classifier); System.out.println(evaluation.toSummaryString()); }
public void processFolder(File folder) throws Exception { if (!folder.isDirectory()) { // manipulate file here String fileName = folder.getName(); System.out.println(fileName); // String extension = getFileExtension(fileName); testdata = new Instances(new BufferedReader(new FileReader(folder))); if (!fileName.startsWith(".") && (fileName.contains(".csv") || fileName.contains(".xls"))) { CSVLoader loader = new CSVLoader(); loader.setSource(new File(folder.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".txt")) { TextDirectoryLoader loader = new TextDirectoryLoader(); System.out.println("About to load text file " + fileName); System.out.println("Name of path " + folder.getAbsolutePath()); loader.setSource(folder); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".json")) { JSONLoader loader = new JSONLoader(); loader.setSource(new File(folder.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) { XRFFLoader loader = new XRFFLoader(); loader.setSource(new File(folder.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".arff")) { traindata = new Instances(new BufferedReader(new FileReader(folder.getAbsolutePath()))); testdata = new Instances(new BufferedReader(new FileReader(folder))); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) { DatabaseConnection loader = new DatabaseConnection(); loader.connectToDatabase(); InstanceQuery query = new InstanceQuery(); query.setUsername("lamogha"); query.setPassword("l@mmyPHD"); query.setQuery("select * from customers"); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query.retrieveInstances(); System.out.println(data.toSummaryString()); this.chooseClassifier(); } } else { for (final File fileEntry : folder.listFiles()) { if (fileEntry.isDirectory()) { this.processFolder(fileEntry); } else { // manipulate file here String fileName = fileEntry.getName(); System.out.println(fileName); if (!fileName.startsWith(".") && (fileName.contains(".csv") || fileName.contains(".xls"))) { CSVLoader loader = new CSVLoader(); loader.setSource(new File(fileEntry.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".txt")) { TextDirectoryLoader loader = new TextDirectoryLoader(); System.out.println("About to load text file " + fileName); System.out.println("Name of path " + fileEntry.getAbsolutePath()); loader.setSource(folder); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".json")) { JSONLoader loader = new JSONLoader(); loader.setSource(new File(fileEntry.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) { XRFFLoader loader = new XRFFLoader(); loader.setSource(new File(fileEntry.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".")) { traindata = new Instances(new BufferedReader(new FileReader(fileEntry.getAbsolutePath()))); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) { DatabaseConnection loader = new DatabaseConnection(); loader.connectToDatabase(); InstanceQuery query = new InstanceQuery(); query.setUsername("lamogha"); query.setPassword("l@mmyPHD"); query.setQuery("select * from customers"); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query.retrieveInstances(); System.out.println(data.toSummaryString()); this.chooseClassifier(); } } } // System.exit(0); } }
public static Instances loadCSV(String fileName) throws IOException { CSVLoader algebraFeaturesCSVLoader = new CSVLoader(); algebraFeaturesCSVLoader.setFile(new File(fileName)); return algebraFeaturesCSVLoader.getDataSet(); }