public static void analyze_accuracy_NHBS(int rng_seed) throws Exception { HashMap<String, Object> population_params = load_defaults(null); RawLoader rl = new RawLoader(population_params, true, false, rng_seed); List<DrugUser> learningData = rl.getLearningData(); Instances nhbs_data = new Instances("learning_instances", DrugUser.getAttInfo(), learningData.size()); for (DrugUser du : learningData) { nhbs_data.add(du.getInstance()); } System.out.println(nhbs_data.toSummaryString()); nhbs_data.setClass(DrugUser.getAttribMap().get("hcv_state")); // wishlist: remove infrequent values // weka.filters.unsupervised.instance.RemoveFrequentValues() Filter f1 = new RemoveUseless(); f1.setInputFormat(nhbs_data); nhbs_data = Filter.useFilter(nhbs_data, f1); System.out.println("NHBS IDU 2009 Dataset"); System.out.println("Summary of input:"); // System.out.printlnnhbs_data.toSummaryString()); System.out.println(" Num of classes: " + nhbs_data.numClasses()); System.out.println(" Num of attributes: " + nhbs_data.numAttributes()); for (int idx = 0; idx < nhbs_data.numAttributes(); ++idx) { Attribute attr = nhbs_data.attribute(idx); System.out.println("" + idx + ": " + attr.toString()); System.out.println(" distinct values:" + nhbs_data.numDistinctValues(idx)); // System.out.println("" + attr.enumerateValues()); } ArrayList<String> options = new ArrayList<String>(); options.add("-Q"); options.add("" + rng_seed); // System.exit(0); // nhbs_data.deleteAttributeAt(0); //response ID // nhbs_data.deleteAttributeAt(16); //zip // Classifier classifier = new NNge(); //best nearest-neighbor classifier: 40.00 // ROC=0.60 // Classifier classifier = new MINND(); // Classifier classifier = new CitationKNN(); // Classifier classifier = new LibSVM(); //requires LibSVM classes. only gets 37.7% // Classifier classifier = new SMOreg(); Classifier classifier = new Logistic(); // ROC=0.686 // Classifier classifier = new LinearNNSearch(); // LinearRegression: Cannot handle multi-valued nominal class! // Classifier classifier = new LinearRegression(); // Classifier classifier = new RandomForest(); // String[] options = {"-I", "100", "-K", "4"}; //-I trees, -K features per tree. generally, // might want to optimize (or not // https://cwiki.apache.org/confluence/display/MAHOUT/Random+Forests) // options.add("-I"); options.add("100"); options.add("-K"); options.add("4"); // ROC=0.673 // KStar classifier = new KStar(); // classifier.setGlobalBlend(20); //the amount of not greedy, in percent // ROC=0.633 // Classifier classifier = new AdaBoostM1(); // ROC=0.66 // Classifier classifier = new MultiBoostAB(); // ROC=0.67 // Classifier classifier = new Stacking(); // ROC=0.495 // J48 classifier = new J48(); // new instance of tree //building a C45 tree classifier // ROC=0.585 // String[] options = new String[1]; // options[0] = "-U"; // unpruned tree // classifier.setOptions(options); // set the options classifier.setOptions((String[]) options.toArray(new String[0])); // not needed before CV: http://weka.wikispaces.com/Use+WEKA+in+your+Java+code // classifier.buildClassifier(nhbs_data); // build classifier // evaluation Evaluation eval = new Evaluation(nhbs_data); eval.crossValidateModel(classifier, nhbs_data, 10, new Random(1)); // 10-fold cross validation System.out.println(eval.toSummaryString("\nResults\n\n", false)); System.out.println(eval.toClassDetailsString()); // System.out.println(eval.toCumulativeMarginDistributionString()); }
public void processFolder(File folder) throws Exception { if (!folder.isDirectory()) { // manipulate file here String fileName = folder.getName(); System.out.println(fileName); // String extension = getFileExtension(fileName); testdata = new Instances(new BufferedReader(new FileReader(folder))); if (!fileName.startsWith(".") && (fileName.contains(".csv") || fileName.contains(".xls"))) { CSVLoader loader = new CSVLoader(); loader.setSource(new File(folder.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".txt")) { TextDirectoryLoader loader = new TextDirectoryLoader(); System.out.println("About to load text file " + fileName); System.out.println("Name of path " + folder.getAbsolutePath()); loader.setSource(folder); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".json")) { JSONLoader loader = new JSONLoader(); loader.setSource(new File(folder.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) { XRFFLoader loader = new XRFFLoader(); loader.setSource(new File(folder.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".arff")) { traindata = new Instances(new BufferedReader(new FileReader(folder.getAbsolutePath()))); testdata = new Instances(new BufferedReader(new FileReader(folder))); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) { DatabaseConnection loader = new DatabaseConnection(); loader.connectToDatabase(); InstanceQuery query = new InstanceQuery(); query.setUsername("lamogha"); query.setPassword("l@mmyPHD"); query.setQuery("select * from customers"); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query.retrieveInstances(); System.out.println(data.toSummaryString()); this.chooseClassifier(); } } else { for (final File fileEntry : folder.listFiles()) { if (fileEntry.isDirectory()) { this.processFolder(fileEntry); } else { // manipulate file here String fileName = fileEntry.getName(); System.out.println(fileName); if (!fileName.startsWith(".") && (fileName.contains(".csv") || fileName.contains(".xls"))) { CSVLoader loader = new CSVLoader(); loader.setSource(new File(fileEntry.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".txt")) { TextDirectoryLoader loader = new TextDirectoryLoader(); System.out.println("About to load text file " + fileName); System.out.println("Name of path " + fileEntry.getAbsolutePath()); loader.setSource(folder); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".json")) { JSONLoader loader = new JSONLoader(); loader.setSource(new File(fileEntry.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".xrff")) { XRFFLoader loader = new XRFFLoader(); loader.setSource(new File(fileEntry.getAbsolutePath())); traindata = loader.getDataSet(); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".")) { traindata = new Instances(new BufferedReader(new FileReader(fileEntry.getAbsolutePath()))); System.out.println(traindata.toSummaryString()); this.chooseClassifier(); } else if (!fileName.startsWith(".") && fileName.contains(".mdf")) { DatabaseConnection loader = new DatabaseConnection(); loader.connectToDatabase(); InstanceQuery query = new InstanceQuery(); query.setUsername("lamogha"); query.setPassword("l@mmyPHD"); query.setQuery("select * from customers"); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query.retrieveInstances(); System.out.println(data.toSummaryString()); this.chooseClassifier(); } } } // System.exit(0); } }