// 将样本集中裁剪提取成m个样本组成的集合; public void SubSample(Instances inst, int m) { inst.randomize(new Random()); while (inst.numInstances() != m) { inst.delete(0); } // System.out.println("subsample:=" + inst.numInstances() + " m:=" + m ); }
/** * Buildclassifier selects a classifier from the set of classifiers by minimising error on the * training data. * * @param data the training data to be used for generating the boosted classifier. * @exception Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if (m_Classifiers.length == 0) { throw new Exception("No base classifiers have been set!"); } Instances newData = new Instances(data); newData.deleteWithMissingClass(); newData.randomize(new Random(m_Seed)); if (newData.classAttribute().isNominal() && (m_NumXValFolds > 1)) newData.stratify(m_NumXValFolds); Instances train = newData; // train on all data by default Instances test = newData; // test on training data by default Classifier bestClassifier = null; int bestIndex = -1; double bestPerformance = Double.NaN; int numClassifiers = m_Classifiers.length; for (int i = 0; i < numClassifiers; i++) { Classifier currentClassifier = getClassifier(i); Evaluation evaluation; if (m_NumXValFolds > 1) { evaluation = new Evaluation(newData); for (int j = 0; j < m_NumXValFolds; j++) { train = newData.trainCV(m_NumXValFolds, j); test = newData.testCV(m_NumXValFolds, j); currentClassifier.buildClassifier(train); evaluation.setPriors(train); evaluation.evaluateModel(currentClassifier, test); } } else { currentClassifier.buildClassifier(train); evaluation = new Evaluation(train); evaluation.evaluateModel(currentClassifier, test); } double error = evaluation.errorRate(); if (m_Debug) { System.err.println( "Error rate: " + Utils.doubleToString(error, 6, 4) + " for classifier " + currentClassifier.getClass().getName()); } if ((i == 0) || (error < bestPerformance)) { bestClassifier = currentClassifier; bestPerformance = error; bestIndex = i; } } m_ClassifierIndex = bestIndex; m_Classifier = bestClassifier; if (m_NumXValFolds > 1) { m_Classifier.buildClassifier(newData); } }
// Create 70% training data set public void generateTrainingDataSet() { trainingDataSet = new Instances(instances); int size = trainingDataSet.numInstances(); // Remove closing prize "close" attribute trainingDataSet.deleteAttributeAt(0); // Randomize data set trainingDataSet.randomize(trainingDataSet.getRandomNumberGenerator(1)); }
public void split() throws IOException { FileSystem fs = FileSystem.get(new Configuration()); fs.delete(splitsDir, true); instances.randomize(new Random(1)); instances.stratify(numberOfSplits); for (int i = 0; i < numberOfSplits; i++) { BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(splitsDir, "train_split_" + i + ".arff")))); bw.write(instances.testCV(numberOfSplits, i).toString()); bw.close(); } }
/** * Generates the classifier. * * @param instances set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ public void buildClassifier(Instances instances) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class Instances trainData = new Instances(instances); trainData.deleteWithMissingClass(); if (!(m_Classifier instanceof OptionHandler)) { throw new IllegalArgumentException("Base classifier should be OptionHandler."); } m_InitOptions = ((OptionHandler) m_Classifier).getOptions(); m_BestPerformance = -99; m_NumAttributes = trainData.numAttributes(); Random random = new Random(m_Seed); trainData.randomize(random); m_TrainFoldSize = trainData.trainCV(m_NumFolds, 0).numInstances(); // Check whether there are any parameters to optimize if (m_CVParams.size() == 0) { m_Classifier.buildClassifier(trainData); m_BestClassifierOptions = m_InitOptions; return; } if (trainData.classAttribute().isNominal()) { trainData.stratify(m_NumFolds); } m_BestClassifierOptions = null; // Set up m_ClassifierOptions -- take getOptions() and remove // those being optimised. m_ClassifierOptions = ((OptionHandler) m_Classifier).getOptions(); for (int i = 0; i < m_CVParams.size(); i++) { Utils.getOption(((CVParameter) m_CVParams.elementAt(i)).m_ParamChar, m_ClassifierOptions); } findParamsByCrossValidation(0, trainData, random); String[] options = (String[]) m_BestClassifierOptions.clone(); ((OptionHandler) m_Classifier).setOptions(options); m_Classifier.buildClassifier(trainData); }
public static void main(String[] args) throws Exception { BufferedReader reader = new BufferedReader(new FileReader("spambase.arff")); Instances data = new Instances(reader); reader.close(); // setting class attribute data.setClassIndex(data.numAttributes() - 1); int i = data.numInstances(); int j = data.numAttributes() - 1; File file = new File("tablelog.csv"); Writer output = null; output = new BufferedWriter(new FileWriter(file)); output.write( "%missing,auc1,correct1,fmeasure1,auc2,correct2,fmeasure2,auc3,correct3,fmeasure3\n"); Random randomGenerator = new Random(); data.randomize(randomGenerator); int numBlock = data.numInstances() / 2; double num0 = 0, num1 = 0, num2 = 0; /*mdata.instance(0).setMissing(0); mdata.deleteWithMissing(0); System.out.println(mdata.numInstances()+","+data.numInstances());*/ // Instances traindata=null; // Instances testdata=null; // System.out.println(data.instance(3).stringValue(1)); for (int perc = 10; perc < 101; perc = perc + 10) { Instances mdata = new Instances(data); int numMissing = perc * numBlock / 100; double y11[] = new double[2]; double y21[] = new double[2]; double y31[] = new double[2]; double y12[] = new double[2]; double y22[] = new double[2]; double y32[] = new double[2]; double y13[] = new double[2]; double y23[] = new double[2]; double y33[] = new double[2]; for (int p = 0; p < 2; p++) { Instances traindata = mdata.trainCV(2, p); Instances testdata = mdata.testCV(2, p); num0 = 0; num1 = 0; num2 = 0; for (int t = 0; t < numBlock; t++) { if (traindata.instance(t).classValue() == 0) num0++; if (traindata.instance(t).classValue() == 1) num1++; // if (traindata.instance(t).classValue()==2) num2++; } // System.out.println(mdata.instance(0).classValue()); Instances trainwithmissing = new Instances(traindata); Instances testwithmissing = new Instances(testdata); for (int q = 0; q < j; q++) { int r = randomGenerator.nextInt((int) i / 2); for (int k = 0; k < numMissing; k++) { // int r = randomGenerator.nextInt((int) i/2); // int c = randomGenerator.nextInt(j); trainwithmissing.instance((r + k) % numBlock).setMissing(q); testwithmissing.instance((r + k) % numBlock).setMissing(q); } } // trainwithmissing.deleteWithMissing(0);System.out.println(traindata.numInstances()+","+trainwithmissing.numInstances()); Classifier cModel = (Classifier) new Logistic(); // try for different classifiers and datasets cModel.buildClassifier(trainwithmissing); Evaluation eTest1 = new Evaluation(trainwithmissing); eTest1.evaluateModel(cModel, testdata); // eTest.crossValidateModel(cModel,mdata,10,mdata.getRandomNumberGenerator(1)); y11[p] = num0 / numBlock * eTest1.areaUnderROC(0) + num1 / numBlock * eTest1.areaUnderROC(1) /*+num2/numBlock*eTest1.areaUnderROC(2)*/; y21[p] = eTest1.correct(); y31[p] = num0 / numBlock * eTest1.fMeasure(0) + num1 / numBlock * eTest1.fMeasure(1) /*+num2/numBlock*eTest1.fMeasure(2)*/; Classifier cModel2 = (Classifier) new Logistic(); cModel2.buildClassifier(traindata); Evaluation eTest2 = new Evaluation(traindata); eTest2.evaluateModel(cModel2, testwithmissing); y12[p] = num0 / numBlock * eTest2.areaUnderROC(0) + num1 / numBlock * eTest2.areaUnderROC(1) /*+num2/numBlock*eTest2.areaUnderROC(2)*/; y22[p] = eTest2.correct(); y32[p] = num0 / numBlock * eTest2.fMeasure(0) + num1 / numBlock * eTest2.fMeasure(1) /*+num2/numBlock*eTest2.fMeasure(2)*/; Classifier cModel3 = (Classifier) new Logistic(); cModel3.buildClassifier(trainwithmissing); Evaluation eTest3 = new Evaluation(trainwithmissing); eTest3.evaluateModel(cModel3, testwithmissing); y13[p] = num0 / numBlock * eTest3.areaUnderROC(0) + num1 / numBlock * eTest3.areaUnderROC(1) /*+num2/numBlock*eTest3.areaUnderROC(2)*/; y23[p] = eTest3.correct(); y33[p] = num0 / numBlock * eTest3.fMeasure(0) + num1 / numBlock * eTest3.fMeasure(1) /*+num2/numBlock*eTest3.fMeasure(2)*/; // System.out.println(num0+","+num1+","+num2+"\n"); } double auc1 = (y11[0] + y11[1]) / 2; double auc2 = (y12[0] + y12[1]) / 2; double auc3 = (y13[0] + y13[1]) / 2; double corr1 = (y21[0] + y21[1]) / i; double corr2 = (y22[0] + y22[1]) / i; double corr3 = (y23[0] + y23[1]) / i; double fm1 = (y31[0] + y31[1]) / 2; double fm2 = (y32[0] + y32[1]) / 2; double fm3 = (y33[0] + y33[1]) / 2; output.write( perc + "," + auc1 + "," + corr1 + "," + fm1 + "," + auc2 + "," + corr2 + "," + fm2 + "," + auc3 + "," + corr3 + "," + fm3 + "\n"); // System.out.println(num0); // mdata=data; } output.close(); }
/** * Carry out the bias-variance decomposition * * @throws Exception if the decomposition couldn't be carried out */ public void decompose() throws Exception { Reader dataReader = new BufferedReader(new FileReader(m_DataFileName)); Instances data = new Instances(dataReader); if (m_ClassIndex < 0) { data.setClassIndex(data.numAttributes() - 1); } else { data.setClassIndex(m_ClassIndex); } if (data.classAttribute().type() != Attribute.NOMINAL) { throw new Exception("Class attribute must be nominal"); } int numClasses = data.numClasses(); data.deleteWithMissingClass(); if (data.checkForStringAttributes()) { throw new Exception("Can't handle string attributes!"); } if (data.numInstances() < 2 * m_TrainPoolSize) { throw new Exception( "The dataset must contain at least " + (2 * m_TrainPoolSize) + " instances"); } Random random = new Random(m_Seed); data.randomize(random); Instances trainPool = new Instances(data, 0, m_TrainPoolSize); Instances test = new Instances(data, m_TrainPoolSize, data.numInstances() - m_TrainPoolSize); int numTest = test.numInstances(); double[][] instanceProbs = new double[numTest][numClasses]; m_Error = 0; for (int i = 0; i < m_TrainIterations; i++) { if (m_Debug) { System.err.println("Iteration " + (i + 1)); } trainPool.randomize(random); Instances train = new Instances(trainPool, 0, m_TrainPoolSize / 2); Classifier current = AbstractClassifier.makeCopy(m_Classifier); current.buildClassifier(train); //// Evaluate the classifier on test, updating BVD stats for (int j = 0; j < numTest; j++) { int pred = (int) current.classifyInstance(test.instance(j)); if (pred != test.instance(j).classValue()) { m_Error++; } instanceProbs[j][pred]++; } } m_Error /= (m_TrainIterations * numTest); // Average the BV over each instance in test. m_Bias = 0; m_Variance = 0; m_Sigma = 0; for (int i = 0; i < numTest; i++) { Instance current = test.instance(i); double[] predProbs = instanceProbs[i]; double pActual, pPred; double bsum = 0, vsum = 0, ssum = 0; for (int j = 0; j < numClasses; j++) { pActual = (current.classValue() == j) ? 1 : 0; // Or via 1NN from test data? pPred = predProbs[j] / m_TrainIterations; bsum += (pActual - pPred) * (pActual - pPred) - pPred * (1 - pPred) / (m_TrainIterations - 1); vsum += pPred * pPred; ssum += pActual * pActual; } m_Bias += bsum; m_Variance += (1 - vsum); m_Sigma += (1 - ssum); } m_Bias /= (2 * numTest); m_Variance /= (2 * numTest); m_Sigma /= (2 * numTest); if (m_Debug) { System.err.println("Decomposition finished"); } }
@Override public Void doInBackground() { BufferedReader reader; publish("Computing features..."); int testingSamples = p.getAllFeatures2(path, "testing_data"); try { publish("Reading data..."); reader = new BufferedReader(new FileReader("testing_data.arff")); final Instances testingdata = new Instances(reader); reader.close(); // setting class attribute testingdata.setClassIndex(13); testingdata.randomize(new Random(1)); long startTime = System.nanoTime(); Classifier ann = (Classifier) weka.core.SerializationHelper.read("mlp.model"); publish("Evaluating ANN..."); evalANN = new Evaluation(testingdata); startTime = System.nanoTime(); evalANN.evaluateModel(ann, testingdata); long runningTimeANN = (System.nanoTime() - startTime) / 1000000; // runningTimeANN /= 100; publish("Done evaluating ANN"); publish("Evaluating SVM..."); Classifier svm = (Classifier) weka.core.SerializationHelper.read("svm.model"); evalSVM = new Evaluation(testingdata); startTime = System.nanoTime(); evalSVM.evaluateModel(svm, testingdata); long runningTimeSVM = (System.nanoTime() - startTime) / 1000000; // runningTimeSVM /= 100; publish("Done evaluating SVM"); publish("Evaluating NB..."); Classifier nb = (Classifier) weka.core.SerializationHelper.read("naivebayes.model"); evalNB = new Evaluation(testingdata); startTime = System.nanoTime(); evalNB.evaluateModel(nb, testingdata); long runningTimeNB = (System.nanoTime() - startTime) / 1000000; // runningTimeNB /= 100; publish("Done evaluating ANN"); Platform.runLater( new Runnable() { @Override public void run() { bc.getData() .get(0) .getData() .get(0) .setYValue(evalANN.correct() / testingdata.size() * 100); bc.getData() .get(0) .getData() .get(1) .setYValue(evalSVM.correct() / testingdata.size() * 100); bc.getData() .get(0) .getData() .get(2) .setYValue(evalNB.correct() / testingdata.size() * 100); for (int i = 0; i < NUM_CLASSES; i++) { lineChart.getData().get(0).getData().get(i).setYValue(evalANN.recall(i) * 100); lineChart.getData().get(1).getData().get(i).setYValue(evalSVM.recall(i) * 100); lineChart.getData().get(2).getData().get(i).setYValue(evalNB.recall(i) * 100); } } }); panel.fillConfTable(evalSVM.confusionMatrix()); summaryTable.setValueAt(evalANN.correct() / testingdata.size() * 100., 0, 1); summaryTable.setValueAt(evalSVM.correct() / testingdata.size() * 100, 0, 2); summaryTable.setValueAt(evalNB.correct() / testingdata.size() * 100, 0, 3); summaryTable.setValueAt(runningTimeANN, 1, 1); summaryTable.setValueAt(runningTimeSVM, 1, 2); summaryTable.setValueAt(runningTimeNB, 1, 3); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } return null; }
@Override public Void doInBackground() { BufferedReader reader; try { publish("Reading data..."); reader = new BufferedReader(new FileReader("cross_validation_data.arff")); final Instances trainingdata = new Instances(reader); reader.close(); // setting class attribute trainingdata.setClassIndex(13); trainingdata.randomize(new Random(1)); long startTime = System.nanoTime(); publish("Training Naive Bayes Classifier..."); NaiveBayes nb = new NaiveBayes(); startTime = System.nanoTime(); nb.buildClassifier(trainingdata); double runningTimeNB = (System.nanoTime() - startTime) / 1000000; runningTimeNB /= 1000; // saving the naive bayes model weka.core.SerializationHelper.write("naivebayes.model", nb); System.out.println("running time" + runningTimeNB); publish("Done training NB.\nEvaluating NB using 10-fold cross-validation..."); evalNB = new Evaluation(trainingdata); evalNB.crossValidateModel(nb, trainingdata, 10, new Random(1)); publish("Done evaluating NB."); // System.out.println(evalNB.toSummaryString("\nResults for Naive Bayes\n======\n", false)); MultilayerPerceptron mlp = new MultilayerPerceptron(); mlp.setOptions(Utils.splitOptions("-L 0.3 -M 0.2 -N 500 -V 0 -S 0 -E 20 -H a")); publish("Training ANN..."); startTime = System.nanoTime(); mlp.buildClassifier(trainingdata); long runningTimeANN = (System.nanoTime() - startTime) / 1000000; runningTimeANN /= 1000; // saving the MLP model weka.core.SerializationHelper.write("mlp.model", mlp); publish("Done training ANN.\nEvaluating ANN using 10-fold cross-validation..."); evalANN = new Evaluation(trainingdata); evalANN.evaluateModel(mlp, trainingdata); // evalMLP.crossValidateModel(mlp, trainingdata, 10, new Random(1)); publish("Done evaluating ANN."); publish("Training SVM..."); SMO svm = new SMO(); startTime = System.nanoTime(); svm.buildClassifier(trainingdata); long runningTimeSVM = (System.nanoTime() - startTime) / 1000000; runningTimeSVM /= 1000; weka.core.SerializationHelper.write("svm.model", svm); publish("Done training SVM.\nEvaluating SVM using 10-fold cross-validation..."); evalSVM = new Evaluation(trainingdata); evalSVM.evaluateModel(svm, trainingdata); publish("Done evaluating SVM."); Platform.runLater( new Runnable() { @Override public void run() { bc.getData() .get(0) .getData() .get(0) .setYValue(evalANN.correct() / trainingdata.size() * 100); bc.getData() .get(0) .getData() .get(1) .setYValue(evalSVM.correct() / trainingdata.size() * 100); bc.getData() .get(0) .getData() .get(2) .setYValue(evalNB.correct() / trainingdata.size() * 100); for (int i = 0; i < NUM_CLASSES; i++) { lineChart.getData().get(0).getData().get(i).setYValue(evalANN.recall(i) * 100); lineChart.getData().get(1).getData().get(i).setYValue(evalSVM.recall(i) * 100); lineChart.getData().get(2).getData().get(i).setYValue(evalNB.recall(i) * 100); } } }); panel.fillConfTable(evalSVM.confusionMatrix()); summaryTable.setValueAt(evalANN.correct() / trainingdata.size() * 100., 0, 1); summaryTable.setValueAt(evalSVM.correct() / trainingdata.size() * 100, 0, 2); summaryTable.setValueAt(evalNB.correct() / trainingdata.size() * 100, 0, 3); summaryTable.setValueAt(runningTimeANN, 1, 1); summaryTable.setValueAt(runningTimeSVM, 1, 2); summaryTable.setValueAt(runningTimeNB, 1, 3); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); } return null; }
/** * Evaluates a feature subset by cross validation * * @param feature_set the subset to be evaluated * @param num_atts the number of attributes in the subset * @return the estimated accuracy * @throws Exception if subset can't be evaluated */ protected double estimatePerformance(BitSet feature_set, int num_atts) throws Exception { m_evaluation = new Evaluation(m_theInstances); int i; int[] fs = new int[num_atts]; double[] instA = new double[num_atts]; int classI = m_theInstances.classIndex(); int index = 0; for (i = 0; i < m_numAttributes; i++) { if (feature_set.get(i)) { fs[index++] = i; } } // create new hash table m_entries = new Hashtable((int) (m_theInstances.numInstances() * 1.5)); // insert instances into the hash table for (i = 0; i < m_numInstances; i++) { Instance inst = m_theInstances.instance(i); for (int j = 0; j < fs.length; j++) { if (fs[j] == classI) { instA[j] = Double.MAX_VALUE; // missing for the class } else if (inst.isMissing(fs[j])) { instA[j] = Double.MAX_VALUE; } else { instA[j] = inst.value(fs[j]); } } insertIntoTable(inst, instA); } if (m_CVFolds == 1) { // calculate leave one out error for (i = 0; i < m_numInstances; i++) { Instance inst = m_theInstances.instance(i); for (int j = 0; j < fs.length; j++) { if (fs[j] == classI) { instA[j] = Double.MAX_VALUE; // missing for the class } else if (inst.isMissing(fs[j])) { instA[j] = Double.MAX_VALUE; } else { instA[j] = inst.value(fs[j]); } } evaluateInstanceLeaveOneOut(inst, instA); } } else { m_theInstances.randomize(m_rr); m_theInstances.stratify(m_CVFolds); // calculate 10 fold cross validation error for (i = 0; i < m_CVFolds; i++) { Instances insts = m_theInstances.testCV(m_CVFolds, i); evaluateFoldCV(insts, fs); } } switch (m_evaluationMeasure) { case EVAL_DEFAULT: if (m_classIsNominal) { return m_evaluation.pctCorrect(); } return -m_evaluation.rootMeanSquaredError(); case EVAL_ACCURACY: return m_evaluation.pctCorrect(); case EVAL_RMSE: return -m_evaluation.rootMeanSquaredError(); case EVAL_MAE: return -m_evaluation.meanAbsoluteError(); case EVAL_AUC: double[] classPriors = m_evaluation.getClassPriors(); Utils.normalize(classPriors); double weightedAUC = 0; for (i = 0; i < m_theInstances.classAttribute().numValues(); i++) { double tempAUC = m_evaluation.areaUnderROC(i); if (!Utils.isMissingValue(tempAUC)) { weightedAUC += (classPriors[i] * tempAUC); } else { System.err.println("Undefined AUC!!"); } } return weightedAUC; } // shouldn't get here return 0.0; }