public SVM train(InstanceList trainingList) { svm_problem problem = new svm_problem(); problem.l = trainingList.size(); problem.x = new svm_node[problem.l][]; problem.y = new double[problem.l]; for (int i = 0; i < trainingList.size(); i++) { Instance instance = trainingList.get(i); svm_node[] input = SVM.getSvmNodes(instance); if (input == null) { continue; } int labelIndex = ((Label) instance.getTarget()).getIndex(); problem.x[i] = input; problem.y[i] = labelIndex; } int max_index = trainingList.getDataAlphabet().size(); if (param.gamma == 0 && max_index > 0) { param.gamma = 1.0 / max_index; } // int numLabels = trainingList.getTargetAlphabet().size(); // int[] weight_label = new int[numLabels]; // double[] weight = trainingList.targetLabelDistribution().getValues(); // double minValue = Double.MAX_VALUE; // // for (int i = 0; i < weight.length; i++) { // if (minValue > weight[i]) { // minValue = weight[i]; // } // } // // for (int i = 0; i < weight.length; i++) { // weight_label[i] = i; // weight[i] = weight[i] / minValue; // } // // param.weight_label = weight_label; // param.weight = weight; String error_msg = svm.svm_check_parameter(problem, param); if (error_msg != null) { System.err.print("Error: " + error_msg + "\n"); System.exit(1); } svm_model model = svm.svm_train(problem, param); classifier = new SVM(model, trainingList.getPipe()); return classifier; }
/** * This method builds a support vector machine (SVM) model * * @param sparkContext JavaSparkContext initialized with the application * @param modelID Model ID * @param trainingData Training data as a JavaRDD of LabeledPoints * @param testingData Testing data as a JavaRDD of LabeledPoints * @param workflow Machine learning workflow * @param mlModel Deployable machine learning model * @throws MLModelBuilderException */ private ModelSummary buildSVMModel( JavaSparkContext sparkContext, long modelID, JavaRDD<LabeledPoint> trainingData, JavaRDD<LabeledPoint> testingData, Workflow workflow, MLModel mlModel, SortedMap<Integer, String> includedFeatures) throws MLModelBuilderException { if (getNoOfClasses(mlModel) > 2) { throw new MLModelBuilderException( "A binary classification algorithm cannot have more than " + "two distinct values in response variable."); } try { SVM svm = new SVM(); Map<String, String> hyperParameters = workflow.getHyperParameters(); SVMModel svmModel = svm.train( trainingData, Integer.parseInt(hyperParameters.get(MLConstants.ITERATIONS)), hyperParameters.get(MLConstants.REGULARIZATION_TYPE), Double.parseDouble(hyperParameters.get(MLConstants.REGULARIZATION_PARAMETER)), Double.parseDouble(hyperParameters.get(MLConstants.LEARNING_RATE)), Double.parseDouble(hyperParameters.get(MLConstants.SGD_DATA_FRACTION))); // remove from cache trainingData.unpersist(); // add test data to cache testingData.cache(); Vector weights = svmModel.weights(); if (!isValidWeights(weights)) { throw new MLModelBuilderException( "Weights of the model generated are null or infinity. [Weights] " + vectorToString(weights)); } // getting scores and labels without clearing threshold to get confusion matrix JavaRDD<Tuple2<Object, Object>> scoresAndLabelsThresholded = svm.test(svmModel, testingData); MulticlassMetrics multiclassMetrics = new MulticlassMetrics(JavaRDD.toRDD(scoresAndLabelsThresholded)); MulticlassConfusionMatrix multiclassConfusionMatrix = getMulticlassConfusionMatrix(multiclassMetrics, mlModel); svmModel.clearThreshold(); JavaRDD<Tuple2<Object, Object>> scoresAndLabels = svm.test(svmModel, testingData); ProbabilisticClassificationModelSummary probabilisticClassificationModelSummary = SparkModelUtils.generateProbabilisticClassificationModelSummary( sparkContext, testingData, scoresAndLabels); // remove from cache testingData.unpersist(); mlModel.setModel(new MLClassificationModel(svmModel)); List<FeatureImportance> featureWeights = getFeatureWeights(includedFeatures, svmModel.weights().toArray()); probabilisticClassificationModelSummary.setFeatures( includedFeatures.values().toArray(new String[0])); probabilisticClassificationModelSummary.setFeatureImportance(featureWeights); probabilisticClassificationModelSummary.setAlgorithm(SUPERVISED_ALGORITHM.SVM.toString()); probabilisticClassificationModelSummary.setMulticlassConfusionMatrix( multiclassConfusionMatrix); Double modelAccuracy = getModelAccuracy(multiclassMetrics); probabilisticClassificationModelSummary.setModelAccuracy(modelAccuracy); probabilisticClassificationModelSummary.setDatasetVersion(workflow.getDatasetVersion()); return probabilisticClassificationModelSummary; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building SVM model: " + e.getMessage(), e); } }
/** * @param args the command line arguments * @throws Exception */ public static void main(String[] args) throws Exception { PreProcessor p = new PreProcessor("census-income.data", "census-income-preprocessed.arff"); p.smote(); PreProcessor p_test = new PreProcessor("census-income.test", "census-income-test-preprocessed.arff"); p_test.run(); BufferedReader traindata = new BufferedReader(new FileReader("census-income-preprocessed.arff")); BufferedReader testdata = new BufferedReader(new FileReader("census-income-test-preprocessed.arff")); Instances traininstance = new Instances(traindata); Instances testinstance = new Instances(testdata); traindata.close(); testdata.close(); traininstance.setClassIndex(traininstance.numAttributes() - 1); testinstance.setClassIndex(testinstance.numAttributes() - 1); int numOfAttributes = testinstance.numAttributes(); int numOfInstances = testinstance.numInstances(); NaiveBayesClassifier nb = new NaiveBayesClassifier("census-income-preprocessed.arff"); Classifier cnaive = nb.NBClassify(); DecisionTree dt = new DecisionTree("census-income-preprocessed.arff"); Classifier cls = dt.DTClassify(); AdaBoost ab = new AdaBoost("census-income-preprocessed.arff"); AdaBoostM1 m1 = ab.AdaBoostDTClassify(); BaggingMethod b = new BaggingMethod("census-income-preprocessed.arff"); Bagging bag = b.BaggingDTClassify(); SVM s = new SVM("census-income-preprocessed.arff"); SMO svm = s.SMOClassifier(); knn knnclass = new knn("census-income-preprocessed.arff"); IBk knnc = knnclass.knnclassifier(); Logistic log = new Logistic(); log.buildClassifier(traininstance); int match = 0; int error = 0; int greater = 0; int less = 0; for (int i = 0; i < numOfInstances; i++) { String predicted = ""; greater = 0; less = 0; double predictions[] = new double[8]; double pred = cls.classifyInstance(testinstance.instance(i)); predictions[0] = pred; double abpred = m1.classifyInstance(testinstance.instance(i)); predictions[1] = abpred; double naivepred = cnaive.classifyInstance(testinstance.instance(i)); predictions[2] = naivepred; double bagpred = bag.classifyInstance(testinstance.instance(i)); predictions[3] = bagpred; double smopred = svm.classifyInstance(testinstance.instance(i)); predictions[4] = smopred; double knnpred = knnc.classifyInstance(testinstance.instance(i)); predictions[5] = knnpred; for (int j = 0; j < 6; j++) { if ((testinstance.instance(i).classAttribute().value((int) predictions[j])) .compareTo(">50K") == 0) greater++; else less++; } if (greater > less) predicted = ">50K"; else predicted = "<=50K"; if ((testinstance.instance(i).stringValue(numOfAttributes - 1)).compareTo(predicted) == 0) match++; else error++; } System.out.println("Correctly classified Instances: " + match); System.out.println("Misclassified Instances: " + error); double accuracy = (double) match / (double) numOfInstances * 100; double error_percent = 100 - accuracy; System.out.println("Accuracy: " + accuracy + "%"); System.out.println("Error: " + error_percent + "%"); }