/** Build a supervised model. */ public MLModel build() throws MLModelBuilderException { MLModelConfigurationContext context = getContext(); JavaSparkContext sparkContext = null; DatabaseService databaseService = MLCoreServiceValueHolder.getInstance().getDatabaseService(); MLModel mlModel = new MLModel(); try { sparkContext = context.getSparkContext(); Workflow workflow = context.getFacts(); long modelId = context.getModelId(); // Verify validity of response variable String typeOfResponseVariable = getTypeOfResponseVariable(workflow.getResponseVariable(), workflow.getFeatures()); if (typeOfResponseVariable == null) { throw new MLModelBuilderException( "Type of response variable cannot be null for supervised learning " + "algorithms."); } // Stops model building if a categorical attribute is used with numerical prediction if (workflow.getAlgorithmClass().equals(AlgorithmType.NUMERICAL_PREDICTION.getValue()) && typeOfResponseVariable.equals(FeatureType.CATEGORICAL)) { throw new MLModelBuilderException( "Categorical attribute " + workflow.getResponseVariable() + " cannot be used as the response variable of the Numerical Prediction algorithm: " + workflow.getAlgorithmName()); } // generate train and test datasets by converting tokens to labeled points int responseIndex = context.getResponseIndex(); SortedMap<Integer, String> includedFeatures = MLUtils.getIncludedFeaturesAfterReordering( workflow, context.getNewToOldIndicesList(), responseIndex); // gets the pre-processed dataset JavaRDD<LabeledPoint> labeledPoints = preProcess().cache(); JavaRDD<LabeledPoint>[] dataSplit = labeledPoints.randomSplit( new double[] {workflow.getTrainDataFraction(), 1 - workflow.getTrainDataFraction()}, MLConstants.RANDOM_SEED); // remove from cache labeledPoints.unpersist(); JavaRDD<LabeledPoint> trainingData = dataSplit[0].cache(); JavaRDD<LabeledPoint> testingData = dataSplit[1]; // create a deployable MLModel object mlModel.setAlgorithmName(workflow.getAlgorithmName()); mlModel.setAlgorithmClass(workflow.getAlgorithmClass()); mlModel.setFeatures(workflow.getIncludedFeatures()); mlModel.setResponseVariable(workflow.getResponseVariable()); mlModel.setEncodings(context.getEncodings()); mlModel.setNewToOldIndicesList(context.getNewToOldIndicesList()); mlModel.setResponseIndex(responseIndex); ModelSummary summaryModel = null; Map<Integer, Integer> categoricalFeatureInfo; // build a machine learning model according to user selected algorithm SUPERVISED_ALGORITHM supervisedAlgorithm = SUPERVISED_ALGORITHM.valueOf(workflow.getAlgorithmName()); switch (supervisedAlgorithm) { case LOGISTIC_REGRESSION: summaryModel = buildLogisticRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, true); break; case LOGISTIC_REGRESSION_LBFGS: summaryModel = buildLogisticRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, false); break; case DECISION_TREE: categoricalFeatureInfo = getCategoricalFeatureInfo(context.getEncodings()); summaryModel = buildDecisionTreeModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, categoricalFeatureInfo); break; case RANDOM_FOREST: categoricalFeatureInfo = getCategoricalFeatureInfo(context.getEncodings()); summaryModel = buildRandomForestTreeModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, categoricalFeatureInfo); break; case SVM: summaryModel = buildSVMModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case NAIVE_BAYES: summaryModel = buildNaiveBayesModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case LINEAR_REGRESSION: summaryModel = buildLinearRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case RIDGE_REGRESSION: summaryModel = buildRidgeRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case LASSO_REGRESSION: summaryModel = buildLassoRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; default: throw new AlgorithmNameException("Incorrect algorithm name"); } // persist model summary databaseService.updateModelSummary(modelId, summaryModel); return mlModel; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building supervised machine learning model: " + e.getMessage(), e); } }
/** * A utility method to generate class classification model summary * * @param predictionsAndLabels Predictions and actual labels * @return Class classification model summary */ public static ClassClassificationAndRegressionModelSummary getClassClassificationModelSummary( JavaSparkContext sparkContext, JavaRDD<LabeledPoint> testingData, JavaPairRDD<Double, Double> predictionsAndLabels) { ClassClassificationAndRegressionModelSummary classClassificationModelSummary = new ClassClassificationAndRegressionModelSummary(); // store predictions and actuals List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>(); for (Tuple2<Double, Double> scoreAndLabel : predictionsAndLabels.collect()) { PredictedVsActual predictedVsActual = new PredictedVsActual(); predictedVsActual.setPredicted(scoreAndLabel._1()); predictedVsActual.setActual(scoreAndLabel._2()); predictedVsActuals.add(predictedVsActual); } // create a list of feature values List<double[]> features = new ArrayList<double[]>(); for (LabeledPoint labeledPoint : testingData.collect()) { if (labeledPoint != null && labeledPoint.features() != null) { double[] rowFeatures = labeledPoint.features().toArray(); features.add(rowFeatures); } } // create a list of feature values with predicted vs. actuals List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>(); for (int i = 0; i < features.size(); i++) { TestResultDataPoint testResultDataPoint = new TestResultDataPoint(); testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i)); testResultDataPoint.setFeatureValues(features.get(i)); testResultDataPoints.add(testResultDataPoint); } // covert List to JavaRDD JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD = sparkContext.parallelize(testResultDataPoints); // collect RDD as a sampled list List<TestResultDataPoint> testResultDataPointsSample; if (testResultDataPointsJavaRDD.count() > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) { testResultDataPointsSample = testResultDataPointsJavaRDD.takeSample( true, MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()); } else { testResultDataPointsSample = testResultDataPointsJavaRDD.collect(); } classClassificationModelSummary.setTestResultDataPointsSample(testResultDataPointsSample); classClassificationModelSummary.setPredictedVsActuals(predictedVsActuals); // calculate test error double error = 1.0 * predictionsAndLabels .filter( new Function<Tuple2<Double, Double>, Boolean>() { private static final long serialVersionUID = -3063364114286182333L; @Override public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }) .count() / predictionsAndLabels.count(); classClassificationModelSummary.setError(error); return classClassificationModelSummary; }
/** * A utility method to generate probabilistic classification model summary * * @param scoresAndLabels Tuple2 containing scores and labels * @return Probabilistic classification model summary */ public static ProbabilisticClassificationModelSummary generateProbabilisticClassificationModelSummary( JavaSparkContext sparkContext, JavaRDD<LabeledPoint> testingData, JavaRDD<Tuple2<Object, Object>> scoresAndLabels) { ProbabilisticClassificationModelSummary probabilisticClassificationModelSummary = new ProbabilisticClassificationModelSummary(); // store predictions and actuals List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>(); DecimalFormat decimalFormat = new DecimalFormat(MLConstants.DECIMAL_FORMAT); for (Tuple2<Object, Object> scoreAndLabel : scoresAndLabels.collect()) { PredictedVsActual predictedVsActual = new PredictedVsActual(); predictedVsActual.setPredicted(Double.parseDouble(decimalFormat.format(scoreAndLabel._1()))); predictedVsActual.setActual(Double.parseDouble(decimalFormat.format(scoreAndLabel._2()))); predictedVsActuals.add(predictedVsActual); if (log.isTraceEnabled()) { log.trace( "Predicted: " + predictedVsActual.getPredicted() + " ------ Actual: " + predictedVsActual.getActual()); } } // create a list of feature values List<double[]> features = new ArrayList<double[]>(); for (LabeledPoint labeledPoint : testingData.collect()) { if (labeledPoint != null && labeledPoint.features() != null) { double[] rowFeatures = labeledPoint.features().toArray(); features.add(rowFeatures); } } // create a list of feature values with predicted vs. actuals List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>(); for (int i = 0; i < features.size(); i++) { TestResultDataPoint testResultDataPoint = new TestResultDataPoint(); testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i)); testResultDataPoint.setFeatureValues(features.get(i)); testResultDataPoints.add(testResultDataPoint); } // covert List to JavaRDD JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD = sparkContext.parallelize(testResultDataPoints); // collect RDD as a sampled list List<TestResultDataPoint> testResultDataPointsSample; if (testResultDataPointsJavaRDD.count() > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) { testResultDataPointsSample = testResultDataPointsJavaRDD.takeSample( true, MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()); } else { testResultDataPointsSample = testResultDataPointsJavaRDD.collect(); } probabilisticClassificationModelSummary.setTestResultDataPointsSample( testResultDataPointsSample); probabilisticClassificationModelSummary.setPredictedVsActuals(predictedVsActuals); // generate binary classification metrics BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(JavaRDD.toRDD(scoresAndLabels)); // store AUC probabilisticClassificationModelSummary.setAuc(metrics.areaUnderROC()); // store ROC data points List<Tuple2<Object, Object>> rocData = metrics.roc().toJavaRDD().collect(); JSONArray rocPoints = new JSONArray(); for (int i = 0; i < rocData.size(); i += 1) { JSONArray point = new JSONArray(); point.put(decimalFormat.format(rocData.get(i)._1())); point.put(decimalFormat.format(rocData.get(i)._2())); rocPoints.put(point); } probabilisticClassificationModelSummary.setRoc(rocPoints.toString()); return probabilisticClassificationModelSummary; }
/** * A utility method to generate regression model summary * * @param predictionsAndLabels Tuple2 containing predicted and actual values * @return Regression model summary */ public static ClassClassificationAndRegressionModelSummary generateRegressionModelSummary( JavaSparkContext sparkContext, JavaRDD<LabeledPoint> testingData, JavaRDD<Tuple2<Double, Double>> predictionsAndLabels) { ClassClassificationAndRegressionModelSummary regressionModelSummary = new ClassClassificationAndRegressionModelSummary(); // store predictions and actuals List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>(); DecimalFormat decimalFormat = new DecimalFormat(MLConstants.DECIMAL_FORMAT); for (Tuple2<Double, Double> scoreAndLabel : predictionsAndLabels.collect()) { PredictedVsActual predictedVsActual = new PredictedVsActual(); predictedVsActual.setPredicted(Double.parseDouble(decimalFormat.format(scoreAndLabel._1()))); predictedVsActual.setActual(Double.parseDouble(decimalFormat.format(scoreAndLabel._2()))); predictedVsActuals.add(predictedVsActual); } // create a list of feature values List<double[]> features = new ArrayList<double[]>(); for (LabeledPoint labeledPoint : testingData.collect()) { if (labeledPoint != null && labeledPoint.features() != null) { double[] rowFeatures = labeledPoint.features().toArray(); features.add(rowFeatures); } } // create a list of feature values with predicted vs. actuals List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>(); for (int i = 0; i < features.size(); i++) { TestResultDataPoint testResultDataPoint = new TestResultDataPoint(); testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i)); testResultDataPoint.setFeatureValues(features.get(i)); testResultDataPoints.add(testResultDataPoint); } // covert List to JavaRDD JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD = sparkContext.parallelize(testResultDataPoints); // collect RDD as a sampled list List<TestResultDataPoint> testResultDataPointsSample; if (testResultDataPointsJavaRDD.count() > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) { testResultDataPointsSample = testResultDataPointsJavaRDD.takeSample( true, MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()); } else { testResultDataPointsSample = testResultDataPointsJavaRDD.collect(); } regressionModelSummary.setTestResultDataPointsSample(testResultDataPointsSample); regressionModelSummary.setPredictedVsActuals(predictedVsActuals); // calculate mean squared error (MSE) double meanSquaredError = new JavaDoubleRDD( predictionsAndLabels .map( new Function<Tuple2<Double, Double>, Object>() { private static final long serialVersionUID = -162193633199074816L; public Object call(Tuple2<Double, Double> pair) { return Math.pow(pair._1() - pair._2(), 2.0); } }) .rdd()) .mean(); regressionModelSummary.setError(meanSquaredError); return regressionModelSummary; }