/** * A utility method to generate class classification model summary * * @param predictionsAndLabels Predictions and actual labels * @return Class classification model summary */ public static ClassClassificationAndRegressionModelSummary getClassClassificationModelSummary( JavaSparkContext sparkContext, JavaRDD<LabeledPoint> testingData, JavaPairRDD<Double, Double> predictionsAndLabels) { ClassClassificationAndRegressionModelSummary classClassificationModelSummary = new ClassClassificationAndRegressionModelSummary(); // store predictions and actuals List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>(); for (Tuple2<Double, Double> scoreAndLabel : predictionsAndLabels.collect()) { PredictedVsActual predictedVsActual = new PredictedVsActual(); predictedVsActual.setPredicted(scoreAndLabel._1()); predictedVsActual.setActual(scoreAndLabel._2()); predictedVsActuals.add(predictedVsActual); } // create a list of feature values List<double[]> features = new ArrayList<double[]>(); for (LabeledPoint labeledPoint : testingData.collect()) { if (labeledPoint != null && labeledPoint.features() != null) { double[] rowFeatures = labeledPoint.features().toArray(); features.add(rowFeatures); } } // create a list of feature values with predicted vs. actuals List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>(); for (int i = 0; i < features.size(); i++) { TestResultDataPoint testResultDataPoint = new TestResultDataPoint(); testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i)); testResultDataPoint.setFeatureValues(features.get(i)); testResultDataPoints.add(testResultDataPoint); } // covert List to JavaRDD JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD = sparkContext.parallelize(testResultDataPoints); // collect RDD as a sampled list List<TestResultDataPoint> testResultDataPointsSample; if (testResultDataPointsJavaRDD.count() > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) { testResultDataPointsSample = testResultDataPointsJavaRDD.takeSample( true, MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()); } else { testResultDataPointsSample = testResultDataPointsJavaRDD.collect(); } classClassificationModelSummary.setTestResultDataPointsSample(testResultDataPointsSample); classClassificationModelSummary.setPredictedVsActuals(predictedVsActuals); // calculate test error double error = 1.0 * predictionsAndLabels .filter( new Function<Tuple2<Double, Double>, Boolean>() { private static final long serialVersionUID = -3063364114286182333L; @Override public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }) .count() / predictionsAndLabels.count(); classClassificationModelSummary.setError(error); return classClassificationModelSummary; }
/** * A utility method to generate probabilistic classification model summary * * @param scoresAndLabels Tuple2 containing scores and labels * @return Probabilistic classification model summary */ public static ProbabilisticClassificationModelSummary generateProbabilisticClassificationModelSummary( JavaSparkContext sparkContext, JavaRDD<LabeledPoint> testingData, JavaRDD<Tuple2<Object, Object>> scoresAndLabels) { ProbabilisticClassificationModelSummary probabilisticClassificationModelSummary = new ProbabilisticClassificationModelSummary(); // store predictions and actuals List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>(); DecimalFormat decimalFormat = new DecimalFormat(MLConstants.DECIMAL_FORMAT); for (Tuple2<Object, Object> scoreAndLabel : scoresAndLabels.collect()) { PredictedVsActual predictedVsActual = new PredictedVsActual(); predictedVsActual.setPredicted(Double.parseDouble(decimalFormat.format(scoreAndLabel._1()))); predictedVsActual.setActual(Double.parseDouble(decimalFormat.format(scoreAndLabel._2()))); predictedVsActuals.add(predictedVsActual); if (log.isTraceEnabled()) { log.trace( "Predicted: " + predictedVsActual.getPredicted() + " ------ Actual: " + predictedVsActual.getActual()); } } // create a list of feature values List<double[]> features = new ArrayList<double[]>(); for (LabeledPoint labeledPoint : testingData.collect()) { if (labeledPoint != null && labeledPoint.features() != null) { double[] rowFeatures = labeledPoint.features().toArray(); features.add(rowFeatures); } } // create a list of feature values with predicted vs. actuals List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>(); for (int i = 0; i < features.size(); i++) { TestResultDataPoint testResultDataPoint = new TestResultDataPoint(); testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i)); testResultDataPoint.setFeatureValues(features.get(i)); testResultDataPoints.add(testResultDataPoint); } // covert List to JavaRDD JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD = sparkContext.parallelize(testResultDataPoints); // collect RDD as a sampled list List<TestResultDataPoint> testResultDataPointsSample; if (testResultDataPointsJavaRDD.count() > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) { testResultDataPointsSample = testResultDataPointsJavaRDD.takeSample( true, MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()); } else { testResultDataPointsSample = testResultDataPointsJavaRDD.collect(); } probabilisticClassificationModelSummary.setTestResultDataPointsSample( testResultDataPointsSample); probabilisticClassificationModelSummary.setPredictedVsActuals(predictedVsActuals); // generate binary classification metrics BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(JavaRDD.toRDD(scoresAndLabels)); // store AUC probabilisticClassificationModelSummary.setAuc(metrics.areaUnderROC()); // store ROC data points List<Tuple2<Object, Object>> rocData = metrics.roc().toJavaRDD().collect(); JSONArray rocPoints = new JSONArray(); for (int i = 0; i < rocData.size(); i += 1) { JSONArray point = new JSONArray(); point.put(decimalFormat.format(rocData.get(i)._1())); point.put(decimalFormat.format(rocData.get(i)._2())); rocPoints.put(point); } probabilisticClassificationModelSummary.setRoc(rocPoints.toString()); return probabilisticClassificationModelSummary; }
/** * A utility method to generate regression model summary * * @param predictionsAndLabels Tuple2 containing predicted and actual values * @return Regression model summary */ public static ClassClassificationAndRegressionModelSummary generateRegressionModelSummary( JavaSparkContext sparkContext, JavaRDD<LabeledPoint> testingData, JavaRDD<Tuple2<Double, Double>> predictionsAndLabels) { ClassClassificationAndRegressionModelSummary regressionModelSummary = new ClassClassificationAndRegressionModelSummary(); // store predictions and actuals List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>(); DecimalFormat decimalFormat = new DecimalFormat(MLConstants.DECIMAL_FORMAT); for (Tuple2<Double, Double> scoreAndLabel : predictionsAndLabels.collect()) { PredictedVsActual predictedVsActual = new PredictedVsActual(); predictedVsActual.setPredicted(Double.parseDouble(decimalFormat.format(scoreAndLabel._1()))); predictedVsActual.setActual(Double.parseDouble(decimalFormat.format(scoreAndLabel._2()))); predictedVsActuals.add(predictedVsActual); } // create a list of feature values List<double[]> features = new ArrayList<double[]>(); for (LabeledPoint labeledPoint : testingData.collect()) { if (labeledPoint != null && labeledPoint.features() != null) { double[] rowFeatures = labeledPoint.features().toArray(); features.add(rowFeatures); } } // create a list of feature values with predicted vs. actuals List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>(); for (int i = 0; i < features.size(); i++) { TestResultDataPoint testResultDataPoint = new TestResultDataPoint(); testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i)); testResultDataPoint.setFeatureValues(features.get(i)); testResultDataPoints.add(testResultDataPoint); } // covert List to JavaRDD JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD = sparkContext.parallelize(testResultDataPoints); // collect RDD as a sampled list List<TestResultDataPoint> testResultDataPointsSample; if (testResultDataPointsJavaRDD.count() > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) { testResultDataPointsSample = testResultDataPointsJavaRDD.takeSample( true, MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()); } else { testResultDataPointsSample = testResultDataPointsJavaRDD.collect(); } regressionModelSummary.setTestResultDataPointsSample(testResultDataPointsSample); regressionModelSummary.setPredictedVsActuals(predictedVsActuals); // calculate mean squared error (MSE) double meanSquaredError = new JavaDoubleRDD( predictionsAndLabels .map( new Function<Tuple2<Double, Double>, Object>() { private static final long serialVersionUID = -162193633199074816L; public Object call(Tuple2<Double, Double> pair) { return Math.pow(pair._1() - pair._2(), 2.0); } }) .rdd()) .mean(); regressionModelSummary.setError(meanSquaredError); return regressionModelSummary; }