/** * This method builds a linear regression model * * @param sparkContext JavaSparkContext initialized with the application * @param modelID Model ID * @param trainingData Training data as a JavaRDD of LabeledPoints * @param testingData Testing data as a JavaRDD of LabeledPoints * @param workflow Machine learning workflow * @param mlModel Deployable machine learning model * @throws MLModelBuilderException */ private ModelSummary buildLinearRegressionModel( JavaSparkContext sparkContext, long modelID, JavaRDD<LabeledPoint> trainingData, JavaRDD<LabeledPoint> testingData, Workflow workflow, MLModel mlModel, SortedMap<Integer, String> includedFeatures) throws MLModelBuilderException { try { LinearRegression linearRegression = new LinearRegression(); Map<String, String> hyperParameters = workflow.getHyperParameters(); LinearRegressionModel linearRegressionModel = linearRegression.train( trainingData, Integer.parseInt(hyperParameters.get(MLConstants.ITERATIONS)), Double.parseDouble(hyperParameters.get(MLConstants.LEARNING_RATE)), Double.parseDouble(hyperParameters.get(MLConstants.SGD_DATA_FRACTION))); // remove from cache trainingData.unpersist(); // add test data to cache testingData.cache(); Vector weights = linearRegressionModel.weights(); if (!isValidWeights(weights)) { throw new MLModelBuilderException( "Weights of the model generated are null or infinity. [Weights] " + vectorToString(weights)); } JavaRDD<Tuple2<Double, Double>> predictionsAndLabels = linearRegression.test(linearRegressionModel, testingData).cache(); ClassClassificationAndRegressionModelSummary regressionModelSummary = SparkModelUtils.generateRegressionModelSummary( sparkContext, testingData, predictionsAndLabels); // remove from cache testingData.unpersist(); mlModel.setModel(new MLGeneralizedLinearModel(linearRegressionModel)); List<FeatureImportance> featureWeights = getFeatureWeights(includedFeatures, linearRegressionModel.weights().toArray()); regressionModelSummary.setFeatures(includedFeatures.values().toArray(new String[0])); regressionModelSummary.setFeatureImportance(featureWeights); regressionModelSummary.setAlgorithm(SUPERVISED_ALGORITHM.LINEAR_REGRESSION.toString()); RegressionMetrics regressionMetrics = getRegressionMetrics(sparkContext, predictionsAndLabels); predictionsAndLabels.unpersist(); Double meanSquaredError = regressionMetrics.meanSquaredError(); regressionModelSummary.setMeanSquaredError(meanSquaredError); regressionModelSummary.setDatasetVersion(workflow.getDatasetVersion()); return regressionModelSummary; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building linear regression model: " + e.getMessage(), e); } }