/** Build a supervised model. */ public MLModel build() throws MLModelBuilderException { MLModelConfigurationContext context = getContext(); JavaSparkContext sparkContext = null; DatabaseService databaseService = MLCoreServiceValueHolder.getInstance().getDatabaseService(); MLModel mlModel = new MLModel(); try { sparkContext = context.getSparkContext(); Workflow workflow = context.getFacts(); long modelId = context.getModelId(); // Verify validity of response variable String typeOfResponseVariable = getTypeOfResponseVariable(workflow.getResponseVariable(), workflow.getFeatures()); if (typeOfResponseVariable == null) { throw new MLModelBuilderException( "Type of response variable cannot be null for supervised learning " + "algorithms."); } // Stops model building if a categorical attribute is used with numerical prediction if (workflow.getAlgorithmClass().equals(AlgorithmType.NUMERICAL_PREDICTION.getValue()) && typeOfResponseVariable.equals(FeatureType.CATEGORICAL)) { throw new MLModelBuilderException( "Categorical attribute " + workflow.getResponseVariable() + " cannot be used as the response variable of the Numerical Prediction algorithm: " + workflow.getAlgorithmName()); } // generate train and test datasets by converting tokens to labeled points int responseIndex = context.getResponseIndex(); SortedMap<Integer, String> includedFeatures = MLUtils.getIncludedFeaturesAfterReordering( workflow, context.getNewToOldIndicesList(), responseIndex); // gets the pre-processed dataset JavaRDD<LabeledPoint> labeledPoints = preProcess().cache(); JavaRDD<LabeledPoint>[] dataSplit = labeledPoints.randomSplit( new double[] {workflow.getTrainDataFraction(), 1 - workflow.getTrainDataFraction()}, MLConstants.RANDOM_SEED); // remove from cache labeledPoints.unpersist(); JavaRDD<LabeledPoint> trainingData = dataSplit[0].cache(); JavaRDD<LabeledPoint> testingData = dataSplit[1]; // create a deployable MLModel object mlModel.setAlgorithmName(workflow.getAlgorithmName()); mlModel.setAlgorithmClass(workflow.getAlgorithmClass()); mlModel.setFeatures(workflow.getIncludedFeatures()); mlModel.setResponseVariable(workflow.getResponseVariable()); mlModel.setEncodings(context.getEncodings()); mlModel.setNewToOldIndicesList(context.getNewToOldIndicesList()); mlModel.setResponseIndex(responseIndex); ModelSummary summaryModel = null; Map<Integer, Integer> categoricalFeatureInfo; // build a machine learning model according to user selected algorithm SUPERVISED_ALGORITHM supervisedAlgorithm = SUPERVISED_ALGORITHM.valueOf(workflow.getAlgorithmName()); switch (supervisedAlgorithm) { case LOGISTIC_REGRESSION: summaryModel = buildLogisticRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, true); break; case LOGISTIC_REGRESSION_LBFGS: summaryModel = buildLogisticRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, false); break; case DECISION_TREE: categoricalFeatureInfo = getCategoricalFeatureInfo(context.getEncodings()); summaryModel = buildDecisionTreeModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, categoricalFeatureInfo); break; case RANDOM_FOREST: categoricalFeatureInfo = getCategoricalFeatureInfo(context.getEncodings()); summaryModel = buildRandomForestTreeModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, categoricalFeatureInfo); break; case SVM: summaryModel = buildSVMModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case NAIVE_BAYES: summaryModel = buildNaiveBayesModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case LINEAR_REGRESSION: summaryModel = buildLinearRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case RIDGE_REGRESSION: summaryModel = buildRidgeRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case LASSO_REGRESSION: summaryModel = buildLassoRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; default: throw new AlgorithmNameException("Incorrect algorithm name"); } // persist model summary databaseService.updateModelSummary(modelId, summaryModel); return mlModel; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building supervised machine learning model: " + e.getMessage(), e); } }