private int getNoOfClasses(MLModel mlModel) { if (mlModel.getEncodings() == null) { return -1; } int responseIndex = mlModel.getEncodings().size() - 1; return mlModel.getEncodings().get(responseIndex) != null ? mlModel.getEncodings().get(responseIndex).size() : -1; }
/** * This method builds a decision tree model * * @param sparkContext JavaSparkContext initialized with the application * @param modelID Model ID * @param trainingData Training data as a JavaRDD of LabeledPoints * @param testingData Testing data as a JavaRDD of LabeledPoints * @param workflow Machine learning workflow * @param mlModel Deployable machine learning model * @throws MLModelBuilderException */ private ModelSummary buildDecisionTreeModel( JavaSparkContext sparkContext, long modelID, JavaRDD<LabeledPoint> trainingData, JavaRDD<LabeledPoint> testingData, Workflow workflow, MLModel mlModel, SortedMap<Integer, String> includedFeatures, Map<Integer, Integer> categoricalFeatureInfo) throws MLModelBuilderException { try { Map<String, String> hyperParameters = workflow.getHyperParameters(); DecisionTree decisionTree = new DecisionTree(); DecisionTreeModel decisionTreeModel = decisionTree.train( trainingData, getNoOfClasses(mlModel), categoricalFeatureInfo, hyperParameters.get(MLConstants.IMPURITY), Integer.parseInt(hyperParameters.get(MLConstants.MAX_DEPTH)), Integer.parseInt(hyperParameters.get(MLConstants.MAX_BINS))); // remove from cache trainingData.unpersist(); // add test data to cache testingData.cache(); JavaPairRDD<Double, Double> predictionsAndLabels = decisionTree.test(decisionTreeModel, testingData).cache(); ClassClassificationAndRegressionModelSummary classClassificationAndRegressionModelSummary = SparkModelUtils.getClassClassificationModelSummary( sparkContext, testingData, predictionsAndLabels); // remove from cache testingData.unpersist(); mlModel.setModel(new MLDecisionTreeModel(decisionTreeModel)); classClassificationAndRegressionModelSummary.setFeatures( includedFeatures.values().toArray(new String[0])); classClassificationAndRegressionModelSummary.setAlgorithm( SUPERVISED_ALGORITHM.DECISION_TREE.toString()); MulticlassMetrics multiclassMetrics = getMulticlassMetrics(sparkContext, predictionsAndLabels); predictionsAndLabels.unpersist(); classClassificationAndRegressionModelSummary.setMulticlassConfusionMatrix( getMulticlassConfusionMatrix(multiclassMetrics, mlModel)); Double modelAccuracy = getModelAccuracy(multiclassMetrics); classClassificationAndRegressionModelSummary.setModelAccuracy(modelAccuracy); classClassificationAndRegressionModelSummary.setDatasetVersion(workflow.getDatasetVersion()); return classClassificationAndRegressionModelSummary; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building decision tree model: " + e.getMessage(), e); } }
/** * This method applies a stacked autoencoders model to a given dataset and make predictions * * @param ctxt JavaSparkContext * @param deeplearningModel Stacked Autoencoders model * @param test Testing dataset as a JavaRDD of labeled points * @return */ public JavaPairRDD<Double, Double> test( JavaSparkContext ctxt, final DeepLearningModel deeplearningModel, JavaRDD<LabeledPoint> test, MLModel mlModel) throws MLModelBuilderException { Scope.enter(); if (deeplearningModel == null) { throw new MLModelBuilderException("DeeplearningModel is Null"); } int numberOfFeatures = mlModel.getFeatures().size(); List<Feature> features = mlModel.getFeatures(); String[] names = new String[numberOfFeatures + 1]; for (int i = 0; i < numberOfFeatures; i++) { names[i] = features.get(i).getName(); } names[numberOfFeatures] = mlModel.getResponseVariable(); Frame testData = DeeplearningModelUtils.javaRDDToFrame(names, test); Frame testDataWithoutLabels = testData.subframe(0, testData.numCols() - 1); int numRows = (int) testDataWithoutLabels.numRows(); Vec predictionsVector = deeplearningModel.score(testDataWithoutLabels).vec(0); double[] predictionValues = new double[numRows]; for (int i = 0; i < numRows; i++) { predictionValues[i] = predictionsVector.at(i); } Vec labelsVector = testData.vec(testData.numCols() - 1); double[] labels = new double[numRows]; for (int i = 0; i < numRows; i++) { labels[i] = labelsVector.at(i); } Scope.exit(); ArrayList<Tuple2<Double, Double>> tupleList = new ArrayList<Tuple2<Double, Double>>(); for (int i = 0; i < labels.length; i++) { tupleList.add(new Tuple2<Double, Double>(predictionValues[i], labels[i])); } return ctxt.parallelizePairs(tupleList); }
/** * This method builds a naive bayes model * * @param sparkContext JavaSparkContext initialized with the application * @param modelID Model ID * @param trainingData Training data as a JavaRDD of LabeledPoints * @param testingData Testing data as a JavaRDD of LabeledPoints * @param workflow Machine learning workflow * @param mlModel Deployable machine learning model * @throws MLModelBuilderException */ private ModelSummary buildNaiveBayesModel( JavaSparkContext sparkContext, long modelID, JavaRDD<LabeledPoint> trainingData, JavaRDD<LabeledPoint> testingData, Workflow workflow, MLModel mlModel, SortedMap<Integer, String> includedFeatures) throws MLModelBuilderException { try { Map<String, String> hyperParameters = workflow.getHyperParameters(); NaiveBayesClassifier naiveBayesClassifier = new NaiveBayesClassifier(); NaiveBayesModel naiveBayesModel = naiveBayesClassifier.train( trainingData, Double.parseDouble(hyperParameters.get(MLConstants.LAMBDA))); // remove from cache trainingData.unpersist(); // add test data to cache testingData.cache(); JavaPairRDD<Double, Double> predictionsAndLabels = naiveBayesClassifier.test(naiveBayesModel, testingData).cache(); ClassClassificationAndRegressionModelSummary classClassificationAndRegressionModelSummary = SparkModelUtils.getClassClassificationModelSummary( sparkContext, testingData, predictionsAndLabels); // remove from cache testingData.unpersist(); mlModel.setModel(new MLClassificationModel(naiveBayesModel)); classClassificationAndRegressionModelSummary.setFeatures( includedFeatures.values().toArray(new String[0])); classClassificationAndRegressionModelSummary.setAlgorithm( SUPERVISED_ALGORITHM.NAIVE_BAYES.toString()); MulticlassMetrics multiclassMetrics = getMulticlassMetrics(sparkContext, predictionsAndLabels); predictionsAndLabels.unpersist(); classClassificationAndRegressionModelSummary.setMulticlassConfusionMatrix( getMulticlassConfusionMatrix(multiclassMetrics, mlModel)); Double modelAccuracy = getModelAccuracy(multiclassMetrics); classClassificationAndRegressionModelSummary.setModelAccuracy(modelAccuracy); classClassificationAndRegressionModelSummary.setDatasetVersion(workflow.getDatasetVersion()); return classClassificationAndRegressionModelSummary; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building naive bayes model: " + e.getMessage(), e); } }
/** * This method returns multiclass confusion matrix for a given multiclass metric object * * @param multiclassMetrics Multiclass metric object */ private MulticlassConfusionMatrix getMulticlassConfusionMatrix( MulticlassMetrics multiclassMetrics, MLModel mlModel) { MulticlassConfusionMatrix multiclassConfusionMatrix = new MulticlassConfusionMatrix(); if (multiclassMetrics != null) { int size = multiclassMetrics.confusionMatrix().numCols(); double[] matrixArray = multiclassMetrics.confusionMatrix().toArray(); double[][] matrix = new double[size][size]; // set values of matrix into a 2D array for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { matrix[i][j] = matrixArray[(j * size) + i]; } } multiclassConfusionMatrix.setMatrix(matrix); List<Map<String, Integer>> encodings = mlModel.getEncodings(); // decode only if encodings are available if (encodings != null) { // last index is response variable encoding Map<String, Integer> encodingMap = encodings.get(encodings.size() - 1); List<String> decodedLabels = new ArrayList<String>(); for (double label : multiclassMetrics.labels()) { Integer labelInt = (int) label; String decodedLabel = MLUtils.getKeyByValue(encodingMap, labelInt); if (decodedLabel != null) { decodedLabels.add(decodedLabel); } else { continue; } } multiclassConfusionMatrix.setLabels(decodedLabels); } else { List<String> labelList = toStringList(multiclassMetrics.labels()); multiclassConfusionMatrix.setLabels(labelList); } multiclassConfusionMatrix.setSize(size); } return multiclassConfusionMatrix; }
/** * This method builds a lasso regression model * * @param sparkContext JavaSparkContext initialized with the application * @param modelID Model ID * @param trainingData Training data as a JavaRDD of LabeledPoints * @param testingData Testing data as a JavaRDD of LabeledPoints * @param workflow Machine learning workflow * @param mlModel Deployable machine learning model * @throws MLModelBuilderException */ private ModelSummary buildLassoRegressionModel( JavaSparkContext sparkContext, long modelID, JavaRDD<LabeledPoint> trainingData, JavaRDD<LabeledPoint> testingData, Workflow workflow, MLModel mlModel, SortedMap<Integer, String> includedFeatures) throws MLModelBuilderException { try { LassoRegression lassoRegression = new LassoRegression(); Map<String, String> hyperParameters = workflow.getHyperParameters(); LassoModel lassoModel = lassoRegression.train( trainingData, Integer.parseInt(hyperParameters.get(MLConstants.ITERATIONS)), Double.parseDouble(hyperParameters.get(MLConstants.LEARNING_RATE)), Double.parseDouble(hyperParameters.get(MLConstants.REGULARIZATION_PARAMETER)), Double.parseDouble(hyperParameters.get(MLConstants.SGD_DATA_FRACTION))); // remove from cache trainingData.unpersist(); // add test data to cache testingData.cache(); Vector weights = lassoModel.weights(); if (!isValidWeights(weights)) { throw new MLModelBuilderException( "Weights of the model generated are null or infinity. [Weights] " + vectorToString(weights)); } JavaRDD<Tuple2<Double, Double>> predictionsAndLabels = lassoRegression.test(lassoModel, testingData).cache(); ClassClassificationAndRegressionModelSummary regressionModelSummary = SparkModelUtils.generateRegressionModelSummary( sparkContext, testingData, predictionsAndLabels); // remove from cache testingData.unpersist(); mlModel.setModel(new MLGeneralizedLinearModel(lassoModel)); List<FeatureImportance> featureWeights = getFeatureWeights(includedFeatures, lassoModel.weights().toArray()); regressionModelSummary.setFeatures(includedFeatures.values().toArray(new String[0])); regressionModelSummary.setAlgorithm(SUPERVISED_ALGORITHM.LASSO_REGRESSION.toString()); regressionModelSummary.setFeatureImportance(featureWeights); RegressionMetrics regressionMetrics = getRegressionMetrics(sparkContext, predictionsAndLabels); predictionsAndLabels.unpersist(); Double meanSquaredError = regressionMetrics.meanSquaredError(); regressionModelSummary.setMeanSquaredError(meanSquaredError); regressionModelSummary.setDatasetVersion(workflow.getDatasetVersion()); return regressionModelSummary; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building lasso regression model: " + e.getMessage(), e); } }
/** * This method builds a support vector machine (SVM) model * * @param sparkContext JavaSparkContext initialized with the application * @param modelID Model ID * @param trainingData Training data as a JavaRDD of LabeledPoints * @param testingData Testing data as a JavaRDD of LabeledPoints * @param workflow Machine learning workflow * @param mlModel Deployable machine learning model * @throws MLModelBuilderException */ private ModelSummary buildSVMModel( JavaSparkContext sparkContext, long modelID, JavaRDD<LabeledPoint> trainingData, JavaRDD<LabeledPoint> testingData, Workflow workflow, MLModel mlModel, SortedMap<Integer, String> includedFeatures) throws MLModelBuilderException { if (getNoOfClasses(mlModel) > 2) { throw new MLModelBuilderException( "A binary classification algorithm cannot have more than " + "two distinct values in response variable."); } try { SVM svm = new SVM(); Map<String, String> hyperParameters = workflow.getHyperParameters(); SVMModel svmModel = svm.train( trainingData, Integer.parseInt(hyperParameters.get(MLConstants.ITERATIONS)), hyperParameters.get(MLConstants.REGULARIZATION_TYPE), Double.parseDouble(hyperParameters.get(MLConstants.REGULARIZATION_PARAMETER)), Double.parseDouble(hyperParameters.get(MLConstants.LEARNING_RATE)), Double.parseDouble(hyperParameters.get(MLConstants.SGD_DATA_FRACTION))); // remove from cache trainingData.unpersist(); // add test data to cache testingData.cache(); Vector weights = svmModel.weights(); if (!isValidWeights(weights)) { throw new MLModelBuilderException( "Weights of the model generated are null or infinity. [Weights] " + vectorToString(weights)); } // getting scores and labels without clearing threshold to get confusion matrix JavaRDD<Tuple2<Object, Object>> scoresAndLabelsThresholded = svm.test(svmModel, testingData); MulticlassMetrics multiclassMetrics = new MulticlassMetrics(JavaRDD.toRDD(scoresAndLabelsThresholded)); MulticlassConfusionMatrix multiclassConfusionMatrix = getMulticlassConfusionMatrix(multiclassMetrics, mlModel); svmModel.clearThreshold(); JavaRDD<Tuple2<Object, Object>> scoresAndLabels = svm.test(svmModel, testingData); ProbabilisticClassificationModelSummary probabilisticClassificationModelSummary = SparkModelUtils.generateProbabilisticClassificationModelSummary( sparkContext, testingData, scoresAndLabels); // remove from cache testingData.unpersist(); mlModel.setModel(new MLClassificationModel(svmModel)); List<FeatureImportance> featureWeights = getFeatureWeights(includedFeatures, svmModel.weights().toArray()); probabilisticClassificationModelSummary.setFeatures( includedFeatures.values().toArray(new String[0])); probabilisticClassificationModelSummary.setFeatureImportance(featureWeights); probabilisticClassificationModelSummary.setAlgorithm(SUPERVISED_ALGORITHM.SVM.toString()); probabilisticClassificationModelSummary.setMulticlassConfusionMatrix( multiclassConfusionMatrix); Double modelAccuracy = getModelAccuracy(multiclassMetrics); probabilisticClassificationModelSummary.setModelAccuracy(modelAccuracy); probabilisticClassificationModelSummary.setDatasetVersion(workflow.getDatasetVersion()); return probabilisticClassificationModelSummary; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building SVM model: " + e.getMessage(), e); } }
/** Build a supervised model. */ public MLModel build() throws MLModelBuilderException { MLModelConfigurationContext context = getContext(); JavaSparkContext sparkContext = null; DatabaseService databaseService = MLCoreServiceValueHolder.getInstance().getDatabaseService(); MLModel mlModel = new MLModel(); try { sparkContext = context.getSparkContext(); Workflow workflow = context.getFacts(); long modelId = context.getModelId(); // Verify validity of response variable String typeOfResponseVariable = getTypeOfResponseVariable(workflow.getResponseVariable(), workflow.getFeatures()); if (typeOfResponseVariable == null) { throw new MLModelBuilderException( "Type of response variable cannot be null for supervised learning " + "algorithms."); } // Stops model building if a categorical attribute is used with numerical prediction if (workflow.getAlgorithmClass().equals(AlgorithmType.NUMERICAL_PREDICTION.getValue()) && typeOfResponseVariable.equals(FeatureType.CATEGORICAL)) { throw new MLModelBuilderException( "Categorical attribute " + workflow.getResponseVariable() + " cannot be used as the response variable of the Numerical Prediction algorithm: " + workflow.getAlgorithmName()); } // generate train and test datasets by converting tokens to labeled points int responseIndex = context.getResponseIndex(); SortedMap<Integer, String> includedFeatures = MLUtils.getIncludedFeaturesAfterReordering( workflow, context.getNewToOldIndicesList(), responseIndex); // gets the pre-processed dataset JavaRDD<LabeledPoint> labeledPoints = preProcess().cache(); JavaRDD<LabeledPoint>[] dataSplit = labeledPoints.randomSplit( new double[] {workflow.getTrainDataFraction(), 1 - workflow.getTrainDataFraction()}, MLConstants.RANDOM_SEED); // remove from cache labeledPoints.unpersist(); JavaRDD<LabeledPoint> trainingData = dataSplit[0].cache(); JavaRDD<LabeledPoint> testingData = dataSplit[1]; // create a deployable MLModel object mlModel.setAlgorithmName(workflow.getAlgorithmName()); mlModel.setAlgorithmClass(workflow.getAlgorithmClass()); mlModel.setFeatures(workflow.getIncludedFeatures()); mlModel.setResponseVariable(workflow.getResponseVariable()); mlModel.setEncodings(context.getEncodings()); mlModel.setNewToOldIndicesList(context.getNewToOldIndicesList()); mlModel.setResponseIndex(responseIndex); ModelSummary summaryModel = null; Map<Integer, Integer> categoricalFeatureInfo; // build a machine learning model according to user selected algorithm SUPERVISED_ALGORITHM supervisedAlgorithm = SUPERVISED_ALGORITHM.valueOf(workflow.getAlgorithmName()); switch (supervisedAlgorithm) { case LOGISTIC_REGRESSION: summaryModel = buildLogisticRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, true); break; case LOGISTIC_REGRESSION_LBFGS: summaryModel = buildLogisticRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, false); break; case DECISION_TREE: categoricalFeatureInfo = getCategoricalFeatureInfo(context.getEncodings()); summaryModel = buildDecisionTreeModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, categoricalFeatureInfo); break; case RANDOM_FOREST: categoricalFeatureInfo = getCategoricalFeatureInfo(context.getEncodings()); summaryModel = buildRandomForestTreeModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures, categoricalFeatureInfo); break; case SVM: summaryModel = buildSVMModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case NAIVE_BAYES: summaryModel = buildNaiveBayesModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case LINEAR_REGRESSION: summaryModel = buildLinearRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case RIDGE_REGRESSION: summaryModel = buildRidgeRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; case LASSO_REGRESSION: summaryModel = buildLassoRegressionModel( sparkContext, modelId, trainingData, testingData, workflow, mlModel, includedFeatures); break; default: throw new AlgorithmNameException("Incorrect algorithm name"); } // persist model summary databaseService.updateModelSummary(modelId, summaryModel); return mlModel; } catch (Exception e) { throw new MLModelBuilderException( "An error occurred while building supervised machine learning model: " + e.getMessage(), e); } }
/** * This method trains a stacked autoencoder * * @param trainData Training dataset as a JavaRDD * @param batchSize Size of a training mini-batch * @param layerSizes Number of neurons for each layer * @param epochs Number of epochs to train * @param responseColumn Name of the response column * @param modelName Name of the model * @return DeepLearningModel */ public DeepLearningModel train( JavaRDD<LabeledPoint> trainData, int batchSize, int[] layerSizes, String activationType, int epochs, String responseColumn, String modelName, MLModel mlModel, long modelID) { // build stacked autoencoder by training the model with training data double trainingFraction = 1; try { Scope.enter(); if (trainData != null) { int numberOfFeatures = mlModel.getFeatures().size(); List<Feature> features = mlModel.getFeatures(); String[] names = new String[numberOfFeatures + 1]; for (int i = 0; i < numberOfFeatures; i++) { names[i] = features.get(i).getName(); } names[numberOfFeatures] = mlModel.getResponseVariable(); Frame frame = DeeplearningModelUtils.javaRDDToFrame(names, trainData); // H2O uses default C<x> for column header // String classifColName = "C" + frame.numCols(); String classifColName = mlModel.getResponseVariable(); // Convert response to categorical (digits 1 to <num of columns>) int ci = frame.find(classifColName); Scope.track(frame.replace(ci, frame.vecs()[ci].toEnum())._key); // Splitting train file to train, validation and test // Using FrameSplitter (instead of SuffleSplitFrame) gives a weird exception // barrier onExCompletion for hex.deeplearning.DeepLearning$DeepLearningDriver@78ec854 double[] ratios = new double[] {trainingFraction, 1 - trainingFraction}; @SuppressWarnings("unchecked") Frame[] splits = ShuffleSplitFrame.shuffleSplitFrame( frame, generateNumKeys(frame._key, ratios.length), ratios, 123456789); Frame trainFrame = splits[0]; Frame vframe = splits[1]; if (log.isDebugEnabled()) { log.debug("Creating Deeplearning parameters"); } DeepLearningParameters deeplearningParameters = new DeepLearningParameters(); // convert model name String dlModelName = modelName.replace('.', '_').replace('-', '_'); // populate model parameters deeplearningParameters._model_id = Key.make(dlModelName + "_dl"); deeplearningParameters._train = trainFrame._key; deeplearningParameters._valid = vframe._key; deeplearningParameters._response_column = classifColName; // last column is the response // This is causin all the predictions to be 0.0 // p._autoencoder = true; deeplearningParameters._activation = getActivationType(activationType); deeplearningParameters._hidden = layerSizes; deeplearningParameters._train_samples_per_iteration = batchSize; deeplearningParameters._input_dropout_ratio = 0.2; deeplearningParameters._l1 = 1e-5; deeplearningParameters._max_w2 = 10; deeplearningParameters._epochs = epochs; // speed up training deeplearningParameters._adaptive_rate = true; // disable adaptive per-weight learning rate -> default // settings for learning rate and momentum are probably // not ideal (slow convergence) deeplearningParameters._replicate_training_data = true; // avoid extra communication cost upfront, got // enough data on each node for load balancing deeplearningParameters._overwrite_with_best_model = true; // no need to keep the best model around deeplearningParameters._diagnostics = false; // no need to compute statistics during training deeplearningParameters._classification_stop = -1; deeplearningParameters._score_interval = 60; // score and print progress report (only) every 20 seconds deeplearningParameters._score_training_samples = batchSize / 10; // only score on a small sample of the // training set -> don't want to spend // too much time scoring (note: there // will be at least 1 row per chunk) DKV.put(trainFrame); DKV.put(vframe); deeplearning = new DeepLearning(deeplearningParameters); if (log.isDebugEnabled()) { log.debug("Start training deeplearning model ...."); } try { dlModel = deeplearning.trainModel().get(); if (log.isDebugEnabled()) { log.debug("Successfully finished Training deeplearning model."); } } catch (RuntimeException ex) { log.error("Error in training Stacked Autoencoder classifier model", ex); } } else { log.error("Train file not found!"); } } catch (RuntimeException ex) { log.error("Failed to train the deeplearning model [id] " + modelID + ". " + ex.getMessage()); } finally { Scope.exit(); } return dlModel; }