/** Build a supervised model. */
  public MLModel build() throws MLModelBuilderException {
    MLModelConfigurationContext context = getContext();
    JavaSparkContext sparkContext = null;
    DatabaseService databaseService = MLCoreServiceValueHolder.getInstance().getDatabaseService();
    MLModel mlModel = new MLModel();
    try {
      sparkContext = context.getSparkContext();
      Workflow workflow = context.getFacts();
      long modelId = context.getModelId();

      // Verify validity of response variable
      String typeOfResponseVariable =
          getTypeOfResponseVariable(workflow.getResponseVariable(), workflow.getFeatures());

      if (typeOfResponseVariable == null) {
        throw new MLModelBuilderException(
            "Type of response variable cannot be null for supervised learning " + "algorithms.");
      }

      // Stops model building if a categorical attribute is used with numerical prediction
      if (workflow.getAlgorithmClass().equals(AlgorithmType.NUMERICAL_PREDICTION.getValue())
          && typeOfResponseVariable.equals(FeatureType.CATEGORICAL)) {
        throw new MLModelBuilderException(
            "Categorical attribute "
                + workflow.getResponseVariable()
                + " cannot be used as the response variable of the Numerical Prediction algorithm: "
                + workflow.getAlgorithmName());
      }

      // generate train and test datasets by converting tokens to labeled points
      int responseIndex = context.getResponseIndex();
      SortedMap<Integer, String> includedFeatures =
          MLUtils.getIncludedFeaturesAfterReordering(
              workflow, context.getNewToOldIndicesList(), responseIndex);

      // gets the pre-processed dataset
      JavaRDD<LabeledPoint> labeledPoints = preProcess().cache();

      JavaRDD<LabeledPoint>[] dataSplit =
          labeledPoints.randomSplit(
              new double[] {workflow.getTrainDataFraction(), 1 - workflow.getTrainDataFraction()},
              MLConstants.RANDOM_SEED);

      // remove from cache
      labeledPoints.unpersist();

      JavaRDD<LabeledPoint> trainingData = dataSplit[0].cache();
      JavaRDD<LabeledPoint> testingData = dataSplit[1];
      // create a deployable MLModel object
      mlModel.setAlgorithmName(workflow.getAlgorithmName());
      mlModel.setAlgorithmClass(workflow.getAlgorithmClass());
      mlModel.setFeatures(workflow.getIncludedFeatures());
      mlModel.setResponseVariable(workflow.getResponseVariable());
      mlModel.setEncodings(context.getEncodings());
      mlModel.setNewToOldIndicesList(context.getNewToOldIndicesList());
      mlModel.setResponseIndex(responseIndex);

      ModelSummary summaryModel = null;
      Map<Integer, Integer> categoricalFeatureInfo;

      // build a machine learning model according to user selected algorithm
      SUPERVISED_ALGORITHM supervisedAlgorithm =
          SUPERVISED_ALGORITHM.valueOf(workflow.getAlgorithmName());
      switch (supervisedAlgorithm) {
        case LOGISTIC_REGRESSION:
          summaryModel =
              buildLogisticRegressionModel(
                  sparkContext,
                  modelId,
                  trainingData,
                  testingData,
                  workflow,
                  mlModel,
                  includedFeatures,
                  true);
          break;
        case LOGISTIC_REGRESSION_LBFGS:
          summaryModel =
              buildLogisticRegressionModel(
                  sparkContext,
                  modelId,
                  trainingData,
                  testingData,
                  workflow,
                  mlModel,
                  includedFeatures,
                  false);
          break;
        case DECISION_TREE:
          categoricalFeatureInfo = getCategoricalFeatureInfo(context.getEncodings());
          summaryModel =
              buildDecisionTreeModel(
                  sparkContext,
                  modelId,
                  trainingData,
                  testingData,
                  workflow,
                  mlModel,
                  includedFeatures,
                  categoricalFeatureInfo);
          break;
        case RANDOM_FOREST:
          categoricalFeatureInfo = getCategoricalFeatureInfo(context.getEncodings());
          summaryModel =
              buildRandomForestTreeModel(
                  sparkContext,
                  modelId,
                  trainingData,
                  testingData,
                  workflow,
                  mlModel,
                  includedFeatures,
                  categoricalFeatureInfo);
          break;
        case SVM:
          summaryModel =
              buildSVMModel(
                  sparkContext,
                  modelId,
                  trainingData,
                  testingData,
                  workflow,
                  mlModel,
                  includedFeatures);
          break;
        case NAIVE_BAYES:
          summaryModel =
              buildNaiveBayesModel(
                  sparkContext,
                  modelId,
                  trainingData,
                  testingData,
                  workflow,
                  mlModel,
                  includedFeatures);
          break;
        case LINEAR_REGRESSION:
          summaryModel =
              buildLinearRegressionModel(
                  sparkContext,
                  modelId,
                  trainingData,
                  testingData,
                  workflow,
                  mlModel,
                  includedFeatures);
          break;
        case RIDGE_REGRESSION:
          summaryModel =
              buildRidgeRegressionModel(
                  sparkContext,
                  modelId,
                  trainingData,
                  testingData,
                  workflow,
                  mlModel,
                  includedFeatures);
          break;
        case LASSO_REGRESSION:
          summaryModel =
              buildLassoRegressionModel(
                  sparkContext,
                  modelId,
                  trainingData,
                  testingData,
                  workflow,
                  mlModel,
                  includedFeatures);
          break;
        default:
          throw new AlgorithmNameException("Incorrect algorithm name");
      }

      // persist model summary
      databaseService.updateModelSummary(modelId, summaryModel);
      return mlModel;
    } catch (Exception e) {
      throw new MLModelBuilderException(
          "An error occurred while building supervised machine learning model: " + e.getMessage(),
          e);
    }
  }
예제 #2
0
  /**
   * A utility method to generate class classification model summary
   *
   * @param predictionsAndLabels Predictions and actual labels
   * @return Class classification model summary
   */
  public static ClassClassificationAndRegressionModelSummary getClassClassificationModelSummary(
      JavaSparkContext sparkContext,
      JavaRDD<LabeledPoint> testingData,
      JavaPairRDD<Double, Double> predictionsAndLabels) {
    ClassClassificationAndRegressionModelSummary classClassificationModelSummary =
        new ClassClassificationAndRegressionModelSummary();
    // store predictions and actuals
    List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>();
    for (Tuple2<Double, Double> scoreAndLabel : predictionsAndLabels.collect()) {
      PredictedVsActual predictedVsActual = new PredictedVsActual();
      predictedVsActual.setPredicted(scoreAndLabel._1());
      predictedVsActual.setActual(scoreAndLabel._2());
      predictedVsActuals.add(predictedVsActual);
    }
    // create a list of feature values
    List<double[]> features = new ArrayList<double[]>();
    for (LabeledPoint labeledPoint : testingData.collect()) {
      if (labeledPoint != null && labeledPoint.features() != null) {
        double[] rowFeatures = labeledPoint.features().toArray();
        features.add(rowFeatures);
      }
    }
    // create a list of feature values with predicted vs. actuals
    List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>();
    for (int i = 0; i < features.size(); i++) {
      TestResultDataPoint testResultDataPoint = new TestResultDataPoint();
      testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i));
      testResultDataPoint.setFeatureValues(features.get(i));
      testResultDataPoints.add(testResultDataPoint);
    }
    // covert List to JavaRDD
    JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD =
        sparkContext.parallelize(testResultDataPoints);
    // collect RDD as a sampled list
    List<TestResultDataPoint> testResultDataPointsSample;
    if (testResultDataPointsJavaRDD.count()
        > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) {
      testResultDataPointsSample =
          testResultDataPointsJavaRDD.takeSample(
              true,
              MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize());
    } else {
      testResultDataPointsSample = testResultDataPointsJavaRDD.collect();
    }
    classClassificationModelSummary.setTestResultDataPointsSample(testResultDataPointsSample);
    classClassificationModelSummary.setPredictedVsActuals(predictedVsActuals);
    // calculate test error
    double error =
        1.0
            * predictionsAndLabels
                .filter(
                    new Function<Tuple2<Double, Double>, Boolean>() {
                      private static final long serialVersionUID = -3063364114286182333L;

                      @Override
                      public Boolean call(Tuple2<Double, Double> pl) {
                        return !pl._1().equals(pl._2());
                      }
                    })
                .count()
            / predictionsAndLabels.count();
    classClassificationModelSummary.setError(error);
    return classClassificationModelSummary;
  }
예제 #3
0
 /**
  * A utility method to generate probabilistic classification model summary
  *
  * @param scoresAndLabels Tuple2 containing scores and labels
  * @return Probabilistic classification model summary
  */
 public static ProbabilisticClassificationModelSummary
     generateProbabilisticClassificationModelSummary(
         JavaSparkContext sparkContext,
         JavaRDD<LabeledPoint> testingData,
         JavaRDD<Tuple2<Object, Object>> scoresAndLabels) {
   ProbabilisticClassificationModelSummary probabilisticClassificationModelSummary =
       new ProbabilisticClassificationModelSummary();
   // store predictions and actuals
   List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>();
   DecimalFormat decimalFormat = new DecimalFormat(MLConstants.DECIMAL_FORMAT);
   for (Tuple2<Object, Object> scoreAndLabel : scoresAndLabels.collect()) {
     PredictedVsActual predictedVsActual = new PredictedVsActual();
     predictedVsActual.setPredicted(Double.parseDouble(decimalFormat.format(scoreAndLabel._1())));
     predictedVsActual.setActual(Double.parseDouble(decimalFormat.format(scoreAndLabel._2())));
     predictedVsActuals.add(predictedVsActual);
     if (log.isTraceEnabled()) {
       log.trace(
           "Predicted: "
               + predictedVsActual.getPredicted()
               + " ------ Actual: "
               + predictedVsActual.getActual());
     }
   }
   // create a list of feature values
   List<double[]> features = new ArrayList<double[]>();
   for (LabeledPoint labeledPoint : testingData.collect()) {
     if (labeledPoint != null && labeledPoint.features() != null) {
       double[] rowFeatures = labeledPoint.features().toArray();
       features.add(rowFeatures);
     }
   }
   // create a list of feature values with predicted vs. actuals
   List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>();
   for (int i = 0; i < features.size(); i++) {
     TestResultDataPoint testResultDataPoint = new TestResultDataPoint();
     testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i));
     testResultDataPoint.setFeatureValues(features.get(i));
     testResultDataPoints.add(testResultDataPoint);
   }
   // covert List to JavaRDD
   JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD =
       sparkContext.parallelize(testResultDataPoints);
   // collect RDD as a sampled list
   List<TestResultDataPoint> testResultDataPointsSample;
   if (testResultDataPointsJavaRDD.count()
       > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) {
     testResultDataPointsSample =
         testResultDataPointsJavaRDD.takeSample(
             true,
             MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize());
   } else {
     testResultDataPointsSample = testResultDataPointsJavaRDD.collect();
   }
   probabilisticClassificationModelSummary.setTestResultDataPointsSample(
       testResultDataPointsSample);
   probabilisticClassificationModelSummary.setPredictedVsActuals(predictedVsActuals);
   // generate binary classification metrics
   BinaryClassificationMetrics metrics =
       new BinaryClassificationMetrics(JavaRDD.toRDD(scoresAndLabels));
   // store AUC
   probabilisticClassificationModelSummary.setAuc(metrics.areaUnderROC());
   // store ROC data points
   List<Tuple2<Object, Object>> rocData = metrics.roc().toJavaRDD().collect();
   JSONArray rocPoints = new JSONArray();
   for (int i = 0; i < rocData.size(); i += 1) {
     JSONArray point = new JSONArray();
     point.put(decimalFormat.format(rocData.get(i)._1()));
     point.put(decimalFormat.format(rocData.get(i)._2()));
     rocPoints.put(point);
   }
   probabilisticClassificationModelSummary.setRoc(rocPoints.toString());
   return probabilisticClassificationModelSummary;
 }
예제 #4
0
  /**
   * A utility method to generate regression model summary
   *
   * @param predictionsAndLabels Tuple2 containing predicted and actual values
   * @return Regression model summary
   */
  public static ClassClassificationAndRegressionModelSummary generateRegressionModelSummary(
      JavaSparkContext sparkContext,
      JavaRDD<LabeledPoint> testingData,
      JavaRDD<Tuple2<Double, Double>> predictionsAndLabels) {
    ClassClassificationAndRegressionModelSummary regressionModelSummary =
        new ClassClassificationAndRegressionModelSummary();
    // store predictions and actuals
    List<PredictedVsActual> predictedVsActuals = new ArrayList<PredictedVsActual>();
    DecimalFormat decimalFormat = new DecimalFormat(MLConstants.DECIMAL_FORMAT);
    for (Tuple2<Double, Double> scoreAndLabel : predictionsAndLabels.collect()) {
      PredictedVsActual predictedVsActual = new PredictedVsActual();
      predictedVsActual.setPredicted(Double.parseDouble(decimalFormat.format(scoreAndLabel._1())));
      predictedVsActual.setActual(Double.parseDouble(decimalFormat.format(scoreAndLabel._2())));
      predictedVsActuals.add(predictedVsActual);
    }
    // create a list of feature values
    List<double[]> features = new ArrayList<double[]>();
    for (LabeledPoint labeledPoint : testingData.collect()) {
      if (labeledPoint != null && labeledPoint.features() != null) {
        double[] rowFeatures = labeledPoint.features().toArray();
        features.add(rowFeatures);
      }
    }
    // create a list of feature values with predicted vs. actuals
    List<TestResultDataPoint> testResultDataPoints = new ArrayList<TestResultDataPoint>();
    for (int i = 0; i < features.size(); i++) {
      TestResultDataPoint testResultDataPoint = new TestResultDataPoint();
      testResultDataPoint.setPredictedVsActual(predictedVsActuals.get(i));
      testResultDataPoint.setFeatureValues(features.get(i));
      testResultDataPoints.add(testResultDataPoint);
    }
    // covert List to JavaRDD
    JavaRDD<TestResultDataPoint> testResultDataPointsJavaRDD =
        sparkContext.parallelize(testResultDataPoints);
    // collect RDD as a sampled list
    List<TestResultDataPoint> testResultDataPointsSample;
    if (testResultDataPointsJavaRDD.count()
        > MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize()) {
      testResultDataPointsSample =
          testResultDataPointsJavaRDD.takeSample(
              true,
              MLCoreServiceValueHolder.getInstance().getSummaryStatSettings().getSampleSize());
    } else {
      testResultDataPointsSample = testResultDataPointsJavaRDD.collect();
    }
    regressionModelSummary.setTestResultDataPointsSample(testResultDataPointsSample);
    regressionModelSummary.setPredictedVsActuals(predictedVsActuals);
    // calculate mean squared error (MSE)
    double meanSquaredError =
        new JavaDoubleRDD(
                predictionsAndLabels
                    .map(
                        new Function<Tuple2<Double, Double>, Object>() {
                          private static final long serialVersionUID = -162193633199074816L;

                          public Object call(Tuple2<Double, Double> pair) {
                            return Math.pow(pair._1() - pair._2(), 2.0);
                          }
                        })
                    .rdd())
            .mean();
    regressionModelSummary.setError(meanSquaredError);
    return regressionModelSummary;
  }