/**
   * A utility method to pre-process data
   *
   * @param sc JavaSparkContext
   * @param workflow Machine learning workflow
   * @param lines JavaRDD of strings
   * @param headerRow HeaderFilter row
   * @param columnSeparator Column separator
   * @return Returns a JavaRDD of doubles
   * @throws org.wso2.carbon.ml.model.exceptions.ModelServiceException
   */
  public static JavaRDD<double[]> preProcess(MLModelConfigurationContext context)
      throws DatasetPreProcessingException {
    JavaSparkContext sc = context.getSparkContext();
    Workflow workflow = context.getFacts();
    JavaRDD<String> lines = context.getLines();
    String headerRow = context.getHeaderRow();
    String columnSeparator = context.getColumnSeparator();
    Map<String, String> summaryStatsOfFeatures = context.getSummaryStatsOfFeatures();
    List<Integer> newToOldIndicesList = context.getNewToOldIndicesList();
    int responseIndex = context.getResponseIndex();

    List<Map<String, Integer>> encodings =
        buildEncodings(
            workflow.getFeatures(), summaryStatsOfFeatures, newToOldIndicesList, responseIndex);
    context.setEncodings(encodings);

    // Apply the filter to discard rows with missing values.
    JavaRDD<String[]> tokensDiscardedRemoved =
        MLUtils.filterRows(
            columnSeparator,
            headerRow,
            lines,
            MLUtils.getImputeFeatureIndices(
                workflow, new ArrayList<Integer>(), MLConstants.DISCARD));
    JavaRDD<String[]> filteredTokens =
        tokensDiscardedRemoved.map(new RemoveDiscardedFeatures(newToOldIndicesList, responseIndex));
    JavaRDD<String[]> encodedTokens = filteredTokens.map(new BasicEncoder(encodings));
    JavaRDD<double[]> features = null;
    // get feature indices for mean imputation
    List<Integer> meanImputeIndices =
        MLUtils.getImputeFeatureIndices(workflow, newToOldIndicesList, MLConstants.MEAN_IMPUTATION);
    if (meanImputeIndices.size() > 0) {
      // calculate means for the whole dataset (sampleFraction = 1.0) or a sample
      Map<Integer, Double> means = getMeans(sc, encodedTokens, meanImputeIndices, 0.01);
      // Replace missing values in impute indices with the mean for that column
      MeanImputation meanImputation = new MeanImputation(means);
      features = encodedTokens.map(meanImputation);
    } else {
      /**
       * Mean imputation mapper will convert string tokens to doubles as a part of the operation. If
       * there is no mean imputation for any columns, tokens has to be converted into doubles.
       */
      features = encodedTokens.map(new StringArrayToDoubleArray());
    }
    return features;
  }