/**
   * Extract fields from a file into a numeric array for machine learning.
   *
   * @param analyst The analyst to use.
   * @param headers The headers for the input data.
   * @param csv The CSV that holds the input data.
   * @param outputLength The length of the returned array.
   * @param skipOutput True if the output should be skipped.
   * @return The encoded data.
   */
  public static final double[] extractFields(
      final EncogAnalyst analyst,
      final CSVHeaders headers,
      final ReadCSV csv,
      final int outputLength,
      final boolean skipOutput) {
    final double[] output = new double[outputLength];
    int outputIndex = 0;
    for (final AnalystField stat : analyst.getScript().getNormalize().getNormalizedFields()) {

      stat.init();

      if (stat.getAction() == NormalizationAction.Ignore) {
        continue;
      }

      if (stat.isOutput() && skipOutput) {
        continue;
      }

      int index = headers.find(stat.getName());
      final String str = csv.get(index).trim();

      // is this an unknown value?
      if (str.equals("?") || str.length() == 0) {
        HandleMissingValues handler = analyst.getScript().getNormalize().getMissingValues();
        double[] d = handler.handleMissing(analyst, stat);

        // should we skip the entire row
        if (d == null) {
          return null;
        }

        // copy the returned values in place of the missing values
        for (int i = 0; i < d.length; i++) {
          output[outputIndex++] = d[i];
        }
      } else {
        // known value
        if (stat.getAction() == NormalizationAction.Normalize) {
          double d = csv.getFormat().parse(str);
          d = stat.normalize(d);
          output[outputIndex++] = d;
        } else if (stat.getAction() == NormalizationAction.PassThrough) {
          double d = csv.getFormat().parse(str);
          output[outputIndex++] = d;
        } else {
          final double[] d = stat.encode(str);
          for (final double element : d) {
            output[outputIndex++] = element;
          }
        }
      }
    }

    return output;
  }
  /**
   * Process the file and cluster.
   *
   * @param outputFile The output file.
   * @param clusters The number of clusters.
   * @param theAnalyst The analyst to use.
   * @param iterations The number of iterations to use.
   */
  public void process(
      final File outputFile,
      final int clusters,
      final EncogAnalyst theAnalyst,
      final int iterations) {

    final PrintWriter tw =
        this.prepareOutputFile(
            outputFile, analyst.getScript().getNormalize().countActiveFields() - 1, 1);

    resetStatus();

    final KMeansClustering cluster = new KMeansClustering(clusters, this.data);
    cluster.iteration(iterations);

    int clusterNum = 0;
    for (final MLCluster cl : cluster.getClusters()) {
      for (final MLData item : cl.getData()) {
        final int clsIndex = item.size();
        final LoadedRow lr = new LoadedRow(this.getFormat(), item.getData(), 1);
        lr.getData()[clsIndex] = "" + clusterNum;
        writeRow(tw, lr);
      }
      clusterNum++;
    }

    reportDone(false);
    tw.close();
  }
Example #3
0
  /**
   * Determine the mode, this is the class item that has the most instances.
   *
   * @param analyst
   * @return
   */
  public int determineMode(EncogAnalyst analyst) {
    if (!this.isClassify()) {
      throw new AnalystError("Can only calculate the mode for a class.");
    }

    DataField df = analyst.getScript().findDataField(this.name);
    AnalystClassItem m = null;
    int result = 0;
    int idx = 0;
    for (AnalystClassItem item : df.getClassMembers()) {
      if (m == null || m.getCount() < item.getCount()) {
        m = item;
        result = idx;
      }
      idx++;
    }

    return result;
  }
  /**
   * Analyze the file.
   *
   * @param inputFilename The input file.
   * @param expectInputHeaders True, if input headers are present.
   * @param inputFormat The format.
   * @param theAnalyst The analyst to use.
   */
  public void analyze(
      final File inputFilename,
      final boolean expectInputHeaders,
      final CSVFormat inputFormat,
      final EncogAnalyst theAnalyst) {
    this.setInputFilename(inputFilename);
    this.setInputFormat(inputFormat);
    this.setExpectInputHeaders(expectInputHeaders);
    this.analyst = theAnalyst;
    this.setAnalyzed(true);

    this.analystHeaders = new CSVHeaders(inputFilename, expectInputHeaders, inputFormat);

    for (final AnalystField field : analyst.getScript().getNormalize().getNormalizedFields()) {
      field.init();
    }

    this.series = new TimeSeriesUtil(analyst, true, this.analystHeaders.getHeaders());
  }