/** * Extract fields from a file into a numeric array for machine learning. * * @param analyst The analyst to use. * @param headers The headers for the input data. * @param csv The CSV that holds the input data. * @param outputLength The length of the returned array. * @param skipOutput True if the output should be skipped. * @return The encoded data. */ public static final double[] extractFields( final EncogAnalyst analyst, final CSVHeaders headers, final ReadCSV csv, final int outputLength, final boolean skipOutput) { final double[] output = new double[outputLength]; int outputIndex = 0; for (final AnalystField stat : analyst.getScript().getNormalize().getNormalizedFields()) { stat.init(); if (stat.getAction() == NormalizationAction.Ignore) { continue; } if (stat.isOutput() && skipOutput) { continue; } int index = headers.find(stat.getName()); final String str = csv.get(index).trim(); // is this an unknown value? if (str.equals("?") || str.length() == 0) { HandleMissingValues handler = analyst.getScript().getNormalize().getMissingValues(); double[] d = handler.handleMissing(analyst, stat); // should we skip the entire row if (d == null) { return null; } // copy the returned values in place of the missing values for (int i = 0; i < d.length; i++) { output[outputIndex++] = d[i]; } } else { // known value if (stat.getAction() == NormalizationAction.Normalize) { double d = csv.getFormat().parse(str); d = stat.normalize(d); output[outputIndex++] = d; } else if (stat.getAction() == NormalizationAction.PassThrough) { double d = csv.getFormat().parse(str); output[outputIndex++] = d; } else { final double[] d = stat.encode(str); for (final double element : d) { output[outputIndex++] = element; } } } } return output; }
/** * Process the file and cluster. * * @param outputFile The output file. * @param clusters The number of clusters. * @param theAnalyst The analyst to use. * @param iterations The number of iterations to use. */ public void process( final File outputFile, final int clusters, final EncogAnalyst theAnalyst, final int iterations) { final PrintWriter tw = this.prepareOutputFile( outputFile, analyst.getScript().getNormalize().countActiveFields() - 1, 1); resetStatus(); final KMeansClustering cluster = new KMeansClustering(clusters, this.data); cluster.iteration(iterations); int clusterNum = 0; for (final MLCluster cl : cluster.getClusters()) { for (final MLData item : cl.getData()) { final int clsIndex = item.size(); final LoadedRow lr = new LoadedRow(this.getFormat(), item.getData(), 1); lr.getData()[clsIndex] = "" + clusterNum; writeRow(tw, lr); } clusterNum++; } reportDone(false); tw.close(); }
/** * Determine the mode, this is the class item that has the most instances. * * @param analyst * @return */ public int determineMode(EncogAnalyst analyst) { if (!this.isClassify()) { throw new AnalystError("Can only calculate the mode for a class."); } DataField df = analyst.getScript().findDataField(this.name); AnalystClassItem m = null; int result = 0; int idx = 0; for (AnalystClassItem item : df.getClassMembers()) { if (m == null || m.getCount() < item.getCount()) { m = item; result = idx; } idx++; } return result; }
/** * Analyze the file. * * @param inputFilename The input file. * @param expectInputHeaders True, if input headers are present. * @param inputFormat The format. * @param theAnalyst The analyst to use. */ public void analyze( final File inputFilename, final boolean expectInputHeaders, final CSVFormat inputFormat, final EncogAnalyst theAnalyst) { this.setInputFilename(inputFilename); this.setInputFormat(inputFormat); this.setExpectInputHeaders(expectInputHeaders); this.analyst = theAnalyst; this.setAnalyzed(true); this.analystHeaders = new CSVHeaders(inputFilename, expectInputHeaders, inputFormat); for (final AnalystField field : analyst.getScript().getNormalize().getNormalizedFields()) { field.init(); } this.series = new TimeSeriesUtil(analyst, true, this.analystHeaders.getHeaders()); }