private static IList<IList<IAgent>> clusteringUsingWeka( final IScope scope, final Clusterer clusterer, final IList<String> attributes, final IAddressableContainer<Integer, IAgent, Integer, IAgent> agents) throws GamaRuntimeException { Instances dataset = convertToInstances(scope, attributes, agents); try { clusterer.buildClusterer(dataset); IList<IList<IAgent>> groupes = GamaListFactory.create(Types.LIST.of(Types.AGENT)); for (int i = 0; i < clusterer.numberOfClusters(); i++) { groupes.add(GamaListFactory.<IAgent>create(Types.AGENT)); } for (int i = 0; i < dataset.numInstances(); i++) { Instance inst = dataset.instance(i); int clusterIndex = -1; clusterIndex = clusterer.clusterInstance(inst); IList<IAgent> groupe = groupes.get(clusterIndex); groupe.add(agents.get(scope, i)); } return groupes; } catch (Exception e) { return null; } }
/** * Prepare a clustering pass on the indicated data. * * @param points The array of dataPoints to be clustered. * @param ids Array of cluster numbers which this call will fill in, defining which cluster each * point belongs to. The caller must leave the data here intact between iterations. * @param means Array of x,y values in which to place centroids of the clusters. * @param region The region of the plane in which the points lie. */ @Override public void prepare(Point[] points, int[] ids, double[][] means, Region region) { super.prepare(points, ids, means, region); // Save the data arrays. dataPoints = points; pointClusters = ids; clusterMeans = means; numPoints = points.length; numClusters = means.length; // Set up the strengths array. clusterStrengths = new double[numPoints][numClusters]; // Set the initial cluster centroids to be random values // within the data region. double x = region.getX1(); double y = region.getY1(); double w = region.getWidth(); double h = region.getHeight(); for (int i = 0; i < numClusters; ++i) { means[i][0] = random.nextDouble() * w + x; means[i][1] = random.nextDouble() * h + y; } // Make an initial assignment of points to clusters, so on the first // iteration we have a basis for computing centroids. assignPoints(ids, means); }
/** * Main method for testing this class. * * @param args the options */ public static void main(String[] args) { try { if (args.length == 0) { throw new Exception("The first argument must be the name of a " + "clusterer"); } String ClustererString = args[0]; args[0] = ""; Clusterer newClusterer = Clusterer.forName(ClustererString, null); System.out.println(evaluateClusterer(newClusterer, args)); } catch (Exception e) { System.out.println(e.getMessage()); } }
/** * Make up the help string giving all the command line options * * @param clusterer the clusterer to include options for * @return a string detailing the valid command line options */ private static String makeOptionString(Clusterer clusterer) { StringBuffer optionsText = new StringBuffer(""); // General options optionsText.append("\n\nGeneral options:\n\n"); optionsText.append("-t <name of training file>\n"); optionsText.append("\tSets training file.\n"); optionsText.append("-T <name of test file>\n"); optionsText.append("-l <name of input file>\n"); optionsText.append("\tSets model input file.\n"); optionsText.append("-d <name of output file>\n"); optionsText.append("\tSets model output file.\n"); optionsText.append("-p <attribute range>\n"); optionsText.append( "\tOutput predictions. Predictions are for " + "training file" + "\n\tif only training file is specified," + "\n\totherwise predictions are for the test file." + "\n\tThe range specifies attribute values to be output" + "\n\twith the predictions. Use '-p 0' for none.\n"); optionsText.append("-x <number of folds>\n"); optionsText.append("\tOnly Distribution Clusterers can be cross " + "validated.\n"); optionsText.append("-s <random number seed>\n"); optionsText.append("-c <class index>\n"); optionsText.append("\tSet class attribute. If supplied, class is ignored"); optionsText.append("\n\tduring clustering but is used in a classes to"); optionsText.append("\n\tclusters evaluation.\n"); // Get scheme-specific options if (clusterer instanceof OptionHandler) { optionsText.append("\nOptions specific to " + clusterer.getClass().getName() + ":\n\n"); Enumeration enu = ((OptionHandler) clusterer).listOptions(); while (enu.hasMoreElements()) { Option option = (Option) enu.nextElement(); optionsText.append(option.synopsis() + '\n'); optionsText.append(option.description() + "\n"); } } return optionsText.toString(); }
/** * Print the cluster assignments for either the training or the testing data. * * @param clusterer the clusterer to use for cluster assignments * @return a string containing the instance indexes and cluster assigns. * @exception if cluster assignments can't be printed */ private static String printClusterings( Clusterer clusterer, Instances train, String testFileName, Range attributesToOutput) throws Exception { StringBuffer text = new StringBuffer(); int i = 0; int cnum; if (testFileName.length() != 0) { BufferedReader testStream = null; try { testStream = new BufferedReader(new FileReader(testFileName)); } catch (Exception e) { throw new Exception("Can't open file " + e.getMessage() + '.'); } Instances test = new Instances(testStream, 1); while (test.readInstance(testStream)) { try { cnum = clusterer.clusterInstance(test.instance(0)); text.append( i + " " + cnum + " " + attributeValuesString(test.instance(0), attributesToOutput) + "\n"); } catch (Exception e) { /* throw new Exception('\n' + "Unable to cluster instance\n" + e.getMessage()); */ text.append( i + " Unclustered " + attributeValuesString(test.instance(0), attributesToOutput) + "\n"); } test.delete(0); i++; } } else // output for training data { for (i = 0; i < train.numInstances(); i++) { try { cnum = clusterer.clusterInstance(train.instance(i)); text.append( i + " " + cnum + " " + attributeValuesString(train.instance(i), attributesToOutput) + "\n"); } catch (Exception e) { /* throw new Exception('\n' + "Unable to cluster instance\n" + e.getMessage()); */ text.append( i + " Unclustered " + attributeValuesString(train.instance(i), attributesToOutput) + "\n"); } } } return text.toString(); }
/** * Print the cluster statistics for either the training or the testing data. * * @param clusterer the clusterer to use for generating statistics. * @return a string containing cluster statistics. * @exception if statistics can't be generated. */ private static String printClusterStats(Clusterer clusterer, String fileName) throws Exception { StringBuffer text = new StringBuffer(); int i = 0; int cnum; double loglk = 0.0; double[] dist; double temp; int cc = clusterer.numberOfClusters(); double[] instanceStats = new double[cc]; int unclusteredInstances = 0; if (fileName.length() != 0) { BufferedReader inStream = null; try { inStream = new BufferedReader(new FileReader(fileName)); } catch (Exception e) { throw new Exception("Can't open file " + e.getMessage() + '.'); } Instances inst = new Instances(inStream, 1); while (inst.readInstance(inStream)) { try { cnum = clusterer.clusterInstance(inst.instance(0)); if (clusterer instanceof DensityBasedClusterer) { loglk += ((DensityBasedClusterer) clusterer).logDensityForInstance(inst.instance(0)); // temp = Utils.sum(dist); } instanceStats[cnum]++; } catch (Exception e) { unclusteredInstances++; } inst.delete(0); i++; } /* // count the actual number of used clusters int count = 0; for (i = 0; i < cc; i++) { if (instanceStats[i] > 0) { count++; } } if (count > 0) { double [] tempStats = new double [count]; count=0; for (i=0;i<cc;i++) { if (instanceStats[i] > 0) { tempStats[count++] = instanceStats[i]; } } instanceStats = tempStats; cc = instanceStats.length; } */ int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1); int numInstFieldWidth = (int) ((Math.log(i) / Math.log(10)) + 1); double sum = Utils.sum(instanceStats); loglk /= sum; text.append("Clustered Instances\n"); for (i = 0; i < cc; i++) { if (instanceStats[i] > 0) { text.append( Utils.doubleToString((double) i, clustFieldWidth, 0) + " " + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) + " (" + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0) + "%)\n"); } } if (unclusteredInstances > 0) { text.append("\nUnclustered Instances : " + unclusteredInstances); } if (clusterer instanceof DensityBasedClusterer) { text.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n"); } } return text.toString(); }
/** * Evaluates a clusterer with the options given in an array of strings. It takes the string * indicated by "-t" as training file, the string indicated by "-T" as test file. If the test file * is missing, a stratified ten-fold cross-validation is performed (distribution clusterers only). * Using "-x" you can change the number of folds to be used, and using "-s" the random seed. If * the "-p" option is present it outputs the classification for each test instance. If you provide * the name of an object file using "-l", a clusterer will be loaded from the given file. If you * provide the name of an object file using "-d", the clusterer built from the training data will * be saved to the given file. * * @param clusterer machine learning clusterer * @param options the array of string containing the options * @exception Exception if model could not be evaluated successfully * @return a string describing the results */ public static String evaluateClusterer(Clusterer clusterer, String[] options) throws Exception { int seed = 1, folds = 10; boolean doXval = false; Instances train = null; Instances test = null; Random random; String trainFileName, testFileName, seedString, foldsString, objectInputFileName, objectOutputFileName, attributeRangeString; String[] savedOptions = null; boolean printClusterAssignments = false; Range attributesToOutput = null; ObjectInputStream objectInputStream = null; ObjectOutputStream objectOutputStream = null; StringBuffer text = new StringBuffer(); int theClass = -1; // class based evaluation of clustering try { if (Utils.getFlag('h', options)) { throw new Exception("Help requested."); } // Get basic options (options the same for all clusterers // printClusterAssignments = Utils.getFlag('p', options); objectInputFileName = Utils.getOption('l', options); objectOutputFileName = Utils.getOption('d', options); trainFileName = Utils.getOption('t', options); testFileName = Utils.getOption('T', options); // Check -p option try { attributeRangeString = Utils.getOption('p', options); } catch (Exception e) { throw new Exception( e.getMessage() + "\nNOTE: the -p option has changed. " + "It now expects a parameter specifying a range of attributes " + "to list with the predictions. Use '-p 0' for none."); } if (attributeRangeString.length() != 0) { printClusterAssignments = true; if (!attributeRangeString.equals("0")) attributesToOutput = new Range(attributeRangeString); } if (trainFileName.length() == 0) { if (objectInputFileName.length() == 0) { throw new Exception("No training file and no object " + "input file given."); } if (testFileName.length() == 0) { throw new Exception("No training file and no test file given."); } } else { if ((objectInputFileName.length() != 0) && (printClusterAssignments == false)) { throw new Exception("Can't use both train and model file " + "unless -p specified."); } } seedString = Utils.getOption('s', options); if (seedString.length() != 0) { seed = Integer.parseInt(seedString); } foldsString = Utils.getOption('x', options); if (foldsString.length() != 0) { folds = Integer.parseInt(foldsString); doXval = true; } } catch (Exception e) { throw new Exception('\n' + e.getMessage() + makeOptionString(clusterer)); } try { if (trainFileName.length() != 0) { train = new Instances(new BufferedReader(new FileReader(trainFileName))); String classString = Utils.getOption('c', options); if (classString.length() != 0) { if (classString.compareTo("last") == 0) { theClass = train.numAttributes(); } else if (classString.compareTo("first") == 0) { theClass = 1; } else { theClass = Integer.parseInt(classString); } if (doXval || testFileName.length() != 0) { throw new Exception("Can only do class based evaluation on the " + "training data"); } if (objectInputFileName.length() != 0) { throw new Exception("Can't load a clusterer and do class based " + "evaluation"); } } if (theClass != -1) { if (theClass < 1 || theClass > train.numAttributes()) { throw new Exception("Class is out of range!"); } if (!train.attribute(theClass - 1).isNominal()) { throw new Exception("Class must be nominal!"); } train.setClassIndex(theClass - 1); } } if (objectInputFileName.length() != 0) { objectInputStream = new ObjectInputStream(new FileInputStream(objectInputFileName)); } if (objectOutputFileName.length() != 0) { objectOutputStream = new ObjectOutputStream(new FileOutputStream(objectOutputFileName)); } } catch (Exception e) { throw new Exception("ClusterEvaluation: " + e.getMessage() + '.'); } // Save options if (options != null) { savedOptions = new String[options.length]; System.arraycopy(options, 0, savedOptions, 0, options.length); } if (objectInputFileName.length() != 0) { Utils.checkForRemainingOptions(options); } // Set options for clusterer if (clusterer instanceof OptionHandler) { ((OptionHandler) clusterer).setOptions(options); } Utils.checkForRemainingOptions(options); if (objectInputFileName.length() != 0) { // Load the clusterer from file clusterer = (Clusterer) objectInputStream.readObject(); objectInputStream.close(); } else { // Build the clusterer if no object file provided if (theClass == -1) { clusterer.buildClusterer(train); } else { Remove removeClass = new Remove(); removeClass.setAttributeIndices("" + theClass); removeClass.setInvertSelection(false); removeClass.setInputFormat(train); Instances clusterTrain = Filter.useFilter(train, removeClass); clusterer.buildClusterer(clusterTrain); ClusterEvaluation ce = new ClusterEvaluation(); ce.setClusterer(clusterer); ce.evaluateClusterer(train); return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString(); } } /* Output cluster predictions only (for the test data if specified, otherwise for the training data */ if (printClusterAssignments) { return printClusterings(clusterer, train, testFileName, attributesToOutput); } text.append(clusterer.toString()); text.append( "\n\n=== Clustering stats for training data ===\n\n" + printClusterStats(clusterer, trainFileName)); if (testFileName.length() != 0) { text.append( "\n\n=== Clustering stats for testing data ===\n\n" + printClusterStats(clusterer, testFileName)); } if ((clusterer instanceof DensityBasedClusterer) && (doXval == true) && (testFileName.length() == 0) && (objectInputFileName.length() == 0)) { // cross validate the log likelihood on the training data random = new Random(seed); random.setSeed(seed); train.randomize(random); text.append( crossValidateModel(clusterer.getClass().getName(), train, folds, savedOptions, random)); } // Save the clusterer if an object output file is provided if (objectOutputFileName.length() != 0) { objectOutputStream.writeObject(clusterer); objectOutputStream.flush(); objectOutputStream.close(); } return text.toString(); }
/** * Evaluate the clusterer on a set of instances. Calculates clustering statistics and stores * cluster assigments for the instances in m_clusterAssignments * * @param test the set of instances to cluster * @exception Exception if something goes wrong */ public void evaluateClusterer(Instances test) throws Exception { int i = 0; int cnum; double loglk = 0.0; double[] dist; double temp; int cc = m_Clusterer.numberOfClusters(); m_numClusters = cc; int numInstFieldWidth = (int) ((Math.log(test.numInstances()) / Math.log(10)) + 1); double[] instanceStats = new double[cc]; m_clusterAssignments = new double[test.numInstances()]; Instances testCopy = test; boolean hasClass = (testCopy.classIndex() >= 0); int unclusteredInstances = 0; // If class is set then do class based evaluation as well if (hasClass) { if (testCopy.classAttribute().isNumeric()) { throw new Exception("ClusterEvaluation: Class must be nominal!"); } Remove removeClass = new Remove(); removeClass.setAttributeIndices("" + (testCopy.classIndex() + 1)); removeClass.setInvertSelection(false); removeClass.setInputFormat(testCopy); testCopy = Filter.useFilter(testCopy, removeClass); } for (i = 0; i < testCopy.numInstances(); i++) { cnum = -1; try { if (m_Clusterer instanceof DensityBasedClusterer) { loglk += ((DensityBasedClusterer) m_Clusterer).logDensityForInstance(testCopy.instance(i)); // temp = Utils.sum(dist); // Utils.normalize(dist); cnum = m_Clusterer.clusterInstance(testCopy.instance(i)); // Utils.maxIndex(dist); m_clusterAssignments[i] = (double) cnum; } else { cnum = m_Clusterer.clusterInstance(testCopy.instance(i)); m_clusterAssignments[i] = (double) cnum; } } catch (Exception e) { unclusteredInstances++; } if (cnum != -1) { instanceStats[cnum]++; } } /* // count the actual number of used clusters int count = 0; for (i = 0; i < cc; i++) { if (instanceStats[i] > 0) { count++; } } if (count > 0) { double [] tempStats = new double [count]; double [] map = new double [m_clusterAssignments.length]; count=0; for (i=0;i<cc;i++) { if (instanceStats[i] > 0) { tempStats[count] = instanceStats[i]; map[i] = count; count++; } } instanceStats = tempStats; cc = instanceStats.length; for (i=0;i<m_clusterAssignments.length;i++) { m_clusterAssignments[i] = map[(int)m_clusterAssignments[i]]; } } */ double sum = Utils.sum(instanceStats); loglk /= sum; m_logL = loglk; m_clusteringResults.append(m_Clusterer.toString()); m_clusteringResults.append("Clustered Instances\n\n"); int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1); for (i = 0; i < cc; i++) { if (instanceStats[i] > 0) { m_clusteringResults.append( Utils.doubleToString((double) i, clustFieldWidth, 0) + " " + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) + " (" + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0) + "%)\n"); } } if (unclusteredInstances > 0) { m_clusteringResults.append("\nUnclustered instances : " + unclusteredInstances); } if (m_Clusterer instanceof DensityBasedClusterer) { m_clusteringResults.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n"); } if (hasClass) { evaluateClustersWithRespectToClass(test); } }