Esempio n. 1
0
  private static IList<IList<IAgent>> clusteringUsingWeka(
      final IScope scope,
      final Clusterer clusterer,
      final IList<String> attributes,
      final IAddressableContainer<Integer, IAgent, Integer, IAgent> agents)
      throws GamaRuntimeException {
    Instances dataset = convertToInstances(scope, attributes, agents);
    try {
      clusterer.buildClusterer(dataset);

      IList<IList<IAgent>> groupes = GamaListFactory.create(Types.LIST.of(Types.AGENT));

      for (int i = 0; i < clusterer.numberOfClusters(); i++) {
        groupes.add(GamaListFactory.<IAgent>create(Types.AGENT));
      }
      for (int i = 0; i < dataset.numInstances(); i++) {
        Instance inst = dataset.instance(i);
        int clusterIndex = -1;
        clusterIndex = clusterer.clusterInstance(inst);
        IList<IAgent> groupe = groupes.get(clusterIndex);
        groupe.add(agents.get(scope, i));
      }
      return groupes;
    } catch (Exception e) {
      return null;
    }
  }
Esempio n. 2
0
  /**
   * Prepare a clustering pass on the indicated data.
   *
   * @param points The array of dataPoints to be clustered.
   * @param ids Array of cluster numbers which this call will fill in, defining which cluster each
   *     point belongs to. The caller must leave the data here intact between iterations.
   * @param means Array of x,y values in which to place centroids of the clusters.
   * @param region The region of the plane in which the points lie.
   */
  @Override
  public void prepare(Point[] points, int[] ids, double[][] means, Region region) {
    super.prepare(points, ids, means, region);

    // Save the data arrays.
    dataPoints = points;
    pointClusters = ids;
    clusterMeans = means;
    numPoints = points.length;
    numClusters = means.length;

    // Set up the strengths array.
    clusterStrengths = new double[numPoints][numClusters];

    // Set the initial cluster centroids to be random values
    // within the data region.
    double x = region.getX1();
    double y = region.getY1();
    double w = region.getWidth();
    double h = region.getHeight();
    for (int i = 0; i < numClusters; ++i) {
      means[i][0] = random.nextDouble() * w + x;
      means[i][1] = random.nextDouble() * h + y;
    }

    // Make an initial assignment of points to clusters, so on the first
    // iteration we have a basis for computing centroids.
    assignPoints(ids, means);
  }
  /**
   * Main method for testing this class.
   *
   * @param args the options
   */
  public static void main(String[] args) {
    try {
      if (args.length == 0) {
        throw new Exception("The first argument must be the name of a " + "clusterer");
      }

      String ClustererString = args[0];
      args[0] = "";
      Clusterer newClusterer = Clusterer.forName(ClustererString, null);
      System.out.println(evaluateClusterer(newClusterer, args));
    } catch (Exception e) {
      System.out.println(e.getMessage());
    }
  }
  /**
   * Make up the help string giving all the command line options
   *
   * @param clusterer the clusterer to include options for
   * @return a string detailing the valid command line options
   */
  private static String makeOptionString(Clusterer clusterer) {
    StringBuffer optionsText = new StringBuffer("");
    // General options
    optionsText.append("\n\nGeneral options:\n\n");
    optionsText.append("-t <name of training file>\n");
    optionsText.append("\tSets training file.\n");
    optionsText.append("-T <name of test file>\n");
    optionsText.append("-l <name of input file>\n");
    optionsText.append("\tSets model input file.\n");
    optionsText.append("-d <name of output file>\n");
    optionsText.append("\tSets model output file.\n");
    optionsText.append("-p <attribute range>\n");
    optionsText.append(
        "\tOutput predictions. Predictions are for "
            + "training file"
            + "\n\tif only training file is specified,"
            + "\n\totherwise predictions are for the test file."
            + "\n\tThe range specifies attribute values to be output"
            + "\n\twith the predictions. Use '-p 0' for none.\n");
    optionsText.append("-x <number of folds>\n");
    optionsText.append("\tOnly Distribution Clusterers can be cross " + "validated.\n");
    optionsText.append("-s <random number seed>\n");
    optionsText.append("-c <class index>\n");
    optionsText.append("\tSet class attribute. If supplied, class is ignored");
    optionsText.append("\n\tduring clustering but is used in a classes to");
    optionsText.append("\n\tclusters evaluation.\n");

    // Get scheme-specific options
    if (clusterer instanceof OptionHandler) {
      optionsText.append("\nOptions specific to " + clusterer.getClass().getName() + ":\n\n");
      Enumeration enu = ((OptionHandler) clusterer).listOptions();

      while (enu.hasMoreElements()) {
        Option option = (Option) enu.nextElement();
        optionsText.append(option.synopsis() + '\n');
        optionsText.append(option.description() + "\n");
      }
    }

    return optionsText.toString();
  }
  /**
   * Print the cluster assignments for either the training or the testing data.
   *
   * @param clusterer the clusterer to use for cluster assignments
   * @return a string containing the instance indexes and cluster assigns.
   * @exception if cluster assignments can't be printed
   */
  private static String printClusterings(
      Clusterer clusterer, Instances train, String testFileName, Range attributesToOutput)
      throws Exception {
    StringBuffer text = new StringBuffer();
    int i = 0;
    int cnum;

    if (testFileName.length() != 0) {
      BufferedReader testStream = null;

      try {
        testStream = new BufferedReader(new FileReader(testFileName));
      } catch (Exception e) {
        throw new Exception("Can't open file " + e.getMessage() + '.');
      }

      Instances test = new Instances(testStream, 1);

      while (test.readInstance(testStream)) {
        try {
          cnum = clusterer.clusterInstance(test.instance(0));

          text.append(
              i
                  + " "
                  + cnum
                  + " "
                  + attributeValuesString(test.instance(0), attributesToOutput)
                  + "\n");
        } catch (Exception e) {
          /*	  throw  new Exception('\n' + "Unable to cluster instance\n"
          + e.getMessage()); */
          text.append(
              i
                  + " Unclustered "
                  + attributeValuesString(test.instance(0), attributesToOutput)
                  + "\n");
        }
        test.delete(0);
        i++;
      }
    } else // output for training data
    {
      for (i = 0; i < train.numInstances(); i++) {
        try {
          cnum = clusterer.clusterInstance(train.instance(i));

          text.append(
              i
                  + " "
                  + cnum
                  + " "
                  + attributeValuesString(train.instance(i), attributesToOutput)
                  + "\n");
        } catch (Exception e) {
          /*  throw  new Exception('\n'
          + "Unable to cluster instance\n"
          + e.getMessage()); */
          text.append(
              i
                  + " Unclustered "
                  + attributeValuesString(train.instance(i), attributesToOutput)
                  + "\n");
        }
      }
    }

    return text.toString();
  }
  /**
   * Print the cluster statistics for either the training or the testing data.
   *
   * @param clusterer the clusterer to use for generating statistics.
   * @return a string containing cluster statistics.
   * @exception if statistics can't be generated.
   */
  private static String printClusterStats(Clusterer clusterer, String fileName) throws Exception {
    StringBuffer text = new StringBuffer();
    int i = 0;
    int cnum;
    double loglk = 0.0;
    double[] dist;
    double temp;
    int cc = clusterer.numberOfClusters();
    double[] instanceStats = new double[cc];
    int unclusteredInstances = 0;

    if (fileName.length() != 0) {
      BufferedReader inStream = null;

      try {
        inStream = new BufferedReader(new FileReader(fileName));
      } catch (Exception e) {
        throw new Exception("Can't open file " + e.getMessage() + '.');
      }

      Instances inst = new Instances(inStream, 1);

      while (inst.readInstance(inStream)) {
        try {
          cnum = clusterer.clusterInstance(inst.instance(0));

          if (clusterer instanceof DensityBasedClusterer) {
            loglk += ((DensityBasedClusterer) clusterer).logDensityForInstance(inst.instance(0));
            //	    temp = Utils.sum(dist);
          }
          instanceStats[cnum]++;
        } catch (Exception e) {
          unclusteredInstances++;
        }
        inst.delete(0);
        i++;
      }

      /*
           // count the actual number of used clusters
           int count = 0;
           for (i = 0; i < cc; i++) {
      if (instanceStats[i] > 0) {
        count++;
      }
           }
           if (count > 0) {
      double [] tempStats = new double [count];
      count=0;
      for (i=0;i<cc;i++) {
        if (instanceStats[i] > 0) {
          tempStats[count++] = instanceStats[i];
      }
      }
      instanceStats = tempStats;
      cc = instanceStats.length;
      } */

      int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1);
      int numInstFieldWidth = (int) ((Math.log(i) / Math.log(10)) + 1);
      double sum = Utils.sum(instanceStats);
      loglk /= sum;
      text.append("Clustered Instances\n");

      for (i = 0; i < cc; i++) {
        if (instanceStats[i] > 0) {
          text.append(
              Utils.doubleToString((double) i, clustFieldWidth, 0)
                  + "      "
                  + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0)
                  + " ("
                  + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0)
                  + "%)\n");
        }
      }
      if (unclusteredInstances > 0) {
        text.append("\nUnclustered Instances : " + unclusteredInstances);
      }

      if (clusterer instanceof DensityBasedClusterer) {
        text.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n");
      }
    }

    return text.toString();
  }
  /**
   * Evaluates a clusterer with the options given in an array of strings. It takes the string
   * indicated by "-t" as training file, the string indicated by "-T" as test file. If the test file
   * is missing, a stratified ten-fold cross-validation is performed (distribution clusterers only).
   * Using "-x" you can change the number of folds to be used, and using "-s" the random seed. If
   * the "-p" option is present it outputs the classification for each test instance. If you provide
   * the name of an object file using "-l", a clusterer will be loaded from the given file. If you
   * provide the name of an object file using "-d", the clusterer built from the training data will
   * be saved to the given file.
   *
   * @param clusterer machine learning clusterer
   * @param options the array of string containing the options
   * @exception Exception if model could not be evaluated successfully
   * @return a string describing the results
   */
  public static String evaluateClusterer(Clusterer clusterer, String[] options) throws Exception {
    int seed = 1, folds = 10;
    boolean doXval = false;
    Instances train = null;
    Instances test = null;
    Random random;
    String trainFileName,
        testFileName,
        seedString,
        foldsString,
        objectInputFileName,
        objectOutputFileName,
        attributeRangeString;
    String[] savedOptions = null;
    boolean printClusterAssignments = false;
    Range attributesToOutput = null;
    ObjectInputStream objectInputStream = null;
    ObjectOutputStream objectOutputStream = null;
    StringBuffer text = new StringBuffer();
    int theClass = -1; // class based evaluation of clustering

    try {
      if (Utils.getFlag('h', options)) {
        throw new Exception("Help requested.");
      }

      // Get basic options (options the same for all clusterers
      // printClusterAssignments = Utils.getFlag('p', options);
      objectInputFileName = Utils.getOption('l', options);
      objectOutputFileName = Utils.getOption('d', options);
      trainFileName = Utils.getOption('t', options);
      testFileName = Utils.getOption('T', options);

      // Check -p option
      try {
        attributeRangeString = Utils.getOption('p', options);
      } catch (Exception e) {
        throw new Exception(
            e.getMessage()
                + "\nNOTE: the -p option has changed. "
                + "It now expects a parameter specifying a range of attributes "
                + "to list with the predictions. Use '-p 0' for none.");
      }
      if (attributeRangeString.length() != 0) {
        printClusterAssignments = true;
        if (!attributeRangeString.equals("0")) attributesToOutput = new Range(attributeRangeString);
      }

      if (trainFileName.length() == 0) {
        if (objectInputFileName.length() == 0) {
          throw new Exception("No training file and no object " + "input file given.");
        }

        if (testFileName.length() == 0) {
          throw new Exception("No training file and no test file given.");
        }
      } else {
        if ((objectInputFileName.length() != 0) && (printClusterAssignments == false)) {
          throw new Exception("Can't use both train and model file " + "unless -p specified.");
        }
      }

      seedString = Utils.getOption('s', options);

      if (seedString.length() != 0) {
        seed = Integer.parseInt(seedString);
      }

      foldsString = Utils.getOption('x', options);

      if (foldsString.length() != 0) {
        folds = Integer.parseInt(foldsString);
        doXval = true;
      }
    } catch (Exception e) {
      throw new Exception('\n' + e.getMessage() + makeOptionString(clusterer));
    }

    try {
      if (trainFileName.length() != 0) {
        train = new Instances(new BufferedReader(new FileReader(trainFileName)));

        String classString = Utils.getOption('c', options);
        if (classString.length() != 0) {
          if (classString.compareTo("last") == 0) {
            theClass = train.numAttributes();
          } else if (classString.compareTo("first") == 0) {
            theClass = 1;
          } else {
            theClass = Integer.parseInt(classString);
          }
          if (doXval || testFileName.length() != 0) {
            throw new Exception("Can only do class based evaluation on the " + "training data");
          }

          if (objectInputFileName.length() != 0) {
            throw new Exception("Can't load a clusterer and do class based " + "evaluation");
          }
        }

        if (theClass != -1) {
          if (theClass < 1 || theClass > train.numAttributes()) {
            throw new Exception("Class is out of range!");
          }
          if (!train.attribute(theClass - 1).isNominal()) {
            throw new Exception("Class must be nominal!");
          }
          train.setClassIndex(theClass - 1);
        }
      }

      if (objectInputFileName.length() != 0) {
        objectInputStream = new ObjectInputStream(new FileInputStream(objectInputFileName));
      }

      if (objectOutputFileName.length() != 0) {
        objectOutputStream = new ObjectOutputStream(new FileOutputStream(objectOutputFileName));
      }
    } catch (Exception e) {
      throw new Exception("ClusterEvaluation: " + e.getMessage() + '.');
    }

    // Save options
    if (options != null) {
      savedOptions = new String[options.length];
      System.arraycopy(options, 0, savedOptions, 0, options.length);
    }

    if (objectInputFileName.length() != 0) {
      Utils.checkForRemainingOptions(options);
    }

    // Set options for clusterer
    if (clusterer instanceof OptionHandler) {
      ((OptionHandler) clusterer).setOptions(options);
    }

    Utils.checkForRemainingOptions(options);

    if (objectInputFileName.length() != 0) {
      // Load the clusterer from file
      clusterer = (Clusterer) objectInputStream.readObject();
      objectInputStream.close();
    } else {
      // Build the clusterer if no object file provided
      if (theClass == -1) {
        clusterer.buildClusterer(train);
      } else {
        Remove removeClass = new Remove();
        removeClass.setAttributeIndices("" + theClass);
        removeClass.setInvertSelection(false);
        removeClass.setInputFormat(train);
        Instances clusterTrain = Filter.useFilter(train, removeClass);
        clusterer.buildClusterer(clusterTrain);
        ClusterEvaluation ce = new ClusterEvaluation();
        ce.setClusterer(clusterer);
        ce.evaluateClusterer(train);

        return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString();
      }
    }

    /* Output cluster predictions only (for the test data if specified,
    otherwise for the training data */
    if (printClusterAssignments) {
      return printClusterings(clusterer, train, testFileName, attributesToOutput);
    }

    text.append(clusterer.toString());
    text.append(
        "\n\n=== Clustering stats for training data ===\n\n"
            + printClusterStats(clusterer, trainFileName));

    if (testFileName.length() != 0) {
      text.append(
          "\n\n=== Clustering stats for testing data ===\n\n"
              + printClusterStats(clusterer, testFileName));
    }

    if ((clusterer instanceof DensityBasedClusterer)
        && (doXval == true)
        && (testFileName.length() == 0)
        && (objectInputFileName.length() == 0)) {
      // cross validate the log likelihood on the training data
      random = new Random(seed);
      random.setSeed(seed);
      train.randomize(random);
      text.append(
          crossValidateModel(clusterer.getClass().getName(), train, folds, savedOptions, random));
    }

    // Save the clusterer if an object output file is provided
    if (objectOutputFileName.length() != 0) {
      objectOutputStream.writeObject(clusterer);
      objectOutputStream.flush();
      objectOutputStream.close();
    }

    return text.toString();
  }
  /**
   * Evaluate the clusterer on a set of instances. Calculates clustering statistics and stores
   * cluster assigments for the instances in m_clusterAssignments
   *
   * @param test the set of instances to cluster
   * @exception Exception if something goes wrong
   */
  public void evaluateClusterer(Instances test) throws Exception {
    int i = 0;
    int cnum;
    double loglk = 0.0;
    double[] dist;
    double temp;
    int cc = m_Clusterer.numberOfClusters();
    m_numClusters = cc;
    int numInstFieldWidth = (int) ((Math.log(test.numInstances()) / Math.log(10)) + 1);
    double[] instanceStats = new double[cc];
    m_clusterAssignments = new double[test.numInstances()];
    Instances testCopy = test;
    boolean hasClass = (testCopy.classIndex() >= 0);
    int unclusteredInstances = 0;

    // If class is set then do class based evaluation as well
    if (hasClass) {
      if (testCopy.classAttribute().isNumeric()) {
        throw new Exception("ClusterEvaluation: Class must be nominal!");
      }
      Remove removeClass = new Remove();
      removeClass.setAttributeIndices("" + (testCopy.classIndex() + 1));
      removeClass.setInvertSelection(false);
      removeClass.setInputFormat(testCopy);
      testCopy = Filter.useFilter(testCopy, removeClass);
    }

    for (i = 0; i < testCopy.numInstances(); i++) {
      cnum = -1;
      try {
        if (m_Clusterer instanceof DensityBasedClusterer) {
          loglk +=
              ((DensityBasedClusterer) m_Clusterer).logDensityForInstance(testCopy.instance(i));
          //	  temp = Utils.sum(dist);

          //	  Utils.normalize(dist);
          cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
          // Utils.maxIndex(dist);
          m_clusterAssignments[i] = (double) cnum;
        } else {
          cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
          m_clusterAssignments[i] = (double) cnum;
        }
      } catch (Exception e) {
        unclusteredInstances++;
      }

      if (cnum != -1) {
        instanceStats[cnum]++;
      }
    }

    /* // count the actual number of used clusters
       int count = 0;
       for (i = 0; i < cc; i++) {
         if (instanceStats[i] > 0) {
    count++;
         }
       }
       if (count > 0) {
         double [] tempStats = new double [count];
         double [] map = new double [m_clusterAssignments.length];
         count=0;
         for (i=0;i<cc;i++) {
    if (instanceStats[i] > 0) {
      tempStats[count] = instanceStats[i];
      map[i] = count;
      count++;
    }
         }
         instanceStats = tempStats;
         cc = instanceStats.length;
         for (i=0;i<m_clusterAssignments.length;i++) {
    m_clusterAssignments[i] = map[(int)m_clusterAssignments[i]];
         }
         } */

    double sum = Utils.sum(instanceStats);
    loglk /= sum;
    m_logL = loglk;

    m_clusteringResults.append(m_Clusterer.toString());
    m_clusteringResults.append("Clustered Instances\n\n");
    int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1);
    for (i = 0; i < cc; i++) {
      if (instanceStats[i] > 0) {
        m_clusteringResults.append(
            Utils.doubleToString((double) i, clustFieldWidth, 0)
                + "      "
                + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0)
                + " ("
                + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0)
                + "%)\n");
      }
    }

    if (unclusteredInstances > 0) {
      m_clusteringResults.append("\nUnclustered instances : " + unclusteredInstances);
    }

    if (m_Clusterer instanceof DensityBasedClusterer) {
      m_clusteringResults.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n");
    }

    if (hasClass) {
      evaluateClustersWithRespectToClass(test);
    }
  }