コード例 #1
0
  /**
   * Evaluates a clusterer with the options given in an array of strings. It takes the string
   * indicated by "-t" as training file, the string indicated by "-T" as test file. If the test file
   * is missing, a stratified ten-fold cross-validation is performed (distribution clusterers only).
   * Using "-x" you can change the number of folds to be used, and using "-s" the random seed. If
   * the "-p" option is present it outputs the classification for each test instance. If you provide
   * the name of an object file using "-l", a clusterer will be loaded from the given file. If you
   * provide the name of an object file using "-d", the clusterer built from the training data will
   * be saved to the given file.
   *
   * @param clusterer machine learning clusterer
   * @param options the array of string containing the options
   * @exception Exception if model could not be evaluated successfully
   * @return a string describing the results
   */
  public static String evaluateClusterer(Clusterer clusterer, String[] options) throws Exception {
    int seed = 1, folds = 10;
    boolean doXval = false;
    Instances train = null;
    Instances test = null;
    Random random;
    String trainFileName,
        testFileName,
        seedString,
        foldsString,
        objectInputFileName,
        objectOutputFileName,
        attributeRangeString;
    String[] savedOptions = null;
    boolean printClusterAssignments = false;
    Range attributesToOutput = null;
    ObjectInputStream objectInputStream = null;
    ObjectOutputStream objectOutputStream = null;
    StringBuffer text = new StringBuffer();
    int theClass = -1; // class based evaluation of clustering

    try {
      if (Utils.getFlag('h', options)) {
        throw new Exception("Help requested.");
      }

      // Get basic options (options the same for all clusterers
      // printClusterAssignments = Utils.getFlag('p', options);
      objectInputFileName = Utils.getOption('l', options);
      objectOutputFileName = Utils.getOption('d', options);
      trainFileName = Utils.getOption('t', options);
      testFileName = Utils.getOption('T', options);

      // Check -p option
      try {
        attributeRangeString = Utils.getOption('p', options);
      } catch (Exception e) {
        throw new Exception(
            e.getMessage()
                + "\nNOTE: the -p option has changed. "
                + "It now expects a parameter specifying a range of attributes "
                + "to list with the predictions. Use '-p 0' for none.");
      }
      if (attributeRangeString.length() != 0) {
        printClusterAssignments = true;
        if (!attributeRangeString.equals("0")) attributesToOutput = new Range(attributeRangeString);
      }

      if (trainFileName.length() == 0) {
        if (objectInputFileName.length() == 0) {
          throw new Exception("No training file and no object " + "input file given.");
        }

        if (testFileName.length() == 0) {
          throw new Exception("No training file and no test file given.");
        }
      } else {
        if ((objectInputFileName.length() != 0) && (printClusterAssignments == false)) {
          throw new Exception("Can't use both train and model file " + "unless -p specified.");
        }
      }

      seedString = Utils.getOption('s', options);

      if (seedString.length() != 0) {
        seed = Integer.parseInt(seedString);
      }

      foldsString = Utils.getOption('x', options);

      if (foldsString.length() != 0) {
        folds = Integer.parseInt(foldsString);
        doXval = true;
      }
    } catch (Exception e) {
      throw new Exception('\n' + e.getMessage() + makeOptionString(clusterer));
    }

    try {
      if (trainFileName.length() != 0) {
        train = new Instances(new BufferedReader(new FileReader(trainFileName)));

        String classString = Utils.getOption('c', options);
        if (classString.length() != 0) {
          if (classString.compareTo("last") == 0) {
            theClass = train.numAttributes();
          } else if (classString.compareTo("first") == 0) {
            theClass = 1;
          } else {
            theClass = Integer.parseInt(classString);
          }
          if (doXval || testFileName.length() != 0) {
            throw new Exception("Can only do class based evaluation on the " + "training data");
          }

          if (objectInputFileName.length() != 0) {
            throw new Exception("Can't load a clusterer and do class based " + "evaluation");
          }
        }

        if (theClass != -1) {
          if (theClass < 1 || theClass > train.numAttributes()) {
            throw new Exception("Class is out of range!");
          }
          if (!train.attribute(theClass - 1).isNominal()) {
            throw new Exception("Class must be nominal!");
          }
          train.setClassIndex(theClass - 1);
        }
      }

      if (objectInputFileName.length() != 0) {
        objectInputStream = new ObjectInputStream(new FileInputStream(objectInputFileName));
      }

      if (objectOutputFileName.length() != 0) {
        objectOutputStream = new ObjectOutputStream(new FileOutputStream(objectOutputFileName));
      }
    } catch (Exception e) {
      throw new Exception("ClusterEvaluation: " + e.getMessage() + '.');
    }

    // Save options
    if (options != null) {
      savedOptions = new String[options.length];
      System.arraycopy(options, 0, savedOptions, 0, options.length);
    }

    if (objectInputFileName.length() != 0) {
      Utils.checkForRemainingOptions(options);
    }

    // Set options for clusterer
    if (clusterer instanceof OptionHandler) {
      ((OptionHandler) clusterer).setOptions(options);
    }

    Utils.checkForRemainingOptions(options);

    if (objectInputFileName.length() != 0) {
      // Load the clusterer from file
      clusterer = (Clusterer) objectInputStream.readObject();
      objectInputStream.close();
    } else {
      // Build the clusterer if no object file provided
      if (theClass == -1) {
        clusterer.buildClusterer(train);
      } else {
        Remove removeClass = new Remove();
        removeClass.setAttributeIndices("" + theClass);
        removeClass.setInvertSelection(false);
        removeClass.setInputFormat(train);
        Instances clusterTrain = Filter.useFilter(train, removeClass);
        clusterer.buildClusterer(clusterTrain);
        ClusterEvaluation ce = new ClusterEvaluation();
        ce.setClusterer(clusterer);
        ce.evaluateClusterer(train);

        return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString();
      }
    }

    /* Output cluster predictions only (for the test data if specified,
    otherwise for the training data */
    if (printClusterAssignments) {
      return printClusterings(clusterer, train, testFileName, attributesToOutput);
    }

    text.append(clusterer.toString());
    text.append(
        "\n\n=== Clustering stats for training data ===\n\n"
            + printClusterStats(clusterer, trainFileName));

    if (testFileName.length() != 0) {
      text.append(
          "\n\n=== Clustering stats for testing data ===\n\n"
              + printClusterStats(clusterer, testFileName));
    }

    if ((clusterer instanceof DensityBasedClusterer)
        && (doXval == true)
        && (testFileName.length() == 0)
        && (objectInputFileName.length() == 0)) {
      // cross validate the log likelihood on the training data
      random = new Random(seed);
      random.setSeed(seed);
      train.randomize(random);
      text.append(
          crossValidateModel(clusterer.getClass().getName(), train, folds, savedOptions, random));
    }

    // Save the clusterer if an object output file is provided
    if (objectOutputFileName.length() != 0) {
      objectOutputStream.writeObject(clusterer);
      objectOutputStream.flush();
      objectOutputStream.close();
    }

    return text.toString();
  }
コード例 #2
0
  /**
   * Evaluate the clusterer on a set of instances. Calculates clustering statistics and stores
   * cluster assigments for the instances in m_clusterAssignments
   *
   * @param test the set of instances to cluster
   * @exception Exception if something goes wrong
   */
  public void evaluateClusterer(Instances test) throws Exception {
    int i = 0;
    int cnum;
    double loglk = 0.0;
    double[] dist;
    double temp;
    int cc = m_Clusterer.numberOfClusters();
    m_numClusters = cc;
    int numInstFieldWidth = (int) ((Math.log(test.numInstances()) / Math.log(10)) + 1);
    double[] instanceStats = new double[cc];
    m_clusterAssignments = new double[test.numInstances()];
    Instances testCopy = test;
    boolean hasClass = (testCopy.classIndex() >= 0);
    int unclusteredInstances = 0;

    // If class is set then do class based evaluation as well
    if (hasClass) {
      if (testCopy.classAttribute().isNumeric()) {
        throw new Exception("ClusterEvaluation: Class must be nominal!");
      }
      Remove removeClass = new Remove();
      removeClass.setAttributeIndices("" + (testCopy.classIndex() + 1));
      removeClass.setInvertSelection(false);
      removeClass.setInputFormat(testCopy);
      testCopy = Filter.useFilter(testCopy, removeClass);
    }

    for (i = 0; i < testCopy.numInstances(); i++) {
      cnum = -1;
      try {
        if (m_Clusterer instanceof DensityBasedClusterer) {
          loglk +=
              ((DensityBasedClusterer) m_Clusterer).logDensityForInstance(testCopy.instance(i));
          //	  temp = Utils.sum(dist);

          //	  Utils.normalize(dist);
          cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
          // Utils.maxIndex(dist);
          m_clusterAssignments[i] = (double) cnum;
        } else {
          cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
          m_clusterAssignments[i] = (double) cnum;
        }
      } catch (Exception e) {
        unclusteredInstances++;
      }

      if (cnum != -1) {
        instanceStats[cnum]++;
      }
    }

    /* // count the actual number of used clusters
       int count = 0;
       for (i = 0; i < cc; i++) {
         if (instanceStats[i] > 0) {
    count++;
         }
       }
       if (count > 0) {
         double [] tempStats = new double [count];
         double [] map = new double [m_clusterAssignments.length];
         count=0;
         for (i=0;i<cc;i++) {
    if (instanceStats[i] > 0) {
      tempStats[count] = instanceStats[i];
      map[i] = count;
      count++;
    }
         }
         instanceStats = tempStats;
         cc = instanceStats.length;
         for (i=0;i<m_clusterAssignments.length;i++) {
    m_clusterAssignments[i] = map[(int)m_clusterAssignments[i]];
         }
         } */

    double sum = Utils.sum(instanceStats);
    loglk /= sum;
    m_logL = loglk;

    m_clusteringResults.append(m_Clusterer.toString());
    m_clusteringResults.append("Clustered Instances\n\n");
    int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1);
    for (i = 0; i < cc; i++) {
      if (instanceStats[i] > 0) {
        m_clusteringResults.append(
            Utils.doubleToString((double) i, clustFieldWidth, 0)
                + "      "
                + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0)
                + " ("
                + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0)
                + "%)\n");
      }
    }

    if (unclusteredInstances > 0) {
      m_clusteringResults.append("\nUnclustered instances : " + unclusteredInstances);
    }

    if (m_Clusterer instanceof DensityBasedClusterer) {
      m_clusteringResults.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n");
    }

    if (hasClass) {
      evaluateClustersWithRespectToClass(test);
    }
  }