/**
   * Evaluates cluster assignments with respect to actual class labels. Assumes that m_Clusterer has
   * been trained and tested on inst (minus the class).
   *
   * @param inst the instances (including class) to evaluate with respect to
   * @exception Exception if something goes wrong
   */
  private void evaluateClustersWithRespectToClass(Instances inst) throws Exception {
    int numClasses = inst.classAttribute().numValues();
    int[][] counts = new int[m_numClusters][numClasses];
    int[] clusterTotals = new int[m_numClusters];
    double[] best = new double[m_numClusters + 1];
    double[] current = new double[m_numClusters + 1];

    for (int i = 0; i < inst.numInstances(); i++) {
      counts[(int) m_clusterAssignments[i]][(int) inst.instance(i).classValue()]++;
      clusterTotals[(int) m_clusterAssignments[i]]++;
    }

    best[m_numClusters] = Double.MAX_VALUE;
    mapClasses(0, counts, clusterTotals, current, best, 0);

    m_clusteringResults.append("\n\nClass attribute: " + inst.classAttribute().name() + "\n");
    m_clusteringResults.append("Classes to Clusters:\n");
    String matrixString = toMatrixString(counts, clusterTotals, inst);
    m_clusteringResults.append(matrixString).append("\n");

    int Cwidth = 1 + (int) (Math.log(m_numClusters) / Math.log(10));
    // add the minimum error assignment
    for (int i = 0; i < m_numClusters; i++) {
      if (clusterTotals[i] > 0) {
        m_clusteringResults.append("Cluster " + Utils.doubleToString((double) i, Cwidth, 0));
        m_clusteringResults.append(" <-- ");

        if (best[i] < 0) {
          m_clusteringResults.append("No class\n");
        } else {
          m_clusteringResults.append(inst.classAttribute().value((int) best[i])).append("\n");
        }
      }
    }
    m_clusteringResults.append(
        "\nIncorrectly clustered instances :\t"
            + best[m_numClusters]
            + "\t"
            + (Utils.doubleToString((best[m_numClusters] / inst.numInstances() * 100.0), 8, 4))
            + " %\n");

    // copy the class assignments
    m_classToCluster = new int[m_numClusters];
    for (int i = 0; i < m_numClusters; i++) {
      m_classToCluster[i] = (int) best[i];
    }
  }
  /**
   * Returns a "confusion" style matrix of classes to clusters assignments
   *
   * @param counts the counts of classes for each cluster
   * @param clusterTotals total number of examples in each cluster
   * @param inst the training instances (with class)
   * @exception Exception if matrix can't be generated
   */
  private String toMatrixString(int[][] counts, int[] clusterTotals, Instances inst)
      throws Exception {
    StringBuffer ms = new StringBuffer();

    int maxval = 0;
    for (int i = 0; i < m_numClusters; i++) {
      for (int j = 0; j < counts[i].length; j++) {
        if (counts[i][j] > maxval) {
          maxval = counts[i][j];
        }
      }
    }

    int Cwidth =
        1
            + Math.max(
                (int) (Math.log(maxval) / Math.log(10)),
                (int) (Math.log(m_numClusters) / Math.log(10)));

    ms.append("\n");

    for (int i = 0; i < m_numClusters; i++) {
      if (clusterTotals[i] > 0) {
        ms.append(" ").append(Utils.doubleToString((double) i, Cwidth, 0));
      }
    }
    ms.append("  <-- assigned to cluster\n");

    for (int i = 0; i < counts[0].length; i++) {

      for (int j = 0; j < m_numClusters; j++) {
        if (clusterTotals[j] > 0) {
          ms.append(" ").append(Utils.doubleToString((double) counts[j][i], Cwidth, 0));
        }
      }
      ms.append(" | ").append(inst.classAttribute().value(i)).append("\n");
    }

    return ms.toString();
  }
  /**
   * Performs a cross-validation for a DensityBasedClusterer clusterer on a set of instances.
   *
   * @param clustererString a string naming the class of the clusterer
   * @param data the data on which the cross-validation is to be performed
   * @param numFolds the number of folds for the cross-validation
   * @param options the options to the clusterer
   * @param random a random number generator
   * @return a string containing the cross validated log likelihood
   * @exception Exception if a clusterer could not be generated
   */
  public static String crossValidateModel(
      String clustererString, Instances data, int numFolds, String[] options, Random random)
      throws Exception {
    Clusterer clusterer = null;
    Instances train, test;
    String[] savedOptions = null;
    double foldAv;
    double CvAv = 0.0;
    double[] tempDist;
    StringBuffer CvString = new StringBuffer();

    if (options != null) {
      savedOptions = new String[options.length];
    }

    data = new Instances(data);

    // create clusterer
    try {
      clusterer = (Clusterer) Class.forName(clustererString).newInstance();
    } catch (Exception e) {
      throw new Exception("Can't find class with name " + clustererString + '.');
    }

    if (!(clusterer instanceof DensityBasedClusterer)) {
      throw new Exception(clustererString + " must be a distrinbution " + "clusterer.");
    }

    // Save options
    if (options != null) {
      System.arraycopy(options, 0, savedOptions, 0, options.length);
    }

    // Parse options
    if (clusterer instanceof OptionHandler) {
      try {
        ((OptionHandler) clusterer).setOptions(savedOptions);
        Utils.checkForRemainingOptions(savedOptions);
      } catch (Exception e) {
        throw new Exception("Can't parse given options in " + "cross-validation!");
      }
    }
    CvAv = crossValidateModel((DensityBasedClusterer) clusterer, data, numFolds, random);

    CvString.append(
        "\n" + numFolds + " fold CV Log Likelihood: " + Utils.doubleToString(CvAv, 6, 4) + "\n");
    return CvString.toString();
  }
예제 #4
0
  /**
   * Returns description of the bagged classifier.
   *
   * @return description of the bagged classifier as a string
   */
  @Override
  public String toString() {

    if (m_Classifiers == null) {
      return "Bagging: No model built yet.";
    }
    StringBuffer text = new StringBuffer();
    text.append("All the base classifiers: \n\n");
    for (int i = 0; i < m_Classifiers.length; i++)
      text.append(m_Classifiers[i].toString() + "\n\n");

    if (m_CalcOutOfBag) {
      text.append("Out of bag error: " + Utils.doubleToString(m_OutOfBagError, 4) + "\n\n");
    }

    return text.toString();
  }
  private static String numToString(double num) {
    int precision = 1;
    int whole = (int) Math.abs(num);
    double decimal = Math.abs(num) - whole;
    int nondecimal;
    nondecimal = (whole > 0) ? (int) (Math.log(whole) / Math.log(10)) : 1;

    precision = (decimal > 0) ? (int) Math.abs(((Math.log(Math.abs(num)) / Math.log(10)))) + 2 : 1;
    if (precision > 5) {
      precision = 1;
    }

    String numString =
        reconcile.weka.core.Utils.doubleToString(num, nondecimal + 1 + precision, precision);

    return numString;
  }
  /**
   * Print the cluster statistics for either the training or the testing data.
   *
   * @param clusterer the clusterer to use for generating statistics.
   * @return a string containing cluster statistics.
   * @exception if statistics can't be generated.
   */
  private static String printClusterStats(Clusterer clusterer, String fileName) throws Exception {
    StringBuffer text = new StringBuffer();
    int i = 0;
    int cnum;
    double loglk = 0.0;
    double[] dist;
    double temp;
    int cc = clusterer.numberOfClusters();
    double[] instanceStats = new double[cc];
    int unclusteredInstances = 0;

    if (fileName.length() != 0) {
      BufferedReader inStream = null;

      try {
        inStream = new BufferedReader(new FileReader(fileName));
      } catch (Exception e) {
        throw new Exception("Can't open file " + e.getMessage() + '.');
      }

      Instances inst = new Instances(inStream, 1);

      while (inst.readInstance(inStream)) {
        try {
          cnum = clusterer.clusterInstance(inst.instance(0));

          if (clusterer instanceof DensityBasedClusterer) {
            loglk += ((DensityBasedClusterer) clusterer).logDensityForInstance(inst.instance(0));
            //	    temp = Utils.sum(dist);
          }
          instanceStats[cnum]++;
        } catch (Exception e) {
          unclusteredInstances++;
        }
        inst.delete(0);
        i++;
      }

      /*
           // count the actual number of used clusters
           int count = 0;
           for (i = 0; i < cc; i++) {
      if (instanceStats[i] > 0) {
        count++;
      }
           }
           if (count > 0) {
      double [] tempStats = new double [count];
      count=0;
      for (i=0;i<cc;i++) {
        if (instanceStats[i] > 0) {
          tempStats[count++] = instanceStats[i];
      }
      }
      instanceStats = tempStats;
      cc = instanceStats.length;
      } */

      int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1);
      int numInstFieldWidth = (int) ((Math.log(i) / Math.log(10)) + 1);
      double sum = Utils.sum(instanceStats);
      loglk /= sum;
      text.append("Clustered Instances\n");

      for (i = 0; i < cc; i++) {
        if (instanceStats[i] > 0) {
          text.append(
              Utils.doubleToString((double) i, clustFieldWidth, 0)
                  + "      "
                  + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0)
                  + " ("
                  + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0)
                  + "%)\n");
        }
      }
      if (unclusteredInstances > 0) {
        text.append("\nUnclustered Instances : " + unclusteredInstances);
      }

      if (clusterer instanceof DensityBasedClusterer) {
        text.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n");
      }
    }

    return text.toString();
  }
  /**
   * Evaluate the clusterer on a set of instances. Calculates clustering statistics and stores
   * cluster assigments for the instances in m_clusterAssignments
   *
   * @param test the set of instances to cluster
   * @exception Exception if something goes wrong
   */
  public void evaluateClusterer(Instances test) throws Exception {
    int i = 0;
    int cnum;
    double loglk = 0.0;
    double[] dist;
    double temp;
    int cc = m_Clusterer.numberOfClusters();
    m_numClusters = cc;
    int numInstFieldWidth = (int) ((Math.log(test.numInstances()) / Math.log(10)) + 1);
    double[] instanceStats = new double[cc];
    m_clusterAssignments = new double[test.numInstances()];
    Instances testCopy = test;
    boolean hasClass = (testCopy.classIndex() >= 0);
    int unclusteredInstances = 0;

    // If class is set then do class based evaluation as well
    if (hasClass) {
      if (testCopy.classAttribute().isNumeric()) {
        throw new Exception("ClusterEvaluation: Class must be nominal!");
      }
      Remove removeClass = new Remove();
      removeClass.setAttributeIndices("" + (testCopy.classIndex() + 1));
      removeClass.setInvertSelection(false);
      removeClass.setInputFormat(testCopy);
      testCopy = Filter.useFilter(testCopy, removeClass);
    }

    for (i = 0; i < testCopy.numInstances(); i++) {
      cnum = -1;
      try {
        if (m_Clusterer instanceof DensityBasedClusterer) {
          loglk +=
              ((DensityBasedClusterer) m_Clusterer).logDensityForInstance(testCopy.instance(i));
          //	  temp = Utils.sum(dist);

          //	  Utils.normalize(dist);
          cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
          // Utils.maxIndex(dist);
          m_clusterAssignments[i] = (double) cnum;
        } else {
          cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
          m_clusterAssignments[i] = (double) cnum;
        }
      } catch (Exception e) {
        unclusteredInstances++;
      }

      if (cnum != -1) {
        instanceStats[cnum]++;
      }
    }

    /* // count the actual number of used clusters
       int count = 0;
       for (i = 0; i < cc; i++) {
         if (instanceStats[i] > 0) {
    count++;
         }
       }
       if (count > 0) {
         double [] tempStats = new double [count];
         double [] map = new double [m_clusterAssignments.length];
         count=0;
         for (i=0;i<cc;i++) {
    if (instanceStats[i] > 0) {
      tempStats[count] = instanceStats[i];
      map[i] = count;
      count++;
    }
         }
         instanceStats = tempStats;
         cc = instanceStats.length;
         for (i=0;i<m_clusterAssignments.length;i++) {
    m_clusterAssignments[i] = map[(int)m_clusterAssignments[i]];
         }
         } */

    double sum = Utils.sum(instanceStats);
    loglk /= sum;
    m_logL = loglk;

    m_clusteringResults.append(m_Clusterer.toString());
    m_clusteringResults.append("Clustered Instances\n\n");
    int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1);
    for (i = 0; i < cc; i++) {
      if (instanceStats[i] > 0) {
        m_clusteringResults.append(
            Utils.doubleToString((double) i, clustFieldWidth, 0)
                + "      "
                + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0)
                + " ("
                + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0)
                + "%)\n");
      }
    }

    if (unclusteredInstances > 0) {
      m_clusteringResults.append("\nUnclustered instances : " + unclusteredInstances);
    }

    if (m_Clusterer instanceof DensityBasedClusterer) {
      m_clusteringResults.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n");
    }

    if (hasClass) {
      evaluateClustersWithRespectToClass(test);
    }
  }