Exemple #1
0
  /**
   * Updates the minimum and maximum values for all the attributes based on a new instance.
   *
   * @param instance the new instance
   */
  private void updateMinMax(Instance instance) {

    for (int j = 0; j < instance.numAttributes(); j++) {
      if (Double.isNaN(m_Min[j])) {
        m_Min[j] = instance.value(j);
        m_Max[j] = instance.value(j);
      } else {
        if (instance.value(j) < m_Min[j]) {
          m_Min[j] = instance.value(j);
        } else {
          if (instance.value(j) > m_Max[j]) {
            m_Max[j] = instance.value(j);
          }
        }
      }
    }
  }
  /**
   * Gets the subset of instances that apply to a particluar branch of the split. If the branch
   * index is -1, the subset will consist of those instances that don't apply to any branch.
   *
   * @param branch the index of the branch
   * @param sourceInstances the instances from which to find the subset
   * @return the set of instances that apply
   */
  public ReferenceInstances instancesDownBranch(int branch, Instances instances) {

    ReferenceInstances filteredInstances = new ReferenceInstances(instances, 1);
    if (branch == -1) {
      for (Enumeration e = instances.enumerateInstances(); e.hasMoreElements(); ) {
        Instance inst = (Instance) e.nextElement();
        if (inst.isMissing(attIndex)) filteredInstances.addReference(inst);
      }
    } else if (branch == 0) {
      for (Enumeration e = instances.enumerateInstances(); e.hasMoreElements(); ) {
        Instance inst = (Instance) e.nextElement();
        if (!inst.isMissing(attIndex) && inst.value(attIndex) < splitPoint)
          filteredInstances.addReference(inst);
      }
    } else {
      for (Enumeration e = instances.enumerateInstances(); e.hasMoreElements(); ) {
        Instance inst = (Instance) e.nextElement();
        if (!inst.isMissing(attIndex) && inst.value(attIndex) >= splitPoint)
          filteredInstances.addReference(inst);
      }
    }
    return filteredInstances;
  }
  /**
   * Gets the index of the branch that an instance applies to. Returns -1 if no branches apply.
   *
   * @param i the instance
   * @return the branch index
   */
  public int branchInstanceGoesDown(Instance inst) {

    if (inst.isMissing(attIndex)) return -1;
    else if (inst.value(attIndex) < splitPoint) return 0;
    else return 1;
  }
  /** if clusterIdx is -1, all instances are used (a single metric for all clusters is used) */
  public boolean trainMetric(int clusterIdx) throws Exception {
    Init(clusterIdx);

    double[] weights = new double[m_numAttributes];
    int violatedConstraints = 0;
    int numInstances = 0;

    for (int instIdx = 0; instIdx < m_instances.numInstances(); instIdx++) {
      int assignment = m_clusterAssignments[instIdx];

      // only instances assigned to this cluster are of importance
      if (assignment == clusterIdx || clusterIdx == -1) {
        numInstances++;
        if (clusterIdx < 0) {
          m_centroid = m_kmeans.getClusterCentroids().instance(assignment);
        }

        // accumulate variance
        Instance instance = m_instances.instance(instIdx);
        Instance diffInstance = m_metric.createDiffInstance(instance, m_centroid);
        for (int attr = 0; attr < m_numAttributes; attr++) {
          weights[attr] += diffInstance.value(attr);
        }

        // check all constraints for this instance
        Object list = m_instanceConstraintMap.get(new Integer(instIdx));
        if (list != null) { // there are constraints associated with this instance
          ArrayList constraintList = (ArrayList) list;
          for (int i = 0; i < constraintList.size(); i++) {
            InstancePair pair = (InstancePair) constraintList.get(i);
            int linkType = pair.linkType;
            int firstIdx = pair.first;
            int secondIdx = pair.second;
            Instance instance1 = m_instances.instance(firstIdx);
            Instance instance2 = m_instances.instance(secondIdx);
            int otherIdx =
                (firstIdx == instIdx)
                    ? m_clusterAssignments[secondIdx]
                    : m_clusterAssignments[firstIdx];

            if (otherIdx != -1) { // check whether the constraint is violated
              if (otherIdx != assignment && linkType == InstancePair.MUST_LINK) {
                diffInstance = m_metric.createDiffInstance(instance1, instance2);
                for (int attr = 0; attr < m_numAttributes; attr++) {
                  weights[attr] += 0.5 * m_MLweight * diffInstance.value(attr);
                }
              } else if (otherIdx == assignment && linkType == InstancePair.CANNOT_LINK) {
                diffInstance = m_metric.createDiffInstance(instance1, instance2);
                for (int attr = 0; attr < m_numAttributes; attr++) {
                  // this constraint will be counted twice, hence 0.5
                  weights[attr] += 0.5 * m_CLweight * m_maxCLDiffInstance.value(attr);
                  weights[attr] -= 0.5 * m_CLweight * diffInstance.value(attr);
                }
              }
            }
          }
        }
      }
    }
    //      System.out.println("Updating cluster " + clusterIdx
    //  		       + " containing " + numInstances);

    // check the weights
    double[] newWeights = new double[m_numAttributes];
    double[] currentWeights = m_metric.getWeights();

    boolean needNewtonRaphson = false;
    for (int attr = 0; attr < m_numAttributes; attr++) {
      if (weights[attr] <= 0) { // check to avoid divide by 0 - TODO!
        System.out.println(
            "Negative weight "
                + weights[attr]
                + " for clusterIdx="
                + clusterIdx
                + "; using prev value="
                + currentWeights[attr]);
        newWeights[attr] = currentWeights[attr];
        //  	needNewtonRaphson = true;
        //	break;
      } else {
        if (m_regularize) { // solution of quadratic equation - TODO!
          int n = m_instances.numInstances();
          double ratio = (m_logTermWeight * n) / (2 * weights[attr]);
          newWeights[attr] =
              ratio + Math.sqrt(ratio * ratio + (m_regularizerTermWeight * n) / weights[attr]);
        } else {
          newWeights[attr] = m_logTermWeight * numInstances / weights[attr];
        }
      }
    }

    // do NR if needed
    if (needNewtonRaphson) {
      System.out.println("GOING TO NEWTON-RAPHSON!!!\n");
      newWeights = updateWeightsUsingNewtonRaphson(currentWeights, weights);
    }

    // PRINT routine
    //      System.out.println("Total constraints violated: " + violatedConstraints/2 + "; weights
    // are:");
    //      for (int attr=0; attr<numAttributes; attr++) {
    //        System.out.print(newWeights[attr] + "\t");
    //      }
    //      System.out.println();
    // end PRINT routine

    m_metric.setWeights(newWeights);
    return true;
  }
  /**
   * Evaluates a feature subset by cross validation
   *
   * @param feature_set the subset to be evaluated
   * @param num_atts the number of attributes in the subset
   * @return the estimated accuracy
   * @throws Exception if subset can't be evaluated
   */
  protected double estimatePerformance(BitSet feature_set, int num_atts) throws Exception {

    m_evaluation = new Evaluation(m_theInstances);
    int i;
    int[] fs = new int[num_atts];

    double[] instA = new double[num_atts];
    int classI = m_theInstances.classIndex();

    int index = 0;
    for (i = 0; i < m_numAttributes; i++) {
      if (feature_set.get(i)) {
        fs[index++] = i;
      }
    }

    // create new hash table
    m_entries = new Hashtable((int) (m_theInstances.numInstances() * 1.5));

    // insert instances into the hash table
    for (i = 0; i < m_numInstances; i++) {

      Instance inst = m_theInstances.instance(i);
      for (int j = 0; j < fs.length; j++) {
        if (fs[j] == classI) {
          instA[j] = Double.MAX_VALUE; // missing for the class
        } else if (inst.isMissing(fs[j])) {
          instA[j] = Double.MAX_VALUE;
        } else {
          instA[j] = inst.value(fs[j]);
        }
      }
      insertIntoTable(inst, instA);
    }

    if (m_CVFolds == 1) {

      // calculate leave one out error
      for (i = 0; i < m_numInstances; i++) {
        Instance inst = m_theInstances.instance(i);
        for (int j = 0; j < fs.length; j++) {
          if (fs[j] == classI) {
            instA[j] = Double.MAX_VALUE; // missing for the class
          } else if (inst.isMissing(fs[j])) {
            instA[j] = Double.MAX_VALUE;
          } else {
            instA[j] = inst.value(fs[j]);
          }
        }
        evaluateInstanceLeaveOneOut(inst, instA);
      }
    } else {
      m_theInstances.randomize(m_rr);
      m_theInstances.stratify(m_CVFolds);

      // calculate 10 fold cross validation error
      for (i = 0; i < m_CVFolds; i++) {
        Instances insts = m_theInstances.testCV(m_CVFolds, i);
        evaluateFoldCV(insts, fs);
      }
    }

    switch (m_evaluationMeasure) {
      case EVAL_DEFAULT:
        if (m_classIsNominal) {
          return m_evaluation.pctCorrect();
        }
        return -m_evaluation.rootMeanSquaredError();
      case EVAL_ACCURACY:
        return m_evaluation.pctCorrect();
      case EVAL_RMSE:
        return -m_evaluation.rootMeanSquaredError();
      case EVAL_MAE:
        return -m_evaluation.meanAbsoluteError();
      case EVAL_AUC:
        double[] classPriors = m_evaluation.getClassPriors();
        Utils.normalize(classPriors);
        double weightedAUC = 0;
        for (i = 0; i < m_theInstances.classAttribute().numValues(); i++) {
          double tempAUC = m_evaluation.areaUnderROC(i);
          if (!Utils.isMissingValue(tempAUC)) {
            weightedAUC += (classPriors[i] * tempAUC);
          } else {
            System.err.println("Undefined AUC!!");
          }
        }
        return weightedAUC;
    }
    // shouldn't get here
    return 0.0;
  }
  /**
   * Calculates the accuracy on a test fold for internal cross validation of feature sets
   *
   * @param fold set of instances to be "left out" and classified
   * @param fs currently selected feature set
   * @return the accuracy for the fold
   * @throws Exception if something goes wrong
   */
  double evaluateFoldCV(Instances fold, int[] fs) throws Exception {

    int i;
    int ruleCount = 0;
    int numFold = fold.numInstances();
    int numCl = m_theInstances.classAttribute().numValues();
    double[][] class_distribs = new double[numFold][numCl];
    double[] instA = new double[fs.length];
    double[] normDist;
    DecisionTableHashKey thekey;
    double acc = 0.0;
    int classI = m_theInstances.classIndex();
    Instance inst;

    if (m_classIsNominal) {
      normDist = new double[numCl];
    } else {
      normDist = new double[2];
    }

    // first *remove* instances
    for (i = 0; i < numFold; i++) {
      inst = fold.instance(i);
      for (int j = 0; j < fs.length; j++) {
        if (fs[j] == classI) {
          instA[j] = Double.MAX_VALUE; // missing for the class
        } else if (inst.isMissing(fs[j])) {
          instA[j] = Double.MAX_VALUE;
        } else {
          instA[j] = inst.value(fs[j]);
        }
      }
      thekey = new DecisionTableHashKey(instA);
      if ((class_distribs[i] = (double[]) m_entries.get(thekey)) == null) {
        throw new Error("This should never happen!");
      } else {
        if (m_classIsNominal) {
          class_distribs[i][(int) inst.classValue()] -= inst.weight();
        } else {
          class_distribs[i][0] -= (inst.classValue() * inst.weight());
          class_distribs[i][1] -= inst.weight();
        }
        ruleCount++;
      }
      m_classPriorCounts[(int) inst.classValue()] -= inst.weight();
    }
    double[] classPriors = m_classPriorCounts.clone();
    Utils.normalize(classPriors);

    // now classify instances
    for (i = 0; i < numFold; i++) {
      inst = fold.instance(i);
      System.arraycopy(class_distribs[i], 0, normDist, 0, normDist.length);
      if (m_classIsNominal) {
        boolean ok = false;
        for (int j = 0; j < normDist.length; j++) {
          if (Utils.gr(normDist[j], 1.0)) {
            ok = true;
            break;
          }
        }

        if (!ok) { // majority class
          normDist = classPriors.clone();
        }

        //	if (ok) {
        Utils.normalize(normDist);
        if (m_evaluationMeasure == EVAL_AUC) {
          m_evaluation.evaluateModelOnceAndRecordPrediction(normDist, inst);
        } else {
          m_evaluation.evaluateModelOnce(normDist, inst);
        }
        /*	} else {
          normDist[(int)m_majority] = 1.0;
          if (m_evaluationMeasure == EVAL_AUC) {
            m_evaluation.evaluateModelOnceAndRecordPrediction(normDist, inst);
          } else {
            m_evaluation.evaluateModelOnce(normDist, inst);
          }
        } */
      } else {
        if (Utils.eq(normDist[1], 0.0)) {
          double[] temp = new double[1];
          temp[0] = m_majority;
          m_evaluation.evaluateModelOnce(temp, inst);
        } else {
          double[] temp = new double[1];
          temp[0] = normDist[0] / normDist[1];
          m_evaluation.evaluateModelOnce(temp, inst);
        }
      }
    }

    // now re-insert instances
    for (i = 0; i < numFold; i++) {
      inst = fold.instance(i);

      m_classPriorCounts[(int) inst.classValue()] += inst.weight();

      if (m_classIsNominal) {
        class_distribs[i][(int) inst.classValue()] += inst.weight();
      } else {
        class_distribs[i][0] += (inst.classValue() * inst.weight());
        class_distribs[i][1] += inst.weight();
      }
    }
    return acc;
  }