Пример #1
0
  /** Generate a random neighbor which differs in only one medoid with current clusters. */
  private double getRandomNeighbor(T[] data, T[] medoids, int[] y, double[] d) {
    int n = data.length;

    int index = Math.randomInt(k);
    T medoid = null;
    boolean dup;
    do {
      dup = false;
      medoid = data[Math.randomInt(n)];
      for (int i = 0; i < k; i++) {
        if (medoid == medoids[i]) {
          dup = true;
          break;
        }
      }
    } while (dup);

    medoids[index] = medoid;

    for (int i = 0; i < n; i++) {
      double dist = distance.d(data[i], medoid);
      if (d[i] > dist) {
        y[i] = index;
        d[i] = dist;
      } else if (y[i] == index) {
        d[i] = dist;
        y[i] = index;
        for (int j = 0; j < k; j++) {
          if (j != index) {
            dist = distance.d(data[i], medoids[j]);
            if (d[i] > dist) {
              y[i] = j;
              d[i] = dist;
            }
          }
        }
      }
    }

    return Math.sum(d);
  }
Пример #2
0
    /**
     * Finds the best split cutoff for attribute j at the current node.
     *
     * @param n the number instances in this node.
     * @param count the sample count in each class.
     * @param falseCount an array to store sample count in each class for false child node.
     * @param impurity the impurity of this node.
     * @param j the attribute to split on.
     */
    public Node findBestSplit(int n, int[] count, int[] falseCount, double impurity, int j) {
      Node splitNode = new Node();

      if (attributes[j].getType() == Attribute.Type.NOMINAL) {
        int m = ((NominalAttribute) attributes[j]).size();
        int[][] trueCount = new int[m][k];

        for (int i = 0; i < x.length; i++) {
          if (samples[i] > 0) {
            trueCount[(int) x[i][j]][y[i]] += samples[i];
          }
        }

        for (int l = 0; l < m; l++) {
          int tc = Math.sum(trueCount[l]);
          int fc = n - tc;

          // If either side is empty, skip this feature.
          if (tc < nodeSize || fc < nodeSize) {
            continue;
          }

          for (int q = 0; q < k; q++) {
            falseCount[q] = count[q] - trueCount[l][q];
          }

          int trueLabel = Math.whichMax(trueCount[l]);
          int falseLabel = Math.whichMax(falseCount);
          double gain =
              impurity
                  - (double) tc / n * impurity(trueCount[l], tc)
                  - (double) fc / n * impurity(falseCount, fc);

          if (gain > splitNode.splitScore) {
            // new best split
            splitNode.splitFeature = j;
            splitNode.splitValue = l;
            splitNode.splitScore = gain;
            splitNode.trueChildOutput = trueLabel;
            splitNode.falseChildOutput = falseLabel;
          }
        }
      } else if (attributes[j].getType() == Attribute.Type.NUMERIC) {
        int[] trueCount = new int[k];
        double prevx = Double.NaN;
        int prevy = -1;

        for (int i : order[j]) {
          if (samples[i] > 0) {
            if (Double.isNaN(prevx) || x[i][j] == prevx || y[i] == prevy) {
              prevx = x[i][j];
              prevy = y[i];
              trueCount[y[i]] += samples[i];
              continue;
            }

            int tc = Math.sum(trueCount);
            int fc = n - tc;

            // If either side is empty, skip this feature.
            if (tc < nodeSize || fc < nodeSize) {
              prevx = x[i][j];
              prevy = y[i];
              trueCount[y[i]] += samples[i];
              continue;
            }

            for (int l = 0; l < k; l++) {
              falseCount[l] = count[l] - trueCount[l];
            }

            int trueLabel = Math.whichMax(trueCount);
            int falseLabel = Math.whichMax(falseCount);
            double gain =
                impurity
                    - (double) tc / n * impurity(trueCount, tc)
                    - (double) fc / n * impurity(falseCount, fc);

            if (gain > splitNode.splitScore) {
              // new best split
              splitNode.splitFeature = j;
              splitNode.splitValue = (x[i][j] + prevx) / 2;
              splitNode.splitScore = gain;
              splitNode.trueChildOutput = trueLabel;
              splitNode.falseChildOutput = falseLabel;
            }

            prevx = x[i][j];
            prevy = y[i];
            trueCount[y[i]] += samples[i];
          }
        }
      } else {
        throw new IllegalStateException("Unsupported attribute type: " + attributes[j].getType());
      }

      return splitNode;
    }