示例#1
0
文件: FLD.java 项目: grue/smile
  /**
   * Constructor. Learn Fisher's linear discriminant.
   *
   * @param x training instances.
   * @param y training labels in [0, k), where k is the number of classes.
   * @param L the dimensionality of mapped space.
   * @param tol a tolerance to decide if a covariance matrix is singular; it will reject variables
   *     whose variance is less than tol<sup>2</sup>.
   */
  public FLD(double[][] x, int[] y, int L, double tol) {
    if (x.length != y.length) {
      throw new IllegalArgumentException(
          String.format("The sizes of X and Y don't match: %d != %d", x.length, y.length));
    }

    // class label set.
    int[] labels = Math.unique(y);
    Arrays.sort(labels);

    for (int i = 0; i < labels.length; i++) {
      if (labels[i] < 0) {
        throw new IllegalArgumentException("Negative class label: " + labels[i]);
      }

      if (i > 0 && labels[i] - labels[i - 1] > 1) {
        throw new IllegalArgumentException("Missing class: " + labels[i] + 1);
      }
    }

    k = labels.length;
    if (k < 2) {
      throw new IllegalArgumentException("Only one class.");
    }

    if (tol < 0.0) {
      throw new IllegalArgumentException("Invalid tol: " + tol);
    }

    if (x.length <= k) {
      throw new IllegalArgumentException(
          String.format("Sample size is too small: %d <= %d", x.length, k));
    }

    if (L >= k) {
      throw new IllegalArgumentException(
          String.format("The dimensionality of mapped space is too high: %d >= %d", L, k));
    }

    if (L <= 0) {
      L = k - 1;
    }

    final int n = x.length;
    p = x[0].length;

    // The number of instances in each class.
    int[] ni = new int[k];
    // Common mean vector.
    mean = Math.colMean(x);
    // Common covariance.
    double[][] T = new double[p][p];
    // Class mean vectors.
    mu = new double[k][p];

    for (int i = 0; i < n; i++) {
      int c = y[i];
      ni[c]++;
      for (int j = 0; j < p; j++) {
        mu[c][j] += x[i][j];
      }
    }

    for (int i = 0; i < k; i++) {
      for (int j = 0; j < p; j++) {
        mu[i][j] = mu[i][j] / ni[i] - mean[j];
      }
    }

    for (int i = 0; i < n; i++) {
      for (int j = 0; j < p; j++) {
        for (int l = 0; l <= j; l++) {
          T[j][l] += (x[i][j] - mean[j]) * (x[i][l] - mean[l]);
        }
      }
    }

    for (int j = 0; j < p; j++) {
      for (int l = 0; l <= j; l++) {
        T[j][l] /= n;
        T[l][j] = T[j][l];
      }
    }

    // Between class scatter
    double[][] B = new double[p][p];
    for (int i = 0; i < k; i++) {
      for (int j = 0; j < p; j++) {
        for (int l = 0; l <= j; l++) {
          B[j][l] += mu[i][j] * mu[i][l];
        }
      }
    }

    for (int j = 0; j < p; j++) {
      for (int l = 0; l <= j; l++) {
        B[j][l] /= k;
        B[l][j] = B[j][l];
      }
    }

    EigenValueDecomposition eigen = EigenValueDecomposition.decompose(T, true);

    tol = tol * tol;
    double[] s = eigen.getEigenValues();
    for (int i = 0; i < s.length; i++) {
      if (s[i] < tol) {
        throw new IllegalArgumentException("The covariance matrix is close to singular.");
      }

      s[i] = 1.0 / s[i];
    }

    double[][] U = eigen.getEigenVectors();
    double[][] UB = Math.atbmm(U, B);

    for (int i = 0; i < k; i++) {
      for (int j = 0; j < p; j++) {
        UB[i][j] *= s[j];
      }
    }

    Math.abmm(U, UB, B);

    eigen = EigenValueDecomposition.decompose(B, true);

    U = eigen.getEigenVectors();
    scaling = new double[p][L];
    for (int i = 0; i < p; i++) {
      System.arraycopy(U[i], 0, scaling[i], 0, L);
    }

    smean = new double[L];
    Math.atx(scaling, mean, smean);
    smu = Math.abmm(mu, scaling);
  }
示例#2
0
  /**
   * Constructor. Learns a classification tree for AdaBoost and Random Forest.
   *
   * @param attributes the attribute properties.
   * @param x the training instances.
   * @param y the response variable.
   * @param nodeSize the minimum size of leaf nodes.
   * @param maxNodes the maximum number of leaf nodes in the tree.
   * @param mtry the number of input variables to pick to split on at each node. It seems that
   *     sqrt(p) give generally good performance, where p is the number of variables.
   * @param rule the splitting rule.
   * @param order the index of training values in ascending order. Note that only numeric attributes
   *     need be sorted.
   * @param samples the sample set of instances for stochastic learning. samples[i] is the number of
   *     sampling for instance i.
   */
  public DecisionTree(
      Attribute[] attributes,
      double[][] x,
      int[] y,
      int maxNodes,
      int nodeSize,
      int mtry,
      SplitRule rule,
      int[] samples,
      int[][] order) {
    if (x.length != y.length) {
      throw new IllegalArgumentException(
          String.format("The sizes of X and Y don't match: %d != %d", x.length, y.length));
    }

    if (mtry < 1 || mtry > x[0].length) {
      throw new IllegalArgumentException(
          "Invalid number of variables to split on at a node of the tree: " + mtry);
    }

    if (maxNodes < 2) {
      throw new IllegalArgumentException("Invalid maximum leaves: " + maxNodes);
    }

    if (nodeSize < 1) {
      throw new IllegalArgumentException("Invalid minimum size of leaf nodes: " + nodeSize);
    }

    // class label set.
    int[] labels = Math.unique(y);
    Arrays.sort(labels);

    for (int i = 0; i < labels.length; i++) {
      if (labels[i] < 0) {
        throw new IllegalArgumentException("Negative class label: " + labels[i]);
      }

      if (i > 0 && labels[i] - labels[i - 1] > 1) {
        throw new IllegalArgumentException("Missing class: " + labels[i] + 1);
      }
    }

    k = labels.length;
    if (k < 2) {
      throw new IllegalArgumentException("Only one class.");
    }

    if (attributes == null) {
      int p = x[0].length;
      attributes = new Attribute[p];
      for (int i = 0; i < p; i++) {
        attributes[i] = new NumericAttribute("V" + (i + 1));
      }
    }

    this.attributes = attributes;
    this.mtry = mtry;
    this.nodeSize = nodeSize;
    this.maxNodes = maxNodes;
    this.rule = rule;
    importance = new double[attributes.length];

    if (order != null) {
      this.order = order;
    } else {
      int n = x.length;
      int p = x[0].length;

      double[] a = new double[n];
      this.order = new int[p][];

      for (int j = 0; j < p; j++) {
        if (attributes[j] instanceof NumericAttribute) {
          for (int i = 0; i < n; i++) {
            a[i] = x[i][j];
          }
          this.order[j] = QuickSort.sort(a);
        }
      }
    }

    // Priority queue for best-first tree growing.
    PriorityQueue<TrainNode> nextSplits = new PriorityQueue<>();

    int n = y.length;
    int[] count = new int[k];
    if (samples == null) {
      samples = new int[n];
      for (int i = 0; i < n; i++) {
        samples[i] = 1;
        count[y[i]]++;
      }
    } else {
      for (int i = 0; i < n; i++) {
        count[y[i]] += samples[i];
      }
    }

    double[] posteriori = new double[k];
    for (int i = 0; i < k; i++) {
      posteriori[i] = (double) count[i] / n;
    }
    root = new Node(Math.whichMax(count), posteriori);

    TrainNode trainRoot = new TrainNode(root, x, y, samples);
    // Now add splits to the tree until max tree size is reached
    if (trainRoot.findBestSplit()) {
      nextSplits.add(trainRoot);
    }

    // Pop best leaf from priority queue, split it, and push
    // children nodes into the queue if possible.
    for (int leaves = 1; leaves < this.maxNodes; leaves++) {
      // parent is the leaf to split
      TrainNode node = nextSplits.poll();
      if (node == null) {
        break;
      }

      node.split(nextSplits); // Split the parent node into two children nodes
    }
  }