예제 #1
0
  /**
   * Draw a instance out from a cluster.
   *
   * @param instIdx index of the instance to be drawn out
   * @param t index of the cluster which the instance previously belong to
   * @param T the current working partition
   * @param input the input statistics
   */
  private void reduce_x(int instIdx, int t, Partition T, Input input) {
    // Update the prior probability of the cluster
    ArrayList<Integer> indices = T.find(t);
    double sum = 0.0;
    for (int i = 0; i < indices.size(); i++) {
      if (indices.get(i) == instIdx) {
        continue;
      }
      sum += input.Px[indices.get(i)];
    }
    T.Pt[t] = sum;

    if (T.Pt[t] < 0) {
      System.out.format("Warning: probability < 0 (%s)\n", T.Pt[t]);
      T.Pt[t] = 0;
    }

    // Update prob of each attribute in the cluster
    double[][] mArray = input.Pyx.getArray();
    for (int i = 0; i < m_numAttributes; i++) {
      sum = 0.0;
      for (int j = 0; j < indices.size(); j++) {
        if (indices.get(j) == instIdx) {
          continue;
        }
        sum += mArray[i][indices.get(j)];
      }
      T.Py_t.set(i, t, sum / T.Pt[t]);
    }
  }
예제 #2
0
  /**
   * Process the input and compute the statistics of the training data
   *
   * @return an Input object which holds the statistics about the training data
   */
  private Input sIB_ProcessInput() {
    double valSum = 0.0;
    for (int i = 0; i < m_numInstances; i++) {
      valSum = 0.0;
      for (int v = 0; v < m_data.instance(i).numValues(); v++) {
        valSum += m_data.instance(i).valueSparse(v);
      }
      if (valSum <= 0) {
        if (m_verbose) {
          System.out.format("Instance %s sum of value = %s <= 0, removed.\n", i, valSum);
        }
        m_data.delete(i);
        m_numInstances--;
      }
    }

    // get the term-document matrix
    Input input = new Input();
    input.Py_x = getTransposedNormedMatrix(m_data);
    if (m_uniformPrior) {
      input.Pyx = input.Py_x.copy();
      normalizePrior(m_data);
    } else {
      input.Pyx = getTransposedMatrix(m_data);
    }
    input.sumVals = getTotalSum(m_data);
    input.Pyx.timesEquals(1 / input.sumVals);

    // prior probability of documents, ie. sum the columns from the Pyx matrix
    input.Px = new double[m_numInstances];
    for (int i = 0; i < m_numInstances; i++) {
      for (int j = 0; j < m_numAttributes; j++) {
        input.Px[i] += input.Pyx.get(j, i);
      }
    }

    // prior probability of terms, ie. sum the rows from the Pyx matrix
    input.Py = new double[m_numAttributes];
    for (int i = 0; i < input.Pyx.getRowDimension(); i++) {
      for (int j = 0; j < input.Pyx.getColumnDimension(); j++) {
        input.Py[i] += input.Pyx.get(i, j);
      }
    }

    MI(input.Pyx, input);
    return input;
  }
예제 #3
0
  /**
   * Initialize the partition
   *
   * @param input object holding the statistics of the training data
   * @return the initialized partition
   */
  private Partition sIB_InitT(Input input) {
    Partition T = new Partition();
    int avgSize = (int) Math.ceil((double) m_numInstances / m_numCluster);

    ArrayList<Integer> permInstsIdx = new ArrayList<Integer>();
    ArrayList<Integer> unassigned = new ArrayList<Integer>();
    for (int i = 0; i < m_numInstances; i++) {
      unassigned.add(i);
    }
    while (unassigned.size() != 0) {
      int t = random.nextInt(unassigned.size());
      permInstsIdx.add(unassigned.get(t));
      unassigned.remove(t);
    }

    for (int i = 0; i < m_numCluster; i++) {
      int r2 = avgSize > permInstsIdx.size() ? permInstsIdx.size() : avgSize;
      for (int j = 0; j < r2; j++) {
        T.Pt_x[permInstsIdx.get(j)] = i;
      }
      for (int j = 0; j < r2; j++) {
        permInstsIdx.remove(0);
      }
    }

    // initialize the prior prob of each cluster, and the probability
    // for each attribute within the cluster
    for (int i = 0; i < m_numCluster; i++) {
      ArrayList<Integer> indices = T.find(i);
      for (int j = 0; j < indices.size(); j++) {
        T.Pt[i] += input.Px[indices.get(j)];
      }
      double[][] mArray = input.Pyx.getArray();
      for (int j = 0; j < m_numAttributes; j++) {
        double sum = 0.0;
        for (int k = 0; k < indices.size(); k++) {
          sum += mArray[j][indices.get(k)];
        }
        sum /= T.Pt[i];
        T.Py_t.set(j, i, sum);
      }
    }

    if (m_verbose) {
      System.out.println("Initializing...");
    }
    return T;
  }