/** * Draw a instance out from a cluster. * * @param instIdx index of the instance to be drawn out * @param t index of the cluster which the instance previously belong to * @param T the current working partition * @param input the input statistics */ private void reduce_x(int instIdx, int t, Partition T, Input input) { // Update the prior probability of the cluster ArrayList<Integer> indices = T.find(t); double sum = 0.0; for (int i = 0; i < indices.size(); i++) { if (indices.get(i) == instIdx) { continue; } sum += input.Px[indices.get(i)]; } T.Pt[t] = sum; if (T.Pt[t] < 0) { System.out.format("Warning: probability < 0 (%s)\n", T.Pt[t]); T.Pt[t] = 0; } // Update prob of each attribute in the cluster double[][] mArray = input.Pyx.getArray(); for (int i = 0; i < m_numAttributes; i++) { sum = 0.0; for (int j = 0; j < indices.size(); j++) { if (indices.get(j) == instIdx) { continue; } sum += mArray[i][indices.get(j)]; } T.Py_t.set(i, t, sum / T.Pt[t]); } }
/** * Process the input and compute the statistics of the training data * * @return an Input object which holds the statistics about the training data */ private Input sIB_ProcessInput() { double valSum = 0.0; for (int i = 0; i < m_numInstances; i++) { valSum = 0.0; for (int v = 0; v < m_data.instance(i).numValues(); v++) { valSum += m_data.instance(i).valueSparse(v); } if (valSum <= 0) { if (m_verbose) { System.out.format("Instance %s sum of value = %s <= 0, removed.\n", i, valSum); } m_data.delete(i); m_numInstances--; } } // get the term-document matrix Input input = new Input(); input.Py_x = getTransposedNormedMatrix(m_data); if (m_uniformPrior) { input.Pyx = input.Py_x.copy(); normalizePrior(m_data); } else { input.Pyx = getTransposedMatrix(m_data); } input.sumVals = getTotalSum(m_data); input.Pyx.timesEquals(1 / input.sumVals); // prior probability of documents, ie. sum the columns from the Pyx matrix input.Px = new double[m_numInstances]; for (int i = 0; i < m_numInstances; i++) { for (int j = 0; j < m_numAttributes; j++) { input.Px[i] += input.Pyx.get(j, i); } } // prior probability of terms, ie. sum the rows from the Pyx matrix input.Py = new double[m_numAttributes]; for (int i = 0; i < input.Pyx.getRowDimension(); i++) { for (int j = 0; j < input.Pyx.getColumnDimension(); j++) { input.Py[i] += input.Pyx.get(i, j); } } MI(input.Pyx, input); return input; }
/** * Initialize the partition * * @param input object holding the statistics of the training data * @return the initialized partition */ private Partition sIB_InitT(Input input) { Partition T = new Partition(); int avgSize = (int) Math.ceil((double) m_numInstances / m_numCluster); ArrayList<Integer> permInstsIdx = new ArrayList<Integer>(); ArrayList<Integer> unassigned = new ArrayList<Integer>(); for (int i = 0; i < m_numInstances; i++) { unassigned.add(i); } while (unassigned.size() != 0) { int t = random.nextInt(unassigned.size()); permInstsIdx.add(unassigned.get(t)); unassigned.remove(t); } for (int i = 0; i < m_numCluster; i++) { int r2 = avgSize > permInstsIdx.size() ? permInstsIdx.size() : avgSize; for (int j = 0; j < r2; j++) { T.Pt_x[permInstsIdx.get(j)] = i; } for (int j = 0; j < r2; j++) { permInstsIdx.remove(0); } } // initialize the prior prob of each cluster, and the probability // for each attribute within the cluster for (int i = 0; i < m_numCluster; i++) { ArrayList<Integer> indices = T.find(i); for (int j = 0; j < indices.size(); j++) { T.Pt[i] += input.Px[indices.get(j)]; } double[][] mArray = input.Pyx.getArray(); for (int j = 0; j < m_numAttributes; j++) { double sum = 0.0; for (int k = 0; k < indices.size(); k++) { sum += mArray[j][indices.get(k)]; } sum /= T.Pt[i]; T.Py_t.set(j, i, sum); } } if (m_verbose) { System.out.println("Initializing..."); } return T; }