/** * Calculates the centroid pivot of a node based on the list of points that it contains (tbe two * lists of its children are provided). * * @param list1 The point index list of first child. * @param list2 The point index list of second child. * @param insts The insts object on which the tree is being built (for header information). * @return The centroid pivot of the node. */ public Instance calcPivot(MyIdxList list1, MyIdxList list2, Instances insts) { int classIdx = m_Instances.classIndex(); double[] attrVals = new double[insts.numAttributes()]; Instance temp; for (int i = 0; i < list1.length(); i++) { temp = insts.instance(((ListNode) list1.get(i)).idx); for (int k = 0; k < temp.numValues(); k++) { if (temp.index(k) == classIdx) continue; attrVals[k] += temp.valueSparse(k); } } for (int j = 0; j < list2.length(); j++) { temp = insts.instance(((ListNode) list2.get(j)).idx); for (int k = 0; k < temp.numValues(); k++) { if (temp.index(k) == classIdx) continue; attrVals[k] += temp.valueSparse(k); } } for (int j = 0, numInsts = list1.length() + list2.length(); j < attrVals.length; j++) { attrVals[j] /= numInsts; } temp = new DenseInstance(1.0, attrVals); return temp; }
/** * Compute the MI between instances and attributes * * @param m the term-document matrix * @param input object that describes the statistics about the training data */ private void MI(Matrix m, Input input) { int minDimSize = m.getColumnDimension() < m.getRowDimension() ? m.getColumnDimension() : m.getRowDimension(); if (minDimSize < 2) { System.err.println("Warning : This is not a JOINT distribution"); input.Hx = Entropy(m); input.Hy = 0; input.Ixy = 0; return; } input.Hx = Entropy(input.Px); input.Hy = Entropy(input.Py); double entropy = input.Hx + input.Hy; for (int i = 0; i < m_numInstances; i++) { Instance inst = m_data.instance(i); for (int v = 0; v < inst.numValues(); v++) { double tmp = m.get(inst.index(v), i); if (tmp <= 0) { continue; } entropy += tmp * Math.log(tmp); } } input.Ixy = entropy; if (m_verbose) { System.out.println("Ixy = " + input.Ixy); } }
public void insert(Instance instance, long timestamp) { N++; LST += timestamp; SST += timestamp * timestamp; for (int i = 0; i < instance.numValues(); i++) { LS[i] += instance.value(i); SS[i] += instance.value(i) * instance.value(i); } }
/** * Transpose the document-term matrix to term-document matrix * * @param data instances with document-term info * @return a term-document matrix transposed from the input dataset */ private Matrix getTransposedMatrix(Instances data) { double[][] temp = new double[data.numAttributes()][data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); for (int v = 0; v < inst.numValues(); v++) { temp[inst.index(v)][i] = inst.valueSparse(v); } } Matrix My_x = new Matrix(temp); return My_x; }
/** * Calculates the distance between two instances * * @param test the first instance * @param train the second instance * @return the distance between the two given instances, between 0 and 1 */ protected double distance(Instance first, Instance second) { double distance = 0; int firstI, secondI; for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues(); ) { if (p1 >= first.numValues()) { firstI = m_instances.numAttributes(); } else { firstI = first.index(p1); } if (p2 >= second.numValues()) { secondI = m_instances.numAttributes(); } else { secondI = second.index(p2); } if (firstI == m_instances.classIndex()) { p1++; continue; } if (secondI == m_instances.classIndex()) { p2++; continue; } double diff; if (firstI == secondI) { diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2)); p1++; p2++; } else if (firstI > secondI) { diff = difference(secondI, 0, second.valueSparse(p2)); p2++; } else { diff = difference(firstI, first.valueSparse(p1), 0); p1++; } distance += diff * diff; } return Math.sqrt(distance / m_instances.numAttributes()); }
/** * Checks if an instance contains an item set. * * @param instance the instance to be tested * @return true if the given instance contains this item set */ public boolean containedByTreatZeroAsMissing(Instance instance) { if (instance instanceof weka.core.SparseInstance) { int numInstVals = instance.numValues(); int numItemSetVals = m_items.length; for (int p1 = 0, p2 = 0; p1 < numInstVals || p2 < numItemSetVals; ) { int instIndex = Integer.MAX_VALUE; if (p1 < numInstVals) { instIndex = instance.index(p1); } int itemIndex = p2; if (m_items[itemIndex] > -1) { if (itemIndex != instIndex) { return false; } else { if (instance.isMissingSparse(p1)) { return false; } if (m_items[itemIndex] != (int) instance.valueSparse(p1)) { return false; } } p1++; p2++; } else { if (itemIndex < instIndex) { p2++; } else if (itemIndex == instIndex) { p2++; p1++; } } } } else { for (int i = 0; i < instance.numAttributes(); i++) { if (m_items[i] > -1) { if (instance.isMissing(i) || (int) instance.value(i) == 0) { return false; } if (m_items[i] != (int) instance.value(i)) { return false; } } } } return true; }
/** * @param inst * @return * @throws Exception */ public double SVMOutput(Instance inst) throws Exception { double result = -m_b; // Is the machine linear? if (m_weights != null) { // Is weight vector stored in sparse format? for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != m_classIndex) { result += m_weights[inst.index(i)] * inst.valueSparse(i); } } } else { for (int i = m_supportVectors.getNext(-1); i != -1; i = m_supportVectors.getNext(i)) { result += (m_alpha[i] - m_alphaStar[i]) * m_kernel.eval(-1, i, inst); } } return result; }
/** * log(N!) + (for all the words)(log(Pi^ni) - log(ni!)) * * <p>where N is the total number of words Pi is the probability of obtaining word i ni is the * number of times the word at index i occurs in the document * * @param inst The instance to be classified * @param classIndex The index of the class we are calculating the probability with respect to * @return The log of the probability of the document occuring given the class */ private double probOfDocGivenClass(Instance inst, int classIndex) { double answer = 0; // double totalWords = 0; //no need as we are not calculating the factorial at all. double freqOfWordInDoc; // should be double for (int i = 0; i < inst.numValues(); i++) if (inst.index(i) != inst.classIndex()) { freqOfWordInDoc = inst.valueSparse(i); // totalWords += freqOfWordInDoc; answer += (freqOfWordInDoc * m_probOfWordGivenClass[classIndex][ inst.index(i)]); // - lnFactorial(freqOfWordInDoc)); } // answer += lnFactorial(totalWords);//The factorial terms don't make // any difference to the classifier's // accuracy, so not needed. return answer; }
/** * Compute the JS divergence between an instance and a cluster, used for test data * * @param inst instance to be clustered * @param t index of the cluster * @param pi1 * @param pi2 * @return the JS divergence */ private double JS(Instance inst, int t, double pi1, double pi2) { if (Math.min(pi1, pi2) <= 0) { System.out.format( "Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2); return 0; } double sum = Utils.sum(inst.toDoubleArray()); double kl1 = 0.0, kl2 = 0.0, tmp = 0.0; for (int i = 0; i < inst.numValues(); i++) { tmp = inst.valueSparse(i) / sum; if (tmp != 0) { kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * bestT.Py_t.get(inst.index(i), t))); } } for (int i = 0; i < m_numAttributes; i++) { if ((tmp = bestT.Py_t.get(i, t)) != 0) { kl2 += tmp * Math.log(tmp / (inst.value(i) * pi1 / sum + pi2 * tmp)); } } return pi1 * kl1 + pi2 * kl2; }
/** * Compute the JS divergence between an instance and a cluster, used for training data * * @param instIdx index of the instance * @param input statistics of the input data * @param T the whole partition * @param t index of the cluster * @param pi1 * @param pi2 * @return the JS divergence */ private double JS(int instIdx, Input input, Partition T, int t, double pi1, double pi2) { if (Math.min(pi1, pi2) <= 0) { System.out.format( "Warning: zero or negative weights in JS calculation! (pi1 %s, pi2 %s)\n", pi1, pi2); return 0; } Instance inst = m_data.instance(instIdx); double kl1 = 0.0, kl2 = 0.0, tmp = 0.0; for (int i = 0; i < inst.numValues(); i++) { tmp = input.Py_x.get(inst.index(i), instIdx); if (tmp != 0) { kl1 += tmp * Math.log(tmp / (tmp * pi1 + pi2 * T.Py_t.get(inst.index(i), t))); } } for (int i = 0; i < m_numAttributes; i++) { if ((tmp = T.Py_t.get(i, t)) != 0) { kl2 += tmp * Math.log(tmp / (input.Py_x.get(i, instIdx) * pi1 + pi2 * tmp)); } } return pi1 * kl1 + pi2 * kl2; }
public void fillWekaInstances(weka.core.Instances winsts) { // set name setName(winsts.relationName()); // set attributes List onto_attrs = new ArrayList(); for (int i = 0; i < winsts.numAttributes(); i++) { Attribute a = new Attribute(); a.fillWekaAttribute(winsts.attribute(i)); onto_attrs.add(a); } setAttributes(onto_attrs); // set instances List onto_insts = new ArrayList(); for (int i = 0; i < winsts.numInstances(); i++) { Instance inst = new Instance(); weka.core.Instance winst = winsts.instance(i); List instvalues = new ArrayList(); List instmis = new ArrayList(); for (int j = 0; j < winst.numValues(); j++) { if (winst.isMissing(j)) { instvalues.add(new Double(0.0)); instmis.add(new Boolean(true)); } else { instvalues.add(new Double(winst.value(j))); instmis.add(new Boolean(false)); } } inst.setValues(instvalues); inst.setMissing(instmis); onto_insts.add(inst); } setInstances(onto_insts); setClass_index(winsts.classIndex()); }
/** * Generates the classifier. * * @param instances set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ public void buildClassifier(Instances instances) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_headerInfo = new Instances(instances, 0); m_numClasses = instances.numClasses(); m_numAttributes = instances.numAttributes(); m_probOfWordGivenClass = new double[m_numClasses][]; /* initialising the matrix of word counts NOTE: Laplace estimator introduced in case a word that does not appear for a class in the training set does so for the test set */ for (int c = 0; c < m_numClasses; c++) { m_probOfWordGivenClass[c] = new double[m_numAttributes]; for (int att = 0; att < m_numAttributes; att++) { m_probOfWordGivenClass[c][att] = 1; } } // enumerate through the instances Instance instance; int classIndex; double numOccurences; double[] docsPerClass = new double[m_numClasses]; double[] wordsPerClass = new double[m_numClasses]; java.util.Enumeration enumInsts = instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { instance = (Instance) enumInsts.nextElement(); classIndex = (int) instance.value(instance.classIndex()); docsPerClass[classIndex] += instance.weight(); for (int a = 0; a < instance.numValues(); a++) if (instance.index(a) != instance.classIndex()) { if (!instance.isMissing(a)) { numOccurences = instance.valueSparse(a) * instance.weight(); if (numOccurences < 0) throw new Exception("Numeric attribute values must all be greater or equal to zero."); wordsPerClass[classIndex] += numOccurences; m_probOfWordGivenClass[classIndex][instance.index(a)] += numOccurences; } } } /* normalising probOfWordGivenClass values and saving each value as the log of each value */ for (int c = 0; c < m_numClasses; c++) for (int v = 0; v < m_numAttributes; v++) m_probOfWordGivenClass[c][v] = Math.log(m_probOfWordGivenClass[c][v] / (wordsPerClass[c] + m_numAttributes - 1)); /* calculating Pr(H) NOTE: Laplace estimator introduced in case a class does not get mentioned in the set of training instances */ final double numDocs = instances.sumOfWeights() + m_numClasses; m_probOfClass = new double[m_numClasses]; for (int h = 0; h < m_numClasses; h++) m_probOfClass[h] = (double) (docsPerClass[h] + 1) / numDocs; }