/** * Checks if an instance contains an item set. * * @param instance the instance to be tested * @return true if the given instance contains this item set */ public boolean containedByTreatZeroAsMissing(Instance instance) { if (instance instanceof weka.core.SparseInstance) { int numInstVals = instance.numValues(); int numItemSetVals = m_items.length; for (int p1 = 0, p2 = 0; p1 < numInstVals || p2 < numItemSetVals; ) { int instIndex = Integer.MAX_VALUE; if (p1 < numInstVals) { instIndex = instance.index(p1); } int itemIndex = p2; if (m_items[itemIndex] > -1) { if (itemIndex != instIndex) { return false; } else { if (instance.isMissingSparse(p1)) { return false; } if (m_items[itemIndex] != (int) instance.valueSparse(p1)) { return false; } } p1++; p2++; } else { if (itemIndex < instIndex) { p2++; } else if (itemIndex == instIndex) { p2++; p1++; } } } } else { for (int i = 0; i < instance.numAttributes(); i++) { if (m_items[i] > -1) { if (instance.isMissing(i) || (int) instance.value(i) == 0) { return false; } if (m_items[i] != (int) instance.value(i)) { return false; } } } } return true; }
/** * Generates the classifier. * * @param instances set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ public void buildClassifier(Instances instances) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_headerInfo = new Instances(instances, 0); m_numClasses = instances.numClasses(); m_numAttributes = instances.numAttributes(); m_probOfWordGivenClass = new double[m_numClasses][]; /* initialising the matrix of word counts NOTE: Laplace estimator introduced in case a word that does not appear for a class in the training set does so for the test set */ for (int c = 0; c < m_numClasses; c++) { m_probOfWordGivenClass[c] = new double[m_numAttributes]; for (int att = 0; att < m_numAttributes; att++) { m_probOfWordGivenClass[c][att] = 1; } } // enumerate through the instances Instance instance; int classIndex; double numOccurences; double[] docsPerClass = new double[m_numClasses]; double[] wordsPerClass = new double[m_numClasses]; java.util.Enumeration<Instance> enumInsts = instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { instance = (Instance) enumInsts.nextElement(); classIndex = (int) instance.value(instance.classIndex()); docsPerClass[classIndex] += instance.weight(); for (int a = 0; a < instance.numValues(); a++) if (instance.index(a) != instance.classIndex()) { if (!instance.isMissingSparse(a)) { numOccurences = instance.valueSparse(a) * instance.weight(); if (numOccurences < 0) throw new Exception("Numeric attribute values must all be greater or equal to zero."); wordsPerClass[classIndex] += numOccurences; m_probOfWordGivenClass[classIndex][instance.index(a)] += numOccurences; } } } /* normalising probOfWordGivenClass values and saving each value as the log of each value */ for (int c = 0; c < m_numClasses; c++) for (int v = 0; v < m_numAttributes; v++) m_probOfWordGivenClass[c][v] = Math.log(m_probOfWordGivenClass[c][v] / (wordsPerClass[c] + m_numAttributes - 1)); /* calculating Pr(H) NOTE: Laplace estimator introduced in case a class does not get mentioned in the set of training instances */ final double numDocs = instances.sumOfWeights() + m_numClasses; m_probOfClass = new double[m_numClasses]; for (int h = 0; h < m_numClasses; h++) m_probOfClass[h] = (double) (docsPerClass[h] + 1) / numDocs; }