/** * Constructor that generates a sparse instance from the given instance. Reference to the dataset * is set to null. (ie. the instance doesn't have access to information about the attribute types) * * @param instance the instance from which the attribute values and the weight are to be copied */ public SparseInstance(Instance instance) { m_Weight = instance.weight(); m_Dataset = null; m_NumAttributes = instance.numAttributes(); if (instance instanceof SparseInstance) { m_AttValues = ((SparseInstance) instance).m_AttValues; m_Indices = ((SparseInstance) instance).m_Indices; } else { double[] tempValues = new double[instance.numAttributes()]; int[] tempIndices = new int[instance.numAttributes()]; int vals = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (instance.value(i) != 0) { tempValues[vals] = instance.value(i); tempIndices[vals] = i; vals++; } } m_AttValues = new double[vals]; m_Indices = new int[vals]; System.arraycopy(tempValues, 0, m_AttValues, 0, vals); System.arraycopy(tempIndices, 0, m_Indices, 0, vals); } }
/** * Private function to compute default number of accurate instances in the specified data for the * consequent of the rule * * @param data the data in question * @return the default accuracy number */ private double computeDefAccu(Instances data) { double defAccu = 0; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if ((int) inst.classValue() == (int) m_Consequent) defAccu += inst.weight(); } return defAccu; }
/** * Processes the given data (may change the provided dataset) and returns the modified version. * This method is called in batchFinished(). * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong * @see #batchFinished() */ protected Instances process(Instances instances) throws Exception { Instances result; int i; int n; double[] values; String value; Instance inst; Instance newInst; // we need the complete input data! if (!isFirstBatchDone()) setOutputFormat(determineOutputFormat(getInputFormat())); result = new Instances(getOutputFormat()); for (i = 0; i < instances.numInstances(); i++) { inst = instances.instance(i); values = inst.toDoubleArray(); for (n = 0; n < values.length; n++) { if (!m_Cols.isInRange(n) || !instances.attribute(n).isNumeric() || inst.isMissing(n)) continue; // get index of value if (instances.attribute(n).type() == Attribute.DATE) value = inst.stringValue(n); else value = Utils.doubleToString(inst.value(n), MAX_DECIMALS); values[n] = result.attribute(n).indexOfValue(value); } // generate new instance if (inst instanceof SparseInstance) newInst = new SparseInstance(inst.weight(), values); else newInst = new DenseInstance(inst.weight(), values); // copy possible string, relational values newInst.setDataset(getOutputFormat()); copyValues(newInst, false, inst.dataset(), getOutputFormat()); result.add(newInst); } return result; }
/** * Implements the splitData function. This procedure is to split the data into bags according to * the nominal attribute value The infoGain for each bag is also calculated. * * @param data the data to be split * @param defAcRt the default accuracy rate for data * @param cl the class label to be predicted * @return the array of data after split */ public Instances[] splitData(Instances data, double defAcRt, double cl) { int bag = att.numValues(); Instances[] splitData = new Instances[bag]; for (int x = 0; x < bag; x++) { splitData[x] = new Instances(data, data.numInstances()); accurate[x] = 0; coverage[x] = 0; } for (int x = 0; x < data.numInstances(); x++) { Instance inst = data.instance(x); if (!inst.isMissing(att)) { int v = (int) inst.value(att); splitData[v].add(inst); coverage[v] += inst.weight(); if ((int) inst.classValue() == (int) cl) accurate[v] += inst.weight(); } } for (int x = 0; x < bag; x++) { double t = coverage[x] + 1.0; double p = accurate[x] + 1.0; double infoGain = // Utils.eq(defAcRt, 1.0) ? // accurate[x]/(double)numConds : accurate[x] * (Utils.log2(p / t) - Utils.log2(defAcRt)); if (infoGain > maxInfoGain) { maxInfoGain = infoGain; cover = coverage[x]; accu = accurate[x]; accuRate = p / t; value = (double) x; } } return splitData; }
/** * Inserts an instance into the hash table * * @param inst instance to be inserted * @param instA to create the hash key from * @throws Exception if the instance can't be inserted */ private void insertIntoTable(Instance inst, double[] instA) throws Exception { double[] tempClassDist2; double[] newDist; DecisionTableHashKey thekey; if (instA != null) { thekey = new DecisionTableHashKey(instA); } else { thekey = new DecisionTableHashKey(inst, inst.numAttributes(), false); } // see if this one is already in the table tempClassDist2 = (double[]) m_entries.get(thekey); if (tempClassDist2 == null) { if (m_classIsNominal) { newDist = new double[m_theInstances.classAttribute().numValues()]; // Leplace estimation for (int i = 0; i < m_theInstances.classAttribute().numValues(); i++) { newDist[i] = 1.0; } newDist[(int) inst.classValue()] = inst.weight(); // add to the table m_entries.put(thekey, newDist); } else { newDist = new double[2]; newDist[0] = inst.classValue() * inst.weight(); newDist[1] = inst.weight(); // add to the table m_entries.put(thekey, newDist); } } else { // update the distribution for this instance if (m_classIsNominal) { tempClassDist2[(int) inst.classValue()] += inst.weight(); // update the table m_entries.put(thekey, tempClassDist2); } else { tempClassDist2[0] += (inst.classValue() * inst.weight()); tempClassDist2[1] += inst.weight(); // update the table m_entries.put(thekey, tempClassDist2); } } }
/** * Prune all the possible final sequences of the rule using the pruning data. The measure used to * prune the rule is based on flag given. * * @param pruneData the pruning data used to prune the rule * @param useWhole flag to indicate whether use the error rate of the whole pruning data instead * of the data covered */ public void prune(Instances pruneData, boolean useWhole) { Instances data = pruneData; double total = data.sumOfWeights(); if (!Utils.gr(total, 0.0)) return; /* The default accurate # and rate on pruning data */ double defAccu = computeDefAccu(data); if (m_Debug) System.err.println( "Pruning with " + defAccu + " positive data out of " + total + " instances"); int size = m_Antds.size(); if (size == 0) return; // Default rule before pruning double[] worthRt = new double[size]; double[] coverage = new double[size]; double[] worthValue = new double[size]; for (int w = 0; w < size; w++) { worthRt[w] = coverage[w] = worthValue[w] = 0.0; } /* Calculate accuracy parameters for all the antecedents in this rule */ double tn = 0.0; // True negative if useWhole for (int x = 0; x < size; x++) { Antd antd = (Antd) m_Antds.elementAt(x); Instances newData = data; data = new Instances(newData, 0); // Make data empty for (int y = 0; y < newData.numInstances(); y++) { Instance ins = newData.instance(y); if (antd.covers(ins) > 0) { // Covered by this antecedent coverage[x] += ins.weight(); data.add(ins); // Add to data for further pruning if ((int) ins.classValue() == (int) m_Consequent) // Accurate prediction worthValue[x] += ins.weight(); } else if (useWhole) { // Not covered if ((int) ins.classValue() != (int) m_Consequent) tn += ins.weight(); } } if (useWhole) { worthValue[x] += tn; worthRt[x] = worthValue[x] / total; } else // Note if coverage is 0, accuracy is 0.5 worthRt[x] = (worthValue[x] + 1.0) / (coverage[x] + 2.0); } double maxValue = (defAccu + 1.0) / (total + 2.0); int maxIndex = -1; for (int i = 0; i < worthValue.length; i++) { if (m_Debug) { double denom = useWhole ? total : coverage[i]; System.err.println( i + "(useAccuray? " + !useWhole + "): " + worthRt[i] + "=" + worthValue[i] + "/" + denom); } if (worthRt[i] > maxValue) { // Prefer to the maxValue = worthRt[i]; // shorter rule maxIndex = i; } } if (maxIndex == -1) return; /* Prune the antecedents according to the accuracy parameters */ for (int z = size - 1; z > maxIndex; z--) m_Antds.removeElementAt(z); }
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return preedicted class probability distribution * @throws Exception if distribution can't be computed successfully */ public double[] distributionForInstance(Instance instance) throws Exception { // default model? if (m_ZeroR != null) { return m_ZeroR.distributionForInstance(instance); } if (m_Train.numInstances() == 0) { throw new Exception("No training instances!"); } m_NNSearch.addInstanceInfo(instance); int k = m_Train.numInstances(); if ((!m_UseAllK && (m_kNN < k)) /*&& !(m_WeightKernel==INVERSE || m_WeightKernel==GAUSS)*/) { k = m_kNN; } Instances neighbours = m_NNSearch.kNearestNeighbours(instance, k); double distances[] = m_NNSearch.getDistances(); if (m_Debug) { System.out.println("Test Instance: " + instance); System.out.println( "For " + k + " kept " + neighbours.numInstances() + " out of " + m_Train.numInstances() + " instances."); } // IF LinearNN has skipped so much that <k neighbours are remaining. if (k > distances.length) k = distances.length; if (m_Debug) { System.out.println("Instance Distances"); for (int i = 0; i < distances.length; i++) { System.out.println("" + distances[i]); } } // Determine the bandwidth double bandwidth = distances[k - 1]; // Check for bandwidth zero if (bandwidth <= 0) { // if the kth distance is zero than give all instances the same weight for (int i = 0; i < distances.length; i++) distances[i] = 1; } else { // Rescale the distances by the bandwidth for (int i = 0; i < distances.length; i++) distances[i] = distances[i] / bandwidth; } // Pass the distances through a weighting kernel for (int i = 0; i < distances.length; i++) { switch (m_WeightKernel) { case LINEAR: distances[i] = 1.0001 - distances[i]; break; case EPANECHNIKOV: distances[i] = 3 / 4D * (1.0001 - distances[i] * distances[i]); break; case TRICUBE: distances[i] = Math.pow((1.0001 - Math.pow(distances[i], 3)), 3); break; case CONSTANT: // System.err.println("using constant kernel"); distances[i] = 1; break; case INVERSE: distances[i] = 1.0 / (1.0 + distances[i]); break; case GAUSS: distances[i] = Math.exp(-distances[i] * distances[i]); break; } } if (m_Debug) { System.out.println("Instance Weights"); for (int i = 0; i < distances.length; i++) { System.out.println("" + distances[i]); } } // Set the weights on the training data double sumOfWeights = 0, newSumOfWeights = 0; for (int i = 0; i < distances.length; i++) { double weight = distances[i]; Instance inst = (Instance) neighbours.instance(i); sumOfWeights += inst.weight(); newSumOfWeights += inst.weight() * weight; inst.setWeight(inst.weight() * weight); // weightedTrain.add(newInst); } // Rescale weights for (int i = 0; i < neighbours.numInstances(); i++) { Instance inst = neighbours.instance(i); inst.setWeight(inst.weight() * sumOfWeights / newSumOfWeights); } // Create a weighted classifier m_Classifier.buildClassifier(neighbours); if (m_Debug) { System.out.println("Classifying test instance: " + instance); System.out.println("Built base classifier:\n" + m_Classifier.toString()); } // Return the classifier's predictions return m_Classifier.distributionForInstance(instance); }
public void buildClassifier(Instances insts) throws Exception { // Compute mean of target value double yMean = insts.meanOrMode(insts.classIndex()); // Choose best attribute double minMsq = Double.MAX_VALUE; m_attribute = null; int chosen = -1; double chosenSlope = Double.NaN; double chosenIntercept = Double.NaN; for (int i = 0; i < insts.numAttributes(); i++) { if (i != insts.classIndex()) { if (!insts.attribute(i).isNumeric()) { throw new Exception("UnivariateLinearRegression: Only numeric attributes!"); } m_attribute = insts.attribute(i); // Compute slope and intercept double xMean = insts.meanOrMode(i); double sumWeightedXDiffSquared = 0; double sumWeightedYDiffSquared = 0; m_slope = 0; for (int j = 0; j < insts.numInstances(); j++) { Instance inst = insts.instance(j); if (!inst.isMissing(i) && !inst.classIsMissing()) { double xDiff = inst.value(i) - xMean; double yDiff = inst.classValue() - yMean; double weightedXDiff = inst.weight() * xDiff; double weightedYDiff = inst.weight() * yDiff; m_slope += weightedXDiff * yDiff; sumWeightedXDiffSquared += weightedXDiff * xDiff; sumWeightedYDiffSquared += weightedYDiff * yDiff; } } // Skip attribute if not useful if (sumWeightedXDiffSquared == 0) { continue; } double numerator = m_slope; m_slope /= sumWeightedXDiffSquared; m_intercept = yMean - m_slope * xMean; // Compute sum of squared errors double msq = sumWeightedYDiffSquared - m_slope * numerator; // Check whether this is the best attribute if (msq < minMsq) { minMsq = msq; chosen = i; chosenSlope = m_slope; chosenIntercept = m_intercept; } } } // Set parameters if (chosen == -1) { System.err.println("----- no useful attribute found"); m_attribute = null; m_slope = 0; m_intercept = yMean; } else { m_attribute = insts.attribute(chosen); m_slope = chosenSlope; m_intercept = chosenIntercept; } }
/** * Calculates the accuracy on a test fold for internal cross validation of feature sets * * @param fold set of instances to be "left out" and classified * @param fs currently selected feature set * @return the accuracy for the fold * @throws Exception if something goes wrong */ double evaluateFoldCV(Instances fold, int[] fs) throws Exception { int i; int ruleCount = 0; int numFold = fold.numInstances(); int numCl = m_theInstances.classAttribute().numValues(); double[][] class_distribs = new double[numFold][numCl]; double[] instA = new double[fs.length]; double[] normDist; DecisionTableHashKey thekey; double acc = 0.0; int classI = m_theInstances.classIndex(); Instance inst; if (m_classIsNominal) { normDist = new double[numCl]; } else { normDist = new double[2]; } // first *remove* instances for (i = 0; i < numFold; i++) { inst = fold.instance(i); for (int j = 0; j < fs.length; j++) { if (fs[j] == classI) { instA[j] = Double.MAX_VALUE; // missing for the class } else if (inst.isMissing(fs[j])) { instA[j] = Double.MAX_VALUE; } else { instA[j] = inst.value(fs[j]); } } thekey = new DecisionTableHashKey(instA); if ((class_distribs[i] = (double[]) m_entries.get(thekey)) == null) { throw new Error("This should never happen!"); } else { if (m_classIsNominal) { class_distribs[i][(int) inst.classValue()] -= inst.weight(); } else { class_distribs[i][0] -= (inst.classValue() * inst.weight()); class_distribs[i][1] -= inst.weight(); } ruleCount++; } m_classPriorCounts[(int) inst.classValue()] -= inst.weight(); } double[] classPriors = m_classPriorCounts.clone(); Utils.normalize(classPriors); // now classify instances for (i = 0; i < numFold; i++) { inst = fold.instance(i); System.arraycopy(class_distribs[i], 0, normDist, 0, normDist.length); if (m_classIsNominal) { boolean ok = false; for (int j = 0; j < normDist.length; j++) { if (Utils.gr(normDist[j], 1.0)) { ok = true; break; } } if (!ok) { // majority class normDist = classPriors.clone(); } // if (ok) { Utils.normalize(normDist); if (m_evaluationMeasure == EVAL_AUC) { m_evaluation.evaluateModelOnceAndRecordPrediction(normDist, inst); } else { m_evaluation.evaluateModelOnce(normDist, inst); } /* } else { normDist[(int)m_majority] = 1.0; if (m_evaluationMeasure == EVAL_AUC) { m_evaluation.evaluateModelOnceAndRecordPrediction(normDist, inst); } else { m_evaluation.evaluateModelOnce(normDist, inst); } } */ } else { if (Utils.eq(normDist[1], 0.0)) { double[] temp = new double[1]; temp[0] = m_majority; m_evaluation.evaluateModelOnce(temp, inst); } else { double[] temp = new double[1]; temp[0] = normDist[0] / normDist[1]; m_evaluation.evaluateModelOnce(temp, inst); } } } // now re-insert instances for (i = 0; i < numFold; i++) { inst = fold.instance(i); m_classPriorCounts[(int) inst.classValue()] += inst.weight(); if (m_classIsNominal) { class_distribs[i][(int) inst.classValue()] += inst.weight(); } else { class_distribs[i][0] += (inst.classValue() * inst.weight()); class_distribs[i][1] += inst.weight(); } } return acc; }
/** * Classifies an instance for internal leave one out cross validation of feature sets * * @param instance instance to be "left out" and classified * @param instA feature values of the selected features for the instance * @return the classification of the instance * @throws Exception if something goes wrong */ double evaluateInstanceLeaveOneOut(Instance instance, double[] instA) throws Exception { DecisionTableHashKey thekey; double[] tempDist; double[] normDist; thekey = new DecisionTableHashKey(instA); if (m_classIsNominal) { // if this one is not in the table if ((tempDist = (double[]) m_entries.get(thekey)) == null) { throw new Error("This should never happen!"); } else { normDist = new double[tempDist.length]; System.arraycopy(tempDist, 0, normDist, 0, tempDist.length); normDist[(int) instance.classValue()] -= instance.weight(); // update the table // first check to see if the class counts are all zero now boolean ok = false; for (int i = 0; i < normDist.length; i++) { if (Utils.gr(normDist[i], 1.0)) { ok = true; break; } } // downdate the class prior counts m_classPriorCounts[(int) instance.classValue()] -= instance.weight(); double[] classPriors = m_classPriorCounts.clone(); Utils.normalize(classPriors); if (!ok) { // majority class normDist = classPriors; } m_classPriorCounts[(int) instance.classValue()] += instance.weight(); // if (ok) { Utils.normalize(normDist); if (m_evaluationMeasure == EVAL_AUC) { m_evaluation.evaluateModelOnceAndRecordPrediction(normDist, instance); } else { m_evaluation.evaluateModelOnce(normDist, instance); } return Utils.maxIndex(normDist); /*} else { normDist = new double [normDist.length]; normDist[(int)m_majority] = 1.0; if (m_evaluationMeasure == EVAL_AUC) { m_evaluation.evaluateModelOnceAndRecordPrediction(normDist, instance); } else { m_evaluation.evaluateModelOnce(normDist, instance); } return m_majority; } */ } // return Utils.maxIndex(tempDist); } else { // see if this one is already in the table if ((tempDist = (double[]) m_entries.get(thekey)) != null) { normDist = new double[tempDist.length]; System.arraycopy(tempDist, 0, normDist, 0, tempDist.length); normDist[0] -= (instance.classValue() * instance.weight()); normDist[1] -= instance.weight(); if (Utils.eq(normDist[1], 0.0)) { double[] temp = new double[1]; temp[0] = m_majority; m_evaluation.evaluateModelOnce(temp, instance); return m_majority; } else { double[] temp = new double[1]; temp[0] = normDist[0] / normDist[1]; m_evaluation.evaluateModelOnce(temp, instance); return temp[0]; } } else { throw new Error("This should never happen!"); } } // shouldn't get here // return 0.0; }
/** * Generates the classifier. * * @param data set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ public void buildClassifier(Instances data) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(data); // remove instances with missing class m_theInstances = new Instances(data); m_theInstances.deleteWithMissingClass(); m_rr = new Random(1); if (m_theInstances.classAttribute().isNominal()) { // Set up class priors m_classPriorCounts = new double[data.classAttribute().numValues()]; Arrays.fill(m_classPriorCounts, 1.0); for (int i = 0; i < data.numInstances(); i++) { Instance curr = data.instance(i); m_classPriorCounts[(int) curr.classValue()] += curr.weight(); } m_classPriors = m_classPriorCounts.clone(); Utils.normalize(m_classPriors); } setUpEvaluator(); if (m_theInstances.classAttribute().isNumeric()) { m_disTransform = new weka.filters.unsupervised.attribute.Discretize(); m_classIsNominal = false; // use binned discretisation if the class is numeric ((weka.filters.unsupervised.attribute.Discretize) m_disTransform).setBins(10); ((weka.filters.unsupervised.attribute.Discretize) m_disTransform).setInvertSelection(true); // Discretize all attributes EXCEPT the class String rangeList = ""; rangeList += (m_theInstances.classIndex() + 1); // System.out.println("The class col: "+m_theInstances.classIndex()); ((weka.filters.unsupervised.attribute.Discretize) m_disTransform) .setAttributeIndices(rangeList); } else { m_disTransform = new weka.filters.supervised.attribute.Discretize(); ((weka.filters.supervised.attribute.Discretize) m_disTransform).setUseBetterEncoding(true); m_classIsNominal = true; } m_disTransform.setInputFormat(m_theInstances); m_theInstances = Filter.useFilter(m_theInstances, m_disTransform); m_numAttributes = m_theInstances.numAttributes(); m_numInstances = m_theInstances.numInstances(); m_majority = m_theInstances.meanOrMode(m_theInstances.classAttribute()); // Perform the search int[] selected = m_search.search(m_evaluator, m_theInstances); m_decisionFeatures = new int[selected.length + 1]; System.arraycopy(selected, 0, m_decisionFeatures, 0, selected.length); m_decisionFeatures[m_decisionFeatures.length - 1] = m_theInstances.classIndex(); // reduce instances to selected features m_delTransform = new Remove(); m_delTransform.setInvertSelection(true); // set features to keep m_delTransform.setAttributeIndicesArray(m_decisionFeatures); m_delTransform.setInputFormat(m_theInstances); m_dtInstances = Filter.useFilter(m_theInstances, m_delTransform); // reset the number of attributes m_numAttributes = m_dtInstances.numAttributes(); // create hash table m_entries = new Hashtable((int) (m_dtInstances.numInstances() * 1.5)); // insert instances into the hash table for (int i = 0; i < m_numInstances; i++) { Instance inst = m_dtInstances.instance(i); insertIntoTable(inst, null); } // Replace the global table majority with nearest neighbour? if (m_useIBk) { m_ibk = new IBk(); m_ibk.buildClassifier(m_theInstances); } // Save memory if (m_saveMemory) { m_theInstances = new Instances(m_theInstances, 0); m_dtInstances = new Instances(m_dtInstances, 0); } m_evaluation = null; }