/** * Compare two datasets to see if they differ. * * @param data1 one set of instances * @param data2 the other set of instances * @throws Exception if the datasets differ */ protected void compareDatasets(Instances data1, Instances data2) throws Exception { if (m_CheckHeader) { if (!data2.equalHeaders(data1)) { throw new Exception("header has been modified\n" + data2.equalHeadersMsg(data1)); } } if (!(data2.numInstances() == data1.numInstances())) { throw new Exception("number of instances has changed"); } for (int i = 0; i < data2.numInstances(); i++) { Instance orig = data1.instance(i); Instance copy = data2.instance(i); for (int j = 0; j < orig.numAttributes(); j++) { if (orig.isMissing(j)) { if (!copy.isMissing(j)) { throw new Exception("instances have changed"); } } else { if (m_CompareValuesAsString) { if (!orig.toString(j).equals(copy.toString(j))) { throw new Exception("instances have changed"); } } else { if (Math.abs(orig.value(j) - copy.value(j)) > m_MaxDiffValues) { throw new Exception("instances have changed"); } } } if (Math.abs(orig.weight() - copy.weight()) > m_MaxDiffWeights) { throw new Exception("instance weights have changed"); } } } }
/** Computes average class values for each attribute and value */ private void computeAverageClassValues() { double totalCounts, sum; Instance instance; double[] counts; double[][] avgClassValues = new double[getInputFormat().numAttributes()][0]; m_Indices = new int[getInputFormat().numAttributes()][0]; for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if (att.isNominal()) { avgClassValues[j] = new double[att.numValues()]; counts = new double[att.numValues()]; for (int i = 0; i < getInputFormat().numInstances(); i++) { instance = getInputFormat().instance(i); if (!instance.classIsMissing() && (!instance.isMissing(j))) { counts[(int) instance.value(j)] += instance.weight(); avgClassValues[j][(int) instance.value(j)] += instance.weight() * instance.classValue(); } } sum = Utils.sum(avgClassValues[j]); totalCounts = Utils.sum(counts); if (Utils.gr(totalCounts, 0)) { for (int k = 0; k < att.numValues(); k++) { if (Utils.gr(counts[k], 0)) { avgClassValues[j][k] /= counts[k]; } else { avgClassValues[j][k] = sum / totalCounts; } } } m_Indices[j] = Utils.sort(avgClassValues[j]); } } }
/** * Input an instance for filtering. Ordinarily the instance is processed and made available for * output immediately. Some filters require all instances be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @exception IllegalStateException if no input format has been defined. * @exception Exception if there was a problem during the filtering. */ public boolean input(Instance instance) throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } double[] vals = new double[instance.numAttributes() + 1]; for (int i = 0; i < instance.numAttributes(); i++) { if (instance.isMissing(i)) { vals[i] = Instance.missingValue(); } else { vals[i] = instance.value(i); } } evaluateExpression(vals); Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new Instance(instance.weight(), vals); } copyStringValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); return true; }
/** * Convert a single instance over. The converted instance is added to the end of the output queue. * * @param instance the instance to convert */ protected void convertInstance(Instance instance) { int index = 0; double[] vals = new double[outputFormatPeek().numAttributes()]; // Copy and convert the values for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (m_DiscretizeCols.isInRange(i) && getInputFormat().attribute(i).isNumeric()) { int j; double currentVal = instance.value(i); if (m_CutPoints[i] == null) { if (instance.isMissing(i)) { vals[index] = Utils.missingValue(); } else { vals[index] = 0; } index++; } else { if (!m_MakeBinary) { if (instance.isMissing(i)) { vals[index] = Utils.missingValue(); } else { for (j = 0; j < m_CutPoints[i].length; j++) { if (currentVal <= m_CutPoints[i][j]) { break; } } vals[index] = j; } index++; } else { for (j = 0; j < m_CutPoints[i].length; j++) { if (instance.isMissing(i)) { vals[index] = Utils.missingValue(); } else if (currentVal <= m_CutPoints[i][j]) { vals[index] = 0; } else { vals[index] = 1; } index++; } } } } else { vals[index] = instance.value(i); index++; } } Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new DenseInstance(instance.weight(), vals); } inst.setDataset(getOutputFormat()); copyValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); }
/** * Input an instance for filtering. Ordinarily the instance is processed and made available for * output immediately. Some filters require all instances be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input structure has been defined. */ @Override public boolean input(Instance instance) { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (getOutputFormat().numAttributes() == 0) { return false; } if (m_selectedAttributes.length == 0) { push(instance); } else { double vals[] = new double[getOutputFormat().numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { double currentV = instance.value(i); if (!m_selectedCols.isInRange(i)) { vals[i] = currentV; } else { if (currentV == Utils.missingValue()) { vals[i] = currentV; } else { String currentS = instance.attribute(i).value((int) currentV); String replace = m_ignoreCase ? m_renameMap.get(currentS.toLowerCase()) : m_renameMap.get(currentS); if (replace == null) { vals[i] = currentV; } else { vals[i] = getOutputFormat().attribute(i).indexOfValue(replace); } } } } Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new DenseInstance(instance.weight(), vals); } inst.setDataset(getOutputFormat()); copyValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); } return true; }
/** * Constructs an instance suitable for passing to the model for scoring * * @param incoming the incoming instance * @return an instance with values mapped to be consistent with what the model is expecting */ protected Instance mapIncomingFieldsToModelFields(Instance incoming) { Instances modelHeader = m_model.getHeader(); double[] vals = new double[modelHeader.numAttributes()]; for (int i = 0; i < modelHeader.numAttributes(); i++) { if (m_attributeMap[i] < 0) { // missing or type mismatch vals[i] = Utils.missingValue(); continue; } Attribute modelAtt = modelHeader.attribute(i); Attribute incomingAtt = incoming.dataset().attribute(m_attributeMap[i]); if (incoming.isMissing(incomingAtt.index())) { vals[i] = Utils.missingValue(); continue; } if (modelAtt.isNumeric()) { vals[i] = incoming.value(m_attributeMap[i]); } else if (modelAtt.isNominal()) { String incomingVal = incoming.stringValue(m_attributeMap[i]); int modelIndex = modelAtt.indexOfValue(incomingVal); if (modelIndex < 0) { vals[i] = Utils.missingValue(); } else { vals[i] = modelIndex; } } else if (modelAtt.isString()) { vals[i] = 0; modelAtt.setStringValue(incoming.stringValue(m_attributeMap[i])); } } if (modelHeader.classIndex() >= 0) { // set class to missing value vals[modelHeader.classIndex()] = Utils.missingValue(); } Instance newInst = null; if (incoming instanceof SparseInstance) { newInst = new SparseInstance(incoming.weight(), vals); } else { newInst = new DenseInstance(incoming.weight(), vals); } newInst.setDataset(modelHeader); return newInst; }
/** * Convert a single instance over. The converted instance is added to the end of the output queue. * * @param instance the instance to convert */ private void convertInstance(Instance instance) { Instance inst = null; if (instance instanceof SparseInstance) { double[] newVals = new double[instance.numAttributes()]; int[] newIndices = new int[instance.numAttributes()]; double[] vals = instance.toDoubleArray(); int ind = 0; for (int j = 0; j < instance.numAttributes(); j++) { double value; if (instance.attribute(j).isNumeric() && (!Instance.isMissingValue(vals[j])) && (getInputFormat().classIndex() != j)) { value = vals[j] - m_Means[j]; if (value != 0.0) { newVals[ind] = value; newIndices[ind] = j; ind++; } } else { value = vals[j]; if (value != 0.0) { newVals[ind] = value; newIndices[ind] = j; ind++; } } } double[] tempVals = new double[ind]; int[] tempInd = new int[ind]; System.arraycopy(newVals, 0, tempVals, 0, ind); System.arraycopy(newIndices, 0, tempInd, 0, ind); inst = new SparseInstance(instance.weight(), tempVals, tempInd, instance.numAttributes()); } else { double[] vals = instance.toDoubleArray(); for (int j = 0; j < getInputFormat().numAttributes(); j++) { if (instance.attribute(j).isNumeric() && (!Instance.isMissingValue(vals[j])) && (getInputFormat().classIndex() != j)) { vals[j] = (vals[j] - m_Means[j]); } } inst = new Instance(instance.weight(), vals); } inst.setDataset(instance.dataset()); push(inst); }
/** * Convert a single instance over if the class is nominal. The converted instance is added to the * end of the output queue. * * @param instance the instance to convert */ private void convertInstanceNominal(Instance instance) { if (!m_needToTransform) { push(instance); return; } double[] vals = new double[outputFormatPeek().numAttributes()]; int attSoFar = 0; for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if ((!att.isNominal()) || (j == getInputFormat().classIndex())) { vals[attSoFar] = instance.value(j); attSoFar++; } else { if ((att.numValues() <= 2) && (!m_TransformAll)) { vals[attSoFar] = instance.value(j); attSoFar++; } else { if (instance.isMissing(j)) { for (int k = 0; k < att.numValues(); k++) { vals[attSoFar + k] = instance.value(j); } } else { for (int k = 0; k < att.numValues(); k++) { if (k == (int) instance.value(j)) { vals[attSoFar + k] = 1; } else { vals[attSoFar + k] = 0; } } } attSoFar += att.numValues(); } } } Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new DenseInstance(instance.weight(), vals); } inst.setDataset(getOutputFormat()); copyValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); }
@Override public void addResult(Instance inst, double[] classVotes) { double weight = inst.weight(); int trueClass = (int) inst.classValue(); if (weight > 0.0) { if (TotalweightObserved == 0) { reset(inst.dataset().numClasses()); } this.TotalweightObserved += weight; this.weightObserved.add(weight); int predictedClass = Utils.maxIndex(classVotes); if (predictedClass == trueClass) { this.weightCorrect.add(weight); } else { this.weightCorrect.add(0); } // Add Kappa statistic information for (int i = 0; i < this.numClasses; i++) { this.rowKappa[i].add(i == predictedClass ? weight : 0); this.columnKappa[i].add(i == trueClass ? weight : 0); } if (this.lastSeenClass == trueClass) { this.weightCorrectNoChangeClassifier.add(weight); } else { this.weightCorrectNoChangeClassifier.add(0); } this.classAccuracy[trueClass].add(predictedClass == trueClass ? weight : 0.0); this.lastSeenClass = trueClass; } }
/** * Convert an input instance * * @param current the input instance to convert * @return a transformed instance * @throws Exception if a problem occurs */ protected Instance convertInstance(Instance current) throws Exception { double[] vals = new double[getOutputFormat().numAttributes()]; int index = 0; for (int j = 0; j < current.numAttributes(); j++) { if (j != current.classIndex()) { if (m_unchanged != null && m_unchanged.attribute(current.attribute(j).name()) != null) { vals[index++] = current.value(j); } else { Estimator[] estForAtt = m_estimatorLookup.get(current.attribute(j).name()); for (int k = 0; k < current.classAttribute().numValues(); k++) { if (current.isMissing(j)) { vals[index++] = Utils.missingValue(); } else { double e = estForAtt[k].getProbability(current.value(j)); vals[index++] = e; } } } } } vals[vals.length - 1] = current.classValue(); DenseInstance instNew = new DenseInstance(current.weight(), vals); return instNew; }
/** * Splits the given set of instances into subsets. * * @exception Exception if something goes wrong */ public final Instances[] split(Instances data) throws Exception { Instances[] instances = new Instances[m_numSubsets]; double[] weights; double newWeight; Instance instance; int subset, i, j; for (j = 0; j < m_numSubsets; j++) instances[j] = new Instances((Instances) data, data.numInstances()); for (i = 0; i < data.numInstances(); i++) { instance = ((Instances) data).instance(i); weights = weights(instance); subset = whichSubset(instance); if (subset > -1) instances[subset].add(instance); else for (j = 0; j < m_numSubsets; j++) if (Utils.gr(weights[j], 0)) { newWeight = weights[j] * instance.weight(); instances[j].add(instance); instances[j].lastInstance().setWeight(newWeight); } } for (j = 0; j < m_numSubsets; j++) instances[j].compactify(); return instances; }
/** * Find all the instances in the dataset covered/not covered by the rule in given index, and the * correponding simple statistics and predicted class distributions are stored in the given double * array, which can be obtained by getSimpleStats() and getDistributions().<br> * * @param index the given index, assuming correct * @param insts the dataset to be covered by the rule * @param stats the given double array to hold stats, side-effected * @param dist the given array to hold class distributions, side-effected if null, the * distribution is not necessary * @return the instances covered and not covered by the rule */ private Instances[] computeSimpleStats( int index, Instances insts, double[] stats, double[] dist) { Rule rule = (Rule) m_Ruleset.elementAt(index); Instances[] data = new Instances[2]; data[0] = new Instances(insts, insts.numInstances()); data[1] = new Instances(insts, insts.numInstances()); for (int i = 0; i < insts.numInstances(); i++) { Instance datum = insts.instance(i); double weight = datum.weight(); if (rule.covers(datum)) { data[0].add(datum); // Covered by this rule stats[0] += weight; // Coverage if ((int) datum.classValue() == (int) rule.getConsequent()) stats[2] += weight; // True positives else stats[4] += weight; // False positives if (dist != null) dist[(int) datum.classValue()] += weight; } else { data[1].add(datum); // Not covered by this rule stats[1] += weight; if ((int) datum.classValue() != (int) rule.getConsequent()) stats[3] += weight; // True negatives else stats[5] += weight; // False negatives } } return data; }
/** * Boosting method. Boosts any classifier that can handle weighted instances. * * @param data the training data to be used for generating the boosted classifier. * @throws Exception if the classifier could not be built successfully */ protected void buildClassifierWithWeights(Instances data) throws Exception { Instances trainData, training, trainingWeightsNotNormalized; int numInstances = data.numInstances(); Random randomInstance = new Random(m_Seed); double minLoss = Double.MAX_VALUE; // Create a copy of the data so that when the weights are diddled // with it doesn't mess up the weights for anyone else trainingWeightsNotNormalized = new Instances(data, 0, numInstances); // Do boostrap iterations for (m_NumIterationsPerformed = -1; m_NumIterationsPerformed < m_Classifiers.length; m_NumIterationsPerformed++) { if (m_Debug) { System.err.println("Training classifier " + (m_NumIterationsPerformed + 1)); } training = new Instances(trainingWeightsNotNormalized); normalizeWeights(training, m_SumOfWeights); // Select instances to train the classifier on if (m_WeightThreshold < 100) { trainData = selectWeightQuantile(training, (double) m_WeightThreshold / 100); } else { trainData = new Instances(training, 0, numInstances); } // Build classifier if (m_NumIterationsPerformed == -1) { m_ZeroR = new weka.classifiers.rules.ZeroR(); m_ZeroR.buildClassifier(data); } else { if (m_Classifiers[m_NumIterationsPerformed] instanceof Randomizable) ((Randomizable) m_Classifiers[m_NumIterationsPerformed]) .setSeed(randomInstance.nextInt()); m_Classifiers[m_NumIterationsPerformed].buildClassifier(trainData); } // Update instance weights setWeights(trainingWeightsNotNormalized, m_NumIterationsPerformed); // Has progress been made? double loss = 0; for (Instance inst : trainingWeightsNotNormalized) { loss += Math.log(inst.weight()); } if (m_Debug) { System.err.println("Current loss on log scale: " + loss); } if ((m_NumIterationsPerformed > -1) && (loss > minLoss)) { if (m_Debug) { System.err.println("Loss has increased: bailing out."); } break; } minLoss = loss; } }
/** * Adds the prediction intervals as additional attributes at the end. Since classifiers can * returns varying number of intervals per instance, the dataset is filled with missing values for * non-existing intervals. */ protected void addPredictionIntervals() { int maxNum; int num; int i; int n; FastVector preds; FastVector atts; Instances data; Instance inst; Instance newInst; double[] values; double[][] predInt; // determine the maximum number of intervals maxNum = 0; preds = m_Evaluation.predictions(); for (i = 0; i < preds.size(); i++) { num = ((NumericPrediction) preds.elementAt(i)).predictionIntervals().length; if (num > maxNum) maxNum = num; } // create new header atts = new FastVector(); for (i = 0; i < m_PlotInstances.numAttributes(); i++) atts.addElement(m_PlotInstances.attribute(i)); for (i = 0; i < maxNum; i++) { atts.addElement(new Attribute("predictionInterval_" + (i + 1) + "-lowerBoundary")); atts.addElement(new Attribute("predictionInterval_" + (i + 1) + "-upperBoundary")); atts.addElement(new Attribute("predictionInterval_" + (i + 1) + "-width")); } data = new Instances(m_PlotInstances.relationName(), atts, m_PlotInstances.numInstances()); data.setClassIndex(m_PlotInstances.classIndex()); // update data for (i = 0; i < m_PlotInstances.numInstances(); i++) { inst = m_PlotInstances.instance(i); // copy old values values = new double[data.numAttributes()]; System.arraycopy(inst.toDoubleArray(), 0, values, 0, inst.numAttributes()); // add interval data predInt = ((NumericPrediction) preds.elementAt(i)).predictionIntervals(); for (n = 0; n < maxNum; n++) { if (n < predInt.length) { values[m_PlotInstances.numAttributes() + n * 3 + 0] = predInt[n][0]; values[m_PlotInstances.numAttributes() + n * 3 + 1] = predInt[n][1]; values[m_PlotInstances.numAttributes() + n * 3 + 2] = predInt[n][1] - predInt[n][0]; } else { values[m_PlotInstances.numAttributes() + n * 3 + 0] = Utils.missingValue(); values[m_PlotInstances.numAttributes() + n * 3 + 1] = Utils.missingValue(); values[m_PlotInstances.numAttributes() + n * 3 + 2] = Utils.missingValue(); } } // create new Instance newInst = new DenseInstance(inst.weight(), values); data.add(newInst); } m_PlotInstances = data; }
/** * Normalize the weights for the next iteration. * * @param training the training instances * @throws Exception if something goes wrong */ protected void normalizeWeights(Instances training, double oldSumOfWeights) throws Exception { // Renormalize weights double newSumOfWeights = training.sumOfWeights(); for (Instance instance : training) { instance.setWeight(instance.weight() * oldSumOfWeights / newSumOfWeights); } }
/** * Normalize the instance * * @param inst instance to be normalized * @return a new Instance with normalized values */ private Instance normalizeInstance(Instance inst) { double[] vals = inst.toDoubleArray(); double sum = Utils.sum(vals); for (int i = 0; i < vals.length; i++) { vals[i] /= sum; } return new DenseInstance(inst.weight(), vals); }
protected void updateClassifier(Instance instance, boolean updateDictionary) throws Exception { if (!instance.classIsMissing()) { int classIndex = (int) instance.classValue(); m_probOfClass[classIndex] += instance.weight(); tokenizeInstance(instance, updateDictionary); m_t++; } }
/** * Processes the given data (may change the provided dataset) and returns the modified version. * This method is called in batchFinished(). * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong * @see #batchFinished() */ protected Instances process(Instances instances) throws Exception { Instances result; int i; int n; double[] values; String value; Instance inst; Instance newInst; // we need the complete input data! if (!isFirstBatchDone()) setOutputFormat(determineOutputFormat(getInputFormat())); result = new Instances(getOutputFormat()); for (i = 0; i < instances.numInstances(); i++) { inst = instances.instance(i); values = inst.toDoubleArray(); for (n = 0; n < values.length; n++) { if (!m_Cols.isInRange(n) || !instances.attribute(n).isNumeric() || inst.isMissing(n)) continue; // get index of value if (instances.attribute(n).type() == Attribute.DATE) value = inst.stringValue(n); else value = Utils.doubleToString(inst.value(n), MAX_DECIMALS); values[n] = result.attribute(n).indexOfValue(value); } // generate new instance if (inst instanceof SparseInstance) newInst = new SparseInstance(inst.weight(), values); else newInst = new DenseInstance(inst.weight(), values); // copy possible string, relational values newInst.setDataset(getOutputFormat()); copyValues(newInst, false, inst.dataset(), getOutputFormat()); result.add(newInst); } return result; }
/** * Compare two datasets to see if they differ. * * @param data1 one set of instances * @param data2 the other set of instances * @throws Exception if the datasets differ */ protected void compareDatasets(Instances data1, Instances data2) throws Exception { if (data1.numAttributes() != data2.numAttributes()) throw new Exception("number of attributes has changed"); if (!(data2.numInstances() == data1.numInstances())) throw new Exception("number of instances has changed"); for (int i = 0; i < data2.numInstances(); i++) { Instance orig = data1.instance(i); Instance copy = data2.instance(i); for (int j = 0; j < orig.numAttributes(); j++) { if (orig.isMissing(j)) { if (!copy.isMissing(j)) throw new Exception("instances have changed"); } else if (!orig.toString(j).equals(copy.toString(j))) { throw new Exception("instances have changed"); } if (orig.weight() != copy.weight()) throw new Exception("instance weights have changed"); } } }
/** * Turn the list of nearest neighbors into a probability distribution. * * @param neighbours the list of nearest neighboring instances * @param distances the distances of the neighbors * @return the probability distribution * @throws Exception if computation goes wrong or has no class attribute */ protected double[] makeDistribution(Instances neighbours, double[] distances) throws Exception { double total = 0, weight; double[] distribution = new double[m_NumClasses]; // Set up a correction to the estimator if (m_ClassType == Attribute.NOMINAL) { for (int i = 0; i < m_NumClasses; i++) { distribution[i] = 1.0 / Math.max(1, m_Train.numInstances()); } total = (double) m_NumClasses / Math.max(1, m_Train.numInstances()); } for (int i = 0; i < neighbours.numInstances(); i++) { // Collect class counts Instance current = neighbours.instance(i); distances[i] = distances[i] * distances[i]; distances[i] = Math.sqrt(distances[i] / m_NumAttributesUsed); switch (m_DistanceWeighting) { case WEIGHT_INVERSE: weight = 1.0 / (distances[i] + 0.001); // to avoid div by zero break; case WEIGHT_SIMILARITY: weight = 1.0 - distances[i]; break; default: // WEIGHT_NONE: weight = 1.0; break; } weight *= current.weight(); try { switch (m_ClassType) { case Attribute.NOMINAL: distribution[(int) current.classValue()] += weight; break; case Attribute.NUMERIC: distribution[0] += current.classValue() * weight; break; } } catch (Exception ex) { throw new Error("Data has no class attribute!"); } total += weight; } // Normalise distribution if (total > 0) { Utils.normalize(distribution, total); } return distribution; }
/** * Creates a new instance the same as one instance (the "destination") but with some attribute * values copied from another instance (the "source") * * @param source the source instance * @param dest the destination instance * @return the new merged instance */ protected Instance mergeInstances(Instance source, Instance dest) { Instances outputFormat = outputFormatPeek(); double[] vals = new double[outputFormat.numAttributes()]; for (int i = 0; i < vals.length; i++) { if ((i != outputFormat.classIndex()) && (m_SelectedCols.isInRange(i))) { if (source != null) { vals[i] = source.value(i); } else { vals[i] = Instance.missingValue(); } } else { vals[i] = dest.value(i); } } Instance inst = null; if (dest instanceof SparseInstance) { inst = new SparseInstance(dest.weight(), vals); } else { inst = new Instance(dest.weight(), vals); } inst.setDataset(dest.dataset()); return inst; }
/** * processes the given instance (may change the provided instance) and returns the modified * version. * * @param instance the instance to process * @return the modified data * @throws Exception in case the processing goes wrong */ protected Instance process(Instance instance) throws Exception { Instance result; double[] values; int i; values = new double[m_Indices.length]; for (i = 0; i < m_Indices.length; i++) values[i] = instance.value(m_Indices[i]); result = new DenseInstance(instance.weight(), values); result.setDataset(getOutputFormat()); copyValues(result, false, instance.dataset(), getOutputFormat()); result.setDataset(getOutputFormat()); return result; }
/** * Inserts an instance into the hash table * * @param inst instance to be inserted * @param instA to create the hash key from * @throws Exception if the instance can't be inserted */ private void insertIntoTable(Instance inst, double[] instA) throws Exception { double[] tempClassDist2; double[] newDist; DecisionTableHashKey thekey; if (instA != null) { thekey = new DecisionTableHashKey(instA); } else { thekey = new DecisionTableHashKey(inst, inst.numAttributes(), false); } // see if this one is already in the table tempClassDist2 = (double[]) m_entries.get(thekey); if (tempClassDist2 == null) { if (m_classIsNominal) { newDist = new double[m_theInstances.classAttribute().numValues()]; // Leplace estimation for (int i = 0; i < m_theInstances.classAttribute().numValues(); i++) { newDist[i] = 1.0; } newDist[(int) inst.classValue()] = inst.weight(); // add to the table m_entries.put(thekey, newDist); } else { newDist = new double[2]; newDist[0] = inst.classValue() * inst.weight(); newDist[1] = inst.weight(); // add to the table m_entries.put(thekey, newDist); } } else { // update the distribution for this instance if (m_classIsNominal) { tempClassDist2[(int) inst.classValue()] += inst.weight(); // update the table m_entries.put(thekey, tempClassDist2); } else { tempClassDist2[0] += (inst.classValue() * inst.weight()); tempClassDist2[1] += inst.weight(); // update the table m_entries.put(thekey, tempClassDist2); } } }
/** * processes the given instance (may change the provided instance) and returns the modified * version. * * @param instance the instance to process * @return the modified data * @throws Exception in case the processing goes wrong */ protected Instance process(Instance instance) throws Exception { Instance result; Attribute att; double[] values; int i; // adjust indices values = new double[instance.numAttributes()]; for (i = 0; i < instance.numAttributes(); i++) { att = instance.attribute(i); if (!att.isNominal() || !m_AttributeIndices.isInRange(i) || instance.isMissing(i)) values[i] = instance.value(i); else values[i] = m_NewOrder[i][(int) instance.value(i)]; } // create new instance result = new DenseInstance(instance.weight(), values); return result; }
@Override public void updateNode(Instance inst) throws Exception { super.updateDistribution(inst); for (int i = 0; i < inst.numAttributes(); i++) { Attribute a = inst.attribute(i); if (i != inst.classIndex()) { ConditionalSufficientStats stats = m_nodeStats.get(a.name()); if (stats == null) { if (a.isNumeric()) { stats = new GaussianConditionalSufficientStats(); } else { stats = new NominalConditionalSufficientStats(); } m_nodeStats.put(a.name(), stats); } stats.update( inst.value(a), inst.classAttribute().value((int) inst.classValue()), inst.weight()); } } }
@Override public void trainOnInstanceImpl(Instance inst) { double lambda_d = 1.0; for (int i = 0; i < this.ensemble.length; i++) { double k = this.pureBoostOption.isSet() ? lambda_d : MiscUtils.poisson(lambda_d, this.classifierRandom); if (k > 0.0) { Instance weightedInst = (Instance) inst.copy(); weightedInst.setWeight(inst.weight() * k); this.ensemble[i].trainOnInstance(weightedInst); } if (this.ensemble[i].correctlyClassifies(inst)) { this.scms[i] += lambda_d; lambda_d *= this.trainingWeightSeenByModel / (2 * this.scms[i]); } else { this.swms[i] += lambda_d; lambda_d *= this.trainingWeightSeenByModel / (2 * this.swms[i]); } } }
/** * Sets the weights for the next iteration. * * @param training the training instances * @throws Exception if something goes wrong */ protected void setWeights(Instances training, int iteration) throws Exception { for (Instance instance : training) { double reweight = 1; double prob = 1, shrinkage = m_Shrinkage; if (iteration == -1) { prob = m_ZeroR.distributionForInstance(instance)[0]; shrinkage = 1.0; } else { prob = m_Classifiers[iteration].distributionForInstance(instance)[0]; // Make sure that probabilities are never 0 or 1 using ad-hoc smoothing prob = (m_SumOfWeights * prob + 1) / (m_SumOfWeights + 2); } if (instance.classValue() == 1) { reweight = shrinkage * 0.5 * (Math.log(prob) - Math.log(1 - prob)); } else { reweight = shrinkage * 0.5 * (Math.log(1 - prob) - Math.log(prob)); } instance.setWeight(instance.weight() * Math.exp(reweight)); } }
protected void tokenizeInstance(Instance instance, boolean updateDictionary) { if (m_inputVector == null) { m_inputVector = new LinkedHashMap<String, Count>(); } else { m_inputVector.clear(); } if (m_useStopList && m_stopwords == null) { m_stopwords = new Stopwords(); try { if (getStopwords().exists() && !getStopwords().isDirectory()) { m_stopwords.read(getStopwords()); } } catch (Exception ex) { ex.printStackTrace(); } } for (int i = 0; i < instance.numAttributes(); i++) { if (instance.attribute(i).isString() && !instance.isMissing(i)) { m_tokenizer.tokenize(instance.stringValue(i)); while (m_tokenizer.hasMoreElements()) { String word = m_tokenizer.nextElement(); if (m_lowercaseTokens) { word = word.toLowerCase(); } word = m_stemmer.stem(word); if (m_useStopList) { if (m_stopwords.is(word)) { continue; } } Count docCount = m_inputVector.get(word); if (docCount == null) { m_inputVector.put(word, new Count(instance.weight())); } else { docCount.m_count += instance.weight(); } } } } if (updateDictionary) { int classValue = (int) instance.classValue(); LinkedHashMap<String, Count> dictForClass = m_probOfWordGivenClass.get(classValue); // document normalization double iNorm = 0; double fv = 0; if (m_normalize) { for (Count c : m_inputVector.values()) { // word counts or bag-of-words? fv = (m_wordFrequencies) ? c.m_count : 1.0; iNorm += Math.pow(Math.abs(fv), m_lnorm); } iNorm = Math.pow(iNorm, 1.0 / m_lnorm); } for (Map.Entry<String, Count> feature : m_inputVector.entrySet()) { String word = feature.getKey(); double freq = (m_wordFrequencies) ? feature.getValue().m_count : 1.0; // double freq = (feature.getValue().m_count / iNorm * m_norm); if (m_normalize) { freq /= (iNorm * m_norm); } // check all classes for (int i = 0; i < m_data.numClasses(); i++) { LinkedHashMap<String, Count> dict = m_probOfWordGivenClass.get(i); if (dict.get(word) == null) { dict.put(word, new Count(m_leplace)); m_wordsPerClass[i] += m_leplace; } } Count dictCount = dictForClass.get(word); /* * if (dictCount == null) { dictForClass.put(word, new Count(m_leplace + * freq)); m_wordsPerClass[classValue] += (m_leplace + freq); } else { */ dictCount.m_count += freq; m_wordsPerClass[classValue] += freq; // } } pruneDictionary(); } }
/** * Generates the classifier. * * @param instances set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ public void buildClassifier(Instances instances) throws Exception { if (!m_weightByConfidence) { TINY = 0.0; } // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_ClassIndex = instances.classIndex(); m_NumClasses = instances.numClasses(); m_globalCounts = new double[m_NumClasses]; m_maxEntrop = Math.log(m_NumClasses) / Math.log(2); m_Instances = new Instances(instances, 0); // Copy the structure for ref m_intervalBounds = new double[instances.numAttributes()][2 + (2 * m_NumClasses)]; for (int j = 0; j < instances.numAttributes(); j++) { boolean alt = false; for (int i = 0; i < m_NumClasses * 2 + 2; i++) { if (i == 0) { m_intervalBounds[j][i] = Double.NEGATIVE_INFINITY; } else if (i == m_NumClasses * 2 + 1) { m_intervalBounds[j][i] = Double.POSITIVE_INFINITY; } else { if (alt) { m_intervalBounds[j][i] = Double.NEGATIVE_INFINITY; alt = false; } else { m_intervalBounds[j][i] = Double.POSITIVE_INFINITY; alt = true; } } } } // find upper and lower bounds for numeric attributes for (int j = 0; j < instances.numAttributes(); j++) { if (j != m_ClassIndex && instances.attribute(j).isNumeric()) { for (int i = 0; i < instances.numInstances(); i++) { Instance inst = instances.instance(i); if (!inst.isMissing(j)) { if (inst.value(j) < m_intervalBounds[j][((int) inst.classValue() * 2 + 1)]) { m_intervalBounds[j][((int) inst.classValue() * 2 + 1)] = inst.value(j); } if (inst.value(j) > m_intervalBounds[j][((int) inst.classValue() * 2 + 2)]) { m_intervalBounds[j][((int) inst.classValue() * 2 + 2)] = inst.value(j); } } } } } m_counts = new double[instances.numAttributes()][][]; // sort intervals for (int i = 0; i < instances.numAttributes(); i++) { if (instances.attribute(i).isNumeric()) { int[] sortedIntervals = Utils.sort(m_intervalBounds[i]); // remove any duplicate bounds int count = 1; for (int j = 1; j < sortedIntervals.length; j++) { if (m_intervalBounds[i][sortedIntervals[j]] != m_intervalBounds[i][sortedIntervals[j - 1]]) { count++; } } double[] reordered = new double[count]; count = 1; reordered[0] = m_intervalBounds[i][sortedIntervals[0]]; for (int j = 1; j < sortedIntervals.length; j++) { if (m_intervalBounds[i][sortedIntervals[j]] != m_intervalBounds[i][sortedIntervals[j - 1]]) { reordered[count] = m_intervalBounds[i][sortedIntervals[j]]; count++; } } m_intervalBounds[i] = reordered; m_counts[i] = new double[count][m_NumClasses]; } else if (i != m_ClassIndex) { // nominal attribute m_counts[i] = new double[instances.attribute(i).numValues()][m_NumClasses]; } } // collect class counts for (int i = 0; i < instances.numInstances(); i++) { Instance inst = instances.instance(i); m_globalCounts[(int) instances.instance(i).classValue()] += inst.weight(); for (int j = 0; j < instances.numAttributes(); j++) { if (!inst.isMissing(j) && j != m_ClassIndex) { if (instances.attribute(j).isNumeric()) { double val = inst.value(j); int k; for (k = m_intervalBounds[j].length - 1; k >= 0; k--) { if (val > m_intervalBounds[j][k]) { m_counts[j][k][(int) inst.classValue()] += inst.weight(); break; } else if (val == m_intervalBounds[j][k]) { m_counts[j][k][(int) inst.classValue()] += (inst.weight() / 2.0); m_counts[j][k - 1][(int) inst.classValue()] += (inst.weight() / 2.0); ; break; } } } else { // nominal attribute m_counts[j][(int) inst.value(j)][(int) inst.classValue()] += inst.weight(); ; } } } } }
/** * Generates the classifier. * * @param instances set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ public void buildClassifier(Instances instances) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_headerInfo = new Instances(instances, 0); m_numClasses = instances.numClasses(); m_numAttributes = instances.numAttributes(); m_probOfWordGivenClass = new double[m_numClasses][]; /* initialising the matrix of word counts NOTE: Laplace estimator introduced in case a word that does not appear for a class in the training set does so for the test set */ for (int c = 0; c < m_numClasses; c++) { m_probOfWordGivenClass[c] = new double[m_numAttributes]; for (int att = 0; att < m_numAttributes; att++) { m_probOfWordGivenClass[c][att] = 1; } } // enumerate through the instances Instance instance; int classIndex; double numOccurences; double[] docsPerClass = new double[m_numClasses]; double[] wordsPerClass = new double[m_numClasses]; java.util.Enumeration enumInsts = instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { instance = (Instance) enumInsts.nextElement(); classIndex = (int) instance.value(instance.classIndex()); docsPerClass[classIndex] += instance.weight(); for (int a = 0; a < instance.numValues(); a++) if (instance.index(a) != instance.classIndex()) { if (!instance.isMissing(a)) { numOccurences = instance.valueSparse(a) * instance.weight(); if (numOccurences < 0) throw new Exception("Numeric attribute values must all be greater or equal to zero."); wordsPerClass[classIndex] += numOccurences; m_probOfWordGivenClass[classIndex][instance.index(a)] += numOccurences; } } } /* normalising probOfWordGivenClass values and saving each value as the log of each value */ for (int c = 0; c < m_numClasses; c++) for (int v = 0; v < m_numAttributes; v++) m_probOfWordGivenClass[c][v] = Math.log(m_probOfWordGivenClass[c][v] / (wordsPerClass[c] + m_numAttributes - 1)); /* calculating Pr(H) NOTE: Laplace estimator introduced in case a class does not get mentioned in the set of training instances */ final double numDocs = instances.sumOfWeights() + m_numClasses; m_probOfClass = new double[m_numClasses]; for (int h = 0; h < m_numClasses; h++) m_probOfClass[h] = (double) (docsPerClass[h] + 1) / numDocs; }