/** * Input an instance for filtering. Ordinarily the instance is processed and made available for * output immediately. Some filters require all instances be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @exception IllegalStateException if no input format has been defined. * @exception Exception if there was a problem during the filtering. */ public boolean input(Instance instance) throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } double[] vals = new double[instance.numAttributes() + 1]; for (int i = 0; i < instance.numAttributes(); i++) { if (instance.isMissing(i)) { vals[i] = Instance.missingValue(); } else { vals[i] = instance.value(i); } } evaluateExpression(vals); Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new Instance(instance.weight(), vals); } copyStringValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); return true; }
@Override public double[] recommendRanking(Instance inst) throws Exception { double[] features = new double[inst.numAttributes() - 1]; double[] ranking; for (int i = 0; i < inst.numAttributes() - 1; i++) { features[i] = inst.value(i); } ranking = m_LRT.lrTr.getLabelRanking(features); return ranking; }
/** * Convert a single instance over. The converted instance is added to the end of the output queue. * * @param instance the instance to convert */ private void convertInstance(Instance instance) { Instance inst = null; if (instance instanceof SparseInstance) { double[] newVals = new double[instance.numAttributes()]; int[] newIndices = new int[instance.numAttributes()]; double[] vals = instance.toDoubleArray(); int ind = 0; for (int j = 0; j < instance.numAttributes(); j++) { double value; if (instance.attribute(j).isNumeric() && (!Instance.isMissingValue(vals[j])) && (getInputFormat().classIndex() != j)) { value = vals[j] - m_Means[j]; if (value != 0.0) { newVals[ind] = value; newIndices[ind] = j; ind++; } } else { value = vals[j]; if (value != 0.0) { newVals[ind] = value; newIndices[ind] = j; ind++; } } } double[] tempVals = new double[ind]; int[] tempInd = new int[ind]; System.arraycopy(newVals, 0, tempVals, 0, ind); System.arraycopy(newIndices, 0, tempInd, 0, ind); inst = new SparseInstance(instance.weight(), tempVals, tempInd, instance.numAttributes()); } else { double[] vals = instance.toDoubleArray(); for (int j = 0; j < getInputFormat().numAttributes(); j++) { if (instance.attribute(j).isNumeric() && (!Instance.isMissingValue(vals[j])) && (getInputFormat().classIndex() != j)) { vals[j] = (vals[j] - m_Means[j]); } } inst = new Instance(instance.weight(), vals); } inst.setDataset(instance.dataset()); push(inst); }
public double[] normalizedInstance(Instance inst) { // Normalize Instance double[] normalizedInstance = new double[inst.numAttributes()]; for (int j = 0; j < inst.numAttributes() - 1; j++) { int instAttIndex = modelAttIndexToInstanceAttIndex(j, inst); double mean = perceptronattributeStatistics.getValue(j) / perceptronYSeen; double sd = computeSD( squaredperceptronattributeStatistics.getValue(j), perceptronattributeStatistics.getValue(j), perceptronYSeen); if (sd > SD_THRESHOLD) normalizedInstance[j] = (inst.value(instAttIndex) - mean) / sd; else normalizedInstance[j] = inst.value(instAttIndex) - mean; } return normalizedInstance; }
/** * Compare two datasets to see if they differ. * * @param data1 one set of instances * @param data2 the other set of instances * @throws Exception if the datasets differ */ protected void compareDatasets(Instances data1, Instances data2) throws Exception { if (m_CheckHeader) { if (!data2.equalHeaders(data1)) { throw new Exception("header has been modified\n" + data2.equalHeadersMsg(data1)); } } if (!(data2.numInstances() == data1.numInstances())) { throw new Exception("number of instances has changed"); } for (int i = 0; i < data2.numInstances(); i++) { Instance orig = data1.instance(i); Instance copy = data2.instance(i); for (int j = 0; j < orig.numAttributes(); j++) { if (orig.isMissing(j)) { if (!copy.isMissing(j)) { throw new Exception("instances have changed"); } } else { if (m_CompareValuesAsString) { if (!orig.toString(j).equals(copy.toString(j))) { throw new Exception("instances have changed"); } } else { if (Math.abs(orig.value(j) - copy.value(j)) > m_MaxDiffValues) { throw new Exception("instances have changed"); } } } if (Math.abs(orig.weight() - copy.weight()) > m_MaxDiffWeights) { throw new Exception("instance weights have changed"); } } } }
@Override public void buildClassifier(Instances data) throws Exception { trainingData = data; Attribute classAttribute = data.classAttribute(); prototypes = new ArrayList<>(); classedData = new HashMap<String, ArrayList<Sequence>>(); indexClassedDataInFullData = new HashMap<String, ArrayList<Integer>>(); for (int c = 0; c < data.numClasses(); c++) { classedData.put(data.classAttribute().value(c), new ArrayList<Sequence>()); indexClassedDataInFullData.put(data.classAttribute().value(c), new ArrayList<Integer>()); } sequences = new Sequence[data.numInstances()]; classMap = new String[sequences.length]; for (int i = 0; i < sequences.length; i++) { Instance sample = data.instance(i); MonoDoubleItemSet[] sequence = new MonoDoubleItemSet[sample.numAttributes() - 1]; int shift = (sample.classIndex() == 0) ? 1 : 0; for (int t = 0; t < sequence.length; t++) { sequence[t] = new MonoDoubleItemSet(sample.value(t + shift)); } sequences[i] = new Sequence(sequence); String clas = sample.stringValue(classAttribute); classMap[i] = clas; classedData.get(clas).add(sequences[i]); indexClassedDataInFullData.get(clas).add(i); // System.out.println("Element "+i+" of train is classed "+clas+" and went to element // "+(indexClassedDataInFullData.get(clas).size()-1)); } buildSpecificClassifier(data); }
/** * Convert an input instance * * @param current the input instance to convert * @return a transformed instance * @throws Exception if a problem occurs */ protected Instance convertInstance(Instance current) throws Exception { double[] vals = new double[getOutputFormat().numAttributes()]; int index = 0; for (int j = 0; j < current.numAttributes(); j++) { if (j != current.classIndex()) { if (m_unchanged != null && m_unchanged.attribute(current.attribute(j).name()) != null) { vals[index++] = current.value(j); } else { Estimator[] estForAtt = m_estimatorLookup.get(current.attribute(j).name()); for (int k = 0; k < current.classAttribute().numValues(); k++) { if (current.isMissing(j)) { vals[index++] = Utils.missingValue(); } else { double e = estForAtt[k].getProbability(current.value(j)); vals[index++] = e; } } } } } vals[vals.length - 1] = current.classValue(); DenseInstance instNew = new DenseInstance(current.weight(), vals); return instNew; }
/** * Adds the prediction intervals as additional attributes at the end. Since classifiers can * returns varying number of intervals per instance, the dataset is filled with missing values for * non-existing intervals. */ protected void addPredictionIntervals() { int maxNum; int num; int i; int n; FastVector preds; FastVector atts; Instances data; Instance inst; Instance newInst; double[] values; double[][] predInt; // determine the maximum number of intervals maxNum = 0; preds = m_Evaluation.predictions(); for (i = 0; i < preds.size(); i++) { num = ((NumericPrediction) preds.elementAt(i)).predictionIntervals().length; if (num > maxNum) maxNum = num; } // create new header atts = new FastVector(); for (i = 0; i < m_PlotInstances.numAttributes(); i++) atts.addElement(m_PlotInstances.attribute(i)); for (i = 0; i < maxNum; i++) { atts.addElement(new Attribute("predictionInterval_" + (i + 1) + "-lowerBoundary")); atts.addElement(new Attribute("predictionInterval_" + (i + 1) + "-upperBoundary")); atts.addElement(new Attribute("predictionInterval_" + (i + 1) + "-width")); } data = new Instances(m_PlotInstances.relationName(), atts, m_PlotInstances.numInstances()); data.setClassIndex(m_PlotInstances.classIndex()); // update data for (i = 0; i < m_PlotInstances.numInstances(); i++) { inst = m_PlotInstances.instance(i); // copy old values values = new double[data.numAttributes()]; System.arraycopy(inst.toDoubleArray(), 0, values, 0, inst.numAttributes()); // add interval data predInt = ((NumericPrediction) preds.elementAt(i)).predictionIntervals(); for (n = 0; n < maxNum; n++) { if (n < predInt.length) { values[m_PlotInstances.numAttributes() + n * 3 + 0] = predInt[n][0]; values[m_PlotInstances.numAttributes() + n * 3 + 1] = predInt[n][1]; values[m_PlotInstances.numAttributes() + n * 3 + 2] = predInt[n][1] - predInt[n][0]; } else { values[m_PlotInstances.numAttributes() + n * 3 + 0] = Utils.missingValue(); values[m_PlotInstances.numAttributes() + n * 3 + 1] = Utils.missingValue(); values[m_PlotInstances.numAttributes() + n * 3 + 2] = Utils.missingValue(); } } // create new Instance newInst = new DenseInstance(inst.weight(), values); data.add(newInst); } m_PlotInstances = data; }
/** * @param outFilePath full path to the output file * @param data the instances object containing the data on which the quantizer is learner * @param numClusters the number of clusters in k-means * @param maxIterations the maximum number of k-means iterations * @param seed the seed given to k-means * @param numSlots the number of execution slots to use (>1 = parallel execution) * @param kMeansPlusPlus whether to use kmeans++ for the initialization of the centroids * (true/false) * @throws Exception */ public static void learnAndWriteQuantizer( String outFilePath, Instances data, int numClusters, int maxIterations, int seed, int numSlots, boolean kMeansPlusPlus) throws Exception { System.out.println("--" + data.numInstances() + " vectors loaded--"); System.out.println("Vector dimensionality: " + data.numAttributes()); System.out.println("Clustering settings:"); System.out.println("Num clusters: " + numClusters); System.out.println("Max iterations: " + maxIterations); System.out.println("Seed: " + seed); System.out.println("Clustering started"); long start = System.currentTimeMillis(); // create a new instance for the Clusterer SimpleKMeansWithOutput clusterer = new SimpleKMeansWithOutput(); clusterer.setInitializeUsingKMeansPlusPlusMethod(kMeansPlusPlus); clusterer.setSeed(seed); clusterer.setNumClusters(numClusters); clusterer.setMaxIterations(maxIterations); clusterer.setNumExecutionSlots(numSlots); clusterer.setFastDistanceCalc(false); // build the clusterer clusterer.buildClusterer(data); long end = System.currentTimeMillis(); System.out.println("Clustering completed in " + (end - start) + " ms"); System.out.println("Writing quantizer in file"); // create a new file to store the codebook BufferedWriter out = new BufferedWriter(new FileWriter(new File(outFilePath))); // write the results of the clustering to the new file (csv formated) Instances clusterCentroids = clusterer.getClusterCentroids(); for (int j = 0; j < clusterCentroids.numInstances(); j++) { Instance centroid = clusterCentroids.instance(j); for (int k = 0; k < centroid.numAttributes() - 1; k++) { out.write(centroid.value(k) + ","); } out.write(centroid.value(centroid.numAttributes() - 1) + "\n"); } out.close(); }
public double updateWeights(Instance inst, double learningRatio) { // Normalize Instance double[] normalizedInstance = normalizedInstance(inst); // Compute the Normalized Prediction of Perceptron double normalizedPredict = prediction(normalizedInstance); double normalizedY = normalizeActualClassValue(inst); double sumWeights = 0.0; double delta = normalizedY - normalizedPredict; for (int j = 0; j < inst.numAttributes() - 1; j++) { int instAttIndex = modelAttIndexToInstanceAttIndex(j, inst); if (inst.attribute(instAttIndex).isNumeric()) { this.weightAttribute[j] += learningRatio * delta * normalizedInstance[j]; sumWeights += Math.abs(this.weightAttribute[j]); } } this.weightAttribute[inst.numAttributes() - 1] += learningRatio * delta; sumWeights += Math.abs(this.weightAttribute[inst.numAttributes() - 1]); if (sumWeights > inst.numAttributes()) { // Lasso regression for (int j = 0; j < inst.numAttributes() - 1; j++) { int instAttIndex = modelAttIndexToInstanceAttIndex(j, inst); if (inst.attribute(instAttIndex).isNumeric()) { this.weightAttribute[j] = this.weightAttribute[j] / sumWeights; } } this.weightAttribute[inst.numAttributes() - 1] = this.weightAttribute[inst.numAttributes() - 1] / sumWeights; } return denormalizedPrediction(normalizedPredict); }
/** * Input an instance for filtering. Ordinarily the instance is processed and made available for * output immediately. Some filters require all instances be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input structure has been defined. */ @Override public boolean input(Instance instance) { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (getOutputFormat().numAttributes() == 0) { return false; } if (m_selectedAttributes.length == 0) { push(instance); } else { double vals[] = new double[getOutputFormat().numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { double currentV = instance.value(i); if (!m_selectedCols.isInRange(i)) { vals[i] = currentV; } else { if (currentV == Utils.missingValue()) { vals[i] = currentV; } else { String currentS = instance.attribute(i).value((int) currentV); String replace = m_ignoreCase ? m_renameMap.get(currentS.toLowerCase()) : m_renameMap.get(currentS); if (replace == null) { vals[i] = currentV; } else { vals[i] = getOutputFormat().attribute(i).indexOfValue(replace); } } } } Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new DenseInstance(instance.weight(), vals); } inst.setDataset(getOutputFormat()); copyValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); } return true; }
/** * processes the given instance (may change the provided instance) and returns the modified * version. * * @param instance the instance to process * @return the modified data * @throws Exception in case the processing goes wrong */ protected Instance process(Instance instance) throws Exception { Instance result; Attribute att; double[] values; int i; // adjust indices values = new double[instance.numAttributes()]; for (i = 0; i < instance.numAttributes(); i++) { att = instance.attribute(i); if (!att.isNominal() || !m_AttributeIndices.isInRange(i) || instance.isMissing(i)) values[i] = instance.value(i); else values[i] = m_NewOrder[i][(int) instance.value(i)]; } // create new instance result = new DenseInstance(instance.weight(), values); return result; }
/** * Inserts an instance into the hash table * * @param inst instance to be inserted * @param instA to create the hash key from * @throws Exception if the instance can't be inserted */ private void insertIntoTable(Instance inst, double[] instA) throws Exception { double[] tempClassDist2; double[] newDist; DecisionTableHashKey thekey; if (instA != null) { thekey = new DecisionTableHashKey(instA); } else { thekey = new DecisionTableHashKey(inst, inst.numAttributes(), false); } // see if this one is already in the table tempClassDist2 = (double[]) m_entries.get(thekey); if (tempClassDist2 == null) { if (m_classIsNominal) { newDist = new double[m_theInstances.classAttribute().numValues()]; // Leplace estimation for (int i = 0; i < m_theInstances.classAttribute().numValues(); i++) { newDist[i] = 1.0; } newDist[(int) inst.classValue()] = inst.weight(); // add to the table m_entries.put(thekey, newDist); } else { newDist = new double[2]; newDist[0] = inst.classValue() * inst.weight(); newDist[1] = inst.weight(); // add to the table m_entries.put(thekey, newDist); } } else { // update the distribution for this instance if (m_classIsNominal) { tempClassDist2[(int) inst.classValue()] += inst.weight(); // update the table m_entries.put(thekey, tempClassDist2); } else { tempClassDist2[0] += (inst.classValue() * inst.weight()); tempClassDist2[1] += inst.weight(); // update the table m_entries.put(thekey, tempClassDist2); } } }
/** Update the model using the provided instance */ public void trainOnInstanceImpl(Instance inst) { accumulatedError = Math.abs(this.prediction(inst) - inst.classValue()) + fadingFactor * accumulatedError; nError = 1 + fadingFactor * nError; // Initialise Perceptron if necessary if (this.initialisePerceptron == true) { this.fadingFactor = this.fadingFactorOption.getValue(); this.classifierRandom.setSeed(randomSeedOption.getValue()); this.initialisePerceptron = false; // not in resetLearningImpl() because it needs Instance! this.weightAttribute = new double[inst.numAttributes()]; for (int j = 0; j < inst.numAttributes(); j++) { weightAttribute[j] = 2 * this.classifierRandom.nextDouble() - 1; } // Update Learning Rate learningRatio = learningRatioOption.getValue(); this.learningRateDecay = learningRateDecayOption.getValue(); } // Update attribute statistics this.perceptronInstancesSeen++; this.perceptronYSeen++; for (int j = 0; j < inst.numAttributes() - 1; j++) { perceptronattributeStatistics.addToValue(j, inst.value(j)); squaredperceptronattributeStatistics.addToValue(j, inst.value(j) * inst.value(j)); } this.perceptronsumY += inst.classValue(); this.squaredperceptronsumY += inst.classValue() * inst.classValue(); if (constantLearningRatioDecayOption.isSet() == false) { learningRatio = learningRatioOption.getValue() / (1 + perceptronInstancesSeen * learningRateDecay); } // double prediction = this.updateWeights(inst,learningRatio); // accumulatedError= Math.abs(prediction-inst.classValue()) + fadingFactor*accumulatedError; this.updateWeights(inst, learningRatio); }
/** * Checks if an instance contains an item set. * * @param instance the instance to be tested * @return true if the given instance contains this item set */ public boolean containedByTreatZeroAsMissing(Instance instance) { if (instance instanceof weka.core.SparseInstance) { int numInstVals = instance.numValues(); int numItemSetVals = m_items.length; for (int p1 = 0, p2 = 0; p1 < numInstVals || p2 < numItemSetVals; ) { int instIndex = Integer.MAX_VALUE; if (p1 < numInstVals) { instIndex = instance.index(p1); } int itemIndex = p2; if (m_items[itemIndex] > -1) { if (itemIndex != instIndex) { return false; } else { if (instance.isMissingSparse(p1)) { return false; } if (m_items[itemIndex] != (int) instance.valueSparse(p1)) { return false; } } p1++; p2++; } else { if (itemIndex < instIndex) { p2++; } else if (itemIndex == instIndex) { p2++; p1++; } } } } else { for (int i = 0; i < instance.numAttributes(); i++) { if (m_items[i] > -1) { if (instance.isMissing(i) || (int) instance.value(i) == 0) { return false; } if (m_items[i] != (int) instance.value(i)) { return false; } } } } return true; }
/** * Checks if an instance contains an item set. * * @param instance the instance to be tested * @return true if the given instance contains this item set */ public boolean containedBy(Instance instance) { for (int i = 0; i < instance.numAttributes(); i++) { if (m_items[i] > -1) { if (instance.isMissing(i)) { return false; } if (m_items[i] != (int) instance.value(i)) { return false; } } } return true; }
/** * turns the instance into a libsvm row * * @param inst the instance to transform * @return the generated libsvm row */ protected String instanceToLibsvm(Instance inst) { StringBuffer result; int i; // class result = new StringBuffer("" + inst.classValue()); // attributes for (i = 0; i < inst.numAttributes(); i++) { if (i == inst.classIndex()) continue; if (inst.value(i) == 0) continue; result.append(" " + (i + 1) + ":" + inst.value(i)); } return result.toString(); }
/** * Calculate the dependent value for a given instance for a given regression model. * * @param transformedInstance the input instance * @param selectedAttributes an array of flags indicating which attributes are included in the * regression model * @param coefficients an array of coefficients for the regression model * @return the regression value for the instance. * @throws Exception if the class attribute of the input instance is not assigned */ private double regressionPrediction( Instance transformedInstance, boolean[] selectedAttributes, double[] coefficients) throws Exception { double result = 0; int column = 0; for (int j = 0; j < transformedInstance.numAttributes(); j++) { if ((m_ClassIndex != j) && (selectedAttributes[j])) { result += coefficients[column] * transformedInstance.value(j); column++; } } result += coefficients[column]; return result; }
/** * Calculates the class membership probabilities for the given test instance. * * @param instance the instance to be classified * @return predicted class probability distribution * @throws Exception if distribution can't be computed */ public double[] distributionForInstance(Instance instance) throws Exception { DecisionTableHashKey thekey; double[] tempDist; double[] normDist; m_disTransform.input(instance); m_disTransform.batchFinished(); instance = m_disTransform.output(); m_delTransform.input(instance); m_delTransform.batchFinished(); instance = m_delTransform.output(); thekey = new DecisionTableHashKey(instance, instance.numAttributes(), false); // if this one is not in the table if ((tempDist = (double[]) m_entries.get(thekey)) == null) { if (m_useIBk) { tempDist = m_ibk.distributionForInstance(instance); } else { if (!m_classIsNominal) { tempDist = new double[1]; tempDist[0] = m_majority; } else { tempDist = m_classPriors.clone(); /*tempDist = new double [m_theInstances.classAttribute().numValues()]; tempDist[(int)m_majority] = 1.0; */ } } } else { if (!m_classIsNominal) { normDist = new double[1]; normDist[0] = (tempDist[0] / tempDist[1]); tempDist = normDist; } else { // normalise distribution normDist = new double[tempDist.length]; System.arraycopy(tempDist, 0, normDist, 0, tempDist.length); Utils.normalize(normDist); tempDist = normDist; } } return tempDist; }
/** * Updates the minimum and maximum values for all the attributes based on a new instance. * * @param instance the new instance */ private void updateMinMax(Instance instance) { for (int j = 0; j < instance.numAttributes(); j++) { if (Double.isNaN(m_Min[j])) { m_Min[j] = instance.value(j); m_Max[j] = instance.value(j); } else { if (instance.value(j) < m_Min[j]) { m_Min[j] = instance.value(j); } else { if (instance.value(j) > m_Max[j]) { m_Max[j] = instance.value(j); } } } } }
@Override public void trainOnInstanceImpl(Instance inst) { if (this.initialized == false) { this.dimension = inst.numAttributes(); manager = new BucketManager(this.length, this.dimension, this.coresetsize, this.clustererRandom); this.initialized = true; } manager.insertPoint(new Point(inst, this.numberInstances)); this.numberInstances++; if (this.numberInstances % widthOption.getValue() == 0) { Point[] streamingCoreset = manager.getCoresetFromManager(dimension); // compute 5 clusterings of the coreset with kMeans++ and take the best double minCost = 0.0; double curCost = 0.0; minCost = lloydPlusPlus( numberOfCentres, coresetsize, dimension, streamingCoreset, centresStreamingCoreset); curCost = minCost; for (int i = 1; i < 5; i++) { Point[] tmpCentresStreamingCoreset = new Point[0]; curCost = lloydPlusPlus( numberOfCentres, coresetsize, dimension, streamingCoreset, tmpCentresStreamingCoreset); if (curCost < minCost) { minCost = curCost; centresStreamingCoreset = tmpCentresStreamingCoreset; } } } }
public double classifyInstance(Instance sample) throws Exception { // transform instance to sequence MonoDoubleItemSet[] sequence = new MonoDoubleItemSet[sample.numAttributes() - 1]; int shift = (sample.classIndex() == 0) ? 1 : 0; for (int t = 0; t < sequence.length; t++) { sequence[t] = new MonoDoubleItemSet(sample.value(t + shift)); } Sequence seq = new Sequence(sequence); double minD = Double.MAX_VALUE; String classValue = null; for (ClassedSequence s : prototypes) { double tmpD = seq.distance(s.sequence); if (tmpD < minD) { minD = tmpD; classValue = s.classValue; } } // System.out.println(prototypes.size()); return sample.classAttribute().indexOfValue(classValue); }
/** * test on one sample * * @param sample * @return p(y|sample) forall y * @throws Exception */ public double classifyInstance(Instance sample) throws Exception { // transform instance to sequence MonoDoubleItemSet[] sequence = new MonoDoubleItemSet[sample.numAttributes() - 1]; int shift = (sample.classIndex() == 0) ? 1 : 0; for (int t = 0; t < sequence.length; t++) { sequence[t] = new MonoDoubleItemSet(sample.value(t + shift)); } Sequence seq = new Sequence(sequence); // for each class String classValue = null; double maxProb = 0.0; double[] pr = new double[classedData.keySet().size()]; for (String clas : classedData.keySet()) { int c = trainingData.classAttribute().indexOfValue(clas); double prob = 0.0; for (int k = 0; k < centroidsPerClass[c].length; k++) { // compute P(Q|k_c) if (sigmasPerClass[c][k] == Double.NaN || sigmasPerClass[c][k] == 0) { System.err.println("sigma=NAN||sigma=0"); continue; } double dist = seq.distanceEuc(centroidsPerClass[c][k]); double p = computeProbaForQueryAndCluster(sigmasPerClass[c][k], dist); prob += p / centroidsPerClass[c].length; // prob += p*prior[c][k]; if (p > maxProb) { maxProb = p; classValue = clas; } } // if (prob > maxProb) { // maxProb = prob; // classValue = clas; // } } // System.out.println(Arrays.toString(pr)); // System.out.println(classValue); return sample.classAttribute().indexOfValue(classValue); }
/** * Compare two datasets to see if they differ. * * @param data1 one set of instances * @param data2 the other set of instances * @throws Exception if the datasets differ */ protected void compareDatasets(Instances data1, Instances data2) throws Exception { if (data1.numAttributes() != data2.numAttributes()) throw new Exception("number of attributes has changed"); if (!(data2.numInstances() == data1.numInstances())) throw new Exception("number of instances has changed"); for (int i = 0; i < data2.numInstances(); i++) { Instance orig = data1.instance(i); Instance copy = data2.instance(i); for (int j = 0; j < orig.numAttributes(); j++) { if (orig.isMissing(j)) { if (!copy.isMissing(j)) throw new Exception("instances have changed"); } else if (!orig.toString(j).equals(copy.toString(j))) { throw new Exception("instances have changed"); } if (orig.weight() != copy.weight()) throw new Exception("instance weights have changed"); } } }
@Override public void updateNode(Instance inst) throws Exception { super.updateDistribution(inst); for (int i = 0; i < inst.numAttributes(); i++) { Attribute a = inst.attribute(i); if (i != inst.classIndex()) { ConditionalSufficientStats stats = m_nodeStats.get(a.name()); if (stats == null) { if (a.isNumeric()) { stats = new GaussianConditionalSufficientStats(); } else { stats = new NominalConditionalSufficientStats(); } m_nodeStats.put(a.name(), stats); } stats.update( inst.value(a), inst.classAttribute().value((int) inst.classValue()), inst.weight()); } } }
public Map<FV, Collection<FV>> extractValuesFromData(Instances inst) { Multimap<FV, FV> fv_list = ArrayListMultimap.create(); // Instances outFormat = getOutputFormat(); for (int i = 0; i < inst.numInstances(); i++) { Instance ins = inst.instance(i); // Skip the class label for (int x = 0; x < ins.numAttributes() - 1; x++) { Object value = null; try { value = ins.stringValue(x); } catch (Exception e) { value = ins.value(x); } FV fv = new FV(x, value, ins.classValue()); fv.setNumLabels(inst.numClasses()); if (!fv_list.put(fv, fv)) { System.err.println("Couldn't put duplicates: " + fv); } } } Map<FV, Collection<FV>> original_map = fv_list.asMap(); return original_map; }
/** * Input an instance for filtering. The instance is processed and made available for output * immediately. * * @param instance the input instance. * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input structure has been defined. */ public boolean input(Instance instance) { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (isOutputFormatDefined()) { Instance newInstance = (Instance) instance.copy(); // make sure that we get the right indexes set for the converted // string attributes when operating on a second batch of instances for (int i = 0; i < newInstance.numAttributes(); i++) { if (newInstance.attribute(i).isString() && !newInstance.isMissing(i) && m_AttIndices.isInRange(i)) { Attribute outAtt = getOutputFormat().attribute(newInstance.attribute(i).name()); String inVal = newInstance.stringValue(i); int outIndex = outAtt.indexOfValue(inVal); if (outIndex < 0) { newInstance.setMissing(i); } else { newInstance.setValue(i, outIndex); } } } push(newInstance); return true; } bufferInput(instance); return false; }
protected void tokenizeInstance(Instance instance, boolean updateDictionary) { if (m_inputVector == null) { m_inputVector = new LinkedHashMap<String, Count>(); } else { m_inputVector.clear(); } if (m_useStopList && m_stopwords == null) { m_stopwords = new Stopwords(); try { if (getStopwords().exists() && !getStopwords().isDirectory()) { m_stopwords.read(getStopwords()); } } catch (Exception ex) { ex.printStackTrace(); } } for (int i = 0; i < instance.numAttributes(); i++) { if (instance.attribute(i).isString() && !instance.isMissing(i)) { m_tokenizer.tokenize(instance.stringValue(i)); while (m_tokenizer.hasMoreElements()) { String word = m_tokenizer.nextElement(); if (m_lowercaseTokens) { word = word.toLowerCase(); } word = m_stemmer.stem(word); if (m_useStopList) { if (m_stopwords.is(word)) { continue; } } Count docCount = m_inputVector.get(word); if (docCount == null) { m_inputVector.put(word, new Count(instance.weight())); } else { docCount.m_count += instance.weight(); } } } } if (updateDictionary) { int classValue = (int) instance.classValue(); LinkedHashMap<String, Count> dictForClass = m_probOfWordGivenClass.get(classValue); // document normalization double iNorm = 0; double fv = 0; if (m_normalize) { for (Count c : m_inputVector.values()) { // word counts or bag-of-words? fv = (m_wordFrequencies) ? c.m_count : 1.0; iNorm += Math.pow(Math.abs(fv), m_lnorm); } iNorm = Math.pow(iNorm, 1.0 / m_lnorm); } for (Map.Entry<String, Count> feature : m_inputVector.entrySet()) { String word = feature.getKey(); double freq = (m_wordFrequencies) ? feature.getValue().m_count : 1.0; // double freq = (feature.getValue().m_count / iNorm * m_norm); if (m_normalize) { freq /= (iNorm * m_norm); } // check all classes for (int i = 0; i < m_data.numClasses(); i++) { LinkedHashMap<String, Count> dict = m_probOfWordGivenClass.get(i); if (dict.get(word) == null) { dict.put(word, new Count(m_leplace)); m_wordsPerClass[i] += m_leplace; } } Count dictCount = dictForClass.get(word); /* * if (dictCount == null) { dictForClass.put(word, new Count(m_leplace + * freq)); m_wordsPerClass[classValue] += (m_leplace + freq); } else { */ dictCount.m_count += freq; m_wordsPerClass[classValue] += freq; // } } pruneDictionary(); } }
/** * Saves an instances incrementally. Structure has to be set by using the setStructure() method or * setInstances() method. * * @param inst the instance to save * @throws IOException throws IOEXception if an instance cannot be saved incrementally. */ public void writeIncremental(Instance inst) throws IOException { int writeMode = getWriteMode(); Instances structure = getInstances(); PrintWriter outW = null; if (structure != null) { if (structure.classIndex() == -1) { structure.setClassIndex(structure.numAttributes() - 1); System.err.println("No class specified. Last attribute is used as class attribute."); } if (structure.attribute(structure.classIndex()).isNumeric()) throw new IOException("To save in C4.5 format the class attribute cannot be numeric."); } if (getRetrieval() == BATCH || getRetrieval() == NONE) throw new IOException("Batch and incremental saving cannot be mixed."); if (retrieveFile() == null || getWriter() == null) { throw new IOException( "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option."); } outW = new PrintWriter(getWriter()); if (writeMode == WAIT) { if (structure == null) { setWriteMode(CANCEL); if (inst != null) System.err.println("Structure(Header Information) has to be set in advance"); } else setWriteMode(STRUCTURE_READY); writeMode = getWriteMode(); } if (writeMode == CANCEL) { if (outW != null) outW.close(); cancel(); } if (writeMode == STRUCTURE_READY) { setWriteMode(WRITE); // write header: here names file for (int i = 0; i < structure.attribute(structure.classIndex()).numValues(); i++) { outW.write(structure.attribute(structure.classIndex()).value(i)); if (i < structure.attribute(structure.classIndex()).numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } for (int i = 0; i < structure.numAttributes(); i++) { if (i != structure.classIndex()) { outW.write(structure.attribute(i).name() + ": "); if (structure.attribute(i).isNumeric() || structure.attribute(i).isDate()) { outW.write("continuous.\n"); } else { Attribute temp = structure.attribute(i); for (int j = 0; j < temp.numValues(); j++) { outW.write(temp.value(j)); if (j < temp.numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } } } } outW.flush(); outW.close(); writeMode = getWriteMode(); String out = retrieveFile().getAbsolutePath(); setFileExtension(".data"); out = out.substring(0, out.lastIndexOf('.')) + getFileExtension(); File namesFile = new File(out); try { setFile(namesFile); } catch (Exception ex) { throw new IOException("Cannot create data file, only names file created."); } if (retrieveFile() == null || getWriter() == null) { throw new IOException("Cannot create data file, only names file created."); } outW = new PrintWriter(getWriter()); } if (writeMode == WRITE) { if (structure == null) throw new IOException("No instances information available."); if (inst != null) { // write instance: here data file for (int j = 0; j < inst.numAttributes(); j++) { if (j != structure.classIndex()) { if (inst.isMissing(j)) { outW.write("?,"); } else if (structure.attribute(j).isNominal() || structure.attribute(j).isString()) { outW.write(structure.attribute(j).value((int) inst.value(j)) + ","); } else { outW.write("" + inst.value(j) + ","); } } } // write the class value if (inst.isMissing(structure.classIndex())) { outW.write("?"); } else { outW.write( structure .attribute(structure.classIndex()) .value((int) inst.value(structure.classIndex()))); } outW.write("\n"); // flushes every 100 instances m_incrementalCounter++; if (m_incrementalCounter > 100) { m_incrementalCounter = 0; outW.flush(); } } else { // close if (outW != null) { outW.flush(); outW.close(); } setFileExtension(".names"); m_incrementalCounter = 0; resetStructure(); outW = null; resetWriter(); } } }
/** * Writes a Batch of instances * * @throws IOException throws IOException if saving in batch mode is not possible */ public void writeBatch() throws IOException { Instances instances = getInstances(); if (instances == null) throw new IOException("No instances to save"); if (instances.classIndex() == -1) { instances.setClassIndex(instances.numAttributes() - 1); System.err.println("No class specified. Last attribute is used as class attribute."); } if (instances.attribute(instances.classIndex()).isNumeric()) throw new IOException("To save in C4.5 format the class attribute cannot be numeric."); if (getRetrieval() == INCREMENTAL) throw new IOException("Batch and incremental saving cannot be mixed."); setRetrieval(BATCH); if (retrieveFile() == null || getWriter() == null) { throw new IOException( "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option."); } setWriteMode(WRITE); // print names file setFileExtension(".names"); PrintWriter outW = new PrintWriter(getWriter()); for (int i = 0; i < instances.attribute(instances.classIndex()).numValues(); i++) { outW.write(instances.attribute(instances.classIndex()).value(i)); if (i < instances.attribute(instances.classIndex()).numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } for (int i = 0; i < instances.numAttributes(); i++) { if (i != instances.classIndex()) { outW.write(instances.attribute(i).name() + ": "); if (instances.attribute(i).isNumeric() || instances.attribute(i).isDate()) { outW.write("continuous.\n"); } else { Attribute temp = instances.attribute(i); for (int j = 0; j < temp.numValues(); j++) { outW.write(temp.value(j)); if (j < temp.numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } } } } outW.flush(); outW.close(); // print data file String out = retrieveFile().getAbsolutePath(); setFileExtension(".data"); out = out.substring(0, out.lastIndexOf('.')) + getFileExtension(); File namesFile = new File(out); try { setFile(namesFile); } catch (Exception ex) { throw new IOException( "Cannot create data file, only names file created (Reason: " + ex.toString() + ")."); } if (retrieveFile() == null || getWriter() == null) { throw new IOException("Cannot create data file, only names file created."); } outW = new PrintWriter(getWriter()); // print data file for (int i = 0; i < instances.numInstances(); i++) { Instance temp = instances.instance(i); for (int j = 0; j < temp.numAttributes(); j++) { if (j != instances.classIndex()) { if (temp.isMissing(j)) { outW.write("?,"); } else if (instances.attribute(j).isNominal() || instances.attribute(j).isString()) { outW.write(instances.attribute(j).value((int) temp.value(j)) + ","); } else { outW.write("" + temp.value(j) + ","); } } } // write the class value if (temp.isMissing(instances.classIndex())) { outW.write("?"); } else { outW.write( instances .attribute(instances.classIndex()) .value((int) temp.value(instances.classIndex()))); } outW.write("\n"); } outW.flush(); outW.close(); setFileExtension(".names"); setWriteMode(WAIT); outW = null; resetWriter(); setWriteMode(CANCEL); }