/** * Select only instances with weights that contribute to the specified quantile of the weight * distribution * * @param data the input instances * @param quantile the specified quantile eg 0.9 to select 90% of the weight mass * @return the selected instances */ protected Instances selectWeightQuantile(Instances data, double quantile) { int numInstances = data.numInstances(); Instances trainData = new Instances(data, numInstances); double[] weights = new double[numInstances]; double sumOfWeights = 0; for (int i = 0; i < numInstances; i++) { weights[i] = data.instance(i).weight(); sumOfWeights += weights[i]; } double weightMassToSelect = sumOfWeights * quantile; int[] sortedIndices = Utils.sort(weights); // Select the instances sumOfWeights = 0; for (int i = numInstances - 1; i >= 0; i--) { Instance instance = (Instance) data.instance(sortedIndices[i]).copy(); trainData.add(instance); sumOfWeights += weights[sortedIndices[i]]; if ((sumOfWeights > weightMassToSelect) && (i > 0) && (weights[sortedIndices[i]] != weights[sortedIndices[i - 1]])) { break; } } if (m_Debug) { System.err.println("Selected " + trainData.numInstances() + " out of " + numInstances); } return trainData; }
// 计算h1,h2分类器共同的分类错误率; public double measureBothError(Classifier h1, Classifier h2, Instances test) { int m = test.numInstances(); double value1, value2, value; int error = 0, total = 0; try { for (int i = 0; i < m; i++) { value = test.instance(i).classValue(); value1 = h1.classifyInstance(test.instance(i)); value2 = h2.classifyInstance(test.instance(i)); // 两分类器做出相同决策 if (value1 == value2) { // 两分类器做出相同决策的样本数量 total++; // 两分类器做出相同错误决策 if (value != value1) { // 两分类器做出相同错误决策的样本数量 error++; } } } } catch (Exception e) { System.out.println(e); } // System.out.println("m:=" + m); // System.out.println("error:=" + error +"; total:=" + total); // 两个分类器的分类错误率= 两分类器做出相同错误决策的样本数量/两分类器做出相同决策的样本数量 return (error * 1.0) / total; }
/** * Compare two datasets to see if they differ. * * @param data1 one set of instances * @param data2 the other set of instances * @throws Exception if the datasets differ */ protected void compareDatasets(Instances data1, Instances data2) throws Exception { if (m_CheckHeader) { if (!data2.equalHeaders(data1)) { throw new Exception("header has been modified\n" + data2.equalHeadersMsg(data1)); } } if (!(data2.numInstances() == data1.numInstances())) { throw new Exception("number of instances has changed"); } for (int i = 0; i < data2.numInstances(); i++) { Instance orig = data1.instance(i); Instance copy = data2.instance(i); for (int j = 0; j < orig.numAttributes(); j++) { if (orig.isMissing(j)) { if (!copy.isMissing(j)) { throw new Exception("instances have changed"); } } else { if (m_CompareValuesAsString) { if (!orig.toString(j).equals(copy.toString(j))) { throw new Exception("instances have changed"); } } else { if (Math.abs(orig.value(j) - copy.value(j)) > m_MaxDiffValues) { throw new Exception("instances have changed"); } } } if (Math.abs(orig.weight() - copy.weight()) > m_MaxDiffWeights) { throw new Exception("instance weights have changed"); } } } }
@Override protected Instances process(Instances instances) throws Exception { Instances result = new Instances(determineOutputFormat(instances), 0); Tagger tagger = new Tagger(); tagger.loadModel("models/model.20120919"); // reference to the content of the tweet Attribute attrCont = instances.attribute("content"); for (int i = 0; i < instances.numInstances(); i++) { double[] values = new double[result.numAttributes()]; for (int n = 0; n < instances.numAttributes(); n++) values[n] = instances.instance(i).value(n); String content = instances.instance(i).stringValue(attrCont); List<String> words = MyUtils.cleanTokenize(content); List<String> posTags = MyUtils.getPOStags(words, tagger); // calculate frequencies of different POS tags Map<String, Integer> posFreqs = MyUtils.calculateTermFreq(posTags); // add POS values for (String posTag : posFreqs.keySet()) { int index = result.attribute("POS-" + posTag).index(); values[index] = posFreqs.get(posTag); } Instance inst = new SparseInstance(1, values); result.add(inst); } return result; }
/** * Calculates the centroid pivot of a node based on the list of points that it contains (tbe two * lists of its children are provided). * * @param list1 The point index list of first child. * @param list2 The point index list of second child. * @param insts The insts object on which the tree is being built (for header information). * @return The centroid pivot of the node. */ public Instance calcPivot(MyIdxList list1, MyIdxList list2, Instances insts) { int classIdx = m_Instances.classIndex(); double[] attrVals = new double[insts.numAttributes()]; Instance temp; for (int i = 0; i < list1.length(); i++) { temp = insts.instance(((ListNode) list1.get(i)).idx); for (int k = 0; k < temp.numValues(); k++) { if (temp.index(k) == classIdx) continue; attrVals[k] += temp.valueSparse(k); } } for (int j = 0; j < list2.length(); j++) { temp = insts.instance(((ListNode) list2.get(j)).idx); for (int k = 0; k < temp.numValues(); k++) { if (temp.index(k) == classIdx) continue; attrVals[k] += temp.valueSparse(k); } } for (int j = 0, numInsts = list1.length() + list2.length(); j < attrVals.length; j++) { attrVals[j] /= numInsts; } temp = new DenseInstance(1.0, attrVals); return temp; }
public static ArrayList<Integer> getProfiles(Instances inst, List<Integer> marks) throws Exception { // Instances inst = Utils.prepareProfileMatcherData(schoolNo, grade, term, subjects); // ReplaceMissingValues rmv = new ReplaceMissingValues(); // rmv.setInputFormat(inst); // inst = Filter.useFilter(inst, rmv); for (int i = 0; i < inst.numAttributes(); i++) { inst.deleteWithMissing(i); } KDTree tree = new KDTree(); tree.setMeasurePerformance(true); try { tree.setInstances(inst); EuclideanDistance df = new EuclideanDistance(inst); df.setDontNormalize(true); df.setAttributeIndices("2-last"); tree.setDistanceFunction(df); } catch (Exception e) { e.printStackTrace(); } Instances neighbors = null; Instances test = CFilter.createInstance(112121, (ArrayList<Integer>) marks); Instance p = test.firstInstance(); try { neighbors = tree.kNearestNeighbours(p, 50); } catch (Exception e) { e.printStackTrace(); } // System.out.println(tree.getPerformanceStats().getTotalPointsVisited()); // System.out.println(nn1 + " is the nearest neigbor for " + p); // System.out.println(nn2 + " is the second nearest neigbor for " + p); ArrayList<Integer> profiles = new ArrayList<Integer>(); for (int i = 0; i < neighbors.numInstances(); i++) { System.out.println(neighbors.instance(i)); profiles.add(Integer.valueOf(neighbors.instance(i).toString(0))); } // Now we can also easily compute the distances as the KDTree does it DistanceFunction df = tree.getDistanceFunction(); // System.out.println("The distance between" + nn1 + " and " + p + " is " + df.distance(nn1, // p)); // System.out.println("The distance between" + nn2 + " and " + p + " is " + df.distance(nn2, // p)); return profiles; }
private static void writePredictedDistributions( Classifier c, Instances data, int idIndex, Writer out) throws Exception { // header out.write("id"); for (int i = 0; i < data.numClasses(); i++) { out.write(",\""); out.write(data.classAttribute().value(i).replaceAll("[\"\\\\]", "_")); out.write("\""); } out.write("\n"); // data for (int i = 0; i < data.numInstances(); i++) { final String id = data.instance(i).stringValue(idIndex); double[] distribution = c.distributionForInstance(data.instance(i)); // final String label = data.attribute(classIndex).value(); out.write(id); for (double probability : distribution) { out.write(","); out.write(String.valueOf(probability > 1e-5 ? (float) probability : 0f)); } out.write("\n"); } }
/** * Get the sum of value of the dataset * * @param data set of instances to handle * @return sum of all the attribute values for all the instances in the dataset */ private double getTotalSum(Instances data) { double sum = 0.0; for (int i = 0; i < data.numInstances(); i++) { for (int v = 0; v < data.instance(i).numValues(); v++) { sum += data.instance(i).valueSparse(v); } } return sum; }
/** * builds the kernel with the given data. Initializes the kernel cache. The actual size of the * cache in bytes is (64 * cacheSize). * * @param data the data to base the kernel on * @throws Exception if something goes wrong */ public void buildKernel(Instances data) throws Exception { // does kernel handle the data? if (!getChecksTurnedOff()) getCapabilities().testWithFail(data); initVars(data); for (int i = 0; i < data.numInstances(); i++) m_kernelPrecalc[i] = dotProd(data.instance(i), data.instance(i)); }
/** Queries the user enough to make a database query to retrieve experiment results. */ protected void setInstancesFromDBaseQuery() { try { if (m_InstanceQuery == null) { m_InstanceQuery = new InstanceQuery(); } String dbaseURL = m_InstanceQuery.getDatabaseURL(); dbaseURL = (String) JOptionPane.showInputDialog( this, "Enter the database URL", "Query Database", JOptionPane.PLAIN_MESSAGE, null, null, dbaseURL); if (dbaseURL == null) { m_FromLab.setText("Cancelled"); return; } m_InstanceQuery.setDatabaseURL(dbaseURL); m_InstanceQuery.connectToDatabase(); if (!m_InstanceQuery.experimentIndexExists()) { m_FromLab.setText("No experiment index"); return; } m_FromLab.setText("Getting experiment index"); Instances index = m_InstanceQuery.retrieveInstances("SELECT * FROM " + InstanceQuery.EXP_INDEX_TABLE); if (index.numInstances() == 0) { m_FromLab.setText("No experiments available"); return; } m_FromLab.setText("Got experiment index"); DefaultListModel lm = new DefaultListModel(); for (int i = 0; i < index.numInstances(); i++) { lm.addElement(index.instance(i).toString()); } JList jl = new JList(lm); ListSelectorDialog jd = new ListSelectorDialog(null, jl); int result = jd.showDialog(); if (result != ListSelectorDialog.APPROVE_OPTION) { m_FromLab.setText("Cancelled"); return; } Instance selInst = index.instance(jl.getSelectedIndex()); Attribute tableAttr = index.attribute(InstanceQuery.EXP_RESULT_COL); String table = InstanceQuery.EXP_RESULT_PREFIX + selInst.toString(tableAttr); setInstancesFromDatabaseTable(table); } catch (Exception ex) { m_FromLab.setText("Problem reading database"); } }
/** * Calculate the squared error of a regression model on the training data * * @param selectedAttributes an array of flags indicating which attributes are included in the * regression model * @param coefficients an array of coefficients for the regression model * @return the mean squared error on the training data * @throws Exception if there is a missing class value in the training data */ private double calculateSE(boolean[] selectedAttributes, double[] coefficients) throws Exception { double mse = 0; for (int i = 0; i < m_TransformedData.numInstances(); i++) { double prediction = regressionPrediction(m_TransformedData.instance(i), selectedAttributes, coefficients); double error = prediction - m_TransformedData.instance(i).classValue(); mse += error * error; } return mse; }
public void classify(String filename) throws Exception { Instances unLabeledData = DataSource.read(filename); unLabeledData.setClassIndex(unLabeledData.numAttributes() - 1); Instances LabeledData = new Instances(unLabeledData); for (int i = 0; i < unLabeledData.numInstances(); ++i) { double clsLabel = classifier.classifyInstance(unLabeledData.instance(i)); LabeledData.instance(i).setClassValue(clsLabel); } System.out.println(LabeledData.toString()); }
/** Sets distribution associated with model. */ public void resetDistribution(Instances data) throws Exception { Instances insts = new Instances(data, data.numInstances()); for (int i = 0; i < data.numInstances(); i++) { if (whichSubset(data.instance(i)) > -1) { insts.add(data.instance(i)); } } Distribution newD = new Distribution(insts, this); newD.addInstWithUnknown(data, m_attIndex); m_distribution = newD; }
/** * processes the instances using the HAAR algorithm * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong */ protected Instances processHAAR(Instances instances) throws Exception { Instances result; int i; int n; int j; int clsIdx; double[] oldVal; double[] newVal; int level; int length; double[] clsVal; Attribute clsAtt; clsIdx = instances.classIndex(); clsVal = null; clsAtt = null; if (clsIdx > -1) { clsVal = instances.attributeToDoubleArray(clsIdx); clsAtt = (Attribute) instances.classAttribute().copy(); instances.setClassIndex(-1); instances.deleteAttributeAt(clsIdx); } result = new Instances(instances, 0); level = (int) StrictMath.ceil(StrictMath.log(instances.numAttributes()) / StrictMath.log(2.0)); for (i = 0; i < instances.numInstances(); i++) { oldVal = instances.instance(i).toDoubleArray(); newVal = new double[oldVal.length]; for (n = level; n > 0; n--) { length = (int) StrictMath.pow(2, n - 1); for (j = 0; j < length; j++) { newVal[j] = (oldVal[j * 2] + oldVal[j * 2 + 1]) / StrictMath.sqrt(2); newVal[j + length] = (oldVal[j * 2] - oldVal[j * 2 + 1]) / StrictMath.sqrt(2); } System.arraycopy(newVal, 0, oldVal, 0, newVal.length); } // add new transformed instance result.add(new DenseInstance(1, newVal)); } // add class again if (clsIdx > -1) { result.insertAttributeAt(clsAtt, clsIdx); result.setClassIndex(clsIdx); for (i = 0; i < clsVal.length; i++) result.instance(i).setClassValue(clsVal[i]); } return result; }
// use the learned classifiers to get conditional probability protected double conMI(Instances D_j, Instances D_k, CNode[][] miNodes, int j, int k) throws Exception { int L = D_j.classIndex(); int N = D_j.numInstances(); double y[] = new double[L]; double I = 0.0; // conditional mutual information for y_j and y_k double p_1, p_2; // p( y_j = 1 | x ), p( y_j = 2 | x ) double p_12[] = { 0.0, 0.0 }; // p_12[0] = p( y_j = 1 | y_k = 0, x ) and p_12[1] = p( y_j = 1 | y_k = 1, x ) for (int i = 0; i < N; i++) { Arrays.fill(y, 0); p_1 = Math.max( miNodes[j][0].distribution((Instance) D_j.instance(i).copy(), y)[1], 0.000001); // p( y_j = 1 | x ) p_1 = Math.min(p_1, 0.999999); p_1 = Math.max(p_1, 0.000001); Arrays.fill(y, 0); p_2 = Math.max( miNodes[k][0].distribution((Instance) D_k.instance(i).copy(), y)[1], 0.000001); // p( y_k = 1 | x ) p_2 = Math.min(p_2, 0.999999); p_2 = Math.max(p_2, 0.000001); Arrays.fill(y, 0); p_12[0] = Math.max( miNodes[j][k - j].distribution((Instance) D_j.instance(i).copy(), y)[1], 0.000001); // p( y_j = 1 | y_k = 0, x ) p_12[0] = Math.min(p_12[0], 0.999999); p_12[0] = Math.max(p_12[0], 0.000001); Arrays.fill(y, 0); Arrays.fill(y, k, k + 1, 1.0); p_12[1] = Math.max( miNodes[j][k - j].distribution((Instance) D_j.instance(i).copy(), y)[1], 0.000001); // p( y_j = 1 | y_k = 1, x ) p_12[1] = Math.min(p_12[1], 0.999999); p_12[1] = Math.max(p_12[1], 0.000001); I += (1 - p_12[0]) * (1 - p_2) * Math.log((1 - p_12[0]) / (1 - p_1)); // I( y_j = 0 ; y_k = 0 ) I += (1 - p_12[1]) * (p_2) * Math.log((1 - p_12[1]) / (1 - p_1)); // I( y_j = 0 ; y_k = 1 ) I += (p_12[0]) * (1 - p_2) * Math.log((p_12[0]) / (p_1)); // I( y_j = 1 ; y_k = 0 ) I += (p_12[1]) * (p_2) * Math.log((p_12[1]) / (p_1)); // I( y_j = 1 ; y_k = 0 ) } I = I / N; return I; }
public static double[][] labeled2relation(boolean[] labeled, Instances data) { // TODO Auto-generated method stub double[][] res = new double[labeled.length][labeled.length]; for (int i = 0; i < labeled.length; ++i) { for (int j = i + 1; j < labeled.length; ++j) { if (labeled[i] && labeled[j]) { if (data.instance(i).classValue() == data.instance(j).classValue()) res[i][j] = 1; else res[i][j] = -1; } } } return res; }
@Override public void train(Instances instance) { // find the best attribute int classIdx = instance.classIndex(); for (int i = 0; i < instance.numInstances(); i++) { if (classIdx == 0) { zeroIns.add(instance.instance(i)); } else { oneIns.add(instance.instance(i)); } } }
public void testTypical() { m_Filter = getFilter("6,3"); Instances result = useFilter(); assertEquals(m_Instances.numAttributes() - 1, result.numAttributes()); for (int i = 0; i < result.numInstances(); i++) { Instance orig = m_Instances.instance(i); if (orig.isMissing(5) || orig.isMissing(2)) { assertTrue("Instance " + (i + 1) + " should have been ?", result.instance(i).isMissing(4)); } else { assertEquals(orig.value(5) - orig.value(2), result.instance(i).value(4), EXPR_DELTA); } } }
/** * Calculates the radius of a node based on the list of points that it contains (the two lists of * its children are provided). * * @param list1 The point index list of first child. * @param list2 The point index list of second child. * @param pivot The centre/pivot of the node. * @param insts The instances on which the tree is being built (for header info). * @return The radius of the node. */ public double calcRadius(MyIdxList list1, MyIdxList list2, Instance pivot, Instances insts) { double radius = Double.NEGATIVE_INFINITY; for (int i = 0; i < list1.length(); i++) { double dist = m_DistanceFunction.distance(pivot, insts.instance(((ListNode) list1.get(i)).idx)); if (dist > radius) radius = dist; } for (int j = 0; j < list2.length(); j++) { double dist = m_DistanceFunction.distance(pivot, insts.instance(((ListNode) list2.get(j)).idx)); if (dist > radius) radius = dist; } return radius; }
@Override public Instances labelData(String data) throws Exception { Instances unlabeled = new Instances(new BufferedReader(new FileReader(data))); // set class attribute unlabeled.setClassIndex(unlabeled.numAttributes() - 1); // create copy Instances labeled = new Instances(unlabeled); for (int i = 0; i < unlabeled.numInstances(); i++) { Instance ui = unlabeled.instance(i); double clsLabel = this.classifier.classifyInstance(ui); labeled.instance(i).setClassValue(clsLabel); System.out.println(ui.toString() + " -> " + unlabeled.classAttribute().value((int) clsLabel)); } return labeled; }
public static Double runClassify(String trainFile, String testFile) { double predictOrder = 0.0; double trueOrder = 0.0; try { String trainWekaFileName = trainFile; String testWekaFileName = testFile; Instances train = DataSource.read(trainWekaFileName); Instances test = DataSource.read(testWekaFileName); train.setClassIndex(0); test.setClassIndex(0); train.deleteAttributeAt(8); test.deleteAttributeAt(8); train.deleteAttributeAt(6); test.deleteAttributeAt(6); train.deleteAttributeAt(5); test.deleteAttributeAt(5); train.deleteAttributeAt(4); test.deleteAttributeAt(4); // AdditiveRegression classifier = new AdditiveRegression(); // NaiveBayes classifier = new NaiveBayes(); RandomForest classifier = new RandomForest(); // LibSVM classifier = new LibSVM(); classifier.buildClassifier(train); Evaluation eval = new Evaluation(train); eval.evaluateModel(classifier, test); System.out.println(eval.toSummaryString("\nResults\n\n", true)); // System.out.println(eval.toClassDetailsString()); // System.out.println(eval.toMatrixString()); int k = 892; for (int i = 0; i < test.numInstances(); i++) { predictOrder = classifier.classifyInstance(test.instance(i)); trueOrder = test.instance(i).classValue(); System.out.println((k++) + "," + (int) predictOrder); } } catch (Exception e) { e.printStackTrace(); } return predictOrder; }
/** * wrap up various variables to save memeory and do some housekeeping after optimization has * finished. * * @throws Exception if something goes wrong */ protected void wrapUp() throws Exception { m_target = null; m_nEvals = m_kernel.numEvals(); m_nCacheHits = m_kernel.numCacheHits(); if ((m_SVM.getKernel() instanceof PolyKernel) && ((PolyKernel) m_SVM.getKernel()).getExponent() == 1.0) { // convert alpha's to weights double[] weights = new double[m_data.numAttributes()]; for (int k = m_supportVectors.getNext(-1); k != -1; k = m_supportVectors.getNext(k)) { for (int j = 0; j < weights.length; j++) { if (j != m_classIndex) { weights[j] += (m_alpha[k] - m_alphaStar[k]) * m_data.instance(k).value(j); } } } m_weights = weights; // release memory m_alpha = null; m_alphaStar = null; m_kernel = null; } m_bModelBuilt = true; }
/** * Compute the value of the objective function. * * @return the score * @throws Exception if something goes wrong */ protected double getScore() throws Exception { double res = 0; double t = 0, t2 = 0; double sumAlpha = 0.0; for (int i = 0; i < m_nInstances; i++) { sumAlpha += (m_alpha[i] - m_alphaStar[i]); for (int j = 0; j < m_nInstances; j++) { t += (m_alpha[i] - m_alphaStar[i]) * (m_alpha[j] - m_alphaStar[j]) * m_kernel.eval(i, j, m_data.instance(i)); } // switch(m_nLossType) { // case L1: // t2 += m_data.instance(i).classValue() * (m_alpha[i] - m_alpha_[i]); // break; // case L2: // t2 += m_data.instance(i).classValue() * (m_alpha[i] - m_alpha_[i]) - (0.5/m_SVM.getC()) // * (m_alpha[i]*m_alpha[i] + m_alpha_[i]*m_alpha_[i]); // break; // case HUBER: // t2 += m_data.instance(i).classValue() * (m_alpha[i] - m_alpha_[i]) - // (0.5*m_SVM.getEpsilon()/m_SVM.getC()) * (m_alpha[i]*m_alpha[i] + m_alpha_[i]*m_alpha_[i]); // break; // case EPSILON: // t2 += m_data.instance(i).classValue() * (m_alpha[i] - m_alphaStar[i]) - m_epsilon * // (m_alpha[i] + m_alphaStar[i]); t2 += m_target[i] * (m_alpha[i] - m_alphaStar[i]) - m_epsilon * (m_alpha[i] + m_alphaStar[i]); // break; // } } res += -0.5 * t + t2; return res; }
/** * SVMOutput of an instance in the training set, m_data This uses the cache, unlike * SVMOutput(Instance) * * @param index index of the training instance in m_data * @return the SVM output * @throws Exception if something goes wrong */ protected double SVMOutput(int index) throws Exception { double result = -m_b; for (int i = m_supportVectors.getNext(-1); i != -1; i = m_supportVectors.getNext(i)) { result += (m_alpha[i] - m_alphaStar[i]) * m_kernel.eval(index, i, m_data.instance(index)); } return result; }
/** * Stratify the given data into the given number of bags based on the class values. It differs * from the <code>Instances.stratify(int fold)</code> that before stratification it sorts the * instances according to the class order in the header file. It assumes no missing values in the * class. * * @param data the given data * @param folds the given number of folds * @param rand the random object used to randomize the instances * @return the stratified instances */ public static final Instances stratify(Instances data, int folds, Random rand) { if (!data.classAttribute().isNominal()) return data; Instances result = new Instances(data, 0); Instances[] bagsByClasses = new Instances[data.numClasses()]; for (int i = 0; i < bagsByClasses.length; i++) bagsByClasses[i] = new Instances(data, 0); // Sort by class for (int j = 0; j < data.numInstances(); j++) { Instance datum = data.instance(j); bagsByClasses[(int) datum.classValue()].add(datum); } // Randomize each class for (int j = 0; j < bagsByClasses.length; j++) bagsByClasses[j].randomize(rand); for (int k = 0; k < folds; k++) { int offset = k, bag = 0; oneFold: while (true) { while (offset >= bagsByClasses[bag].numInstances()) { offset -= bagsByClasses[bag].numInstances(); if (++bag >= bagsByClasses.length) // Next bag break oneFold; } result.add(bagsByClasses[bag].instance(offset)); offset += folds; } } return result; }
/** * Find all the instances in the dataset covered/not covered by the rule in given index, and the * correponding simple statistics and predicted class distributions are stored in the given double * array, which can be obtained by getSimpleStats() and getDistributions().<br> * * @param index the given index, assuming correct * @param insts the dataset to be covered by the rule * @param stats the given double array to hold stats, side-effected * @param dist the given array to hold class distributions, side-effected if null, the * distribution is not necessary * @return the instances covered and not covered by the rule */ private Instances[] computeSimpleStats( int index, Instances insts, double[] stats, double[] dist) { Rule rule = (Rule) m_Ruleset.elementAt(index); Instances[] data = new Instances[2]; data[0] = new Instances(insts, insts.numInstances()); data[1] = new Instances(insts, insts.numInstances()); for (int i = 0; i < insts.numInstances(); i++) { Instance datum = insts.instance(i); double weight = datum.weight(); if (rule.covers(datum)) { data[0].add(datum); // Covered by this rule stats[0] += weight; // Coverage if ((int) datum.classValue() == (int) rule.getConsequent()) stats[2] += weight; // True positives else stats[4] += weight; // False positives if (dist != null) dist[(int) datum.classValue()] += weight; } else { data[1].add(datum); // Not covered by this rule stats[1] += weight; if ((int) datum.classValue() != (int) rule.getConsequent()) stats[3] += weight; // True negatives else stats[5] += weight; // False negatives } } return data; }
/** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ @Override public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (!m_firstBatchFinished) { Instances filtered; if (m_numOfCrossValidationFolds < 2) { filtered = cleanseTrain(getInputFormat()); } else { filtered = cleanseCross(getInputFormat()); } for (int i = 0; i < filtered.numInstances(); i++) { push(filtered.instance(i)); } m_firstBatchFinished = true; flushInput(); } m_NewBatch = true; return (numPendingOutput() != 0); }
@Override public void buildClassifier(Instances data) throws Exception { trainingData = data; Attribute classAttribute = data.classAttribute(); prototypes = new ArrayList<>(); classedData = new HashMap<String, ArrayList<Sequence>>(); indexClassedDataInFullData = new HashMap<String, ArrayList<Integer>>(); for (int c = 0; c < data.numClasses(); c++) { classedData.put(data.classAttribute().value(c), new ArrayList<Sequence>()); indexClassedDataInFullData.put(data.classAttribute().value(c), new ArrayList<Integer>()); } sequences = new Sequence[data.numInstances()]; classMap = new String[sequences.length]; for (int i = 0; i < sequences.length; i++) { Instance sample = data.instance(i); MonoDoubleItemSet[] sequence = new MonoDoubleItemSet[sample.numAttributes() - 1]; int shift = (sample.classIndex() == 0) ? 1 : 0; for (int t = 0; t < sequence.length; t++) { sequence[t] = new MonoDoubleItemSet(sample.value(t + shift)); } sequences[i] = new Sequence(sequence); String clas = sample.stringValue(classAttribute); classMap[i] = clas; classedData.get(clas).add(sequences[i]); indexClassedDataInFullData.get(clas).add(i); // System.out.println("Element "+i+" of train is classed "+clas+" and went to element // "+(indexClassedDataInFullData.get(clas).size()-1)); } buildSpecificClassifier(data); }
/** * Generates the classifier. * * @param data set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ @Override public void buildClassifier(Instances data) throws Exception { reset(); // can classifier handle the data? getCapabilities().testWithFail(data); m_data = new Instances(data, 0); data = new Instances(data); m_wordsPerClass = new double[data.numClasses()]; m_probOfClass = new double[data.numClasses()]; m_probOfWordGivenClass = new HashMap<Integer, LinkedHashMap<String, Count>>(); double laplace = 1.0; for (int i = 0; i < data.numClasses(); i++) { LinkedHashMap<String, Count> dict = new LinkedHashMap<String, Count>(10000 / data.numClasses()); m_probOfWordGivenClass.put(i, dict); m_probOfClass[i] = laplace; // this needs to be updated for laplace correction every time we see a new // word (attribute) m_wordsPerClass[i] = 0; } for (int i = 0; i < data.numInstances(); i++) { updateClassifier(data.instance(i)); } }
/** * Calculate average of every columns * * @param inst * @return */ public Double[] calculateAverage(Instances inst) { Double[] average = new Double[inst.numAttributes() - 1]; for (int i = 0; i < inst.numAttributes() - 1; i++) { average[i] = 0.0; } for (int i = 0; i < inst.numInstances(); i++) { for (int x = 0; x < inst.instance(i).numAttributes() - 1; x++) { Instance ins = inst.instance(i); if (ins != null && !Double.isNaN(ins.value(x))) average[x] += ins.value(x); } } for (int i = 0; i < inst.numAttributes() - 1; i++) { average[i] /= inst.numInstances(); } return average; }