public static double CA(Instances odata, int[] clusters) { double result = 0; double[] tmpdclass = odata.attributeToDoubleArray(odata.numAttributes() - 1); int[] oclass = new int[odata.numInstances()]; for (int i = 0; i < tmpdclass.length; ++i) { oclass[i] = (int) tmpdclass[i]; } int[] tmpclass = oclass.clone(); int[] tmpclusters = clusters.clone(); Arrays.sort(tmpclusters); Arrays.sort(tmpclass); int[][] M = new int[tmpclass[tmpclass.length - 1] + 1][tmpclusters[tmpclusters.length - 1] + 1]; for (int i = 0; i < clusters.length; ++i) { M[oclass[i]][clusters[i]]++; } for (int i = 0; i < M.length; ++i) { System.out.println(Arrays.toString(M[i])); } for (int i = 0; i < M.length; ++i) { int maxindex = -1; for (int j = 0; j < M[0].length - 1; ++j) { if (M[i][j] < M[i][j + 1]) maxindex = j + 1; } M[i][0] = maxindex; } for (int i = 0; i < oclass.length; ++i) { if (M[oclass[i]][0] == clusters[i]) result++; } return (double) result / (double) odata.numInstances(); }
/** * Find all the instances in the dataset covered/not covered by the rule in given index, and the * correponding simple statistics and predicted class distributions are stored in the given double * array, which can be obtained by getSimpleStats() and getDistributions().<br> * * @param index the given index, assuming correct * @param insts the dataset to be covered by the rule * @param stats the given double array to hold stats, side-effected * @param dist the given array to hold class distributions, side-effected if null, the * distribution is not necessary * @return the instances covered and not covered by the rule */ private Instances[] computeSimpleStats( int index, Instances insts, double[] stats, double[] dist) { Rule rule = (Rule) m_Ruleset.elementAt(index); Instances[] data = new Instances[2]; data[0] = new Instances(insts, insts.numInstances()); data[1] = new Instances(insts, insts.numInstances()); for (int i = 0; i < insts.numInstances(); i++) { Instance datum = insts.instance(i); double weight = datum.weight(); if (rule.covers(datum)) { data[0].add(datum); // Covered by this rule stats[0] += weight; // Coverage if ((int) datum.classValue() == (int) rule.getConsequent()) stats[2] += weight; // True positives else stats[4] += weight; // False positives if (dist != null) dist[(int) datum.classValue()] += weight; } else { data[1].add(datum); // Not covered by this rule stats[1] += weight; if ((int) datum.classValue() != (int) rule.getConsequent()) stats[3] += weight; // True negatives else stats[5] += weight; // False negatives } } return data; }
/** * Select only instances with weights that contribute to the specified quantile of the weight * distribution * * @param data the input instances * @param quantile the specified quantile eg 0.9 to select 90% of the weight mass * @return the selected instances */ protected Instances selectWeightQuantile(Instances data, double quantile) { int numInstances = data.numInstances(); Instances trainData = new Instances(data, numInstances); double[] weights = new double[numInstances]; double sumOfWeights = 0; for (int i = 0; i < numInstances; i++) { weights[i] = data.instance(i).weight(); sumOfWeights += weights[i]; } double weightMassToSelect = sumOfWeights * quantile; int[] sortedIndices = Utils.sort(weights); // Select the instances sumOfWeights = 0; for (int i = numInstances - 1; i >= 0; i--) { Instance instance = (Instance) data.instance(sortedIndices[i]).copy(); trainData.add(instance); sumOfWeights += weights[sortedIndices[i]]; if ((sumOfWeights > weightMassToSelect) && (i > 0) && (weights[sortedIndices[i]] != weights[sortedIndices[i - 1]])) { break; } } if (m_Debug) { System.err.println("Selected " + trainData.numInstances() + " out of " + numInstances); } return trainData; }
protected void searchMedian(Instances instances) { medians = new double[instances.numAttributes()]; imputations = new int[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); ++j) { int numPresentValues = 0; if (instances.attribute(j).isNumeric()) { double[] values = new double[instances.numInstances()]; for (int i = 0; i < instances.numInstances(); ++i) { Instance current = instances.get(i); if (Utils.isMissingValue(current.value(j)) == false) { values[numPresentValues] = current.value(j); numPresentValues += 1; } } if (numPresentValues > 0) { double[] goodValues = Arrays.copyOf(values, numPresentValues); Median median = new Median(); medians[j] = median.evaluate(goodValues); } } } for (int j = 0; j < instances.numAttributes(); ++j) { if (instances.attribute(j).isNumeric()) { Conversion.log( "OK", "Impute Numeric", "Attribute " + instances.attribute(j) + " - Median: " + medians[j]); } } }
/** * Compare two datasets to see if they differ. * * @param data1 one set of instances * @param data2 the other set of instances * @throws Exception if the datasets differ */ protected void compareDatasets(Instances data1, Instances data2) throws Exception { if (m_CheckHeader) { if (!data2.equalHeaders(data1)) { throw new Exception("header has been modified\n" + data2.equalHeadersMsg(data1)); } } if (!(data2.numInstances() == data1.numInstances())) { throw new Exception("number of instances has changed"); } for (int i = 0; i < data2.numInstances(); i++) { Instance orig = data1.instance(i); Instance copy = data2.instance(i); for (int j = 0; j < orig.numAttributes(); j++) { if (orig.isMissing(j)) { if (!copy.isMissing(j)) { throw new Exception("instances have changed"); } } else { if (m_CompareValuesAsString) { if (!orig.toString(j).equals(copy.toString(j))) { throw new Exception("instances have changed"); } } else { if (Math.abs(orig.value(j) - copy.value(j)) > m_MaxDiffValues) { throw new Exception("instances have changed"); } } } if (Math.abs(orig.weight() - copy.weight()) > m_MaxDiffWeights) { throw new Exception("instance weights have changed"); } } } }
/** * Splits the given set of instances into subsets. * * @exception Exception if something goes wrong */ public final Instances[] split(Instances data) throws Exception { Instances[] instances = new Instances[m_numSubsets]; double[] weights; double newWeight; Instance instance; int subset, i, j; for (j = 0; j < m_numSubsets; j++) instances[j] = new Instances((Instances) data, data.numInstances()); for (i = 0; i < data.numInstances(); i++) { instance = ((Instances) data).instance(i); weights = weights(instance); subset = whichSubset(instance); if (subset > -1) instances[subset].add(instance); else for (j = 0; j < m_numSubsets; j++) if (Utils.gr(weights[j], 0)) { newWeight = weights[j] * instance.weight(); instances[j].add(instance); instances[j].lastInstance().setWeight(newWeight); } } for (j = 0; j < m_numSubsets; j++) instances[j].compactify(); return instances; }
private ArrayList<Instance> rellenarConInstancias(Instances train) { Random r = new Random(); ArrayList<Instance> muestras = new ArrayList<Instance>(); for (int i = 0; i < train.numInstances(); i++) { muestras.add(train.instance(r.nextInt(train.numInstances()))); } return muestras; }
/** * Patition the data into 2, first of which has (numFolds-1)/numFolds of the data and the second * has 1/numFolds of the data * * @param data the given data * @param numFolds the given number of folds * @return the patitioned instances */ public static final Instances[] partition(Instances data, int numFolds) { Instances[] rt = new Instances[2]; int splits = data.numInstances() * (numFolds - 1) / numFolds; rt[0] = new Instances(data, 0, splits); rt[1] = new Instances(data, splits, data.numInstances() - splits); return rt; }
/** Queries the user enough to make a database query to retrieve experiment results. */ protected void setInstancesFromDBaseQuery() { try { if (m_InstanceQuery == null) { m_InstanceQuery = new InstanceQuery(); } String dbaseURL = m_InstanceQuery.getDatabaseURL(); dbaseURL = (String) JOptionPane.showInputDialog( this, "Enter the database URL", "Query Database", JOptionPane.PLAIN_MESSAGE, null, null, dbaseURL); if (dbaseURL == null) { m_FromLab.setText("Cancelled"); return; } m_InstanceQuery.setDatabaseURL(dbaseURL); m_InstanceQuery.connectToDatabase(); if (!m_InstanceQuery.experimentIndexExists()) { m_FromLab.setText("No experiment index"); return; } m_FromLab.setText("Getting experiment index"); Instances index = m_InstanceQuery.retrieveInstances("SELECT * FROM " + InstanceQuery.EXP_INDEX_TABLE); if (index.numInstances() == 0) { m_FromLab.setText("No experiments available"); return; } m_FromLab.setText("Got experiment index"); DefaultListModel lm = new DefaultListModel(); for (int i = 0; i < index.numInstances(); i++) { lm.addElement(index.instance(i).toString()); } JList jl = new JList(lm); ListSelectorDialog jd = new ListSelectorDialog(null, jl); int result = jd.showDialog(); if (result != ListSelectorDialog.APPROVE_OPTION) { m_FromLab.setText("Cancelled"); return; } Instance selInst = index.instance(jl.getSelectedIndex()); Attribute tableAttr = index.attribute(InstanceQuery.EXP_RESULT_COL); String table = InstanceQuery.EXP_RESULT_PREFIX + selInst.toString(tableAttr); setInstancesFromDatabaseTable(table); } catch (Exception ex) { m_FromLab.setText("Problem reading database"); } }
/** * Transpose the document-term matrix to term-document matrix * * @param data instances with document-term info * @return a term-document matrix transposed from the input dataset */ private Matrix getTransposedMatrix(Instances data) { double[][] temp = new double[data.numAttributes()][data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); for (int v = 0; v < inst.numValues(); v++) { temp[inst.index(v)][i] = inst.valueSparse(v); } } Matrix My_x = new Matrix(temp); return My_x; }
private Matrix getTransposedNormedMatrix(Instances data) { Matrix matrix = new Matrix(data.numAttributes(), data.numInstances()); for (int i = 0; i < data.numInstances(); i++) { double[] vals = data.instance(i).toDoubleArray(); double sum = Utils.sum(vals); for (int v = 0; v < vals.length; v++) { vals[v] /= sum; matrix.set(v, i, vals[v]); } } return matrix; }
public int calculateAllWrong() { if (run_ids.size() < 2) { throw new RuntimeException("Too few runs to compare. Should be at least 2. "); } ArrayList<Attribute> attributes = new ArrayList<Attribute>(); attributes.add(new Attribute("repeat")); attributes.add(new Attribute("fold")); attributes.add(new Attribute("rowid")); resultSet = new Instances("all-wrong", attributes, task_splits.numInstances()); for (int i = 0; i < task_splits.numInstances(); ++i) { Instance current = task_splits.get(i); boolean test = current.stringValue(task_splits.attribute("type")).equals("TEST"); if (!test) { continue; } Integer row_id = (int) current.value(task_splits.attribute("rowid")); Integer repeat = (int) current.value(task_splits.attribute("repeat")); Integer fold = (int) current.value(task_splits.attribute("fold")); Integer sample = 0; try { sample = (int) current.value(task_splits.attribute("sample")); } catch (Exception e) { } String correctLabel = correct.get(row_id); Integer correctPredictions = 0; for (Integer run_id : run_ids) { // System.out.println(predictions.get(run_id)); // System.out.println(repeat + "," + fold + "," + sample + "," + row_id); if (predictions .get(run_id) .get(repeat) .get(fold) .get(sample) .get(row_id) .equals(correctLabel)) { correctPredictions += 1; } } if (correctPredictions == 0) { double[] instance = {repeat, fold, row_id}; resultSet.add(new DenseInstance(1.0, instance)); } } return resultSet.size(); }
/** Sets distribution associated with model. */ public void resetDistribution(Instances data) throws Exception { Instances insts = new Instances(data, data.numInstances()); for (int i = 0; i < data.numInstances(); i++) { if (whichSubset(data.instance(i)) > -1) { insts.add(data.instance(i)); } } Distribution newD = new Distribution(insts, this); newD.addInstWithUnknown(data, m_attIndex); m_distribution = newD; }
/** * Turn the list of nearest neighbors into a probability distribution. * * @param neighbours the list of nearest neighboring instances * @param distances the distances of the neighbors * @return the probability distribution * @throws Exception if computation goes wrong or has no class attribute */ protected double[] makeDistribution(Instances neighbours, double[] distances) throws Exception { double total = 0, weight; double[] distribution = new double[m_NumClasses]; // Set up a correction to the estimator if (m_ClassType == Attribute.NOMINAL) { for (int i = 0; i < m_NumClasses; i++) { distribution[i] = 1.0 / Math.max(1, m_Train.numInstances()); } total = (double) m_NumClasses / Math.max(1, m_Train.numInstances()); } for (int i = 0; i < neighbours.numInstances(); i++) { // Collect class counts Instance current = neighbours.instance(i); distances[i] = distances[i] * distances[i]; distances[i] = Math.sqrt(distances[i] / m_NumAttributesUsed); switch (m_DistanceWeighting) { case WEIGHT_INVERSE: weight = 1.0 / (distances[i] + 0.001); // to avoid div by zero break; case WEIGHT_SIMILARITY: weight = 1.0 - distances[i]; break; default: // WEIGHT_NONE: weight = 1.0; break; } weight *= current.weight(); try { switch (m_ClassType) { case Attribute.NOMINAL: distribution[(int) current.classValue()] += weight; break; case Attribute.NUMERIC: distribution[0] += current.classValue() * weight; break; } } catch (Exception ex) { throw new Error("Data has no class attribute!"); } total += weight; } // Normalise distribution if (total > 0) { Utils.normalize(distribution, total); } return distribution; }
private static void writePredictionsTrecEval( double[] predictions, Instances data, int idIndex, int classIndex, Writer out) throws IOException { if (predictions.length != data.numInstances()) throw new IllegalStateException(predictions.length + "!=" + data.numInstances()); for (int i = 0; i < predictions.length; i++) { final String id = data.instance(i).stringValue(idIndex); final String label = data.attribute(classIndex).value((int) predictions[i]); out.write(id); out.write(" "); out.write(label); out.write(" 1.0\n"); } }
/** * Set cutpoints for a single attribute using MDL. * * @param index the index of the attribute to set cutpoints for * @param data the data to work with */ protected void calculateCutPointsByMDL(int index, Instances data) { // Sort instances data.sort(data.attribute(index)); // Find first instances that's missing int firstMissing = data.numInstances(); for (int i = 0; i < data.numInstances(); i++) { if (data.instance(i).isMissing(index)) { firstMissing = i; break; } } m_CutPoints[index] = cutPointsForSubset(data, index, 0, firstMissing); }
/** * checks a certain statistic * * @param expr the filter expression * @param stats the value of the corresponding attribute statistics */ protected void checkStatistics(String expr, double stats) { m_Filter = getFilter(expr); Instances result = useFilter(); assertEquals(m_Instances.numAttributes(), result.numAttributes()); assertEquals(m_Instances.numInstances(), result.numInstances()); // check statistics boolean equal = true; for (int i = 0; i < result.numInstances(); i++) { if (!Utils.eq(stats, result.instance(i).value(m_AttIndex))) { equal = false; break; } } if (!equal) fail("Filter and Attribute statistics differ ('" + expr + "')!"); }
/** * Gets the index of the instance with the closest threshold value to the desired target * * @param tcurve a set of instances that have been generated by this class * @param threshold the target threshold * @return the index of the instance that has threshold closest to the target, or -1 if this could * not be found (i.e. no data, or bad threshold target) */ public static int getThresholdInstance(Instances tcurve, double threshold) { if (!RELATION_NAME.equals(tcurve.relationName()) || (tcurve.numInstances() == 0) || (threshold < 0) || (threshold > 1.0)) { return -1; } if (tcurve.numInstances() == 1) { return 0; } double[] tvals = tcurve.attributeToDoubleArray(tcurve.numAttributes() - 1); int[] sorted = Utils.sort(tvals); return binarySearch(sorted, tvals, threshold); }
private static void writePredictedDistributions( Classifier c, Instances data, int idIndex, Writer out) throws Exception { // header out.write("id"); for (int i = 0; i < data.numClasses(); i++) { out.write(",\""); out.write(data.classAttribute().value(i).replaceAll("[\"\\\\]", "_")); out.write("\""); } out.write("\n"); // data for (int i = 0; i < data.numInstances(); i++) { final String id = data.instance(i).stringValue(idIndex); double[] distribution = c.distributionForInstance(data.instance(i)); // final String label = data.attribute(classIndex).value(); out.write(id); for (double probability : distribution) { out.write(","); out.write(String.valueOf(probability > 1e-5 ? (float) probability : 0f)); } out.write("\n"); } }
/** * Method that finds all large itemsets for the given set of instances. * * @param the instances to be used * @exception Exception if an attribute is numeric */ private void findLargeItemSets(int index) throws Exception { FastVector kMinusOneSets, kSets = new FastVector(); Hashtable hashtable; int i = 0; // Find large itemsets // of length 1 if (index == 1) { kSets = ItemSet.singletons(m_instances); ItemSet.upDateCounters(kSets, m_instances); kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE); if (kSets.size() == 0) return; m_Ls.addElement(kSets); } // of length > 1 if (index > 1) { if (m_Ls.size() > 0) kSets = (FastVector) m_Ls.lastElement(); m_Ls.removeAllElements(); i = index - 2; kMinusOneSets = kSets; kSets = ItemSet.mergeAllItemSets(kMinusOneSets, i, m_instances.numInstances()); hashtable = ItemSet.getHashtable(kMinusOneSets, kMinusOneSets.size()); m_hashtables.addElement(hashtable); kSets = ItemSet.pruneItemSets(kSets, hashtable); ItemSet.upDateCounters(kSets, m_instances); kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE); if (kSets.size() == 0) return; m_Ls.addElement(kSets); } }
/** * Calculate average of every columns * * @param inst * @return */ public Double[] calculateAverage(Instances inst) { Double[] average = new Double[inst.numAttributes() - 1]; for (int i = 0; i < inst.numAttributes() - 1; i++) { average[i] = 0.0; } for (int i = 0; i < inst.numInstances(); i++) { for (int x = 0; x < inst.instance(i).numAttributes() - 1; x++) { Instance ins = inst.instance(i); if (ins != null && !Double.isNaN(ins.value(x))) average[x] += ins.value(x); } } for (int i = 0; i < inst.numAttributes() - 1; i++) { average[i] /= inst.numInstances(); } return average; }
/** * Stratify the given data into the given number of bags based on the class values. It differs * from the <code>Instances.stratify(int fold)</code> that before stratification it sorts the * instances according to the class order in the header file. It assumes no missing values in the * class. * * @param data the given data * @param folds the given number of folds * @param rand the random object used to randomize the instances * @return the stratified instances */ public static final Instances stratify(Instances data, int folds, Random rand) { if (!data.classAttribute().isNominal()) return data; Instances result = new Instances(data, 0); Instances[] bagsByClasses = new Instances[data.numClasses()]; for (int i = 0; i < bagsByClasses.length; i++) bagsByClasses[i] = new Instances(data, 0); // Sort by class for (int j = 0; j < data.numInstances(); j++) { Instance datum = data.instance(j); bagsByClasses[(int) datum.classValue()].add(datum); } // Randomize each class for (int j = 0; j < bagsByClasses.length; j++) bagsByClasses[j].randomize(rand); for (int k = 0; k < folds; k++) { int offset = k, bag = 0; oneFold: while (true) { while (offset >= bagsByClasses[bag].numInstances()) { offset -= bagsByClasses[bag].numInstances(); if (++bag >= bagsByClasses.length) // Next bag break oneFold; } result.add(bagsByClasses[bag].instance(offset)); offset += folds; } } return result; }
/** * Calculates the area under the precision-recall curve (AUPRC). * * @param tcurve a previously extracted threshold curve Instances. * @return the PRC area, or Double.NaN if you don't pass in a ThresholdCurve generated Instances. */ public static double getPRCArea(Instances tcurve) { final int n = tcurve.numInstances(); if (!RELATION_NAME.equals(tcurve.relationName()) || (n == 0)) { return Double.NaN; } final int pInd = tcurve.attribute(PRECISION_NAME).index(); final int rInd = tcurve.attribute(RECALL_NAME).index(); final double[] pVals = tcurve.attributeToDoubleArray(pInd); final double[] rVals = tcurve.attributeToDoubleArray(rInd); double area = 0; double xlast = rVals[n - 1]; // start from the first real p/r pair (not the artificial zero point) for (int i = n - 2; i >= 0; i--) { double recallDelta = rVals[i] - xlast; area += (pVals[i] * recallDelta); xlast = rVals[i]; } if (area == 0) { return Utils.missingValue(); } return area; }
public void testTypical() { Instances result = useFilter(); // Number of attributes and instances shouldn't change assertEquals(m_Instances.numAttributes() + 5, result.numAttributes()); assertEquals(m_Instances.numInstances(), result.numInstances()); // Eibe can enhance this to check the binarizing is correct. }
@Override public void buildClassifier(Instances data) throws Exception { trainingData = data; Attribute classAttribute = data.classAttribute(); prototypes = new ArrayList<>(); classedData = new HashMap<String, ArrayList<Sequence>>(); indexClassedDataInFullData = new HashMap<String, ArrayList<Integer>>(); for (int c = 0; c < data.numClasses(); c++) { classedData.put(data.classAttribute().value(c), new ArrayList<Sequence>()); indexClassedDataInFullData.put(data.classAttribute().value(c), new ArrayList<Integer>()); } sequences = new Sequence[data.numInstances()]; classMap = new String[sequences.length]; for (int i = 0; i < sequences.length; i++) { Instance sample = data.instance(i); MonoDoubleItemSet[] sequence = new MonoDoubleItemSet[sample.numAttributes() - 1]; int shift = (sample.classIndex() == 0) ? 1 : 0; for (int t = 0; t < sequence.length; t++) { sequence[t] = new MonoDoubleItemSet(sample.value(t + shift)); } sequences[i] = new Sequence(sequence); String clas = sample.stringValue(classAttribute); classMap[i] = clas; classedData.get(clas).add(sequences[i]); indexClassedDataInFullData.get(clas).add(i); // System.out.println("Element "+i+" of train is classed "+clas+" and went to element // "+(indexClassedDataInFullData.get(clas).size()-1)); } buildSpecificClassifier(data); }
/** * Analyses the given list of decision points according to the context specified. Furthermore, the * context is provided with some visualization of the analysis result. * * @param decisionPoints the list of decision points to be analysed * @param log the log to be analysed * @param highLevelPN the simulation model to export discovered data dependencies */ public void analyse(ClusterDecisionAnalyzer cda) { clusterDecisionAnalyzer = cda; // create empty data set with attribute information Instances data = cda.getDataInfo(); // in case no single learning instance can be provided (as decision // point is never // reached, or decision classes cannot specified properly) --> do not // call algorithm if (data.numInstances() == 0) { System.out.println("No learning instances available"); } // actually solve the classification problem else { try { myClassifier.buildClassifier(data); // build up result visualization cda.setResultVisualization(createResultVisualization()); cda.setEvaluationVisualization(createEvaluationVisualization(data)); } catch (Exception ex) { ex.printStackTrace(); cda.setResultVisualization( createMessagePanel("Error while solving the classification problem")); } } }
/** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ @Override public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (!m_firstBatchFinished) { Instances filtered; if (m_numOfCrossValidationFolds < 2) { filtered = cleanseTrain(getInputFormat()); } else { filtered = cleanseCross(getInputFormat()); } for (int i = 0; i < filtered.numInstances(); i++) { push(filtered.instance(i)); } m_firstBatchFinished = true; flushInput(); } m_NewBatch = true; return (numPendingOutput() != 0); }
/** * Calculates the area under the ROC curve as the Wilcoxon-Mann-Whitney statistic. * * @param tcurve a previously extracted threshold curve Instances. * @return the ROC area, or Double.NaN if you don't pass in a ThresholdCurve generated Instances. */ public static double getROCArea(Instances tcurve) { final int n = tcurve.numInstances(); if (!RELATION_NAME.equals(tcurve.relationName()) || (n == 0)) { return Double.NaN; } final int tpInd = tcurve.attribute(TRUE_POS_NAME).index(); final int fpInd = tcurve.attribute(FALSE_POS_NAME).index(); final double[] tpVals = tcurve.attributeToDoubleArray(tpInd); final double[] fpVals = tcurve.attributeToDoubleArray(fpInd); double area = 0.0, cumNeg = 0.0; final double totalPos = tpVals[0]; final double totalNeg = fpVals[0]; for (int i = 0; i < n; i++) { double cip, cin; if (i < n - 1) { cip = tpVals[i] - tpVals[i + 1]; cin = fpVals[i] - fpVals[i + 1]; } else { cip = tpVals[n - 1]; cin = fpVals[n - 1]; } area += cip * (cumNeg + (0.5 * cin)); cumNeg += cin; } area /= (totalNeg * totalPos); return area; }
/** * Generates the classifier. * * @param data set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ @Override public void buildClassifier(Instances data) throws Exception { reset(); // can classifier handle the data? getCapabilities().testWithFail(data); m_data = new Instances(data, 0); data = new Instances(data); m_wordsPerClass = new double[data.numClasses()]; m_probOfClass = new double[data.numClasses()]; m_probOfWordGivenClass = new HashMap<Integer, LinkedHashMap<String, Count>>(); double laplace = 1.0; for (int i = 0; i < data.numClasses(); i++) { LinkedHashMap<String, Count> dict = new LinkedHashMap<String, Count>(10000 / data.numClasses()); m_probOfWordGivenClass.put(i, dict); m_probOfClass[i] = laplace; // this needs to be updated for laplace correction every time we see a new // word (attribute) m_wordsPerClass[i] = 0; } for (int i = 0; i < data.numInstances(); i++) { updateClassifier(data.instance(i)); } }
@Override protected Instances process(Instances instances) throws Exception { Instances result = new Instances(determineOutputFormat(instances), 0); Tagger tagger = new Tagger(); tagger.loadModel("models/model.20120919"); // reference to the content of the tweet Attribute attrCont = instances.attribute("content"); for (int i = 0; i < instances.numInstances(); i++) { double[] values = new double[result.numAttributes()]; for (int n = 0; n < instances.numAttributes(); n++) values[n] = instances.instance(i).value(n); String content = instances.instance(i).stringValue(attrCont); List<String> words = MyUtils.cleanTokenize(content); List<String> posTags = MyUtils.getPOStags(words, tagger); // calculate frequencies of different POS tags Map<String, Integer> posFreqs = MyUtils.calculateTermFreq(posTags); // add POS values for (String posTag : posFreqs.keySet()) { int index = result.attribute("POS-" + posTag).index(); values[index] = posFreqs.get(posTag); } Instance inst = new SparseInstance(1, values); result.add(inst); } return result; }