protected void searchMedian(Instances instances) { medians = new double[instances.numAttributes()]; imputations = new int[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); ++j) { int numPresentValues = 0; if (instances.attribute(j).isNumeric()) { double[] values = new double[instances.numInstances()]; for (int i = 0; i < instances.numInstances(); ++i) { Instance current = instances.get(i); if (Utils.isMissingValue(current.value(j)) == false) { values[numPresentValues] = current.value(j); numPresentValues += 1; } } if (numPresentValues > 0) { double[] goodValues = Arrays.copyOf(values, numPresentValues); Median median = new Median(); medians[j] = median.evaluate(goodValues); } } } for (int j = 0; j < instances.numAttributes(); ++j) { if (instances.attribute(j).isNumeric()) { Conversion.log( "OK", "Impute Numeric", "Attribute " + instances.attribute(j) + " - Median: " + medians[j]); } } }
public static void run(String[] args) throws Exception { /** * ************************************************* * * @param args[0]: train arff path * @param args[1]: test arff path */ DataSource source = new DataSource(args[0]); Instances data = source.getDataSet(); data.setClassIndex(data.numAttributes() - 1); NaiveBayes model = new NaiveBayes(); model.buildClassifier(data); // Evaluation: Evaluation eval = new Evaluation(data); Instances testData = new DataSource(args[1]).getDataSet(); testData.setClassIndex(testData.numAttributes() - 1); eval.evaluateModel(model, testData); System.out.println(model.toString()); System.out.println(eval.toSummaryString("\nResults\n======\n", false)); System.out.println("======\nConfusion Matrix:"); double[][] confusionM = eval.confusionMatrix(); for (int i = 0; i < confusionM.length; ++i) { for (int j = 0; j < confusionM[i].length; ++j) { System.out.format("%10s ", confusionM[i][j]); } System.out.print("\n"); } }
@Override protected Instances process(Instances instances) throws Exception { Instances result = new Instances(determineOutputFormat(instances), 0); Tagger tagger = new Tagger(); tagger.loadModel("models/model.20120919"); // reference to the content of the tweet Attribute attrCont = instances.attribute("content"); for (int i = 0; i < instances.numInstances(); i++) { double[] values = new double[result.numAttributes()]; for (int n = 0; n < instances.numAttributes(); n++) values[n] = instances.instance(i).value(n); String content = instances.instance(i).stringValue(attrCont); List<String> words = MyUtils.cleanTokenize(content); List<String> posTags = MyUtils.getPOStags(words, tagger); // calculate frequencies of different POS tags Map<String, Integer> posFreqs = MyUtils.calculateTermFreq(posTags); // add POS values for (String posTag : posFreqs.keySet()) { int index = result.attribute("POS-" + posTag).index(); values[index] = posFreqs.get(posTag); } Instance inst = new SparseInstance(1, values); result.add(inst); } return result; }
/** * Method for building this classifier. * * @param training the training instances * @param test the test instances * @throws Exception if something goes wrong */ public void buildClassifier(Instances training, Instances test) throws Exception { m_ClassifierBuilt = true; m_Random = new Random(m_Seed); m_Trainset = training; m_Testset = test; // set class index? if ((m_Trainset.classIndex() == -1) || (m_Testset.classIndex() == -1)) { m_Trainset.setClassIndex(m_Trainset.numAttributes() - 1); m_Testset.setClassIndex(m_Trainset.numAttributes() - 1); } // are datasets correct? checkData(); // any other data restrictions not met? checkRestrictions(); // generate sets generateSets(); // performs the restarts/iterations build(); m_Random = null; }
/** * Signify that this batch of input to the filter is finished. If the filter requires all * instances prior to filtering, output() may now be called to retrieve the filtered instances. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_attStats == null) { Instances input = getInputFormat(); m_attStats = new AttributeStats[input.numAttributes()]; for (int i = 0; i < input.numAttributes(); i++) { if (input.attribute(i).isNumeric() && (input.classIndex() != i)) { m_attStats[i] = input.attributeStats(i); } } // Convert pending input instances for (int i = 0; i < input.numInstances(); i++) { convertInstance(input.instance(i)); } } // Free memory flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); }
public Instances transformInstances(MultiLabelInstances mlData) throws Exception { labelIndices = mlData.getLabelIndices(); numOfLabels = mlData.getNumLabels(); Instances data = mlData.getDataSet(); Instances transformed = new Instances(mlData.getDataSet(), 0); // delete all labels transformed = RemoveAllLabels.transformInstances(transformed, labelIndices); // add single label attribute ArrayList<String> classValues = new ArrayList<String>(numOfLabels); for (int x = 0; x < numOfLabels; x++) { classValues.add("Class" + (x + 1)); } Attribute newClass = new Attribute("Class", classValues); transformed.insertAttributeAt(newClass, transformed.numAttributes()); transformed.setClassIndex(transformed.numAttributes() - 1); for (int instanceIndex = 0; instanceIndex < data.numInstances(); instanceIndex++) { // System.out.println(data.instance(instanceIndex).toString()); List<Instance> result = transformInstance(data.instance(instanceIndex)); for (Instance instance : result) { // System.out.println(instance.toString()); transformed.add(instance); // System.out.println(transformed.instance(transformed.numInstances()-1)); } } return transformed; }
public static Instances getNewRandomData(Instances data, int rnd) throws Exception { Random rd = new Random(rnd); boolean[] resB = new boolean[data.numAttributes()]; for (int r = 0; r < data.numAttributes(); ++r) resB[r] = rd.nextBoolean(); int cnt = 0; for (int i = 0; i < resB.length - 1; ++i) { if (resB[i]) { cnt++; } } int[] removeind = new int[resB.length - 1 - cnt]; int j = 0; for (int i = 0; i < resB.length - 1; ++i) { if (!resB[i]) { removeind[j++] = i; } } Remove m_removeFilter = new Remove(); m_removeFilter.setAttributeIndicesArray(removeind); m_removeFilter.setInvertSelection(false); m_removeFilter.setInputFormat(data); Instances newData = Filter.useFilter(data, m_removeFilter); return newData; }
/** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // TODO Auto-generated method stub oneAlgorithm oneAlg = new oneAlgorithm(); oneAlg.category = xCategory.RSandFCBFalg; oneAlg.style = xStyle.fuzzySU; oneAlg.flag = false; oneAlg.alpha = 2.0; // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/Data/wine.arff"; // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/Data/wdbc.arff"; String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/Data/glass.arff"; // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/shen/wine-shen.arff"; // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/fuzzy/fuzzy-ex.arff"; // String fn = "C:/Users/Eric/Desktop/2011秋冬/Code/Xreducer/data/derm.arff"; oneFile onef = new oneFile(new File(fn)); Instances dataset = new Instances(new FileReader(fn)); dataset.setClassIndex(dataset.numAttributes() - 1); onef.ins = dataset.numInstances(); onef.att = dataset.numAttributes(); onef.cla = dataset.numClasses(); RSandFCBFReduceMethod rs = new RSandFCBFReduceMethod(onef, oneAlg); boolean[] B = new boolean[rs.NumAttr]; boolean[] rq = rs.getOneReduction(B); System.out.println(Arrays.toString(Utils.boolean2select(rq))); }
/** * 用分类器测试 * * @param trainFileName * @param testFileName */ public static void classify(String trainFileName, String testFileName) { try { File inputFile = new File(fileName + trainFileName); // 训练语料文件 ArffLoader atf = new ArffLoader(); atf.setFile(inputFile); Instances instancesTrain = atf.getDataSet(); // 读入训练文件 // 设置类标签类 inputFile = new File(fileName + testFileName); // 测试语料文件 atf.setFile(inputFile); Instances instancesTest = atf.getDataSet(); // 读入测试文件 instancesTest.setClassIndex(instancesTest.numAttributes() - 1); instancesTrain.setClassIndex(instancesTrain.numAttributes() - 1); classifier = (Classifier) Class.forName(CLASSIFIERNAME).newInstance(); classifier.buildClassifier(instancesTrain); Evaluation eval = new Evaluation(instancesTrain); // 第一个为一个训练过的分类器,第二个参数是在某个数据集上评价的数据集 eval.evaluateModel(classifier, instancesTest); System.out.println(eval.toClassDetailsString()); System.out.println(eval.toSummaryString()); System.out.println(eval.toMatrixString()); System.out.println("precision is :" + (1 - eval.errorRate())); } catch (Exception e) { e.printStackTrace(); } }
/** * Builds a regression model for the given data. * * @param data the training data to be used for generating the linear regression function * @throws Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if (!m_checksTurnedOff) { // can classifier handle the data? getCapabilities().testWithFail(data); // remove instances with missing class data = new Instances(data); data.deleteWithMissingClass(); } // Preprocess instances if (!m_checksTurnedOff) { m_TransformFilter = new NominalToBinary(); m_TransformFilter.setInputFormat(data); data = Filter.useFilter(data, m_TransformFilter); m_MissingFilter = new ReplaceMissingValues(); m_MissingFilter.setInputFormat(data); data = Filter.useFilter(data, m_MissingFilter); data.deleteWithMissingClass(); } else { m_TransformFilter = null; m_MissingFilter = null; } m_ClassIndex = data.classIndex(); m_TransformedData = data; // Turn all attributes on for a start m_SelectedAttributes = new boolean[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { if (i != m_ClassIndex) { m_SelectedAttributes[i] = true; } } m_Coefficients = null; // Compute means and standard deviations m_Means = new double[data.numAttributes()]; m_StdDevs = new double[data.numAttributes()]; for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { m_Means[j] = data.meanOrMode(j); m_StdDevs[j] = Math.sqrt(data.variance(j)); if (m_StdDevs[j] == 0) { m_SelectedAttributes[j] = false; } } } m_ClassStdDev = Math.sqrt(data.variance(m_TransformedData.classIndex())); m_ClassMean = data.meanOrMode(m_TransformedData.classIndex()); // Perform the regression findBestModel(); // Save memory m_TransformedData = new Instances(data, 0); }
/** * Convert an <code>Instance</code> to an array of values that matches the format of the mining * schema. First maps raw attribute values and then applies rules for missing values, outliers * etc. * * @param inst the <code>Instance</code> to convert * @param miningSchema the mining schema incoming instance attributes * @return an array of doubles that are values from the incoming Instances, correspond to the * format of the mining schema and have had missing values, outliers etc. dealt with. * @throws Exception if something goes wrong */ public double[] instanceToSchema(Instance inst, MiningSchema miningSchema) throws Exception { Instances miningSchemaI = miningSchema.getMiningSchemaAsInstances(); // allocate enough space for both mining schema fields and any derived fields double[] result = new double[miningSchema.getFieldsAsInstances().numAttributes()]; // Copy over the values for (int i = 0; i < miningSchemaI.numAttributes(); i++) { // if (miningSchemaI.attribute(i).isNumeric()) { result[i] = inst.value(m_fieldsMap[i]); if (miningSchemaI.attribute(i).isNominal() || miningSchemaI.attribute(i).isString()) { // If not missing, look up the index of this incoming categorical value in // the mining schema if (!Utils.isMissingValue(inst.value(m_fieldsMap[i]))) { int[] valueMap = m_nominalValueMaps[i]; int index = valueMap[(int) inst.value(m_fieldsMap[i])]; String incomingAttValue = inst.attribute(m_fieldsMap[i]).value((int) inst.value(m_fieldsMap[i])); /*int index = miningSchemaI.attribute(i).indexOfValue(incomingAttValue); */ if (index >= 0) { result[i] = index; } else { // set this to "unknown" (-1) for nominal valued attributes result[i] = UNKNOWN_NOMINAL_VALUE; String warningString = "[MappingInfo] WARNING: Can't match nominal value " + incomingAttValue; if (m_log != null) { m_log.logMessage(warningString); } else { System.err.println(warningString); } } } } } // Now deal with missing values and outliers... miningSchema.applyMissingAndOutlierTreatments(result); // printInst(result); // now fill in any derived values ArrayList<DerivedFieldMetaInfo> derivedFields = miningSchema.getDerivedFields(); for (int i = 0; i < derivedFields.size(); i++) { DerivedFieldMetaInfo temp = derivedFields.get(i); // System.err.println("Applying : " + temp); double r = temp.getDerivedValue(result); result[i + miningSchemaI.numAttributes()] = r; } /*System.err.print("==> "); for (int i = 0; i < result.length; i++) { System.err.print(" " + result[i]); } System.err.println();*/ return result; }
/** @param args */ private void Init() { testIns.setClassIndex(testIns.numAttributes() - 1); labeledIns.setClassIndex(labeledIns.numAttributes() - 1); unlabeledIns.setClassIndex(unlabeledIns.numAttributes() - 1); class_Array[0] = classifier1; class_Array[1] = classifier2; class_Array[2] = classifier3; }
private RunTrace traceToXML(int file_id, int task_id, int run_id) throws Exception { RunTrace trace = new RunTrace(run_id); URL traceURL = apiconnector.getOpenmlFileUrl(file_id, "Task_" + task_id + "_trace.arff"); Instances traceDataset = new Instances(new BufferedReader(Input.getURL(traceURL))); List<Integer> parameterIndexes = new ArrayList<Integer>(); if (traceDataset.attribute("repeat") == null || traceDataset.attribute("fold") == null || traceDataset.attribute("iteration") == null || traceDataset.attribute("evaluation") == null || traceDataset.attribute("selected") == null) { throw new Exception("trace file missing mandatory attributes. "); } for (int i = 0; i < traceDataset.numAttributes(); ++i) { if (traceDataset.attribute(i).name().startsWith("parameter_")) { parameterIndexes.add(i); } } if (parameterIndexes.size() == 0) { throw new Exception( "trace file contains no fields with prefix 'parameter_' (i.e., parameters are not registered). "); } if (traceDataset.numAttributes() > 6 + parameterIndexes.size()) { throw new Exception( "trace file contains illegal attributes (only allow for repeat, fold, iteration, evaluation, selected, setup_string and parameter_*). "); } for (int i = 0; i < traceDataset.numInstances(); ++i) { Instance current = traceDataset.get(i); Integer repeat = (int) current.value(traceDataset.attribute("repeat").index()); Integer fold = (int) current.value(traceDataset.attribute("fold").index()); Integer iteration = (int) current.value(traceDataset.attribute("iteration").index()); Double evaluation = current.value(traceDataset.attribute("evaluation").index()); Boolean selected = current.stringValue(traceDataset.attribute("selected").index()).equals("true"); Map<String, String> parameters = new HashMap<String, String>(); for (int j = 0; j < parameterIndexes.size(); ++j) { int attIdx = parameterIndexes.get(j); if (traceDataset.attribute(attIdx).isNumeric()) { parameters.put(traceDataset.attribute(attIdx).name(), current.value(attIdx) + ""); } else { parameters.put(traceDataset.attribute(attIdx).name(), current.stringValue(attIdx)); } } String setup_string = new JSONObject(parameters).toString(); trace.addIteration( new RunTrace.Trace_iteration( repeat, fold, iteration, setup_string, evaluation, selected)); } return trace; }
protected void initMinMax(Instances data) { m_Min = new double[data.numAttributes()]; m_Max = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { m_Min[i] = m_Max[i] = Double.NaN; } for (int i = 0; i < data.numInstances(); i++) { updateMinMax(data.instance(i)); } }
/** * Determines the output format based on the input format and returns this. In case the output * format cannot be returned immediately, i.e., immediateOutputFormat() returns false, then this * method will be called from batchFinished(). * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong * @see #hasImmediateOutputFormat() * @see #batchFinished() */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { Instances data; Instances result; FastVector atts; FastVector values; HashSet hash; int i; int n; boolean isDate; Instance inst; Vector sorted; m_Cols.setUpper(inputFormat.numAttributes() - 1); data = new Instances(inputFormat); atts = new FastVector(); for (i = 0; i < data.numAttributes(); i++) { if (!m_Cols.isInRange(i) || !data.attribute(i).isNumeric()) { atts.addElement(data.attribute(i)); continue; } // date attribute? isDate = (data.attribute(i).type() == Attribute.DATE); // determine all available attribtues in dataset hash = new HashSet(); for (n = 0; n < data.numInstances(); n++) { inst = data.instance(n); if (inst.isMissing(i)) continue; if (isDate) hash.add(inst.stringValue(i)); else hash.add(new Double(inst.value(i))); } // sort values sorted = new Vector(); for (Object o : hash) sorted.add(o); Collections.sort(sorted); // create attribute from sorted values values = new FastVector(); for (Object o : sorted) { if (isDate) values.addElement(o.toString()); else values.addElement(Utils.doubleToString(((Double) o).doubleValue(), MAX_DECIMALS)); } atts.addElement(new Attribute(data.attribute(i).name(), values)); } result = new Instances(inputFormat.relationName(), atts, 0); result.setClassIndex(inputFormat.classIndex()); return result; }
/** Perform the sub task */ public void execute() { m_random = new Random(m_rowNumber * 11); m_dataGenerator.setSeed(m_rowNumber * 11); m_result = new RemoteResult(m_rowNumber, m_panelWidth); m_status.setTaskResult(m_result); m_status.setExecutionStatus(TaskStatusInfo.PROCESSING); try { m_numOfSamplesPerGenerator = (int) Math.pow(m_samplesBase, m_trainingData.numAttributes() - 3); if (m_trainingData == null) { throw new Exception("No training data set (BoundaryPanel)"); } if (m_classifier == null) { throw new Exception("No classifier set (BoundaryPanel)"); } if (m_dataGenerator == null) { throw new Exception("No data generator set (BoundaryPanel)"); } if (m_trainingData.attribute(m_xAttribute).isNominal() || m_trainingData.attribute(m_yAttribute).isNominal()) { throw new Exception( "Visualization dimensions must be numeric " + "(RemoteBoundaryVisualizerSubTask)"); } m_attsToWeightOn = new boolean[m_trainingData.numAttributes()]; m_attsToWeightOn[m_xAttribute] = true; m_attsToWeightOn[m_yAttribute] = true; // generate samples m_weightingAttsValues = new double[m_attsToWeightOn.length]; m_vals = new double[m_trainingData.numAttributes()]; m_predInst = new Instance(1.0, m_vals); m_predInst.setDataset(m_trainingData); System.err.println("Executing row number " + m_rowNumber); for (int j = 0; j < m_panelWidth; j++) { double[] preds = calculateRegionProbs(j, m_rowNumber); m_result.setLocationProbs(j, preds); m_result.setPercentCompleted((int) (100 * ((double) j / (double) m_panelWidth))); } } catch (Exception ex) { m_status.setExecutionStatus(TaskStatusInfo.FAILED); m_status.setStatusMessage("Row " + m_rowNumber + " failed."); System.err.print(ex); return; } // finished m_status.setExecutionStatus(TaskStatusInfo.FINISHED); m_status.setStatusMessage("Row " + m_rowNumber + " completed successfully."); }
/** * Constructs an instance suitable for passing to the model for scoring * * @param incoming the incoming instance * @return an instance with values mapped to be consistent with what the model is expecting */ protected Instance mapIncomingFieldsToModelFields(Instance incoming) { Instances modelHeader = m_model.getHeader(); double[] vals = new double[modelHeader.numAttributes()]; for (int i = 0; i < modelHeader.numAttributes(); i++) { if (m_attributeMap[i] < 0) { // missing or type mismatch vals[i] = Utils.missingValue(); continue; } Attribute modelAtt = modelHeader.attribute(i); Attribute incomingAtt = incoming.dataset().attribute(m_attributeMap[i]); if (incoming.isMissing(incomingAtt.index())) { vals[i] = Utils.missingValue(); continue; } if (modelAtt.isNumeric()) { vals[i] = incoming.value(m_attributeMap[i]); } else if (modelAtt.isNominal()) { String incomingVal = incoming.stringValue(m_attributeMap[i]); int modelIndex = modelAtt.indexOfValue(incomingVal); if (modelIndex < 0) { vals[i] = Utils.missingValue(); } else { vals[i] = modelIndex; } } else if (modelAtt.isString()) { vals[i] = 0; modelAtt.setStringValue(incoming.stringValue(m_attributeMap[i])); } } if (modelHeader.classIndex() >= 0) { // set class to missing value vals[modelHeader.classIndex()] = Utils.missingValue(); } Instance newInst = null; if (incoming instanceof SparseInstance) { newInst = new SparseInstance(incoming.weight(), vals); } else { newInst = new DenseInstance(incoming.weight(), vals); } newInst.setDataset(modelHeader); return newInst; }
/** * Determines the output format based on the input format and returns this. In case the output * format cannot be returned immediately, i.e., hasImmediateOutputFormat() returns false, then * this method will called from batchFinished() after the call of preprocess(Instances), in which, * e.g., statistics for the actual processing step can be gathered. * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { Instances result; FastVector atts; int i; int numAtts; Vector<Integer> indices; Vector<Integer> subset; Random rand; int index; // determine the number of attributes numAtts = inputFormat.numAttributes(); if (inputFormat.classIndex() > -1) numAtts--; if (m_NumAttributes < 1) { numAtts = (int) Math.round((double) numAtts * m_NumAttributes); } else { if (m_NumAttributes < numAtts) numAtts = (int) m_NumAttributes; } if (getDebug()) System.out.println("# of atts: " + numAtts); // determine random indices indices = new Vector<Integer>(); for (i = 0; i < inputFormat.numAttributes(); i++) { if (i == inputFormat.classIndex()) continue; indices.add(i); } subset = new Vector<Integer>(); rand = new Random(m_Seed); for (i = 0; i < numAtts; i++) { index = rand.nextInt(indices.size()); subset.add(indices.get(index)); indices.remove(index); } Collections.sort(subset); if (inputFormat.classIndex() > -1) subset.add(inputFormat.classIndex()); if (getDebug()) System.out.println("indices: " + subset); // generate output format atts = new FastVector(); m_Indices = new int[subset.size()]; for (i = 0; i < subset.size(); i++) { atts.addElement(inputFormat.attribute(subset.get(i))); m_Indices[i] = subset.get(i); } result = new Instances(inputFormat.relationName(), atts, 0); if (inputFormat.classIndex() > -1) result.setClassIndex(result.numAttributes() - 1); return result; }
public KDTreeBufferCPU( Instances dataset, Context ctx, int tree_depth, DenseInstanceBuffer buffer) { m_instance_data = buffer; m_tree_depth = tree_depth; m_leaf_node_ids = new int[buffer.rows()]; m_branch_nodes = (int) (Math.pow(2, tree_depth + 1) - 1) - (int) Math.pow(2, tree_depth); m_dataset = dataset; m_num_attributes = dataset.numAttributes(); m_max_temp = new double[m_dataset.numAttributes() * m_branch_nodes]; m_min_temp = new double[m_dataset.numAttributes() * m_branch_nodes]; m_node_split_dim = new int[m_branch_nodes]; m_node_split_value = new double[m_branch_nodes]; }
public static double CA(Instances odata, int[] clusters) { double result = 0; double[] tmpdclass = odata.attributeToDoubleArray(odata.numAttributes() - 1); int[] oclass = new int[odata.numInstances()]; for (int i = 0; i < tmpdclass.length; ++i) { oclass[i] = (int) tmpdclass[i]; } int[] tmpclass = oclass.clone(); int[] tmpclusters = clusters.clone(); Arrays.sort(tmpclusters); Arrays.sort(tmpclass); int[][] M = new int[tmpclass[tmpclass.length - 1] + 1][tmpclusters[tmpclusters.length - 1] + 1]; for (int i = 0; i < clusters.length; ++i) { M[oclass[i]][clusters[i]]++; } for (int i = 0; i < M.length; ++i) { System.out.println(Arrays.toString(M[i])); } for (int i = 0; i < M.length; ++i) { int maxindex = -1; for (int j = 0; j < M[0].length - 1; ++j) { if (M[i][j] < M[i][j + 1]) maxindex = j + 1; } M[i][0] = maxindex; } for (int i = 0; i < oclass.length; ++i) { if (M[oclass[i]][0] == clusters[i]) result++; } return (double) result / (double) odata.numInstances(); }
/** * Generate artificial training examples. * * @param artSize size of examples set to create * @param data training data * @return the set of unlabeled artificial examples */ protected Instances generateArtificialData(int artSize, Instances data) { int numAttributes = data.numAttributes(); Instances artData = new Instances(data, artSize); double[] att; Instance artInstance; for (int i = 0; i < artSize; i++) { att = new double[numAttributes]; for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Select nominal value based on the frequency of occurence in the training data double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (double) selectIndexProbabilistically(stats); } else if (data.attribute(j).isNumeric()) { // Generate numeric value from the Guassian distribution // defined by the mean and std dev of the attribute double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0]; } else System.err.println("Decorate can only handle numeric and nominal values."); } artInstance = new Instance(1.0, att); artData.add(artInstance); } return artData; }
/** * Compute and store statistics required for generating artificial data. * * @param data training instances * @exception Exception if statistics could not be calculated successfully */ protected void computeStats(Instances data) throws Exception { int numAttributes = data.numAttributes(); m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Compute the probability of occurence of each distinct value int[] nomCounts = (data.attributeStats(j)).nominalCounts; double[] counts = new double[nomCounts.length]; if (counts.length < 2) throw new Exception("Nominal attribute has less than two distinct values!"); // Perform Laplace smoothing for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1; Utils.normalize(counts); double[] stats = new double[counts.length - 1]; stats[0] = counts[0]; // Calculate cumulative probabilities for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i]; m_AttributeStats.add(j, stats); } else if (data.attribute(j).isNumeric()) { // Get mean and standard deviation from the training data double[] stats = new double[2]; stats[0] = data.meanOrMode(j); stats[1] = Math.sqrt(data.variance(j)); m_AttributeStats.add(j, stats); } else System.err.println("Decorate can only handle numeric and nominal values."); } }
public void testTypical() { Instances result = useFilter(); // Number of attributes and instances shouldn't change assertEquals(m_Instances.numAttributes() + 5, result.numAttributes()); assertEquals(m_Instances.numInstances(), result.numInstances()); // Eibe can enhance this to check the binarizing is correct. }
/** trains the classifier */ @Override public void train() throws Exception { if (_train.classIndex() == -1) _train.setClassIndex(_train.numAttributes() - 1); _cl.buildClassifier(_train); // evaluate classifier and print some statistics evaluate(); }
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance structure (any instances * contained in the object are ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws UnsupportedAttributeTypeException if selected attributes are not numeric or nominal. */ public boolean setInputFormat(Instances instanceInfo) throws Exception { if ((instanceInfo.classIndex() > 0) && (!getFillWithMissing())) { throw new IllegalArgumentException( "TimeSeriesTranslate: Need to fill in missing values " + "using appropriate option when class index is set."); } super.setInputFormat(instanceInfo); // Create the output buffer Instances outputFormat = new Instances(instanceInfo, 0); for (int i = 0; i < instanceInfo.numAttributes(); i++) { if (i != instanceInfo.classIndex()) { if (m_SelectedCols.isInRange(i)) { if (outputFormat.attribute(i).isNominal() || outputFormat.attribute(i).isNumeric()) { outputFormat.renameAttribute( i, outputFormat.attribute(i).name() + (m_InstanceRange < 0 ? '-' : '+') + Math.abs(m_InstanceRange)); } else { throw new UnsupportedAttributeTypeException( "Only numeric and nominal attributes may be " + " manipulated in time series."); } } } } outputFormat.setClassIndex(instanceInfo.classIndex()); setOutputFormat(outputFormat); return true; }
/** * Calculates the centroid pivot of a node based on the list of points that it contains (tbe two * lists of its children are provided). * * @param list1 The point index list of first child. * @param list2 The point index list of second child. * @param insts The insts object on which the tree is being built (for header information). * @return The centroid pivot of the node. */ public Instance calcPivot(MyIdxList list1, MyIdxList list2, Instances insts) { int classIdx = m_Instances.classIndex(); double[] attrVals = new double[insts.numAttributes()]; Instance temp; for (int i = 0; i < list1.length(); i++) { temp = insts.instance(((ListNode) list1.get(i)).idx); for (int k = 0; k < temp.numValues(); k++) { if (temp.index(k) == classIdx) continue; attrVals[k] += temp.valueSparse(k); } } for (int j = 0; j < list2.length(); j++) { temp = insts.instance(((ListNode) list2.get(j)).idx); for (int k = 0; k < temp.numValues(); k++) { if (temp.index(k) == classIdx) continue; attrVals[k] += temp.valueSparse(k); } } for (int j = 0, numInsts = list1.length() + list2.length(); j < attrVals.length; j++) { attrVals[j] /= numInsts; } temp = new DenseInstance(1.0, attrVals); return temp; }
public static void wekaAlgorithms(Instances data) throws Exception { classifier = new FilteredClassifier(); // new instance of tree classifier.setClassifier(new NaiveBayes()); // classifier.setClassifier(new J48()); // classifier.setClassifier(new RandomForest()); // classifier.setClassifier(new ZeroR()); // classifier.setClassifier(new NaiveBayes()); // classifier.setClassifier(new IBk()); data.setClassIndex(data.numAttributes() - 1); Evaluation eval = new Evaluation(data); int folds = 10; eval.crossValidateModel(classifier, data, folds, new Random(1)); System.out.println("===== Evaluating on filtered (training) dataset ====="); System.out.println(eval.toSummaryString()); System.out.println(eval.toClassDetailsString()); double[][] mat = eval.confusionMatrix(); System.out.println("========= Confusion Matrix ========="); for (int i = 0; i < mat.length; i++) { for (int j = 0; j < mat.length; j++) { System.out.print(mat[i][j] + " "); } System.out.println(" "); } }
/** * Tests the ThresholdCurve generation from the command line. The classifier is currently * hardcoded. Pipe in an arff file. * * @param args currently ignored */ public static void main(String[] args) { try { Instances inst = new Instances(new java.io.InputStreamReader(System.in)); if (false) { System.out.println(ThresholdCurve.getNPointPrecision(inst, 11)); } else { inst.setClassIndex(inst.numAttributes() - 1); ThresholdCurve tc = new ThresholdCurve(); EvaluationUtils eu = new EvaluationUtils(); Classifier classifier = new weka.classifiers.functions.Logistic(); FastVector predictions = new FastVector(); for (int i = 0; i < 2; i++) { // Do two runs. eu.setSeed(i); predictions.appendElements(eu.getCVPredictions(classifier, inst, 10)); // System.out.println("\n\n\n"); } Instances result = tc.getCurve(predictions); System.out.println(result); } } catch (Exception ex) { ex.printStackTrace(); } }
/** * Returns a vector with column names of the dataset, listed in "list". If a column cannot be * found or the list is empty the ones from the default list are returned. * * @param list comma-separated list of attribute names * @param defaultList the default list of attribute names * @param inst the instances to get the attribute names from * @return a vector containing attribute names */ protected Vector determineColumnNames(String list, String defaultList, Instances inst) { Vector result; Vector atts; StringTokenizer tok; int i; String item; // get attribute names atts = new Vector(); for (i = 0; i < inst.numAttributes(); i++) atts.add(inst.attribute(i).name().toLowerCase()); // process list result = new Vector(); tok = new StringTokenizer(list, ","); while (tok.hasMoreTokens()) { item = tok.nextToken().toLowerCase(); if (atts.contains(item)) { result.add(item); } else { result.clear(); break; } } // do we have to return defaults? if (result.size() == 0) { tok = new StringTokenizer(defaultList, ","); while (tok.hasMoreTokens()) result.add(tok.nextToken().toLowerCase()); } return result; }
/** * Calculate average of every columns * * @param inst * @return */ public Double[] calculateAverage(Instances inst) { Double[] average = new Double[inst.numAttributes() - 1]; for (int i = 0; i < inst.numAttributes() - 1; i++) { average[i] = 0.0; } for (int i = 0; i < inst.numInstances(); i++) { for (int x = 0; x < inst.instance(i).numAttributes() - 1; x++) { Instance ins = inst.instance(i); if (ins != null && !Double.isNaN(ins.value(x))) average[x] += ins.value(x); } } for (int i = 0; i < inst.numAttributes() - 1; i++) { average[i] /= inst.numInstances(); } return average; }