/** Computes average class values for each attribute and value */ private void computeAverageClassValues() { double totalCounts, sum; Instance instance; double[] counts; double[][] avgClassValues = new double[getInputFormat().numAttributes()][0]; m_Indices = new int[getInputFormat().numAttributes()][0]; for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if (att.isNominal()) { avgClassValues[j] = new double[att.numValues()]; counts = new double[att.numValues()]; for (int i = 0; i < getInputFormat().numInstances(); i++) { instance = getInputFormat().instance(i); if (!instance.classIsMissing() && (!instance.isMissing(j))) { counts[(int) instance.value(j)] += instance.weight(); avgClassValues[j][(int) instance.value(j)] += instance.weight() * instance.classValue(); } } sum = Utils.sum(avgClassValues[j]); totalCounts = Utils.sum(counts); if (Utils.gr(totalCounts, 0)) { for (int k = 0; k < att.numValues(); k++) { if (Utils.gr(counts[k], 0)) { avgClassValues[j][k] /= counts[k]; } else { avgClassValues[j][k] = sum / totalCounts; } } } m_Indices[j] = Utils.sort(avgClassValues[j]); } } }
/** * Set the output format. Takes the current average class values and m_InputFormat and calls * setOutputFormat(Instances) appropriately. */ private void setOutputFormat() { Instances newData; FastVector newAtts, newVals; // Compute new attributes newAtts = new FastVector(getInputFormat().numAttributes()); for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if (!m_AttIndices.isInRange(j) || !att.isString()) { // We don't have to copy the attribute because the // attribute index remains unchanged. newAtts.addElement(att); } else { // Compute list of attribute values newVals = new FastVector(att.numValues()); for (int i = 0; i < att.numValues(); i++) { newVals.addElement(att.value(i)); } newAtts.addElement(new Attribute(att.name(), newVals)); } } // Construct new header newData = new Instances(getInputFormat().relationName(), newAtts, 0); newData.setClassIndex(getInputFormat().classIndex()); setOutputFormat(newData); }
/** * Adds this tree recursively to the buffer. * * @param id the unqiue id for the method * @param buffer the buffer to add the source code to * @return the last ID being used * @throws Exception if something goes wrong */ protected int toSource(int id, StringBuffer buffer) throws Exception { int result; int i; int newID; StringBuffer[] subBuffers; buffer.append("\n"); buffer.append(" protected static double node" + id + "(Object[] i) {\n"); // leaf? if (m_Attribute == null) { result = id; if (Double.isNaN(m_ClassValue)) buffer.append(" return Double.NaN;"); else buffer.append(" return " + m_ClassValue + ";"); if (m_ClassAttribute != null) buffer.append(" // " + m_ClassAttribute.value((int) m_ClassValue)); buffer.append("\n"); buffer.append(" }\n"); } else { buffer.append(" // " + m_Attribute.name() + "\n"); // subtree calls subBuffers = new StringBuffer[m_Attribute.numValues()]; newID = id; for (i = 0; i < m_Attribute.numValues(); i++) { newID++; buffer.append(" "); if (i > 0) buffer.append("else "); buffer.append( "if (((String) i[" + m_Attribute.index() + "]).equals(\"" + m_Attribute.value(i) + "\"))\n"); buffer.append(" return node" + newID + "(i);\n"); subBuffers[i] = new StringBuffer(); newID = m_Successors[i].toSource(newID, subBuffers[i]); } buffer.append(" else\n"); buffer.append( " throw new IllegalArgumentException(\"Value '\" + i[" + m_Attribute.index() + "] + \"' is not allowed!\");\n"); buffer.append(" }\n"); // output subtree code for (i = 0; i < m_Attribute.numValues(); i++) { buffer.append(subBuffers[i].toString()); } subBuffers = null; result = newID; } return result; }
/** * Convert a single instance over if the class is nominal. The converted instance is added to the * end of the output queue. * * @param instance the instance to convert */ private void convertInstanceNominal(Instance instance) { if (!m_needToTransform) { push(instance); return; } double[] vals = new double[outputFormatPeek().numAttributes()]; int attSoFar = 0; for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if ((!att.isNominal()) || (j == getInputFormat().classIndex())) { vals[attSoFar] = instance.value(j); attSoFar++; } else { if ((att.numValues() <= 2) && (!m_TransformAll)) { vals[attSoFar] = instance.value(j); attSoFar++; } else { if (instance.isMissing(j)) { for (int k = 0; k < att.numValues(); k++) { vals[attSoFar + k] = instance.value(j); } } else { for (int k = 0; k < att.numValues(); k++) { if (k == (int) instance.value(j)) { vals[attSoFar + k] = 1; } else { vals[attSoFar + k] = 0; } } } attSoFar += att.numValues(); } } } Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new DenseInstance(instance.weight(), vals); } inst.setDataset(getOutputFormat()); copyValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); }
/** * Sets up the structure for the plot instances. Sets m_PlotInstances to null if instances are not * saved for visualization. * * @see #getSaveForVisualization() */ protected void determineFormat() { FastVector hv; Attribute predictedClass; Attribute classAt; FastVector attVals; int i; if (!m_SaveForVisualization) { m_PlotInstances = null; return; } hv = new FastVector(); classAt = m_Instances.attribute(m_ClassIndex); if (classAt.isNominal()) { attVals = new FastVector(); for (i = 0; i < classAt.numValues(); i++) attVals.addElement(classAt.value(i)); predictedClass = new Attribute("predicted" + classAt.name(), attVals); } else { predictedClass = new Attribute("predicted" + classAt.name()); } for (i = 0; i < m_Instances.numAttributes(); i++) { if (i == m_Instances.classIndex()) hv.addElement(predictedClass); hv.addElement(m_Instances.attribute(i).copy()); } m_PlotInstances = new Instances(m_Instances.relationName() + "_predicted", hv, m_Instances.numInstances()); m_PlotInstances.setClassIndex(m_ClassIndex + 1); }
/** * Splits a dataset according to the values of a nominal attribute. * * @param data the data which is to be split * @param att the attribute to be used for splitting * @return the sets of instances produced by the split */ private Instances[] splitData(Instances data, Attribute att) { Instances[] splitData = new Instances[att.numValues()]; for (int j = 0; j < att.numValues(); j++) { splitData[j] = new Instances(data, data.numInstances()); } Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); splitData[(int) inst.value(att)].add(inst); } for (int i = 0; i < splitData.length; i++) { splitData[i].compactify(); } return splitData; }
/** * Cálculo da precisão de Laplace * * @return Precisão de Laplace */ public double getLaplace() { double temp = matrizContigencia.getB(); int numClasses = classe.numValues(); if (temp != 0) laplace = (matrizContigencia.getH_B() + 1) / (temp + numClasses); else laplace = 0; return laplace; }
/** * Determines the output format based on the input format and returns this. * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { Instances result; Attribute att; Attribute attSorted; FastVector atts; FastVector values; Vector<String> sorted; int i; int n; m_AttributeIndices.setUpper(inputFormat.numAttributes() - 1); // determine sorted indices atts = new FastVector(); m_NewOrder = new int[inputFormat.numAttributes()][]; for (i = 0; i < inputFormat.numAttributes(); i++) { att = inputFormat.attribute(i); if (!att.isNominal() || !m_AttributeIndices.isInRange(i)) { m_NewOrder[i] = new int[0]; atts.addElement(inputFormat.attribute(i).copy()); continue; } // sort labels sorted = new Vector<String>(); for (n = 0; n < att.numValues(); n++) sorted.add(att.value(n)); Collections.sort(sorted, m_Comparator); // determine new indices m_NewOrder[i] = new int[att.numValues()]; values = new FastVector(); for (n = 0; n < att.numValues(); n++) { m_NewOrder[i][n] = sorted.indexOf(att.value(n)); values.addElement(sorted.get(n)); } attSorted = new Attribute(att.name(), values); attSorted.setWeight(att.weight()); atts.addElement(attSorted); } // generate new header result = new Instances(inputFormat.relationName(), atts, 0); result.setClassIndex(inputFormat.classIndex()); return result; }
/** * Returns a description of the classifier. * * @return a description of the classifier as a string. */ @Override public String toString() { if (m_Instances == null) { return "Naive Bayes (simple): No model built yet."; } try { StringBuffer text = new StringBuffer("Naive Bayes (simple)"); int attIndex; for (int i = 0; i < m_Instances.numClasses(); i++) { text.append( "\n\nClass " + m_Instances.classAttribute().value(i) + ": P(C) = " + Utils.doubleToString(m_Priors[i], 10, 8) + "\n\n"); Enumeration<Attribute> enumAtts = m_Instances.enumerateAttributes(); attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = enumAtts.nextElement(); text.append("Attribute " + attribute.name() + "\n"); if (attribute.isNominal()) { for (int j = 0; j < attribute.numValues(); j++) { text.append(attribute.value(j) + "\t"); } text.append("\n"); for (int j = 0; j < attribute.numValues(); j++) { text.append(Utils.doubleToString(m_Counts[i][attIndex][j], 10, 8) + "\t"); } } else { text.append("Mean: " + Utils.doubleToString(m_Means[i][attIndex], 10, 8) + "\t"); text.append("Standard Deviation: " + Utils.doubleToString(m_Devs[i][attIndex], 10, 8)); } text.append("\n\n"); attIndex++; } } return text.toString(); } catch (Exception e) { return "Can't print Naive Bayes classifier!"; } }
/** * Compute the number of all possible conditions that could appear in a rule of a given data. For * nominal attributes, it's the number of values that could appear; for numeric attributes, it's * the number of values * 2, i.e. <= and >= are counted as different possible conditions. * * @param data the given data * @return number of all conditions of the data */ public static double numAllConditions(Instances data) { double total = 0; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); if (att.isNominal()) total += (double) att.numValues(); else total += 2.0 * (double) data.numDistinctValues(att); } return total; }
/** * Gives a string representation of the test in Prolog notation, starting from the comparison * symbol. * * @return a string representing the test in Prolog notation */ private String testPrologComparisonString() { Attribute att = m_Dataset.attribute(m_AttIndex); if (att.isNumeric()) { return ((m_Not ? ">= " : "< ") + Utils.doubleToString(m_Split, 3)); } else { if (att.numValues() != 2) return ((m_Not ? "!= " : "= ") + att.value((int) m_Split)); else return ("= " + (m_Not ? att.value((int) m_Split == 0 ? 1 : 0) : att.value((int) m_Split))); } }
/** * Method for building an Id3 tree. * * @param data the training data * @exception Exception if decision tree can't be built successfully */ private void makeTree(Instances data) throws Exception { // Check if no instances have reached this node. if (data.numInstances() == 0) { m_Attribute = null; m_ClassValue = Utils.missingValue(); m_Distribution = new double[data.numClasses()]; return; } // Compute attribute with maximum information gain. double[] infoGains = new double[data.numAttributes()]; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); infoGains[att.index()] = computeInfoGain(data, att); } m_Attribute = data.attribute(Utils.maxIndex(infoGains)); // Make leaf if information gain is zero. // Otherwise create successors. if (Utils.eq(infoGains[m_Attribute.index()], 0)) { m_Attribute = null; m_Distribution = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); m_Distribution[(int) inst.classValue()]++; } Utils.normalize(m_Distribution); m_ClassValue = Utils.maxIndex(m_Distribution); m_ClassAttribute = data.classAttribute(); } else { Instances[] splitData = splitData(data, m_Attribute); m_Successors = new Id3[m_Attribute.numValues()]; for (int j = 0; j < m_Attribute.numValues(); j++) { m_Successors[j] = new Id3(); m_Successors[j].makeTree(splitData[j]); } } }
/** * Set the output format. Swapss the desired nominal attribute values in the header and calls * setOutputFormat(Instances) appropriately. */ private void setOutputFormat() { Instances newData; ArrayList<Attribute> newAtts; ArrayList<String> newVals; // Compute new attributes newAtts = new ArrayList<Attribute>(getInputFormat().numAttributes()); for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if (j != m_AttIndex.getIndex()) { newAtts.add((Attribute) att.copy()); } else { // Compute list of attribute values newVals = new ArrayList<String>(att.numValues()); for (int i = 0; i < att.numValues(); i++) { if (i == m_FirstIndex.getIndex()) { newVals.add(att.value(m_SecondIndex.getIndex())); } else if (i == m_SecondIndex.getIndex()) { newVals.add(att.value(m_FirstIndex.getIndex())); } else { newVals.add(att.value(i)); } } Attribute newAtt = new Attribute(att.name(), newVals); newAtt.setWeight(att.weight()); newAtts.add(newAtt); } } // Construct new header newData = new Instances(getInputFormat().relationName(), newAtts, 0); newData.setClassIndex(getInputFormat().classIndex()); setOutputFormat(newData); }
/** * Computes information gain for an attribute. * * @param data the data for which info gain is to be computed * @param att the attribute * @return the information gain for the given attribute and data * @throws Exception if computation fails */ private double computeInfoGain(Instances data, Attribute att) throws Exception { double infoGain = computeEntropy(data); Instances[] splitData = splitData(data, att); for (int j = 0; j < att.numValues(); j++) { if (splitData[j].numInstances() > 0) { infoGain -= ((double) splitData[j].numInstances() / (double) data.numInstances()) * computeEntropy(splitData[j]); } } return infoGain; }
/** * Outputs a tree at a certain level. * * @param level the level at which the tree is to be printed * @return the tree as string at the given level */ private String toString(int level) { StringBuffer text = new StringBuffer(); if (m_Attribute == null) { if (Utils.isMissingValue(m_ClassValue)) { text.append(": null"); } else { text.append(": " + m_ClassAttribute.value((int) m_ClassValue)); } } else { for (int j = 0; j < m_Attribute.numValues(); j++) { text.append("\n"); for (int i = 0; i < level; i++) { text.append("| "); } text.append(m_Attribute.name() + " = " + m_Attribute.value(j)); text.append(m_Successors[j].toString(level + 1)); } } return text.toString(); }
/** * use for training data * * @param instancesWithMeta * @param labelInstances * @return * @throws Exception */ public static Instances addNominalLabelsForClassificationToTrainingData( Instances instances, AttributeFilterMeta instancesWithMeta, Instances labelInstances) throws Exception { Instances finalCleaned = Instances.mergeInstances(instances, labelInstances); finalCleaned.setClassIndex(finalCleaned.numAttributes() - 1); Attribute classAt = finalCleaned.classAttribute(); int numOfAttValues = classAt.numValues(); String attValues = ""; for (int nai = 0; nai < numOfAttValues; nai++) { if (nai != 0) { attValues += ","; } attValues += classAt.value(nai); } instancesWithMeta.setClassAtrributeValues(attValues); instancesWithMeta.setInstances(finalCleaned); return finalCleaned; }
public void setDatasetKeyFromDialog() { ListSelectorDialog jd = new ListSelectorDialog(null, m_DatasetKeyList); // Open the dialog int result = jd.showDialog(); // =============== BEGIN EDIT melville =============== // Check if learning curves should be generated boolean noise = false; boolean learning = false; boolean fraction = false; int learning_key = -1; // If accepted, update the ttester if (result == ListSelectorDialog.APPROVE_OPTION) { int[] selected = m_DatasetKeyList.getSelectedIndices(); String selectedList = ""; Object[] selected_string = m_DatasetKeyList.getSelectedValues(); for (int i = 0; i < selected.length; i++) { if (((String) selected_string[i]).toLowerCase().equals("key_noise_levels")) { learning_key = i; learning = true; // fraction = true; noise = true; } else if (((String) selected_string[i]).toLowerCase().equals("key_fraction_instances")) { learning_key = i; learning = true; fraction = true; } else if (((String) selected_string[i]).toLowerCase().equals("key_total_instances") && !learning) { learning = true; learning_key = i; } else selectedList += "," + (selected[i] + 1); } m_TTester.setLearningCurve(learning); m_TTester.setFraction(fraction); if (learning) { // get points on the learning curve selectedList += "," + (selected[learning_key] + 1); Attribute attr; if (noise) { // override fraction attr = m_Instances.attribute("Key_Noise_levels"); } else if (fraction) { attr = m_Instances.attribute("Key_Fraction_instances"); } else { attr = m_Instances.attribute("Key_Total_instances"); } double[] pts = new double[attr.numValues()]; for (int k = 0; k < attr.numValues(); k++) { pts[k] = Double.parseDouble(attr.value(k)); } Arrays.sort(pts); m_TTester.setPoints(pts); } // ================ END EDIT melville ================ Range generatorRange = new Range(); if (selectedList.length() != 0) { try { generatorRange.setRanges(selectedList); } catch (Exception ex) { ex.printStackTrace(); System.err.println(ex.getMessage()); } } m_TTester.setDatasetKeyColumns(generatorRange); setTTester(); } }
/** * The procedure implementing the SMOTE algorithm. The output instances are pushed onto the output * queue for collection. * * @throws Exception if provided options cannot be executed on input instances */ protected void doSMOTE() throws Exception { int minIndex = 0; int min = Integer.MAX_VALUE; if (m_DetectMinorityClass) { // find minority class int[] classCounts = getInputFormat().attributeStats(getInputFormat().classIndex()).nominalCounts; for (int i = 0; i < classCounts.length; i++) { if (classCounts[i] != 0 && classCounts[i] < min) { min = classCounts[i]; minIndex = i; } } } else { String classVal = getClassValue(); if (classVal.equalsIgnoreCase("first")) { minIndex = 1; } else if (classVal.equalsIgnoreCase("last")) { minIndex = getInputFormat().numClasses(); } else { minIndex = Integer.parseInt(classVal); } if (minIndex > getInputFormat().numClasses()) { throw new Exception("value index must be <= the number of classes"); } minIndex--; // make it an index } int nearestNeighbors; if (min <= getNearestNeighbors()) { nearestNeighbors = min - 1; } else { nearestNeighbors = getNearestNeighbors(); } if (nearestNeighbors < 1) throw new Exception("Cannot use 0 neighbors!"); // compose minority class dataset // also push all dataset instances Instances sample = getInputFormat().stringFreeStructure(); Enumeration instanceEnum = getInputFormat().enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); push((Instance) instance.copy()); if ((int) instance.classValue() == minIndex) { sample.add(instance); } } // compute Value Distance Metric matrices for nominal features Map vdmMap = new HashMap(); Enumeration attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { if (attr.isNominal() || attr.isString()) { double[][] vdm = new double[attr.numValues()][attr.numValues()]; vdmMap.put(attr, vdm); int[] featureValueCounts = new int[attr.numValues()]; int[][] featureValueCountsByClass = new int[getInputFormat().classAttribute().numValues()][attr.numValues()]; instanceEnum = getInputFormat().enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); int value = (int) instance.value(attr); int classValue = (int) instance.classValue(); featureValueCounts[value]++; featureValueCountsByClass[classValue][value]++; } for (int valueIndex1 = 0; valueIndex1 < attr.numValues(); valueIndex1++) { for (int valueIndex2 = 0; valueIndex2 < attr.numValues(); valueIndex2++) { double sum = 0; for (int classValueIndex = 0; classValueIndex < getInputFormat().numClasses(); classValueIndex++) { double c1i = featureValueCountsByClass[classValueIndex][valueIndex1]; double c2i = featureValueCountsByClass[classValueIndex][valueIndex2]; double c1 = featureValueCounts[valueIndex1]; double c2 = featureValueCounts[valueIndex2]; double term1 = c1i / c1; double term2 = c2i / c2; sum += Math.abs(term1 - term2); } vdm[valueIndex1][valueIndex2] = sum; } } } } } // use this random source for all required randomness Random rand = new Random(getRandomSeed()); // find the set of extra indices to use if the percentage is not evenly // divisible by 100 List extraIndices = new LinkedList(); double percentageRemainder = (getPercentage() / 100) - Math.floor(getPercentage() / 100.0); int extraIndicesCount = (int) (percentageRemainder * sample.numInstances()); if (extraIndicesCount >= 1) { for (int i = 0; i < sample.numInstances(); i++) { extraIndices.add(i); } } Collections.shuffle(extraIndices, rand); extraIndices = extraIndices.subList(0, extraIndicesCount); Set extraIndexSet = new HashSet(extraIndices); // the main loop to handle computing nearest neighbors and generating SMOTE // examples from each instance in the original minority class data Instance[] nnArray = new Instance[nearestNeighbors]; for (int i = 0; i < sample.numInstances(); i++) { Instance instanceI = sample.instance(i); // find k nearest neighbors for each instance List distanceToInstance = new LinkedList(); for (int j = 0; j < sample.numInstances(); j++) { Instance instanceJ = sample.instance(j); if (i != j) { double distance = 0; attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { double iVal = instanceI.value(attr); double jVal = instanceJ.value(attr); if (attr.isNumeric()) { distance += Math.pow(iVal - jVal, 2); } else { distance += ((double[][]) vdmMap.get(attr))[(int) iVal][(int) jVal]; } } } distance = Math.pow(distance, .5); distanceToInstance.add(new Object[] {distance, instanceJ}); } } // sort the neighbors according to distance Collections.sort( distanceToInstance, new Comparator() { public int compare(Object o1, Object o2) { double distance1 = (Double) ((Object[]) o1)[0]; double distance2 = (Double) ((Object[]) o2)[0]; return Double.compare(distance1, distance2); } }); // populate the actual nearest neighbor instance array Iterator entryIterator = distanceToInstance.iterator(); int j = 0; while (entryIterator.hasNext() && j < nearestNeighbors) { nnArray[j] = (Instance) ((Object[]) entryIterator.next())[1]; j++; } // create synthetic examples int n = (int) Math.floor(getPercentage() / 100); while (n > 0 || extraIndexSet.remove(i)) { double[] values = new double[sample.numAttributes()]; int nn = rand.nextInt(nearestNeighbors); attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { if (attr.isNumeric()) { double dif = nnArray[nn].value(attr) - instanceI.value(attr); double gap = rand.nextDouble(); values[attr.index()] = (instanceI.value(attr) + gap * dif); } else if (attr.isDate()) { double dif = nnArray[nn].value(attr) - instanceI.value(attr); double gap = rand.nextDouble(); values[attr.index()] = (long) (instanceI.value(attr) + gap * dif); } else { int[] valueCounts = new int[attr.numValues()]; int iVal = (int) instanceI.value(attr); valueCounts[iVal]++; for (int nnEx = 0; nnEx < nearestNeighbors; nnEx++) { int val = (int) nnArray[nnEx].value(attr); valueCounts[val]++; } int maxIndex = 0; int max = Integer.MIN_VALUE; for (int index = 0; index < attr.numValues(); index++) { if (valueCounts[index] > max) { max = valueCounts[index]; maxIndex = index; } } values[attr.index()] = maxIndex; } } } values[sample.classIndex()] = minIndex; Instance synthetic = new Instance(1.0, values); push(synthetic); n--; } } }
/** Set the output format if the class is numeric. */ private void setOutputFormatNumeric() { if (m_Indices == null) { setOutputFormat(null); return; } ArrayList<Attribute> newAtts; int newClassIndex; StringBuffer attributeName; Instances outputFormat; ArrayList<String> vals; // Compute new attributes m_needToTransform = false; for (int i = 0; i < getInputFormat().numAttributes(); i++) { Attribute att = getInputFormat().attribute(i); if (att.isNominal() && (att.numValues() > 2 || m_Numeric || m_TransformAll)) { m_needToTransform = true; break; } } if (!m_needToTransform) { setOutputFormat(getInputFormat()); return; } newClassIndex = getInputFormat().classIndex(); newAtts = new ArrayList<Attribute>(); for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if ((!att.isNominal()) || (j == getInputFormat().classIndex())) { newAtts.add((Attribute) att.copy()); } else { if (j < getInputFormat().classIndex()) { newClassIndex += att.numValues() - 2; } // Compute values for new attributes for (int k = 1; k < att.numValues(); k++) { attributeName = new StringBuffer(att.name() + "="); for (int l = k; l < att.numValues(); l++) { if (l > k) { attributeName.append(','); } attributeName.append(att.value(m_Indices[j][l])); } if (m_Numeric) { newAtts.add(new Attribute(attributeName.toString())); } else { vals = new ArrayList<String>(2); vals.add("f"); vals.add("t"); newAtts.add(new Attribute(attributeName.toString(), vals)); } } } } outputFormat = new Instances(getInputFormat().relationName(), newAtts, 0); outputFormat.setClassIndex(newClassIndex); setOutputFormat(outputFormat); }
/** * Generates the classifier. * * @param instances set of instances serving as training data * @exception Exception if the classifier has not been generated successfully */ @Override public void buildClassifier(Instances instances) throws Exception { int attIndex = 0; double sum; // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_Instances = new Instances(instances, 0); // Reserve space m_Counts = new double[instances.numClasses()][instances.numAttributes() - 1][0]; m_Means = new double[instances.numClasses()][instances.numAttributes() - 1]; m_Devs = new double[instances.numClasses()][instances.numAttributes() - 1]; m_Priors = new double[instances.numClasses()]; Enumeration<Attribute> enu = instances.enumerateAttributes(); while (enu.hasMoreElements()) { Attribute attribute = enu.nextElement(); if (attribute.isNominal()) { for (int j = 0; j < instances.numClasses(); j++) { m_Counts[j][attIndex] = new double[attribute.numValues()]; } } else { for (int j = 0; j < instances.numClasses(); j++) { m_Counts[j][attIndex] = new double[1]; } } attIndex++; } // Compute counts and sums Enumeration<Instance> enumInsts = instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { Instance instance = enumInsts.nextElement(); if (!instance.classIsMissing()) { Enumeration<Attribute> enumAtts = instances.enumerateAttributes(); attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = enumAtts.nextElement(); if (!instance.isMissing(attribute)) { if (attribute.isNominal()) { m_Counts[(int) instance.classValue()][attIndex][(int) instance.value(attribute)]++; } else { m_Means[(int) instance.classValue()][attIndex] += instance.value(attribute); m_Counts[(int) instance.classValue()][attIndex][0]++; } } attIndex++; } m_Priors[(int) instance.classValue()]++; } } // Compute means Enumeration<Attribute> enumAtts = instances.enumerateAttributes(); attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = enumAtts.nextElement(); if (attribute.isNumeric()) { for (int j = 0; j < instances.numClasses(); j++) { if (m_Counts[j][attIndex][0] < 2) { throw new Exception( "attribute " + attribute.name() + ": less than two values for class " + instances.classAttribute().value(j)); } m_Means[j][attIndex] /= m_Counts[j][attIndex][0]; } } attIndex++; } // Compute standard deviations enumInsts = instances.enumerateInstances(); while (enumInsts.hasMoreElements()) { Instance instance = enumInsts.nextElement(); if (!instance.classIsMissing()) { enumAtts = instances.enumerateAttributes(); attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = enumAtts.nextElement(); if (!instance.isMissing(attribute)) { if (attribute.isNumeric()) { m_Devs[(int) instance.classValue()][attIndex] += (m_Means[(int) instance.classValue()][attIndex] - instance.value(attribute)) * (m_Means[(int) instance.classValue()][attIndex] - instance.value(attribute)); } } attIndex++; } } } enumAtts = instances.enumerateAttributes(); attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = enumAtts.nextElement(); if (attribute.isNumeric()) { for (int j = 0; j < instances.numClasses(); j++) { if (m_Devs[j][attIndex] <= 0) { throw new Exception( "attribute " + attribute.name() + ": standard deviation is 0 for class " + instances.classAttribute().value(j)); } else { m_Devs[j][attIndex] /= m_Counts[j][attIndex][0] - 1; m_Devs[j][attIndex] = Math.sqrt(m_Devs[j][attIndex]); } } } attIndex++; } // Normalize counts enumAtts = instances.enumerateAttributes(); attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = enumAtts.nextElement(); if (attribute.isNominal()) { for (int j = 0; j < instances.numClasses(); j++) { sum = Utils.sum(m_Counts[j][attIndex]); for (int i = 0; i < attribute.numValues(); i++) { m_Counts[j][attIndex][i] = (m_Counts[j][attIndex][i] + 1) / (sum + attribute.numValues()); } } } attIndex++; } // Normalize priors sum = Utils.sum(m_Priors); for (int j = 0; j < instances.numClasses(); j++) { m_Priors[j] = (m_Priors[j] + 1) / (sum + instances.numClasses()); } }
public MappingInfo(Instances dataSet, MiningSchema miningSchema, Logger log) throws Exception { m_log = log; // miningSchema.convertStringAttsToNominal(); Instances fieldsI = miningSchema.getMiningSchemaAsInstances(); m_fieldsMap = new int[fieldsI.numAttributes()]; m_nominalValueMaps = new int[fieldsI.numAttributes()][]; for (int i = 0; i < fieldsI.numAttributes(); i++) { String schemaAttName = fieldsI.attribute(i).name(); boolean found = false; for (int j = 0; j < dataSet.numAttributes(); j++) { if (dataSet.attribute(j).name().equals(schemaAttName)) { Attribute miningSchemaAtt = fieldsI.attribute(i); Attribute incomingAtt = dataSet.attribute(j); // check type match if (miningSchemaAtt.type() != incomingAtt.type()) { throw new Exception( "[MappingInfo] type mismatch for field " + schemaAttName + ". Mining schema type " + miningSchemaAtt.toString() + ". Incoming type " + incomingAtt.toString() + "."); } // check nominal values (number, names...) if (miningSchemaAtt.numValues() != incomingAtt.numValues()) { String warningString = "[MappingInfo] WARNING: incoming nominal attribute " + incomingAtt.name() + " does not have the same " + "number of values as the corresponding mining " + "schema attribute."; if (m_log != null) { m_log.logMessage(warningString); } else { System.err.println(warningString); } } if (miningSchemaAtt.isNominal() || miningSchemaAtt.isString()) { int[] valuesMap = new int[incomingAtt.numValues()]; for (int k = 0; k < incomingAtt.numValues(); k++) { String incomingNomVal = incomingAtt.value(k); int indexInSchema = miningSchemaAtt.indexOfValue(incomingNomVal); if (indexInSchema < 0) { String warningString = "[MappingInfo] WARNING: incoming nominal attribute " + incomingAtt.name() + " has value " + incomingNomVal + " that doesn't occur in the mining schema."; if (m_log != null) { m_log.logMessage(warningString); } else { System.err.println(warningString); } valuesMap[k] = UNKNOWN_NOMINAL_VALUE; } else { valuesMap[k] = indexInSchema; } } m_nominalValueMaps[i] = valuesMap; } /*if (miningSchemaAtt.isNominal()) { for (int k = 0; k < miningSchemaAtt.numValues(); k++) { if (!miningSchemaAtt.value(k).equals(incomingAtt.value(k))) { throw new Exception("[PMMLUtils] value " + k + " (" + miningSchemaAtt.value(k) + ") does not match " + "incoming value (" + incomingAtt.value(k) + ") for attribute " + miningSchemaAtt.name() + "."); } } }*/ found = true; m_fieldsMap[i] = j; } } if (!found) { throw new Exception( "[MappingInfo] Unable to find a match for mining schema " + "attribute " + schemaAttName + " in the " + "incoming instances!"); } } // check class attribute (if set) if (fieldsI.classIndex() >= 0) { if (dataSet.classIndex() < 0) { // first see if we can find a matching class String className = fieldsI.classAttribute().name(); Attribute classMatch = dataSet.attribute(className); if (classMatch == null) { throw new Exception( "[MappingInfo] Can't find match for target field " + className + "in incoming instances!"); } dataSet.setClass(classMatch); } else if (!fieldsI.classAttribute().name().equals(dataSet.classAttribute().name())) { throw new Exception( "[MappingInfo] class attribute in mining schema does not match " + "class attribute in incoming instances!"); } } // Set up the textual description of the mapping fieldsMappingString(fieldsI, dataSet); }
/** * Writes a Batch of instances * * @throws IOException throws IOException if saving in batch mode is not possible */ public void writeBatch() throws IOException { Instances instances = getInstances(); if (instances == null) throw new IOException("No instances to save"); if (instances.classIndex() == -1) { instances.setClassIndex(instances.numAttributes() - 1); System.err.println("No class specified. Last attribute is used as class attribute."); } if (instances.attribute(instances.classIndex()).isNumeric()) throw new IOException("To save in C4.5 format the class attribute cannot be numeric."); if (getRetrieval() == INCREMENTAL) throw new IOException("Batch and incremental saving cannot be mixed."); setRetrieval(BATCH); if (retrieveFile() == null || getWriter() == null) { throw new IOException( "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option."); } setWriteMode(WRITE); // print names file setFileExtension(".names"); PrintWriter outW = new PrintWriter(getWriter()); for (int i = 0; i < instances.attribute(instances.classIndex()).numValues(); i++) { outW.write(instances.attribute(instances.classIndex()).value(i)); if (i < instances.attribute(instances.classIndex()).numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } for (int i = 0; i < instances.numAttributes(); i++) { if (i != instances.classIndex()) { outW.write(instances.attribute(i).name() + ": "); if (instances.attribute(i).isNumeric() || instances.attribute(i).isDate()) { outW.write("continuous.\n"); } else { Attribute temp = instances.attribute(i); for (int j = 0; j < temp.numValues(); j++) { outW.write(temp.value(j)); if (j < temp.numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } } } } outW.flush(); outW.close(); // print data file String out = retrieveFile().getAbsolutePath(); setFileExtension(".data"); out = out.substring(0, out.lastIndexOf('.')) + getFileExtension(); File namesFile = new File(out); try { setFile(namesFile); } catch (Exception ex) { throw new IOException( "Cannot create data file, only names file created (Reason: " + ex.toString() + ")."); } if (retrieveFile() == null || getWriter() == null) { throw new IOException("Cannot create data file, only names file created."); } outW = new PrintWriter(getWriter()); // print data file for (int i = 0; i < instances.numInstances(); i++) { Instance temp = instances.instance(i); for (int j = 0; j < temp.numAttributes(); j++) { if (j != instances.classIndex()) { if (temp.isMissing(j)) { outW.write("?,"); } else if (instances.attribute(j).isNominal() || instances.attribute(j).isString()) { outW.write(instances.attribute(j).value((int) temp.value(j)) + ","); } else { outW.write("" + temp.value(j) + ","); } } } // write the class value if (temp.isMissing(instances.classIndex())) { outW.write("?"); } else { outW.write( instances .attribute(instances.classIndex()) .value((int) temp.value(instances.classIndex()))); } outW.write("\n"); } outW.flush(); outW.close(); setFileExtension(".names"); setWriteMode(WAIT); outW = null; resetWriter(); setWriteMode(CANCEL); }
/** * Saves an instances incrementally. Structure has to be set by using the setStructure() method or * setInstances() method. * * @param inst the instance to save * @throws IOException throws IOEXception if an instance cannot be saved incrementally. */ public void writeIncremental(Instance inst) throws IOException { int writeMode = getWriteMode(); Instances structure = getInstances(); PrintWriter outW = null; if (structure != null) { if (structure.classIndex() == -1) { structure.setClassIndex(structure.numAttributes() - 1); System.err.println("No class specified. Last attribute is used as class attribute."); } if (structure.attribute(structure.classIndex()).isNumeric()) throw new IOException("To save in C4.5 format the class attribute cannot be numeric."); } if (getRetrieval() == BATCH || getRetrieval() == NONE) throw new IOException("Batch and incremental saving cannot be mixed."); if (retrieveFile() == null || getWriter() == null) { throw new IOException( "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option."); } outW = new PrintWriter(getWriter()); if (writeMode == WAIT) { if (structure == null) { setWriteMode(CANCEL); if (inst != null) System.err.println("Structure(Header Information) has to be set in advance"); } else setWriteMode(STRUCTURE_READY); writeMode = getWriteMode(); } if (writeMode == CANCEL) { if (outW != null) outW.close(); cancel(); } if (writeMode == STRUCTURE_READY) { setWriteMode(WRITE); // write header: here names file for (int i = 0; i < structure.attribute(structure.classIndex()).numValues(); i++) { outW.write(structure.attribute(structure.classIndex()).value(i)); if (i < structure.attribute(structure.classIndex()).numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } for (int i = 0; i < structure.numAttributes(); i++) { if (i != structure.classIndex()) { outW.write(structure.attribute(i).name() + ": "); if (structure.attribute(i).isNumeric() || structure.attribute(i).isDate()) { outW.write("continuous.\n"); } else { Attribute temp = structure.attribute(i); for (int j = 0; j < temp.numValues(); j++) { outW.write(temp.value(j)); if (j < temp.numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } } } } outW.flush(); outW.close(); writeMode = getWriteMode(); String out = retrieveFile().getAbsolutePath(); setFileExtension(".data"); out = out.substring(0, out.lastIndexOf('.')) + getFileExtension(); File namesFile = new File(out); try { setFile(namesFile); } catch (Exception ex) { throw new IOException("Cannot create data file, only names file created."); } if (retrieveFile() == null || getWriter() == null) { throw new IOException("Cannot create data file, only names file created."); } outW = new PrintWriter(getWriter()); } if (writeMode == WRITE) { if (structure == null) throw new IOException("No instances information available."); if (inst != null) { // write instance: here data file for (int j = 0; j < inst.numAttributes(); j++) { if (j != structure.classIndex()) { if (inst.isMissing(j)) { outW.write("?,"); } else if (structure.attribute(j).isNominal() || structure.attribute(j).isString()) { outW.write(structure.attribute(j).value((int) inst.value(j)) + ","); } else { outW.write("" + inst.value(j) + ","); } } } // write the class value if (inst.isMissing(structure.classIndex())) { outW.write("?"); } else { outW.write( structure .attribute(structure.classIndex()) .value((int) inst.value(structure.classIndex()))); } outW.write("\n"); // flushes every 100 instances m_incrementalCounter++; if (m_incrementalCounter > 100) { m_incrementalCounter = 0; outW.flush(); } } else { // close if (outW != null) { outW.flush(); outW.close(); } setFileExtension(".names"); m_incrementalCounter = 0; resetStructure(); outW = null; resetWriter(); } } }
/** * Set the output format. Takes the current average class values and m_InputFormat and calls * setOutputFormat(Instances) appropriately. */ private void setOutputFormat() { Instances newData; FastVector newAtts, newVals; boolean firstEndsWithPrime = false, secondEndsWithPrime = false; StringBuffer text = new StringBuffer(); // Compute new attributes newAtts = new FastVector(getInputFormat().numAttributes()); for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if (j != m_AttIndex.getIndex()) { newAtts.addElement(att.copy()); } else { // Compute new value if (att.value(m_FirstIndex.getIndex()).endsWith("'")) { firstEndsWithPrime = true; } if (att.value(m_SecondIndex.getIndex()).endsWith("'")) { secondEndsWithPrime = true; } if (firstEndsWithPrime || secondEndsWithPrime) { text.append("'"); } if (firstEndsWithPrime) { text.append( ((String) att.value(m_FirstIndex.getIndex())) .substring(1, ((String) att.value(m_FirstIndex.getIndex())).length() - 1)); } else { text.append((String) att.value(m_FirstIndex.getIndex())); } text.append('_'); if (secondEndsWithPrime) { text.append( ((String) att.value(m_SecondIndex.getIndex())) .substring(1, ((String) att.value(m_SecondIndex.getIndex())).length() - 1)); } else { text.append((String) att.value(m_SecondIndex.getIndex())); } if (firstEndsWithPrime || secondEndsWithPrime) { text.append("'"); } // Compute list of attribute values newVals = new FastVector(att.numValues() - 1); for (int i = 0; i < att.numValues(); i++) { if (i == m_FirstIndex.getIndex()) { newVals.addElement(text.toString()); } else if (i != m_SecondIndex.getIndex()) { newVals.addElement(att.value(i)); } } Attribute newAtt = new Attribute(att.name(), newVals); newAtt.setWeight(getInputFormat().attribute(j).weight()); newAtts.addElement(newAtt); } } // Construct new header newData = new Instances(getInputFormat().relationName(), newAtts, 0); newData.setClassIndex(getInputFormat().classIndex()); setOutputFormat(newData); }
/** * Sets up the panel with a new set of instances, attempting to guess the correct settings for * various columns. * * @param newInstances the new set of results. */ public void setInstances(Instances newInstances) { m_Instances = newInstances; m_TTester.setInstances(m_Instances); m_FromLab.setText("Got " + m_Instances.numInstances() + " results"); // Temporarily remove the configuration listener m_RunCombo.removeActionListener(m_ConfigureListener); // Do other stuff m_DatasetKeyModel.removeAllElements(); m_RunModel.removeAllElements(); m_ResultKeyModel.removeAllElements(); m_CompareModel.removeAllElements(); int datasetCol = -1; int runCol = -1; String selectedList = ""; String selectedListDataset = ""; // =============== BEGIN EDIT melville =============== boolean noise = false; // keep track of whether noise levels eval is required boolean learning = false; // keep track of whether learning curve eval is required boolean fraction = false; // keep track of whether fractions of datasets are provided for learning // the key on which to base the learning curves (either total instances or fraction) int learning_key = -1; boolean classificationTask = false; // used to determine if regression measures should be selected // =============== END EDIT melville =============== for (int i = 0; i < m_Instances.numAttributes(); i++) { String name = m_Instances.attribute(i).name(); m_DatasetKeyModel.addElement(name); m_RunModel.addElement(name); m_ResultKeyModel.addElement(name); m_CompareModel.addElement(name); // =============== BEGIN EDIT melville =============== // If learning curves were generated then treat each // dataset + pt combination as a different dataset if (name.toLowerCase().equals("key_noise_levels")) { // noise overrides learning curves - but treat noise levels // like pts on learning curve learning_key = i; learning = true; noise = true; // fraction = true; } else if (name.toLowerCase().equals("key_fraction_instances") && !noise) { // fraction overrides total_instances learning_key = i; learning = true; fraction = true; } else if (name.toLowerCase().equals("key_total_instances") && !learning) { learning_key = i; learning = true; } else // =============== END EDIT melville =============== if (name.toLowerCase().equals("key_dataset")) { m_DatasetKeyList.addSelectionInterval(i, i); selectedListDataset += "," + (i + 1); } else if ((runCol == -1) && (name.toLowerCase().equals("key_run"))) { m_RunCombo.setSelectedIndex(i); runCol = i; } else if (name.toLowerCase().equals("key_scheme") || name.toLowerCase().equals("key_scheme_options") || name.toLowerCase().equals("key_scheme_version_id")) { m_ResultKeyList.addSelectionInterval(i, i); selectedList += "," + (i + 1); // =============== BEGIN EDIT mbilenko =============== // automatic selection of the correct measure for clustering experiments } else if (name.toLowerCase().indexOf("pairwise_f_measure") != -1) { m_CompareCombo.setSelectedIndex(i); m_ErrorCompareCol = i; } // automatic selection of the correct measure for deduping experiments else if (name.toLowerCase().equals("precision")) { m_CompareCombo.setSelectedIndex(i); // =============== END EDIT mbilenko =============== } else if (name.toLowerCase().indexOf("percent_correct") != -1) { m_CompareCombo.setSelectedIndex(i); classificationTask = true; } else if (!classificationTask && (name.toLowerCase().indexOf("root_mean_squared_error") != -1)) { // automatic selection of the correct measure for regression experiments m_CompareCombo.setSelectedIndex(i); } else if (name.toLowerCase().indexOf("percent_incorrect") != -1) { m_ErrorCompareCol = i; // remember index of error for computing error reductions } } // =============== BEGIN EDIT melville =============== if (learning) { m_DatasetKeyList.addSelectionInterval(learning_key, learning_key); selectedListDataset += "," + (learning_key + 1); m_CompareModel.addElement("%Error_reduction"); m_CompareModel.addElement("Top_20%_%Error_reduction"); } // =============== END EDIT melville =============== if (runCol == -1) { runCol = 0; } m_DatasetKeyBut.setEnabled(true); m_RunCombo.setEnabled(true); m_ResultKeyBut.setEnabled(true); m_CompareCombo.setEnabled(true); // Reconnect the configuration listener m_RunCombo.addActionListener(m_ConfigureListener); // Set up the TTester with the new data m_TTester.setRunColumn(runCol); Range generatorRange = new Range(); if (selectedList.length() != 0) { try { generatorRange.setRanges(selectedList); } catch (Exception ex) { ex.printStackTrace(); System.err.println(ex.getMessage()); } } m_TTester.setResultsetKeyColumns(generatorRange); generatorRange = new Range(); if (selectedListDataset.length() != 0) { try { generatorRange.setRanges(selectedListDataset); } catch (Exception ex) { ex.printStackTrace(); System.err.println(ex.getMessage()); } } m_TTester.setDatasetKeyColumns(generatorRange); // =============== BEGIN EDIT melville =============== m_TTester.setLearningCurve(learning); m_TTester.setFraction(fraction); if (learning) { // get points on the learning curve Attribute attr; if (noise) { // override fraction attr = m_Instances.attribute("Key_Noise_levels"); } else if (fraction) { attr = m_Instances.attribute("Key_Fraction_instances"); } else { attr = m_Instances.attribute("Key_Total_instances"); } double[] pts = new double[attr.numValues()]; for (int k = 0; k < attr.numValues(); k++) { pts[k] = Double.parseDouble(attr.value(k)); } // sort points Arrays.sort(pts); m_TTester.setPoints(pts); } // =============== END EDIT melville =============== m_SigTex.setEnabled(true); m_PrecTex.setEnabled(true); setTTester(); }
/** * TextDirectoryLoader is unable to process a data set incrementally. * * @param structure ignored * @return never returns without throwing an exception * @throws IOException always. TextDirectoryLoader is unable to process a data set incrementally. */ public Instance getNextInstance(Instances structure) throws IOException { // throw new IOException("TextDirectoryLoader can't read data sets incrementally."); String directoryPath = getDirectory().getAbsolutePath(); Attribute classAtt = structure.classAttribute(); if (m_filesByClass == null) { m_filesByClass = new ArrayList<LinkedList<String>>(); for (int i = 0; i < classAtt.numValues(); i++) { File classDir = new File(directoryPath + File.separator + classAtt.value(i)); String[] files = classDir.list(); LinkedList<String> classDocs = new LinkedList<String>(); for (String cd : files) { File txt = new File(directoryPath + File.separator + classAtt.value(i) + File.separator + cd); if (txt.isFile()) { classDocs.add(cd); } } m_filesByClass.add(classDocs); } } // cycle through the classes int count = 0; LinkedList<String> classContents = m_filesByClass.get(m_lastClassDir); boolean found = (classContents.size() > 0); while (classContents.size() == 0) { m_lastClassDir++; count++; if (m_lastClassDir == structure.classAttribute().numValues()) { m_lastClassDir = 0; } classContents = m_filesByClass.get(m_lastClassDir); if (classContents.size() > 0) { found = true; // we have an instance we can create break; } if (count == structure.classAttribute().numValues()) { break; // must be finished } } if (found) { String nextDoc = classContents.poll(); File txt = new File( directoryPath + File.separator + classAtt.value(m_lastClassDir) + File.separator + nextDoc); BufferedReader is; if (m_charSet == null || m_charSet.length() == 0) { is = new BufferedReader(new InputStreamReader(new FileInputStream(txt))); } else { is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } double[] newInst = null; if (m_OutputFilename) newInst = new double[3]; else newInst = new double[2]; if (getRetainStringValues()) { newInst[0] = (double) structure.attribute(0).addStringValue(txtStr.toString()); } else { newInst[0] = 0; structure.attribute(0).setStringValue(txtStr.toString()); } if (m_OutputFilename) { if (getRetainStringValues()) { newInst[1] = (double) structure.attribute(1).addStringValue(txt.getAbsolutePath()); } else { newInst[1] = 0; structure.attribute(1).setStringValue(txt.getAbsolutePath()); } } newInst[structure.classIndex()] = (double) m_lastClassDir; Instance inst = new DenseInstance(1.0, newInst); inst.setDataset(structure); is.close(); m_lastClassDir++; if (m_lastClassDir == structure.classAttribute().numValues()) { m_lastClassDir = 0; } return inst; } else { return null; // done! } }