/** * Generate artificial training examples. * * @param artSize size of examples set to create * @param data training data * @return the set of unlabeled artificial examples */ protected Instances generateArtificialData(int artSize, Instances data) { int numAttributes = data.numAttributes(); Instances artData = new Instances(data, artSize); double[] att; Instance artInstance; for (int i = 0; i < artSize; i++) { att = new double[numAttributes]; for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Select nominal value based on the frequency of occurence in the training data double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (double) selectIndexProbabilistically(stats); } else if (data.attribute(j).isNumeric()) { // Generate numeric value from the Guassian distribution // defined by the mean and std dev of the attribute double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0]; } else System.err.println("Decorate can only handle numeric and nominal values."); } artInstance = new Instance(1.0, att); artData.add(artInstance); } return artData; }
/** * Compute and store statistics required for generating artificial data. * * @param data training instances * @exception Exception if statistics could not be calculated successfully */ protected void computeStats(Instances data) throws Exception { int numAttributes = data.numAttributes(); m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Compute the probability of occurence of each distinct value int[] nomCounts = (data.attributeStats(j)).nominalCounts; double[] counts = new double[nomCounts.length]; if (counts.length < 2) throw new Exception("Nominal attribute has less than two distinct values!"); // Perform Laplace smoothing for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1; Utils.normalize(counts); double[] stats = new double[counts.length - 1]; stats[0] = counts[0]; // Calculate cumulative probabilities for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i]; m_AttributeStats.add(j, stats); } else if (data.attribute(j).isNumeric()) { // Get mean and standard deviation from the training data double[] stats = new double[2]; stats[0] = data.meanOrMode(j); stats[1] = Math.sqrt(data.variance(j)); m_AttributeStats.add(j, stats); } else System.err.println("Decorate can only handle numeric and nominal values."); } }
/** * GetKs - return [K_1,K_2,...,K_L] where each Y_j \in {1,...,K_j}. In the multi-label case, K[j] * = 2 for all j = 1,...,L. * * @param D a dataset * @return an array of the number of values that each label can take */ private static int[] getKs(Instances D) { int L = D.classIndex(); int K[] = new int[L]; for (int k = 0; k < L; k++) { K[k] = D.attribute(k).numValues(); } return K; }
/** * return a string describing this clusterer * * @return a description of the clusterer as a string */ public String toString() { StringBuffer temp = new StringBuffer(); temp.append("\n FarthestFirst\n==============\n"); temp.append("\nCluster centroids:\n"); for (int i = 0; i < m_NumClusters; i++) { temp.append("\nCluster " + i + "\n\t"); for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) { if (m_ClusterCentroids.attribute(j).isNominal()) { temp.append( " " + m_ClusterCentroids .attribute(j) .value((int) m_ClusterCentroids.instance(i).value(j))); } else { temp.append(" " + m_ClusterCentroids.instance(i).value(j)); } } } temp.append("\n\n"); return temp.toString(); }
/** Computes the difference between two given attribute values. */ protected double difference(int index, double val1, double val2) { switch (m_instances.attribute(index).type()) { case Attribute.NOMINAL: // If attribute is nominal if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2) || ((int) val1 != (int) val2)) { return 1; } else { return 0; } case Attribute.NUMERIC: // If attribute is numeric if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) { if (Instance.isMissingValue(val1) && Instance.isMissingValue(val2)) { return 1; } else { double diff; if (Instance.isMissingValue(val2)) { diff = norm(val1, index); } else { diff = norm(val2, index); } if (diff < 0.5) { diff = 1.0 - diff; } return diff; } } else { return norm(val1, index) - norm(val2, index); } default: return 0; } }
/** * Tests a certain range of attributes of the given data, whether it can be processed by the * handler, given its capabilities. Classifiers implementing the <code> * MultiInstanceCapabilitiesHandler</code> interface are checked automatically for their * multi-instance Capabilities (if no bags, then only the bag-structure, otherwise only the first * bag). * * @param data the data to test * @param fromIndex the range of attributes - start (incl.) * @param toIndex the range of attributes - end (incl.) * @return true if all the tests succeeded * @see MultiInstanceCapabilitiesHandler * @see #m_InstancesTest * @see #m_MissingValuesTest * @see #m_MissingClassValuesTest * @see #m_MinimumNumberInstancesTest */ public boolean test(Instances data, int fromIndex, int toIndex) { int i; int n; int m; Attribute att; Instance inst; boolean testClass; Capabilities cap; boolean missing; Iterator iter; // shall we test the data? if (!m_InstancesTest) return true; // no Capabilities? -> warning if ((m_Capabilities.size() == 0) || ((m_Capabilities.size() == 1) && handles(Capability.NO_CLASS))) System.err.println(createMessage("No capabilities set!")); // any attributes? if (toIndex - fromIndex < 0) { m_FailReason = new WekaException(createMessage("No attributes!")); return false; } // do wee need to test the class attribute, i.e., is the class attribute // within the range of attributes? testClass = (data.classIndex() > -1) && (data.classIndex() >= fromIndex) && (data.classIndex() <= toIndex); // attributes for (i = fromIndex; i <= toIndex; i++) { att = data.attribute(i); // class is handled separately if (i == data.classIndex()) continue; // check attribute types if (!test(att)) return false; } // class if (!handles(Capability.NO_CLASS) && (data.classIndex() == -1)) { m_FailReason = new UnassignedClassException(createMessage("Class attribute not set!")); return false; } // special case: no class attribute can be handled if (handles(Capability.NO_CLASS) && (data.classIndex() > -1)) { cap = getClassCapabilities(); cap.disable(Capability.NO_CLASS); iter = cap.capabilities(); if (!iter.hasNext()) { m_FailReason = new WekaException(createMessage("Cannot handle any class attribute!")); return false; } } if (testClass && !handles(Capability.NO_CLASS)) { att = data.classAttribute(); if (!test(att, true)) return false; // special handling of RELATIONAL class // TODO: store additional Capabilities for this case // missing class labels if (m_MissingClassValuesTest) { if (!handles(Capability.MISSING_CLASS_VALUES)) { for (i = 0; i < data.numInstances(); i++) { if (data.instance(i).classIsMissing()) { m_FailReason = new WekaException(createMessage("Cannot handle missing class values!")); return false; } } } else { if (m_MinimumNumberInstancesTest) { int hasClass = 0; for (i = 0; i < data.numInstances(); i++) { if (!data.instance(i).classIsMissing()) hasClass++; } // not enough instances with class labels? if (hasClass < getMinimumNumberInstances()) { m_FailReason = new WekaException( createMessage( "Not enough training instances with class labels (required: " + getMinimumNumberInstances() + ", provided: " + hasClass + ")!")); return false; } } } } } // missing values if (m_MissingValuesTest) { if (!handles(Capability.MISSING_VALUES)) { missing = false; for (i = 0; i < data.numInstances(); i++) { inst = data.instance(i); if (inst instanceof SparseInstance) { for (m = 0; m < inst.numValues(); m++) { n = inst.index(m); // out of scope? if (n < fromIndex) continue; if (n > toIndex) break; // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } else { for (n = fromIndex; n <= toIndex; n++) { // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } if (missing) { m_FailReason = new NoSupportForMissingValuesException( createMessage("Cannot handle missing values!")); return false; } } } } // instances if (m_MinimumNumberInstancesTest) { if (data.numInstances() < getMinimumNumberInstances()) { m_FailReason = new WekaException( createMessage( "Not enough training instances (required: " + getMinimumNumberInstances() + ", provided: " + data.numInstances() + ")!")); return false; } } // Multi-Instance? -> check structure (regardless of attribute range!) if (handles(Capability.ONLY_MULTIINSTANCE)) { // number of attributes? if (data.numAttributes() != 3) { m_FailReason = new WekaException( createMessage("Incorrect Multi-Instance format, must be 'bag-id, bag, class'!")); return false; } // type of attributes and position of class? if (!data.attribute(0).isNominal() || !data.attribute(1).isRelationValued() || (data.classIndex() != data.numAttributes() - 1)) { m_FailReason = new WekaException( createMessage( "Incorrect Multi-Instance format, must be 'NOMINAL att, RELATIONAL att, CLASS att'!")); return false; } // check data immediately if (getOwner() instanceof MultiInstanceCapabilitiesHandler) { MultiInstanceCapabilitiesHandler handler = (MultiInstanceCapabilitiesHandler) getOwner(); cap = handler.getMultiInstanceCapabilities(); boolean result; if (data.numInstances() > 0) result = cap.test(data.attribute(1).relation(0)); else result = cap.test(data.attribute(1).relation()); if (!result) { m_FailReason = cap.m_FailReason; return false; } } } // passed all tests! return true; }
/** * returns a Capabilities object specific for this data. The minimum number of instances is not * set, the check for multi-instance data is optional. * * @param data the data to base the capabilities on * @param multi if true then the structure is checked, too * @return a data-specific capabilities object * @throws Exception in case an error occurrs, e.g., an unknown attribute type */ public static Capabilities forInstances(Instances data, boolean multi) throws Exception { Capabilities result; Capabilities multiInstance; int i; int n; int m; Instance inst; boolean missing; result = new Capabilities(null); // class if (data.classIndex() == -1) { result.enable(Capability.NO_CLASS); } else { switch (data.classAttribute().type()) { case Attribute.NOMINAL: if (data.classAttribute().numValues() == 1) result.enable(Capability.UNARY_CLASS); else if (data.classAttribute().numValues() == 2) result.enable(Capability.BINARY_CLASS); else result.enable(Capability.NOMINAL_CLASS); break; case Attribute.NUMERIC: result.enable(Capability.NUMERIC_CLASS); break; case Attribute.STRING: result.enable(Capability.STRING_CLASS); break; case Attribute.DATE: result.enable(Capability.DATE_CLASS); break; case Attribute.RELATIONAL: result.enable(Capability.RELATIONAL_CLASS); break; default: throw new UnsupportedAttributeTypeException( "Unknown class attribute type '" + data.classAttribute() + "'!"); } // missing class values for (i = 0; i < data.numInstances(); i++) { if (data.instance(i).classIsMissing()) { result.enable(Capability.MISSING_CLASS_VALUES); break; } } } // attributes for (i = 0; i < data.numAttributes(); i++) { // skip class if (i == data.classIndex()) continue; switch (data.attribute(i).type()) { case Attribute.NOMINAL: result.enable(Capability.UNARY_ATTRIBUTES); if (data.attribute(i).numValues() == 2) result.enable(Capability.BINARY_ATTRIBUTES); else if (data.attribute(i).numValues() > 2) result.enable(Capability.NOMINAL_ATTRIBUTES); break; case Attribute.NUMERIC: result.enable(Capability.NUMERIC_ATTRIBUTES); break; case Attribute.DATE: result.enable(Capability.DATE_ATTRIBUTES); break; case Attribute.STRING: result.enable(Capability.STRING_ATTRIBUTES); break; case Attribute.RELATIONAL: result.enable(Capability.RELATIONAL_ATTRIBUTES); break; default: throw new UnsupportedAttributeTypeException( "Unknown attribute type '" + data.attribute(i).type() + "'!"); } } // missing values missing = false; for (i = 0; i < data.numInstances(); i++) { inst = data.instance(i); if (inst instanceof SparseInstance) { for (m = 0; m < inst.numValues(); m++) { n = inst.index(m); // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } else { for (n = 0; n < data.numAttributes(); n++) { // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } if (missing) { result.enable(Capability.MISSING_VALUES); break; } } // multi-instance data? if (multi) { if ((data.numAttributes() == 3) && (data.attribute(0).isNominal()) // bag-id && (data.attribute(1).isRelationValued()) // bag && (data.classIndex() == data.numAttributes() - 1)) { multiInstance = new Capabilities(null); multiInstance.or(result.getClassCapabilities()); multiInstance.enable(Capability.NOMINAL_ATTRIBUTES); multiInstance.enable(Capability.RELATIONAL_ATTRIBUTES); multiInstance.enable(Capability.ONLY_MULTIINSTANCE); result.assign(multiInstance); } } return result; }
/** * Gets the string describing the attributes the split depends on. i.e. the left hand side of the * description of the split. * * @param dataset the dataset that the split is based on * @return a string describing the attributes */ public String attributeString(Instances dataset) { return dataset.attribute(attIndex).name(); }
/** * ************************************************** Convert a table to a set of instances, with * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing * <b>attributes</b> (e.g. as is common with microarray data) */ public Instances tableColsToInstances(Table t, String relationName) { System.err.print("Converting table cols to instances..."); // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int r = 0; r < t.numRows; r++) { if (rowIsNumeric(t, r)) { isNominal.add(false); atts.addElement(new Attribute(t.rowNames[r])); allAttVals.add(null); // No enumeration of attribute values. } else { // It's nominal... determine the range of values and create a nominal attribute... isNominal.add(true); FastVector attVals = getRowValues(t, r); atts.addElement(new Attribute(t.rowNames[r], attVals)); // Save it for later allAttVals.add(attVals); } } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < t.numCols; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. // For each attribute fill in the numeric or attributeValue index... for (int r = 0; r < t.numRows; r++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[r] = Instance.missingValue(); else if (isNominal.get(r)) { vals[r] = allAttVals.get(r).indexOf(val); } else { vals[r] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int c = 0; c < t.colNames.length; c++) { instanceNames.add(t.colNames[c]); newData.instance(c).setValue(attrIdx, t.colNames[c]); } data = newData; } System.err.println("done."); return (data); }
/** * ************************************************** Convert a table to a set of instances, with * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing * <b>attributes</b> */ public Instances tableRowsToNominalInstances(Table t, String relationName) { System.err.print("Converting table rows to instances..."); // Set up attributes, which for rowInstances will be the colNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int c = 0; c < t.numCols; c++) { // It's nominal... determine the range of values isNominal.add(true); FastVector attVals = getColValues(t, c); atts.addElement(new Attribute(t.colNames[c], attVals)); // Save it for later allAttVals.add(attVals); } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); // Fill the instances with data... // For each instance... for (int r = 0; r < t.numRows; r++) { double[] vals = new double[data.numAttributes()]; // for each attribute for (int c = 0; c < t.numCols; c++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[c] = Instance.missingValue(); else if (isNominal.get(c)) { vals[c] = allAttVals.get(c).indexOf(val); } else { vals[c] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int r = 0; r < t.rowNames.length; r++) { instanceNames.add(t.rowNames[r]); newData.instance(r).setValue(attrIdx, t.rowNames[r]); } data = newData; } System.err.println("done."); return (data); }
/** * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable. */ public static Instances readNumeric(String fileName, String relationName, String delimiter) throws Exception { int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading. String[] attrNames = new String[numAttributes]; // Read the col headings and figure out the number of columns in the table.. BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304); String line = reader.readLine(); String[] instanceNames = parseColNames(line, delimiter); int numInstances = instanceNames.length; System.err.print("reading " + numAttributes + " x " + numInstances + " table.."); // Create an array to hold the data as we read it in... double dataArray[][] = new double[numAttributes][numInstances]; // Populate the matrix with values... String valToken = ""; try { int rowIdx = 0; while ((line = reader.readLine()) != null) { String[] tokens = line.split(delimiter, -1); attrNames[rowIdx] = tokens[0].trim(); for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) { valToken = tokens[colIdx + 1]; double value; if (valToken.equals("null")) { value = Instance.missingValue(); } else if (valToken.equals("?")) { value = Instance.missingValue(); } else if (valToken.equals("NA")) { value = Instance.missingValue(); } else if (valToken.equals("")) { value = Instance.missingValue(); // }else value = DoubleParser.lightningParse(valToken); // faster double parser with // MANY assumptions } else value = Double.parseDouble(valToken); dataArray[rowIdx][colIdx] = value; } rowIdx++; } } catch (NumberFormatException e) { System.err.println(e.toString()); System.err.println("Parsing line: " + line); System.err.println("Parsing token: " + valToken); } // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); for (int a = 0; a < numAttributes; a++) { atts.addElement(new Attribute(attrNames[a])); } // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); System.err.print("creating instances.."); // System.err.println("DEBUG: numAttributes "+numAttributes); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < numInstances; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. for (int r = 0; r < numAttributes; r++) { double val = dataArray[r][c]; vals[r] = val; } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); // System.err.println("DEBUG: data.relationNAme"+data.relationName()); System.err.print("add feature names.."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 for (int c = 0; c < numInstances; c++) { newData.instance(c).setValue(attrIdx, instanceNames[c]); } data = newData; // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); return (data); }
/** * Returns a description of the classifier. * * @return a description of the classifier as a string. */ public String toString() { if (m_entries == null) { return "Decision Table: No model built yet."; } else { StringBuffer text = new StringBuffer(); text.append( "Decision Table:" + "\n\nNumber of training instances: " + m_numInstances + "\nNumber of Rules : " + m_entries.size() + "\n"); if (m_useIBk) { text.append("Non matches covered by IB1.\n"); } else { text.append("Non matches covered by Majority class.\n"); } text.append(m_search.toString()); /*text.append("Best first search for feature set,\nterminated after "+ m_maxStale+" non improving subsets.\n"); */ text.append("Evaluation (for feature selection): CV "); if (m_CVFolds > 1) { text.append("(" + m_CVFolds + " fold) "); } else { text.append("(leave one out) "); } text.append("\nFeature set: " + printFeatures()); if (m_displayRules) { // find out the max column width int maxColWidth = 0; for (int i = 0; i < m_dtInstances.numAttributes(); i++) { if (m_dtInstances.attribute(i).name().length() > maxColWidth) { maxColWidth = m_dtInstances.attribute(i).name().length(); } if (m_classIsNominal || (i != m_dtInstances.classIndex())) { Enumeration e = m_dtInstances.attribute(i).enumerateValues(); while (e.hasMoreElements()) { String ss = (String) e.nextElement(); if (ss.length() > maxColWidth) { maxColWidth = ss.length(); } } } } text.append("\n\nRules:\n"); StringBuffer tm = new StringBuffer(); for (int i = 0; i < m_dtInstances.numAttributes(); i++) { if (m_dtInstances.classIndex() != i) { int d = maxColWidth - m_dtInstances.attribute(i).name().length(); tm.append(m_dtInstances.attribute(i).name()); for (int j = 0; j < d + 1; j++) { tm.append(" "); } } } tm.append(m_dtInstances.attribute(m_dtInstances.classIndex()).name() + " "); for (int i = 0; i < tm.length() + 10; i++) { text.append("="); } text.append("\n"); text.append(tm); text.append("\n"); for (int i = 0; i < tm.length() + 10; i++) { text.append("="); } text.append("\n"); Enumeration e = m_entries.keys(); while (e.hasMoreElements()) { DecisionTableHashKey tt = (DecisionTableHashKey) e.nextElement(); text.append(tt.toString(m_dtInstances, maxColWidth)); double[] ClassDist = (double[]) m_entries.get(tt); if (m_classIsNominal) { int m = Utils.maxIndex(ClassDist); try { text.append(m_dtInstances.classAttribute().value(m) + "\n"); } catch (Exception ee) { System.out.println(ee.getMessage()); } } else { text.append((ClassDist[0] / ClassDist[1]) + "\n"); } } for (int i = 0; i < tm.length() + 10; i++) { text.append("="); } text.append("\n"); text.append("\n"); } return text.toString(); } }