/** * Compute and store statistics required for generating artificial data. * * @param data training instances * @exception Exception if statistics could not be calculated successfully */ protected void computeStats(Instances data) throws Exception { int numAttributes = data.numAttributes(); m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Compute the probability of occurence of each distinct value int[] nomCounts = (data.attributeStats(j)).nominalCounts; double[] counts = new double[nomCounts.length]; if (counts.length < 2) throw new Exception("Nominal attribute has less than two distinct values!"); // Perform Laplace smoothing for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1; Utils.normalize(counts); double[] stats = new double[counts.length - 1]; stats[0] = counts[0]; // Calculate cumulative probabilities for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i]; m_AttributeStats.add(j, stats); } else if (data.attribute(j).isNumeric()) { // Get mean and standard deviation from the training data double[] stats = new double[2]; stats[0] = data.meanOrMode(j); stats[1] = Math.sqrt(data.variance(j)); m_AttributeStats.add(j, stats); } else System.err.println("Decorate can only handle numeric and nominal values."); } }
/** * Generate artificial training examples. * * @param artSize size of examples set to create * @param data training data * @return the set of unlabeled artificial examples */ protected Instances generateArtificialData(int artSize, Instances data) { int numAttributes = data.numAttributes(); Instances artData = new Instances(data, artSize); double[] att; Instance artInstance; for (int i = 0; i < artSize; i++) { att = new double[numAttributes]; for (int j = 0; j < numAttributes; j++) { if (data.attribute(j).isNominal()) { // Select nominal value based on the frequency of occurence in the training data double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (double) selectIndexProbabilistically(stats); } else if (data.attribute(j).isNumeric()) { // Generate numeric value from the Guassian distribution // defined by the mean and std dev of the attribute double[] stats = (double[]) m_AttributeStats.get(j); att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0]; } else System.err.println("Decorate can only handle numeric and nominal values."); } artInstance = new Instance(1.0, att); artData.add(artInstance); } return artData; }
/** * Determines the output format based on the input format and returns this. In case the output * format cannot be returned immediately, i.e., immediateOutputFormat() returns false, then this * method will be called from batchFinished(). * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong * @see #hasImmediateOutputFormat() * @see #batchFinished() */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { Instances data; Instances result; FastVector atts; FastVector values; HashSet hash; int i; int n; boolean isDate; Instance inst; Vector sorted; m_Cols.setUpper(inputFormat.numAttributes() - 1); data = new Instances(inputFormat); atts = new FastVector(); for (i = 0; i < data.numAttributes(); i++) { if (!m_Cols.isInRange(i) || !data.attribute(i).isNumeric()) { atts.addElement(data.attribute(i)); continue; } // date attribute? isDate = (data.attribute(i).type() == Attribute.DATE); // determine all available attribtues in dataset hash = new HashSet(); for (n = 0; n < data.numInstances(); n++) { inst = data.instance(n); if (inst.isMissing(i)) continue; if (isDate) hash.add(inst.stringValue(i)); else hash.add(new Double(inst.value(i))); } // sort values sorted = new Vector(); for (Object o : hash) sorted.add(o); Collections.sort(sorted); // create attribute from sorted values values = new FastVector(); for (Object o : sorted) { if (isDate) values.addElement(o.toString()); else values.addElement(Utils.doubleToString(((Double) o).doubleValue(), MAX_DECIMALS)); } atts.addElement(new Attribute(data.attribute(i).name(), values)); } result = new Instances(inputFormat.relationName(), atts, 0); result.setClassIndex(inputFormat.classIndex()); return result; }
/** * GetKs - return [K_1,K_2,...,K_L] where each Y_j \in {1,...,K_j}. In the multi-label case, K[j] * = 2 for all j = 1,...,L. * * @param D a dataset * @return an array of the number of values that each label can take */ private static int[] getKs(Instances D) { int L = D.classIndex(); int K[] = new int[L]; for (int k = 0; k < L; k++) { K[k] = D.attribute(k).numValues(); } return K; }
/** * Processes the given data (may change the provided dataset) and returns the modified version. * This method is called in batchFinished(). * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong * @see #batchFinished() */ protected Instances process(Instances instances) throws Exception { Instances result; int i; int n; double[] values; String value; Instance inst; Instance newInst; // we need the complete input data! if (!isFirstBatchDone()) setOutputFormat(determineOutputFormat(getInputFormat())); result = new Instances(getOutputFormat()); for (i = 0; i < instances.numInstances(); i++) { inst = instances.instance(i); values = inst.toDoubleArray(); for (n = 0; n < values.length; n++) { if (!m_Cols.isInRange(n) || !instances.attribute(n).isNumeric() || inst.isMissing(n)) continue; // get index of value if (instances.attribute(n).type() == Attribute.DATE) value = inst.stringValue(n); else value = Utils.doubleToString(inst.value(n), MAX_DECIMALS); values[n] = result.attribute(n).indexOfValue(value); } // generate new instance if (inst instanceof SparseInstance) newInst = new SparseInstance(inst.weight(), values); else newInst = new DenseInstance(inst.weight(), values); // copy possible string, relational values newInst.setDataset(getOutputFormat()); copyValues(newInst, false, inst.dataset(), getOutputFormat()); result.add(newInst); } return result; }
/** * return a string describing this clusterer * * @return a description of the clusterer as a string */ public String toString() { StringBuffer temp = new StringBuffer(); temp.append("\n FarthestFirst\n==============\n"); temp.append("\nCluster centroids:\n"); for (int i = 0; i < m_NumClusters; i++) { temp.append("\nCluster " + i + "\n\t"); for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) { if (m_ClusterCentroids.attribute(j).isNominal()) { temp.append( " " + m_ClusterCentroids .attribute(j) .value((int) m_ClusterCentroids.instance(i).value(j))); } else { temp.append(" " + m_ClusterCentroids.instance(i).value(j)); } } } temp.append("\n\n"); return temp.toString(); }
/** * Initializes a gain ratio attribute evaluator. Discretizes all attributes that are numeric. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been generated successfully */ public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); m_trainInstances = data; m_classIndex = m_trainInstances.classIndex(); m_numAttribs = m_trainInstances.numAttributes(); m_numInstances = m_trainInstances.numInstances(); Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, disTransform); m_numClasses = m_trainInstances.attribute(m_classIndex).numValues(); }
/** * Computes the difference between two given attribute values. * * @param index the attribute index * @param val1 the first value * @param val2 the second value * @return the difference */ protected double difference(int index, double val1, double val2) { switch (m_Data.attribute(index).type()) { case Attribute.NOMINAL: if (Utils.isMissingValue(val1) || Utils.isMissingValue(val2) || ((int) val1 != (int) val2)) { return 1; } else { return 0; } case Attribute.NUMERIC: if (Utils.isMissingValue(val1) || Utils.isMissingValue(val2)) { if (Utils.isMissingValue(val1) && Utils.isMissingValue(val2)) { if (!m_DontNormalize) { return 1; } else { return (m_Ranges[index][R_MAX] - m_Ranges[index][R_MIN]); } } else { double diff; if (Utils.isMissingValue(val2)) { diff = (!m_DontNormalize) ? norm(val1, index) : val1; } else { diff = (!m_DontNormalize) ? norm(val2, index) : val2; } if (!m_DontNormalize && diff < 0.5) { diff = 1.0 - diff; } else if (m_DontNormalize) { if ((m_Ranges[index][R_MAX] - diff) > (diff - m_Ranges[index][R_MIN])) { return m_Ranges[index][R_MAX] - diff; } else { return diff - m_Ranges[index][R_MIN]; } } return diff; } } else { return (!m_DontNormalize) ? (norm(val1, index) - norm(val2, index)) : (val1 - val2); } default: return 0; } }
/** * Prints out the classifier. * * @return a description of the classifier as a string */ public String toString() { StringBuffer text = new StringBuffer(); text.append("SMOreg\n\n"); if (m_weights != null) { text.append("weights (not support vectors):\n"); // it's a linear machine for (int i = 0; i < m_data.numAttributes(); i++) { if (i != m_classIndex) { text.append( (m_weights[i] >= 0 ? " + " : " - ") + Utils.doubleToString(Math.abs(m_weights[i]), 12, 4) + " * "); if (m_SVM.getFilterType().getSelectedTag().getID() == SMOreg.FILTER_STANDARDIZE) { text.append("(standardized) "); } else if (m_SVM.getFilterType().getSelectedTag().getID() == SMOreg.FILTER_NORMALIZE) { text.append("(normalized) "); } text.append(m_data.attribute(i).name() + "\n"); } } } else { // non linear, print out all supportvectors text.append("Support vectors:\n"); for (int i = 0; i < m_nInstances; i++) { if (m_alpha[i] > 0) { text.append("+" + m_alpha[i] + " * k[" + i + "]\n"); } if (m_alphaStar[i] > 0) { text.append("-" + m_alphaStar[i] + " * k[" + i + "]\n"); } } } text.append((m_b <= 0 ? " + " : " - ") + Utils.doubleToString(Math.abs(m_b), 12, 4) + "\n\n"); text.append("\n\nNumber of kernel evaluations: " + m_nEvals); if (m_nCacheHits >= 0 && m_nEvals > 0) { double hitRatio = 1 - m_nEvals * 1.0 / (m_nCacheHits + m_nEvals); text.append(" (" + Utils.doubleToString(hitRatio * 100, 7, 3).trim() + "% cached)"); } return text.toString(); }
/** * Method for building an Id3 tree. * * @param data the training data * @exception Exception if decision tree can't be built successfully */ private void makeTree(Instances data) throws Exception { // Check if no instances have reached this node. if (data.numInstances() == 0) { m_Attribute = null; m_ClassValue = Utils.missingValue(); m_Distribution = new double[data.numClasses()]; return; } // Compute attribute with maximum information gain. double[] infoGains = new double[data.numAttributes()]; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); infoGains[att.index()] = computeInfoGain(data, att); } m_Attribute = data.attribute(Utils.maxIndex(infoGains)); // Make leaf if information gain is zero. // Otherwise create successors. if (Utils.eq(infoGains[m_Attribute.index()], 0)) { m_Attribute = null; m_Distribution = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); m_Distribution[(int) inst.classValue()]++; } Utils.normalize(m_Distribution); m_ClassValue = Utils.maxIndex(m_Distribution); m_ClassAttribute = data.classAttribute(); } else { Instances[] splitData = splitData(data, m_Attribute); m_Successors = new Id3[m_Attribute.numValues()]; for (int j = 0; j < m_Attribute.numValues(); j++) { m_Successors[j] = new Id3(); m_Successors[j].makeTree(splitData[j]); } } }
/** * Generates an attribute evaluator. Has to initialise all fields of the evaluator that are not * being set via options. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been generated successfully */ public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); m_trainInstances = new Instances(data); m_trainInstances.deleteWithMissingClass(); m_numAttribs = m_trainInstances.numAttributes(); m_numInstances = m_trainInstances.numInstances(); // if the data has no decision feature, m_classIndex is negative m_classIndex = m_trainInstances.classIndex(); // supervised if (m_classIndex >= 0) { m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric(); if (m_isNumeric) { m_DecisionSimilarity = m_Similarity; } else m_DecisionSimilarity = m_SimilarityEq; } m_Similarity.setInstances(m_trainInstances); m_DecisionSimilarity.setInstances(m_trainInstances); m_SimilarityEq.setInstances(m_trainInstances); m_composition = m_Similarity.getTNorm(); m_FuzzyMeasure.set( m_Similarity, m_DecisionSimilarity, m_TNorm, m_composition, m_Implicator, m_SNorm, m_numInstances, m_numAttribs, m_classIndex, m_trainInstances); }
/** * returns a description of the search as a String * * @return a description of the search */ public String toString() { StringBuffer text = new StringBuffer(); text.append("\tRankSearch :\n"); text.append("\tAttribute evaluator : " + getAttributeEvaluator().getClass().getName() + " "); if (m_ASEval instanceof OptionHandler) { String[] evaluatorOptions = new String[0]; evaluatorOptions = ((OptionHandler) m_ASEval).getOptions(); for (int i = 0; i < evaluatorOptions.length; i++) { text.append(evaluatorOptions[i] + ' '); } } text.append("\n"); text.append("\tAttribute ranking : \n"); int rlength = (int) (Math.log(m_Ranking.length) / Math.log(10) + 1); for (int i = 0; i < m_Ranking.length; i++) { text.append( "\t " + Utils.doubleToString((double) (m_Ranking[i] + 1), rlength, 0) + " " + m_Instances.attribute(m_Ranking[i]).name() + '\n'); } text.append("\tMerit of best subset found : "); int fieldwidth = 3; double precision = (m_bestMerit - (int) m_bestMerit); if (Math.abs(m_bestMerit) > 0) { fieldwidth = (int) Math.abs((Math.log(Math.abs(m_bestMerit)) / Math.log(10))) + 2; } if (Math.abs(precision) > 0) { precision = Math.abs((Math.log(Math.abs(precision)) / Math.log(10))) + 3; } else { precision = 2; } text.append( Utils.doubleToString(Math.abs(m_bestMerit), fieldwidth + (int) precision, (int) precision) + "\n"); return text.toString(); }
/** Computes the difference between two given attribute values. */ protected double difference(int index, double val1, double val2) { switch (m_instances.attribute(index).type()) { case Attribute.NOMINAL: // If attribute is nominal if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2) || ((int) val1 != (int) val2)) { return 1; } else { return 0; } case Attribute.NUMERIC: // If attribute is numeric if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) { if (Instance.isMissingValue(val1) && Instance.isMissingValue(val2)) { return 1; } else { double diff; if (Instance.isMissingValue(val2)) { diff = norm(val1, index); } else { diff = norm(val2, index); } if (diff < 0.5) { diff = 1.0 - diff; } return diff; } } else { return norm(val1, index) - norm(val2, index); } default: return 0; } }
/** * Returns a description of the classifier. * * @return a description of the classifier as a string. */ public String toString() { if (m_entries == null) { return "Decision Table: No model built yet."; } else { StringBuffer text = new StringBuffer(); text.append( "Decision Table:" + "\n\nNumber of training instances: " + m_numInstances + "\nNumber of Rules : " + m_entries.size() + "\n"); if (m_useIBk) { text.append("Non matches covered by IB1.\n"); } else { text.append("Non matches covered by Majority class.\n"); } text.append(m_search.toString()); /*text.append("Best first search for feature set,\nterminated after "+ m_maxStale+" non improving subsets.\n"); */ text.append("Evaluation (for feature selection): CV "); if (m_CVFolds > 1) { text.append("(" + m_CVFolds + " fold) "); } else { text.append("(leave one out) "); } text.append("\nFeature set: " + printFeatures()); if (m_displayRules) { // find out the max column width int maxColWidth = 0; for (int i = 0; i < m_dtInstances.numAttributes(); i++) { if (m_dtInstances.attribute(i).name().length() > maxColWidth) { maxColWidth = m_dtInstances.attribute(i).name().length(); } if (m_classIsNominal || (i != m_dtInstances.classIndex())) { Enumeration e = m_dtInstances.attribute(i).enumerateValues(); while (e.hasMoreElements()) { String ss = (String) e.nextElement(); if (ss.length() > maxColWidth) { maxColWidth = ss.length(); } } } } text.append("\n\nRules:\n"); StringBuffer tm = new StringBuffer(); for (int i = 0; i < m_dtInstances.numAttributes(); i++) { if (m_dtInstances.classIndex() != i) { int d = maxColWidth - m_dtInstances.attribute(i).name().length(); tm.append(m_dtInstances.attribute(i).name()); for (int j = 0; j < d + 1; j++) { tm.append(" "); } } } tm.append(m_dtInstances.attribute(m_dtInstances.classIndex()).name() + " "); for (int i = 0; i < tm.length() + 10; i++) { text.append("="); } text.append("\n"); text.append(tm); text.append("\n"); for (int i = 0; i < tm.length() + 10; i++) { text.append("="); } text.append("\n"); Enumeration e = m_entries.keys(); while (e.hasMoreElements()) { DecisionTableHashKey tt = (DecisionTableHashKey) e.nextElement(); text.append(tt.toString(m_dtInstances, maxColWidth)); double[] ClassDist = (double[]) m_entries.get(tt); if (m_classIsNominal) { int m = Utils.maxIndex(ClassDist); try { text.append(m_dtInstances.classAttribute().value(m) + "\n"); } catch (Exception ee) { System.out.println(ee.getMessage()); } } else { text.append((ClassDist[0] / ClassDist[1]) + "\n"); } } for (int i = 0; i < tm.length() + 10; i++) { text.append("="); } text.append("\n"); text.append("\n"); } return text.toString(); } }
/** * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable. */ public static Instances readNumeric(String fileName, String relationName, String delimiter) throws Exception { int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading. String[] attrNames = new String[numAttributes]; // Read the col headings and figure out the number of columns in the table.. BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304); String line = reader.readLine(); String[] instanceNames = parseColNames(line, delimiter); int numInstances = instanceNames.length; System.err.print("reading " + numAttributes + " x " + numInstances + " table.."); // Create an array to hold the data as we read it in... double dataArray[][] = new double[numAttributes][numInstances]; // Populate the matrix with values... String valToken = ""; try { int rowIdx = 0; while ((line = reader.readLine()) != null) { String[] tokens = line.split(delimiter, -1); attrNames[rowIdx] = tokens[0].trim(); for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) { valToken = tokens[colIdx + 1]; double value; if (valToken.equals("null")) { value = Instance.missingValue(); } else if (valToken.equals("?")) { value = Instance.missingValue(); } else if (valToken.equals("NA")) { value = Instance.missingValue(); } else if (valToken.equals("")) { value = Instance.missingValue(); // }else value = DoubleParser.lightningParse(valToken); // faster double parser with // MANY assumptions } else value = Double.parseDouble(valToken); dataArray[rowIdx][colIdx] = value; } rowIdx++; } } catch (NumberFormatException e) { System.err.println(e.toString()); System.err.println("Parsing line: " + line); System.err.println("Parsing token: " + valToken); } // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); for (int a = 0; a < numAttributes; a++) { atts.addElement(new Attribute(attrNames[a])); } // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); System.err.print("creating instances.."); // System.err.println("DEBUG: numAttributes "+numAttributes); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < numInstances; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. for (int r = 0; r < numAttributes; r++) { double val = dataArray[r][c]; vals[r] = val; } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); // System.err.println("DEBUG: data.relationNAme"+data.relationName()); System.err.print("add feature names.."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 for (int c = 0; c < numInstances; c++) { newData.instance(c).setValue(attrIdx, instanceNames[c]); } data = newData; // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); return (data); }
/** * ************************************************** Convert a table to a set of instances, with * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing * <b>attributes</b> */ public Instances tableRowsToNominalInstances(Table t, String relationName) { System.err.print("Converting table rows to instances..."); // Set up attributes, which for rowInstances will be the colNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int c = 0; c < t.numCols; c++) { // It's nominal... determine the range of values isNominal.add(true); FastVector attVals = getColValues(t, c); atts.addElement(new Attribute(t.colNames[c], attVals)); // Save it for later allAttVals.add(attVals); } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); // Fill the instances with data... // For each instance... for (int r = 0; r < t.numRows; r++) { double[] vals = new double[data.numAttributes()]; // for each attribute for (int c = 0; c < t.numCols; c++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[c] = Instance.missingValue(); else if (isNominal.get(c)) { vals[c] = allAttVals.get(c).indexOf(val); } else { vals[c] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int r = 0; r < t.rowNames.length; r++) { instanceNames.add(t.rowNames[r]); newData.instance(r).setValue(attrIdx, t.rowNames[r]); } data = newData; } System.err.println("done."); return (data); }
/** * ************************************************** Convert a table to a set of instances, with * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing * <b>attributes</b> (e.g. as is common with microarray data) */ public Instances tableColsToInstances(Table t, String relationName) { System.err.print("Converting table cols to instances..."); // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int r = 0; r < t.numRows; r++) { if (rowIsNumeric(t, r)) { isNominal.add(false); atts.addElement(new Attribute(t.rowNames[r])); allAttVals.add(null); // No enumeration of attribute values. } else { // It's nominal... determine the range of values and create a nominal attribute... isNominal.add(true); FastVector attVals = getRowValues(t, r); atts.addElement(new Attribute(t.rowNames[r], attVals)); // Save it for later allAttVals.add(attVals); } } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < t.numCols; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. // For each attribute fill in the numeric or attributeValue index... for (int r = 0; r < t.numRows; r++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[r] = Instance.missingValue(); else if (isNominal.get(r)) { vals[r] = allAttVals.get(r).indexOf(val); } else { vals[r] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int c = 0; c < t.colNames.length; c++) { instanceNames.add(t.colNames[c]); newData.instance(c).setValue(attrIdx, t.colNames[c]); } data = newData; } System.err.println("done."); return (data); }
/** * Writes a Batch of instances * * @throws IOException throws IOException if saving in batch mode is not possible */ public void writeBatch() throws IOException { Instances instances = getInstances(); if (instances == null) throw new IOException("No instances to save"); if (instances.classIndex() == -1) { instances.setClassIndex(instances.numAttributes() - 1); System.err.println("No class specified. Last attribute is used as class attribute."); } if (instances.attribute(instances.classIndex()).isNumeric()) throw new IOException("To save in C4.5 format the class attribute cannot be numeric."); if (getRetrieval() == INCREMENTAL) throw new IOException("Batch and incremental saving cannot be mixed."); setRetrieval(BATCH); if (retrieveFile() == null || getWriter() == null) { throw new IOException( "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option."); } setWriteMode(WRITE); // print names file setFileExtension(".names"); PrintWriter outW = new PrintWriter(getWriter()); for (int i = 0; i < instances.attribute(instances.classIndex()).numValues(); i++) { outW.write(instances.attribute(instances.classIndex()).value(i)); if (i < instances.attribute(instances.classIndex()).numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } for (int i = 0; i < instances.numAttributes(); i++) { if (i != instances.classIndex()) { outW.write(instances.attribute(i).name() + ": "); if (instances.attribute(i).isNumeric() || instances.attribute(i).isDate()) { outW.write("continuous.\n"); } else { Attribute temp = instances.attribute(i); for (int j = 0; j < temp.numValues(); j++) { outW.write(temp.value(j)); if (j < temp.numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } } } } outW.flush(); outW.close(); // print data file String out = retrieveFile().getAbsolutePath(); setFileExtension(".data"); out = out.substring(0, out.lastIndexOf('.')) + getFileExtension(); File namesFile = new File(out); try { setFile(namesFile); } catch (Exception ex) { throw new IOException( "Cannot create data file, only names file created (Reason: " + ex.toString() + ")."); } if (retrieveFile() == null || getWriter() == null) { throw new IOException("Cannot create data file, only names file created."); } outW = new PrintWriter(getWriter()); // print data file for (int i = 0; i < instances.numInstances(); i++) { Instance temp = instances.instance(i); for (int j = 0; j < temp.numAttributes(); j++) { if (j != instances.classIndex()) { if (temp.isMissing(j)) { outW.write("?,"); } else if (instances.attribute(j).isNominal() || instances.attribute(j).isString()) { outW.write(instances.attribute(j).value((int) temp.value(j)) + ","); } else { outW.write("" + temp.value(j) + ","); } } } // write the class value if (temp.isMissing(instances.classIndex())) { outW.write("?"); } else { outW.write( instances .attribute(instances.classIndex()) .value((int) temp.value(instances.classIndex()))); } outW.write("\n"); } outW.flush(); outW.close(); setFileExtension(".names"); setWriteMode(WAIT); outW = null; resetWriter(); setWriteMode(CANCEL); }
/** * Saves an instances incrementally. Structure has to be set by using the setStructure() method or * setInstances() method. * * @param inst the instance to save * @throws IOException throws IOEXception if an instance cannot be saved incrementally. */ public void writeIncremental(Instance inst) throws IOException { int writeMode = getWriteMode(); Instances structure = getInstances(); PrintWriter outW = null; if (structure != null) { if (structure.classIndex() == -1) { structure.setClassIndex(structure.numAttributes() - 1); System.err.println("No class specified. Last attribute is used as class attribute."); } if (structure.attribute(structure.classIndex()).isNumeric()) throw new IOException("To save in C4.5 format the class attribute cannot be numeric."); } if (getRetrieval() == BATCH || getRetrieval() == NONE) throw new IOException("Batch and incremental saving cannot be mixed."); if (retrieveFile() == null || getWriter() == null) { throw new IOException( "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option."); } outW = new PrintWriter(getWriter()); if (writeMode == WAIT) { if (structure == null) { setWriteMode(CANCEL); if (inst != null) System.err.println("Structure(Header Information) has to be set in advance"); } else setWriteMode(STRUCTURE_READY); writeMode = getWriteMode(); } if (writeMode == CANCEL) { if (outW != null) outW.close(); cancel(); } if (writeMode == STRUCTURE_READY) { setWriteMode(WRITE); // write header: here names file for (int i = 0; i < structure.attribute(structure.classIndex()).numValues(); i++) { outW.write(structure.attribute(structure.classIndex()).value(i)); if (i < structure.attribute(structure.classIndex()).numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } for (int i = 0; i < structure.numAttributes(); i++) { if (i != structure.classIndex()) { outW.write(structure.attribute(i).name() + ": "); if (structure.attribute(i).isNumeric() || structure.attribute(i).isDate()) { outW.write("continuous.\n"); } else { Attribute temp = structure.attribute(i); for (int j = 0; j < temp.numValues(); j++) { outW.write(temp.value(j)); if (j < temp.numValues() - 1) { outW.write(","); } else { outW.write(".\n"); } } } } } outW.flush(); outW.close(); writeMode = getWriteMode(); String out = retrieveFile().getAbsolutePath(); setFileExtension(".data"); out = out.substring(0, out.lastIndexOf('.')) + getFileExtension(); File namesFile = new File(out); try { setFile(namesFile); } catch (Exception ex) { throw new IOException("Cannot create data file, only names file created."); } if (retrieveFile() == null || getWriter() == null) { throw new IOException("Cannot create data file, only names file created."); } outW = new PrintWriter(getWriter()); } if (writeMode == WRITE) { if (structure == null) throw new IOException("No instances information available."); if (inst != null) { // write instance: here data file for (int j = 0; j < inst.numAttributes(); j++) { if (j != structure.classIndex()) { if (inst.isMissing(j)) { outW.write("?,"); } else if (structure.attribute(j).isNominal() || structure.attribute(j).isString()) { outW.write(structure.attribute(j).value((int) inst.value(j)) + ","); } else { outW.write("" + inst.value(j) + ","); } } } // write the class value if (inst.isMissing(structure.classIndex())) { outW.write("?"); } else { outW.write( structure .attribute(structure.classIndex()) .value((int) inst.value(structure.classIndex()))); } outW.write("\n"); // flushes every 100 instances m_incrementalCounter++; if (m_incrementalCounter > 100) { m_incrementalCounter = 0; outW.flush(); } } else { // close if (outW != null) { outW.flush(); outW.close(); } setFileExtension(".names"); m_incrementalCounter = 0; resetStructure(); outW = null; resetWriter(); } } }
/** * Tests a certain range of attributes of the given data, whether it can be processed by the * handler, given its capabilities. Classifiers implementing the <code> * MultiInstanceCapabilitiesHandler</code> interface are checked automatically for their * multi-instance Capabilities (if no bags, then only the bag-structure, otherwise only the first * bag). * * @param data the data to test * @param fromIndex the range of attributes - start (incl.) * @param toIndex the range of attributes - end (incl.) * @return true if all the tests succeeded * @see MultiInstanceCapabilitiesHandler * @see #m_InstancesTest * @see #m_MissingValuesTest * @see #m_MissingClassValuesTest * @see #m_MinimumNumberInstancesTest */ public boolean test(Instances data, int fromIndex, int toIndex) { int i; int n; int m; Attribute att; Instance inst; boolean testClass; Capabilities cap; boolean missing; Iterator iter; // shall we test the data? if (!m_InstancesTest) return true; // no Capabilities? -> warning if ((m_Capabilities.size() == 0) || ((m_Capabilities.size() == 1) && handles(Capability.NO_CLASS))) System.err.println(createMessage("No capabilities set!")); // any attributes? if (toIndex - fromIndex < 0) { m_FailReason = new WekaException(createMessage("No attributes!")); return false; } // do wee need to test the class attribute, i.e., is the class attribute // within the range of attributes? testClass = (data.classIndex() > -1) && (data.classIndex() >= fromIndex) && (data.classIndex() <= toIndex); // attributes for (i = fromIndex; i <= toIndex; i++) { att = data.attribute(i); // class is handled separately if (i == data.classIndex()) continue; // check attribute types if (!test(att)) return false; } // class if (!handles(Capability.NO_CLASS) && (data.classIndex() == -1)) { m_FailReason = new UnassignedClassException(createMessage("Class attribute not set!")); return false; } // special case: no class attribute can be handled if (handles(Capability.NO_CLASS) && (data.classIndex() > -1)) { cap = getClassCapabilities(); cap.disable(Capability.NO_CLASS); iter = cap.capabilities(); if (!iter.hasNext()) { m_FailReason = new WekaException(createMessage("Cannot handle any class attribute!")); return false; } } if (testClass && !handles(Capability.NO_CLASS)) { att = data.classAttribute(); if (!test(att, true)) return false; // special handling of RELATIONAL class // TODO: store additional Capabilities for this case // missing class labels if (m_MissingClassValuesTest) { if (!handles(Capability.MISSING_CLASS_VALUES)) { for (i = 0; i < data.numInstances(); i++) { if (data.instance(i).classIsMissing()) { m_FailReason = new WekaException(createMessage("Cannot handle missing class values!")); return false; } } } else { if (m_MinimumNumberInstancesTest) { int hasClass = 0; for (i = 0; i < data.numInstances(); i++) { if (!data.instance(i).classIsMissing()) hasClass++; } // not enough instances with class labels? if (hasClass < getMinimumNumberInstances()) { m_FailReason = new WekaException( createMessage( "Not enough training instances with class labels (required: " + getMinimumNumberInstances() + ", provided: " + hasClass + ")!")); return false; } } } } } // missing values if (m_MissingValuesTest) { if (!handles(Capability.MISSING_VALUES)) { missing = false; for (i = 0; i < data.numInstances(); i++) { inst = data.instance(i); if (inst instanceof SparseInstance) { for (m = 0; m < inst.numValues(); m++) { n = inst.index(m); // out of scope? if (n < fromIndex) continue; if (n > toIndex) break; // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } else { for (n = fromIndex; n <= toIndex; n++) { // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } if (missing) { m_FailReason = new NoSupportForMissingValuesException( createMessage("Cannot handle missing values!")); return false; } } } } // instances if (m_MinimumNumberInstancesTest) { if (data.numInstances() < getMinimumNumberInstances()) { m_FailReason = new WekaException( createMessage( "Not enough training instances (required: " + getMinimumNumberInstances() + ", provided: " + data.numInstances() + ")!")); return false; } } // Multi-Instance? -> check structure (regardless of attribute range!) if (handles(Capability.ONLY_MULTIINSTANCE)) { // number of attributes? if (data.numAttributes() != 3) { m_FailReason = new WekaException( createMessage("Incorrect Multi-Instance format, must be 'bag-id, bag, class'!")); return false; } // type of attributes and position of class? if (!data.attribute(0).isNominal() || !data.attribute(1).isRelationValued() || (data.classIndex() != data.numAttributes() - 1)) { m_FailReason = new WekaException( createMessage( "Incorrect Multi-Instance format, must be 'NOMINAL att, RELATIONAL att, CLASS att'!")); return false; } // check data immediately if (getOwner() instanceof MultiInstanceCapabilitiesHandler) { MultiInstanceCapabilitiesHandler handler = (MultiInstanceCapabilitiesHandler) getOwner(); cap = handler.getMultiInstanceCapabilities(); boolean result; if (data.numInstances() > 0) result = cap.test(data.attribute(1).relation(0)); else result = cap.test(data.attribute(1).relation()); if (!result) { m_FailReason = cap.m_FailReason; return false; } } } // passed all tests! return true; }
/** * returns a Capabilities object specific for this data. The minimum number of instances is not * set, the check for multi-instance data is optional. * * @param data the data to base the capabilities on * @param multi if true then the structure is checked, too * @return a data-specific capabilities object * @throws Exception in case an error occurrs, e.g., an unknown attribute type */ public static Capabilities forInstances(Instances data, boolean multi) throws Exception { Capabilities result; Capabilities multiInstance; int i; int n; int m; Instance inst; boolean missing; result = new Capabilities(null); // class if (data.classIndex() == -1) { result.enable(Capability.NO_CLASS); } else { switch (data.classAttribute().type()) { case Attribute.NOMINAL: if (data.classAttribute().numValues() == 1) result.enable(Capability.UNARY_CLASS); else if (data.classAttribute().numValues() == 2) result.enable(Capability.BINARY_CLASS); else result.enable(Capability.NOMINAL_CLASS); break; case Attribute.NUMERIC: result.enable(Capability.NUMERIC_CLASS); break; case Attribute.STRING: result.enable(Capability.STRING_CLASS); break; case Attribute.DATE: result.enable(Capability.DATE_CLASS); break; case Attribute.RELATIONAL: result.enable(Capability.RELATIONAL_CLASS); break; default: throw new UnsupportedAttributeTypeException( "Unknown class attribute type '" + data.classAttribute() + "'!"); } // missing class values for (i = 0; i < data.numInstances(); i++) { if (data.instance(i).classIsMissing()) { result.enable(Capability.MISSING_CLASS_VALUES); break; } } } // attributes for (i = 0; i < data.numAttributes(); i++) { // skip class if (i == data.classIndex()) continue; switch (data.attribute(i).type()) { case Attribute.NOMINAL: result.enable(Capability.UNARY_ATTRIBUTES); if (data.attribute(i).numValues() == 2) result.enable(Capability.BINARY_ATTRIBUTES); else if (data.attribute(i).numValues() > 2) result.enable(Capability.NOMINAL_ATTRIBUTES); break; case Attribute.NUMERIC: result.enable(Capability.NUMERIC_ATTRIBUTES); break; case Attribute.DATE: result.enable(Capability.DATE_ATTRIBUTES); break; case Attribute.STRING: result.enable(Capability.STRING_ATTRIBUTES); break; case Attribute.RELATIONAL: result.enable(Capability.RELATIONAL_ATTRIBUTES); break; default: throw new UnsupportedAttributeTypeException( "Unknown attribute type '" + data.attribute(i).type() + "'!"); } } // missing values missing = false; for (i = 0; i < data.numInstances(); i++) { inst = data.instance(i); if (inst instanceof SparseInstance) { for (m = 0; m < inst.numValues(); m++) { n = inst.index(m); // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } else { for (n = 0; n < data.numAttributes(); n++) { // skip class if (n == inst.classIndex()) continue; if (inst.isMissing(n)) { missing = true; break; } } } if (missing) { result.enable(Capability.MISSING_VALUES); break; } } // multi-instance data? if (multi) { if ((data.numAttributes() == 3) && (data.attribute(0).isNominal()) // bag-id && (data.attribute(1).isRelationValued()) // bag && (data.classIndex() == data.numAttributes() - 1)) { multiInstance = new Capabilities(null); multiInstance.or(result.getClassCapabilities()); multiInstance.enable(Capability.NOMINAL_ATTRIBUTES); multiInstance.enable(Capability.RELATIONAL_ATTRIBUTES); multiInstance.enable(Capability.ONLY_MULTIINSTANCE); result.assign(multiInstance); } } return result; }
public void buildClassifier(Instances insts) throws Exception { // Compute mean of target value double yMean = insts.meanOrMode(insts.classIndex()); // Choose best attribute double minMsq = Double.MAX_VALUE; m_attribute = null; int chosen = -1; double chosenSlope = Double.NaN; double chosenIntercept = Double.NaN; for (int i = 0; i < insts.numAttributes(); i++) { if (i != insts.classIndex()) { if (!insts.attribute(i).isNumeric()) { throw new Exception("UnivariateLinearRegression: Only numeric attributes!"); } m_attribute = insts.attribute(i); // Compute slope and intercept double xMean = insts.meanOrMode(i); double sumWeightedXDiffSquared = 0; double sumWeightedYDiffSquared = 0; m_slope = 0; for (int j = 0; j < insts.numInstances(); j++) { Instance inst = insts.instance(j); if (!inst.isMissing(i) && !inst.classIsMissing()) { double xDiff = inst.value(i) - xMean; double yDiff = inst.classValue() - yMean; double weightedXDiff = inst.weight() * xDiff; double weightedYDiff = inst.weight() * yDiff; m_slope += weightedXDiff * yDiff; sumWeightedXDiffSquared += weightedXDiff * xDiff; sumWeightedYDiffSquared += weightedYDiff * yDiff; } } // Skip attribute if not useful if (sumWeightedXDiffSquared == 0) { continue; } double numerator = m_slope; m_slope /= sumWeightedXDiffSquared; m_intercept = yMean - m_slope * xMean; // Compute sum of squared errors double msq = sumWeightedYDiffSquared - m_slope * numerator; // Check whether this is the best attribute if (msq < minMsq) { minMsq = msq; chosen = i; chosenSlope = m_slope; chosenIntercept = m_intercept; } } } // Set parameters if (chosen == -1) { System.err.println("----- no useful attribute found"); m_attribute = null; m_slope = 0; m_intercept = yMean; } else { m_attribute = insts.attribute(chosen); m_slope = chosenSlope; m_intercept = chosenIntercept; } }
/** * Gets the string describing the attributes the split depends on. i.e. the left hand side of the * description of the split. * * @param dataset the dataset that the split is based on * @return a string describing the attributes */ public String attributeString(Instances dataset) { return dataset.attribute(attIndex).name(); }
public void findAndSetSupportBoundForKnownAntecedents( Instances thisClassifiersExtension, boolean allWeightsAreOne) { if (m_Antds == null) return; double maxPurity = Double.NEGATIVE_INFINITY; boolean[] finishedAntecedents = new boolean[m_Antds.size()]; int numFinishedAntecedents = 0; while (numFinishedAntecedents < m_Antds.size()) { double maxPurityOfAllAntecedents = Double.NEGATIVE_INFINITY; int bestAntecedentsIndex = -1; double bestSupportBoundForAllAntecedents = Double.NaN; Instances ext = new Instances(thisClassifiersExtension, 0); for (int j = 0; j < m_Antds.size(); j++) { if (finishedAntecedents[j]) continue; ext = new Instances(thisClassifiersExtension); /* * Remove instances which are not relevant, because they are not covered * by the _other_ antecedents. */ for (int k = 0; k < m_Antds.size(); k++) { if (k == j) continue; Antd exclusionAntd = ((Antd) m_Antds.elementAt(k)); for (int y = 0; y < ext.numInstances(); y++) { if (exclusionAntd.covers(ext.instance(y)) == 0) { ext.delete(y--); } } } if (ext.attribute(((Antd) m_Antds.elementAt(j)).att.index()).isNumeric() && ext.numInstances() > 0) { NumericAntd currentAntd = (NumericAntd) ((NumericAntd) m_Antds.elementAt(j)).copy(); currentAntd.fuzzyYet = true; ext.deleteWithMissing(currentAntd.att.index()); double sumOfWeights = ext.sumOfWeights(); if (!Utils.gr(sumOfWeights, 0.0)) return; ext.sort(currentAntd.att.index()); double maxPurityForThisAntecedent = 0; double bestFoundSupportBound = Double.NaN; double lastAccu = 0; double lastCover = 0; // Test all possible edge points if (currentAntd.value == 0) { for (int k = 1; k < ext.numInstances(); k++) { // break the loop if there is no gain (only works when all instances have weight 1) if ((lastAccu + (ext.numInstances() - k - 1)) / (lastCover + (ext.numInstances() - k - 1)) < maxPurityForThisAntecedent && allWeightsAreOne) { break; } // Bag 1 if (currentAntd.splitPoint < ext.instance(k).value(currentAntd.att.index()) && ext.instance(k).value(currentAntd.att.index()) != ext.instance(k - 1).value(currentAntd.att.index())) { currentAntd.supportBound = ext.instance(k).value(currentAntd.att.index()); double[] accuArray = new double[ext.numInstances()]; double[] coverArray = new double[ext.numInstances()]; for (int i = 0; i < ext.numInstances(); i++) { coverArray[i] = ext.instance(i).weight(); double coverValue = currentAntd.covers(ext.instance(i)); if (coverArray[i] >= coverValue * ext.instance(i).weight()) { coverArray[i] = coverValue * ext.instance(i).weight(); if (ext.instance(i).classValue() == m_Consequent) { accuArray[i] = coverValue * ext.instance(i).weight(); } } } double purity = (Utils.sum(accuArray)) / (Utils.sum(coverArray)); if (purity >= maxPurityForThisAntecedent) { maxPurityForThisAntecedent = purity; bestFoundSupportBound = currentAntd.supportBound; } lastAccu = Utils.sum(accuArray); lastCover = Utils.sum(coverArray); } } } else { for (int k = ext.numInstances() - 2; k >= 0; k--) { // break the loop if there is no gain (only works when all instances have weight 1) if ((lastAccu + (k)) / (lastCover + (k)) < maxPurityForThisAntecedent && allWeightsAreOne) { break; } // Bag 2 if (currentAntd.splitPoint > ext.instance(k).value(currentAntd.att.index()) && ext.instance(k).value(currentAntd.att.index()) != ext.instance(k + 1).value(currentAntd.att.index())) { currentAntd.supportBound = ext.instance(k).value(currentAntd.att.index()); double[] accuArray = new double[ext.numInstances()]; double[] coverArray = new double[ext.numInstances()]; for (int i = 0; i < ext.numInstances(); i++) { coverArray[i] = ext.instance(i).weight(); double coverValue = currentAntd.covers(ext.instance(i)); if (coverArray[i] >= coverValue * ext.instance(i).weight()) { coverArray[i] = coverValue * ext.instance(i).weight(); if (ext.instance(i).classValue() == m_Consequent) { accuArray[i] = coverValue * ext.instance(i).weight(); } } } double purity = (Utils.sum(accuArray)) / (Utils.sum(coverArray)); if (purity >= maxPurityForThisAntecedent) { maxPurityForThisAntecedent = purity; bestFoundSupportBound = currentAntd.supportBound; } lastAccu = Utils.sum(accuArray); lastCover = Utils.sum(coverArray); } } } if (maxPurityForThisAntecedent > maxPurityOfAllAntecedents) { bestAntecedentsIndex = j; bestSupportBoundForAllAntecedents = bestFoundSupportBound; maxPurityOfAllAntecedents = maxPurityForThisAntecedent; } } else { // Nominal Antd finishedAntecedents[j] = true; numFinishedAntecedents++; continue; } } if (bestAntecedentsIndex == -1) { return; } if (maxPurity <= maxPurityOfAllAntecedents) { if (Double.isNaN(bestSupportBoundForAllAntecedents)) { ((NumericAntd) m_Antds.elementAt(bestAntecedentsIndex)).supportBound = ((NumericAntd) m_Antds.elementAt(bestAntecedentsIndex)).splitPoint; } else { ((NumericAntd) m_Antds.elementAt(bestAntecedentsIndex)).supportBound = bestSupportBoundForAllAntecedents; ((NumericAntd) m_Antds.elementAt(bestAntecedentsIndex)).fuzzyYet = true; } maxPurity = maxPurityOfAllAntecedents; } finishedAntecedents[bestAntecedentsIndex] = true; numFinishedAntecedents++; } }
/** * This function fits the rule to the data which it overlaps. This way the rule can only * interpolate but not extrapolate. * * @param instances The data to which the rule shall be fitted */ public void fitAndSetCoreBound(Instances instances) { if (m_Antds == null) return; boolean[] antExistingForDimension = new boolean[instances.numAttributes() - 1]; for (int i = 0; i < m_Antds.size(); i++) { antExistingForDimension[((Antd) m_Antds.elementAt(i)).att.index()] = true; } FastVector newAntds = new FastVector(10); // for (int i=0; i < instances.numAttributes()-1; i++){ for (int iterator = 0; iterator < m_Antds.size(); iterator++) { int i = ((Antd) m_Antds.elementAt(iterator)).getAttr().index(); if (!antExistingForDimension[i]) continue; // Excluding non existant antecedents Instances instancesWithoutMissingValues = new Instances(instances); instancesWithoutMissingValues.deleteWithMissing(i); if (instancesWithoutMissingValues.attribute(i).isNumeric() && instancesWithoutMissingValues.numInstances() > 0) { boolean bag0AntdExists = false; boolean bag1AntdExists = false; for (int j = 0; j < m_Antds.size(); j++) { if (((Antd) m_Antds.elementAt(j)).att.index() == i) { if (((Antd) m_Antds.elementAt(j)).value == 0) { bag0AntdExists = true; } else { bag1AntdExists = true; } newAntds.addElement((Antd) m_Antds.elementAt(j)); } } double higherCore = Double.NaN; double lowerCore = Double.NaN; if (!bag0AntdExists) { if (Double.isNaN(higherCore)) higherCore = instancesWithoutMissingValues.kthSmallestValue( i, instancesWithoutMissingValues.numInstances()); NumericAntd antd; antd = new NumericAntd(instancesWithoutMissingValues.attribute(i)); antd.value = 0; antd.splitPoint = higherCore; newAntds.addElement(antd); } if (!bag1AntdExists) { if (Double.isNaN(lowerCore)) lowerCore = instancesWithoutMissingValues.kthSmallestValue(i, 1); NumericAntd antd; antd = new NumericAntd(instancesWithoutMissingValues.attribute(i)); antd.value = 1; antd.splitPoint = lowerCore; newAntds.addElement(antd); } } else { for (int j = 0; j < m_Antds.size(); j++) { if (((Antd) m_Antds.elementAt(j)).att.index() == i) { newAntds.addElement(m_Antds.elementAt(j)); } } } } m_Antds = newAntds; }
/** * evaluates an individual attribute by measuring the gain ratio of the class given the attribute. * * @param attribute the index of the attribute to be evaluated * @return the gain ratio * @throws Exception if the attribute could not be evaluated */ public double evaluateAttribute(int attribute) throws Exception { int i, j, ii, jj; int ni, nj; double sum = 0.0; ni = m_trainInstances.attribute(attribute).numValues() + 1; nj = m_numClasses + 1; double[] sumi, sumj; Instance inst; double temp = 0.0; sumi = new double[ni]; sumj = new double[nj]; double[][] counts = new double[ni][nj]; sumi = new double[ni]; sumj = new double[nj]; for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumj[j] = 0.0; counts[i][j] = 0.0; } } // Fill the contingency table for (i = 0; i < m_numInstances; i++) { inst = m_trainInstances.instance(i); if (inst.isMissing(attribute)) { ii = ni - 1; } else { ii = (int) inst.value(attribute); } if (inst.isMissing(m_classIndex)) { jj = nj - 1; } else { jj = (int) inst.value(m_classIndex); } counts[ii][jj]++; } // get the row totals for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumi[i] += counts[i][j]; sum += counts[i][j]; } } // get the column totals for (j = 0; j < nj; j++) { sumj[j] = 0.0; for (i = 0; i < ni; i++) { sumj[j] += counts[i][j]; } } // distribute missing counts if (m_missing_merge && (sumi[ni - 1] < m_numInstances) && (sumj[nj - 1] < m_numInstances)) { double[] i_copy = new double[sumi.length]; double[] j_copy = new double[sumj.length]; double[][] counts_copy = new double[sumi.length][sumj.length]; for (i = 0; i < ni; i++) { System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length); } System.arraycopy(sumi, 0, i_copy, 0, sumi.length); System.arraycopy(sumj, 0, j_copy, 0, sumj.length); double total_missing = (sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]); // do the missing i's if (sumi[ni - 1] > 0.0) { for (j = 0; j < nj - 1; j++) { if (counts[ni - 1][j] > 0.0) { for (i = 0; i < ni - 1; i++) { temp = ((i_copy[i] / (sum - i_copy[ni - 1])) * counts[ni - 1][j]); counts[i][j] += temp; sumi[i] += temp; } counts[ni - 1][j] = 0.0; } } } sumi[ni - 1] = 0.0; // do the missing j's if (sumj[nj - 1] > 0.0) { for (i = 0; i < ni - 1; i++) { if (counts[i][nj - 1] > 0.0) { for (j = 0; j < nj - 1; j++) { temp = ((j_copy[j] / (sum - j_copy[nj - 1])) * counts[i][nj - 1]); counts[i][j] += temp; sumj[j] += temp; } counts[i][nj - 1] = 0.0; } } } sumj[nj - 1] = 0.0; // do the both missing if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) { for (i = 0; i < ni - 1; i++) { for (j = 0; j < nj - 1; j++) { temp = (counts_copy[i][j] / (sum - total_missing)) * counts_copy[ni - 1][nj - 1]; counts[i][j] += temp; sumi[i] += temp; sumj[j] += temp; } } counts[ni - 1][nj - 1] = 0.0; } } return ContingencyTables.gainRatio(counts); }