/** * Converts the tree under this node to a string * * @param treeLevel the depth of this node; the root of a tree should have treeLevel = 0 * @param deviation the global deviation of the class column, used for evaluating relative errors * @return the converted string */ public final String treeToString(int treeLevel, double deviation) { int i; StringBuffer text = new StringBuffer(); if (type == true) { text.append("\n"); for (i = 1; i <= treeLevel; i++) { text.append(" "); } if (itemsets.getAttribute(splitAttr).name().charAt(0) != '[') { text.append( itemsets.getAttribute(splitAttr).name() + " <= " + M5.doubleToStringG(splitValue, 1, 3) + " "); } else { text.append(itemsets.getAttribute(splitAttr).name() + " false : "); } treeLevel++; text.append(leftNode.treeToString(treeLevel, deviation)); treeLevel--; for (i = 1; i <= treeLevel; i++) { text.append(" "); } if (itemsets.getAttribute(splitAttr).name().charAt(0) != '[') { text.append( itemsets.getAttribute(splitAttr).name() + " > " + M5.doubleToStringG(splitValue, 1, 3) + " "); } else { text.append(itemsets.getAttribute(splitAttr).name() + " true : "); } treeLevel++; text.append(rightNode.treeToString(treeLevel, deviation)); treeLevel--; } else { // LEAF text.append(" THEN LM" + lm + "\n"); /* if(deviation > 0.0) text.append(" (" + itemsets.numItemsets() + "/" + M5.doubleToStringG((100. * errors.rootMeanSqrErr / deviation),1,3) + "%)\n"); else text.append(" (" + itemsets.numItemsets() + ")\n");*/ } return text.toString(); }
/** * Converts the performance measures into a string * * @param measures contains both the unsmoothed and smoothed measures * @param inst the itemsets * @param lmNo also converts the predictions by all linear models if lmNo=0, or one linear model * spedified by lmNo. * @param verbosity the verbosity level * @param str the type of evaluation, one of "t" for training, "T" for testing, "f" for fold * training, "F" for fold testing, "x" for cross-validation * @return the converted string * @exception Exception if something goes wrong */ public final String measuresToString( Measures measures[], MyDataset inst, int lmNo, int verbosity, String str) throws Exception { StringBuffer text = new StringBuffer(); double absDev, sd; absDev = M5.absDev(inst.getClassIndex(), inst); sd = M5.stdDev(inst.getClassIndex(), inst); text.append(" Without smoothing:\n\n"); if ((verbosity >= 2 || lmNo != 0) && (str.equals("T") == true || str.equals("F") == true)) { text.append(predictionsToString(inst, lmNo, false)); } text.append(measures[0].toString(absDev, sd, str, "u") + "\n\n"); text.append(" With smoothing:\n\n"); if ((verbosity >= 2 || lmNo != 0) && (str.equals("T") == true || str.equals("F") == true)) { text.append(this.predictionsToString(inst, lmNo, true)); } text.append(measures[1].toString(absDev, sd, str, "s") + "\n\n"); return text.toString(); }
/** * Computes performance measures of a tree * * @param inst itemsets * @param smooth =true uses the smoothed models; otherwise uses the unsmoothed models * @return the performance measures * @exception Exception if something goes wrong */ public final Measures measures(MyDataset inst, boolean smooth) throws Exception { int i, numItemsets, count; double sd, y1[], y2[]; Measures measures = new Measures(); errors = this.errors(inst, smooth); numItemsets = errors.numItemsets - errors.missingItemsets; y1 = new double[numItemsets]; y2 = new double[numItemsets]; count = 0; for (i = 0; i <= inst.numItemsets() - 1; i++) { y1[count] = this.predict(inst.itemset(i), smooth); y2[count] = inst.itemset(i).getClassValue(); count++; } measures.correlation = M5.correlation(y1, y2, numItemsets); sd = M5.stdDev(inst.getClassIndex(), inst); if (sd > 0.0) { measures.meanAbsErr = errors.meanAbsErr; measures.meanSqrErr = errors.meanSqrErr; measures.type = 0; } else { if (numItemsets >= 1) { measures.type = 1; measures.meanAbsErr = errors.meanAbsErr; measures.meanSqrErr = errors.meanSqrErr; } else { measures.type = 2; measures.meanAbsErr = 0.0; measures.meanSqrErr = 0.0; } } return measures; }
/** * Converts the predictions by the tree under this node to a string * * @param inst itemsets * @param smooth =true using the smoothed models; otherwise, the unsmoothed * @param lmNo the number of the associated linear model * @return the converted string * @exception Exception if something goes wrong */ public final String predictionsToString(MyDataset inst, int lmNo, boolean smooth) throws Exception { int i, lmNum; double value; StringBuffer text = new StringBuffer(); text.append( " Predicting test itemsets (" + inst.getAttribute(inst.getClassIndex()).name() + ", column " + (inst.getClassIndex() + 1) + ")\n\n"); for (i = 0; i <= inst.numItemsets() - 1; i++) { lmNum = this.leafNum(inst.itemset(i)); if (lmNo == 0 || lmNo == lmNum) { text.append(" Predicting " + i + " (LM" + lmNum + "): "); text.append(inst.itemset(i).toString() + "\n"); value = this.predict(inst.itemset(i), smooth); if (inst.itemset(i).classIsMissing() == false) { text.append( " Actual value: " + M5.doubleToStringG(inst.itemset(i).getClassValue(), 9, 4) + " Prediction: " + M5.doubleToStringG(value, 9, 4) + " Abs. error: " + M5.doubleToStringG(Math.abs(inst.itemset(i).getClassValue() - value), 9, 4) + "\n\n"); } else { text.append( " Actual value: missing Prediction: " + M5.doubleToStringG(value, 9, 4) + " Abs. Error: undefined\n\n"); } } } return text.toString(); }
/** * Finds the appropriate order of the unsmoothed linear model at this node * * @exception Exception if something goes wrong */ public final void function() throws Exception { int n, jmin, flag = 0; double err1, err2, sdy; Results e1, e2; Function f1 = unsmoothed; Function f2; if (f1.terms[0] != 0) { sdy = M5.stdDev(itemsets.getClassIndex(), itemsets); this.regression(f1); valueNode = false; if (model != LINEAR_REGRESSION) { e1 = f1.errors(itemsets); err1 = e1.rootMeanSqrErr * this.factor(itemsets.numItemsets(), f1.terms[0] + 1, pruningFactor); flag = 0; while (flag == 0) { jmin = f1.insignificant(sdy, itemsets); if (jmin == -1) { flag = 1; } else { f2 = f1.remove(jmin); this.regression(f2); e2 = f2.errors(itemsets); err2 = e2.rootMeanSqrErr * this.factor(itemsets.numItemsets(), f2.terms[0] + 1, pruningFactor); if (err2 > err1 && err2 > deviation * 0.00001) { flag = 1; } else { // compare estimated error with and without attr jmin f1 = f2; err1 = err2; if (f1.terms[0] == 0) { flag = 1; } } } } } unsmoothed = f1; } if (unsmoothed.terms[0] == 0) { // constant function without attributes this.valueNode(); } }
/** * Creates a new instance of an object given it's class name and (optional) arguments to pass to * it's setOptions method. If the object implements OptionHandler and the options parameter is * non-null, the object will have it's options set. Example use: * * <p><code> <pre> * String classifierName = M5StaticUtils.getOption('W', options); * Classifier c = (Classifier)M5StaticUtils.forName(Classifier.class, * classifierName, * options); * setClassifier(c); * </pre></code> * * @param classType the class that the instantiated object should be assignable to -- an exception * is thrown if this is not the case * @param className the fully qualified class name of the object * @param options an array of options suitable for passing to setOptions. May be null. Any options * accepted by the object will be removed from the array. * @return the newly created object, ready for use. * @exception Exception if the class name is invalid, or if the class is not assignable to the * desired class type, or the options supplied are not acceptable to the object */ public static Object forName(Class classType, String className, String[] options) throws Exception { Class c = null; try { c = Class.forName(className); } catch (Exception ex) { throw new Exception("Can't find class called: " + className); } if (!classType.isAssignableFrom(c)) { throw new Exception(classType.getName() + " is not assignable from " + className); } Object o = c.newInstance(); if ((o instanceof M5) && (options != null)) { ((M5) o).setOptions(options); M5StaticUtils.checkForRemainingOptions(options); } return o; }
/** * Recursively smoothens the unsmoothed linear model at this node with the unsmoothed linear * models at the nodes above this * * @param current the unsmoothed linear model at the up node of the 'current' will be used for * smoothening */ public final void smoothenFormula(M5TreeNode current) { int i = smoothed.terms[0], j = current.upNode.unsmoothed.terms[0], k, l, smoothingConstant = 15; Function function; function = Function.combine(smoothed, current.upNode.unsmoothed); function.coeffs[0] = M5.smoothenValue( smoothed.coeffs[0], current.upNode.unsmoothed.coeffs[0], current.itemsets.numItemsets(), smoothingConstant); for (k = function.terms[0]; k >= 1; k--) { if (i >= 1 && j >= 1) { if (function.terms[k] == smoothed.terms[i] && function.terms[k] == current.upNode.unsmoothed.terms[j]) { function.coeffs[k] = M5.smoothenValue( smoothed.coeffs[i], current.upNode.unsmoothed.coeffs[j], current.itemsets.numItemsets(), smoothingConstant); i--; j--; } else if (function.terms[k] == smoothed.terms[i] && function.terms[k] != current.upNode.unsmoothed.terms[j]) { function.coeffs[k] = M5.smoothenValue( smoothed.coeffs[i], 0.0, current.itemsets.numItemsets(), smoothingConstant); i--; } else if (function.terms[k] != smoothed.terms[i] && function.terms[k] == current.upNode.unsmoothed.terms[j]) { function.coeffs[k] = M5.smoothenValue( 0.0, current.upNode.unsmoothed.coeffs[j], current.itemsets.numItemsets(), smoothingConstant); j--; } else { M5.errorMsg("wrong terms value in smoothing_formula()."); } } else if (i < 1 && j < 1) { break; } else if (j >= 1) { for (l = k; l >= 1; l--) { function.coeffs[l] = M5.smoothenValue( 0.0, current.upNode.unsmoothed.coeffs[j--], current.itemsets.numItemsets(), smoothingConstant); } break; } else { for (l = k; l >= 1; l--) { function.coeffs[l] = M5.smoothenValue( smoothed.coeffs[i--], 0.0, current.itemsets.numItemsets(), smoothingConstant); } break; } } smoothed = function; if (current.upNode.upNode != null) { this.smoothenFormula(current.upNode); } }
/** * Splits the node recursively, unless there are few itemsets or itemsets have similar values of * the class attribute * * @param inst itemsets * @exception Exception if something goes wrong */ public final void split(MyDataset inst) throws Exception { SplitInfo s, sMax; int j, partition; MyDataset leftInst, rightInst; itemsets = inst; if (itemsets.numItemsets() < SPLIT_NUM || M5.stdDev(itemsets.getClassIndex(), itemsets) < deviation * 0.05) { type = false; } else { sMax = new SplitInfo(0, itemsets.numItemsets() - 1, -1); s = new SplitInfo(0, itemsets.numItemsets() - 1, -1); for (j = 0; j < itemsets.numAttributes(); j++) { if (j != itemsets.getClassIndex()) { itemsets.sort(itemsets.getAttribute(j)); s.attrSplit(j, itemsets); if ((Math.abs(s.maxImpurity - sMax.maxImpurity) > 1.e-6) && (s.maxImpurity > sMax.maxImpurity + 1.e-6)) { sMax = s.copy(); } } } if (sMax.splitAttr < 0 || sMax.position < 1 || sMax.position > itemsets.numItemsets() - 1) { type = false; } if (type == true) { sf = sMax; splitAttr = sMax.splitAttr; // split attribute splitValue = sMax.splitValue; // split value unsmoothed = new Function(splitAttr); // unsmoothed function leftInst = new MyDataset(itemsets, itemsets.numItemsets()); rightInst = new MyDataset(itemsets, itemsets.numItemsets()); int nmissings = 0, missings[] = new int[itemsets.numItemsets()]; for (int i = 0; i < itemsets.numItemsets(); i++) { if (!itemsets.isMissing(i, splitAttr)) { if (itemsets.itemset(i).getValue(splitAttr) <= splitValue) { leftInst.addItemset(itemsets.itemset(i)); } else { rightInst.addItemset(itemsets.itemset(i)); } } else { missings[nmissings] = i; nmissings++; } } // Missing values treatment if (nmissings > 0) { // Calculate the average class value double avgRight = 0.0, avgLeft = 0.0; if (itemsets.getAttribute(splitAttr).isEnumerate()) { avgRight = rightInst.averageClassValue(); avgLeft = leftInst.averageClassValue(); } else { if (rightInst.numItemsets() > 3) { rightInst.sort(splitAttr); int n = rightInst.numItemsets(); double sum = rightInst.itemset(n - 1).getClassValue() + rightInst.itemset(n - 2).getClassValue() + rightInst.itemset(n - 3).getClassValue(); avgRight = sum / ((double) n); } else { avgRight = rightInst.averageClassValue(); } if (leftInst.numItemsets() > 3) { leftInst.sort(splitAttr); int n = leftInst.numItemsets(); double sum = leftInst.itemset(0).getClassValue() + leftInst.itemset(1).getClassValue() + leftInst.itemset(2).getClassValue(); avgLeft = sum / ((double) n); } else { avgLeft = leftInst.averageClassValue(); } } double avgClassValue = (avgRight + avgLeft) / 2.0; // Give out the missing instances for (int i = 0; i < nmissings; i++) { if (itemsets.itemset(missings[i]).getClassValue() <= avgClassValue) { if (avgRight <= avgLeft) { rightInst.addItemset(itemsets.itemset(missings[i])); } else { leftInst.addItemset(itemsets.itemset(missings[i])); } } else { if (avgRight > avgLeft) { rightInst.addItemset(itemsets.itemset(missings[i])); } else { leftInst.addItemset(itemsets.itemset(missings[i])); } } } } leftInst.compactify(); rightInst.compactify(); leftNode = new M5TreeNode(leftInst, this); leftNode.split(leftInst); // split left node rightNode = new M5TreeNode(rightInst, this); rightNode.split(rightInst); // split right node // Give the missing values the average value for the splitting attribute if (nmissings > 0) { double avgAtt = itemsets.averageValue(splitAttr); for (int i = 0; i < nmissings; i++) itemsets.itemset(missings[i]).setValue(splitAttr, avgAtt); } this.valueNode(); // function of the constant value if (model != REGRESSION_TREE) { unsmoothed = Function.combine(unsmoothed, leftNode.unsmoothed); // passes up the attributes found under the left node unsmoothed = Function.combine(unsmoothed, rightNode.unsmoothed); // passes up the attributes found under the right node } else { unsmoothed = new Function(); } } } if (type == false) { // a leaf node this.leafNode(); errors = unsmoothed.errors(itemsets); } }