/** * Takes string values referenced by an Instance and copies them from a source dataset to a * destination dataset. The instance references are updated to be valid for the destination * dataset. The instance may have the structure (i.e. number and attribute position) of either * dataset (this affects where references are obtained from). Only works if the number of string * attributes is the same in both indices (implicitly these string attributes should be * semantically same but just with shifted positions). * * @param instance the instance containing references to strings in the source dataset that will * have references updated to be valid for the destination dataset. * @param instSrcCompat true if the instance structure is the same as the source, or false if it * is the same as the destination (i.e. which of the string attribute indices contains the * correct locations for this instance). * @param srcDataset the dataset for which the current instance string references are valid (after * any position mapping if needed) * @param srcStrAtts an array containing the indices of string attributes in the source datset. * @param destDataset the dataset for which the current instance string references need to be * inserted (after any position mapping if needed) * @param destStrAtts an array containing the indices of string attributes in the destination * datset. */ protected void copyStringValues( M5Instance instance, boolean instSrcCompat, M5Instances srcDataset, int[] srcStrAtts, M5Instances destDataset, int[] destStrAtts) { if (srcDataset == destDataset) { return; } if (srcStrAtts.length != destStrAtts.length) { throw new IllegalArgumentException("Src and Dest string indices differ in length!!"); } for (int i = 0; i < srcStrAtts.length; i++) { int instIndex = instSrcCompat ? srcStrAtts[i] : destStrAtts[i]; M5Attribute src = srcDataset.attribute(srcStrAtts[i]); M5Attribute dest = destDataset.attribute(destStrAtts[i]); if (!instance.isMissing(instIndex)) { // System.err.println(instance.value(srcIndex) // + " " + src.numValues() // + " " + dest.numValues()); int valIndex = dest.addStringValue(src, (int) instance.value(instIndex)); // setValue here shouldn't be too slow here unless your dataset has // squillions of string attributes instance.setValue(instIndex, (double) valIndex); } } }
/** * Computes average class values for each attribute and value * * @throws Exception */ private void computeAverageClassValues() throws Exception { double totalCounts, sum; M5Instance instance; double[] counts; double[][] avgClassValues = new double[getInputFormat().numAttributes()][0]; m_Indices = new int[getInputFormat().numAttributes()][0]; for (int j = 0; j < getInputFormat().numAttributes(); j++) { M5Attribute att = getInputFormat().attribute(j); if (att.isNominal()) { avgClassValues[j] = new double[att.numValues()]; counts = new double[att.numValues()]; for (int i = 0; i < getInputFormat().numInstances(); i++) { instance = getInputFormat().instance(i); if (!instance.classIsMissing() && (!instance.isMissing(j))) { counts[(int) instance.value(j)] += instance.weight(); avgClassValues[j][(int) instance.value(j)] += instance.weight() * instance.classValue(); } } sum = M5StaticUtils.sum(avgClassValues[j]); totalCounts = M5StaticUtils.sum(counts); if (M5StaticUtils.gr(totalCounts, 0)) { for (int k = 0; k < att.numValues(); k++) { if (M5StaticUtils.gr(counts[k], 0)) { avgClassValues[j][k] /= (double) counts[k]; } else { avgClassValues[j][k] = sum / (double) totalCounts; } } } m_Indices[j] = M5StaticUtils.sort(avgClassValues[j]); } } }
/** Set the output format if the class is nominal. */ private void setOutputFormatNominal() { M5Vector newAtts; int newClassIndex; StringBuffer attributeName; M5Instances outputFormat; M5Vector vals; // Compute new attributes newClassIndex = getInputFormat().classIndex(); newAtts = new M5Vector(); for (int j = 0; j < getInputFormat().numAttributes(); j++) { M5Attribute att = getInputFormat().attribute(j); if ((!att.isNominal()) || (j == getInputFormat().classIndex())) { newAtts.addElement(att.copy()); } else { if (att.numValues() <= 2) { if (m_Numeric) { newAtts.addElement(new M5Attribute(att.name())); } else { newAtts.addElement(att.copy()); } } else { if (j < getInputFormat().classIndex()) { newClassIndex += att.numValues() - 1; } // Compute values for new attributes for (int k = 0; k < att.numValues(); k++) { attributeName = new StringBuffer(att.name() + "="); attributeName.append(att.value(k)); if (m_Numeric) { newAtts.addElement(new M5Attribute(attributeName.toString())); } else { vals = new M5Vector(2); vals.addElement("f"); vals.addElement("t"); newAtts.addElement(new M5Attribute(attributeName.toString(), vals)); } } } } } outputFormat = new M5Instances(getInputFormat().relationName(), newAtts, 0); outputFormat.setClassIndex(newClassIndex); setOutputFormat(outputFormat); }
/** * Convert a single instance over if the class is numeric. The converted instance is added to the * end of the output queue. * * @param instance the instance to convert */ private void convertInstanceNumeric(M5Instance instance) { double[] vals = new double[outputFormatPeek().numAttributes()]; int attSoFar = 0; for (int j = 0; j < getInputFormat().numAttributes(); j++) { M5Attribute att = getInputFormat().attribute(j); if ((!att.isNominal()) || (j == getInputFormat().classIndex())) { vals[attSoFar] = instance.value(j); attSoFar++; } else { if (instance.isMissing(j)) { for (int k = 0; k < att.numValues() - 1; k++) { vals[attSoFar + k] = instance.value(j); } } else { int k = 0; while ((int) instance.value(j) != m_Indices[j][k]) { vals[attSoFar + k] = 1; k++; } while (k < att.numValues() - 1) { vals[attSoFar + k] = 0; k++; } } attSoFar += att.numValues() - 1; } } M5Instance inst = null; if (instance instanceof M5SparseInstance) { inst = new M5SparseInstance(instance.weight(), vals); } else { inst = new M5Instance(instance.weight(), vals); } copyStringValues( inst, false, instance.dataset(), getInputStringIndex(), getOutputFormat(), getOutputStringIndex()); inst.setDataset(getOutputFormat()); push(inst); }