/** * Takes string values referenced by an Instance and copies them from a source dataset to a * destination dataset. The instance references are updated to be valid for the destination * dataset. The instance may have the structure (i.e. number and attribute position) of either * dataset (this affects where references are obtained from). Only works if the number of string * attributes is the same in both indices (implicitly these string attributes should be * semantically same but just with shifted positions). * * @param instance the instance containing references to strings in the source dataset that will * have references updated to be valid for the destination dataset. * @param instSrcCompat true if the instance structure is the same as the source, or false if it * is the same as the destination (i.e. which of the string attribute indices contains the * correct locations for this instance). * @param srcDataset the dataset for which the current instance string references are valid (after * any position mapping if needed) * @param srcStrAtts an array containing the indices of string attributes in the source datset. * @param destDataset the dataset for which the current instance string references need to be * inserted (after any position mapping if needed) * @param destStrAtts an array containing the indices of string attributes in the destination * datset. */ protected void copyStringValues( M5Instance instance, boolean instSrcCompat, M5Instances srcDataset, int[] srcStrAtts, M5Instances destDataset, int[] destStrAtts) { if (srcDataset == destDataset) { return; } if (srcStrAtts.length != destStrAtts.length) { throw new IllegalArgumentException("Src and Dest string indices differ in length!!"); } for (int i = 0; i < srcStrAtts.length; i++) { int instIndex = instSrcCompat ? srcStrAtts[i] : destStrAtts[i]; M5Attribute src = srcDataset.attribute(srcStrAtts[i]); M5Attribute dest = destDataset.attribute(destStrAtts[i]); if (!instance.isMissing(instIndex)) { // System.err.println(instance.value(srcIndex) // + " " + src.numValues() // + " " + dest.numValues()); int valIndex = dest.addStringValue(src, (int) instance.value(instIndex)); // setValue here shouldn't be too slow here unless your dataset has // squillions of string attributes instance.setValue(instIndex, (double) valIndex); } } }
/** * This will remove all buffered instances from the inputformat dataset. Use this method rather * than getInputFormat().delete(); */ protected void flushInput() { if (m_InputStringAtts.length > 0) { m_InputFormat = m_InputFormat.stringFreeStructure(); } else { // This more efficient than new Instances(m_InputFormat, 0); m_InputFormat.delete(); } }
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance structure (any instances * contained in the object are ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @exception Exception if the input format can't be set successfully */ public boolean setInputFormat(M5Instances instanceInfo) throws Exception { superSetInputFormat(instanceInfo); if (instanceInfo.classIndex() < 0) { throw new Exception("No class has been assigned to the instances"); } setOutputFormat(); m_Indices = null; if (instanceInfo.classAttribute().isNominal()) { return true; } else { return false; } }
/** * Gets an array containing the indices of all string attributes. * * @param insts the Instances to scan for string attributes. * @return an array containing the indices of string attributes in the input structure. Will be * zero-length if there are no string attributes */ protected int[] getStringIndices(M5Instances insts) { // Scan through getting the indices of String attributes int[] index = new int[insts.numAttributes()]; int indexSize = 0; for (int i = 0; i < insts.numAttributes(); i++) { if (insts.attribute(i).type() == M5Attribute.STRING) { index[indexSize++] = i; } } int[] result = new int[indexSize]; System.arraycopy(index, 0, result, 0, indexSize); return result; }
/** Set the output format if the class is numeric. */ private void setOutputFormatNumeric() { if (m_Indices == null) { setOutputFormat(null); return; } M5Vector newAtts; int newClassIndex; StringBuffer attributeName; M5Instances outputFormat; M5Vector vals; // Compute new attributes newClassIndex = getInputFormat().classIndex(); newAtts = new M5Vector(); for (int j = 0; j < getInputFormat().numAttributes(); j++) { M5Attribute att = getInputFormat().attribute(j); if ((!att.isNominal()) || (j == getInputFormat().classIndex())) { newAtts.addElement(att.copy()); } else { if (j < getInputFormat().classIndex()) { newClassIndex += att.numValues() - 2; } // Compute values for new attributes for (int k = 1; k < att.numValues(); k++) { attributeName = new StringBuffer(att.name() + "="); for (int l = k; l < att.numValues(); l++) { if (l > k) { attributeName.append(','); } attributeName.append(att.value(m_Indices[j][l])); } if (m_Numeric) { newAtts.addElement(new M5Attribute(attributeName.toString())); } else { vals = new M5Vector(2); vals.addElement("f"); vals.addElement("t"); newAtts.addElement(new M5Attribute(attributeName.toString(), vals)); } } } } outputFormat = new M5Instances(getInputFormat().relationName(), newAtts, 0); outputFormat.setClassIndex(newClassIndex); setOutputFormat(outputFormat); }
/** * Adds the supplied input instance to the inputformat dataset for later processing. Use this * method rather than getInputFormat().add(instance). Or else. * * @param instance the <code>Instance</code> to buffer. */ protected void bufferInput(M5Instance instance) { if (instance != null) { copyStringValues(instance, m_InputFormat, m_InputStringAtts); instance.setDataset(m_InputFormat); m_InputFormat.add(instance); } }
public boolean superSetInputFormat(M5Instances instanceInfo) throws Exception { m_InputFormat = instanceInfo.stringFreeStructure(); m_InputStringAtts = getStringIndices(instanceInfo); m_OutputFormat = null; m_OutputQueue = new Queue(); m_NewBatch = true; return false; }
/** * Sets the format of output instances. The derived class should use this method once it has * determined the outputformat. The output queue is cleared. * * @param outputFormat the new output format */ protected void setOutputFormat(M5Instances outputFormat) { if (outputFormat != null) { m_OutputFormat = outputFormat.stringFreeStructure(); m_OutputStringAtts = getStringIndices(m_OutputFormat); // Rename the attribute String relationName = outputFormat.relationName() + "-" + this.getClass().getName(); if (this instanceof NominalToBinaryFilter) { String[] options = ((NominalToBinaryFilter) this).getOptions(); for (int i = 0; i < options.length; i++) { relationName += options[i].trim(); } } m_OutputFormat.setRelationName(relationName); } else { m_OutputFormat = null; } m_OutputQueue = new Queue(); }
/** * Copies string values contained in the instance copied to a new dataset. The Instance must * already be assigned to a dataset. This dataset and the destination dataset must have the same * structure. * * @param instance the Instance containing the string values to copy. * @param destDataset the destination set of Instances * @param strAtts an array containing the indices of any string attributes in the dataset. */ private void copyStringValues(M5Instance inst, M5Instances destDataset, int[] strAtts) { if (strAtts.length == 0) { return; } if (inst.dataset() == null) { throw new IllegalArgumentException("Instance has no dataset assigned!!"); } else if (inst.dataset().numAttributes() != destDataset.numAttributes()) { throw new IllegalArgumentException("Src and Dest differ in # of attributes!!"); } copyStringValues(inst, true, inst.dataset(), strAtts, destDataset, strAtts); }
/** * Filters an entire set of instances through a filter and returns the new set. * * @param data the data to be filtered * @param filter the filter to be used * @return the filtered set of data * @exception Exception if the filter can't be used successfully */ public static M5Instances useFilter(M5Instances data, NominalToBinaryFilter filter) throws Exception { /* System.err.println(filter.getClass().getName() + " in:" + data.numInstances()); */ for (int i = 0; i < data.numInstances(); i++) { filter.input(data.instance(i)); } filter.batchFinished(); M5Instances newData = filter.getOutputFormat(); M5Instance processed; while ((processed = filter.output()) != null) { newData.add(processed); } /* System.err.println(filter.getClass().getName() + " out:" + newData.numInstances()); */ return newData; }
/** * Output an instance after filtering and remove from the output queue. * * @return the instance that has most recently been filtered (or null if the queue is empty). * @exception NullPointerException if no output structure has been defined */ public M5Instance output() { if (m_OutputFormat == null) { throw new NullPointerException("No output instance format defined"); } if (m_OutputQueue.empty()) { return null; } M5Instance result = (M5Instance) m_OutputQueue.pop(); // Clear out references to old strings occasionally if (m_OutputQueue.empty() && m_NewBatch) { if (m_OutputStringAtts.length > 0) { m_OutputFormat = m_OutputFormat.stringFreeStructure(); } } return result; }
/** * Method for testing filters ability to process multiple batches. * * @param options should contain the following arguments:<br> * -i (first) input file <br> * -o (first) output file <br> * -r (second) input file <br> * -s (second) output file <br> * -c class_index <br> * or -h for help on options * @exception Exception if something goes wrong or the user requests help on command options */ public static void batchFilterFile(NominalToBinaryFilter filter, String[] options) throws Exception { M5Instances firstData = null; M5Instances secondData = null; Reader firstInput = null; Reader secondInput = null; PrintWriter firstOutput = null; PrintWriter secondOutput = null; boolean helpRequest; try { helpRequest = M5StaticUtils.getFlag('h', options); String fileName = M5StaticUtils.getOption('i', options); if (fileName.length() != 0) { firstInput = new BufferedReader(new FileReader(fileName)); } else { throw new Exception("No first input file given.\n"); } fileName = M5StaticUtils.getOption('r', options); if (fileName.length() != 0) { secondInput = new BufferedReader(new FileReader(fileName)); } else { throw new Exception("No second input file given.\n"); } fileName = M5StaticUtils.getOption('o', options); if (fileName.length() != 0) { firstOutput = new PrintWriter(new FileOutputStream(fileName)); } else { firstOutput = new PrintWriter(System.out); } fileName = M5StaticUtils.getOption('s', options); if (fileName.length() != 0) { secondOutput = new PrintWriter(new FileOutputStream(fileName)); } else { secondOutput = new PrintWriter(System.out); } String classIndex = M5StaticUtils.getOption('c', options); if (filter instanceof NominalToBinaryFilter) { ((NominalToBinaryFilter) filter).setOptions(options); } M5StaticUtils.checkForRemainingOptions(options); if (helpRequest) { throw new Exception("Help requested.\n"); } firstData = new M5Instances(firstInput, 1); secondData = new M5Instances(secondInput, 1); if (!secondData.equalHeaders(firstData)) { throw new Exception("Input file formats differ.\n"); } if (classIndex.length() != 0) { if (classIndex.equals("first")) { firstData.setClassIndex(0); secondData.setClassIndex(0); } else if (classIndex.equals("last")) { firstData.setClassIndex(firstData.numAttributes() - 1); secondData.setClassIndex(secondData.numAttributes() - 1); } else { firstData.setClassIndex(Integer.parseInt(classIndex) - 1); secondData.setClassIndex(Integer.parseInt(classIndex) - 1); } } } catch (Exception ex) { String filterOptions = ""; // Output the error and also the valid options if (filter instanceof NominalToBinaryFilter) { filterOptions += "\nFilter options:\n\n"; Enumeration enume = ((NominalToBinaryFilter) filter).listOptions(); while (enume.hasMoreElements()) { Information option = (Information) enume.nextElement(); filterOptions += option.synopsis() + '\n' + option.description() + "\n"; } } String genericOptions = "\nGeneral options:\n\n" + "-h\n" + "\tGet help on available options.\n" + "-i <filename>\n" + "\tThe file containing first input instances.\n" + "-o <filename>\n" + "\tThe file first output instances will be written to.\n" + "-r <filename>\n" + "\tThe file containing second input instances.\n" + "-s <filename>\n" + "\tThe file second output instances will be written to.\n" + "-c <class index>\n" + "\tThe number of the attribute to use as the class.\n" + "\t\"first\" and \"last\" are also valid entries.\n" + "\tIf not supplied then no class is assigned.\n"; throw new Exception('\n' + ex.getMessage() + filterOptions + genericOptions); } boolean printedHeader = false; if (filter.setInputFormat(firstData)) { firstOutput.println(filter.getOutputFormat().toString()); printedHeader = true; } // Pass all the instances to the filter while (firstData.readInstance(firstInput)) { if (filter.input(firstData.instance(0))) { if (!printedHeader) { throw new Error("Filter didn't return true from setInputFormat() " + "earlier!"); } firstOutput.println(filter.output().toString()); } firstData.delete(0); } // Say that input has finished, and print any pending output instances if (filter.batchFinished()) { if (!printedHeader) { firstOutput.println(filter.getOutputFormat().toString()); } while (filter.numPendingOutput() > 0) { firstOutput.println(filter.output().toString()); } } if (firstOutput != null) { firstOutput.close(); } printedHeader = false; if (filter.isOutputFormatDefined()) { secondOutput.println(filter.getOutputFormat().toString()); printedHeader = true; } // Pass all the second instances to the filter while (secondData.readInstance(secondInput)) { if (filter.input(secondData.instance(0))) { if (!printedHeader) { throw new Error("Filter didn't return true from" + " isOutputFormatDefined() earlier!"); } secondOutput.println(filter.output().toString()); } secondData.delete(0); } // Say that input has finished, and print any pending output instances if (filter.batchFinished()) { if (!printedHeader) { secondOutput.println(filter.getOutputFormat().toString()); } while (filter.numPendingOutput() > 0) { secondOutput.println(filter.output().toString()); } } if (secondOutput != null) { secondOutput.close(); } }
/** * Method for testing filters. * * @param options should contain the following arguments: <br> * -i input_file <br> * -o output_file <br> * -c class_index <br> * or -h for help on options * @exception Exception if something goes wrong or the user requests help on command options */ public static void filterFile(NominalToBinaryFilter filter, String[] options) throws Exception { boolean debug = false; M5Instances data = null; Reader input = null; PrintWriter output = null; boolean helpRequest; try { helpRequest = M5StaticUtils.getFlag('h', options); if (M5StaticUtils.getFlag('d', options)) { debug = true; } String infileName = M5StaticUtils.getOption('i', options); String outfileName = M5StaticUtils.getOption('o', options); String classIndex = M5StaticUtils.getOption('c', options); if (filter instanceof NominalToBinaryFilter) { ((NominalToBinaryFilter) filter).setOptions(options); } M5StaticUtils.checkForRemainingOptions(options); if (helpRequest) { throw new Exception("Help requested.\n"); } if (infileName.length() != 0) { input = new BufferedReader(new FileReader(infileName)); } else { input = new BufferedReader(new InputStreamReader(System.in)); } if (outfileName.length() != 0) { output = new PrintWriter(new FileOutputStream(outfileName)); } else { output = new PrintWriter(System.out); } data = new M5Instances(input, 1); if (classIndex.length() != 0) { if (classIndex.equals("first")) { data.setClassIndex(0); } else if (classIndex.equals("last")) { data.setClassIndex(data.numAttributes() - 1); } else { data.setClassIndex(Integer.parseInt(classIndex) - 1); } } } catch (Exception ex) { String filterOptions = ""; // Output the error and also the valid options if (filter instanceof NominalToBinaryFilter) { filterOptions += "\nFilter options:\n\n"; Enumeration enuma = ((NominalToBinaryFilter) filter).listOptions(); while (enuma.hasMoreElements()) { Information option = (Information) enuma.nextElement(); filterOptions += option.synopsis() + '\n' + option.description() + "\n"; } } String genericOptions = "\nGeneral options:\n\n" + "-h\n" + "\tGet help on available options.\n" + "\t(use -b -h for help on batch mode.)\n" + "-i <file>\n" + "\tThe name of the file containing input instances.\n" + "\tIf not supplied then instances will be read from stdin.\n" + "-o <file>\n" + "\tThe name of the file output instances will be written to.\n" + "\tIf not supplied then instances will be written to stdout.\n" + "-c <class index>\n" + "\tThe number of the attribute to use as the class.\n" + "\t\"first\" and \"last\" are also valid entries.\n" + "\tIf not supplied then no class is assigned.\n"; throw new Exception('\n' + ex.getMessage() + filterOptions + genericOptions); } if (debug) { System.err.println("Setting input format"); } boolean printedHeader = false; if (filter.setInputFormat(data)) { if (debug) { System.err.println("Getting output format"); } output.println(filter.getOutputFormat().toString()); printedHeader = true; } // Pass all the instances to the filter while (data.readInstance(input)) { if (debug) { System.err.println("Input instance to filter"); } if (filter.input(data.instance(0))) { if (debug) { System.err.println("Filter said collect immediately"); } if (!printedHeader) { throw new Error("Filter didn't return true from setInputFormat() " + "earlier!"); } if (debug) { System.err.println("Getting output instance"); } output.println(filter.output().toString()); } data.delete(0); } // Say that input has finished, and print any pending output instances if (debug) { System.err.println("Setting end of batch"); } if (filter.batchFinished()) { if (debug) { System.err.println("Filter said collect output"); } if (!printedHeader) { if (debug) { System.err.println("Getting output format"); } output.println(filter.getOutputFormat().toString()); } if (debug) { System.err.println("Getting output instance"); } while (filter.numPendingOutput() > 0) { output.println(filter.output().toString()); if (debug) { System.err.println("Getting output instance"); } } } if (debug) { System.err.println("Done"); } if (output != null) { output.close(); } }