/** * check if attribute types are not contradicting * * @return empty string if no problem, otherwise error message */ protected String checkIndices() { for (int i = 0; i < getNumAttributes(); i++) { if (m_booleanCols.isInRange(i) && m_nominalCols.isInRange(i)) { return "Error in attribute type: Attribute " + i + " is set boolean and nominal."; } } return ""; }
/** * Checks the current instance against what is known about the structure of the data set so far. * If there is a nominal value for an attribute that was beleived to be numeric then all * previously seen values for this attribute are stored in a Hashtable. * * @param current a <code>ArrayList</code> value * @exception Exception if an error occurs * <pre><jml> * private_normal_behavior * requires: current != null; * also * private_exceptional_behavior * requires: current == null * || (* unrecognized object type in current *); * signals: (Exception); * </jml></pre> */ private void checkStructure(ArrayList<Object> current) throws Exception { if (current == null) { throw new Exception("current shouldn't be null in checkStructure"); } // initialize ranges, if necessary if (m_FirstCheck) { m_NominalAttributes.setUpper(current.size() - 1); m_StringAttributes.setUpper(current.size() - 1); m_FirstCheck = false; } for (int i = 0; i < current.size(); i++) { Object ob = current.get(i); if ((ob instanceof String) || (m_NominalAttributes.isInRange(i)) || (m_StringAttributes.isInRange(i))) { if (ob.toString().compareTo(m_MissingValue) == 0) { // do nothing } else { Hashtable<Object, Integer> tempHash = m_cumulativeStructure.get(i); if (!tempHash.containsKey(ob)) { // may have found a nominal value in what was previously thought to // be a numeric variable. if (tempHash.size() == 0) { for (int j = 0; j < m_cumulativeInstances.size(); j++) { ArrayList tempUpdate = ((ArrayList) m_cumulativeInstances.get(j)); Object tempO = tempUpdate.get(i); if (tempO instanceof String) { // must have been a missing value } else { if (!tempHash.containsKey(tempO)) { tempHash.put( new Double(((Double) tempO).doubleValue()), new Integer(tempHash.size())); } } } } int newIndex = tempHash.size(); tempHash.put(ob, new Integer(newIndex)); } } } else if (ob instanceof Double) { Hashtable<Object, Integer> tempHash = m_cumulativeStructure.get(i); if (tempHash.size() != 0) { if (!tempHash.containsKey(ob)) { int newIndex = tempHash.size(); tempHash.put(new Double(((Double) ob).doubleValue()), new Integer(newIndex)); } } } else { throw new Exception("Wrong object type in checkStructure!"); } } }
/** * Set the output format. Takes the current average class values and m_InputFormat and calls * setOutputFormat(Instances) appropriately. */ private void setOutputFormat() { Instances newData; FastVector newAtts, newVals; // Compute new attributes newAtts = new FastVector(getInputFormat().numAttributes()); for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if (!m_AttIndices.isInRange(j) || !att.isString()) { // We don't have to copy the attribute because the // attribute index remains unchanged. newAtts.addElement(att); } else { // Compute list of attribute values newVals = new FastVector(att.numValues()); for (int i = 0; i < att.numValues(); i++) { newVals.addElement(att.value(i)); } newAtts.addElement(new Attribute(att.name(), newVals)); } } // Construct new header newData = new Instances(getInputFormat().relationName(), newAtts, 0); newData.setClassIndex(getInputFormat().classIndex()); setOutputFormat(newData); }
/** * Convert a single instance over. The converted instance is added to the end of the output queue. * * @param instance the instance to convert */ protected void convertInstance(Instance instance) { int index = 0; double[] vals = new double[outputFormatPeek().numAttributes()]; // Copy and convert the values for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (m_DiscretizeCols.isInRange(i) && getInputFormat().attribute(i).isNumeric()) { int j; double currentVal = instance.value(i); if (m_CutPoints[i] == null) { if (instance.isMissing(i)) { vals[index] = Utils.missingValue(); } else { vals[index] = 0; } index++; } else { if (!m_MakeBinary) { if (instance.isMissing(i)) { vals[index] = Utils.missingValue(); } else { for (j = 0; j < m_CutPoints[i].length; j++) { if (currentVal <= m_CutPoints[i][j]) { break; } } vals[index] = j; } index++; } else { for (j = 0; j < m_CutPoints[i].length; j++) { if (instance.isMissing(i)) { vals[index] = Utils.missingValue(); } else if (currentVal <= m_CutPoints[i][j]) { vals[index] = 0; } else { vals[index] = 1; } index++; } } } } else { vals[index] = instance.value(i); index++; } } Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new DenseInstance(instance.weight(), vals); } inst.setDataset(getOutputFormat()); copyValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); }
/** * Input an instance for filtering. Ordinarily the instance is processed and made available for * output immediately. Some filters require all instances be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input structure has been defined. */ @Override public boolean input(Instance instance) { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (getOutputFormat().numAttributes() == 0) { return false; } if (m_selectedAttributes.length == 0) { push(instance); } else { double vals[] = new double[getOutputFormat().numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { double currentV = instance.value(i); if (!m_selectedCols.isInRange(i)) { vals[i] = currentV; } else { if (currentV == Utils.missingValue()) { vals[i] = currentV; } else { String currentS = instance.attribute(i).value((int) currentV); String replace = m_ignoreCase ? m_renameMap.get(currentS.toLowerCase()) : m_renameMap.get(currentS); if (replace == null) { vals[i] = currentV; } else { vals[i] = getOutputFormat().attribute(i).indexOfValue(replace); } } } } Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new DenseInstance(instance.weight(), vals); } inst.setDataset(getOutputFormat()); copyValues(inst, false, instance.dataset(), getOutputFormat()); inst.setDataset(getOutputFormat()); push(inst); } return true; }
/** * Determines the output format based on the input format and returns this. In case the output * format cannot be returned immediately, i.e., immediateOutputFormat() returns false, then this * method will be called from batchFinished(). * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong * @see #hasImmediateOutputFormat() * @see #batchFinished() */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { Instances data; Instances result; FastVector atts; FastVector values; HashSet hash; int i; int n; boolean isDate; Instance inst; Vector sorted; m_Cols.setUpper(inputFormat.numAttributes() - 1); data = new Instances(inputFormat); atts = new FastVector(); for (i = 0; i < data.numAttributes(); i++) { if (!m_Cols.isInRange(i) || !data.attribute(i).isNumeric()) { atts.addElement(data.attribute(i)); continue; } // date attribute? isDate = (data.attribute(i).type() == Attribute.DATE); // determine all available attribtues in dataset hash = new HashSet(); for (n = 0; n < data.numInstances(); n++) { inst = data.instance(n); if (inst.isMissing(i)) continue; if (isDate) hash.add(inst.stringValue(i)); else hash.add(new Double(inst.value(i))); } // sort values sorted = new Vector(); for (Object o : hash) sorted.add(o); Collections.sort(sorted); // create attribute from sorted values values = new FastVector(); for (Object o : sorted) { if (isDate) values.addElement(o.toString()); else values.addElement(Utils.doubleToString(((Double) o).doubleValue(), MAX_DECIMALS)); } atts.addElement(new Attribute(data.attribute(i).name(), values)); } result = new Instances(inputFormat.relationName(), atts, 0); result.setClassIndex(inputFormat.classIndex()); return result; }
/** Generate the cutpoints for each attribute */ protected void calculateCutPoints() { Instances copy = null; m_CutPoints = new double[getInputFormat().numAttributes()][]; for (int i = getInputFormat().numAttributes() - 1; i >= 0; i--) { if ((m_DiscretizeCols.isInRange(i)) && (getInputFormat().attribute(i).isNumeric())) { // Use copy to preserve order if (copy == null) { copy = new Instances(getInputFormat()); } calculateCutPointsByMDL(i, copy); } } }
/** * Determines the output format based on the input format and returns this. * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { Instances result; Attribute att; Attribute attSorted; FastVector atts; FastVector values; Vector<String> sorted; int i; int n; m_AttributeIndices.setUpper(inputFormat.numAttributes() - 1); // determine sorted indices atts = new FastVector(); m_NewOrder = new int[inputFormat.numAttributes()][]; for (i = 0; i < inputFormat.numAttributes(); i++) { att = inputFormat.attribute(i); if (!att.isNominal() || !m_AttributeIndices.isInRange(i)) { m_NewOrder[i] = new int[0]; atts.addElement(inputFormat.attribute(i).copy()); continue; } // sort labels sorted = new Vector<String>(); for (n = 0; n < att.numValues(); n++) sorted.add(att.value(n)); Collections.sort(sorted, m_Comparator); // determine new indices m_NewOrder[i] = new int[att.numValues()]; values = new FastVector(); for (n = 0; n < att.numValues(); n++) { m_NewOrder[i][n] = sorted.indexOf(att.value(n)); values.addElement(sorted.get(n)); } attSorted = new Attribute(att.name(), values); attSorted.setWeight(att.weight()); atts.addElement(attSorted); } // generate new header result = new Instances(inputFormat.relationName(), atts, 0); result.setClassIndex(inputFormat.classIndex()); return result; }
/** * Set the output format. Takes the current average class values and m_InputFormat and calls * setOutputFormat(Instances) appropriately. */ private void setOutputFormat() { Instances newData; FastVector newAtts; // Compute new attributes newAtts = new FastVector(getInputFormat().numAttributes()); for (int j = 0; j < getInputFormat().numAttributes(); j++) { Attribute att = getInputFormat().attribute(j); if (!att.isNominal() || !m_AttIndex.isInRange(j)) newAtts.addElement(att); else newAtts.addElement(new Attribute(att.name(), (FastVector) null)); } // Construct new header newData = new Instances(getInputFormat().relationName(), newAtts, 0); newData.setClassIndex(getInputFormat().classIndex()); setOutputFormat(newData); }
/** * Processes the given data (may change the provided dataset) and returns the modified version. * This method is called in batchFinished(). * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong * @see #batchFinished() */ protected Instances process(Instances instances) throws Exception { Instances result; int i; int n; double[] values; String value; Instance inst; Instance newInst; // we need the complete input data! if (!isFirstBatchDone()) setOutputFormat(determineOutputFormat(getInputFormat())); result = new Instances(getOutputFormat()); for (i = 0; i < instances.numInstances(); i++) { inst = instances.instance(i); values = inst.toDoubleArray(); for (n = 0; n < values.length; n++) { if (!m_Cols.isInRange(n) || !instances.attribute(n).isNumeric() || inst.isMissing(n)) continue; // get index of value if (instances.attribute(n).type() == Attribute.DATE) value = inst.stringValue(n); else value = Utils.doubleToString(inst.value(n), MAX_DECIMALS); values[n] = result.attribute(n).indexOfValue(value); } // generate new instance if (inst instanceof SparseInstance) newInst = new SparseInstance(inst.weight(), values); else newInst = new DenseInstance(inst.weight(), values); // copy possible string, relational values newInst.setDataset(getOutputFormat()); copyValues(newInst, false, inst.dataset(), getOutputFormat()); result.add(newInst); } return result; }
/** * processes the given instance (may change the provided instance) and returns the modified * version. * * @param instance the instance to process * @return the modified data * @throws Exception in case the processing goes wrong */ protected Instance process(Instance instance) throws Exception { Instance result; Attribute att; double[] values; int i; // adjust indices values = new double[instance.numAttributes()]; for (i = 0; i < instance.numAttributes(); i++) { att = instance.attribute(i); if (!att.isNominal() || !m_AttributeIndices.isInRange(i) || instance.isMissing(i)) values[i] = instance.value(i); else values[i] = m_NewOrder[i][(int) instance.value(i)]; } // create new instance result = new DenseInstance(instance.weight(), values); return result; }
/** * Determines the output format based on the input format and returns this. In case the output * format cannot be returned immediately, i.e., hasImmediateOutputFormat() returns false, then * this method will called from batchFinished() after the call of preprocess(Instances), in which, * e.g., statistics for the actual processing step can be gathered. * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { Instances result; Attribute att; ArrayList<Attribute> atts; int i; m_AttributeIndices.setUpper(inputFormat.numAttributes() - 1); // generate new header atts = new ArrayList<Attribute>(); for (i = 0; i < inputFormat.numAttributes(); i++) { att = inputFormat.attribute(i); if (m_AttributeIndices.isInRange(i)) { if (m_ReplaceAll) atts.add(att.copy(att.name().replaceAll(m_Find, m_Replace))); else atts.add(att.copy(att.name().replaceFirst(m_Find, m_Replace))); } else { atts.add((Attribute) att.copy()); } } result = new Instances(inputFormat.relationName(), atts, 0); result.setClassIndex(inputFormat.classIndex()); return result; }
/** * Input an instance for filtering. The instance is processed and made available for output * immediately. * * @param instance the input instance. * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input structure has been defined. */ public boolean input(Instance instance) { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (isOutputFormatDefined()) { Instance newInstance = (Instance) instance.copy(); // make sure that we get the right indexes set for the converted // string attributes when operating on a second batch of instances for (int i = 0; i < newInstance.numAttributes(); i++) { if (newInstance.attribute(i).isString() && !newInstance.isMissing(i) && m_AttIndices.isInRange(i)) { Attribute outAtt = getOutputFormat().attribute(newInstance.attribute(i).name()); String inVal = newInstance.stringValue(i); int outIndex = outAtt.indexOfValue(inVal); if (outIndex < 0) { newInstance.setMissing(i); } else { newInstance.setValue(i, outIndex); } } } push(newInstance); return true; } bufferInput(instance); return false; }
/** * processes the given instance (may change the provided instance) and returns the modified * version. * * @param instance the instance to process * @return the modified data * @throws Exception in case the processing goes wrong */ @Override protected Instance process(Instance instance) throws Exception { Instance result; int i; double val; double factor; result = (Instance) instance.copy(); if (m_Decimals > -1) { factor = StrictMath.pow(10, m_Decimals); } else { factor = 1; } for (i = 0; i < result.numAttributes(); i++) { // only numeric attributes if (!result.attribute(i).isNumeric()) { continue; } // out of range? if (!m_Cols.isInRange(i)) { continue; } // skip class? if ((result.classIndex() == i) && (!m_IncludeClass)) { continue; } // too small? if (result.value(i) < m_MinThreshold) { if (getDebug()) { System.out.println("Too small: " + result.value(i) + " -> " + m_MinDefault); } result.setValue(i, m_MinDefault); } // too big? else if (result.value(i) > m_MaxThreshold) { if (getDebug()) { System.out.println("Too big: " + result.value(i) + " -> " + m_MaxDefault); } result.setValue(i, m_MaxDefault); } // too close? else if ((result.value(i) - m_CloseTo < m_CloseToTolerance) && (m_CloseTo - result.value(i) < m_CloseToTolerance) && (result.value(i) != m_CloseTo)) { if (getDebug()) { System.out.println("Too close: " + result.value(i) + " -> " + m_CloseToDefault); } result.setValue(i, m_CloseToDefault); } // decimals? if (m_Decimals > -1 && !result.isMissing(i)) { val = result.value(i); val = StrictMath.round(val * factor) / factor; result.setValue(i, val); } } return result; }
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance structure (any instances * contained in the object are ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the format couldn't be set successfully */ @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); int classIndex = instanceInfo.classIndex(); // setup the map if (m_renameVals != null && m_renameVals.length() > 0) { String[] vals = m_renameVals.split(","); for (String val : vals) { String[] parts = val.split(":"); if (parts.length != 2) { throw new WekaException("Invalid replacement string: " + val); } if (parts[0].length() == 0 || parts[1].length() == 0) { throw new WekaException("Invalid replacement string: " + val); } m_renameMap.put( m_ignoreCase ? parts[0].toLowerCase().trim() : parts[0].trim(), parts[1].trim()); } } // try selected atts as a numeric range first Range tempRange = new Range(); tempRange.setInvert(m_invert); if (m_selectedColsString == null) { m_selectedColsString = ""; } try { tempRange.setRanges(m_selectedColsString); tempRange.setUpper(instanceInfo.numAttributes() - 1); m_selectedAttributes = tempRange.getSelection(); m_selectedCols = tempRange; } catch (Exception r) { // OK, now try as named attributes StringBuffer indexes = new StringBuffer(); String[] attNames = m_selectedColsString.split(","); boolean first = true; for (String n : attNames) { n = n.trim(); Attribute found = instanceInfo.attribute(n); if (found == null) { throw new WekaException( "Unable to find attribute '" + n + "' in the incoming instances'"); } if (first) { indexes.append("" + (found.index() + 1)); first = false; } else { indexes.append("," + (found.index() + 1)); } } tempRange = new Range(); tempRange.setRanges(indexes.toString()); tempRange.setUpper(instanceInfo.numAttributes() - 1); m_selectedAttributes = tempRange.getSelection(); m_selectedCols = tempRange; } ArrayList<Attribute> attributes = new ArrayList<Attribute>(); for (int i = 0; i < instanceInfo.numAttributes(); i++) { if (m_selectedCols.isInRange(i)) { if (instanceInfo.attribute(i).isNominal()) { List<String> valsForAtt = new ArrayList<String>(); for (int j = 0; j < instanceInfo.attribute(i).numValues(); j++) { String origV = instanceInfo.attribute(i).value(j); String replace = m_ignoreCase ? m_renameMap.get(origV.toLowerCase()) : m_renameMap.get(origV); if (replace != null && !valsForAtt.contains(replace)) { valsForAtt.add(replace); } else { valsForAtt.add(origV); } } Attribute newAtt = new Attribute(instanceInfo.attribute(i).name(), valsForAtt); attributes.add(newAtt); } else { // ignore any selected attributes that are not nominal Attribute att = (Attribute) instanceInfo.attribute(i).copy(); attributes.add(att); } } else { Attribute att = (Attribute) instanceInfo.attribute(i).copy(); attributes.add(att); } } Instances outputFormat = new Instances(instanceInfo.relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); return true; }
private void readHeader() throws IOException { m_rowCount = 1; m_incrementalReader = null; m_current = new ArrayList<Object>(); openTempFiles(); m_rowBuffer = new ArrayList<String>(); String firstRow = m_sourceReader.readLine(); if (firstRow == null) { throw new IOException("No data in the file!"); } if (m_noHeaderRow) { m_rowBuffer.add(firstRow); } ArrayList<Attribute> attribNames = new ArrayList<Attribute>(); // now tokenize to determine attribute names (or create att names if // no header row StringReader sr = new StringReader(firstRow + "\n"); // System.out.print(firstRow + "\n"); m_st = new StreamTokenizer(sr); initTokenizer(m_st); m_st.ordinaryChar(m_FieldSeparator.charAt(0)); int attNum = 1; StreamTokenizerUtils.getFirstToken(m_st); if (m_st.ttype == StreamTokenizer.TT_EOF) { StreamTokenizerUtils.errms(m_st, "premature end of file"); } boolean first = true; boolean wasSep; while (m_st.ttype != StreamTokenizer.TT_EOL && m_st.ttype != StreamTokenizer.TT_EOF) { // Get next token if (!first) { StreamTokenizerUtils.getToken(m_st); } if (m_st.ttype == m_FieldSeparator.charAt(0) || m_st.ttype == StreamTokenizer.TT_EOL) { wasSep = true; } else { wasSep = false; String attName = null; if (m_noHeaderRow) { attName = "att" + attNum; attNum++; } else { attName = m_st.sval; } attribNames.add(new Attribute(attName, (java.util.List<String>) null)); } if (!wasSep) { StreamTokenizerUtils.getToken(m_st); } first = false; } String relationName; if (m_sourceFile != null) { relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$", ""); } else { relationName = "stream"; } m_structure = new Instances(relationName, attribNames, 0); m_NominalAttributes.setUpper(m_structure.numAttributes() - 1); m_StringAttributes.setUpper(m_structure.numAttributes() - 1); m_dateAttributes.setUpper(m_structure.numAttributes() - 1); m_numericAttributes.setUpper(m_structure.numAttributes() - 1); m_nominalVals = new HashMap<Integer, LinkedHashSet<String>>(); m_types = new TYPE[m_structure.numAttributes()]; for (int i = 0; i < m_structure.numAttributes(); i++) { if (m_NominalAttributes.isInRange(i)) { m_types[i] = TYPE.NOMINAL; LinkedHashSet<String> ts = new LinkedHashSet<String>(); m_nominalVals.put(i, ts); } else if (m_StringAttributes.isInRange(i)) { m_types[i] = TYPE.STRING; } else if (m_dateAttributes.isInRange(i)) { m_types[i] = TYPE.DATE; } else if (m_numericAttributes.isInRange(i)) { m_types[i] = TYPE.NUMERIC; } else { m_types[i] = TYPE.UNDETERMINED; } } if (m_nominalLabelSpecs.size() > 0) { for (String spec : m_nominalLabelSpecs) { String[] attsAndLabels = spec.split(":"); if (attsAndLabels.length == 2) { String[] labels = attsAndLabels[1].split(","); try { // try as a range string first Range tempR = new Range(); tempR.setRanges(attsAndLabels[0].trim()); tempR.setUpper(m_structure.numAttributes() - 1); int[] rangeIndexes = tempR.getSelection(); for (int i = 0; i < rangeIndexes.length; i++) { m_types[rangeIndexes[i]] = TYPE.NOMINAL; LinkedHashSet<String> ts = new LinkedHashSet<String>(); for (String lab : labels) { ts.add(lab); } m_nominalVals.put(rangeIndexes[i], ts); } } catch (IllegalArgumentException e) { // one or more named attributes? String[] attNames = attsAndLabels[0].split(","); for (String attN : attNames) { Attribute a = m_structure.attribute(attN.trim()); if (a != null) { int attIndex = a.index(); m_types[attIndex] = TYPE.NOMINAL; LinkedHashSet<String> ts = new LinkedHashSet<String>(); for (String lab : labels) { ts.add(lab); } m_nominalVals.put(attIndex, ts); } } } } } } // Prevents the first row from getting lost in the // case where there is no header row and we're // running in batch mode if (m_noHeaderRow && getRetrieval() == BATCH) { StreamTokenizer tempT = new StreamTokenizer(new StringReader(firstRow)); initTokenizer(tempT); tempT.ordinaryChar(m_FieldSeparator.charAt(0)); String checked = getInstance(tempT); dumpRow(checked); } m_st = new StreamTokenizer(m_sourceReader); initTokenizer(m_st); m_st.ordinaryChar(m_FieldSeparator.charAt(0)); // try and determine a more accurate structure from the first batch readData(false || getRetrieval() == BATCH); makeStructure(); }
/** * Return the full data set. If the structure hasn't yet been determined by a call to getStructure * then method should do so before processing the rest of the data set. * * @return the structure of the data set as an empty set of Instances * @exception IOException if there is no source or parsing fails */ @Override public Instances getDataSet() throws IOException { if ((m_sourceFile == null) && (m_sourceReader == null)) { throw new IOException("No source has been specified"); } if (m_structure == null) { getStructure(); } if (m_st == null) { m_st = new StreamTokenizer(m_sourceReader); initTokenizer(m_st); } m_st.ordinaryChar(m_FieldSeparator.charAt(0)); m_cumulativeStructure = new ArrayList<Hashtable<Object, Integer>>(m_structure.numAttributes()); for (int i = 0; i < m_structure.numAttributes(); i++) { m_cumulativeStructure.add(new Hashtable<Object, Integer>()); } m_cumulativeInstances = new ArrayList<ArrayList<Object>>(); ArrayList<Object> current; while ((current = getInstance(m_st)) != null) { m_cumulativeInstances.add(current); } ArrayList<Attribute> atts = new ArrayList<Attribute>(m_structure.numAttributes()); for (int i = 0; i < m_structure.numAttributes(); i++) { String attname = m_structure.attribute(i).name(); Hashtable<Object, Integer> tempHash = m_cumulativeStructure.get(i); if (tempHash.size() == 0) { atts.add(new Attribute(attname)); } else { if (m_StringAttributes.isInRange(i)) { atts.add(new Attribute(attname, (ArrayList<String>) null)); } else { ArrayList<String> values = new ArrayList<String>(tempHash.size()); // add dummy objects in order to make the ArrayList's size == capacity for (int z = 0; z < tempHash.size(); z++) { values.add("dummy"); } Enumeration e = tempHash.keys(); while (e.hasMoreElements()) { Object ob = e.nextElement(); // if (ob instanceof Double) { int index = ((Integer) tempHash.get(ob)).intValue(); String s = ob.toString(); if (s.startsWith("'") || s.startsWith("\"")) s = s.substring(1, s.length() - 1); values.set(index, new String(s)); // } } atts.add(new Attribute(attname, values)); } } } // make the instances String relationName; if (m_sourceFile != null) relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$", ""); else relationName = "stream"; Instances dataSet = new Instances(relationName, atts, m_cumulativeInstances.size()); for (int i = 0; i < m_cumulativeInstances.size(); i++) { current = m_cumulativeInstances.get(i); double[] vals = new double[dataSet.numAttributes()]; for (int j = 0; j < current.size(); j++) { Object cval = current.get(j); if (cval instanceof String) { if (((String) cval).compareTo(m_MissingValue) == 0) { vals[j] = Utils.missingValue(); } else { if (dataSet.attribute(j).isString()) { vals[j] = dataSet.attribute(j).addStringValue((String) cval); } else if (dataSet.attribute(j).isNominal()) { // find correct index Hashtable<Object, Integer> lookup = m_cumulativeStructure.get(j); int index = ((Integer) lookup.get(cval)).intValue(); vals[j] = index; } else { throw new IllegalStateException( "Wrong attribute type at position " + (i + 1) + "!!!"); } } } else if (dataSet.attribute(j).isNominal()) { // find correct index Hashtable<Object, Integer> lookup = m_cumulativeStructure.get(j); int index = ((Integer) lookup.get(cval)).intValue(); vals[j] = index; } else if (dataSet.attribute(j).isString()) { vals[j] = dataSet.attribute(j).addStringValue("" + cval); } else { vals[j] = ((Double) cval).doubleValue(); } } dataSet.add(new DenseInstance(1.0, vals)); } m_structure = new Instances(dataSet, 0); setRetrieval(BATCH); m_cumulativeStructure = null; // conserve memory // close the stream m_sourceReader.close(); return dataSet; }
/** * Set the output format. Takes the currently defined cutpoints and m_InputFormat and calls * setOutputFormat(Instances) appropriately. */ protected void setOutputFormat() { if (m_CutPoints == null) { setOutputFormat(null); return; } ArrayList<Attribute> attributes = new ArrayList<Attribute>(getInputFormat().numAttributes()); int classIndex = getInputFormat().classIndex(); for (int i = 0, m = getInputFormat().numAttributes(); i < m; ++i) { if ((m_DiscretizeCols.isInRange(i)) && (getInputFormat().attribute(i).isNumeric())) { Set<String> cutPointsCheck = new HashSet<String>(); double[] cutPoints = m_CutPoints[i]; if (!m_MakeBinary) { ArrayList<String> attribValues; if (cutPoints == null) { attribValues = new ArrayList<String>(1); attribValues.add("'All'"); } else { attribValues = new ArrayList<String>(cutPoints.length + 1); if (m_UseBinNumbers) { for (int j = 0, n = cutPoints.length; j <= n; ++j) { attribValues.add("'B" + (j + 1) + "of" + (n + 1) + "'"); } } else { for (int j = 0, n = cutPoints.length; j <= n; ++j) { String newBinRangeString = binRangeString(cutPoints, j, m_BinRangePrecision); if (cutPointsCheck.contains(newBinRangeString)) { throw new IllegalArgumentException( "A duplicate bin range was detected. " + "Try increasing the bin range precision."); } attribValues.add("'" + newBinRangeString + "'"); } } } Attribute newAtt = new Attribute(getInputFormat().attribute(i).name(), attribValues); newAtt.setWeight(getInputFormat().attribute(i).weight()); attributes.add(newAtt); } else { if (cutPoints == null) { ArrayList<String> attribValues = new ArrayList<String>(1); attribValues.add("'All'"); Attribute newAtt = new Attribute(getInputFormat().attribute(i).name(), attribValues); newAtt.setWeight(getInputFormat().attribute(i).weight()); attributes.add(newAtt); } else { if (i < getInputFormat().classIndex()) { classIndex += cutPoints.length - 1; } for (int j = 0, n = cutPoints.length; j < n; ++j) { ArrayList<String> attribValues = new ArrayList<String>(2); if (m_UseBinNumbers) { attribValues.add("'B1of2'"); attribValues.add("'B2of2'"); } else { double[] binaryCutPoint = {cutPoints[j]}; String newBinRangeString1 = binRangeString(binaryCutPoint, 0, m_BinRangePrecision); String newBinRangeString2 = binRangeString(binaryCutPoint, 1, m_BinRangePrecision); if (newBinRangeString1.equals(newBinRangeString2)) { throw new IllegalArgumentException( "A duplicate bin range was detected. " + "Try increasing the bin range precision."); } attribValues.add("'" + newBinRangeString1 + "'"); attribValues.add("'" + newBinRangeString2 + "'"); } Attribute newAtt = new Attribute(getInputFormat().attribute(i).name() + "_" + (j + 1), attribValues); newAtt.setWeight(getInputFormat().attribute(i).weight()); attributes.add(newAtt); } } } } else { attributes.add((Attribute) getInputFormat().attribute(i).copy()); } } Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); }
/** * Convert a single instance over. The converted instance is added to the end of the output queue. * * @param instance the instance to convert * @throws Exception if instance cannot be converted */ private void convertInstance(Instance instance) throws Exception { Instance inst = null; HashMap symbols = new HashMap(5); if (instance instanceof SparseInstance) { double[] newVals = new double[instance.numAttributes()]; int[] newIndices = new int[instance.numAttributes()]; double[] vals = instance.toDoubleArray(); int ind = 0; double value; for (int j = 0; j < instance.numAttributes(); j++) { if (m_SelectCols.isInRange(j)) { if (instance.attribute(j).isNumeric() && (!Utils.isMissingValue(vals[j])) && (getInputFormat().classIndex() != j)) { symbols.put("A", new Double(vals[j])); symbols.put("MAX", new Double(m_attStats[j].numericStats.max)); symbols.put("MIN", new Double(m_attStats[j].numericStats.min)); symbols.put("MEAN", new Double(m_attStats[j].numericStats.mean)); symbols.put("SD", new Double(m_attStats[j].numericStats.stdDev)); symbols.put("COUNT", new Double(m_attStats[j].numericStats.count)); symbols.put("SUM", new Double(m_attStats[j].numericStats.sum)); symbols.put("SUMSQUARED", new Double(m_attStats[j].numericStats.sumSq)); value = eval(symbols); if (Double.isNaN(value) || Double.isInfinite(value)) { System.err.println("WARNING:Error in evaluating the expression: missing value set"); value = Utils.missingValue(); } if (value != 0.0) { newVals[ind] = value; newIndices[ind] = j; ind++; } } } else { value = vals[j]; if (value != 0.0) { newVals[ind] = value; newIndices[ind] = j; ind++; } } } double[] tempVals = new double[ind]; int[] tempInd = new int[ind]; System.arraycopy(newVals, 0, tempVals, 0, ind); System.arraycopy(newIndices, 0, tempInd, 0, ind); inst = new SparseInstance(instance.weight(), tempVals, tempInd, instance.numAttributes()); } else { double[] vals = instance.toDoubleArray(); for (int j = 0; j < getInputFormat().numAttributes(); j++) { if (m_SelectCols.isInRange(j)) { if (instance.attribute(j).isNumeric() && (!Utils.isMissingValue(vals[j])) && (getInputFormat().classIndex() != j)) { symbols.put("A", new Double(vals[j])); symbols.put("MAX", new Double(m_attStats[j].numericStats.max)); symbols.put("MIN", new Double(m_attStats[j].numericStats.min)); symbols.put("MEAN", new Double(m_attStats[j].numericStats.mean)); symbols.put("SD", new Double(m_attStats[j].numericStats.stdDev)); symbols.put("COUNT", new Double(m_attStats[j].numericStats.count)); symbols.put("SUM", new Double(m_attStats[j].numericStats.sum)); symbols.put("SUMSQUARED", new Double(m_attStats[j].numericStats.sumSq)); vals[j] = eval(symbols); if (Double.isNaN(vals[j]) || Double.isInfinite(vals[j])) { System.err.println("WARNING:Error in Evaluation the Expression: missing value set"); vals[j] = Utils.missingValue(); } } } } inst = new DenseInstance(instance.weight(), vals); } inst.setDataset(instance.dataset()); push(inst); }