/** * Adds this tree recursively to the buffer. * * @param id the unqiue id for the method * @param buffer the buffer to add the source code to * @return the last ID being used * @throws Exception if something goes wrong */ protected int toSource(int id, StringBuffer buffer) throws Exception { int result; int i; int newID; StringBuffer[] subBuffers; buffer.append("\n"); buffer.append(" protected static double node" + id + "(Object[] i) {\n"); // leaf? if (m_Attribute == null) { result = id; if (Double.isNaN(m_ClassValue)) buffer.append(" return Double.NaN;"); else buffer.append(" return " + m_ClassValue + ";"); if (m_ClassAttribute != null) buffer.append(" // " + m_ClassAttribute.value((int) m_ClassValue)); buffer.append("\n"); buffer.append(" }\n"); } else { buffer.append(" // " + m_Attribute.name() + "\n"); // subtree calls subBuffers = new StringBuffer[m_Attribute.numValues()]; newID = id; for (i = 0; i < m_Attribute.numValues(); i++) { newID++; buffer.append(" "); if (i > 0) buffer.append("else "); buffer.append( "if (((String) i[" + m_Attribute.index() + "]).equals(\"" + m_Attribute.value(i) + "\"))\n"); buffer.append(" return node" + newID + "(i);\n"); subBuffers[i] = new StringBuffer(); newID = m_Successors[i].toSource(newID, subBuffers[i]); } buffer.append(" else\n"); buffer.append( " throw new IllegalArgumentException(\"Value '\" + i[" + m_Attribute.index() + "] + \"' is not allowed!\");\n"); buffer.append(" }\n"); // output subtree code for (i = 0; i < m_Attribute.numValues(); i++) { buffer.append(subBuffers[i].toString()); } subBuffers = null; result = newID; } return result; }
public double classifyInstance(Instance inst) throws Exception { if (m_attribute == null) { return m_intercept; } else { if (inst.isMissing(m_attribute.index())) { throw new Exception("UnivariateLinearRegression: No missing values!"); } return m_intercept + m_slope * inst.value(m_attribute.index()); } }
/** * Method for building an Id3 tree. * * @param data the training data * @exception Exception if decision tree can't be built successfully */ private void makeTree(Instances data) throws Exception { // Check if no instances have reached this node. if (data.numInstances() == 0) { m_Attribute = null; m_ClassValue = Utils.missingValue(); m_Distribution = new double[data.numClasses()]; return; } // Compute attribute with maximum information gain. double[] infoGains = new double[data.numAttributes()]; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); infoGains[att.index()] = computeInfoGain(data, att); } m_Attribute = data.attribute(Utils.maxIndex(infoGains)); // Make leaf if information gain is zero. // Otherwise create successors. if (Utils.eq(infoGains[m_Attribute.index()], 0)) { m_Attribute = null; m_Distribution = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); m_Distribution[(int) inst.classValue()]++; } Utils.normalize(m_Distribution); m_ClassValue = Utils.maxIndex(m_Distribution); m_ClassAttribute = data.classAttribute(); } else { Instances[] splitData = splitData(data, m_Attribute); m_Successors = new Id3[m_Attribute.numValues()]; for (int j = 0; j < m_Attribute.numValues(); j++) { m_Successors[j] = new Id3(); m_Successors[j].makeTree(splitData[j]); } } }
/** * Constructs an instance suitable for passing to the model for scoring * * @param incoming the incoming instance * @return an instance with values mapped to be consistent with what the model is expecting */ protected Instance mapIncomingFieldsToModelFields(Instance incoming) { Instances modelHeader = m_model.getHeader(); double[] vals = new double[modelHeader.numAttributes()]; for (int i = 0; i < modelHeader.numAttributes(); i++) { if (m_attributeMap[i] < 0) { // missing or type mismatch vals[i] = Utils.missingValue(); continue; } Attribute modelAtt = modelHeader.attribute(i); Attribute incomingAtt = incoming.dataset().attribute(m_attributeMap[i]); if (incoming.isMissing(incomingAtt.index())) { vals[i] = Utils.missingValue(); continue; } if (modelAtt.isNumeric()) { vals[i] = incoming.value(m_attributeMap[i]); } else if (modelAtt.isNominal()) { String incomingVal = incoming.stringValue(m_attributeMap[i]); int modelIndex = modelAtt.indexOfValue(incomingVal); if (modelIndex < 0) { vals[i] = Utils.missingValue(); } else { vals[i] = modelIndex; } } else if (modelAtt.isString()) { vals[i] = 0; modelAtt.setStringValue(incoming.stringValue(m_attributeMap[i])); } } if (modelHeader.classIndex() >= 0) { // set class to missing value vals[modelHeader.classIndex()] = Utils.missingValue(); } Instance newInst = null; if (incoming instanceof SparseInstance) { newInst = new SparseInstance(incoming.weight(), vals); } else { newInst = new DenseInstance(incoming.weight(), vals); } newInst.setDataset(modelHeader); return newInst; }
/** * Builds a mapping between the header for the incoming data to be scored and the header used to * train the model. Uses attribute names to match between the two. Also constructs a list of * missing attributes and a list of type mismatches. * * @param modelHeader the header of the data used to train the model * @param incomingHeader the header of the incoming data * @throws DistributedWekaException if more than 50% of the attributes expected by the model are * missing or have a type mismatch with the incoming data */ protected void buildAttributeMap(Instances modelHeader, Instances incomingHeader) throws DistributedWekaException { m_attributeMap = new int[modelHeader.numAttributes()]; int problemCount = 0; for (int i = 0; i < modelHeader.numAttributes(); i++) { Attribute modAtt = modelHeader.attribute(i); Attribute incomingAtt = incomingHeader.attribute(modAtt.name()); if (incomingAtt == null) { // missing model attribute m_attributeMap[i] = -1; m_missingMismatch.put(modAtt.name(), "missing from incoming data"); problemCount++; } else if (modAtt.type() != incomingAtt.type()) { // type mismatch m_attributeMap[i] = -1; m_missingMismatch.put( modAtt.name(), "type mismatch - " + "model: " + Attribute.typeToString(modAtt) + " != incoming: " + Attribute.typeToString(incomingAtt)); problemCount++; } else { m_attributeMap[i] = incomingAtt.index(); } } // -1 for the class (if set) int adjustForClass = modelHeader.classIndex() >= 0 ? 1 : 0; if (problemCount > (modelHeader.numAttributes() - adjustForClass) / 2) { throw new DistributedWekaException( "More than 50% of the attributes that the model " + "is expecting to see are either missing or have a type mismatch in the " + "incoming data."); } }
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance structure (any instances * contained in the object are ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the format couldn't be set successfully */ @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); int classIndex = instanceInfo.classIndex(); // setup the map if (m_renameVals != null && m_renameVals.length() > 0) { String[] vals = m_renameVals.split(","); for (String val : vals) { String[] parts = val.split(":"); if (parts.length != 2) { throw new WekaException("Invalid replacement string: " + val); } if (parts[0].length() == 0 || parts[1].length() == 0) { throw new WekaException("Invalid replacement string: " + val); } m_renameMap.put( m_ignoreCase ? parts[0].toLowerCase().trim() : parts[0].trim(), parts[1].trim()); } } // try selected atts as a numeric range first Range tempRange = new Range(); tempRange.setInvert(m_invert); if (m_selectedColsString == null) { m_selectedColsString = ""; } try { tempRange.setRanges(m_selectedColsString); tempRange.setUpper(instanceInfo.numAttributes() - 1); m_selectedAttributes = tempRange.getSelection(); m_selectedCols = tempRange; } catch (Exception r) { // OK, now try as named attributes StringBuffer indexes = new StringBuffer(); String[] attNames = m_selectedColsString.split(","); boolean first = true; for (String n : attNames) { n = n.trim(); Attribute found = instanceInfo.attribute(n); if (found == null) { throw new WekaException( "Unable to find attribute '" + n + "' in the incoming instances'"); } if (first) { indexes.append("" + (found.index() + 1)); first = false; } else { indexes.append("," + (found.index() + 1)); } } tempRange = new Range(); tempRange.setRanges(indexes.toString()); tempRange.setUpper(instanceInfo.numAttributes() - 1); m_selectedAttributes = tempRange.getSelection(); m_selectedCols = tempRange; } ArrayList<Attribute> attributes = new ArrayList<Attribute>(); for (int i = 0; i < instanceInfo.numAttributes(); i++) { if (m_selectedCols.isInRange(i)) { if (instanceInfo.attribute(i).isNominal()) { List<String> valsForAtt = new ArrayList<String>(); for (int j = 0; j < instanceInfo.attribute(i).numValues(); j++) { String origV = instanceInfo.attribute(i).value(j); String replace = m_ignoreCase ? m_renameMap.get(origV.toLowerCase()) : m_renameMap.get(origV); if (replace != null && !valsForAtt.contains(replace)) { valsForAtt.add(replace); } else { valsForAtt.add(origV); } } Attribute newAtt = new Attribute(instanceInfo.attribute(i).name(), valsForAtt); attributes.add(newAtt); } else { // ignore any selected attributes that are not nominal Attribute att = (Attribute) instanceInfo.attribute(i).copy(); attributes.add(att); } } else { Attribute att = (Attribute) instanceInfo.attribute(i).copy(); attributes.add(att); } } Instances outputFormat = new Instances(instanceInfo.relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); return true; }
@Override public void init(Instances structure, Environment env) { super.init(structure, env); m_resolvedLhsName = m_lhsAttributeName; m_resolvedRhsOperand = m_rhsOperand; try { m_resolvedLhsName = m_env.substitute(m_resolvedLhsName); m_resolvedRhsOperand = m_env.substitute(m_resolvedRhsOperand); } catch (Exception ex) { } Attribute lhs = null; // try as an index or "special" label first if (m_resolvedLhsName.toLowerCase().startsWith("/first")) { lhs = structure.attribute(0); } else if (m_resolvedLhsName.toLowerCase().startsWith("/last")) { lhs = structure.attribute(structure.numAttributes() - 1); } else { // try as an index try { int indx = Integer.parseInt(m_resolvedLhsName); indx--; lhs = structure.attribute(indx); } catch (NumberFormatException ex) { } } if (lhs == null) { lhs = structure.attribute(m_resolvedLhsName); } if (lhs == null) { throw new IllegalArgumentException( "Data does not contain attribute " + "\"" + m_resolvedLhsName + "\""); } m_lhsAttIndex = lhs.index(); if (m_rhsIsAttribute) { Attribute rhs = null; // try as an index or "special" label first if (m_resolvedRhsOperand.toLowerCase().equals("/first")) { rhs = structure.attribute(0); } else if (m_resolvedRhsOperand.toLowerCase().equals("/last")) { rhs = structure.attribute(structure.numAttributes() - 1); } else { // try as an index try { int indx = Integer.parseInt(m_resolvedRhsOperand); indx--; rhs = structure.attribute(indx); } catch (NumberFormatException ex) { } } if (rhs == null) { rhs = structure.attribute(m_resolvedRhsOperand); } if (rhs == null) { throw new IllegalArgumentException( "Data does not contain attribute " + "\"" + m_resolvedRhsOperand + "\""); } m_rhsAttIndex = rhs.index(); } else if (m_operator != ExpressionType.CONTAINS && m_operator != ExpressionType.STARTSWITH && m_operator != ExpressionType.ENDSWITH && m_operator != ExpressionType.REGEX && m_operator != ExpressionType.ISMISSING) { // make sure the operand is parseable as a number (unless missing has // been specified - equals only) if (lhs.isNominal()) { m_numericOperand = lhs.indexOfValue(m_resolvedRhsOperand); if (m_numericOperand < 0) { throw new IllegalArgumentException( "Unknown nominal value '" + m_resolvedRhsOperand + "' for attribute '" + lhs.name() + "'"); } } else { try { m_numericOperand = Double.parseDouble(m_resolvedRhsOperand); } catch (NumberFormatException e) { throw new IllegalArgumentException( "\"" + m_resolvedRhsOperand + "\" is not parseable as a number!"); } } } if (m_operator == ExpressionType.REGEX) { m_regexPattern = Pattern.compile(m_resolvedRhsOperand); } }
private void readHeader() throws IOException { m_rowCount = 1; m_incrementalReader = null; m_current = new ArrayList<Object>(); openTempFiles(); m_rowBuffer = new ArrayList<String>(); String firstRow = m_sourceReader.readLine(); if (firstRow == null) { throw new IOException("No data in the file!"); } if (m_noHeaderRow) { m_rowBuffer.add(firstRow); } ArrayList<Attribute> attribNames = new ArrayList<Attribute>(); // now tokenize to determine attribute names (or create att names if // no header row StringReader sr = new StringReader(firstRow + "\n"); // System.out.print(firstRow + "\n"); m_st = new StreamTokenizer(sr); initTokenizer(m_st); m_st.ordinaryChar(m_FieldSeparator.charAt(0)); int attNum = 1; StreamTokenizerUtils.getFirstToken(m_st); if (m_st.ttype == StreamTokenizer.TT_EOF) { StreamTokenizerUtils.errms(m_st, "premature end of file"); } boolean first = true; boolean wasSep; while (m_st.ttype != StreamTokenizer.TT_EOL && m_st.ttype != StreamTokenizer.TT_EOF) { // Get next token if (!first) { StreamTokenizerUtils.getToken(m_st); } if (m_st.ttype == m_FieldSeparator.charAt(0) || m_st.ttype == StreamTokenizer.TT_EOL) { wasSep = true; } else { wasSep = false; String attName = null; if (m_noHeaderRow) { attName = "att" + attNum; attNum++; } else { attName = m_st.sval; } attribNames.add(new Attribute(attName, (java.util.List<String>) null)); } if (!wasSep) { StreamTokenizerUtils.getToken(m_st); } first = false; } String relationName; if (m_sourceFile != null) { relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$", ""); } else { relationName = "stream"; } m_structure = new Instances(relationName, attribNames, 0); m_NominalAttributes.setUpper(m_structure.numAttributes() - 1); m_StringAttributes.setUpper(m_structure.numAttributes() - 1); m_dateAttributes.setUpper(m_structure.numAttributes() - 1); m_numericAttributes.setUpper(m_structure.numAttributes() - 1); m_nominalVals = new HashMap<Integer, LinkedHashSet<String>>(); m_types = new TYPE[m_structure.numAttributes()]; for (int i = 0; i < m_structure.numAttributes(); i++) { if (m_NominalAttributes.isInRange(i)) { m_types[i] = TYPE.NOMINAL; LinkedHashSet<String> ts = new LinkedHashSet<String>(); m_nominalVals.put(i, ts); } else if (m_StringAttributes.isInRange(i)) { m_types[i] = TYPE.STRING; } else if (m_dateAttributes.isInRange(i)) { m_types[i] = TYPE.DATE; } else if (m_numericAttributes.isInRange(i)) { m_types[i] = TYPE.NUMERIC; } else { m_types[i] = TYPE.UNDETERMINED; } } if (m_nominalLabelSpecs.size() > 0) { for (String spec : m_nominalLabelSpecs) { String[] attsAndLabels = spec.split(":"); if (attsAndLabels.length == 2) { String[] labels = attsAndLabels[1].split(","); try { // try as a range string first Range tempR = new Range(); tempR.setRanges(attsAndLabels[0].trim()); tempR.setUpper(m_structure.numAttributes() - 1); int[] rangeIndexes = tempR.getSelection(); for (int i = 0; i < rangeIndexes.length; i++) { m_types[rangeIndexes[i]] = TYPE.NOMINAL; LinkedHashSet<String> ts = new LinkedHashSet<String>(); for (String lab : labels) { ts.add(lab); } m_nominalVals.put(rangeIndexes[i], ts); } } catch (IllegalArgumentException e) { // one or more named attributes? String[] attNames = attsAndLabels[0].split(","); for (String attN : attNames) { Attribute a = m_structure.attribute(attN.trim()); if (a != null) { int attIndex = a.index(); m_types[attIndex] = TYPE.NOMINAL; LinkedHashSet<String> ts = new LinkedHashSet<String>(); for (String lab : labels) { ts.add(lab); } m_nominalVals.put(attIndex, ts); } } } } } } // Prevents the first row from getting lost in the // case where there is no header row and we're // running in batch mode if (m_noHeaderRow && getRetrieval() == BATCH) { StreamTokenizer tempT = new StreamTokenizer(new StringReader(firstRow)); initTokenizer(tempT); tempT.ordinaryChar(m_FieldSeparator.charAt(0)); String checked = getInstance(tempT); dumpRow(checked); } m_st = new StreamTokenizer(m_sourceReader); initTokenizer(m_st); m_st.ordinaryChar(m_FieldSeparator.charAt(0)); // try and determine a more accurate structure from the first batch readData(false || getRetrieval() == BATCH); makeStructure(); }
/** * The procedure implementing the SMOTE algorithm. The output instances are pushed onto the output * queue for collection. * * @throws Exception if provided options cannot be executed on input instances */ protected void doSMOTE() throws Exception { int minIndex = 0; int min = Integer.MAX_VALUE; if (m_DetectMinorityClass) { // find minority class int[] classCounts = getInputFormat().attributeStats(getInputFormat().classIndex()).nominalCounts; for (int i = 0; i < classCounts.length; i++) { if (classCounts[i] != 0 && classCounts[i] < min) { min = classCounts[i]; minIndex = i; } } } else { String classVal = getClassValue(); if (classVal.equalsIgnoreCase("first")) { minIndex = 1; } else if (classVal.equalsIgnoreCase("last")) { minIndex = getInputFormat().numClasses(); } else { minIndex = Integer.parseInt(classVal); } if (minIndex > getInputFormat().numClasses()) { throw new Exception("value index must be <= the number of classes"); } minIndex--; // make it an index } int nearestNeighbors; if (min <= getNearestNeighbors()) { nearestNeighbors = min - 1; } else { nearestNeighbors = getNearestNeighbors(); } if (nearestNeighbors < 1) throw new Exception("Cannot use 0 neighbors!"); // compose minority class dataset // also push all dataset instances Instances sample = getInputFormat().stringFreeStructure(); Enumeration instanceEnum = getInputFormat().enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); push((Instance) instance.copy()); if ((int) instance.classValue() == minIndex) { sample.add(instance); } } // compute Value Distance Metric matrices for nominal features Map vdmMap = new HashMap(); Enumeration attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { if (attr.isNominal() || attr.isString()) { double[][] vdm = new double[attr.numValues()][attr.numValues()]; vdmMap.put(attr, vdm); int[] featureValueCounts = new int[attr.numValues()]; int[][] featureValueCountsByClass = new int[getInputFormat().classAttribute().numValues()][attr.numValues()]; instanceEnum = getInputFormat().enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); int value = (int) instance.value(attr); int classValue = (int) instance.classValue(); featureValueCounts[value]++; featureValueCountsByClass[classValue][value]++; } for (int valueIndex1 = 0; valueIndex1 < attr.numValues(); valueIndex1++) { for (int valueIndex2 = 0; valueIndex2 < attr.numValues(); valueIndex2++) { double sum = 0; for (int classValueIndex = 0; classValueIndex < getInputFormat().numClasses(); classValueIndex++) { double c1i = featureValueCountsByClass[classValueIndex][valueIndex1]; double c2i = featureValueCountsByClass[classValueIndex][valueIndex2]; double c1 = featureValueCounts[valueIndex1]; double c2 = featureValueCounts[valueIndex2]; double term1 = c1i / c1; double term2 = c2i / c2; sum += Math.abs(term1 - term2); } vdm[valueIndex1][valueIndex2] = sum; } } } } } // use this random source for all required randomness Random rand = new Random(getRandomSeed()); // find the set of extra indices to use if the percentage is not evenly // divisible by 100 List extraIndices = new LinkedList(); double percentageRemainder = (getPercentage() / 100) - Math.floor(getPercentage() / 100.0); int extraIndicesCount = (int) (percentageRemainder * sample.numInstances()); if (extraIndicesCount >= 1) { for (int i = 0; i < sample.numInstances(); i++) { extraIndices.add(i); } } Collections.shuffle(extraIndices, rand); extraIndices = extraIndices.subList(0, extraIndicesCount); Set extraIndexSet = new HashSet(extraIndices); // the main loop to handle computing nearest neighbors and generating SMOTE // examples from each instance in the original minority class data Instance[] nnArray = new Instance[nearestNeighbors]; for (int i = 0; i < sample.numInstances(); i++) { Instance instanceI = sample.instance(i); // find k nearest neighbors for each instance List distanceToInstance = new LinkedList(); for (int j = 0; j < sample.numInstances(); j++) { Instance instanceJ = sample.instance(j); if (i != j) { double distance = 0; attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { double iVal = instanceI.value(attr); double jVal = instanceJ.value(attr); if (attr.isNumeric()) { distance += Math.pow(iVal - jVal, 2); } else { distance += ((double[][]) vdmMap.get(attr))[(int) iVal][(int) jVal]; } } } distance = Math.pow(distance, .5); distanceToInstance.add(new Object[] {distance, instanceJ}); } } // sort the neighbors according to distance Collections.sort( distanceToInstance, new Comparator() { public int compare(Object o1, Object o2) { double distance1 = (Double) ((Object[]) o1)[0]; double distance2 = (Double) ((Object[]) o2)[0]; return Double.compare(distance1, distance2); } }); // populate the actual nearest neighbor instance array Iterator entryIterator = distanceToInstance.iterator(); int j = 0; while (entryIterator.hasNext() && j < nearestNeighbors) { nnArray[j] = (Instance) ((Object[]) entryIterator.next())[1]; j++; } // create synthetic examples int n = (int) Math.floor(getPercentage() / 100); while (n > 0 || extraIndexSet.remove(i)) { double[] values = new double[sample.numAttributes()]; int nn = rand.nextInt(nearestNeighbors); attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { if (attr.isNumeric()) { double dif = nnArray[nn].value(attr) - instanceI.value(attr); double gap = rand.nextDouble(); values[attr.index()] = (instanceI.value(attr) + gap * dif); } else if (attr.isDate()) { double dif = nnArray[nn].value(attr) - instanceI.value(attr); double gap = rand.nextDouble(); values[attr.index()] = (long) (instanceI.value(attr) + gap * dif); } else { int[] valueCounts = new int[attr.numValues()]; int iVal = (int) instanceI.value(attr); valueCounts[iVal]++; for (int nnEx = 0; nnEx < nearestNeighbors; nnEx++) { int val = (int) nnArray[nnEx].value(attr); valueCounts[val]++; } int maxIndex = 0; int max = Integer.MIN_VALUE; for (int index = 0; index < attr.numValues(); index++) { if (valueCounts[index] > max) { max = valueCounts[index]; maxIndex = index; } } values[attr.index()] = maxIndex; } } } values[sample.classIndex()] = minIndex; Instance synthetic = new Instance(1.0, values); push(synthetic); n--; } } }