/** * Searches the attribute subset space using a genetic algorithm. * * @param ASEval the attribute evaluator to guide the search * @param data the training instances. * @return an array (not necessarily ordered) of selected attribute indexes * @throws Exception if the search can't be completed */ @Override public int[] search(ASEvaluation ASEval, Instances data) throws Exception { m_best = null; m_generationReports = new StringBuffer(); if (!(ASEval instanceof SubsetEvaluator)) { throw new Exception(ASEval.getClass().getName() + " is not a " + "Subset evaluator!"); } if (ASEval instanceof UnsupervisedSubsetEvaluator) { m_hasClass = false; } else { m_hasClass = true; m_classIndex = data.classIndex(); } SubsetEvaluator ASEvaluator = (SubsetEvaluator) ASEval; m_numAttribs = data.numAttributes(); m_startRange.setUpper(m_numAttribs - 1); if (!(getStartSet().equals(""))) { m_starting = m_startRange.getSelection(); } // initial random population m_lookupTable = new Hashtable<BitSet, GABitSet>(m_lookupTableSize); m_random = new Random(m_seed); m_population = new GABitSet[m_popSize]; // set up random initial population initPopulation(); evaluatePopulation(ASEvaluator); populationStatistics(); scalePopulation(); checkBest(); m_generationReports.append(populationReport(0)); boolean converged; for (int i = 1; i <= m_maxGenerations; i++) { generation(); evaluatePopulation(ASEvaluator); populationStatistics(); scalePopulation(); // find the best pop member and check for convergence converged = checkBest(); if ((i == m_maxGenerations) || ((i % m_reportFrequency) == 0) || (converged == true)) { m_generationReports.append(populationReport(i)); if (converged == true) { break; } } } return attributeList(m_best.getChromosome()); }
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance structure (any instances * contained in the object are ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the format couldn't be set successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_Insert.setUpper(instanceInfo.numAttributes()); Instances outputFormat = new Instances(instanceInfo, 0); Attribute newAttribute = null; switch (m_AttributeType) { case Attribute.NUMERIC: newAttribute = new Attribute(m_Name); break; case Attribute.NOMINAL: newAttribute = new Attribute(m_Name, m_Labels); break; case Attribute.STRING: newAttribute = new Attribute(m_Name, (FastVector) null); break; case Attribute.DATE: newAttribute = new Attribute(m_Name, m_DateFormat); break; default: throw new IllegalArgumentException("Unknown attribute type in Add"); } if ((m_Insert.getIndex() < 0) || (m_Insert.getIndex() > getInputFormat().numAttributes())) { throw new IllegalArgumentException("Index out of range"); } outputFormat.insertAttributeAt(newAttribute, m_Insert.getIndex()); setOutputFormat(outputFormat); // all attributes, except index of added attribute // (otherwise the length of the input/output indices differ) Range atts = new Range(m_Insert.getSingleIndex()); atts.setInvert(true); atts.setUpper(outputFormat.numAttributes() - 1); initOutputLocators(outputFormat, atts.getSelection()); return true; }
/** * Searches the attribute subset space by best first search * * @param ASEval the attribute evaluator to guide the search * @param data the training instances. * @return an array (not necessarily ordered) of selected attribute indexes * @throws Exception if the search can't be completed */ public int[] search(ASEvaluation ASEval, Instances data) throws Exception { m_totalEvals = 0; if (!(ASEval instanceof SubsetEvaluator)) { throw new Exception(ASEval.getClass().getName() + " is not a " + "Subset evaluator!"); } if (ASEval instanceof UnsupervisedSubsetEvaluator) { m_hasClass = false; } else { m_hasClass = true; m_classIndex = data.classIndex(); } SubsetEvaluator ASEvaluator = (SubsetEvaluator) ASEval; m_numAttribs = data.numAttributes(); int i, j; int best_size = 0; int size = 0; int done; int sd = m_searchDirection; BitSet best_group, temp_group; int stale; double best_merit; double merit; boolean z; boolean added; Link2 tl; Hashtable lookup = new Hashtable(m_cacheSize * m_numAttribs); int insertCount = 0; int cacheHits = 0; LinkedList2 bfList = new LinkedList2(m_maxStale); best_merit = -Double.MAX_VALUE; stale = 0; best_group = new BitSet(m_numAttribs); m_startRange.setUpper(m_numAttribs - 1); if (!(getStartSet().equals(""))) { m_starting = m_startRange.getSelection(); } // If a starting subset has been supplied, then initialise the bitset if (m_starting != null) { for (i = 0; i < m_starting.length; i++) { if ((m_starting[i]) != m_classIndex) { best_group.set(m_starting[i]); } } best_size = m_starting.length; m_totalEvals++; } else { if (m_searchDirection == SELECTION_BACKWARD) { setStartSet("1-last"); m_starting = new int[m_numAttribs]; // init initial subset to all attributes for (i = 0, j = 0; i < m_numAttribs; i++) { if (i != m_classIndex) { best_group.set(i); m_starting[j++] = i; } } best_size = m_numAttribs - 1; m_totalEvals++; } } // evaluate the initial subset best_merit = ASEvaluator.evaluateSubset(best_group); // add the initial group to the list and the hash table Object[] best = new Object[1]; best[0] = best_group.clone(); bfList.addToList(best, best_merit); BitSet tt = (BitSet) best_group.clone(); String hashC = tt.toString(); lookup.put(hashC, new Double(best_merit)); while (stale < m_maxStale) { added = false; if (m_searchDirection == SELECTION_BIDIRECTIONAL) { // bi-directional search done = 2; sd = SELECTION_FORWARD; } else { done = 1; } // finished search? if (bfList.size() == 0) { stale = m_maxStale; break; } // copy the attribute set at the head of the list tl = bfList.getLinkAt(0); temp_group = (BitSet) (tl.getData()[0]); temp_group = (BitSet) temp_group.clone(); // remove the head of the list bfList.removeLinkAt(0); // count the number of bits set (attributes) int kk; for (kk = 0, size = 0; kk < m_numAttribs; kk++) { if (temp_group.get(kk)) { size++; } } do { for (i = 0; i < m_numAttribs; i++) { if (sd == SELECTION_FORWARD) { z = ((i != m_classIndex) && (!temp_group.get(i))); } else { z = ((i != m_classIndex) && (temp_group.get(i))); } if (z) { // set the bit (attribute to add/delete) if (sd == SELECTION_FORWARD) { temp_group.set(i); size++; } else { temp_group.clear(i); size--; } /* if this subset has been seen before, then it is already in the list (or has been fully expanded) */ tt = (BitSet) temp_group.clone(); hashC = tt.toString(); if (lookup.containsKey(hashC) == false) { merit = ASEvaluator.evaluateSubset(temp_group); m_totalEvals++; // insert this one in the hashtable if (insertCount > m_cacheSize * m_numAttribs) { lookup = new Hashtable(m_cacheSize * m_numAttribs); insertCount = 0; } hashC = tt.toString(); lookup.put(hashC, new Double(merit)); insertCount++; } else { merit = ((Double) lookup.get(hashC)).doubleValue(); cacheHits++; } // insert this one in the list Object[] add = new Object[1]; add[0] = tt.clone(); bfList.addToList(add, merit); if (m_debug) { System.out.print("Group: "); printGroup(tt, m_numAttribs); System.out.println("Merit: " + merit); } // is this better than the best? if (sd == SELECTION_FORWARD) { z = ((merit - best_merit) > 0.00001); } else { if (merit == best_merit) { z = (size < best_size); } else { z = (merit > best_merit); } } if (z) { added = true; stale = 0; best_merit = merit; // best_size = (size + best_size); best_size = size; best_group = (BitSet) (temp_group.clone()); } // unset this addition(deletion) if (sd == SELECTION_FORWARD) { temp_group.clear(i); size--; } else { temp_group.set(i); size++; } } } if (done == 2) { sd = SELECTION_BACKWARD; } done--; } while (done > 0); /* if we haven't added a new attribute subset then full expansion of this node hasen't resulted in anything better */ if (!added) { stale++; } } m_bestMerit = best_merit; return attributeList(best_group); }
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance structure (any instances * contained in the object are ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the format couldn't be set successfully */ @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); int classIndex = instanceInfo.classIndex(); // setup the map if (m_renameVals != null && m_renameVals.length() > 0) { String[] vals = m_renameVals.split(","); for (String val : vals) { String[] parts = val.split(":"); if (parts.length != 2) { throw new WekaException("Invalid replacement string: " + val); } if (parts[0].length() == 0 || parts[1].length() == 0) { throw new WekaException("Invalid replacement string: " + val); } m_renameMap.put( m_ignoreCase ? parts[0].toLowerCase().trim() : parts[0].trim(), parts[1].trim()); } } // try selected atts as a numeric range first Range tempRange = new Range(); tempRange.setInvert(m_invert); if (m_selectedColsString == null) { m_selectedColsString = ""; } try { tempRange.setRanges(m_selectedColsString); tempRange.setUpper(instanceInfo.numAttributes() - 1); m_selectedAttributes = tempRange.getSelection(); m_selectedCols = tempRange; } catch (Exception r) { // OK, now try as named attributes StringBuffer indexes = new StringBuffer(); String[] attNames = m_selectedColsString.split(","); boolean first = true; for (String n : attNames) { n = n.trim(); Attribute found = instanceInfo.attribute(n); if (found == null) { throw new WekaException( "Unable to find attribute '" + n + "' in the incoming instances'"); } if (first) { indexes.append("" + (found.index() + 1)); first = false; } else { indexes.append("," + (found.index() + 1)); } } tempRange = new Range(); tempRange.setRanges(indexes.toString()); tempRange.setUpper(instanceInfo.numAttributes() - 1); m_selectedAttributes = tempRange.getSelection(); m_selectedCols = tempRange; } ArrayList<Attribute> attributes = new ArrayList<Attribute>(); for (int i = 0; i < instanceInfo.numAttributes(); i++) { if (m_selectedCols.isInRange(i)) { if (instanceInfo.attribute(i).isNominal()) { List<String> valsForAtt = new ArrayList<String>(); for (int j = 0; j < instanceInfo.attribute(i).numValues(); j++) { String origV = instanceInfo.attribute(i).value(j); String replace = m_ignoreCase ? m_renameMap.get(origV.toLowerCase()) : m_renameMap.get(origV); if (replace != null && !valsForAtt.contains(replace)) { valsForAtt.add(replace); } else { valsForAtt.add(origV); } } Attribute newAtt = new Attribute(instanceInfo.attribute(i).name(), valsForAtt); attributes.add(newAtt); } else { // ignore any selected attributes that are not nominal Attribute att = (Attribute) instanceInfo.attribute(i).copy(); attributes.add(att); } } else { Attribute att = (Attribute) instanceInfo.attribute(i).copy(); attributes.add(att); } } Instances outputFormat = new Instances(instanceInfo.relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); return true; }
/** * Kind of a dummy search algorithm. Calls a Attribute evaluator to evaluate each attribute not * included in the startSet and then sorts them to produce a ranked list of attributes. * * @param ASEval the attribute evaluator to guide the search * @param data the training instances. * @return an array (not necessarily ordered) of selected attribute indexes * @throws Exception if the search can't be completed */ public int[] search(ASEvaluation ASEval, Instances data) throws Exception { int i, j; if (!(ASEval instanceof AttributeEvaluator)) { throw new Exception(ASEval.getClass().getName() + " is not a" + "Attribute evaluator!"); } m_numAttribs = data.numAttributes(); if (ASEval instanceof UnsupervisedAttributeEvaluator) { m_hasClass = false; } else { m_classIndex = data.classIndex(); if (m_classIndex >= 0) { m_hasClass = true; } else { m_hasClass = false; } } // get the transformed data and check to see if the transformer // preserves a class index if (ASEval instanceof AttributeTransformer) { data = ((AttributeTransformer) ASEval).transformedHeader(); if (m_classIndex >= 0 && data.classIndex() >= 0) { m_classIndex = data.classIndex(); m_hasClass = true; } } m_startRange.setUpper(m_numAttribs - 1); if (!(getStartSet().equals(""))) { m_starting = m_startRange.getSelection(); } int sl = 0; if (m_starting != null) { sl = m_starting.length; } if ((m_starting != null) && (m_hasClass == true)) { // see if the supplied list contains the class index boolean ok = false; for (i = 0; i < sl; i++) { if (m_starting[i] == m_classIndex) { ok = true; break; } } if (ok == false) { sl++; } } else { if (m_hasClass == true) { sl++; } } m_attributeList = new int[m_numAttribs - sl]; m_attributeMerit = new double[m_numAttribs - sl]; // add in those attributes not in the starting (omit list) for (i = 0, j = 0; i < m_numAttribs; i++) { if (!inStarting(i)) { m_attributeList[j++] = i; } } AttributeEvaluator ASEvaluator = (AttributeEvaluator) ASEval; for (i = 0; i < m_attributeList.length; i++) { m_attributeMerit[i] = ASEvaluator.evaluateAttribute(m_attributeList[i]); } double[][] tempRanked = rankedAttributes(); int[] rankedAttributes = new int[m_attributeList.length]; for (i = 0; i < m_attributeList.length; i++) { rankedAttributes[i] = (int) tempRanked[i][0]; } return rankedAttributes; }
/** * Searches the attribute subset space by linear forward selection * * @param ASEval the attribute evaluator to guide the search * @param data the training instances. * @return an array (not necessarily ordered) of selected attribute indexes * @exception Exception if the search can't be completed */ public int[] search(ASEvaluation ASEval, Instances data) throws Exception { m_totalEvals = 0; if (!(ASEval instanceof SubsetEvaluator)) { throw new Exception(ASEval.getClass().getName() + " is not a " + "Subset evaluator!"); } if (ASEval instanceof UnsupervisedSubsetEvaluator) { m_hasClass = false; } else { m_hasClass = true; m_classIndex = data.classIndex(); } ((ASEvaluation) ASEval).buildEvaluator(data); m_numAttribs = data.numAttributes(); if (m_numUsedAttributes > m_numAttribs) { System.out.println( "Decreasing number of top-ranked attributes to total number of attributes: " + data.numAttributes()); m_numUsedAttributes = m_numAttribs; } BitSet start_group = new BitSet(m_numAttribs); m_startRange.setUpper(m_numAttribs - 1); if (!(getStartSet().equals(""))) { m_starting = m_startRange.getSelection(); } // If a starting subset has been supplied, then initialise the bitset if (m_starting != null) { for (int i = 0; i < m_starting.length; i++) { if ((m_starting[i]) != m_classIndex) { start_group.set(m_starting[i]); } } } LFSMethods LFS = new LFSMethods(); int[] ranking; if (m_performRanking) { ranking = LFS.rankAttributes(data, (SubsetEvaluator) ASEval, m_verbose); } else { ranking = new int[m_numAttribs]; for (int i = 0; i < ranking.length; i++) { ranking[i] = i; } } if (m_forwardSearchMethod == SEARCH_METHOD_FORWARD) { LFS.forwardSearch( m_cacheSize, start_group, ranking, m_numUsedAttributes, m_linearSelectionType == TYPE_FIXED_WIDTH, m_maxStale, -1, data, (SubsetEvaluator) ASEval, m_verbose); } else if (m_forwardSearchMethod == SEARCH_METHOD_FLOATING) { LFS.floatingForwardSearch( m_cacheSize, start_group, ranking, m_numUsedAttributes, m_linearSelectionType == TYPE_FIXED_WIDTH, m_maxStale, data, (SubsetEvaluator) ASEval, m_verbose); } m_totalEvals = LFS.getNumEvalsTotal(); m_bestMerit = LFS.getBestMerit(); return attributeList(LFS.getBestGroup()); }
/** * Determines the output format based on the input format and returns this. In case the output * format cannot be returned immediately, i.e., hasImmediateOutputFormat() returns false, then * this method will called from batchFinished() after the call of preprocess(Instances), in which, * e.g., statistics for the actual processing step can be gathered. * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong * @see #hasImmediateOutputFormat() * @see #batchFinished() */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { FastVector atts; FastVector values; Instances result; int i; // attributes must be numeric m_Attributes.setUpper(inputFormat.numAttributes() - 1); m_AttributeIndices = m_Attributes.getSelection(); for (i = 0; i < m_AttributeIndices.length; i++) { // ignore class if (m_AttributeIndices[i] == inputFormat.classIndex()) { m_AttributeIndices[i] = NON_NUMERIC; continue; } // not numeric -> ignore it if (!inputFormat.attribute(m_AttributeIndices[i]).isNumeric()) m_AttributeIndices[i] = NON_NUMERIC; } // get old attributes atts = new FastVector(); for (i = 0; i < inputFormat.numAttributes(); i++) atts.addElement(inputFormat.attribute(i)); if (!getDetectionPerAttribute()) { m_OutlierAttributePosition = new int[1]; m_OutlierAttributePosition[0] = atts.size(); // add 2 new attributes values = new FastVector(); values.addElement("no"); values.addElement("yes"); atts.addElement(new Attribute("Outlier", values)); values = new FastVector(); values.addElement("no"); values.addElement("yes"); atts.addElement(new Attribute("ExtremeValue", values)); } else { m_OutlierAttributePosition = new int[m_AttributeIndices.length]; for (i = 0; i < m_AttributeIndices.length; i++) { if (m_AttributeIndices[i] == NON_NUMERIC) continue; m_OutlierAttributePosition[i] = atts.size(); // add new attributes values = new FastVector(); values.addElement("no"); values.addElement("yes"); atts.addElement( new Attribute( inputFormat.attribute(m_AttributeIndices[i]).name() + "_Outlier", values)); values = new FastVector(); values.addElement("no"); values.addElement("yes"); atts.addElement( new Attribute( inputFormat.attribute(m_AttributeIndices[i]).name() + "_ExtremeValue", values)); if (getOutputOffsetMultiplier()) atts.addElement( new Attribute(inputFormat.attribute(m_AttributeIndices[i]).name() + "_Offset")); } } // generate header result = new Instances(inputFormat.relationName(), atts, 0); result.setClassIndex(inputFormat.classIndex()); return result; }
private void readHeader() throws IOException { m_rowCount = 1; m_incrementalReader = null; m_current = new ArrayList<Object>(); openTempFiles(); m_rowBuffer = new ArrayList<String>(); String firstRow = m_sourceReader.readLine(); if (firstRow == null) { throw new IOException("No data in the file!"); } if (m_noHeaderRow) { m_rowBuffer.add(firstRow); } ArrayList<Attribute> attribNames = new ArrayList<Attribute>(); // now tokenize to determine attribute names (or create att names if // no header row StringReader sr = new StringReader(firstRow + "\n"); // System.out.print(firstRow + "\n"); m_st = new StreamTokenizer(sr); initTokenizer(m_st); m_st.ordinaryChar(m_FieldSeparator.charAt(0)); int attNum = 1; StreamTokenizerUtils.getFirstToken(m_st); if (m_st.ttype == StreamTokenizer.TT_EOF) { StreamTokenizerUtils.errms(m_st, "premature end of file"); } boolean first = true; boolean wasSep; while (m_st.ttype != StreamTokenizer.TT_EOL && m_st.ttype != StreamTokenizer.TT_EOF) { // Get next token if (!first) { StreamTokenizerUtils.getToken(m_st); } if (m_st.ttype == m_FieldSeparator.charAt(0) || m_st.ttype == StreamTokenizer.TT_EOL) { wasSep = true; } else { wasSep = false; String attName = null; if (m_noHeaderRow) { attName = "att" + attNum; attNum++; } else { attName = m_st.sval; } attribNames.add(new Attribute(attName, (java.util.List<String>) null)); } if (!wasSep) { StreamTokenizerUtils.getToken(m_st); } first = false; } String relationName; if (m_sourceFile != null) { relationName = (m_sourceFile.getName()).replaceAll("\\.[cC][sS][vV]$", ""); } else { relationName = "stream"; } m_structure = new Instances(relationName, attribNames, 0); m_NominalAttributes.setUpper(m_structure.numAttributes() - 1); m_StringAttributes.setUpper(m_structure.numAttributes() - 1); m_dateAttributes.setUpper(m_structure.numAttributes() - 1); m_numericAttributes.setUpper(m_structure.numAttributes() - 1); m_nominalVals = new HashMap<Integer, LinkedHashSet<String>>(); m_types = new TYPE[m_structure.numAttributes()]; for (int i = 0; i < m_structure.numAttributes(); i++) { if (m_NominalAttributes.isInRange(i)) { m_types[i] = TYPE.NOMINAL; LinkedHashSet<String> ts = new LinkedHashSet<String>(); m_nominalVals.put(i, ts); } else if (m_StringAttributes.isInRange(i)) { m_types[i] = TYPE.STRING; } else if (m_dateAttributes.isInRange(i)) { m_types[i] = TYPE.DATE; } else if (m_numericAttributes.isInRange(i)) { m_types[i] = TYPE.NUMERIC; } else { m_types[i] = TYPE.UNDETERMINED; } } if (m_nominalLabelSpecs.size() > 0) { for (String spec : m_nominalLabelSpecs) { String[] attsAndLabels = spec.split(":"); if (attsAndLabels.length == 2) { String[] labels = attsAndLabels[1].split(","); try { // try as a range string first Range tempR = new Range(); tempR.setRanges(attsAndLabels[0].trim()); tempR.setUpper(m_structure.numAttributes() - 1); int[] rangeIndexes = tempR.getSelection(); for (int i = 0; i < rangeIndexes.length; i++) { m_types[rangeIndexes[i]] = TYPE.NOMINAL; LinkedHashSet<String> ts = new LinkedHashSet<String>(); for (String lab : labels) { ts.add(lab); } m_nominalVals.put(rangeIndexes[i], ts); } } catch (IllegalArgumentException e) { // one or more named attributes? String[] attNames = attsAndLabels[0].split(","); for (String attN : attNames) { Attribute a = m_structure.attribute(attN.trim()); if (a != null) { int attIndex = a.index(); m_types[attIndex] = TYPE.NOMINAL; LinkedHashSet<String> ts = new LinkedHashSet<String>(); for (String lab : labels) { ts.add(lab); } m_nominalVals.put(attIndex, ts); } } } } } } // Prevents the first row from getting lost in the // case where there is no header row and we're // running in batch mode if (m_noHeaderRow && getRetrieval() == BATCH) { StreamTokenizer tempT = new StreamTokenizer(new StringReader(firstRow)); initTokenizer(tempT); tempT.ordinaryChar(m_FieldSeparator.charAt(0)); String checked = getInstance(tempT); dumpRow(checked); } m_st = new StreamTokenizer(m_sourceReader); initTokenizer(m_st); m_st.ordinaryChar(m_FieldSeparator.charAt(0)); // try and determine a more accurate structure from the first batch readData(false || getRetrieval() == BATCH); makeStructure(); }