/** * Builds a new LinearRegression without the 'bad' data found by buildWeight * * @throws Exception if building fails */ private void buildRLSRegression() throws Exception { buildWeight(); m_RLSData = new Instances(m_Data); int x = 0; int y = 0; int n = m_RLSData.numInstances(); while (y < n) { if (m_weight[x] == 0) { m_RLSData.delete(y); n = m_RLSData.numInstances(); y--; } x++; y++; } if (m_RLSData.numInstances() == 0) { System.err.println("rls regression unbuilt"); m_ls = m_currentRegression; } else { m_ls = new LinearRegression(); m_ls.setOptions(new String[] {"-S", "1"}); m_ls.buildClassifier(m_RLSData); m_currentRegression = m_ls; } }
/** * Signify that this batch of input to the filter is finished. If the filter requires all * instances prior to filtering, output() may now be called to retrieve the filtered instances. * * @return true if there are instances pending output * @exception Exception if an error occurs * @exception IllegalStateException if no input structure has been defined */ public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_Means == null) { Instances input = getInputFormat(); m_Means = new double[input.numAttributes()]; m_StdDevs = new double[input.numAttributes()]; for (int i = 0; i < input.numAttributes(); i++) { if (input.attribute(i).isNumeric() && (input.classIndex() != i)) { m_Means[i] = input.meanOrMode(i); m_StdDevs[i] = Math.sqrt(input.variance(i)); } } // Convert pending input instances for (int i = 0; i < input.numInstances(); i++) { convertInstance(input.instance(i)); } } // Free memory flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); }
/** * Calculates the performance stats for the desired class and return results as a set of * Instances. * * @param predictions the predictions to base the curve on * @param classIndex index of the class of interest. * @return datapoints as a set of instances. */ public Instances getCurve(FastVector predictions, int classIndex) { if ((predictions.size() == 0) || (((NominalPrediction) predictions.elementAt(0)).distribution().length <= classIndex)) { return null; } ThresholdCurve tc = new ThresholdCurve(); Instances threshInst = tc.getCurve(predictions, classIndex); Instances insts = makeHeader(); int fpind = threshInst.attribute(ThresholdCurve.FP_RATE_NAME).index(); int tpind = threshInst.attribute(ThresholdCurve.TP_RATE_NAME).index(); int threshind = threshInst.attribute(ThresholdCurve.THRESHOLD_NAME).index(); double[] vals; double fpval, tpval, thresh; for (int i = 0; i < threshInst.numInstances(); i++) { fpval = threshInst.instance(i).value(fpind); tpval = threshInst.instance(i).value(tpind); thresh = threshInst.instance(i).value(threshind); vals = new double[3]; vals[0] = 0; vals[1] = fpval; vals[2] = thresh; insts.add(new Instance(1.0, vals)); vals = new double[3]; vals[0] = 1; vals[1] = 1.0 - tpval; vals[2] = thresh; insts.add(new Instance(1.0, vals)); } return insts; }
/** * Builds a weight function removing instances with an abnormally high scaled residual * * @throws Exception if weight building fails */ private void buildWeight() throws Exception { findResiduals(); m_scalefactor = 1.4826 * (1 + 5 / (m_Data.numInstances() - m_Data.numAttributes())) * Math.sqrt(m_bestMedian); m_weight = new double[m_Residuals.length]; for (int i = 0; i < m_Residuals.length; i++) m_weight[i] = ((Math.sqrt(m_Residuals[i]) / m_scalefactor < 2.5) ? 1.0 : 0.0); }
/** * Finds residuals (squared) for the current regression. * * @throws Exception if an error occurs */ private void findResiduals() throws Exception { m_SSR = 0; m_Residuals = new double[m_Data.numInstances()]; for (int i = 0; i < m_Data.numInstances(); i++) { m_Residuals[i] = m_currentRegression.classifyInstance(m_Data.instance(i)); m_Residuals[i] -= m_Data.instance(i).value(m_Data.classAttribute()); m_Residuals[i] *= m_Residuals[i]; m_SSR += m_Residuals[i]; } }
/** * Gets the number of samples to use. * * @throws Exception if an error occurs */ private void getSamples() throws Exception { int stuf[] = new int[] {500, 50, 22, 17, 15, 14}; if (m_samplesize < 7) { if (m_Data.numInstances() < stuf[m_samplesize - 1]) m_samples = combinations(m_Data.numInstances(), m_samplesize); else m_samples = m_samplesize * 500; } else m_samples = 3000; if (m_debug) { System.out.println("m_samplesize: " + m_samplesize); System.out.println("m_samples: " + m_samples); System.out.println("m_randomseed: " + m_randomseed); } }
/** * Creates a new <code>TestSetEvent</code> * * @param source the source of the event * @param testSet the test instances */ public TestSetEvent(Object source, Instances testSet) { super(source); m_testSet = testSet; if (m_testSet != null && m_testSet.numInstances() == 0) { m_structureOnly = true; } }
/** * Cleans up data * * @param data data to be cleaned up * @throws Exception if an error occurs */ private void cleanUpData(Instances data) throws Exception { m_Data = data; m_TransformFilter = new NominalToBinary(); m_TransformFilter.setInputFormat(m_Data); m_Data = Filter.useFilter(m_Data, m_TransformFilter); m_MissingFilter = new ReplaceMissingValues(); m_MissingFilter.setInputFormat(m_Data); m_Data = Filter.useFilter(m_Data, m_MissingFilter); m_Data.deleteWithMissingClass(); }
/** * Returns a string suitable for passing to RemoveRange consisting of m_samplesize indices. * * @param data dataset from which to take indicese * @return string of indices suitable for passing to RemoveRange */ private String selectIndices(Instances data) { StringBuffer text = new StringBuffer(); for (int i = 0, x = 0; i < m_samplesize; i++) { do { x = (int) (m_random.nextDouble() * data.numInstances()); } while (x == 0); text.append(Integer.toString(x)); if (i < m_samplesize - 1) text.append(","); else text.append("\n"); } return text.toString(); }
/** * Tests the CostCurve generation from the command line. The classifier is currently hardcoded. * Pipe in an arff file. * * @param args currently ignored */ public static void main(String[] args) { try { Instances inst = new Instances(new java.io.InputStreamReader(System.in)); inst.setClassIndex(inst.numAttributes() - 1); CostCurve cc = new CostCurve(); EvaluationUtils eu = new EvaluationUtils(); Classifier classifier = new features.classifiers.functions.Logistic(); FastVector predictions = new FastVector(); for (int i = 0; i < 2; i++) { // Do two runs. eu.setSeed(i); predictions.appendElements(eu.getCVPredictions(classifier, inst, 10)); // System.out.println("\n\n\n"); } Instances result = cc.getCurve(predictions); System.out.println(result); } catch (Exception ex) { ex.printStackTrace(); } }
/** * Build lms regression * * @param data training data * @throws Exception if an error occurs */ public void buildClassifier(Instances data) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(data); // remove instances with missing class data = new Instances(data); data.deleteWithMissingClass(); cleanUpData(data); getSamples(); findBestRegression(); buildRLSRegression(); } // buildClassifier
/** * Converts the header info of the given set of instances into a set of item sets (singletons). * The ordering of values in the header file determines the lexicographic order. * * @param instances the set of instances whose header info is to be used * @return a set of item sets, each containing a single item * @exception Exception if singletons can't be generated successfully */ public static FastVector singletons(Instances instances) throws Exception { FastVector setOfItemSets = new FastVector(); ItemSet current; for (int i = 0; i < instances.numAttributes(); i++) { if (instances.attribute(i).isNumeric()) throw new Exception("Can't handle numeric attributes!"); for (int j = 0; j < instances.attribute(i).numValues(); j++) { current = new AprioriItemSet(instances.numInstances()); current.m_items = new int[instances.numAttributes()]; for (int k = 0; k < instances.numAttributes(); k++) current.m_items[k] = -1; current.m_items[i] = j; setOfItemSets.addElement(current); } } return setOfItemSets; }
/** * Accept a test set * * @param e a <code>TestSetEvent</code> value */ public void acceptTestSet(final TestSetEvent e) { if (e.isStructureOnly()) notifyTestListeners(e); if (m_trainingSet != null && m_trainingSet.equalHeaders(e.getTestSet()) && m_filterThread == null) { try { if (m_state == IDLE) { m_state = FILTERING_TEST; } m_testingSet = e.getTestSet(); // final String oldText = m_visual.getText(); m_filterThread = new Thread() { public void run() { try { if (m_testingSet != null) { m_visual.setAnimated(); // m_visual.setText("Filtering test data..."); if (m_log != null) { m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptTestSet_StatusMessage_Text_First") + m_testingSet.relationName() + Messages.getInstance() .getString("Filter_AcceptTestSet_StatusMessage_Text_Second")); } Instances filteredTest = features.filters.Filter.useFilter(m_testingSet, m_Filter); // m_visual.setText(oldText); m_visual.setStatic(); TestSetEvent ne = new TestSetEvent(features.gui.beans.Filter.this, filteredTest); ne.m_setNumber = e.m_setNumber; ne.m_maxSetNumber = e.m_maxSetNumber; notifyTestListeners(ne); } } catch (Exception ex) { ex.printStackTrace(); if (m_log != null) { m_log.logMessage( Messages.getInstance() .getString("Filter_AcceptTestSet_LogMessage_Text_First") + statusMessagePrefix() + ex.getMessage()); m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptTestSet_StatusMessage_Text_Third")); } Filter.this.stop(); } finally { // m_visual.setText(oldText); m_visual.setStatic(); m_state = IDLE; if (isInterrupted()) { m_trainingSet = null; if (m_log != null) { m_log.logMessage( Messages.getInstance() .getString("Filter_AcceptTestSet_LogMessage_Text_Second") + statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptTestSet_LogMessage_Text_Third")); m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptTestSet_StatusMessage_Text_Fourth")); // m_log.statusMessage("OK"); } } else { if (m_log != null) { m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptTestSet_StatusMessage_Text_Fifth")); } } block(false); } } }; m_filterThread.setPriority(Thread.MIN_PRIORITY); m_filterThread.start(); block(true); m_filterThread = null; m_state = IDLE; } catch (Exception ex) { ex.printStackTrace(); } } }
/** * Accept an instance for processing by StreamableFilters only * * @param e an <code>InstanceEvent</code> value */ public void acceptInstance(InstanceEvent e) { // to do! if (m_filterThread != null) { String messg = Messages.getInstance().getString("Filter_AcceptInstance_Mess_Text_First") + statusMessagePrefix() + Messages.getInstance().getString("Filter_AcceptInstance_Mess_Text_Second"); if (m_log != null) { m_log.logMessage(messg); m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_First")); } else { System.err.println(messg); } return; } if (!(m_Filter instanceof StreamableFilter)) { stop(); // stop all processing if (m_log != null) { m_log.logMessage( Messages.getInstance().getString("Filter_AcceptInstance_LogMessage_Text_First") + statusMessagePrefix() + Messages.getInstance().getString("Filter_AcceptInstance_LogMessage_Text_Second") + m_Filter.getClass().getName() + Messages.getInstance().getString("Filter_AcceptInstance_LogMessage_Text_Third")); m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Second")); } return; } if (e.getStatus() == InstanceEvent.FORMAT_AVAILABLE) { try { m_instanceCount = 0; // notifyInstanceListeners(e); // Instances dataset = e.getInstance().dataset(); Instances dataset = e.getStructure(); if (m_Filter instanceof SupervisedFilter) { // defualt to last column if no class is set if (dataset.classIndex() < 0) { dataset.setClassIndex(dataset.numAttributes() - 1); } } // initialize filter m_Filter.setInputFormat(dataset); // attempt to determine post-filtering // structure. If successful this can be passed on to instance // listeners as a new FORMAT_AVAILABLE event. m_structurePassedOn = false; try { if (m_Filter.isOutputFormatDefined()) { // System.err.println("Filter - passing on output format..."); // System.err.println(m_Filter.getOutputFormat()); m_ie.setStructure(m_Filter.getOutputFormat()); notifyInstanceListeners(m_ie); m_structurePassedOn = true; } } catch (Exception ex) { stop(); // stop all processing if (m_log != null) { m_log.logMessage( Messages.getInstance().getString("Filter_AcceptInstance_LogMessage_Text_Fourth") + statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_LogMessage_Text_Fifth") + ex.getMessage()); m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Third")); } else { System.err.println( Messages.getInstance().getString("Filter_AcceptInstance_Error_Text_First") + statusMessagePrefix() + Messages.getInstance().getString("Filter_AcceptInstance_Error_Text_Second")); } } } catch (Exception ex) { ex.printStackTrace(); } return; } if (e.getStatus() == InstanceEvent.BATCH_FINISHED) { // get the last instance (if available) try { if (m_log != null) { m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Fourth")); } if (m_Filter.input(e.getInstance())) { Instance filteredInstance = m_Filter.output(); if (filteredInstance != null) { if (!m_structurePassedOn) { // pass on the new structure first m_ie.setStructure(new Instances(filteredInstance.dataset(), 0)); notifyInstanceListeners(m_ie); m_structurePassedOn = true; } m_ie.setInstance(filteredInstance); // if there are instances pending for output don't want to send // a batch finisehd at this point... // System.err.println("Filter - in batch finisehd..."); if (m_Filter.batchFinished() && m_Filter.numPendingOutput() > 0) { m_ie.setStatus(InstanceEvent.INSTANCE_AVAILABLE); } else { m_ie.setStatus(e.getStatus()); } notifyInstanceListeners(m_ie); } } if (m_log != null) { m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Fourth_Alpha")); } } catch (Exception ex) { stop(); // stop all processing if (m_log != null) { m_log.logMessage( Messages.getInstance().getString("Filter_AcceptInstance_LogMessage_Text_Sixth") + statusMessagePrefix() + ex.getMessage()); m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Fifth")); } ex.printStackTrace(); } // check for any pending instances that we might need to pass on try { if (m_Filter.batchFinished() && m_Filter.numPendingOutput() > 0) { if (m_log != null) { m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Sixth")); } Instance filteredInstance = m_Filter.output(); if (filteredInstance != null) { if (!m_structurePassedOn) { // pass on the new structure first m_ie.setStructure(new Instances(filteredInstance.dataset(), 0)); notifyInstanceListeners(m_ie); m_structurePassedOn = true; } m_ie.setInstance(filteredInstance); // TODO here is the problem I think m_ie.setStatus(InstanceEvent.INSTANCE_AVAILABLE); notifyInstanceListeners(m_ie); } while (m_Filter.numPendingOutput() > 0) { filteredInstance = m_Filter.output(); m_ie.setInstance(filteredInstance); // System.err.println("Filter - sending pending..."); if (m_Filter.numPendingOutput() == 0) { m_ie.setStatus(InstanceEvent.BATCH_FINISHED); } else { m_ie.setStatus(InstanceEvent.INSTANCE_AVAILABLE); } notifyInstanceListeners(m_ie); } if (m_log != null) { m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Seventh")); } } } catch (Exception ex) { stop(); // stop all processing if (m_log != null) { m_log.logMessage( Messages.getInstance().getString("Filter_AcceptInstance_LogMessage_Text_Seventh") + statusMessagePrefix() + ex.toString()); m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Eighth")); } ex.printStackTrace(); } } else { // pass instance through the filter try { if (!m_Filter.input(e.getInstance())) { // System.err.println("Filter - inputing instance into filter..."); /* if (m_log != null) { m_log.logMessage("ERROR : filter not ready to output instance"); } */ // quietly return. Filter might be able to output some instances // once the batch is finished. return; } // collect output instance. Instance filteredInstance = m_Filter.output(); if (filteredInstance == null) { return; } m_instanceCount++; if (!m_structurePassedOn) { // pass on the new structure first m_ie.setStructure(new Instances(filteredInstance.dataset(), 0)); notifyInstanceListeners(m_ie); m_structurePassedOn = true; } m_ie.setInstance(filteredInstance); m_ie.setStatus(e.getStatus()); if (m_log != null && (m_instanceCount % 10000 == 0)) { m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Nineth") + m_instanceCount + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Tenth")); } notifyInstanceListeners(m_ie); } catch (Exception ex) { stop(); // stop all processing if (m_log != null) { m_log.logMessage( Messages.getInstance().getString("Filter_AcceptInstance_LogMessage_Text_Eighth") + statusMessagePrefix() + ex.toString()); m_log.statusMessage( statusMessagePrefix() + Messages.getInstance() .getString("Filter_AcceptInstance_StatusMessage_Text_Eleventh")); } ex.printStackTrace(); } } }
/** * Returns a string that describes the filter as source. The filter will be contained in a class * with the given name (there may be auxiliary classes), and will contain two methods with these * signatures: * * <pre><code> * // converts one row * public static Object[] filter(Object[] i); * // converts a full dataset (first dimension is row index) * public static Object[][] filter(Object[][] i); * </code></pre> * * where the array <code>i</code> contains elements that are either Double, String, with missing * values represented as null. The generated code is public domain and comes with no warranty. * * @param className the name that should be given to the source class. * @param data the dataset used for initializing the filter * @return the object source described by a string * @throws Exception if the source can't be computed */ public String toSource(String className, Instances data) throws Exception { StringBuffer result; boolean[] process; int i; result = new StringBuffer(); // determine what attributes were processed process = new boolean[data.numAttributes()]; for (i = 0; i < data.numAttributes(); i++) { process[i] = (data.attribute(i).isNumeric() && (i != data.classIndex())); } result.append("class " + className + " {\n"); result.append("\n"); result.append(" /** lists which attributes will be processed */\n"); result.append( " protected final static boolean[] PROCESS = new boolean[]{" + Utils.arrayToString(process) + "};\n"); result.append("\n"); result.append(" /** the computed means */\n"); result.append( " protected final static double[] MEANS = new double[]{" + Utils.arrayToString(m_Means) + "};\n"); result.append("\n"); result.append(" /** the computed standard deviations */\n"); result.append( " protected final static double[] STDEVS = new double[]{" + Utils.arrayToString(m_StdDevs) + "};\n"); result.append("\n"); result.append(" /**\n"); result.append(" * filters a single row\n"); result.append(" * \n"); result.append(" * @param i the row to process\n"); result.append(" * @return the processed row\n"); result.append(" */\n"); result.append(" public static Object[] filter(Object[] i) {\n"); result.append(" Object[] result;\n"); result.append("\n"); result.append(" result = new Object[i.length];\n"); result.append(" for (int n = 0; n < i.length; n++) {\n"); result.append(" if (PROCESS[n] && (i[n] != null)) {\n"); result.append(" if (STDEVS[n] > 0)\n"); result.append(" result[n] = (((Double) i[n]) - MEANS[n]) / STDEVS[n];\n"); result.append(" else\n"); result.append(" result[n] = ((Double) i[n]) - MEANS[n];\n"); result.append(" }\n"); result.append(" else {\n"); result.append(" result[n] = i[n];\n"); result.append(" }\n"); result.append(" }\n"); result.append("\n"); result.append(" return result;\n"); result.append(" }\n"); result.append("\n"); result.append(" /**\n"); result.append(" * filters multiple rows\n"); result.append(" * \n"); result.append(" * @param i the rows to process\n"); result.append(" * @return the processed rows\n"); result.append(" */\n"); result.append(" public static Object[][] filter(Object[][] i) {\n"); result.append(" Object[][] result;\n"); result.append("\n"); result.append(" result = new Object[i.length][];\n"); result.append(" for (int n = 0; n < i.length; n++) {\n"); result.append(" result[n] = filter(i[n]);\n"); result.append(" }\n"); result.append("\n"); result.append(" return result;\n"); result.append(" }\n"); result.append("}\n"); return result.toString(); }