/** * Sets the format of output instances. The derived class should use this method once it has * determined the outputformat. The output queue is cleared. * * @param outputFormat the new output format */ protected void setOutputFormat(Instances outputFormat) { if (outputFormat != null) { m_OutputFormat = outputFormat.stringFreeStructure(); initOutputLocators(m_OutputFormat, null); // Rename the relation String relationName = outputFormat.relationName() + "-" + this.getClass().getName(); if (this instanceof OptionHandler) { String[] options = ((OptionHandler) this).getOptions(); for (int i = 0; i < options.length; i++) { relationName += options[i].trim(); } } m_OutputFormat.setRelationName(relationName); } else { m_OutputFormat = null; } m_OutputQueue = new Queue(); }
/** * Takes an evaluation object from a task and aggregates it with the overall one. * * @param eval the evaluation object to aggregate * @param classifier the classifier used by the task * @param testData the testData from the task * @param plotInstances the ClassifierErrorsPlotInstances object from the task * @param setNum the set number processed by the task * @param maxSetNum the maximum number of sets in this batch */ protected synchronized void aggregateEvalTask( Evaluation eval, Classifier classifier, Instances testData, ClassifierErrorsPlotInstances plotInstances, int setNum, int maxSetNum) { m_eval.aggregate(eval); if (m_aggregatedPlotInstances == null) { m_aggregatedPlotInstances = new Instances(plotInstances.getPlotInstances()); m_aggregatedPlotShapes = plotInstances.getPlotShapes(); m_aggregatedPlotSizes = plotInstances.getPlotSizes(); } else { Instances temp = plotInstances.getPlotInstances(); for (int i = 0; i < temp.numInstances(); i++) { m_aggregatedPlotInstances.add(temp.get(i)); m_aggregatedPlotShapes.addElement(plotInstances.getPlotShapes().get(i)); m_aggregatedPlotSizes.addElement(plotInstances.getPlotSizes().get(i)); } } m_setsComplete++; // if (ce.getSetNumber() == ce.getMaxSetNumber()) { if (m_setsComplete == maxSetNum) { try { String textTitle = classifier.getClass().getName(); String textOptions = ""; if (classifier instanceof OptionHandler) { textOptions = Utils.joinOptions(((OptionHandler) classifier).getOptions()); } textTitle = textTitle.substring(textTitle.lastIndexOf('.') + 1, textTitle.length()); String resultT = "=== Evaluation result ===\n\n" + "Scheme: " + textTitle + "\n" + ((textOptions.length() > 0) ? "Options: " + textOptions + "\n" : "") + "Relation: " + testData.relationName() + "\n\n" + m_eval.toSummaryString(); if (testData.classAttribute().isNominal()) { resultT += "\n" + m_eval.toClassDetailsString() + "\n" + m_eval.toMatrixString(); } TextEvent te = new TextEvent(ClassifierPerformanceEvaluator.this, resultT, textTitle); notifyTextListeners(te); // set up visualizable errors if (m_visualizableErrorListeners.size() > 0) { PlotData2D errorD = new PlotData2D(m_aggregatedPlotInstances); errorD.setShapeSize(m_aggregatedPlotSizes); errorD.setShapeType(m_aggregatedPlotShapes); errorD.setPlotName(textTitle + " " + textOptions); /* PlotData2D errorD = m_PlotInstances.getPlotData( textTitle + " " + textOptions); */ VisualizableErrorEvent vel = new VisualizableErrorEvent(ClassifierPerformanceEvaluator.this, errorD); notifyVisualizableErrorListeners(vel); m_PlotInstances.cleanUp(); } if (testData.classAttribute().isNominal() && m_thresholdListeners.size() > 0) { ThresholdCurve tc = new ThresholdCurve(); Instances result = tc.getCurve(m_eval.predictions(), 0); result.setRelationName(testData.relationName()); PlotData2D pd = new PlotData2D(result); String htmlTitle = "<html><font size=-2>" + textTitle; String newOptions = ""; if (classifier instanceof OptionHandler) { String[] options = ((OptionHandler) classifier).getOptions(); if (options.length > 0) { for (int ii = 0; ii < options.length; ii++) { if (options[ii].length() == 0) { continue; } if (options[ii].charAt(0) == '-' && !(options[ii].charAt(1) >= '0' && options[ii].charAt(1) <= '9')) { newOptions += "<br>"; } newOptions += options[ii]; } } } htmlTitle += " " + newOptions + "<br>" + " (class: " + testData.classAttribute().value(0) + ")" + "</font></html>"; pd.setPlotName(textTitle + " (class: " + testData.classAttribute().value(0) + ")"); pd.setPlotNameHTML(htmlTitle); boolean[] connectPoints = new boolean[result.numInstances()]; for (int jj = 1; jj < connectPoints.length; jj++) { connectPoints[jj] = true; } pd.setConnectPoints(connectPoints); ThresholdDataEvent rde = new ThresholdDataEvent( ClassifierPerformanceEvaluator.this, pd, testData.classAttribute()); notifyThresholdListeners(rde); } if (m_logger != null) { m_logger.statusMessage(statusMessagePrefix() + "Finished."); } } catch (Exception ex) { if (m_logger != null) { m_logger.logMessage( "[ClassifierPerformanceEvaluator] " + statusMessagePrefix() + " problem constructing evaluation results. " + ex.getMessage()); } ex.printStackTrace(); } finally { m_visual.setStatic(); // save memory m_PlotInstances = null; m_setsComplete = 0; m_tasks = null; m_aggregatedPlotInstances = null; } } }
/** * ************************************************** Convert a table to a set of instances, with * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing * <b>attributes</b> */ public Instances tableRowsToNominalInstances(Table t, String relationName) { System.err.print("Converting table rows to instances..."); // Set up attributes, which for rowInstances will be the colNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int c = 0; c < t.numCols; c++) { // It's nominal... determine the range of values isNominal.add(true); FastVector attVals = getColValues(t, c); atts.addElement(new Attribute(t.colNames[c], attVals)); // Save it for later allAttVals.add(attVals); } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); // Fill the instances with data... // For each instance... for (int r = 0; r < t.numRows; r++) { double[] vals = new double[data.numAttributes()]; // for each attribute for (int c = 0; c < t.numCols; c++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[c] = Instance.missingValue(); else if (isNominal.get(c)) { vals[c] = allAttVals.get(c).indexOf(val); } else { vals[c] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int r = 0; r < t.rowNames.length; r++) { instanceNames.add(t.rowNames[r]); newData.instance(r).setValue(attrIdx, t.rowNames[r]); } data = newData; } System.err.println("done."); return (data); }
/** * ************************************************** Convert a table to a set of instances, with * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing * <b>attributes</b> (e.g. as is common with microarray data) */ public Instances tableColsToInstances(Table t, String relationName) { System.err.print("Converting table cols to instances..."); // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int r = 0; r < t.numRows; r++) { if (rowIsNumeric(t, r)) { isNominal.add(false); atts.addElement(new Attribute(t.rowNames[r])); allAttVals.add(null); // No enumeration of attribute values. } else { // It's nominal... determine the range of values and create a nominal attribute... isNominal.add(true); FastVector attVals = getRowValues(t, r); atts.addElement(new Attribute(t.rowNames[r], attVals)); // Save it for later allAttVals.add(attVals); } } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < t.numCols; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. // For each attribute fill in the numeric or attributeValue index... for (int r = 0; r < t.numRows; r++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[r] = Instance.missingValue(); else if (isNominal.get(r)) { vals[r] = allAttVals.get(r).indexOf(val); } else { vals[r] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int c = 0; c < t.colNames.length; c++) { instanceNames.add(t.colNames[c]); newData.instance(c).setValue(attrIdx, t.colNames[c]); } data = newData; } System.err.println("done."); return (data); }
/** * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable. */ public static Instances readNumeric(String fileName, String relationName, String delimiter) throws Exception { int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading. String[] attrNames = new String[numAttributes]; // Read the col headings and figure out the number of columns in the table.. BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304); String line = reader.readLine(); String[] instanceNames = parseColNames(line, delimiter); int numInstances = instanceNames.length; System.err.print("reading " + numAttributes + " x " + numInstances + " table.."); // Create an array to hold the data as we read it in... double dataArray[][] = new double[numAttributes][numInstances]; // Populate the matrix with values... String valToken = ""; try { int rowIdx = 0; while ((line = reader.readLine()) != null) { String[] tokens = line.split(delimiter, -1); attrNames[rowIdx] = tokens[0].trim(); for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) { valToken = tokens[colIdx + 1]; double value; if (valToken.equals("null")) { value = Instance.missingValue(); } else if (valToken.equals("?")) { value = Instance.missingValue(); } else if (valToken.equals("NA")) { value = Instance.missingValue(); } else if (valToken.equals("")) { value = Instance.missingValue(); // }else value = DoubleParser.lightningParse(valToken); // faster double parser with // MANY assumptions } else value = Double.parseDouble(valToken); dataArray[rowIdx][colIdx] = value; } rowIdx++; } } catch (NumberFormatException e) { System.err.println(e.toString()); System.err.println("Parsing line: " + line); System.err.println("Parsing token: " + valToken); } // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); for (int a = 0; a < numAttributes; a++) { atts.addElement(new Attribute(attrNames[a])); } // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); System.err.print("creating instances.."); // System.err.println("DEBUG: numAttributes "+numAttributes); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < numInstances; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. for (int r = 0; r < numAttributes; r++) { double val = dataArray[r][c]; vals[r] = val; } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); // System.err.println("DEBUG: data.relationNAme"+data.relationName()); System.err.print("add feature names.."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 for (int c = 0; c < numInstances; c++) { newData.instance(c).setValue(attrIdx, instanceNames[c]); } data = newData; // System.err.println("DEBUG: data.numInstances: "+data.numInstances()); // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes()); return (data); }