Exemple #1
0
  /**
   * Sets the format of output instances. The derived class should use this method once it has
   * determined the outputformat. The output queue is cleared.
   *
   * @param outputFormat the new output format
   */
  protected void setOutputFormat(Instances outputFormat) {

    if (outputFormat != null) {
      m_OutputFormat = outputFormat.stringFreeStructure();
      initOutputLocators(m_OutputFormat, null);

      // Rename the relation
      String relationName = outputFormat.relationName() + "-" + this.getClass().getName();
      if (this instanceof OptionHandler) {
        String[] options = ((OptionHandler) this).getOptions();
        for (int i = 0; i < options.length; i++) {
          relationName += options[i].trim();
        }
      }
      m_OutputFormat.setRelationName(relationName);
    } else {
      m_OutputFormat = null;
    }
    m_OutputQueue = new Queue();
  }
  /**
   * Takes an evaluation object from a task and aggregates it with the overall one.
   *
   * @param eval the evaluation object to aggregate
   * @param classifier the classifier used by the task
   * @param testData the testData from the task
   * @param plotInstances the ClassifierErrorsPlotInstances object from the task
   * @param setNum the set number processed by the task
   * @param maxSetNum the maximum number of sets in this batch
   */
  protected synchronized void aggregateEvalTask(
      Evaluation eval,
      Classifier classifier,
      Instances testData,
      ClassifierErrorsPlotInstances plotInstances,
      int setNum,
      int maxSetNum) {

    m_eval.aggregate(eval);

    if (m_aggregatedPlotInstances == null) {
      m_aggregatedPlotInstances = new Instances(plotInstances.getPlotInstances());
      m_aggregatedPlotShapes = plotInstances.getPlotShapes();
      m_aggregatedPlotSizes = plotInstances.getPlotSizes();
    } else {
      Instances temp = plotInstances.getPlotInstances();
      for (int i = 0; i < temp.numInstances(); i++) {
        m_aggregatedPlotInstances.add(temp.get(i));
        m_aggregatedPlotShapes.addElement(plotInstances.getPlotShapes().get(i));
        m_aggregatedPlotSizes.addElement(plotInstances.getPlotSizes().get(i));
      }
    }
    m_setsComplete++;

    //  if (ce.getSetNumber() == ce.getMaxSetNumber()) {
    if (m_setsComplete == maxSetNum) {
      try {
        String textTitle = classifier.getClass().getName();
        String textOptions = "";
        if (classifier instanceof OptionHandler) {
          textOptions = Utils.joinOptions(((OptionHandler) classifier).getOptions());
        }
        textTitle = textTitle.substring(textTitle.lastIndexOf('.') + 1, textTitle.length());
        String resultT =
            "=== Evaluation result ===\n\n"
                + "Scheme: "
                + textTitle
                + "\n"
                + ((textOptions.length() > 0) ? "Options: " + textOptions + "\n" : "")
                + "Relation: "
                + testData.relationName()
                + "\n\n"
                + m_eval.toSummaryString();

        if (testData.classAttribute().isNominal()) {
          resultT += "\n" + m_eval.toClassDetailsString() + "\n" + m_eval.toMatrixString();
        }

        TextEvent te = new TextEvent(ClassifierPerformanceEvaluator.this, resultT, textTitle);
        notifyTextListeners(te);

        // set up visualizable errors
        if (m_visualizableErrorListeners.size() > 0) {
          PlotData2D errorD = new PlotData2D(m_aggregatedPlotInstances);
          errorD.setShapeSize(m_aggregatedPlotSizes);
          errorD.setShapeType(m_aggregatedPlotShapes);
          errorD.setPlotName(textTitle + " " + textOptions);

          /*          PlotData2D errorD = m_PlotInstances.getPlotData(
          textTitle + " " + textOptions); */
          VisualizableErrorEvent vel =
              new VisualizableErrorEvent(ClassifierPerformanceEvaluator.this, errorD);
          notifyVisualizableErrorListeners(vel);
          m_PlotInstances.cleanUp();
        }

        if (testData.classAttribute().isNominal() && m_thresholdListeners.size() > 0) {
          ThresholdCurve tc = new ThresholdCurve();
          Instances result = tc.getCurve(m_eval.predictions(), 0);
          result.setRelationName(testData.relationName());
          PlotData2D pd = new PlotData2D(result);
          String htmlTitle = "<html><font size=-2>" + textTitle;
          String newOptions = "";
          if (classifier instanceof OptionHandler) {
            String[] options = ((OptionHandler) classifier).getOptions();
            if (options.length > 0) {
              for (int ii = 0; ii < options.length; ii++) {
                if (options[ii].length() == 0) {
                  continue;
                }
                if (options[ii].charAt(0) == '-'
                    && !(options[ii].charAt(1) >= '0' && options[ii].charAt(1) <= '9')) {
                  newOptions += "<br>";
                }
                newOptions += options[ii];
              }
            }
          }

          htmlTitle +=
              " "
                  + newOptions
                  + "<br>"
                  + " (class: "
                  + testData.classAttribute().value(0)
                  + ")"
                  + "</font></html>";
          pd.setPlotName(textTitle + " (class: " + testData.classAttribute().value(0) + ")");
          pd.setPlotNameHTML(htmlTitle);
          boolean[] connectPoints = new boolean[result.numInstances()];
          for (int jj = 1; jj < connectPoints.length; jj++) {
            connectPoints[jj] = true;
          }

          pd.setConnectPoints(connectPoints);

          ThresholdDataEvent rde =
              new ThresholdDataEvent(
                  ClassifierPerformanceEvaluator.this, pd, testData.classAttribute());
          notifyThresholdListeners(rde);
        }
        if (m_logger != null) {
          m_logger.statusMessage(statusMessagePrefix() + "Finished.");
        }

      } catch (Exception ex) {
        if (m_logger != null) {
          m_logger.logMessage(
              "[ClassifierPerformanceEvaluator] "
                  + statusMessagePrefix()
                  + " problem constructing evaluation results. "
                  + ex.getMessage());
        }
        ex.printStackTrace();
      } finally {
        m_visual.setStatic();
        // save memory
        m_PlotInstances = null;
        m_setsComplete = 0;
        m_tasks = null;
        m_aggregatedPlotInstances = null;
      }
    }
  }
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing
   * <b>attributes</b>
   */
  public Instances tableRowsToNominalInstances(Table t, String relationName) {

    System.err.print("Converting table rows to instances...");

    // Set up attributes, which for rowInstances will be the colNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...			

    System.err.print("creating attributes...");

    for (int c = 0; c < t.numCols; c++) {
      // It's nominal... determine the range of values
      isNominal.add(true);
      FastVector attVals = getColValues(t, c);
      atts.addElement(new Attribute(t.colNames[c], attVals));
      // Save it for later
      allAttVals.add(attVals);
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    // Fill the instances with data...
    // For each instance...
    for (int r = 0; r < t.numRows; r++) {
      double[] vals = new double[data.numAttributes()];

      // for each attribute
      for (int c = 0; c < t.numCols; c++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[c] = Instance.missingValue();
        else if (isNominal.get(c)) {
          vals[c] = allAttVals.get(c).indexOf(val);
        } else {
          vals[c] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int r = 0; r < t.rowNames.length; r++) {
        instanceNames.add(t.rowNames[r]);
        newData.instance(r).setValue(attrIdx, t.rowNames[r]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing
   * <b>attributes</b> (e.g. as is common with microarray data)
   */
  public Instances tableColsToInstances(Table t, String relationName) {

    System.err.print("Converting table cols to instances...");

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...

    System.err.print("creating attributes...");

    for (int r = 0; r < t.numRows; r++) {
      if (rowIsNumeric(t, r)) {
        isNominal.add(false);
        atts.addElement(new Attribute(t.rowNames[r]));
        allAttVals.add(null); // No enumeration of attribute values.
      } else {
        // It's nominal... determine the range of values and create a nominal attribute...
        isNominal.add(true);
        FastVector attVals = getRowValues(t, r);
        atts.addElement(new Attribute(t.rowNames[r], attVals));
        // Save it for later
        allAttVals.add(attVals);
      }
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < t.numCols; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      // For each attribute fill in the numeric or attributeValue index...
      for (int r = 0; r < t.numRows; r++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[r] = Instance.missingValue();
        else if (isNominal.get(r)) {
          vals[r] = allAttVals.get(r).indexOf(val);
        } else {
          vals[r] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int c = 0; c < t.colNames.length; c++) {
        instanceNames.add(t.colNames[c]);
        newData.instance(c).setValue(attrIdx, t.colNames[c]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
  /**
   * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x
   * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable.
   */
  public static Instances readNumeric(String fileName, String relationName, String delimiter)
      throws Exception {

    int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading.
    String[] attrNames = new String[numAttributes];

    // Read the col headings and figure out the number of columns in the table..
    BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304);
    String line = reader.readLine();
    String[] instanceNames = parseColNames(line, delimiter);
    int numInstances = instanceNames.length;

    System.err.print("reading " + numAttributes + " x " + numInstances + " table..");

    // Create an array to hold the data as we read it in...
    double dataArray[][] = new double[numAttributes][numInstances];

    // Populate the matrix with values...
    String valToken = "";
    try {
      int rowIdx = 0;
      while ((line = reader.readLine()) != null) {

        String[] tokens = line.split(delimiter, -1);
        attrNames[rowIdx] = tokens[0].trim();
        for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) {
          valToken = tokens[colIdx + 1];
          double value;

          if (valToken.equals("null")) {
            value = Instance.missingValue();
          } else if (valToken.equals("?")) {
            value = Instance.missingValue();
          } else if (valToken.equals("NA")) {
            value = Instance.missingValue();
          } else if (valToken.equals("")) {
            value = Instance.missingValue();
            // }else value = DoubleParser.lightningParse(valToken); // faster double parser with
            // MANY assumptions
          } else value = Double.parseDouble(valToken);
          dataArray[rowIdx][colIdx] = value;
        }
        rowIdx++;
      }
    } catch (NumberFormatException e) {
      System.err.println(e.toString());
      System.err.println("Parsing line: " + line);
      System.err.println("Parsing token: " + valToken);
    }

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    for (int a = 0; a < numAttributes; a++) {
      atts.addElement(new Attribute(attrNames[a]));
    }

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    System.err.print("creating instances..");

    // System.err.println("DEBUG: numAttributes "+numAttributes);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < numInstances; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      for (int r = 0; r < numAttributes; r++) {
        double val = dataArray[r][c];
        vals[r] = val;
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());
    // System.err.println("DEBUG: data.relationNAme"+data.relationName());
    System.err.print("add feature names..");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    Instances newData = new Instances(data);
    newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
    int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

    for (int c = 0; c < numInstances; c++) {
      newData.instance(c).setValue(attrIdx, instanceNames[c]);
    }
    data = newData;

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());

    return (data);
  }