Beispiel #1
0
  /** @param args */
  private void Init() {
    testIns.setClassIndex(testIns.numAttributes() - 1);
    labeledIns.setClassIndex(labeledIns.numAttributes() - 1);
    unlabeledIns.setClassIndex(unlabeledIns.numAttributes() - 1);

    class_Array[0] = classifier1;
    class_Array[1] = classifier2;
    class_Array[2] = classifier3;
  }
Beispiel #2
0
  protected void initMinMax(Instances data) {
    m_Min = new double[data.numAttributes()];
    m_Max = new double[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
      m_Min[i] = m_Max[i] = Double.NaN;
    }

    for (int i = 0; i < data.numInstances(); i++) {
      updateMinMax(data.instance(i));
    }
  }
Beispiel #3
0
  /**
   * Generate artificial training examples.
   *
   * @param artSize size of examples set to create
   * @param data training data
   * @return the set of unlabeled artificial examples
   */
  protected Instances generateArtificialData(int artSize, Instances data) {
    int numAttributes = data.numAttributes();
    Instances artData = new Instances(data, artSize);
    double[] att;
    Instance artInstance;

    for (int i = 0; i < artSize; i++) {
      att = new double[numAttributes];
      for (int j = 0; j < numAttributes; j++) {
        if (data.attribute(j).isNominal()) {
          // Select nominal value based on the frequency of occurence in the training data
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (double) selectIndexProbabilistically(stats);
        } else if (data.attribute(j).isNumeric()) {
          // Generate numeric value from the Guassian distribution
          // defined by the mean and std dev of the attribute
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0];
        } else System.err.println("Decorate can only handle numeric and nominal values.");
      }
      artInstance = new Instance(1.0, att);
      artData.add(artInstance);
    }
    return artData;
  }
Beispiel #4
0
  /**
   * Compute and store statistics required for generating artificial data.
   *
   * @param data training instances
   * @exception Exception if statistics could not be calculated successfully
   */
  protected void computeStats(Instances data) throws Exception {
    int numAttributes = data.numAttributes();
    m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats

    for (int j = 0; j < numAttributes; j++) {
      if (data.attribute(j).isNominal()) {
        // Compute the probability of occurence of each distinct value
        int[] nomCounts = (data.attributeStats(j)).nominalCounts;
        double[] counts = new double[nomCounts.length];
        if (counts.length < 2)
          throw new Exception("Nominal attribute has less than two distinct values!");
        // Perform Laplace smoothing
        for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1;
        Utils.normalize(counts);
        double[] stats = new double[counts.length - 1];
        stats[0] = counts[0];
        // Calculate cumulative probabilities
        for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i];
        m_AttributeStats.add(j, stats);
      } else if (data.attribute(j).isNumeric()) {
        // Get mean and standard deviation from the training data
        double[] stats = new double[2];
        stats[0] = data.meanOrMode(j);
        stats[1] = Math.sqrt(data.variance(j));
        m_AttributeStats.add(j, stats);
      } else System.err.println("Decorate can only handle numeric and nominal values.");
    }
  }
Beispiel #5
0
  /**
   * Calculates the distance between two instances
   *
   * @param test the first instance
   * @param train the second instance
   * @return the distance between the two given instances, between 0 and 1
   */
  protected double distance(Instance first, Instance second) {

    double distance = 0;
    int firstI, secondI;

    for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues(); ) {
      if (p1 >= first.numValues()) {
        firstI = m_instances.numAttributes();
      } else {
        firstI = first.index(p1);
      }
      if (p2 >= second.numValues()) {
        secondI = m_instances.numAttributes();
      } else {
        secondI = second.index(p2);
      }
      if (firstI == m_instances.classIndex()) {
        p1++;
        continue;
      }
      if (secondI == m_instances.classIndex()) {
        p2++;
        continue;
      }
      double diff;
      if (firstI == secondI) {
        diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
        p1++;
        p2++;
      } else if (firstI > secondI) {
        diff = difference(secondI, 0, second.valueSparse(p2));
        p2++;
      } else {
        diff = difference(firstI, first.valueSparse(p1), 0);
        p1++;
      }
      distance += diff * diff;
    }

    return Math.sqrt(distance / m_instances.numAttributes());
  }
  /**
   * Generates a clusterer by the mean of spectral clustering algorithm.
   *
   * @param data set of instances serving as training data
   * @exception Exception if the clusterer has not been generated successfully
   */
  public void buildClusterer(Instances data) throws java.lang.Exception {
    m_Sequences = new Instances(data);
    int n = data.numInstances();
    int k = data.numAttributes();
    DoubleMatrix2D w;
    if (useSparseMatrix) w = DoubleFactory2D.sparse.make(n, n);
    else w = DoubleFactory2D.dense.make(n, n);
    double[][] v1 = new double[n][];
    for (int i = 0; i < n; i++) v1[i] = data.instance(i).toDoubleArray();
    v = DoubleFactory2D.dense.make(v1);
    double sigma_sq = sigma * sigma;
    // Sets up similarity matrix
    for (int i = 0; i < n; i++)
      for (int j = i; j < n; j++) {
        /*double dist = distnorm2(v.viewRow(i), v.viewRow(j));
        if((r == -1) || (dist < r)) {
          double sim = Math.exp(- (dist * dist) / (2 * sigma_sq));
          w.set(i, j, sim);
          w.set(j, i, sim);
        }*/
        /* String [] key = {data.instance(i).stringValue(0), data.instance(j).stringValue(0)};
        System.out.println(key[0]);
        System.out.println(key[1]);
        System.out.println(simScoreMap.containsKey(key));
        Double simValue = simScoreMap.get(key);*/

        double sim = sim_matrix[i][j];
        w.set(i, j, sim);
        w.set(j, i, sim);
      }

    // Partitions points
    int[][] p = partition(w, alpha_star);

    // Deploys results
    numOfClusters = p.length;
    cluster = new int[n];
    for (int i = 0; i < p.length; i++) for (int j = 0; j < p[i].length; j++) cluster[p[i][j]] = i;

    // System.out.println("Final partition:");
    // UtilsJS.printMatrix(p);
    // System.out.println("Cluster:\n");
    // UtilsJS.printArray(cluster);
    this.numOfClusters = cluster[Utils.maxIndex(cluster)] + 1;
    //  System.out.println("Num clusters:\t"+this.numOfClusters);
  }
Beispiel #7
0
  /**
   * return a string describing this clusterer
   *
   * @return a description of the clusterer as a string
   */
  public String toString() {
    StringBuffer temp = new StringBuffer();

    temp.append("\n FarthestFirst\n==============\n");

    temp.append("\nCluster centroids:\n");
    for (int i = 0; i < m_NumClusters; i++) {
      temp.append("\nCluster " + i + "\n\t");
      for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
        if (m_ClusterCentroids.attribute(j).isNominal()) {
          temp.append(
              " "
                  + m_ClusterCentroids
                      .attribute(j)
                      .value((int) m_ClusterCentroids.instance(i).value(j)));
        } else {
          temp.append(" " + m_ClusterCentroids.instance(i).value(j));
        }
      }
    }
    temp.append("\n\n");
    return temp.toString();
  }
Beispiel #8
0
  /**
   * Tests a certain range of attributes of the given data, whether it can be processed by the
   * handler, given its capabilities. Classifiers implementing the <code>
   * MultiInstanceCapabilitiesHandler</code> interface are checked automatically for their
   * multi-instance Capabilities (if no bags, then only the bag-structure, otherwise only the first
   * bag).
   *
   * @param data the data to test
   * @param fromIndex the range of attributes - start (incl.)
   * @param toIndex the range of attributes - end (incl.)
   * @return true if all the tests succeeded
   * @see MultiInstanceCapabilitiesHandler
   * @see #m_InstancesTest
   * @see #m_MissingValuesTest
   * @see #m_MissingClassValuesTest
   * @see #m_MinimumNumberInstancesTest
   */
  public boolean test(Instances data, int fromIndex, int toIndex) {
    int i;
    int n;
    int m;
    Attribute att;
    Instance inst;
    boolean testClass;
    Capabilities cap;
    boolean missing;
    Iterator iter;

    // shall we test the data?
    if (!m_InstancesTest) return true;

    // no Capabilities? -> warning
    if ((m_Capabilities.size() == 0)
        || ((m_Capabilities.size() == 1) && handles(Capability.NO_CLASS)))
      System.err.println(createMessage("No capabilities set!"));

    // any attributes?
    if (toIndex - fromIndex < 0) {
      m_FailReason = new WekaException(createMessage("No attributes!"));
      return false;
    }

    // do wee need to test the class attribute, i.e., is the class attribute
    // within the range of attributes?
    testClass =
        (data.classIndex() > -1)
            && (data.classIndex() >= fromIndex)
            && (data.classIndex() <= toIndex);

    // attributes
    for (i = fromIndex; i <= toIndex; i++) {
      att = data.attribute(i);

      // class is handled separately
      if (i == data.classIndex()) continue;

      // check attribute types
      if (!test(att)) return false;
    }

    // class
    if (!handles(Capability.NO_CLASS) && (data.classIndex() == -1)) {
      m_FailReason = new UnassignedClassException(createMessage("Class attribute not set!"));
      return false;
    }

    // special case: no class attribute can be handled
    if (handles(Capability.NO_CLASS) && (data.classIndex() > -1)) {
      cap = getClassCapabilities();
      cap.disable(Capability.NO_CLASS);
      iter = cap.capabilities();
      if (!iter.hasNext()) {
        m_FailReason = new WekaException(createMessage("Cannot handle any class attribute!"));
        return false;
      }
    }

    if (testClass && !handles(Capability.NO_CLASS)) {
      att = data.classAttribute();
      if (!test(att, true)) return false;

      // special handling of RELATIONAL class
      // TODO: store additional Capabilities for this case

      // missing class labels
      if (m_MissingClassValuesTest) {
        if (!handles(Capability.MISSING_CLASS_VALUES)) {
          for (i = 0; i < data.numInstances(); i++) {
            if (data.instance(i).classIsMissing()) {
              m_FailReason =
                  new WekaException(createMessage("Cannot handle missing class values!"));
              return false;
            }
          }
        } else {
          if (m_MinimumNumberInstancesTest) {
            int hasClass = 0;

            for (i = 0; i < data.numInstances(); i++) {
              if (!data.instance(i).classIsMissing()) hasClass++;
            }

            // not enough instances with class labels?
            if (hasClass < getMinimumNumberInstances()) {
              m_FailReason =
                  new WekaException(
                      createMessage(
                          "Not enough training instances with class labels (required: "
                              + getMinimumNumberInstances()
                              + ", provided: "
                              + hasClass
                              + ")!"));
              return false;
            }
          }
        }
      }
    }

    // missing values
    if (m_MissingValuesTest) {
      if (!handles(Capability.MISSING_VALUES)) {
        missing = false;
        for (i = 0; i < data.numInstances(); i++) {
          inst = data.instance(i);

          if (inst instanceof SparseInstance) {
            for (m = 0; m < inst.numValues(); m++) {
              n = inst.index(m);

              // out of scope?
              if (n < fromIndex) continue;
              if (n > toIndex) break;

              // skip class
              if (n == inst.classIndex()) continue;

              if (inst.isMissing(n)) {
                missing = true;
                break;
              }
            }
          } else {
            for (n = fromIndex; n <= toIndex; n++) {
              // skip class
              if (n == inst.classIndex()) continue;

              if (inst.isMissing(n)) {
                missing = true;
                break;
              }
            }
          }

          if (missing) {
            m_FailReason =
                new NoSupportForMissingValuesException(
                    createMessage("Cannot handle missing values!"));
            return false;
          }
        }
      }
    }

    // instances
    if (m_MinimumNumberInstancesTest) {
      if (data.numInstances() < getMinimumNumberInstances()) {
        m_FailReason =
            new WekaException(
                createMessage(
                    "Not enough training instances (required: "
                        + getMinimumNumberInstances()
                        + ", provided: "
                        + data.numInstances()
                        + ")!"));
        return false;
      }
    }

    // Multi-Instance? -> check structure (regardless of attribute range!)
    if (handles(Capability.ONLY_MULTIINSTANCE)) {
      // number of attributes?
      if (data.numAttributes() != 3) {
        m_FailReason =
            new WekaException(
                createMessage("Incorrect Multi-Instance format, must be 'bag-id, bag, class'!"));
        return false;
      }

      // type of attributes and position of class?
      if (!data.attribute(0).isNominal()
          || !data.attribute(1).isRelationValued()
          || (data.classIndex() != data.numAttributes() - 1)) {
        m_FailReason =
            new WekaException(
                createMessage(
                    "Incorrect Multi-Instance format, must be 'NOMINAL att, RELATIONAL att, CLASS att'!"));
        return false;
      }

      // check data immediately
      if (getOwner() instanceof MultiInstanceCapabilitiesHandler) {
        MultiInstanceCapabilitiesHandler handler = (MultiInstanceCapabilitiesHandler) getOwner();
        cap = handler.getMultiInstanceCapabilities();
        boolean result;
        if (data.numInstances() > 0) result = cap.test(data.attribute(1).relation(0));
        else result = cap.test(data.attribute(1).relation());

        if (!result) {
          m_FailReason = cap.m_FailReason;
          return false;
        }
      }
    }

    // passed all tests!
    return true;
  }
Beispiel #9
0
 /**
  * Tests the given data, whether it can be processed by the handler, given its capabilities.
  * Classifiers implementing the <code>MultiInstanceCapabilitiesHandler</code> interface are
  * checked automatically for their multi-instance Capabilities (if no bags, then only the
  * bag-structure, otherwise only the first bag).
  *
  * @param data the data to test
  * @return true if all the tests succeeded
  * @see #test(Instances, int, int)
  */
 public boolean test(Instances data) {
   return test(data, 0, data.numAttributes() - 1);
 }
Beispiel #10
0
  /**
   * returns a Capabilities object specific for this data. The minimum number of instances is not
   * set, the check for multi-instance data is optional.
   *
   * @param data the data to base the capabilities on
   * @param multi if true then the structure is checked, too
   * @return a data-specific capabilities object
   * @throws Exception in case an error occurrs, e.g., an unknown attribute type
   */
  public static Capabilities forInstances(Instances data, boolean multi) throws Exception {
    Capabilities result;
    Capabilities multiInstance;
    int i;
    int n;
    int m;
    Instance inst;
    boolean missing;

    result = new Capabilities(null);

    // class
    if (data.classIndex() == -1) {
      result.enable(Capability.NO_CLASS);
    } else {
      switch (data.classAttribute().type()) {
        case Attribute.NOMINAL:
          if (data.classAttribute().numValues() == 1) result.enable(Capability.UNARY_CLASS);
          else if (data.classAttribute().numValues() == 2) result.enable(Capability.BINARY_CLASS);
          else result.enable(Capability.NOMINAL_CLASS);
          break;

        case Attribute.NUMERIC:
          result.enable(Capability.NUMERIC_CLASS);
          break;

        case Attribute.STRING:
          result.enable(Capability.STRING_CLASS);
          break;

        case Attribute.DATE:
          result.enable(Capability.DATE_CLASS);
          break;

        case Attribute.RELATIONAL:
          result.enable(Capability.RELATIONAL_CLASS);
          break;

        default:
          throw new UnsupportedAttributeTypeException(
              "Unknown class attribute type '" + data.classAttribute() + "'!");
      }

      // missing class values
      for (i = 0; i < data.numInstances(); i++) {
        if (data.instance(i).classIsMissing()) {
          result.enable(Capability.MISSING_CLASS_VALUES);
          break;
        }
      }
    }

    // attributes
    for (i = 0; i < data.numAttributes(); i++) {
      // skip class
      if (i == data.classIndex()) continue;

      switch (data.attribute(i).type()) {
        case Attribute.NOMINAL:
          result.enable(Capability.UNARY_ATTRIBUTES);
          if (data.attribute(i).numValues() == 2) result.enable(Capability.BINARY_ATTRIBUTES);
          else if (data.attribute(i).numValues() > 2) result.enable(Capability.NOMINAL_ATTRIBUTES);
          break;

        case Attribute.NUMERIC:
          result.enable(Capability.NUMERIC_ATTRIBUTES);
          break;

        case Attribute.DATE:
          result.enable(Capability.DATE_ATTRIBUTES);
          break;

        case Attribute.STRING:
          result.enable(Capability.STRING_ATTRIBUTES);
          break;

        case Attribute.RELATIONAL:
          result.enable(Capability.RELATIONAL_ATTRIBUTES);
          break;

        default:
          throw new UnsupportedAttributeTypeException(
              "Unknown attribute type '" + data.attribute(i).type() + "'!");
      }
    }

    // missing values
    missing = false;
    for (i = 0; i < data.numInstances(); i++) {
      inst = data.instance(i);

      if (inst instanceof SparseInstance) {
        for (m = 0; m < inst.numValues(); m++) {
          n = inst.index(m);

          // skip class
          if (n == inst.classIndex()) continue;

          if (inst.isMissing(n)) {
            missing = true;
            break;
          }
        }
      } else {
        for (n = 0; n < data.numAttributes(); n++) {
          // skip class
          if (n == inst.classIndex()) continue;

          if (inst.isMissing(n)) {
            missing = true;
            break;
          }
        }
      }

      if (missing) {
        result.enable(Capability.MISSING_VALUES);
        break;
      }
    }

    // multi-instance data?
    if (multi) {
      if ((data.numAttributes() == 3)
          && (data.attribute(0).isNominal()) // bag-id
          && (data.attribute(1).isRelationValued()) // bag
          && (data.classIndex() == data.numAttributes() - 1)) {
        multiInstance = new Capabilities(null);
        multiInstance.or(result.getClassCapabilities());
        multiInstance.enable(Capability.NOMINAL_ATTRIBUTES);
        multiInstance.enable(Capability.RELATIONAL_ATTRIBUTES);
        multiInstance.enable(Capability.ONLY_MULTIINSTANCE);
        result.assign(multiInstance);
      }
    }

    return result;
  }
Beispiel #11
0
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing
   * <b>attributes</b> (e.g. as is common with microarray data)
   */
  public Instances tableColsToInstances(Table t, String relationName) {

    System.err.print("Converting table cols to instances...");

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...

    System.err.print("creating attributes...");

    for (int r = 0; r < t.numRows; r++) {
      if (rowIsNumeric(t, r)) {
        isNominal.add(false);
        atts.addElement(new Attribute(t.rowNames[r]));
        allAttVals.add(null); // No enumeration of attribute values.
      } else {
        // It's nominal... determine the range of values and create a nominal attribute...
        isNominal.add(true);
        FastVector attVals = getRowValues(t, r);
        atts.addElement(new Attribute(t.rowNames[r], attVals));
        // Save it for later
        allAttVals.add(attVals);
      }
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < t.numCols; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      // For each attribute fill in the numeric or attributeValue index...
      for (int r = 0; r < t.numRows; r++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[r] = Instance.missingValue();
        else if (isNominal.get(r)) {
          vals[r] = allAttVals.get(r).indexOf(val);
        } else {
          vals[r] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int c = 0; c < t.colNames.length; c++) {
        instanceNames.add(t.colNames[c]);
        newData.instance(c).setValue(attrIdx, t.colNames[c]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
Beispiel #12
0
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing
   * <b>attributes</b>
   */
  public Instances tableRowsToNominalInstances(Table t, String relationName) {

    System.err.print("Converting table rows to instances...");

    // Set up attributes, which for rowInstances will be the colNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...			

    System.err.print("creating attributes...");

    for (int c = 0; c < t.numCols; c++) {
      // It's nominal... determine the range of values
      isNominal.add(true);
      FastVector attVals = getColValues(t, c);
      atts.addElement(new Attribute(t.colNames[c], attVals));
      // Save it for later
      allAttVals.add(attVals);
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    // Fill the instances with data...
    // For each instance...
    for (int r = 0; r < t.numRows; r++) {
      double[] vals = new double[data.numAttributes()];

      // for each attribute
      for (int c = 0; c < t.numCols; c++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[c] = Instance.missingValue();
        else if (isNominal.get(c)) {
          vals[c] = allAttVals.get(c).indexOf(val);
        } else {
          vals[c] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int r = 0; r < t.rowNames.length; r++) {
        instanceNames.add(t.rowNames[r]);
        newData.instance(r).setValue(attrIdx, t.rowNames[r]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
Beispiel #13
0
  /**
   * If we know in advance that the table is numeric, can optimize a lot... For example, on 9803 x
   * 294 table, TableFileLoader.readNumeric takes 6s compared to 12s for WekaMine readFromTable.
   */
  public static Instances readNumeric(String fileName, String relationName, String delimiter)
      throws Exception {

    int numAttributes = FileUtils.fastCountLines(fileName) - 1; // -1 exclude heading.
    String[] attrNames = new String[numAttributes];

    // Read the col headings and figure out the number of columns in the table..
    BufferedReader reader = new BufferedReader(new FileReader(fileName), 4194304);
    String line = reader.readLine();
    String[] instanceNames = parseColNames(line, delimiter);
    int numInstances = instanceNames.length;

    System.err.print("reading " + numAttributes + " x " + numInstances + " table..");

    // Create an array to hold the data as we read it in...
    double dataArray[][] = new double[numAttributes][numInstances];

    // Populate the matrix with values...
    String valToken = "";
    try {
      int rowIdx = 0;
      while ((line = reader.readLine()) != null) {

        String[] tokens = line.split(delimiter, -1);
        attrNames[rowIdx] = tokens[0].trim();
        for (int colIdx = 0; colIdx < (tokens.length - 1); colIdx++) {
          valToken = tokens[colIdx + 1];
          double value;

          if (valToken.equals("null")) {
            value = Instance.missingValue();
          } else if (valToken.equals("?")) {
            value = Instance.missingValue();
          } else if (valToken.equals("NA")) {
            value = Instance.missingValue();
          } else if (valToken.equals("")) {
            value = Instance.missingValue();
            // }else value = DoubleParser.lightningParse(valToken); // faster double parser with
            // MANY assumptions
          } else value = Double.parseDouble(valToken);
          dataArray[rowIdx][colIdx] = value;
        }
        rowIdx++;
      }
    } catch (NumberFormatException e) {
      System.err.println(e.toString());
      System.err.println("Parsing line: " + line);
      System.err.println("Parsing token: " + valToken);
    }

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    for (int a = 0; a < numAttributes; a++) {
      atts.addElement(new Attribute(attrNames[a]));
    }

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    System.err.print("creating instances..");

    // System.err.println("DEBUG: numAttributes "+numAttributes);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < numInstances; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      for (int r = 0; r < numAttributes; r++) {
        double val = dataArray[r][c];
        vals[r] = val;
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());
    // System.err.println("DEBUG: data.relationNAme"+data.relationName());
    System.err.print("add feature names..");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    Instances newData = new Instances(data);
    newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
    int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

    for (int c = 0; c < numInstances; c++) {
      newData.instance(c).setValue(attrIdx, instanceNames[c]);
    }
    data = newData;

    // System.err.println("DEBUG: data.numInstances: "+data.numInstances());
    // System.err.println("DEBUG: data.numAttributes: "+data.numAttributes());

    return (data);
  }
Beispiel #14
0
  // tri-training学习过程;
  public void training() {
    int length_L = 0;
    int up_int = 0;
    double temp = 0.0;

    // 直到没有分类器发生更新时,跳出循环;
    while (update1 || update2 || update3) {
      update1 = false;
      update2 = false;
      update3 = false;
      for (int i = 0; i < 3; i++) {

        ins_Array[i] = new Instances(testIns, 0);

        ins_Array[i].setClassIndex(testIns.numAttributes() - 1);

        switch (i) {
          case 0:
            j = 1;
            k = 2;
            break;
          case 1:
            j = 0;
            k = 2;
            break;
          case 2:
            j = 0;
            k = 1;
            break;
        }

        // 获得用于加强第i个分类器的其它两个分类器j,k的分类错误率;
        err_Array[i] = measureBothError(class_Array[j], class_Array[k], this.unlabeledIns);

        // 如果这个两个分类器j,k的分类错误率小于前一次的时候,运用这两个分类器为第i个分类器标记样本;
        if (err_Array[i] < error[i]) {

          // 获得两个分类器j,k分类做出相同决策得到的样本集合ins_Array[i]
          this.updateL(class_Array[j], class_Array[k], ins_Array[i], this.unlabeledIns);

          length_L = ins_Array[i].numInstances();

          if (length[i] == 0) {

            //	System.out.println("err_array[i] =" + err_Array[i] + " err=" + error );
            length[i] = this.getDownInt(err_Array[i], error[i]);
            //	System.out.println("length[i] =" + length[i]);
          }

          if (length[i] < length_L) {
            if (err_Array[i] * length_L < error[i] * length[i]) {
              this.update[i] = true;
            } else if (length[i] > (err_Array[i] / (error[i] - err_Array[i]))) {
              up_int = this.getUpInt(err_Array[i], error[i], length[i]);
              //	System.out.println("err_array[i] =" + err_Array[i] + " err=" + error + "length:=" +
              // length[i]);
              //	System.out.println("up_int=" + up_int );
              this.SubSample(this.ins_Array[i], up_int);
              this.update[i] = true;
            }
          }
        }
      }

      // 更新分类器
      for (int i = 0; i < 3; i++) {
        // 如果第i个分类器的update为true,更新该分类器;
        if (this.update[i]) {
          try {
            this.class_Array[i].buildClassifier(Util.add(this.instan_Array[i], this.ins_Array[i]));
            temp = Util.errorRate(this.class_Array[i], this.testIns);

            // 如果分类器更新以后的分类错误率比以前高,则恢复分类器到未更新时的状态。 这一点与论文中的算法有一点点不同。
            // 论文中没有这一步的判断。
            if (temp > err_Classifier[i]) {
              this.update[i] = false;
              this.class_Array[i].buildClassifier(this.instan_Array[i]);
            } else {
              // 如果分类器的分类错误率下降了,则更新length[i]以及error[i];
              length[i] = this.ins_Array[i].numInstances();
              error[i] = err_Array[i];
            }
          } catch (Exception e) {
            System.out.println(e);
          }
        }
      }
    }
  }
Beispiel #15
0
  /**
   * Returns a description of the classifier.
   *
   * @return a description of the classifier as a string.
   */
  public String toString() {

    if (m_entries == null) {
      return "Decision Table: No model built yet.";
    } else {
      StringBuffer text = new StringBuffer();

      text.append(
          "Decision Table:"
              + "\n\nNumber of training instances: "
              + m_numInstances
              + "\nNumber of Rules : "
              + m_entries.size()
              + "\n");

      if (m_useIBk) {
        text.append("Non matches covered by IB1.\n");
      } else {
        text.append("Non matches covered by Majority class.\n");
      }

      text.append(m_search.toString());
      /*text.append("Best first search for feature set,\nterminated after "+
      m_maxStale+" non improving subsets.\n"); */

      text.append("Evaluation (for feature selection): CV ");
      if (m_CVFolds > 1) {
        text.append("(" + m_CVFolds + " fold) ");
      } else {
        text.append("(leave one out) ");
      }
      text.append("\nFeature set: " + printFeatures());

      if (m_displayRules) {

        // find out the max column width
        int maxColWidth = 0;
        for (int i = 0; i < m_dtInstances.numAttributes(); i++) {
          if (m_dtInstances.attribute(i).name().length() > maxColWidth) {
            maxColWidth = m_dtInstances.attribute(i).name().length();
          }

          if (m_classIsNominal || (i != m_dtInstances.classIndex())) {
            Enumeration e = m_dtInstances.attribute(i).enumerateValues();
            while (e.hasMoreElements()) {
              String ss = (String) e.nextElement();
              if (ss.length() > maxColWidth) {
                maxColWidth = ss.length();
              }
            }
          }
        }

        text.append("\n\nRules:\n");
        StringBuffer tm = new StringBuffer();
        for (int i = 0; i < m_dtInstances.numAttributes(); i++) {
          if (m_dtInstances.classIndex() != i) {
            int d = maxColWidth - m_dtInstances.attribute(i).name().length();
            tm.append(m_dtInstances.attribute(i).name());
            for (int j = 0; j < d + 1; j++) {
              tm.append(" ");
            }
          }
        }
        tm.append(m_dtInstances.attribute(m_dtInstances.classIndex()).name() + "  ");

        for (int i = 0; i < tm.length() + 10; i++) {
          text.append("=");
        }
        text.append("\n");
        text.append(tm);
        text.append("\n");
        for (int i = 0; i < tm.length() + 10; i++) {
          text.append("=");
        }
        text.append("\n");

        Enumeration e = m_entries.keys();
        while (e.hasMoreElements()) {
          DecisionTableHashKey tt = (DecisionTableHashKey) e.nextElement();
          text.append(tt.toString(m_dtInstances, maxColWidth));
          double[] ClassDist = (double[]) m_entries.get(tt);

          if (m_classIsNominal) {
            int m = Utils.maxIndex(ClassDist);
            try {
              text.append(m_dtInstances.classAttribute().value(m) + "\n");
            } catch (Exception ee) {
              System.out.println(ee.getMessage());
            }
          } else {
            text.append((ClassDist[0] / ClassDist[1]) + "\n");
          }
        }

        for (int i = 0; i < tm.length() + 10; i++) {
          text.append("=");
        }
        text.append("\n");
        text.append("\n");
      }
      return text.toString();
    }
  }
Beispiel #16
0
  /**
   * Generates the classifier.
   *
   * @param data set of instances serving as training data
   * @throws Exception if the classifier has not been generated successfully
   */
  public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    m_theInstances = new Instances(data);
    m_theInstances.deleteWithMissingClass();

    m_rr = new Random(1);

    if (m_theInstances.classAttribute().isNominal()) { // 	 Set up class priors
      m_classPriorCounts = new double[data.classAttribute().numValues()];
      Arrays.fill(m_classPriorCounts, 1.0);
      for (int i = 0; i < data.numInstances(); i++) {
        Instance curr = data.instance(i);
        m_classPriorCounts[(int) curr.classValue()] += curr.weight();
      }
      m_classPriors = m_classPriorCounts.clone();
      Utils.normalize(m_classPriors);
    }

    setUpEvaluator();

    if (m_theInstances.classAttribute().isNumeric()) {
      m_disTransform = new weka.filters.unsupervised.attribute.Discretize();
      m_classIsNominal = false;

      // use binned discretisation if the class is numeric
      ((weka.filters.unsupervised.attribute.Discretize) m_disTransform).setBins(10);
      ((weka.filters.unsupervised.attribute.Discretize) m_disTransform).setInvertSelection(true);

      // Discretize all attributes EXCEPT the class
      String rangeList = "";
      rangeList += (m_theInstances.classIndex() + 1);
      // System.out.println("The class col: "+m_theInstances.classIndex());

      ((weka.filters.unsupervised.attribute.Discretize) m_disTransform)
          .setAttributeIndices(rangeList);
    } else {
      m_disTransform = new weka.filters.supervised.attribute.Discretize();
      ((weka.filters.supervised.attribute.Discretize) m_disTransform).setUseBetterEncoding(true);
      m_classIsNominal = true;
    }

    m_disTransform.setInputFormat(m_theInstances);
    m_theInstances = Filter.useFilter(m_theInstances, m_disTransform);

    m_numAttributes = m_theInstances.numAttributes();
    m_numInstances = m_theInstances.numInstances();
    m_majority = m_theInstances.meanOrMode(m_theInstances.classAttribute());

    // Perform the search
    int[] selected = m_search.search(m_evaluator, m_theInstances);

    m_decisionFeatures = new int[selected.length + 1];
    System.arraycopy(selected, 0, m_decisionFeatures, 0, selected.length);
    m_decisionFeatures[m_decisionFeatures.length - 1] = m_theInstances.classIndex();

    // reduce instances to selected features
    m_delTransform = new Remove();
    m_delTransform.setInvertSelection(true);

    // set features to keep
    m_delTransform.setAttributeIndicesArray(m_decisionFeatures);
    m_delTransform.setInputFormat(m_theInstances);
    m_dtInstances = Filter.useFilter(m_theInstances, m_delTransform);

    // reset the number of attributes
    m_numAttributes = m_dtInstances.numAttributes();

    // create hash table
    m_entries = new Hashtable((int) (m_dtInstances.numInstances() * 1.5));

    // insert instances into the hash table
    for (int i = 0; i < m_numInstances; i++) {
      Instance inst = m_dtInstances.instance(i);
      insertIntoTable(inst, null);
    }

    // Replace the global table majority with nearest neighbour?
    if (m_useIBk) {
      m_ibk = new IBk();
      m_ibk.buildClassifier(m_theInstances);
    }

    // Save memory
    if (m_saveMemory) {
      m_theInstances = new Instances(m_theInstances, 0);
      m_dtInstances = new Instances(m_dtInstances, 0);
    }
    m_evaluation = null;
  }