/** initializes the attribute indices. */
 protected void initializeAttributeIndices() {
   m_AttributeIndices.setUpper(m_Data.numAttributes() - 1);
   m_ActiveIndices = new boolean[m_Data.numAttributes()];
   for (int i = 0; i < m_ActiveIndices.length; i++) {
     m_ActiveIndices[i] = m_AttributeIndices.isInRange(i);
   }
 }
Exemple #2
0
  /** @param args */
  private void Init() {
    testIns.setClassIndex(testIns.numAttributes() - 1);
    labeledIns.setClassIndex(labeledIns.numAttributes() - 1);
    unlabeledIns.setClassIndex(unlabeledIns.numAttributes() - 1);

    class_Array[0] = classifier1;
    class_Array[1] = classifier2;
    class_Array[2] = classifier3;
  }
Exemple #3
0
  protected void initMinMax(Instances data) {
    m_Min = new double[data.numAttributes()];
    m_Max = new double[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
      m_Min[i] = m_Max[i] = Double.NaN;
    }

    for (int i = 0; i < data.numInstances(); i++) {
      updateMinMax(data.instance(i));
    }
  }
  /**
   * Determines the output format based on the input format and returns this. In case the output
   * format cannot be returned immediately, i.e., immediateOutputFormat() returns false, then this
   * method will be called from batchFinished().
   *
   * @param inputFormat the input format to base the output format on
   * @return the output format
   * @throws Exception in case the determination goes wrong
   * @see #hasImmediateOutputFormat()
   * @see #batchFinished()
   */
  protected Instances determineOutputFormat(Instances inputFormat) throws Exception {

    Instances data;
    Instances result;
    FastVector atts;
    FastVector values;
    HashSet hash;
    int i;
    int n;
    boolean isDate;
    Instance inst;
    Vector sorted;

    m_Cols.setUpper(inputFormat.numAttributes() - 1);
    data = new Instances(inputFormat);
    atts = new FastVector();
    for (i = 0; i < data.numAttributes(); i++) {
      if (!m_Cols.isInRange(i) || !data.attribute(i).isNumeric()) {
        atts.addElement(data.attribute(i));
        continue;
      }

      // date attribute?
      isDate = (data.attribute(i).type() == Attribute.DATE);

      // determine all available attribtues in dataset
      hash = new HashSet();
      for (n = 0; n < data.numInstances(); n++) {
        inst = data.instance(n);
        if (inst.isMissing(i)) continue;

        if (isDate) hash.add(inst.stringValue(i));
        else hash.add(new Double(inst.value(i)));
      }

      // sort values
      sorted = new Vector();
      for (Object o : hash) sorted.add(o);
      Collections.sort(sorted);

      // create attribute from sorted values
      values = new FastVector();
      for (Object o : sorted) {
        if (isDate) values.addElement(o.toString());
        else values.addElement(Utils.doubleToString(((Double) o).doubleValue(), MAX_DECIMALS));
      }
      atts.addElement(new Attribute(data.attribute(i).name(), values));
    }

    result = new Instances(inputFormat.relationName(), atts, 0);
    result.setClassIndex(inputFormat.classIndex());

    return result;
  }
  /**
   * Initializes the ranges using all instances of the dataset. Sets m_Ranges.
   *
   * @return the ranges
   */
  public double[][] initializeRanges() {
    if (m_Data == null) {
      m_Ranges = null;
      return m_Ranges;
    }

    int numAtt = m_Data.numAttributes();
    double[][] ranges = new double[numAtt][3];

    if (m_Data.numInstances() <= 0) {
      initializeRangesEmpty(numAtt, ranges);
      m_Ranges = ranges;
      return m_Ranges;
    } else {
      // initialize ranges using the first instance
      updateRangesFirst(m_Data.instance(0), numAtt, ranges);
    }

    // update ranges, starting from the second
    for (int i = 1; i < m_Data.numInstances(); i++) {
      updateRanges(m_Data.instance(i), numAtt, ranges);
    }

    m_Ranges = ranges;

    return m_Ranges;
  }
Exemple #6
0
  /**
   * Generate artificial training examples.
   *
   * @param artSize size of examples set to create
   * @param data training data
   * @return the set of unlabeled artificial examples
   */
  protected Instances generateArtificialData(int artSize, Instances data) {
    int numAttributes = data.numAttributes();
    Instances artData = new Instances(data, artSize);
    double[] att;
    Instance artInstance;

    for (int i = 0; i < artSize; i++) {
      att = new double[numAttributes];
      for (int j = 0; j < numAttributes; j++) {
        if (data.attribute(j).isNominal()) {
          // Select nominal value based on the frequency of occurence in the training data
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (double) selectIndexProbabilistically(stats);
        } else if (data.attribute(j).isNumeric()) {
          // Generate numeric value from the Guassian distribution
          // defined by the mean and std dev of the attribute
          double[] stats = (double[]) m_AttributeStats.get(j);
          att[j] = (m_Random.nextGaussian() * stats[1]) + stats[0];
        } else System.err.println("Decorate can only handle numeric and nominal values.");
      }
      artInstance = new Instance(1.0, att);
      artData.add(artInstance);
    }
    return artData;
  }
  /**
   * wrap up various variables to save memeory and do some housekeeping after optimization has
   * finished.
   *
   * @throws Exception if something goes wrong
   */
  protected void wrapUp() throws Exception {
    m_target = null;

    m_nEvals = m_kernel.numEvals();
    m_nCacheHits = m_kernel.numCacheHits();

    if ((m_SVM.getKernel() instanceof PolyKernel)
        && ((PolyKernel) m_SVM.getKernel()).getExponent() == 1.0) {
      // convert alpha's to weights
      double[] weights = new double[m_data.numAttributes()];
      for (int k = m_supportVectors.getNext(-1); k != -1; k = m_supportVectors.getNext(k)) {
        for (int j = 0; j < weights.length; j++) {
          if (j != m_classIndex) {
            weights[j] += (m_alpha[k] - m_alphaStar[k]) * m_data.instance(k).value(j);
          }
        }
      }
      m_weights = weights;

      // release memory
      m_alpha = null;
      m_alphaStar = null;
      m_kernel = null;
    }
    m_bModelBuilt = true;
  }
Exemple #8
0
  /**
   * Compute and store statistics required for generating artificial data.
   *
   * @param data training instances
   * @exception Exception if statistics could not be calculated successfully
   */
  protected void computeStats(Instances data) throws Exception {
    int numAttributes = data.numAttributes();
    m_AttributeStats = new Vector(numAttributes); // use to map attributes to their stats

    for (int j = 0; j < numAttributes; j++) {
      if (data.attribute(j).isNominal()) {
        // Compute the probability of occurence of each distinct value
        int[] nomCounts = (data.attributeStats(j)).nominalCounts;
        double[] counts = new double[nomCounts.length];
        if (counts.length < 2)
          throw new Exception("Nominal attribute has less than two distinct values!");
        // Perform Laplace smoothing
        for (int i = 0; i < counts.length; i++) counts[i] = nomCounts[i] + 1;
        Utils.normalize(counts);
        double[] stats = new double[counts.length - 1];
        stats[0] = counts[0];
        // Calculate cumulative probabilities
        for (int i = 1; i < stats.length; i++) stats[i] = stats[i - 1] + counts[i];
        m_AttributeStats.add(j, stats);
      } else if (data.attribute(j).isNumeric()) {
        // Get mean and standard deviation from the training data
        double[] stats = new double[2];
        stats[0] = data.meanOrMode(j);
        stats[1] = Math.sqrt(data.variance(j));
        m_AttributeStats.add(j, stats);
      } else System.err.println("Decorate can only handle numeric and nominal values.");
    }
  }
Exemple #9
0
  /**
   * Generates the classifier.
   *
   * @param instances set of instances serving as training data
   * @throws Exception if the classifier has not been generated successfully
   */
  public void buildClassifier(Instances instances) throws Exception {

    if (!(m_Classifier instanceof WeightedInstancesHandler)) {
      throw new IllegalArgumentException("Classifier must be a " + "WeightedInstancesHandler!");
    }

    // can classifier handle the data?
    getCapabilities().testWithFail(instances);

    // remove instances with missing class
    instances = new Instances(instances);
    instances.deleteWithMissingClass();

    // only class? -> build ZeroR model
    if (instances.numAttributes() == 1) {
      System.err.println(
          "Cannot build model (only class attribute present in data!), "
              + "using ZeroR model instead!");
      m_ZeroR = new weka.classifiers.rules.ZeroR();
      m_ZeroR.buildClassifier(instances);
      return;
    } else {
      m_ZeroR = null;
    }

    m_Train = new Instances(instances, 0, instances.numInstances());

    m_NNSearch.setInstances(m_Train);
  }
Exemple #10
0
  /**
   * Calculates the distance between two instances
   *
   * @param test the first instance
   * @param train the second instance
   * @return the distance between the two given instances, between 0 and 1
   */
  protected double distance(Instance first, Instance second) {

    double distance = 0;
    int firstI, secondI;

    for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues(); ) {
      if (p1 >= first.numValues()) {
        firstI = m_instances.numAttributes();
      } else {
        firstI = first.index(p1);
      }
      if (p2 >= second.numValues()) {
        secondI = m_instances.numAttributes();
      } else {
        secondI = second.index(p2);
      }
      if (firstI == m_instances.classIndex()) {
        p1++;
        continue;
      }
      if (secondI == m_instances.classIndex()) {
        p2++;
        continue;
      }
      double diff;
      if (firstI == secondI) {
        diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
        p1++;
        p2++;
      } else if (firstI > secondI) {
        diff = difference(secondI, 0, second.valueSparse(p2));
        p2++;
      } else {
        diff = difference(firstI, first.valueSparse(p1), 0);
        p1++;
      }
      distance += diff * diff;
    }

    return Math.sqrt(distance / m_instances.numAttributes());
  }
Exemple #11
0
  /**
   * Parses a given list of options.
   *
   * <p>
   * <!-- options-start -->
   * Valid options are:
   *
   * <p>
   *
   * <pre> -i &lt;the input file&gt;
   * The input file</pre>
   *
   * <pre> -o &lt;the output file&gt;
   * The output file</pre>
   *
   * <pre> -c &lt;the class index&gt;
   * The class index</pre>
   *
   * <!-- options-end -->
   *
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {

    String outputString = Utils.getOption('o', options);
    String inputString = Utils.getOption('i', options);
    String indexString = Utils.getOption('c', options);

    ArffLoader loader = new ArffLoader();

    resetOptions();

    // parse index
    int index = -1;
    if (indexString.length() != 0) {
      if (indexString.equals("first")) index = 0;
      else {
        if (indexString.equals("last")) index = -1;
        else index = Integer.parseInt(indexString);
      }
    }

    if (inputString.length() != 0) {
      try {
        File input = new File(inputString);
        loader.setFile(input);
        Instances inst = loader.getDataSet();
        if (index == -1) inst.setClassIndex(inst.numAttributes() - 1);
        else inst.setClassIndex(index);
        setInstances(inst);
      } catch (Exception ex) {
        throw new IOException(
            "No data set loaded. Data set has to be arff format (Reason: " + ex.toString() + ").");
      }
    } else throw new IOException("No data set to save.");

    if (outputString.length() != 0) {
      // add appropriate file extension
      if (!outputString.endsWith(getFileExtension())) {
        if (outputString.lastIndexOf('.') != -1)
          outputString =
              (outputString.substring(0, outputString.lastIndexOf('.'))) + getFileExtension();
        else outputString = outputString + getFileExtension();
      }
      try {
        File output = new File(outputString);
        setFile(output);
      } catch (Exception ex) {
        throw new IOException("Cannot create output file.");
      }
    }

    if (index == -1) index = getInstances().numAttributes() - 1;
    getInstances().setClassIndex(index);
  }
Exemple #12
0
  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input instance structure (any instances
   *     contained in the object are ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @throws Exception if the format couldn't be set successfully
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {

    super.setInputFormat(instanceInfo);

    m_Insert.setUpper(instanceInfo.numAttributes());
    Instances outputFormat = new Instances(instanceInfo, 0);
    Attribute newAttribute = null;
    switch (m_AttributeType) {
      case Attribute.NUMERIC:
        newAttribute = new Attribute(m_Name);
        break;
      case Attribute.NOMINAL:
        newAttribute = new Attribute(m_Name, m_Labels);
        break;
      case Attribute.STRING:
        newAttribute = new Attribute(m_Name, (FastVector) null);
        break;
      case Attribute.DATE:
        newAttribute = new Attribute(m_Name, m_DateFormat);
        break;
      default:
        throw new IllegalArgumentException("Unknown attribute type in Add");
    }

    if ((m_Insert.getIndex() < 0) || (m_Insert.getIndex() > getInputFormat().numAttributes())) {
      throw new IllegalArgumentException("Index out of range");
    }
    outputFormat.insertAttributeAt(newAttribute, m_Insert.getIndex());
    setOutputFormat(outputFormat);

    // all attributes, except index of added attribute
    // (otherwise the length of the input/output indices differ)
    Range atts = new Range(m_Insert.getSingleIndex());
    atts.setInvert(true);
    atts.setUpper(outputFormat.numAttributes() - 1);
    initOutputLocators(outputFormat, atts.getSelection());

    return true;
  }
  /**
   * Initializes a gain ratio attribute evaluator. Discretizes all attributes that are numeric.
   *
   * @param data set of instances serving as training data
   * @throws Exception if the evaluator has not been generated successfully
   */
  public void buildEvaluator(Instances data) throws Exception {

    // can evaluator handle data?
    getCapabilities().testWithFail(data);

    m_trainInstances = data;
    m_classIndex = m_trainInstances.classIndex();
    m_numAttribs = m_trainInstances.numAttributes();
    m_numInstances = m_trainInstances.numInstances();
    Discretize disTransform = new Discretize();
    disTransform.setUseBetterEncoding(true);
    disTransform.setInputFormat(m_trainInstances);
    m_trainInstances = Filter.useFilter(m_trainInstances, disTransform);
    m_numClasses = m_trainInstances.attribute(m_classIndex).numValues();
  }
  /**
   * Generates a clusterer by the mean of spectral clustering algorithm.
   *
   * @param data set of instances serving as training data
   * @exception Exception if the clusterer has not been generated successfully
   */
  public void buildClusterer(Instances data) throws java.lang.Exception {
    m_Sequences = new Instances(data);
    int n = data.numInstances();
    int k = data.numAttributes();
    DoubleMatrix2D w;
    if (useSparseMatrix) w = DoubleFactory2D.sparse.make(n, n);
    else w = DoubleFactory2D.dense.make(n, n);
    double[][] v1 = new double[n][];
    for (int i = 0; i < n; i++) v1[i] = data.instance(i).toDoubleArray();
    v = DoubleFactory2D.dense.make(v1);
    double sigma_sq = sigma * sigma;
    // Sets up similarity matrix
    for (int i = 0; i < n; i++)
      for (int j = i; j < n; j++) {
        /*double dist = distnorm2(v.viewRow(i), v.viewRow(j));
        if((r == -1) || (dist < r)) {
          double sim = Math.exp(- (dist * dist) / (2 * sigma_sq));
          w.set(i, j, sim);
          w.set(j, i, sim);
        }*/
        /* String [] key = {data.instance(i).stringValue(0), data.instance(j).stringValue(0)};
        System.out.println(key[0]);
        System.out.println(key[1]);
        System.out.println(simScoreMap.containsKey(key));
        Double simValue = simScoreMap.get(key);*/

        double sim = sim_matrix[i][j];
        w.set(i, j, sim);
        w.set(j, i, sim);
      }

    // Partitions points
    int[][] p = partition(w, alpha_star);

    // Deploys results
    numOfClusters = p.length;
    cluster = new int[n];
    for (int i = 0; i < p.length; i++) for (int j = 0; j < p[i].length; j++) cluster[p[i][j]] = i;

    // System.out.println("Final partition:");
    // UtilsJS.printMatrix(p);
    // System.out.println("Cluster:\n");
    // UtilsJS.printArray(cluster);
    this.numOfClusters = cluster[Utils.maxIndex(cluster)] + 1;
    //  System.out.println("Num clusters:\t"+this.numOfClusters);
  }
  /**
   * Prints out the classifier.
   *
   * @return a description of the classifier as a string
   */
  public String toString() {
    StringBuffer text = new StringBuffer();
    text.append("SMOreg\n\n");
    if (m_weights != null) {
      text.append("weights (not support vectors):\n");
      // it's a linear machine
      for (int i = 0; i < m_data.numAttributes(); i++) {
        if (i != m_classIndex) {
          text.append(
              (m_weights[i] >= 0 ? " + " : " - ")
                  + Utils.doubleToString(Math.abs(m_weights[i]), 12, 4)
                  + " * ");
          if (m_SVM.getFilterType().getSelectedTag().getID() == SMOreg.FILTER_STANDARDIZE) {
            text.append("(standardized) ");
          } else if (m_SVM.getFilterType().getSelectedTag().getID() == SMOreg.FILTER_NORMALIZE) {
            text.append("(normalized) ");
          }
          text.append(m_data.attribute(i).name() + "\n");
        }
      }
    } else {
      // non linear, print out all supportvectors
      text.append("Support vectors:\n");
      for (int i = 0; i < m_nInstances; i++) {
        if (m_alpha[i] > 0) {
          text.append("+" + m_alpha[i] + " * k[" + i + "]\n");
        }
        if (m_alphaStar[i] > 0) {
          text.append("-" + m_alphaStar[i] + " * k[" + i + "]\n");
        }
      }
    }

    text.append((m_b <= 0 ? " + " : " - ") + Utils.doubleToString(Math.abs(m_b), 12, 4) + "\n\n");

    text.append("\n\nNumber of kernel evaluations: " + m_nEvals);
    if (m_nCacheHits >= 0 && m_nEvals > 0) {
      double hitRatio = 1 - m_nEvals * 1.0 / (m_nCacheHits + m_nEvals);
      text.append(" (" + Utils.doubleToString(hitRatio * 100, 7, 3).trim() + "% cached)");
    }

    return text.toString();
  }
Exemple #16
0
  /**
   * Method for building an Id3 tree.
   *
   * @param data the training data
   * @exception Exception if decision tree can't be built successfully
   */
  private void makeTree(Instances data) throws Exception {

    // Check if no instances have reached this node.
    if (data.numInstances() == 0) {
      m_Attribute = null;
      m_ClassValue = Utils.missingValue();
      m_Distribution = new double[data.numClasses()];
      return;
    }

    // Compute attribute with maximum information gain.
    double[] infoGains = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    while (attEnum.hasMoreElements()) {
      Attribute att = (Attribute) attEnum.nextElement();
      infoGains[att.index()] = computeInfoGain(data, att);
    }
    m_Attribute = data.attribute(Utils.maxIndex(infoGains));

    // Make leaf if information gain is zero.
    // Otherwise create successors.
    if (Utils.eq(infoGains[m_Attribute.index()], 0)) {
      m_Attribute = null;
      m_Distribution = new double[data.numClasses()];
      Enumeration instEnum = data.enumerateInstances();
      while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        m_Distribution[(int) inst.classValue()]++;
      }
      Utils.normalize(m_Distribution);
      m_ClassValue = Utils.maxIndex(m_Distribution);
      m_ClassAttribute = data.classAttribute();
    } else {
      Instances[] splitData = splitData(data, m_Attribute);
      m_Successors = new Id3[m_Attribute.numValues()];
      for (int j = 0; j < m_Attribute.numValues(); j++) {
        m_Successors[j] = new Id3();
        m_Successors[j].makeTree(splitData[j]);
      }
    }
  }
  /**
   * Initializes the ranges of a subset of the instances of this dataset. Therefore m_Ranges is not
   * set.
   *
   * @param instList list of indexes of the subset
   * @return the ranges
   * @throws Exception if something goes wrong
   */
  public double[][] initializeRanges(int[] instList) throws Exception {
    if (m_Data == null) {
      throw new Exception("No instances supplied.");
    }

    int numAtt = m_Data.numAttributes();
    double[][] ranges = new double[numAtt][3];

    if (m_Data.numInstances() <= 0) {
      initializeRangesEmpty(numAtt, ranges);
      return ranges;
    } else {
      // initialize ranges using the first instance
      updateRangesFirst(m_Data.instance(instList[0]), numAtt, ranges);
      // update ranges, starting from the second
      for (int i = 1; i < instList.length; i++) {
        updateRanges(m_Data.instance(instList[i]), numAtt, ranges);
      }
    }
    return ranges;
  }
  /**
   * Generates an attribute evaluator. Has to initialise all fields of the evaluator that are not
   * being set via options.
   *
   * @param data set of instances serving as training data
   * @throws Exception if the evaluator has not been generated successfully
   */
  public void buildEvaluator(Instances data) throws Exception {

    // can evaluator handle data?
    getCapabilities().testWithFail(data);

    m_trainInstances = new Instances(data);
    m_trainInstances.deleteWithMissingClass();

    m_numAttribs = m_trainInstances.numAttributes();
    m_numInstances = m_trainInstances.numInstances();

    // if the data has no decision feature, m_classIndex is negative
    m_classIndex = m_trainInstances.classIndex();

    // supervised
    if (m_classIndex >= 0) {
      m_isNumeric = m_trainInstances.attribute(m_classIndex).isNumeric();

      if (m_isNumeric) {
        m_DecisionSimilarity = m_Similarity;
      } else m_DecisionSimilarity = m_SimilarityEq;
    }

    m_Similarity.setInstances(m_trainInstances);
    m_DecisionSimilarity.setInstances(m_trainInstances);
    m_SimilarityEq.setInstances(m_trainInstances);
    m_composition = m_Similarity.getTNorm();

    m_FuzzyMeasure.set(
        m_Similarity,
        m_DecisionSimilarity,
        m_TNorm,
        m_composition,
        m_Implicator,
        m_SNorm,
        m_numInstances,
        m_numAttribs,
        m_classIndex,
        m_trainInstances);
  }
Exemple #19
0
  /**
   * return a string describing this clusterer
   *
   * @return a description of the clusterer as a string
   */
  public String toString() {
    StringBuffer temp = new StringBuffer();

    temp.append("\n FarthestFirst\n==============\n");

    temp.append("\nCluster centroids:\n");
    for (int i = 0; i < m_NumClusters; i++) {
      temp.append("\nCluster " + i + "\n\t");
      for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
        if (m_ClusterCentroids.attribute(j).isNominal()) {
          temp.append(
              " "
                  + m_ClusterCentroids
                      .attribute(j)
                      .value((int) m_ClusterCentroids.instance(i).value(j)));
        } else {
          temp.append(" " + m_ClusterCentroids.instance(i).value(j));
        }
      }
    }
    temp.append("\n\n");
    return temp.toString();
  }
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing
   * <b>attributes</b> (e.g. as is common with microarray data)
   */
  public Instances tableColsToInstances(Table t, String relationName) {

    System.err.print("Converting table cols to instances...");

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...

    System.err.print("creating attributes...");

    for (int r = 0; r < t.numRows; r++) {
      if (rowIsNumeric(t, r)) {
        isNominal.add(false);
        atts.addElement(new Attribute(t.rowNames[r]));
        allAttVals.add(null); // No enumeration of attribute values.
      } else {
        // It's nominal... determine the range of values and create a nominal attribute...
        isNominal.add(true);
        FastVector attVals = getRowValues(t, r);
        atts.addElement(new Attribute(t.rowNames[r], attVals));
        // Save it for later
        allAttVals.add(attVals);
      }
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < t.numCols; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      // For each attribute fill in the numeric or attributeValue index...
      for (int r = 0; r < t.numRows; r++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[r] = Instance.missingValue();
        else if (isNominal.get(r)) {
          vals[r] = allAttVals.get(r).indexOf(val);
        } else {
          vals[r] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int c = 0; c < t.colNames.length; c++) {
        instanceNames.add(t.colNames[c]);
        newData.instance(c).setValue(attrIdx, t.colNames[c]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
  /**
   * Calculates the distance between two instances. Offers speed up (if the distance function class
   * in use supports it) in nearest neighbour search by taking into account the cutOff or maximum
   * distance. Depending on the distance function class, post processing of the distances by
   * postProcessDistances(double []) may be required if this function is used.
   *
   * @param first the first instance
   * @param second the second instance
   * @param cutOffValue If the distance being calculated becomes larger than cutOffValue then the
   *     rest of the calculation is discarded.
   * @param stats the performance stats object
   * @return the distance between the two given instances or Double.POSITIVE_INFINITY if the
   *     distance being calculated becomes larger than cutOffValue.
   */
  @Override
  public double distance(
      Instance first, Instance second, double cutOffValue, PerformanceStats stats) {
    double distance = 0;
    int firstI, secondI;
    int firstNumValues = first.numValues();
    int secondNumValues = second.numValues();
    int numAttributes = m_Data.numAttributes();
    int classIndex = m_Data.classIndex();

    validate();

    for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues; ) {
      if (p1 >= firstNumValues) {
        firstI = numAttributes;
      } else {
        firstI = first.index(p1);
      }

      if (p2 >= secondNumValues) {
        secondI = numAttributes;
      } else {
        secondI = second.index(p2);
      }

      if (firstI == classIndex) {
        p1++;
        continue;
      }
      if ((firstI < numAttributes) && !m_ActiveIndices[firstI]) {
        p1++;
        continue;
      }

      if (secondI == classIndex) {
        p2++;
        continue;
      }
      if ((secondI < numAttributes) && !m_ActiveIndices[secondI]) {
        p2++;
        continue;
      }

      double diff;

      if (firstI == secondI) {
        diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
        p1++;
        p2++;
      } else if (firstI > secondI) {
        diff = difference(secondI, 0, second.valueSparse(p2));
        p2++;
      } else {
        diff = difference(firstI, first.valueSparse(p1), 0);
        p1++;
      }
      if (stats != null) {
        stats.incrCoordCount();
      }

      distance = updateDistance(distance, diff);
      if (distance > cutOffValue) {
        return Double.POSITIVE_INFINITY;
      }
    }

    return distance;
  }
Exemple #22
0
  /**
   * Writes a Batch of instances
   *
   * @throws IOException throws IOException if saving in batch mode is not possible
   */
  public void writeBatch() throws IOException {

    Instances instances = getInstances();

    if (instances == null) throw new IOException("No instances to save");
    if (instances.classIndex() == -1) {
      instances.setClassIndex(instances.numAttributes() - 1);
      System.err.println("No class specified. Last attribute is used as class attribute.");
    }
    if (instances.attribute(instances.classIndex()).isNumeric())
      throw new IOException("To save in C4.5 format the class attribute cannot be numeric.");
    if (getRetrieval() == INCREMENTAL)
      throw new IOException("Batch and incremental saving cannot be mixed.");

    setRetrieval(BATCH);
    if (retrieveFile() == null || getWriter() == null) {
      throw new IOException(
          "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option.");
    }
    setWriteMode(WRITE);
    // print names file
    setFileExtension(".names");
    PrintWriter outW = new PrintWriter(getWriter());
    for (int i = 0; i < instances.attribute(instances.classIndex()).numValues(); i++) {
      outW.write(instances.attribute(instances.classIndex()).value(i));
      if (i < instances.attribute(instances.classIndex()).numValues() - 1) {
        outW.write(",");
      } else {
        outW.write(".\n");
      }
    }
    for (int i = 0; i < instances.numAttributes(); i++) {
      if (i != instances.classIndex()) {
        outW.write(instances.attribute(i).name() + ": ");
        if (instances.attribute(i).isNumeric() || instances.attribute(i).isDate()) {
          outW.write("continuous.\n");
        } else {
          Attribute temp = instances.attribute(i);
          for (int j = 0; j < temp.numValues(); j++) {
            outW.write(temp.value(j));
            if (j < temp.numValues() - 1) {
              outW.write(",");
            } else {
              outW.write(".\n");
            }
          }
        }
      }
    }
    outW.flush();
    outW.close();

    // print data file
    String out = retrieveFile().getAbsolutePath();
    setFileExtension(".data");
    out = out.substring(0, out.lastIndexOf('.')) + getFileExtension();
    File namesFile = new File(out);
    try {
      setFile(namesFile);
    } catch (Exception ex) {
      throw new IOException(
          "Cannot create data file, only names file created (Reason: " + ex.toString() + ").");
    }
    if (retrieveFile() == null || getWriter() == null) {
      throw new IOException("Cannot create data file, only names file created.");
    }
    outW = new PrintWriter(getWriter());
    // print data file
    for (int i = 0; i < instances.numInstances(); i++) {
      Instance temp = instances.instance(i);
      for (int j = 0; j < temp.numAttributes(); j++) {
        if (j != instances.classIndex()) {
          if (temp.isMissing(j)) {
            outW.write("?,");
          } else if (instances.attribute(j).isNominal() || instances.attribute(j).isString()) {
            outW.write(instances.attribute(j).value((int) temp.value(j)) + ",");
          } else {
            outW.write("" + temp.value(j) + ",");
          }
        }
      }
      // write the class value
      if (temp.isMissing(instances.classIndex())) {
        outW.write("?");
      } else {
        outW.write(
            instances
                .attribute(instances.classIndex())
                .value((int) temp.value(instances.classIndex())));
      }
      outW.write("\n");
    }
    outW.flush();
    outW.close();
    setFileExtension(".names");
    setWriteMode(WAIT);
    outW = null;
    resetWriter();
    setWriteMode(CANCEL);
  }
Exemple #23
0
  /**
   * This function fits the rule to the data which it overlaps. This way the rule can only
   * interpolate but not extrapolate.
   *
   * @param instances The data to which the rule shall be fitted
   */
  public void fitAndSetCoreBound(Instances instances) {
    if (m_Antds == null) return;
    boolean[] antExistingForDimension = new boolean[instances.numAttributes() - 1];
    for (int i = 0; i < m_Antds.size(); i++) {
      antExistingForDimension[((Antd) m_Antds.elementAt(i)).att.index()] = true;
    }

    FastVector newAntds = new FastVector(10);
    //    for (int i=0; i < instances.numAttributes()-1; i++){
    for (int iterator = 0; iterator < m_Antds.size(); iterator++) {
      int i = ((Antd) m_Antds.elementAt(iterator)).getAttr().index();

      if (!antExistingForDimension[i]) continue; // Excluding non existant antecedents
      Instances instancesWithoutMissingValues = new Instances(instances);
      instancesWithoutMissingValues.deleteWithMissing(i);

      if (instancesWithoutMissingValues.attribute(i).isNumeric()
          && instancesWithoutMissingValues.numInstances() > 0) {
        boolean bag0AntdExists = false;
        boolean bag1AntdExists = false;
        for (int j = 0; j < m_Antds.size(); j++) {
          if (((Antd) m_Antds.elementAt(j)).att.index() == i) {
            if (((Antd) m_Antds.elementAt(j)).value == 0) {
              bag0AntdExists = true;
            } else {
              bag1AntdExists = true;
            }
            newAntds.addElement((Antd) m_Antds.elementAt(j));
          }
        }

        double higherCore = Double.NaN;
        double lowerCore = Double.NaN;

        if (!bag0AntdExists) {
          if (Double.isNaN(higherCore))
            higherCore =
                instancesWithoutMissingValues.kthSmallestValue(
                    i, instancesWithoutMissingValues.numInstances());
          NumericAntd antd;
          antd = new NumericAntd(instancesWithoutMissingValues.attribute(i));
          antd.value = 0;
          antd.splitPoint = higherCore;
          newAntds.addElement(antd);
        }

        if (!bag1AntdExists) {
          if (Double.isNaN(lowerCore))
            lowerCore = instancesWithoutMissingValues.kthSmallestValue(i, 1);
          NumericAntd antd;
          antd = new NumericAntd(instancesWithoutMissingValues.attribute(i));
          antd.value = 1;
          antd.splitPoint = lowerCore;
          newAntds.addElement(antd);
        }
      } else {
        for (int j = 0; j < m_Antds.size(); j++) {
          if (((Antd) m_Antds.elementAt(j)).att.index() == i) {
            newAntds.addElement(m_Antds.elementAt(j));
          }
        }
      }
    }
    m_Antds = newAntds;
  }
Exemple #24
0
  /**
   * Saves an instances incrementally. Structure has to be set by using the setStructure() method or
   * setInstances() method.
   *
   * @param inst the instance to save
   * @throws IOException throws IOEXception if an instance cannot be saved incrementally.
   */
  public void writeIncremental(Instance inst) throws IOException {

    int writeMode = getWriteMode();
    Instances structure = getInstances();
    PrintWriter outW = null;

    if (structure != null) {
      if (structure.classIndex() == -1) {
        structure.setClassIndex(structure.numAttributes() - 1);
        System.err.println("No class specified. Last attribute is used as class attribute.");
      }
      if (structure.attribute(structure.classIndex()).isNumeric())
        throw new IOException("To save in C4.5 format the class attribute cannot be numeric.");
    }
    if (getRetrieval() == BATCH || getRetrieval() == NONE)
      throw new IOException("Batch and incremental saving cannot be mixed.");
    if (retrieveFile() == null || getWriter() == null) {
      throw new IOException(
          "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option.");
    }

    outW = new PrintWriter(getWriter());

    if (writeMode == WAIT) {
      if (structure == null) {
        setWriteMode(CANCEL);
        if (inst != null)
          System.err.println("Structure(Header Information) has to be set in advance");
      } else setWriteMode(STRUCTURE_READY);
      writeMode = getWriteMode();
    }
    if (writeMode == CANCEL) {
      if (outW != null) outW.close();
      cancel();
    }
    if (writeMode == STRUCTURE_READY) {
      setWriteMode(WRITE);
      // write header: here names file
      for (int i = 0; i < structure.attribute(structure.classIndex()).numValues(); i++) {
        outW.write(structure.attribute(structure.classIndex()).value(i));
        if (i < structure.attribute(structure.classIndex()).numValues() - 1) {
          outW.write(",");
        } else {
          outW.write(".\n");
        }
      }
      for (int i = 0; i < structure.numAttributes(); i++) {
        if (i != structure.classIndex()) {
          outW.write(structure.attribute(i).name() + ": ");
          if (structure.attribute(i).isNumeric() || structure.attribute(i).isDate()) {
            outW.write("continuous.\n");
          } else {
            Attribute temp = structure.attribute(i);
            for (int j = 0; j < temp.numValues(); j++) {
              outW.write(temp.value(j));
              if (j < temp.numValues() - 1) {
                outW.write(",");
              } else {
                outW.write(".\n");
              }
            }
          }
        }
      }
      outW.flush();
      outW.close();

      writeMode = getWriteMode();

      String out = retrieveFile().getAbsolutePath();
      setFileExtension(".data");
      out = out.substring(0, out.lastIndexOf('.')) + getFileExtension();
      File namesFile = new File(out);
      try {
        setFile(namesFile);
      } catch (Exception ex) {
        throw new IOException("Cannot create data file, only names file created.");
      }
      if (retrieveFile() == null || getWriter() == null) {
        throw new IOException("Cannot create data file, only names file created.");
      }
      outW = new PrintWriter(getWriter());
    }
    if (writeMode == WRITE) {
      if (structure == null) throw new IOException("No instances information available.");
      if (inst != null) {
        // write instance: here data file
        for (int j = 0; j < inst.numAttributes(); j++) {
          if (j != structure.classIndex()) {
            if (inst.isMissing(j)) {
              outW.write("?,");
            } else if (structure.attribute(j).isNominal() || structure.attribute(j).isString()) {
              outW.write(structure.attribute(j).value((int) inst.value(j)) + ",");
            } else {
              outW.write("" + inst.value(j) + ",");
            }
          }
        }
        // write the class value
        if (inst.isMissing(structure.classIndex())) {
          outW.write("?");
        } else {
          outW.write(
              structure
                  .attribute(structure.classIndex())
                  .value((int) inst.value(structure.classIndex())));
        }
        outW.write("\n");
        // flushes every 100 instances
        m_incrementalCounter++;
        if (m_incrementalCounter > 100) {
          m_incrementalCounter = 0;
          outW.flush();
        }
      } else {
        // close
        if (outW != null) {
          outW.flush();
          outW.close();
        }
        setFileExtension(".names");
        m_incrementalCounter = 0;
        resetStructure();
        outW = null;
        resetWriter();
      }
    }
  }
Exemple #25
0
  /**
   * Tests a certain range of attributes of the given data, whether it can be processed by the
   * handler, given its capabilities. Classifiers implementing the <code>
   * MultiInstanceCapabilitiesHandler</code> interface are checked automatically for their
   * multi-instance Capabilities (if no bags, then only the bag-structure, otherwise only the first
   * bag).
   *
   * @param data the data to test
   * @param fromIndex the range of attributes - start (incl.)
   * @param toIndex the range of attributes - end (incl.)
   * @return true if all the tests succeeded
   * @see MultiInstanceCapabilitiesHandler
   * @see #m_InstancesTest
   * @see #m_MissingValuesTest
   * @see #m_MissingClassValuesTest
   * @see #m_MinimumNumberInstancesTest
   */
  public boolean test(Instances data, int fromIndex, int toIndex) {
    int i;
    int n;
    int m;
    Attribute att;
    Instance inst;
    boolean testClass;
    Capabilities cap;
    boolean missing;
    Iterator iter;

    // shall we test the data?
    if (!m_InstancesTest) return true;

    // no Capabilities? -> warning
    if ((m_Capabilities.size() == 0)
        || ((m_Capabilities.size() == 1) && handles(Capability.NO_CLASS)))
      System.err.println(createMessage("No capabilities set!"));

    // any attributes?
    if (toIndex - fromIndex < 0) {
      m_FailReason = new WekaException(createMessage("No attributes!"));
      return false;
    }

    // do wee need to test the class attribute, i.e., is the class attribute
    // within the range of attributes?
    testClass =
        (data.classIndex() > -1)
            && (data.classIndex() >= fromIndex)
            && (data.classIndex() <= toIndex);

    // attributes
    for (i = fromIndex; i <= toIndex; i++) {
      att = data.attribute(i);

      // class is handled separately
      if (i == data.classIndex()) continue;

      // check attribute types
      if (!test(att)) return false;
    }

    // class
    if (!handles(Capability.NO_CLASS) && (data.classIndex() == -1)) {
      m_FailReason = new UnassignedClassException(createMessage("Class attribute not set!"));
      return false;
    }

    // special case: no class attribute can be handled
    if (handles(Capability.NO_CLASS) && (data.classIndex() > -1)) {
      cap = getClassCapabilities();
      cap.disable(Capability.NO_CLASS);
      iter = cap.capabilities();
      if (!iter.hasNext()) {
        m_FailReason = new WekaException(createMessage("Cannot handle any class attribute!"));
        return false;
      }
    }

    if (testClass && !handles(Capability.NO_CLASS)) {
      att = data.classAttribute();
      if (!test(att, true)) return false;

      // special handling of RELATIONAL class
      // TODO: store additional Capabilities for this case

      // missing class labels
      if (m_MissingClassValuesTest) {
        if (!handles(Capability.MISSING_CLASS_VALUES)) {
          for (i = 0; i < data.numInstances(); i++) {
            if (data.instance(i).classIsMissing()) {
              m_FailReason =
                  new WekaException(createMessage("Cannot handle missing class values!"));
              return false;
            }
          }
        } else {
          if (m_MinimumNumberInstancesTest) {
            int hasClass = 0;

            for (i = 0; i < data.numInstances(); i++) {
              if (!data.instance(i).classIsMissing()) hasClass++;
            }

            // not enough instances with class labels?
            if (hasClass < getMinimumNumberInstances()) {
              m_FailReason =
                  new WekaException(
                      createMessage(
                          "Not enough training instances with class labels (required: "
                              + getMinimumNumberInstances()
                              + ", provided: "
                              + hasClass
                              + ")!"));
              return false;
            }
          }
        }
      }
    }

    // missing values
    if (m_MissingValuesTest) {
      if (!handles(Capability.MISSING_VALUES)) {
        missing = false;
        for (i = 0; i < data.numInstances(); i++) {
          inst = data.instance(i);

          if (inst instanceof SparseInstance) {
            for (m = 0; m < inst.numValues(); m++) {
              n = inst.index(m);

              // out of scope?
              if (n < fromIndex) continue;
              if (n > toIndex) break;

              // skip class
              if (n == inst.classIndex()) continue;

              if (inst.isMissing(n)) {
                missing = true;
                break;
              }
            }
          } else {
            for (n = fromIndex; n <= toIndex; n++) {
              // skip class
              if (n == inst.classIndex()) continue;

              if (inst.isMissing(n)) {
                missing = true;
                break;
              }
            }
          }

          if (missing) {
            m_FailReason =
                new NoSupportForMissingValuesException(
                    createMessage("Cannot handle missing values!"));
            return false;
          }
        }
      }
    }

    // instances
    if (m_MinimumNumberInstancesTest) {
      if (data.numInstances() < getMinimumNumberInstances()) {
        m_FailReason =
            new WekaException(
                createMessage(
                    "Not enough training instances (required: "
                        + getMinimumNumberInstances()
                        + ", provided: "
                        + data.numInstances()
                        + ")!"));
        return false;
      }
    }

    // Multi-Instance? -> check structure (regardless of attribute range!)
    if (handles(Capability.ONLY_MULTIINSTANCE)) {
      // number of attributes?
      if (data.numAttributes() != 3) {
        m_FailReason =
            new WekaException(
                createMessage("Incorrect Multi-Instance format, must be 'bag-id, bag, class'!"));
        return false;
      }

      // type of attributes and position of class?
      if (!data.attribute(0).isNominal()
          || !data.attribute(1).isRelationValued()
          || (data.classIndex() != data.numAttributes() - 1)) {
        m_FailReason =
            new WekaException(
                createMessage(
                    "Incorrect Multi-Instance format, must be 'NOMINAL att, RELATIONAL att, CLASS att'!"));
        return false;
      }

      // check data immediately
      if (getOwner() instanceof MultiInstanceCapabilitiesHandler) {
        MultiInstanceCapabilitiesHandler handler = (MultiInstanceCapabilitiesHandler) getOwner();
        cap = handler.getMultiInstanceCapabilities();
        boolean result;
        if (data.numInstances() > 0) result = cap.test(data.attribute(1).relation(0));
        else result = cap.test(data.attribute(1).relation());

        if (!result) {
          m_FailReason = cap.m_FailReason;
          return false;
        }
      }
    }

    // passed all tests!
    return true;
  }
Exemple #26
0
 /**
  * Tests the given data, whether it can be processed by the handler, given its capabilities.
  * Classifiers implementing the <code>MultiInstanceCapabilitiesHandler</code> interface are
  * checked automatically for their multi-instance Capabilities (if no bags, then only the
  * bag-structure, otherwise only the first bag).
  *
  * @param data the data to test
  * @return true if all the tests succeeded
  * @see #test(Instances, int, int)
  */
 public boolean test(Instances data) {
   return test(data, 0, data.numAttributes() - 1);
 }
Exemple #27
0
  /**
   * returns a Capabilities object specific for this data. The minimum number of instances is not
   * set, the check for multi-instance data is optional.
   *
   * @param data the data to base the capabilities on
   * @param multi if true then the structure is checked, too
   * @return a data-specific capabilities object
   * @throws Exception in case an error occurrs, e.g., an unknown attribute type
   */
  public static Capabilities forInstances(Instances data, boolean multi) throws Exception {
    Capabilities result;
    Capabilities multiInstance;
    int i;
    int n;
    int m;
    Instance inst;
    boolean missing;

    result = new Capabilities(null);

    // class
    if (data.classIndex() == -1) {
      result.enable(Capability.NO_CLASS);
    } else {
      switch (data.classAttribute().type()) {
        case Attribute.NOMINAL:
          if (data.classAttribute().numValues() == 1) result.enable(Capability.UNARY_CLASS);
          else if (data.classAttribute().numValues() == 2) result.enable(Capability.BINARY_CLASS);
          else result.enable(Capability.NOMINAL_CLASS);
          break;

        case Attribute.NUMERIC:
          result.enable(Capability.NUMERIC_CLASS);
          break;

        case Attribute.STRING:
          result.enable(Capability.STRING_CLASS);
          break;

        case Attribute.DATE:
          result.enable(Capability.DATE_CLASS);
          break;

        case Attribute.RELATIONAL:
          result.enable(Capability.RELATIONAL_CLASS);
          break;

        default:
          throw new UnsupportedAttributeTypeException(
              "Unknown class attribute type '" + data.classAttribute() + "'!");
      }

      // missing class values
      for (i = 0; i < data.numInstances(); i++) {
        if (data.instance(i).classIsMissing()) {
          result.enable(Capability.MISSING_CLASS_VALUES);
          break;
        }
      }
    }

    // attributes
    for (i = 0; i < data.numAttributes(); i++) {
      // skip class
      if (i == data.classIndex()) continue;

      switch (data.attribute(i).type()) {
        case Attribute.NOMINAL:
          result.enable(Capability.UNARY_ATTRIBUTES);
          if (data.attribute(i).numValues() == 2) result.enable(Capability.BINARY_ATTRIBUTES);
          else if (data.attribute(i).numValues() > 2) result.enable(Capability.NOMINAL_ATTRIBUTES);
          break;

        case Attribute.NUMERIC:
          result.enable(Capability.NUMERIC_ATTRIBUTES);
          break;

        case Attribute.DATE:
          result.enable(Capability.DATE_ATTRIBUTES);
          break;

        case Attribute.STRING:
          result.enable(Capability.STRING_ATTRIBUTES);
          break;

        case Attribute.RELATIONAL:
          result.enable(Capability.RELATIONAL_ATTRIBUTES);
          break;

        default:
          throw new UnsupportedAttributeTypeException(
              "Unknown attribute type '" + data.attribute(i).type() + "'!");
      }
    }

    // missing values
    missing = false;
    for (i = 0; i < data.numInstances(); i++) {
      inst = data.instance(i);

      if (inst instanceof SparseInstance) {
        for (m = 0; m < inst.numValues(); m++) {
          n = inst.index(m);

          // skip class
          if (n == inst.classIndex()) continue;

          if (inst.isMissing(n)) {
            missing = true;
            break;
          }
        }
      } else {
        for (n = 0; n < data.numAttributes(); n++) {
          // skip class
          if (n == inst.classIndex()) continue;

          if (inst.isMissing(n)) {
            missing = true;
            break;
          }
        }
      }

      if (missing) {
        result.enable(Capability.MISSING_VALUES);
        break;
      }
    }

    // multi-instance data?
    if (multi) {
      if ((data.numAttributes() == 3)
          && (data.attribute(0).isNominal()) // bag-id
          && (data.attribute(1).isRelationValued()) // bag
          && (data.classIndex() == data.numAttributes() - 1)) {
        multiInstance = new Capabilities(null);
        multiInstance.or(result.getClassCapabilities());
        multiInstance.enable(Capability.NOMINAL_ATTRIBUTES);
        multiInstance.enable(Capability.RELATIONAL_ATTRIBUTES);
        multiInstance.enable(Capability.ONLY_MULTIINSTANCE);
        result.assign(multiInstance);
      }
    }

    return result;
  }
  public void buildClassifier(Instances insts) throws Exception {

    // Compute mean of target value
    double yMean = insts.meanOrMode(insts.classIndex());

    // Choose best attribute
    double minMsq = Double.MAX_VALUE;
    m_attribute = null;
    int chosen = -1;
    double chosenSlope = Double.NaN;
    double chosenIntercept = Double.NaN;
    for (int i = 0; i < insts.numAttributes(); i++) {
      if (i != insts.classIndex()) {
        if (!insts.attribute(i).isNumeric()) {
          throw new Exception("UnivariateLinearRegression: Only numeric attributes!");
        }
        m_attribute = insts.attribute(i);

        // Compute slope and intercept
        double xMean = insts.meanOrMode(i);
        double sumWeightedXDiffSquared = 0;
        double sumWeightedYDiffSquared = 0;
        m_slope = 0;
        for (int j = 0; j < insts.numInstances(); j++) {
          Instance inst = insts.instance(j);
          if (!inst.isMissing(i) && !inst.classIsMissing()) {
            double xDiff = inst.value(i) - xMean;
            double yDiff = inst.classValue() - yMean;
            double weightedXDiff = inst.weight() * xDiff;
            double weightedYDiff = inst.weight() * yDiff;
            m_slope += weightedXDiff * yDiff;
            sumWeightedXDiffSquared += weightedXDiff * xDiff;
            sumWeightedYDiffSquared += weightedYDiff * yDiff;
          }
        }

        // Skip attribute if not useful
        if (sumWeightedXDiffSquared == 0) {
          continue;
        }
        double numerator = m_slope;
        m_slope /= sumWeightedXDiffSquared;
        m_intercept = yMean - m_slope * xMean;

        // Compute sum of squared errors
        double msq = sumWeightedYDiffSquared - m_slope * numerator;

        // Check whether this is the best attribute
        if (msq < minMsq) {
          minMsq = msq;
          chosen = i;
          chosenSlope = m_slope;
          chosenIntercept = m_intercept;
        }
      }
    }

    // Set parameters
    if (chosen == -1) {

      System.err.println("----- no useful attribute found");
      m_attribute = null;
      m_slope = 0;
      m_intercept = yMean;
    } else {
      m_attribute = insts.attribute(chosen);
      m_slope = chosenSlope;
      m_intercept = chosenIntercept;
    }
  }
Exemple #29
0
  /**
   * Ranks attributes using the specified attribute evaluator and then searches the ranking using
   * the supplied subset evaluator.
   *
   * @param ASEval the subset evaluator to guide the search
   * @param data the training instances.
   * @return an array (not necessarily ordered) of selected attribute indexes
   * @throws Exception if the search can't be completed
   */
  public int[] search(ASEvaluation ASEval, Instances data) throws Exception {

    double best_merit = -Double.MAX_VALUE;
    double temp_merit;
    BitSet temp_group, best_group = null;

    if (!(ASEval instanceof SubsetEvaluator)) {
      throw new Exception(ASEval.getClass().getName() + " is not a " + "Subset evaluator!");
    }

    m_SubsetEval = ASEval;
    m_Instances = data;
    m_numAttribs = m_Instances.numAttributes();

    /*    if (m_ASEval instanceof AttributeTransformer) {
    throw new Exception("Can't use an attribute transformer "
                        +"with RankSearch");
                        } */
    if (m_ASEval instanceof UnsupervisedAttributeEvaluator
        || m_ASEval instanceof UnsupervisedSubsetEvaluator) {
      m_hasClass = false;
      /*      if (!(m_SubsetEval instanceof UnsupervisedSubsetEvaluator)) {
      throw new Exception("Must use an unsupervised subset evaluator.");
      } */
    } else {
      m_hasClass = true;
      m_classIndex = m_Instances.classIndex();
    }

    if (m_ASEval instanceof AttributeEvaluator) {
      // generate the attribute ranking first
      Ranker ranker = new Ranker();
      m_ASEval.buildEvaluator(m_Instances);
      if (m_ASEval instanceof AttributeTransformer) {
        // get the transformed data a rebuild the subset evaluator
        m_Instances = ((AttributeTransformer) m_ASEval).transformedData(m_Instances);
        ((ASEvaluation) m_SubsetEval).buildEvaluator(m_Instances);
      }
      m_Ranking = ranker.search(m_ASEval, m_Instances);
    } else {
      GreedyStepwise fs = new GreedyStepwise();
      double[][] rankres;
      fs.setGenerateRanking(true);
      ((ASEvaluation) m_ASEval).buildEvaluator(m_Instances);
      fs.search(m_ASEval, m_Instances);
      rankres = fs.rankedAttributes();
      m_Ranking = new int[rankres.length];
      for (int i = 0; i < rankres.length; i++) {
        m_Ranking[i] = (int) rankres[i][0];
      }
    }

    // now evaluate the attribute ranking
    for (int i = m_startPoint; i < m_Ranking.length; i += m_add) {
      temp_group = new BitSet(m_numAttribs);
      for (int j = 0; j <= i; j++) {
        temp_group.set(m_Ranking[j]);
      }
      temp_merit = ((SubsetEvaluator) m_SubsetEval).evaluateSubset(temp_group);

      if (temp_merit > best_merit) {
        best_merit = temp_merit;
        ;
        best_group = temp_group;
      }
    }
    m_bestMerit = best_merit;
    return attributeList(best_group);
  }
Exemple #30
0
  /**
   * Build one rule using the growing data
   *
   * @param data the growing data used to build the rule
   * @throws Exception if the consequent is not set yet
   */
  public void grow(Instances data) throws Exception {
    if (m_Consequent == -1) throw new Exception(" Consequent not set yet.");

    Instances growData = data;
    double sumOfWeights = growData.sumOfWeights();
    if (!Utils.gr(sumOfWeights, 0.0)) return;

    /* Compute the default accurate rate of the growing data */
    double defAccu = computeDefAccu(growData);
    double defAcRt = (defAccu + 1.0) / (sumOfWeights + 1.0);

    /* Keep the record of which attributes have already been used*/
    boolean[] used = new boolean[growData.numAttributes()];
    for (int k = 0; k < used.length; k++) used[k] = false;
    int numUnused = used.length;

    // If there are already antecedents existing
    for (int j = 0; j < m_Antds.size(); j++) {
      Antd antdj = (Antd) m_Antds.elementAt(j);
      if (!antdj.getAttr().isNumeric()) {
        used[antdj.getAttr().index()] = true;
        numUnused--;
      }
    }

    double maxInfoGain;
    while (Utils.gr(growData.numInstances(), 0.0) && (numUnused > 0) && Utils.sm(defAcRt, 1.0)) {

      // We require that infoGain be positive
      /*if(numAntds == originalSize)
      maxInfoGain = 0.0; // At least one condition allowed
      else
      maxInfoGain = Utils.eq(defAcRt, 1.0) ?
      defAccu/(double)numAntds : 0.0; */
      maxInfoGain = 0.0;

      /* Build a list of antecedents */
      Antd oneAntd = null;
      Instances coverData = null;
      Enumeration enumAttr = growData.enumerateAttributes();

      /* Build one condition based on all attributes not used yet*/
      while (enumAttr.hasMoreElements()) {
        AttributeWeka att = (AttributeWeka) (enumAttr.nextElement());

        if (m_Debug) System.err.println("\nOne condition: size = " + growData.sumOfWeights());

        Antd antd = null;
        if (att.isNumeric()) antd = new NumericAntd(att);
        else antd = new NominalAntd(att);

        if (!used[att.index()]) {
          /* Compute the best information gain for each attribute,
          it's stored in the antecedent formed by this attribute.
          This procedure returns the data covered by the antecedent*/
          Instances coveredData = computeInfoGain(growData, defAcRt, antd);
          if (coveredData != null) {
            double infoGain = antd.getMaxInfoGain();
            if (m_Debug)
              System.err.println(
                  "Test of \'"
                      + antd.toString()
                      + "\': infoGain = "
                      + infoGain
                      + " | Accuracy = "
                      + antd.getAccuRate()
                      + "="
                      + antd.getAccu()
                      + "/"
                      + antd.getCover()
                      + " def. accuracy: "
                      + defAcRt);

            if (infoGain > maxInfoGain) {
              oneAntd = antd;
              coverData = coveredData;
              maxInfoGain = infoGain;
            }
          }
        }
      }

      if (oneAntd == null) break; // Cannot find antds
      if (Utils.sm(oneAntd.getAccu(), m_MinNo)) break; // Too low coverage

      // Numeric attributes can be used more than once
      if (!oneAntd.getAttr().isNumeric()) {
        used[oneAntd.getAttr().index()] = true;
        numUnused--;
      }

      m_Antds.addElement(oneAntd);

      growData = coverData; // Grow data size is shrinking
      defAcRt = oneAntd.getAccuRate();
    }
  }