Example #1
0
  /** Computes average class values for each attribute and value */
  private void computeAverageClassValues() {

    double totalCounts, sum;
    Instance instance;
    double[] counts;

    double[][] avgClassValues = new double[getInputFormat().numAttributes()][0];
    m_Indices = new int[getInputFormat().numAttributes()][0];
    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
      Attribute att = getInputFormat().attribute(j);
      if (att.isNominal()) {
        avgClassValues[j] = new double[att.numValues()];
        counts = new double[att.numValues()];
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
          instance = getInputFormat().instance(i);
          if (!instance.classIsMissing() && (!instance.isMissing(j))) {
            counts[(int) instance.value(j)] += instance.weight();
            avgClassValues[j][(int) instance.value(j)] += instance.weight() * instance.classValue();
          }
        }
        sum = Utils.sum(avgClassValues[j]);
        totalCounts = Utils.sum(counts);
        if (Utils.gr(totalCounts, 0)) {
          for (int k = 0; k < att.numValues(); k++) {
            if (Utils.gr(counts[k], 0)) {
              avgClassValues[j][k] /= counts[k];
            } else {
              avgClassValues[j][k] = sum / totalCounts;
            }
          }
        }
        m_Indices[j] = Utils.sort(avgClassValues[j]);
      }
    }
  }
  /**
   * Set the output format. Takes the current average class values and m_InputFormat and calls
   * setOutputFormat(Instances) appropriately.
   */
  private void setOutputFormat() {

    Instances newData;
    FastVector newAtts, newVals;

    // Compute new attributes

    newAtts = new FastVector(getInputFormat().numAttributes());
    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
      Attribute att = getInputFormat().attribute(j);
      if (!m_AttIndices.isInRange(j) || !att.isString()) {

        // We don't have to copy the attribute because the
        // attribute index remains unchanged.
        newAtts.addElement(att);
      } else {

        // Compute list of attribute values
        newVals = new FastVector(att.numValues());
        for (int i = 0; i < att.numValues(); i++) {
          newVals.addElement(att.value(i));
        }
        newAtts.addElement(new Attribute(att.name(), newVals));
      }
    }

    // Construct new header
    newData = new Instances(getInputFormat().relationName(), newAtts, 0);
    newData.setClassIndex(getInputFormat().classIndex());
    setOutputFormat(newData);
  }
Example #3
0
  /**
   * Adds this tree recursively to the buffer.
   *
   * @param id the unqiue id for the method
   * @param buffer the buffer to add the source code to
   * @return the last ID being used
   * @throws Exception if something goes wrong
   */
  protected int toSource(int id, StringBuffer buffer) throws Exception {
    int result;
    int i;
    int newID;
    StringBuffer[] subBuffers;

    buffer.append("\n");
    buffer.append("  protected static double node" + id + "(Object[] i) {\n");

    // leaf?
    if (m_Attribute == null) {
      result = id;
      if (Double.isNaN(m_ClassValue)) buffer.append("    return Double.NaN;");
      else buffer.append("    return " + m_ClassValue + ";");
      if (m_ClassAttribute != null)
        buffer.append(" // " + m_ClassAttribute.value((int) m_ClassValue));
      buffer.append("\n");
      buffer.append("  }\n");
    } else {
      buffer.append("    // " + m_Attribute.name() + "\n");

      // subtree calls
      subBuffers = new StringBuffer[m_Attribute.numValues()];
      newID = id;
      for (i = 0; i < m_Attribute.numValues(); i++) {
        newID++;

        buffer.append("    ");
        if (i > 0) buffer.append("else ");
        buffer.append(
            "if (((String) i["
                + m_Attribute.index()
                + "]).equals(\""
                + m_Attribute.value(i)
                + "\"))\n");
        buffer.append("      return node" + newID + "(i);\n");

        subBuffers[i] = new StringBuffer();
        newID = m_Successors[i].toSource(newID, subBuffers[i]);
      }
      buffer.append("    else\n");
      buffer.append(
          "      throw new IllegalArgumentException(\"Value '\" + i["
              + m_Attribute.index()
              + "] + \"' is not allowed!\");\n");
      buffer.append("  }\n");

      // output subtree code
      for (i = 0; i < m_Attribute.numValues(); i++) {
        buffer.append(subBuffers[i].toString());
      }
      subBuffers = null;

      result = newID;
    }

    return result;
  }
Example #4
0
  /**
   * Convert a single instance over if the class is nominal. The converted instance is added to the
   * end of the output queue.
   *
   * @param instance the instance to convert
   */
  private void convertInstanceNominal(Instance instance) {

    if (!m_needToTransform) {
      push(instance);
      return;
    }

    double[] vals = new double[outputFormatPeek().numAttributes()];
    int attSoFar = 0;

    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
      Attribute att = getInputFormat().attribute(j);
      if ((!att.isNominal()) || (j == getInputFormat().classIndex())) {
        vals[attSoFar] = instance.value(j);
        attSoFar++;
      } else {
        if ((att.numValues() <= 2) && (!m_TransformAll)) {
          vals[attSoFar] = instance.value(j);
          attSoFar++;
        } else {
          if (instance.isMissing(j)) {
            for (int k = 0; k < att.numValues(); k++) {
              vals[attSoFar + k] = instance.value(j);
            }
          } else {
            for (int k = 0; k < att.numValues(); k++) {
              if (k == (int) instance.value(j)) {
                vals[attSoFar + k] = 1;
              } else {
                vals[attSoFar + k] = 0;
              }
            }
          }
          attSoFar += att.numValues();
        }
      }
    }
    Instance inst = null;
    if (instance instanceof SparseInstance) {
      inst = new SparseInstance(instance.weight(), vals);
    } else {
      inst = new DenseInstance(instance.weight(), vals);
    }
    inst.setDataset(getOutputFormat());
    copyValues(inst, false, instance.dataset(), getOutputFormat());
    inst.setDataset(getOutputFormat());
    push(inst);
  }
  /**
   * Sets up the structure for the plot instances. Sets m_PlotInstances to null if instances are not
   * saved for visualization.
   *
   * @see #getSaveForVisualization()
   */
  protected void determineFormat() {
    FastVector hv;
    Attribute predictedClass;
    Attribute classAt;
    FastVector attVals;
    int i;

    if (!m_SaveForVisualization) {
      m_PlotInstances = null;
      return;
    }

    hv = new FastVector();

    classAt = m_Instances.attribute(m_ClassIndex);
    if (classAt.isNominal()) {
      attVals = new FastVector();
      for (i = 0; i < classAt.numValues(); i++) attVals.addElement(classAt.value(i));
      predictedClass = new Attribute("predicted" + classAt.name(), attVals);
    } else {
      predictedClass = new Attribute("predicted" + classAt.name());
    }

    for (i = 0; i < m_Instances.numAttributes(); i++) {
      if (i == m_Instances.classIndex()) hv.addElement(predictedClass);
      hv.addElement(m_Instances.attribute(i).copy());
    }

    m_PlotInstances =
        new Instances(m_Instances.relationName() + "_predicted", hv, m_Instances.numInstances());
    m_PlotInstances.setClassIndex(m_ClassIndex + 1);
  }
Example #6
0
  /**
   * Splits a dataset according to the values of a nominal attribute.
   *
   * @param data the data which is to be split
   * @param att the attribute to be used for splitting
   * @return the sets of instances produced by the split
   */
  private Instances[] splitData(Instances data, Attribute att) {

    Instances[] splitData = new Instances[att.numValues()];
    for (int j = 0; j < att.numValues(); j++) {
      splitData[j] = new Instances(data, data.numInstances());
    }
    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
      Instance inst = (Instance) instEnum.nextElement();
      splitData[(int) inst.value(att)].add(inst);
    }
    for (int i = 0; i < splitData.length; i++) {
      splitData[i].compactify();
    }
    return splitData;
  }
  /**
   * Cálculo da precisão de Laplace
   *
   * @return Precisão de Laplace
   */
  public double getLaplace() {
    double temp = matrizContigencia.getB();
    int numClasses = classe.numValues();
    if (temp != 0) laplace = (matrizContigencia.getH_B() + 1) / (temp + numClasses);
    else laplace = 0;

    return laplace;
  }
Example #8
0
  /**
   * Determines the output format based on the input format and returns this.
   *
   * @param inputFormat the input format to base the output format on
   * @return the output format
   * @throws Exception in case the determination goes wrong
   */
  protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
    Instances result;
    Attribute att;
    Attribute attSorted;
    FastVector atts;
    FastVector values;
    Vector<String> sorted;
    int i;
    int n;

    m_AttributeIndices.setUpper(inputFormat.numAttributes() - 1);

    // determine sorted indices
    atts = new FastVector();
    m_NewOrder = new int[inputFormat.numAttributes()][];
    for (i = 0; i < inputFormat.numAttributes(); i++) {
      att = inputFormat.attribute(i);
      if (!att.isNominal() || !m_AttributeIndices.isInRange(i)) {
        m_NewOrder[i] = new int[0];
        atts.addElement(inputFormat.attribute(i).copy());
        continue;
      }

      // sort labels
      sorted = new Vector<String>();
      for (n = 0; n < att.numValues(); n++) sorted.add(att.value(n));
      Collections.sort(sorted, m_Comparator);

      // determine new indices
      m_NewOrder[i] = new int[att.numValues()];
      values = new FastVector();
      for (n = 0; n < att.numValues(); n++) {
        m_NewOrder[i][n] = sorted.indexOf(att.value(n));
        values.addElement(sorted.get(n));
      }
      attSorted = new Attribute(att.name(), values);
      attSorted.setWeight(att.weight());
      atts.addElement(attSorted);
    }

    // generate new header
    result = new Instances(inputFormat.relationName(), atts, 0);
    result.setClassIndex(inputFormat.classIndex());

    return result;
  }
Example #9
0
  /**
   * Returns a description of the classifier.
   *
   * @return a description of the classifier as a string.
   */
  @Override
  public String toString() {

    if (m_Instances == null) {
      return "Naive Bayes (simple): No model built yet.";
    }
    try {
      StringBuffer text = new StringBuffer("Naive Bayes (simple)");
      int attIndex;

      for (int i = 0; i < m_Instances.numClasses(); i++) {
        text.append(
            "\n\nClass "
                + m_Instances.classAttribute().value(i)
                + ": P(C) = "
                + Utils.doubleToString(m_Priors[i], 10, 8)
                + "\n\n");
        Enumeration<Attribute> enumAtts = m_Instances.enumerateAttributes();
        attIndex = 0;
        while (enumAtts.hasMoreElements()) {
          Attribute attribute = enumAtts.nextElement();
          text.append("Attribute " + attribute.name() + "\n");
          if (attribute.isNominal()) {
            for (int j = 0; j < attribute.numValues(); j++) {
              text.append(attribute.value(j) + "\t");
            }
            text.append("\n");
            for (int j = 0; j < attribute.numValues(); j++) {
              text.append(Utils.doubleToString(m_Counts[i][attIndex][j], 10, 8) + "\t");
            }
          } else {
            text.append("Mean: " + Utils.doubleToString(m_Means[i][attIndex], 10, 8) + "\t");
            text.append("Standard Deviation: " + Utils.doubleToString(m_Devs[i][attIndex], 10, 8));
          }
          text.append("\n\n");
          attIndex++;
        }
      }

      return text.toString();
    } catch (Exception e) {
      return "Can't print Naive Bayes classifier!";
    }
  }
Example #10
0
 /**
  * Compute the number of all possible conditions that could appear in a rule of a given data. For
  * nominal attributes, it's the number of values that could appear; for numeric attributes, it's
  * the number of values * 2, i.e. <= and >= are counted as different possible conditions.
  *
  * @param data the given data
  * @return number of all conditions of the data
  */
 public static double numAllConditions(Instances data) {
   double total = 0;
   Enumeration attEnum = data.enumerateAttributes();
   while (attEnum.hasMoreElements()) {
     Attribute att = (Attribute) attEnum.nextElement();
     if (att.isNominal()) total += (double) att.numValues();
     else total += 2.0 * (double) data.numDistinctValues(att);
   }
   return total;
 }
Example #11
0
 /**
  * Gives a string representation of the test in Prolog notation, starting from the comparison
  * symbol.
  *
  * @return a string representing the test in Prolog notation
  */
 private String testPrologComparisonString() {
   Attribute att = m_Dataset.attribute(m_AttIndex);
   if (att.isNumeric()) {
     return ((m_Not ? ">= " : "< ") + Utils.doubleToString(m_Split, 3));
   } else {
     if (att.numValues() != 2) return ((m_Not ? "!= " : "= ") + att.value((int) m_Split));
     else
       return ("= " + (m_Not ? att.value((int) m_Split == 0 ? 1 : 0) : att.value((int) m_Split)));
   }
 }
Example #12
0
  /**
   * Method for building an Id3 tree.
   *
   * @param data the training data
   * @exception Exception if decision tree can't be built successfully
   */
  private void makeTree(Instances data) throws Exception {

    // Check if no instances have reached this node.
    if (data.numInstances() == 0) {
      m_Attribute = null;
      m_ClassValue = Utils.missingValue();
      m_Distribution = new double[data.numClasses()];
      return;
    }

    // Compute attribute with maximum information gain.
    double[] infoGains = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    while (attEnum.hasMoreElements()) {
      Attribute att = (Attribute) attEnum.nextElement();
      infoGains[att.index()] = computeInfoGain(data, att);
    }
    m_Attribute = data.attribute(Utils.maxIndex(infoGains));

    // Make leaf if information gain is zero.
    // Otherwise create successors.
    if (Utils.eq(infoGains[m_Attribute.index()], 0)) {
      m_Attribute = null;
      m_Distribution = new double[data.numClasses()];
      Enumeration instEnum = data.enumerateInstances();
      while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        m_Distribution[(int) inst.classValue()]++;
      }
      Utils.normalize(m_Distribution);
      m_ClassValue = Utils.maxIndex(m_Distribution);
      m_ClassAttribute = data.classAttribute();
    } else {
      Instances[] splitData = splitData(data, m_Attribute);
      m_Successors = new Id3[m_Attribute.numValues()];
      for (int j = 0; j < m_Attribute.numValues(); j++) {
        m_Successors[j] = new Id3();
        m_Successors[j].makeTree(splitData[j]);
      }
    }
  }
Example #13
0
  /**
   * Set the output format. Swapss the desired nominal attribute values in the header and calls
   * setOutputFormat(Instances) appropriately.
   */
  private void setOutputFormat() {

    Instances newData;
    ArrayList<Attribute> newAtts;
    ArrayList<String> newVals;

    // Compute new attributes

    newAtts = new ArrayList<Attribute>(getInputFormat().numAttributes());
    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
      Attribute att = getInputFormat().attribute(j);
      if (j != m_AttIndex.getIndex()) {
        newAtts.add((Attribute) att.copy());
      } else {

        // Compute list of attribute values

        newVals = new ArrayList<String>(att.numValues());
        for (int i = 0; i < att.numValues(); i++) {
          if (i == m_FirstIndex.getIndex()) {
            newVals.add(att.value(m_SecondIndex.getIndex()));
          } else if (i == m_SecondIndex.getIndex()) {
            newVals.add(att.value(m_FirstIndex.getIndex()));
          } else {
            newVals.add(att.value(i));
          }
        }
        Attribute newAtt = new Attribute(att.name(), newVals);
        newAtt.setWeight(att.weight());
        newAtts.add(newAtt);
      }
    }

    // Construct new header

    newData = new Instances(getInputFormat().relationName(), newAtts, 0);
    newData.setClassIndex(getInputFormat().classIndex());
    setOutputFormat(newData);
  }
Example #14
0
  /**
   * Computes information gain for an attribute.
   *
   * @param data the data for which info gain is to be computed
   * @param att the attribute
   * @return the information gain for the given attribute and data
   * @throws Exception if computation fails
   */
  private double computeInfoGain(Instances data, Attribute att) throws Exception {

    double infoGain = computeEntropy(data);
    Instances[] splitData = splitData(data, att);
    for (int j = 0; j < att.numValues(); j++) {
      if (splitData[j].numInstances() > 0) {
        infoGain -=
            ((double) splitData[j].numInstances() / (double) data.numInstances())
                * computeEntropy(splitData[j]);
      }
    }
    return infoGain;
  }
Example #15
0
  /**
   * Outputs a tree at a certain level.
   *
   * @param level the level at which the tree is to be printed
   * @return the tree as string at the given level
   */
  private String toString(int level) {

    StringBuffer text = new StringBuffer();

    if (m_Attribute == null) {
      if (Utils.isMissingValue(m_ClassValue)) {
        text.append(": null");
      } else {
        text.append(": " + m_ClassAttribute.value((int) m_ClassValue));
      }
    } else {
      for (int j = 0; j < m_Attribute.numValues(); j++) {
        text.append("\n");
        for (int i = 0; i < level; i++) {
          text.append("|  ");
        }
        text.append(m_Attribute.name() + " = " + m_Attribute.value(j));
        text.append(m_Successors[j].toString(level + 1));
      }
    }
    return text.toString();
  }
Example #16
0
  /**
   * use for training data
   *
   * @param instancesWithMeta
   * @param labelInstances
   * @return
   * @throws Exception
   */
  public static Instances addNominalLabelsForClassificationToTrainingData(
      Instances instances, AttributeFilterMeta instancesWithMeta, Instances labelInstances)
      throws Exception {

    Instances finalCleaned = Instances.mergeInstances(instances, labelInstances);
    finalCleaned.setClassIndex(finalCleaned.numAttributes() - 1);

    Attribute classAt = finalCleaned.classAttribute();
    int numOfAttValues = classAt.numValues();
    String attValues = "";
    for (int nai = 0; nai < numOfAttValues; nai++) {
      if (nai != 0) {
        attValues += ",";
      }
      attValues += classAt.value(nai);
    }
    instancesWithMeta.setClassAtrributeValues(attValues);

    instancesWithMeta.setInstances(finalCleaned);

    return finalCleaned;
  }
Example #17
0
  public void setDatasetKeyFromDialog() {

    ListSelectorDialog jd = new ListSelectorDialog(null, m_DatasetKeyList);

    // Open the dialog
    int result = jd.showDialog();
    // =============== BEGIN EDIT melville ===============
    // Check if learning curves should be generated
    boolean noise = false;
    boolean learning = false;
    boolean fraction = false;
    int learning_key = -1;

    // If accepted, update the ttester
    if (result == ListSelectorDialog.APPROVE_OPTION) {
      int[] selected = m_DatasetKeyList.getSelectedIndices();
      String selectedList = "";
      Object[] selected_string = m_DatasetKeyList.getSelectedValues();
      for (int i = 0; i < selected.length; i++) {
        if (((String) selected_string[i]).toLowerCase().equals("key_noise_levels")) {
          learning_key = i;
          learning = true;
          // fraction = true;
          noise = true;
        } else if (((String) selected_string[i]).toLowerCase().equals("key_fraction_instances")) {
          learning_key = i;
          learning = true;
          fraction = true;
        } else if (((String) selected_string[i]).toLowerCase().equals("key_total_instances")
            && !learning) {
          learning = true;
          learning_key = i;
        } else selectedList += "," + (selected[i] + 1);
      }

      m_TTester.setLearningCurve(learning);
      m_TTester.setFraction(fraction);
      if (learning) { // get points on the learning curve
        selectedList += "," + (selected[learning_key] + 1);
        Attribute attr;
        if (noise) { // override fraction
          attr = m_Instances.attribute("Key_Noise_levels");
        } else if (fraction) {
          attr = m_Instances.attribute("Key_Fraction_instances");
        } else {
          attr = m_Instances.attribute("Key_Total_instances");
        }
        double[] pts = new double[attr.numValues()];
        for (int k = 0; k < attr.numValues(); k++) {
          pts[k] = Double.parseDouble(attr.value(k));
        }
        Arrays.sort(pts);
        m_TTester.setPoints(pts);
      }
      // ================ END EDIT melville ================

      Range generatorRange = new Range();
      if (selectedList.length() != 0) {
        try {
          generatorRange.setRanges(selectedList);
        } catch (Exception ex) {
          ex.printStackTrace();
          System.err.println(ex.getMessage());
        }
      }
      m_TTester.setDatasetKeyColumns(generatorRange);
      setTTester();
    }
  }
Example #18
0
  /**
   * The procedure implementing the SMOTE algorithm. The output instances are pushed onto the output
   * queue for collection.
   *
   * @throws Exception if provided options cannot be executed on input instances
   */
  protected void doSMOTE() throws Exception {
    int minIndex = 0;
    int min = Integer.MAX_VALUE;
    if (m_DetectMinorityClass) {
      // find minority class
      int[] classCounts =
          getInputFormat().attributeStats(getInputFormat().classIndex()).nominalCounts;
      for (int i = 0; i < classCounts.length; i++) {
        if (classCounts[i] != 0 && classCounts[i] < min) {
          min = classCounts[i];
          minIndex = i;
        }
      }
    } else {
      String classVal = getClassValue();
      if (classVal.equalsIgnoreCase("first")) {
        minIndex = 1;
      } else if (classVal.equalsIgnoreCase("last")) {
        minIndex = getInputFormat().numClasses();
      } else {
        minIndex = Integer.parseInt(classVal);
      }
      if (minIndex > getInputFormat().numClasses()) {
        throw new Exception("value index must be <= the number of classes");
      }
      minIndex--; // make it an index
    }

    int nearestNeighbors;
    if (min <= getNearestNeighbors()) {
      nearestNeighbors = min - 1;
    } else {
      nearestNeighbors = getNearestNeighbors();
    }
    if (nearestNeighbors < 1) throw new Exception("Cannot use 0 neighbors!");

    // compose minority class dataset
    // also push all dataset instances
    Instances sample = getInputFormat().stringFreeStructure();
    Enumeration instanceEnum = getInputFormat().enumerateInstances();
    while (instanceEnum.hasMoreElements()) {
      Instance instance = (Instance) instanceEnum.nextElement();
      push((Instance) instance.copy());
      if ((int) instance.classValue() == minIndex) {
        sample.add(instance);
      }
    }

    // compute Value Distance Metric matrices for nominal features
    Map vdmMap = new HashMap();
    Enumeration attrEnum = getInputFormat().enumerateAttributes();
    while (attrEnum.hasMoreElements()) {
      Attribute attr = (Attribute) attrEnum.nextElement();
      if (!attr.equals(getInputFormat().classAttribute())) {
        if (attr.isNominal() || attr.isString()) {
          double[][] vdm = new double[attr.numValues()][attr.numValues()];
          vdmMap.put(attr, vdm);
          int[] featureValueCounts = new int[attr.numValues()];
          int[][] featureValueCountsByClass =
              new int[getInputFormat().classAttribute().numValues()][attr.numValues()];
          instanceEnum = getInputFormat().enumerateInstances();
          while (instanceEnum.hasMoreElements()) {
            Instance instance = (Instance) instanceEnum.nextElement();
            int value = (int) instance.value(attr);
            int classValue = (int) instance.classValue();
            featureValueCounts[value]++;
            featureValueCountsByClass[classValue][value]++;
          }
          for (int valueIndex1 = 0; valueIndex1 < attr.numValues(); valueIndex1++) {
            for (int valueIndex2 = 0; valueIndex2 < attr.numValues(); valueIndex2++) {
              double sum = 0;
              for (int classValueIndex = 0;
                  classValueIndex < getInputFormat().numClasses();
                  classValueIndex++) {
                double c1i = featureValueCountsByClass[classValueIndex][valueIndex1];
                double c2i = featureValueCountsByClass[classValueIndex][valueIndex2];
                double c1 = featureValueCounts[valueIndex1];
                double c2 = featureValueCounts[valueIndex2];
                double term1 = c1i / c1;
                double term2 = c2i / c2;
                sum += Math.abs(term1 - term2);
              }
              vdm[valueIndex1][valueIndex2] = sum;
            }
          }
        }
      }
    }

    // use this random source for all required randomness
    Random rand = new Random(getRandomSeed());

    // find the set of extra indices to use if the percentage is not evenly
    // divisible by 100
    List extraIndices = new LinkedList();
    double percentageRemainder = (getPercentage() / 100) - Math.floor(getPercentage() / 100.0);
    int extraIndicesCount = (int) (percentageRemainder * sample.numInstances());
    if (extraIndicesCount >= 1) {
      for (int i = 0; i < sample.numInstances(); i++) {
        extraIndices.add(i);
      }
    }
    Collections.shuffle(extraIndices, rand);
    extraIndices = extraIndices.subList(0, extraIndicesCount);
    Set extraIndexSet = new HashSet(extraIndices);

    // the main loop to handle computing nearest neighbors and generating SMOTE
    // examples from each instance in the original minority class data
    Instance[] nnArray = new Instance[nearestNeighbors];
    for (int i = 0; i < sample.numInstances(); i++) {
      Instance instanceI = sample.instance(i);
      // find k nearest neighbors for each instance
      List distanceToInstance = new LinkedList();
      for (int j = 0; j < sample.numInstances(); j++) {
        Instance instanceJ = sample.instance(j);
        if (i != j) {
          double distance = 0;
          attrEnum = getInputFormat().enumerateAttributes();
          while (attrEnum.hasMoreElements()) {
            Attribute attr = (Attribute) attrEnum.nextElement();
            if (!attr.equals(getInputFormat().classAttribute())) {
              double iVal = instanceI.value(attr);
              double jVal = instanceJ.value(attr);
              if (attr.isNumeric()) {
                distance += Math.pow(iVal - jVal, 2);
              } else {
                distance += ((double[][]) vdmMap.get(attr))[(int) iVal][(int) jVal];
              }
            }
          }
          distance = Math.pow(distance, .5);
          distanceToInstance.add(new Object[] {distance, instanceJ});
        }
      }

      // sort the neighbors according to distance
      Collections.sort(
          distanceToInstance,
          new Comparator() {
            public int compare(Object o1, Object o2) {
              double distance1 = (Double) ((Object[]) o1)[0];
              double distance2 = (Double) ((Object[]) o2)[0];
              return Double.compare(distance1, distance2);
            }
          });

      // populate the actual nearest neighbor instance array
      Iterator entryIterator = distanceToInstance.iterator();
      int j = 0;
      while (entryIterator.hasNext() && j < nearestNeighbors) {
        nnArray[j] = (Instance) ((Object[]) entryIterator.next())[1];
        j++;
      }

      // create synthetic examples
      int n = (int) Math.floor(getPercentage() / 100);
      while (n > 0 || extraIndexSet.remove(i)) {
        double[] values = new double[sample.numAttributes()];
        int nn = rand.nextInt(nearestNeighbors);
        attrEnum = getInputFormat().enumerateAttributes();
        while (attrEnum.hasMoreElements()) {
          Attribute attr = (Attribute) attrEnum.nextElement();
          if (!attr.equals(getInputFormat().classAttribute())) {
            if (attr.isNumeric()) {
              double dif = nnArray[nn].value(attr) - instanceI.value(attr);
              double gap = rand.nextDouble();
              values[attr.index()] = (instanceI.value(attr) + gap * dif);
            } else if (attr.isDate()) {
              double dif = nnArray[nn].value(attr) - instanceI.value(attr);
              double gap = rand.nextDouble();
              values[attr.index()] = (long) (instanceI.value(attr) + gap * dif);
            } else {
              int[] valueCounts = new int[attr.numValues()];
              int iVal = (int) instanceI.value(attr);
              valueCounts[iVal]++;
              for (int nnEx = 0; nnEx < nearestNeighbors; nnEx++) {
                int val = (int) nnArray[nnEx].value(attr);
                valueCounts[val]++;
              }
              int maxIndex = 0;
              int max = Integer.MIN_VALUE;
              for (int index = 0; index < attr.numValues(); index++) {
                if (valueCounts[index] > max) {
                  max = valueCounts[index];
                  maxIndex = index;
                }
              }
              values[attr.index()] = maxIndex;
            }
          }
        }
        values[sample.classIndex()] = minIndex;
        Instance synthetic = new Instance(1.0, values);
        push(synthetic);
        n--;
      }
    }
  }
Example #19
0
  /** Set the output format if the class is numeric. */
  private void setOutputFormatNumeric() {

    if (m_Indices == null) {
      setOutputFormat(null);
      return;
    }
    ArrayList<Attribute> newAtts;
    int newClassIndex;
    StringBuffer attributeName;
    Instances outputFormat;
    ArrayList<String> vals;

    // Compute new attributes

    m_needToTransform = false;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      Attribute att = getInputFormat().attribute(i);
      if (att.isNominal() && (att.numValues() > 2 || m_Numeric || m_TransformAll)) {
        m_needToTransform = true;
        break;
      }
    }

    if (!m_needToTransform) {
      setOutputFormat(getInputFormat());
      return;
    }

    newClassIndex = getInputFormat().classIndex();
    newAtts = new ArrayList<Attribute>();
    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
      Attribute att = getInputFormat().attribute(j);
      if ((!att.isNominal()) || (j == getInputFormat().classIndex())) {
        newAtts.add((Attribute) att.copy());
      } else {
        if (j < getInputFormat().classIndex()) {
          newClassIndex += att.numValues() - 2;
        }

        // Compute values for new attributes

        for (int k = 1; k < att.numValues(); k++) {
          attributeName = new StringBuffer(att.name() + "=");
          for (int l = k; l < att.numValues(); l++) {
            if (l > k) {
              attributeName.append(',');
            }
            attributeName.append(att.value(m_Indices[j][l]));
          }
          if (m_Numeric) {
            newAtts.add(new Attribute(attributeName.toString()));
          } else {
            vals = new ArrayList<String>(2);
            vals.add("f");
            vals.add("t");
            newAtts.add(new Attribute(attributeName.toString(), vals));
          }
        }
      }
    }
    outputFormat = new Instances(getInputFormat().relationName(), newAtts, 0);
    outputFormat.setClassIndex(newClassIndex);
    setOutputFormat(outputFormat);
  }
Example #20
0
  /**
   * Generates the classifier.
   *
   * @param instances set of instances serving as training data
   * @exception Exception if the classifier has not been generated successfully
   */
  @Override
  public void buildClassifier(Instances instances) throws Exception {

    int attIndex = 0;
    double sum;

    // can classifier handle the data?
    getCapabilities().testWithFail(instances);

    // remove instances with missing class
    instances = new Instances(instances);
    instances.deleteWithMissingClass();

    m_Instances = new Instances(instances, 0);

    // Reserve space
    m_Counts = new double[instances.numClasses()][instances.numAttributes() - 1][0];
    m_Means = new double[instances.numClasses()][instances.numAttributes() - 1];
    m_Devs = new double[instances.numClasses()][instances.numAttributes() - 1];
    m_Priors = new double[instances.numClasses()];
    Enumeration<Attribute> enu = instances.enumerateAttributes();
    while (enu.hasMoreElements()) {
      Attribute attribute = enu.nextElement();
      if (attribute.isNominal()) {
        for (int j = 0; j < instances.numClasses(); j++) {
          m_Counts[j][attIndex] = new double[attribute.numValues()];
        }
      } else {
        for (int j = 0; j < instances.numClasses(); j++) {
          m_Counts[j][attIndex] = new double[1];
        }
      }
      attIndex++;
    }

    // Compute counts and sums
    Enumeration<Instance> enumInsts = instances.enumerateInstances();
    while (enumInsts.hasMoreElements()) {
      Instance instance = enumInsts.nextElement();
      if (!instance.classIsMissing()) {
        Enumeration<Attribute> enumAtts = instances.enumerateAttributes();
        attIndex = 0;
        while (enumAtts.hasMoreElements()) {
          Attribute attribute = enumAtts.nextElement();
          if (!instance.isMissing(attribute)) {
            if (attribute.isNominal()) {
              m_Counts[(int) instance.classValue()][attIndex][(int) instance.value(attribute)]++;
            } else {
              m_Means[(int) instance.classValue()][attIndex] += instance.value(attribute);
              m_Counts[(int) instance.classValue()][attIndex][0]++;
            }
          }
          attIndex++;
        }
        m_Priors[(int) instance.classValue()]++;
      }
    }

    // Compute means
    Enumeration<Attribute> enumAtts = instances.enumerateAttributes();
    attIndex = 0;
    while (enumAtts.hasMoreElements()) {
      Attribute attribute = enumAtts.nextElement();
      if (attribute.isNumeric()) {
        for (int j = 0; j < instances.numClasses(); j++) {
          if (m_Counts[j][attIndex][0] < 2) {
            throw new Exception(
                "attribute "
                    + attribute.name()
                    + ": less than two values for class "
                    + instances.classAttribute().value(j));
          }
          m_Means[j][attIndex] /= m_Counts[j][attIndex][0];
        }
      }
      attIndex++;
    }

    // Compute standard deviations
    enumInsts = instances.enumerateInstances();
    while (enumInsts.hasMoreElements()) {
      Instance instance = enumInsts.nextElement();
      if (!instance.classIsMissing()) {
        enumAtts = instances.enumerateAttributes();
        attIndex = 0;
        while (enumAtts.hasMoreElements()) {
          Attribute attribute = enumAtts.nextElement();
          if (!instance.isMissing(attribute)) {
            if (attribute.isNumeric()) {
              m_Devs[(int) instance.classValue()][attIndex] +=
                  (m_Means[(int) instance.classValue()][attIndex] - instance.value(attribute))
                      * (m_Means[(int) instance.classValue()][attIndex]
                          - instance.value(attribute));
            }
          }
          attIndex++;
        }
      }
    }
    enumAtts = instances.enumerateAttributes();
    attIndex = 0;
    while (enumAtts.hasMoreElements()) {
      Attribute attribute = enumAtts.nextElement();
      if (attribute.isNumeric()) {
        for (int j = 0; j < instances.numClasses(); j++) {
          if (m_Devs[j][attIndex] <= 0) {
            throw new Exception(
                "attribute "
                    + attribute.name()
                    + ": standard deviation is 0 for class "
                    + instances.classAttribute().value(j));
          } else {
            m_Devs[j][attIndex] /= m_Counts[j][attIndex][0] - 1;
            m_Devs[j][attIndex] = Math.sqrt(m_Devs[j][attIndex]);
          }
        }
      }
      attIndex++;
    }

    // Normalize counts
    enumAtts = instances.enumerateAttributes();
    attIndex = 0;
    while (enumAtts.hasMoreElements()) {
      Attribute attribute = enumAtts.nextElement();
      if (attribute.isNominal()) {
        for (int j = 0; j < instances.numClasses(); j++) {
          sum = Utils.sum(m_Counts[j][attIndex]);
          for (int i = 0; i < attribute.numValues(); i++) {
            m_Counts[j][attIndex][i] =
                (m_Counts[j][attIndex][i] + 1) / (sum + attribute.numValues());
          }
        }
      }
      attIndex++;
    }

    // Normalize priors
    sum = Utils.sum(m_Priors);
    for (int j = 0; j < instances.numClasses(); j++) {
      m_Priors[j] = (m_Priors[j] + 1) / (sum + instances.numClasses());
    }
  }
Example #21
0
  public MappingInfo(Instances dataSet, MiningSchema miningSchema, Logger log) throws Exception {
    m_log = log;
    // miningSchema.convertStringAttsToNominal();
    Instances fieldsI = miningSchema.getMiningSchemaAsInstances();

    m_fieldsMap = new int[fieldsI.numAttributes()];
    m_nominalValueMaps = new int[fieldsI.numAttributes()][];

    for (int i = 0; i < fieldsI.numAttributes(); i++) {
      String schemaAttName = fieldsI.attribute(i).name();
      boolean found = false;
      for (int j = 0; j < dataSet.numAttributes(); j++) {
        if (dataSet.attribute(j).name().equals(schemaAttName)) {
          Attribute miningSchemaAtt = fieldsI.attribute(i);
          Attribute incomingAtt = dataSet.attribute(j);
          // check type match
          if (miningSchemaAtt.type() != incomingAtt.type()) {
            throw new Exception(
                "[MappingInfo] type mismatch for field "
                    + schemaAttName
                    + ". Mining schema type "
                    + miningSchemaAtt.toString()
                    + ". Incoming type "
                    + incomingAtt.toString()
                    + ".");
          }

          // check nominal values (number, names...)
          if (miningSchemaAtt.numValues() != incomingAtt.numValues()) {
            String warningString =
                "[MappingInfo] WARNING: incoming nominal attribute "
                    + incomingAtt.name()
                    + " does not have the same "
                    + "number of values as the corresponding mining "
                    + "schema attribute.";
            if (m_log != null) {
              m_log.logMessage(warningString);
            } else {
              System.err.println(warningString);
            }
          }
          if (miningSchemaAtt.isNominal() || miningSchemaAtt.isString()) {
            int[] valuesMap = new int[incomingAtt.numValues()];
            for (int k = 0; k < incomingAtt.numValues(); k++) {
              String incomingNomVal = incomingAtt.value(k);
              int indexInSchema = miningSchemaAtt.indexOfValue(incomingNomVal);
              if (indexInSchema < 0) {
                String warningString =
                    "[MappingInfo] WARNING: incoming nominal attribute "
                        + incomingAtt.name()
                        + " has value "
                        + incomingNomVal
                        + " that doesn't occur in the mining schema.";
                if (m_log != null) {
                  m_log.logMessage(warningString);
                } else {
                  System.err.println(warningString);
                }
                valuesMap[k] = UNKNOWN_NOMINAL_VALUE;
              } else {
                valuesMap[k] = indexInSchema;
              }
            }
            m_nominalValueMaps[i] = valuesMap;
          }

          /*if (miningSchemaAtt.isNominal()) {
            for (int k = 0; k < miningSchemaAtt.numValues(); k++) {
              if (!miningSchemaAtt.value(k).equals(incomingAtt.value(k))) {
                throw new Exception("[PMMLUtils] value " + k + " (" +
                                    miningSchemaAtt.value(k) + ") does not match " +
                                    "incoming value (" + incomingAtt.value(k) +
                                    ") for attribute " + miningSchemaAtt.name() +
                                    ".");

              }
            }
          }*/
          found = true;
          m_fieldsMap[i] = j;
        }
      }
      if (!found) {
        throw new Exception(
            "[MappingInfo] Unable to find a match for mining schema "
                + "attribute "
                + schemaAttName
                + " in the "
                + "incoming instances!");
      }
    }

    // check class attribute (if set)
    if (fieldsI.classIndex() >= 0) {
      if (dataSet.classIndex() < 0) {
        // first see if we can find a matching class
        String className = fieldsI.classAttribute().name();
        Attribute classMatch = dataSet.attribute(className);
        if (classMatch == null) {
          throw new Exception(
              "[MappingInfo] Can't find match for target field "
                  + className
                  + "in incoming instances!");
        }
        dataSet.setClass(classMatch);
      } else if (!fieldsI.classAttribute().name().equals(dataSet.classAttribute().name())) {
        throw new Exception(
            "[MappingInfo] class attribute in mining schema does not match "
                + "class attribute in incoming instances!");
      }
    }

    // Set up the textual description of the mapping
    fieldsMappingString(fieldsI, dataSet);
  }
Example #22
0
  /**
   * Writes a Batch of instances
   *
   * @throws IOException throws IOException if saving in batch mode is not possible
   */
  public void writeBatch() throws IOException {

    Instances instances = getInstances();

    if (instances == null) throw new IOException("No instances to save");
    if (instances.classIndex() == -1) {
      instances.setClassIndex(instances.numAttributes() - 1);
      System.err.println("No class specified. Last attribute is used as class attribute.");
    }
    if (instances.attribute(instances.classIndex()).isNumeric())
      throw new IOException("To save in C4.5 format the class attribute cannot be numeric.");
    if (getRetrieval() == INCREMENTAL)
      throw new IOException("Batch and incremental saving cannot be mixed.");

    setRetrieval(BATCH);
    if (retrieveFile() == null || getWriter() == null) {
      throw new IOException(
          "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option.");
    }
    setWriteMode(WRITE);
    // print names file
    setFileExtension(".names");
    PrintWriter outW = new PrintWriter(getWriter());
    for (int i = 0; i < instances.attribute(instances.classIndex()).numValues(); i++) {
      outW.write(instances.attribute(instances.classIndex()).value(i));
      if (i < instances.attribute(instances.classIndex()).numValues() - 1) {
        outW.write(",");
      } else {
        outW.write(".\n");
      }
    }
    for (int i = 0; i < instances.numAttributes(); i++) {
      if (i != instances.classIndex()) {
        outW.write(instances.attribute(i).name() + ": ");
        if (instances.attribute(i).isNumeric() || instances.attribute(i).isDate()) {
          outW.write("continuous.\n");
        } else {
          Attribute temp = instances.attribute(i);
          for (int j = 0; j < temp.numValues(); j++) {
            outW.write(temp.value(j));
            if (j < temp.numValues() - 1) {
              outW.write(",");
            } else {
              outW.write(".\n");
            }
          }
        }
      }
    }
    outW.flush();
    outW.close();

    // print data file
    String out = retrieveFile().getAbsolutePath();
    setFileExtension(".data");
    out = out.substring(0, out.lastIndexOf('.')) + getFileExtension();
    File namesFile = new File(out);
    try {
      setFile(namesFile);
    } catch (Exception ex) {
      throw new IOException(
          "Cannot create data file, only names file created (Reason: " + ex.toString() + ").");
    }
    if (retrieveFile() == null || getWriter() == null) {
      throw new IOException("Cannot create data file, only names file created.");
    }
    outW = new PrintWriter(getWriter());
    // print data file
    for (int i = 0; i < instances.numInstances(); i++) {
      Instance temp = instances.instance(i);
      for (int j = 0; j < temp.numAttributes(); j++) {
        if (j != instances.classIndex()) {
          if (temp.isMissing(j)) {
            outW.write("?,");
          } else if (instances.attribute(j).isNominal() || instances.attribute(j).isString()) {
            outW.write(instances.attribute(j).value((int) temp.value(j)) + ",");
          } else {
            outW.write("" + temp.value(j) + ",");
          }
        }
      }
      // write the class value
      if (temp.isMissing(instances.classIndex())) {
        outW.write("?");
      } else {
        outW.write(
            instances
                .attribute(instances.classIndex())
                .value((int) temp.value(instances.classIndex())));
      }
      outW.write("\n");
    }
    outW.flush();
    outW.close();
    setFileExtension(".names");
    setWriteMode(WAIT);
    outW = null;
    resetWriter();
    setWriteMode(CANCEL);
  }
Example #23
0
  /**
   * Saves an instances incrementally. Structure has to be set by using the setStructure() method or
   * setInstances() method.
   *
   * @param inst the instance to save
   * @throws IOException throws IOEXception if an instance cannot be saved incrementally.
   */
  public void writeIncremental(Instance inst) throws IOException {

    int writeMode = getWriteMode();
    Instances structure = getInstances();
    PrintWriter outW = null;

    if (structure != null) {
      if (structure.classIndex() == -1) {
        structure.setClassIndex(structure.numAttributes() - 1);
        System.err.println("No class specified. Last attribute is used as class attribute.");
      }
      if (structure.attribute(structure.classIndex()).isNumeric())
        throw new IOException("To save in C4.5 format the class attribute cannot be numeric.");
    }
    if (getRetrieval() == BATCH || getRetrieval() == NONE)
      throw new IOException("Batch and incremental saving cannot be mixed.");
    if (retrieveFile() == null || getWriter() == null) {
      throw new IOException(
          "C4.5 format requires two files. Therefore no output to standard out can be generated.\nPlease specifiy output files using the -o option.");
    }

    outW = new PrintWriter(getWriter());

    if (writeMode == WAIT) {
      if (structure == null) {
        setWriteMode(CANCEL);
        if (inst != null)
          System.err.println("Structure(Header Information) has to be set in advance");
      } else setWriteMode(STRUCTURE_READY);
      writeMode = getWriteMode();
    }
    if (writeMode == CANCEL) {
      if (outW != null) outW.close();
      cancel();
    }
    if (writeMode == STRUCTURE_READY) {
      setWriteMode(WRITE);
      // write header: here names file
      for (int i = 0; i < structure.attribute(structure.classIndex()).numValues(); i++) {
        outW.write(structure.attribute(structure.classIndex()).value(i));
        if (i < structure.attribute(structure.classIndex()).numValues() - 1) {
          outW.write(",");
        } else {
          outW.write(".\n");
        }
      }
      for (int i = 0; i < structure.numAttributes(); i++) {
        if (i != structure.classIndex()) {
          outW.write(structure.attribute(i).name() + ": ");
          if (structure.attribute(i).isNumeric() || structure.attribute(i).isDate()) {
            outW.write("continuous.\n");
          } else {
            Attribute temp = structure.attribute(i);
            for (int j = 0; j < temp.numValues(); j++) {
              outW.write(temp.value(j));
              if (j < temp.numValues() - 1) {
                outW.write(",");
              } else {
                outW.write(".\n");
              }
            }
          }
        }
      }
      outW.flush();
      outW.close();

      writeMode = getWriteMode();

      String out = retrieveFile().getAbsolutePath();
      setFileExtension(".data");
      out = out.substring(0, out.lastIndexOf('.')) + getFileExtension();
      File namesFile = new File(out);
      try {
        setFile(namesFile);
      } catch (Exception ex) {
        throw new IOException("Cannot create data file, only names file created.");
      }
      if (retrieveFile() == null || getWriter() == null) {
        throw new IOException("Cannot create data file, only names file created.");
      }
      outW = new PrintWriter(getWriter());
    }
    if (writeMode == WRITE) {
      if (structure == null) throw new IOException("No instances information available.");
      if (inst != null) {
        // write instance: here data file
        for (int j = 0; j < inst.numAttributes(); j++) {
          if (j != structure.classIndex()) {
            if (inst.isMissing(j)) {
              outW.write("?,");
            } else if (structure.attribute(j).isNominal() || structure.attribute(j).isString()) {
              outW.write(structure.attribute(j).value((int) inst.value(j)) + ",");
            } else {
              outW.write("" + inst.value(j) + ",");
            }
          }
        }
        // write the class value
        if (inst.isMissing(structure.classIndex())) {
          outW.write("?");
        } else {
          outW.write(
              structure
                  .attribute(structure.classIndex())
                  .value((int) inst.value(structure.classIndex())));
        }
        outW.write("\n");
        // flushes every 100 instances
        m_incrementalCounter++;
        if (m_incrementalCounter > 100) {
          m_incrementalCounter = 0;
          outW.flush();
        }
      } else {
        // close
        if (outW != null) {
          outW.flush();
          outW.close();
        }
        setFileExtension(".names");
        m_incrementalCounter = 0;
        resetStructure();
        outW = null;
        resetWriter();
      }
    }
  }
Example #24
0
  /**
   * Set the output format. Takes the current average class values and m_InputFormat and calls
   * setOutputFormat(Instances) appropriately.
   */
  private void setOutputFormat() {

    Instances newData;
    FastVector newAtts, newVals;
    boolean firstEndsWithPrime = false, secondEndsWithPrime = false;
    StringBuffer text = new StringBuffer();

    // Compute new attributes

    newAtts = new FastVector(getInputFormat().numAttributes());
    for (int j = 0; j < getInputFormat().numAttributes(); j++) {
      Attribute att = getInputFormat().attribute(j);
      if (j != m_AttIndex.getIndex()) {
        newAtts.addElement(att.copy());
      } else {

        // Compute new value

        if (att.value(m_FirstIndex.getIndex()).endsWith("'")) {
          firstEndsWithPrime = true;
        }
        if (att.value(m_SecondIndex.getIndex()).endsWith("'")) {
          secondEndsWithPrime = true;
        }
        if (firstEndsWithPrime || secondEndsWithPrime) {
          text.append("'");
        }
        if (firstEndsWithPrime) {
          text.append(
              ((String) att.value(m_FirstIndex.getIndex()))
                  .substring(1, ((String) att.value(m_FirstIndex.getIndex())).length() - 1));
        } else {
          text.append((String) att.value(m_FirstIndex.getIndex()));
        }
        text.append('_');
        if (secondEndsWithPrime) {
          text.append(
              ((String) att.value(m_SecondIndex.getIndex()))
                  .substring(1, ((String) att.value(m_SecondIndex.getIndex())).length() - 1));
        } else {
          text.append((String) att.value(m_SecondIndex.getIndex()));
        }
        if (firstEndsWithPrime || secondEndsWithPrime) {
          text.append("'");
        }

        // Compute list of attribute values

        newVals = new FastVector(att.numValues() - 1);
        for (int i = 0; i < att.numValues(); i++) {
          if (i == m_FirstIndex.getIndex()) {
            newVals.addElement(text.toString());
          } else if (i != m_SecondIndex.getIndex()) {
            newVals.addElement(att.value(i));
          }
        }

        Attribute newAtt = new Attribute(att.name(), newVals);
        newAtt.setWeight(getInputFormat().attribute(j).weight());

        newAtts.addElement(newAtt);
      }
    }

    // Construct new header

    newData = new Instances(getInputFormat().relationName(), newAtts, 0);
    newData.setClassIndex(getInputFormat().classIndex());
    setOutputFormat(newData);
  }
Example #25
0
  /**
   * Sets up the panel with a new set of instances, attempting to guess the correct settings for
   * various columns.
   *
   * @param newInstances the new set of results.
   */
  public void setInstances(Instances newInstances) {

    m_Instances = newInstances;
    m_TTester.setInstances(m_Instances);
    m_FromLab.setText("Got " + m_Instances.numInstances() + " results");

    // Temporarily remove the configuration listener
    m_RunCombo.removeActionListener(m_ConfigureListener);

    // Do other stuff
    m_DatasetKeyModel.removeAllElements();
    m_RunModel.removeAllElements();
    m_ResultKeyModel.removeAllElements();
    m_CompareModel.removeAllElements();
    int datasetCol = -1;
    int runCol = -1;
    String selectedList = "";
    String selectedListDataset = "";
    // =============== BEGIN EDIT melville ===============
    boolean noise = false; // keep track of whether noise levels eval is required
    boolean learning = false; // keep track of whether learning curve eval is required
    boolean fraction =
        false; // keep track of whether fractions of datasets are provided for learning
    // the key on which to base the learning curves (either total instances or fraction)
    int learning_key = -1;
    boolean classificationTask =
        false; // used to determine if regression measures should be selected
    // =============== END EDIT melville ===============
    for (int i = 0; i < m_Instances.numAttributes(); i++) {
      String name = m_Instances.attribute(i).name();
      m_DatasetKeyModel.addElement(name);
      m_RunModel.addElement(name);
      m_ResultKeyModel.addElement(name);
      m_CompareModel.addElement(name);

      // =============== BEGIN EDIT melville ===============
      // If learning curves were generated then treat each
      // dataset + pt combination as a different dataset
      if (name.toLowerCase().equals("key_noise_levels")) {
        // noise overrides learning curves - but treat noise levels
        // like pts on learning curve
        learning_key = i;
        learning = true;
        noise = true;
        // fraction = true;
      } else if (name.toLowerCase().equals("key_fraction_instances") && !noise) {
        // fraction overrides total_instances
        learning_key = i;
        learning = true;
        fraction = true;
      } else if (name.toLowerCase().equals("key_total_instances") && !learning) {
        learning_key = i;
        learning = true;
      } else
      // =============== END EDIT melville ===============
      if (name.toLowerCase().equals("key_dataset")) {
        m_DatasetKeyList.addSelectionInterval(i, i);
        selectedListDataset += "," + (i + 1);
      } else if ((runCol == -1) && (name.toLowerCase().equals("key_run"))) {
        m_RunCombo.setSelectedIndex(i);
        runCol = i;
      } else if (name.toLowerCase().equals("key_scheme")
          || name.toLowerCase().equals("key_scheme_options")
          || name.toLowerCase().equals("key_scheme_version_id")) {
        m_ResultKeyList.addSelectionInterval(i, i);
        selectedList += "," + (i + 1);
        // =============== BEGIN EDIT mbilenko ===============
        // automatic selection of the correct measure for clustering experiments
      } else if (name.toLowerCase().indexOf("pairwise_f_measure") != -1) {
        m_CompareCombo.setSelectedIndex(i);
        m_ErrorCompareCol = i;
      }
      // automatic selection of the correct measure for deduping experiments
      else if (name.toLowerCase().equals("precision")) {
        m_CompareCombo.setSelectedIndex(i);
        // =============== END EDIT mbilenko ===============
      } else if (name.toLowerCase().indexOf("percent_correct") != -1) {
        m_CompareCombo.setSelectedIndex(i);
        classificationTask = true;
      } else if (!classificationTask
          && (name.toLowerCase().indexOf("root_mean_squared_error") != -1)) {
        // automatic selection of the correct measure for regression experiments
        m_CompareCombo.setSelectedIndex(i);
      } else if (name.toLowerCase().indexOf("percent_incorrect") != -1) {
        m_ErrorCompareCol = i;
        // remember index of error for computing error reductions
      }
    }
    // =============== BEGIN EDIT melville ===============
    if (learning) {
      m_DatasetKeyList.addSelectionInterval(learning_key, learning_key);
      selectedListDataset += "," + (learning_key + 1);
      m_CompareModel.addElement("%Error_reduction");
      m_CompareModel.addElement("Top_20%_%Error_reduction");
    }
    // =============== END EDIT melville ===============

    if (runCol == -1) {
      runCol = 0;
    }
    m_DatasetKeyBut.setEnabled(true);
    m_RunCombo.setEnabled(true);
    m_ResultKeyBut.setEnabled(true);
    m_CompareCombo.setEnabled(true);

    // Reconnect the configuration listener
    m_RunCombo.addActionListener(m_ConfigureListener);

    // Set up the TTester with the new data
    m_TTester.setRunColumn(runCol);
    Range generatorRange = new Range();
    if (selectedList.length() != 0) {
      try {
        generatorRange.setRanges(selectedList);
      } catch (Exception ex) {
        ex.printStackTrace();
        System.err.println(ex.getMessage());
      }
    }
    m_TTester.setResultsetKeyColumns(generatorRange);

    generatorRange = new Range();
    if (selectedListDataset.length() != 0) {
      try {
        generatorRange.setRanges(selectedListDataset);
      } catch (Exception ex) {
        ex.printStackTrace();
        System.err.println(ex.getMessage());
      }
    }
    m_TTester.setDatasetKeyColumns(generatorRange);
    // =============== BEGIN EDIT melville ===============
    m_TTester.setLearningCurve(learning);
    m_TTester.setFraction(fraction);
    if (learning) { // get points on the learning curve
      Attribute attr;
      if (noise) { // override fraction
        attr = m_Instances.attribute("Key_Noise_levels");
      } else if (fraction) {
        attr = m_Instances.attribute("Key_Fraction_instances");
      } else {
        attr = m_Instances.attribute("Key_Total_instances");
      }
      double[] pts = new double[attr.numValues()];
      for (int k = 0; k < attr.numValues(); k++) {
        pts[k] = Double.parseDouble(attr.value(k));
      }
      // sort points
      Arrays.sort(pts);
      m_TTester.setPoints(pts);
    }
    // =============== END EDIT melville ===============
    m_SigTex.setEnabled(true);
    m_PrecTex.setEnabled(true);

    setTTester();
  }
  /**
   * TextDirectoryLoader is unable to process a data set incrementally.
   *
   * @param structure ignored
   * @return never returns without throwing an exception
   * @throws IOException always. TextDirectoryLoader is unable to process a data set incrementally.
   */
  public Instance getNextInstance(Instances structure) throws IOException {
    // throw new IOException("TextDirectoryLoader can't read data sets incrementally.");

    String directoryPath = getDirectory().getAbsolutePath();
    Attribute classAtt = structure.classAttribute();
    if (m_filesByClass == null) {
      m_filesByClass = new ArrayList<LinkedList<String>>();
      for (int i = 0; i < classAtt.numValues(); i++) {
        File classDir = new File(directoryPath + File.separator + classAtt.value(i));
        String[] files = classDir.list();
        LinkedList<String> classDocs = new LinkedList<String>();
        for (String cd : files) {
          File txt =
              new File(directoryPath + File.separator + classAtt.value(i) + File.separator + cd);
          if (txt.isFile()) {
            classDocs.add(cd);
          }
        }
        m_filesByClass.add(classDocs);
      }
    }

    // cycle through the classes
    int count = 0;
    LinkedList<String> classContents = m_filesByClass.get(m_lastClassDir);
    boolean found = (classContents.size() > 0);
    while (classContents.size() == 0) {
      m_lastClassDir++;
      count++;
      if (m_lastClassDir == structure.classAttribute().numValues()) {
        m_lastClassDir = 0;
      }
      classContents = m_filesByClass.get(m_lastClassDir);
      if (classContents.size() > 0) {
        found = true; // we have an instance we can create
        break;
      }
      if (count == structure.classAttribute().numValues()) {
        break; // must be finished
      }
    }

    if (found) {
      String nextDoc = classContents.poll();
      File txt =
          new File(
              directoryPath
                  + File.separator
                  + classAtt.value(m_lastClassDir)
                  + File.separator
                  + nextDoc);

      BufferedReader is;
      if (m_charSet == null || m_charSet.length() == 0) {
        is = new BufferedReader(new InputStreamReader(new FileInputStream(txt)));
      } else {
        is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet));
      }
      StringBuffer txtStr = new StringBuffer();
      int c;
      while ((c = is.read()) != -1) {
        txtStr.append((char) c);
      }

      double[] newInst = null;
      if (m_OutputFilename) newInst = new double[3];
      else newInst = new double[2];

      if (getRetainStringValues()) {
        newInst[0] = (double) structure.attribute(0).addStringValue(txtStr.toString());
      } else {
        newInst[0] = 0;
        structure.attribute(0).setStringValue(txtStr.toString());
      }
      if (m_OutputFilename) {
        if (getRetainStringValues()) {
          newInst[1] = (double) structure.attribute(1).addStringValue(txt.getAbsolutePath());
        } else {
          newInst[1] = 0;
          structure.attribute(1).setStringValue(txt.getAbsolutePath());
        }
      }
      newInst[structure.classIndex()] = (double) m_lastClassDir;
      Instance inst = new DenseInstance(1.0, newInst);
      inst.setDataset(structure);
      is.close();

      m_lastClassDir++;
      if (m_lastClassDir == structure.classAttribute().numValues()) {
        m_lastClassDir = 0;
      }

      return inst;
    } else {
      return null; // done!
    }
  }