예제 #1
0
  public static double CA(Instances odata, int[] clusters) {
    double result = 0;
    double[] tmpdclass = odata.attributeToDoubleArray(odata.numAttributes() - 1);
    int[] oclass = new int[odata.numInstances()];
    for (int i = 0; i < tmpdclass.length; ++i) {
      oclass[i] = (int) tmpdclass[i];
    }
    int[] tmpclass = oclass.clone();
    int[] tmpclusters = clusters.clone();

    Arrays.sort(tmpclusters);
    Arrays.sort(tmpclass);
    int[][] M = new int[tmpclass[tmpclass.length - 1] + 1][tmpclusters[tmpclusters.length - 1] + 1];

    for (int i = 0; i < clusters.length; ++i) {
      M[oclass[i]][clusters[i]]++;
    }
    for (int i = 0; i < M.length; ++i) {
      System.out.println(Arrays.toString(M[i]));
    }
    for (int i = 0; i < M.length; ++i) {
      int maxindex = -1;
      for (int j = 0; j < M[0].length - 1; ++j) {
        if (M[i][j] < M[i][j + 1]) maxindex = j + 1;
      }
      M[i][0] = maxindex;
    }

    for (int i = 0; i < oclass.length; ++i) {
      if (M[oclass[i]][0] == clusters[i]) result++;
    }

    return (double) result / (double) odata.numInstances();
  }
예제 #2
0
  /**
   * Find all the instances in the dataset covered/not covered by the rule in given index, and the
   * correponding simple statistics and predicted class distributions are stored in the given double
   * array, which can be obtained by getSimpleStats() and getDistributions().<br>
   *
   * @param index the given index, assuming correct
   * @param insts the dataset to be covered by the rule
   * @param stats the given double array to hold stats, side-effected
   * @param dist the given array to hold class distributions, side-effected if null, the
   *     distribution is not necessary
   * @return the instances covered and not covered by the rule
   */
  private Instances[] computeSimpleStats(
      int index, Instances insts, double[] stats, double[] dist) {
    Rule rule = (Rule) m_Ruleset.elementAt(index);

    Instances[] data = new Instances[2];
    data[0] = new Instances(insts, insts.numInstances());
    data[1] = new Instances(insts, insts.numInstances());

    for (int i = 0; i < insts.numInstances(); i++) {
      Instance datum = insts.instance(i);
      double weight = datum.weight();
      if (rule.covers(datum)) {
        data[0].add(datum); // Covered by this rule
        stats[0] += weight; // Coverage
        if ((int) datum.classValue() == (int) rule.getConsequent())
          stats[2] += weight; // True positives
        else stats[4] += weight; // False positives
        if (dist != null) dist[(int) datum.classValue()] += weight;
      } else {
        data[1].add(datum); // Not covered by this rule
        stats[1] += weight;
        if ((int) datum.classValue() != (int) rule.getConsequent())
          stats[3] += weight; // True negatives
        else stats[5] += weight; // False negatives
      }
    }

    return data;
  }
예제 #3
0
  /**
   * Select only instances with weights that contribute to the specified quantile of the weight
   * distribution
   *
   * @param data the input instances
   * @param quantile the specified quantile eg 0.9 to select 90% of the weight mass
   * @return the selected instances
   */
  protected Instances selectWeightQuantile(Instances data, double quantile) {

    int numInstances = data.numInstances();
    Instances trainData = new Instances(data, numInstances);
    double[] weights = new double[numInstances];

    double sumOfWeights = 0;
    for (int i = 0; i < numInstances; i++) {
      weights[i] = data.instance(i).weight();
      sumOfWeights += weights[i];
    }
    double weightMassToSelect = sumOfWeights * quantile;
    int[] sortedIndices = Utils.sort(weights);

    // Select the instances
    sumOfWeights = 0;
    for (int i = numInstances - 1; i >= 0; i--) {
      Instance instance = (Instance) data.instance(sortedIndices[i]).copy();
      trainData.add(instance);
      sumOfWeights += weights[sortedIndices[i]];
      if ((sumOfWeights > weightMassToSelect)
          && (i > 0)
          && (weights[sortedIndices[i]] != weights[sortedIndices[i - 1]])) {
        break;
      }
    }
    if (m_Debug) {
      System.err.println("Selected " + trainData.numInstances() + " out of " + numInstances);
    }
    return trainData;
  }
  protected void searchMedian(Instances instances) {
    medians = new double[instances.numAttributes()];
    imputations = new int[instances.numAttributes()];

    for (int j = 0; j < instances.numAttributes(); ++j) {
      int numPresentValues = 0;
      if (instances.attribute(j).isNumeric()) {
        double[] values = new double[instances.numInstances()];
        for (int i = 0; i < instances.numInstances(); ++i) {
          Instance current = instances.get(i);
          if (Utils.isMissingValue(current.value(j)) == false) {
            values[numPresentValues] = current.value(j);
            numPresentValues += 1;
          }
        }
        if (numPresentValues > 0) {
          double[] goodValues = Arrays.copyOf(values, numPresentValues);
          Median median = new Median();
          medians[j] = median.evaluate(goodValues);
        }
      }
    }

    for (int j = 0; j < instances.numAttributes(); ++j) {
      if (instances.attribute(j).isNumeric()) {
        Conversion.log(
            "OK",
            "Impute Numeric",
            "Attribute " + instances.attribute(j) + " - Median: " + medians[j]);
      }
    }
  }
  /**
   * Compare two datasets to see if they differ.
   *
   * @param data1 one set of instances
   * @param data2 the other set of instances
   * @throws Exception if the datasets differ
   */
  protected void compareDatasets(Instances data1, Instances data2) throws Exception {

    if (m_CheckHeader) {
      if (!data2.equalHeaders(data1)) {
        throw new Exception("header has been modified\n" + data2.equalHeadersMsg(data1));
      }
    }
    if (!(data2.numInstances() == data1.numInstances())) {
      throw new Exception("number of instances has changed");
    }
    for (int i = 0; i < data2.numInstances(); i++) {
      Instance orig = data1.instance(i);
      Instance copy = data2.instance(i);
      for (int j = 0; j < orig.numAttributes(); j++) {
        if (orig.isMissing(j)) {
          if (!copy.isMissing(j)) {
            throw new Exception("instances have changed");
          }
        } else {
          if (m_CompareValuesAsString) {
            if (!orig.toString(j).equals(copy.toString(j))) {
              throw new Exception("instances have changed");
            }
          } else {
            if (Math.abs(orig.value(j) - copy.value(j)) > m_MaxDiffValues) {
              throw new Exception("instances have changed");
            }
          }
        }
        if (Math.abs(orig.weight() - copy.weight()) > m_MaxDiffWeights) {
          throw new Exception("instance weights have changed");
        }
      }
    }
  }
예제 #6
0
  /**
   * Splits the given set of instances into subsets.
   *
   * @exception Exception if something goes wrong
   */
  public final Instances[] split(Instances data) throws Exception {

    Instances[] instances = new Instances[m_numSubsets];
    double[] weights;
    double newWeight;
    Instance instance;
    int subset, i, j;

    for (j = 0; j < m_numSubsets; j++)
      instances[j] = new Instances((Instances) data, data.numInstances());
    for (i = 0; i < data.numInstances(); i++) {
      instance = ((Instances) data).instance(i);
      weights = weights(instance);
      subset = whichSubset(instance);
      if (subset > -1) instances[subset].add(instance);
      else
        for (j = 0; j < m_numSubsets; j++)
          if (Utils.gr(weights[j], 0)) {
            newWeight = weights[j] * instance.weight();
            instances[j].add(instance);
            instances[j].lastInstance().setWeight(newWeight);
          }
    }
    for (j = 0; j < m_numSubsets; j++) instances[j].compactify();

    return instances;
  }
예제 #7
0
 private ArrayList<Instance> rellenarConInstancias(Instances train) {
   Random r = new Random();
   ArrayList<Instance> muestras = new ArrayList<Instance>();
   for (int i = 0; i < train.numInstances(); i++) {
     muestras.add(train.instance(r.nextInt(train.numInstances())));
   }
   return muestras;
 }
예제 #8
0
  /**
   * Patition the data into 2, first of which has (numFolds-1)/numFolds of the data and the second
   * has 1/numFolds of the data
   *
   * @param data the given data
   * @param numFolds the given number of folds
   * @return the patitioned instances
   */
  public static final Instances[] partition(Instances data, int numFolds) {
    Instances[] rt = new Instances[2];
    int splits = data.numInstances() * (numFolds - 1) / numFolds;

    rt[0] = new Instances(data, 0, splits);
    rt[1] = new Instances(data, splits, data.numInstances() - splits);

    return rt;
  }
예제 #9
0
  /** Queries the user enough to make a database query to retrieve experiment results. */
  protected void setInstancesFromDBaseQuery() {

    try {
      if (m_InstanceQuery == null) {
        m_InstanceQuery = new InstanceQuery();
      }
      String dbaseURL = m_InstanceQuery.getDatabaseURL();
      dbaseURL =
          (String)
              JOptionPane.showInputDialog(
                  this,
                  "Enter the database URL",
                  "Query Database",
                  JOptionPane.PLAIN_MESSAGE,
                  null,
                  null,
                  dbaseURL);
      if (dbaseURL == null) {
        m_FromLab.setText("Cancelled");
        return;
      }
      m_InstanceQuery.setDatabaseURL(dbaseURL);
      m_InstanceQuery.connectToDatabase();
      if (!m_InstanceQuery.experimentIndexExists()) {
        m_FromLab.setText("No experiment index");
        return;
      }
      m_FromLab.setText("Getting experiment index");
      Instances index =
          m_InstanceQuery.retrieveInstances("SELECT * FROM " + InstanceQuery.EXP_INDEX_TABLE);
      if (index.numInstances() == 0) {
        m_FromLab.setText("No experiments available");
        return;
      }
      m_FromLab.setText("Got experiment index");

      DefaultListModel lm = new DefaultListModel();
      for (int i = 0; i < index.numInstances(); i++) {
        lm.addElement(index.instance(i).toString());
      }
      JList jl = new JList(lm);
      ListSelectorDialog jd = new ListSelectorDialog(null, jl);
      int result = jd.showDialog();
      if (result != ListSelectorDialog.APPROVE_OPTION) {
        m_FromLab.setText("Cancelled");
        return;
      }
      Instance selInst = index.instance(jl.getSelectedIndex());
      Attribute tableAttr = index.attribute(InstanceQuery.EXP_RESULT_COL);
      String table = InstanceQuery.EXP_RESULT_PREFIX + selInst.toString(tableAttr);

      setInstancesFromDatabaseTable(table);
    } catch (Exception ex) {
      m_FromLab.setText("Problem reading database");
    }
  }
예제 #10
0
 /**
  * Transpose the document-term matrix to term-document matrix
  *
  * @param data instances with document-term info
  * @return a term-document matrix transposed from the input dataset
  */
 private Matrix getTransposedMatrix(Instances data) {
   double[][] temp = new double[data.numAttributes()][data.numInstances()];
   for (int i = 0; i < data.numInstances(); i++) {
     Instance inst = data.instance(i);
     for (int v = 0; v < inst.numValues(); v++) {
       temp[inst.index(v)][i] = inst.valueSparse(v);
     }
   }
   Matrix My_x = new Matrix(temp);
   return My_x;
 }
예제 #11
0
 private Matrix getTransposedNormedMatrix(Instances data) {
   Matrix matrix = new Matrix(data.numAttributes(), data.numInstances());
   for (int i = 0; i < data.numInstances(); i++) {
     double[] vals = data.instance(i).toDoubleArray();
     double sum = Utils.sum(vals);
     for (int v = 0; v < vals.length; v++) {
       vals[v] /= sum;
       matrix.set(v, i, vals[v]);
     }
   }
   return matrix;
 }
예제 #12
0
  public int calculateAllWrong() {
    if (run_ids.size() < 2) {
      throw new RuntimeException("Too few runs to compare. Should be at least 2. ");
    }

    ArrayList<Attribute> attributes = new ArrayList<Attribute>();
    attributes.add(new Attribute("repeat"));
    attributes.add(new Attribute("fold"));
    attributes.add(new Attribute("rowid"));
    resultSet = new Instances("all-wrong", attributes, task_splits.numInstances());

    for (int i = 0; i < task_splits.numInstances(); ++i) {
      Instance current = task_splits.get(i);
      boolean test = current.stringValue(task_splits.attribute("type")).equals("TEST");
      if (!test) {
        continue;
      }

      Integer row_id = (int) current.value(task_splits.attribute("rowid"));
      Integer repeat = (int) current.value(task_splits.attribute("repeat"));
      Integer fold = (int) current.value(task_splits.attribute("fold"));
      Integer sample = 0;
      try {
        sample = (int) current.value(task_splits.attribute("sample"));
      } catch (Exception e) {
      }

      String correctLabel = correct.get(row_id);
      Integer correctPredictions = 0;

      for (Integer run_id : run_ids) {

        // System.out.println(predictions.get(run_id));
        // System.out.println(repeat + "," + fold + "," + sample + "," + row_id);

        if (predictions
            .get(run_id)
            .get(repeat)
            .get(fold)
            .get(sample)
            .get(row_id)
            .equals(correctLabel)) {
          correctPredictions += 1;
        }
      }

      if (correctPredictions == 0) {
        double[] instance = {repeat, fold, row_id};
        resultSet.add(new DenseInstance(1.0, instance));
      }
    }
    return resultSet.size();
  }
예제 #13
0
  /** Sets distribution associated with model. */
  public void resetDistribution(Instances data) throws Exception {

    Instances insts = new Instances(data, data.numInstances());
    for (int i = 0; i < data.numInstances(); i++) {
      if (whichSubset(data.instance(i)) > -1) {
        insts.add(data.instance(i));
      }
    }
    Distribution newD = new Distribution(insts, this);
    newD.addInstWithUnknown(data, m_attIndex);
    m_distribution = newD;
  }
예제 #14
0
  /**
   * Turn the list of nearest neighbors into a probability distribution.
   *
   * @param neighbours the list of nearest neighboring instances
   * @param distances the distances of the neighbors
   * @return the probability distribution
   * @throws Exception if computation goes wrong or has no class attribute
   */
  protected double[] makeDistribution(Instances neighbours, double[] distances) throws Exception {

    double total = 0, weight;
    double[] distribution = new double[m_NumClasses];

    // Set up a correction to the estimator
    if (m_ClassType == Attribute.NOMINAL) {
      for (int i = 0; i < m_NumClasses; i++) {
        distribution[i] = 1.0 / Math.max(1, m_Train.numInstances());
      }
      total = (double) m_NumClasses / Math.max(1, m_Train.numInstances());
    }

    for (int i = 0; i < neighbours.numInstances(); i++) {
      // Collect class counts
      Instance current = neighbours.instance(i);
      distances[i] = distances[i] * distances[i];
      distances[i] = Math.sqrt(distances[i] / m_NumAttributesUsed);
      switch (m_DistanceWeighting) {
        case WEIGHT_INVERSE:
          weight = 1.0 / (distances[i] + 0.001); // to avoid div by zero
          break;
        case WEIGHT_SIMILARITY:
          weight = 1.0 - distances[i];
          break;
        default: // WEIGHT_NONE:
          weight = 1.0;
          break;
      }
      weight *= current.weight();
      try {
        switch (m_ClassType) {
          case Attribute.NOMINAL:
            distribution[(int) current.classValue()] += weight;
            break;
          case Attribute.NUMERIC:
            distribution[0] += current.classValue() * weight;
            break;
        }
      } catch (Exception ex) {
        throw new Error("Data has no class attribute!");
      }
      total += weight;
    }

    // Normalise distribution
    if (total > 0) {
      Utils.normalize(distribution, total);
    }
    return distribution;
  }
예제 #15
0
 private static void writePredictionsTrecEval(
     double[] predictions, Instances data, int idIndex, int classIndex, Writer out)
     throws IOException {
   if (predictions.length != data.numInstances())
     throw new IllegalStateException(predictions.length + "!=" + data.numInstances());
   for (int i = 0; i < predictions.length; i++) {
     final String id = data.instance(i).stringValue(idIndex);
     final String label = data.attribute(classIndex).value((int) predictions[i]);
     out.write(id);
     out.write(" ");
     out.write(label);
     out.write(" 1.0\n");
   }
 }
예제 #16
0
  /**
   * Set cutpoints for a single attribute using MDL.
   *
   * @param index the index of the attribute to set cutpoints for
   * @param data the data to work with
   */
  protected void calculateCutPointsByMDL(int index, Instances data) {

    // Sort instances
    data.sort(data.attribute(index));

    // Find first instances that's missing
    int firstMissing = data.numInstances();
    for (int i = 0; i < data.numInstances(); i++) {
      if (data.instance(i).isMissing(index)) {
        firstMissing = i;
        break;
      }
    }
    m_CutPoints[index] = cutPointsForSubset(data, index, 0, firstMissing);
  }
 /**
  * checks a certain statistic
  *
  * @param expr the filter expression
  * @param stats the value of the corresponding attribute statistics
  */
 protected void checkStatistics(String expr, double stats) {
   m_Filter = getFilter(expr);
   Instances result = useFilter();
   assertEquals(m_Instances.numAttributes(), result.numAttributes());
   assertEquals(m_Instances.numInstances(), result.numInstances());
   // check statistics
   boolean equal = true;
   for (int i = 0; i < result.numInstances(); i++) {
     if (!Utils.eq(stats, result.instance(i).value(m_AttIndex))) {
       equal = false;
       break;
     }
   }
   if (!equal) fail("Filter and Attribute statistics differ ('" + expr + "')!");
 }
예제 #18
0
  /**
   * Gets the index of the instance with the closest threshold value to the desired target
   *
   * @param tcurve a set of instances that have been generated by this class
   * @param threshold the target threshold
   * @return the index of the instance that has threshold closest to the target, or -1 if this could
   *     not be found (i.e. no data, or bad threshold target)
   */
  public static int getThresholdInstance(Instances tcurve, double threshold) {

    if (!RELATION_NAME.equals(tcurve.relationName())
        || (tcurve.numInstances() == 0)
        || (threshold < 0)
        || (threshold > 1.0)) {
      return -1;
    }
    if (tcurve.numInstances() == 1) {
      return 0;
    }
    double[] tvals = tcurve.attributeToDoubleArray(tcurve.numAttributes() - 1);
    int[] sorted = Utils.sort(tvals);
    return binarySearch(sorted, tvals, threshold);
  }
예제 #19
0
  private static void writePredictedDistributions(
      Classifier c, Instances data, int idIndex, Writer out) throws Exception {
    // header
    out.write("id");
    for (int i = 0; i < data.numClasses(); i++) {
      out.write(",\"");
      out.write(data.classAttribute().value(i).replaceAll("[\"\\\\]", "_"));
      out.write("\"");
    }
    out.write("\n");

    // data
    for (int i = 0; i < data.numInstances(); i++) {
      final String id = data.instance(i).stringValue(idIndex);
      double[] distribution = c.distributionForInstance(data.instance(i));

      // final String label = data.attribute(classIndex).value();
      out.write(id);
      for (double probability : distribution) {
        out.write(",");
        out.write(String.valueOf(probability > 1e-5 ? (float) probability : 0f));
      }
      out.write("\n");
    }
  }
예제 #20
0
  /**
   * Method that finds all large itemsets for the given set of instances.
   *
   * @param the instances to be used
   * @exception Exception if an attribute is numeric
   */
  private void findLargeItemSets(int index) throws Exception {

    FastVector kMinusOneSets, kSets = new FastVector();
    Hashtable hashtable;
    int i = 0;
    // Find large itemsets
    // of length 1
    if (index == 1) {
      kSets = ItemSet.singletons(m_instances);
      ItemSet.upDateCounters(kSets, m_instances);
      kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE);
      if (kSets.size() == 0) return;
      m_Ls.addElement(kSets);
    }
    // of length > 1
    if (index > 1) {
      if (m_Ls.size() > 0) kSets = (FastVector) m_Ls.lastElement();
      m_Ls.removeAllElements();
      i = index - 2;
      kMinusOneSets = kSets;
      kSets = ItemSet.mergeAllItemSets(kMinusOneSets, i, m_instances.numInstances());
      hashtable = ItemSet.getHashtable(kMinusOneSets, kMinusOneSets.size());
      m_hashtables.addElement(hashtable);
      kSets = ItemSet.pruneItemSets(kSets, hashtable);
      ItemSet.upDateCounters(kSets, m_instances);
      kSets = ItemSet.deleteItemSets(kSets, m_premiseCount, Integer.MAX_VALUE);
      if (kSets.size() == 0) return;
      m_Ls.addElement(kSets);
    }
  }
예제 #21
0
 /**
  * Calculate average of every columns
  *
  * @param inst
  * @return
  */
 public Double[] calculateAverage(Instances inst) {
   Double[] average = new Double[inst.numAttributes() - 1];
   for (int i = 0; i < inst.numAttributes() - 1; i++) {
     average[i] = 0.0;
   }
   for (int i = 0; i < inst.numInstances(); i++) {
     for (int x = 0; x < inst.instance(i).numAttributes() - 1; x++) {
       Instance ins = inst.instance(i);
       if (ins != null && !Double.isNaN(ins.value(x))) average[x] += ins.value(x);
     }
   }
   for (int i = 0; i < inst.numAttributes() - 1; i++) {
     average[i] /= inst.numInstances();
   }
   return average;
 }
예제 #22
0
  /**
   * Stratify the given data into the given number of bags based on the class values. It differs
   * from the <code>Instances.stratify(int fold)</code> that before stratification it sorts the
   * instances according to the class order in the header file. It assumes no missing values in the
   * class.
   *
   * @param data the given data
   * @param folds the given number of folds
   * @param rand the random object used to randomize the instances
   * @return the stratified instances
   */
  public static final Instances stratify(Instances data, int folds, Random rand) {
    if (!data.classAttribute().isNominal()) return data;

    Instances result = new Instances(data, 0);
    Instances[] bagsByClasses = new Instances[data.numClasses()];

    for (int i = 0; i < bagsByClasses.length; i++) bagsByClasses[i] = new Instances(data, 0);

    // Sort by class
    for (int j = 0; j < data.numInstances(); j++) {
      Instance datum = data.instance(j);
      bagsByClasses[(int) datum.classValue()].add(datum);
    }

    // Randomize each class
    for (int j = 0; j < bagsByClasses.length; j++) bagsByClasses[j].randomize(rand);

    for (int k = 0; k < folds; k++) {
      int offset = k, bag = 0;
      oneFold:
      while (true) {
        while (offset >= bagsByClasses[bag].numInstances()) {
          offset -= bagsByClasses[bag].numInstances();
          if (++bag >= bagsByClasses.length) // Next bag
          break oneFold;
        }

        result.add(bagsByClasses[bag].instance(offset));
        offset += folds;
      }
    }

    return result;
  }
예제 #23
0
  /**
   * Calculates the area under the precision-recall curve (AUPRC).
   *
   * @param tcurve a previously extracted threshold curve Instances.
   * @return the PRC area, or Double.NaN if you don't pass in a ThresholdCurve generated Instances.
   */
  public static double getPRCArea(Instances tcurve) {
    final int n = tcurve.numInstances();
    if (!RELATION_NAME.equals(tcurve.relationName()) || (n == 0)) {
      return Double.NaN;
    }

    final int pInd = tcurve.attribute(PRECISION_NAME).index();
    final int rInd = tcurve.attribute(RECALL_NAME).index();
    final double[] pVals = tcurve.attributeToDoubleArray(pInd);
    final double[] rVals = tcurve.attributeToDoubleArray(rInd);

    double area = 0;
    double xlast = rVals[n - 1];

    // start from the first real p/r pair (not the artificial zero point)
    for (int i = n - 2; i >= 0; i--) {
      double recallDelta = rVals[i] - xlast;
      area += (pVals[i] * recallDelta);

      xlast = rVals[i];
    }

    if (area == 0) {
      return Utils.missingValue();
    }
    return area;
  }
 public void testTypical() {
   Instances result = useFilter();
   // Number of attributes and instances shouldn't change
   assertEquals(m_Instances.numAttributes() + 5, result.numAttributes());
   assertEquals(m_Instances.numInstances(), result.numInstances());
   // Eibe can enhance this to check the binarizing is correct.
 }
예제 #25
0
  @Override
  public void buildClassifier(Instances data) throws Exception {
    trainingData = data;
    Attribute classAttribute = data.classAttribute();
    prototypes = new ArrayList<>();

    classedData = new HashMap<String, ArrayList<Sequence>>();
    indexClassedDataInFullData = new HashMap<String, ArrayList<Integer>>();
    for (int c = 0; c < data.numClasses(); c++) {
      classedData.put(data.classAttribute().value(c), new ArrayList<Sequence>());
      indexClassedDataInFullData.put(data.classAttribute().value(c), new ArrayList<Integer>());
    }

    sequences = new Sequence[data.numInstances()];
    classMap = new String[sequences.length];
    for (int i = 0; i < sequences.length; i++) {
      Instance sample = data.instance(i);
      MonoDoubleItemSet[] sequence = new MonoDoubleItemSet[sample.numAttributes() - 1];
      int shift = (sample.classIndex() == 0) ? 1 : 0;
      for (int t = 0; t < sequence.length; t++) {
        sequence[t] = new MonoDoubleItemSet(sample.value(t + shift));
      }
      sequences[i] = new Sequence(sequence);
      String clas = sample.stringValue(classAttribute);
      classMap[i] = clas;
      classedData.get(clas).add(sequences[i]);
      indexClassedDataInFullData.get(clas).add(i);
      //			System.out.println("Element "+i+" of train is classed "+clas+" and went to element
      // "+(indexClassedDataInFullData.get(clas).size()-1));
    }
    buildSpecificClassifier(data);
  }
예제 #26
0
  /**
   * Analyses the given list of decision points according to the context specified. Furthermore, the
   * context is provided with some visualization of the analysis result.
   *
   * @param decisionPoints the list of decision points to be analysed
   * @param log the log to be analysed
   * @param highLevelPN the simulation model to export discovered data dependencies
   */
  public void analyse(ClusterDecisionAnalyzer cda) {
    clusterDecisionAnalyzer = cda;

    // create empty data set with attribute information
    Instances data = cda.getDataInfo();

    // in case no single learning instance can be provided (as decision
    // point is never
    // reached, or decision classes cannot specified properly) --> do not
    // call algorithm
    if (data.numInstances() == 0) {
      System.out.println("No learning instances available");
    }
    // actually solve the classification problem
    else {
      try {
        myClassifier.buildClassifier(data);
        // build up result visualization
        cda.setResultVisualization(createResultVisualization());
        cda.setEvaluationVisualization(createEvaluationVisualization(data));
      } catch (Exception ex) {
        ex.printStackTrace();
        cda.setResultVisualization(
            createMessagePanel("Error while solving the classification problem"));
      }
    }
  }
예제 #27
0
  /**
   * Signify that this batch of input to the filter is finished.
   *
   * @return true if there are instances pending output
   * @throws IllegalStateException if no input structure has been defined
   */
  @Override
  public boolean batchFinished() throws Exception {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }

    if (!m_firstBatchFinished) {

      Instances filtered;
      if (m_numOfCrossValidationFolds < 2) {
        filtered = cleanseTrain(getInputFormat());
      } else {
        filtered = cleanseCross(getInputFormat());
      }

      for (int i = 0; i < filtered.numInstances(); i++) {
        push(filtered.instance(i));
      }

      m_firstBatchFinished = true;
      flushInput();
    }
    m_NewBatch = true;
    return (numPendingOutput() != 0);
  }
예제 #28
0
  /**
   * Calculates the area under the ROC curve as the Wilcoxon-Mann-Whitney statistic.
   *
   * @param tcurve a previously extracted threshold curve Instances.
   * @return the ROC area, or Double.NaN if you don't pass in a ThresholdCurve generated Instances.
   */
  public static double getROCArea(Instances tcurve) {

    final int n = tcurve.numInstances();
    if (!RELATION_NAME.equals(tcurve.relationName()) || (n == 0)) {
      return Double.NaN;
    }
    final int tpInd = tcurve.attribute(TRUE_POS_NAME).index();
    final int fpInd = tcurve.attribute(FALSE_POS_NAME).index();
    final double[] tpVals = tcurve.attributeToDoubleArray(tpInd);
    final double[] fpVals = tcurve.attributeToDoubleArray(fpInd);

    double area = 0.0, cumNeg = 0.0;
    final double totalPos = tpVals[0];
    final double totalNeg = fpVals[0];
    for (int i = 0; i < n; i++) {
      double cip, cin;
      if (i < n - 1) {
        cip = tpVals[i] - tpVals[i + 1];
        cin = fpVals[i] - fpVals[i + 1];
      } else {
        cip = tpVals[n - 1];
        cin = fpVals[n - 1];
      }
      area += cip * (cumNeg + (0.5 * cin));
      cumNeg += cin;
    }
    area /= (totalNeg * totalPos);

    return area;
  }
  /**
   * Generates the classifier.
   *
   * @param data set of instances serving as training data
   * @throws Exception if the classifier has not been generated successfully
   */
  @Override
  public void buildClassifier(Instances data) throws Exception {
    reset();

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    m_data = new Instances(data, 0);
    data = new Instances(data);

    m_wordsPerClass = new double[data.numClasses()];
    m_probOfClass = new double[data.numClasses()];
    m_probOfWordGivenClass = new HashMap<Integer, LinkedHashMap<String, Count>>();

    double laplace = 1.0;
    for (int i = 0; i < data.numClasses(); i++) {
      LinkedHashMap<String, Count> dict =
          new LinkedHashMap<String, Count>(10000 / data.numClasses());
      m_probOfWordGivenClass.put(i, dict);
      m_probOfClass[i] = laplace;

      // this needs to be updated for laplace correction every time we see a new
      // word (attribute)
      m_wordsPerClass[i] = 0;
    }

    for (int i = 0; i < data.numInstances(); i++) {
      updateClassifier(data.instance(i));
    }
  }
  @Override
  protected Instances process(Instances instances) throws Exception {
    Instances result = new Instances(determineOutputFormat(instances), 0);

    Tagger tagger = new Tagger();
    tagger.loadModel("models/model.20120919");

    // reference to the content of the tweet
    Attribute attrCont = instances.attribute("content");

    for (int i = 0; i < instances.numInstances(); i++) {
      double[] values = new double[result.numAttributes()];
      for (int n = 0; n < instances.numAttributes(); n++)
        values[n] = instances.instance(i).value(n);

      String content = instances.instance(i).stringValue(attrCont);
      List<String> words = MyUtils.cleanTokenize(content);
      List<String> posTags = MyUtils.getPOStags(words, tagger);

      // calculate frequencies of different POS tags
      Map<String, Integer> posFreqs = MyUtils.calculateTermFreq(posTags);

      // add POS values
      for (String posTag : posFreqs.keySet()) {
        int index = result.attribute("POS-" + posTag).index();
        values[index] = posFreqs.get(posTag);
      }

      Instance inst = new SparseInstance(1, values);
      result.add(inst);
    }
    return result;
  }