예제 #1
0
  /**
   * Select only instances with weights that contribute to the specified quantile of the weight
   * distribution
   *
   * @param data the input instances
   * @param quantile the specified quantile eg 0.9 to select 90% of the weight mass
   * @return the selected instances
   */
  protected Instances selectWeightQuantile(Instances data, double quantile) {

    int numInstances = data.numInstances();
    Instances trainData = new Instances(data, numInstances);
    double[] weights = new double[numInstances];

    double sumOfWeights = 0;
    for (int i = 0; i < numInstances; i++) {
      weights[i] = data.instance(i).weight();
      sumOfWeights += weights[i];
    }
    double weightMassToSelect = sumOfWeights * quantile;
    int[] sortedIndices = Utils.sort(weights);

    // Select the instances
    sumOfWeights = 0;
    for (int i = numInstances - 1; i >= 0; i--) {
      Instance instance = (Instance) data.instance(sortedIndices[i]).copy();
      trainData.add(instance);
      sumOfWeights += weights[sortedIndices[i]];
      if ((sumOfWeights > weightMassToSelect)
          && (i > 0)
          && (weights[sortedIndices[i]] != weights[sortedIndices[i - 1]])) {
        break;
      }
    }
    if (m_Debug) {
      System.err.println("Selected " + trainData.numInstances() + " out of " + numInstances);
    }
    return trainData;
  }
예제 #2
0
  //	计算h1,h2分类器共同的分类错误率;
  public double measureBothError(Classifier h1, Classifier h2, Instances test) {
    int m = test.numInstances();
    double value1, value2, value;
    int error = 0, total = 0;
    try {
      for (int i = 0; i < m; i++) {
        value = test.instance(i).classValue();
        value1 = h1.classifyInstance(test.instance(i));
        value2 = h2.classifyInstance(test.instance(i));

        // 两分类器做出相同决策
        if (value1 == value2) {
          // 两分类器做出相同决策的样本数量
          total++;

          // 两分类器做出相同错误决策
          if (value != value1) {
            //	两分类器做出相同错误决策的样本数量
            error++;
          }
        }
      }
    } catch (Exception e) {
      System.out.println(e);
    }
    // System.out.println("m:=" + m);
    // System.out.println("error:=" + error +"; total:=" + total);

    // 两个分类器的分类错误率= 两分类器做出相同错误决策的样本数量/两分类器做出相同决策的样本数量
    return (error * 1.0) / total;
  }
  /**
   * Compare two datasets to see if they differ.
   *
   * @param data1 one set of instances
   * @param data2 the other set of instances
   * @throws Exception if the datasets differ
   */
  protected void compareDatasets(Instances data1, Instances data2) throws Exception {

    if (m_CheckHeader) {
      if (!data2.equalHeaders(data1)) {
        throw new Exception("header has been modified\n" + data2.equalHeadersMsg(data1));
      }
    }
    if (!(data2.numInstances() == data1.numInstances())) {
      throw new Exception("number of instances has changed");
    }
    for (int i = 0; i < data2.numInstances(); i++) {
      Instance orig = data1.instance(i);
      Instance copy = data2.instance(i);
      for (int j = 0; j < orig.numAttributes(); j++) {
        if (orig.isMissing(j)) {
          if (!copy.isMissing(j)) {
            throw new Exception("instances have changed");
          }
        } else {
          if (m_CompareValuesAsString) {
            if (!orig.toString(j).equals(copy.toString(j))) {
              throw new Exception("instances have changed");
            }
          } else {
            if (Math.abs(orig.value(j) - copy.value(j)) > m_MaxDiffValues) {
              throw new Exception("instances have changed");
            }
          }
        }
        if (Math.abs(orig.weight() - copy.weight()) > m_MaxDiffWeights) {
          throw new Exception("instance weights have changed");
        }
      }
    }
  }
  @Override
  protected Instances process(Instances instances) throws Exception {
    Instances result = new Instances(determineOutputFormat(instances), 0);

    Tagger tagger = new Tagger();
    tagger.loadModel("models/model.20120919");

    // reference to the content of the tweet
    Attribute attrCont = instances.attribute("content");

    for (int i = 0; i < instances.numInstances(); i++) {
      double[] values = new double[result.numAttributes()];
      for (int n = 0; n < instances.numAttributes(); n++)
        values[n] = instances.instance(i).value(n);

      String content = instances.instance(i).stringValue(attrCont);
      List<String> words = MyUtils.cleanTokenize(content);
      List<String> posTags = MyUtils.getPOStags(words, tagger);

      // calculate frequencies of different POS tags
      Map<String, Integer> posFreqs = MyUtils.calculateTermFreq(posTags);

      // add POS values
      for (String posTag : posFreqs.keySet()) {
        int index = result.attribute("POS-" + posTag).index();
        values[index] = posFreqs.get(posTag);
      }

      Instance inst = new SparseInstance(1, values);
      result.add(inst);
    }
    return result;
  }
  /**
   * Calculates the centroid pivot of a node based on the list of points that it contains (tbe two
   * lists of its children are provided).
   *
   * @param list1 The point index list of first child.
   * @param list2 The point index list of second child.
   * @param insts The insts object on which the tree is being built (for header information).
   * @return The centroid pivot of the node.
   */
  public Instance calcPivot(MyIdxList list1, MyIdxList list2, Instances insts) {
    int classIdx = m_Instances.classIndex();
    double[] attrVals = new double[insts.numAttributes()];

    Instance temp;
    for (int i = 0; i < list1.length(); i++) {
      temp = insts.instance(((ListNode) list1.get(i)).idx);
      for (int k = 0; k < temp.numValues(); k++) {
        if (temp.index(k) == classIdx) continue;
        attrVals[k] += temp.valueSparse(k);
      }
    }
    for (int j = 0; j < list2.length(); j++) {
      temp = insts.instance(((ListNode) list2.get(j)).idx);
      for (int k = 0; k < temp.numValues(); k++) {
        if (temp.index(k) == classIdx) continue;
        attrVals[k] += temp.valueSparse(k);
      }
    }
    for (int j = 0, numInsts = list1.length() + list2.length(); j < attrVals.length; j++) {
      attrVals[j] /= numInsts;
    }
    temp = new DenseInstance(1.0, attrVals);
    return temp;
  }
예제 #6
0
  public static ArrayList<Integer> getProfiles(Instances inst, List<Integer> marks)
      throws Exception {

    //		Instances inst = Utils.prepareProfileMatcherData(schoolNo, grade, term, subjects);

    //		ReplaceMissingValues rmv = new ReplaceMissingValues();
    //		rmv.setInputFormat(inst);
    //		inst = Filter.useFilter(inst, rmv);

    for (int i = 0; i < inst.numAttributes(); i++) {
      inst.deleteWithMissing(i);
    }

    KDTree tree = new KDTree();
    tree.setMeasurePerformance(true);

    try {
      tree.setInstances(inst);

      EuclideanDistance df = new EuclideanDistance(inst);
      df.setDontNormalize(true);
      df.setAttributeIndices("2-last");

      tree.setDistanceFunction(df);

    } catch (Exception e) {
      e.printStackTrace();
    }

    Instances neighbors = null;

    Instances test = CFilter.createInstance(112121, (ArrayList<Integer>) marks);

    Instance p = test.firstInstance();

    try {
      neighbors = tree.kNearestNeighbours(p, 50);
    } catch (Exception e) {
      e.printStackTrace();
    }
    //		System.out.println(tree.getPerformanceStats().getTotalPointsVisited());

    //		System.out.println(nn1 + " is the nearest neigbor for " + p);
    //		System.out.println(nn2 + " is the second nearest neigbor for " + p);

    ArrayList<Integer> profiles = new ArrayList<Integer>();
    for (int i = 0; i < neighbors.numInstances(); i++) {
      System.out.println(neighbors.instance(i));
      profiles.add(Integer.valueOf(neighbors.instance(i).toString(0)));
    }

    // Now we can also easily compute the distances as the KDTree does it

    DistanceFunction df = tree.getDistanceFunction();
    //		System.out.println("The distance between" + nn1 + " and " + p + " is " + df.distance(nn1,
    // p));
    //		System.out.println("The distance between" + nn2 + " and " + p + " is " + df.distance(nn2,
    // p));
    return profiles;
  }
예제 #7
0
  private static void writePredictedDistributions(
      Classifier c, Instances data, int idIndex, Writer out) throws Exception {
    // header
    out.write("id");
    for (int i = 0; i < data.numClasses(); i++) {
      out.write(",\"");
      out.write(data.classAttribute().value(i).replaceAll("[\"\\\\]", "_"));
      out.write("\"");
    }
    out.write("\n");

    // data
    for (int i = 0; i < data.numInstances(); i++) {
      final String id = data.instance(i).stringValue(idIndex);
      double[] distribution = c.distributionForInstance(data.instance(i));

      // final String label = data.attribute(classIndex).value();
      out.write(id);
      for (double probability : distribution) {
        out.write(",");
        out.write(String.valueOf(probability > 1e-5 ? (float) probability : 0f));
      }
      out.write("\n");
    }
  }
예제 #8
0
 /**
  * Get the sum of value of the dataset
  *
  * @param data set of instances to handle
  * @return sum of all the attribute values for all the instances in the dataset
  */
 private double getTotalSum(Instances data) {
   double sum = 0.0;
   for (int i = 0; i < data.numInstances(); i++) {
     for (int v = 0; v < data.instance(i).numValues(); v++) {
       sum += data.instance(i).valueSparse(v);
     }
   }
   return sum;
 }
  /**
   * builds the kernel with the given data. Initializes the kernel cache. The actual size of the
   * cache in bytes is (64 * cacheSize).
   *
   * @param data the data to base the kernel on
   * @throws Exception if something goes wrong
   */
  public void buildKernel(Instances data) throws Exception {
    // does kernel handle the data?
    if (!getChecksTurnedOff()) getCapabilities().testWithFail(data);

    initVars(data);

    for (int i = 0; i < data.numInstances(); i++)
      m_kernelPrecalc[i] = dotProd(data.instance(i), data.instance(i));
  }
예제 #10
0
  /** Queries the user enough to make a database query to retrieve experiment results. */
  protected void setInstancesFromDBaseQuery() {

    try {
      if (m_InstanceQuery == null) {
        m_InstanceQuery = new InstanceQuery();
      }
      String dbaseURL = m_InstanceQuery.getDatabaseURL();
      dbaseURL =
          (String)
              JOptionPane.showInputDialog(
                  this,
                  "Enter the database URL",
                  "Query Database",
                  JOptionPane.PLAIN_MESSAGE,
                  null,
                  null,
                  dbaseURL);
      if (dbaseURL == null) {
        m_FromLab.setText("Cancelled");
        return;
      }
      m_InstanceQuery.setDatabaseURL(dbaseURL);
      m_InstanceQuery.connectToDatabase();
      if (!m_InstanceQuery.experimentIndexExists()) {
        m_FromLab.setText("No experiment index");
        return;
      }
      m_FromLab.setText("Getting experiment index");
      Instances index =
          m_InstanceQuery.retrieveInstances("SELECT * FROM " + InstanceQuery.EXP_INDEX_TABLE);
      if (index.numInstances() == 0) {
        m_FromLab.setText("No experiments available");
        return;
      }
      m_FromLab.setText("Got experiment index");

      DefaultListModel lm = new DefaultListModel();
      for (int i = 0; i < index.numInstances(); i++) {
        lm.addElement(index.instance(i).toString());
      }
      JList jl = new JList(lm);
      ListSelectorDialog jd = new ListSelectorDialog(null, jl);
      int result = jd.showDialog();
      if (result != ListSelectorDialog.APPROVE_OPTION) {
        m_FromLab.setText("Cancelled");
        return;
      }
      Instance selInst = index.instance(jl.getSelectedIndex());
      Attribute tableAttr = index.attribute(InstanceQuery.EXP_RESULT_COL);
      String table = InstanceQuery.EXP_RESULT_PREFIX + selInst.toString(tableAttr);

      setInstancesFromDatabaseTable(table);
    } catch (Exception ex) {
      m_FromLab.setText("Problem reading database");
    }
  }
예제 #11
0
  /**
   * Calculate the squared error of a regression model on the training data
   *
   * @param selectedAttributes an array of flags indicating which attributes are included in the
   *     regression model
   * @param coefficients an array of coefficients for the regression model
   * @return the mean squared error on the training data
   * @throws Exception if there is a missing class value in the training data
   */
  private double calculateSE(boolean[] selectedAttributes, double[] coefficients) throws Exception {

    double mse = 0;
    for (int i = 0; i < m_TransformedData.numInstances(); i++) {
      double prediction =
          regressionPrediction(m_TransformedData.instance(i), selectedAttributes, coefficients);
      double error = prediction - m_TransformedData.instance(i).classValue();
      mse += error * error;
    }
    return mse;
  }
예제 #12
0
  public void classify(String filename) throws Exception {
    Instances unLabeledData = DataSource.read(filename);
    unLabeledData.setClassIndex(unLabeledData.numAttributes() - 1);
    Instances LabeledData = new Instances(unLabeledData);

    for (int i = 0; i < unLabeledData.numInstances(); ++i) {
      double clsLabel = classifier.classifyInstance(unLabeledData.instance(i));
      LabeledData.instance(i).setClassValue(clsLabel);
    }
    System.out.println(LabeledData.toString());
  }
예제 #13
0
  /** Sets distribution associated with model. */
  public void resetDistribution(Instances data) throws Exception {

    Instances insts = new Instances(data, data.numInstances());
    for (int i = 0; i < data.numInstances(); i++) {
      if (whichSubset(data.instance(i)) > -1) {
        insts.add(data.instance(i));
      }
    }
    Distribution newD = new Distribution(insts, this);
    newD.addInstWithUnknown(data, m_attIndex);
    m_distribution = newD;
  }
예제 #14
0
파일: Wavelet.java 프로젝트: dachylong/weka
  /**
   * processes the instances using the HAAR algorithm
   *
   * @param instances the data to process
   * @return the modified data
   * @throws Exception in case the processing goes wrong
   */
  protected Instances processHAAR(Instances instances) throws Exception {
    Instances result;
    int i;
    int n;
    int j;
    int clsIdx;
    double[] oldVal;
    double[] newVal;
    int level;
    int length;
    double[] clsVal;
    Attribute clsAtt;

    clsIdx = instances.classIndex();
    clsVal = null;
    clsAtt = null;
    if (clsIdx > -1) {
      clsVal = instances.attributeToDoubleArray(clsIdx);
      clsAtt = (Attribute) instances.classAttribute().copy();
      instances.setClassIndex(-1);
      instances.deleteAttributeAt(clsIdx);
    }
    result = new Instances(instances, 0);
    level = (int) StrictMath.ceil(StrictMath.log(instances.numAttributes()) / StrictMath.log(2.0));

    for (i = 0; i < instances.numInstances(); i++) {
      oldVal = instances.instance(i).toDoubleArray();
      newVal = new double[oldVal.length];

      for (n = level; n > 0; n--) {
        length = (int) StrictMath.pow(2, n - 1);

        for (j = 0; j < length; j++) {
          newVal[j] = (oldVal[j * 2] + oldVal[j * 2 + 1]) / StrictMath.sqrt(2);
          newVal[j + length] = (oldVal[j * 2] - oldVal[j * 2 + 1]) / StrictMath.sqrt(2);
        }

        System.arraycopy(newVal, 0, oldVal, 0, newVal.length);
      }

      // add new transformed instance
      result.add(new DenseInstance(1, newVal));
    }

    // add class again
    if (clsIdx > -1) {
      result.insertAttributeAt(clsAtt, clsIdx);
      result.setClassIndex(clsIdx);
      for (i = 0; i < clsVal.length; i++) result.instance(i).setClassValue(clsVal[i]);
    }

    return result;
  }
예제 #15
0
  // use the learned classifiers to get conditional probability
  protected double conMI(Instances D_j, Instances D_k, CNode[][] miNodes, int j, int k)
      throws Exception {

    int L = D_j.classIndex();
    int N = D_j.numInstances();
    double y[] = new double[L];
    double I = 0.0; // conditional mutual information for y_j and y_k
    double p_1, p_2; // p( y_j = 1 | x ), p( y_j = 2 | x )
    double p_12[] = {
      0.0, 0.0
    }; // p_12[0] = p( y_j = 1 | y_k = 0, x ) and p_12[1] = p( y_j = 1 | y_k = 1, x )

    for (int i = 0; i < N; i++) {
      Arrays.fill(y, 0);
      p_1 =
          Math.max(
              miNodes[j][0].distribution((Instance) D_j.instance(i).copy(), y)[1],
              0.000001); // p( y_j = 1 | x )
      p_1 = Math.min(p_1, 0.999999);
      p_1 = Math.max(p_1, 0.000001);
      Arrays.fill(y, 0);
      p_2 =
          Math.max(
              miNodes[k][0].distribution((Instance) D_k.instance(i).copy(), y)[1],
              0.000001); // p( y_k = 1 | x )
      p_2 = Math.min(p_2, 0.999999);
      p_2 = Math.max(p_2, 0.000001);
      Arrays.fill(y, 0);
      p_12[0] =
          Math.max(
              miNodes[j][k - j].distribution((Instance) D_j.instance(i).copy(), y)[1],
              0.000001); // p( y_j = 1 | y_k = 0, x )
      p_12[0] = Math.min(p_12[0], 0.999999);
      p_12[0] = Math.max(p_12[0], 0.000001);
      Arrays.fill(y, 0);
      Arrays.fill(y, k, k + 1, 1.0);
      p_12[1] =
          Math.max(
              miNodes[j][k - j].distribution((Instance) D_j.instance(i).copy(), y)[1],
              0.000001); // p( y_j = 1 | y_k = 1, x )
      p_12[1] = Math.min(p_12[1], 0.999999);
      p_12[1] = Math.max(p_12[1], 0.000001);

      I +=
          (1 - p_12[0]) * (1 - p_2) * Math.log((1 - p_12[0]) / (1 - p_1)); // I( y_j = 0 ; y_k = 0 )
      I += (1 - p_12[1]) * (p_2) * Math.log((1 - p_12[1]) / (1 - p_1)); // I( y_j = 0 ; y_k = 1 )
      I += (p_12[0]) * (1 - p_2) * Math.log((p_12[0]) / (p_1)); // I( y_j = 1 ; y_k = 0 )
      I += (p_12[1]) * (p_2) * Math.log((p_12[1]) / (p_1)); // I( y_j = 1 ; y_k = 0 )
    }
    I = I / N;
    return I;
  }
 public static double[][] labeled2relation(boolean[] labeled, Instances data) {
   // TODO Auto-generated method stub
   double[][] res = new double[labeled.length][labeled.length];
   for (int i = 0; i < labeled.length; ++i) {
     for (int j = i + 1; j < labeled.length; ++j) {
       if (labeled[i] && labeled[j]) {
         if (data.instance(i).classValue() == data.instance(j).classValue()) res[i][j] = 1;
         else res[i][j] = -1;
       }
     }
   }
   return res;
 }
예제 #17
0
  @Override
  public void train(Instances instance) {

    // find the best attribute
    int classIdx = instance.classIndex();
    for (int i = 0; i < instance.numInstances(); i++) {
      if (classIdx == 0) {
        zeroIns.add(instance.instance(i));
      } else {
        oneIns.add(instance.instance(i));
      }
    }
  }
예제 #18
0
 public void testTypical() {
   m_Filter = getFilter("6,3");
   Instances result = useFilter();
   assertEquals(m_Instances.numAttributes() - 1, result.numAttributes());
   for (int i = 0; i < result.numInstances(); i++) {
     Instance orig = m_Instances.instance(i);
     if (orig.isMissing(5) || orig.isMissing(2)) {
       assertTrue("Instance " + (i + 1) + " should have been ?", result.instance(i).isMissing(4));
     } else {
       assertEquals(orig.value(5) - orig.value(2), result.instance(i).value(4), EXPR_DELTA);
     }
   }
 }
  /**
   * Calculates the radius of a node based on the list of points that it contains (the two lists of
   * its children are provided).
   *
   * @param list1 The point index list of first child.
   * @param list2 The point index list of second child.
   * @param pivot The centre/pivot of the node.
   * @param insts The instances on which the tree is being built (for header info).
   * @return The radius of the node.
   */
  public double calcRadius(MyIdxList list1, MyIdxList list2, Instance pivot, Instances insts) {
    double radius = Double.NEGATIVE_INFINITY;

    for (int i = 0; i < list1.length(); i++) {
      double dist =
          m_DistanceFunction.distance(pivot, insts.instance(((ListNode) list1.get(i)).idx));
      if (dist > radius) radius = dist;
    }
    for (int j = 0; j < list2.length(); j++) {
      double dist =
          m_DistanceFunction.distance(pivot, insts.instance(((ListNode) list2.get(j)).idx));
      if (dist > radius) radius = dist;
    }
    return radius;
  }
 @Override
 public Instances labelData(String data) throws Exception {
   Instances unlabeled = new Instances(new BufferedReader(new FileReader(data)));
   // set class attribute
   unlabeled.setClassIndex(unlabeled.numAttributes() - 1);
   // create copy
   Instances labeled = new Instances(unlabeled);
   for (int i = 0; i < unlabeled.numInstances(); i++) {
     Instance ui = unlabeled.instance(i);
     double clsLabel = this.classifier.classifyInstance(ui);
     labeled.instance(i).setClassValue(clsLabel);
     System.out.println(ui.toString() + " -> " + unlabeled.classAttribute().value((int) clsLabel));
   }
   return labeled;
 }
예제 #21
0
  public static Double runClassify(String trainFile, String testFile) {
    double predictOrder = 0.0;
    double trueOrder = 0.0;
    try {
      String trainWekaFileName = trainFile;
      String testWekaFileName = testFile;

      Instances train = DataSource.read(trainWekaFileName);
      Instances test = DataSource.read(testWekaFileName);

      train.setClassIndex(0);
      test.setClassIndex(0);

      train.deleteAttributeAt(8);
      test.deleteAttributeAt(8);
      train.deleteAttributeAt(6);
      test.deleteAttributeAt(6);
      train.deleteAttributeAt(5);
      test.deleteAttributeAt(5);
      train.deleteAttributeAt(4);
      test.deleteAttributeAt(4);

      // AdditiveRegression classifier = new AdditiveRegression();

      // NaiveBayes classifier = new NaiveBayes();

      RandomForest classifier = new RandomForest();
      // LibSVM classifier = new LibSVM();

      classifier.buildClassifier(train);
      Evaluation eval = new Evaluation(train);
      eval.evaluateModel(classifier, test);

      System.out.println(eval.toSummaryString("\nResults\n\n", true));
      // System.out.println(eval.toClassDetailsString());
      // System.out.println(eval.toMatrixString());
      int k = 892;
      for (int i = 0; i < test.numInstances(); i++) {
        predictOrder = classifier.classifyInstance(test.instance(i));
        trueOrder = test.instance(i).classValue();
        System.out.println((k++) + "," + (int) predictOrder);
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
    return predictOrder;
  }
예제 #22
0
  /**
   * wrap up various variables to save memeory and do some housekeeping after optimization has
   * finished.
   *
   * @throws Exception if something goes wrong
   */
  protected void wrapUp() throws Exception {
    m_target = null;

    m_nEvals = m_kernel.numEvals();
    m_nCacheHits = m_kernel.numCacheHits();

    if ((m_SVM.getKernel() instanceof PolyKernel)
        && ((PolyKernel) m_SVM.getKernel()).getExponent() == 1.0) {
      // convert alpha's to weights
      double[] weights = new double[m_data.numAttributes()];
      for (int k = m_supportVectors.getNext(-1); k != -1; k = m_supportVectors.getNext(k)) {
        for (int j = 0; j < weights.length; j++) {
          if (j != m_classIndex) {
            weights[j] += (m_alpha[k] - m_alphaStar[k]) * m_data.instance(k).value(j);
          }
        }
      }
      m_weights = weights;

      // release memory
      m_alpha = null;
      m_alphaStar = null;
      m_kernel = null;
    }
    m_bModelBuilt = true;
  }
예제 #23
0
 /**
  * Compute the value of the objective function.
  *
  * @return the score
  * @throws Exception if something goes wrong
  */
 protected double getScore() throws Exception {
   double res = 0;
   double t = 0, t2 = 0;
   double sumAlpha = 0.0;
   for (int i = 0; i < m_nInstances; i++) {
     sumAlpha += (m_alpha[i] - m_alphaStar[i]);
     for (int j = 0; j < m_nInstances; j++) {
       t +=
           (m_alpha[i] - m_alphaStar[i])
               * (m_alpha[j] - m_alphaStar[j])
               * m_kernel.eval(i, j, m_data.instance(i));
     }
     //    switch(m_nLossType) {
     //    case L1:
     //    t2 += m_data.instance(i).classValue() * (m_alpha[i] - m_alpha_[i]);
     //    break;
     //    case L2:
     //    t2 += m_data.instance(i).classValue() * (m_alpha[i] - m_alpha_[i]) - (0.5/m_SVM.getC())
     // * (m_alpha[i]*m_alpha[i] + m_alpha_[i]*m_alpha_[i]);
     //    break;
     //    case HUBER:
     //    t2 += m_data.instance(i).classValue() * (m_alpha[i] - m_alpha_[i]) -
     // (0.5*m_SVM.getEpsilon()/m_SVM.getC()) * (m_alpha[i]*m_alpha[i] + m_alpha_[i]*m_alpha_[i]);
     //    break;
     //    case EPSILON:
     // t2 += m_data.instance(i).classValue() * (m_alpha[i] - m_alphaStar[i]) - m_epsilon *
     // (m_alpha[i] + m_alphaStar[i]);
     t2 += m_target[i] * (m_alpha[i] - m_alphaStar[i]) - m_epsilon * (m_alpha[i] + m_alphaStar[i]);
     //    break;
     //    }
   }
   res += -0.5 * t + t2;
   return res;
 }
예제 #24
0
 /**
  * SVMOutput of an instance in the training set, m_data This uses the cache, unlike
  * SVMOutput(Instance)
  *
  * @param index index of the training instance in m_data
  * @return the SVM output
  * @throws Exception if something goes wrong
  */
 protected double SVMOutput(int index) throws Exception {
   double result = -m_b;
   for (int i = m_supportVectors.getNext(-1); i != -1; i = m_supportVectors.getNext(i)) {
     result += (m_alpha[i] - m_alphaStar[i]) * m_kernel.eval(index, i, m_data.instance(index));
   }
   return result;
 }
예제 #25
0
  /**
   * Stratify the given data into the given number of bags based on the class values. It differs
   * from the <code>Instances.stratify(int fold)</code> that before stratification it sorts the
   * instances according to the class order in the header file. It assumes no missing values in the
   * class.
   *
   * @param data the given data
   * @param folds the given number of folds
   * @param rand the random object used to randomize the instances
   * @return the stratified instances
   */
  public static final Instances stratify(Instances data, int folds, Random rand) {
    if (!data.classAttribute().isNominal()) return data;

    Instances result = new Instances(data, 0);
    Instances[] bagsByClasses = new Instances[data.numClasses()];

    for (int i = 0; i < bagsByClasses.length; i++) bagsByClasses[i] = new Instances(data, 0);

    // Sort by class
    for (int j = 0; j < data.numInstances(); j++) {
      Instance datum = data.instance(j);
      bagsByClasses[(int) datum.classValue()].add(datum);
    }

    // Randomize each class
    for (int j = 0; j < bagsByClasses.length; j++) bagsByClasses[j].randomize(rand);

    for (int k = 0; k < folds; k++) {
      int offset = k, bag = 0;
      oneFold:
      while (true) {
        while (offset >= bagsByClasses[bag].numInstances()) {
          offset -= bagsByClasses[bag].numInstances();
          if (++bag >= bagsByClasses.length) // Next bag
          break oneFold;
        }

        result.add(bagsByClasses[bag].instance(offset));
        offset += folds;
      }
    }

    return result;
  }
예제 #26
0
  /**
   * Find all the instances in the dataset covered/not covered by the rule in given index, and the
   * correponding simple statistics and predicted class distributions are stored in the given double
   * array, which can be obtained by getSimpleStats() and getDistributions().<br>
   *
   * @param index the given index, assuming correct
   * @param insts the dataset to be covered by the rule
   * @param stats the given double array to hold stats, side-effected
   * @param dist the given array to hold class distributions, side-effected if null, the
   *     distribution is not necessary
   * @return the instances covered and not covered by the rule
   */
  private Instances[] computeSimpleStats(
      int index, Instances insts, double[] stats, double[] dist) {
    Rule rule = (Rule) m_Ruleset.elementAt(index);

    Instances[] data = new Instances[2];
    data[0] = new Instances(insts, insts.numInstances());
    data[1] = new Instances(insts, insts.numInstances());

    for (int i = 0; i < insts.numInstances(); i++) {
      Instance datum = insts.instance(i);
      double weight = datum.weight();
      if (rule.covers(datum)) {
        data[0].add(datum); // Covered by this rule
        stats[0] += weight; // Coverage
        if ((int) datum.classValue() == (int) rule.getConsequent())
          stats[2] += weight; // True positives
        else stats[4] += weight; // False positives
        if (dist != null) dist[(int) datum.classValue()] += weight;
      } else {
        data[1].add(datum); // Not covered by this rule
        stats[1] += weight;
        if ((int) datum.classValue() != (int) rule.getConsequent())
          stats[3] += weight; // True negatives
        else stats[5] += weight; // False negatives
      }
    }

    return data;
  }
예제 #27
0
  /**
   * Signify that this batch of input to the filter is finished.
   *
   * @return true if there are instances pending output
   * @throws IllegalStateException if no input structure has been defined
   */
  @Override
  public boolean batchFinished() throws Exception {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }

    if (!m_firstBatchFinished) {

      Instances filtered;
      if (m_numOfCrossValidationFolds < 2) {
        filtered = cleanseTrain(getInputFormat());
      } else {
        filtered = cleanseCross(getInputFormat());
      }

      for (int i = 0; i < filtered.numInstances(); i++) {
        push(filtered.instance(i));
      }

      m_firstBatchFinished = true;
      flushInput();
    }
    m_NewBatch = true;
    return (numPendingOutput() != 0);
  }
예제 #28
0
  @Override
  public void buildClassifier(Instances data) throws Exception {
    trainingData = data;
    Attribute classAttribute = data.classAttribute();
    prototypes = new ArrayList<>();

    classedData = new HashMap<String, ArrayList<Sequence>>();
    indexClassedDataInFullData = new HashMap<String, ArrayList<Integer>>();
    for (int c = 0; c < data.numClasses(); c++) {
      classedData.put(data.classAttribute().value(c), new ArrayList<Sequence>());
      indexClassedDataInFullData.put(data.classAttribute().value(c), new ArrayList<Integer>());
    }

    sequences = new Sequence[data.numInstances()];
    classMap = new String[sequences.length];
    for (int i = 0; i < sequences.length; i++) {
      Instance sample = data.instance(i);
      MonoDoubleItemSet[] sequence = new MonoDoubleItemSet[sample.numAttributes() - 1];
      int shift = (sample.classIndex() == 0) ? 1 : 0;
      for (int t = 0; t < sequence.length; t++) {
        sequence[t] = new MonoDoubleItemSet(sample.value(t + shift));
      }
      sequences[i] = new Sequence(sequence);
      String clas = sample.stringValue(classAttribute);
      classMap[i] = clas;
      classedData.get(clas).add(sequences[i]);
      indexClassedDataInFullData.get(clas).add(i);
      //			System.out.println("Element "+i+" of train is classed "+clas+" and went to element
      // "+(indexClassedDataInFullData.get(clas).size()-1));
    }
    buildSpecificClassifier(data);
  }
  /**
   * Generates the classifier.
   *
   * @param data set of instances serving as training data
   * @throws Exception if the classifier has not been generated successfully
   */
  @Override
  public void buildClassifier(Instances data) throws Exception {
    reset();

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    m_data = new Instances(data, 0);
    data = new Instances(data);

    m_wordsPerClass = new double[data.numClasses()];
    m_probOfClass = new double[data.numClasses()];
    m_probOfWordGivenClass = new HashMap<Integer, LinkedHashMap<String, Count>>();

    double laplace = 1.0;
    for (int i = 0; i < data.numClasses(); i++) {
      LinkedHashMap<String, Count> dict =
          new LinkedHashMap<String, Count>(10000 / data.numClasses());
      m_probOfWordGivenClass.put(i, dict);
      m_probOfClass[i] = laplace;

      // this needs to be updated for laplace correction every time we see a new
      // word (attribute)
      m_wordsPerClass[i] = 0;
    }

    for (int i = 0; i < data.numInstances(); i++) {
      updateClassifier(data.instance(i));
    }
  }
예제 #30
0
 /**
  * Calculate average of every columns
  *
  * @param inst
  * @return
  */
 public Double[] calculateAverage(Instances inst) {
   Double[] average = new Double[inst.numAttributes() - 1];
   for (int i = 0; i < inst.numAttributes() - 1; i++) {
     average[i] = 0.0;
   }
   for (int i = 0; i < inst.numInstances(); i++) {
     for (int x = 0; x < inst.instance(i).numAttributes() - 1; x++) {
       Instance ins = inst.instance(i);
       if (ins != null && !Double.isNaN(ins.value(x))) average[x] += ins.value(x);
     }
   }
   for (int i = 0; i < inst.numAttributes() - 1; i++) {
     average[i] /= inst.numInstances();
   }
   return average;
 }