Ejemplo n.º 1
0
  /**
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    Instances isTrainingSet = createSet(4);
    Instance instance1 = createInstance(new double[] {1, 0.7, 0.1, 0.7}, "S1", isTrainingSet);
    Instance instance2 = createInstance(new double[] {0.1, 0.2, 1, 0.3}, "S2", isTrainingSet);
    Instance instance22 = createInstance(new double[] {0, 0, 0, 0}, "S3", isTrainingSet);
    isTrainingSet.add(instance1);
    isTrainingSet.add(instance2);
    isTrainingSet.add(instance22);
    Instances isTestingSet = createSet(4);
    Instance instance3 = createInstance(new double[] {1, 0.7, 0.1, 0.7}, "S1", isTrainingSet);
    Instance instance4 = createInstance(new double[] {0.1, 0.2, 1, 0.3}, "S2", isTrainingSet);
    isTestingSet.add(instance3);
    isTestingSet.add(instance4);

    // Create a naïve bayes classifier
    Classifier cModel = (Classifier) new BayesNet(); // M5P
    cModel.buildClassifier(isTrainingSet);

    // Test the model
    Evaluation eTest = new Evaluation(isTrainingSet);
    eTest.evaluateModel(cModel, isTestingSet);

    // Print the result à la Weka explorer:
    String strSummary = eTest.toSummaryString();
    System.out.println(strSummary);

    // Get the likelihood of each classes
    // fDistribution[0] is the probability of being “positive”
    // fDistribution[1] is the probability of being “negative”
    double[] fDistribution = cModel.distributionForInstance(instance4);
    for (int i = 0; i < fDistribution.length; i++) {
      System.out.println(fDistribution[i]);
    }
  }
Ejemplo n.º 2
0
  private static void writePredictedDistributions(
      Classifier c, Instances data, int idIndex, Writer out) throws Exception {
    // header
    out.write("id");
    for (int i = 0; i < data.numClasses(); i++) {
      out.write(",\"");
      out.write(data.classAttribute().value(i).replaceAll("[\"\\\\]", "_"));
      out.write("\"");
    }
    out.write("\n");

    // data
    for (int i = 0; i < data.numInstances(); i++) {
      final String id = data.instance(i).stringValue(idIndex);
      double[] distribution = c.distributionForInstance(data.instance(i));

      // final String label = data.attribute(classIndex).value();
      out.write(id);
      for (double probability : distribution) {
        out.write(",");
        out.write(String.valueOf(probability > 1e-5 ? (float) probability : 0f));
      }
      out.write("\n");
    }
  }
  public int SelectRow_KLDivergenceMisclassified(
      Instances pool, Classifier myEstimator, int desiredAttr) {

    // for each instance with unbought desiredAttr and label = desiredLabel
    // measure KL-divergence (relative entropy between two prob distributions):
    //  KL(P||Q) = sum_i  p_i log (p_i/q_i)
    // withr respect to Q = Uniform, we have
    //  KL(P||U) = sum_i p_i log(p_i)
    // choose (row) that is minimum (i.e. closest to uniform)

    int numInstances = pool.numInstances();
    double[] KLDivs = new double[numInstances];
    boolean[] isValidInstance = new boolean[numInstances];
    boolean misclassified = false;
    double[] probs = null;
    Instance inst;

    for (int i = 0; i < numInstances; i++) {
      inst = pool.instance(i);
      try {
        if (inst.classValue() != myEstimator.classifyInstance(inst)) misclassified = true;
        else misclassified = false;
      } catch (Exception e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
      }
      if (inst.isMissing(desiredAttr) && misclassified) {
        try {
          probs = myEstimator.distributionForInstance(inst);
        } catch (Exception e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
        for (int j = 0; j < probs.length; j++) KLDivs[i] += MyXLogX(probs[j]);
        isValidInstance[i] = true;
      } else {
        KLDivs[i] = Double.MAX_VALUE;
        isValidInstance[i] = false;
      }
    }

    double leastDivergence = KLDivs[Utils.minIndex(KLDivs)];
    int numLeastDivs = 0;
    for (int i = 0; i < numInstances; i++)
      if (isValidInstance[i] && KLDivs[i] == leastDivergence) numLeastDivs++;
    int randomInstance = r.nextInt(numLeastDivs);
    int index = 0;
    for (int i = 0; i < numInstances; i++) {
      if (isValidInstance[i] && KLDivs[i] == leastDivergence) {
        if (index == randomInstance) return i;
        else index++;
      }
    }
    return -1;
  }
Ejemplo n.º 4
0
  public void batchPredict() {
    // load all test set
    String modelFile =
        "data\\AcquireValueShopper\\decisionTable_bayes_trees.model".replace("\\", File.separator);
    String pathTest = "data/AcquireValueShopper/test_new.csv";
    String pathPredict = "data/AcquireValueShopper/submission.csv";

    Scanner scanner;
    String line = "";
    String[] partsOfLine = null;
    String id = "";
    PrintWriter output;
    Map<String, String> testSet = new HashMap<String, String>();
    try {
      scanner = new Scanner(new File(pathTest));
      while (scanner.hasNext()) {
        line = scanner.nextLine().trim();
        partsOfLine = line.split(",");
        id = partsOfLine[0];
        testSet.put(id, line);
      }
      scanner.close();
    } catch (FileNotFoundException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }
    double[] returnProb;
    double prob = 0.0;
    // predict
    try {
      // load model
      Classifier classifier = (Classifier) SerializationHelper.read(modelFile);

      output = new PrintWriter(pathPredict);
      output.append("id,repeatProbability" + "\n");
      Iterator<String> idIterator = testSet.keySet().iterator();
      while (idIterator.hasNext()) {
        id = idIterator.next();
        line = testSet.get(id);
        Instances instances = buildInstance(line);
        Instance instance = instances.instance(0);
        returnProb = classifier.distributionForInstance(instance);
        prob = returnProb[1];
        // prob = classifier.classifyInstance(instance);
        output.append(id + "," + prob + "\n");
      }
      output.close();
    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
  int SelectRow_ErrorMargin(Instances pool, Classifier myEstimator, int desiredAttr) {

    // for each instance with unbought desiredAttr and label = desiredLabel
    // measure Prob(i,L(i)) the class probability of the true label, choose the one minimizing it.
    // i.e. the most erroneous instance

    int numInstances = pool.numInstances();
    double[] classProb = new double[numInstances];
    boolean[] isValidInstance = new boolean[numInstances];
    double[] probs = null;
    Instance inst;

    for (int i = 0; i < numInstances; i++) {
      inst = pool.instance(i);
      if (inst.isMissing(desiredAttr)) {
        try {
          probs = myEstimator.distributionForInstance(inst);
          classProb[i] = probs[(int) inst.classValue()];
          isValidInstance[i] = true;

        } catch (Exception e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }

      } else {
        classProb[i] = Double.POSITIVE_INFINITY;
        isValidInstance[i] = false;
      }
    }

    double leastCorrect = classProb[Utils.minIndex(classProb)];
    int numLeastCorrect = 0;
    for (int i = 0; i < numInstances; i++) {
      if (isValidInstance[i] && classProb[i] == leastCorrect) numLeastCorrect++;
    }

    int randomInstance = r.nextInt(numLeastCorrect);
    int index = 0;

    for (int i = 0; i < numInstances; i++) {
      if (isValidInstance[i] && classProb[i] == leastCorrect) {
        if (index == randomInstance) return i;
        else index++;
      }
    }
    return -1;
  }
Ejemplo n.º 6
0
  /**
   * Calculates the class membership probabilities for the given test instance
   *
   * @param instance the instance to be classified
   * @return predicted class probability distribution
   * @throws Exception if there is a problem generating the prediction
   */
  public double[] distributionForInstance(Instance instance) throws Exception {

    // default model?
    if (m_ZeroR != null) {
      return m_ZeroR.distributionForInstance(instance);
    }

    // Definition of local variables
    double[] probs = new double[m_NumClasses];
    double prob;
    double mutualInfoSum;

    // store instance's att values in an int array
    int[] attIndex = new int[m_NumAttributes];
    for (int att = 0; att < m_NumAttributes; att++) {
      if (att == m_ClassIndex) attIndex[att] = -1;
      else attIndex[att] = m_StartAttIndex[att] + (int) instance.value(att);
    }

    // calculate probabilities for each possible class value
    for (int classVal = 0; classVal < m_NumClasses; classVal++) {
      probs[classVal] = 0;
      prob = 1;
      mutualInfoSum = 0.0;
      for (int parent = 0; parent < m_NumAttributes; parent++) {
        if (attIndex[parent] == -1) continue;
        prob =
            (m_ClassAttAttCounts[classVal][attIndex[parent]][attIndex[parent]]
                    + 1.0 / (m_NumClasses * m_NumAttValues[parent]))
                / (m_NumInstances + 1.0);
        for (int son = 0; son < m_NumAttributes; son++) {
          if (attIndex[son] == -1 || son == parent) continue;
          prob *=
              (m_ClassAttAttCounts[classVal][attIndex[parent]][attIndex[son]]
                      + 1.0 / m_NumAttValues[son])
                  / (m_ClassAttAttCounts[classVal][attIndex[parent]][attIndex[parent]] + 1.0);
        }
        mutualInfoSum += m_mutualInformation[parent];
        probs[classVal] += m_mutualInformation[parent] * prob;
      }
      probs[classVal] /= mutualInfoSum;
    }
    if (!Double.isNaN(Utils.sum(probs))) Utils.normalize(probs);
    return probs;
  }
  public int SelectRow_L2Norm(
      Instances pool, Classifier myEstimator, int desiredAttr, int desiredLabel) {

    // for each instance with unbought desiredAttr and label = desiredLabel
    // measure distance from uniform
    // choose (row) that is closest to uniform as your instance to buy from

    double leastDistance = Double.MAX_VALUE;
    int leastIndex = -1;
    Instance inst;
    int n = pool.numClasses();
    double[] uniform;
    double[] probs;
    uniform = new double[n];
    for (int i = 0; i < n; i++) uniform[i] = 1.0 / (double) n;

    for (int i = 0; i < pool.numInstances(); i++) {
      inst = pool.instance(i);
      // System.out.println("currentlabel="+(int)inst.classValue()+"
      // isMissing="+inst.isMissing(desiredAttr));
      if ((int) inst.classValue() == desiredLabel && inst.isMissing(desiredAttr)) {
        // valid instance
        // measure the distance from uniform:
        // sqrt{ sum_i (a_i - b_i)^2 }
        probs = new double[n];
        try {
          probs = myEstimator.distributionForInstance(inst);
        } catch (Exception e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
        double distance = 0.0;
        for (int j = 0; j < n; j++) distance += (probs[j] - uniform[j]) * (probs[j] - uniform[j]);
        distance = Math.sqrt(distance);
        // System.out.println("current distance="+distance);
        if (distance < leastDistance) {
          leastDistance = distance;
          leastIndex = i;
        }
      }
    }
    return leastIndex;
  }
Ejemplo n.º 8
0
  /**
   * Calculates the class membership probabilities for the given test instance.
   *
   * @param instance the instance to be classified
   * @return predicted class probability distribution
   * @throws Exception if instance could not be classified successfully
   */
  public double[] distributionForInstance(Instance instance) throws Exception {

    double[] sums = new double[instance.numClasses()];
    for (int i = -1; i < m_NumIterationsPerformed; i++) {
      double prob = 1, shrinkage = m_Shrinkage;
      if (i == -1) {
        prob = m_ZeroR.distributionForInstance(instance)[0];
        shrinkage = 1.0;
      } else {
        prob = m_Classifiers[i].distributionForInstance(instance)[0];

        // Make sure that probabilities are never 0 or 1 using ad-hoc smoothing
        prob = (m_SumOfWeights * prob + 1) / (m_SumOfWeights + 2);
      }
      sums[0] += shrinkage * 0.5 * (Math.log(prob) - Math.log(1 - prob));
    }
    sums[1] = -sums[0];
    return Utils.logs2probs(sums);
  }
Ejemplo n.º 9
0
  /**
   * Calculates the class membership probabilities for the given test instance.
   *
   * @param instance the instance to be classified
   * @return predicted class probability distribution
   * @exception Exception if distribution can't be computed successfully
   */
  public double[] distributionForInstance(Instance instance) throws Exception {
    if (instance.classAttribute().isNumeric()) {
      throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!");
    }
    double[] sums = new double[instance.numClasses()], newProbs;
    Classifier curr;

    for (int i = 0; i < m_Committee.size(); i++) {
      curr = (Classifier) m_Committee.get(i);
      newProbs = curr.distributionForInstance(instance);
      for (int j = 0; j < newProbs.length; j++) sums[j] += newProbs[j];
    }
    if (Utils.eq(Utils.sum(sums), 0)) {
      return sums;
    } else {
      Utils.normalize(sums);
      return sums;
    }
  }
Ejemplo n.º 10
0
  /**
   * Sets the weights for the next iteration.
   *
   * @param training the training instances
   * @throws Exception if something goes wrong
   */
  protected void setWeights(Instances training, int iteration) throws Exception {

    for (Instance instance : training) {
      double reweight = 1;
      double prob = 1, shrinkage = m_Shrinkage;

      if (iteration == -1) {
        prob = m_ZeroR.distributionForInstance(instance)[0];
        shrinkage = 1.0;
      } else {
        prob = m_Classifiers[iteration].distributionForInstance(instance)[0];

        // Make sure that probabilities are never 0 or 1 using ad-hoc smoothing
        prob = (m_SumOfWeights * prob + 1) / (m_SumOfWeights + 2);
      }

      if (instance.classValue() == 1) {
        reweight = shrinkage * 0.5 * (Math.log(prob) - Math.log(1 - prob));
      } else {
        reweight = shrinkage * 0.5 * (Math.log(1 - prob) - Math.log(prob));
      }
      instance.setWeight(instance.weight() * Math.exp(reweight));
    }
  }
Ejemplo n.º 11
0
 @Override
 public double[] distributionForInstance(Instance inst) throws Exception {
   return m_model.distributionForInstance(inst);
 }
Ejemplo n.º 12
0
 /**
  * Return the argmax on #distribution(Instance, double[]).
  *
  * @return argmax_{k in 0,1,...} p( y_j = k | x , y_pred )
  */
 public double classify(Instance x, double ypred[]) throws Exception {
   Instance x_ = transform(x, ypred);
   return Utils.maxIndex(h.distributionForInstance(x_));
 }
Ejemplo n.º 13
0
 /**
  * Same as #distribution(Instance, double[]), but the Instance is pre-transformed with ypred
  * inside.
  */
 public double[] distributionT(Instance x_) throws Exception {
   return h.distributionForInstance(x_);
 }
Ejemplo n.º 14
0
 /**
  * The distribution this this node, given input x.
  *
  * @return p( y_j = k | x , y_pred ) for k in {0,1}
  */
 public double[] distribution(Instance x, double ypred[]) throws Exception {
   Instance x_ = transform(x, ypred);
   return h.distributionForInstance(x_);
 }
Ejemplo n.º 15
0
  /**
   * Calculates the class membership probabilities for the given test instance.
   *
   * @param instance the instance to be classified
   * @return preedicted class probability distribution
   * @throws Exception if distribution can't be computed successfully
   */
  public double[] distributionForInstance(Instance instance) throws Exception {

    // default model?
    if (m_ZeroR != null) {
      return m_ZeroR.distributionForInstance(instance);
    }

    if (m_Train.numInstances() == 0) {
      throw new Exception("No training instances!");
    }

    m_NNSearch.addInstanceInfo(instance);

    int k = m_Train.numInstances();
    if ((!m_UseAllK && (m_kNN < k)) /*&&
       !(m_WeightKernel==INVERSE ||
         m_WeightKernel==GAUSS)*/) {
      k = m_kNN;
    }

    Instances neighbours = m_NNSearch.kNearestNeighbours(instance, k);
    double distances[] = m_NNSearch.getDistances();

    if (m_Debug) {
      System.out.println("Test Instance: " + instance);
      System.out.println(
          "For "
              + k
              + " kept "
              + neighbours.numInstances()
              + " out of "
              + m_Train.numInstances()
              + " instances.");
    }

    // IF LinearNN has skipped so much that <k neighbours are remaining.
    if (k > distances.length) k = distances.length;

    if (m_Debug) {
      System.out.println("Instance Distances");
      for (int i = 0; i < distances.length; i++) {
        System.out.println("" + distances[i]);
      }
    }

    // Determine the bandwidth
    double bandwidth = distances[k - 1];

    // Check for bandwidth zero
    if (bandwidth <= 0) {
      // if the kth distance is zero than give all instances the same weight
      for (int i = 0; i < distances.length; i++) distances[i] = 1;
    } else {
      // Rescale the distances by the bandwidth
      for (int i = 0; i < distances.length; i++) distances[i] = distances[i] / bandwidth;
    }

    // Pass the distances through a weighting kernel
    for (int i = 0; i < distances.length; i++) {
      switch (m_WeightKernel) {
        case LINEAR:
          distances[i] = 1.0001 - distances[i];
          break;
        case EPANECHNIKOV:
          distances[i] = 3 / 4D * (1.0001 - distances[i] * distances[i]);
          break;
        case TRICUBE:
          distances[i] = Math.pow((1.0001 - Math.pow(distances[i], 3)), 3);
          break;
        case CONSTANT:
          // System.err.println("using constant kernel");
          distances[i] = 1;
          break;
        case INVERSE:
          distances[i] = 1.0 / (1.0 + distances[i]);
          break;
        case GAUSS:
          distances[i] = Math.exp(-distances[i] * distances[i]);
          break;
      }
    }

    if (m_Debug) {
      System.out.println("Instance Weights");
      for (int i = 0; i < distances.length; i++) {
        System.out.println("" + distances[i]);
      }
    }

    // Set the weights on the training data
    double sumOfWeights = 0, newSumOfWeights = 0;
    for (int i = 0; i < distances.length; i++) {
      double weight = distances[i];
      Instance inst = (Instance) neighbours.instance(i);
      sumOfWeights += inst.weight();
      newSumOfWeights += inst.weight() * weight;
      inst.setWeight(inst.weight() * weight);
      // weightedTrain.add(newInst);
    }

    // Rescale weights
    for (int i = 0; i < neighbours.numInstances(); i++) {
      Instance inst = neighbours.instance(i);
      inst.setWeight(inst.weight() * sumOfWeights / newSumOfWeights);
    }

    // Create a weighted classifier
    m_Classifier.buildClassifier(neighbours);

    if (m_Debug) {
      System.out.println("Classifying test instance: " + instance);
      System.out.println("Built base classifier:\n" + m_Classifier.toString());
    }

    // Return the classifier's predictions
    return m_Classifier.distributionForInstance(instance);
  }
  private double[] calculateRegionProbs(int j, int i) throws Exception {
    double[] sumOfProbsForRegion = new double[m_trainingData.classAttribute().numValues()];

    for (int u = 0; u < m_numOfSamplesPerRegion; u++) {

      double[] sumOfProbsForLocation = new double[m_trainingData.classAttribute().numValues()];

      m_weightingAttsValues[m_xAttribute] = getRandomX(j);
      m_weightingAttsValues[m_yAttribute] = getRandomY(m_panelHeight - i - 1);

      m_dataGenerator.setWeightingValues(m_weightingAttsValues);

      double[] weights = m_dataGenerator.getWeights();
      double sumOfWeights = Utils.sum(weights);
      int[] indices = Utils.sort(weights);

      // Prune 1% of weight mass
      int[] newIndices = new int[indices.length];
      double sumSoFar = 0;
      double criticalMass = 0.99 * sumOfWeights;
      int index = weights.length - 1;
      int counter = 0;
      for (int z = weights.length - 1; z >= 0; z--) {
        newIndices[index--] = indices[z];
        sumSoFar += weights[indices[z]];
        counter++;
        if (sumSoFar > criticalMass) {
          break;
        }
      }
      indices = new int[counter];
      System.arraycopy(newIndices, index + 1, indices, 0, counter);

      for (int z = 0; z < m_numOfSamplesPerGenerator; z++) {

        m_dataGenerator.setWeightingValues(m_weightingAttsValues);
        double[][] values = m_dataGenerator.generateInstances(indices);

        for (int q = 0; q < values.length; q++) {
          if (values[q] != null) {
            System.arraycopy(values[q], 0, m_vals, 0, m_vals.length);
            m_vals[m_xAttribute] = m_weightingAttsValues[m_xAttribute];
            m_vals[m_yAttribute] = m_weightingAttsValues[m_yAttribute];

            // classify the instance
            m_dist = m_classifier.distributionForInstance(m_predInst);

            for (int k = 0; k < sumOfProbsForLocation.length; k++) {
              sumOfProbsForLocation[k] += (m_dist[k] * weights[q]);
            }
          }
        }
      }

      for (int k = 0; k < sumOfProbsForRegion.length; k++) {
        sumOfProbsForRegion[k] += (sumOfProbsForLocation[k] * sumOfWeights);
      }
    }

    // average
    Utils.normalize(sumOfProbsForRegion);

    // cache
    double[] tempDist = new double[sumOfProbsForRegion.length];
    System.arraycopy(sumOfProbsForRegion, 0, tempDist, 0, sumOfProbsForRegion.length);

    return tempDist;
  }
Ejemplo n.º 17
0
 /**
  * Calculates the class membership probabilities for the given test instance.
  *
  * @param instance the instance to be classified
  * @return predicted class probability distribution
  * @throws Exception if class is numeric
  */
 public double[] distributionForInstance(Instance instance) throws Exception {
   if (m_GroovyObject != null) return m_GroovyObject.distributionForInstance(instance);
   else return new double[instance.numClasses()];
 }