Пример #1
0
  public Classification classify(Instance instance) {
    FeatureVector fv = (FeatureVector) instance.getData();
    assert (instancePipe == null || fv.getAlphabet() == this.instancePipe.getDataAlphabet());

    Node leaf = getLeaf(m_root, fv);
    return new Classification(instance, this, leaf.getGainRatio().getBaseLabelDistribution());
  }
Пример #2
0
 public void addToken(FeatureVector fv) {
   int[] indices = fv.getIndices();
   Alphabet dictionary = fv.getAlphabet();
   int indicesLength = fv.numLocations();
   for (int i = 0; i < indicesLength; i++) {
     String key = dictionary.lookupObject(indices[i]).toString();
     // log.info( key );
     if (!contextOnly
         || (contextOnly
             && (key.endsWith("/+1")
                 || key.endsWith("/-1")
                 || key.endsWith("/-2")
                 || key.endsWith("/+2")))) {
       map.increment(key);
     }
   }
 }
Пример #3
0
  /**
   * Classify an instance using NaiveBayes according to the trained data. The alphabet of the
   * featureVector of the instance must match the alphabe of the pipe used to train the classifier.
   *
   * @param instance to be classified. Data field must be a FeatureVector
   * @return Classification containing the labeling of the instance
   */
  public Classification classify(Instance instance) {
    // Note that the current size of the label alphabet can be larger
    // than it was at the time of training.  We are careful here
    // to correctly handle those labels here. For example,
    // we assume the log prior probability of those classes is
    // minus infinity.
    int numClasses = getLabelAlphabet().size();
    double[] scores = new double[numClasses];
    FeatureVector fv = (FeatureVector) instance.getData();
    // Make sure the feature vector's feature dictionary matches
    // what we are expecting from our data pipe (and thus our notion
    // of feature probabilities.
    assert (instancePipe == null || fv.getAlphabet() == instancePipe.getDataAlphabet());
    int fvisize = fv.numLocations();

    prior.addLogProbabilities(scores);

    // Set the scores according to the feature weights and per-class probabilities
    for (int fvi = 0; fvi < fvisize; fvi++) {
      int fi = fv.indexAtLocation(fvi);
      for (int ci = 0; ci < numClasses; ci++) {
        // guard against dataAlphabet or target alphabet growing; can happen if classifying
        // a never before seen feature.  Ignore these.
        if (ci >= p.length || fi >= p[ci].size()) continue;

        scores[ci] += fv.valueAtLocation(fvi) * p[ci].logProbability(fi);
      }
    }

    // Get the scores in the range near zero, where exp() is more accurate
    double maxScore = Double.NEGATIVE_INFINITY;
    for (int ci = 0; ci < numClasses; ci++) if (scores[ci] > maxScore) maxScore = scores[ci];
    for (int ci = 0; ci < numClasses; ci++) scores[ci] -= maxScore;

    // Exponentiate and normalize
    double sum = 0;
    for (int ci = 0; ci < numClasses; ci++) sum += (scores[ci] = Math.exp(scores[ci]));
    for (int ci = 0; ci < numClasses; ci++) scores[ci] /= sum;

    // Create and return a Classification object
    return new Classification(instance, this, new LabelVector(getLabelAlphabet(), scores));
  }