コード例 #1
0
  public void count() {

    TIntIntHashMap docCounts = new TIntIntHashMap();

    int index = 0;

    if (instances.size() == 0) {
      logger.info("Instance list is empty");
      return;
    }

    if (instances.get(0).getData() instanceof FeatureSequence) {

      for (Instance instance : instances) {
        FeatureSequence features = (FeatureSequence) instance.getData();

        for (int i = 0; i < features.getLength(); i++) {
          docCounts.adjustOrPutValue(features.getIndexAtPosition(i), 1, 1);
        }

        int[] keys = docCounts.keys();
        for (int i = 0; i < keys.length - 1; i++) {
          int feature = keys[i];
          featureCounts[feature] += docCounts.get(feature);
          documentFrequencies[feature]++;
        }

        docCounts = new TIntIntHashMap();

        index++;
        if (index % 1000 == 0) {
          System.err.println(index);
        }
      }
    } else if (instances.get(0).getData() instanceof FeatureVector) {

      for (Instance instance : instances) {
        FeatureVector features = (FeatureVector) instance.getData();

        for (int location = 0; location < features.numLocations(); location++) {
          int feature = features.indexAtLocation(location);
          double value = features.valueAtLocation(location);

          documentFrequencies[feature]++;
          featureCounts[feature] += value;
        }

        index++;
        if (index % 1000 == 0) {
          System.err.println(index);
        }
      }
    } else {
      logger.info("Unsupported data class: " + instances.get(0).getData().getClass().getName());
    }
  }
コード例 #2
0
ファイル: NaiveBayes.java プロジェクト: Balkanlii/nlp
  private double dataLogProbability(Instance instance, int labelIndex) {
    FeatureVector fv = (FeatureVector) instance.getData();
    int fvisize = fv.numLocations();
    double logProb = 0;

    for (int fvi = 0; fvi < fvisize; fvi++)
      logProb += fv.valueAtLocation(fvi) * p[labelIndex].logProbability(fv.indexAtLocation(fvi));
    return logProb;
  }
コード例 #3
0
ファイル: TokenTextCharNGrams.java プロジェクト: zabak/digmap
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   for (int i = 0; i < ts.size(); i++) {
     Token t = ts.get(i);
     String s = t.getText();
     if (distinguishBorders) s = startBorderChar + s + endBorderChar;
     int slen = s.length();
     for (int j = 0; j < gramSizes.length; j++) {
       int size = gramSizes[j];
       for (int k = 0; k < (slen - size) + 1; k++)
         t.setFeatureValue((prefix + s.substring(k, k + size)).intern(), 1.0);
     }
   }
   return carrier;
 }
コード例 #4
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   for (int i = 0; i < ts.size(); i++) {
     Token t = ts.get(i);
     String s = t.getText();
     if (distinguishBorders) s = startBorderChar + s + endBorderChar;
     int slen = s.length();
     for (int j = 0; j < gramSizes.length; j++) {
       int size = gramSizes[j];
       for (int k = 0; k < slen - size; k++)
         t.setFeatureValue(
             s.substring(k, k + size), 1.0); // original was substring(k, size), changed by Fuchun
     }
   }
   return carrier;
 }
コード例 #5
0
  public Instance pipe(Instance carrier) {
    Sequence data = (Sequence) carrier.getData();
    Sequence target = (Sequence) carrier.getTarget();

    if (data.size() != target.size())
      throw new IllegalArgumentException(
          "Trying to print into SimpleTagger format, where data and target lengths do not match\n"
              + "data.length = "
              + data.size()
              + ", target.length = "
              + target.size());

    int N = data.size();

    if (data instanceof TokenSequence) {
      throw new UnsupportedOperationException("Not yet implemented.");
    } else if (data instanceof FeatureVectorSequence) {

      FeatureVectorSequence fvs = (FeatureVectorSequence) data;
      Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null;

      for (int i = 0; i < N; i++) {
        Object label = target.get(i);
        writer.print(label);

        FeatureVector fv = fvs.getFeatureVector(i);
        for (int loc = 0; loc < fv.numLocations(); loc++) {
          writer.print(' ');
          String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString();
          double value = fv.valueAtLocation(loc);
          // if (!Maths.almostEquals(value, 1.0)) {
          //    throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector
          // not binary at time slice "+i+" fv:"+fv);
          // }
          writer.print(fname + String.valueOf(value));
        }
        writer.println();
      }
    } else {
      throw new IllegalArgumentException("Don't know how to print data of type " + data);
    }

    writer.println();
    // writer.print(getDataAlphabet());

    return carrier;
  }
コード例 #6
0
ファイル: NaiveBayes.java プロジェクト: Balkanlii/nlp
  /**
   * Classify an instance using NaiveBayes according to the trained data. The alphabet of the
   * featureVector of the instance must match the alphabe of the pipe used to train the classifier.
   *
   * @param instance to be classified. Data field must be a FeatureVector
   * @return Classification containing the labeling of the instance
   */
  public Classification classify(Instance instance) {
    // Note that the current size of the label alphabet can be larger
    // than it was at the time of training.  We are careful here
    // to correctly handle those labels here. For example,
    // we assume the log prior probability of those classes is
    // minus infinity.
    int numClasses = getLabelAlphabet().size();
    double[] scores = new double[numClasses];
    FeatureVector fv = (FeatureVector) instance.getData();
    // Make sure the feature vector's feature dictionary matches
    // what we are expecting from our data pipe (and thus our notion
    // of feature probabilities.
    assert (instancePipe == null || fv.getAlphabet() == instancePipe.getDataAlphabet());
    int fvisize = fv.numLocations();

    prior.addLogProbabilities(scores);

    // Set the scores according to the feature weights and per-class probabilities
    for (int fvi = 0; fvi < fvisize; fvi++) {
      int fi = fv.indexAtLocation(fvi);
      for (int ci = 0; ci < numClasses; ci++) {
        // guard against dataAlphabet or target alphabet growing; can happen if classifying
        // a never before seen feature.  Ignore these.
        if (ci >= p.length || fi >= p[ci].size()) continue;

        scores[ci] += fv.valueAtLocation(fvi) * p[ci].logProbability(fi);
      }
    }

    // Get the scores in the range near zero, where exp() is more accurate
    double maxScore = Double.NEGATIVE_INFINITY;
    for (int ci = 0; ci < numClasses; ci++) if (scores[ci] > maxScore) maxScore = scores[ci];
    for (int ci = 0; ci < numClasses; ci++) scores[ci] -= maxScore;

    // Exponentiate and normalize
    double sum = 0;
    for (int ci = 0; ci < numClasses; ci++) sum += (scores[ci] = Math.exp(scores[ci]));
    for (int ci = 0; ci < numClasses; ci++) scores[ci] /= sum;

    // Create and return a Classification object
    return new Classification(instance, this, new LabelVector(getLabelAlphabet(), scores));
  }
コード例 #7
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   TokenSequence targets =
       carrier.getTarget() instanceof TokenSequence ? (TokenSequence) carrier.getTarget() : null;
   TokenSequence source =
       carrier.getSource() instanceof TokenSequence ? (TokenSequence) carrier.getSource() : null;
   StringBuffer sb = new StringBuffer();
   if (prefix != null) sb.append(prefix);
   sb.append("name: " + carrier.getName() + "\n");
   for (int i = 0; i < ts.size(); i++) {
     if (source != null) {
       sb.append(source.get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof TokenSequence) {
       sb.append(((TokenSequence) carrier.getTarget()).get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof FeatureSequence) {
       sb.append(((FeatureSequence) carrier.getTarget()).getObjectAtPosition(i).toString());
       sb.append(' ');
     }
     PropertyList pl = ts.get(i).getFeatures();
     if (pl != null) {
       PropertyList.Iterator iter = pl.iterator();
       while (iter.hasNext()) {
         iter.next();
         double v = iter.getNumericValue();
         if (v == 1.0) sb.append(iter.getKey());
         else sb.append(iter.getKey() + '=' + v);
         sb.append(' ');
       }
     }
     sb.append('\n');
   }
   System.out.print(sb.toString());
   return carrier;
 }
コード例 #8
0
 public Instance pipe(Instance carrier) {
   carrier.setData(new TokenIterator((TokenSequence) carrier.getData()));
   return carrier;
 }
コード例 #9
0
  public void test(
      Transducer transducer,
      InstanceList data,
      String description,
      PrintStream viterbiOutputStream) {
    int[] ntrue = new int[segmentTags.length];
    int[] npred = new int[segmentTags.length];
    int[] ncorr = new int[segmentTags.length];

    LabelAlphabet dict = (LabelAlphabet) transducer.getInputPipe().getTargetAlphabet();

    for (int i = 0; i < data.size(); i++) {
      Instance instance = data.getInstance(i);
      Sequence input = (Sequence) instance.getData();
      Sequence trueOutput = (Sequence) instance.getTarget();
      assert (input.size() == trueOutput.size());
      Sequence predOutput = transducer.viterbiPath(input).output();
      assert (predOutput.size() == trueOutput.size());

      List trueSegs = new ArrayList();
      List predSegs = new ArrayList();

      addSegs(trueSegs, trueOutput);
      addSegs(predSegs, predOutput);

      //      System.out.println("FieldF1Evaluator instance "+instance.getName ());
      //      printSegs(dict, trueSegs, "True");
      //      printSegs(dict, predSegs, "Pred");

      for (Iterator it = predSegs.iterator(); it.hasNext(); ) {
        Segment seg = (Segment) it.next();
        npred[seg.tag]++;
        if (trueSegs.contains(seg)) {
          ncorr[seg.tag]++;
        }
      }

      for (Iterator it = trueSegs.iterator(); it.hasNext(); ) {
        Segment seg = (Segment) it.next();
        ntrue[seg.tag]++;
      }
    }

    DecimalFormat f = new DecimalFormat("0.####");
    logger.info(description + " per-field F1");
    for (int tag = 0; tag < segmentTags.length; tag++) {
      double precision = ((double) ncorr[tag]) / npred[tag];
      double recall = ((double) ncorr[tag]) / ntrue[tag];
      double f1 = (2 * precision * recall) / (precision + recall);
      Label name = dict.lookupLabel(segmentTags[tag]);
      logger.info(
          " segments "
              + name
              + "  true = "
              + ntrue[tag]
              + "  pred = "
              + npred[tag]
              + "  correct = "
              + ncorr[tag]);
      logger.info(
          " precision="
              + f.format(precision)
              + " recall="
              + f.format(recall)
              + " f1="
              + f.format(f1));
    }
  }