public void count() { TIntIntHashMap docCounts = new TIntIntHashMap(); int index = 0; if (instances.size() == 0) { logger.info("Instance list is empty"); return; } if (instances.get(0).getData() instanceof FeatureSequence) { for (Instance instance : instances) { FeatureSequence features = (FeatureSequence) instance.getData(); for (int i = 0; i < features.getLength(); i++) { docCounts.adjustOrPutValue(features.getIndexAtPosition(i), 1, 1); } int[] keys = docCounts.keys(); for (int i = 0; i < keys.length - 1; i++) { int feature = keys[i]; featureCounts[feature] += docCounts.get(feature); documentFrequencies[feature]++; } docCounts = new TIntIntHashMap(); index++; if (index % 1000 == 0) { System.err.println(index); } } } else if (instances.get(0).getData() instanceof FeatureVector) { for (Instance instance : instances) { FeatureVector features = (FeatureVector) instance.getData(); for (int location = 0; location < features.numLocations(); location++) { int feature = features.indexAtLocation(location); double value = features.valueAtLocation(location); documentFrequencies[feature]++; featureCounts[feature] += value; } index++; if (index % 1000 == 0) { System.err.println(index); } } } else { logger.info("Unsupported data class: " + instances.get(0).getData().getClass().getName()); } }
private double dataLogProbability(Instance instance, int labelIndex) { FeatureVector fv = (FeatureVector) instance.getData(); int fvisize = fv.numLocations(); double logProb = 0; for (int fvi = 0; fvi < fvisize; fvi++) logProb += fv.valueAtLocation(fvi) * p[labelIndex].logProbability(fv.indexAtLocation(fvi)); return logProb; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); String s = t.getText(); if (distinguishBorders) s = startBorderChar + s + endBorderChar; int slen = s.length(); for (int j = 0; j < gramSizes.length; j++) { int size = gramSizes[j]; for (int k = 0; k < (slen - size) + 1; k++) t.setFeatureValue((prefix + s.substring(k, k + size)).intern(), 1.0); } } return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); String s = t.getText(); if (distinguishBorders) s = startBorderChar + s + endBorderChar; int slen = s.length(); for (int j = 0; j < gramSizes.length; j++) { int size = gramSizes[j]; for (int k = 0; k < slen - size; k++) t.setFeatureValue( s.substring(k, k + size), 1.0); // original was substring(k, size), changed by Fuchun } } return carrier; }
public Instance pipe(Instance carrier) { Sequence data = (Sequence) carrier.getData(); Sequence target = (Sequence) carrier.getTarget(); if (data.size() != target.size()) throw new IllegalArgumentException( "Trying to print into SimpleTagger format, where data and target lengths do not match\n" + "data.length = " + data.size() + ", target.length = " + target.size()); int N = data.size(); if (data instanceof TokenSequence) { throw new UnsupportedOperationException("Not yet implemented."); } else if (data instanceof FeatureVectorSequence) { FeatureVectorSequence fvs = (FeatureVectorSequence) data; Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null; for (int i = 0; i < N; i++) { Object label = target.get(i); writer.print(label); FeatureVector fv = fvs.getFeatureVector(i); for (int loc = 0; loc < fv.numLocations(); loc++) { writer.print(' '); String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString(); double value = fv.valueAtLocation(loc); // if (!Maths.almostEquals(value, 1.0)) { // throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector // not binary at time slice "+i+" fv:"+fv); // } writer.print(fname + String.valueOf(value)); } writer.println(); } } else { throw new IllegalArgumentException("Don't know how to print data of type " + data); } writer.println(); // writer.print(getDataAlphabet()); return carrier; }
/** * Classify an instance using NaiveBayes according to the trained data. The alphabet of the * featureVector of the instance must match the alphabe of the pipe used to train the classifier. * * @param instance to be classified. Data field must be a FeatureVector * @return Classification containing the labeling of the instance */ public Classification classify(Instance instance) { // Note that the current size of the label alphabet can be larger // than it was at the time of training. We are careful here // to correctly handle those labels here. For example, // we assume the log prior probability of those classes is // minus infinity. int numClasses = getLabelAlphabet().size(); double[] scores = new double[numClasses]; FeatureVector fv = (FeatureVector) instance.getData(); // Make sure the feature vector's feature dictionary matches // what we are expecting from our data pipe (and thus our notion // of feature probabilities. assert (instancePipe == null || fv.getAlphabet() == instancePipe.getDataAlphabet()); int fvisize = fv.numLocations(); prior.addLogProbabilities(scores); // Set the scores according to the feature weights and per-class probabilities for (int fvi = 0; fvi < fvisize; fvi++) { int fi = fv.indexAtLocation(fvi); for (int ci = 0; ci < numClasses; ci++) { // guard against dataAlphabet or target alphabet growing; can happen if classifying // a never before seen feature. Ignore these. if (ci >= p.length || fi >= p[ci].size()) continue; scores[ci] += fv.valueAtLocation(fvi) * p[ci].logProbability(fi); } } // Get the scores in the range near zero, where exp() is more accurate double maxScore = Double.NEGATIVE_INFINITY; for (int ci = 0; ci < numClasses; ci++) if (scores[ci] > maxScore) maxScore = scores[ci]; for (int ci = 0; ci < numClasses; ci++) scores[ci] -= maxScore; // Exponentiate and normalize double sum = 0; for (int ci = 0; ci < numClasses; ci++) sum += (scores[ci] = Math.exp(scores[ci])); for (int ci = 0; ci < numClasses; ci++) scores[ci] /= sum; // Create and return a Classification object return new Classification(instance, this, new LabelVector(getLabelAlphabet(), scores)); }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence targets = carrier.getTarget() instanceof TokenSequence ? (TokenSequence) carrier.getTarget() : null; TokenSequence source = carrier.getSource() instanceof TokenSequence ? (TokenSequence) carrier.getSource() : null; StringBuffer sb = new StringBuffer(); if (prefix != null) sb.append(prefix); sb.append("name: " + carrier.getName() + "\n"); for (int i = 0; i < ts.size(); i++) { if (source != null) { sb.append(source.get(i).getText()); sb.append(' '); } if (carrier.getTarget() instanceof TokenSequence) { sb.append(((TokenSequence) carrier.getTarget()).get(i).getText()); sb.append(' '); } if (carrier.getTarget() instanceof FeatureSequence) { sb.append(((FeatureSequence) carrier.getTarget()).getObjectAtPosition(i).toString()); sb.append(' '); } PropertyList pl = ts.get(i).getFeatures(); if (pl != null) { PropertyList.Iterator iter = pl.iterator(); while (iter.hasNext()) { iter.next(); double v = iter.getNumericValue(); if (v == 1.0) sb.append(iter.getKey()); else sb.append(iter.getKey() + '=' + v); sb.append(' '); } } sb.append('\n'); } System.out.print(sb.toString()); return carrier; }
public Instance pipe(Instance carrier) { carrier.setData(new TokenIterator((TokenSequence) carrier.getData())); return carrier; }
public void test( Transducer transducer, InstanceList data, String description, PrintStream viterbiOutputStream) { int[] ntrue = new int[segmentTags.length]; int[] npred = new int[segmentTags.length]; int[] ncorr = new int[segmentTags.length]; LabelAlphabet dict = (LabelAlphabet) transducer.getInputPipe().getTargetAlphabet(); for (int i = 0; i < data.size(); i++) { Instance instance = data.getInstance(i); Sequence input = (Sequence) instance.getData(); Sequence trueOutput = (Sequence) instance.getTarget(); assert (input.size() == trueOutput.size()); Sequence predOutput = transducer.viterbiPath(input).output(); assert (predOutput.size() == trueOutput.size()); List trueSegs = new ArrayList(); List predSegs = new ArrayList(); addSegs(trueSegs, trueOutput); addSegs(predSegs, predOutput); // System.out.println("FieldF1Evaluator instance "+instance.getName ()); // printSegs(dict, trueSegs, "True"); // printSegs(dict, predSegs, "Pred"); for (Iterator it = predSegs.iterator(); it.hasNext(); ) { Segment seg = (Segment) it.next(); npred[seg.tag]++; if (trueSegs.contains(seg)) { ncorr[seg.tag]++; } } for (Iterator it = trueSegs.iterator(); it.hasNext(); ) { Segment seg = (Segment) it.next(); ntrue[seg.tag]++; } } DecimalFormat f = new DecimalFormat("0.####"); logger.info(description + " per-field F1"); for (int tag = 0; tag < segmentTags.length; tag++) { double precision = ((double) ncorr[tag]) / npred[tag]; double recall = ((double) ncorr[tag]) / ntrue[tag]; double f1 = (2 * precision * recall) / (precision + recall); Label name = dict.lookupLabel(segmentTags[tag]); logger.info( " segments " + name + " true = " + ntrue[tag] + " pred = " + npred[tag] + " correct = " + ncorr[tag]); logger.info( " precision=" + f.format(precision) + " recall=" + f.format(recall) + " f1=" + f.format(f1)); } }