public boolean train( InstanceList ilist, InstanceList validation, InstanceList testing, TransducerEvaluator eval) { assert (ilist.size() > 0); if (emissionEstimator == null) { emissionEstimator = new Multinomial.LaplaceEstimator[numStates()]; transitionEstimator = new Multinomial.LaplaceEstimator[numStates()]; emissionMultinomial = new Multinomial[numStates()]; transitionMultinomial = new Multinomial[numStates()]; Alphabet transitionAlphabet = new Alphabet(); for (int i = 0; i < numStates(); i++) transitionAlphabet.lookupIndex(((State) states.get(i)).getName(), true); for (int i = 0; i < numStates(); i++) { emissionEstimator[i] = new Multinomial.LaplaceEstimator(inputAlphabet); transitionEstimator[i] = new Multinomial.LaplaceEstimator(transitionAlphabet); emissionMultinomial[i] = new Multinomial(getUniformArray(inputAlphabet.size()), inputAlphabet); transitionMultinomial[i] = new Multinomial(getUniformArray(transitionAlphabet.size()), transitionAlphabet); } initialEstimator = new Multinomial.LaplaceEstimator(transitionAlphabet); } for (Instance instance : ilist) { FeatureSequence input = (FeatureSequence) instance.getData(); FeatureSequence output = (FeatureSequence) instance.getTarget(); new SumLatticeDefault(this, input, output, new Incrementor()); } initialMultinomial = initialEstimator.estimate(); for (int i = 0; i < numStates(); i++) { emissionMultinomial[i] = emissionEstimator[i].estimate(); transitionMultinomial[i] = transitionEstimator[i].estimate(); getState(i).setInitialWeight(initialMultinomial.logProbability(getState(i).getName())); } return true; }
public Instance pipe(Instance carrier) { if (!(carrier.getTarget() instanceof String)) { throw new IllegalArgumentException("Target must be of type String"); } String featuresLine = (String) carrier.getTarget(); String[] features = featuresLine.split(",?\\s+"); double[] values = new double[features.length]; Arrays.fill(values, 1.0); for (int i = 0; i < features.length; i++) { // Support the syntax "FEATURE=0.000342 OTHER_FEATURE=-2.32423" // \ if (features[i].indexOf("=") != -1) { String[] keyValuePair = features[i].split("="); features[i] = keyValuePair[0]; values[i] = Double.parseDouble(keyValuePair[1]); } // ensure that the feature has a spot in the alphabet // \ getTargetAlphabet().lookupIndex(features[i], true); } FeatureVector target = new FeatureVector(getTargetAlphabet(), features, values); carrier.setTarget(target); return carrier; }
/** * converts the sentence based instance list into a token based one This is needed for the * ME-version of JET (JetMeClassifier) * * @param METrainerDummyPipe * @param inst just the features for one sentence to be transformed * @return */ public static InstanceList convertFeatsforClassifier( final Pipe METrainerDummyPipe, final Instance inst) { final InstanceList iList = new InstanceList(METrainerDummyPipe); final FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData(); final LabelSequence ls = (LabelSequence) inst.getTarget(); final LabelAlphabet ldict = (LabelAlphabet) ls.getAlphabet(); final Object source = inst.getSource(); final Object name = inst.getName(); if (ls.size() != fvs.size()) { System.err.println( "failed making token instances: size of labelsequence != size of featue vector sequence: " + ls.size() + " - " + fvs.size()); System.exit(-1); } for (int j = 0; j < fvs.size(); j++) { final Instance I = new Instance(fvs.getFeatureVector(j), ldict.lookupLabel(ls.get(j)), name, source); iList.add(I); } return iList; }
public void split() { if (m_ilist == null) throw new IllegalStateException("Frozen. Cannot split."); int numLeftChildren = 0; boolean[] toLeftChild = new boolean[m_instIndices.length]; for (int i = 0; i < m_instIndices.length; i++) { Instance instance = m_ilist.get(m_instIndices[i]); FeatureVector fv = (FeatureVector) instance.getData(); if (fv.value(m_gainRatio.getMaxValuedIndex()) <= m_gainRatio.getMaxValuedThreshold()) { toLeftChild[i] = true; numLeftChildren++; } else toLeftChild[i] = false; } logger.info( "leftChild.size=" + numLeftChildren + " rightChild.size=" + (m_instIndices.length - numLeftChildren)); int[] leftIndices = new int[numLeftChildren]; int[] rightIndices = new int[m_instIndices.length - numLeftChildren]; int li = 0, ri = 0; for (int i = 0; i < m_instIndices.length; i++) { if (toLeftChild[i]) leftIndices[li++] = m_instIndices[i]; else rightIndices[ri++] = m_instIndices[i]; } m_leftChild = new Node(m_ilist, this, m_minNumInsts, leftIndices); m_rightChild = new Node(m_ilist, this, m_minNumInsts, rightIndices); }
@Override public Instance pipe(Instance carrier) { Arg1RankInstance instance = (Arg1RankInstance) carrier; Document document = (Document) instance.getData(); List<Pair<Integer, Integer>> candidates = instance.getCandidates(); int connStart = instance.getConnStart(); int connEnd = instance.getConnEnd(); int arg2Line = instance.getArg2Line(); int arg2HeadPos = instance.getArg2HeadPos(); FeatureVector fvs[] = new FeatureVector[candidates.size()]; for (int i = 0; i < candidates.size(); i++) { Pair<Integer, Integer> candidate = candidates.get(i); PropertyList pl = null; pl = addBaselineFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd); pl = addConstituentFeatures( pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd); pl = addDependencyFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd); // pl = addLexicoSyntacticFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, // connEnd); fvs[i] = new FeatureVector(getDataAlphabet(), pl, true, true); } // set target label LabelAlphabet ldict = (LabelAlphabet) getTargetAlphabet(); carrier.setTarget(ldict.lookupLabel(String.valueOf(instance.getTrueArg1Candidate()))); carrier.setData(new FeatureVectorSequence(fvs)); return carrier; }
public static InstanceList scale(InstanceList trainingList, double lower, double upper) { InstanceList ret = copy(trainingList); Alphabet featDict = ret.getDataAlphabet(); double[] feat_max = new double[featDict.size()]; double[] feat_min = new double[featDict.size()]; for (int i = 0; i < feat_max.length; i++) { feat_max[i] = -Double.MAX_VALUE; feat_min[i] = Double.MAX_VALUE; } for (int i = 0; i < ret.size(); i++) { Instance inst = ret.get(i); FeatureVector fv = (FeatureVector) inst.getData(); for (int loc = 0; loc < fv.numLocations(); loc++) { int featId = fv.indexAtLocation(loc); double value = fv.valueAtLocation(loc); double maxValue = feat_max[featId]; double minValue = feat_min[featId]; double newMaxValue = Math.max(value, maxValue); double newMinValue = Math.min(value, minValue); feat_max[featId] = newMaxValue; feat_min[featId] = newMinValue; } } // double lower = -1; // double upper = 1; for (int i = 0; i < ret.size(); i++) { Instance inst = ret.get(i); FeatureVector fv = (FeatureVector) inst.getData(); for (int loc = 0; loc < fv.numLocations(); loc++) { int featId = fv.indexAtLocation(loc); double value = fv.valueAtLocation(loc); double maxValue = feat_max[featId]; double minValue = feat_min[featId]; double newValue = Double.NaN; if (maxValue == minValue) { newValue = value; } else if (value == minValue) { newValue = lower; } else if (value == maxValue) { newValue = upper; } else { newValue = lower + (upper - lower) * (value - minValue) / (maxValue - minValue); } fv.setValueAtLocation(loc, newValue); } } return ret; }
public void count() { TIntIntHashMap docCounts = new TIntIntHashMap(); int index = 0; if (instances.size() == 0) { logger.info("Instance list is empty"); return; } if (instances.get(0).getData() instanceof FeatureSequence) { for (Instance instance : instances) { FeatureSequence features = (FeatureSequence) instance.getData(); for (int i = 0; i < features.getLength(); i++) { docCounts.adjustOrPutValue(features.getIndexAtPosition(i), 1, 1); } int[] keys = docCounts.keys(); for (int i = 0; i < keys.length - 1; i++) { int feature = keys[i]; featureCounts[feature] += docCounts.get(feature); documentFrequencies[feature]++; } docCounts = new TIntIntHashMap(); index++; if (index % 1000 == 0) { System.err.println(index); } } } else if (instances.get(0).getData() instanceof FeatureVector) { for (Instance instance : instances) { FeatureVector features = (FeatureVector) instance.getData(); for (int location = 0; location < features.numLocations(); location++) { int feature = features.indexAtLocation(location); double value = features.valueAtLocation(location); documentFrequencies[feature]++; featureCounts[feature] += value; } index++; if (index % 1000 == 0) { System.err.println(index); } } } else { logger.info("Unsupported data class: " + instances.get(0).getData().getClass().getName()); } }
public Instance pipe(Instance carrier) { if (carrier.getData() instanceof File) { try { // get file text File file = (File) carrier.getData(); @SuppressWarnings("resource") String txt = new LineReader(new FileInputStream(file)).getText("\n"); // update instance values carrier.setData(new TokenSequence(addRegexes(txt))); carrier.setSource(txt + " [file:" + file.getName() + "]"); } catch (java.io.IOException e) { throw new IllegalArgumentException("IOException " + e); } } else if (carrier.getData() instanceof String) { String txt = (String) carrier.getData(); // update instance values carrier.setData(new TokenSequence(addRegexes(txt))); carrier.setSource(txt); } else { throw new IllegalArgumentException("must be file or string " + carrier.getData()); } return carrier; }
public SVM train(InstanceList trainingList) { svm_problem problem = new svm_problem(); problem.l = trainingList.size(); problem.x = new svm_node[problem.l][]; problem.y = new double[problem.l]; for (int i = 0; i < trainingList.size(); i++) { Instance instance = trainingList.get(i); svm_node[] input = SVM.getSvmNodes(instance); if (input == null) { continue; } int labelIndex = ((Label) instance.getTarget()).getIndex(); problem.x[i] = input; problem.y[i] = labelIndex; } int max_index = trainingList.getDataAlphabet().size(); if (param.gamma == 0 && max_index > 0) { param.gamma = 1.0 / max_index; } // int numLabels = trainingList.getTargetAlphabet().size(); // int[] weight_label = new int[numLabels]; // double[] weight = trainingList.targetLabelDistribution().getValues(); // double minValue = Double.MAX_VALUE; // // for (int i = 0; i < weight.length; i++) { // if (minValue > weight[i]) { // minValue = weight[i]; // } // } // // for (int i = 0; i < weight.length; i++) { // weight_label[i] = i; // weight[i] = weight[i] / minValue; // } // // param.weight_label = weight_label; // param.weight = weight; String error_msg = svm.svm_check_parameter(problem, param); if (error_msg != null) { System.err.print("Error: " + error_msg + "\n"); System.exit(1); } svm_model model = svm.svm_train(problem, param); classifier = new SVM(model, trainingList.getPipe()); return classifier; }
public Instance pipe(Instance carrier) { if (carrier.getData() instanceof String) { String data = (String) carrier.getData(); String cleanedText = Jsoup.parse(data).text(); carrier.setData(cleanedText); } else { throw new IllegalArgumentException( "CharSequenceLowercase expects a String, found a " + carrier.getData().getClass()); } return carrier; }
private boolean[][] labelConnectionsIn(InstanceList trainingSet) { int numLabels = outputAlphabet.size(); boolean[][] connections = new boolean[numLabels][numLabels]; for (Instance instance : trainingSet) { FeatureSequence output = (FeatureSequence) instance.getTarget(); for (int j = 1; j < output.size(); j++) { int sourceIndex = outputAlphabet.lookupIndex(output.get(j - 1)); int destIndex = outputAlphabet.lookupIndex(output.get(j)); assert (sourceIndex >= 0 && destIndex >= 0); connections[sourceIndex][destIndex] = true; } } return connections; }
public InstanceList readArray(String[] cleanTexts) { StringArrayIterator iterator = new StringArrayIterator(cleanTexts); // Construct a new instance list, passing it the pipe we want to use to // process instances. InstanceList instances = new InstanceList(pipe); int index = 0; for (Instance inst : instances) { inst.setName(name_id.get(index)); inst.setTarget("english"); index++; } // Now process each instance provided by the iterator. instances.addThruPipe(iterator); return instances; }
/** * Process each sentence to add the feature if necessary. * * @param carrier Instance to be processed. * @return Instance with new features. */ public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); char[] text = t.getText().toCharArray(); int numDigit = 0; for (int k = 0; k < text.length; k++) { if (Character.isDigit(text[k])) { numDigit++; } } if (numDigit == 1) { t.setFeatureValue("SingleDigit", 1.0); } else if (numDigit == 2) { t.setFeatureValue("TwoDigit", 1.0); } else if (numDigit == 3) { t.setFeatureValue("ThreeDigit", 1.0); } else if (numDigit >= 4) { t.setFeatureValue("MoreDigit", 1.0); } } return carrier; }
public Classification classify(Instance instance) { FeatureVector fv = (FeatureVector) instance.getData(); assert (instancePipe == null || fv.getAlphabet() == this.instancePipe.getDataAlphabet()); Node leaf = getLeaf(m_root, fv); return new Classification(instance, this, leaf.getGainRatio().getBaseLabelDistribution()); }
public Instance pipe(Instance carrier) { Sequence data = (Sequence) carrier.getData(); Sequence target = (Sequence) carrier.getTarget(); if (data.size() != target.size()) throw new IllegalArgumentException( "Trying to print into SimpleTagger format, where data and target lengths do not match\n" + "data.length = " + data.size() + ", target.length = " + target.size()); int N = data.size(); if (data instanceof TokenSequence) { throw new UnsupportedOperationException("Not yet implemented."); } else if (data instanceof FeatureVectorSequence) { FeatureVectorSequence fvs = (FeatureVectorSequence) data; Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null; for (int i = 0; i < N; i++) { Object label = target.get(i); writer.print(label); FeatureVector fv = fvs.getFeatureVector(i); for (int loc = 0; loc < fv.numLocations(); loc++) { writer.print(' '); String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString(); double value = fv.valueAtLocation(loc); // if (!Maths.almostEquals(value, 1.0)) { // throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector // not binary at time slice "+i+" fv:"+fv); // } writer.print(fname + String.valueOf(value)); } writer.println(); } } else { throw new IllegalArgumentException("Don't know how to print data of type " + data); } writer.println(); // writer.print(getDataAlphabet()); return carrier; }
public void addInstance(Instance instance) { Sequence input = (Sequence) instance.getData(); for (int j = 0; j < input.size(); j++) { FeatureVector fv = (FeatureVector) input.get(j); addToken(fv); // log.info( "token:" + j + " " + fv.toString( true ) ); } }
public Instance pipe(Instance carrier) { try { if (carrier.getData() instanceof URI) carrier.setData(pipe((URI) carrier.getData())); else if (carrier.getData() instanceof File) carrier.setData(pipe((File) carrier.getData())); else if (carrier.getData() instanceof Reader) carrier.setData(pipe((Reader) carrier.getData())); else if (carrier.getData() instanceof CharSequence) ; // No conversion necessary else throw new IllegalArgumentException("Does not handle class " + carrier.getData().getClass()); } catch (java.io.IOException e) { throw new IllegalArgumentException("IOException " + e); } // System.out.println(carrier.getData().toString()); return carrier; }
public static InstanceList copy(InstanceList instances) { InstanceList ret = (InstanceList) instances.clone(); // LabelAlphabet labelDict = (LabelAlphabet) ret.getTargetAlphabet(); Alphabet featDict = ret.getDataAlphabet(); for (int i = 0; i < ret.size(); i++) { Instance instance = ret.get(i); Instance clone = (Instance) instance.clone(); FeatureVector fv = (FeatureVector) clone.getData(); int[] indices = fv.getIndices(); double[] values = fv.getValues(); int[] newIndices = new int[indices.length]; System.arraycopy(indices, 0, newIndices, 0, indices.length); double[] newValues = new double[indices.length]; System.arraycopy(values, 0, newValues, 0, indices.length); FeatureVector newFv = new FeatureVector(featDict, newIndices, newValues); Instance newInstance = new Instance(newFv, instance.getTarget(), instance.getName(), instance.getSource()); ret.set(i, newInstance); } return ret; }
private double dataLogProbability(Instance instance, int labelIndex) { FeatureVector fv = (FeatureVector) instance.getData(); int fvisize = fv.numLocations(); double logProb = 0; for (int fvi = 0; fvi < fvisize; fvi++) logProb += fv.valueAtLocation(fvi) * p[labelIndex].logProbability(fv.indexAtLocation(fvi)); return logProb; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence(); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); if (!stoplist.contains(caseSensitive ? t.getText() : t.getText().toLowerCase())) { // xxx Should we instead make and add a copy of the Token? ret.add(t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty(FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
@Override public Instance pipe(Instance carrier) { if (synonymMap == null) { readSynonymFile(); } TokenSequence in = (TokenSequence) carrier.getData(); for (Token token : in) { if (synonymMap.containsKey(token.getText())) { token.setText(synonymMap.get(token.getText())); } } return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence targets = carrier.getTarget() instanceof TokenSequence ? (TokenSequence) carrier.getTarget() : null; TokenSequence source = carrier.getSource() instanceof TokenSequence ? (TokenSequence) carrier.getSource() : null; StringBuffer sb = new StringBuffer(); if (prefix != null) sb.append(prefix); sb.append("name: " + carrier.getName() + "\n"); for (int i = 0; i < ts.size(); i++) { if (source != null) { sb.append(source.get(i).getText()); sb.append(' '); } if (carrier.getTarget() instanceof TokenSequence) { sb.append(((TokenSequence) carrier.getTarget()).get(i).getText()); sb.append(' '); } if (carrier.getTarget() instanceof FeatureSequence) { sb.append(((FeatureSequence) carrier.getTarget()).getObjectAtPosition(i).toString()); sb.append(' '); } PropertyList pl = ts.get(i).getFeatures(); if (pl != null) { PropertyList.Iterator iter = pl.iterator(); while (iter.hasNext()) { iter.next(); double v = iter.getNumericValue(); if (v == 1.0) sb.append(iter.getKey()); else sb.append(iter.getKey() + '=' + v); sb.append(' '); } } sb.append('\n'); } System.out.print(sb.toString()); return carrier; }
public double dataLogLikelihood(InstanceList ilist) { double logLikelihood = 0; for (int ii = 0; ii < ilist.size(); ii++) { double instanceWeight = ilist.getInstanceWeight(ii); Instance inst = ilist.get(ii); Labeling labeling = inst.getLabeling(); if (labeling != null) logLikelihood += instanceWeight * dataLogProbability(inst, labeling.getBestIndex()); else { Labeling predicted = this.classify(inst).getLabeling(); // System.err.println ("label = \n"+labeling); // System.err.println ("predicted = \n"+predicted); for (int lpos = 0; lpos < predicted.numLocations(); lpos++) { int li = predicted.indexAtLocation(lpos); double labelWeight = predicted.valueAtLocation(lpos); // System.err.print (", "+labelWeight); if (labelWeight == 0) continue; logLikelihood += instanceWeight * labelWeight * dataLogProbability(inst, li); } } } return logLikelihood; }
@Test public void testLoadRareWords() throws UnsupportedEncodingException, FileNotFoundException { String dataset_fn = "src/main/resources/datasets/SmallTexts.txt"; InstanceList nonPrunedInstances = LDAUtils.loadInstances(dataset_fn, "stoplist.txt", 0); System.out.println(LDAUtils.instancesToString(nonPrunedInstances)); System.out.println("Non pruned Alphabet size: " + nonPrunedInstances.getDataAlphabet().size()); System.out.println("No. instances: " + nonPrunedInstances.size()); InstanceList originalInstances = LDAUtils.loadInstances(dataset_fn, "stoplist.txt", 2); System.out.println("Alphabet size: " + originalInstances.getDataAlphabet().size()); System.out.println(LDAUtils.instancesToString(originalInstances)); System.out.println("No. instances: " + originalInstances.size()); int[] wordCounts = {0, 3, 3, 0, 0}; int idx = 0; for (Instance instance : originalInstances) { FeatureSequence fs = (FeatureSequence) instance.getData(); // This assertion would fail for eventhough the feature sequence // is "empty" the underlying array is 2 long. // assertEquals(wordCounts[idx++], fs.getFeatures().length); assertEquals(wordCounts[idx++], fs.size()); } }
public double labelLogLikelihood(InstanceList ilist) { double logLikelihood = 0; for (int ii = 0; ii < ilist.size(); ii++) { double instanceWeight = ilist.getInstanceWeight(ii); Instance inst = ilist.get(ii); Labeling labeling = inst.getLabeling(); if (labeling == null) continue; Labeling predicted = this.classify(inst).getLabeling(); // System.err.println ("label = \n"+labeling); // System.err.println ("predicted = \n"+predicted); if (labeling.numLocations() == 1) { logLikelihood += instanceWeight * Math.log(predicted.value(labeling.getBestIndex())); } else { for (int lpos = 0; lpos < labeling.numLocations(); lpos++) { int li = labeling.indexAtLocation(lpos); double labelWeight = labeling.valueAtLocation(lpos); // System.err.print (", "+labelWeight); if (labelWeight == 0) continue; logLikelihood += instanceWeight * labelWeight * Math.log(predicted.value(li)); } } } return logLikelihood; }
public Instance pipe(Instance carrier) { Object inputData = carrier.getData(); Alphabet features = getDataAlphabet(); LabelAlphabet labels; LabelSequence target = null; String[][] tokens; if (inputData instanceof String) tokens = parseSentence((String) inputData); else if (inputData instanceof String[][]) tokens = (String[][]) inputData; else throw new IllegalArgumentException("Not a String or String[][]; got " + inputData); FeatureVector[] fvs = new FeatureVector[tokens.length]; if (isTargetProcessing()) { labels = (LabelAlphabet) getTargetAlphabet(); target = new LabelSequence(labels, tokens.length); } for (int l = 0; l < tokens.length; l++) { int nFeatures; if (isTargetProcessing()) { if (tokens[l].length < 1) throw new IllegalStateException( "Missing label at line " + l + " instance " + carrier.getName()); nFeatures = tokens[l].length - 1; target.add(tokens[l][nFeatures]); } else nFeatures = tokens[l].length; int featureIndices[] = new int[nFeatures]; for (int f = 0; f < nFeatures; f++) featureIndices[f] = features.lookupIndex(tokens[l][f]); fvs[l] = featureInductionOption.value ? new AugmentableFeatureVector( features, featureIndices, null, featureIndices.length) : new FeatureVector(features, featureIndices); } carrier.setData(new FeatureVectorSequence(fvs)); if (isTargetProcessing()) carrier.setTarget(target); else carrier.setTarget(new LabelSequence(getTargetAlphabet())); return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); String s = t.getText(); if (distinguishBorders) s = startBorderChar + s + endBorderChar; int slen = s.length(); for (int j = 0; j < gramSizes.length; j++) { int size = gramSizes[j]; for (int k = 0; k < (slen - size) + 1; k++) t.setFeatureValue((prefix + s.substring(k, k + size)).intern(), 1.0); } } return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); String s = t.getText(); if (distinguishBorders) s = startBorderChar + s + endBorderChar; int slen = s.length(); for (int j = 0; j < gramSizes.length; j++) { int size = gramSizes[j]; for (int k = 0; k < slen - size; k++) t.setFeatureValue( s.substring(k, k + size), 1.0); // original was substring(k, size), changed by Fuchun } } return carrier; }
/** * Classify an instance using NaiveBayes according to the trained data. The alphabet of the * featureVector of the instance must match the alphabe of the pipe used to train the classifier. * * @param instance to be classified. Data field must be a FeatureVector * @return Classification containing the labeling of the instance */ public Classification classify(Instance instance) { // Note that the current size of the label alphabet can be larger // than it was at the time of training. We are careful here // to correctly handle those labels here. For example, // we assume the log prior probability of those classes is // minus infinity. int numClasses = getLabelAlphabet().size(); double[] scores = new double[numClasses]; FeatureVector fv = (FeatureVector) instance.getData(); // Make sure the feature vector's feature dictionary matches // what we are expecting from our data pipe (and thus our notion // of feature probabilities. assert (instancePipe == null || fv.getAlphabet() == instancePipe.getDataAlphabet()); int fvisize = fv.numLocations(); prior.addLogProbabilities(scores); // Set the scores according to the feature weights and per-class probabilities for (int fvi = 0; fvi < fvisize; fvi++) { int fi = fv.indexAtLocation(fvi); for (int ci = 0; ci < numClasses; ci++) { // guard against dataAlphabet or target alphabet growing; can happen if classifying // a never before seen feature. Ignore these. if (ci >= p.length || fi >= p[ci].size()) continue; scores[ci] += fv.valueAtLocation(fvi) * p[ci].logProbability(fi); } } // Get the scores in the range near zero, where exp() is more accurate double maxScore = Double.NEGATIVE_INFINITY; for (int ci = 0; ci < numClasses; ci++) if (scores[ci] > maxScore) maxScore = scores[ci]; for (int ci = 0; ci < numClasses; ci++) scores[ci] -= maxScore; // Exponentiate and normalize double sum = 0; for (int ci = 0; ci < numClasses; ci++) sum += (scores[ci] = Math.exp(scores[ci])); for (int ci = 0; ci < numClasses; ci++) scores[ci] /= sum; // Create and return a Classification object return new Classification(instance, this, new LabelVector(getLabelAlphabet(), scores)); }
public Instance pipe(Instance carrier) { try { if (carrier.getData() instanceof URI) { carrier.setData(pipe((URI) carrier.getData())); } else if (carrier.getData() instanceof File) { carrier.setData(pipe((File) carrier.getData())); } else if (carrier.getData() instanceof Reader) { carrier.setData(pipe((Reader) carrier.getData())); } else if (carrier.getData() instanceof CharSequence) ; // No conversion necessary else { throw new IllegalArgumentException("Does not handle class " + carrier.getData().getClass()); } if (this.labelsInText) { String str = ((CharSequence) carrier.getData()).toString(); if (str.startsWith("[")) { String[] labelsBoundary = str.substring(1) . // remove initial '[' split("]", 2); // separate labels and str between ']' // String[] labelStrs = labelsBoundary[0].trim().split("[ \\t]"); carrier.setData(labelsBoundary[1].trim()); carrier.setTarget(labelsBoundary[0].trim()); // homer to do } } } catch (java.io.IOException e) { throw new IllegalArgumentException("IOException " + e); } // System.out.println(carrier.getData().toString()); return carrier; }