public Instance pipe(Instance carrier) { if (!(carrier.getTarget() instanceof String)) { throw new IllegalArgumentException("Target must be of type String"); } String featuresLine = (String) carrier.getTarget(); String[] features = featuresLine.split(",?\\s+"); double[] values = new double[features.length]; Arrays.fill(values, 1.0); for (int i = 0; i < features.length; i++) { // Support the syntax "FEATURE=0.000342 OTHER_FEATURE=-2.32423" // \ if (features[i].indexOf("=") != -1) { String[] keyValuePair = features[i].split("="); features[i] = keyValuePair[0]; values[i] = Double.parseDouble(keyValuePair[1]); } // ensure that the feature has a spot in the alphabet // \ getTargetAlphabet().lookupIndex(features[i], true); } FeatureVector target = new FeatureVector(getTargetAlphabet(), features, values); carrier.setTarget(target); return carrier; }
public static InstanceList copy(InstanceList instances) { InstanceList ret = (InstanceList) instances.clone(); // LabelAlphabet labelDict = (LabelAlphabet) ret.getTargetAlphabet(); Alphabet featDict = ret.getDataAlphabet(); for (int i = 0; i < ret.size(); i++) { Instance instance = ret.get(i); Instance clone = (Instance) instance.clone(); FeatureVector fv = (FeatureVector) clone.getData(); int[] indices = fv.getIndices(); double[] values = fv.getValues(); int[] newIndices = new int[indices.length]; System.arraycopy(indices, 0, newIndices, 0, indices.length); double[] newValues = new double[indices.length]; System.arraycopy(values, 0, newValues, 0, indices.length); FeatureVector newFv = new FeatureVector(featDict, newIndices, newValues); Instance newInstance = new Instance(newFv, instance.getTarget(), instance.getName(), instance.getSource()); ret.set(i, newInstance); } return ret; }
public boolean train( InstanceList ilist, InstanceList validation, InstanceList testing, TransducerEvaluator eval) { assert (ilist.size() > 0); if (emissionEstimator == null) { emissionEstimator = new Multinomial.LaplaceEstimator[numStates()]; transitionEstimator = new Multinomial.LaplaceEstimator[numStates()]; emissionMultinomial = new Multinomial[numStates()]; transitionMultinomial = new Multinomial[numStates()]; Alphabet transitionAlphabet = new Alphabet(); for (int i = 0; i < numStates(); i++) transitionAlphabet.lookupIndex(((State) states.get(i)).getName(), true); for (int i = 0; i < numStates(); i++) { emissionEstimator[i] = new Multinomial.LaplaceEstimator(inputAlphabet); transitionEstimator[i] = new Multinomial.LaplaceEstimator(transitionAlphabet); emissionMultinomial[i] = new Multinomial(getUniformArray(inputAlphabet.size()), inputAlphabet); transitionMultinomial[i] = new Multinomial(getUniformArray(transitionAlphabet.size()), transitionAlphabet); } initialEstimator = new Multinomial.LaplaceEstimator(transitionAlphabet); } for (Instance instance : ilist) { FeatureSequence input = (FeatureSequence) instance.getData(); FeatureSequence output = (FeatureSequence) instance.getTarget(); new SumLatticeDefault(this, input, output, new Incrementor()); } initialMultinomial = initialEstimator.estimate(); for (int i = 0; i < numStates(); i++) { emissionMultinomial[i] = emissionEstimator[i].estimate(); transitionMultinomial[i] = transitionEstimator[i].estimate(); getState(i).setInitialWeight(initialMultinomial.logProbability(getState(i).getName())); } return true; }
/** * converts the sentence based instance list into a token based one This is needed for the * ME-version of JET (JetMeClassifier) * * @param METrainerDummyPipe * @param inst just the features for one sentence to be transformed * @return */ public static InstanceList convertFeatsforClassifier( final Pipe METrainerDummyPipe, final Instance inst) { final InstanceList iList = new InstanceList(METrainerDummyPipe); final FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData(); final LabelSequence ls = (LabelSequence) inst.getTarget(); final LabelAlphabet ldict = (LabelAlphabet) ls.getAlphabet(); final Object source = inst.getSource(); final Object name = inst.getName(); if (ls.size() != fvs.size()) { System.err.println( "failed making token instances: size of labelsequence != size of featue vector sequence: " + ls.size() + " - " + fvs.size()); System.exit(-1); } for (int j = 0; j < fvs.size(); j++) { final Instance I = new Instance(fvs.getFeatureVector(j), ldict.lookupLabel(ls.get(j)), name, source); iList.add(I); } return iList; }
public SVM train(InstanceList trainingList) { svm_problem problem = new svm_problem(); problem.l = trainingList.size(); problem.x = new svm_node[problem.l][]; problem.y = new double[problem.l]; for (int i = 0; i < trainingList.size(); i++) { Instance instance = trainingList.get(i); svm_node[] input = SVM.getSvmNodes(instance); if (input == null) { continue; } int labelIndex = ((Label) instance.getTarget()).getIndex(); problem.x[i] = input; problem.y[i] = labelIndex; } int max_index = trainingList.getDataAlphabet().size(); if (param.gamma == 0 && max_index > 0) { param.gamma = 1.0 / max_index; } // int numLabels = trainingList.getTargetAlphabet().size(); // int[] weight_label = new int[numLabels]; // double[] weight = trainingList.targetLabelDistribution().getValues(); // double minValue = Double.MAX_VALUE; // // for (int i = 0; i < weight.length; i++) { // if (minValue > weight[i]) { // minValue = weight[i]; // } // } // // for (int i = 0; i < weight.length; i++) { // weight_label[i] = i; // weight[i] = weight[i] / minValue; // } // // param.weight_label = weight_label; // param.weight = weight; String error_msg = svm.svm_check_parameter(problem, param); if (error_msg != null) { System.err.print("Error: " + error_msg + "\n"); System.exit(1); } svm_model model = svm.svm_train(problem, param); classifier = new SVM(model, trainingList.getPipe()); return classifier; }
private boolean[][] labelConnectionsIn(InstanceList trainingSet) { int numLabels = outputAlphabet.size(); boolean[][] connections = new boolean[numLabels][numLabels]; for (Instance instance : trainingSet) { FeatureSequence output = (FeatureSequence) instance.getTarget(); for (int j = 1; j < output.size(); j++) { int sourceIndex = outputAlphabet.lookupIndex(output.get(j - 1)); int destIndex = outputAlphabet.lookupIndex(output.get(j)); assert (sourceIndex >= 0 && destIndex >= 0); connections[sourceIndex][destIndex] = true; } } return connections; }
public Instance pipe(Instance carrier) { Sequence data = (Sequence) carrier.getData(); Sequence target = (Sequence) carrier.getTarget(); if (data.size() != target.size()) throw new IllegalArgumentException( "Trying to print into SimpleTagger format, where data and target lengths do not match\n" + "data.length = " + data.size() + ", target.length = " + target.size()); int N = data.size(); if (data instanceof TokenSequence) { throw new UnsupportedOperationException("Not yet implemented."); } else if (data instanceof FeatureVectorSequence) { FeatureVectorSequence fvs = (FeatureVectorSequence) data; Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null; for (int i = 0; i < N; i++) { Object label = target.get(i); writer.print(label); FeatureVector fv = fvs.getFeatureVector(i); for (int loc = 0; loc < fv.numLocations(); loc++) { writer.print(' '); String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString(); double value = fv.valueAtLocation(loc); // if (!Maths.almostEquals(value, 1.0)) { // throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector // not binary at time slice "+i+" fv:"+fv); // } writer.print(fname + String.valueOf(value)); } writer.println(); } } else { throw new IllegalArgumentException("Don't know how to print data of type " + data); } writer.println(); // writer.print(getDataAlphabet()); return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence targets = carrier.getTarget() instanceof TokenSequence ? (TokenSequence) carrier.getTarget() : null; TokenSequence source = carrier.getSource() instanceof TokenSequence ? (TokenSequence) carrier.getSource() : null; StringBuffer sb = new StringBuffer(); if (prefix != null) sb.append(prefix); sb.append("name: " + carrier.getName() + "\n"); for (int i = 0; i < ts.size(); i++) { if (source != null) { sb.append(source.get(i).getText()); sb.append(' '); } if (carrier.getTarget() instanceof TokenSequence) { sb.append(((TokenSequence) carrier.getTarget()).get(i).getText()); sb.append(' '); } if (carrier.getTarget() instanceof FeatureSequence) { sb.append(((FeatureSequence) carrier.getTarget()).getObjectAtPosition(i).toString()); sb.append(' '); } PropertyList pl = ts.get(i).getFeatures(); if (pl != null) { PropertyList.Iterator iter = pl.iterator(); while (iter.hasNext()) { iter.next(); double v = iter.getNumericValue(); if (v == 1.0) sb.append(iter.getKey()); else sb.append(iter.getKey() + '=' + v); sb.append(' '); } } sb.append('\n'); } System.out.print(sb.toString()); return carrier; }