@SuppressWarnings({"unchecked"}) @Override protected void fillFeatures( Pair<Mention, ClusteredMention> input, Counter<Feature> inFeatures, Boolean output, Counter<Feature> outFeatures) { // --Input Features for (Object o : ACTIVE_FEATURES) { if (o instanceof Class) { // (case: singleton feature) Option<Double> count = new Option<Double>(1.0); Feature feat = feature((Class) o, input, count); if (count.get() > 0.0) { inFeatures.incrementCount(feat, count.get()); } } else if (o instanceof Pair) { // (case: pair of features) Pair<Class, Class> pair = (Pair<Class, Class>) o; Option<Double> countA = new Option<Double>(1.0); Option<Double> countB = new Option<Double>(1.0); Feature featA = feature(pair.getFirst(), input, countA); Feature featB = feature(pair.getSecond(), input, countB); if (countA.get() * countB.get() > 0.0) { inFeatures.incrementCount( new Feature.PairFeature(featA, featB), countA.get() * countB.get()); } } } // --Output Features if (output != null) { outFeatures.incrementCount(new Feature.CoreferentIndicator(output), 1.0); } }
public List<ClusteredMention> runCoreference(Document doc) { // --Overhead startTrack("Testing " + doc.id); // (variables) List<ClusteredMention> rtn = new ArrayList<ClusteredMention>(doc.getMentions().size()); List<Mention> mentions = doc.getMentions(); int singletons = 0; // --Run Classifier for (int i = 0; i < mentions.size(); i++) { // (variables) Mention onPrix = mentions.get(i); int coreferentWith = -1; // (get mention it is coreferent with) for (int j = i - 1; j >= 0; j--) { ClusteredMention cand = rtn.get(j); boolean coreferent = classifier.classOf( new RVFDatum<Boolean, Feature>(extractor.extractFeatures(Pair.make(onPrix, cand)))); if (coreferent) { coreferentWith = j; break; } } if (coreferentWith < 0) { singletons += 1; rtn.add(onPrix.markSingleton()); } else { // log("Mention " + onPrix + " coreferent with " + mentions.get(coreferentWith)); rtn.add(onPrix.markCoreferent(rtn.get(coreferentWith))); } } // log("" + singletons + " singletons"); // --Return endTrack("Testing " + doc.id); return rtn; }
private <E> Feature feature( Class<E> clazz, Pair<Mention, ClusteredMention> input, Option<Double> count) { // --Variables Mention onPrix = input.getFirst(); // the first mention (referred to as m_i in the handout) Mention candidate = input.getSecond().mention; // the second mention (referred to as m_j in the handout) Entity candidateCluster = input.getSecond().entity; // the cluster containing the second mention // --Features:w if (clazz.equals(Feature.ExactMatch.class)) { // (exact string match) return new Feature.ExactMatch(onPrix.gloss().equals(candidate.gloss())); } else if (clazz.equals(Feature.SentenceDist.class)) { return new Feature.SentenceDist( Math.abs( onPrix.doc.indexOfMention(onPrix) - candidate.doc.indexOfMention(candidate))); } else if (clazz.equals(Feature.MentionDist.class)) { return new Feature.MentionDist( Math.abs( onPrix.doc.indexOfSentence(onPrix.sentence) - candidate.doc.indexOfSentence(candidate.sentence))); } else if (clazz.equals(Feature.EitherHeadWordPronoun.class)) { return new Feature.EitherHeadWordPronoun( Pronoun.isSomePronoun(onPrix.gloss()) || Pronoun.isSomePronoun(candidate.gloss())); } else if (clazz.equals(Feature.CandidateNERTag.class)) { return new Feature.CandidateNERTag(candidate.headToken().nerTag()); } else if (clazz.equals(Feature.CandidateSpeaker.class)) { return new Feature.CandidateSpeaker(candidate.headToken().speaker()); } else if (clazz.equals(Feature.FixedSpeaker.class)) { return new Feature.FixedSpeaker(onPrix.headToken().speaker()); } else if (clazz.equals(Feature.HeadWordMatch.class)) { return new Feature.HeadWordMatch(onPrix.equals(candidate.headWord())); } else if (clazz.equals(Feature.HeadWordLemmaMatch.class)) { return new Feature.HeadWordLemmaMatch( onPrix.headToken().lemma().equals(candidate.headToken().lemma())); } else if (clazz.equals(Feature.FixedNERTag.class)) { return new Feature.FixedNERTag(onPrix.headToken().nerTag()); } else if (clazz.equals(Feature.SpeakerMatch.class)) { return new Feature.SpeakerMatch( candidate.headToken().speaker().equals(onPrix.headToken().speaker())); } else if (clazz.equals(Feature.NERTagMatch.class)) { return new Feature.NERTagMatch( candidate.headToken().nerTag().equals(onPrix.headToken().nerTag())); } else if (clazz.equals(Feature.CandidatePOSTag.class)) { return new Feature.CandidatePOSTag(candidate.headToken().posTag()); } else if (clazz.equals(Feature.FixedPOSTag.class)) { return new Feature.FixedPOSTag(onPrix.headToken().posTag()); } else if (clazz.equals(Feature.GenderMatch.class)) { Pair<Boolean, Boolean> match = Util.haveGenderAndAreSameGender(onPrix, candidate); boolean finalMatch = (!match.getFirst() || match.getSecond()); return new Feature.GenderMatch(finalMatch); } else if (clazz.equals(Feature.NumberMatch.class)) { Pair<Boolean, Boolean> match = Util.haveNumberAndAreSameNumber(onPrix, candidate); boolean finalMatch = (!match.getFirst() || match.getSecond()); return new Feature.NumberMatch(finalMatch); } // } else if(clazz.equals(Feature.NewFeature.class) { /* * TODO: Add features to return for specific classes. Implement calculating values of features here. */ else { throw new IllegalArgumentException("Unregistered feature: " + clazz); } }
public void train(Collection<Pair<Document, List<Entity>>> trainingData) { startTrack("Training"); // --Variables RVFDataset<Boolean, Feature> dataset = new RVFDataset<Boolean, Feature>(); LinearClassifierFactory<Boolean, Feature> fact = new LinearClassifierFactory<Boolean, Feature>(); // --Feature Extraction startTrack("Feature Extraction"); for (Pair<Document, List<Entity>> datum : trainingData) { // (document variables) Document doc = datum.getFirst(); List<Entity> goldClusters = datum.getSecond(); List<Mention> mentions = doc.getMentions(); Map<Mention, Entity> goldEntities = Entity.mentionToEntityMap(goldClusters); startTrack("Document " + doc.id); // (for each mention...) for (int i = 0; i < mentions.size(); i++) { // (get the mention and its cluster) Mention onPrix = mentions.get(i); Entity source = goldEntities.get(onPrix); if (source == null) { throw new IllegalArgumentException("Mention has no gold entity: " + onPrix); } // (for each previous mention...) int oldSize = dataset.size(); for (int j = i - 1; j >= 0; j--) { // (get previous mention and its cluster) Mention cand = mentions.get(j); Entity target = goldEntities.get(cand); if (target == null) { throw new IllegalArgumentException("Mention has no gold entity: " + cand); } // (extract features) Counter<Feature> feats = extractor.extractFeatures(Pair.make(onPrix, cand.markCoreferent(target))); // (add datum) dataset.add(new RVFDatum<Boolean, Feature>(feats, target == source)); // (stop if if (target == source) { break; } } // logf("Mention %s (%d datums)", onPrix.toString(), dataset.size() - oldSize); } endTrack("Document " + doc.id); } endTrack("Feature Extraction"); // --Train Classifier startTrack("Minimizer"); this.classifier = fact.trainClassifier(dataset); endTrack("Minimizer"); // --Dump Weights startTrack("Features"); // (get labels to print) Set<Boolean> labels = new HashSet<Boolean>(); labels.add(true); // (print features) for (Triple<Feature, Boolean, Double> featureInfo : this.classifier.getTopFeatures(labels, 0.0, true, 100, true)) { Feature feature = featureInfo.first(); Boolean label = featureInfo.second(); Double magnitude = featureInfo.third(); // log(FORCE,new DecimalFormat("0.000").format(magnitude) + " [" + label + "] " + feature); } end_Track("Features"); endTrack("Training"); }
public void train(List<SentencePair> trainingPairs) { sourceTargetCounts = new CounterMap<String, String>(); sourceTargetDistortions = new CounterMap<Pair<Integer, Integer>, Pair<Integer, Integer>>(); for (SentencePair pair : trainingPairs) { List<String> sourceSentence = pair.getSourceWords(); List<String> targetSentence = pair.getTargetWords(); targetSentence.add(WordAligner.NULL_WORD); int m = sourceSentence.size(); int l = targetSentence.size(); for (int i = 0; i < m; i++) { String sourceWord = sourceSentence.get(i); for (int j = 0; j < l; j++) { String targetWord = targetSentence.get(j); sourceTargetCounts.setCount(sourceWord, targetWord, 1.0); Pair<Integer, Integer> lmPair = new Pair<Integer, Integer>(l, m); Pair<Integer, Integer> jiPair = new Pair<Integer, Integer>(j, i); sourceTargetDistortions.setCount(jiPair, lmPair, 1.0); } } } // Use Model 1 to train params double delta = Double.POSITIVE_INFINITY; for (int i = 0; i < MAX_ITERS && delta > CONVERGENCE; i++) { CounterMap<String, String> tempSourceTargetCounts = new CounterMap<String, String>(); Counter<String> targetCounts = new Counter<String>(); delta = 0.0; for (SentencePair pair : trainingPairs) { List<String> sourceSentence = pair.getSourceWords(); List<String> targetSentence = pair.getTargetWords(); Counter<String> sourceTotals = new Counter<String>(); for (String sourceWord : sourceSentence) { for (String targetWord : targetSentence) { sourceTotals.incrementCount( sourceWord, sourceTargetCounts.getCount(sourceWord, targetWord)); } } for (String sourceWord : sourceSentence) { for (String targetWord : targetSentence) { double transProb = sourceTargetCounts.getCount(sourceWord, targetWord); double sourceTotal = sourceTotals.getCount(sourceWord); tempSourceTargetCounts.incrementCount(sourceWord, targetWord, transProb / sourceTotal); targetCounts.incrementCount(targetWord, transProb / sourceTotal); } } } // update t(s|t) values for (String sourceWord : tempSourceTargetCounts.keySet()) { for (String targetWord : tempSourceTargetCounts.getCounter(sourceWord).keySet()) { double oldProb = sourceTargetCounts.getCount(sourceWord, targetWord); double newProb = tempSourceTargetCounts.getCount(sourceWord, targetWord) / targetCounts.getCount(targetWord); sourceTargetCounts.setCount(sourceWord, targetWord, newProb); delta += Math.pow(oldProb - newProb, 2.0); } } delta /= sourceTargetCounts.totalSize(); } // Maximizing for ibm model 2 delta = Double.POSITIVE_INFINITY; for (int iter = 0; iter < MAX_ITERS && delta > CONVERGENCE; iter++) { CounterMap<String, String> tempSourceTargetCounts = new CounterMap<String, String>(); CounterMap<Pair<Integer, Integer>, Pair<Integer, Integer>> tempSourceTargetDistortions = new CounterMap<Pair<Integer, Integer>, Pair<Integer, Integer>>(); Counter<String> targetCounts = new Counter<String>(); CounterMap<Pair<Integer, Integer>, Integer> targetDistorts = new CounterMap<Pair<Integer, Integer>, Integer>(); delta = 0.0; for (SentencePair pair : trainingPairs) { List<String> sourceSentence = pair.getSourceWords(); List<String> targetSentence = pair.getTargetWords(); CounterMap<Pair<Integer, Integer>, Integer> distortSourceTotals = new CounterMap<Pair<Integer, Integer>, Integer>(); Pair<Integer, Integer> lmPair = new Pair<Integer, Integer>(targetSentence.size(), sourceSentence.size()); for (int i = 0; i < sourceSentence.size(); i++) { String sourceWord = sourceSentence.get(i); for (int j = 0; j < targetSentence.size(); j++) { String targetWord = targetSentence.get(j); Pair<Integer, Integer> jiPair = new Pair<Integer, Integer>(j, i); double currTransProb = sourceTargetCounts.getCount(sourceWord, targetWord); double currAlignProb = sourceTargetDistortions.getCount(jiPair, lmPair); distortSourceTotals.incrementCount(lmPair, i, currTransProb * currAlignProb); } } for (int i = 0; i < sourceSentence.size(); i++) { String sourceWord = sourceSentence.get(i); double distortTransSourceTotal = distortSourceTotals.getCount(lmPair, i); for (int j = 0; j < targetSentence.size(); j++) { String targetWord = targetSentence.get(j); Pair<Integer, Integer> jiPair = new Pair<Integer, Integer>(j, i); double transProb = sourceTargetCounts.getCount(sourceWord, targetWord); double distortProb = sourceTargetDistortions.getCount(jiPair, lmPair); double update = (transProb * distortProb) / (distortTransSourceTotal); // q(j|ilm)t(f|e)/totals tempSourceTargetCounts.incrementCount(sourceWord, targetWord, update); tempSourceTargetDistortions.incrementCount(jiPair, lmPair, update); targetCounts.incrementCount(targetWord, update); targetDistorts.incrementCount(lmPair, i, update); } } } // update t(s|t) values double delta_trans = 0.0; for (String sourceWord : tempSourceTargetCounts.keySet()) { for (String targetWord : tempSourceTargetCounts.getCounter(sourceWord).keySet()) { double oldProb = sourceTargetCounts.getCount(sourceWord, targetWord); double newProb = tempSourceTargetCounts.getCount(sourceWord, targetWord) / targetCounts.getCount(targetWord); sourceTargetCounts.setCount(sourceWord, targetWord, newProb); delta += Math.pow(oldProb - newProb, 2.0); } } // update q(j|ilm) values double delta_dist = 0.0; for (Pair<Integer, Integer> jiPair : tempSourceTargetDistortions.keySet()) { for (Pair<Integer, Integer> lmPair : tempSourceTargetDistortions.getCounter(jiPair).keySet()) { double oldProb = sourceTargetDistortions.getCount(jiPair, lmPair); double tempAlignProb = tempSourceTargetDistortions.getCount(jiPair, lmPair); double tempTargetDist = targetDistorts.getCount(lmPair, jiPair.getSecond()); double newProb = tempAlignProb / tempTargetDist; sourceTargetDistortions.setCount(jiPair, lmPair, newProb); delta_dist += Math.pow(oldProb - newProb, 2.0); } } delta = (delta_trans / sourceTargetCounts.totalSize() + delta_dist / sourceTargetDistortions.totalSize()) / 2.0; } }