public List<ClusteredMention> runCoreference(Document doc) { // --Overhead startTrack("Testing " + doc.id); // (variables) List<ClusteredMention> rtn = new ArrayList<ClusteredMention>(doc.getMentions().size()); List<Mention> mentions = doc.getMentions(); int singletons = 0; // --Run Classifier for (int i = 0; i < mentions.size(); i++) { // (variables) Mention onPrix = mentions.get(i); int coreferentWith = -1; // (get mention it is coreferent with) for (int j = i - 1; j >= 0; j--) { ClusteredMention cand = rtn.get(j); boolean coreferent = classifier.classOf( new RVFDatum<Boolean, Feature>(extractor.extractFeatures(Pair.make(onPrix, cand)))); if (coreferent) { coreferentWith = j; break; } } if (coreferentWith < 0) { singletons += 1; rtn.add(onPrix.markSingleton()); } else { // log("Mention " + onPrix + " coreferent with " + mentions.get(coreferentWith)); rtn.add(onPrix.markCoreferent(rtn.get(coreferentWith))); } } // log("" + singletons + " singletons"); // --Return endTrack("Testing " + doc.id); return rtn; }
public void train(Collection<Pair<Document, List<Entity>>> trainingData) { startTrack("Training"); // --Variables RVFDataset<Boolean, Feature> dataset = new RVFDataset<Boolean, Feature>(); LinearClassifierFactory<Boolean, Feature> fact = new LinearClassifierFactory<Boolean, Feature>(); // --Feature Extraction startTrack("Feature Extraction"); for (Pair<Document, List<Entity>> datum : trainingData) { // (document variables) Document doc = datum.getFirst(); List<Entity> goldClusters = datum.getSecond(); List<Mention> mentions = doc.getMentions(); Map<Mention, Entity> goldEntities = Entity.mentionToEntityMap(goldClusters); startTrack("Document " + doc.id); // (for each mention...) for (int i = 0; i < mentions.size(); i++) { // (get the mention and its cluster) Mention onPrix = mentions.get(i); Entity source = goldEntities.get(onPrix); if (source == null) { throw new IllegalArgumentException("Mention has no gold entity: " + onPrix); } // (for each previous mention...) int oldSize = dataset.size(); for (int j = i - 1; j >= 0; j--) { // (get previous mention and its cluster) Mention cand = mentions.get(j); Entity target = goldEntities.get(cand); if (target == null) { throw new IllegalArgumentException("Mention has no gold entity: " + cand); } // (extract features) Counter<Feature> feats = extractor.extractFeatures(Pair.make(onPrix, cand.markCoreferent(target))); // (add datum) dataset.add(new RVFDatum<Boolean, Feature>(feats, target == source)); // (stop if if (target == source) { break; } } // logf("Mention %s (%d datums)", onPrix.toString(), dataset.size() - oldSize); } endTrack("Document " + doc.id); } endTrack("Feature Extraction"); // --Train Classifier startTrack("Minimizer"); this.classifier = fact.trainClassifier(dataset); endTrack("Minimizer"); // --Dump Weights startTrack("Features"); // (get labels to print) Set<Boolean> labels = new HashSet<Boolean>(); labels.add(true); // (print features) for (Triple<Feature, Boolean, Double> featureInfo : this.classifier.getTopFeatures(labels, 0.0, true, 100, true)) { Feature feature = featureInfo.first(); Boolean label = featureInfo.second(); Double magnitude = featureInfo.third(); // log(FORCE,new DecimalFormat("0.000").format(magnitude) + " [" + label + "] " + feature); } end_Track("Features"); endTrack("Training"); }