/** * Generate the training features from the CoNLL input file. * * @return Dataset of feature vectors * @throws Exception */ public GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception { GeneralDataset<String, String> dataset = new Dataset<>(); Dictionaries dict = new Dictionaries(props); MentionExtractor mentionExtractor = new CoNLLMentionExtractor(dict, props, new Semantics(dict)); Document document; while ((document = mentionExtractor.nextDoc()) != null) { setTokenIndices(document); document.extractGoldCorefClusters(); Map<Integer, CorefCluster> entities = document.goldCorefClusters; // Generate features for coreferent mentions with class label 1 for (CorefCluster entity : entities.values()) { for (Mention mention : entity.getCorefMentions()) { // Ignore verbal mentions if (mention.headWord.tag().startsWith("V")) continue; IndexedWord head = mention.dependency.getNodeByIndexSafe(mention.headWord.index()); if (head == null) continue; ArrayList<String> feats = mention.getSingletonFeatures(dict); dataset.add(new BasicDatum<>(feats, "1")); } } // Generate features for singletons with class label 0 ArrayList<CoreLabel> gold_heads = new ArrayList<>(); for (Mention gold_men : document.allGoldMentions.values()) { gold_heads.add(gold_men.headWord); } for (Mention predicted_men : document.allPredictedMentions.values()) { SemanticGraph dep = predicted_men.dependency; IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index()); if (head == null) continue; // Ignore verbal mentions if (predicted_men.headWord.tag().startsWith("V")) continue; // If the mention is in the gold set, it is not a singleton and thus ignore if (gold_heads.contains(predicted_men.headWord)) continue; dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0")); } } dataset.summaryStatistics(); return dataset; }
public CorefChain(CorefCluster c, Map<Mention, IntTuple> positions) { chainID = c.clusterID; // Collect mentions mentions = new ArrayList<>(); mentionMap = Generics.newHashMap(); CorefMention represents = null; for (Mention m : c.getCorefMentions()) { CorefMention men = new CorefMention(m, positions.get(m)); mentions.add(men); } Collections.sort(mentions, new CorefMentionComparator()); // Find representative mention for (CorefMention men : mentions) { IntPair position = new IntPair(men.sentNum, men.headIndex); if (!mentionMap.containsKey(position)) mentionMap.put(position, Generics.<CorefMention>newHashSet()); mentionMap.get(position).add(men); if (men.moreRepresentativeThan(represents)) { represents = men; } } representative = represents; }