コード例 #1
0
  public List<ClusteredMention> runCoreference(Document doc) {
    // --Overhead
    startTrack("Testing " + doc.id);
    // (variables)
    List<ClusteredMention> rtn = new ArrayList<ClusteredMention>(doc.getMentions().size());
    List<Mention> mentions = doc.getMentions();
    int singletons = 0;
    // --Run Classifier
    for (int i = 0; i < mentions.size(); i++) {
      // (variables)
      Mention onPrix = mentions.get(i);
      int coreferentWith = -1;
      // (get mention it is coreferent with)
      for (int j = i - 1; j >= 0; j--) {

        ClusteredMention cand = rtn.get(j);

        boolean coreferent =
            classifier.classOf(
                new RVFDatum<Boolean, Feature>(extractor.extractFeatures(Pair.make(onPrix, cand))));

        if (coreferent) {
          coreferentWith = j;
          break;
        }
      }

      if (coreferentWith < 0) {
        singletons += 1;
        rtn.add(onPrix.markSingleton());
      } else {
        // log("Mention " + onPrix + " coreferent with " + mentions.get(coreferentWith));
        rtn.add(onPrix.markCoreferent(rtn.get(coreferentWith)));
      }
    }
    // log("" + singletons + " singletons");
    // --Return
    endTrack("Testing " + doc.id);
    return rtn;
  }
コード例 #2
0
        private <E> Feature feature(
            Class<E> clazz, Pair<Mention, ClusteredMention> input, Option<Double> count) {

          // --Variables
          Mention onPrix =
              input.getFirst(); // the first mention (referred to as m_i in the handout)
          Mention candidate =
              input.getSecond().mention; // the second mention (referred to as m_j in the handout)
          Entity candidateCluster =
              input.getSecond().entity; // the cluster containing the second mention

          // --Features:w
          if (clazz.equals(Feature.ExactMatch.class)) {
            // (exact string match)
            return new Feature.ExactMatch(onPrix.gloss().equals(candidate.gloss()));
          } else if (clazz.equals(Feature.SentenceDist.class)) {
            return new Feature.SentenceDist(
                Math.abs(
                    onPrix.doc.indexOfMention(onPrix) - candidate.doc.indexOfMention(candidate)));
          } else if (clazz.equals(Feature.MentionDist.class)) {
            return new Feature.MentionDist(
                Math.abs(
                    onPrix.doc.indexOfSentence(onPrix.sentence)
                        - candidate.doc.indexOfSentence(candidate.sentence)));
          } else if (clazz.equals(Feature.EitherHeadWordPronoun.class)) {
            return new Feature.EitherHeadWordPronoun(
                Pronoun.isSomePronoun(onPrix.gloss()) || Pronoun.isSomePronoun(candidate.gloss()));
          } else if (clazz.equals(Feature.CandidateNERTag.class)) {
            return new Feature.CandidateNERTag(candidate.headToken().nerTag());
          } else if (clazz.equals(Feature.CandidateSpeaker.class)) {
            return new Feature.CandidateSpeaker(candidate.headToken().speaker());
          } else if (clazz.equals(Feature.FixedSpeaker.class)) {
            return new Feature.FixedSpeaker(onPrix.headToken().speaker());
          } else if (clazz.equals(Feature.HeadWordMatch.class)) {
            return new Feature.HeadWordMatch(onPrix.equals(candidate.headWord()));
          } else if (clazz.equals(Feature.HeadWordLemmaMatch.class)) {
            return new Feature.HeadWordLemmaMatch(
                onPrix.headToken().lemma().equals(candidate.headToken().lemma()));
          } else if (clazz.equals(Feature.FixedNERTag.class)) {
            return new Feature.FixedNERTag(onPrix.headToken().nerTag());
          } else if (clazz.equals(Feature.SpeakerMatch.class)) {
            return new Feature.SpeakerMatch(
                candidate.headToken().speaker().equals(onPrix.headToken().speaker()));
          } else if (clazz.equals(Feature.NERTagMatch.class)) {
            return new Feature.NERTagMatch(
                candidate.headToken().nerTag().equals(onPrix.headToken().nerTag()));
          } else if (clazz.equals(Feature.CandidatePOSTag.class)) {
            return new Feature.CandidatePOSTag(candidate.headToken().posTag());
          } else if (clazz.equals(Feature.FixedPOSTag.class)) {
            return new Feature.FixedPOSTag(onPrix.headToken().posTag());
          } else if (clazz.equals(Feature.GenderMatch.class)) {
            Pair<Boolean, Boolean> match = Util.haveGenderAndAreSameGender(onPrix, candidate);
            boolean finalMatch = (!match.getFirst() || match.getSecond());
            return new Feature.GenderMatch(finalMatch);
          } else if (clazz.equals(Feature.NumberMatch.class)) {
            Pair<Boolean, Boolean> match = Util.haveNumberAndAreSameNumber(onPrix, candidate);
            boolean finalMatch = (!match.getFirst() || match.getSecond());
            return new Feature.NumberMatch(finalMatch);
          }
          //			} else if(clazz.equals(Feature.NewFeature.class) {
          /*
           * TODO: Add features to return for specific classes. Implement calculating values of features here.
           */

          else {
            throw new IllegalArgumentException("Unregistered feature: " + clazz);
          }
        }
コード例 #3
0
 public void train(Collection<Pair<Document, List<Entity>>> trainingData) {
   startTrack("Training");
   // --Variables
   RVFDataset<Boolean, Feature> dataset = new RVFDataset<Boolean, Feature>();
   LinearClassifierFactory<Boolean, Feature> fact =
       new LinearClassifierFactory<Boolean, Feature>();
   // --Feature Extraction
   startTrack("Feature Extraction");
   for (Pair<Document, List<Entity>> datum : trainingData) {
     // (document variables)
     Document doc = datum.getFirst();
     List<Entity> goldClusters = datum.getSecond();
     List<Mention> mentions = doc.getMentions();
     Map<Mention, Entity> goldEntities = Entity.mentionToEntityMap(goldClusters);
     startTrack("Document " + doc.id);
     // (for each mention...)
     for (int i = 0; i < mentions.size(); i++) {
       // (get the mention and its cluster)
       Mention onPrix = mentions.get(i);
       Entity source = goldEntities.get(onPrix);
       if (source == null) {
         throw new IllegalArgumentException("Mention has no gold entity: " + onPrix);
       }
       // (for each previous mention...)
       int oldSize = dataset.size();
       for (int j = i - 1; j >= 0; j--) {
         // (get previous mention and its cluster)
         Mention cand = mentions.get(j);
         Entity target = goldEntities.get(cand);
         if (target == null) {
           throw new IllegalArgumentException("Mention has no gold entity: " + cand);
         }
         // (extract features)
         Counter<Feature> feats =
             extractor.extractFeatures(Pair.make(onPrix, cand.markCoreferent(target)));
         // (add datum)
         dataset.add(new RVFDatum<Boolean, Feature>(feats, target == source));
         // (stop if
         if (target == source) {
           break;
         }
       }
       // logf("Mention %s (%d datums)", onPrix.toString(), dataset.size() - oldSize);
     }
     endTrack("Document " + doc.id);
   }
   endTrack("Feature Extraction");
   // --Train Classifier
   startTrack("Minimizer");
   this.classifier = fact.trainClassifier(dataset);
   endTrack("Minimizer");
   // --Dump Weights
   startTrack("Features");
   // (get labels to print)
   Set<Boolean> labels = new HashSet<Boolean>();
   labels.add(true);
   // (print features)
   for (Triple<Feature, Boolean, Double> featureInfo :
       this.classifier.getTopFeatures(labels, 0.0, true, 100, true)) {
     Feature feature = featureInfo.first();
     Boolean label = featureInfo.second();
     Double magnitude = featureInfo.third();
     // log(FORCE,new DecimalFormat("0.000").format(magnitude) + " [" + label + "] " + feature);
   }
   end_Track("Features");
   endTrack("Training");
 }