private void findSpeakersInConversation(Dictionaries dict) {
   for (List<Mention> l : predictedOrderedMentionsBySentence) {
     for (Mention m : l) {
       if (m.predicateNominatives == null) continue;
       for (Mention a : m.predicateNominatives) {
         if (a.spanToString().toLowerCase().equals("i")) {
           speakers.put(
               m.headWord.get(CoreAnnotations.UtteranceAnnotation.class),
               Integer.toString(m.mentionID));
         }
       }
     }
   }
   List<CoreMap> paragraph = new ArrayList<CoreMap>();
   int paragraphUtterIndex = 0;
   String nextParagraphSpeaker = "";
   int paragraphOffset = 0;
   for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
     int currentUtter =
         sent.get(CoreAnnotations.TokensAnnotation.class)
             .get(0)
             .get(CoreAnnotations.UtteranceAnnotation.class);
     if (paragraphUtterIndex != currentUtter) {
       nextParagraphSpeaker =
           findParagraphSpeaker(
               paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
       paragraphUtterIndex = currentUtter;
       paragraphOffset += paragraph.size();
       paragraph = new ArrayList<CoreMap>();
     }
     paragraph.add(sent);
   }
   findParagraphSpeaker(
       paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
 }
  public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) {
    List<CoreMap> sentences =
        conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class);
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    CollectionValuedMap<String, CoreMap> corefChainMap = conllDoc.getCorefChainMap();
    for (int i = 0; i < sentences.size(); i++) {
      allGoldMentions.add(new ArrayList<Mention>());
    }
    int maxCorefClusterId = -1;
    for (String corefIdStr : corefChainMap.keySet()) {
      int id = Integer.parseInt(corefIdStr);
      if (id > maxCorefClusterId) {
        maxCorefClusterId = id;
      }
    }
    int newMentionID = maxCorefClusterId + 1;
    for (String corefIdStr : corefChainMap.keySet()) {
      int id = Integer.parseInt(corefIdStr);
      int clusterMentionCnt = 0;
      for (CoreMap m : corefChainMap.get(corefIdStr)) {
        clusterMentionCnt++;
        Mention mention = new Mention();

        mention.goldCorefClusterID = id;
        if (clusterMentionCnt == 1) {
          // First mention in cluster
          mention.mentionID = id;
          mention.originalRef = -1;
        } else {
          mention.mentionID = newMentionID;
          mention.originalRef = id;
          newMentionID++;
        }
        if (maxID < mention.mentionID) maxID = mention.mentionID;
        int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class);
        CoreMap sent = sentences.get(sentIndex);
        mention.startIndex =
            m.get(CoreAnnotations.TokenBeginAnnotation.class)
                - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
        mention.endIndex =
            m.get(CoreAnnotations.TokenEndAnnotation.class)
                - sent.get(CoreAnnotations.TokenBeginAnnotation.class);

        // will be set by arrange
        mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class);

        // Mention dependency is collapsed dependency for sentence
        mention.dependency =
            sentences
                .get(sentIndex)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);

        allGoldMentions.get(sentIndex).add(mention);
      }
    }
    return allGoldMentions;
  }
  /**
   * Generate the training features from the CoNLL input file.
   *
   * @return Dataset of feature vectors
   * @throws Exception
   */
  public GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception {

    GeneralDataset<String, String> dataset = new Dataset<>();

    Dictionaries dict = new Dictionaries(props);
    MentionExtractor mentionExtractor = new CoNLLMentionExtractor(dict, props, new Semantics(dict));

    Document document;
    while ((document = mentionExtractor.nextDoc()) != null) {
      setTokenIndices(document);
      document.extractGoldCorefClusters();
      Map<Integer, CorefCluster> entities = document.goldCorefClusters;

      // Generate features for coreferent mentions with class label 1
      for (CorefCluster entity : entities.values()) {
        for (Mention mention : entity.getCorefMentions()) {
          // Ignore verbal mentions
          if (mention.headWord.tag().startsWith("V")) continue;

          IndexedWord head = mention.dependency.getNodeByIndexSafe(mention.headWord.index());
          if (head == null) continue;
          ArrayList<String> feats = mention.getSingletonFeatures(dict);
          dataset.add(new BasicDatum<>(feats, "1"));
        }
      }

      // Generate features for singletons with class label 0
      ArrayList<CoreLabel> gold_heads = new ArrayList<>();
      for (Mention gold_men : document.allGoldMentions.values()) {
        gold_heads.add(gold_men.headWord);
      }
      for (Mention predicted_men : document.allPredictedMentions.values()) {
        SemanticGraph dep = predicted_men.dependency;
        IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index());
        if (head == null) continue;

        // Ignore verbal mentions
        if (predicted_men.headWord.tag().startsWith("V")) continue;
        // If the mention is in the gold set, it is not a singleton and thus ignore
        if (gold_heads.contains(predicted_men.headWord)) continue;

        dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0"));
      }
    }

    dataset.summaryStatistics();
    return dataset;
  }
  /** Check one mention is the speaker of the other mention */
  public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) {

    if (!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase())
        || ant.number == Number.PLURAL
        || ant.sentNum != m.sentNum) return false;

    int countQuotationMark = 0;
    for (int i = Math.min(m.headIndex, ant.headIndex) + 1;
        i < Math.max(m.headIndex, ant.headIndex);
        i++) {
      String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class);
      if (word.equals("``") || word.equals("''")) countQuotationMark++;
    }
    if (countQuotationMark != 1) return false;

    IndexedWord w =
        m.dependency.getNodeByWordPattern(
            m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class));
    if (w == null) return false;

    for (Pair<GrammaticalRelation, IndexedWord> parent : m.dependency.parentPairs(w)) {
      if (parent.first().getShortName().equals("nsubj")
          && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) {
        return true;
      }
    }
    return false;
  }
  /** Mark twin mentions: All mention boundaries should be matched */
  private void findTwinMentionsStrict() {
    for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
      List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
      List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);

      // For CoNLL training there are some documents with gold mentions with the same position
      // offsets
      // See
      // /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
      //  (Packwood - Roth)
      CollectionValuedMap<IntPair, Mention> goldMentionPositions =
          new CollectionValuedMap<IntPair, Mention>();
      for (Mention g : golds) {
        IntPair ip = new IntPair(g.startIndex, g.endIndex);
        if (goldMentionPositions.containsKey(ip)) {
          StringBuilder existingMentions = new StringBuilder();
          for (Mention eg : goldMentionPositions.get(ip)) {
            if (existingMentions.length() > 0) {
              existingMentions.append(",");
            }
            existingMentions.append(eg.mentionID);
          }
          SieveCoreferenceSystem.logger.warning(
              "WARNING: gold mentions with the same offsets: "
                  + ip
                  + " mentions="
                  + g.mentionID
                  + ","
                  + existingMentions
                  + ", "
                  + g.spanToString());
        }
        // assert(!goldMentionPositions.containsKey(ip));
        goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
      }
      for (Mention p : predicts) {
        IntPair pos = new IntPair(p.startIndex, p.endIndex);
        if (goldMentionPositions.containsKey(pos)) {
          Collection<Mention> cm = goldMentionPositions.get(pos);
          Mention g = cm.iterator().next();
          cm.remove(g);
          p.mentionID = g.mentionID;
          p.twinless = false;
          g.twinless = false;
        }
      }
      // temp: for making easy to recognize twinless mention
      for (Mention p : predicts) {
        if (p.twinless) p.mentionID += 10000;
      }
    }
  }
  public void setKeywordsWikiMiner() {
    try {
      WikipediaAnnotator annotator = new WikipediaAnnotator();
      long annstartTime = System.currentTimeMillis();

      HashMap<String, Label.Sense[]> ment2ent = annotator.annotate(document);

      long annendTime = System.currentTimeMillis();

      long diff1 = (annendTime - annstartTime);

      System.out.println("Time taken by annotater : " + diff1 + " milliseconds");

      for (String key : ment2ent.keySet()) {

        // sunny:adding code to check if sense is within freebase dataset
        // adding it only if we find it.

        //				Vector<Label.Sense> updatedsenses = new Vector();
        //				for(Label.Sense s : ment2ent.get(key)){
        //					String entity = s.getTitle().replace(" ", "_");
        //					String freebaseid =
        // WikiToFreebaseIDMap.getInstance().getFreeBaseID("\"/wikipedia/en_title/" + entity +
        // "\"");
        //
        //					if(freebaseid != null){
        //						updatedsenses.add(s);
        //					}
        //				}
        //
        //				Label.Sense[] sensearray = new Label.Sense[updatedsenses.size()];
        //
        //				updatedsenses.toArray(sensearray);

        LabelSense senses = new LabelSense(ment2ent.get(key));
        // LabelSense senses = new LabelSense(sensearray);
        Mention mention = new Mention();
        // System.out.println("key from ment2ent : " + key);
        String ment = key.split("_")[0];

        // System.out.println("ment from ment2ent : " + ment);
        int off = Integer.parseInt(key.split("_")[1]);
        mention.key = ment;
        mention.name = ment;
        mention.length = ment.length();
        mention.offset = off;
        mention.context = getContext(off, mention.length, contextSize);
        mention.contextAroundMention = getContext(off, mention.length, 10);

        mention.senses = senses;
        keywords.add(mention);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
 public void setGroundMention(ArrayList<XMLTagInfo> groundtruth) throws Exception {
   groundMention.clear();
   keywords.clear();
   for (int i = 0; i < groundtruth.size(); i++) {
     int off = groundtruth.get(i).offset;
     int len = groundtruth.get(i).length;
     groundMention.add(document.substring(off, off + len));
     Mention m = new Mention();
     int context_lo = Math.max(0, off - contextSize);
     int context_hi = Math.min(document.length() - 1, off + contextSize);
     String contextString = document.substring(context_lo, context_hi);
     m.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", "").toLowerCase();
     int con_lo = Math.max(0, m.offset - 10);
     int con_hi = Math.min(document.length() - 1, m.offset + 10);
     m.contextAroundMention =
         document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " ").toLowerCase();
     m.contextAroundMention += " " + m.name.replaceAll("[^0-9a-z\\sA-Z]", " ").toLowerCase();
     m.name = document.substring(off, off + len);
     m.context.replaceAll("\\sand", "");
     m.contextAroundMention.replaceAll("\\sand", "");
     m.context.replaceAll("\\snot", "");
     m.contextAroundMention.replaceAll("\\snot", "");
     m.length = len;
     m.offset = off;
     keywords.add(m);
   }
 }
  /** Mark twin mentions: heads of the mentions are matched */
  private void findTwinMentionsRelaxed() {
    for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
      List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
      List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);

      Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap();
      Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap();
      for (Mention g : golds) {
        goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g);
        if (!goldMentionHeadPositions.containsKey(g.headIndex)) {
          goldMentionHeadPositions.put(g.headIndex, new LinkedList<Mention>());
        }
        goldMentionHeadPositions.get(g.headIndex).add(g);
      }

      List<Mention> remains = new ArrayList<Mention>();
      for (Mention p : predicts) {
        IntPair pos = new IntPair(p.startIndex, p.endIndex);
        if (goldMentionPositions.containsKey(pos)) {
          Mention g = goldMentionPositions.get(pos);
          p.mentionID = g.mentionID;
          p.twinless = false;
          g.twinless = false;
          goldMentionHeadPositions.get(g.headIndex).remove(g);
          if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
            goldMentionHeadPositions.remove(g.headIndex);
          }
        } else remains.add(p);
      }
      for (Mention r : remains) {
        if (goldMentionHeadPositions.containsKey(r.headIndex)) {
          Mention g = goldMentionHeadPositions.get(r.headIndex).poll();
          r.mentionID = g.mentionID;
          r.twinless = false;
          g.twinless = false;
          if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
            goldMentionHeadPositions.remove(g.headIndex);
          }
        }
      }
    }
  }
 /** When there is no mentionID information (without gold annotation), assign mention IDs */
 protected void assignOriginalID() {
   List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions();
   boolean hasOriginalID = true;
   for (List<Mention> l : orderedMentionsBySentence) {
     if (l.size() == 0) continue;
     for (Mention m : l) {
       if (m.mentionID == -1) {
         hasOriginalID = false;
       }
     }
   }
   if (!hasOriginalID) {
     int id = 0;
     for (List<Mention> l : orderedMentionsBySentence) {
       for (Mention m : l) {
         m.mentionID = id++;
       }
     }
   }
 }
  public List<ClusteredMention> runCoreference(Document doc) {
    // --Overhead
    startTrack("Testing " + doc.id);
    // (variables)
    List<ClusteredMention> rtn = new ArrayList<ClusteredMention>(doc.getMentions().size());
    List<Mention> mentions = doc.getMentions();
    int singletons = 0;
    // --Run Classifier
    for (int i = 0; i < mentions.size(); i++) {
      // (variables)
      Mention onPrix = mentions.get(i);
      int coreferentWith = -1;
      // (get mention it is coreferent with)
      for (int j = i - 1; j >= 0; j--) {

        ClusteredMention cand = rtn.get(j);

        boolean coreferent =
            classifier.classOf(
                new RVFDatum<Boolean, Feature>(extractor.extractFeatures(Pair.make(onPrix, cand))));

        if (coreferent) {
          coreferentWith = j;
          break;
        }
      }

      if (coreferentWith < 0) {
        singletons += 1;
        rtn.add(onPrix.markSingleton());
      } else {
        // log("Mention " + onPrix + " coreferent with " + mentions.get(coreferentWith));
        rtn.add(onPrix.markCoreferent(rtn.get(coreferentWith)));
      }
    }
    // log("" + singletons + " singletons");
    // --Return
    endTrack("Testing " + doc.id);
    return rtn;
  }
 /** Set paragraph index */
 private void setParagraphAnnotation() {
   int paragraphIndex = 0;
   int previousOffset = -10;
   for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
     for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
       if (w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
         if (w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset + 2)
           paragraphIndex++;
         w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex);
         previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
       } else {
         w.set(CoreAnnotations.ParagraphAnnotation.class, -1);
       }
     }
   }
   for (List<Mention> l : predictedOrderedMentionsBySentence) {
     for (Mention m : l) {
       m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class);
     }
   }
   numParagraph = paragraphIndex;
 }
 public static List<List<Mention>> makeCopy(List<List<Mention>> mentions) {
   List<List<Mention>> copy = new ArrayList<List<Mention>>(mentions.size());
   for (List<Mention> sm : mentions) {
     List<Mention> sm2 = new ArrayList<Mention>(sm.size());
     for (Mention m : sm) {
       Mention m2 = new Mention();
       m2.goldCorefClusterID = m.goldCorefClusterID;
       m2.mentionID = m.mentionID;
       m2.startIndex = m.startIndex;
       m2.endIndex = m.endIndex;
       m2.originalSpan = m.originalSpan;
       m2.dependency = m.dependency;
       sm2.add(m2);
     }
     copy.add(sm2);
   }
   return copy;
 }
Beispiel #13
0
    public CorefMention(Mention m, IntTuple pos) {
      mentionType = m.mentionType;
      number = m.number;
      gender = m.gender;
      animacy = m.animacy;
      startIndex = m.startIndex + 1;
      endIndex = m.endIndex + 1;
      headIndex = m.headIndex + 1;
      corefClusterID = m.corefClusterID;
      sentNum = m.sentNum + 1;
      mentionID = m.mentionID;
      mentionSpan = m.spanToString();

      // index starts from 1
      position = new IntTuple(2);
      position.set(0, pos.get(0) + 1);
      position.set(1, pos.get(1) + 1);

      m.headWord.set(CorefCoreAnnotations.CorefClusterIdAnnotation.class, corefClusterID);
    }
  /** initialize positions and corefClusters (put each mention in each CorefCluster) */
  private void initializeCorefCluster() {
    for (int i = 0; i < predictedOrderedMentionsBySentence.size(); i++) {
      for (int j = 0; j < predictedOrderedMentionsBySentence.get(i).size(); j++) {
        Mention m = predictedOrderedMentionsBySentence.get(i).get(j);
        if (allPredictedMentions.containsKey(m.mentionID)) {
          SieveCoreferenceSystem.logger.warning("WARNING: Already contain mention " + m.mentionID);
          Mention m1 = allPredictedMentions.get(m.mentionID);
          SieveCoreferenceSystem.logger.warning(
              "OLD mention: " + m1.spanToString() + "[" + m1.startIndex + "," + m1.endIndex + "]");
          SieveCoreferenceSystem.logger.warning(
              "NEW mention: " + m.spanToString() + "[" + m.startIndex + "," + m.endIndex + "]");
          //          SieveCoreferenceSystem.debugPrintMentions(System.err, "PREDICTED ORDERED",
          // predictedOrderedMentionsBySentence);
          //          SieveCoreferenceSystem.debugPrintMentions(System.err, "GOLD ORDERED",
          // goldOrderedMentionsBySentence);
        }
        assert (!allPredictedMentions.containsKey(m.mentionID));
        allPredictedMentions.put(m.mentionID, m);

        IntTuple pos = new IntTuple(2);
        pos.set(0, i);
        pos.set(1, j);
        positions.put(m, pos);
        m.sentNum = i;

        assert (!corefClusters.containsKey(m.mentionID));
        corefClusters.put(
            m.mentionID, new CorefCluster(m.mentionID, Generics.newHashSet(Arrays.asList(m))));
        m.corefClusterID = m.mentionID;

        IntTuple headPosition = new IntTuple(2);
        headPosition.set(0, i);
        headPosition.set(1, m.headIndex);
        mentionheadPositions.put(headPosition, m);
      }
    }
  }
  public void consolidateMentions(int maxLength) {

    if (!Config.Server && thesaurus == null) {
      thesaurus = new Wikisaurus();
    }

    ClientWikisauras obj = new ClientWikisauras();

    //		LuceneIndexWrapper luceneIndex = new LuceneIndexWrapper(
    //				props.getCompleteIndex(), props.getRedirectIndex(),
    //				props.getInlinkIndex(), props.getDisambIndex(),
    //				props.getAnchorIndex());

    ArrayList<Mention> mentions = new ArrayList<Mention>();
    mentions.addAll(keywords);
    // System.out.println("consolidating mentions size:" + mentions.size());
    keywords = new ArrayList<Mention>();
    Integer[] token_type = new Integer[mentions.size()];
    for (int i = 0; i < token_type.length; i++) token_type[i] = 0;

    int curr_offset = 0;
    String curr_mention = "";
    for (int i = 0; i < mentions.size(); i++) {
      if (token_type[i] != 0) {
        // i++;
        continue;
      }
      curr_offset = mentions.get(i).offset;
      curr_mention = mentions.get(i).name;
      // System.out.println("offset: " + curr_offset + " curr_mention: " + curr_mention + " context:
      // " + mentions.get(i).context);

      String[] allWords = new String[maxLength];
      Integer[] allOffset = new Integer[maxLength];
      String currWord = curr_mention;
      Integer currWordEnd = curr_offset + curr_mention.length() + 1;

      allWords[0] = currWord;
      allOffset[0] = curr_offset;
      int k = 1;
      for (; k < maxLength; k++) {
        currWordEnd = document.indexOf(" ", currWordEnd + 1);
        if (currWordEnd == -1) currWordEnd = document.length();
        if (curr_offset < 0 || curr_offset >= document.length()) {
          k--;
          break;
        }
        currWord = document.substring(curr_offset, currWordEnd);
        allWords[k] = currWord;
        allOffset[k] = currWordEnd;
        if (currWordEnd >= document.length()) break;
      }
      if (k == maxLength) k--;

      for (; k >= 0; k--) {
        LabelSense senses = null;
        // System.out.println("allwords[" + k + "] : " + allWords[k]);
        try {
          if (Config.Server) senses = obj.getSenses(allWords[k]);
          else {
            // String possibleMention = WordUtils.capitalize(allWords[k]);
            Label.Sense[] temp = thesaurus.getSenses(allWords[k]);

            //						List<String> qwords = Arrays.asList(allWords[k].split(" "));
            //
            //						boolean nostopword = true;
            //						for(String item : qwords){
            //							if(Stopwords.isStopword(item)){
            //								nostopword = false;
            //							}
            //						}

            if (temp != null) senses = new LabelSense(temp);

            //						Vector<String> sensewmc = new Vector();
            //						Vector<Double> sensewmp = new Vector();

            // hard coded search for the word in the freebase dataset. should not be done for long
            // text hence commenting
            //						if(!Stopwords.isStopword(allWords[k].toLowerCase())){
            //							Vector<String> freebaseTitles =
            // WikiToFreebaseIDMap.getInstance().getAllWikiTitles(allWords[k].toLowerCase());
            //							//String title = "/wikipedia/en_title/" + allWords[k].replace(" ", "_");
            //							//java.util.regex.Pattern pa =
            // java.util.regex.Pattern.compile(title.toLowerCase());
            //							//java.util.regex.Matcher ma = pa.matcher("");
            //
            //							if(freebaseTitles != null){
            //								for(String fbTitle : freebaseTitles){
            //									fbTitle = fbTitle.replace("/wikipedia/en_title/", "");
            //									fbTitle = fbTitle.replace("\"", "");
            //									fbTitle = fbTitle.replace("_", " ");
            //									System.out.println("fbTitle : " + fbTitle);
            //									sensewmc.add(fbTitle);
            //									sensewmp.add(new Double(1.0/(freebaseTitles.size())));
            //									//sensewmp.add(new Double(0));
            //								}
            //							}
            //						}

            //						if((temp != null) || (sensewmc.size() > 0)){
            //							senses = new LabelSense();
            //
            //							int scount= 0,total = 0;
            //
            //							if(temp != null){
            //								total = temp.length + sensewmc.size();
            //								senses.wikiMinerCandidate = new String[temp.length + sensewmc.size()];
            //								senses.wikiMinerProbability = new double[temp.length + sensewmc.size()];
            //
            //								for(;scount<temp.length;++scount){
            //									senses.wikiMinerCandidate[scount] = temp[scount].getTitle();
            //									senses.wikiMinerProbability[scount] = temp[scount].getPriorProbability();
            //								}
            //								scount = temp.length;
            //							}
            //							else{
            //								total = sensewmc.size();
            //								senses.wikiMinerCandidate = new String[sensewmc.size()];
            //								senses.wikiMinerProbability = new double[sensewmc.size()];
            //							}
            //
            //							for(int cnt = 0;scount < total;++scount,++cnt){
            //								senses.wikiMinerCandidate[scount] = sensewmc.elementAt(cnt);
            //								senses.wikiMinerProbability[scount] = sensewmp.elementAt(cnt);
            //							}
            //						}

            // else if((k >= 1 && k <= 2) || ((k == 0) &&
            // (!Stopwords.isStopword(allWords[k].split("_")[0]))))
            //						else if((k <= 2) && (nostopword == true))
            //						{
            //							String myquery = allWords[k];
            //
            //							String query = luceneIndex.buildPhraseSearchQuery(myquery,null);
            //
            //							System.out.println("query : " + query);
            //
            //							if (query != null) {
            //								luceneIndex.searchStringInIndex(query, 2);
            //
            //								System.out.println("found : " + luceneIndex.hits.scoreDocs.length);
            //
            //								Vector<String> sensewmc = new Vector();
            //								Vector<Double> sensewmp = new Vector();
            //
            //								for (int licount = 0;licount < luceneIndex.hits.scoreDocs.length;++licount) {
            //									Document doc =
            // luceneIndex.searcher.doc(luceneIndex.hits.scoreDocs[licount].doc); // get the next
            // document
            //
            //									String pagetitle = doc.get("page_title");
            //									String disamb = doc.get("title_disamb");
            //									if (!((disamb == null) || disamb.equals(""))){
            //										pagetitle = pagetitle + " (" + disamb + ")";
            //									}
            //
            //									System.out.println("lucene hit: " + pagetitle + " score : " +
            // luceneIndex.hits.scoreDocs[licount].score);
            //
            //									//if(luceneIndex.hits.scoreDocs[licount].score < 0.5)
            //									//	continue;
            //
            //									sensewmc.add(pagetitle);
            //									sensewmp.add(new Double(luceneIndex.hits.scoreDocs[licount].score));
            //
            //									if(sensewmc.size() == 3)
            //										break;
            //								}
            //
            //								if(sensewmc.size() > 0){
            //									senses = new LabelSense();
            //
            //									if(sensewmc.size() == 3){
            //										senses.wikiMinerCandidate = new String[3];
            //										senses.wikiMinerProbability = new double[3];
            //									}
            //									else{
            //										senses.wikiMinerCandidate = new String[sensewmc.size()];
            //										senses.wikiMinerProbability = new double[sensewmc.size()];
            //									}
            //								}
            //
            //								for(int scount=0;scount<sensewmc.size();++scount){
            //									senses.wikiMinerCandidate[scount] = sensewmc.elementAt(scount);
            //									senses.wikiMinerProbability[scount] = sensewmp.elementAt(scount);
            //								}
            //							}
            //						}
          }
        } catch (Exception e) {
          e.printStackTrace();
          System.exit(1);
        }
        if (null != senses) {

          //					Vector<String> updatedsensewmc = new Vector();
          //					Vector<Double> updatedsensewmp = new Vector();
          //					for(int x=0;x<senses.wikiMinerCandidate.length;++x){
          //						System.out.println("senses: " + senses.wikiMinerCandidate[x]);
          //						String entity = senses.wikiMinerCandidate[x].replace(" ", "_");
          //						String freebaseid =
          // WikiToFreebaseIDMap.getInstance().getFreeBaseID("\"/wikipedia/en_title/" + entity +
          // "\"");
          //						if(freebaseid != null){
          //							updatedsensewmc.add(senses.wikiMinerCandidate[x]);
          //							updatedsensewmp.add(senses.wikiMinerProbability[x]);
          //						}
          //					}
          //
          //					LabelSense lsense = new LabelSense();
          //
          //					lsense.wikiMinerCandidate = new String[updatedsensewmc.size()];
          //					lsense.wikiMinerProbability = new double[updatedsensewmc.size()];
          //
          //					for(int scount=0;scount<updatedsensewmc.size();++scount){
          //						lsense.wikiMinerCandidate[scount] = updatedsensewmc.elementAt(scount);
          //						lsense.wikiMinerProbability[scount] = updatedsensewmp.elementAt(scount);
          //					}

          Mention new_mention = new Mention();
          new_mention.name = allWords[k];
          new_mention.length = new_mention.name.length();
          new_mention.offset = curr_offset;
          new_mention.context = getContext(curr_offset, new_mention.length, contextSize);
          new_mention.contextAroundMention = getContext(curr_offset, new_mention.length, 10);
          if (k == 0) new_mention.key = mentions.get(i).key;
          // new_mention.senses = lsense;
          new_mention.senses = senses;

          System.out.println("wikiminer candidate for : " + new_mention.name);
          for (int ic = 0; ic < senses.wikiMinerCandidate.length; ++ic) {
            System.out.println(
                "\t" + senses.wikiMinerCandidate[ic] + "  " + senses.wikiMinerProbability[ic]);
          }

          keywords.add(new_mention);
          // System.out.println("new_mention offset + length : " + new_mention.offset + " " +
          // new_mention.length);
          if (!isArticleToken(curr_mention)) {
            for (int j = i;
                j < mentions.size()
                    && mentions.get(j).offset < (new_mention.offset + new_mention.length);
                j++) token_type[j] = 1;
          } else {
            token_type[i] = 2;
          }
          break;
        }
      }

      if (token_type[i] == 0 && !isArticleToken(curr_mention) && isValidToken(curr_mention)) {
        keywords.add(mentions.get(i));
      }
    }
  }
  public void setKeywords(boolean stem) throws Exception {
    tagged_document = tagger.tagString(document);
    ArrayList<String> tokens = new ArrayList<String>();

    // System.out.println("tagged document : " + tagged_document);

    StringTokenizer str = new StringTokenizer(tagged_document);

    while (str.hasMoreTokens()) {
      String token = str.nextToken();
      if (token == null || "".equals(token) || " ".equals(token)) continue;
      if (!Stopwords.isStopword(token.split("_")[0])
          || noun_tags.contains(token.split("_")[1])
          || adj_tags.contains(token.split("_")[1])
          || extra_tags.contains(token.split("_")[1])) tokens.add(token);
      // System.out.println("token : " + token);
      if (!Stopwords.isStopword(token.split("_")[0])) {
        //				System.out.println("token added.");
        tokens.add(token);
      }
    }

    String prev_tag = null; // if previous token was a noun then add n-gram
    // noun clause

    int curr_offset = 0, currbyte = 0;
    for (int i = 0; i < tokens.size(); i++) {
      // System.out.print(" "+tokens.get(i));
      if (tokens.get(i) == null) continue;
      Matcher matcher = pattern.matcher(tokens.get(i));
      matcher.find();
      String word = matcher.group(1);
      String tag = matcher.group(2);

      // System.out.println("word: " + word + " tag: " + tag);

      if (word == null || "".equals(word)) {
        prev_tag = null;
        continue;
      }
      String token = word.replaceAll("[^0-9a-z\\sA-Z/\\-]", "");
      if ("".equals(token) || "/".equals(token)) {
        prev_tag = null;
        continue;
      }
      if (!(noun_tags.contains(tag) || adj_tags.contains(tag) || extra_tags.contains(tag))) {
        prev_tag = null;
        continue;
      }
      Mention mention = new Mention();

      if (tag.equals("JJ")) {
        String temp = TestJAWS.getNounForm(token);
        if (temp != null && !"".equals(temp)) {
          mention.key = temp;
          prev_tag = null;
        } else {
          mention.key = token;
          prev_tag = null;
        }
      } else {
        mention.key = token;
      }

      mention.name = word;
      mention.length = word.length();
      curr_offset = document.indexOf(word, curr_offset);

      mention.offset = curr_offset;
      mention.context = getContext(curr_offset, mention.length, contextSize);
      mention.contextAroundMention = getContext(curr_offset, mention.length, 10);
      // StringTokenizer str1 = new StringTokenizer(contextString);
      // while(str1.hasMoreTokens()){
      // String w=str1.nextToken();
      // if (w == null || "".equals(w) || " ".equals(w)) continue;
      // mention.context.add(w);
      // }
      // parseContext(mention);
      // System.out.println("mention.name : " + mention.name + " offset : " + mention.offset);
      keywords.add(mention);
    }
    // System.out.println("Keywords: "+getMentionNames());
    consolidateMentions(6);
    // consolidateMentions(4);
  }
 // for collective training as we already have ground mentions
 public void setKeywordsTraining(
     HashMap<String, ArrayList<XMLTagInfo>> groundMapWiki,
     HashMap<String, ArrayList<XMLTagInfo>> groundMapManual,
     String file) {
   ArrayList<XMLTagInfo> mapForTrainFile = groundMapWiki.get(file);
   for (int i = 0; i < mapForTrainFile.size(); i++) {
     Mention mention = new Mention();
     mention.key = mapForTrainFile.get(i).mention;
     mention.name = mapForTrainFile.get(i).mention;
     mention.length = mapForTrainFile.get(i).length;
     mention.offset = mapForTrainFile.get(i).offset;
     if (null == mention.name)
       mention.name = document.substring(mention.offset, mention.offset + mention.length);
     if (mention.offset < document.length() - 1) {
       int context_lo = Math.max(0, (mention.offset) - contextSize);
       int context_hi = Math.min(document.length() - 1, (mention.offset) + contextSize);
       String contextString = document.substring(context_lo, context_hi);
       mention.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", "");
       int con_lo = Math.max(0, mention.offset - 10);
       int con_hi = Math.min(document.length() - 1, mention.offset + 10);
       mention.contextAroundMention =
           document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " ");
       mention.contextAroundMention += " " + mention.name.replaceAll("[^0-9a-z\\sA-Z]", " ");
       keywords.add(mention);
     }
   }
   if (groundMapManual != null) {
     ArrayList<XMLTagInfo> mapForTrainFile1 = groundMapManual.get(file);
     for (int i = 0; i < mapForTrainFile1.size(); i++) {
       Mention mention = new Mention();
       mention.key = mapForTrainFile1.get(i).mention;
       mention.name = mapForTrainFile1.get(i).mention;
       mention.length = mapForTrainFile1.get(i).mention.length();
       mention.offset = mapForTrainFile1.get(i).offset;
       int context_lo = Math.max(0, mention.offset - contextSize);
       int context_hi = Math.min(document.length() - 1, mention.offset + contextSize);
       String contextString = document.substring(context_lo, context_hi);
       mention.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", "");
       int con_lo = Math.max(0, mention.offset - 10);
       int con_hi = Math.min(document.length() - 1, mention.offset + 10);
       mention.contextAroundMention =
           document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " ");
       mention.contextAroundMention += " " + mention.name.replaceAll("[^0-9a-z\\sA-Z]", " ");
       keywords.add(mention);
     }
   }
 }
  @Override
  public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern =
        Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern =
        Pattern.compile(
            "(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset)) return null;

    currentOffset = docMatcher.end();
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    // Maintain current document ID.
    Pattern docIDPattern =
        Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1);
    else currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
      String sentenceString = sentenceMatcher.group(2);
      List<CoreLabel> words =
          tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();

      // FIXING TOKENIZATION PROBLEMS
      for (int i = 0; i < words.size(); i++) {
        CoreLabel w = words.get(i);
        if (i > 0 && w.word().equals("$")) {
          if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
            continue;
          words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
          words.remove(i);
          i--;
        } else if (w.word().equals("\\/")) {
          if (words.get(i - 1).word().equals("</COREF>")) continue;
          w.set(
              CoreAnnotations.TextAnnotation.class,
              words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
          words.remove(i + 1);
          words.remove(i - 1);
        }
      }
      // END FIXING TOKENIZATION PROBLEMS

      List<CoreLabel> sentence = new ArrayList<CoreLabel>();
      // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently
      // open
      Stack<Mention> stack = new Stack<Mention>();
      List<Mention> mentions = new ArrayList<Mention>();

      allWords.add(sentence);
      allGoldMentions.add(mentions);

      for (CoreLabel word : words) {
        String w = word.get(CoreAnnotations.TextAnnotation.class);
        // found regular token: WORD/POS
        if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
          int i = w.lastIndexOf("\\/");
          String w1 = w.substring(0, i);
          // we do NOT set POS info here. We take the POS tags from the parser!
          word.set(CoreAnnotations.TextAnnotation.class, w1);
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if (Constants.USE_GOLD_NE) {
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
        // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
        else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
          Pattern nerPattern = Pattern.compile("<(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          ner = m.group(1);
        }
        // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
        else if (w.startsWith("</") && !w.startsWith("</COREF")) {
          Pattern nerPattern = Pattern.compile("</(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          String ner1 = m.group(1);
          if (ner != null && !ner.equals(ner1))
            throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
          ner = null;
        }
        // found the start SGML tag for a coref mention
        else if (w.startsWith("<COREF")) {
          Mention mention = new Mention();
          // position of this mention in the sentence
          mention.startIndex = sentence.size();

          // extract GOLD info about this coref chain. needed for eval
          Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
          Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

          Matcher m = idPattern.matcher(w);
          m.find();
          mention.mentionID = Integer.parseInt(m.group(1));

          m = refPattern.matcher(w);
          if (m.find()) {
            mention.originalRef = Integer.parseInt(m.group(1));
          }

          // open mention. keep track of all open mentions using the stack
          stack.push(mention);
        }
        // found the end SGML tag for a coref mention
        else if (w.equals("</COREF>")) {
          Mention mention = stack.pop();
          mention.endIndex = sentence.size();

          // this is a closed mention. add it to the final list of mentions
          // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID,
          // mention.originalRef);
          mentions.add(mention);
        } else {
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if (Constants.USE_GOLD_NE) {
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
      }
      StringBuilder textContent = new StringBuilder();
      for (int i = 0; i < sentence.size(); i++) {
        CoreLabel w = sentence.get(i);
        w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
        w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
        if (i > 0) textContent.append(" ");
        textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
      }
      CoreMap sentCoreMap = new Annotation(textContent.toString());
      allSentences.add(sentCoreMap);
      sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
      for (Mention m : goldMentions) {
        idMention.put(m.mentionID, m);
      }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
      for (Mention m : goldMentions) {
        if (m.goldCorefClusterID == -1) {
          if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID;
          else {
            int ref = m.originalRef;
            while (true) {
              Mention m2 = idMention.get(ref);
              if (m2.goldCorefClusterID != -1) {
                m.goldCorefClusterID = m2.goldCorefClusterID;
                break;
              } else if (m2.originalRef == -1) {
                m2.goldCorefClusterID = m2.mentionID;
                m.goldCorefClusterID = m2.goldCorefClusterID;
                break;
              } else {
                ref = m2.originalRef;
              }
            }
          }
        }
      }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
      throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
      List<CoreLabel> annotatedSent =
          allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
      List<CoreLabel> unannotatedSent = allWords.get(i);
      List<Mention> mentionInSent = allGoldMentions.get(i);
      for (Mention m : mentionInSent) {
        m.dependency =
            allSentences
                .get(i)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
      }
      if (annotatedSent.size() != unannotatedSent.size()) {
        throw new IllegalStateException("annotatedSent != unannotatedSent");
      }
      for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
        CoreLabel annotatedWord = annotatedSent.get(j);
        CoreLabel unannotatedWord = unannotatedSent.get(j);
        if (!annotatedWord
            .get(CoreAnnotations.TextAnnotation.class)
            .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
          throw new IllegalStateException("annotatedWord != unannotatedWord");
        }
      }
      allWords.set(i, annotatedSent);
      allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // extract predicted mentions
    if (Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions;
    else
      allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
  }
 public void train(Collection<Pair<Document, List<Entity>>> trainingData) {
   startTrack("Training");
   // --Variables
   RVFDataset<Boolean, Feature> dataset = new RVFDataset<Boolean, Feature>();
   LinearClassifierFactory<Boolean, Feature> fact =
       new LinearClassifierFactory<Boolean, Feature>();
   // --Feature Extraction
   startTrack("Feature Extraction");
   for (Pair<Document, List<Entity>> datum : trainingData) {
     // (document variables)
     Document doc = datum.getFirst();
     List<Entity> goldClusters = datum.getSecond();
     List<Mention> mentions = doc.getMentions();
     Map<Mention, Entity> goldEntities = Entity.mentionToEntityMap(goldClusters);
     startTrack("Document " + doc.id);
     // (for each mention...)
     for (int i = 0; i < mentions.size(); i++) {
       // (get the mention and its cluster)
       Mention onPrix = mentions.get(i);
       Entity source = goldEntities.get(onPrix);
       if (source == null) {
         throw new IllegalArgumentException("Mention has no gold entity: " + onPrix);
       }
       // (for each previous mention...)
       int oldSize = dataset.size();
       for (int j = i - 1; j >= 0; j--) {
         // (get previous mention and its cluster)
         Mention cand = mentions.get(j);
         Entity target = goldEntities.get(cand);
         if (target == null) {
           throw new IllegalArgumentException("Mention has no gold entity: " + cand);
         }
         // (extract features)
         Counter<Feature> feats =
             extractor.extractFeatures(Pair.make(onPrix, cand.markCoreferent(target)));
         // (add datum)
         dataset.add(new RVFDatum<Boolean, Feature>(feats, target == source));
         // (stop if
         if (target == source) {
           break;
         }
       }
       // logf("Mention %s (%d datums)", onPrix.toString(), dataset.size() - oldSize);
     }
     endTrack("Document " + doc.id);
   }
   endTrack("Feature Extraction");
   // --Train Classifier
   startTrack("Minimizer");
   this.classifier = fact.trainClassifier(dataset);
   endTrack("Minimizer");
   // --Dump Weights
   startTrack("Features");
   // (get labels to print)
   Set<Boolean> labels = new HashSet<Boolean>();
   labels.add(true);
   // (print features)
   for (Triple<Feature, Boolean, Double> featureInfo :
       this.classifier.getTopFeatures(labels, 0.0, true, 100, true)) {
     Feature feature = featureInfo.first();
     Boolean label = featureInfo.second();
     Double magnitude = featureInfo.third();
     // log(FORCE,new DecimalFormat("0.000").format(magnitude) + " [" + label + "] " + feature);
   }
   end_Track("Features");
   endTrack("Training");
 }
  /** Extract gold coref link information */
  protected void extractGoldLinks() {
    //    List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions();
    List<Pair<IntTuple, IntTuple>> links = new ArrayList<Pair<IntTuple, IntTuple>>();

    // position of each mention in the input matrix, by id
    Map<Integer, IntTuple> positions = Generics.newHashMap();
    // positions of antecedents
    Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap();
    for (int i = 0; i < goldOrderedMentionsBySentence.size(); i++) {
      for (int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j++) {
        Mention m = goldOrderedMentionsBySentence.get(i).get(j);
        int id = m.mentionID;
        IntTuple pos = new IntTuple(2);
        pos.set(0, i);
        pos.set(1, j);
        positions.put(id, pos);
        antecedents.put(id, new ArrayList<IntTuple>());
      }
    }

    //    SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence);
    for (List<Mention> mentions : goldOrderedMentionsBySentence) {
      for (Mention m : mentions) {
        int id = m.mentionID;
        IntTuple src = positions.get(id);

        assert (src != null);
        if (m.originalRef >= 0) {
          IntTuple dst = positions.get(m.originalRef);
          if (dst == null) {
            throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef);
          }

          // to deal with cataphoric annotation
          while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) {
            Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1));
            m.originalRef = dstMention.originalRef;
            dstMention.originalRef = id;

            if (m.originalRef < 0) break;
            dst = positions.get(m.originalRef);
          }
          if (m.originalRef < 0) continue;

          // A B C: if A<-B, A<-C => make a link B<-C
          for (int k = dst.get(0); k <= src.get(0); k++) {
            for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) {
              if (k == dst.get(0) && l < dst.get(1)) continue;
              if (k == src.get(0) && l > src.get(1)) break;
              IntTuple missed = new IntTuple(2);
              missed.set(0, k);
              missed.set(1, l);
              if (links.contains(new Pair<IntTuple, IntTuple>(missed, dst))) {
                antecedents.get(id).add(missed);
                links.add(new Pair<IntTuple, IntTuple>(src, missed));
              }
            }
          }

          links.add(new Pair<IntTuple, IntTuple>(src, dst));

          assert (antecedents.get(id) != null);
          antecedents.get(id).add(dst);

          List<IntTuple> ants = antecedents.get(m.originalRef);
          assert (ants != null);
          for (IntTuple ant : ants) {
            antecedents.get(id).add(ant);
            links.add(new Pair<IntTuple, IntTuple>(src, ant));
          }
        }
      }
    }
    goldLinks = links;
  }
        private <E> Feature feature(
            Class<E> clazz, Pair<Mention, ClusteredMention> input, Option<Double> count) {

          // --Variables
          Mention onPrix =
              input.getFirst(); // the first mention (referred to as m_i in the handout)
          Mention candidate =
              input.getSecond().mention; // the second mention (referred to as m_j in the handout)
          Entity candidateCluster =
              input.getSecond().entity; // the cluster containing the second mention

          // --Features:w
          if (clazz.equals(Feature.ExactMatch.class)) {
            // (exact string match)
            return new Feature.ExactMatch(onPrix.gloss().equals(candidate.gloss()));
          } else if (clazz.equals(Feature.SentenceDist.class)) {
            return new Feature.SentenceDist(
                Math.abs(
                    onPrix.doc.indexOfMention(onPrix) - candidate.doc.indexOfMention(candidate)));
          } else if (clazz.equals(Feature.MentionDist.class)) {
            return new Feature.MentionDist(
                Math.abs(
                    onPrix.doc.indexOfSentence(onPrix.sentence)
                        - candidate.doc.indexOfSentence(candidate.sentence)));
          } else if (clazz.equals(Feature.EitherHeadWordPronoun.class)) {
            return new Feature.EitherHeadWordPronoun(
                Pronoun.isSomePronoun(onPrix.gloss()) || Pronoun.isSomePronoun(candidate.gloss()));
          } else if (clazz.equals(Feature.CandidateNERTag.class)) {
            return new Feature.CandidateNERTag(candidate.headToken().nerTag());
          } else if (clazz.equals(Feature.CandidateSpeaker.class)) {
            return new Feature.CandidateSpeaker(candidate.headToken().speaker());
          } else if (clazz.equals(Feature.FixedSpeaker.class)) {
            return new Feature.FixedSpeaker(onPrix.headToken().speaker());
          } else if (clazz.equals(Feature.HeadWordMatch.class)) {
            return new Feature.HeadWordMatch(onPrix.equals(candidate.headWord()));
          } else if (clazz.equals(Feature.HeadWordLemmaMatch.class)) {
            return new Feature.HeadWordLemmaMatch(
                onPrix.headToken().lemma().equals(candidate.headToken().lemma()));
          } else if (clazz.equals(Feature.FixedNERTag.class)) {
            return new Feature.FixedNERTag(onPrix.headToken().nerTag());
          } else if (clazz.equals(Feature.SpeakerMatch.class)) {
            return new Feature.SpeakerMatch(
                candidate.headToken().speaker().equals(onPrix.headToken().speaker()));
          } else if (clazz.equals(Feature.NERTagMatch.class)) {
            return new Feature.NERTagMatch(
                candidate.headToken().nerTag().equals(onPrix.headToken().nerTag()));
          } else if (clazz.equals(Feature.CandidatePOSTag.class)) {
            return new Feature.CandidatePOSTag(candidate.headToken().posTag());
          } else if (clazz.equals(Feature.FixedPOSTag.class)) {
            return new Feature.FixedPOSTag(onPrix.headToken().posTag());
          } else if (clazz.equals(Feature.GenderMatch.class)) {
            Pair<Boolean, Boolean> match = Util.haveGenderAndAreSameGender(onPrix, candidate);
            boolean finalMatch = (!match.getFirst() || match.getSecond());
            return new Feature.GenderMatch(finalMatch);
          } else if (clazz.equals(Feature.NumberMatch.class)) {
            Pair<Boolean, Boolean> match = Util.haveNumberAndAreSameNumber(onPrix, candidate);
            boolean finalMatch = (!match.getFirst() || match.getSecond());
            return new Feature.NumberMatch(finalMatch);
          }
          //			} else if(clazz.equals(Feature.NewFeature.class) {
          /*
           * TODO: Add features to return for specific classes. Implement calculating values of features here.
           */

          else {
            throw new IllegalArgumentException("Unregistered feature: " + clazz);
          }
        }
  /** Process discourse information */
  protected void processDiscourse(Dictionaries dict) {
    docType = findDocType(dict);
    markQuotations(this.annotation.get(CoreAnnotations.SentencesAnnotation.class), false);
    findSpeakers(dict);

    // find 'speaker mention' for each mention
    for (Mention m : allPredictedMentions.values()) {
      int utter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class);
      String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
      if (speaker != null) {
        // Populate speaker info
        SpeakerInfo speakerInfo = speakerInfoMap.get(speaker);
        if (speakerInfo == null) {
          speakerInfoMap.put(speaker, speakerInfo = new SpeakerInfo(speaker));
          // span indicates this is the speaker
          if (Rules.mentionMatchesSpeaker(m, speakerInfo, true)) {
            m.speakerInfo = speakerInfo;
          }
        }

        if (NumberMatchingRegex.isDecimalInteger(speaker)) {
          try {
            int speakerMentionID = Integer.parseInt(speaker);
            if (utter != 0) {
              // Add pairs of mention id and the mention id of the speaker
              speakerPairs.add(new Pair<Integer, Integer>(m.mentionID, speakerMentionID));
              //              speakerPairs.add(new Pair<Integer, Integer>(speakerMentionID,
              // m.mentionID));
            }
          } catch (Exception e) {
            // no mention found for the speaker
            // nothing to do
          }
        }
      }
      // set generic 'you' : e.g., you know in conversation
      if (docType != DocType.ARTICLE
          && m.person == Person.YOU
          && m.endIndex < m.sentenceWords.size() - 1
          && m.sentenceWords
              .get(m.endIndex)
              .get(CoreAnnotations.TextAnnotation.class)
              .equalsIgnoreCase("know")) {
        m.generic = true;
      }
    }
    // now that we have identified the speakers, first pass to check if mentions should cluster with
    // the speakers
    for (Mention m : allPredictedMentions.values()) {
      if (m.speakerInfo == null) {
        for (SpeakerInfo speakerInfo : speakerInfoMap.values()) {
          if (speakerInfo.hasRealSpeakerName()) {
            // do loose match - assumes that there isn't that many speakers....
            if (Rules.mentionMatchesSpeaker(m, speakerInfo, false)) {
              m.speakerInfo = speakerInfo;
              break;
            }
          }
        }
      }
    }
  }