/** Mark twin mentions: All mention boundaries should be matched */
  private void findTwinMentionsStrict() {
    for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
      List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
      List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);

      // For CoNLL training there are some documents with gold mentions with the same position
      // offsets
      // See
      // /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
      //  (Packwood - Roth)
      CollectionValuedMap<IntPair, Mention> goldMentionPositions =
          new CollectionValuedMap<IntPair, Mention>();
      for (Mention g : golds) {
        IntPair ip = new IntPair(g.startIndex, g.endIndex);
        if (goldMentionPositions.containsKey(ip)) {
          StringBuilder existingMentions = new StringBuilder();
          for (Mention eg : goldMentionPositions.get(ip)) {
            if (existingMentions.length() > 0) {
              existingMentions.append(",");
            }
            existingMentions.append(eg.mentionID);
          }
          SieveCoreferenceSystem.logger.warning(
              "WARNING: gold mentions with the same offsets: "
                  + ip
                  + " mentions="
                  + g.mentionID
                  + ","
                  + existingMentions
                  + ", "
                  + g.spanToString());
        }
        // assert(!goldMentionPositions.containsKey(ip));
        goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
      }
      for (Mention p : predicts) {
        IntPair pos = new IntPair(p.startIndex, p.endIndex);
        if (goldMentionPositions.containsKey(pos)) {
          Collection<Mention> cm = goldMentionPositions.get(pos);
          Mention g = cm.iterator().next();
          cm.remove(g);
          p.mentionID = g.mentionID;
          p.twinless = false;
          g.twinless = false;
        }
      }
      // temp: for making easy to recognize twinless mention
      for (Mention p : predicts) {
        if (p.twinless) p.mentionID += 10000;
      }
    }
  }
  public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) {
    List<CoreMap> sentences =
        conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class);
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    CollectionValuedMap<String, CoreMap> corefChainMap = conllDoc.getCorefChainMap();
    for (int i = 0; i < sentences.size(); i++) {
      allGoldMentions.add(new ArrayList<Mention>());
    }
    int maxCorefClusterId = -1;
    for (String corefIdStr : corefChainMap.keySet()) {
      int id = Integer.parseInt(corefIdStr);
      if (id > maxCorefClusterId) {
        maxCorefClusterId = id;
      }
    }
    int newMentionID = maxCorefClusterId + 1;
    for (String corefIdStr : corefChainMap.keySet()) {
      int id = Integer.parseInt(corefIdStr);
      int clusterMentionCnt = 0;
      for (CoreMap m : corefChainMap.get(corefIdStr)) {
        clusterMentionCnt++;
        Mention mention = new Mention();

        mention.goldCorefClusterID = id;
        if (clusterMentionCnt == 1) {
          // First mention in cluster
          mention.mentionID = id;
          mention.originalRef = -1;
        } else {
          mention.mentionID = newMentionID;
          mention.originalRef = id;
          newMentionID++;
        }
        if (maxID < mention.mentionID) maxID = mention.mentionID;
        int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class);
        CoreMap sent = sentences.get(sentIndex);
        mention.startIndex =
            m.get(CoreAnnotations.TokenBeginAnnotation.class)
                - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
        mention.endIndex =
            m.get(CoreAnnotations.TokenEndAnnotation.class)
                - sent.get(CoreAnnotations.TokenBeginAnnotation.class);

        // will be set by arrange
        mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class);

        // Mention dependency is collapsed dependency for sentence
        mention.dependency =
            sentences
                .get(sentIndex)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);

        allGoldMentions.get(sentIndex).add(mention);
      }
    }
    return allGoldMentions;
  }
Example #3
0
  /**
   * Load a collection of parse trees from the file of given name. Each tree may optionally be
   * encased in parens to allow for Penn Treebank style trees. This methods implements the <code>
   * FileProcessor</code> interface.
   *
   * @param file file to load a tree from
   */
  public void processFile(File file) {
    TreeReader tr = null;

    // SRL stuff
    CollectionValuedMap<Integer, String> srlMap = null;
    if (this.srlMap != null) {
      // there must be a better way ...
      String filename = file.getAbsolutePath();
      for (String suffix : this.srlMap.keySet()) {
        if (filename.endsWith(suffix)) {
          srlMap = this.srlMap.get(suffix);
          break;
        }
      }
      if (srlMap == null) {
        System.err.println("could not find SRL entries for file: " + file);
      }
    }

    try {
      // maybe print file name to stdout to get some feedback
      if (PRINT_FILENAMES) {
        System.err.println(file);
      }
      // could throw an IO exception if can't open for reading
      tr =
          treeReaderFactory()
              .newTreeReader(
                  new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding())));
      int sentIndex = 0;
      Tree pt;
      while ((pt = tr.readTree()) != null) {
        if (pt.label() instanceof HasIndex) { // so we can trace where this tree came from
          HasIndex hi = (HasIndex) pt.label();
          hi.setDocID(file.getName());
          hi.setSentIndex(sentIndex);
        }
        if (srlMap == null) {
          parseTrees.add(pt);
        } else {
          Collection<String> srls = srlMap.get(sentIndex);
          //           pt.pennPrint();
          //           System.err.println(srls);
          parseTrees.add(pt);
          if (srls.isEmpty()) {
            //            parseTrees.add(pt);
          } else {
            for (String srl : srls) {
              //              Tree t = pt.deepCopy();
              String[] bits = srl.split("\\s+");
              int verbIndex = Integer.parseInt(bits[0]);
              String lemma = bits[2].split("\\.")[0];
              //              Tree verb = Trees.getTerminal(t, verbIndex);
              Tree verb = Trees.getTerminal(pt, verbIndex);
              //              ((CoreLabel)verb.label()).set(SRLIDAnnotation.class, SRL_ID.REL);
              ((CoreLabel) verb.label()).set(CoreAnnotations.CoNLLPredicateAnnotation.class, true);
              for (int i = 4; i < bits.length; i++) {
                String arg = bits[i];
                String[] bits1;
                if (arg.indexOf("ARGM") >= 0) {
                  bits1 = arg.split("-");
                } else {
                  bits1 = arg.split("-");
                }
                String locs = bits1[0];
                String argType = bits1[1];
                if (argType.equals("rel")) {
                  continue;
                }
                for (String loc : locs.split("[*,]")) {
                  bits1 = loc.split(":");
                  int term = Integer.parseInt(bits1[0]);
                  int height = Integer.parseInt(bits1[1]);
                  //                  Tree t1 = Trees.getPreTerminal(t, term);
                  Tree t1 = Trees.getPreTerminal(pt, term);
                  for (int j = 0; j < height; j++) {
                    //                    t1 = t1.parent(t);
                    t1 = t1.parent(pt);
                  }
                  Map<Integer, String> roleMap =
                      ((CoreLabel) t1.label()).get(CoreAnnotations.CoNLLSRLAnnotation.class);
                  if (roleMap == null) {
                    roleMap = new HashMap<Integer, String>();
                    ((CoreLabel) t1.label()).set(CoreAnnotations.CoNLLSRLAnnotation.class, roleMap);
                  }
                  roleMap.put(verbIndex, argType);
                  //                  ((CoreLabel)t1.label()).set(SRLIDAnnotation.class,
                  // SRL_ID.ARG);
                }
              }
              //               for (Tree t1 : t) {
              //                 if (t1.isLeaf()) { continue; }
              //                 CoreLabel fl = (CoreLabel)t1.label();
              //                 if (fl.value() == null) { continue; }
              //                 if (!fl.has(SRLIDAnnotation.class)) {
              //                   boolean allNone = true;
              //                   for (Tree t2 : t1) {
              //                     SRL_ID s = ((CoreLabel)t2.label()).get(SRLIDAnnotation.class);
              //                     if (s == SRL_ID.ARG || s == SRL_ID.REL) {
              //                       allNone = false;
              //                       break;
              //                     }
              //                   }
              //                   if (allNone) {
              //                     fl.set(SRLIDAnnotation.class, SRL_ID.ALL_NO);
              //                   } else {
              //                     fl.set(SRLIDAnnotation.class, SRL_ID.NO);
              //                   }
              //                 }
              //               }
              //              parseTrees.add(t);
            }
          }
        }

        sentIndex++;
      }
    } catch (IOException e) {
      System.err.println("loadTree IO Exception: " + e + " in file " + file);
    } finally {
      try {
        if (tr != null) {
          tr.close(); // important: closes file even if error!
        }
      } catch (IOException e) {
        // do nothin'
      }
    }
  }