Beispiel #1
0
  public static final String doCorefResolution(Annotation annotation) {

    Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class);
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> resolved = new ArrayList<String>();
    for (CoreMap sentence : sentences) {
      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      for (CoreLabel token : tokens) {
        Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
        CorefChain chain = corefs.get(corefClustId);
        if (chain == null) resolved.add(token.word());
        else {
          int sentINdx = chain.getRepresentativeMention().sentNum - 1;
          CoreMap corefSentence = sentences.get(sentINdx);
          List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class);
          CorefMention reprMent = chain.getRepresentativeMention();
          if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) {
            for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
              CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
              resolved.add(matchedLabel.word());
            }
          } else resolved.add(token.word());
        }
      }
    }
    String resolvedStr = "";
    System.out.println();
    for (String str : resolved) {
      resolvedStr += str + " ";
    }
    System.out.println(resolvedStr);

    return resolvedStr;
  }
  public static void main(String[] args) throws IOException {

    String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";

    if (args.length > 0) {
      serializedClassifier = args[0];
    }

    AbstractSequenceClassifier<CoreLabel> classifier =
        CRFClassifier.getClassifierNoExceptions(serializedClassifier);

    /* For either a file to annotate or for the hardcoded text example,
       this demo file shows two ways to process the output, for teaching
       purposes.  For the file, it shows both how to run NER on a String
       and how to run it on a whole file.  For the hard-coded String,
       it shows how to run it on a single sentence, and how to do this
       and produce an inline XML output format.
    */
    if (args.length > 1) {
      String fileContents = IOUtils.slurpFile(args[1]);
      List<List<CoreLabel>> out = classifier.classify(fileContents);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(word.word() + '/' + word.get(AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }
      out = classifier.classifyFile(args[1]);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(word.word() + '/' + word.get(AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }

    } else {
      String s1 = "Good afternoon Rajat Raina, how are you today?";
      String s2 = "I go to school at Stanford University, which is located in California.";
      System.out.println(classifier.classifyToString(s1));
      System.out.println(classifier.classifyWithInlineXML(s2));
      System.out.println(classifier.classifyToString(s2, "xml", true));
      int i = 0;
      for (List<CoreLabel> lcl : classifier.classify(s2)) {
        for (CoreLabel cl : lcl) {
          System.out.println(i++ + ":");
          System.out.println(cl);
        }
      }
    }
  }
Beispiel #3
0
  private LinkedHashMap<LinkedHashMap<Integer, String>, String> identifyNER(String text) {

    LinkedHashMap<LinkedHashMap<Integer, String>, String> map = new LinkedHashMap<>();

    String serializedClassifier =
        "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";

    CRFClassifier<CoreLabel> classifier =
        CRFClassifier.getClassifierNoExceptions(serializedClassifier);
    List<List<CoreLabel>> classify = classifier.classify(text);
    for (List<CoreLabel> coreLabels : classify) {
      for (CoreLabel coreLabel : coreLabels) {

        String word = coreLabel.word();
        int index = coreLabel.index();
        String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
        if (!"O".equals(category)) {

          //                    for(Entry e1 : map.entrySet()){
          //
          //                        LinkedHashMap<Integer, String> entries = (LinkedHashMap<Integer,
          // String>) e1;
          //
          //
          //                    }
          System.out.println(word + ":" + category);
        }
      }
    }
    return map;
  }
Beispiel #4
0
 @Test
 public void testCorp() {
   // We test a 2x2 design: {strict, regular} x {no following context, following context}
   for (int sent = 0; sent < 4; sent++) {
     PTBTokenizer<CoreLabel> ptbTokenizer =
         new PTBTokenizer<>(
             new StringReader(corpInputs[sent / 2]),
             new CoreLabelTokenFactory(),
             (sent % 2 == 0) ? "strictTreebank3" : "");
     int i = 0;
     while (ptbTokenizer.hasNext()) {
       CoreLabel w = ptbTokenizer.next();
       try {
         assertEquals("PTBTokenizer problem", corpGold[sent % 2][i], w.word());
       } catch (ArrayIndexOutOfBoundsException aioobe) {
         // the assertion below outside the loop will fail
       }
       i++;
     }
     if (i != corpGold[sent % 2].length) {
       System.out.println("Gold: " + Arrays.toString(corpGold[sent % 2]));
       List<CoreLabel> tokens =
           new PTBTokenizer<>(
                   new StringReader(corpInputs[sent / 2]),
                   new CoreLabelTokenFactory(),
                   (sent % 2 == 0) ? "strictTreebank3" : "")
               .tokenize();
       System.out.println("Guess: " + SentenceUtils.listToString(tokens));
       System.out.flush();
     }
     assertEquals("PTBTokenizer num tokens problem", i, corpGold[sent % 2].length);
   }
 }
 public static void saveCoNLL(
     PrintStream os, List<List<CoreLabel>> sentences, boolean alreadyBIO) {
   os.println("-DOCSTART- -X- O\n");
   for (List<CoreLabel> sent : sentences) {
     String prev = null;
     for (CoreLabel word : sent) {
       String w = word.word().replaceAll("[ \t\n]+", "_");
       String t = word.get(CoreAnnotations.PartOfSpeechAnnotation.class);
       String l = word.get(CoreAnnotations.AnswerAnnotation.class);
       String nl = l;
       if (!alreadyBIO && !l.equals("O")) {
         if (prev != null && l.equals(prev)) nl = "I-" + l;
         else nl = "B-" + l;
       }
       String line = w + ' ' + t + ' ' + nl;
       String[] toks = line.split("[ \t\n]+");
       if (toks.length != 3) {
         throw new RuntimeException("INVALID LINE: \"" + line + '"');
       }
       os.printf("%s %s %s\n", w, t, nl);
       prev = l;
     }
     os.println();
   }
 }
  /**
   * Get the text value of this entity. The headTokenSpan MUST be set before calling this method!
   */
  public String getValue() {
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    // int lastEnd = -1;
    StringBuilder sb = new StringBuilder();
    for (int i = headTokenSpan.start(); i < headTokenSpan.end(); i++) {
      CoreLabel token = tokens.get(i);

      // we are not guaranteed to have CharacterOffsets so we can't use them...
      /*
      Integer start = token.get(CharacterOffsetBeginAnnotation.class);
      Integer end = token.get(CharacterOffsetEndAnnotation.class);

      if (start != null && end != null) {
        if (lastEnd != -1 && !start.equals(lastEnd)) {
          sb.append(StringUtils.repeat(" ", start - lastEnd));
          lastEnd = end;
        }
      } else {
        if (lastEnd != -1) sb.append(" ");
        lastEnd = 0;
      }
        */
      if (i > headTokenSpan.start()) sb.append(" ");

      sb.append(token.word());
    }

    return sb.toString();
  }
 @Override
 public void printAnswers(List<CoreLabel> doc, PrintWriter out) {
   for (CoreLabel wi : doc) {
     String answer = wi.get(CoreAnnotations.AnswerAnnotation.class);
     String goldAnswer = wi.get(CoreAnnotations.GoldAnswerAnnotation.class);
     out.println(wi.word() + "\t" + goldAnswer + "\t" + answer);
   }
   out.println();
 }
 public String getExtentString() {
   List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
   StringBuilder sb = new StringBuilder();
   for (int i = extentTokenSpan.start(); i < extentTokenSpan.end(); i++) {
     CoreLabel token = tokens.get(i);
     if (i > extentTokenSpan.start()) sb.append(" ");
     sb.append(token.word());
   }
   return sb.toString();
 }
  @Override
  protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
    Collection<String> features = super.featuresCpC(cInfo, loc);

    CoreLabel c = cInfo.get(loc);

    // "Wrapper" feature: identity of first and last two chars of the current word.
    // This helps detect ma+_+sh in dialect, as well as avoiding segmenting possessive
    // pronouns if the word starts with al-.
    if (c.word().length() > 3) {
      String start = c.word().substring(0, 2);
      String end = c.word().substring(c.word().length() - 2);
      if (c.index() == 2) {
        features.add(start + "_" + end + "-begin-wrap");
      }
      if (c.index() == c.word().length() - 1) {
        features.add(start + "_" + end + "-end-wrap");
      }
    }

    return features;
  }
  public static void saveCoNLLFiles(
      String dir, Annotation dataset, boolean useSubTypes, boolean alreadyBIO) throws IOException {
    List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);

    String docid = null;
    PrintStream os = null;
    for (CoreMap sentence : sentences) {
      String myDocid = sentence.get(CoreAnnotations.DocIDAnnotation.class);
      if (docid == null || !myDocid.equals(docid)) {
        if (os != null) {
          os.close();
        }
        docid = myDocid;
        os = new PrintStream(new FileOutputStream(dir + File.separator + docid + ".conll"));
      }
      List<CoreLabel> labeledSentence =
          AnnotationUtils.sentenceEntityMentionsToCoreLabels(
              sentence, true, null, null, useSubTypes, alreadyBIO);
      assert (labeledSentence != null);

      String prev = null;
      for (CoreLabel word : labeledSentence) {
        String w = word.word().replaceAll("[ \t\n]+", "_");
        String t = word.get(CoreAnnotations.PartOfSpeechAnnotation.class);
        String l = word.get(CoreAnnotations.AnswerAnnotation.class);
        String nl = l;
        if (!alreadyBIO && !l.equals("O")) {
          if (prev != null && l.equals(prev)) nl = "I-" + l;
          else nl = "B-" + l;
        }
        String line = w + ' ' + t + ' ' + nl;
        String[] toks = line.split("[ \t\n]+");
        if (toks.length != 3) {
          throw new RuntimeException("INVALID LINE: \"" + line + '"');
        }
        os.printf("%s %s %s\n", w, t, nl);
        prev = l;
      }
      os.println();
    }
    if (os != null) {
      os.close();
    }
  }
Beispiel #11
0
  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) return null;

    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();

    StringReader textInput = new StringReader(input.get(0).toString());
    PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");

    for (CoreLabel label; ptbt.hasNext(); ) {
      label = (CoreLabel) ptbt.next();
      if (label.value().length() > 2) {
        System.err.println(label.toString());
        Tuple termText = tupleFactory.newTuple(label.word());
        bagOfTokens.add(termText);
      }
    }
    return bagOfTokens;
  }
Beispiel #12
0
  private void test() throws IOException, ClassCastException, ClassNotFoundException {

    String serializedClassifier =
        "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";

    AbstractSequenceClassifier<CoreLabel> classifier =
        CRFClassifier.getClassifier(serializedClassifier);

    // read some text in the text variable
    String text = "What is the timezone in San Pedro de Atacama?";

    // String text = "He ate the apple";
    List<List<CoreLabel>> out = classifier.classify(text);
    for (List<CoreLabel> sentence : out) {
      for (CoreLabel word : sentence) {
        System.out.print(
            word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
      }
      System.out.println();
    }
  }
  /**
   * transformTree does all language-specific tree transformations. Any parameterizations should be
   * inside the specific TreebankLangParserParams class.
   */
  @Override
  public Tree transformTree(Tree t, Tree root) {
    if (t == null || t.isLeaf()) {
      return t;
    }

    String parentStr;
    String grandParentStr;
    Tree parent;
    Tree grandParent;
    if (root == null || t.equals(root)) {
      parent = null;
      parentStr = "";
    } else {
      parent = t.parent(root);
      parentStr = parent.label().value();
    }
    if (parent == null || parent.equals(root)) {
      grandParent = null;
      grandParentStr = "";
    } else {
      grandParent = parent.parent(root);
      grandParentStr = grandParent.label().value();
    }

    String baseParentStr = ctlp.basicCategory(parentStr);
    String baseGrandParentStr = ctlp.basicCategory(grandParentStr);

    CoreLabel lab = (CoreLabel) t.label();
    String word = lab.word();
    String tag = lab.tag();
    String baseTag = ctlp.basicCategory(tag);
    String category = lab.value();
    String baseCategory = ctlp.basicCategory(category);

    if (t.isPreTerminal()) { // it's a POS tag
      List<String> leftAunts =
          listBasicCategories(SisterAnnotationStats.leftSisterLabels(parent, grandParent));
      List<String> rightAunts =
          listBasicCategories(SisterAnnotationStats.rightSisterLabels(parent, grandParent));

      // Chinese-specific punctuation splits
      if (chineseSplitPunct && baseTag.equals("PU")) {
        if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().accept(word)) {
          tag = tag + "-DOU";
          // System.out.println("Punct: Split dou hao"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseCommaAcceptFilter().accept(word)) {
          tag = tag + "-COMMA";
          // System.out.println("Punct: Split comma"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseColonAcceptFilter().accept(word)) {
          tag = tag + "-COLON";
          // System.out.println("Punct: Split colon"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseQuoteMarkAcceptFilter().accept(word)) {
          if (chineseSplitPunctLR) {
            if (ChineseTreebankLanguagePack.chineseLeftQuoteMarkAcceptFilter().accept(word)) {
              tag += "-LQUOTE";
            } else {
              tag += "-RQUOTE";
            }
          } else {
            tag = tag + "-QUOTE";
          }
          // System.out.println("Punct: Split quote"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseEndSentenceAcceptFilter().accept(word)) {
          tag = tag + "-ENDSENT";
          // System.out.println("Punct: Split end sent"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseParenthesisAcceptFilter().accept(word)) {
          if (chineseSplitPunctLR) {
            if (ChineseTreebankLanguagePack.chineseLeftParenthesisAcceptFilter().accept(word)) {
              tag += "-LPAREN";
            } else {
              tag += "-RPAREN";
            }
          } else {
            tag += "-PAREN";
            // printlnErr("Just used -PAREN annotation");
            // printlnErr(word);
            // throw new RuntimeException();
          }
          // System.out.println("Punct: Split paren"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseDashAcceptFilter().accept(word)) {
          tag = tag + "-DASH";
          // System.out.println("Punct: Split dash"); // debugging
        } else if (ChineseTreebankLanguagePack.chineseOtherAcceptFilter().accept(word)) {
          tag = tag + "-OTHER";
        } else {
          printlnErr("Unknown punct (you should add it to CTLP): " + tag + " |" + word + "|");
        }
      } else if (chineseSplitDouHao) { // only split DouHao
        if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().accept(word)
            && baseTag.equals("PU")) {
          tag = tag + "-DOU";
        }
      }

      // Chinese-specific POS tag splits (non-punctuation)

      if (tagWordSize) {
        int l = word.length();
        tag += "-" + l + "CHARS";
      }

      if (mergeNNVV && baseTag.equals("NN")) {
        tag = "VV";
      }

      if ((chineseSelectiveTagPA || chineseVerySelectiveTagPA)
          && (baseTag.equals("CC") || baseTag.equals("P"))) {
        tag += "-" + baseParentStr;
      }
      if (chineseSelectiveTagPA && (baseTag.equals("VV"))) {
        tag += "-" + baseParentStr;
      }

      if (markMultiNtag && tag.startsWith("N")) {
        for (int i = 0; i < parent.numChildren(); i++) {
          if (parent.children()[i].label().value().startsWith("N") && parent.children()[i] != t) {
            tag += "=N";
            // System.out.println("Found multi=N rewrite");
          }
        }
      }

      if (markVVsisterIP && baseTag.equals("VV")) {
        boolean seenIP = false;
        for (int i = 0; i < parent.numChildren(); i++) {
          if (parent.children()[i].label().value().startsWith("IP")) {
            seenIP = true;
          }
        }
        if (seenIP) {
          tag += "-IP";
          // System.out.println("Found VV with IP sister"); // testing
        }
      }

      if (markPsisterIP && baseTag.equals("P")) {
        boolean seenIP = false;
        for (int i = 0; i < parent.numChildren(); i++) {
          if (parent.children()[i].label().value().startsWith("IP")) {
            seenIP = true;
          }
        }
        if (seenIP) {
          tag += "-IP";
        }
      }

      if (markADgrandchildOfIP && baseTag.equals("AD") && baseGrandParentStr.equals("IP")) {
        tag += "~IP";
        // System.out.println("Found AD with IP grandparent"); // testing
      }

      if (gpaAD && baseTag.equals("AD")) {
        tag += "~" + baseGrandParentStr;
        // System.out.println("Found AD with grandparent " + grandParentStr); // testing
      }

      if (markPostverbalP && leftAunts.contains("VV") && baseTag.equals("P")) {
        // System.out.println("Found post-verbal P");
        tag += "^=lVV";
      }

      // end Chinese-specific tag splits

      Label label = new CategoryWordTag(tag, word, tag);
      t.setLabel(label);
    } else {
      // it's a phrasal category
      Tree[] kids = t.children();

      // Chinese-specific category splits
      List<String> leftSis = listBasicCategories(SisterAnnotationStats.leftSisterLabels(t, parent));
      List<String> rightSis =
          listBasicCategories(SisterAnnotationStats.rightSisterLabels(t, parent));

      if (paRootDtr && baseParentStr.equals("ROOT")) {
        category += "^ROOT";
      }

      if (markIPsisterBA && baseCategory.equals("IP")) {
        if (leftSis.contains("BA")) {
          category += "=BA";
          // System.out.println("Found IP sister of BA");
        }
      }

      if (dominatesV && hasV(t.preTerminalYield())) {
        // mark categories containing a verb
        category += "-v";
      }

      if (markIPsisterVVorP && baseCategory.equals("IP")) {
        // todo: cdm: is just looking for "P" here selective enough??
        if (leftSis.contains("VV") || leftSis.contains("P")) {
          category += "=VVP";
        }
      }

      if (markIPsisDEC && baseCategory.equals("IP")) {
        if (rightSis.contains("DEC")) {
          category += "=DEC";
          // System.out.println("Found prenominal IP");
        }
      }

      if (baseCategory.equals("VP")) {
        // cdm 2008: this used to just check that it startsWith("VP"), but
        // I think that was bad because it also matched VPT verb compounds
        if (chineseSplitVP == 3) {
          boolean hasCC = false;
          boolean hasPU = false;
          boolean hasLexV = false;
          for (Tree kid : kids) {
            if (kid.label().value().startsWith("CC")) {
              hasCC = true;
            } else if (kid.label().value().startsWith("PU")) {
              hasPU = true;
            } else if (StringUtils.lookingAt(
                kid.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
              hasLexV = true;
            }
          }
          if (hasCC || (hasPU && !hasLexV)) {
            category += "-CRD";
            // System.out.println("Found coordinate VP"); // testing
          } else if (hasLexV) {
            category += "-COMP";
            // System.out.println("Found complementing VP"); // testing
          } else {
            category += "-ADJT";
            // System.out.println("Found adjoining VP"); // testing
          }
        } else if (chineseSplitVP >= 1) {
          boolean hasBA = false;
          for (Tree kid : kids) {
            if (kid.label().value().startsWith("BA")) {
              hasBA = true;
            } else if (chineseSplitVP == 2 && tlp.basicCategory(kid.label().value()).equals("VP")) {
              for (Tree kidkid : kid.children()) {
                if (kidkid.label().value().startsWith("BA")) {
                  hasBA = true;
                }
              }
            }
          }
          if (hasBA) {
            category += "-BA";
          }
        }
      }

      if (markVPadjunct && baseParentStr.equals("VP")) {
        // cdm 2008: This used to use startsWith("VP") but changed to baseCat
        Tree[] sisters = parent.children();
        boolean hasVPsister = false;
        boolean hasCC = false;
        boolean hasPU = false;
        boolean hasLexV = false;
        for (Tree sister : sisters) {
          if (tlp.basicCategory(sister.label().value()).equals("VP")) {
            hasVPsister = true;
          }
          if (sister.label().value().startsWith("CC")) {
            hasCC = true;
          }
          if (sister.label().value().startsWith("PU")) {
            hasPU = true;
          }
          if (StringUtils.lookingAt(sister.label().value(), "(V[ACEV]|VCD|VCP|VNV|VPT|VRD|VSB)")) {
            hasLexV = true;
          }
        }
        if (hasVPsister && !(hasCC || hasPU || hasLexV)) {
          category += "-VPADJ";
          // System.out.println("Found adjunct of VP"); // testing
        }
      }

      if (markNPmodNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
        if (rightSis.contains("NP")) {
          category += "=MODIFIERNP";
          // System.out.println("Found NP modifier of NP"); // testing
        }
      }

      if (markModifiedNP && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
        if (rightSis.isEmpty()
            && (leftSis.contains("ADJP")
                || leftSis.contains("NP")
                || leftSis.contains("DNP")
                || leftSis.contains("QP")
                || leftSis.contains("CP")
                || leftSis.contains("PP"))) {
          category += "=MODIFIEDNP";
          // System.out.println("Found modified NP"); // testing
        }
      }

      if (markNPconj && baseCategory.equals("NP") && baseParentStr.equals("NP")) {
        if (rightSis.contains("CC")
            || rightSis.contains("PU")
            || leftSis.contains("CC")
            || leftSis.contains("PU")) {
          category += "=CONJ";
          // System.out.println("Found NP conjunct"); // testing
        }
      }

      if (markIPconj && baseCategory.equals("IP") && baseParentStr.equals("IP")) {
        Tree[] sisters = parent.children();
        boolean hasCommaSis = false;
        boolean hasIPSis = false;
        for (Tree sister : sisters) {
          if (ctlp.basicCategory(sister.label().value()).equals("PU")
              && ChineseTreebankLanguagePack.chineseCommaAcceptFilter()
                  .accept(sister.children()[0].label().toString())) {
            hasCommaSis = true;
            // System.out.println("Found CommaSis"); // testing
          }
          if (ctlp.basicCategory(sister.label().value()).equals("IP") && sister != t) {
            hasIPSis = true;
          }
        }
        if (hasCommaSis && hasIPSis) {
          category += "-CONJ";
          // System.out.println("Found IP conjunct"); // testing
        }
      }

      if (unaryIP && baseCategory.equals("IP") && t.numChildren() == 1) {
        category += "-U";
        // System.out.println("Found unary IP"); //testing
      }
      if (unaryCP && baseCategory.equals("CP") && t.numChildren() == 1) {
        category += "-U";
        // System.out.println("Found unary CP"); //testing
      }

      if (splitBaseNP && baseCategory.equals("NP")) {
        if (t.isPrePreTerminal()) {
          category = category + "-B";
        }
      }

      // if (Test.verbose) printlnErr(baseCategory + " " + leftSis.toString()); //debugging

      if (markPostverbalPP && leftSis.contains("VV") && baseCategory.equals("PP")) {
        // System.out.println("Found post-verbal PP");
        category += "=lVV";
      }

      if ((markADgrandchildOfIP || gpaAD)
          && listBasicCategories(SisterAnnotationStats.kidLabels(t)).contains("AD")) {
        category += "^ADVP";
      }

      if (markCC) {
        // was: for (int i = 0; i < kids.length; i++) {
        // This second version takes an idea from Collins: don't count
        // marginal conjunctions which don't conjoin 2 things.
        for (int i = 1; i < kids.length - 1; i++) {
          String cat2 = kids[i].label().value();
          if (cat2.startsWith("CC")) {
            category += "-CC";
          }
        }
      }

      Label label = new CategoryWordTag(category, word, tag);
      t.setLabel(label);
    }
    return t;
  }
  @Override
  public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern =
        Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern =
        Pattern.compile(
            "(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset)) return null;

    currentOffset = docMatcher.end();
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    // Maintain current document ID.
    Pattern docIDPattern =
        Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1);
    else currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
      String sentenceString = sentenceMatcher.group(2);
      List<CoreLabel> words =
          tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();

      // FIXING TOKENIZATION PROBLEMS
      for (int i = 0; i < words.size(); i++) {
        CoreLabel w = words.get(i);
        if (i > 0 && w.word().equals("$")) {
          if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
            continue;
          words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
          words.remove(i);
          i--;
        } else if (w.word().equals("\\/")) {
          if (words.get(i - 1).word().equals("</COREF>")) continue;
          w.set(
              CoreAnnotations.TextAnnotation.class,
              words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
          words.remove(i + 1);
          words.remove(i - 1);
        }
      }
      // END FIXING TOKENIZATION PROBLEMS

      List<CoreLabel> sentence = new ArrayList<CoreLabel>();
      // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently
      // open
      Stack<Mention> stack = new Stack<Mention>();
      List<Mention> mentions = new ArrayList<Mention>();

      allWords.add(sentence);
      allGoldMentions.add(mentions);

      for (CoreLabel word : words) {
        String w = word.get(CoreAnnotations.TextAnnotation.class);
        // found regular token: WORD/POS
        if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
          int i = w.lastIndexOf("\\/");
          String w1 = w.substring(0, i);
          // we do NOT set POS info here. We take the POS tags from the parser!
          word.set(CoreAnnotations.TextAnnotation.class, w1);
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if (Constants.USE_GOLD_NE) {
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
        // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
        else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
          Pattern nerPattern = Pattern.compile("<(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          ner = m.group(1);
        }
        // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
        else if (w.startsWith("</") && !w.startsWith("</COREF")) {
          Pattern nerPattern = Pattern.compile("</(.*?)>");
          Matcher m = nerPattern.matcher(w);
          m.find();
          String ner1 = m.group(1);
          if (ner != null && !ner.equals(ner1))
            throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
          ner = null;
        }
        // found the start SGML tag for a coref mention
        else if (w.startsWith("<COREF")) {
          Mention mention = new Mention();
          // position of this mention in the sentence
          mention.startIndex = sentence.size();

          // extract GOLD info about this coref chain. needed for eval
          Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
          Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

          Matcher m = idPattern.matcher(w);
          m.find();
          mention.mentionID = Integer.parseInt(m.group(1));

          m = refPattern.matcher(w);
          if (m.find()) {
            mention.originalRef = Integer.parseInt(m.group(1));
          }

          // open mention. keep track of all open mentions using the stack
          stack.push(mention);
        }
        // found the end SGML tag for a coref mention
        else if (w.equals("</COREF>")) {
          Mention mention = stack.pop();
          mention.endIndex = sentence.size();

          // this is a closed mention. add it to the final list of mentions
          // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID,
          // mention.originalRef);
          mentions.add(mention);
        } else {
          word.remove(CoreAnnotations.OriginalTextAnnotation.class);
          if (Constants.USE_GOLD_NE) {
            if (ner != null) {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
            } else {
              word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
            }
          }
          sentence.add(word);
        }
      }
      StringBuilder textContent = new StringBuilder();
      for (int i = 0; i < sentence.size(); i++) {
        CoreLabel w = sentence.get(i);
        w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
        w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
        if (i > 0) textContent.append(" ");
        textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
      }
      CoreMap sentCoreMap = new Annotation(textContent.toString());
      allSentences.add(sentCoreMap);
      sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
      for (Mention m : goldMentions) {
        idMention.put(m.mentionID, m);
      }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
      for (Mention m : goldMentions) {
        if (m.goldCorefClusterID == -1) {
          if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID;
          else {
            int ref = m.originalRef;
            while (true) {
              Mention m2 = idMention.get(ref);
              if (m2.goldCorefClusterID != -1) {
                m.goldCorefClusterID = m2.goldCorefClusterID;
                break;
              } else if (m2.originalRef == -1) {
                m2.goldCorefClusterID = m2.mentionID;
                m.goldCorefClusterID = m2.goldCorefClusterID;
                break;
              } else {
                ref = m2.originalRef;
              }
            }
          }
        }
      }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
      throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
      List<CoreLabel> annotatedSent =
          allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
      List<CoreLabel> unannotatedSent = allWords.get(i);
      List<Mention> mentionInSent = allGoldMentions.get(i);
      for (Mention m : mentionInSent) {
        m.dependency =
            allSentences
                .get(i)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
      }
      if (annotatedSent.size() != unannotatedSent.size()) {
        throw new IllegalStateException("annotatedSent != unannotatedSent");
      }
      for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
        CoreLabel annotatedWord = annotatedSent.get(j);
        CoreLabel unannotatedWord = unannotatedSent.get(j);
        if (!annotatedWord
            .get(CoreAnnotations.TextAnnotation.class)
            .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
          throw new IllegalStateException("annotatedWord != unannotatedWord");
        }
      }
      allWords.set(i, annotatedSent);
      allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // extract predicted mentions
    if (Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions;
    else
      allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
  }
  private static int parseFullDocument(
      CompletePipelineCMDLineOptions options,
      CompletePipeline pipeline,
      BufferedReader in,
      SentenceWriter writer)
      throws IOException, Exception {

    /** initialize * */
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    props.put(
        "dcoref.sievePasses",
        "MarkRole,"
            + "DiscourseMatch,"
            + "ExactStringMatch,"
            + "RelaxedExactStringMatch,"
            + "PreciseConstructs,"
            + "StrictHeadMatch1,"
            + "StrictHeadMatch2,"
            + "StrictHeadMatch3,"
            + "StrictHeadMatch4,"
            + "RelaxedHeadMatch");
    StanfordCoreNLP stanfordpipeline = new StanfordCoreNLP(props);
    ExternalProcesses glove = new ExternalProcesses(options.glovedir);

    /** read full text * */
    int senCount = 0;
    String str;
    StringBuffer text = new StringBuffer();
    while ((str = in.readLine()) != null) {
      text.append(str);
      text.append("\n");
    }

    /** document-level preprocessing * */
    Annotation document = new Annotation(text.toString());
    stanfordpipeline.annotate(document);

    Map<String, Double[]> word2vecs = glove.createvecs(document);

    Corpus c = new Corpus("tmp");

    /** sentence-level preprocessing * */
    for (CoreMap sentence : document.get(SentencesAnnotation.class)) {
      StringBuffer posOutput = new StringBuffer();

      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        if (posOutput.length() > 0) {
          posOutput.append(" ");
        }
        posOutput.append(token.word());
        posOutput.append("_");
        posOutput.append(token.tag());
      }

      String parse =
          ExternalProcesses.runProcess(
              "nc " + options.mstserver.replaceAll(":", " "), posOutput.toString());
      parse = parse.replaceAll("-\t-", "_\t_\n@#").replaceAll("@#\t", "").replaceAll("@#", "");

      String[] lines = parse.split("\n");
      String[] words = new String[lines.length + 1];
      String[] lemmas = new String[lines.length + 1];
      String[] tags = new String[lines.length + 1];
      String[] morphs = new String[lines.length + 1];
      int[] heads = new int[lines.length];
      String[] deprels = new String[lines.length];

      for (int i = 1; i < words.length; i++) {
        String[] parts = lines[i - 1].split("\t");
        words[i] = sentence.get(TokensAnnotation.class).get(i - 1).word();
        tags[i] = sentence.get(TokensAnnotation.class).get(i - 1).tag();
        lemmas[i] = sentence.get(TokensAnnotation.class).get(i - 1).lemma();
        morphs[i] = "_";
        heads[i - 1] = Integer.parseInt(parts[6]);
        deprels[i - 1] = parts[7];
      }
      Sentence sen = new Sentence(words, lemmas, tags, morphs);
      sen.setHeadsAndDeprels(heads, deprels);

      /* add labeled predicates from SEMAFOR */
      String json =
          ExternalProcesses.runProcess("nc " + options.semaforserver.replaceAll(":", " "), parse);
      Pattern pred_frame =
          Pattern.compile(
              "\\{\"target\":\\{\"name\":\"([A-Za-z_]*)\",\"spans\":\\[\\{\"start\":([0-9]*),\"");
      Matcher m = pred_frame.matcher(json);
      while (m.find()) {
        String frame = m.group(1);
        int index = Integer.parseInt(m.group(2));
        System.out.println(index + "\t" + frame);

        sen.makePredicate(index + 1);
        ((Predicate) sen.get(index + 1)).setSense(frame);
      }

      for (Word w : sen)
        if (word2vecs.containsKey(w.getForm().toLowerCase()))
          w.setRep(word2vecs.get(w.getForm().toLowerCase()));

      new CorpusSentence(sen, c);
    }

    /* add coref output to corpus */
    Map<Integer, CorefChain> coref = document.get(CorefChainAnnotation.class);
    int num = 1;
    for (Map.Entry<Integer, CorefChain> entry : coref.entrySet()) {
      CorefChain cc = entry.getValue();
      // skip singleton mentions
      if (cc.getMentionsInTextualOrder().size() == 1) continue;

      for (CorefMention m : cc.getMentionsInTextualOrder()) {
        c.addMention(c.get(m.sentNum - 1), m.headIndex, num);
      }
      num++;
    }

    for (Sentence sen : c) {
      pipeline.srl.parseSentence(sen);
      senCount++;
      if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount);
      writer.write(sen);
    }
    return senCount;
  }
  private static Element toInputXML(CoreMap document) {
    // construct GUTime format XML
    Element doc = new Element("DOC");
    doc.appendChild("\n");
    // populate the date element
    Calendar dateCalendar = document.get(CoreAnnotations.CalendarAnnotation.class);
    if (dateCalendar != null) {
      Element date = new Element("date");
      date.appendChild(String.format("%TF", dateCalendar));
      doc.appendChild(date);
      doc.appendChild("\n");
    } else {
      String s = document.get(CoreAnnotations.DocDateAnnotation.class);
      if (s != null) {
        Element date = new Element("date");
        date.appendChild(s);
        doc.appendChild(date);
        doc.appendChild("\n");
      }
    }
    Element textElem = new Element("text");
    doc.appendChild(textElem);
    doc.appendChild("\n");

    // populate the text element
    String text = document.get(CoreAnnotations.TextAnnotation.class);
    int offset = 0;
    for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
      int sentBegin = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int sentEnd = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);

      // add text before the first token
      textElem.appendChild(text.substring(offset, sentBegin));
      offset = sentBegin;

      // add one "s" element per sentence
      Element s = new Element("s");
      textElem.appendChild(s);
      for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
        int tokenBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int tokenEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        s.appendChild(text.substring(offset, tokenBegin));
        offset = tokenBegin;

        // add one "lex" element per token
        Element lex = new Element("lex");
        s.appendChild(lex);
        String posTag = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
        if (posTag != null) {
          lex.addAttribute(new Attribute("pos", posTag));
        }
        assert token.word().equals(text.substring(offset, tokenEnd));
        lex.appendChild(text.substring(offset, tokenEnd));
        offset = tokenEnd;
      }

      // add text after the last token
      textElem.appendChild(text.substring(offset, sentEnd));
      offset = sentEnd;
    }

    // add text after the last sentence
    textElem.appendChild(text.substring(offset, text.length()));

    // return the document
    return doc;
  }
  public void process(String inFilepath, String outFilepath, String nerOutFile) {

    try {
      StringBuilder inText = new StringBuilder();
      StringBuilder outText = new StringBuilder();
      StringBuilder nerText = new StringBuilder();

      // read some text in the inText variable from input file
      BufferedReader reader = new BufferedReader(new FileReader(inFilepath));
      String line = null;
      while ((line = reader.readLine()) != null) {
        if (line.trim().length() == 0) continue;
        inText.append(line + "\n");
      }
      reader.close();

      // create an empty Annotation just with the given text
      Annotation document = new Annotation(inText.toString());

      // run all Annotators on this text
      pipeline.annotate(document);

      // these are all the sentences in this document
      // a CoreMap is essentially a Map that uses class objects as keys and has values with custom
      // types
      List<CoreMap> sentences = document.get(SentencesAnnotation.class);

      for (CoreMap sentence : sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
          totalWords++;
          String pos = token.tag();
          if (tagFilter.contains(pos)) {
            remainWords++;
            String lemma = token.lemma();
            outText.append(lemma + " ");
            if (nerFilter.contains(token.ner())) {
              nerText.append(token.word() + " ");
            }
          }
        }
      }

      // write the processed text to output file
      FileWriter fw = FileUtil.open(outFilepath);
      fw.append(outText);
      FileUtil.close(fw);

      if (nerOutFile != null) {
        FileWriter fw2 = FileUtil.open(nerOutFile);
        fw2.append(nerText);
        FileUtil.close(fw2);
      }

    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
Beispiel #18
0
  public static void main(String[] args) throws Exception {

    // String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
    String serializedClassifier = "classifiers/english.muc.7class.distsim.crf.ser.gz";
    if (args.length > 0) {
      serializedClassifier = args[0];
    }

    AbstractSequenceClassifier<CoreLabel> classifier =
        CRFClassifier.getClassifier(serializedClassifier);

    /* For either a file to annotate or for the hardcoded text example, this
       demo file shows several ways to process the input, for teaching purposes.
    */

    if (args.length > 1) {

      /* For the file, it shows (1) how to run NER on a String, (2) how
         to get the entities in the String with character offsets, and
         (3) how to run NER on a whole file (without loading it into a String).
      */

      String fileContents = IOUtils.slurpFile(args[1]);
      List<List<CoreLabel>> out = classifier.classify(fileContents);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(
              word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }

      System.out.println("---");
      out = classifier.classifyFile(args[1]);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(
              word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }

      System.out.println("---");

      List<Triple<String, Integer, Integer>> list =
          classifier.classifyToCharacterOffsets(fileContents);
      for (Triple<String, Integer, Integer> item : list) {
        // print entity/or non-entity - their nearby tokens
        System.out.println(
            item.first() + ": " + fileContents.substring(item.second(), item.third()));
      }
      System.out.println("---");
      System.out.println("Ten best entity labelings");
      DocumentReaderAndWriter<CoreLabel> readerAndWriter =
          classifier.makePlainTextReaderAndWriter();
      classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter);

      System.out.println("---");
      System.out.println("Per-token marginalized probabilities");
      classifier.printProbs(args[1], readerAndWriter);

      // -- This code prints out the first order (token pair) clique probabilities.
      // -- But that output is a bit overwhelming, so we leave it commented out by default.
      // System.out.println("---");
      // System.out.println("First Order Clique Probabilities");
      // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter);

    } else {

      /* For the hard-coded String, it shows how to run it on a single
         sentence, and how to do this and produce several formats, including
         slash tags and an inline XML output format. It also shows the full
         contents of the {@code CoreLabel}s that are constructed by the
         classifier. And it shows getting out the probabilities of different
         assignments and an n-best list of classifications with probabilities.
      */

      String[] example = {
        "Good afternoon Rajat Raina, how are you today? I go to Washington DC on September 19. And Tomorrow.",
        "I go to school at Stanford University, which is located in California."
      };
      for (String str : example) {
        System.out.println(classifier.classifyToString(str));
      }
      System.out.println("---");

      // ***sentence-by-sentence
      for (String str : example) {
        // This one puts in spaces and newlines between tokens, so just print not println.
        System.out.print(classifier.classifyToString(str, "slashTags", false));
      }
      System.out.println("---");

      // ***print: entities + Classes + remaining text in the text
      for (String str : example) {
        // This one is best for dealing with the output as a TSV (tab-separated column) file.
        // The first column gives entities, the second their classes, and the third the remaining
        // text in a document
        System.out.print(classifier.classifyToString(str, "tabbedEntities", false));
      }
      System.out.println("---");

      for (String str : example) {
        System.out.println(classifier.classifyWithInlineXML(str));
      }
      System.out.println("---");

      for (String str : example) {
        System.out.println(classifier.classifyToString(str, "xml", true));
      }
      System.out.println("---");

      for (String str : example) {
        System.out.print(classifier.classifyToString(str, "tsv", false));
      }
      System.out.println("---");

      // This gets out entities with character offsets
      System.out.print("character offsets");
      int j = 0;
      for (String str : example) {
        j++;
        List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str);
        for (Triple<String, Integer, Integer> trip : triples) {
          System.out.printf(
              "%s over character offsets [%d, %d) in sentence %d.%n",
              trip.first(), trip.second(), trip.third, j);
        }
      }
      System.out.println("---");

      // This prints out all the details of what is stored for each token
      int i = 0;
      for (String str : example) {
        for (List<CoreLabel> lcl : classifier.classify(str)) {
          for (CoreLabel cl : lcl) {
            System.out.print(i++ + ": ");
            System.out.println(cl.toShorterString());
          }
        }
      }

      System.out.println("---");
    }
  }