示例#1
0
  public static final String doCorefResolution(Annotation annotation) {

    Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class);
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> resolved = new ArrayList<String>();
    for (CoreMap sentence : sentences) {
      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      for (CoreLabel token : tokens) {
        Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
        CorefChain chain = corefs.get(corefClustId);
        if (chain == null) resolved.add(token.word());
        else {
          int sentINdx = chain.getRepresentativeMention().sentNum - 1;
          CoreMap corefSentence = sentences.get(sentINdx);
          List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class);
          CorefMention reprMent = chain.getRepresentativeMention();
          if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) {
            for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
              CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
              resolved.add(matchedLabel.word());
            }
          } else resolved.add(token.word());
        }
      }
    }
    String resolvedStr = "";
    System.out.println();
    for (String str : resolved) {
      resolvedStr += str + " ";
    }
    System.out.println(resolvedStr);

    return resolvedStr;
  }
  /** Speaker extraction */
  private void findSpeakers(Dictionaries dict) {
    Boolean useMarkedDiscourseBoolean =
        annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class);
    boolean useMarkedDiscourse =
        (useMarkedDiscourseBoolean != null) ? useMarkedDiscourseBoolean : false;
    if (Constants.USE_GOLD_SPEAKER_TAGS || useMarkedDiscourse) {
      for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
          int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
          speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class));
        }
      }
    } else {
      if (docType == DocType.CONVERSATION) findSpeakersInConversation(dict);
      else if (docType == DocType.ARTICLE) findSpeakersInArticle(dict);

      // set speaker info to annotation
      for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
          int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
          if (speakers.containsKey(utterIndex)) {
            w.set(CoreAnnotations.SpeakerAnnotation.class, speakers.get(utterIndex));
          }
        }
      }
    }
  }
  public static void addFigerAnnotationToDocument(Annotation d) throws SQLException {

    List<CoreMap> sentences = d.get(CoreAnnotations.SentencesAnnotation.class);
    Set<String> entityIds = new HashSet<String>();
    for (CoreMap sen : sentences) {
      List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation =
          sen.get(NamedEntityLinkingAnnotation.class);
      for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) {
        String id = t.second;
        if (!id.equals("null")) {
          entityIds.add(id);
        }
      }
    }
    Map<String, Set<String>> idTypeMap = bigQuery(entityIds);
    // add type onto sentences
    for (CoreMap sen : sentences) {
      List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation =
          sen.get(NamedEntityLinkingAnnotation.class);
      List<Triple<Set<String>, Integer, Integer>> figerData = new ArrayList<>();
      for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) {
        Integer start = t.first.first;
        Integer end = t.first.second;
        Set<String> types = null;
        if (!t.second.equals("null")) {
          types = idTypeMap.get(GuidMidConversion.convertBackward(t.second));
        }
        Triple<Set<String>, Integer, Integer> figerTrip = new Triple<>(types, start, end);
        figerData.add(figerTrip);
      }
      sen.set(FigerAnnotation.class, figerData);
    }
  }
示例#4
0
  public static void main(String[] args) {
    SentenceDAO sentenceDAO = new SentenceDAOImpl();
    List<Sentence> sentences = sentenceDAO.findAll();
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    int i = 0;
    for (Sentence sentence : sentences) {
      if (sentence.getPredicate() == null) {
        try {
          System.out.println(i++);
          String text = sentence.getContent();
          Annotation annotation = new Annotation(text);
          pipeline.annotate(annotation);
          for (CoreMap core : annotation.get(SentencesAnnotation.class)) {
            SemanticGraph graph = core.get(CollapsedCCProcessedDependenciesAnnotation.class);

            sentence.setPredicate(graph.getFirstRoot().lemma());
          }
          sentenceDAO.save(sentence);
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    }

    // System.out.println(sentence.getWords());

  }
 private void findSpeakersInConversation(Dictionaries dict) {
   for (List<Mention> l : predictedOrderedMentionsBySentence) {
     for (Mention m : l) {
       if (m.predicateNominatives == null) continue;
       for (Mention a : m.predicateNominatives) {
         if (a.spanToString().toLowerCase().equals("i")) {
           speakers.put(
               m.headWord.get(CoreAnnotations.UtteranceAnnotation.class),
               Integer.toString(m.mentionID));
         }
       }
     }
   }
   List<CoreMap> paragraph = new ArrayList<CoreMap>();
   int paragraphUtterIndex = 0;
   String nextParagraphSpeaker = "";
   int paragraphOffset = 0;
   for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
     int currentUtter =
         sent.get(CoreAnnotations.TokensAnnotation.class)
             .get(0)
             .get(CoreAnnotations.UtteranceAnnotation.class);
     if (paragraphUtterIndex != currentUtter) {
       nextParagraphSpeaker =
           findParagraphSpeaker(
               paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
       paragraphUtterIndex = currentUtter;
       paragraphOffset += paragraph.size();
       paragraph = new ArrayList<CoreMap>();
     }
     paragraph.add(sent);
   }
   findParagraphSpeaker(
       paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
 }
  private void findSpeakersInArticle(Dictionaries dict) {
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    Pair<Integer, Integer> beginQuotation = new Pair<Integer, Integer>();
    Pair<Integer, Integer> endQuotation = new Pair<Integer, Integer>();
    boolean insideQuotation = false;
    int utterNum = -1;

    for (int i = 0; i < sentences.size(); i++) {
      List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
      for (int j = 0; j < sent.size(); j++) {
        int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class);

        if (utterIndex != 0 && !insideQuotation) {
          utterNum = utterIndex;
          insideQuotation = true;
          beginQuotation.setFirst(i);
          beginQuotation.setSecond(j);
        } else if (utterIndex == 0 && insideQuotation) {
          insideQuotation = false;
          endQuotation.setFirst(i);
          endQuotation.setSecond(j);
          findQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict);
        }
      }
    }
  }
示例#7
0
  public static List<String> lemmatizeDocument(String documentText) {

    if (pipeline == null) {
      loadModels();
    }

    List<String> lemmas = new LinkedList<>();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(documentText);

    // run all Annotators on this text
    pipeline.annotate(document);

    // Iterate over all of the sentences found
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      // Iterate over all tokens in a sentence
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        // Retrieve and add the lemma for each word into the
        // list of lemmas
        lemmas.add(token.get(CoreAnnotations.LemmaAnnotation.class));
      }
    }

    return lemmas;
  }
示例#8
0
  /**
   * @param t
   * @return
   */
  public static String lemmatize(String t) {

    if (pipeline == null) {
      loadModels();
    }

    String lemma = "";

    try {
      // create an empty Annotation just with the given text
      Annotation document = new Annotation(t);

      // run all Annotators on this text
      pipeline.annotate(document);

      // Iterate over all of the sentences found
      List<CoreMap> sentences = document.get(SentencesAnnotation.class);
      for (CoreMap sentence : sentences) {
        // Iterate over all tokens in a sentence
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
          // Retrieve and add the lemma for each word into the
          // list of lemmas
          lemma += " " + token.get(CoreAnnotations.LemmaAnnotation.class);
        }
      }
    } catch (Exception e) {
      System.err.println("Stanford Lemmatizer error exception Word: " + t);
    }

    return lemma.trim();
  }
 public Map<Integer, Integer> getGeneSpans(String text) {
   Map<Integer, Integer> begin2end = new HashMap<Integer, Integer>();
   Annotation document = new Annotation(text);
   pipeline.annotate(document);
   List<CoreMap> sentences = document.get(SentencesAnnotation.class);
   for (CoreMap sentence : sentences) {
     List<CoreLabel> candidate = new ArrayList<CoreLabel>();
     for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
       String pos = token.get(PartOfSpeechAnnotation.class);
       if (pos.startsWith("NN")) {
         candidate.add(token);
       } else if (candidate.size() > 0) {
         int begin = candidate.get(0).beginPosition();
         int end = candidate.get(candidate.size() - 1).endPosition();
         begin2end.put(begin, end);
         candidate.clear();
       }
     }
     if (candidate.size() > 0) {
       int begin = candidate.get(0).beginPosition();
       int end = candidate.get(candidate.size() - 1).endPosition();
       begin2end.put(begin, end);
       candidate.clear();
     }
   }
   return begin2end;
 }
  private static List<AnaphorWithReferent> parseText(InputText text) {
    Annotation annotatedText = new Annotation(text.toString());
    Container.getStanfordCoreNLP().annotate(annotatedText);
    List<CoreMap> coreMapSentences = annotatedText.get(CoreAnnotations.SentencesAnnotation.class);
    List<Tree> trees =
        coreMapSentences
            .stream()
            .map(s -> s.get(TreeCoreAnnotations.TreeAnnotation.class))
            .collect(Collectors.toList());

    List<Sentence> allSentences =
        IntStream.range(0, trees.size())
            .mapToObj(
                id ->
                    new Sentence(
                        id,
                        trees.get(id),
                        Container.getNPsFromParseTreeExtractor().extract(trees.get(id))))
            .collect(Collectors.toList());
    List<AnaphorWithReferent> anaphoraWithReferentFromAllSentences =
        allSentences
            .stream()
            .map(s -> Container.getAllAnaphorWithReferentPerSentenceFinder().find(s, allSentences))
            .flatMap(a -> a.stream())
            .collect(Collectors.toList());

    return anaphoraWithReferentFromAllSentences;
  }
  public List<NLPInfo> analyze(String text) {

    Annotation document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    if (sentences == null || sentences.isEmpty()) {
      return null;
    }

    List<NLPInfo> res = new ArrayList<NLPInfo>();
    NLPInfo info;
    for (CoreMap sentence : sentences) {
      info = new NLPInfo();
      NLPToken tokenInfo;
      for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
        tokenInfo = new NLPToken();
        tokenInfo.setWord(token.get(CoreAnnotations.TextAnnotation.class));
        tokenInfo.setTag(token.get(CoreAnnotations.PartOfSpeechAnnotation.class));
        tokenInfo.setNer(token.get(CoreAnnotations.NamedEntityTagAnnotation.class));
        info.appendToken(tokenInfo);
      }
      res.add(info);
    }
    return res;
  }
示例#12
0
  public static ArrayList<String[]> extractNounPhrases(
      StanfordCoreNLP pipeline, String text, int searchRange) {
    ArrayList<String[]> wordPairs = new ArrayList<String[]>();
    Annotation document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    MAX_STEPS = searchRange;

    for (CoreMap sentence : sentences) {
      List<CoreLabel> labels = sentence.get(TokensAnnotation.class);

      // Check negation
      boolean hasNegation = false;
      for (CoreLabel label : labels) {
        if (NEGATIONS.contains(label.lemma().toLowerCase())) {
          hasNegation = true;
        }
      }

      for (int idx = 0; idx < labels.size(); idx++) {
        CoreLabel label = labels.get(idx);
        if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) {
          for (int step = 1; step <= MAX_STEPS; step++) {
            CoreLabel leftLabel = labels.get(Math.max(0, idx - step));
            if (JJ_TAGS.contains(leftLabel.tag())) {
              if (hasNegation)
                addPair(
                    wordPairs,
                    NOT_PREFIX + leftLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              else
                addPair(
                    wordPairs,
                    leftLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              break;
            }
            CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1));
            if (JJ_TAGS.contains(rightLabel.tag())) {
              if (hasNegation)
                addPair(
                    wordPairs,
                    NOT_PREFIX + rightLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              else
                addPair(
                    wordPairs,
                    rightLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));

              break;
            }
          }
        }
      }
    }
    return wordPairs;
  }
示例#13
0
  private void handleCvtBinary(FormulaGenerationInfo fgInfo, Set<String> res) {
    String description1 = normalizeFbDescription(fgInfo.bInfo.descriptions.get(0));
    String description2 = normalizeFbDescription(fgInfo.bInfo.descriptions.get(1));
    Annotation a1 = getAnnotation(description1);
    Annotation a2 = getAnnotation(description2);

    String question =
        generateCvtQuestion(
            fgInfo,
            description1,
            description2,
            a1.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(),
            a2.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(),
            getPosTagsFromAnnotation(a1),
            getPosTagsFromAnnotation(a2));
    if (question != null) res.add(question);
  }
示例#14
0
 public static void main(String[] args) throws IOException {
   SUTimePipeline pipeline = new SUTimePipeline();
   Annotator timeAnnotator = pipeline.getTimeAnnotator("sutime", new Properties());
   BufferedReader is = new BufferedReader(new InputStreamReader(System.in));
   System.out.print("> ");
   for (String line; (line = is.readLine()) != null; ) {
     Annotation ann = pipeline.process(line, null, timeAnnotator);
     System.out.println(ann.get(TimeAnnotations.TimexAnnotations.class));
     System.out.print("> ");
   }
 }
示例#15
0
  public static void describe(DBObject doc, StanfordCoreNLP pipeline) {
    // create an empty Annotation just with the given text
    Annotation document = new Annotation((String) doc.get("cleansed_text"));
    BasicDBObject m_doc = new BasicDBObject();
    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom
    // types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        // this is the text of the token
        String word = token.get(TextAnnotation.class);

        // this is the POS tag of the token
        String pos = token.get(PartOfSpeechAnnotation.class);

        // this is the NER label of the token
        String ne = token.get(NamedEntityTagAnnotation.class);
        // System.out.print("(" + word + ", " + pos + ", " + ne + ")");
      }
      // System.out.println();

      // this is the parse tree of the current sentence
      Tree tree = sentence.get(TreeAnnotation.class);

      // this is the Stanford dependency graph of the current sentence
      SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
    }

    // This is the coreference link graph
    // Each chain stores a set of mentions that link to each other,
    // along with a method for getting the most representative mention
    // Both sentence and token offsets start at 1!
    Map<Integer, CorefChain> graph = document.get(CorefChainAnnotation.class);
  }
示例#16
0
 private void handleNonCvtBinary(FormulaGenerationInfo fgInfo, Set<String> res) {
   String description = normalizeFbDescription(fgInfo.bInfo.descriptions.get(0));
   Annotation a = getAnnotation(description);
   String question =
       generateNonCvtQuestion(
           fgInfo,
           description,
           getPosTagsFromAnnotation(a),
           a.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(),
           fbFormulasInfo.isReversed(fgInfo.bInfo.formula));
   if (question != null) res.add(question);
 }
  /**
   * Finds the position of the sentence in the given document that achieves the best ROUGE-N scores
   * w.r.t. to the reference summaries.
   *
   * @param task the document and the corresponding models
   * @return the position of the best sentence in the document
   * @throws IOException
   */
  public int getBestSentencePos(Task task) {
    Document document = task.getDocument();
    Annotation documentAnnotation = annotationProvider.getAnnotation(document.getContent());

    RougeN rouge = rougeFactory.make(task.getModels(), annotationProvider);
    BestSentenceSelector sentenceSelector = new BestSentenceSelector(rouge);
    Annotation bestAnnotation = sentenceSelector.select(documentAnnotation);
    CoreMap sentence = bestAnnotation.get(SentencesAnnotation.class).get(0);
    String bestPos = sentence.get(SentencePositionAnnotation.class);

    return Integer.valueOf(bestPos);
  }
示例#18
0
  private void parseThread(ArrayList<Thread> threads) {
    for (Thread t : threads) {
      ThreadVector tv = new ThreadVector(t);
      allThreads.add(tv);
      for (Email e : t.getEmails()) {
        StringBuffer sb = new StringBuffer();
        for (Sentence s : e.getSentences()) {
          // if it's the content of this email
          if (s.getQuotationTimes() == 0) {
            sb.append(s.getText() + " ");
          }
        }
        String content = sb.toString().toLowerCase();

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(content);

        // run all Annotators on this text
        this.pipeline.annotate(document);

        // Iterate over all of the sentences found
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
          List<String> lemmas = new LinkedList<String>();
          // Iterate over all tokens in a sentence
          for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            // Retrieve and add the lemma for each word into the
            // list of lemmas
            lemmas.add(token.get(LemmaAnnotation.class));
          }

          HashMap<String, Integer> wordCount = countWordsInSentence(lemmas);
          // if it has valid words
          if (wordCount.size() > 0) {
            totalSentenceNumber++;
            for (String word : wordCount.keySet()) {
              if (!dictionaryIndex.containsKey(word)) {
                dictionaryIndex.put(word, dictionaryIndex.size());
                dictionaryDocumentCount.put(word, 1);
              } else {
                dictionaryDocumentCount.put(word, dictionaryDocumentCount.get(word) + 1);
              }
            }
            SentenceVector sv = new SentenceVector(sentence.toString(), wordCount);
            tv.addSentenceVectors(sv);
          }
        }
      }
    }
  }
示例#19
0
  public static Collection<String> lemmatize(String rawInput) {

    Collection<String> lemmas =
        Lists.newArrayListWithCapacity(30); // should to the initial capacity in other places too
    Annotation rawInputAnnotation = new Annotation(rawInput);
    coreNlp.annotate(rawInputAnnotation);

    List<CoreLabel> allTokens = rawInputAnnotation.get(TokensAnnotation.class);

    for (CoreLabel eachToken : allTokens) {
      lemmas.add(eachToken.get(LemmaAnnotation.class));
    }

    return lemmas;
  }
示例#20
0
  public static DependencyParse parse(String text) {

    if (pipeline == null) {
      loadModels();
    }

    DependencyParse parse = new DependencyParse();

    Annotation document = new Annotation(text);

    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {

      SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);

      IndexedWord root = dependencies.getFirstRoot();

      parse.setHeadNode(root.index());

      List<SemanticGraphEdge> edges = dependencies.edgeListSorted();

      // System.out.println(edges);
      for (SemanticGraphEdge t : edges) {

        String dep = t.getDependent().originalText();
        int depIndex = t.getDependent().index();
        String depPOS = t.getDependent().tag();
        int depStart = t.getDependent().beginPosition();
        int depEnd = t.getDependent().endPosition();

        String gov = t.getGovernor().originalText();
        int govIndex = t.getGovernor().index();
        String govPOS = t.getGovernor().tag();
        int govStart = t.getGovernor().beginPosition();
        int govEnd = t.getGovernor().endPosition();

        parse.addNode(govIndex, gov, govPOS, govStart, govEnd);
        parse.addNode(depIndex, dep, depPOS, depStart, depEnd);

        parse.addEdge(depIndex, govIndex, t.getRelation().getShortName());
      }
    }

    return parse;
  }
  /** Reads an annotation from the given filename using the requested input. */
  public static List<Annotation> getAnnotations(
      StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) {
    switch (inputFormat) {
      case TEXT:
        {
          String text = IOUtils.slurpFileNoExceptions(filename);
          Annotation annotation = new Annotation(text);
          tokenizer.annotate(annotation);
          List<Annotation> annotations = Generics.newArrayList();
          for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            Annotation nextAnnotation =
                new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
            nextAnnotation.set(
                CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
            annotations.add(nextAnnotation);
          }
          return annotations;
        }
      case TREES:
        {
          List<Tree> trees;
          if (filterUnknown) {
            trees = SentimentUtils.readTreesWithGoldLabels(filename);
            trees = SentimentUtils.filterUnknownRoots(trees);
          } else {
            trees = Generics.newArrayList();
            MemoryTreebank treebank = new MemoryTreebank("utf-8");
            treebank.loadPath(filename, null);
            for (Tree tree : treebank) {
              trees.add(tree);
            }
          }

          List<Annotation> annotations = Generics.newArrayList();
          for (Tree tree : trees) {
            CoreMap sentence = new Annotation(Sentence.listToString(tree.yield()));
            sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
            List<CoreMap> sentences = Collections.singletonList(sentence);
            Annotation annotation = new Annotation("");
            annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
            annotations.add(annotation);
          }
          return annotations;
        }
      default:
        throw new IllegalArgumentException("Unknown format " + inputFormat);
    }
  }
示例#22
0
  private void addMissingAnnotation(Annotation anno) {
    boolean useConstituency = CorefProperties.useConstituencyTree(props);
    final boolean LEMMATIZE = true;

    List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      boolean hasTree = sentence.containsKey(TreeCoreAnnotations.TreeAnnotation.class);
      Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);

      if (!useConstituency) { // TODO: temp for dev: make sure we don't use constituency tree
        sentence.remove(TreeCoreAnnotations.TreeAnnotation.class);
      }
      if (LEMMATIZE && hasTree && useConstituency)
        treeLemmatizer.transformTree(tree); // TODO don't need?
    }
    corenlp.annotate(anno);
  }
示例#23
0
  public static void saveCoNLLFiles(
      String dir, Annotation dataset, boolean useSubTypes, boolean alreadyBIO) throws IOException {
    List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);

    String docid = null;
    PrintStream os = null;
    for (CoreMap sentence : sentences) {
      String myDocid = sentence.get(CoreAnnotations.DocIDAnnotation.class);
      if (docid == null || !myDocid.equals(docid)) {
        if (os != null) {
          os.close();
        }
        docid = myDocid;
        os = new PrintStream(new FileOutputStream(dir + File.separator + docid + ".conll"));
      }
      List<CoreLabel> labeledSentence =
          AnnotationUtils.sentenceEntityMentionsToCoreLabels(
              sentence, true, null, null, useSubTypes, alreadyBIO);
      assert (labeledSentence != null);

      String prev = null;
      for (CoreLabel word : labeledSentence) {
        String w = word.word().replaceAll("[ \t\n]+", "_");
        String t = word.get(CoreAnnotations.PartOfSpeechAnnotation.class);
        String l = word.get(CoreAnnotations.AnswerAnnotation.class);
        String nl = l;
        if (!alreadyBIO && !l.equals("O")) {
          if (prev != null && l.equals(prev)) nl = "I-" + l;
          else nl = "B-" + l;
        }
        String line = w + ' ' + t + ' ' + nl;
        String[] toks = line.split("[ \t\n]+");
        if (toks.length != 3) {
          throw new RuntimeException("INVALID LINE: \"" + line + '"');
        }
        os.printf("%s %s %s\n", w, t, nl);
        prev = l;
      }
      os.println();
    }
    if (os != null) {
      os.close();
    }
  }
示例#24
0
  private void testParseTree() {
    try {
      Properties props = new Properties();
      props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
      StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

      // read some text in the text variable
      String text = "Give me a list of all bandleaders that play trumpet.";

      // create an empty Annotation just with the given text
      Annotation document = new Annotation(text);

      // run all Annotators on this text
      pipeline.annotate(document);

      // these are all the sentences in this document
      // a CoreMap is essentially a Map that uses class objects as keys and has values with custom
      // types
      List<CoreMap> sentences = document.get(SentencesAnnotation.class);

      for (CoreMap sentence : sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods

        // this is the parse tree of the current sentence
        Tree tree = sentence.get(TreeAnnotation.class);

        // this is the Stanford dependency graph of the current sentence
        SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);

        Set<IndexedWord> vertices = dependencies.vertexSet();
        List<SemanticGraphEdge> edges = dependencies.edgeListSorted();

        for (SemanticGraphEdge e : edges) {}

        for (IndexedWord i : vertices) {
          System.out.println(i.toString());
        }
      }

    } catch (Exception e) {

    }
  }
  public void annotate(Annotation annotation) {
    if (verbose) {
      timer.start();
      System.err.print("Adding gender annotation...");
    }

    if (!annotation.containsKey(SentencesAnnotation.class))
      throw new RuntimeException("Unable to find sentences in " + annotation);

    List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
      classifier.classify(tokens);

      for (CoreLabel token : tokens)
        token.set(GenderAnnotation.class, token.get(AnswerAnnotation.class));
    }

    if (verbose) timer.stop("done.");
  }
示例#26
0
文件: Pivot.java 项目: chmr123/phd
  /*
   * This function return the lemmatized word from the original term
   */
  private String lemmatize(String text) {
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and
    // has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    String lemma = null;
    for (CoreMap sentence : sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        lemma = token.get(LemmaAnnotation.class);
      }
    }
    return lemma;
  }
示例#27
0
  /** Find document type: Conversation or article */
  private DocType findDocType(Dictionaries dict) {
    boolean speakerChange = false;
    Set<Integer> discourseWithIorYou = Generics.newHashSet();

    for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
        int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
        if (utterIndex != 0) speakerChange = true;
        if (speakerChange && utterIndex == 0) return DocType.ARTICLE;
        if (dict.firstPersonPronouns.contains(
                w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())
            || dict.secondPersonPronouns.contains(
                w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())) {
          discourseWithIorYou.add(utterIndex);
        }
        if (maxUtter < utterIndex) maxUtter = utterIndex;
      }
    }
    if (!speakerChange) return DocType.ARTICLE;
    return DocType.CONVERSATION; // in conversation, utter index keep increasing.
  }
 private static List<Extraction> getExtractions(
     Corpus c, ArgumentIdentification ai, SententialInstanceGeneration sig, DocumentExtractor de)
     throws SQLException, IOException {
   List<Extraction> extrs = new ArrayList<Extraction>();
   Iterator<Annotation> docs = c.getDocumentIterator();
   Map<Integer, String> ftID2ftMap = ModelUtils.getFeatureIDToFeatureMap(de.getMapping());
   while (docs.hasNext()) {
     Annotation doc = docs.next();
     List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
     int sentenceCount = 1;
     for (CoreMap sentence : sentences) {
       // argument identification
       List<Argument> arguments = ai.identifyArguments(doc, sentence);
       // sentential instance generation
       List<Pair<Argument, Argument>> sententialInstances =
           sig.generateSententialInstances(arguments, sentence);
       for (Pair<Argument, Argument> p : sententialInstances) {
         Pair<Triple<String, Double, Double>, Map<Integer, Double>> extrResult =
             de.extractFromSententialInstanceWithFeatureScores(p.first, p.second, sentence, doc);
         if (extrResult != null) {
           Triple<String, Double, Double> extrScoreTripe = extrResult.first;
           Map<Integer, Double> featureScores = extrResult.second;
           String rel = extrScoreTripe.first;
           if (targetRelations.contains(rel)) {
             String docName = sentence.get(SentDocName.class);
             String senText = sentence.get(CoreAnnotations.TextAnnotation.class);
             Integer sentNum = sentence.get(SentGlobalID.class);
             Extraction e =
                 new Extraction(
                     p.first, p.second, docName, rel, sentNum, extrScoreTripe.third, senText);
             e.setFeatureScoreList(EvaluationUtils.getFeatureScoreList(featureScores, ftID2ftMap));
             extrs.add(e);
           }
         }
       }
       sentenceCount++;
     }
   }
   return EvaluationUtils.getUniqueList(extrs);
 }
  private static void recallErrors(
      List<List<Mention>> goldMentions, List<List<Mention>> predictedMentions, Annotation doc)
      throws IOException {
    List<CoreMap> coreMaps = doc.get(CoreAnnotations.SentencesAnnotation.class);
    int numSentences = goldMentions.size();
    for (int i = 0; i < numSentences; i++) {
      CoreMap coreMap = coreMaps.get(i);
      List<CoreLabel> words = coreMap.get(CoreAnnotations.TokensAnnotation.class);
      Tree tree = coreMap.get(TreeCoreAnnotations.TreeAnnotation.class);
      List<Mention> goldMentionsSent = goldMentions.get(i);
      List<Pair<Integer, Integer>> goldMentionsSpans = extractSpans(goldMentionsSent);

      for (Pair<Integer, Integer> mentionSpan : goldMentionsSpans) {
        logger.finer("RECALL ERROR\n");
        logger.finer(coreMap + "\n");
        for (int x = mentionSpan.first; x < mentionSpan.second; x++) {
          logger.finer(words.get(x).value() + " ");
        }
        logger.finer("\n" + tree + "\n");
      }
    }
  }
示例#30
0
 /** Set paragraph index */
 private void setParagraphAnnotation() {
   int paragraphIndex = 0;
   int previousOffset = -10;
   for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
     for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
       if (w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
         if (w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset + 2)
           paragraphIndex++;
         w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex);
         previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
       } else {
         w.set(CoreAnnotations.ParagraphAnnotation.class, -1);
       }
     }
   }
   for (List<Mention> l : predictedOrderedMentionsBySentence) {
     for (Mention m : l) {
       m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class);
     }
   }
   numParagraph = paragraphIndex;
 }