public List<NLPInfo> analyze(String text) {

    Annotation document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    if (sentences == null || sentences.isEmpty()) {
      return null;
    }

    List<NLPInfo> res = new ArrayList<NLPInfo>();
    NLPInfo info;
    for (CoreMap sentence : sentences) {
      info = new NLPInfo();
      NLPToken tokenInfo;
      for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
        tokenInfo = new NLPToken();
        tokenInfo.setWord(token.get(CoreAnnotations.TextAnnotation.class));
        tokenInfo.setTag(token.get(CoreAnnotations.PartOfSpeechAnnotation.class));
        tokenInfo.setNer(token.get(CoreAnnotations.NamedEntityTagAnnotation.class));
        info.appendToken(tokenInfo);
      }
      res.add(info);
    }
    return res;
  }
 public Map<Integer, Integer> getGeneSpans(String text) {
   Map<Integer, Integer> begin2end = new HashMap<Integer, Integer>();
   Annotation document = new Annotation(text);
   pipeline.annotate(document);
   List<CoreMap> sentences = document.get(SentencesAnnotation.class);
   for (CoreMap sentence : sentences) {
     List<CoreLabel> candidate = new ArrayList<CoreLabel>();
     for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
       String pos = token.get(PartOfSpeechAnnotation.class);
       if (pos.startsWith("NN")) {
         candidate.add(token);
       } else if (candidate.size() > 0) {
         int begin = candidate.get(0).beginPosition();
         int end = candidate.get(candidate.size() - 1).endPosition();
         begin2end.put(begin, end);
         candidate.clear();
       }
     }
     if (candidate.size() > 0) {
       int begin = candidate.get(0).beginPosition();
       int end = candidate.get(candidate.size() - 1).endPosition();
       begin2end.put(begin, end);
       candidate.clear();
     }
   }
   return begin2end;
 }
Example #3
0
  /**
   * @param t
   * @return
   */
  public static String lemmatize(String t) {

    if (pipeline == null) {
      loadModels();
    }

    String lemma = "";

    try {
      // create an empty Annotation just with the given text
      Annotation document = new Annotation(t);

      // run all Annotators on this text
      pipeline.annotate(document);

      // Iterate over all of the sentences found
      List<CoreMap> sentences = document.get(SentencesAnnotation.class);
      for (CoreMap sentence : sentences) {
        // Iterate over all tokens in a sentence
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
          // Retrieve and add the lemma for each word into the
          // list of lemmas
          lemma += " " + token.get(CoreAnnotations.LemmaAnnotation.class);
        }
      }
    } catch (Exception e) {
      System.err.println("Stanford Lemmatizer error exception Word: " + t);
    }

    return lemma.trim();
  }
Example #4
0
  public static final String doCorefResolution(Annotation annotation) {

    Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class);
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> resolved = new ArrayList<String>();
    for (CoreMap sentence : sentences) {
      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      for (CoreLabel token : tokens) {
        Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
        CorefChain chain = corefs.get(corefClustId);
        if (chain == null) resolved.add(token.word());
        else {
          int sentINdx = chain.getRepresentativeMention().sentNum - 1;
          CoreMap corefSentence = sentences.get(sentINdx);
          List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class);
          CorefMention reprMent = chain.getRepresentativeMention();
          if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) {
            for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
              CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
              resolved.add(matchedLabel.word());
            }
          } else resolved.add(token.word());
        }
      }
    }
    String resolvedStr = "";
    System.out.println();
    for (String str : resolved) {
      resolvedStr += str + " ";
    }
    System.out.println(resolvedStr);

    return resolvedStr;
  }
Example #5
0
  public static void main(String[] args) {
    SentenceDAO sentenceDAO = new SentenceDAOImpl();
    List<Sentence> sentences = sentenceDAO.findAll();
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    int i = 0;
    for (Sentence sentence : sentences) {
      if (sentence.getPredicate() == null) {
        try {
          System.out.println(i++);
          String text = sentence.getContent();
          Annotation annotation = new Annotation(text);
          pipeline.annotate(annotation);
          for (CoreMap core : annotation.get(SentencesAnnotation.class)) {
            SemanticGraph graph = core.get(CollapsedCCProcessedDependenciesAnnotation.class);

            sentence.setPredicate(graph.getFirstRoot().lemma());
          }
          sentenceDAO.save(sentence);
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    }

    // System.out.println(sentence.getWords());

  }
Example #6
0
  /**
   * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). Mention
   * detection and document preprocessing is done here.
   *
   * @throws Exception
   */
  public Document makeDocument(InputDoc input) throws Exception {
    if (input == null) return null;
    Annotation anno = input.annotation;

    // add missing annotation
    if (needMissingAnnotations) {
      addMissingAnnotation(anno);
    }

    if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) {
      anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true);
    }

    // remove nested NP with same headword except newswire document for chinese

    if (input.conllDoc != null && CorefProperties.getLanguage(props) == Locale.CHINESE) {
      CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw"));
    }

    // mention detection: MD gives following information about mentions: mention start/end index,
    // span, headword
    // rest information will be set in preprocess step
    List<List<Mention>> mentions = md.findMentions(anno, dict, props);
    Document doc = new Document(input, mentions);

    // find headword for gold mentions
    if (input.goldMentions != null) findGoldMentionHeads(doc);

    // document preprocessing: initialization (assign ID), mention processing (gender, number, type,
    // etc), speaker extraction, etc
    Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder);

    return doc;
  }
  /** Speaker extraction */
  private void findSpeakers(Dictionaries dict) {
    Boolean useMarkedDiscourseBoolean =
        annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class);
    boolean useMarkedDiscourse =
        (useMarkedDiscourseBoolean != null) ? useMarkedDiscourseBoolean : false;
    if (Constants.USE_GOLD_SPEAKER_TAGS || useMarkedDiscourse) {
      for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
          int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
          speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class));
        }
      }
    } else {
      if (docType == DocType.CONVERSATION) findSpeakersInConversation(dict);
      else if (docType == DocType.ARTICLE) findSpeakersInArticle(dict);

      // set speaker info to annotation
      for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
          int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
          if (speakers.containsKey(utterIndex)) {
            w.set(CoreAnnotations.SpeakerAnnotation.class, speakers.get(utterIndex));
          }
        }
      }
    }
  }
  private static List<AnaphorWithReferent> parseText(InputText text) {
    Annotation annotatedText = new Annotation(text.toString());
    Container.getStanfordCoreNLP().annotate(annotatedText);
    List<CoreMap> coreMapSentences = annotatedText.get(CoreAnnotations.SentencesAnnotation.class);
    List<Tree> trees =
        coreMapSentences
            .stream()
            .map(s -> s.get(TreeCoreAnnotations.TreeAnnotation.class))
            .collect(Collectors.toList());

    List<Sentence> allSentences =
        IntStream.range(0, trees.size())
            .mapToObj(
                id ->
                    new Sentence(
                        id,
                        trees.get(id),
                        Container.getNPsFromParseTreeExtractor().extract(trees.get(id))))
            .collect(Collectors.toList());
    List<AnaphorWithReferent> anaphoraWithReferentFromAllSentences =
        allSentences
            .stream()
            .map(s -> Container.getAllAnaphorWithReferentPerSentenceFinder().find(s, allSentences))
            .flatMap(a -> a.stream())
            .collect(Collectors.toList());

    return anaphoraWithReferentFromAllSentences;
  }
Example #9
0
  public static List<String> lemmatizeDocument(String documentText) {

    if (pipeline == null) {
      loadModels();
    }

    List<String> lemmas = new LinkedList<>();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(documentText);

    // run all Annotators on this text
    pipeline.annotate(document);

    // Iterate over all of the sentences found
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      // Iterate over all tokens in a sentence
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        // Retrieve and add the lemma for each word into the
        // list of lemmas
        lemmas.add(token.get(CoreAnnotations.LemmaAnnotation.class));
      }
    }

    return lemmas;
  }
Example #10
0
  public static ArrayList<String[]> extractNounPhrases(
      StanfordCoreNLP pipeline, String text, int searchRange) {
    ArrayList<String[]> wordPairs = new ArrayList<String[]>();
    Annotation document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    MAX_STEPS = searchRange;

    for (CoreMap sentence : sentences) {
      List<CoreLabel> labels = sentence.get(TokensAnnotation.class);

      // Check negation
      boolean hasNegation = false;
      for (CoreLabel label : labels) {
        if (NEGATIONS.contains(label.lemma().toLowerCase())) {
          hasNegation = true;
        }
      }

      for (int idx = 0; idx < labels.size(); idx++) {
        CoreLabel label = labels.get(idx);
        if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) {
          for (int step = 1; step <= MAX_STEPS; step++) {
            CoreLabel leftLabel = labels.get(Math.max(0, idx - step));
            if (JJ_TAGS.contains(leftLabel.tag())) {
              if (hasNegation)
                addPair(
                    wordPairs,
                    NOT_PREFIX + leftLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              else
                addPair(
                    wordPairs,
                    leftLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              break;
            }
            CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1));
            if (JJ_TAGS.contains(rightLabel.tag())) {
              if (hasNegation)
                addPair(
                    wordPairs,
                    NOT_PREFIX + rightLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              else
                addPair(
                    wordPairs,
                    rightLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));

              break;
            }
          }
        }
      }
    }
    return wordPairs;
  }
Example #11
0
 /**
  * TODO(gabor) JavaDoc
  *
  * @param sentence
  * @param pipeline
  */
 public static void annotate(CoreMap sentence, AnnotationPipeline pipeline) {
   Annotation ann =
       new Annotation(StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), " "));
   ann.set(
       CoreAnnotations.TokensAnnotation.class,
       sentence.get(CoreAnnotations.TokensAnnotation.class));
   ann.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
   pipeline.annotate(ann);
 }
Example #12
0
  public Annotation process(String sentence, String dateString, Annotator timeAnnotator) {
    log.info("Processing text \"" + sentence + "\" with dateString = " + dateString);
    Annotation anno = new Annotation(sentence);
    if (dateString != null && !dateString.equals("")) {
      anno.set(CoreAnnotations.DocDateAnnotation.class, dateString);
    }
    pipeline.annotate(anno);

    timeAnnotator.annotate(anno);
    return anno;
  }
Example #13
0
 public static void main(String[] args) throws IOException {
   SUTimePipeline pipeline = new SUTimePipeline();
   Annotator timeAnnotator = pipeline.getTimeAnnotator("sutime", new Properties());
   BufferedReader is = new BufferedReader(new InputStreamReader(System.in));
   System.out.print("> ");
   for (String line; (line = is.readLine()) != null; ) {
     Annotation ann = pipeline.process(line, null, timeAnnotator);
     System.out.println(ann.get(TimeAnnotations.TimexAnnotations.class));
     System.out.print("> ");
   }
 }
Example #14
0
 private void handleNonCvtBinary(FormulaGenerationInfo fgInfo, Set<String> res) {
   String description = normalizeFbDescription(fgInfo.bInfo.descriptions.get(0));
   Annotation a = getAnnotation(description);
   String question =
       generateNonCvtQuestion(
           fgInfo,
           description,
           getPosTagsFromAnnotation(a),
           a.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(),
           fbFormulasInfo.isReversed(fgInfo.bInfo.formula));
   if (question != null) res.add(question);
 }
  /**
   * Finds the position of the sentence in the given document that achieves the best ROUGE-N scores
   * w.r.t. to the reference summaries.
   *
   * @param task the document and the corresponding models
   * @return the position of the best sentence in the document
   * @throws IOException
   */
  public int getBestSentencePos(Task task) {
    Document document = task.getDocument();
    Annotation documentAnnotation = annotationProvider.getAnnotation(document.getContent());

    RougeN rouge = rougeFactory.make(task.getModels(), annotationProvider);
    BestSentenceSelector sentenceSelector = new BestSentenceSelector(rouge);
    Annotation bestAnnotation = sentenceSelector.select(documentAnnotation);
    CoreMap sentence = bestAnnotation.get(SentencesAnnotation.class).get(0);
    String bestPos = sentence.get(SentencePositionAnnotation.class);

    return Integer.valueOf(bestPos);
  }
Example #16
0
  private void parseThread(ArrayList<Thread> threads) {
    for (Thread t : threads) {
      ThreadVector tv = new ThreadVector(t);
      allThreads.add(tv);
      for (Email e : t.getEmails()) {
        StringBuffer sb = new StringBuffer();
        for (Sentence s : e.getSentences()) {
          // if it's the content of this email
          if (s.getQuotationTimes() == 0) {
            sb.append(s.getText() + " ");
          }
        }
        String content = sb.toString().toLowerCase();

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(content);

        // run all Annotators on this text
        this.pipeline.annotate(document);

        // Iterate over all of the sentences found
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
          List<String> lemmas = new LinkedList<String>();
          // Iterate over all tokens in a sentence
          for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            // Retrieve and add the lemma for each word into the
            // list of lemmas
            lemmas.add(token.get(LemmaAnnotation.class));
          }

          HashMap<String, Integer> wordCount = countWordsInSentence(lemmas);
          // if it has valid words
          if (wordCount.size() > 0) {
            totalSentenceNumber++;
            for (String word : wordCount.keySet()) {
              if (!dictionaryIndex.containsKey(word)) {
                dictionaryIndex.put(word, dictionaryIndex.size());
                dictionaryDocumentCount.put(word, 1);
              } else {
                dictionaryDocumentCount.put(word, dictionaryDocumentCount.get(word) + 1);
              }
            }
            SentenceVector sv = new SentenceVector(sentence.toString(), wordCount);
            tv.addSentenceVectors(sv);
          }
        }
      }
    }
  }
Example #17
0
  public static Collection<String> lemmatize(String rawInput) {

    Collection<String> lemmas =
        Lists.newArrayListWithCapacity(30); // should to the initial capacity in other places too
    Annotation rawInputAnnotation = new Annotation(rawInput);
    coreNlp.annotate(rawInputAnnotation);

    List<CoreLabel> allTokens = rawInputAnnotation.get(TokensAnnotation.class);

    for (CoreLabel eachToken : allTokens) {
      lemmas.add(eachToken.get(LemmaAnnotation.class));
    }

    return lemmas;
  }
Example #18
0
  public static DependencyParse parse(String text) {

    if (pipeline == null) {
      loadModels();
    }

    DependencyParse parse = new DependencyParse();

    Annotation document = new Annotation(text);

    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {

      SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);

      IndexedWord root = dependencies.getFirstRoot();

      parse.setHeadNode(root.index());

      List<SemanticGraphEdge> edges = dependencies.edgeListSorted();

      // System.out.println(edges);
      for (SemanticGraphEdge t : edges) {

        String dep = t.getDependent().originalText();
        int depIndex = t.getDependent().index();
        String depPOS = t.getDependent().tag();
        int depStart = t.getDependent().beginPosition();
        int depEnd = t.getDependent().endPosition();

        String gov = t.getGovernor().originalText();
        int govIndex = t.getGovernor().index();
        String govPOS = t.getGovernor().tag();
        int govStart = t.getGovernor().beginPosition();
        int govEnd = t.getGovernor().endPosition();

        parse.addNode(govIndex, gov, govPOS, govStart, govEnd);
        parse.addNode(depIndex, dep, depPOS, depStart, depEnd);

        parse.addEdge(depIndex, govIndex, t.getRelation().getShortName());
      }
    }

    return parse;
  }
 private void findSpeakersInConversation(Dictionaries dict) {
   for (List<Mention> l : predictedOrderedMentionsBySentence) {
     for (Mention m : l) {
       if (m.predicateNominatives == null) continue;
       for (Mention a : m.predicateNominatives) {
         if (a.spanToString().toLowerCase().equals("i")) {
           speakers.put(
               m.headWord.get(CoreAnnotations.UtteranceAnnotation.class),
               Integer.toString(m.mentionID));
         }
       }
     }
   }
   List<CoreMap> paragraph = new ArrayList<CoreMap>();
   int paragraphUtterIndex = 0;
   String nextParagraphSpeaker = "";
   int paragraphOffset = 0;
   for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
     int currentUtter =
         sent.get(CoreAnnotations.TokensAnnotation.class)
             .get(0)
             .get(CoreAnnotations.UtteranceAnnotation.class);
     if (paragraphUtterIndex != currentUtter) {
       nextParagraphSpeaker =
           findParagraphSpeaker(
               paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
       paragraphUtterIndex = currentUtter;
       paragraphOffset += paragraph.size();
       paragraph = new ArrayList<CoreMap>();
     }
     paragraph.add(sent);
   }
   findParagraphSpeaker(
       paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
 }
  private void findSpeakersInArticle(Dictionaries dict) {
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    Pair<Integer, Integer> beginQuotation = new Pair<Integer, Integer>();
    Pair<Integer, Integer> endQuotation = new Pair<Integer, Integer>();
    boolean insideQuotation = false;
    int utterNum = -1;

    for (int i = 0; i < sentences.size(); i++) {
      List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
      for (int j = 0; j < sent.size(); j++) {
        int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class);

        if (utterIndex != 0 && !insideQuotation) {
          utterNum = utterIndex;
          insideQuotation = true;
          beginQuotation.setFirst(i);
          beginQuotation.setSecond(j);
        } else if (utterIndex == 0 && insideQuotation) {
          insideQuotation = false;
          endQuotation.setFirst(i);
          endQuotation.setSecond(j);
          findQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict);
        }
      }
    }
  }
  public static void addFigerAnnotationToDocument(Annotation d) throws SQLException {

    List<CoreMap> sentences = d.get(CoreAnnotations.SentencesAnnotation.class);
    Set<String> entityIds = new HashSet<String>();
    for (CoreMap sen : sentences) {
      List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation =
          sen.get(NamedEntityLinkingAnnotation.class);
      for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) {
        String id = t.second;
        if (!id.equals("null")) {
          entityIds.add(id);
        }
      }
    }
    Map<String, Set<String>> idTypeMap = bigQuery(entityIds);
    // add type onto sentences
    for (CoreMap sen : sentences) {
      List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation =
          sen.get(NamedEntityLinkingAnnotation.class);
      List<Triple<Set<String>, Integer, Integer>> figerData = new ArrayList<>();
      for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) {
        Integer start = t.first.first;
        Integer end = t.first.second;
        Set<String> types = null;
        if (!t.second.equals("null")) {
          types = idTypeMap.get(GuidMidConversion.convertBackward(t.second));
        }
        Triple<Set<String>, Integer, Integer> figerTrip = new Triple<>(types, start, end);
        figerData.add(figerTrip);
      }
      sen.set(FigerAnnotation.class, figerData);
    }
  }
Example #22
0
  private void handleCvtBinary(FormulaGenerationInfo fgInfo, Set<String> res) {
    String description1 = normalizeFbDescription(fgInfo.bInfo.descriptions.get(0));
    String description2 = normalizeFbDescription(fgInfo.bInfo.descriptions.get(1));
    Annotation a1 = getAnnotation(description1);
    Annotation a2 = getAnnotation(description2);

    String question =
        generateCvtQuestion(
            fgInfo,
            description1,
            description2,
            a1.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(),
            a2.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(),
            getPosTagsFromAnnotation(a1),
            getPosTagsFromAnnotation(a2));
    if (question != null) res.add(question);
  }
Example #23
0
  private void testParseTree() {
    try {
      Properties props = new Properties();
      props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
      StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

      // read some text in the text variable
      String text = "Give me a list of all bandleaders that play trumpet.";

      // create an empty Annotation just with the given text
      Annotation document = new Annotation(text);

      // run all Annotators on this text
      pipeline.annotate(document);

      // these are all the sentences in this document
      // a CoreMap is essentially a Map that uses class objects as keys and has values with custom
      // types
      List<CoreMap> sentences = document.get(SentencesAnnotation.class);

      for (CoreMap sentence : sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods

        // this is the parse tree of the current sentence
        Tree tree = sentence.get(TreeAnnotation.class);

        // this is the Stanford dependency graph of the current sentence
        SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);

        Set<IndexedWord> vertices = dependencies.vertexSet();
        List<SemanticGraphEdge> edges = dependencies.edgeListSorted();

        for (SemanticGraphEdge e : edges) {}

        for (IndexedWord i : vertices) {
          System.out.println(i.toString());
        }
      }

    } catch (Exception e) {

    }
  }
  public void annotate(Annotation annotation) {
    if (verbose) {
      timer.start();
      System.err.print("Adding gender annotation...");
    }

    if (!annotation.containsKey(SentencesAnnotation.class))
      throw new RuntimeException("Unable to find sentences in " + annotation);

    List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
      classifier.classify(tokens);

      for (CoreLabel token : tokens)
        token.set(GenderAnnotation.class, token.get(AnswerAnnotation.class));
    }

    if (verbose) timer.stop("done.");
  }
Example #25
0
  public static void describe(DBObject doc, StanfordCoreNLP pipeline) {
    // create an empty Annotation just with the given text
    Annotation document = new Annotation((String) doc.get("cleansed_text"));
    BasicDBObject m_doc = new BasicDBObject();
    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom
    // types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        // this is the text of the token
        String word = token.get(TextAnnotation.class);

        // this is the POS tag of the token
        String pos = token.get(PartOfSpeechAnnotation.class);

        // this is the NER label of the token
        String ne = token.get(NamedEntityTagAnnotation.class);
        // System.out.print("(" + word + ", " + pos + ", " + ne + ")");
      }
      // System.out.println();

      // this is the parse tree of the current sentence
      Tree tree = sentence.get(TreeAnnotation.class);

      // this is the Stanford dependency graph of the current sentence
      SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
    }

    // This is the coreference link graph
    // Each chain stores a set of mentions that link to each other,
    // along with a method for getting the most representative mention
    // Both sentence and token offsets start at 1!
    Map<Integer, CorefChain> graph = document.get(CorefChainAnnotation.class);
  }
Example #26
0
  /*
   * This function return the lemmatized word from the original term
   */
  private String lemmatize(String text) {
    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and
    // has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    String lemma = null;
    for (CoreMap sentence : sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        lemma = token.get(LemmaAnnotation.class);
      }
    }
    return lemma;
  }
 private static List<Extraction> getExtractions(
     Corpus c, ArgumentIdentification ai, SententialInstanceGeneration sig, DocumentExtractor de)
     throws SQLException, IOException {
   List<Extraction> extrs = new ArrayList<Extraction>();
   Iterator<Annotation> docs = c.getDocumentIterator();
   Map<Integer, String> ftID2ftMap = ModelUtils.getFeatureIDToFeatureMap(de.getMapping());
   while (docs.hasNext()) {
     Annotation doc = docs.next();
     List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
     int sentenceCount = 1;
     for (CoreMap sentence : sentences) {
       // argument identification
       List<Argument> arguments = ai.identifyArguments(doc, sentence);
       // sentential instance generation
       List<Pair<Argument, Argument>> sententialInstances =
           sig.generateSententialInstances(arguments, sentence);
       for (Pair<Argument, Argument> p : sententialInstances) {
         Pair<Triple<String, Double, Double>, Map<Integer, Double>> extrResult =
             de.extractFromSententialInstanceWithFeatureScores(p.first, p.second, sentence, doc);
         if (extrResult != null) {
           Triple<String, Double, Double> extrScoreTripe = extrResult.first;
           Map<Integer, Double> featureScores = extrResult.second;
           String rel = extrScoreTripe.first;
           if (targetRelations.contains(rel)) {
             String docName = sentence.get(SentDocName.class);
             String senText = sentence.get(CoreAnnotations.TextAnnotation.class);
             Integer sentNum = sentence.get(SentGlobalID.class);
             Extraction e =
                 new Extraction(
                     p.first, p.second, docName, rel, sentNum, extrScoreTripe.third, senText);
             e.setFeatureScoreList(EvaluationUtils.getFeatureScoreList(featureScores, ftID2ftMap));
             extrs.add(e);
           }
         }
       }
       sentenceCount++;
     }
   }
   return EvaluationUtils.getUniqueList(extrs);
 }
Example #28
0
  private String getHeadNoun(String uri) {
    String[] tokens = lexicalize(uri);

    // if we have multiple tokens, get the head noun
    String head;
    if (tokens.length > 1) {
      head = Joiner.on(" ").join(tokens);

      Annotation document = new Annotation(head);
      pipeline.annotate(document);

      CoreMap sentence = document.get(SentencesAnnotation.class).get(0);
      Tree tree = sentence.get(TreeAnnotation.class);

      Tree headTree = headFinder.determineHead(tree);
      // we assume that the last occurring NN is the head noun
      List<Tree> leaves = headTree.getLeaves();
      head = leaves.get(leaves.size() - 1).label().value();
    } else {
      head = tokens[0];
    }
    return head;
  }
  public String[] wordsSegment(String text) {
    String[] listTokenSens = null;
    List<String> listSens = new ArrayList<String>();
    // creates a StanfordCoreNLP object, with POS tagging, lemmatization,
    // NER, parsing, and coreference resolution
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and
    // has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      List<String> listWord = new ArrayList<String>();
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        System.err.println(token.lemma());
        // this is the text of the token
        String word = token.get(TextAnnotation.class);
        listWord.add(word);
        // this is the POS tag of the token
        // String pos = token.get(PartOfSpeechAnnotation.class);
      }
      listSens.add(StringUtils.join(listWord, " "));
    }
    listTokenSens = new String[listSens.size()];
    listTokenSens = listSens.toArray(listTokenSens);
    return listTokenSens;
  }
  @Override
  protected void process(
      ComplexEventChunk<StreamEvent> streamEventChunk,
      Processor nextProcessor,
      StreamEventCloner streamEventCloner,
      ComplexEventPopulater complexEventPopulater) {
    synchronized (this) {
      while (streamEventChunk.hasNext()) {
        StreamEvent streamEvent = streamEventChunk.next();
        if (logger.isDebugEnabled()) {
          logger.debug(
              String.format(
                  "Event received. Regex:%s Event:%s", regexPattern.pattern(), streamEvent));
        }

        Annotation document =
            pipeline.process(attributeExpressionExecutors[1].execute(streamEvent).toString());

        for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
          TokenSequenceMatcher matcher =
              regexPattern.getMatcher(sentence.get(CoreAnnotations.TokensAnnotation.class));
          while (matcher.find()) {
            Object[] data = new Object[attributeCount];
            data[0] = matcher.group();
            for (int i = 1; i < attributeCount; i++) {
              data[i] = matcher.group(i);
            }
            StreamEvent newStreamEvent = streamEventCloner.copyStreamEvent(streamEvent);
            complexEventPopulater.populateComplexEvent(newStreamEvent, data);
            streamEventChunk.insertBeforeCurrent(newStreamEvent);
          }
        }
        streamEventChunk.remove();
      }
    }
    nextProcessor.process(streamEventChunk);
  }