public List<CoreMap> getAPIElementSentences(boolean parse) {
    List<CoreMap> sentences = section.sentences;

    // getAllSentences(parse);
    List<CoreMap> apiSentences = new ArrayList<CoreMap>();

    String formattedAPI =
        apiElement
            .getAPIElementName()
            .replaceAll("\\(", "")
            .replaceAll("\\)", "")
            .replaceAll("\\.", "-")
            .toLowerCase();
    if (sentences == null) System.out.println("Warning");
    for (CoreMap sent : sentences) {
      if (sent.toString().toLowerCase().indexOf("clt_" + formattedAPI) != -1)
        apiSentences.add(sent);
    }

    if (apiSentences.isEmpty())
      System.out.println(
          "WARNGING: In getAPIElementSentences "
              + apiElement.getAPIElementName()
              + ","
              + section.getSubTitle());
    return apiSentences;
  }
예제 #2
0
  public static void main(String[] args) throws IOException, ClassNotFoundException {
    Timing tim = new Timing();
    AnnotationPipeline ap = new AnnotationPipeline();
    boolean verbose = false;
    ap.addAnnotator(new TokenizerAnnotator(verbose, "en"));
    ap.addAnnotator(new WordsToSentencesAnnotator(verbose));
    // ap.addAnnotator(new NERCombinerAnnotator(verbose));
    // ap.addAnnotator(new OldNERAnnotator(verbose));
    // ap.addAnnotator(new NERMergingAnnotator(verbose));
    ap.addAnnotator(new ParserAnnotator(verbose, -1));
    /**
     * ap.addAnnotator(new UpdateSentenceFromParseAnnotator(verbose)); ap.addAnnotator(new
     * NumberAnnotator(verbose)); ap.addAnnotator(new
     * QuantifiableEntityNormalizingAnnotator(verbose)); ap.addAnnotator(new
     * StemmerAnnotator(verbose)); ap.addAnnotator(new MorphaAnnotator(verbose));
     */
    //    ap.addAnnotator(new SRLAnnotator());

    String text =
        ("USAir said in the filings that Mr. Icahn first contacted Mr. Colodny last September to discuss the benefits of combining TWA and USAir -- either by TWA's acquisition of USAir, or USAir's acquisition of TWA.");
    Annotation a = new Annotation(text);
    ap.annotate(a);
    System.out.println(a.get(CoreAnnotations.TokensAnnotation.class));
    for (CoreMap sentence : a.get(CoreAnnotations.SentencesAnnotation.class)) {
      System.out.println(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    if (TIME) {
      System.out.println(ap.timingInformation());
      System.err.println("Total time for AnnotationPipeline: " + tim.toSecondsString() + " sec.");
    }
  }
 public Map<Integer, Integer> getGeneSpans(String text) {
   Map<Integer, Integer> begin2end = new HashMap<Integer, Integer>();
   Annotation document = new Annotation(text);
   pipeline.annotate(document);
   List<CoreMap> sentences = document.get(SentencesAnnotation.class);
   for (CoreMap sentence : sentences) {
     List<CoreLabel> candidate = new ArrayList<CoreLabel>();
     for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
       String pos = token.get(PartOfSpeechAnnotation.class);
       if (pos.startsWith("NN")) {
         candidate.add(token);
       } else if (candidate.size() > 0) {
         int begin = candidate.get(0).beginPosition();
         int end = candidate.get(candidate.size() - 1).endPosition();
         begin2end.put(begin, end);
         candidate.clear();
       }
     }
     if (candidate.size() > 0) {
       int begin = candidate.get(0).beginPosition();
       int end = candidate.get(candidate.size() - 1).endPosition();
       begin2end.put(begin, end);
       candidate.clear();
     }
   }
   return begin2end;
 }
예제 #4
0
파일: App.java 프로젝트: tqtruonga5/thesis
  public static void main(String[] args) {
    SentenceDAO sentenceDAO = new SentenceDAOImpl();
    List<Sentence> sentences = sentenceDAO.findAll();
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    int i = 0;
    for (Sentence sentence : sentences) {
      if (sentence.getPredicate() == null) {
        try {
          System.out.println(i++);
          String text = sentence.getContent();
          Annotation annotation = new Annotation(text);
          pipeline.annotate(annotation);
          for (CoreMap core : annotation.get(SentencesAnnotation.class)) {
            SemanticGraph graph = core.get(CollapsedCCProcessedDependenciesAnnotation.class);

            sentence.setPredicate(graph.getFirstRoot().lemma());
          }
          sentenceDAO.save(sentence);
        } catch (Exception e) {
          e.printStackTrace();
        }
      }
    }

    // System.out.println(sentence.getWords());

  }
예제 #5
0
  public static void addFigerAnnotationToDocument(Annotation d) throws SQLException {

    List<CoreMap> sentences = d.get(CoreAnnotations.SentencesAnnotation.class);
    Set<String> entityIds = new HashSet<String>();
    for (CoreMap sen : sentences) {
      List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation =
          sen.get(NamedEntityLinkingAnnotation.class);
      for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) {
        String id = t.second;
        if (!id.equals("null")) {
          entityIds.add(id);
        }
      }
    }
    Map<String, Set<String>> idTypeMap = bigQuery(entityIds);
    // add type onto sentences
    for (CoreMap sen : sentences) {
      List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation =
          sen.get(NamedEntityLinkingAnnotation.class);
      List<Triple<Set<String>, Integer, Integer>> figerData = new ArrayList<>();
      for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) {
        Integer start = t.first.first;
        Integer end = t.first.second;
        Set<String> types = null;
        if (!t.second.equals("null")) {
          types = idTypeMap.get(GuidMidConversion.convertBackward(t.second));
        }
        Triple<Set<String>, Integer, Integer> figerTrip = new Triple<>(types, start, end);
        figerData.add(figerTrip);
      }
      sen.set(FigerAnnotation.class, figerData);
    }
  }
예제 #6
0
  public static List<String> lemmatizeDocument(String documentText) {

    if (pipeline == null) {
      loadModels();
    }

    List<String> lemmas = new LinkedList<>();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(documentText);

    // run all Annotators on this text
    pipeline.annotate(document);

    // Iterate over all of the sentences found
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      // Iterate over all tokens in a sentence
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        // Retrieve and add the lemma for each word into the
        // list of lemmas
        lemmas.add(token.get(CoreAnnotations.LemmaAnnotation.class));
      }
    }

    return lemmas;
  }
예제 #7
0
  /**
   * @param t
   * @return
   */
  public static String lemmatize(String t) {

    if (pipeline == null) {
      loadModels();
    }

    String lemma = "";

    try {
      // create an empty Annotation just with the given text
      Annotation document = new Annotation(t);

      // run all Annotators on this text
      pipeline.annotate(document);

      // Iterate over all of the sentences found
      List<CoreMap> sentences = document.get(SentencesAnnotation.class);
      for (CoreMap sentence : sentences) {
        // Iterate over all tokens in a sentence
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
          // Retrieve and add the lemma for each word into the
          // list of lemmas
          lemma += " " + token.get(CoreAnnotations.LemmaAnnotation.class);
        }
      }
    } catch (Exception e) {
      System.err.println("Stanford Lemmatizer error exception Word: " + t);
    }

    return lemma.trim();
  }
예제 #8
0
  public static void fillInParseAnnotations(
      boolean verbose, boolean buildGraphs, CoreMap sentence, Tree tree) {
    // make sure all tree nodes are CoreLabels
    // TODO: why isn't this always true? something fishy is going on
    ParserAnnotatorUtils.convertToCoreLabels(tree);

    // index nodes, i.e., add start and end token positions to all nodes
    // this is needed by other annotators down stream, e.g., the NFLAnnotator
    tree.indexSpans(0);

    sentence.set(TreeAnnotation.class, tree);
    if (verbose) {
      System.err.println("Tree is:");
      tree.pennPrint(System.err);
    }

    if (buildGraphs) {
      // generate the dependency graph
      SemanticGraph deps = generateCollapsedDependencies(tree);
      SemanticGraph uncollapsedDeps = generateUncollapsedDependencies(tree);
      SemanticGraph ccDeps = generateCCProcessedDependencies(tree);
      if (verbose) {
        System.err.println("SDs:");
        System.err.println(deps.toString("plain"));
      }
      sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
      sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps);
      sentence.set(
          SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps);
    }

    setMissingTags(sentence, tree);
  }
예제 #9
0
  public List<NLPInfo> analyze(String text) {

    Annotation document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    if (sentences == null || sentences.isEmpty()) {
      return null;
    }

    List<NLPInfo> res = new ArrayList<NLPInfo>();
    NLPInfo info;
    for (CoreMap sentence : sentences) {
      info = new NLPInfo();
      NLPToken tokenInfo;
      for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
        tokenInfo = new NLPToken();
        tokenInfo.setWord(token.get(CoreAnnotations.TextAnnotation.class));
        tokenInfo.setTag(token.get(CoreAnnotations.PartOfSpeechAnnotation.class));
        tokenInfo.setNer(token.get(CoreAnnotations.NamedEntityTagAnnotation.class));
        info.appendToken(tokenInfo);
      }
      res.add(info);
    }
    return res;
  }
예제 #10
0
 private void findSpeakersInConversation(Dictionaries dict) {
   for (List<Mention> l : predictedOrderedMentionsBySentence) {
     for (Mention m : l) {
       if (m.predicateNominatives == null) continue;
       for (Mention a : m.predicateNominatives) {
         if (a.spanToString().toLowerCase().equals("i")) {
           speakers.put(
               m.headWord.get(CoreAnnotations.UtteranceAnnotation.class),
               Integer.toString(m.mentionID));
         }
       }
     }
   }
   List<CoreMap> paragraph = new ArrayList<CoreMap>();
   int paragraphUtterIndex = 0;
   String nextParagraphSpeaker = "";
   int paragraphOffset = 0;
   for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
     int currentUtter =
         sent.get(CoreAnnotations.TokensAnnotation.class)
             .get(0)
             .get(CoreAnnotations.UtteranceAnnotation.class);
     if (paragraphUtterIndex != currentUtter) {
       nextParagraphSpeaker =
           findParagraphSpeaker(
               paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
       paragraphUtterIndex = currentUtter;
       paragraphOffset += paragraph.size();
       paragraph = new ArrayList<CoreMap>();
     }
     paragraph.add(sent);
   }
   findParagraphSpeaker(
       paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
 }
예제 #11
0
  private String findNextParagraphSpeaker(
      List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) {
    CoreMap lastSent = paragraph.get(paragraph.size() - 1);
    String speaker = "";
    for (CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
      if (w.get(CoreAnnotations.LemmaAnnotation.class).equals("report")
          || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
        String word = w.get(CoreAnnotations.TextAnnotation.class);
        SemanticGraph dependency =
            lastSent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        IndexedWord t = dependency.getNodeByWordPattern(word);

        for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(t)) {
          if (child.first().getShortName().equals("nsubj")) {
            int subjectIndex = child.second().index(); // start from 1
            IntTuple headPosition = new IntTuple(2);
            headPosition.set(0, paragraph.size() - 1 + paragraphOffset);
            headPosition.set(1, subjectIndex - 1);
            if (mentionheadPositions.containsKey(headPosition)
                && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
              speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
            }
          }
        }
      }
    }
    return speaker;
  }
예제 #12
0
  /** Speaker extraction */
  private void findSpeakers(Dictionaries dict) {
    Boolean useMarkedDiscourseBoolean =
        annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class);
    boolean useMarkedDiscourse =
        (useMarkedDiscourseBoolean != null) ? useMarkedDiscourseBoolean : false;
    if (Constants.USE_GOLD_SPEAKER_TAGS || useMarkedDiscourse) {
      for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
          int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
          speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class));
        }
      }
    } else {
      if (docType == DocType.CONVERSATION) findSpeakersInConversation(dict);
      else if (docType == DocType.ARTICLE) findSpeakersInArticle(dict);

      // set speaker info to annotation
      for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
          int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
          if (speakers.containsKey(utterIndex)) {
            w.set(CoreAnnotations.SpeakerAnnotation.class, speakers.get(utterIndex));
          }
        }
      }
    }
  }
예제 #13
0
  /** set UtteranceAnnotation for quotations: default UtteranceAnnotation = 0 is given */
  private void markQuotations(List<CoreMap> results, boolean normalQuotationType) {
    boolean insideQuotation = false;
    for (CoreMap m : results) {
      for (CoreLabel l : m.get(CoreAnnotations.TokensAnnotation.class)) {
        String w = l.get(CoreAnnotations.TextAnnotation.class);

        boolean noSpeakerInfo =
            !l.containsKey(CoreAnnotations.SpeakerAnnotation.class)
                || l.get(CoreAnnotations.SpeakerAnnotation.class).equals("")
                || l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER");

        if (w.equals("``") || (!insideQuotation && normalQuotationType && w.equals("\""))) {
          insideQuotation = true;
          maxUtter++;
          continue;
        } else if (w.equals("''") || (insideQuotation && normalQuotationType && w.equals("\""))) {
          insideQuotation = false;
        }
        if (insideQuotation) {
          l.set(CoreAnnotations.UtteranceAnnotation.class, maxUtter);
        }
        if (noSpeakerInfo) {
          l.set(
              CoreAnnotations.SpeakerAnnotation.class,
              "PER" + l.get(CoreAnnotations.UtteranceAnnotation.class));
        }
      }
    }
    if (maxUtter == 0 && !normalQuotationType) markQuotations(results, true);
  }
예제 #14
0
  public static final String doCorefResolution(Annotation annotation) {

    Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class);
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> resolved = new ArrayList<String>();
    for (CoreMap sentence : sentences) {
      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      for (CoreLabel token : tokens) {
        Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
        CorefChain chain = corefs.get(corefClustId);
        if (chain == null) resolved.add(token.word());
        else {
          int sentINdx = chain.getRepresentativeMention().sentNum - 1;
          CoreMap corefSentence = sentences.get(sentINdx);
          List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class);
          CorefMention reprMent = chain.getRepresentativeMention();
          if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) {
            for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
              CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
              resolved.add(matchedLabel.word());
            }
          } else resolved.add(token.word());
        }
      }
    }
    String resolvedStr = "";
    System.out.println();
    for (String str : resolved) {
      resolvedStr += str + " ";
    }
    System.out.println(resolvedStr);

    return resolvedStr;
  }
예제 #15
0
  public static ArrayList<String[]> extractNounPhrases(
      StanfordCoreNLP pipeline, String text, int searchRange) {
    ArrayList<String[]> wordPairs = new ArrayList<String[]>();
    Annotation document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    MAX_STEPS = searchRange;

    for (CoreMap sentence : sentences) {
      List<CoreLabel> labels = sentence.get(TokensAnnotation.class);

      // Check negation
      boolean hasNegation = false;
      for (CoreLabel label : labels) {
        if (NEGATIONS.contains(label.lemma().toLowerCase())) {
          hasNegation = true;
        }
      }

      for (int idx = 0; idx < labels.size(); idx++) {
        CoreLabel label = labels.get(idx);
        if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) {
          for (int step = 1; step <= MAX_STEPS; step++) {
            CoreLabel leftLabel = labels.get(Math.max(0, idx - step));
            if (JJ_TAGS.contains(leftLabel.tag())) {
              if (hasNegation)
                addPair(
                    wordPairs,
                    NOT_PREFIX + leftLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              else
                addPair(
                    wordPairs,
                    leftLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              break;
            }
            CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1));
            if (JJ_TAGS.contains(rightLabel.tag())) {
              if (hasNegation)
                addPair(
                    wordPairs,
                    NOT_PREFIX + rightLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              else
                addPair(
                    wordPairs,
                    rightLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));

              break;
            }
          }
        }
      }
    }
    return wordPairs;
  }
 public Object aggregate(Class key, List<? extends CoreMap> in) {
   if (in == null) return null;
   for (int i = in.size() - 1; i >= 0; i--) {
     CoreMap cm = in.get(i);
     return cm.get(key);
   }
   return null;
 }
 public SUTime.Temporal apply(CoreMap chunk) {
   if (tokenPattern != null) {
     return apply(chunk.get(CoreAnnotations.NumerizedTokensAnnotation.class));
     //          return apply(chunk.get(CoreAnnotations.TokensAnnotation.class));
   } else {
     return apply(chunk.get(CoreAnnotations.TextAnnotation.class));
   }
 }
 public Object aggregate(Class key, List<? extends CoreMap> in) {
   if (in == null) return null;
   for (CoreMap cm : in) {
     Object obj = cm.get(key);
     return obj;
   }
   return null;
 }
예제 #19
0
 /**
  * Set index for each token and sentence in the document.
  *
  * @param doc
  */
 public static void setTokenIndices(Document doc) {
   int token_index = 0;
   for (CoreMap sent : doc.annotation.get(SentencesAnnotation.class)) {
     for (CoreLabel token : sent.get(TokensAnnotation.class)) {
       token.set(TokenBeginAnnotation.class, token_index++);
     }
   }
 }
예제 #20
0
파일: Util.java 프로젝트: Eagles2F/CoreNLP
 /**
  * TODO(gabor) JavaDoc
  *
  * @param sentence
  * @param pipeline
  */
 public static void annotate(CoreMap sentence, AnnotationPipeline pipeline) {
   Annotation ann =
       new Annotation(StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), " "));
   ann.set(
       CoreAnnotations.TokensAnnotation.class,
       sentence.get(CoreAnnotations.TokensAnnotation.class));
   ann.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
   pipeline.annotate(ann);
 }
      @Override
      public int compare(CoreMap sent1, CoreMap sent2) {
        String d1 = sent1.get(CoreAnnotations.DocIDAnnotation.class);
        String d2 = sent2.get(CoreAnnotations.DocIDAnnotation.class);
        if (d1 != null && d2 != null && !d1.equals(d2)) return d1.compareTo(d2);

        String t1 = sent1.get(CoreAnnotations.TextAnnotation.class);
        String t2 = sent2.get(CoreAnnotations.TextAnnotation.class);
        return t1.compareTo(t2);
      }
 /**
  * Given a set of sentences with annotations from an information extractor class, and the same
  * sentences with gold-standard annotations, print results on how the information extraction
  * performed.
  */
 public String printResults(CoreMap goldStandard, CoreMap extractorOutput) {
   StringWriter sw = new StringWriter();
   PrintWriter pw = new PrintWriter(sw, true);
   List<CoreMap> mutableGold = new ArrayList<CoreMap>();
   mutableGold.addAll(goldStandard.get(CoreAnnotations.SentencesAnnotation.class));
   List<CoreMap> mutableOutput = new ArrayList<CoreMap>();
   mutableOutput.addAll(extractorOutput.get(CoreAnnotations.SentencesAnnotation.class));
   printResults(pw, mutableGold, mutableOutput);
   return sw.getBuffer().toString();
 }
  /**
   * Finds the position of the sentence in the given document that achieves the best ROUGE-N scores
   * w.r.t. to the reference summaries.
   *
   * @param task the document and the corresponding models
   * @return the position of the best sentence in the document
   * @throws IOException
   */
  public int getBestSentencePos(Task task) {
    Document document = task.getDocument();
    Annotation documentAnnotation = annotationProvider.getAnnotation(document.getContent());

    RougeN rouge = rougeFactory.make(task.getModels(), annotationProvider);
    BestSentenceSelector sentenceSelector = new BestSentenceSelector(rouge);
    Annotation bestAnnotation = sentenceSelector.select(documentAnnotation);
    CoreMap sentence = bestAnnotation.get(SentencesAnnotation.class).get(0);
    String bestPos = sentence.get(SentencePositionAnnotation.class);

    return Integer.valueOf(bestPos);
  }
 public Object aggregate(Class key, List<? extends CoreMap> in) {
   if (in == null) return null;
   List<T> res = new ArrayList<T>();
   for (CoreMap cm : in) {
     Object obj = cm.get(key);
     if (obj != null) {
       if (obj instanceof List) {
         res.addAll((List<T>) obj);
       }
     }
   }
   return res;
 }
 private List<CoreMap> toCoreMaps(
     CoreMap annotation, List<TimeExpression> timeExpressions, SUTime.TimeIndex timeIndex) {
   if (timeExpressions == null) return null;
   List<CoreMap> coreMaps = new ArrayList<CoreMap>(timeExpressions.size());
   for (TimeExpression te : timeExpressions) {
     CoreMap cm = te.getAnnotation();
     SUTime.Temporal temporal = te.getTemporal();
     if (temporal != null) {
       String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
       String text = cm.get(CoreAnnotations.TextAnnotation.class);
       if (origText != null) {
         // Make sure the text is from original (and not from concatenated tokens)
         ChunkAnnotationUtils.annotateChunkText(cm, annotation);
         text = cm.get(CoreAnnotations.TextAnnotation.class);
       }
       Map<String, String> timexAttributes;
       try {
         timexAttributes = temporal.getTimexAttributes(timeIndex);
         if (options.includeRange) {
           SUTime.Temporal rangeTemporal = temporal.getRange();
           if (rangeTemporal != null) {
             timexAttributes.put("range", rangeTemporal.toString());
           }
         }
       } catch (Exception e) {
         logger.log(
             Level.WARNING,
             "Failed to get attributes from " + text + ", timeIndex " + timeIndex,
             e);
         continue;
       }
       Timex timex;
       try {
         timex = Timex.fromMap(text, timexAttributes);
       } catch (Exception e) {
         logger.log(
             Level.WARNING,
             "Failed to process " + text + " with attributes " + timexAttributes,
             e);
         continue;
       }
       cm.set(TimexAnnotation.class, timex);
       if (timex != null) {
         coreMaps.add(cm);
       } else {
         logger.warning("No timex expression for: " + text);
       }
     }
   }
   return coreMaps;
 }
예제 #26
0
  private void parseThread(ArrayList<Thread> threads) {
    for (Thread t : threads) {
      ThreadVector tv = new ThreadVector(t);
      allThreads.add(tv);
      for (Email e : t.getEmails()) {
        StringBuffer sb = new StringBuffer();
        for (Sentence s : e.getSentences()) {
          // if it's the content of this email
          if (s.getQuotationTimes() == 0) {
            sb.append(s.getText() + " ");
          }
        }
        String content = sb.toString().toLowerCase();

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(content);

        // run all Annotators on this text
        this.pipeline.annotate(document);

        // Iterate over all of the sentences found
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
          List<String> lemmas = new LinkedList<String>();
          // Iterate over all tokens in a sentence
          for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            // Retrieve and add the lemma for each word into the
            // list of lemmas
            lemmas.add(token.get(LemmaAnnotation.class));
          }

          HashMap<String, Integer> wordCount = countWordsInSentence(lemmas);
          // if it has valid words
          if (wordCount.size() > 0) {
            totalSentenceNumber++;
            for (String word : wordCount.keySet()) {
              if (!dictionaryIndex.containsKey(word)) {
                dictionaryIndex.put(word, dictionaryIndex.size());
                dictionaryDocumentCount.put(word, 1);
              } else {
                dictionaryDocumentCount.put(word, dictionaryDocumentCount.get(word) + 1);
              }
            }
            SentenceVector sv = new SentenceVector(sentence.toString(), wordCount);
            tv.addSentenceVectors(sv);
          }
        }
      }
    }
  }
 public Object aggregate(Class key, List<? extends CoreMap> in) {
   if (in == null) return null;
   StringBuilder sb = new StringBuilder();
   for (CoreMap cm : in) {
     Object obj = cm.get(key);
     if (obj != null) {
       if (sb.length() > 0) {
         sb.append(delimiter);
       }
       sb.append(obj);
     }
   }
   return sb.toString();
 }
 public SUTime.Temporal apply(CoreMap chunk) {
   if (tokenPattern != null) {
     if (chunk.containsKey(TimeExpression.ChildrenAnnotation.class)) {
       return apply(chunk.get(TimeExpression.ChildrenAnnotation.class));
     } else {
       return apply(chunk.get(CoreAnnotations.NumerizedTokensAnnotation.class));
       //            return apply(chunk.get(CoreAnnotations.TokensAnnotation.class));
     }
   } else if (stringPattern != null) {
     return apply(chunk.get(CoreAnnotations.TextAnnotation.class));
   } else {
     return extract(null);
   }
 }
예제 #29
0
  @Override
  public void print(Annotation doc, OutputStream target, Options options) throws IOException {
    PrintWriter writer = new PrintWriter(target);

    List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      SemanticGraph sg =
          sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
      if (sg != null) {
        writer.print(conllUWriter.printSemanticGraph(sg));
      }
    }
    writer.flush();
  }
  public List<TimeExpression> extractTimeExpressions(CoreMap annotation, String docDateStr) {
    List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation);
    annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers);

    // TODO: docDate may not have century....
    SUTime.Time docDate = timexPatterns.parseDateTime(docDateStr);

    List<? extends MatchedExpression> matchedExpressions =
        expressionExtractor.extractExpressions(annotation);
    List<TimeExpression> timeExpressions = new ArrayList<TimeExpression>(matchedExpressions.size());
    for (MatchedExpression expr : matchedExpressions) {
      if (expr instanceof TimeExpression) {
        timeExpressions.add((TimeExpression) expr);
      } else {
        timeExpressions.add(new TimeExpression(expr));
      }
    }

    // Add back nested time expressions for ranges....
    // For now only one level of nesting...
    if (options.includeNested) {
      List<TimeExpression> nestedTimeExpressions = new ArrayList<TimeExpression>();
      for (TimeExpression te : timeExpressions) {
        if (te.isIncludeNested()) {
          List<? extends CoreMap> children =
              te.getAnnotation().get(TimeExpression.ChildrenAnnotation.class);
          if (children != null) {
            for (CoreMap child : children) {
              TimeExpression childTe = child.get(TimeExpression.Annotation.class);
              if (childTe != null) {
                nestedTimeExpressions.add(childTe);
              }
            }
          }
        }
      }
      timeExpressions.addAll(nestedTimeExpressions);
    }
    Collections.sort(timeExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR);
    timeExpressions = filterInvalidTimeExpressions(timeExpressions);

    // Some resolving is done even if docDate null...
    if (
    /*docDate != null && */ timeExpressions != null) {
      resolveTimeExpressions(annotation, timeExpressions, docDate);
    }
    // Annotate timex
    return timeExpressions;
  }