Java CoreLabel 예제들, edu.stanford.nlp.ling.CoreLabel Java 예제들

예제 #1

0

파일 보기

파일: PosTagNamedEntityRecognizer.java 프로젝트: akcarmel/se-hw-1

 public Map<Integer, Integer> getGeneSpans(String text) {
   Map<Integer, Integer> begin2end = new HashMap<Integer, Integer>();
   Annotation document = new Annotation(text);
   pipeline.annotate(document);
   List<CoreMap> sentences = document.get(SentencesAnnotation.class);
   for (CoreMap sentence : sentences) {
     List<CoreLabel> candidate = new ArrayList<CoreLabel>();
     for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
       String pos = token.get(PartOfSpeechAnnotation.class);
       if (pos.startsWith("NN")) {
         candidate.add(token);
       } else if (candidate.size() > 0) {
         int begin = candidate.get(0).beginPosition();
         int end = candidate.get(candidate.size() - 1).endPosition();
         begin2end.put(begin, end);
         candidate.clear();
       }
     }
     if (candidate.size() > 0) {
       int begin = candidate.get(0).beginPosition();
       int end = candidate.get(candidate.size() - 1).endPosition();
       begin2end.put(begin, end);
       candidate.clear();
     }
   }
   return begin2end;
 }

예제 #2

0

파일 보기

파일: EntityMention.java 프로젝트: slutermr/stanford-corenlp

  /**
   * Get the text value of this entity. The headTokenSpan MUST be set before calling this method!
   */
  public String getValue() {
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    // int lastEnd = -1;
    StringBuilder sb = new StringBuilder();
    for (int i = headTokenSpan.start(); i < headTokenSpan.end(); i++) {
      CoreLabel token = tokens.get(i);

      // we are not guaranteed to have CharacterOffsets so we can't use them...
      /*
      Integer start = token.get(CharacterOffsetBeginAnnotation.class);
      Integer end = token.get(CharacterOffsetEndAnnotation.class);

      if (start != null && end != null) {
        if (lastEnd != -1 && !start.equals(lastEnd)) {
          sb.append(StringUtils.repeat(" ", start - lastEnd));
          lastEnd = end;
        }
      } else {
        if (lastEnd != -1) sb.append(" ");
        lastEnd = 0;
      }
        */
      if (i > headTokenSpan.start()) sb.append(" ");

      sb.append(token.word());
    }

    return sb.toString();
  }

예제 #3

0

파일 보기

파일: StanfordParser.java 프로젝트: ag-sc/DeptDUDES

  private LinkedHashMap<LinkedHashMap<Integer, String>, String> identifyNER(String text) {

    LinkedHashMap<LinkedHashMap<Integer, String>, String> map = new LinkedHashMap<>();

    String serializedClassifier =
        "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";

    CRFClassifier<CoreLabel> classifier =
        CRFClassifier.getClassifierNoExceptions(serializedClassifier);
    List<List<CoreLabel>> classify = classifier.classify(text);
    for (List<CoreLabel> coreLabels : classify) {
      for (CoreLabel coreLabel : coreLabels) {

        String word = coreLabel.word();
        int index = coreLabel.index();
        String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
        if (!"O".equals(category)) {

          //                    for(Entry e1 : map.entrySet()){
          //
          //                        LinkedHashMap<Integer, String> entries = (LinkedHashMap<Integer,
          // String>) e1;
          //
          //
          //                    }
          System.out.println(word + ":" + category);
        }
      }
    }
    return map;
  }

예제 #4

0

파일 보기

파일: StanfordParser.java 프로젝트: ag-sc/DeptDUDES

  public static List<String> lemmatizeDocument(String documentText) {

    if (pipeline == null) {
      loadModels();
    }

    List<String> lemmas = new LinkedList<>();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(documentText);

    // run all Annotators on this text
    pipeline.annotate(document);

    // Iterate over all of the sentences found
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      // Iterate over all tokens in a sentence
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        // Retrieve and add the lemma for each word into the
        // list of lemmas
        lemmas.add(token.get(CoreAnnotations.LemmaAnnotation.class));
      }
    }

    return lemmas;
  }

예제 #5

0

파일 보기

파일: StanfordParser.java 프로젝트: ag-sc/DeptDUDES

  /**
   * @param t
   * @return
   */
  public static String lemmatize(String t) {

    if (pipeline == null) {
      loadModels();
    }

    String lemma = "";

    try {
      // create an empty Annotation just with the given text
      Annotation document = new Annotation(t);

      // run all Annotators on this text
      pipeline.annotate(document);

      // Iterate over all of the sentences found
      List<CoreMap> sentences = document.get(SentencesAnnotation.class);
      for (CoreMap sentence : sentences) {
        // Iterate over all tokens in a sentence
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
          // Retrieve and add the lemma for each word into the
          // list of lemmas
          lemma += " " + token.get(CoreAnnotations.LemmaAnnotation.class);
        }
      }
    } catch (Exception e) {
      System.err.println("Stanford Lemmatizer error exception Word: " + t);
    }

    return lemma.trim();
  }

예제 #6

0

파일 보기

파일: TokensRegexNERAnnotator.java 프로젝트: automenta/corenlp

 // TODO: roll check into tokens regex pattern?
 // That allows for better matching because unmatched sequences will be eliminated at match time
 private boolean checkPosTags(List<CoreLabel> tokens, int start, int end) {
   if (validPosPattern != null) {
     // Need to check POS tag too...
     switch (posMatchType) {
       case MATCH_ONE_TOKEN_PHRASE_ONLY:
         if (tokens.size() > 1) return true;
         // fall through
       case MATCH_AT_LEAST_ONE_TOKEN:
         for (int i = start; i < end; i++) {
           CoreLabel token = tokens.get(i);
           String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
           if (pos != null && validPosPattern.matcher(pos).matches()) {
             return true;
           }
         }
         return false;
       case MATCH_ALL_TOKENS:
         // Checked else where
         return true;
       default:
         // Don't know this match type....
         return true;
     }
   }
   return true;
 }

예제 #7

0

파일 보기

파일: StandfordNLPProcessor.java 프로젝트: nkorange/Secretary

  public List<NLPInfo> analyze(String text) {

    Annotation document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    if (sentences == null || sentences.isEmpty()) {
      return null;
    }

    List<NLPInfo> res = new ArrayList<NLPInfo>();
    NLPInfo info;
    for (CoreMap sentence : sentences) {
      info = new NLPInfo();
      NLPToken tokenInfo;
      for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
        tokenInfo = new NLPToken();
        tokenInfo.setWord(token.get(CoreAnnotations.TextAnnotation.class));
        tokenInfo.setTag(token.get(CoreAnnotations.PartOfSpeechAnnotation.class));
        tokenInfo.setNer(token.get(CoreAnnotations.NamedEntityTagAnnotation.class));
        info.appendToken(tokenInfo);
      }
      res.add(info);
    }
    return res;
  }

예제 #8

0

파일 보기

파일: Document.java 프로젝트: taarraas/matrixfactorization

  private String findNextParagraphSpeaker(
      List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) {
    CoreMap lastSent = paragraph.get(paragraph.size() - 1);
    String speaker = "";
    for (CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
      if (w.get(CoreAnnotations.LemmaAnnotation.class).equals("report")
          || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
        String word = w.get(CoreAnnotations.TextAnnotation.class);
        SemanticGraph dependency =
            lastSent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        IndexedWord t = dependency.getNodeByWordPattern(word);

        for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(t)) {
          if (child.first().getShortName().equals("nsubj")) {
            int subjectIndex = child.second().index(); // start from 1
            IntTuple headPosition = new IntTuple(2);
            headPosition.set(0, paragraph.size() - 1 + paragraphOffset);
            headPosition.set(1, subjectIndex - 1);
            if (mentionheadPositions.containsKey(headPosition)
                && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
              speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
            }
          }
        }
      }
    }
    return speaker;
  }

예제 #9

0

파일 보기

파일: Benchmarks.java 프로젝트: jayantam/CoreNLP

  /**
   * 29% in FactorTable.getValue() 28% in CRFCliqueTree.getCalibratedCliqueTree() 12.6% waiting for
   * threads
   *
   * <p>Single threaded: 15000 ms - 26000 ms Multi threaded: 4500 ms - 7000 ms
   *
   * <p>with 8 cpus, 3.3x - 3.7x speedup, around 800% utilization
   */
  public static void benchmarkCRF() {
    Properties props = new Properties();
    props.setProperty("macro", "true"); // use a generic CRF configuration
    props.setProperty("useIfInteger", "true");
    props.setProperty("featureFactory", "edu.stanford.nlp.benchmarks.BenchmarkFeatureFactory");
    props.setProperty("saveFeatureIndexToDisk", "false");

    CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(props);

    Random r = new Random(42);

    List<List<CoreLabel>> data = new ArrayList<>();
    for (int i = 0; i < 100; i++) {
      List<CoreLabel> sentence = new ArrayList<>();
      for (int j = 0; j < 20; j++) {
        CoreLabel l = new CoreLabel();

        l.setWord("j:" + j);

        boolean tag = j % 2 == 0 ^ (r.nextDouble() > 0.7);
        l.set(CoreAnnotations.AnswerAnnotation.class, "target:" + tag);
        sentence.add(l);
      }
      data.add(sentence);
    }

    long msStart = System.currentTimeMillis();
    crf.train(data);
    long delay = System.currentTimeMillis() - msStart;
    System.out.println("Training took " + delay + " ms");
  }

예제 #10

0

파일 보기

파일: PTBTokenizerTest.java 프로젝트: wayzou/CoreNLP

 @Test
 public void testCorp() {
   // We test a 2x2 design: {strict, regular} x {no following context, following context}
   for (int sent = 0; sent < 4; sent++) {
     PTBTokenizer<CoreLabel> ptbTokenizer =
         new PTBTokenizer<>(
             new StringReader(corpInputs[sent / 2]),
             new CoreLabelTokenFactory(),
             (sent % 2 == 0) ? "strictTreebank3" : "");
     int i = 0;
     while (ptbTokenizer.hasNext()) {
       CoreLabel w = ptbTokenizer.next();
       try {
         assertEquals("PTBTokenizer problem", corpGold[sent % 2][i], w.word());
       } catch (ArrayIndexOutOfBoundsException aioobe) {
         // the assertion below outside the loop will fail
       }
       i++;
     }
     if (i != corpGold[sent % 2].length) {
       System.out.println("Gold: " + Arrays.toString(corpGold[sent % 2]));
       List<CoreLabel> tokens =
           new PTBTokenizer<>(
                   new StringReader(corpInputs[sent / 2]),
                   new CoreLabelTokenFactory(),
                   (sent % 2 == 0) ? "strictTreebank3" : "")
               .tokenize();
       System.out.println("Guess: " + SentenceUtils.listToString(tokens));
       System.out.flush();
     }
     assertEquals("PTBTokenizer num tokens problem", i, corpGold[sent % 2].length);
   }
 }

예제 #11

0

파일 보기

파일: BasicEntityExtractor.java 프로젝트: automenta/corenlp

 public static void saveCoNLL(
     PrintStream os, List<List<CoreLabel>> sentences, boolean alreadyBIO) {
   os.println("-DOCSTART- -X- O\n");
   for (List<CoreLabel> sent : sentences) {
     String prev = null;
     for (CoreLabel word : sent) {
       String w = word.word().replaceAll("[ \t\n]+", "_");
       String t = word.get(CoreAnnotations.PartOfSpeechAnnotation.class);
       String l = word.get(CoreAnnotations.AnswerAnnotation.class);
       String nl = l;
       if (!alreadyBIO && !l.equals("O")) {
         if (prev != null && l.equals(prev)) nl = "I-" + l;
         else nl = "B-" + l;
       }
       String line = w + ' ' + t + ' ' + nl;
       String[] toks = line.split("[ \t\n]+");
       if (toks.length != 3) {
         throw new RuntimeException("INVALID LINE: \"" + line + '"');
       }
       os.printf("%s %s %s\n", w, t, nl);
       prev = l;
     }
     os.println();
   }
 }

예제 #12

0

파일 보기

파일: AcronymMatcher.java 프로젝트: nrajani/Stanford_Relation_Extractor

 private static List<String> getTokenStrs(List<CoreLabel> tokens) {
   List<String> mainTokenStrs = new ArrayList<String>(tokens.size());
   for (CoreLabel token : tokens) {
     String text = token.get(CoreAnnotations.TextAnnotation.class);
     mainTokenStrs.add(text);
   }
   return mainTokenStrs;
 }

예제 #13

0

파일 보기

파일: SingletonPredictor.java 프로젝트: alishir/CoreNLP

 /**
  * Set index for each token and sentence in the document.
  *
  * @param doc
  */
 public static void setTokenIndices(Document doc) {
   int token_index = 0;
   for (CoreMap sent : doc.annotation.get(SentencesAnnotation.class)) {
     for (CoreLabel token : sent.get(TokensAnnotation.class)) {
       token.set(TokenBeginAnnotation.class, token_index++);
     }
   }
 }

예제 #14

0

파일 보기

파일: ColumnDocumentReaderAndWriter.java 프로젝트: toliwa/CoreNLP

 @Override
 public void printAnswers(List<CoreLabel> doc, PrintWriter out) {
   for (CoreLabel wi : doc) {
     String answer = wi.get(CoreAnnotations.AnswerAnnotation.class);
     String goldAnswer = wi.get(CoreAnnotations.GoldAnswerAnnotation.class);
     out.println(wi.word() + "\t" + goldAnswer + "\t" + answer);
   }
   out.println();
 }

예제 #15

0

파일 보기

파일: Util.java 프로젝트: foxlf823/ade

 public static void fillEntity(List<Entity> entities, List<CoreLabel> tokens) {
   for (Entity entity : entities) {
     for (int i = 0; i < tokens.size(); i++) {
       CoreLabel token = tokens.get(i);
       if (entity.offset == token.beginPosition()) entity.start = i;
       if (entity.offsetEnd == token.endPosition()) entity.end = i;
     }
   }
 }

예제 #16

0

파일 보기

파일: LeafAncestorEval.java 프로젝트: automenta/corenlp

  private static String toString(final List<CoreLabel> lineage) {
    StringBuilder sb = new StringBuilder();
    for (CoreLabel cl : lineage) {
      sb.append(cl.value());
      sb.append(" <-- ");
    }

    return sb.toString();
  }

예제 #17

0

파일 보기

파일: AcronymMatcher.java 프로젝트: nrajani/Stanford_Relation_Extractor

 private static List<String> getMainTokenStrs(List<CoreLabel> tokens) {
   List<String> mainTokenStrs = new ArrayList<String>(tokens.size());
   for (CoreLabel token : tokens) {
     String text = token.get(CoreAnnotations.TextAnnotation.class);
     if (!text.isEmpty() && (text.length() >= 4 || Character.isUpperCase(text.charAt(0)))) {
       mainTokenStrs.add(text);
     }
   }
   return mainTokenStrs;
 }

예제 #18

0

파일 보기

파일: ExtractionObject.java 프로젝트: StonyBrookNLP/stingysentiment

 public String getExtentString() {
   List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
   StringBuilder sb = new StringBuilder();
   for (int i = extentTokenSpan.start(); i < extentTokenSpan.end(); i++) {
     CoreLabel token = tokens.get(i);
     if (i > extentTokenSpan.start()) sb.append(" ");
     sb.append(token.word());
   }
   return sb.toString();
 }

예제 #19

0

파일 보기

파일: Chapter2.java 프로젝트: jdmalter/spring2016

 public static void standfordNLP() {
   CoreLabelTokenFactory ctf = new CoreLabelTokenFactory();
   PTBTokenizer<CoreLabel> ptb =
       new PTBTokenizer<>(new StringReader(paragraph), ctf, "invertible=true");
   while (ptb.hasNext()) {
     CoreLabel cl = ptb.next();
     System.out.print(
         cl.originalText() + " [" + cl.beginPosition() + "-" + cl.endPosition() + "];");
   }
   System.out.println();
 }

예제 #20

0

파일 보기

파일: CustomAnnotationSerializerITest.java 프로젝트: jayantam/CoreNLP

 private void verifyWord(CoreLabel expected, CoreLabel result) {
   for (Class annotation : tokenAnnotations) {
     if (expected.get(annotation) == null
         && result.get(annotation) != null
         && "".equals(result.get(annotation))) {
       // allow "" in place of null
       continue;
     }
     assertEquals(
         "Different for class " + annotation, expected.get(annotation), result.get(annotation));
   }
 }

예제 #21

0

파일 보기

파일: ParserAnnotatorUtils.java 프로젝트: Balkanlii/nlp

  /**
   * Converts the tree labels to CoreLabels. We need this because we store additional info in the
   * CoreLabel, like token span.
   *
   * @param tree
   */
  public static void convertToCoreLabels(Tree tree) {
    Label l = tree.label();
    if (!(l instanceof CoreLabel)) {
      CoreLabel cl = new CoreLabel();
      cl.setValue(l.value());
      tree.setLabel(cl);
    }

    for (Tree kid : tree.children()) {
      convertToCoreLabels(kid);
    }
  }

예제 #22

0

파일 보기

파일: Trees.java 프로젝트: hercky/undergradCourses

 private static void taggedLeafLabels(Tree t, List<CoreLabel> l) {
   if (t.isPreTerminal()) {
     CoreLabel fl = (CoreLabel) t.getChild(0).label();
     fl.set(TagLabelAnnotation.class, t.label());
     l.add(fl);
   } else {
     Tree[] kids = t.children();
     for (int j = 0, n = kids.length; j < n; j++) {
       taggedLeafLabels(kids[j], l);
     }
   }
 }

예제 #23

0

파일 보기

파일: TreeUtils.java 프로젝트: renaud/dkpro-core

 private static int reIndexLeaves(Tree t, int startIndex) {
   if (t.isLeaf()) {
     CoreLabel afl = (CoreLabel) t.label();
     afl.setIndex(startIndex);
     startIndex++;
   } else {
     for (Tree child : t.children()) {
       startIndex = reIndexLeaves(child, startIndex);
     }
   }
   return startIndex;
 }

예제 #24

0

파일 보기

파일: CorefResolWorker.java 프로젝트: qiaojingy/MC

  public static final String doCorefResolution(Annotation annotation) {

    Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class);
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> resolved = new ArrayList<String>();
    for (CoreMap sentence : sentences) {
      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      for (CoreLabel token : tokens) {
        Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
        CorefChain chain = corefs.get(corefClustId);
        if (chain == null) resolved.add(token.word());
        else {
          int sentINdx = chain.getRepresentativeMention().sentNum - 1;
          CoreMap corefSentence = sentences.get(sentINdx);
          List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class);
          CorefMention reprMent = chain.getRepresentativeMention();
          if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) {
            for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
              CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
              resolved.add(matchedLabel.word());
            }
          } else resolved.add(token.word());
        }
      }
    }
    String resolvedStr = "";
    System.out.println();
    for (String str : resolved) {
      resolvedStr += str + " ";
    }
    System.out.println(resolvedStr);

    return resolvedStr;
  }

예제 #25

0

파일 보기

파일: BasicEntityExtractor.java 프로젝트: automenta/corenlp

  /**
   * This should be called after the classifier has been trained and parseAndTrain has been called
   * to accumulate test set
   *
   * <p>This will return precision,recall and F1 measure
   */
  public void runTestSet(List<List<CoreLabel>> testSet) {
    Counter<String> tp = new DefaultCounter<>();
    Counter<String> fp = new DefaultCounter<>();
    Counter<String> fn = new DefaultCounter<>();

    Counter<String> actual = new DefaultCounter<>();

    for (List<CoreLabel> labels : testSet) {
      List<CoreLabel> unannotatedLabels = new ArrayList<>();
      // create a new label without answer annotation
      for (CoreLabel label : labels) {
        CoreLabel newLabel = new CoreLabel();
        newLabel.set(annotationForWord, label.get(annotationForWord));
        newLabel.set(PartOfSpeechAnnotation.class, label.get(PartOfSpeechAnnotation.class));
        unannotatedLabels.add(newLabel);
      }

      List<CoreLabel> annotatedLabels = this.classifier.classify(unannotatedLabels);

      int ind = 0;
      for (CoreLabel expectedLabel : labels) {

        CoreLabel annotatedLabel = annotatedLabels.get(ind);
        String answer = annotatedLabel.get(AnswerAnnotation.class);
        String expectedAnswer = expectedLabel.get(AnswerAnnotation.class);

        actual.incrementCount(expectedAnswer);

        // match only non background symbols
        if (!SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL.equals(expectedAnswer)
            && expectedAnswer.equals(answer)) {
          // true positives
          tp.incrementCount(answer);
          System.out.println("True Positive:" + annotatedLabel);
        } else if (!SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL.equals(answer)) {
          // false positives
          fp.incrementCount(answer);
          System.out.println("False Positive:" + annotatedLabel);
        } else if (!SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL.equals(expectedAnswer)) {
          // false negatives
          fn.incrementCount(expectedAnswer);
          System.out.println("False Negative:" + expectedLabel);
        } // else true negatives

        ind++;
      }
    }

    actual.remove(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
  }

예제 #26

0

파일 보기

파일: Snippet.java 프로젝트: Wendong-Wu/goNLP

  private void parseThread(ArrayList<Thread> threads) {
    for (Thread t : threads) {
      ThreadVector tv = new ThreadVector(t);
      allThreads.add(tv);
      for (Email e : t.getEmails()) {
        StringBuffer sb = new StringBuffer();
        for (Sentence s : e.getSentences()) {
          // if it's the content of this email
          if (s.getQuotationTimes() == 0) {
            sb.append(s.getText() + " ");
          }
        }
        String content = sb.toString().toLowerCase();

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(content);

        // run all Annotators on this text
        this.pipeline.annotate(document);

        // Iterate over all of the sentences found
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
          List<String> lemmas = new LinkedList<String>();
          // Iterate over all tokens in a sentence
          for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
            // Retrieve and add the lemma for each word into the
            // list of lemmas
            lemmas.add(token.get(LemmaAnnotation.class));
          }

          HashMap<String, Integer> wordCount = countWordsInSentence(lemmas);
          // if it has valid words
          if (wordCount.size() > 0) {
            totalSentenceNumber++;
            for (String word : wordCount.keySet()) {
              if (!dictionaryIndex.containsKey(word)) {
                dictionaryIndex.put(word, dictionaryIndex.size());
                dictionaryDocumentCount.put(word, 1);
              } else {
                dictionaryDocumentCount.put(word, dictionaryDocumentCount.get(word) + 1);
              }
            }
            SentenceVector sv = new SentenceVector(sentence.toString(), wordCount);
            tv.addSentenceVectors(sv);
          }
        }
      }
    }
  }

예제 #27

0

파일 보기

파일: FilterUtils.java 프로젝트: priyanshujha/TSenti

  public static Collection<String> lemmatize(String rawInput) {

    Collection<String> lemmas =
        Lists.newArrayListWithCapacity(30); // should to the initial capacity in other places too
    Annotation rawInputAnnotation = new Annotation(rawInput);
    coreNlp.annotate(rawInputAnnotation);

    List<CoreLabel> allTokens = rawInputAnnotation.get(TokensAnnotation.class);

    for (CoreLabel eachToken : allTokens) {
      lemmas.add(eachToken.get(LemmaAnnotation.class));
    }

    return lemmas;
  }

예제 #28

0

파일 보기

파일: RelationTripleSegmenterTest.java 프로젝트: alishir/CoreNLP

  /**
   * Parse a CoNLL formatted tree into a SemanticGraph object (along with a list of tokens).
   *
   * @param conll The CoNLL formatted tree.
   * @return A pair of a SemanticGraph and a token list, corresponding to the parse of the sentence
   *     and to tokens in the sentence.
   */
  protected Pair<SemanticGraph, List<CoreLabel>> mkTree(String conll) {
    List<CoreLabel> sentence = new ArrayList<>();
    SemanticGraph tree = new SemanticGraph();
    for (String line : conll.split("\n")) {
      if (line.trim().equals("")) {
        continue;
      }
      String[] fields = line.trim().split("\\s+");
      int index = Integer.parseInt(fields[0]);
      String word = fields[1];
      CoreLabel label = IETestUtils.mkWord(word, index);
      sentence.add(label);
      if (fields[2].equals("0")) {
        tree.addRoot(new IndexedWord(label));
      } else {
        tree.addVertex(new IndexedWord(label));
      }
      if (fields.length > 4) {
        label.setTag(fields[4]);
      }
      if (fields.length > 5) {
        label.setNER(fields[5]);
      }
      if (fields.length > 6) {
        label.setLemma(fields[6]);
      }
    }
    int i = 0;
    for (String line : conll.split("\n")) {
      if (line.trim().equals("")) {
        continue;
      }
      String[] fields = line.trim().split("\\s+");
      int parent = Integer.parseInt(fields[2]);
      String reln = fields[3];
      if (parent > 0) {
        tree.addEdge(
            new IndexedWord(sentence.get(parent - 1)),
            new IndexedWord(sentence.get(i)),
            new GrammaticalRelation(Language.UniversalEnglish, reln, null, null),
            1.0,
            false);
      }
      i += 1;
    }

    return Pair.makePair(tree, sentence);
  }

예제 #29

0

파일 보기

파일: SpanishXMLTreeReader.java 프로젝트: automenta/corenlp

  /**
   * Determine if the given tree contains a leaf which matches the part-of-speech and lexical
   * criteria.
   *
   * @param pos Regular expression to match part of speech (may be null, in which case any POS is
   *     allowed)
   * @param pos Regular expression to match word (may be null, in which case any word is allowed)
   */
  public static boolean shouldPrintTree(Tree tree, Pattern pos, Pattern word) {
    for (Tree t : tree) {
      if (t.isPreTerminal()) {
        CoreLabel label = (CoreLabel) t.label();
        String tpos = label.value();

        Tree wordNode = t.firstChild();
        CoreLabel wordLabel = (CoreLabel) wordNode.label();
        String tword = wordLabel.value();

        if ((pos == null || pos.matcher(tpos).find())
            && (word == null || word.matcher(tword).find())) return true;
      }
    }
    return false;
  }

예제 #30

0

파일 보기

파일: SentimentPipeline.java 프로젝트: StonyBrookNLP/stingysentiment

  /**
   * Sets the labels on the tree (except the leaves) to be the integer value of the sentiment
   * prediction. Makes it easy to print out with Tree.toString()
   */
  static void setSentimentLabels(Tree tree) {
    if (tree.isLeaf()) {
      return;
    }

    for (Tree child : tree.children()) {
      setSentimentLabels(child);
    }

    Label label = tree.label();
    if (!(label instanceof CoreLabel)) {
      throw new IllegalArgumentException("Required a tree with CoreLabels");
    }
    CoreLabel cl = (CoreLabel) label;
    cl.setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree)));
  }