Пример #1
0
  public static ArrayList<String[]> extractNounPhrases(
      StanfordCoreNLP pipeline, String text, int searchRange) {
    ArrayList<String[]> wordPairs = new ArrayList<String[]>();
    Annotation document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    MAX_STEPS = searchRange;

    for (CoreMap sentence : sentences) {
      List<CoreLabel> labels = sentence.get(TokensAnnotation.class);

      // Check negation
      boolean hasNegation = false;
      for (CoreLabel label : labels) {
        if (NEGATIONS.contains(label.lemma().toLowerCase())) {
          hasNegation = true;
        }
      }

      for (int idx = 0; idx < labels.size(); idx++) {
        CoreLabel label = labels.get(idx);
        if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) {
          for (int step = 1; step <= MAX_STEPS; step++) {
            CoreLabel leftLabel = labels.get(Math.max(0, idx - step));
            if (JJ_TAGS.contains(leftLabel.tag())) {
              if (hasNegation)
                addPair(
                    wordPairs,
                    NOT_PREFIX + leftLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              else
                addPair(
                    wordPairs,
                    leftLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              break;
            }
            CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1));
            if (JJ_TAGS.contains(rightLabel.tag())) {
              if (hasNegation)
                addPair(
                    wordPairs,
                    NOT_PREFIX + rightLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));
              else
                addPair(
                    wordPairs,
                    rightLabel.get(LemmaAnnotation.class),
                    label.get(LemmaAnnotation.class));

              break;
            }
          }
        }
      }
    }
    return wordPairs;
  }
Пример #2
0
  public String[] wordsSegment(String text) {
    String[] listTokenSens = null;
    List<String> listSens = new ArrayList<String>();
    // creates a StanfordCoreNLP object, with POS tagging, lemmatization,
    // NER, parsing, and coreference resolution
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and
    // has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      List<String> listWord = new ArrayList<String>();
      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        System.err.println(token.lemma());
        // this is the text of the token
        String word = token.get(TextAnnotation.class);
        listWord.add(word);
        // this is the POS tag of the token
        // String pos = token.get(PartOfSpeechAnnotation.class);
      }
      listSens.add(StringUtils.join(listWord, " "));
    }
    listTokenSens = new String[listSens.size()];
    listTokenSens = listSens.toArray(listTokenSens);
    return listTokenSens;
  }
Пример #3
0
  public void process(String inFilepath, String outFilepath, String nerOutFile) {

    try {
      StringBuilder inText = new StringBuilder();
      StringBuilder outText = new StringBuilder();
      StringBuilder nerText = new StringBuilder();

      // read some text in the inText variable from input file
      BufferedReader reader = new BufferedReader(new FileReader(inFilepath));
      String line = null;
      while ((line = reader.readLine()) != null) {
        if (line.trim().length() == 0) continue;
        inText.append(line + "\n");
      }
      reader.close();

      // create an empty Annotation just with the given text
      Annotation document = new Annotation(inText.toString());

      // run all Annotators on this text
      pipeline.annotate(document);

      // these are all the sentences in this document
      // a CoreMap is essentially a Map that uses class objects as keys and has values with custom
      // types
      List<CoreMap> sentences = document.get(SentencesAnnotation.class);

      for (CoreMap sentence : sentences) {
        // traversing the words in the current sentence
        // a CoreLabel is a CoreMap with additional token-specific methods
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
          totalWords++;
          String pos = token.tag();
          if (tagFilter.contains(pos)) {
            remainWords++;
            String lemma = token.lemma();
            outText.append(lemma + " ");
            if (nerFilter.contains(token.ner())) {
              nerText.append(token.word() + " ");
            }
          }
        }
      }

      // write the processed text to output file
      FileWriter fw = FileUtil.open(outFilepath);
      fw.append(outText);
      FileUtil.close(fw);

      if (nerOutFile != null) {
        FileWriter fw2 = FileUtil.open(nerOutFile);
        fw2.append(nerText);
        FileUtil.close(fw2);
      }

    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }