Beispiel #1
0
  public void createPhrases() {
    //// Tokenize
    List<CoreLabel> tokens = new ArrayList<>();
    PTBTokenizer<CoreLabel> tokenizer =
        new PTBTokenizer<>(new StringReader(text), new CoreLabelTokenFactory(), "");
    while (tokenizer.hasNext()) {
      tokens.add(tokenizer.next());
    }
    //// Split sentences from tokens
    List<List<CoreLabel>> sentences = new WordToSentenceProcessor<CoreLabel>().process(tokens);
    //// Join back together
    int end;
    int start = 0;
    for (List<CoreLabel> sentence : sentences) {
      end = sentence.get(sentence.size() - 1).endPosition();
      phrases.add(new Phrase(text.substring(start, end).trim()));
      start = end;
    }

    /*Reader reader = new StringReader(text);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);

    for (List<HasWord> sentence : dp) {
        String out = Sentence.listToString(sentence);
        //replace -LRB- and -RRB- with opening and closing brackets
        out = out.replace("-LRB-", "(");
        out = out.replace("-RRB-", ")");
        Phrase line = new Phrase(out);
        phrases.add(line);
        System.out.println(line.getText());
    }*/
  }
Beispiel #2
0
 public static void standfordNLP() {
   CoreLabelTokenFactory ctf = new CoreLabelTokenFactory();
   PTBTokenizer<CoreLabel> ptb =
       new PTBTokenizer<>(new StringReader(paragraph), ctf, "invertible=true");
   while (ptb.hasNext()) {
     CoreLabel cl = ptb.next();
     System.out.print(
         cl.originalText() + " [" + cl.beginPosition() + "-" + cl.endPosition() + "];");
   }
   System.out.println();
 }
  private static float lexSimilarity(String s1, String s2) {
    String[] split1 = s1.split("[.?!]");
    String[] split2 = s2.split("[.?!]");
    Set<String> stemsInFirst = new HashSet<String>();
    Set<String> stemsInSecond = new HashSet<String>();
    for (int i = 0; i < split1.length; i++) {

      PTBTokenizer<Word> tokenizer1 = PTBTokenizer.newPTBTokenizer(new StringReader(split1[i]));

      while (tokenizer1.hasNext()) {
        Word w = tokenizer1.next();
        String stem = m.stem(w).word();

        stemsInFirst.add(stem);
      }
    }

    for (int j = 0; j < split2.length; j++) {
      PTBTokenizer<Word> tokenizer2 = PTBTokenizer.newPTBTokenizer(new StringReader(split2[j]));
      while (tokenizer2.hasNext()) {
        Word w = tokenizer2.next();
        String stem = m.stem(w).word();

        stemsInSecond.add(stem);
      }
    }

    Iterator<String> i = stemsInSecond.iterator();
    float commonStems = 0;
    while (i.hasNext()) {
      String curStem = i.next();
      // System.out.println(curStem);
      if (stemsInFirst.contains(curStem)) commonStems++;
    }
    int secondSize = stemsInSecond.size();
    if (secondSize > 0) return commonStems / (float) (secondSize);
    else return 0;
  }
 private void tokenizeDate(String inputDate) {
   tokens = new ArrayList<String>();
   Pattern pat = Pattern.compile("[-]");
   if (inputDate == null) {
     System.out.println("Null input date");
   }
   Matcher m = pat.matcher(inputDate);
   String str = m.replaceAll(" - ");
   str = str.replaceAll(",", " ");
   PTBTokenizer<Word> tokenizer =
       PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str)));
   while (tokenizer.hasNext()) {
     Word nextToken = tokenizer.next();
     tokens.add(nextToken.toString());
   }
   if (DEBUG) {
     System.out.println("tokens:" + tokens);
   }
 }
Beispiel #5
0
  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) return null;

    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();

    StringReader textInput = new StringReader(input.get(0).toString());
    PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");

    for (CoreLabel label; ptbt.hasNext(); ) {
      label = (CoreLabel) ptbt.next();
      if (label.value().length() > 2) {
        System.err.println(label.toString());
        Tuple termText = tupleFactory.newTuple(label.word());
        bagOfTokens.add(termText);
      }
    }
    return bagOfTokens;
  }
 public static void main(String[] args) throws IOException {
   PrintWriter pw = new PrintWriter(filename + ".tokenized.SPL");
   @SuppressWarnings({"unchecked", "rawtypes"})
   PTBTokenizer tokenizer =
       new PTBTokenizer(new FileReader(filename), new CoreLabelTokenFactory(), "");
   Sentence s = new Sentence();
   for (CoreLabel label; tokenizer.hasNext(); ) {
     label = (CoreLabel) tokenizer.next();
     String token = label.toString("value");
     if (token.equals("-LRB-")) {
       token = "(";
     }
     if (token.equals("-RRB-")) {
       token = ")";
     }
     if (token.equals("-LSB-")) {
       token = "[";
     }
     if (token.equals("-RSB-")) {
       token = "]";
     }
     if (token.equals("-LCB-")) {
       token = "{";
     }
     if (token.equals("-RCB-")) {
       token = "}";
     }
     s.words.add(token);
     // is it sentence splitter?
     if (token.equals(".") || token.equals("?") || token.equals("!")) {
       if (s.isCleanSentence()) {
         pw.println(s.toString());
       }
       s = new Sentence();
     }
   }
   pw.close();
 }