Beispiel #1
0
  public static void prepareWord2VecCorpus(
      Properties properties, Parameters parameters, String output) throws Exception {
    File fAbstractDir = new File(PropertiesUtils.getString(properties, "corpusDir", ""));
    File fOutputFile = new File(output);
    Tokenizer tokenizer = new Tokenizer(true, ' ');
    MaxentTagger tagger = new MaxentTagger(PropertiesUtils.getString(properties, "pos_tagger", ""));
    Morphology morphology = new Morphology();

    OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(fOutputFile), "utf-8");

    for (File abstractFile : fAbstractDir.listFiles()) {
      Abstract ab = (Abstract) ObjectSerializer.readObjectFromFile(abstractFile.getAbsolutePath());

      for (ADESentence sentence : ab.sentences) {
        List<CoreLabel> tokens = tokenizer.tokenize(sentence.offset, sentence.text);
        tagger.tagCoreLabels(tokens);
        for (int i = 0; i < tokens.size(); i++) morphology.stem(tokens.get(i));
        for (CoreLabel token : tokens) {
          String temp = NNADE.wordPreprocess(token, parameters);
          osw.write(temp + " ");
        }
        osw.write("\n");
      }
    }

    osw.close();

    System.out.println("Generate a word2vec corpus.");
    System.out.printf("wordPreprocess = %d%n", parameters.wordPreprocess);
    System.out.printf("embeddingSize = %d%n", parameters.embeddingSize);
  }
 public static TaggedWord verbToGerund(TaggedWord verb) {
   Morphology wordMorpher = new Morphology();
   String stem = wordMorpher.stem(verb.word());
   if (!stem.equals("do")) {
     stem = stem.replaceAll("[aeiou]?$", "");
   }
   return new TaggedWord(stem + "ing", "VBG");
 }
  private static float lexSimilarity(String s1, String s2) {
    String[] split1 = s1.split("[.?!]");
    String[] split2 = s2.split("[.?!]");
    Set<String> stemsInFirst = new HashSet<String>();
    Set<String> stemsInSecond = new HashSet<String>();
    for (int i = 0; i < split1.length; i++) {

      PTBTokenizer<Word> tokenizer1 = PTBTokenizer.newPTBTokenizer(new StringReader(split1[i]));

      while (tokenizer1.hasNext()) {
        Word w = tokenizer1.next();
        String stem = m.stem(w).word();

        stemsInFirst.add(stem);
      }
    }

    for (int j = 0; j < split2.length; j++) {
      PTBTokenizer<Word> tokenizer2 = PTBTokenizer.newPTBTokenizer(new StringReader(split2[j]));
      while (tokenizer2.hasNext()) {
        Word w = tokenizer2.next();
        String stem = m.stem(w).word();

        stemsInSecond.add(stem);
      }
    }

    Iterator<String> i = stemsInSecond.iterator();
    float commonStems = 0;
    while (i.hasNext()) {
      String curStem = i.next();
      // System.out.println(curStem);
      if (stemsInFirst.contains(curStem)) commonStems++;
    }
    int secondSize = stemsInSecond.size();
    if (secondSize > 0) return commonStems / (float) (secondSize);
    else return 0;
  }
 private static void addLemma(
     Morphology morpha,
     Class<? extends CoreAnnotation<String>> ann,
     CoreMap map,
     String word,
     String tag) {
   if (tag.length() > 0) {
     String phrasalVerb = phrasalVerb(morpha, word, tag);
     if (phrasalVerb == null) {
       map.set(ann, morpha.lemma(word, tag));
     } else {
       map.set(ann, phrasalVerb);
     }
   } else {
     map.set(ann, morpha.stem(word));
   }
 }
 public static TaggedWord verbToBaseTense(TaggedWord verb) {
   Morphology wordMorpher = new Morphology();
   return new TaggedWord(wordMorpher.stem(verb.word()), "VB");
 }