Ejemplo n.º 1
0
  public static void prepareWord2VecCorpus(
      Properties properties, Parameters parameters, String output) throws Exception {
    File fAbstractDir = new File(PropertiesUtils.getString(properties, "corpusDir", ""));
    File fOutputFile = new File(output);
    Tokenizer tokenizer = new Tokenizer(true, ' ');
    MaxentTagger tagger = new MaxentTagger(PropertiesUtils.getString(properties, "pos_tagger", ""));
    Morphology morphology = new Morphology();

    OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(fOutputFile), "utf-8");

    for (File abstractFile : fAbstractDir.listFiles()) {
      Abstract ab = (Abstract) ObjectSerializer.readObjectFromFile(abstractFile.getAbsolutePath());

      for (ADESentence sentence : ab.sentences) {
        List<CoreLabel> tokens = tokenizer.tokenize(sentence.offset, sentence.text);
        tagger.tagCoreLabels(tokens);
        for (int i = 0; i < tokens.size(); i++) morphology.stem(tokens.get(i));
        for (CoreLabel token : tokens) {
          String temp = NNADE.wordPreprocess(token, parameters);
          osw.write(temp + " ");
        }
        osw.write("\n");
      }
    }

    osw.close();

    System.out.println("Generate a word2vec corpus.");
    System.out.printf("wordPreprocess = %d%n", parameters.wordPreprocess);
    System.out.printf("embeddingSize = %d%n", parameters.embeddingSize);
  }
Ejemplo n.º 2
0
 public static TaggedWord verbToGerund(TaggedWord verb) {
   Morphology wordMorpher = new Morphology();
   String stem = wordMorpher.stem(verb.word());
   if (!stem.equals("do")) {
     stem = stem.replaceAll("[aeiou]?$", "");
   }
   return new TaggedWord(stem + "ing", "VBG");
 }
Ejemplo n.º 3
0
 private static void addLemma(
     Morphology morpha,
     Class<? extends CoreAnnotation<String>> ann,
     CoreMap map,
     String word,
     String tag) {
   if (tag.length() > 0) {
     String phrasalVerb = phrasalVerb(morpha, word, tag);
     if (phrasalVerb == null) {
       map.set(ann, morpha.lemma(word, tag));
     } else {
       map.set(ann, phrasalVerb);
     }
   } else {
     map.set(ann, morpha.stem(word));
   }
 }
Ejemplo n.º 4
0
  private static float lexSimilarity(String s1, String s2) {
    String[] split1 = s1.split("[.?!]");
    String[] split2 = s2.split("[.?!]");
    Set<String> stemsInFirst = new HashSet<String>();
    Set<String> stemsInSecond = new HashSet<String>();
    for (int i = 0; i < split1.length; i++) {

      PTBTokenizer<Word> tokenizer1 = PTBTokenizer.newPTBTokenizer(new StringReader(split1[i]));

      while (tokenizer1.hasNext()) {
        Word w = tokenizer1.next();
        String stem = m.stem(w).word();

        stemsInFirst.add(stem);
      }
    }

    for (int j = 0; j < split2.length; j++) {
      PTBTokenizer<Word> tokenizer2 = PTBTokenizer.newPTBTokenizer(new StringReader(split2[j]));
      while (tokenizer2.hasNext()) {
        Word w = tokenizer2.next();
        String stem = m.stem(w).word();

        stemsInSecond.add(stem);
      }
    }

    Iterator<String> i = stemsInSecond.iterator();
    float commonStems = 0;
    while (i.hasNext()) {
      String curStem = i.next();
      // System.out.println(curStem);
      if (stemsInFirst.contains(curStem)) commonStems++;
    }
    int secondSize = stemsInSecond.size();
    if (secondSize > 0) return commonStems / (float) (secondSize);
    else return 0;
  }
Ejemplo n.º 5
0
 public void setFromString(String labelStr, String divider) {
   int first = labelStr.indexOf(divider);
   int second = labelStr.lastIndexOf(divider);
   if (first == second) {
     setWord(labelStr.substring(0, first));
     setTag(labelStr.substring(first + 1));
     setLemma(Morphology.lemmaStatic(labelStr.substring(0, first), labelStr.substring(first + 1)));
   } else if (first >= 0) {
     setWord(labelStr.substring(0, first));
     setLemma(labelStr.substring(first + 1, second));
     setTag(labelStr.substring(second + 1));
   } else {
     setWord(labelStr);
     setLemma(null);
     setTag(null);
   }
 }
Ejemplo n.º 6
0
  /**
   * If a token is a phrasal verb with an underscore between a verb and a particle, return the
   * phrasal verb lemmatized. If not, return null
   */
  private static String phrasalVerb(Morphology morpha, String word, String tag) {

    // must be a verb and contain an underscore
    assert (word != null);
    assert (tag != null);
    if (!tag.startsWith("VB") || !word.contains("_")) return null;

    // check whether the last part is a particle
    String[] verb = word.split("_");
    if (verb.length != 2) return null;
    String particle = verb[1];
    if (particles.contains(particle)) {
      String base = verb[0];
      String lemma = morpha.lemma(base, tag);
      return lemma + '_' + particle;
    }

    return null;
  }
  public static ArrayList<TaggedWord> Preprocess(ArrayList<TaggedWord> taggedWords) {
    ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>();

    String[] punctuationsAndSpecialCharacters = {
      ",", ".", "?", "!", ":", ";", "\"", "-", "--", "'", "-LRB-", "-RRB-", "''", "``", "&"
    }; // , "/", "\\", "<", ">", "#", "&", "*", "(", ")", "{", "}", "[", "]", "~", "|"};
    HashMap<String, String> punctuationMarks = new HashMap<String, String>();
    for (int i = 0; i < punctuationsAndSpecialCharacters.length; i++) {
      punctuationMarks.put(
          punctuationsAndSpecialCharacters[i], punctuationsAndSpecialCharacters[i]);
    }

    for (int i = 0; i < taggedWords.size(); i++) {
      String word = taggedWords.get(i).word();
      String posTag = taggedWords.get(i).tag();

      if (!punctuationMarks.containsKey(word)) {

        if (!(posTag.length() > 2 && posTag.substring(0, 3).equals("NNP"))) {
          word = Morphology.lemmaStatic(word, posTag, true);
          word = word.replace('-', ' ');
        }

        String newWord, newPosTag;
        if (word.equals("n't")) newWord = "not";
        else if (word.equals("'s")) newWord = "is";
        else if (word.equals("'ll")) newWord = "will";
        else if (word.equals("'m") || word.equals("m")) newWord = "am";
        else if (word.equals("im")) newWord = "am";
        else newWord = word;
        newPosTag = posTag;
        newList.add(new TaggedWord(newWord, newPosTag));
      }
    }
    newList = StopWordRemoval(newList);
    return newList;
  }
Ejemplo n.º 8
0
 public static TaggedWord verbToBaseTense(TaggedWord verb) {
   Morphology wordMorpher = new Morphology();
   return new TaggedWord(wordMorpher.stem(verb.word()), "VB");
 }
Ejemplo n.º 9
0
 /**
  * Create a new {@code WordLemmaTag} from a Label. The value of the Label corresponds to the word
  * of the WordLemmaTag.
  *
  * @param word This word is passed to the supertype constructor
  * @param tag The {@code value()} of this Label is set as the tag of this Label
  */
 public WordLemmaTag(Label word, Label tag) {
   this(word);
   WordTag wT = new WordTag(word, tag);
   this.lemma = Morphology.stemStatic(wT).word();
   setTag(tag.value());
 }
Ejemplo n.º 10
0
 /**
  * Create a new {@code WordLemmaTag}.
  *
  * @param word This word is set as the word of this Label
  * @param tag The {@code value()} of this Label is set as the tag of this Label
  */
 public WordLemmaTag(String word, String tag) {
   WordTag wT = new WordTag(word, tag);
   this.word = word;
   this.lemma = Morphology.stemStatic(wT).word();
   setTag(tag);
 }