public static void prepareWord2VecCorpus( Properties properties, Parameters parameters, String output) throws Exception { File fAbstractDir = new File(PropertiesUtils.getString(properties, "corpusDir", "")); File fOutputFile = new File(output); Tokenizer tokenizer = new Tokenizer(true, ' '); MaxentTagger tagger = new MaxentTagger(PropertiesUtils.getString(properties, "pos_tagger", "")); Morphology morphology = new Morphology(); OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(fOutputFile), "utf-8"); for (File abstractFile : fAbstractDir.listFiles()) { Abstract ab = (Abstract) ObjectSerializer.readObjectFromFile(abstractFile.getAbsolutePath()); for (ADESentence sentence : ab.sentences) { List<CoreLabel> tokens = tokenizer.tokenize(sentence.offset, sentence.text); tagger.tagCoreLabels(tokens); for (int i = 0; i < tokens.size(); i++) morphology.stem(tokens.get(i)); for (CoreLabel token : tokens) { String temp = NNADE.wordPreprocess(token, parameters); osw.write(temp + " "); } osw.write("\n"); } } osw.close(); System.out.println("Generate a word2vec corpus."); System.out.printf("wordPreprocess = %d%n", parameters.wordPreprocess); System.out.printf("embeddingSize = %d%n", parameters.embeddingSize); }
public static TaggedWord verbToGerund(TaggedWord verb) { Morphology wordMorpher = new Morphology(); String stem = wordMorpher.stem(verb.word()); if (!stem.equals("do")) { stem = stem.replaceAll("[aeiou]?$", ""); } return new TaggedWord(stem + "ing", "VBG"); }
private static float lexSimilarity(String s1, String s2) { String[] split1 = s1.split("[.?!]"); String[] split2 = s2.split("[.?!]"); Set<String> stemsInFirst = new HashSet<String>(); Set<String> stemsInSecond = new HashSet<String>(); for (int i = 0; i < split1.length; i++) { PTBTokenizer<Word> tokenizer1 = PTBTokenizer.newPTBTokenizer(new StringReader(split1[i])); while (tokenizer1.hasNext()) { Word w = tokenizer1.next(); String stem = m.stem(w).word(); stemsInFirst.add(stem); } } for (int j = 0; j < split2.length; j++) { PTBTokenizer<Word> tokenizer2 = PTBTokenizer.newPTBTokenizer(new StringReader(split2[j])); while (tokenizer2.hasNext()) { Word w = tokenizer2.next(); String stem = m.stem(w).word(); stemsInSecond.add(stem); } } Iterator<String> i = stemsInSecond.iterator(); float commonStems = 0; while (i.hasNext()) { String curStem = i.next(); // System.out.println(curStem); if (stemsInFirst.contains(curStem)) commonStems++; } int secondSize = stemsInSecond.size(); if (secondSize > 0) return commonStems / (float) (secondSize); else return 0; }
private static void addLemma( Morphology morpha, Class<? extends CoreAnnotation<String>> ann, CoreMap map, String word, String tag) { if (tag.length() > 0) { String phrasalVerb = phrasalVerb(morpha, word, tag); if (phrasalVerb == null) { map.set(ann, morpha.lemma(word, tag)); } else { map.set(ann, phrasalVerb); } } else { map.set(ann, morpha.stem(word)); } }
public static TaggedWord verbToBaseTense(TaggedWord verb) { Morphology wordMorpher = new Morphology(); return new TaggedWord(wordMorpher.stem(verb.word()), "VB"); }