public static void prepareWord2VecCorpus( Properties properties, Parameters parameters, String output) throws Exception { File fAbstractDir = new File(PropertiesUtils.getString(properties, "corpusDir", "")); File fOutputFile = new File(output); Tokenizer tokenizer = new Tokenizer(true, ' '); MaxentTagger tagger = new MaxentTagger(PropertiesUtils.getString(properties, "pos_tagger", "")); Morphology morphology = new Morphology(); OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(fOutputFile), "utf-8"); for (File abstractFile : fAbstractDir.listFiles()) { Abstract ab = (Abstract) ObjectSerializer.readObjectFromFile(abstractFile.getAbsolutePath()); for (ADESentence sentence : ab.sentences) { List<CoreLabel> tokens = tokenizer.tokenize(sentence.offset, sentence.text); tagger.tagCoreLabels(tokens); for (int i = 0; i < tokens.size(); i++) morphology.stem(tokens.get(i)); for (CoreLabel token : tokens) { String temp = NNADE.wordPreprocess(token, parameters); osw.write(temp + " "); } osw.write("\n"); } } osw.close(); System.out.println("Generate a word2vec corpus."); System.out.printf("wordPreprocess = %d%n", parameters.wordPreprocess); System.out.printf("embeddingSize = %d%n", parameters.embeddingSize); }
public static TaggedWord verbToGerund(TaggedWord verb) { Morphology wordMorpher = new Morphology(); String stem = wordMorpher.stem(verb.word()); if (!stem.equals("do")) { stem = stem.replaceAll("[aeiou]?$", ""); } return new TaggedWord(stem + "ing", "VBG"); }
private static void addLemma( Morphology morpha, Class<? extends CoreAnnotation<String>> ann, CoreMap map, String word, String tag) { if (tag.length() > 0) { String phrasalVerb = phrasalVerb(morpha, word, tag); if (phrasalVerb == null) { map.set(ann, morpha.lemma(word, tag)); } else { map.set(ann, phrasalVerb); } } else { map.set(ann, morpha.stem(word)); } }
private static float lexSimilarity(String s1, String s2) { String[] split1 = s1.split("[.?!]"); String[] split2 = s2.split("[.?!]"); Set<String> stemsInFirst = new HashSet<String>(); Set<String> stemsInSecond = new HashSet<String>(); for (int i = 0; i < split1.length; i++) { PTBTokenizer<Word> tokenizer1 = PTBTokenizer.newPTBTokenizer(new StringReader(split1[i])); while (tokenizer1.hasNext()) { Word w = tokenizer1.next(); String stem = m.stem(w).word(); stemsInFirst.add(stem); } } for (int j = 0; j < split2.length; j++) { PTBTokenizer<Word> tokenizer2 = PTBTokenizer.newPTBTokenizer(new StringReader(split2[j])); while (tokenizer2.hasNext()) { Word w = tokenizer2.next(); String stem = m.stem(w).word(); stemsInSecond.add(stem); } } Iterator<String> i = stemsInSecond.iterator(); float commonStems = 0; while (i.hasNext()) { String curStem = i.next(); // System.out.println(curStem); if (stemsInFirst.contains(curStem)) commonStems++; } int secondSize = stemsInSecond.size(); if (secondSize > 0) return commonStems / (float) (secondSize); else return 0; }
public void setFromString(String labelStr, String divider) { int first = labelStr.indexOf(divider); int second = labelStr.lastIndexOf(divider); if (first == second) { setWord(labelStr.substring(0, first)); setTag(labelStr.substring(first + 1)); setLemma(Morphology.lemmaStatic(labelStr.substring(0, first), labelStr.substring(first + 1))); } else if (first >= 0) { setWord(labelStr.substring(0, first)); setLemma(labelStr.substring(first + 1, second)); setTag(labelStr.substring(second + 1)); } else { setWord(labelStr); setLemma(null); setTag(null); } }
/** * If a token is a phrasal verb with an underscore between a verb and a particle, return the * phrasal verb lemmatized. If not, return null */ private static String phrasalVerb(Morphology morpha, String word, String tag) { // must be a verb and contain an underscore assert (word != null); assert (tag != null); if (!tag.startsWith("VB") || !word.contains("_")) return null; // check whether the last part is a particle String[] verb = word.split("_"); if (verb.length != 2) return null; String particle = verb[1]; if (particles.contains(particle)) { String base = verb[0]; String lemma = morpha.lemma(base, tag); return lemma + '_' + particle; } return null; }
public static ArrayList<TaggedWord> Preprocess(ArrayList<TaggedWord> taggedWords) { ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>(); String[] punctuationsAndSpecialCharacters = { ",", ".", "?", "!", ":", ";", "\"", "-", "--", "'", "-LRB-", "-RRB-", "''", "``", "&" }; // , "/", "\\", "<", ">", "#", "&", "*", "(", ")", "{", "}", "[", "]", "~", "|"}; HashMap<String, String> punctuationMarks = new HashMap<String, String>(); for (int i = 0; i < punctuationsAndSpecialCharacters.length; i++) { punctuationMarks.put( punctuationsAndSpecialCharacters[i], punctuationsAndSpecialCharacters[i]); } for (int i = 0; i < taggedWords.size(); i++) { String word = taggedWords.get(i).word(); String posTag = taggedWords.get(i).tag(); if (!punctuationMarks.containsKey(word)) { if (!(posTag.length() > 2 && posTag.substring(0, 3).equals("NNP"))) { word = Morphology.lemmaStatic(word, posTag, true); word = word.replace('-', ' '); } String newWord, newPosTag; if (word.equals("n't")) newWord = "not"; else if (word.equals("'s")) newWord = "is"; else if (word.equals("'ll")) newWord = "will"; else if (word.equals("'m") || word.equals("m")) newWord = "am"; else if (word.equals("im")) newWord = "am"; else newWord = word; newPosTag = posTag; newList.add(new TaggedWord(newWord, newPosTag)); } } newList = StopWordRemoval(newList); return newList; }
public static TaggedWord verbToBaseTense(TaggedWord verb) { Morphology wordMorpher = new Morphology(); return new TaggedWord(wordMorpher.stem(verb.word()), "VB"); }
/** * Create a new {@code WordLemmaTag} from a Label. The value of the Label corresponds to the word * of the WordLemmaTag. * * @param word This word is passed to the supertype constructor * @param tag The {@code value()} of this Label is set as the tag of this Label */ public WordLemmaTag(Label word, Label tag) { this(word); WordTag wT = new WordTag(word, tag); this.lemma = Morphology.stemStatic(wT).word(); setTag(tag.value()); }
/** * Create a new {@code WordLemmaTag}. * * @param word This word is set as the word of this Label * @param tag The {@code value()} of this Label is set as the tag of this Label */ public WordLemmaTag(String word, String tag) { WordTag wT = new WordTag(word, tag); this.word = word; this.lemma = Morphology.stemStatic(wT).word(); setTag(tag); }