public static void prepareWord2VecCorpus( Properties properties, Parameters parameters, String output) throws Exception { File fAbstractDir = new File(PropertiesUtils.getString(properties, "corpusDir", "")); File fOutputFile = new File(output); Tokenizer tokenizer = new Tokenizer(true, ' '); MaxentTagger tagger = new MaxentTagger(PropertiesUtils.getString(properties, "pos_tagger", "")); Morphology morphology = new Morphology(); OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(fOutputFile), "utf-8"); for (File abstractFile : fAbstractDir.listFiles()) { Abstract ab = (Abstract) ObjectSerializer.readObjectFromFile(abstractFile.getAbsolutePath()); for (ADESentence sentence : ab.sentences) { List<CoreLabel> tokens = tokenizer.tokenize(sentence.offset, sentence.text); tagger.tagCoreLabels(tokens); for (int i = 0; i < tokens.size(); i++) morphology.stem(tokens.get(i)); for (CoreLabel token : tokens) { String temp = NNADE.wordPreprocess(token, parameters); osw.write(temp + " "); } osw.write("\n"); } } osw.close(); System.out.println("Generate a word2vec corpus."); System.out.printf("wordPreprocess = %d%n", parameters.wordPreprocess); System.out.printf("embeddingSize = %d%n", parameters.embeddingSize); }
static void findClosestWord(String w) throws Exception { NNADE nnade = (NNADE) ObjectSerializer.readObjectFromFile("E:\\ade\\nnade.ser0"); ArrayList<String> words = new ArrayList<>(); ArrayList<Matrix> vectors = new ArrayList<>(); Matrix wVector = null; for (String word : nnade.knownWords) { int index = nnade.wordIDs.get(word); if (index == 0 || index == 1) { System.out.println(word); continue; } if (w.equals(word)) { wVector = new Matrix(1, 50, nnade.E[index]); continue; } words.add(word); vectors.add(new Matrix(1, 50, nnade.E[index])); } // normalize /*for(int i=0;i<vectors.size();i++) Normalizer.doVectorNormalizing(vectors.get(i));*/ int max = 100; ArrayList<Matrix> mat = new ArrayList<>(); ArrayList<String> word = new ArrayList<>(); for (int i = 0; i < vectors.size(); i++) { double currentDist = Matrix.distanceEuclidean(vectors.get(i), wVector); boolean inserted = false; for (int j = 0; j < mat.size(); j++) { double oldDist = Matrix.distanceEuclidean(mat.get(j), wVector); if (currentDist < oldDist) { mat.add(j, vectors.get(i)); word.add(j, words.get(i)); inserted = true; break; } } if (inserted == false) { mat.add(vectors.get(i)); word.add(words.get(i)); } if (mat.size() > max) { mat.remove(mat.size() - 1); word.remove(word.size() - 1); } } // dump result for (String t : word) { System.out.print(t + " "); } }
static void kmeansWord(Tool tool, Parameters parameters, List<Abstract> abs) throws Exception { NNADE nnade = (NNADE) ObjectSerializer.readObjectFromFile("E:\\ade\\nnade.ser0"); HashSet<String> results = new HashSet<String>( Arrays.asList( new String[] { "induce", "clozapine", "mtx", "heparin", "colchicine", "methotrexate", "thrombocytopenia", "glaucoma", "myopathy", "thyrotoxicosis" })); /*HashSet<String> results = new HashSet<String>(); HashSet<String> triggers = new HashSet<String>(Arrays.asList(new String[] {"induce", "associate", "relate", "cause", "develop", "produce", "after", "follow", "result"})); nnade.nn.preCompute(); for(Abstract ab:abs) { for(ADESentence gold:ab.sentences) { List<CoreLabel> tokens = nnade.prepareNLPInfo(tool, gold); ADESentence predicted = null; predicted = nnade.decode(tokens, tool); for(RelationEntity relation:predicted.relaitons) { Entity former = relation.getFormer(); if(former.start!=former.end) continue; Entity latter = relation.getLatter(); if(latter.start!=latter.end) continue; for(int i=former.end+1;i<=latter.start-1;i++) { if(triggers.contains(tokens.get(i).lemma().toLowerCase())) { results.add(tokens.get(former.start).lemma().toLowerCase()); results.add(tokens.get(latter.start).lemma().toLowerCase()); results.add(tokens.get(i).lemma().toLowerCase()); } } } } }*/ ArrayList<String> words = new ArrayList<>(); ArrayList<Matrix> vectors = new ArrayList<>(); for (String word : results) { int index = nnade.wordIDs.get(word); if (index == 0 || index == 1) { System.out.println(word); continue; } words.add(word); vectors.add(new Matrix(1, 50, nnade.E[index])); } int k = 2; // normalize for (int i = 0; i < vectors.size(); i++) Normalizer.doVectorNormalizing(vectors.get(i)); // do Buckshot Buckshot bs = new Buckshot(k, vectors); ArrayList<Matrix> centroids = bs.doBuckshot(); // do KMeans KMeans mk = new KMeans(k, vectors, centroids, 1000); mk.getResults(); OutputStreamWriter osw1 = new OutputStreamWriter(new FileOutputStream("E:\\ade\\kmeans.txt"), "utf-8"); // dump result, i denotes class, j denotes word for (int i = 0; i < k; i++) { osw1.write("##the " + i + " class :\n"); int line = 0; for (int j = 0; j < mk.vectors2classes.length; j++) { if (i == mk.vectors2classes[j]) { osw1.write(words.get(j) + " "); line++; if (line == 10) { osw1.write("\n"); line = 0; } } } osw1.write("\n\n"); } osw1.close(); int max = 10; ArrayList<ArrayList<Matrix>> outMat = new ArrayList<ArrayList<Matrix>>(); ArrayList<ArrayList<String>> outWord = new ArrayList<ArrayList<String>>(); for (int i = 0; i < k; i++) { outMat.add(new ArrayList<Matrix>()); outWord.add(new ArrayList<String>()); } for (int i = 0; i < mk.vectors2classes.length; i++) { ArrayList<Matrix> mat = outMat.get(mk.vectors2classes[i]); ArrayList<String> word = outWord.get(mk.vectors2classes[i]); double currentDist = mk.similarity(vectors.get(i), mk.centroids.get(mk.vectors2classes[i])); boolean inserted = false; for (int j = 0; j < mat.size(); j++) { double oldDist = mk.similarity(mat.get(j), mk.centroids.get(mk.vectors2classes[i])); if (currentDist < oldDist) { mat.add(j, vectors.get(i)); word.add(j, words.get(i)); inserted = true; break; } } if (inserted == false) { mat.add(vectors.get(i)); word.add(words.get(i)); } if (mat.size() > max) { mat.remove(mat.size() - 1); word.remove(word.size() - 1); } } // dump result, i denotes class, j denotes word for (int i = 0; i < outWord.size(); i++) { ArrayList<String> word = outWord.get(i); System.out.print("##the " + (i + 1) + " class :\n"); for (String w : word) { System.out.print(w + " "); } System.out.println(); } }