private void setAllContexts() { this.contextPairs = new HashMap<Integer, Set<Integer>>(); for (int wordIndex : encodedVocab.values()) { contextPairs.put(wordIndex, new HashSet<Integer>()); } for (List<String> sentence : sentenceCollection) { for (int wordPosition = 0; wordPosition < sentence.size(); wordPosition++) { Pair<Integer, Set<Integer>> wordPlusContext = getWordContextPair(sentence, wordPosition); int wordIndex = wordPlusContext.getFirst(); (contextPairs.get(wordIndex)).addAll(wordPlusContext.getSecond()); } } }
public static void main(String[] args) throws Exception { // Parse command line flags and arguments. Map<String, String> argMap = CommandLineUtils.simpleCommandLineParser(args); // Read commandline parameters. String embeddingPath = ""; if (!argMap.containsKey("-embeddings")) { System.out.println("-embeddings flag required."); System.exit(0); } else { embeddingPath = argMap.get("-embeddings"); } String wordSimPath = ""; if (!argMap.containsKey("-wordsim")) { System.out.println("-wordsim flag required."); System.exit(0); } else { wordSimPath = argMap.get("-wordsim"); } // Read in the labeled similarities and generate the target vocabulary. System.out.println("Loading wordsim353 ..."); List<Pair<Pair<String, String>, Float>> wordSimPairs = readWordSimPairs(wordSimPath); Set<String> targetVocab = getWordSimVocab(wordSimPath); // It is likely that you will want to generate your embeddings // elsewhere. But this supports the option to generate the embeddings // and evaluate them in a single loop. HashMap<String, float[]> embeddings; if (argMap.containsKey("-trainandeval")) { // Get some training data. String dataPath = ""; if (!argMap.containsKey("-trainingdata")) { System.out.println("-trainingdata flag required with -trainandeval"); System.exit(0); } else { dataPath = argMap.get("-trainingdata"); } // Since this simple approach does not do dimensionality reduction // on the co-occurrence vectors, we instead control the size of the // vectors by only counting co-occurrence with core WordNet senses. String wordNetPath = ""; if (!argMap.containsKey("-wordnetdata")) { System.out.println("-wordnetdata flag required with -trainandeval"); System.exit(0); } else { wordNetPath = argMap.get("-wordnetdata"); } // HashMap<String, Integer> contentWordVocab = getWordNetVocab(wordNetPath); System.out.println("Training embeddings on " + dataPath + " ..."); // embeddings = getEmbeddings(dataPath, contentWordVocab, targetVocab); int kSamples = 5; int dimensions = 100; int contextSize = 2; WordSim skipgram = new WordSim(dataPath, kSamples, dimensions, contextSize); embeddings = skipgram.getEmbeddings(targetVocab); // Keep only the words that are needed. System.out.println("Writing embeddings to " + embeddingPath + " ..."); // embeddings = reduceVocabulary(embeddings, targetVocab); // writeEmbeddings(embeddings, embeddingPath, contentVocab.size()); writeEmbeddings(embeddings, embeddingPath, dimensions); } else { // Read in embeddings. System.out.println("Loading embeddings ..."); embeddings = readEmbeddings(embeddingPath); // Keep only the words that are needed. System.out.println( "Writing reduced vocabulary embeddings to " + embeddingPath + ".reduced ..."); embeddings = reduceVocabulary(embeddings, targetVocab); writeEmbeddings( embeddings, embeddingPath + ".reduced", embeddings.values().iterator().next().length); } reduceVocabulary(embeddings, targetVocab); double score = spearmansScore(wordSimPairs, embeddings); System.out.println("Score is " + score); }