/** * Calculate spearmans rho on the wordSim353 dataset (or any other dataset with similar * formatting). * * @param wordSimPairs * @param wordEmbeddings * @return * @throws Exception */ private static double spearmansScore( List<Pair<Pair<String, String>, Float>> wordSimPairs, HashMap<String, float[]> wordEmbeddings) throws Exception { final double[] predictions = new double[wordSimPairs.size()]; final double[] labels = new double[wordSimPairs.size()]; int pairNum = 0; for (Pair<Pair<String, String>, Float> wordPair : wordSimPairs) { // Find the cosine of the word embeddings. String word1 = wordPair.getFirst().getFirst(); String word2 = wordPair.getFirst().getSecond(); if (wordEmbeddings.containsKey(word1) && wordEmbeddings.containsKey(word2)) { predictions[pairNum] = cosineSimilarity(wordEmbeddings.get(word1), wordEmbeddings.get(word2)); } else { // Unmodelled words have 0.5 similarity. predictions[pairNum] = 0.5; } labels[pairNum] = wordPair.getSecond(); pairNum++; } NaturalRanking ranking = new NaturalRanking(NaNStrategy.REMOVED); SpearmansCorrelation spearman = new SpearmansCorrelation(ranking); return spearman.correlation(predictions, labels); }
/* * Reduce the embeddings vocabulary to only the words that will be needed * for the word similarity task. */ private static HashMap<String, float[]> reduceVocabulary( HashMap<String, float[]> embeddings, Set<String> targetVocab) { HashMap<String, float[]> prunedEmbeddings = new HashMap<String, float[]>(); for (String word : targetVocab) { if (embeddings.containsKey(word)) { prunedEmbeddings.put(word, embeddings.get(word)); } } return prunedEmbeddings; }
/** * Read the core WordNet senses and map each to a unique integer. Used by the simple model below. */ private static HashMap<String, Integer> getWordNetVocab(String coreWordNetPath) throws Exception { HashMap<String, Integer> vocab = new HashMap<String, Integer>(); BufferedReader reader = new BufferedReader(new FileReader(coreWordNetPath)); String line = ""; while ((line = reader.readLine()) != null) { String[] parts = line.split(" "); String word = parts[2].replace("[", "").replace("]", ""); vocab.put(word, vocab.size()); } reader.close(); return vocab; }
private HashMap<String, float[]> convertEmbeddings(Set<String> targetVocab) { // For every string in vocabulary // Get corresponding column of output matrix W2 // Map String to array of floats HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>(); for (String word : targetVocab) { int wordIndex = encodedVocab.get(word); double[] wordEmbedding = W2.getColumn(wordIndex); float[] wordEmbeddingFloat = new float[wordEmbedding.length]; for (int i = 0; i < wordEmbedding.length; i++) { wordEmbeddingFloat[i] = (float) wordEmbedding[i]; } embeddingMatrix.put(word, wordEmbeddingFloat); } return embeddingMatrix; }
/** * A dumb vector space model that counts each word's co-occurences with a predefined set of * content words and uses these co-occurence vectors directly as word representations. The context * in which a word occurs is the set of content words in an entire sentence. * * <p>N.B. Most people would probably not consider this an embedding model, since the words have * not been embedded in a lower dimensional subspace. However, it is a good starting point. * * <p>Since this approach does not share any information between representations of different * words, we can filter the training data to only include sentences that contain words of * interest. In other approaches this may not be a good idea. * * @param dataPath * @param targetVocab * @param contentVocab * @return */ private static HashMap<String, float[]> getEmbeddings( String dataPath, HashMap<String, Integer> contentVocab, Set<String> targetVocab) { HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>(); for (String target_word : targetVocab) { embeddingMatrix.put(target_word, new float[contentVocab.size()]); } Collection<List<String>> sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath); for (List<String> sentence : sentenceCollection) { Set<String> sw = new HashSet<String>(sentence); sw.retainAll(targetVocab); for (String word : sentence) { if (!contentVocab.containsKey(word)) continue; int contentWordId = contentVocab.get(word); for (String targetWord : sw) { embeddingMatrix.get(targetWord)[contentWordId] = embeddingMatrix.get(targetWord)[contentWordId] + 1; } } } return embeddingMatrix; }
private Pair<Integer, Set<Integer>> getWordContextPair( List<String> sentence, int wordPosition) { String centerWord = sentence.get(wordPosition); int centerWordIndex = encodedVocab.get(centerWord); Set<Integer> contextWordSet = new HashSet<Integer>(); for (int i = wordPosition - contextSize; i < wordPosition + contextSize; i++) { if (i < 0) continue; // Ignore contexts prior to start of sentence if (i >= sentence.size()) break; // Ignore contexts after end of current sentence if (i == centerWordIndex) continue; // Ignore center word String contextWord = sentence.get(i); int contextWordIndex = encodedVocab.get(contextWord); contextWordSet.add(contextWordIndex); } return Pair.create(centerWordIndex, contextWordSet); }
/** * Write embeddings to a file. * * @param embeddings * @param embeddingPath * @param embeddingDim * @throws Exception */ private static void writeEmbeddings( HashMap<String, float[]> embeddings, String path, int embeddingDim) throws Exception { BufferedWriter writer = new BufferedWriter(new FileWriter(path)); writer.write(embeddings.size() + " " + embeddingDim + "\n"); for (Map.Entry<String, float[]> wordEmbedding : embeddings.entrySet()) { String word = wordEmbedding.getKey(); String embeddingString = Arrays.toString(wordEmbedding.getValue()) .replace(", ", " ") .replace("[", "") .replace("]", ""); if (wordEmbedding.getValue().length != embeddingDim) { System.out.println("The embedding for " + word + " is not " + embeddingDim + "D."); System.exit(0); } writer.write(word + " " + embeddingString + "\n"); } writer.close(); }
/** * Read the embedding parameters from a file. * * @param path * @return * @throws Exception */ private static HashMap<String, float[]> readEmbeddings(String path) throws Exception { HashMap<String, float[]> embeddings = new HashMap<String, float[]>(); BufferedReader reader = new BufferedReader(new FileReader(path)); String line = ""; // Read the first line that contains the number of words and the // embedding dimension. line = reader.readLine().trim(); String[] parts = line.split("\\s{1,}"); if (parts.length < 2) { System.out.println( "Format of embedding file wrong." + "First line should contain number of words " + "embedding dimension"); System.exit(0); } int vocab_size = Integer.parseInt(parts[0]); int embedding_dim = Integer.parseInt(parts[1]); // Read the embeddings. int count_lines = 0; while ((line = reader.readLine()) != null) { if (count_lines > vocab_size) { System.out.println("Embedding file has more words than" + "provided vocab size."); System.exit(0); } parts = line.split("\\s{1,}"); String word = parts[0]; float[] emb = new float[embedding_dim]; for (int e_dim = 0; e_dim < embedding_dim; ++e_dim) { emb[e_dim] = Float.parseFloat(parts[e_dim + 1]); } embeddings.put(word, emb); ++count_lines; } System.out.println("Read " + count_lines + " embeddings of dimension: " + embedding_dim); reader.close(); return embeddings; }
/** * @author jacqueline If boolean sampleUnigram = true, we use noiseSampler from * randomContextGeneration to model the unigram probability distribution raised to specfied * power, default 3/4. Otherwise, use overloaded negativeSampleContexts(int wordIndex) * method to draw from uniform probability distribution. */ private Set<Integer> negativeSampleContexts( int wordIndex, EnumeratedDistribution<String> weightedRandomSample) { Set<Integer> negativeContexts = new HashSet<Integer>(); Set<Integer> positiveContexts = contextPairs.get(wordIndex); while (negativeContexts.size() < kSamples) { String possibleContext = weightedRandomSample.sample(); int contextIndex = encodedVocab.get(possibleContext); if (!positiveContexts.contains(contextIndex) && !negativeContexts.contains(contextIndex)) { negativeContexts.add(contextIndex); } } return negativeContexts; }
private void setAllContexts() { this.contextPairs = new HashMap<Integer, Set<Integer>>(); for (int wordIndex : encodedVocab.values()) { contextPairs.put(wordIndex, new HashSet<Integer>()); } for (List<String> sentence : sentenceCollection) { for (int wordPosition = 0; wordPosition < sentence.size(); wordPosition++) { Pair<Integer, Set<Integer>> wordPlusContext = getWordContextPair(sentence, wordPosition); int wordIndex = wordPlusContext.getFirst(); (contextPairs.get(wordIndex)).addAll(wordPlusContext.getSecond()); } } }
public static void main(String[] args) throws Exception { // Parse command line flags and arguments. Map<String, String> argMap = CommandLineUtils.simpleCommandLineParser(args); // Read commandline parameters. String embeddingPath = ""; if (!argMap.containsKey("-embeddings")) { System.out.println("-embeddings flag required."); System.exit(0); } else { embeddingPath = argMap.get("-embeddings"); } String wordSimPath = ""; if (!argMap.containsKey("-wordsim")) { System.out.println("-wordsim flag required."); System.exit(0); } else { wordSimPath = argMap.get("-wordsim"); } // Read in the labeled similarities and generate the target vocabulary. System.out.println("Loading wordsim353 ..."); List<Pair<Pair<String, String>, Float>> wordSimPairs = readWordSimPairs(wordSimPath); Set<String> targetVocab = getWordSimVocab(wordSimPath); // It is likely that you will want to generate your embeddings // elsewhere. But this supports the option to generate the embeddings // and evaluate them in a single loop. HashMap<String, float[]> embeddings; if (argMap.containsKey("-trainandeval")) { // Get some training data. String dataPath = ""; if (!argMap.containsKey("-trainingdata")) { System.out.println("-trainingdata flag required with -trainandeval"); System.exit(0); } else { dataPath = argMap.get("-trainingdata"); } // Since this simple approach does not do dimensionality reduction // on the co-occurrence vectors, we instead control the size of the // vectors by only counting co-occurrence with core WordNet senses. String wordNetPath = ""; if (!argMap.containsKey("-wordnetdata")) { System.out.println("-wordnetdata flag required with -trainandeval"); System.exit(0); } else { wordNetPath = argMap.get("-wordnetdata"); } // HashMap<String, Integer> contentWordVocab = getWordNetVocab(wordNetPath); System.out.println("Training embeddings on " + dataPath + " ..."); // embeddings = getEmbeddings(dataPath, contentWordVocab, targetVocab); int kSamples = 5; int dimensions = 100; int contextSize = 2; WordSim skipgram = new WordSim(dataPath, kSamples, dimensions, contextSize); embeddings = skipgram.getEmbeddings(targetVocab); // Keep only the words that are needed. System.out.println("Writing embeddings to " + embeddingPath + " ..."); // embeddings = reduceVocabulary(embeddings, targetVocab); // writeEmbeddings(embeddings, embeddingPath, contentVocab.size()); writeEmbeddings(embeddings, embeddingPath, dimensions); } else { // Read in embeddings. System.out.println("Loading embeddings ..."); embeddings = readEmbeddings(embeddingPath); // Keep only the words that are needed. System.out.println( "Writing reduced vocabulary embeddings to " + embeddingPath + ".reduced ..."); embeddings = reduceVocabulary(embeddings, targetVocab); writeEmbeddings( embeddings, embeddingPath + ".reduced", embeddings.values().iterator().next().length); } reduceVocabulary(embeddings, targetVocab); double score = spearmansScore(wordSimPairs, embeddings); System.out.println("Score is " + score); }
/** * @author jacqueline * <p>Take a vocabulary, return a HashMap that maps each word in the vocabulary to a unique * integer. This integer is the index of the non-zero value in the one-hot vector of size V. */ private void encodeVocabulary() { encodedVocab = new HashMap<String, Integer>(); for (String word : vocabulary) { encodedVocab.put(word, encodedVocab.size()); } }
private Set<Integer> negativeSampleContexts( String word, EnumeratedDistribution<String> weightedRandomSample) { int wordIndex = encodedVocab.get(word); return negativeSampleContexts(wordIndex, weightedRandomSample); }
private Set<Integer> negativeSampleContexts(String word) { int wordIndex = encodedVocab.get(word); return negativeSampleContexts(wordIndex); }