/** * Get all of the words in the evaluation dataset. * * @param path * @return * @throws Exception */ private static Set<String> getWordSimVocab(String path) throws Exception { Set<String> vocab = new HashSet<String>(); BufferedReader reader = new BufferedReader(new FileReader(path)); String line = ""; line = reader.readLine(); String[] keys = line.split(","); // Read the first line that contains the column keys. if (keys.length != 3) { System.out.println( "There should be two words per line " + "and a single score for each of these word " + "pairs. We just saw, " + line); System.exit(0); } while ((line = reader.readLine()) != null) { String[] parts = line.split(","); if (parts.length != 3) { System.out.println("WordSim line: " + line + " should contain two words and a score."); System.exit(0); } String word1 = parts[0]; String word2 = parts[1]; vocab.add(word1); vocab.add(word2); } reader.close(); return vocab; }
/** * A dumb vector space model that counts each word's co-occurences with a predefined set of * content words and uses these co-occurence vectors directly as word representations. The context * in which a word occurs is the set of content words in an entire sentence. * * <p>N.B. Most people would probably not consider this an embedding model, since the words have * not been embedded in a lower dimensional subspace. However, it is a good starting point. * * <p>Since this approach does not share any information between representations of different * words, we can filter the training data to only include sentences that contain words of * interest. In other approaches this may not be a good idea. * * @param dataPath * @param targetVocab * @param contentVocab * @return */ private static HashMap<String, float[]> getEmbeddings( String dataPath, HashMap<String, Integer> contentVocab, Set<String> targetVocab) { HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>(); for (String target_word : targetVocab) { embeddingMatrix.put(target_word, new float[contentVocab.size()]); } Collection<List<String>> sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath); for (List<String> sentence : sentenceCollection) { Set<String> sw = new HashSet<String>(sentence); sw.retainAll(targetVocab); for (String word : sentence) { if (!contentVocab.containsKey(word)) continue; int contentWordId = contentVocab.get(word); for (String targetWord : sw) { embeddingMatrix.get(targetWord)[contentWordId] = embeddingMatrix.get(targetWord)[contentWordId] + 1; } } } return embeddingMatrix; }
private Pair<Integer, Set<Integer>> getWordContextPair( List<String> sentence, int wordPosition) { String centerWord = sentence.get(wordPosition); int centerWordIndex = encodedVocab.get(centerWord); Set<Integer> contextWordSet = new HashSet<Integer>(); for (int i = wordPosition - contextSize; i < wordPosition + contextSize; i++) { if (i < 0) continue; // Ignore contexts prior to start of sentence if (i >= sentence.size()) break; // Ignore contexts after end of current sentence if (i == centerWordIndex) continue; // Ignore center word String contextWord = sentence.get(i); int contextWordIndex = encodedVocab.get(contextWord); contextWordSet.add(contextWordIndex); } return Pair.create(centerWordIndex, contextWordSet); }
private Set<Integer> negativeSampleContexts(int wordIndex) { Set<Integer> negativeContexts = new HashSet<Integer>(); Set<Integer> positiveContexts = contextPairs.get(wordIndex); while (negativeContexts.size() < kSamples) { int contextIndex = (int) (Math.random() * V); if (!positiveContexts.contains(contextIndex) && negativeContexts.contains(contextIndex)) { negativeContexts.add(contextIndex); } } return negativeContexts; }
/** * @author jacqueline If boolean sampleUnigram = true, we use noiseSampler from * randomContextGeneration to model the unigram probability distribution raised to specfied * power, default 3/4. Otherwise, use overloaded negativeSampleContexts(int wordIndex) * method to draw from uniform probability distribution. */ private Set<Integer> negativeSampleContexts( int wordIndex, EnumeratedDistribution<String> weightedRandomSample) { Set<Integer> negativeContexts = new HashSet<Integer>(); Set<Integer> positiveContexts = contextPairs.get(wordIndex); while (negativeContexts.size() < kSamples) { String possibleContext = weightedRandomSample.sample(); int contextIndex = encodedVocab.get(possibleContext); if (!positiveContexts.contains(contextIndex) && !negativeContexts.contains(contextIndex)) { negativeContexts.add(contextIndex); } } return negativeContexts; }
public WordSim( String dataPath, int kSamples, int dimensions, int contextSize, double power, double alpha, double min_eta, double sigma, int epochs, boolean skipGram, boolean negativeSampling, boolean sampleUnigram, double learningDecay) { this.sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath); this.kSamples = kSamples; this.dimensions = dimensions; this.contextSize = contextSize; this.power = power; this.alpha = alpha; this.min_eta = min_eta; this.sigma = sigma; this.epochs = epochs; this.skipGram = skipGram; this.negativeSampling = negativeSampling; this.sampleUnigram = sampleUnigram; this.learningDecay = learningDecay; this.vocabulary = LanguageModelTester.extractVocabulary(sentenceCollection); encodeVocabulary(); // create one-hot encoding index for all words in vocabulary this.V = vocabulary.size(); // cardinality of vocabulary setAllContexts(); // create HashMap for all observed positive contexts for each word if (sampleUnigram) randomContextGeneration(); // create weighted random sampler for noise distribution else noiseSampler = null; }