/** * Get all of the words in the evaluation dataset. * * @param path * @return * @throws Exception */ private static Set<String> getWordSimVocab(String path) throws Exception { Set<String> vocab = new HashSet<String>(); BufferedReader reader = new BufferedReader(new FileReader(path)); String line = ""; line = reader.readLine(); String[] keys = line.split(","); // Read the first line that contains the column keys. if (keys.length != 3) { System.out.println( "There should be two words per line " + "and a single score for each of these word " + "pairs. We just saw, " + line); System.exit(0); } while ((line = reader.readLine()) != null) { String[] parts = line.split(","); if (parts.length != 3) { System.out.println("WordSim line: " + line + " should contain two words and a score."); System.exit(0); } String word1 = parts[0]; String word2 = parts[1]; vocab.add(word1); vocab.add(word2); } reader.close(); return vocab; }
private Set<Integer> negativeSampleContexts(int wordIndex) { Set<Integer> negativeContexts = new HashSet<Integer>(); Set<Integer> positiveContexts = contextPairs.get(wordIndex); while (negativeContexts.size() < kSamples) { int contextIndex = (int) (Math.random() * V); if (!positiveContexts.contains(contextIndex) && negativeContexts.contains(contextIndex)) { negativeContexts.add(contextIndex); } } return negativeContexts; }
/** * @author jacqueline If boolean sampleUnigram = true, we use noiseSampler from * randomContextGeneration to model the unigram probability distribution raised to specfied * power, default 3/4. Otherwise, use overloaded negativeSampleContexts(int wordIndex) * method to draw from uniform probability distribution. */ private Set<Integer> negativeSampleContexts( int wordIndex, EnumeratedDistribution<String> weightedRandomSample) { Set<Integer> negativeContexts = new HashSet<Integer>(); Set<Integer> positiveContexts = contextPairs.get(wordIndex); while (negativeContexts.size() < kSamples) { String possibleContext = weightedRandomSample.sample(); int contextIndex = encodedVocab.get(possibleContext); if (!positiveContexts.contains(contextIndex) && !negativeContexts.contains(contextIndex)) { negativeContexts.add(contextIndex); } } return negativeContexts; }
private Pair<Integer, Set<Integer>> getWordContextPair( List<String> sentence, int wordPosition) { String centerWord = sentence.get(wordPosition); int centerWordIndex = encodedVocab.get(centerWord); Set<Integer> contextWordSet = new HashSet<Integer>(); for (int i = wordPosition - contextSize; i < wordPosition + contextSize; i++) { if (i < 0) continue; // Ignore contexts prior to start of sentence if (i >= sentence.size()) break; // Ignore contexts after end of current sentence if (i == centerWordIndex) continue; // Ignore center word String contextWord = sentence.get(i); int contextWordIndex = encodedVocab.get(contextWord); contextWordSet.add(contextWordIndex); } return Pair.create(centerWordIndex, contextWordSet); }