private Set<Integer> negativeSampleContexts(int wordIndex) { Set<Integer> negativeContexts = new HashSet<Integer>(); Set<Integer> positiveContexts = contextPairs.get(wordIndex); while (negativeContexts.size() < kSamples) { int contextIndex = (int) (Math.random() * V); if (!positiveContexts.contains(contextIndex) && negativeContexts.contains(contextIndex)) { negativeContexts.add(contextIndex); } } return negativeContexts; }
/** * @author jacqueline If boolean sampleUnigram = true, we use noiseSampler from * randomContextGeneration to model the unigram probability distribution raised to specfied * power, default 3/4. Otherwise, use overloaded negativeSampleContexts(int wordIndex) * method to draw from uniform probability distribution. */ private Set<Integer> negativeSampleContexts( int wordIndex, EnumeratedDistribution<String> weightedRandomSample) { Set<Integer> negativeContexts = new HashSet<Integer>(); Set<Integer> positiveContexts = contextPairs.get(wordIndex); while (negativeContexts.size() < kSamples) { String possibleContext = weightedRandomSample.sample(); int contextIndex = encodedVocab.get(possibleContext); if (!positiveContexts.contains(contextIndex) && !negativeContexts.contains(contextIndex)) { negativeContexts.add(contextIndex); } } return negativeContexts; }
public WordSim( String dataPath, int kSamples, int dimensions, int contextSize, double power, double alpha, double min_eta, double sigma, int epochs, boolean skipGram, boolean negativeSampling, boolean sampleUnigram, double learningDecay) { this.sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath); this.kSamples = kSamples; this.dimensions = dimensions; this.contextSize = contextSize; this.power = power; this.alpha = alpha; this.min_eta = min_eta; this.sigma = sigma; this.epochs = epochs; this.skipGram = skipGram; this.negativeSampling = negativeSampling; this.sampleUnigram = sampleUnigram; this.learningDecay = learningDecay; this.vocabulary = LanguageModelTester.extractVocabulary(sentenceCollection); encodeVocabulary(); // create one-hot encoding index for all words in vocabulary this.V = vocabulary.size(); // cardinality of vocabulary setAllContexts(); // create HashMap for all observed positive contexts for each word if (sampleUnigram) randomContextGeneration(); // create weighted random sampler for noise distribution else noiseSampler = null; }