/** * A dumb vector space model that counts each word's co-occurences with a predefined set of * content words and uses these co-occurence vectors directly as word representations. The context * in which a word occurs is the set of content words in an entire sentence. * * <p>N.B. Most people would probably not consider this an embedding model, since the words have * not been embedded in a lower dimensional subspace. However, it is a good starting point. * * <p>Since this approach does not share any information between representations of different * words, we can filter the training data to only include sentences that contain words of * interest. In other approaches this may not be a good idea. * * @param dataPath * @param targetVocab * @param contentVocab * @return */ private static HashMap<String, float[]> getEmbeddings( String dataPath, HashMap<String, Integer> contentVocab, Set<String> targetVocab) { HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>(); for (String target_word : targetVocab) { embeddingMatrix.put(target_word, new float[contentVocab.size()]); } Collection<List<String>> sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath); for (List<String> sentence : sentenceCollection) { Set<String> sw = new HashSet<String>(sentence); sw.retainAll(targetVocab); for (String word : sentence) { if (!contentVocab.containsKey(word)) continue; int contentWordId = contentVocab.get(word); for (String targetWord : sw) { embeddingMatrix.get(targetWord)[contentWordId] = embeddingMatrix.get(targetWord)[contentWordId] + 1; } } } return embeddingMatrix; }
public WordSim( String dataPath, int kSamples, int dimensions, int contextSize, double power, double alpha, double min_eta, double sigma, int epochs, boolean skipGram, boolean negativeSampling, boolean sampleUnigram, double learningDecay) { this.sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath); this.kSamples = kSamples; this.dimensions = dimensions; this.contextSize = contextSize; this.power = power; this.alpha = alpha; this.min_eta = min_eta; this.sigma = sigma; this.epochs = epochs; this.skipGram = skipGram; this.negativeSampling = negativeSampling; this.sampleUnigram = sampleUnigram; this.learningDecay = learningDecay; this.vocabulary = LanguageModelTester.extractVocabulary(sentenceCollection); encodeVocabulary(); // create one-hot encoding index for all words in vocabulary this.V = vocabulary.size(); // cardinality of vocabulary setAllContexts(); // create HashMap for all observed positive contexts for each word if (sampleUnigram) randomContextGeneration(); // create weighted random sampler for noise distribution else noiseSampler = null; }