/** * A dumb vector space model that counts each word's co-occurences with a predefined set of * content words and uses these co-occurence vectors directly as word representations. The context * in which a word occurs is the set of content words in an entire sentence. * * <p>N.B. Most people would probably not consider this an embedding model, since the words have * not been embedded in a lower dimensional subspace. However, it is a good starting point. * * <p>Since this approach does not share any information between representations of different * words, we can filter the training data to only include sentences that contain words of * interest. In other approaches this may not be a good idea. * * @param dataPath * @param targetVocab * @param contentVocab * @return */ private static HashMap<String, float[]> getEmbeddings( String dataPath, HashMap<String, Integer> contentVocab, Set<String> targetVocab) { HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>(); for (String target_word : targetVocab) { embeddingMatrix.put(target_word, new float[contentVocab.size()]); } Collection<List<String>> sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath); for (List<String> sentence : sentenceCollection) { Set<String> sw = new HashSet<String>(sentence); sw.retainAll(targetVocab); for (String word : sentence) { if (!contentVocab.containsKey(word)) continue; int contentWordId = contentVocab.get(word); for (String targetWord : sw) { embeddingMatrix.get(targetWord)[contentWordId] = embeddingMatrix.get(targetWord)[contentWordId] + 1; } } } return embeddingMatrix; }