/** * A dumb vector space model that counts each word's co-occurences with a predefined set of * content words and uses these co-occurence vectors directly as word representations. The context * in which a word occurs is the set of content words in an entire sentence. * * <p>N.B. Most people would probably not consider this an embedding model, since the words have * not been embedded in a lower dimensional subspace. However, it is a good starting point. * * <p>Since this approach does not share any information between representations of different * words, we can filter the training data to only include sentences that contain words of * interest. In other approaches this may not be a good idea. * * @param dataPath * @param targetVocab * @param contentVocab * @return */ private static HashMap<String, float[]> getEmbeddings( String dataPath, HashMap<String, Integer> contentVocab, Set<String> targetVocab) { HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>(); for (String target_word : targetVocab) { embeddingMatrix.put(target_word, new float[contentVocab.size()]); } Collection<List<String>> sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath); for (List<String> sentence : sentenceCollection) { Set<String> sw = new HashSet<String>(sentence); sw.retainAll(targetVocab); for (String word : sentence) { if (!contentVocab.containsKey(word)) continue; int contentWordId = contentVocab.get(word); for (String targetWord : sw) { embeddingMatrix.get(targetWord)[contentWordId] = embeddingMatrix.get(targetWord)[contentWordId] + 1; } } } return embeddingMatrix; }
/** * Read the core WordNet senses and map each to a unique integer. Used by the simple model below. */ private static HashMap<String, Integer> getWordNetVocab(String coreWordNetPath) throws Exception { HashMap<String, Integer> vocab = new HashMap<String, Integer>(); BufferedReader reader = new BufferedReader(new FileReader(coreWordNetPath)); String line = ""; while ((line = reader.readLine()) != null) { String[] parts = line.split(" "); String word = parts[2].replace("[", "").replace("]", ""); vocab.put(word, vocab.size()); } reader.close(); return vocab; }
/** * Write embeddings to a file. * * @param embeddings * @param embeddingPath * @param embeddingDim * @throws Exception */ private static void writeEmbeddings( HashMap<String, float[]> embeddings, String path, int embeddingDim) throws Exception { BufferedWriter writer = new BufferedWriter(new FileWriter(path)); writer.write(embeddings.size() + " " + embeddingDim + "\n"); for (Map.Entry<String, float[]> wordEmbedding : embeddings.entrySet()) { String word = wordEmbedding.getKey(); String embeddingString = Arrays.toString(wordEmbedding.getValue()) .replace(", ", " ") .replace("[", "") .replace("]", ""); if (wordEmbedding.getValue().length != embeddingDim) { System.out.println("The embedding for " + word + " is not " + embeddingDim + "D."); System.exit(0); } writer.write(word + " " + embeddingString + "\n"); } writer.close(); }
/** * @author jacqueline * <p>Take a vocabulary, return a HashMap that maps each word in the vocabulary to a unique * integer. This integer is the index of the non-zero value in the one-hot vector of size V. */ private void encodeVocabulary() { encodedVocab = new HashMap<String, Integer>(); for (String word : vocabulary) { encodedVocab.put(word, encodedVocab.size()); } }