Exemplo n.º 1
0
  /**
   * A dumb vector space model that counts each word's co-occurences with a predefined set of
   * content words and uses these co-occurence vectors directly as word representations. The context
   * in which a word occurs is the set of content words in an entire sentence.
   *
   * <p>N.B. Most people would probably not consider this an embedding model, since the words have
   * not been embedded in a lower dimensional subspace. However, it is a good starting point.
   *
   * <p>Since this approach does not share any information between representations of different
   * words, we can filter the training data to only include sentences that contain words of
   * interest. In other approaches this may not be a good idea.
   *
   * @param dataPath
   * @param targetVocab
   * @param contentVocab
   * @return
   */
  private static HashMap<String, float[]> getEmbeddings(
      String dataPath, HashMap<String, Integer> contentVocab, Set<String> targetVocab) {

    HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>();
    for (String target_word : targetVocab) {
      embeddingMatrix.put(target_word, new float[contentVocab.size()]);
    }

    Collection<List<String>> sentenceCollection =
        SentenceCollection.Reader.readSentenceCollection(dataPath);

    for (List<String> sentence : sentenceCollection) {
      Set<String> sw = new HashSet<String>(sentence);
      sw.retainAll(targetVocab);
      for (String word : sentence) {
        if (!contentVocab.containsKey(word)) continue;
        int contentWordId = contentVocab.get(word);
        for (String targetWord : sw) {
          embeddingMatrix.get(targetWord)[contentWordId] =
              embeddingMatrix.get(targetWord)[contentWordId] + 1;
        }
      }
    }

    return embeddingMatrix;
  }
Exemplo n.º 2
0
 /**
  * Read the core WordNet senses and map each to a unique integer. Used by the simple model below.
  */
 private static HashMap<String, Integer> getWordNetVocab(String coreWordNetPath) throws Exception {
   HashMap<String, Integer> vocab = new HashMap<String, Integer>();
   BufferedReader reader = new BufferedReader(new FileReader(coreWordNetPath));
   String line = "";
   while ((line = reader.readLine()) != null) {
     String[] parts = line.split(" ");
     String word = parts[2].replace("[", "").replace("]", "");
     vocab.put(word, vocab.size());
   }
   reader.close();
   return vocab;
 }
Exemplo n.º 3
0
 /**
  * Write embeddings to a file.
  *
  * @param embeddings
  * @param embeddingPath
  * @param embeddingDim
  * @throws Exception
  */
 private static void writeEmbeddings(
     HashMap<String, float[]> embeddings, String path, int embeddingDim) throws Exception {
   BufferedWriter writer = new BufferedWriter(new FileWriter(path));
   writer.write(embeddings.size() + " " + embeddingDim + "\n");
   for (Map.Entry<String, float[]> wordEmbedding : embeddings.entrySet()) {
     String word = wordEmbedding.getKey();
     String embeddingString =
         Arrays.toString(wordEmbedding.getValue())
             .replace(", ", " ")
             .replace("[", "")
             .replace("]", "");
     if (wordEmbedding.getValue().length != embeddingDim) {
       System.out.println("The embedding for " + word + " is not " + embeddingDim + "D.");
       System.exit(0);
     }
     writer.write(word + " " + embeddingString + "\n");
   }
   writer.close();
 }
Exemplo n.º 4
0
 /**
  * @author jacqueline
  *     <p>Take a vocabulary, return a HashMap that maps each word in the vocabulary to a unique
  *     integer. This integer is the index of the non-zero value in the one-hot vector of size V.
  */
 private void encodeVocabulary() {
   encodedVocab = new HashMap<String, Integer>();
   for (String word : vocabulary) {
     encodedVocab.put(word, encodedVocab.size());
   }
 }