コード例 #1
0
  /**
   * Calculate spearmans rho on the wordSim353 dataset (or any other dataset with similar
   * formatting).
   *
   * @param wordSimPairs
   * @param wordEmbeddings
   * @return
   * @throws Exception
   */
  private static double spearmansScore(
      List<Pair<Pair<String, String>, Float>> wordSimPairs, HashMap<String, float[]> wordEmbeddings)
      throws Exception {

    final double[] predictions = new double[wordSimPairs.size()];
    final double[] labels = new double[wordSimPairs.size()];
    int pairNum = 0;
    for (Pair<Pair<String, String>, Float> wordPair : wordSimPairs) {
      // Find the cosine of the word embeddings.
      String word1 = wordPair.getFirst().getFirst();
      String word2 = wordPair.getFirst().getSecond();
      if (wordEmbeddings.containsKey(word1) && wordEmbeddings.containsKey(word2)) {
        predictions[pairNum] =
            cosineSimilarity(wordEmbeddings.get(word1), wordEmbeddings.get(word2));
      } else {
        // Unmodelled words have 0.5 similarity.
        predictions[pairNum] = 0.5;
      }
      labels[pairNum] = wordPair.getSecond();
      pairNum++;
    }

    NaturalRanking ranking = new NaturalRanking(NaNStrategy.REMOVED);
    SpearmansCorrelation spearman = new SpearmansCorrelation(ranking);

    return spearman.correlation(predictions, labels);
  }
コード例 #2
0
 /*
  * Reduce the embeddings vocabulary to only the words that will be needed
  * for the word similarity task.
  */
 private static HashMap<String, float[]> reduceVocabulary(
     HashMap<String, float[]> embeddings, Set<String> targetVocab) {
   HashMap<String, float[]> prunedEmbeddings = new HashMap<String, float[]>();
   for (String word : targetVocab) {
     if (embeddings.containsKey(word)) {
       prunedEmbeddings.put(word, embeddings.get(word));
     }
   }
   return prunedEmbeddings;
 }
コード例 #3
0
 /**
  * Read the core WordNet senses and map each to a unique integer. Used by the simple model below.
  */
 private static HashMap<String, Integer> getWordNetVocab(String coreWordNetPath) throws Exception {
   HashMap<String, Integer> vocab = new HashMap<String, Integer>();
   BufferedReader reader = new BufferedReader(new FileReader(coreWordNetPath));
   String line = "";
   while ((line = reader.readLine()) != null) {
     String[] parts = line.split(" ");
     String word = parts[2].replace("[", "").replace("]", "");
     vocab.put(word, vocab.size());
   }
   reader.close();
   return vocab;
 }
コード例 #4
0
    private HashMap<String, float[]> convertEmbeddings(Set<String> targetVocab) {
      // For every string in vocabulary
      // Get corresponding column of output matrix W2
      // Map String to array of floats
      HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>();

      for (String word : targetVocab) {
        int wordIndex = encodedVocab.get(word);
        double[] wordEmbedding = W2.getColumn(wordIndex);
        float[] wordEmbeddingFloat = new float[wordEmbedding.length];
        for (int i = 0; i < wordEmbedding.length; i++) {
          wordEmbeddingFloat[i] = (float) wordEmbedding[i];
        }
        embeddingMatrix.put(word, wordEmbeddingFloat);
      }
      return embeddingMatrix;
    }
コード例 #5
0
  /**
   * A dumb vector space model that counts each word's co-occurences with a predefined set of
   * content words and uses these co-occurence vectors directly as word representations. The context
   * in which a word occurs is the set of content words in an entire sentence.
   *
   * <p>N.B. Most people would probably not consider this an embedding model, since the words have
   * not been embedded in a lower dimensional subspace. However, it is a good starting point.
   *
   * <p>Since this approach does not share any information between representations of different
   * words, we can filter the training data to only include sentences that contain words of
   * interest. In other approaches this may not be a good idea.
   *
   * @param dataPath
   * @param targetVocab
   * @param contentVocab
   * @return
   */
  private static HashMap<String, float[]> getEmbeddings(
      String dataPath, HashMap<String, Integer> contentVocab, Set<String> targetVocab) {

    HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>();
    for (String target_word : targetVocab) {
      embeddingMatrix.put(target_word, new float[contentVocab.size()]);
    }

    Collection<List<String>> sentenceCollection =
        SentenceCollection.Reader.readSentenceCollection(dataPath);

    for (List<String> sentence : sentenceCollection) {
      Set<String> sw = new HashSet<String>(sentence);
      sw.retainAll(targetVocab);
      for (String word : sentence) {
        if (!contentVocab.containsKey(word)) continue;
        int contentWordId = contentVocab.get(word);
        for (String targetWord : sw) {
          embeddingMatrix.get(targetWord)[contentWordId] =
              embeddingMatrix.get(targetWord)[contentWordId] + 1;
        }
      }
    }

    return embeddingMatrix;
  }
コード例 #6
0
    private Pair<Integer, Set<Integer>> getWordContextPair(
        List<String> sentence, int wordPosition) {

      String centerWord = sentence.get(wordPosition);
      int centerWordIndex = encodedVocab.get(centerWord);
      Set<Integer> contextWordSet = new HashSet<Integer>();

      for (int i = wordPosition - contextSize; i < wordPosition + contextSize; i++) {
        if (i < 0) continue; // Ignore contexts prior to start of sentence
        if (i >= sentence.size()) break; // Ignore contexts after end of current sentence
        if (i == centerWordIndex) continue; // Ignore center word

        String contextWord = sentence.get(i);
        int contextWordIndex = encodedVocab.get(contextWord);
        contextWordSet.add(contextWordIndex);
      }
      return Pair.create(centerWordIndex, contextWordSet);
    }
コード例 #7
0
 /**
  * Write embeddings to a file.
  *
  * @param embeddings
  * @param embeddingPath
  * @param embeddingDim
  * @throws Exception
  */
 private static void writeEmbeddings(
     HashMap<String, float[]> embeddings, String path, int embeddingDim) throws Exception {
   BufferedWriter writer = new BufferedWriter(new FileWriter(path));
   writer.write(embeddings.size() + " " + embeddingDim + "\n");
   for (Map.Entry<String, float[]> wordEmbedding : embeddings.entrySet()) {
     String word = wordEmbedding.getKey();
     String embeddingString =
         Arrays.toString(wordEmbedding.getValue())
             .replace(", ", " ")
             .replace("[", "")
             .replace("]", "");
     if (wordEmbedding.getValue().length != embeddingDim) {
       System.out.println("The embedding for " + word + " is not " + embeddingDim + "D.");
       System.exit(0);
     }
     writer.write(word + " " + embeddingString + "\n");
   }
   writer.close();
 }
コード例 #8
0
  /**
   * Read the embedding parameters from a file.
   *
   * @param path
   * @return
   * @throws Exception
   */
  private static HashMap<String, float[]> readEmbeddings(String path) throws Exception {
    HashMap<String, float[]> embeddings = new HashMap<String, float[]>();
    BufferedReader reader = new BufferedReader(new FileReader(path));
    String line = "";

    // Read the first line that contains the number of words and the
    // embedding dimension.
    line = reader.readLine().trim();

    String[] parts = line.split("\\s{1,}");
    if (parts.length < 2) {
      System.out.println(
          "Format of embedding file wrong."
              + "First line should contain number of words "
              + "embedding dimension");
      System.exit(0);
    }
    int vocab_size = Integer.parseInt(parts[0]);
    int embedding_dim = Integer.parseInt(parts[1]);

    // Read the embeddings.
    int count_lines = 0;
    while ((line = reader.readLine()) != null) {
      if (count_lines > vocab_size) {
        System.out.println("Embedding file has more words than" + "provided vocab size.");
        System.exit(0);
      }
      parts = line.split("\\s{1,}");
      String word = parts[0];
      float[] emb = new float[embedding_dim];
      for (int e_dim = 0; e_dim < embedding_dim; ++e_dim) {
        emb[e_dim] = Float.parseFloat(parts[e_dim + 1]);
      }
      embeddings.put(word, emb);
      ++count_lines;
    }
    System.out.println("Read " + count_lines + " embeddings of dimension: " + embedding_dim);
    reader.close();
    return embeddings;
  }
コード例 #9
0
    /**
     * @author jacqueline If boolean sampleUnigram = true, we use noiseSampler from
     *     randomContextGeneration to model the unigram probability distribution raised to specfied
     *     power, default 3/4. Otherwise, use overloaded negativeSampleContexts(int wordIndex)
     *     method to draw from uniform probability distribution.
     */
    private Set<Integer> negativeSampleContexts(
        int wordIndex, EnumeratedDistribution<String> weightedRandomSample) {
      Set<Integer> negativeContexts = new HashSet<Integer>();
      Set<Integer> positiveContexts = contextPairs.get(wordIndex);

      while (negativeContexts.size() < kSamples) {
        String possibleContext = weightedRandomSample.sample();
        int contextIndex = encodedVocab.get(possibleContext);
        if (!positiveContexts.contains(contextIndex) && !negativeContexts.contains(contextIndex)) {
          negativeContexts.add(contextIndex);
        }
      }
      return negativeContexts;
    }
コード例 #10
0
    private void setAllContexts() {

      this.contextPairs = new HashMap<Integer, Set<Integer>>();
      for (int wordIndex : encodedVocab.values()) {
        contextPairs.put(wordIndex, new HashSet<Integer>());
      }

      for (List<String> sentence : sentenceCollection) {
        for (int wordPosition = 0; wordPosition < sentence.size(); wordPosition++) {
          Pair<Integer, Set<Integer>> wordPlusContext = getWordContextPair(sentence, wordPosition);
          int wordIndex = wordPlusContext.getFirst();
          (contextPairs.get(wordIndex)).addAll(wordPlusContext.getSecond());
        }
      }
    }
コード例 #11
0
  public static void main(String[] args) throws Exception {
    // Parse command line flags and arguments.
    Map<String, String> argMap = CommandLineUtils.simpleCommandLineParser(args);

    // Read commandline parameters.
    String embeddingPath = "";
    if (!argMap.containsKey("-embeddings")) {
      System.out.println("-embeddings flag required.");
      System.exit(0);
    } else {
      embeddingPath = argMap.get("-embeddings");
    }

    String wordSimPath = "";
    if (!argMap.containsKey("-wordsim")) {
      System.out.println("-wordsim flag required.");
      System.exit(0);
    } else {
      wordSimPath = argMap.get("-wordsim");
    }

    // Read in the labeled similarities and generate the target vocabulary.
    System.out.println("Loading wordsim353 ...");
    List<Pair<Pair<String, String>, Float>> wordSimPairs = readWordSimPairs(wordSimPath);
    Set<String> targetVocab = getWordSimVocab(wordSimPath);

    // It is likely that you will want to generate your embeddings
    // elsewhere. But this supports the option to generate the embeddings
    // and evaluate them in a single loop.
    HashMap<String, float[]> embeddings;
    if (argMap.containsKey("-trainandeval")) {
      // Get some training data.
      String dataPath = "";
      if (!argMap.containsKey("-trainingdata")) {
        System.out.println("-trainingdata flag required with -trainandeval");
        System.exit(0);
      } else {
        dataPath = argMap.get("-trainingdata");
      }

      // Since this simple approach does not do dimensionality reduction
      // on the co-occurrence vectors, we instead control the size of the
      // vectors by only counting co-occurrence with core WordNet senses.
      String wordNetPath = "";
      if (!argMap.containsKey("-wordnetdata")) {
        System.out.println("-wordnetdata flag required with -trainandeval");
        System.exit(0);
      } else {
        wordNetPath = argMap.get("-wordnetdata");
      }
      // HashMap<String, Integer> contentWordVocab = getWordNetVocab(wordNetPath);

      System.out.println("Training embeddings on " + dataPath + " ...");
      // embeddings = getEmbeddings(dataPath, contentWordVocab, targetVocab);
      int kSamples = 5;
      int dimensions = 100;
      int contextSize = 2;

      WordSim skipgram = new WordSim(dataPath, kSamples, dimensions, contextSize);
      embeddings = skipgram.getEmbeddings(targetVocab);

      // Keep only the words that are needed.
      System.out.println("Writing embeddings to " + embeddingPath + " ...");
      // embeddings = reduceVocabulary(embeddings, targetVocab);
      // writeEmbeddings(embeddings, embeddingPath, contentVocab.size());
      writeEmbeddings(embeddings, embeddingPath, dimensions);
    } else {
      // Read in embeddings.
      System.out.println("Loading embeddings ...");
      embeddings = readEmbeddings(embeddingPath);

      // Keep only the words that are needed.
      System.out.println(
          "Writing reduced vocabulary embeddings to " + embeddingPath + ".reduced ...");
      embeddings = reduceVocabulary(embeddings, targetVocab);
      writeEmbeddings(
          embeddings, embeddingPath + ".reduced", embeddings.values().iterator().next().length);
    }

    reduceVocabulary(embeddings, targetVocab);

    double score = spearmansScore(wordSimPairs, embeddings);
    System.out.println("Score is " + score);
  }
コード例 #12
0
 /**
  * @author jacqueline
  *     <p>Take a vocabulary, return a HashMap that maps each word in the vocabulary to a unique
  *     integer. This integer is the index of the non-zero value in the one-hot vector of size V.
  */
 private void encodeVocabulary() {
   encodedVocab = new HashMap<String, Integer>();
   for (String word : vocabulary) {
     encodedVocab.put(word, encodedVocab.size());
   }
 }
コード例 #13
0
 private Set<Integer> negativeSampleContexts(
     String word, EnumeratedDistribution<String> weightedRandomSample) {
   int wordIndex = encodedVocab.get(word);
   return negativeSampleContexts(wordIndex, weightedRandomSample);
 }
コード例 #14
0
 private Set<Integer> negativeSampleContexts(String word) {
   int wordIndex = encodedVocab.get(word);
   return negativeSampleContexts(wordIndex);
 }