Пример #1
0
  /**
   * Get all of the words in the evaluation dataset.
   *
   * @param path
   * @return
   * @throws Exception
   */
  private static Set<String> getWordSimVocab(String path) throws Exception {
    Set<String> vocab = new HashSet<String>();
    BufferedReader reader = new BufferedReader(new FileReader(path));
    String line = "";
    line = reader.readLine();
    String[] keys = line.split(",");

    // Read the first line that contains the column keys.
    if (keys.length != 3) {
      System.out.println(
          "There should be two words per line "
              + "and a single score for each of these word "
              + "pairs. We just saw, "
              + line);
      System.exit(0);
    }
    while ((line = reader.readLine()) != null) {
      String[] parts = line.split(",");
      if (parts.length != 3) {
        System.out.println("WordSim line: " + line + " should contain two words and a score.");
        System.exit(0);
      }
      String word1 = parts[0];
      String word2 = parts[1];
      vocab.add(word1);
      vocab.add(word2);
    }
    reader.close();
    return vocab;
  }
Пример #2
0
    private Set<Integer> negativeSampleContexts(int wordIndex) {
      Set<Integer> negativeContexts = new HashSet<Integer>();
      Set<Integer> positiveContexts = contextPairs.get(wordIndex);

      while (negativeContexts.size() < kSamples) {
        int contextIndex = (int) (Math.random() * V);
        if (!positiveContexts.contains(contextIndex) && negativeContexts.contains(contextIndex)) {
          negativeContexts.add(contextIndex);
        }
      }
      return negativeContexts;
    }
Пример #3
0
    /**
     * @author jacqueline If boolean sampleUnigram = true, we use noiseSampler from
     *     randomContextGeneration to model the unigram probability distribution raised to specfied
     *     power, default 3/4. Otherwise, use overloaded negativeSampleContexts(int wordIndex)
     *     method to draw from uniform probability distribution.
     */
    private Set<Integer> negativeSampleContexts(
        int wordIndex, EnumeratedDistribution<String> weightedRandomSample) {
      Set<Integer> negativeContexts = new HashSet<Integer>();
      Set<Integer> positiveContexts = contextPairs.get(wordIndex);

      while (negativeContexts.size() < kSamples) {
        String possibleContext = weightedRandomSample.sample();
        int contextIndex = encodedVocab.get(possibleContext);
        if (!positiveContexts.contains(contextIndex) && !negativeContexts.contains(contextIndex)) {
          negativeContexts.add(contextIndex);
        }
      }
      return negativeContexts;
    }
Пример #4
0
    private Pair<Integer, Set<Integer>> getWordContextPair(
        List<String> sentence, int wordPosition) {

      String centerWord = sentence.get(wordPosition);
      int centerWordIndex = encodedVocab.get(centerWord);
      Set<Integer> contextWordSet = new HashSet<Integer>();

      for (int i = wordPosition - contextSize; i < wordPosition + contextSize; i++) {
        if (i < 0) continue; // Ignore contexts prior to start of sentence
        if (i >= sentence.size()) break; // Ignore contexts after end of current sentence
        if (i == centerWordIndex) continue; // Ignore center word

        String contextWord = sentence.get(i);
        int contextWordIndex = encodedVocab.get(contextWord);
        contextWordSet.add(contextWordIndex);
      }
      return Pair.create(centerWordIndex, contextWordSet);
    }