Пример #1
0
  /**
   * Get all of the words in the evaluation dataset.
   *
   * @param path
   * @return
   * @throws Exception
   */
  private static Set<String> getWordSimVocab(String path) throws Exception {
    Set<String> vocab = new HashSet<String>();
    BufferedReader reader = new BufferedReader(new FileReader(path));
    String line = "";
    line = reader.readLine();
    String[] keys = line.split(",");

    // Read the first line that contains the column keys.
    if (keys.length != 3) {
      System.out.println(
          "There should be two words per line "
              + "and a single score for each of these word "
              + "pairs. We just saw, "
              + line);
      System.exit(0);
    }
    while ((line = reader.readLine()) != null) {
      String[] parts = line.split(",");
      if (parts.length != 3) {
        System.out.println("WordSim line: " + line + " should contain two words and a score.");
        System.exit(0);
      }
      String word1 = parts[0];
      String word2 = parts[1];
      vocab.add(word1);
      vocab.add(word2);
    }
    reader.close();
    return vocab;
  }
Пример #2
0
  /**
   * A dumb vector space model that counts each word's co-occurences with a predefined set of
   * content words and uses these co-occurence vectors directly as word representations. The context
   * in which a word occurs is the set of content words in an entire sentence.
   *
   * <p>N.B. Most people would probably not consider this an embedding model, since the words have
   * not been embedded in a lower dimensional subspace. However, it is a good starting point.
   *
   * <p>Since this approach does not share any information between representations of different
   * words, we can filter the training data to only include sentences that contain words of
   * interest. In other approaches this may not be a good idea.
   *
   * @param dataPath
   * @param targetVocab
   * @param contentVocab
   * @return
   */
  private static HashMap<String, float[]> getEmbeddings(
      String dataPath, HashMap<String, Integer> contentVocab, Set<String> targetVocab) {

    HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>();
    for (String target_word : targetVocab) {
      embeddingMatrix.put(target_word, new float[contentVocab.size()]);
    }

    Collection<List<String>> sentenceCollection =
        SentenceCollection.Reader.readSentenceCollection(dataPath);

    for (List<String> sentence : sentenceCollection) {
      Set<String> sw = new HashSet<String>(sentence);
      sw.retainAll(targetVocab);
      for (String word : sentence) {
        if (!contentVocab.containsKey(word)) continue;
        int contentWordId = contentVocab.get(word);
        for (String targetWord : sw) {
          embeddingMatrix.get(targetWord)[contentWordId] =
              embeddingMatrix.get(targetWord)[contentWordId] + 1;
        }
      }
    }

    return embeddingMatrix;
  }
Пример #3
0
    private Pair<Integer, Set<Integer>> getWordContextPair(
        List<String> sentence, int wordPosition) {

      String centerWord = sentence.get(wordPosition);
      int centerWordIndex = encodedVocab.get(centerWord);
      Set<Integer> contextWordSet = new HashSet<Integer>();

      for (int i = wordPosition - contextSize; i < wordPosition + contextSize; i++) {
        if (i < 0) continue; // Ignore contexts prior to start of sentence
        if (i >= sentence.size()) break; // Ignore contexts after end of current sentence
        if (i == centerWordIndex) continue; // Ignore center word

        String contextWord = sentence.get(i);
        int contextWordIndex = encodedVocab.get(contextWord);
        contextWordSet.add(contextWordIndex);
      }
      return Pair.create(centerWordIndex, contextWordSet);
    }
Пример #4
0
    private Set<Integer> negativeSampleContexts(int wordIndex) {
      Set<Integer> negativeContexts = new HashSet<Integer>();
      Set<Integer> positiveContexts = contextPairs.get(wordIndex);

      while (negativeContexts.size() < kSamples) {
        int contextIndex = (int) (Math.random() * V);
        if (!positiveContexts.contains(contextIndex) && negativeContexts.contains(contextIndex)) {
          negativeContexts.add(contextIndex);
        }
      }
      return negativeContexts;
    }
Пример #5
0
    /**
     * @author jacqueline If boolean sampleUnigram = true, we use noiseSampler from
     *     randomContextGeneration to model the unigram probability distribution raised to specfied
     *     power, default 3/4. Otherwise, use overloaded negativeSampleContexts(int wordIndex)
     *     method to draw from uniform probability distribution.
     */
    private Set<Integer> negativeSampleContexts(
        int wordIndex, EnumeratedDistribution<String> weightedRandomSample) {
      Set<Integer> negativeContexts = new HashSet<Integer>();
      Set<Integer> positiveContexts = contextPairs.get(wordIndex);

      while (negativeContexts.size() < kSamples) {
        String possibleContext = weightedRandomSample.sample();
        int contextIndex = encodedVocab.get(possibleContext);
        if (!positiveContexts.contains(contextIndex) && !negativeContexts.contains(contextIndex)) {
          negativeContexts.add(contextIndex);
        }
      }
      return negativeContexts;
    }
Пример #6
0
    public WordSim(
        String dataPath,
        int kSamples,
        int dimensions,
        int contextSize,
        double power,
        double alpha,
        double min_eta,
        double sigma,
        int epochs,
        boolean skipGram,
        boolean negativeSampling,
        boolean sampleUnigram,
        double learningDecay) {
      this.sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath);
      this.kSamples = kSamples;
      this.dimensions = dimensions;
      this.contextSize = contextSize;
      this.power = power;
      this.alpha = alpha;
      this.min_eta = min_eta;
      this.sigma = sigma;
      this.epochs = epochs;
      this.skipGram = skipGram;
      this.negativeSampling = negativeSampling;
      this.sampleUnigram = sampleUnigram;
      this.learningDecay = learningDecay;

      this.vocabulary = LanguageModelTester.extractVocabulary(sentenceCollection);
      encodeVocabulary(); // create one-hot encoding index for all words in vocabulary
      this.V = vocabulary.size(); // cardinality of vocabulary
      setAllContexts(); // create HashMap for all observed positive contexts for each word
      if (sampleUnigram)
        randomContextGeneration(); // create weighted random sampler for noise distribution
      else noiseSampler = null;
    }