Пример #1
0
  /**
   * A dumb vector space model that counts each word's co-occurences with a predefined set of
   * content words and uses these co-occurence vectors directly as word representations. The context
   * in which a word occurs is the set of content words in an entire sentence.
   *
   * <p>N.B. Most people would probably not consider this an embedding model, since the words have
   * not been embedded in a lower dimensional subspace. However, it is a good starting point.
   *
   * <p>Since this approach does not share any information between representations of different
   * words, we can filter the training data to only include sentences that contain words of
   * interest. In other approaches this may not be a good idea.
   *
   * @param dataPath
   * @param targetVocab
   * @param contentVocab
   * @return
   */
  private static HashMap<String, float[]> getEmbeddings(
      String dataPath, HashMap<String, Integer> contentVocab, Set<String> targetVocab) {

    HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>();
    for (String target_word : targetVocab) {
      embeddingMatrix.put(target_word, new float[contentVocab.size()]);
    }

    Collection<List<String>> sentenceCollection =
        SentenceCollection.Reader.readSentenceCollection(dataPath);

    for (List<String> sentence : sentenceCollection) {
      Set<String> sw = new HashSet<String>(sentence);
      sw.retainAll(targetVocab);
      for (String word : sentence) {
        if (!contentVocab.containsKey(word)) continue;
        int contentWordId = contentVocab.get(word);
        for (String targetWord : sw) {
          embeddingMatrix.get(targetWord)[contentWordId] =
              embeddingMatrix.get(targetWord)[contentWordId] + 1;
        }
      }
    }

    return embeddingMatrix;
  }
Пример #2
0
    public WordSim(
        String dataPath,
        int kSamples,
        int dimensions,
        int contextSize,
        double power,
        double alpha,
        double min_eta,
        double sigma,
        int epochs,
        boolean skipGram,
        boolean negativeSampling,
        boolean sampleUnigram,
        double learningDecay) {
      this.sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath);
      this.kSamples = kSamples;
      this.dimensions = dimensions;
      this.contextSize = contextSize;
      this.power = power;
      this.alpha = alpha;
      this.min_eta = min_eta;
      this.sigma = sigma;
      this.epochs = epochs;
      this.skipGram = skipGram;
      this.negativeSampling = negativeSampling;
      this.sampleUnigram = sampleUnigram;
      this.learningDecay = learningDecay;

      this.vocabulary = LanguageModelTester.extractVocabulary(sentenceCollection);
      encodeVocabulary(); // create one-hot encoding index for all words in vocabulary
      this.V = vocabulary.size(); // cardinality of vocabulary
      setAllContexts(); // create HashMap for all observed positive contexts for each word
      if (sampleUnigram)
        randomContextGeneration(); // create weighted random sampler for noise distribution
      else noiseSampler = null;
    }