Ejemplo n.º 1
0
    private Set<Integer> negativeSampleContexts(int wordIndex) {
      Set<Integer> negativeContexts = new HashSet<Integer>();
      Set<Integer> positiveContexts = contextPairs.get(wordIndex);

      while (negativeContexts.size() < kSamples) {
        int contextIndex = (int) (Math.random() * V);
        if (!positiveContexts.contains(contextIndex) && negativeContexts.contains(contextIndex)) {
          negativeContexts.add(contextIndex);
        }
      }
      return negativeContexts;
    }
Ejemplo n.º 2
0
    /**
     * @author jacqueline If boolean sampleUnigram = true, we use noiseSampler from
     *     randomContextGeneration to model the unigram probability distribution raised to specfied
     *     power, default 3/4. Otherwise, use overloaded negativeSampleContexts(int wordIndex)
     *     method to draw from uniform probability distribution.
     */
    private Set<Integer> negativeSampleContexts(
        int wordIndex, EnumeratedDistribution<String> weightedRandomSample) {
      Set<Integer> negativeContexts = new HashSet<Integer>();
      Set<Integer> positiveContexts = contextPairs.get(wordIndex);

      while (negativeContexts.size() < kSamples) {
        String possibleContext = weightedRandomSample.sample();
        int contextIndex = encodedVocab.get(possibleContext);
        if (!positiveContexts.contains(contextIndex) && !negativeContexts.contains(contextIndex)) {
          negativeContexts.add(contextIndex);
        }
      }
      return negativeContexts;
    }
Ejemplo n.º 3
0
    public WordSim(
        String dataPath,
        int kSamples,
        int dimensions,
        int contextSize,
        double power,
        double alpha,
        double min_eta,
        double sigma,
        int epochs,
        boolean skipGram,
        boolean negativeSampling,
        boolean sampleUnigram,
        double learningDecay) {
      this.sentenceCollection = SentenceCollection.Reader.readSentenceCollection(dataPath);
      this.kSamples = kSamples;
      this.dimensions = dimensions;
      this.contextSize = contextSize;
      this.power = power;
      this.alpha = alpha;
      this.min_eta = min_eta;
      this.sigma = sigma;
      this.epochs = epochs;
      this.skipGram = skipGram;
      this.negativeSampling = negativeSampling;
      this.sampleUnigram = sampleUnigram;
      this.learningDecay = learningDecay;

      this.vocabulary = LanguageModelTester.extractVocabulary(sentenceCollection);
      encodeVocabulary(); // create one-hot encoding index for all words in vocabulary
      this.V = vocabulary.size(); // cardinality of vocabulary
      setAllContexts(); // create HashMap for all observed positive contexts for each word
      if (sampleUnigram)
        randomContextGeneration(); // create weighted random sampler for noise distribution
      else noiseSampler = null;
    }