예제 #1
0
    private void setAllContexts() {

      this.contextPairs = new HashMap<Integer, Set<Integer>>();
      for (int wordIndex : encodedVocab.values()) {
        contextPairs.put(wordIndex, new HashSet<Integer>());
      }

      for (List<String> sentence : sentenceCollection) {
        for (int wordPosition = 0; wordPosition < sentence.size(); wordPosition++) {
          Pair<Integer, Set<Integer>> wordPlusContext = getWordContextPair(sentence, wordPosition);
          int wordIndex = wordPlusContext.getFirst();
          (contextPairs.get(wordIndex)).addAll(wordPlusContext.getSecond());
        }
      }
    }
예제 #2
0
  public static void main(String[] args) throws Exception {
    // Parse command line flags and arguments.
    Map<String, String> argMap = CommandLineUtils.simpleCommandLineParser(args);

    // Read commandline parameters.
    String embeddingPath = "";
    if (!argMap.containsKey("-embeddings")) {
      System.out.println("-embeddings flag required.");
      System.exit(0);
    } else {
      embeddingPath = argMap.get("-embeddings");
    }

    String wordSimPath = "";
    if (!argMap.containsKey("-wordsim")) {
      System.out.println("-wordsim flag required.");
      System.exit(0);
    } else {
      wordSimPath = argMap.get("-wordsim");
    }

    // Read in the labeled similarities and generate the target vocabulary.
    System.out.println("Loading wordsim353 ...");
    List<Pair<Pair<String, String>, Float>> wordSimPairs = readWordSimPairs(wordSimPath);
    Set<String> targetVocab = getWordSimVocab(wordSimPath);

    // It is likely that you will want to generate your embeddings
    // elsewhere. But this supports the option to generate the embeddings
    // and evaluate them in a single loop.
    HashMap<String, float[]> embeddings;
    if (argMap.containsKey("-trainandeval")) {
      // Get some training data.
      String dataPath = "";
      if (!argMap.containsKey("-trainingdata")) {
        System.out.println("-trainingdata flag required with -trainandeval");
        System.exit(0);
      } else {
        dataPath = argMap.get("-trainingdata");
      }

      // Since this simple approach does not do dimensionality reduction
      // on the co-occurrence vectors, we instead control the size of the
      // vectors by only counting co-occurrence with core WordNet senses.
      String wordNetPath = "";
      if (!argMap.containsKey("-wordnetdata")) {
        System.out.println("-wordnetdata flag required with -trainandeval");
        System.exit(0);
      } else {
        wordNetPath = argMap.get("-wordnetdata");
      }
      // HashMap<String, Integer> contentWordVocab = getWordNetVocab(wordNetPath);

      System.out.println("Training embeddings on " + dataPath + " ...");
      // embeddings = getEmbeddings(dataPath, contentWordVocab, targetVocab);
      int kSamples = 5;
      int dimensions = 100;
      int contextSize = 2;

      WordSim skipgram = new WordSim(dataPath, kSamples, dimensions, contextSize);
      embeddings = skipgram.getEmbeddings(targetVocab);

      // Keep only the words that are needed.
      System.out.println("Writing embeddings to " + embeddingPath + " ...");
      // embeddings = reduceVocabulary(embeddings, targetVocab);
      // writeEmbeddings(embeddings, embeddingPath, contentVocab.size());
      writeEmbeddings(embeddings, embeddingPath, dimensions);
    } else {
      // Read in embeddings.
      System.out.println("Loading embeddings ...");
      embeddings = readEmbeddings(embeddingPath);

      // Keep only the words that are needed.
      System.out.println(
          "Writing reduced vocabulary embeddings to " + embeddingPath + ".reduced ...");
      embeddings = reduceVocabulary(embeddings, targetVocab);
      writeEmbeddings(
          embeddings, embeddingPath + ".reduced", embeddings.values().iterator().next().length);
    }

    reduceVocabulary(embeddings, targetVocab);

    double score = spearmansScore(wordSimPairs, embeddings);
    System.out.println("Score is " + score);
  }