public InMemoryLookupCache(boolean addUnk) { if (addUnk) { VocabWord word = new VocabWord(1.0, Word2Vec.UNK); word.setIndex(0); addToken(word); addWordToIndex(0, Word2Vec.UNK); putVocabWord(Word2Vec.UNK); } }
private static void addTokenToVocabCache(InMemoryLookupCache vocab, String stringToken) { // Making string token into actual token if not already an actual token (vocabWord) VocabWord actualToken; if (vocab.hasToken(stringToken)) { actualToken = vocab.tokenFor(stringToken); } else { actualToken = new VocabWord(1, stringToken); } // Set the index of the actual token (vocabWord) // Put vocabWord into vocabs in InMemoryVocabCache boolean vocabContainsWord = vocab.containsWord(stringToken); if (!vocabContainsWord) { vocab.addToken(actualToken); int idx = vocab.numWords(); actualToken.setIndex(idx); vocab.putVocabWord(stringToken); } }
/** * Load a look up cache from an input stream delimited by \n * * @param from the input stream to read from * @return the in memory lookup cache */ public static InMemoryLookupCache load(InputStream from) { Reader inputStream = new InputStreamReader(from); LineIterator iter = IOUtils.lineIterator(inputStream); String line; InMemoryLookupCache ret = new InMemoryLookupCache(); int count = 0; while ((iter.hasNext())) { line = iter.nextLine(); if (line.isEmpty()) continue; ret.incrementWordCount(line); VocabWord word = new VocabWord(1.0, line); word.setIndex(count); ret.addToken(word); ret.addWordToIndex(count, line); ret.putVocabWord(line); count++; } return ret; }