/** * @param index * @param word */ @Override public synchronized void addWordToIndex(int index, String word) { if (word == null || word.isEmpty()) throw new IllegalArgumentException("Word can't be empty or null"); if (!tokens.containsKey(word)) { VocabWord token = new VocabWord(1.0, word); tokens.put(word, token); wordFrequencies.incrementCount(word, 1.0); } /* If we're speaking about adding any word to index directly, it means it's going to be vocab word, not token */ if (!vocabs.containsKey(word)) { VocabWord vw = tokenFor(word); vw.setIndex(index); vocabs.put(word, vw); vw.setIndex(index); } if (!wordFrequencies.containsKey(word)) wordFrequencies.incrementCount(word, 1); wordIndex.add(word, index); }
@Override public int hashCode() { int result = wordIndex != null ? wordIndex.hashCode() : 0; result = 31 * result + (wordFrequencies != null ? wordFrequencies.hashCode() : 0); result = 31 * result + (docFrequencies != null ? docFrequencies.hashCode() : 0); result = 31 * result + (vocabs != null ? vocabs.hashCode() : 0); result = 31 * result + (tokens != null ? tokens.hashCode() : 0); result = 31 * result + (totalWordOccurrences != null ? totalWordOccurrences.hashCode() : 0); result = 31 * result + numDocs; return result; }
public DoubleMatrix getScoreMatrix(File file) { Counter<String> docWords = new Counter<String>(); try { LineIterator iter = FileUtils.lineIterator(file); while (iter.hasNext()) { Tokenizer t = tokenizerFactory.create((new InputHomogenization(iter.nextLine()).transform())); while (t.hasMoreTokens()) { docWords.incrementCount(t.nextToken(), 1.0); } } iter.close(); } catch (IOException e) { throw new IllegalStateException("Unable to read file", e); } DoubleMatrix ret = new DoubleMatrix(1, currVocab.size()); for (int i = 0; i < currVocab.size(); i++) { if (docWords.getCount(currVocab.get(i).toString()) > 0) { ret.put(i, wordScores.getCount(currVocab.get(i).toString())); } } return ret; }
/** * Creates an index of the top size words based on tf-idf metrics * * @param size the number of words in the vocab * @return the index of the words * @throws IOException */ public Index createVocab(int size) { Index vocab = new Index(); // bootstrapping calcWordFrequencies(); // term frequency has every word for (String word : tf.keySet()) { double tfVal = MathUtils.tf((int) documentWordFrequencies.getCount(word)); double idfVal = MathUtils.idf(numFiles, idf.getCount(word)); double tfidfVal = MathUtils.tfidf(tfVal, idfVal); java.util.regex.Matcher m = punct.matcher(word); if (!stopWords.contains(word) && !m.matches()) tfidf.setCount(word, tfidfVal); } Counter<String> aggregate = tfidf; // keep top size keys via tfidf rankings aggregate.keepTopNKeys(size - 1); log.info("Created vocab of size " + aggregate.size()); wordScores = aggregate; // add words that made it via rankings for (String word : aggregate.keySet()) { if (vocab.indexOf(word) < 0) vocab.add(word); } // cache the vocab currVocab = vocab; return vocab; }
/** * Transforms the matrix * * @param text * @return */ @Override public DoubleMatrix transform(String text) { Tokenizer tokenizer = tokenizerFactory.create(text); List<String> tokens = tokenizer.getTokens(); DoubleMatrix input = new DoubleMatrix(1, vocab.size()); for (int i = 0; i < tokens.size(); i++) { int idx = vocab.indexOf(tokens.get(i)); if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i))); } return input; }
private void process() { while (sentenceIter.hasNext()) { Tokenizer tokenizer = tokenizerFactory.create(sentenceIter.nextSentence()); List<String> tokens = tokenizer.getTokens(); for (String token : tokens) if (!stopWords.contains(token)) { wordCounts.incrementCount(token, 1.0); if (vocab.indexOf(token) < 0) vocab.add(token); } } }
/** * Vectorizes the passed in text treating it as one document * * @param text the text to vectorize * @param label the label of the text * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word * counts or tfidf scores) */ @Override public DataSet vectorize(String text, String label) { Tokenizer tokenizer = tokenizerFactory.create(text); List<String> tokens = tokenizer.getTokens(); DoubleMatrix input = new DoubleMatrix(1, vocab.size()); for (int i = 0; i < tokens.size(); i++) { int idx = vocab.indexOf(tokens.get(i)); if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i))); } DoubleMatrix labelMatrix = MatrixUtil.toOutcomeVector(labels.indexOf(label), labels.size()); return new DataSet(input, labelMatrix); }
/** * Loads the google binary model Credit to: * https://github.com/NLPchina/Word2VEC_java/blob/master/src/com/ansj/vec/Word2VEC.java * * @param path path to model * @throws IOException */ public static Word2Vec loadGoogleModel(String path) throws IOException { DataInputStream dis = null; BufferedInputStream bis = null; double len = 0; float vector = 0; Word2Vec ret = new Word2Vec(); Index wordIndex = new Index(); FloatMatrix wordVectors = null; try { bis = new BufferedInputStream( path.endsWith(".gz") ? new GZIPInputStream(new FileInputStream(path)) : new FileInputStream(path)); dis = new DataInputStream(bis); Map<String, FloatMatrix> wordMap = new HashMap<>(); // number of words int words = Integer.parseInt(readString(dis)); // word vector size int size = Integer.parseInt(readString(dis)); wordVectors = new FloatMatrix(words, size); String word; float[] vectors = null; for (int i = 0; i < words; i++) { word = readString(dis); log.info("Loaded " + word); vectors = new float[size]; len = 0; for (int j = 0; j < size; j++) { vector = readFloat(dis); len += vector * vector; vectors[j] = vector; } len = Math.sqrt(len); for (int j = 0; j < size; j++) { vectors[j] /= len; } wordIndex.add(word); wordVectors.putRow(i, new FloatMatrix(vectors)); } } finally { bis.close(); dis.close(); } ret.setWordIndex(wordIndex); ret.setSyn0(wordVectors); return ret; }
/** * @param index * @param word */ @Override public synchronized void addWordToIndex(int index, String word) { if (word == null || word.isEmpty()) throw new IllegalArgumentException("Word can't be empty or null"); if (!wordFrequencies.containsKey(word)) wordFrequencies.incrementCount(word, 1); wordIndex.add(word, index); }
/** @param word */ @Override public synchronized void putVocabWord(String word) { if (word == null || word.isEmpty()) throw new IllegalArgumentException("Word can't be empty or null"); // STOP and UNK are not added as tokens if (word.equals("STOP") || word.equals("UNK")) return; VocabWord token = tokenFor(word); if (token == null) throw new IllegalStateException("Word " + word + " not found as token in vocab"); int ind = token.getIndex(); addWordToIndex(ind, word); if (!hasToken(word)) throw new IllegalStateException("Unable to add token " + word + " when not already a token"); vocabs.put(word, token); wordIndex.add(word, token.getIndex()); }
@Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; InMemoryLookupCache that = (InMemoryLookupCache) o; if (numDocs != that.numDocs) return false; if (wordIndex != null ? !wordIndex.equals(that.wordIndex) : that.wordIndex != null) return false; if (wordFrequencies != null ? !wordFrequencies.equals(that.wordFrequencies) : that.wordFrequencies != null) return false; if (docFrequencies != null ? !docFrequencies.equals(that.docFrequencies) : that.docFrequencies != null) return false; if (vocabWords().equals(that.vocabWords())) return true; return true; }
/** * Returns the index of a given word * * @param word the index of a given word * @return the index of a given word or -1 if not found */ @Override public synchronized int indexOf(String word) { return wordIndex.indexOf(word); }
/** * Returns the word contained at the given index or null * * @param index the index of the word to get * @return the word at the given index */ @Override public synchronized String wordAtIndex(int index) { return (String) wordIndex.get(index); }
/** * Returns the index of a given word * * @param word the index of a given word * @return the index of a given word or -1 if not found */ @Override public int indexOf(String word) { return wordIndex.indexOf(word); }
/** * Returns the word contained at the given index or null * * @param index the index of the word to get * @return the word at the given index */ @Override public String wordAtIndex(int index) { return (String) wordIndex.get(index); }
/** * @param index * @param word */ @Override public void addWordToIndex(int index, String word) { if (!wordFrequencies.containsKey(word)) wordFrequencies.incrementCount(word, 1); if (!vocabs.containsKey(word)) vocabs.put(word, new VocabWord(1, vectorLength)); wordIndex.add(word); }