/** * Transforms the matrix * * @param text * @return */ @Override public DoubleMatrix transform(String text) { Tokenizer tokenizer = tokenizerFactory.create(text); List<String> tokens = tokenizer.getTokens(); DoubleMatrix input = new DoubleMatrix(1, vocab.size()); for (int i = 0; i < tokens.size(); i++) { int idx = vocab.indexOf(tokens.get(i)); if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i))); } return input; }
/** * Vectorizes the passed in text treating it as one document * * @param text the text to vectorize * @param label the label of the text * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word * counts or tfidf scores) */ @Override public DataSet vectorize(String text, String label) { Tokenizer tokenizer = tokenizerFactory.create(text); List<String> tokens = tokenizer.getTokens(); DoubleMatrix input = new DoubleMatrix(1, vocab.size()); for (int i = 0; i < tokens.size(); i++) { int idx = vocab.indexOf(tokens.get(i)); if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i))); } DoubleMatrix labelMatrix = MatrixUtil.toOutcomeVector(labels.indexOf(label), labels.size()); return new DataSet(input, labelMatrix); }
/** * Creates an index of the top size words based on tf-idf metrics * * @param size the number of words in the vocab * @return the index of the words * @throws IOException */ public Index createVocab(int size) { Index vocab = new Index(); // bootstrapping calcWordFrequencies(); // term frequency has every word for (String word : tf.keySet()) { double tfVal = MathUtils.tf((int) documentWordFrequencies.getCount(word)); double idfVal = MathUtils.idf(numFiles, idf.getCount(word)); double tfidfVal = MathUtils.tfidf(tfVal, idfVal); java.util.regex.Matcher m = punct.matcher(word); if (!stopWords.contains(word) && !m.matches()) tfidf.setCount(word, tfidfVal); } Counter<String> aggregate = tfidf; // keep top size keys via tfidf rankings aggregate.keepTopNKeys(size - 1); log.info("Created vocab of size " + aggregate.size()); wordScores = aggregate; // add words that made it via rankings for (String word : aggregate.keySet()) { if (vocab.indexOf(word) < 0) vocab.add(word); } // cache the vocab currVocab = vocab; return vocab; }
private void process() { while (sentenceIter.hasNext()) { Tokenizer tokenizer = tokenizerFactory.create(sentenceIter.nextSentence()); List<String> tokens = tokenizer.getTokens(); for (String token : tokens) if (!stopWords.contains(token)) { wordCounts.incrementCount(token, 1.0); if (vocab.indexOf(token) < 0) vocab.add(token); } } }
/** * Returns the index of a given word * * @param word the index of a given word * @return the index of a given word or -1 if not found */ @Override public synchronized int indexOf(String word) { return wordIndex.indexOf(word); }
/** * Returns the index of a given word * * @param word the index of a given word * @return the index of a given word or -1 if not found */ @Override public int indexOf(String word) { return wordIndex.indexOf(word); }