public DoubleMatrix getScoreMatrix(File file) { Counter<String> docWords = new Counter<String>(); try { LineIterator iter = FileUtils.lineIterator(file); while (iter.hasNext()) { Tokenizer t = tokenizerFactory.create((new InputHomogenization(iter.nextLine()).transform())); while (t.hasMoreTokens()) { docWords.incrementCount(t.nextToken(), 1.0); } } iter.close(); } catch (IOException e) { throw new IllegalStateException("Unable to read file", e); } DoubleMatrix ret = new DoubleMatrix(1, currVocab.size()); for (int i = 0; i < currVocab.size(); i++) { if (docWords.getCount(currVocab.get(i).toString()) > 0) { ret.put(i, wordScores.getCount(currVocab.get(i).toString())); } } return ret; }
/** * Transforms the matrix * * @param text * @return */ @Override public DoubleMatrix transform(String text) { Tokenizer tokenizer = tokenizerFactory.create(text); List<String> tokens = tokenizer.getTokens(); DoubleMatrix input = new DoubleMatrix(1, vocab.size()); for (int i = 0; i < tokens.size(); i++) { int idx = vocab.indexOf(tokens.get(i)); if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i))); } return input; }
private void process() { while (sentenceIter.hasNext()) { Tokenizer tokenizer = tokenizerFactory.create(sentenceIter.nextSentence()); List<String> tokens = tokenizer.getTokens(); for (String token : tokens) if (!stopWords.contains(token)) { wordCounts.incrementCount(token, 1.0); if (vocab.indexOf(token) < 0) vocab.add(token); } } }
/** * Vectorizes the passed in text treating it as one document * * @param text the text to vectorize * @param label the label of the text * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word * counts or tfidf scores) */ @Override public DataSet vectorize(String text, String label) { Tokenizer tokenizer = tokenizerFactory.create(text); List<String> tokens = tokenizer.getTokens(); DoubleMatrix input = new DoubleMatrix(1, vocab.size()); for (int i = 0; i < tokens.size(); i++) { int idx = vocab.indexOf(tokens.get(i)); if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i))); } DoubleMatrix labelMatrix = MatrixUtil.toOutcomeVector(labels.indexOf(label), labels.size()); return new DataSet(input, labelMatrix); }
protected void addForDoc(File doc) { Set<String> encountered = new HashSet<String>(); SentenceIterator iter = new LineSentenceIterator(doc); while (iter.hasNext()) { String line = iter.nextSentence(); if (line == null) continue; Tokenizer tokenizer = tokenizerFactory.create(new InputHomogenization(line).transform()); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); java.util.regex.Matcher m = punct.matcher(token); if (validWord(token)) { documentWordFrequencies.incrementCount(token, doc.getAbsolutePath(), 1.0); tf.incrementCount(token, 1.0); if (!encountered.contains(token)) { idf.incrementCount(token, 1.0); encountered.add(token); } } } iter.finish(); } }