/** * Transforms the matrix * * @param text * @return */ @Override public INDArray transform(String text) { Tokenizer tokenizer = tokenizerFactory.create(text); List<String> tokens = tokenizer.getTokens(); INDArray input = Nd4j.create(1, cache.numWords()); for (String token : tokens) { int idx = cache.indexOf(token); if (cache.indexOf(token) >= 0) input.putScalar(idx, cache.wordFrequency(token)); } return input; }
/** * Vectorizes the passed in text treating it as one document * * @param text the text to vectorize * @param label the label of the text * @return a dataset with a transform of weights(relative to impl; could be word counts or tfidf * scores) */ @Override public DataSet vectorize(String text, String label) { Tokenizer tokenizer = tokenizerFactory.create(text); List<String> tokens = tokenizer.getTokens(); INDArray input = Nd4j.create(1, cache.numWords()); for (String token : tokens) { int idx = cache.indexOf(token); if (cache.indexOf(token) >= 0) input.putScalar(idx, cache.wordFrequency(token)); } INDArray labelMatrix = FeatureUtil.toOutcomeVector(labels.indexOf(label), labels.size()); return new DataSet(input, labelMatrix); }