/** * Creates a vocab based on tf-idf * * @param stopWords the stop words to use * @param rootDir the root directory to train on * @param tokenizerFactory the tokenizer factory to use */ public VocabCreator(File rootDir, TokenizerFactory tokenizerFactory) { super(); this.stopWords = StopWords.getStopWords(); this.rootDir = rootDir; this.tokenizerFactory = tokenizerFactory; system = ActorSystem.create("WordFrequencySystem"); }
/** * Converts a document in to a bag of words * * @param sentenceIterator the sentence iterator to use This handles segmenting the document in to * whole segments * @param tokenizerFactory the tokenizer to use * @param labels the possible labels for each document * @param vocabSize the max size of vocab */ public BagOfWordsVectorizer( LabelAwareSentenceIterator sentenceIterator, TokenizerFactory tokenizerFactory, List<String> labels, int vocabSize) { this.sentenceIter = sentenceIterator; this.tokenizerFactory = tokenizerFactory; this.vocab = new Index(); this.labels = labels; this.vocabSize = vocabSize; wordCounts = new Counter<>(); stopWords = StopWords.getStopWords(); }