Exemplo n.º 1
0
 /**
  * Creates a vocab based on tf-idf
  *
  * @param stopWords the stop words to use
  * @param rootDir the root directory to train on
  * @param tokenizerFactory the tokenizer factory to use
  */
 public VocabCreator(File rootDir, TokenizerFactory tokenizerFactory) {
   super();
   this.stopWords = StopWords.getStopWords();
   this.rootDir = rootDir;
   this.tokenizerFactory = tokenizerFactory;
   system = ActorSystem.create("WordFrequencySystem");
 }
 /**
  * Converts a document in to a bag of words
  *
  * @param sentenceIterator the sentence iterator to use This handles segmenting the document in to
  *     whole segments
  * @param tokenizerFactory the tokenizer to use
  * @param labels the possible labels for each document
  * @param vocabSize the max size of vocab
  */
 public BagOfWordsVectorizer(
     LabelAwareSentenceIterator sentenceIterator,
     TokenizerFactory tokenizerFactory,
     List<String> labels,
     int vocabSize) {
   this.sentenceIter = sentenceIterator;
   this.tokenizerFactory = tokenizerFactory;
   this.vocab = new Index();
   this.labels = labels;
   this.vocabSize = vocabSize;
   wordCounts = new Counter<>();
   stopWords = StopWords.getStopWords();
 }