Java Tokenizer примеры использования

Язык программирования: Java

Пространство имен/Пакет: org.deeplearning4j.word2vec.tokenizer

Класс/Тип: Tokenizer

Примеров на hotexamples.com: 5

Java Tokenizer - 5 примеров найдено. Это лучшие примеры Java кода для org.deeplearning4j.word2vec.tokenizer.Tokenizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

getTokens(3)

hasMoreTokens(2)

nextToken(2)

Пример #1

Показать файл

Файл: VocabCreator.java Проект: ksuleman/java-deeplearning

  public DoubleMatrix getScoreMatrix(File file) {
    Counter<String> docWords = new Counter<String>();
    try {
      LineIterator iter = FileUtils.lineIterator(file);
      while (iter.hasNext()) {
        Tokenizer t =
            tokenizerFactory.create((new InputHomogenization(iter.nextLine()).transform()));
        while (t.hasMoreTokens()) {
          docWords.incrementCount(t.nextToken(), 1.0);
        }
      }

      iter.close();
    } catch (IOException e) {
      throw new IllegalStateException("Unable to read file", e);
    }
    DoubleMatrix ret = new DoubleMatrix(1, currVocab.size());

    for (int i = 0; i < currVocab.size(); i++) {
      if (docWords.getCount(currVocab.get(i).toString()) > 0) {
        ret.put(i, wordScores.getCount(currVocab.get(i).toString()));
      }
    }

    return ret;
  }

Пример #2

Показать файл

Файл: BagOfWordsVectorizer.java Проект: ksuleman/java-deeplearning

 /**
  * Transforms the matrix
  *
  * @param text
  * @return
  */
 @Override
 public DoubleMatrix transform(String text) {
   Tokenizer tokenizer = tokenizerFactory.create(text);
   List<String> tokens = tokenizer.getTokens();
   DoubleMatrix input = new DoubleMatrix(1, vocab.size());
   for (int i = 0; i < tokens.size(); i++) {
     int idx = vocab.indexOf(tokens.get(i));
     if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i)));
   }
   return input;
 }

Пример #3

Показать файл

Файл: BagOfWordsVectorizer.java Проект: ksuleman/java-deeplearning

 private void process() {
   while (sentenceIter.hasNext()) {
     Tokenizer tokenizer = tokenizerFactory.create(sentenceIter.nextSentence());
     List<String> tokens = tokenizer.getTokens();
     for (String token : tokens)
       if (!stopWords.contains(token)) {
         wordCounts.incrementCount(token, 1.0);
         if (vocab.indexOf(token) < 0) vocab.add(token);
       }
   }
 }

Пример #4

Показать файл

Файл: BagOfWordsVectorizer.java Проект: ksuleman/java-deeplearning

  /**
   * Vectorizes the passed in text treating it as one document
   *
   * @param text the text to vectorize
   * @param label the label of the text
   * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word
   *     counts or tfidf scores)
   */
  @Override
  public DataSet vectorize(String text, String label) {
    Tokenizer tokenizer = tokenizerFactory.create(text);
    List<String> tokens = tokenizer.getTokens();
    DoubleMatrix input = new DoubleMatrix(1, vocab.size());
    for (int i = 0; i < tokens.size(); i++) {
      int idx = vocab.indexOf(tokens.get(i));
      if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i)));
    }

    DoubleMatrix labelMatrix = MatrixUtil.toOutcomeVector(labels.indexOf(label), labels.size());
    return new DataSet(input, labelMatrix);
  }

Пример #5

Показать файл

Файл: VocabCreator.java Проект: ksuleman/java-deeplearning

  protected void addForDoc(File doc) {
    Set<String> encountered = new HashSet<String>();
    SentenceIterator iter = new LineSentenceIterator(doc);
    while (iter.hasNext()) {
      String line = iter.nextSentence();
      if (line == null) continue;
      Tokenizer tokenizer = tokenizerFactory.create(new InputHomogenization(line).transform());
      while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        java.util.regex.Matcher m = punct.matcher(token);
        if (validWord(token)) {
          documentWordFrequencies.incrementCount(token, doc.getAbsolutePath(), 1.0);
          tf.incrementCount(token, 1.0);
          if (!encountered.contains(token)) {
            idf.incrementCount(token, 1.0);
            encountered.add(token);
          }
        }
      }

      iter.finish();
    }
  }