Java SnowballProgram.getCurrent примеры использования

Язык программирования: Java

Пространство имен/Пакет: org.tartarus.snowball

Класс/Тип: SnowballProgram

Метод/Функция: getCurrent

Примеров на hotexamples.com: 3

Java SnowballProgram.getCurrent - 3 примера найдено. Это лучшие примеры Java кода для org.tartarus.snowball.SnowballProgram.getCurrent, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

setCurrent(4)

getCurrent(3)

copy_from(2)

slice_check(2)

slice_from(2)

insert(1)

replace_s(1)

stem(1)

Пример #1

Показать файл

Файл: FTSUtils.java Проект: Softmotions/ncms

  public static String[] stemWords(
      Locale locale, Locale fallback, Collection<String> words, int minimalTermChars) {

    SnowballProgram s = selectStemmer(locale, fallback);
    if (s == null) {
      return words.toArray(new String[words.size()]);
    }
    List<String> rl = new ArrayList<>(words.size());
    for (final String w : words) {
      s.setCurrent(w);
      s.stem();
      String c = s.getCurrent();
      if (c.length() >= minimalTermChars) {
        rl.add(c);
      }
    }
    return rl.toArray(new String[rl.size()]);
  }

Пример #2

Показать файл

Файл: SnowballStemmer.java Проект: renaud/dkpro-core

  /**
   * Creates a Stem annotation with same begin and end as the AnnotationFS fs, the value is the
   * stemmed value derived by applying the featurepath.
   *
   * @param jcas the JCas
   * @param fs the AnnotationFS where the Stem annotation is created
   * @throws AnalysisEngineProcessException if the {@code stem} method from the snowball stemmer
   *     cannot be invoked.
   */
  private void createStemAnnotation(JCas jcas, AnnotationFS fs)
      throws AnalysisEngineProcessException {
    // Check for blank text, it makes no sense to add a stem then (and raised an exception)
    String value = fp.getValue(fs);
    if (!StringUtils.isBlank(value)) {
      if (lowerCase) {
        // Fixme - should use locale/language defined in CAS.
        value = value.toLowerCase(Locale.US);
      }

      Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd());
      SnowballProgram programm = getSnowballProgram(jcas);
      programm.setCurrent(value);

      try {
        // The patched snowball from Lucene has this as a method on SnowballProgram
        // but if we have some other snowball also in the classpath, Java might
        // choose to use the other. So to be safe, we use a reflection here.
        // -- REC, 2011-04-17
        MethodUtils.invokeMethod(programm, "stem", null);
      } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
      }

      stemAnnot.setValue(programm.getCurrent());
      stemAnnot.addToIndexes(jcas);

      // Try setting the "stem" feature on Tokens.
      Feature feat = fs.getType().getFeatureByBaseName("stem");
      if (feat != null
          && feat.getRange() != null
          && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) {
        fs.setFeatureValue(feat, stemAnnot);
      }
    }
  }

Пример #3

Показать файл

Файл: Document.java Проект: danicoool/documentclassifier

  /**
   * This method creates the histogram of the document represented by this instance; if necessary
   * the text is pre-processed (removing stopwords and stemming the remaining terms) before
   * calculating the frequency of each one of its terms.
   *
   * @return The histogram of the terms of the document, represented by a map that has keys (the
   *     terms) of type {@link String}, and values (their relative frequencies) of type {@link
   *     Double}.
   */
  private Map<String, Double> createHistogram() throws Exception {

    DocumentClassifierApp application = DocumentClassifierApp.getApplication();
    boolean isStemming = application.isStemming(false);
    boolean isRemovalStopWords = application.isRemovalStopWords(false);

    /**
     * Even if the user has chosen (through the Preferences panel) not to apply stemming to document
     * terms, it is necessary to create the classes that do it, otherwise the compiler will generate
     * an error.
     */
    Class stemClass = Class.forName(application.getStemmer(false));
    SnowballProgram stemmer = (SnowballProgram) stemClass.newInstance();
    @SuppressWarnings("unchecked")
    Method stemMethod = stemClass.getMethod("stem", new Class[0]);

    Object[] emptyArgs = new Object[0];
    String specialCharacters = " \t\n\r\f,;.:!'\"()?[]=-@";

    Map<String, Double> documentHistogram = new HashMap<String, Double>();
    String currentToken;
    Double frequency;
    int weight;
    String row;

    /** I build the set of stopwords, by reading the appropriate file. */
    Set<String> stopWordsList = new HashSet<String>();
    BufferedReader stopWordsBR;
    File stopWordsListFile =
        new File(DocumentClassifierApp.getApplication().getStopWordsList(false));
    String[] fields;
    if (isRemovalStopWords) {
      /**
       * If the user has chosen not to enable the removal of stopwords, no element is added to the
       * set, which remains empty.
       */
      stopWordsBR = new BufferedReader(new FileReader(stopWordsListFile));
      while ((row = stopWordsBR.readLine()) != null) {
        if (!row.isEmpty()) {
          fields = row.split("|");
          if (!fields[0].startsWith(" ")) {
            stopWordsList.add(fields[0].trim());
          }
        }
      }
      stopWordsBR.close();
    }

    /**
     * Pre-processing of the text. The title and text of the document are represented as two
     * strings, belonging to an array, such that I will be able to apply the same operations to both
     * of them, but weighting the terms in a different way depending if they belong to the title or
     * the text.
     */
    String[] titleText = new String[2];
    titleText[0] = title;
    titleText[1] = text;
    for (int j = 0; j <= 1; j++) {
      if (j == 0) {
        /**
         * If I'm reading the title of the document-->Its terms have a double weight than the terms
         * of the text, because they are more directly related to the argument and context of the
         * document, than the terms of the text.
         */
        weight = 2;
      } else {
        weight = 1;
      }
      titleText[j] = titleText[j].toLowerCase();
      StringTokenizer ST = new StringTokenizer(titleText[j], specialCharacters);
      while (ST.hasMoreTokens()) {
        currentToken = ST.nextToken();
        /**
         * Removal of stopwords (if enabled by the user) and of numbers (in any case). The word
         * 'removal' is inappropriate, because what is really done is simply to not take into
         * consideration a term if it is present in the stopwords list, or if it represents a
         * number. In these two cases the term is not added to the map which represents the
         * histogram of the document.
         */
        if (!stopWordsList.contains(currentToken) && !currentToken.matches("\\d+")) {

          if (isStemming) {
            /**
             * Stemming of the current term: The stemmer creates a new term containing the root of
             * the one given in input.
             */
            stemmer.setCurrent(currentToken);
            stemMethod.invoke(stemmer, emptyArgs);
            currentToken = stemmer.getCurrent();
          }
          /**
           * The frequency of the current term (eventually stemmed to its root) is read from the
           * document's histogram, and updated (depending on the weight assigned to the current
           * term).
           */
          frequency = documentHistogram.get(currentToken);
          if (frequency == null) {
            frequency = 0.0;
          }
          documentHistogram.put(currentToken, frequency + weight);
        }
      }
    }

    /**
     * The histogram has been completed-->Now it is necessary to normalize its frequencies to the
     * length of the document, making them relative.
     */
    double normalizationFactor = 0;
    for (double I : documentHistogram.values()) {
      // Calculation of the document's length after pre-processing
      normalizationFactor += I;
    }
    // Normalization of frequencies (from absolute to relative)
    for (String Token : documentHistogram.keySet()) {
      documentHistogram.put(Token, documentHistogram.get(Token) / normalizationFactor);
    }
    /**
     * I make sure that, once created, the histogram of this document can't be modified anymore
     * (accidentally or intentionally) by other classes that read it.
     */
    return Collections.unmodifiableMap(documentHistogram);
  }