Пример #1
0
  public static String[] stemWords(
      Locale locale, Locale fallback, Collection<String> words, int minimalTermChars) {

    SnowballProgram s = selectStemmer(locale, fallback);
    if (s == null) {
      return words.toArray(new String[words.size()]);
    }
    List<String> rl = new ArrayList<>(words.size());
    for (final String w : words) {
      s.setCurrent(w);
      s.stem();
      String c = s.getCurrent();
      if (c.length() >= minimalTermChars) {
        rl.add(c);
      }
    }
    return rl.toArray(new String[rl.size()]);
  }
Пример #2
0
  /**
   * Creates a Stem annotation with same begin and end as the AnnotationFS fs, the value is the
   * stemmed value derived by applying the featurepath.
   *
   * @param jcas the JCas
   * @param fs the AnnotationFS where the Stem annotation is created
   * @throws AnalysisEngineProcessException if the {@code stem} method from the snowball stemmer
   *     cannot be invoked.
   */
  private void createStemAnnotation(JCas jcas, AnnotationFS fs)
      throws AnalysisEngineProcessException {
    // Check for blank text, it makes no sense to add a stem then (and raised an exception)
    String value = fp.getValue(fs);
    if (!StringUtils.isBlank(value)) {
      if (lowerCase) {
        // Fixme - should use locale/language defined in CAS.
        value = value.toLowerCase(Locale.US);
      }

      Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd());
      SnowballProgram programm = getSnowballProgram(jcas);
      programm.setCurrent(value);

      try {
        // The patched snowball from Lucene has this as a method on SnowballProgram
        // but if we have some other snowball also in the classpath, Java might
        // choose to use the other. So to be safe, we use a reflection here.
        // -- REC, 2011-04-17
        MethodUtils.invokeMethod(programm, "stem", null);
      } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
      }

      stemAnnot.setValue(programm.getCurrent());
      stemAnnot.addToIndexes(jcas);

      // Try setting the "stem" feature on Tokens.
      Feature feat = fs.getType().getFeatureByBaseName("stem");
      if (feat != null
          && feat.getRange() != null
          && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) {
        fs.setFeatureValue(feat, stemAnnot);
      }
    }
  }
Пример #3
0
  /**
   * This method creates the histogram of the document represented by this instance; if necessary
   * the text is pre-processed (removing stopwords and stemming the remaining terms) before
   * calculating the frequency of each one of its terms.
   *
   * @return The histogram of the terms of the document, represented by a map that has keys (the
   *     terms) of type {@link String}, and values (their relative frequencies) of type {@link
   *     Double}.
   */
  private Map<String, Double> createHistogram() throws Exception {

    DocumentClassifierApp application = DocumentClassifierApp.getApplication();
    boolean isStemming = application.isStemming(false);
    boolean isRemovalStopWords = application.isRemovalStopWords(false);

    /**
     * Even if the user has chosen (through the Preferences panel) not to apply stemming to document
     * terms, it is necessary to create the classes that do it, otherwise the compiler will generate
     * an error.
     */
    Class stemClass = Class.forName(application.getStemmer(false));
    SnowballProgram stemmer = (SnowballProgram) stemClass.newInstance();
    @SuppressWarnings("unchecked")
    Method stemMethod = stemClass.getMethod("stem", new Class[0]);

    Object[] emptyArgs = new Object[0];
    String specialCharacters = " \t\n\r\f,;.:!'\"()?[]=-@";

    Map<String, Double> documentHistogram = new HashMap<String, Double>();
    String currentToken;
    Double frequency;
    int weight;
    String row;

    /** I build the set of stopwords, by reading the appropriate file. */
    Set<String> stopWordsList = new HashSet<String>();
    BufferedReader stopWordsBR;
    File stopWordsListFile =
        new File(DocumentClassifierApp.getApplication().getStopWordsList(false));
    String[] fields;
    if (isRemovalStopWords) {
      /**
       * If the user has chosen not to enable the removal of stopwords, no element is added to the
       * set, which remains empty.
       */
      stopWordsBR = new BufferedReader(new FileReader(stopWordsListFile));
      while ((row = stopWordsBR.readLine()) != null) {
        if (!row.isEmpty()) {
          fields = row.split("|");
          if (!fields[0].startsWith(" ")) {
            stopWordsList.add(fields[0].trim());
          }
        }
      }
      stopWordsBR.close();
    }

    /**
     * Pre-processing of the text. The title and text of the document are represented as two
     * strings, belonging to an array, such that I will be able to apply the same operations to both
     * of them, but weighting the terms in a different way depending if they belong to the title or
     * the text.
     */
    String[] titleText = new String[2];
    titleText[0] = title;
    titleText[1] = text;
    for (int j = 0; j <= 1; j++) {
      if (j == 0) {
        /**
         * If I'm reading the title of the document-->Its terms have a double weight than the terms
         * of the text, because they are more directly related to the argument and context of the
         * document, than the terms of the text.
         */
        weight = 2;
      } else {
        weight = 1;
      }
      titleText[j] = titleText[j].toLowerCase();
      StringTokenizer ST = new StringTokenizer(titleText[j], specialCharacters);
      while (ST.hasMoreTokens()) {
        currentToken = ST.nextToken();
        /**
         * Removal of stopwords (if enabled by the user) and of numbers (in any case). The word
         * 'removal' is inappropriate, because what is really done is simply to not take into
         * consideration a term if it is present in the stopwords list, or if it represents a
         * number. In these two cases the term is not added to the map which represents the
         * histogram of the document.
         */
        if (!stopWordsList.contains(currentToken) && !currentToken.matches("\\d+")) {

          if (isStemming) {
            /**
             * Stemming of the current term: The stemmer creates a new term containing the root of
             * the one given in input.
             */
            stemmer.setCurrent(currentToken);
            stemMethod.invoke(stemmer, emptyArgs);
            currentToken = stemmer.getCurrent();
          }
          /**
           * The frequency of the current term (eventually stemmed to its root) is read from the
           * document's histogram, and updated (depending on the weight assigned to the current
           * term).
           */
          frequency = documentHistogram.get(currentToken);
          if (frequency == null) {
            frequency = 0.0;
          }
          documentHistogram.put(currentToken, frequency + weight);
        }
      }
    }

    /**
     * The histogram has been completed-->Now it is necessary to normalize its frequencies to the
     * length of the document, making them relative.
     */
    double normalizationFactor = 0;
    for (double I : documentHistogram.values()) {
      // Calculation of the document's length after pre-processing
      normalizationFactor += I;
    }
    // Normalization of frequencies (from absolute to relative)
    for (String Token : documentHistogram.keySet()) {
      documentHistogram.put(Token, documentHistogram.get(Token) / normalizationFactor);
    }
    /**
     * I make sure that, once created, the histogram of this document can't be modified anymore
     * (accidentally or intentionally) by other classes that read it.
     */
    return Collections.unmodifiableMap(documentHistogram);
  }