public static String[] stemWords( Locale locale, Locale fallback, Collection<String> words, int minimalTermChars) { SnowballProgram s = selectStemmer(locale, fallback); if (s == null) { return words.toArray(new String[words.size()]); } List<String> rl = new ArrayList<>(words.size()); for (final String w : words) { s.setCurrent(w); s.stem(); String c = s.getCurrent(); if (c.length() >= minimalTermChars) { rl.add(c); } } return rl.toArray(new String[rl.size()]); }
/** * Creates a Stem annotation with same begin and end as the AnnotationFS fs, the value is the * stemmed value derived by applying the featurepath. * * @param jcas the JCas * @param fs the AnnotationFS where the Stem annotation is created * @throws AnalysisEngineProcessException if the {@code stem} method from the snowball stemmer * cannot be invoked. */ private void createStemAnnotation(JCas jcas, AnnotationFS fs) throws AnalysisEngineProcessException { // Check for blank text, it makes no sense to add a stem then (and raised an exception) String value = fp.getValue(fs); if (!StringUtils.isBlank(value)) { if (lowerCase) { // Fixme - should use locale/language defined in CAS. value = value.toLowerCase(Locale.US); } Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd()); SnowballProgram programm = getSnowballProgram(jcas); programm.setCurrent(value); try { // The patched snowball from Lucene has this as a method on SnowballProgram // but if we have some other snowball also in the classpath, Java might // choose to use the other. So to be safe, we use a reflection here. // -- REC, 2011-04-17 MethodUtils.invokeMethod(programm, "stem", null); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } stemAnnot.setValue(programm.getCurrent()); stemAnnot.addToIndexes(jcas); // Try setting the "stem" feature on Tokens. Feature feat = fs.getType().getFeatureByBaseName("stem"); if (feat != null && feat.getRange() != null && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { fs.setFeatureValue(feat, stemAnnot); } } }
/** * This method creates the histogram of the document represented by this instance; if necessary * the text is pre-processed (removing stopwords and stemming the remaining terms) before * calculating the frequency of each one of its terms. * * @return The histogram of the terms of the document, represented by a map that has keys (the * terms) of type {@link String}, and values (their relative frequencies) of type {@link * Double}. */ private Map<String, Double> createHistogram() throws Exception { DocumentClassifierApp application = DocumentClassifierApp.getApplication(); boolean isStemming = application.isStemming(false); boolean isRemovalStopWords = application.isRemovalStopWords(false); /** * Even if the user has chosen (through the Preferences panel) not to apply stemming to document * terms, it is necessary to create the classes that do it, otherwise the compiler will generate * an error. */ Class stemClass = Class.forName(application.getStemmer(false)); SnowballProgram stemmer = (SnowballProgram) stemClass.newInstance(); @SuppressWarnings("unchecked") Method stemMethod = stemClass.getMethod("stem", new Class[0]); Object[] emptyArgs = new Object[0]; String specialCharacters = " \t\n\r\f,;.:!'\"()?[]=-@"; Map<String, Double> documentHistogram = new HashMap<String, Double>(); String currentToken; Double frequency; int weight; String row; /** I build the set of stopwords, by reading the appropriate file. */ Set<String> stopWordsList = new HashSet<String>(); BufferedReader stopWordsBR; File stopWordsListFile = new File(DocumentClassifierApp.getApplication().getStopWordsList(false)); String[] fields; if (isRemovalStopWords) { /** * If the user has chosen not to enable the removal of stopwords, no element is added to the * set, which remains empty. */ stopWordsBR = new BufferedReader(new FileReader(stopWordsListFile)); while ((row = stopWordsBR.readLine()) != null) { if (!row.isEmpty()) { fields = row.split("|"); if (!fields[0].startsWith(" ")) { stopWordsList.add(fields[0].trim()); } } } stopWordsBR.close(); } /** * Pre-processing of the text. The title and text of the document are represented as two * strings, belonging to an array, such that I will be able to apply the same operations to both * of them, but weighting the terms in a different way depending if they belong to the title or * the text. */ String[] titleText = new String[2]; titleText[0] = title; titleText[1] = text; for (int j = 0; j <= 1; j++) { if (j == 0) { /** * If I'm reading the title of the document-->Its terms have a double weight than the terms * of the text, because they are more directly related to the argument and context of the * document, than the terms of the text. */ weight = 2; } else { weight = 1; } titleText[j] = titleText[j].toLowerCase(); StringTokenizer ST = new StringTokenizer(titleText[j], specialCharacters); while (ST.hasMoreTokens()) { currentToken = ST.nextToken(); /** * Removal of stopwords (if enabled by the user) and of numbers (in any case). The word * 'removal' is inappropriate, because what is really done is simply to not take into * consideration a term if it is present in the stopwords list, or if it represents a * number. In these two cases the term is not added to the map which represents the * histogram of the document. */ if (!stopWordsList.contains(currentToken) && !currentToken.matches("\\d+")) { if (isStemming) { /** * Stemming of the current term: The stemmer creates a new term containing the root of * the one given in input. */ stemmer.setCurrent(currentToken); stemMethod.invoke(stemmer, emptyArgs); currentToken = stemmer.getCurrent(); } /** * The frequency of the current term (eventually stemmed to its root) is read from the * document's histogram, and updated (depending on the weight assigned to the current * term). */ frequency = documentHistogram.get(currentToken); if (frequency == null) { frequency = 0.0; } documentHistogram.put(currentToken, frequency + weight); } } } /** * The histogram has been completed-->Now it is necessary to normalize its frequencies to the * length of the document, making them relative. */ double normalizationFactor = 0; for (double I : documentHistogram.values()) { // Calculation of the document's length after pre-processing normalizationFactor += I; } // Normalization of frequencies (from absolute to relative) for (String Token : documentHistogram.keySet()) { documentHistogram.put(Token, documentHistogram.get(Token) / normalizationFactor); } /** * I make sure that, once created, the histogram of this document can't be modified anymore * (accidentally or intentionally) by other classes that read it. */ return Collections.unmodifiableMap(documentHistogram); }