Java PorterStemmer Examples, PorterStemmer Java Examples

Example #1

0

Show file

File: GenerateEquivalenceClasses.java Project: lramach/autoregexes

 public String identifyClassesOfPhrases(
     String rubricPhrase, ArrayList<String> topScoringTokens, MaxentTagger posTagger)
     throws ClassNotFoundException, IOException {
   GenerateEquivalenceClasses genClass = new GenerateEquivalenceClasses();
   WordnetBasedSimilarity wn = new WordnetBasedSimilarity();
   String outputPhrase = "";
   StringTokenizer st = new StringTokenizer(rubricPhrase);
   while (st.hasMoreTokens()) {
     // 'token' is the unigram from the sentence segment in the rubirc text
     String token = st.nextToken();
     if (!token.contains("(\\\\w{0-4}\\\\s)")) { // if the token is not a frequent word
       String temptoken = token;
       // replacing common suffixes with (aaa)? and making it optional
       PorterStemmer s = new PorterStemmer();
       String[] stemText = s.getStemmedTextAndSuffix(temptoken, s);
       if (stemText[1] != "") temptoken = stemText[0] + "(" + stemText[1] + ")?";
       else temptoken = stemText[0];
       // get the class of words for this token
       ArrayList tokenClass = genClass.getClassOfWords(temptoken, topScoringTokens, s, posTagger);
       outputPhrase = outputPhrase + " " + tokenClass;
     } else {
       outputPhrase = outputPhrase + " " + token;
     }
   } // iterating over each of the tokens in the rubric phrase
   System.out.println("outphrase: " + outputPhrase);
   return outputPhrase.trim();
 }

Example #2

0

Show file

File: GenerateEquivalenceClasses.java Project: lramach/autoregexes

  public ArrayList identifyClassesOfWords(
      ArrayList<String> rubricTokens,
      ArrayList<String> topScoringTokens,
      ArrayList finalListOfTokenClasses,
      MaxentTagger posTagger)
      throws ClassNotFoundException, IOException {
    GenerateEquivalenceClasses genClass = new GenerateEquivalenceClasses();
    ArrayList<ArrayList> tokenClasses = new ArrayList<ArrayList>();
    WordnetBasedSimilarity wn = new WordnetBasedSimilarity();
    int sizes = 0;
    for (int i = 0; i < rubricTokens.size(); i++) {
      // 'token' is the unigram from the sentence segment in the rubirc text
      String token = rubricTokens.get(i);
      String temptoken = token;

      // replacing common suffixes with (aaa)? and making it optional
      PorterStemmer s = new PorterStemmer();
      String[] stemText = s.getStemmedTextAndSuffix(temptoken, s);
      if (stemText[1] != "") temptoken = stemText[0] + "(" + stemText[1] + ")?";
      else temptoken = stemText[0];
      // get the class of words for this token
      ArrayList classOfWords = genClass.getClassOfWords(temptoken, topScoringTokens, s, posTagger);
      // sorting the arraylist before adding it to make it easy to compare unordered lists
      Collections.sort(classOfWords);
      if (!tokenClasses.contains(classOfWords)) {
        System.out.println("Adding: " + classOfWords);
        tokenClasses.add(classOfWords);
      }
    }

    String concatListOfTokens = "";
    // select the top token-classes
    // restrict them to 5-grams
    int grams = 0;
    for (ArrayList tokClass : tokenClasses) {
      if (!finalListOfTokenClasses.contains(tokClass)) {
        if (tokClass.size() > 1) { // adding each token class individually
          finalListOfTokenClasses.add(tokClass);
        }
        if (grams < 5) { // building the longer array of token classes, more specific regex
          concatListOfTokens =
              concatListOfTokens + " @@ " + tokClass; // add the tokens to the final list
          grams++;
        } else if (grams == 5) {
          if (!finalListOfTokenClasses.contains(concatListOfTokens.trim()) && grams > 1) {
            System.out.println("concatListOfTokens: " + concatListOfTokens);
            finalListOfTokenClasses.add(concatListOfTokens.trim());
          }
          // reset the grams so that the rest of the token classes in this rubric segment can be
          // concatenated together
          grams = 0;
          concatListOfTokens = "";
        }
      }
    }
    return finalListOfTokenClasses;
  }

Example #3

0

Show file

File: GenerateEquivalenceClasses.java Project: lramach/autoregexes

 public ArrayList getClassOfWords(
     String token, ArrayList<String> topscorers, PorterStemmer s, MaxentTagger posTagger)
     throws ClassNotFoundException, IOException {
   // compare token with words in the top scoring responses
   StringTokenizer sttop;
   ArrayList tokenClass = new ArrayList<String>(); // initializing the class
   // adding root token
   tokenClass.add(token);
   WordnetBasedSimilarity wn = new WordnetBasedSimilarity();
   String[] tokenSyns = null;
   // TO DO LATER!! call wordnet code for rubric token's values, so this does not have to be
   // recomputed for every topscorer token
   for (int i = 0; i < topscorers.size(); i++) {
     if (topscorers.get(i) != null) {
       sttop = new StringTokenizer(topscorers.get(i));
       // compare the rubric token with each top score token
       while (sttop.hasMoreTokens()) {
         String toptoken =
             sttop.nextToken(); // 'toptoken' is the token from the top-scoring response
         // compare rubric token and the top scoring tokens.
         WordNetMatch wordnetMatch = wn.compareStrings(token, toptoken, posTagger);
         double match = wordnetMatch.matchValue;
         tokenSyns = wordnetMatch.synonyms;
         // if the tokens' match is above the threshold
         if (match >= threshold
             && match
                 != WordnetBasedSimilarity
                     .EXACT) { // add the matched words to the class, don't have to add EXACT
           // matches
           // stemming toptoken before it is added to the list
           String[] stemText = s.getStemmedTextAndSuffix(toptoken, s);
           if (stemText[1] != "") toptoken = stemText[0] + "(" + stemText[1] + ")?";
           else toptoken = stemText[0];
           if (!tokenClass.contains(toptoken)) tokenClass.add(toptoken.trim());
         }
       }
     }
   }
   System.out.println("Synonyms of token: " + token);
   if (tokenSyns != null) {
     for (int i = 0; i < tokenSyns.length; i++) {
       System.out.println(tokenSyns[i]);
       // stemming toptoken before it is added to the list
       String[] stemText = s.getStemmedTextAndSuffix(tokenSyns[i], s);
       String toptoken;
       if (stemText[1] != "") toptoken = stemText[0] + "(" + stemText[1] + ")?";
       else toptoken = stemText[0];
       if (!tokenClass.contains(toptoken)) tokenClass.add(toptoken);
     }
   }
   System.out.println("Eq. classes for token:" + token);
   System.out.println(tokenClass.toString());
   return tokenClass;
 }

Example #4

0

Show file

File: PorterStemmer.java Project: cwinters/notem

  /**
   * Test program for demonstrating the Stemmer. It reads a file and stems each word, writing the
   * result to standard out. Usage: Stemmer file-name
   *
   * @param args command-line args
   */
  public static void main(String[] args) {
    PorterStemmer s = new PorterStemmer();

    for (int i = 0; i < args.length; i++) {
      for (int j = 0; j < args[i].length(); j++) {
        s.add(args[i].charAt(j));
      }
      s.stem();
      System.out.println(s.toString());
      s.reset();
    }
  }

Example #5

0

Show file

File: OkapiBM25.java Project: PraneethVellaboyana/Academic_Projects

  @SuppressWarnings({"unchecked", "rawtypes"})
  public List<String> getBestMatchingScore(String indexfile, String queryfile, String num)
      throws IOException, ClassNotFoundException {

    //        try {
    int count = Integer.parseInt(num);

    // String cur_dir = "D:\\STS_Workspace\\IRSearch_PraneethReddyVellaboyana"; cur_dir + "\\" +
    // System.out.println("!!!!!!!!!!!!!!!!"+cur_dir);
    FileInputStream fileIn = new FileInputStream(IntegerConstants.cur_dir + indexfile);
    ObjectInputStream in = new ObjectInputStream(fileIn);
    inv_index = (TreeMap) in.readObject();
    System.out.println(inv_index);

    tokenCount = (TreeMap) in.readObject();

    System.out.println(tokenCount);
    // Total number of words in all the documents
    int totalTokenCount = 0;

    // Calculate the total number of tokens in the collection
    for (Iterator i = tokenCount.entrySet().iterator(); i.hasNext(); ) {
      Map.Entry next = (Map.Entry) i.next();
      totalTokenCount = totalTokenCount + (Integer) next.getValue();
    }
    // average  document length
    Double avdl = totalTokenCount * 1.0 / tokenCount.size();

    // Reading a file cur_dir + "\\" +
    File qf = new File(IntegerConstants.cur_dir + queryfile);
    BufferedReader bf = new BufferedReader(new FileReader(qf));

    String querytext;

    int queryid = 1;
    while ((querytext = bf.readLine()) != null) {

      String[] querywords = querytext.split(" ");

      // Step1: Retrieve all inverted lists corresponding to terms in a query.
      for (String word1 : querywords) {

        PorterStemmer ps = new PorterStemmer();
        String word = ps.stem(word1);

        word = word.trim();
        if (!word.equals("") && inv_index.containsKey(word)) {
          query_index.put(word, inv_index.get(word));
        }
      }

      // Step2: Compute BM25 scores for documents in the lists.
      for (Iterator iterator1 = query_index.entrySet().iterator(); iterator1.hasNext(); ) {
        // next contains list of files for the query word and their occurrences in each file
        Map.Entry next = (Map.Entry) iterator1.next();
        TreeMap indexes = (TreeMap) next.getValue();
        for (Iterator iterator2 = indexes.entrySet().iterator(); iterator2.hasNext(); ) {
          Map.Entry next2 = (Map.Entry) iterator2.next();
          // number of words of the query in the document
          int fi = (Integer) next2.getValue();
          //  total number of documents
          int N = tokenCount.size();
          // number of files the query word occurred in
          int ni = indexes.size();
          Double qfi = 0.0;

          // Number of words in the query string
          for (int i = 0; i < querywords.length; i++) {
            // matching the query word with the index of the queries to count the matched and
            // revelent query
            if (querytext.contains(querywords[i])) {
              qfi++;
            }
          }

          // Computing K value
          Double K =
              IntegerConstants.k1
                  * ((1 - IntegerConstants.b)
                      + IntegerConstants.b * (tokenCount.get(next2.getKey()) / avdl));
          Double first_term =
              (Math.log(
                  ((IntegerConstants.ri + 0.5) / (IntegerConstants.R - IntegerConstants.ri + 0.5))
                      / ((ni - IntegerConstants.ri + 0.5)
                          / (N - ni - IntegerConstants.R + IntegerConstants.ri + 0.5))));
          Double second_term = ((IntegerConstants.k1 + 1) * fi / (K + fi));
          Double third_term = ((IntegerConstants.k2 + 1) * qfi / (IntegerConstants.k2 + qfi));
          Double total = first_term * second_term * third_term;

          if (documentScore.containsKey((String) next2.getKey())) {
            Double valueToPut = total + documentScore.get((String) next2.getKey());
            documentScore.put((String) next2.getKey(), valueToPut);
          } else {
            documentScore.put((String) next2.getKey(), total);
          }
        }
      }
      // putting all the rank in descending order using the rank
      DescendingOrder comp = new DescendingOrder((TreeMap) documentScore);
      // order document score
      TreeMap<String, Double> list_asc = new TreeMap<String, Double>(comp);
      list_asc.putAll(documentScore);

      int rank = 1;
      // iterating with respect to the number of fetch results(third) parameter from the method
      for (Iterator itr = list_asc.entrySet().iterator(); itr.hasNext() && rank <= count; ) {
        Map.Entry nx = (Map.Entry) itr.next();
        // Double bmValue = (Double) nx.getValue();
        // System.out.println(queryid + " Q0 " + nx.getKey() + " " + rank + " " + bmValue + "
        // Praneeth");
        // adding the files in the list by preserving the insertion order
        queryResults.add(nx.getKey().toString());
        rank++;
      }
      queryid++;
      documentScore.clear();
      query_index.clear();
    }

    in.close();
    fileIn.close();
    bf.close();
    return queryResults;
  }