public String identifyClassesOfPhrases( String rubricPhrase, ArrayList<String> topScoringTokens, MaxentTagger posTagger) throws ClassNotFoundException, IOException { GenerateEquivalenceClasses genClass = new GenerateEquivalenceClasses(); WordnetBasedSimilarity wn = new WordnetBasedSimilarity(); String outputPhrase = ""; StringTokenizer st = new StringTokenizer(rubricPhrase); while (st.hasMoreTokens()) { // 'token' is the unigram from the sentence segment in the rubirc text String token = st.nextToken(); if (!token.contains("(\\\\w{0-4}\\\\s)")) { // if the token is not a frequent word String temptoken = token; // replacing common suffixes with (aaa)? and making it optional PorterStemmer s = new PorterStemmer(); String[] stemText = s.getStemmedTextAndSuffix(temptoken, s); if (stemText[1] != "") temptoken = stemText[0] + "(" + stemText[1] + ")?"; else temptoken = stemText[0]; // get the class of words for this token ArrayList tokenClass = genClass.getClassOfWords(temptoken, topScoringTokens, s, posTagger); outputPhrase = outputPhrase + " " + tokenClass; } else { outputPhrase = outputPhrase + " " + token; } } // iterating over each of the tokens in the rubric phrase System.out.println("outphrase: " + outputPhrase); return outputPhrase.trim(); }
public ArrayList identifyClassesOfWords( ArrayList<String> rubricTokens, ArrayList<String> topScoringTokens, ArrayList finalListOfTokenClasses, MaxentTagger posTagger) throws ClassNotFoundException, IOException { GenerateEquivalenceClasses genClass = new GenerateEquivalenceClasses(); ArrayList<ArrayList> tokenClasses = new ArrayList<ArrayList>(); WordnetBasedSimilarity wn = new WordnetBasedSimilarity(); int sizes = 0; for (int i = 0; i < rubricTokens.size(); i++) { // 'token' is the unigram from the sentence segment in the rubirc text String token = rubricTokens.get(i); String temptoken = token; // replacing common suffixes with (aaa)? and making it optional PorterStemmer s = new PorterStemmer(); String[] stemText = s.getStemmedTextAndSuffix(temptoken, s); if (stemText[1] != "") temptoken = stemText[0] + "(" + stemText[1] + ")?"; else temptoken = stemText[0]; // get the class of words for this token ArrayList classOfWords = genClass.getClassOfWords(temptoken, topScoringTokens, s, posTagger); // sorting the arraylist before adding it to make it easy to compare unordered lists Collections.sort(classOfWords); if (!tokenClasses.contains(classOfWords)) { System.out.println("Adding: " + classOfWords); tokenClasses.add(classOfWords); } } String concatListOfTokens = ""; // select the top token-classes // restrict them to 5-grams int grams = 0; for (ArrayList tokClass : tokenClasses) { if (!finalListOfTokenClasses.contains(tokClass)) { if (tokClass.size() > 1) { // adding each token class individually finalListOfTokenClasses.add(tokClass); } if (grams < 5) { // building the longer array of token classes, more specific regex concatListOfTokens = concatListOfTokens + " @@ " + tokClass; // add the tokens to the final list grams++; } else if (grams == 5) { if (!finalListOfTokenClasses.contains(concatListOfTokens.trim()) && grams > 1) { System.out.println("concatListOfTokens: " + concatListOfTokens); finalListOfTokenClasses.add(concatListOfTokens.trim()); } // reset the grams so that the rest of the token classes in this rubric segment can be // concatenated together grams = 0; concatListOfTokens = ""; } } } return finalListOfTokenClasses; }
public ArrayList getClassOfWords( String token, ArrayList<String> topscorers, PorterStemmer s, MaxentTagger posTagger) throws ClassNotFoundException, IOException { // compare token with words in the top scoring responses StringTokenizer sttop; ArrayList tokenClass = new ArrayList<String>(); // initializing the class // adding root token tokenClass.add(token); WordnetBasedSimilarity wn = new WordnetBasedSimilarity(); String[] tokenSyns = null; // TO DO LATER!! call wordnet code for rubric token's values, so this does not have to be // recomputed for every topscorer token for (int i = 0; i < topscorers.size(); i++) { if (topscorers.get(i) != null) { sttop = new StringTokenizer(topscorers.get(i)); // compare the rubric token with each top score token while (sttop.hasMoreTokens()) { String toptoken = sttop.nextToken(); // 'toptoken' is the token from the top-scoring response // compare rubric token and the top scoring tokens. WordNetMatch wordnetMatch = wn.compareStrings(token, toptoken, posTagger); double match = wordnetMatch.matchValue; tokenSyns = wordnetMatch.synonyms; // if the tokens' match is above the threshold if (match >= threshold && match != WordnetBasedSimilarity .EXACT) { // add the matched words to the class, don't have to add EXACT // matches // stemming toptoken before it is added to the list String[] stemText = s.getStemmedTextAndSuffix(toptoken, s); if (stemText[1] != "") toptoken = stemText[0] + "(" + stemText[1] + ")?"; else toptoken = stemText[0]; if (!tokenClass.contains(toptoken)) tokenClass.add(toptoken.trim()); } } } } System.out.println("Synonyms of token: " + token); if (tokenSyns != null) { for (int i = 0; i < tokenSyns.length; i++) { System.out.println(tokenSyns[i]); // stemming toptoken before it is added to the list String[] stemText = s.getStemmedTextAndSuffix(tokenSyns[i], s); String toptoken; if (stemText[1] != "") toptoken = stemText[0] + "(" + stemText[1] + ")?"; else toptoken = stemText[0]; if (!tokenClass.contains(toptoken)) tokenClass.add(toptoken); } } System.out.println("Eq. classes for token:" + token); System.out.println(tokenClass.toString()); return tokenClass; }
/** * Test program for demonstrating the Stemmer. It reads a file and stems each word, writing the * result to standard out. Usage: Stemmer file-name * * @param args command-line args */ public static void main(String[] args) { PorterStemmer s = new PorterStemmer(); for (int i = 0; i < args.length; i++) { for (int j = 0; j < args[i].length(); j++) { s.add(args[i].charAt(j)); } s.stem(); System.out.println(s.toString()); s.reset(); } }
@SuppressWarnings({"unchecked", "rawtypes"}) public List<String> getBestMatchingScore(String indexfile, String queryfile, String num) throws IOException, ClassNotFoundException { // try { int count = Integer.parseInt(num); // String cur_dir = "D:\\STS_Workspace\\IRSearch_PraneethReddyVellaboyana"; cur_dir + "\\" + // System.out.println("!!!!!!!!!!!!!!!!"+cur_dir); FileInputStream fileIn = new FileInputStream(IntegerConstants.cur_dir + indexfile); ObjectInputStream in = new ObjectInputStream(fileIn); inv_index = (TreeMap) in.readObject(); System.out.println(inv_index); tokenCount = (TreeMap) in.readObject(); System.out.println(tokenCount); // Total number of words in all the documents int totalTokenCount = 0; // Calculate the total number of tokens in the collection for (Iterator i = tokenCount.entrySet().iterator(); i.hasNext(); ) { Map.Entry next = (Map.Entry) i.next(); totalTokenCount = totalTokenCount + (Integer) next.getValue(); } // average document length Double avdl = totalTokenCount * 1.0 / tokenCount.size(); // Reading a file cur_dir + "\\" + File qf = new File(IntegerConstants.cur_dir + queryfile); BufferedReader bf = new BufferedReader(new FileReader(qf)); String querytext; int queryid = 1; while ((querytext = bf.readLine()) != null) { String[] querywords = querytext.split(" "); // Step1: Retrieve all inverted lists corresponding to terms in a query. for (String word1 : querywords) { PorterStemmer ps = new PorterStemmer(); String word = ps.stem(word1); word = word.trim(); if (!word.equals("") && inv_index.containsKey(word)) { query_index.put(word, inv_index.get(word)); } } // Step2: Compute BM25 scores for documents in the lists. for (Iterator iterator1 = query_index.entrySet().iterator(); iterator1.hasNext(); ) { // next contains list of files for the query word and their occurrences in each file Map.Entry next = (Map.Entry) iterator1.next(); TreeMap indexes = (TreeMap) next.getValue(); for (Iterator iterator2 = indexes.entrySet().iterator(); iterator2.hasNext(); ) { Map.Entry next2 = (Map.Entry) iterator2.next(); // number of words of the query in the document int fi = (Integer) next2.getValue(); // total number of documents int N = tokenCount.size(); // number of files the query word occurred in int ni = indexes.size(); Double qfi = 0.0; // Number of words in the query string for (int i = 0; i < querywords.length; i++) { // matching the query word with the index of the queries to count the matched and // revelent query if (querytext.contains(querywords[i])) { qfi++; } } // Computing K value Double K = IntegerConstants.k1 * ((1 - IntegerConstants.b) + IntegerConstants.b * (tokenCount.get(next2.getKey()) / avdl)); Double first_term = (Math.log( ((IntegerConstants.ri + 0.5) / (IntegerConstants.R - IntegerConstants.ri + 0.5)) / ((ni - IntegerConstants.ri + 0.5) / (N - ni - IntegerConstants.R + IntegerConstants.ri + 0.5)))); Double second_term = ((IntegerConstants.k1 + 1) * fi / (K + fi)); Double third_term = ((IntegerConstants.k2 + 1) * qfi / (IntegerConstants.k2 + qfi)); Double total = first_term * second_term * third_term; if (documentScore.containsKey((String) next2.getKey())) { Double valueToPut = total + documentScore.get((String) next2.getKey()); documentScore.put((String) next2.getKey(), valueToPut); } else { documentScore.put((String) next2.getKey(), total); } } } // putting all the rank in descending order using the rank DescendingOrder comp = new DescendingOrder((TreeMap) documentScore); // order document score TreeMap<String, Double> list_asc = new TreeMap<String, Double>(comp); list_asc.putAll(documentScore); int rank = 1; // iterating with respect to the number of fetch results(third) parameter from the method for (Iterator itr = list_asc.entrySet().iterator(); itr.hasNext() && rank <= count; ) { Map.Entry nx = (Map.Entry) itr.next(); // Double bmValue = (Double) nx.getValue(); // System.out.println(queryid + " Q0 " + nx.getKey() + " " + rank + " " + bmValue + " // Praneeth"); // adding the files in the list by preserving the insertion order queryResults.add(nx.getKey().toString()); rank++; } queryid++; documentScore.clear(); query_index.clear(); } in.close(); fileIn.close(); bf.close(); return queryResults; }