public List<Integer> getResultWithAND(String query) throws Exception { String[] terms = query.split(" "); List<Integer> results = new LinkedList<Integer>(); for (String term : terms) { String searchTerm = getStemmedQuery(term); int termid = getTermId(searchTerm); List<Integer> docs = new LinkedList<Integer>(); if (termid > 0) { TreeMap<Integer, List<Integer>> docpos = null; if (!termdocposindex.containsKey(termid)) { getTermDocPosIndex(termid); docpos = termdocposindex.get(termid); } else { docpos = termdocposindex.get(termid); } Iterator docIter = docpos.keySet().iterator(); Object doc = null; while (docIter.hasNext()) { doc = docIter.next(); List<Integer> lpos1 = docpos.get(doc); // list of docs docs.add((Integer) doc); } } if (results.isEmpty()) results.addAll(docs); else results.retainAll(docs); } return results; }
public TreeMap<Double, Integer> calculateTopTenQuery() throws IOException { // calculating relevance and irrelevance docs ZipInputStream zis; FileInputStream fis = null; ZipFile zipFile = null; ZipEntry ze; fis = new FileInputStream(indexFile); zipFile = new ZipFile(indexFile); zis = new ZipInputStream(fis); TreeMap<Double, Integer> results = new TreeMap<Double, Integer>(); TreeMap<Integer, List<Integer>> docTermIndex = new TreeMap<Integer, List<Integer>>(); List<Integer> termids = new LinkedList<Integer>(); if (relFlag) { String[] reldocs = relevance.split(","); for (String reldoc : reldocs) { int docid = getDocId(reldoc); // Editing starts here while ((ze = zis.getNextEntry()) != null) { String indexData = ""; String[] splittedString = null; InputStream is = zipFile.getInputStream(ze); InputStreamReader r = new InputStreamReader(is); BufferedReader br = new BufferedReader(r); indexData = br.readLine(); splittedString = indexData.split(";"); for (int i = 0; i < splittedString.length; i++) { if (splittedString[i].contains(":" + docid + ":")) { String[] subToken = splittedString[i].toString().split(":"); int termid = Integer.parseInt(subToken[0]); int sDocId = Integer.parseInt(subToken[1]); if (!termids.contains(termid)) { termids.add(termid); // docTermIndex.put(sDocId, termids); // calculating the terms' vector scores here } } } } for (int termid : termids) { System.out.println(termid); getTermDocPosIndex(termid); } // Editing ends here // Last editing here!! Iterator iterator = termdocposindex.keySet().iterator(); Object obj; List<Integer> lterms = new LinkedList<Integer>(); TreeMap<Integer, List<Integer>> tdocs = new TreeMap<Integer, List<Integer>>(); while (iterator.hasNext()) { obj = iterator.next(); tdocs = termdocposindex.get(obj); Iterator iterator2 = tdocs.keySet().iterator(); Object obj2; while (iterator2.hasNext()) { obj2 = iterator2.next(); if ((Integer) obj2 == docid) { lterms.add((Integer) obj2); System.out.println(obj.toString()); } } } // System.out.println(docid); } } else if (irrelFlag) { String[] irreldocs = irrelevance.split(","); for (String irreldoc : irreldocs) { int docid = getDocId(irreldoc); // System.out.println(docid); } } return results; }
public TreeMap<Double, List<Integer>> getCosineScore(String query) throws IOException { TreeMap<Double, List<Integer>> rankedResults = new TreeMap<Double, List<Integer>>(); String[] terms = null; if (query.contains("-")) { terms = query.split(" "); String refinedQuery = ""; for (String term : terms) { if (term.contains("-")) { String[] splittedTerms = term.split("-"); term = ""; for (String splittedTerm : splittedTerms) { term += splittedTerm + " "; } } refinedQuery += term + " "; } query = refinedQuery; terms = query.split(" "); } else { terms = query.split(" "); } double start = System.currentTimeMillis(); List<Integer> results = new LinkedList<Integer>(); HashMap<Integer, Double> docScores = new HashMap<Integer, Double>(); TreeMap<Integer, List<Integer>> docpos = null; TreeMap<Integer, Double> listOfDocScores = new TreeMap<Integer, Double>(); double tfCorpus = 0; for (String term : terms) { String searchTerm = getStemmedQuery(term.trim()); int termid = getTermId(searchTerm); List<Integer> docs = new LinkedList<Integer>(); if (termid > 0) { if (!termdocposindex.containsKey(termid)) { getTermDocPosIndex(termid); docpos = termdocposindex.get(termid); } else { docpos = termdocposindex.get(termid); } Iterator docIter = docpos.keySet().iterator(); Object doc = null; while (docIter.hasNext()) { doc = docIter.next(); docs.add((Integer) doc); } // select only documents contain the query if (results.isEmpty()) results.addAll(docs); else { results.retainAll(docs); } } // calculating tf in document for (int result : results) { double tfDocument = 0; List<Integer> lPositions = docpos.get(result); tfDocument = lPositions.size(); tfCorpus += tfDocument; listOfDocScores.put(result, tfDocument); } } // calculating tf in corpus Iterator iteratorKey = listOfDocScores.keySet().iterator(); Object docKey; double tfDocument = 0; while (iteratorKey.hasNext()) { docKey = iteratorKey.next(); tfDocument = listOfDocScores.get(docKey); if (!docScores.containsKey(docKey)) { docScores.put((Integer) docKey, tfDocument / tfCorpus); } } // rank the results ascending for (Integer docid : docScores.keySet()) { if (rankedResults.containsKey(docScores.get(docid))) { List<Integer> l = rankedResults.get(docScores.get(docid)); if (l.contains(docid)) { rankedResults.put(docScores.get(docid), l); } } else { List<Integer> l = new LinkedList<Integer>(); l.add(docid); rankedResults.put(docScores.get(docid), l); } } double end = (System.currentTimeMillis() - start) / 1000; System.out.println("Match documents found in " + end + " seconds."); return rankedResults; }