예제 #1
0
파일: Search.java 프로젝트: krahman/JSearch
  public List<Integer> getResultWithAND(String query) throws Exception {
    String[] terms = query.split(" ");
    List<Integer> results = new LinkedList<Integer>();
    for (String term : terms) {
      String searchTerm = getStemmedQuery(term);
      int termid = getTermId(searchTerm);
      List<Integer> docs = new LinkedList<Integer>();
      if (termid > 0) {
        TreeMap<Integer, List<Integer>> docpos = null;
        if (!termdocposindex.containsKey(termid)) {
          getTermDocPosIndex(termid);
          docpos = termdocposindex.get(termid);
        } else {
          docpos = termdocposindex.get(termid);
        }
        Iterator docIter = docpos.keySet().iterator();
        Object doc = null;
        while (docIter.hasNext()) {
          doc = docIter.next();
          List<Integer> lpos1 = docpos.get(doc);
          // list of docs
          docs.add((Integer) doc);
        }
      }

      if (results.isEmpty()) results.addAll(docs);
      else results.retainAll(docs);
    }
    return results;
  }
예제 #2
0
파일: Search.java 프로젝트: krahman/JSearch
  public TreeMap<Double, Integer> calculateTopTenQuery() throws IOException {
    // calculating relevance and irrelevance docs
    ZipInputStream zis;
    FileInputStream fis = null;
    ZipFile zipFile = null;
    ZipEntry ze;
    fis = new FileInputStream(indexFile);
    zipFile = new ZipFile(indexFile);
    zis = new ZipInputStream(fis);

    TreeMap<Double, Integer> results = new TreeMap<Double, Integer>();
    TreeMap<Integer, List<Integer>> docTermIndex = new TreeMap<Integer, List<Integer>>();
    List<Integer> termids = new LinkedList<Integer>();
    if (relFlag) {
      String[] reldocs = relevance.split(",");
      for (String reldoc : reldocs) {
        int docid = getDocId(reldoc);
        // Editing starts here

        while ((ze = zis.getNextEntry()) != null) {
          String indexData = "";
          String[] splittedString = null;
          InputStream is = zipFile.getInputStream(ze);
          InputStreamReader r = new InputStreamReader(is);
          BufferedReader br = new BufferedReader(r);
          indexData = br.readLine();
          splittedString = indexData.split(";");

          for (int i = 0; i < splittedString.length; i++) {
            if (splittedString[i].contains(":" + docid + ":")) {
              String[] subToken = splittedString[i].toString().split(":");
              int termid = Integer.parseInt(subToken[0]);
              int sDocId = Integer.parseInt(subToken[1]);
              if (!termids.contains(termid)) {
                termids.add(termid);
                // docTermIndex.put(sDocId, termids);
                // calculating the terms' vector scores here
              }
            }
          }
        }
        for (int termid : termids) {
          System.out.println(termid);
          getTermDocPosIndex(termid);
        }
        // Editing ends here
        // Last editing here!!

        Iterator iterator = termdocposindex.keySet().iterator();
        Object obj;
        List<Integer> lterms = new LinkedList<Integer>();
        TreeMap<Integer, List<Integer>> tdocs = new TreeMap<Integer, List<Integer>>();
        while (iterator.hasNext()) {
          obj = iterator.next();
          tdocs = termdocposindex.get(obj);
          Iterator iterator2 = tdocs.keySet().iterator();
          Object obj2;
          while (iterator2.hasNext()) {
            obj2 = iterator2.next();
            if ((Integer) obj2 == docid) {
              lterms.add((Integer) obj2);
              System.out.println(obj.toString());
            }
          }
        }
        // System.out.println(docid);
      }
    } else if (irrelFlag) {
      String[] irreldocs = irrelevance.split(",");
      for (String irreldoc : irreldocs) {
        int docid = getDocId(irreldoc);
        // System.out.println(docid);
      }
    }
    return results;
  }
예제 #3
0
파일: Search.java 프로젝트: krahman/JSearch
  public TreeMap<Double, List<Integer>> getCosineScore(String query) throws IOException {
    TreeMap<Double, List<Integer>> rankedResults = new TreeMap<Double, List<Integer>>();
    String[] terms = null;
    if (query.contains("-")) {
      terms = query.split(" ");
      String refinedQuery = "";
      for (String term : terms) {
        if (term.contains("-")) {
          String[] splittedTerms = term.split("-");
          term = "";
          for (String splittedTerm : splittedTerms) {
            term += splittedTerm + " ";
          }
        }
        refinedQuery += term + " ";
      }

      query = refinedQuery;
      terms = query.split(" ");
    } else {
      terms = query.split(" ");
    }
    double start = System.currentTimeMillis();
    List<Integer> results = new LinkedList<Integer>();
    HashMap<Integer, Double> docScores = new HashMap<Integer, Double>();
    TreeMap<Integer, List<Integer>> docpos = null;
    TreeMap<Integer, Double> listOfDocScores = new TreeMap<Integer, Double>();
    double tfCorpus = 0;
    for (String term : terms) {
      String searchTerm = getStemmedQuery(term.trim());
      int termid = getTermId(searchTerm);
      List<Integer> docs = new LinkedList<Integer>();
      if (termid > 0) {
        if (!termdocposindex.containsKey(termid)) {
          getTermDocPosIndex(termid);
          docpos = termdocposindex.get(termid);
        } else {
          docpos = termdocposindex.get(termid);
        }
        Iterator docIter = docpos.keySet().iterator();
        Object doc = null;
        while (docIter.hasNext()) {
          doc = docIter.next();
          docs.add((Integer) doc);
        }

        // select only documents contain the query
        if (results.isEmpty()) results.addAll(docs);
        else {
          results.retainAll(docs);
        }
      }

      // calculating tf in document
      for (int result : results) {
        double tfDocument = 0;
        List<Integer> lPositions = docpos.get(result);
        tfDocument = lPositions.size();
        tfCorpus += tfDocument;
        listOfDocScores.put(result, tfDocument);
      }
    }

    // calculating tf in corpus
    Iterator iteratorKey = listOfDocScores.keySet().iterator();
    Object docKey;
    double tfDocument = 0;
    while (iteratorKey.hasNext()) {
      docKey = iteratorKey.next();
      tfDocument = listOfDocScores.get(docKey);
      if (!docScores.containsKey(docKey)) {
        docScores.put((Integer) docKey, tfDocument / tfCorpus);
      }
    }

    // rank the results ascending
    for (Integer docid : docScores.keySet()) {
      if (rankedResults.containsKey(docScores.get(docid))) {
        List<Integer> l = rankedResults.get(docScores.get(docid));
        if (l.contains(docid)) {
          rankedResults.put(docScores.get(docid), l);
        }
      } else {
        List<Integer> l = new LinkedList<Integer>();
        l.add(docid);
        rankedResults.put(docScores.get(docid), l);
      }
    }
    double end = (System.currentTimeMillis() - start) / 1000;
    System.out.println("Match documents found in " + end + " seconds.");
    return rankedResults;
  }