예제 #1
0
  private static Map<String, List<String>> generate_result(Directory directory) {
    Map<String, List<String>> result_map = new HashMap<String, List<String>>();

    try {
      IndexReader reader = IndexReader.open(directory);
      TermEnum termEnum = reader.terms();
      while (termEnum.next()) {
        String termEnumString = termEnum.term().toString();
        if (termEnumString.startsWith("content:")) {
          String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1);
          TermDocs termDocs = reader.termDocs(termEnum.term());
          while (termDocs.next()) {
            Document doc = reader.document(termDocs.doc());
            String relative_path = doc.get("relative_path");

            if (result_map.containsKey(relative_path)) {
              result_map.get(relative_path).add(term + termDocs.freq());
            } else {
              result_map.put(relative_path, new ArrayList<String>());
            }
          }
        }
      }
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
    }

    return result_map;
  }
 /**
  * Gets the global term frequency of a term, i.e. how may times it occurs in the whole corpus
  *
  * @param term whose frequency you want
  * @return Global term frequency of term, or 1 if unavailable.
  */
 private int getGlobalTermFreq(Term term) {
   int tf = 0;
   try {
     TermDocs tDocs = this.indexReader.termDocs(term);
     if (tDocs == null) {
       logger.info("Couldn't get term frequency for term " + term.text());
       return 1;
     }
     while (tDocs.next()) {
       tf += tDocs.freq();
     }
   } catch (IOException e) {
     logger.info("Couldn't get term frequency for term " + term.text());
     return 1;
   }
   return tf;
 }
예제 #3
0
    public MyTerm(Term originTrem, TermDocs termDocs, int maxDocNum) throws IOException {
      super();

      this.originTrem = originTrem;
      this.termDocs = termDocs;
      this.totalFreq = 0;
      while (this.termDocs.next()) {
        int docNum = termDocs.doc();
        int freq = termDocs.freq();
        this.termMap.put(docNum, freq);
        this.totalFreq += freq;
      }
      this.vector = new int[maxDocNum];
      for (int i = 0; i < maxDocNum; i++) {
        this.vector[i] = 0;
      }
      for (int k : this.termMap.keySet()) {
        this.vector[k] = (int) this.termMap.get(k);
      }
    }
  @Override
  public boolean reload(String collectionName, String topRankingField) {
    if (collectionName == null) {
      return false;
    }

    CrescentCollectionHandler collectionHandler =
        SpringApplicationContext.getBean(
            "crescentCollectionHandler", CrescentCollectionHandler.class);

    CrescentCollection collection =
        collectionHandler.getCrescentCollections().getCrescentCollection(collectionName);

    if (collection == null) {
      logger.debug("doesn't Collection Info => {}", collectionName);
      init(View.Overview);
      return false;
    }

    if (topRankingField == null) {
      if (collection.getDefaultSearchFields().get(0) != null) {
        topRankingField = collection.getDefaultSearchFields().get(0).getName();
      } else {
        logger.debug("doesn't defaultSearchField => {}", collectionName);
        init(View.Overview);
        return false;
      }
    }

    List<String> fieldName = new ArrayList<String>();
    for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName());
    TopRankingQueue topRankingQueue =
        new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator());

    try {
      Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory()));
      IndexReader reader = IndexReader.open(directory);

      TermEnum terms = reader.terms();

      int termFreq = 0;
      int termCount = 0;
      Term beforeTerm = null;
      // init term count
      fieldTermCount.clear();
      for (CrescentCollectionField field : collection.getFields())
        fieldTermCount.put(field.getName(), 0);
      topRankingQueue.clear();

      while (terms.next()) {
        Term currTerm = terms.term();
        if (beforeTerm == null) {
          beforeTerm = currTerm;
        }

        if (beforeTerm.field() == currTerm.field()) {
          termCount++;
        } else {
          fieldTermCount.put(beforeTerm.field(), termCount);
          termCount = 1;
          beforeTerm = currTerm;
        }

        TermDocs termDocs = reader.termDocs(currTerm);

        while (termDocs.next()) {
          if (currTerm.field().equals(topRankingField)) {
            RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq());
            topRankingQueue.add(e);
          }
        }
        termFreq++;
      }
      if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount);

      terms.close();
      result.put("numOfTerm", termFreq);
      result.put("numOfDoc", reader.numDocs());
      result.put("hasDel", reader.hasDeletions());
      result.put("isOptimize", reader.isOptimized());
      result.put("indexVersion", reader.getVersion());
      result.put("lastModify", new Date(IndexReader.lastModified(directory)));
    } catch (IOException e) {
      e.printStackTrace();
      return false;
    }
    if (topRankingQueue.size() != 0) {
      topRankingTerms = topRankingQueue.toArray();
      Arrays.sort(topRankingTerms);
    }
    result.put("collectionName", collectionName);
    result.put("indexName", collection.getIndexingDirectory());
    result.put("numOfField", collection.getFields().size());
    result.put("termCount", fieldTermCount);
    result.put("topRanking", topRankingTerms);
    result.put("fieldName", fieldName);

    return true;
  }
예제 #5
0
  public static void main(String[] args) throws Exception {
    // the IndexReader object is the main handle that will give you
    // all the documents, terms and inverted index
    IndexReader r = IndexReader.open(FSDirectory.open(new File("index")));

    // You can figure out the number of documents using the maxDoc() function
    System.out.println("The number of documents in this index is: " + r.maxDoc());

    int i = 0;
    // You can find out all the terms that have been indexed using the terms() function
    TermEnum t = r.terms();
    while (t.next()) {
      // Since there are so many terms, let us try printing only term #100000-#100010
      if (i > 100000) System.out.println("[" + i + "] " + t.term().text());
      if (++i > 100010) break;
    }

    // You can create your own query terms by calling the Term constructor, with the field
    // 'contents'
    // In the following example, the query term is 'brute'
    Term te = new Term("contents", "brute");

    // You can also quickly find out the number of documents that have term t
    System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te));

    // You can use the inverted index to find out all the documents that contain the term 'brute'
    //  by using the termDocs function
    TermDocs td = r.termDocs(te);
    while (td.next()) {
      System.out.println(
          "Document number ["
              + td.doc()
              + "] contains the term 'brute' "
              + td.freq()
              + " time(s).");
    }

    // You can find the URL of the a specific document number using the document() function
    // For example, the URL for document number 14191 is:
    Document d = r.document(14191);
    String url =
        d.getFieldable("path")
            .stringValue(); // the 'path' field of the Document object holds the URL
    System.out.println(url.replace("%%", "/"));

    // -------- Now let us use all of the functions above to make something useful --------
    // The following bit of code is a worked out example of how to get a bunch of documents
    // in response to a query and show them (without ranking them according to TF/IDF)
    Scanner sc = new Scanner(System.in);
    String str = "";
    System.out.print("query> ");
    while (!(str = sc.nextLine()).equals("quit")) {
      String[] terms = str.split("\\s+");
      for (String word : terms) {
        Term term = new Term("contents", word);
        TermDocs tdocs = r.termDocs(term);
        while (tdocs.next()) {
          String d_url =
              r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/");
          System.out.println("[" + tdocs.doc() + "] " + d_url);
        }
      }
      System.out.print("query> ");
    }
  }