// public SortingReader(IndexReader oldReader, int[] oldToNew) { // TODO MC
    public SortingReader(IndexReader oldReader, DocScore[] newToOld) { // TODO MC
      super(oldReader);
      this.newToOld = newToOld;

      this.oldToNew = new int[oldReader.maxDoc()];
      int newDoc = 0;
      while (newDoc < newToOld.length) {
        int oldDoc = newToOld[newDoc].doc;
        oldToNew[oldDoc] = newDoc;
        newDoc++;
      }
    }
Пример #2
0
  public static void main(String[] args) throws Exception {
    // the IndexReader object is the main handle that will give you
    // all the documents, terms and inverted index
    IndexReader r = IndexReader.open(FSDirectory.open(new File("index")));

    // You can figure out the number of documents using the maxDoc() function
    System.out.println("The number of documents in this index is: " + r.maxDoc());

    int i = 0;
    // You can find out all the terms that have been indexed using the terms() function
    TermEnum t = r.terms();
    while (t.next()) {
      // Since there are so many terms, let us try printing only term #100000-#100010
      if (i > 100000) System.out.println("[" + i + "] " + t.term().text());
      if (++i > 100010) break;
    }

    // You can create your own query terms by calling the Term constructor, with the field
    // 'contents'
    // In the following example, the query term is 'brute'
    Term te = new Term("contents", "brute");

    // You can also quickly find out the number of documents that have term t
    System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te));

    // You can use the inverted index to find out all the documents that contain the term 'brute'
    //  by using the termDocs function
    TermDocs td = r.termDocs(te);
    while (td.next()) {
      System.out.println(
          "Document number ["
              + td.doc()
              + "] contains the term 'brute' "
              + td.freq()
              + " time(s).");
    }

    // You can find the URL of the a specific document number using the document() function
    // For example, the URL for document number 14191 is:
    Document d = r.document(14191);
    String url =
        d.getFieldable("path")
            .stringValue(); // the 'path' field of the Document object holds the URL
    System.out.println(url.replace("%%", "/"));

    // -------- Now let us use all of the functions above to make something useful --------
    // The following bit of code is a worked out example of how to get a bunch of documents
    // in response to a query and show them (without ranking them according to TF/IDF)
    Scanner sc = new Scanner(System.in);
    String str = "";
    System.out.print("query> ");
    while (!(str = sc.nextLine()).equals("quit")) {
      String[] terms = str.split("\\s+");
      for (String word : terms) {
        Term term = new Term("contents", word);
        TermDocs tdocs = r.termDocs(term);
        while (tdocs.next()) {
          String d_url =
              r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/");
          System.out.println("[" + tdocs.doc() + "] " + d_url);
        }
      }
      System.out.print("query> ");
    }
  }
  // private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException {
  private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException {
    int readerMax = reader.maxDoc();
    DocScore[] newToOld = new DocScore[readerMax];

    // use site, an indexed, un-tokenized field to get boost
    // byte[] boosts = reader.norms("site"); TODO MC
    /* TODO MC */
    Document docMeta;
    Pattern includes = Pattern.compile("\\|");
    String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, "");
    String includeExtensions[] = includes.split(value);
    Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>();
    for (int i = 0; i < includeExtensions.length; i++) {
      validExtensions.put(includeExtensions[i], true);
      System.out.println("extension boosted " + includeExtensions[i]);
    }
    /* TODO MC */

    for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
      float score;
      if (reader.isDeleted(oldDoc)) {
        // score = 0.0f;
        score = -1f; // TODO MC
      } else {
        // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC
        /* TODO MC */
        docMeta = searcher.doc(oldDoc);
        if (validExtensions.get(docMeta.get("subType"))
            == null) { // searched extensions will have higher scores
          score = -0.5f;
        } else {
          score = Integer.parseInt(docMeta.get("inlinks"));
          /*
          if (score==0) {
          	score=0.001f; // TODO MC - to not erase
          }
          */
        }
        /* TODO MC */
        // System.out.println("Score for old document "+oldDoc+" is "+score+" and type
        // "+docMeta.get("subType")); // TODO MC debug remove
      }
      DocScore docScore = new DocScore();
      docScore.doc = oldDoc;
      docScore.score = score;
      newToOld[oldDoc] = docScore;
    }

    System.out.println("Sorting " + newToOld.length + " documents.");
    Arrays.sort(newToOld);
    // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space

    /* TODO MC
    int[] oldToNew = new int[readerMax];
    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
      DocScore docScore = newToOld[newDoc];
      //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC
      oldToNew[docScore.oldDoc] = newDoc; // TODO MC
    }
    */

    /* TODO MC *
    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
    	DocScore docScore = newToOld[newDoc];
    	System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove
    }
    * TODO MC */

    // return oldToNew; TODO MC
    return newToOld; // TODO MC
  }