Java IndexReader Beispiele, org.apache.lucene.store.IndexReader Java Beispiele

Beispiel #1

0

Datei anzeigen

Datei: TestRollingUpdates.java Projekt: PATRIC3/p3_solr

  public void testUpdateSameDoc() throws Exception {
    final Directory dir = newDirectory();

    final LineFileDocs docs = new LineFileDocs(random());
    for (int r = 0; r < 3; r++) {
      final IndexWriter w =
          new IndexWriter(
              dir, newIndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(2));
      final int numUpdates = atLeast(20);
      int numThreads = TestUtil.nextInt(random(), 2, 6);
      IndexingThread[] threads = new IndexingThread[numThreads];
      for (int i = 0; i < numThreads; i++) {
        threads[i] = new IndexingThread(docs, w, numUpdates);
        threads[i].start();
      }

      for (int i = 0; i < numThreads; i++) {
        threads[i].join();
      }

      w.close();
    }

    IndexReader open = DirectoryReader.open(dir);
    assertEquals(1, open.numDocs());
    open.close();
    docs.close();
    dir.close();
  }

Beispiel #2

0

Datei anzeigen

Datei: InspectIndex.java Projekt: laosiaudi/CMU-11642-Project

  /*
   *  listTermVectors displays the term vectors for all of the fields
   *  in a document in an index (specified by reader).
   */
  static void listTermVectors(IndexReader reader, String docidString) throws IOException {

    System.out.println("\nTermVector:  docid " + docidString);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    /*
     *  Iterate over the fields in this document.
     */
    Fields fields = reader.getTermVectors(docid);
    Iterator<String> fieldIterator = fields.iterator();

    while (fieldIterator.hasNext()) {
      String fieldName = fieldIterator.next();
      System.out.println("  Field: " + fieldName);

      Terms terms = fields.terms(fieldName);
      termVectorDisplay(terms);
    }
    ;
  }

Beispiel #3

0

Datei anzeigen

Datei: InspectIndex.java Projekt: laosiaudi/CMU-11642-Project

  /*
   *  listPostings displays the first n postings for a term in a
   *  field in an index (specified by reader).  Set n to MAX_VALUE
   *  to display all postings.
   */
  static void listPostings(IndexReader reader, String termString, String field, Integer n)
      throws IOException {

    System.out.println("\nPostings:  " + termString + " " + field);

    /*
     *  Prepare to access the index.
     */
    BytesRef termBytes = new BytesRef(termString);
    Term term = new Term(field, termBytes);
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    /*
     *  Lookup the collection term frequency (ctf).
     */
    long df = reader.docFreq(term);
    System.out.println("\tdf:  " + df);

    long ctf = reader.totalTermFreq(term);
    System.out.println("\tctf:  " + ctf);

    if (df < 1) return;

    /*
     *  Lookup the inverted list.
     */
    DocsAndPositionsEnum postings =
        MultiFields.getTermPositionsEnum(reader, liveDocs, field, termBytes);

    /*
     *  Iterate through the first n postings.
     */
    long count = 0;

    while ((count < n) && (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)) {

      System.out.println("\tdocid: " + postings.docID());
      int tf = postings.freq();
      System.out.println("\ttf: " + tf);
      System.out.print("\tPositions: ");

      for (int j = 0; j < tf; j++) {
        int pos = postings.nextPosition();
        System.out.print(pos + " ");
      }

      System.out.println("");

      count++;
    }
    ;

    return;
  }

Beispiel #4

0

Datei anzeigen

Datei: TestSearchForDuplicates.java Projekt: zuoyebushiwo/lucene-solr-lucene_solr_4_10_4

  private void doTest(Random random, PrintWriter out, boolean useCompoundFiles, int MAX_DOCS)
      throws Exception {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random);
    IndexWriterConfig conf = newIndexWriterConfig(analyzer);
    final MergePolicy mp = conf.getMergePolicy();
    mp.setNoCFSRatio(useCompoundFiles ? 1.0 : 0.0);
    IndexWriter writer = new IndexWriter(directory, conf);
    if (VERBOSE) {
      System.out.println("TEST: now build index MAX_DOCS=" + MAX_DOCS);
    }

    for (int j = 0; j < MAX_DOCS; j++) {
      Document d = new Document();
      d.add(newTextField(PRIORITY_FIELD, HIGH_PRIORITY, Field.Store.YES));
      d.add(newTextField(ID_FIELD, Integer.toString(j), Field.Store.YES));
      writer.addDocument(d);
    }
    writer.close();

    // try a search without OR
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher searcher = newSearcher(reader);

    Query query = new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY));
    out.println("Query: " + query.toString(PRIORITY_FIELD));
    if (VERBOSE) {
      System.out.println("TEST: search query=" + query);
    }

    final Sort sort = new Sort(SortField.FIELD_SCORE, new SortField(ID_FIELD, SortField.Type.INT));

    ScoreDoc[] hits = searcher.search(query, null, MAX_DOCS, sort).scoreDocs;
    printHits(out, hits, searcher);
    checkHits(hits, MAX_DOCS, searcher);

    // try a new search with OR
    searcher = newSearcher(reader);
    hits = null;

    BooleanQuery booleanQuery = new BooleanQuery();
    booleanQuery.add(
        new TermQuery(new Term(PRIORITY_FIELD, HIGH_PRIORITY)), BooleanClause.Occur.SHOULD);
    booleanQuery.add(
        new TermQuery(new Term(PRIORITY_FIELD, MED_PRIORITY)), BooleanClause.Occur.SHOULD);
    out.println("Query: " + booleanQuery.toString(PRIORITY_FIELD));

    hits = searcher.search(booleanQuery, null, MAX_DOCS, sort).scoreDocs;
    printHits(out, hits, searcher);
    checkHits(hits, MAX_DOCS, searcher);

    reader.close();
    directory.close();
  }

Beispiel #5

0

Datei anzeigen

Datei: InspectIndex.java Projekt: laosiaudi/CMU-11642-Project

  /*
   *  listTermVectorField displays the term vector for a field in
   *  a document in an index (specified by reader).
   */
  static void listTermVectorField(IndexReader reader, String docidString, String field)
      throws IOException {

    System.out.println("\nTermVector:  docid " + docidString + ", field " + field);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    Terms terms = reader.getTermVector(docid, field);
    termVectorDisplay(terms);
  }

Beispiel #6

0

Datei anzeigen

Datei: TestMultiFields.java Projekt: joseerlang/Lucene-solr

 public void testTermDocsEnum() throws Exception {
   Directory dir = newDirectory();
   IndexWriter w =
       new IndexWriter(
           dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
   Document d = new Document();
   d.add(newStringField("f", "j", Field.Store.NO));
   w.addDocument(d);
   w.commit();
   w.addDocument(d);
   IndexReader r = w.getReader();
   w.close();
   DocsEnum de = MultiFields.getTermDocsEnum(r, null, "f", new BytesRef("j"));
   assertEquals(0, de.nextDoc());
   assertEquals(1, de.nextDoc());
   assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc());
   r.close();
   dir.close();
 }

Beispiel #7

0

Datei anzeigen

Datei: TestMultiFields.java Projekt: joseerlang/Lucene-solr

 public void testSeparateEnums() throws Exception {
   Directory dir = newDirectory();
   IndexWriter w =
       new IndexWriter(
           dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
   Document d = new Document();
   d.add(newStringField("f", "j", Field.Store.NO));
   w.addDocument(d);
   w.commit();
   w.addDocument(d);
   IndexReader r = w.getReader();
   w.close();
   DocsEnum d1 = _TestUtil.docs(random(), r, "f", new BytesRef("j"), null, null, 0);
   DocsEnum d2 = _TestUtil.docs(random(), r, "f", new BytesRef("j"), null, null, 0);
   assertEquals(0, d1.nextDoc());
   assertEquals(0, d2.nextDoc());
   r.close();
   dir.close();
 }

Beispiel #8

0

Datei anzeigen

Datei: IndexSorterArquivoWeb.java Projekt: arquivo/pwa-technologies

    // public SortingReader(IndexReader oldReader, int[] oldToNew) { // TODO MC
    public SortingReader(IndexReader oldReader, DocScore[] newToOld) { // TODO MC
      super(oldReader);
      this.newToOld = newToOld;

      this.oldToNew = new int[oldReader.maxDoc()];
      int newDoc = 0;
      while (newDoc < newToOld.length) {
        int oldDoc = newToOld[newDoc].doc;
        oldToNew[oldDoc] = newDoc;
        newDoc++;
      }
    }

Beispiel #9

0

Datei anzeigen

Datei: IndexSorterArquivoWeb.java Projekt: arquivo/pwa-technologies

  public void sort(File directory) throws IOException {
    LOG.info("IndexSorter: starting.");
    Date start = new Date();
    int termIndexInterval = getConf().getInt("indexer.termIndexInterval", 128);
    IndexReader reader = IndexReader.open(new File(directory, "index"));
    Searcher searcher =
        new IndexSearcher(new File(directory, "index").getAbsolutePath()); // TODO MC

    SortingReader sorter = new SortingReader(reader, newToOld(reader, searcher)); // TODO MC
    IndexWriter writer = new IndexWriter(new File(directory, "index-sorted"), null, true);
    writer.setTermIndexInterval(termIndexInterval);
    writer.setUseCompoundFile(false);
    writer.addIndexes(new IndexReader[] {sorter});
    writer.close();
    Date end = new Date();
    LOG.info("IndexSorter: done, " + (end.getTime() - start.getTime()) + " total milliseconds");
  }

Beispiel #10

0

Datei anzeigen

Datei: TestMultiFields.java Projekt: joseerlang/Lucene-solr

  public void testRandom() throws Exception {

    int num = atLeast(2);
    for (int iter = 0; iter < num; iter++) {
      if (VERBOSE) {
        System.out.println("TEST: iter=" + iter);
      }

      Directory dir = newDirectory();

      IndexWriter w =
          new IndexWriter(
              dir,
              newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
                  .setMergePolicy(NoMergePolicy.COMPOUND_FILES));
      _TestUtil.keepFullyDeletedSegments(w);

      Map<BytesRef, List<Integer>> docs = new HashMap<BytesRef, List<Integer>>();
      Set<Integer> deleted = new HashSet<Integer>();
      List<BytesRef> terms = new ArrayList<BytesRef>();

      int numDocs = _TestUtil.nextInt(random(), 1, 100 * RANDOM_MULTIPLIER);
      Document doc = new Document();
      Field f = newStringField("field", "", Field.Store.NO);
      doc.add(f);
      Field id = newStringField("id", "", Field.Store.NO);
      doc.add(id);

      boolean onlyUniqueTerms = random().nextBoolean();
      if (VERBOSE) {
        System.out.println("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs);
      }
      Set<BytesRef> uniqueTerms = new HashSet<BytesRef>();
      for (int i = 0; i < numDocs; i++) {

        if (!onlyUniqueTerms && random().nextBoolean() && terms.size() > 0) {
          // re-use existing term
          BytesRef term = terms.get(random().nextInt(terms.size()));
          docs.get(term).add(i);
          f.setStringValue(term.utf8ToString());
        } else {
          String s = _TestUtil.randomUnicodeString(random(), 10);
          BytesRef term = new BytesRef(s);
          if (!docs.containsKey(term)) {
            docs.put(term, new ArrayList<Integer>());
          }
          docs.get(term).add(i);
          terms.add(term);
          uniqueTerms.add(term);
          f.setStringValue(s);
        }
        id.setStringValue("" + i);
        w.addDocument(doc);
        if (random().nextInt(4) == 1) {
          w.commit();
        }
        if (i > 0 && random().nextInt(20) == 1) {
          int delID = random().nextInt(i);
          deleted.add(delID);
          w.deleteDocuments(new Term("id", "" + delID));
          if (VERBOSE) {
            System.out.println("TEST: delete " + delID);
          }
        }
      }

      if (VERBOSE) {
        List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms);
        Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator());
        System.out.println("TEST: terms in UTF16 order:");
        for (BytesRef b : termsList) {
          System.out.println("  " + UnicodeUtil.toHexString(b.utf8ToString()) + " " + b);
          for (int docID : docs.get(b)) {
            if (deleted.contains(docID)) {
              System.out.println("    " + docID + " (deleted)");
            } else {
              System.out.println("    " + docID);
            }
          }
        }
      }

      IndexReader reader = w.getReader();
      w.close();
      if (VERBOSE) {
        System.out.println("TEST: reader=" + reader);
      }

      Bits liveDocs = MultiFields.getLiveDocs(reader);
      for (int delDoc : deleted) {
        assertFalse(liveDocs.get(delDoc));
      }

      for (int i = 0; i < 100; i++) {
        BytesRef term = terms.get(random().nextInt(terms.size()));
        if (VERBOSE) {
          System.out.println(
              "TEST: seek term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " " + term);
        }

        DocsEnum docsEnum = _TestUtil.docs(random(), reader, "field", term, liveDocs, null, 0);
        assertNotNull(docsEnum);

        for (int docID : docs.get(term)) {
          if (!deleted.contains(docID)) {
            assertEquals(docID, docsEnum.nextDoc());
          }
        }
        assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsEnum.nextDoc());
      }

      reader.close();
      dir.close();
    }
  }

Beispiel #11

0

Datei anzeigen

Datei: TestRollingUpdates.java Projekt: PATRIC3/p3_solr

  @Test
  public void testRollingUpdates() throws Exception {
    Random random = new Random(random().nextLong());
    final BaseDirectoryWrapper dir = newDirectory();
    // test checks for no unref'ed files with the IW helper method, which isn't aware of "tried to
    // delete files"
    if (dir instanceof MockDirectoryWrapper) {
      ((MockDirectoryWrapper) dir).setEnableVirusScanner(false);
    }

    final LineFileDocs docs = new LineFileDocs(random, true);

    // provider.register(new MemoryCodec());
    if (random().nextBoolean()) {
      Codec.setDefault(
          TestUtil.alwaysPostingsFormat(
              new MemoryPostingsFormat(random().nextBoolean(), random.nextFloat())));
    }

    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));

    final IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    final int SIZE = atLeast(20);
    int id = 0;
    IndexReader r = null;
    IndexSearcher s = null;
    final int numUpdates =
        (int)
            (SIZE * (2 + (TEST_NIGHTLY ? 200 * random().nextDouble() : 5 * random().nextDouble())));
    if (VERBOSE) {
      System.out.println("TEST: numUpdates=" + numUpdates);
    }
    int updateCount = 0;
    // TODO: sometimes update ids not in order...
    for (int docIter = 0; docIter < numUpdates; docIter++) {
      final Document doc = docs.nextDoc();
      final String myID = Integer.toString(id);
      if (id == SIZE - 1) {
        id = 0;
      } else {
        id++;
      }
      if (VERBOSE) {
        System.out.println("  docIter=" + docIter + " id=" + id);
      }
      ((Field) doc.getField("docid")).setStringValue(myID);

      Term idTerm = new Term("docid", myID);

      final boolean doUpdate;
      if (s != null && updateCount < SIZE) {
        TopDocs hits = s.search(new TermQuery(idTerm), 1);
        assertEquals(1, hits.totalHits);
        doUpdate = !w.tryDeleteDocument(r, hits.scoreDocs[0].doc);
        if (VERBOSE) {
          if (doUpdate) {
            System.out.println("  tryDeleteDocument failed");
          } else {
            System.out.println("  tryDeleteDocument succeeded");
          }
        }
      } else {
        doUpdate = true;
        if (VERBOSE) {
          System.out.println("  no searcher: doUpdate=true");
        }
      }

      updateCount++;

      if (doUpdate) {
        if (random().nextBoolean()) {
          w.updateDocument(idTerm, doc);
        } else {
          // It's OK to not be atomic for this test (no separate thread reopening readers):
          w.deleteDocuments(new TermQuery(idTerm));
          w.addDocument(doc);
        }
      } else {
        w.addDocument(doc);
      }

      if (docIter >= SIZE && random().nextInt(50) == 17) {
        if (r != null) {
          r.close();
        }

        final boolean applyDeletions = random().nextBoolean();

        if (VERBOSE) {
          System.out.println("TEST: reopen applyDeletions=" + applyDeletions);
        }

        r = w.getReader(applyDeletions);
        if (applyDeletions) {
          s = newSearcher(r);
        } else {
          s = null;
        }
        assertTrue(
            "applyDeletions=" + applyDeletions + " r.numDocs()=" + r.numDocs() + " vs SIZE=" + SIZE,
            !applyDeletions || r.numDocs() == SIZE);
        updateCount = 0;
      }
    }

    if (r != null) {
      r.close();
    }

    w.commit();
    assertEquals(SIZE, w.numDocs());

    w.close();

    TestIndexWriter.assertNoUnreferencedFiles(dir, "leftover files after rolling updates");

    docs.close();

    // LUCENE-4455:
    SegmentInfos infos = SegmentInfos.readLatestCommit(dir);
    long totalBytes = 0;
    for (SegmentCommitInfo sipc : infos) {
      totalBytes += sipc.sizeInBytes();
    }
    long totalBytes2 = 0;

    for (String fileName : dir.listAll()) {
      if (IndexFileNames.CODEC_FILE_PATTERN.matcher(fileName).matches()) {
        totalBytes2 += dir.fileLength(fileName);
      }
    }
    assertEquals(totalBytes2, totalBytes);
    dir.close();
  }

Beispiel #12

0

Datei anzeigen

Datei: InspectIndex.java Projekt: laosiaudi/CMU-11642-Project

  public static void main(String[] args) throws IOException {

    IndexReader reader = null;

    /*
     *  Opening the index first simplifies the processing of the
     *  rest of the command line arguments.
     */
    for (int i = 0; i < args.length; i++) {
      if (("-index".equals(args[i])) && ((i + 1) < args.length)) {
        reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1])));

        if (reader == null) {
          System.err.println("Error:  Can't open index " + args[i + 1]);
          System.exit(1);
        }
        ;

        break;
      }
      ;
    }
    ;

    if (reader == null) {
      System.err.println(usage);
      System.exit(1);
    }
    ;

    /*
     *  Process the command line arguments sequentially.
     */
    for (int i = 0; i < args.length; i++) {

      if ("-index".equals(args[i])) {

        /*
         *  Handled in the previous loop, so just skip the argument.
         */
        i++;

      } else if ("-list-edocid".equals(args[i])) {

        System.out.println("-list-edocid:");

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        Document d = reader.document(Integer.parseInt(args[i + 1]));

        System.out.println(
            "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId"));

        i += 1;
      } else if ("-list-docids".equals(args[i])) {

        System.out.println("-list-docids:");

        for (int j = 0; j < reader.numDocs(); j++) {
          Document d = reader.document(j);
          System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId"));
        }
        ;

      } else if ("-list-fields".equals(args[i])) {

        Fields fields = MultiFields.getFields(reader);

        System.out.print("\nNumber of fields:  ");

        if (fields == null) System.out.println("0");
        else {
          System.out.println(fields.size());

          Iterator<String> is = fields.iterator();

          while (is.hasNext()) {
            System.out.println("\t" + is.next());
          }
          ;
        }
        ;

      } else if ("-list-postings".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE);
        i += 2;

      } else if ("-list-postings-sample".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], 5);
        i += 2;

      } else if ("-list-stats".equals(args[i])) {

        System.out.println("Corpus statistics:");
        System.out.println("\tnumdocs\t\t" + reader.numDocs());
        System.out.println(
            "\turl:\t"
                + "\tnumdocs="
                + reader.getDocCount("url")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("url")
                + "\tavglen="
                + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url"));

        System.out.println(
            "\tkeywords:"
                + "\tnumdocs="
                + reader.getDocCount("keywords")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("keywords")
                + "\tavglen="
                + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords"));

        System.out.println(
            "\ttitle:\t"
                + "\tnumdocs="
                + reader.getDocCount("title")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("title")
                + "\tavglen="
                + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title"));

        System.out.println(
            "\tbody:\t"
                + "\tnumdocs="
                + reader.getDocCount("body")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("body")
                + "\tavglen="
                + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body"));

        System.out.println(
            "\tinlink:\t"
                + "\tnumdocs="
                + reader.getDocCount("inlink")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("inlink")
                + "\tavglen="
                + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink"));

      } else if ("-list-terms".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermDictionary(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectors(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector-field".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectorField(reader, args[i + 1], args[i + 2]);
        i += 2;

      } else System.err.println("\nWarning:  Unknown argument " + args[i] + " ignored.");
    }
    ;

    /*
     *  Close the index and exit gracefully.
     */
    reader.close();
  }

Beispiel #13

0

Datei anzeigen

Datei: SearchFilesDefault.java Projekt: udaiarora/IR

  public static void main(String[] args) throws Exception {
    // the IndexReader object is the main handle that will give you
    // all the documents, terms and inverted index
    IndexReader r = IndexReader.open(FSDirectory.open(new File("index")));

    // You can figure out the number of documents using the maxDoc() function
    System.out.println("The number of documents in this index is: " + r.maxDoc());

    int i = 0;
    // You can find out all the terms that have been indexed using the terms() function
    TermEnum t = r.terms();
    while (t.next()) {
      // Since there are so many terms, let us try printing only term #100000-#100010
      if (i > 100000) System.out.println("[" + i + "] " + t.term().text());
      if (++i > 100010) break;
    }

    // You can create your own query terms by calling the Term constructor, with the field
    // 'contents'
    // In the following example, the query term is 'brute'
    Term te = new Term("contents", "brute");

    // You can also quickly find out the number of documents that have term t
    System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te));

    // You can use the inverted index to find out all the documents that contain the term 'brute'
    //  by using the termDocs function
    TermDocs td = r.termDocs(te);
    while (td.next()) {
      System.out.println(
          "Document number ["
              + td.doc()
              + "] contains the term 'brute' "
              + td.freq()
              + " time(s).");
    }

    // You can find the URL of the a specific document number using the document() function
    // For example, the URL for document number 14191 is:
    Document d = r.document(14191);
    String url =
        d.getFieldable("path")
            .stringValue(); // the 'path' field of the Document object holds the URL
    System.out.println(url.replace("%%", "/"));

    // -------- Now let us use all of the functions above to make something useful --------
    // The following bit of code is a worked out example of how to get a bunch of documents
    // in response to a query and show them (without ranking them according to TF/IDF)
    Scanner sc = new Scanner(System.in);
    String str = "";
    System.out.print("query> ");
    while (!(str = sc.nextLine()).equals("quit")) {
      String[] terms = str.split("\\s+");
      for (String word : terms) {
        Term term = new Term("contents", word);
        TermDocs tdocs = r.termDocs(term);
        while (tdocs.next()) {
          String d_url =
              r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/");
          System.out.println("[" + tdocs.doc() + "] " + d_url);
        }
      }
      System.out.print("query> ");
    }
  }

Beispiel #14

0

Datei anzeigen

Datei: IndexSorterArquivoWeb.java Projekt: arquivo/pwa-technologies

  // private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException {
  private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException {
    int readerMax = reader.maxDoc();
    DocScore[] newToOld = new DocScore[readerMax];

    // use site, an indexed, un-tokenized field to get boost
    // byte[] boosts = reader.norms("site"); TODO MC
    /* TODO MC */
    Document docMeta;
    Pattern includes = Pattern.compile("\\|");
    String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, "");
    String includeExtensions[] = includes.split(value);
    Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>();
    for (int i = 0; i < includeExtensions.length; i++) {
      validExtensions.put(includeExtensions[i], true);
      System.out.println("extension boosted " + includeExtensions[i]);
    }
    /* TODO MC */

    for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
      float score;
      if (reader.isDeleted(oldDoc)) {
        // score = 0.0f;
        score = -1f; // TODO MC
      } else {
        // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC
        /* TODO MC */
        docMeta = searcher.doc(oldDoc);
        if (validExtensions.get(docMeta.get("subType"))
            == null) { // searched extensions will have higher scores
          score = -0.5f;
        } else {
          score = Integer.parseInt(docMeta.get("inlinks"));
          /*
          if (score==0) {
          	score=0.001f; // TODO MC - to not erase
          }
          */
        }
        /* TODO MC */
        // System.out.println("Score for old document "+oldDoc+" is "+score+" and type
        // "+docMeta.get("subType")); // TODO MC debug remove
      }
      DocScore docScore = new DocScore();
      docScore.doc = oldDoc;
      docScore.score = score;
      newToOld[oldDoc] = docScore;
    }

    System.out.println("Sorting " + newToOld.length + " documents.");
    Arrays.sort(newToOld);
    // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space

    /* TODO MC
    int[] oldToNew = new int[readerMax];
    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
      DocScore docScore = newToOld[newDoc];
      //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC
      oldToNew[docScore.oldDoc] = newDoc; // TODO MC
    }
    */

    /* TODO MC *
    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
    	DocScore docScore = newToOld[newDoc];
    	System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove
    }
    * TODO MC */

    // return oldToNew; TODO MC
    return newToOld; // TODO MC
  }