示例#1
0
  private static Map<String, List<String>> generate_result(Directory directory) {
    Map<String, List<String>> result_map = new HashMap<String, List<String>>();

    try {
      IndexReader reader = IndexReader.open(directory);
      TermEnum termEnum = reader.terms();
      while (termEnum.next()) {
        String termEnumString = termEnum.term().toString();
        if (termEnumString.startsWith("content:")) {
          String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1);
          TermDocs termDocs = reader.termDocs(termEnum.term());
          while (termDocs.next()) {
            Document doc = reader.document(termDocs.doc());
            String relative_path = doc.get("relative_path");

            if (result_map.containsKey(relative_path)) {
              result_map.get(relative_path).add(term + termDocs.freq());
            } else {
              result_map.put(relative_path, new ArrayList<String>());
            }
          }
        }
      }
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
    }

    return result_map;
  }
示例#2
0
 public static int docId(IndexReader reader, Term term) throws IOException {
   TermDocs termDocs = reader.termDocs(term);
   try {
     if (termDocs.next()) {
       return termDocs.doc();
     }
     return NO_DOC;
   } finally {
     termDocs.close();
   }
 }
示例#3
0
 private TermDocs termDocs(int i) throws IOException {
   TermDocs result = readerTermDocs[i];
   if (result == null) result = readerTermDocs[i] = termDocs(readers[i]);
   if (smi != null) {
     assert (smi.ord == i);
     assert (smi.termEnum.term().equals(term));
     result.seek(smi.termEnum);
   } else {
     result.seek(term);
   }
   return result;
 }
示例#4
0
 /** Optimized implementation. */
 public int read(final int[] docs, final int[] freqs) throws IOException {
   while (true) {
     while (current == null) {
       if (pointer < readers.length) { // try next segment
         if (tenum != null) {
           smi = tenum.matchingSegments[matchingSegmentPos++];
           if (smi == null) {
             pointer = readers.length;
             return 0;
           }
           pointer = smi.ord;
         }
         base = starts[pointer];
         current = termDocs(pointer++);
       } else {
         return 0;
       }
     }
     int end = current.read(docs, freqs);
     if (end == 0) { // none left in segment
       current = null;
     } else { // got some
       final int b = base; // adjust doc numbers
       for (int i = 0; i < end; i++) docs[i] += b;
       return end;
     }
   }
 }
  /**
   * Tests the IndexReader.getFieldNames implementation
   *
   * @throws Exception on error
   */
  public void testFilterIndexReader() throws Exception {
    Directory directory = newDirectory();
    IndexWriter writer =
        new IndexWriter(
            directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));

    Document d1 = new Document();
    d1.add(newField("default", "one two", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(d1);

    Document d2 = new Document();
    d2.add(newField("default", "one three", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(d2);

    Document d3 = new Document();
    d3.add(newField("default", "two four", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(d3);

    writer.close();

    IndexReader reader = new TestReader(IndexReader.open(directory, true));
    TermEnum terms = reader.terms();
    while (terms.next()) {
      assertTrue(terms.term().text().indexOf('e') != -1);
    }
    terms.close();

    TermPositions positions = reader.termPositions(new Term("default", "one"));
    while (positions.next()) {
      assertTrue((positions.doc() % 2) == 1);
    }

    int NUM_DOCS = 3;

    TermDocs td = reader.termDocs(null);
    for (int i = 0; i < NUM_DOCS; i++) {
      assertTrue(td.next());
      assertEquals(i, td.doc());
      assertEquals(1, td.freq());
    }
    td.close();
    reader.close();
    directory.close();
  }
示例#6
0
    public MyTerm(Term originTrem, TermDocs termDocs, int maxDocNum) throws IOException {
      super();

      this.originTrem = originTrem;
      this.termDocs = termDocs;
      this.totalFreq = 0;
      while (this.termDocs.next()) {
        int docNum = termDocs.doc();
        int freq = termDocs.freq();
        this.termMap.put(docNum, freq);
        this.totalFreq += freq;
      }
      this.vector = new int[maxDocNum];
      for (int i = 0; i < maxDocNum; i++) {
        this.vector[i] = 0;
      }
      for (int k : this.termMap.keySet()) {
        this.vector[k] = (int) this.termMap.get(k);
      }
    }
示例#7
0
 /* A Possible future optimization could skip entire segments */
 public boolean skipTo(int target) throws IOException {
   for (; ; ) {
     if (current != null && current.skipTo(target - base)) {
       return true;
     } else if (pointer < readers.length) {
       if (tenum != null) {
         SegmentMergeInfo smi = tenum.matchingSegments[matchingSegmentPos++];
         if (smi == null) {
           pointer = readers.length;
           return false;
         }
         pointer = smi.ord;
       }
       base = starts[pointer];
       current = termDocs(pointer++);
     } else return false;
   }
 }
示例#8
0
 public boolean next() throws IOException {
   for (; ; ) {
     if (current != null && current.next()) {
       return true;
     } else if (pointer < readers.length) {
       if (tenum != null) {
         smi = tenum.matchingSegments[matchingSegmentPos++];
         if (smi == null) {
           pointer = readers.length;
           return false;
         }
         pointer = smi.ord;
       }
       base = starts[pointer];
       current = termDocs(pointer++);
     } else {
       return false;
     }
   }
 }
  public void testSkipTo(int indexDivisor) throws IOException {
    Directory dir = new RAMDirectory();
    IndexWriter writer =
        new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);

    Term ta = new Term("content", "aaa");
    for (int i = 0; i < 10; i++) addDoc(writer, "aaa aaa aaa aaa");

    Term tb = new Term("content", "bbb");
    for (int i = 0; i < 16; i++) addDoc(writer, "bbb bbb bbb bbb");

    Term tc = new Term("content", "ccc");
    for (int i = 0; i < 50; i++) addDoc(writer, "ccc ccc ccc ccc");

    // assure that we deal with a single segment
    writer.optimize();
    writer.close();

    IndexReader reader = IndexReader.open(dir);
    reader.setTermInfosIndexDivisor(indexDivisor);
    assertEquals(indexDivisor, reader.getTermInfosIndexDivisor());

    TermDocs tdocs = reader.termDocs();

    // without optimization (assumption skipInterval == 16)

    // with next
    tdocs.seek(ta);
    assertTrue(tdocs.next());
    assertEquals(0, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.next());
    assertEquals(1, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.skipTo(0));
    assertEquals(2, tdocs.doc());
    assertTrue(tdocs.skipTo(4));
    assertEquals(4, tdocs.doc());
    assertTrue(tdocs.skipTo(9));
    assertEquals(9, tdocs.doc());
    assertFalse(tdocs.skipTo(10));

    // without next
    tdocs.seek(ta);
    assertTrue(tdocs.skipTo(0));
    assertEquals(0, tdocs.doc());
    assertTrue(tdocs.skipTo(4));
    assertEquals(4, tdocs.doc());
    assertTrue(tdocs.skipTo(9));
    assertEquals(9, tdocs.doc());
    assertFalse(tdocs.skipTo(10));

    // exactly skipInterval documents and therefore with optimization

    // with next
    tdocs.seek(tb);
    assertTrue(tdocs.next());
    assertEquals(10, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.next());
    assertEquals(11, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.skipTo(5));
    assertEquals(12, tdocs.doc());
    assertTrue(tdocs.skipTo(15));
    assertEquals(15, tdocs.doc());
    assertTrue(tdocs.skipTo(24));
    assertEquals(24, tdocs.doc());
    assertTrue(tdocs.skipTo(25));
    assertEquals(25, tdocs.doc());
    assertFalse(tdocs.skipTo(26));

    // without next
    tdocs.seek(tb);
    assertTrue(tdocs.skipTo(5));
    assertEquals(10, tdocs.doc());
    assertTrue(tdocs.skipTo(15));
    assertEquals(15, tdocs.doc());
    assertTrue(tdocs.skipTo(24));
    assertEquals(24, tdocs.doc());
    assertTrue(tdocs.skipTo(25));
    assertEquals(25, tdocs.doc());
    assertFalse(tdocs.skipTo(26));

    // much more than skipInterval documents and therefore with optimization

    // with next
    tdocs.seek(tc);
    assertTrue(tdocs.next());
    assertEquals(26, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.next());
    assertEquals(27, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.skipTo(5));
    assertEquals(28, tdocs.doc());
    assertTrue(tdocs.skipTo(40));
    assertEquals(40, tdocs.doc());
    assertTrue(tdocs.skipTo(57));
    assertEquals(57, tdocs.doc());
    assertTrue(tdocs.skipTo(74));
    assertEquals(74, tdocs.doc());
    assertTrue(tdocs.skipTo(75));
    assertEquals(75, tdocs.doc());
    assertFalse(tdocs.skipTo(76));

    // without next
    tdocs.seek(tc);
    assertTrue(tdocs.skipTo(5));
    assertEquals(26, tdocs.doc());
    assertTrue(tdocs.skipTo(40));
    assertEquals(40, tdocs.doc());
    assertTrue(tdocs.skipTo(57));
    assertEquals(57, tdocs.doc());
    assertTrue(tdocs.skipTo(74));
    assertEquals(74, tdocs.doc());
    assertTrue(tdocs.skipTo(75));
    assertEquals(75, tdocs.doc());
    assertFalse(tdocs.skipTo(76));

    tdocs.close();
    reader.close();
    dir.close();
  }
示例#10
0
 public int freq() {
   return current.freq();
 }
示例#11
0
 public int doc() {
   return base + current.doc();
 }
示例#12
0
  public static void main(String[] args) throws Exception {
    // the IndexReader object is the main handle that will give you
    // all the documents, terms and inverted index
    IndexReader r = IndexReader.open(FSDirectory.open(new File("index")));

    // You can figure out the number of documents using the maxDoc() function
    System.out.println("The number of documents in this index is: " + r.maxDoc());

    int i = 0;
    // You can find out all the terms that have been indexed using the terms() function
    TermEnum t = r.terms();
    while (t.next()) {
      // Since there are so many terms, let us try printing only term #100000-#100010
      if (i > 100000) System.out.println("[" + i + "] " + t.term().text());
      if (++i > 100010) break;
    }

    // You can create your own query terms by calling the Term constructor, with the field
    // 'contents'
    // In the following example, the query term is 'brute'
    Term te = new Term("contents", "brute");

    // You can also quickly find out the number of documents that have term t
    System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te));

    // You can use the inverted index to find out all the documents that contain the term 'brute'
    //  by using the termDocs function
    TermDocs td = r.termDocs(te);
    while (td.next()) {
      System.out.println(
          "Document number ["
              + td.doc()
              + "] contains the term 'brute' "
              + td.freq()
              + " time(s).");
    }

    // You can find the URL of the a specific document number using the document() function
    // For example, the URL for document number 14191 is:
    Document d = r.document(14191);
    String url =
        d.getFieldable("path")
            .stringValue(); // the 'path' field of the Document object holds the URL
    System.out.println(url.replace("%%", "/"));

    // -------- Now let us use all of the functions above to make something useful --------
    // The following bit of code is a worked out example of how to get a bunch of documents
    // in response to a query and show them (without ranking them according to TF/IDF)
    Scanner sc = new Scanner(System.in);
    String str = "";
    System.out.print("query> ");
    while (!(str = sc.nextLine()).equals("quit")) {
      String[] terms = str.split("\\s+");
      for (String word : terms) {
        Term term = new Term("contents", word);
        TermDocs tdocs = r.termDocs(term);
        while (tdocs.next()) {
          String d_url =
              r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/");
          System.out.println("[" + tdocs.doc() + "] " + d_url);
        }
      }
      System.out.print("query> ");
    }
  }
示例#13
0
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("TermDumper [-c|-v value] field <index...>");
      System.exit(1);
    }

    boolean count = false;
    String value = null;
    boolean all = false;

    int i = 0;
    for (; i < args.length; i++) {
      String arg = args[i];

      if ("-h".equals(arg) || "--help".equals(arg)) {
        System.err.println("TermDumper [-c|-v value] field <index...>");
        System.exit(1);
      } else if ("-c".equals(arg) || "--count".equals(arg)) {
        count = true;
      } else if ("-v".equals(arg) || "--vaue".equals(arg)) {
        value = args[++i];
      } else if ("-a".equals(arg) || "--all".equals(arg)) {
        all = true;
      } else {
        break;
      }
    }

    String field = args[i++];

    java.util.ArrayList<IndexReader> readers =
        new java.util.ArrayList<IndexReader>(args.length - 1);
    for (; i < args.length; i++) {
      String arg = args[i];
      try {
        IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

        readers.add(reader);
      } catch (IOException ioe) {
        System.err.println("Error reading: " + arg);
      }
    }

    for (IndexReader reader : readers) {
      TermDocs termDocs = reader.termDocs();
      TermEnum termEnum = reader.terms(new Term(field));

      try {
        do {
          Term term = termEnum.term();

          if (term == null || !field.equals(term.field())) break;

          if (value == null) {
            if (count) {
              termDocs.seek(termEnum);

              int c = 0;
              for (; termDocs.next(); c++) ;

              System.out.print(c + " ");
            }
            System.out.println(term.text());
          } else if (value.equals(term.text())) {
            termDocs.seek(termEnum);

            while (termDocs.next()) {
              if (all) {
                Document d = reader.document(termDocs.doc());
                System.out.println(termDocs.doc());
                for (Object o : d.getFields()) {
                  Field f = (Field) o;
                  System.out.println(f.name() + " " + d.get(f.name()));
                }
              } else {
                System.out.println(
                    termDocs.doc() + " " + reader.document(termDocs.doc()).get("url"));
              }
            }
          }
        } while (termEnum.next());
      } finally {
        termDocs.close();
        termEnum.close();
      }
    }
  }