private void remove(Class entity, Serializable id) {
   log.trace("remove from Lucene index: " + entity + "#" + id);
   DocumentBuilder builder = workspace.getDocumentBuilder(entity);
   Term term = builder.getTerm(id);
   IndexReader reader = workspace.getIndexReader(entity);
   TermDocs termDocs = null;
   try {
     // TODO is there a faster way?
     // TODO include TermDocs into the workspace?
     termDocs = reader.termDocs(term);
     String entityName = entity.getName();
     while (termDocs.next()) {
       int docIndex = termDocs.doc();
       if (entityName.equals(reader.document(docIndex).get(DocumentBuilder.CLASS_FIELDNAME))) {
         // remove only the one of the right class
         // loop all to remove all the matches (defensive code)
         reader.deleteDocument(docIndex);
       }
     }
   } catch (Exception e) {
     throw new HibernateException("Unable to remove from Lucene index: " + entity + "#" + id, e);
   } finally {
     if (termDocs != null)
       try {
         termDocs.close();
       } catch (IOException e) {
         log.warn("Unable to close termDocs properly", e);
       }
   }
 }
 MatchNoneScorer(IndexReader reader, Similarity similarity, Weight w, byte[] norms)
     throws IOException {
   super(similarity, w);
   this.termDocs = reader.termDocs(null);
   score = w.getValue();
   this.norms = norms;
 }
    public Scorer scorer(IndexReader reader) throws IOException {
      TermDocs termDocs = reader.termDocs(term);

      if (termDocs == null) return null;

      return new MyTermScorer(this, termDocs, similarity, reader.norms(term.field()));
    }
  @Override
  public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    StockAction action = getStockAction(); // retrieve the service
    long lastUpdateTime = action.geLastUpdateTime();
    if (lastUpdateTime != this.lastUpdateTime) {
      cache.clear(); // clear outdated cache
    }
    DocIdSet cached = cache.get(reader); // check if in cache already
    if (cached != null) return cached;
    // not in cache, build info
    final BitSet bitSet = getAllPositiveBitSet(reader.maxDoc()); // by default, all documents pass

    Term clazzTerm = new Term(DocumentBuilder.CLASS_FIELDNAME, Item.class.getName());
    if (reader.docFreq(clazzTerm) == 0) { // no need to filter
      // index does not contain Item objects
      // no-op
    } else {
      // for each item out of stock, find the corresponding document id by item id
      // and switch off the corresponding bit
      for (String ean : action.getEanOfItemsOutOfStock()) { // invoke external service
        Term term = new Term("ean", ean);
        TermDocs termDocs = reader.termDocs(term); // find document by ean
        while (termDocs.next()) {
          bitSet.clear(termDocs.doc());
        }
      }
    }
    DocIdSet docIdSet = new DocIdBitSet(bitSet); // build DocIdSet from BitSet
    cache.put(reader, docIdSet); // put results in the cache
    this.lastUpdateTime = lastUpdateTime; // update timestamp
    return docIdSet;
  }
Exemple #5
0
  private static Map<String, List<String>> generate_result(Directory directory) {
    Map<String, List<String>> result_map = new HashMap<String, List<String>>();

    try {
      IndexReader reader = IndexReader.open(directory);
      TermEnum termEnum = reader.terms();
      while (termEnum.next()) {
        String termEnumString = termEnum.term().toString();
        if (termEnumString.startsWith("content:")) {
          String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1);
          TermDocs termDocs = reader.termDocs(termEnum.term());
          while (termDocs.next()) {
            Document doc = reader.document(termDocs.doc());
            String relative_path = doc.get("relative_path");

            if (result_map.containsKey(relative_path)) {
              result_map.get(relative_path).add(term + termDocs.freq());
            } else {
              result_map.put(relative_path, new ArrayList<String>());
            }
          }
        }
      }
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
    }

    return result_map;
  }
Exemple #6
0
 public static int docId(IndexReader reader, Term term) throws IOException {
   TermDocs termDocs = reader.termDocs(term);
   try {
     if (termDocs.next()) {
       return termDocs.doc();
     }
     return NO_DOC;
   } finally {
     termDocs.close();
   }
 }
  public void getIndexInfo(String indexdir, int freqThreshold) {
    IndexReader reader = null;

    try {
      Directory dir = FSDirectory.open(new File(indexdir));
      System.out.println(dir);
      reader = IndexReader.open(dir);

      System.out.println("document num:" + reader.numDocs());
      System.out.println("======================");

      TermEnum terms = reader.terms();
      sortedTermQueue.clear();
      maxDocNum = reader.maxDoc();
      linkMap.clear();
      termList.clear();
      while (terms.next()) {
        // System.out.print(terms.term() + "\tDocFreq:" +
        TermDocs termDocs = reader.termDocs(terms.term());
        MyTerm temp = new MyTerm(terms.term(), termDocs, maxDocNum);
        if (temp.totalFreq < freqThreshold) {
          continue;
        } /*
           * if(temp.originTrem.text().length()==1){ continue; }
           */
        linkMap.put(temp.originTrem.text(), temp);
        sortedTermQueue.add(temp);
        termList.add(temp);
      }
      System.out.println("total Size:" + sortedTermQueue.size());
      System.out.println("mapsize:" + linkMap.keySet().size());
      // System.exit(0);
      int num = 0;
      this.maxFreq = sortedTermQueue.peek().totalFreq;
      while (!sortedTermQueue.isEmpty()) {
        num++;
        System.out.println(num + ":" + sortedTermQueue.poll());
      }
      System.out.println("read index info done");
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      try {
        reader.close();

      } catch (IOException e) {
        e.printStackTrace();
      }
    }
  }
    @SuppressWarnings({"StringEquality"})
    @Override
    public void run() {
      TermDocs termDocs = null;
      TermEnum termEnum = null;
      try {
        BloomFilter filter = BloomFilterFactory.getFilter(reader.numDocs(), 15);
        termDocs = reader.termDocs();
        termEnum = reader.terms(new Term(field));
        do {
          Term term = termEnum.term();
          if (term == null || term.field() != field) break;

          // LUCENE MONITOR: 4.0, move to use bytes!
          UnicodeUtil.UTF8Result utf8Result = Unicode.fromStringAsUtf8(term.text());
          termDocs.seek(termEnum);
          while (termDocs.next()) {
            // when traversing, make sure to ignore deleted docs, so the key->docId will be correct
            if (!reader.isDeleted(termDocs.doc())) {
              filter.add(utf8Result.result, 0, utf8Result.length);
            }
          }
        } while (termEnum.next());
        ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey());
        if (fieldCache != null) {
          if (fieldCache.containsKey(field)) {
            BloomFilterEntry filterEntry = new BloomFilterEntry(reader.numDocs(), filter);
            filterEntry.loading.set(false);
            fieldCache.put(field, filterEntry);
          }
        }
      } catch (Exception e) {
        logger.warn("failed to load bloom filter for [{}]", e, field);
      } finally {
        try {
          if (termDocs != null) {
            termDocs.close();
          }
        } catch (IOException e) {
          // ignore
        }
        try {
          if (termEnum != null) {
            termEnum.close();
          }
        } catch (IOException e) {
          // ignore
        }
      }
    }
  /**
   * Tests the IndexReader.getFieldNames implementation
   *
   * @throws Exception on error
   */
  public void testFilterIndexReader() throws Exception {
    Directory directory = newDirectory();
    IndexWriter writer =
        new IndexWriter(
            directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)));

    Document d1 = new Document();
    d1.add(newField("default", "one two", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(d1);

    Document d2 = new Document();
    d2.add(newField("default", "one three", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(d2);

    Document d3 = new Document();
    d3.add(newField("default", "two four", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(d3);

    writer.close();

    IndexReader reader = new TestReader(IndexReader.open(directory, true));
    TermEnum terms = reader.terms();
    while (terms.next()) {
      assertTrue(terms.term().text().indexOf('e') != -1);
    }
    terms.close();

    TermPositions positions = reader.termPositions(new Term("default", "one"));
    while (positions.next()) {
      assertTrue((positions.doc() % 2) == 1);
    }

    int NUM_DOCS = 3;

    TermDocs td = reader.termDocs(null);
    for (int i = 0; i < NUM_DOCS; i++) {
      assertTrue(td.next());
      assertEquals(i, td.doc());
      assertEquals(1, td.freq());
    }
    td.close();
    reader.close();
    directory.close();
  }
 /*     */ public DocIdSet getDocIdSet(IndexReader reader) /*     */ throws IOException /*     */ {
   /* 103 */ TermEnum enumerator = this.query.getEnum(reader);
   /*     */ try
   /*     */ {
     /* 106 */ if (enumerator.term() == null) {
       /* 107 */ return DocIdSet.EMPTY_DOCIDSET;
     }
     OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
     /* 110 */ int[] docs = new int[32];
     /* 111 */ int[] freqs = new int[32];
     /* 112 */ TermDocs termDocs = reader.termDocs();
     /*     */ int termCount;
     /*     */ try {
       termCount = 0;
       /*     */ do {
         /* 116 */ Term term = enumerator.term();
         /* 117 */ if (term == null) /*     */ break;
         /* 119 */ termCount++;
         /* 120 */ termDocs.seek(term);
         /*     */ while (true) {
           /* 122 */ int count = termDocs.read(docs, freqs);
           /* 123 */ if (count == 0) break;
           /* 124 */ for (int i = 0; i < count; i++) {
             /* 125 */ bitSet.set(docs[i]);
             /*     */ }
           /*     */ }
         /*     */
         /*     */ }
       /*     */
       /* 131 */ while (enumerator.next());
       /*     */
       /* 133 */ this.query.incTotalNumberOfTerms(termCount);
       /*     */ } finally
     /*     */ {
       /* 136 */ termDocs.close();
       /*     */ }
     /* 138 */ return bitSet;
     /*     */ } finally {
     /* 140 */ enumerator.close();
     /*     */ }
   /*     */ }
      @Override
      public DocIdSetIterator iterator() throws IOException {
        final TermDocs td = reader.termDocs(term);
        if (td == null) {
          return EmptyDocIdSet.getInstance().iterator();
        }
        return new DocIdSetIterator() {

          private int _doc = -1;

          @Override
          public int advance(int target) throws IOException {
            if (td.skipTo(target)) {
              _doc = td.doc();
            } else {
              td.close();
              _doc = DocIdSetIterator.NO_MORE_DOCS;
            }
            return _doc;
          }

          @Override
          public int docID() {
            return _doc;
          }

          @Override
          public int nextDoc() throws IOException {
            if (td.next()) {
              _doc = td.doc();
            } else {
              td.close();
              _doc = DocIdSetIterator.NO_MORE_DOCS;
            }
            return _doc;
          }
        };
      }
Exemple #12
0
  /**
   * Returns a BitSet with true for documents which should be permitted in searchCallback results,
   * and false for those that should not.
   */
  public BitSet bits(IndexReader reader) throws IOException {
    long start = System.currentTimeMillis();

    BitSet bits = new BitSet(reader.maxDoc());
    //        TermEnum enumerator =
    //            (null == lowerTerm ? reader.terms(new Term(fieldName, "")) : reader.terms(new
    // Term(fieldName, lowerTerm)));
    TermEnum enumerator =
        (null != lowerTerm
            ? reader.terms(new Term(fieldName, lowerTerm))
            : reader.terms(new Term(fieldName, "")));

    // coords = new HashMap(enumerator.docFreq());

    try {

      if (enumerator.term() == null) {
        return bits;
      }

      boolean checkLower = false;
      if (!includeLower) // make adjustments to set to exclusive
      checkLower = true;

      TermDocs termDocs = reader.termDocs();
      try {

        do {
          Term term = enumerator.term();
          if (term != null && term.field().equals(fieldName)) {
            if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) {
              checkLower = false;
              if (upperTerm != null) {
                int compare = upperTerm.compareTo(term.text());
                /* if beyond the upper term, or is exclusive and
                 * this is equal to the upper term, break out */
                if ((compare < 0) || (!includeUpper && compare == 0)) {
                  break;
                }
              }
              /* we have a good term, find the docs */

              termDocs.seek(enumerator.term());
              while (termDocs.next()) {
                bits.set(termDocs.doc());
              }
            }
          } else {
            break;
          }
        } while (enumerator.next());

      } finally {
        termDocs.close();
      }
    } finally {
      enumerator.close();
    }

    long end = System.currentTimeMillis();
    log.info("BoundaryBox Time Taken: " + (end - start));
    return bits;
  }
 public MatchAllDocIdSetIterator(IndexReader reader) throws IOException {
   _termDocs = reader.termDocs(null);
   _docid = -1;
 }
  public void testSkipTo(int indexDivisor) throws IOException {
    Directory dir = new RAMDirectory();
    IndexWriter writer =
        new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);

    Term ta = new Term("content", "aaa");
    for (int i = 0; i < 10; i++) addDoc(writer, "aaa aaa aaa aaa");

    Term tb = new Term("content", "bbb");
    for (int i = 0; i < 16; i++) addDoc(writer, "bbb bbb bbb bbb");

    Term tc = new Term("content", "ccc");
    for (int i = 0; i < 50; i++) addDoc(writer, "ccc ccc ccc ccc");

    // assure that we deal with a single segment
    writer.optimize();
    writer.close();

    IndexReader reader = IndexReader.open(dir);
    reader.setTermInfosIndexDivisor(indexDivisor);
    assertEquals(indexDivisor, reader.getTermInfosIndexDivisor());

    TermDocs tdocs = reader.termDocs();

    // without optimization (assumption skipInterval == 16)

    // with next
    tdocs.seek(ta);
    assertTrue(tdocs.next());
    assertEquals(0, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.next());
    assertEquals(1, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.skipTo(0));
    assertEquals(2, tdocs.doc());
    assertTrue(tdocs.skipTo(4));
    assertEquals(4, tdocs.doc());
    assertTrue(tdocs.skipTo(9));
    assertEquals(9, tdocs.doc());
    assertFalse(tdocs.skipTo(10));

    // without next
    tdocs.seek(ta);
    assertTrue(tdocs.skipTo(0));
    assertEquals(0, tdocs.doc());
    assertTrue(tdocs.skipTo(4));
    assertEquals(4, tdocs.doc());
    assertTrue(tdocs.skipTo(9));
    assertEquals(9, tdocs.doc());
    assertFalse(tdocs.skipTo(10));

    // exactly skipInterval documents and therefore with optimization

    // with next
    tdocs.seek(tb);
    assertTrue(tdocs.next());
    assertEquals(10, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.next());
    assertEquals(11, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.skipTo(5));
    assertEquals(12, tdocs.doc());
    assertTrue(tdocs.skipTo(15));
    assertEquals(15, tdocs.doc());
    assertTrue(tdocs.skipTo(24));
    assertEquals(24, tdocs.doc());
    assertTrue(tdocs.skipTo(25));
    assertEquals(25, tdocs.doc());
    assertFalse(tdocs.skipTo(26));

    // without next
    tdocs.seek(tb);
    assertTrue(tdocs.skipTo(5));
    assertEquals(10, tdocs.doc());
    assertTrue(tdocs.skipTo(15));
    assertEquals(15, tdocs.doc());
    assertTrue(tdocs.skipTo(24));
    assertEquals(24, tdocs.doc());
    assertTrue(tdocs.skipTo(25));
    assertEquals(25, tdocs.doc());
    assertFalse(tdocs.skipTo(26));

    // much more than skipInterval documents and therefore with optimization

    // with next
    tdocs.seek(tc);
    assertTrue(tdocs.next());
    assertEquals(26, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.next());
    assertEquals(27, tdocs.doc());
    assertEquals(4, tdocs.freq());
    assertTrue(tdocs.skipTo(5));
    assertEquals(28, tdocs.doc());
    assertTrue(tdocs.skipTo(40));
    assertEquals(40, tdocs.doc());
    assertTrue(tdocs.skipTo(57));
    assertEquals(57, tdocs.doc());
    assertTrue(tdocs.skipTo(74));
    assertEquals(74, tdocs.doc());
    assertTrue(tdocs.skipTo(75));
    assertEquals(75, tdocs.doc());
    assertFalse(tdocs.skipTo(76));

    // without next
    tdocs.seek(tc);
    assertTrue(tdocs.skipTo(5));
    assertEquals(26, tdocs.doc());
    assertTrue(tdocs.skipTo(40));
    assertEquals(40, tdocs.doc());
    assertTrue(tdocs.skipTo(57));
    assertEquals(57, tdocs.doc());
    assertTrue(tdocs.skipTo(74));
    assertEquals(74, tdocs.doc());
    assertTrue(tdocs.skipTo(75));
    assertEquals(75, tdocs.doc());
    assertFalse(tdocs.skipTo(76));

    tdocs.close();
    reader.close();
    dir.close();
  }
Exemple #15
0
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("TermDumper [-c|-v value] field <index...>");
      System.exit(1);
    }

    boolean count = false;
    String value = null;
    boolean all = false;

    int i = 0;
    for (; i < args.length; i++) {
      String arg = args[i];

      if ("-h".equals(arg) || "--help".equals(arg)) {
        System.err.println("TermDumper [-c|-v value] field <index...>");
        System.exit(1);
      } else if ("-c".equals(arg) || "--count".equals(arg)) {
        count = true;
      } else if ("-v".equals(arg) || "--vaue".equals(arg)) {
        value = args[++i];
      } else if ("-a".equals(arg) || "--all".equals(arg)) {
        all = true;
      } else {
        break;
      }
    }

    String field = args[i++];

    java.util.ArrayList<IndexReader> readers =
        new java.util.ArrayList<IndexReader>(args.length - 1);
    for (; i < args.length; i++) {
      String arg = args[i];
      try {
        IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

        readers.add(reader);
      } catch (IOException ioe) {
        System.err.println("Error reading: " + arg);
      }
    }

    for (IndexReader reader : readers) {
      TermDocs termDocs = reader.termDocs();
      TermEnum termEnum = reader.terms(new Term(field));

      try {
        do {
          Term term = termEnum.term();

          if (term == null || !field.equals(term.field())) break;

          if (value == null) {
            if (count) {
              termDocs.seek(termEnum);

              int c = 0;
              for (; termDocs.next(); c++) ;

              System.out.print(c + " ");
            }
            System.out.println(term.text());
          } else if (value.equals(term.text())) {
            termDocs.seek(termEnum);

            while (termDocs.next()) {
              if (all) {
                Document d = reader.document(termDocs.doc());
                System.out.println(termDocs.doc());
                for (Object o : d.getFields()) {
                  Field f = (Field) o;
                  System.out.println(f.name() + " " + d.get(f.name()));
                }
              } else {
                System.out.println(
                    termDocs.doc() + " " + reader.document(termDocs.doc()).get("url"));
              }
            }
          }
        } while (termEnum.next());
      } finally {
        termDocs.close();
        termEnum.close();
      }
    }
  }
  @Override
  public boolean reload(String collectionName, String topRankingField) {
    if (collectionName == null) {
      return false;
    }

    CrescentCollectionHandler collectionHandler =
        SpringApplicationContext.getBean(
            "crescentCollectionHandler", CrescentCollectionHandler.class);

    CrescentCollection collection =
        collectionHandler.getCrescentCollections().getCrescentCollection(collectionName);

    if (collection == null) {
      logger.debug("doesn't Collection Info => {}", collectionName);
      init(View.Overview);
      return false;
    }

    if (topRankingField == null) {
      if (collection.getDefaultSearchFields().get(0) != null) {
        topRankingField = collection.getDefaultSearchFields().get(0).getName();
      } else {
        logger.debug("doesn't defaultSearchField => {}", collectionName);
        init(View.Overview);
        return false;
      }
    }

    List<String> fieldName = new ArrayList<String>();
    for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName());
    TopRankingQueue topRankingQueue =
        new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator());

    try {
      Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory()));
      IndexReader reader = IndexReader.open(directory);

      TermEnum terms = reader.terms();

      int termFreq = 0;
      int termCount = 0;
      Term beforeTerm = null;
      // init term count
      fieldTermCount.clear();
      for (CrescentCollectionField field : collection.getFields())
        fieldTermCount.put(field.getName(), 0);
      topRankingQueue.clear();

      while (terms.next()) {
        Term currTerm = terms.term();
        if (beforeTerm == null) {
          beforeTerm = currTerm;
        }

        if (beforeTerm.field() == currTerm.field()) {
          termCount++;
        } else {
          fieldTermCount.put(beforeTerm.field(), termCount);
          termCount = 1;
          beforeTerm = currTerm;
        }

        TermDocs termDocs = reader.termDocs(currTerm);

        while (termDocs.next()) {
          if (currTerm.field().equals(topRankingField)) {
            RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq());
            topRankingQueue.add(e);
          }
        }
        termFreq++;
      }
      if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount);

      terms.close();
      result.put("numOfTerm", termFreq);
      result.put("numOfDoc", reader.numDocs());
      result.put("hasDel", reader.hasDeletions());
      result.put("isOptimize", reader.isOptimized());
      result.put("indexVersion", reader.getVersion());
      result.put("lastModify", new Date(IndexReader.lastModified(directory)));
    } catch (IOException e) {
      e.printStackTrace();
      return false;
    }
    if (topRankingQueue.size() != 0) {
      topRankingTerms = topRankingQueue.toArray();
      Arrays.sort(topRankingTerms);
    }
    result.put("collectionName", collectionName);
    result.put("indexName", collection.getIndexingDirectory());
    result.put("numOfField", collection.getFields().size());
    result.put("termCount", fieldTermCount);
    result.put("topRanking", topRankingTerms);
    result.put("fieldName", fieldName);

    return true;
  }
  /**
   * loads multi-value facet data. This method uses a workarea to prepare loading.
   *
   * @param fieldName
   * @param reader
   * @param listFactory
   * @param workArea
   * @throws IOException
   */
  public void load(
      String fieldName, IndexReader reader, TermListFactory<T> listFactory, WorkArea workArea)
      throws IOException {
    long t0 = System.currentTimeMillis();
    int maxdoc = reader.maxDoc();
    BufferedLoader loader = getBufferedLoader(maxdoc, workArea);

    TermEnum tenum = null;
    TermDocs tdoc = null;
    TermValueList<T> list =
        (listFactory == null
            ? (TermValueList<T>) new TermStringList()
            : listFactory.createTermList());
    IntArrayList minIDList = new IntArrayList();
    IntArrayList maxIDList = new IntArrayList();
    IntArrayList freqList = new IntArrayList();
    OpenBitSet bitset = new OpenBitSet(maxdoc + 1);
    int negativeValueCount = getNegativeValueCount(reader, fieldName.intern());
    int t = 0; // current term number
    list.add(null);
    minIDList.add(-1);
    maxIDList.add(-1);
    freqList.add(0);
    t++;

    _overflow = false;
    try {
      tdoc = reader.termDocs();
      tenum = reader.terms(new Term(fieldName, ""));
      if (tenum != null) {
        do {
          Term term = tenum.term();
          if (term == null || !fieldName.equals(term.field())) break;

          String val = term.text();

          if (val != null) {
            list.add(val);

            tdoc.seek(tenum);
            // freqList.add(tenum.docFreq()); // removed because the df doesn't take into account
            // the num of deletedDocs
            int df = 0;
            int minID = -1;
            int maxID = -1;
            int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
            if (tdoc.next()) {
              df++;
              int docid = tdoc.doc();
              if (!loader.add(docid, valId)) logOverflow(fieldName);
              minID = docid;
              bitset.fastSet(docid);
              while (tdoc.next()) {
                df++;
                docid = tdoc.doc();

                if (!loader.add(docid, valId)) logOverflow(fieldName);
                bitset.fastSet(docid);
              }
              maxID = docid;
            }
            freqList.add(df);
            minIDList.add(minID);
            maxIDList.add(maxID);
          }

          t++;
        } while (tenum.next());
      }
    } finally {
      try {
        if (tdoc != null) {
          tdoc.close();
        }
      } finally {
        if (tenum != null) {
          tenum.close();
        }
      }
    }

    list.seal();

    try {
      _nestedArray.load(maxdoc + 1, loader);
    } catch (IOException e) {
      throw e;
    } catch (Exception e) {
      throw new RuntimeException("failed to load due to " + e.toString(), e);
    }

    this.valArray = list;
    this.freqs = freqList.toIntArray();
    this.minIDs = minIDList.toIntArray();
    this.maxIDs = maxIDList.toIntArray();

    int doc = 0;
    while (doc <= maxdoc && !_nestedArray.contains(doc, 0, true)) {
      ++doc;
    }
    if (doc <= maxdoc) {
      this.minIDs[0] = doc;
      doc = maxdoc;
      while (doc > 0 && !_nestedArray.contains(doc, 0, true)) {
        --doc;
      }
      if (doc > 0) {
        this.maxIDs[0] = doc;
      }
    }
    this.freqs[0] = maxdoc + 1 - (int) bitset.cardinality();
  }
 /** @see LuceneIndexReader#termDocs() */
 public TermDocs termDocs(Term term) throws IOException {
   return indexReader.termDocs(term);
 }
Exemple #19
0
  public static void main(String[] args) throws Exception {
    // the IndexReader object is the main handle that will give you
    // all the documents, terms and inverted index
    IndexReader r = IndexReader.open(FSDirectory.open(new File("index")));

    // You can figure out the number of documents using the maxDoc() function
    System.out.println("The number of documents in this index is: " + r.maxDoc());

    int i = 0;
    // You can find out all the terms that have been indexed using the terms() function
    TermEnum t = r.terms();
    while (t.next()) {
      // Since there are so many terms, let us try printing only term #100000-#100010
      if (i > 100000) System.out.println("[" + i + "] " + t.term().text());
      if (++i > 100010) break;
    }

    // You can create your own query terms by calling the Term constructor, with the field
    // 'contents'
    // In the following example, the query term is 'brute'
    Term te = new Term("contents", "brute");

    // You can also quickly find out the number of documents that have term t
    System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te));

    // You can use the inverted index to find out all the documents that contain the term 'brute'
    //  by using the termDocs function
    TermDocs td = r.termDocs(te);
    while (td.next()) {
      System.out.println(
          "Document number ["
              + td.doc()
              + "] contains the term 'brute' "
              + td.freq()
              + " time(s).");
    }

    // You can find the URL of the a specific document number using the document() function
    // For example, the URL for document number 14191 is:
    Document d = r.document(14191);
    String url =
        d.getFieldable("path")
            .stringValue(); // the 'path' field of the Document object holds the URL
    System.out.println(url.replace("%%", "/"));

    // -------- Now let us use all of the functions above to make something useful --------
    // The following bit of code is a worked out example of how to get a bunch of documents
    // in response to a query and show them (without ranking them according to TF/IDF)
    Scanner sc = new Scanner(System.in);
    String str = "";
    System.out.print("query> ");
    while (!(str = sc.nextLine()).equals("quit")) {
      String[] terms = str.split("\\s+");
      for (String word : terms) {
        Term term = new Term("contents", word);
        TermDocs tdocs = r.termDocs(term);
        while (tdocs.next()) {
          String d_url =
              r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/");
          System.out.println("[" + tdocs.doc() + "] " + d_url);
        }
      }
      System.out.print("query> ");
    }
  }
  private static float[] getFloats(FileFloatSource ffs, IndexReader reader) {
    float[] vals = new float[reader.maxDoc()];
    if (ffs.defVal != 0) {
      Arrays.fill(vals, ffs.defVal);
    }
    InputStream is;
    String fname = "external_" + ffs.field.getName();
    try {
      is = VersionedFile.getLatestFile(ffs.dataDir, fname);
    } catch (IOException e) {
      // log, use defaults
      SolrCore.log.error("Error opening external value source file: " + e);
      return vals;
    }

    BufferedReader r = new BufferedReader(new InputStreamReader(is));

    String idName = StringHelper.intern(ffs.keyField.getName());
    FieldType idType = ffs.keyField.getType();
    boolean sorted = true; // assume sorted until we discover it's not

    // warning: lucene's termEnum.skipTo() is not optimized... it simply does a next()
    // because of this, simply ask the reader for a new termEnum rather than
    // trying to use skipTo()

    List<String> notFound = new ArrayList<String>();
    int notFoundCount = 0;
    int otherErrors = 0;

    TermDocs termDocs = null;
    Term protoTerm = new Term(idName, "");
    TermEnum termEnum = null;
    // Number of times to try termEnum.next() before resorting to skip
    int numTimesNext = 10;

    char delimiter = '=';
    String termVal;
    boolean hasNext = true;
    String prevKey = "";

    String lastVal = "\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF";

    try {
      termDocs = reader.termDocs();
      termEnum = reader.terms(protoTerm);
      Term t = termEnum.term();
      if (t != null && t.field() == idName) { // intern'd comparison
        termVal = t.text();
      } else {
        termVal = lastVal;
      }

      for (String line; (line = r.readLine()) != null; ) {
        int delimIndex = line.indexOf(delimiter);
        if (delimIndex < 0) continue;

        int endIndex = line.length();
        /* EOLs should already be removed for BufferedReader.readLine()
        for(int endIndex = line.length();endIndex>delimIndex+1; endIndex--) {
          char ch = line.charAt(endIndex-1);
          if (ch!='\n' && ch!='\r') break;
        }
        */
        String key = line.substring(0, delimIndex);
        String val = line.substring(delimIndex + 1, endIndex);

        String internalKey = idType.toInternal(key);
        float fval;
        try {
          fval = Float.parseFloat(val);
        } catch (Exception e) {
          if (++otherErrors <= 10) {
            SolrCore.log.error(
                "Error loading external value source + fileName + "
                    + e
                    + (otherErrors < 10 ? "" : "\tSkipping future errors for this file."));
          }
          continue; // go to next line in file.. leave values as default.
        }

        if (sorted) {
          // make sure this key is greater than the previous key
          sorted = internalKey.compareTo(prevKey) >= 0;
          prevKey = internalKey;

          if (sorted) {
            int countNext = 0;
            for (; ; ) {
              int cmp = internalKey.compareTo(termVal);
              if (cmp == 0) {
                termDocs.seek(termEnum);
                while (termDocs.next()) {
                  vals[termDocs.doc()] = fval;
                }
                break;
              } else if (cmp < 0) {
                // term enum has already advanced past current key... we didn't find it.
                if (notFoundCount < 10) { // collect first 10 not found for logging
                  notFound.add(key);
                }
                notFoundCount++;
                break;
              } else {
                // termEnum is less than our current key, so skip ahead

                // try next() a few times to see if we hit or pass the target.
                // Lucene's termEnum.skipTo() is currently unoptimized (it just does next())
                // so the best thing is to simply ask the reader for a new termEnum(target)
                // if we really need to skip.
                if (++countNext > numTimesNext) {
                  termEnum = reader.terms(protoTerm.createTerm(internalKey));
                  t = termEnum.term();
                } else {
                  hasNext = termEnum.next();
                  t = hasNext ? termEnum.term() : null;
                }

                if (t != null && t.field() == idName) { // intern'd comparison
                  termVal = t.text();
                } else {
                  termVal = lastVal;
                }
              }
            } // end for(;;)
          }
        }

        if (!sorted) {
          termEnum = reader.terms(protoTerm.createTerm(internalKey));
          t = termEnum.term();
          if (t != null
              && t.field() == idName // intern'd comparison
              && internalKey.equals(t.text())) {
            termDocs.seek(termEnum);
            while (termDocs.next()) {
              vals[termDocs.doc()] = fval;
            }
          } else {
            if (notFoundCount < 10) { // collect first 10 not found for logging
              notFound.add(key);
            }
            notFoundCount++;
          }
        }
      }
    } catch (IOException e) {
      // log, use defaults
      SolrCore.log.error("Error loading external value source: " + e);
    } finally {
      // swallow exceptions on close so we don't override any
      // exceptions that happened in the loop
      if (termDocs != null)
        try {
          termDocs.close();
        } catch (Exception e) {
        }
      if (termEnum != null)
        try {
          termEnum.close();
        } catch (Exception e) {
        }
      try {
        r.close();
      } catch (Exception e) {
      }
    }

    SolrCore.log.info(
        "Loaded external value source "
            + fname
            + (notFoundCount == 0 ? "" : " :" + notFoundCount + " missing keys " + notFound));

    return vals;
  }
 protected TermDocs termDocs(IndexReader reader) throws IOException {
   return term == null ? reader.termDocs(null, this.buffer) : reader.termDocs(this.buffer);
 }
 public TermDocs getTermDocs() throws IOException {
   if (termDocs == null) termDocs = reader.termDocs(t, 102400);
   else termDocs.seek(t);
   return termDocs;
 }