Java TermDocs 예제들, org.apache.lucene.index.TermDocs Java 예제들

예제 #1

0

파일 보기

파일: Facet.java 프로젝트: iamsavvy/opensearchserver

 private static final int[] computeMultivaluedTD(
     ReaderAbstract reader,
     String fieldName,
     FieldCacheIndex stringIndex,
     DocIdInterface docIdInterface)
     throws IOException, SearchLibException {
   int[] countIndex = new int[stringIndex.lookup.length];
   int indexPos = 0;
   if (docIdInterface.getSize() == 0) return countIndex;
   int[] docs = new int[100];
   int[] freqs = new int[100];
   BitSetInterface bitset = docIdInterface.getBitSet();
   Term oTerm = new Term(fieldName);
   for (String term : stringIndex.lookup) {
     if (term != null) {
       Term t = oTerm.createTerm(term);
       TermDocs termDocs = reader.getTermDocs(t);
       int l;
       while ((l = termDocs.read(docs, freqs)) > 0)
         for (int i = 0; i < l; i++)
           if (freqs[i] > 0) if (bitset.get(docs[i])) countIndex[indexPos]++;
       termDocs.close();
     }
     indexPos++;
   }
   return countIndex;
 }

예제 #2

0

파일 보기

파일: StockFilter.java 프로젝트: ashraf-revo/hibernatesearchinaction

  @Override
  public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    StockAction action = getStockAction(); // retrieve the service
    long lastUpdateTime = action.geLastUpdateTime();
    if (lastUpdateTime != this.lastUpdateTime) {
      cache.clear(); // clear outdated cache
    }
    DocIdSet cached = cache.get(reader); // check if in cache already
    if (cached != null) return cached;
    // not in cache, build info
    final BitSet bitSet = getAllPositiveBitSet(reader.maxDoc()); // by default, all documents pass

    Term clazzTerm = new Term(DocumentBuilder.CLASS_FIELDNAME, Item.class.getName());
    if (reader.docFreq(clazzTerm) == 0) { // no need to filter
      // index does not contain Item objects
      // no-op
    } else {
      // for each item out of stock, find the corresponding document id by item id
      // and switch off the corresponding bit
      for (String ean : action.getEanOfItemsOutOfStock()) { // invoke external service
        Term term = new Term("ean", ean);
        TermDocs termDocs = reader.termDocs(term); // find document by ean
        while (termDocs.next()) {
          bitSet.clear(termDocs.doc());
        }
      }
    }
    DocIdSet docIdSet = new DocIdBitSet(bitSet); // build DocIdSet from BitSet
    cache.put(reader, docIdSet); // put results in the cache
    this.lastUpdateTime = lastUpdateTime; // update timestamp
    return docIdSet;
  }

예제 #3

0

파일 보기

파일: LuceneWorker.java 프로젝트: KishorGrandhe/hibernate-annotations-3.2.1.GA

 private void remove(Class entity, Serializable id) {
   log.trace("remove from Lucene index: " + entity + "#" + id);
   DocumentBuilder builder = workspace.getDocumentBuilder(entity);
   Term term = builder.getTerm(id);
   IndexReader reader = workspace.getIndexReader(entity);
   TermDocs termDocs = null;
   try {
     // TODO is there a faster way?
     // TODO include TermDocs into the workspace?
     termDocs = reader.termDocs(term);
     String entityName = entity.getName();
     while (termDocs.next()) {
       int docIndex = termDocs.doc();
       if (entityName.equals(reader.document(docIndex).get(DocumentBuilder.CLASS_FIELDNAME))) {
         // remove only the one of the right class
         // loop all to remove all the matches (defensive code)
         reader.deleteDocument(docIndex);
       }
     }
   } catch (Exception e) {
     throw new HibernateException("Unable to remove from Lucene index: " + entity + "#" + id, e);
   } finally {
     if (termDocs != null)
       try {
         termDocs.close();
       } catch (IOException e) {
         log.warn("Unable to close termDocs properly", e);
       }
   }
 }

예제 #4

0

파일 보기

파일: Main.java 프로젝트: muyuxuebao/Lucene-exercise

  private static Map<String, List<String>> generate_result(Directory directory) {
    Map<String, List<String>> result_map = new HashMap<String, List<String>>();

    try {
      IndexReader reader = IndexReader.open(directory);
      TermEnum termEnum = reader.terms();
      while (termEnum.next()) {
        String termEnumString = termEnum.term().toString();
        if (termEnumString.startsWith("content:")) {
          String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1);
          TermDocs termDocs = reader.termDocs(termEnum.term());
          while (termDocs.next()) {
            Document doc = reader.document(termDocs.doc());
            String relative_path = doc.get("relative_path");

            if (result_map.containsKey(relative_path)) {
              result_map.get(relative_path).add(term + termDocs.freq());
            } else {
              result_map.put(relative_path, new ArrayList<String>());
            }
          }
        }
      }
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
    }

    return result_map;
  }

예제 #5

0

파일 보기

파일: Model.java 프로젝트: marayl/doogal

 private final DocumentTable more(Term term) throws IOException {
   final TermDocs docs = state.getIndexReader().termDocs(term);
   if (!docs.next()) throw new EvalException("no such document");
   final MoreLikeThis mlt = new MoreLikeThis(state.getIndexReader());
   mlt.setFieldNames(FIELDS);
   mlt.setMinDocFreq(1);
   mlt.setMinTermFreq(1);
   final Query query = mlt.like(docs.doc());
   return new SearchTable(view, state, query);
 }

예제 #6

0

파일 보기

파일: Lucene.java 프로젝트: holdenk/elasticsearch

 public static int docId(IndexReader reader, Term term) throws IOException {
   TermDocs termDocs = reader.termDocs(term);
   try {
     if (termDocs.next()) {
       return termDocs.doc();
     }
     return NO_DOC;
   } finally {
     termDocs.close();
   }
 }

예제 #7

0

파일 보기

파일: SimpleBloomCache.java 프로젝트: kunthar/elasticsearch

    @SuppressWarnings({"StringEquality"})
    @Override
    public void run() {
      TermDocs termDocs = null;
      TermEnum termEnum = null;
      try {
        BloomFilter filter = BloomFilterFactory.getFilter(reader.numDocs(), 15);
        termDocs = reader.termDocs();
        termEnum = reader.terms(new Term(field));
        do {
          Term term = termEnum.term();
          if (term == null || term.field() != field) break;

          // LUCENE MONITOR: 4.0, move to use bytes!
          UnicodeUtil.UTF8Result utf8Result = Unicode.fromStringAsUtf8(term.text());
          termDocs.seek(termEnum);
          while (termDocs.next()) {
            // when traversing, make sure to ignore deleted docs, so the key->docId will be correct
            if (!reader.isDeleted(termDocs.doc())) {
              filter.add(utf8Result.result, 0, utf8Result.length);
            }
          }
        } while (termEnum.next());
        ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey());
        if (fieldCache != null) {
          if (fieldCache.containsKey(field)) {
            BloomFilterEntry filterEntry = new BloomFilterEntry(reader.numDocs(), filter);
            filterEntry.loading.set(false);
            fieldCache.put(field, filterEntry);
          }
        }
      } catch (Exception e) {
        logger.warn("failed to load bloom filter for [{}]", e, field);
      } finally {
        try {
          if (termDocs != null) {
            termDocs.close();
          }
        } catch (IOException e) {
          // ignore
        }
        try {
          if (termEnum != null) {
            termEnum.close();
          }
        } catch (IOException e) {
          // ignore
        }
      }
    }

예제 #8

0

파일 보기

파일: GlobalTermFrequencies.java 프로젝트: peterwittek/ckcc-topic-modeling

 /**
  * Gets the global term frequency of a term, i.e. how may times it occurs in the whole corpus
  *
  * @param term whose frequency you want
  * @return Global term frequency of term, or 1 if unavailable.
  */
 private int getGlobalTermFreq(Term term) {
   int tf = 0;
   try {
     TermDocs tDocs = this.indexReader.termDocs(term);
     if (tDocs == null) {
       logger.info("Couldn't get term frequency for term " + term.text());
       return 1;
     }
     while (tDocs.next()) {
       tf += tDocs.freq();
     }
   } catch (IOException e) {
     logger.info("Couldn't get term frequency for term " + term.text());
     return 1;
   }
   return tf;
 }

예제 #9

0

파일 보기

파일: LuceneAnalyser.java 프로젝트: alei76/weibo-analyse

    public MyTerm(Term originTrem, TermDocs termDocs, int maxDocNum) throws IOException {
      super();

      this.originTrem = originTrem;
      this.termDocs = termDocs;
      this.totalFreq = 0;
      while (this.termDocs.next()) {
        int docNum = termDocs.doc();
        int freq = termDocs.freq();
        this.termMap.put(docNum, freq);
        this.totalFreq += freq;
      }
      this.vector = new int[maxDocNum];
      for (int i = 0; i < maxDocNum; i++) {
        this.vector[i] = 0;
      }
      for (int k : this.termMap.keySet()) {
        this.vector[k] = (int) this.termMap.get(k);
      }
    }

예제 #10

0

파일 보기

파일: MultiTermQueryWrapperFilter.java 프로젝트: Rashmos/Information_Retrieval_System

 /*     */ public DocIdSet getDocIdSet(IndexReader reader) /*     */ throws IOException /*     */ {
   /* 103 */ TermEnum enumerator = this.query.getEnum(reader);
   /*     */ try
   /*     */ {
     /* 106 */ if (enumerator.term() == null) {
       /* 107 */ return DocIdSet.EMPTY_DOCIDSET;
     }
     OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
     /* 110 */ int[] docs = new int[32];
     /* 111 */ int[] freqs = new int[32];
     /* 112 */ TermDocs termDocs = reader.termDocs();
     /*     */ int termCount;
     /*     */ try {
       termCount = 0;
       /*     */ do {
         /* 116 */ Term term = enumerator.term();
         /* 117 */ if (term == null) /*     */ break;
         /* 119 */ termCount++;
         /* 120 */ termDocs.seek(term);
         /*     */ while (true) {
           /* 122 */ int count = termDocs.read(docs, freqs);
           /* 123 */ if (count == 0) break;
           /* 124 */ for (int i = 0; i < count; i++) {
             /* 125 */ bitSet.set(docs[i]);
             /*     */ }
           /*     */ }
         /*     */
         /*     */ }
       /*     */
       /* 131 */ while (enumerator.next());
       /*     */
       /* 133 */ this.query.incTotalNumberOfTerms(termCount);
       /*     */ } finally
     /*     */ {
       /* 136 */ termDocs.close();
       /*     */ }
     /* 138 */ return bitSet;
     /*     */ } finally {
     /* 140 */ enumerator.close();
     /*     */ }
   /*     */ }

예제 #11

0

파일 보기

파일: ResultDocuments.java 프로젝트: nikolayvoronchikhin/opensearchserver

 private static final int[] toDocArray(ReaderLocal reader, DocumentsRequest request)
     throws IOException {
   SchemaField schemaField = null;
   Schema schema = request.getConfig().getSchema();
   String field = request.getField();
   if (!StringUtils.isEmpty(field)) {
     schemaField = schema.getField(field);
     if (schemaField == null) throw new IOException("Field not found: " + field);
   } else {
     schemaField = schema.getFieldList().getUniqueField();
     if (schemaField == null) throw new IOException("No unique field");
   }
   int higher = -1;
   RoaringBitmap bitSet = new RoaringBitmap();
   String fieldName = schemaField.getName();
   for (String uniqueKey : request.getUniqueKeyList()) {
     TermDocs termDocs = reader.getTermDocs(new Term(fieldName, uniqueKey));
     if (termDocs != null) {
       while (termDocs.next()) {
         int doc = termDocs.doc();
         if (doc > higher) higher = doc;
         bitSet.add(doc);
       }
     }
     termDocs.close();
   }
   if (request.isReverse()) bitSet.flip(0, higher + 1);
   IntBufferedArrayInterface intBufferArray =
       IntBufferedArrayFactory.INSTANCE.newInstance(bitSet.getCardinality());
   IntIterator iterator = bitSet.getIntIterator();
   while (iterator.hasNext()) {
     int docId = iterator.next();
     if (!reader.isDeletedNoLock(docId)) intBufferArray.add(docId);
   }
   return intBufferArray.getFinalArray();
 }

예제 #12

0

파일 보기

파일: BoundaryBoxFilter.java 프로젝트: zabak/digmap

  /**
   * Returns a BitSet with true for documents which should be permitted in searchCallback results,
   * and false for those that should not.
   */
  public BitSet bits(IndexReader reader) throws IOException {
    long start = System.currentTimeMillis();

    BitSet bits = new BitSet(reader.maxDoc());
    //        TermEnum enumerator =
    //            (null == lowerTerm ? reader.terms(new Term(fieldName, "")) : reader.terms(new
    // Term(fieldName, lowerTerm)));
    TermEnum enumerator =
        (null != lowerTerm
            ? reader.terms(new Term(fieldName, lowerTerm))
            : reader.terms(new Term(fieldName, "")));

    // coords = new HashMap(enumerator.docFreq());

    try {

      if (enumerator.term() == null) {
        return bits;
      }

      boolean checkLower = false;
      if (!includeLower) // make adjustments to set to exclusive
      checkLower = true;

      TermDocs termDocs = reader.termDocs();
      try {

        do {
          Term term = enumerator.term();
          if (term != null && term.field().equals(fieldName)) {
            if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) {
              checkLower = false;
              if (upperTerm != null) {
                int compare = upperTerm.compareTo(term.text());
                /* if beyond the upper term, or is exclusive and
                 * this is equal to the upper term, break out */
                if ((compare < 0) || (!includeUpper && compare == 0)) {
                  break;
                }
              }
              /* we have a good term, find the docs */

              termDocs.seek(enumerator.term());
              while (termDocs.next()) {
                bits.set(termDocs.doc());
              }
            }
          } else {
            break;
          }
        } while (enumerator.next());

      } finally {
        termDocs.close();
      }
    } finally {
      enumerator.close();
    }

    long end = System.currentTimeMillis();
    log.info("BoundaryBox Time Taken: " + (end - start));
    return bits;
  }

예제 #13

0

파일 보기

파일: MatchAllDocIdSetIterator.java 프로젝트: stinkycheeseman/Mavenize_Bobo

 @Override
 public int nextDoc() throws IOException {
   return _docid = _termDocs.next() ? _termDocs.doc() : NO_MORE_DOCS;
 }

예제 #14

0

파일 보기

파일: TermDumper.java 프로젝트: rlugojr/tnh

  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("TermDumper [-c|-v value] field <index...>");
      System.exit(1);
    }

    boolean count = false;
    String value = null;
    boolean all = false;

    int i = 0;
    for (; i < args.length; i++) {
      String arg = args[i];

      if ("-h".equals(arg) || "--help".equals(arg)) {
        System.err.println("TermDumper [-c|-v value] field <index...>");
        System.exit(1);
      } else if ("-c".equals(arg) || "--count".equals(arg)) {
        count = true;
      } else if ("-v".equals(arg) || "--vaue".equals(arg)) {
        value = args[++i];
      } else if ("-a".equals(arg) || "--all".equals(arg)) {
        all = true;
      } else {
        break;
      }
    }

    String field = args[i++];

    java.util.ArrayList<IndexReader> readers =
        new java.util.ArrayList<IndexReader>(args.length - 1);
    for (; i < args.length; i++) {
      String arg = args[i];
      try {
        IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

        readers.add(reader);
      } catch (IOException ioe) {
        System.err.println("Error reading: " + arg);
      }
    }

    for (IndexReader reader : readers) {
      TermDocs termDocs = reader.termDocs();
      TermEnum termEnum = reader.terms(new Term(field));

      try {
        do {
          Term term = termEnum.term();

          if (term == null || !field.equals(term.field())) break;

          if (value == null) {
            if (count) {
              termDocs.seek(termEnum);

              int c = 0;
              for (; termDocs.next(); c++) ;

              System.out.print(c + " ");
            }
            System.out.println(term.text());
          } else if (value.equals(term.text())) {
            termDocs.seek(termEnum);

            while (termDocs.next()) {
              if (all) {
                Document d = reader.document(termDocs.doc());
                System.out.println(termDocs.doc());
                for (Object o : d.getFields()) {
                  Field f = (Field) o;
                  System.out.println(f.name() + " " + d.get(f.name()));
                }
              } else {
                System.out.println(
                    termDocs.doc() + " " + reader.document(termDocs.doc()).get("url"));
              }
            }
          }
        } while (termEnum.next());
      } finally {
        termDocs.close();
        termEnum.close();
      }
    }
  }

예제 #15

0

파일 보기

파일: FileFloatSource.java 프로젝트: guoruanfy/solr-server-3.3.0

  private static float[] getFloats(FileFloatSource ffs, IndexReader reader) {
    float[] vals = new float[reader.maxDoc()];
    if (ffs.defVal != 0) {
      Arrays.fill(vals, ffs.defVal);
    }
    InputStream is;
    String fname = "external_" + ffs.field.getName();
    try {
      is = VersionedFile.getLatestFile(ffs.dataDir, fname);
    } catch (IOException e) {
      // log, use defaults
      SolrCore.log.error("Error opening external value source file: " + e);
      return vals;
    }

    BufferedReader r = new BufferedReader(new InputStreamReader(is));

    String idName = StringHelper.intern(ffs.keyField.getName());
    FieldType idType = ffs.keyField.getType();
    boolean sorted = true; // assume sorted until we discover it's not

    // warning: lucene's termEnum.skipTo() is not optimized... it simply does a next()
    // because of this, simply ask the reader for a new termEnum rather than
    // trying to use skipTo()

    List<String> notFound = new ArrayList<String>();
    int notFoundCount = 0;
    int otherErrors = 0;

    TermDocs termDocs = null;
    Term protoTerm = new Term(idName, "");
    TermEnum termEnum = null;
    // Number of times to try termEnum.next() before resorting to skip
    int numTimesNext = 10;

    char delimiter = '=';
    String termVal;
    boolean hasNext = true;
    String prevKey = "";

    String lastVal = "\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF\uFFFF";

    try {
      termDocs = reader.termDocs();
      termEnum = reader.terms(protoTerm);
      Term t = termEnum.term();
      if (t != null && t.field() == idName) { // intern'd comparison
        termVal = t.text();
      } else {
        termVal = lastVal;
      }

      for (String line; (line = r.readLine()) != null; ) {
        int delimIndex = line.indexOf(delimiter);
        if (delimIndex < 0) continue;

        int endIndex = line.length();
        /* EOLs should already be removed for BufferedReader.readLine()
        for(int endIndex = line.length();endIndex>delimIndex+1; endIndex--) {
          char ch = line.charAt(endIndex-1);
          if (ch!='\n' && ch!='\r') break;
        }
        */
        String key = line.substring(0, delimIndex);
        String val = line.substring(delimIndex + 1, endIndex);

        String internalKey = idType.toInternal(key);
        float fval;
        try {
          fval = Float.parseFloat(val);
        } catch (Exception e) {
          if (++otherErrors <= 10) {
            SolrCore.log.error(
                "Error loading external value source + fileName + "
                    + e
                    + (otherErrors < 10 ? "" : "\tSkipping future errors for this file."));
          }
          continue; // go to next line in file.. leave values as default.
        }

        if (sorted) {
          // make sure this key is greater than the previous key
          sorted = internalKey.compareTo(prevKey) >= 0;
          prevKey = internalKey;

          if (sorted) {
            int countNext = 0;
            for (; ; ) {
              int cmp = internalKey.compareTo(termVal);
              if (cmp == 0) {
                termDocs.seek(termEnum);
                while (termDocs.next()) {
                  vals[termDocs.doc()] = fval;
                }
                break;
              } else if (cmp < 0) {
                // term enum has already advanced past current key... we didn't find it.
                if (notFoundCount < 10) { // collect first 10 not found for logging
                  notFound.add(key);
                }
                notFoundCount++;
                break;
              } else {
                // termEnum is less than our current key, so skip ahead

                // try next() a few times to see if we hit or pass the target.
                // Lucene's termEnum.skipTo() is currently unoptimized (it just does next())
                // so the best thing is to simply ask the reader for a new termEnum(target)
                // if we really need to skip.
                if (++countNext > numTimesNext) {
                  termEnum = reader.terms(protoTerm.createTerm(internalKey));
                  t = termEnum.term();
                } else {
                  hasNext = termEnum.next();
                  t = hasNext ? termEnum.term() : null;
                }

                if (t != null && t.field() == idName) { // intern'd comparison
                  termVal = t.text();
                } else {
                  termVal = lastVal;
                }
              }
            } // end for(;;)
          }
        }

        if (!sorted) {
          termEnum = reader.terms(protoTerm.createTerm(internalKey));
          t = termEnum.term();
          if (t != null
              && t.field() == idName // intern'd comparison
              && internalKey.equals(t.text())) {
            termDocs.seek(termEnum);
            while (termDocs.next()) {
              vals[termDocs.doc()] = fval;
            }
          } else {
            if (notFoundCount < 10) { // collect first 10 not found for logging
              notFound.add(key);
            }
            notFoundCount++;
          }
        }
      }
    } catch (IOException e) {
      // log, use defaults
      SolrCore.log.error("Error loading external value source: " + e);
    } finally {
      // swallow exceptions on close so we don't override any
      // exceptions that happened in the loop
      if (termDocs != null)
        try {
          termDocs.close();
        } catch (Exception e) {
        }
      if (termEnum != null)
        try {
          termEnum.close();
        } catch (Exception e) {
        }
      try {
        r.close();
      } catch (Exception e) {
      }
    }

    SolrCore.log.info(
        "Loaded external value source "
            + fname
            + (notFoundCount == 0 ? "" : " :" + notFoundCount + " missing keys " + notFound));

    return vals;
  }

예제 #16

0

파일 보기

파일: MatchNoneDocsQuery.java 프로젝트: sachinrase/sensei

 @Override
 public int advance(int target) throws IOException {
   return doc = termDocs.skipTo(target) ? termDocs.doc() : NO_MORE_DOCS;
 }

예제 #17

0

파일 보기

파일: MultiValueFacetDataCache.java 프로젝트: jbijoux/bobo

  /**
   * loads multi-value facet data. This method uses a workarea to prepare loading.
   *
   * @param fieldName
   * @param reader
   * @param listFactory
   * @param workArea
   * @throws IOException
   */
  public void load(
      String fieldName, IndexReader reader, TermListFactory<T> listFactory, WorkArea workArea)
      throws IOException {
    long t0 = System.currentTimeMillis();
    int maxdoc = reader.maxDoc();
    BufferedLoader loader = getBufferedLoader(maxdoc, workArea);

    TermEnum tenum = null;
    TermDocs tdoc = null;
    TermValueList<T> list =
        (listFactory == null
            ? (TermValueList<T>) new TermStringList()
            : listFactory.createTermList());
    IntArrayList minIDList = new IntArrayList();
    IntArrayList maxIDList = new IntArrayList();
    IntArrayList freqList = new IntArrayList();
    OpenBitSet bitset = new OpenBitSet(maxdoc + 1);
    int negativeValueCount = getNegativeValueCount(reader, fieldName.intern());
    int t = 0; // current term number
    list.add(null);
    minIDList.add(-1);
    maxIDList.add(-1);
    freqList.add(0);
    t++;

    _overflow = false;
    try {
      tdoc = reader.termDocs();
      tenum = reader.terms(new Term(fieldName, ""));
      if (tenum != null) {
        do {
          Term term = tenum.term();
          if (term == null || !fieldName.equals(term.field())) break;

          String val = term.text();

          if (val != null) {
            list.add(val);

            tdoc.seek(tenum);
            // freqList.add(tenum.docFreq()); // removed because the df doesn't take into account
            // the num of deletedDocs
            int df = 0;
            int minID = -1;
            int maxID = -1;
            int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
            if (tdoc.next()) {
              df++;
              int docid = tdoc.doc();
              if (!loader.add(docid, valId)) logOverflow(fieldName);
              minID = docid;
              bitset.fastSet(docid);
              while (tdoc.next()) {
                df++;
                docid = tdoc.doc();

                if (!loader.add(docid, valId)) logOverflow(fieldName);
                bitset.fastSet(docid);
              }
              maxID = docid;
            }
            freqList.add(df);
            minIDList.add(minID);
            maxIDList.add(maxID);
          }

          t++;
        } while (tenum.next());
      }
    } finally {
      try {
        if (tdoc != null) {
          tdoc.close();
        }
      } finally {
        if (tenum != null) {
          tenum.close();
        }
      }
    }

    list.seal();

    try {
      _nestedArray.load(maxdoc + 1, loader);
    } catch (IOException e) {
      throw e;
    } catch (Exception e) {
      throw new RuntimeException("failed to load due to " + e.toString(), e);
    }

    this.valArray = list;
    this.freqs = freqList.toIntArray();
    this.minIDs = minIDList.toIntArray();
    this.maxIDs = maxIDList.toIntArray();

    int doc = 0;
    while (doc <= maxdoc && !_nestedArray.contains(doc, 0, true)) {
      ++doc;
    }
    if (doc <= maxdoc) {
      this.minIDs[0] = doc;
      doc = maxdoc;
      while (doc > 0 && !_nestedArray.contains(doc, 0, true)) {
        --doc;
      }
      if (doc > 0) {
        this.maxIDs[0] = doc;
      }
    }
    this.freqs[0] = maxdoc + 1 - (int) bitset.cardinality();
  }

예제 #18

0

파일 보기

파일: IndexFileManageServiceImpl.java 프로젝트: skywingzz/crescent

  @Override
  public boolean reload(String collectionName, String topRankingField) {
    if (collectionName == null) {
      return false;
    }

    CrescentCollectionHandler collectionHandler =
        SpringApplicationContext.getBean(
            "crescentCollectionHandler", CrescentCollectionHandler.class);

    CrescentCollection collection =
        collectionHandler.getCrescentCollections().getCrescentCollection(collectionName);

    if (collection == null) {
      logger.debug("doesn't Collection Info => {}", collectionName);
      init(View.Overview);
      return false;
    }

    if (topRankingField == null) {
      if (collection.getDefaultSearchFields().get(0) != null) {
        topRankingField = collection.getDefaultSearchFields().get(0).getName();
      } else {
        logger.debug("doesn't defaultSearchField => {}", collectionName);
        init(View.Overview);
        return false;
      }
    }

    List<String> fieldName = new ArrayList<String>();
    for (CrescentCollectionField field : collection.getFields()) fieldName.add(field.getName());
    TopRankingQueue topRankingQueue =
        new TopRankingQueue(DEFAULT_TOPRANKING_TERM, new RankingTermComparator());

    try {
      Directory directory = FSDirectory.open(new File(collection.getIndexingDirectory()));
      IndexReader reader = IndexReader.open(directory);

      TermEnum terms = reader.terms();

      int termFreq = 0;
      int termCount = 0;
      Term beforeTerm = null;
      // init term count
      fieldTermCount.clear();
      for (CrescentCollectionField field : collection.getFields())
        fieldTermCount.put(field.getName(), 0);
      topRankingQueue.clear();

      while (terms.next()) {
        Term currTerm = terms.term();
        if (beforeTerm == null) {
          beforeTerm = currTerm;
        }

        if (beforeTerm.field() == currTerm.field()) {
          termCount++;
        } else {
          fieldTermCount.put(beforeTerm.field(), termCount);
          termCount = 1;
          beforeTerm = currTerm;
        }

        TermDocs termDocs = reader.termDocs(currTerm);

        while (termDocs.next()) {
          if (currTerm.field().equals(topRankingField)) {
            RankingTerm e = new RankingTerm(currTerm.text(), currTerm.field(), termDocs.freq());
            topRankingQueue.add(e);
          }
        }
        termFreq++;
      }
      if (beforeTerm != null) fieldTermCount.put(beforeTerm.field(), termCount);

      terms.close();
      result.put("numOfTerm", termFreq);
      result.put("numOfDoc", reader.numDocs());
      result.put("hasDel", reader.hasDeletions());
      result.put("isOptimize", reader.isOptimized());
      result.put("indexVersion", reader.getVersion());
      result.put("lastModify", new Date(IndexReader.lastModified(directory)));
    } catch (IOException e) {
      e.printStackTrace();
      return false;
    }
    if (topRankingQueue.size() != 0) {
      topRankingTerms = topRankingQueue.toArray();
      Arrays.sort(topRankingTerms);
    }
    result.put("collectionName", collectionName);
    result.put("indexName", collection.getIndexingDirectory());
    result.put("numOfField", collection.getFields().size());
    result.put("termCount", fieldTermCount);
    result.put("topRanking", topRankingTerms);
    result.put("fieldName", fieldName);

    return true;
  }

예제 #19

0

파일 보기

파일: SearchFilesDefault.java 프로젝트: udaiarora/IR

  public static void main(String[] args) throws Exception {
    // the IndexReader object is the main handle that will give you
    // all the documents, terms and inverted index
    IndexReader r = IndexReader.open(FSDirectory.open(new File("index")));

    // You can figure out the number of documents using the maxDoc() function
    System.out.println("The number of documents in this index is: " + r.maxDoc());

    int i = 0;
    // You can find out all the terms that have been indexed using the terms() function
    TermEnum t = r.terms();
    while (t.next()) {
      // Since there are so many terms, let us try printing only term #100000-#100010
      if (i > 100000) System.out.println("[" + i + "] " + t.term().text());
      if (++i > 100010) break;
    }

    // You can create your own query terms by calling the Term constructor, with the field
    // 'contents'
    // In the following example, the query term is 'brute'
    Term te = new Term("contents", "brute");

    // You can also quickly find out the number of documents that have term t
    System.out.println("Number of documents with the word 'brute' is: " + r.docFreq(te));

    // You can use the inverted index to find out all the documents that contain the term 'brute'
    //  by using the termDocs function
    TermDocs td = r.termDocs(te);
    while (td.next()) {
      System.out.println(
          "Document number ["
              + td.doc()
              + "] contains the term 'brute' "
              + td.freq()
              + " time(s).");
    }

    // You can find the URL of the a specific document number using the document() function
    // For example, the URL for document number 14191 is:
    Document d = r.document(14191);
    String url =
        d.getFieldable("path")
            .stringValue(); // the 'path' field of the Document object holds the URL
    System.out.println(url.replace("%%", "/"));

    // -------- Now let us use all of the functions above to make something useful --------
    // The following bit of code is a worked out example of how to get a bunch of documents
    // in response to a query and show them (without ranking them according to TF/IDF)
    Scanner sc = new Scanner(System.in);
    String str = "";
    System.out.print("query> ");
    while (!(str = sc.nextLine()).equals("quit")) {
      String[] terms = str.split("\\s+");
      for (String word : terms) {
        Term term = new Term("contents", word);
        TermDocs tdocs = r.termDocs(term);
        while (tdocs.next()) {
          String d_url =
              r.document(tdocs.doc()).getFieldable("path").stringValue().replace("%%", "/");
          System.out.println("[" + tdocs.doc() + "] " + d_url);
        }
      }
      System.out.print("query> ");
    }
  }

예제 #20

0

파일 보기

파일: GeoFacetHandler.java 프로젝트: andrewhuzz/bobo

    public void load(String latFieldName, String lonFieldName, BoboIndexReader reader)
        throws IOException {
      if (reader == null) throw new NullPointerException("reader object is null");
      if (latFieldName == null) throw new NullPointerException("latitude Field Name is null");
      if (lonFieldName == null) throw new NullPointerException("longitude Field Name is null");

      String latField = latFieldName.intern();
      String lonField = lonFieldName.intern();
      int maxDoc = reader.maxDoc();

      BigFloatArray xVals = this._xValArray;
      BigFloatArray yVals = this._yValArray;
      BigFloatArray zVals = this._zValArray;

      if (xVals == null) xVals = newInstance(maxDoc);
      else xVals.ensureCapacity(maxDoc);
      if (yVals == null) yVals = newInstance(maxDoc);
      else yVals.ensureCapacity(maxDoc);
      if (zVals == null) zVals = newInstance(maxDoc);
      else zVals.ensureCapacity(maxDoc);

      this._xValArray = xVals;
      this._yValArray = yVals;
      this._zValArray = zVals;

      Term latTerm = new Term(latFieldName, "");
      TermDocs termDocs = reader.termDocs(latTerm);
      TermEnum termEnum = reader.terms(latTerm);

      float docLat, docLon;
      int termCount = 1;
      String lonValue = null;
      int length = maxDoc + 1;
      int doc;
      termDocs.next();
      try {
        do {
          Term term = termEnum.term();
          if (term == null || term.field() != latFieldName) continue;

          if (termCount > xVals.capacity())
            throw new IOException("Maximum number of values cannot exceed: " + xVals.capacity());
          if (termCount >= length)
            throw new RuntimeException(
                "There are more terms than documents in field "
                    + latFieldName
                    + " or "
                    + lonFieldName
                    + ", but its impossible to sort on tokenized fields");

          // pull the termDocs to point to the document for the current term in the termEnum
          termDocs.seek(termEnum);
          while (termDocs.next()) {
            doc = termDocs.doc();

            // read the latitude value in the current document
            docLat = Float.parseFloat(term.text().trim());
            // read the longitude value in the current document
            Document docVal = reader.document(doc, null);
            lonValue = docVal.get(lonFieldName);
            if (lonValue == null) continue;
            else docLon = Float.parseFloat(lonValue);

            // convert the lat, lon values to x,y,z coordinates
            float[] coords = GeoMatchUtil.geoMatchCoordsFromDegrees(docLat, docLon);
            _xValArray.add(doc, coords[0]);
            _yValArray.add(doc, coords[1]);
            _zValArray.add(doc, coords[2]);
          }
        } while (termEnum.next());
      } catch (Exception e) {
        // TODO: get rid of this catch phrase
        e.printStackTrace();
      } finally {
        if (termDocs != null) termDocs.close();
        if (termEnum != null) termEnum.close();
      }
    }

예제 #21

0

파일 보기

파일: TermNumEnumerator.java 프로젝트: freedream520/mdrill

 public TermDocs getTermDocs() throws IOException {
   if (termDocs == null) termDocs = reader.termDocs(t, 102400);
   else termDocs.seek(t);
   return termDocs;
 }