Esempio n. 1
0
  /**
   * Remove a stale file (uidIter.term().text()) from the index database (and the xref file)
   *
   * @throws java.io.IOException if an error occurs
   */
  private void removeFile() throws IOException {
    String path = Util.uid2url(uidIter.term().utf8ToString());

    for (IndexChangedListener listener : listeners) {
      listener.fileRemove(path);
    }
    writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term()));
    writer.prepareCommit();
    writer.commit();

    File xrefFile;
    if (RuntimeEnvironment.getInstance().isCompressXref()) {
      xrefFile = new File(xrefDir, path + ".gz");
    } else {
      xrefFile = new File(xrefDir, path);
    }
    File parent = xrefFile.getParentFile();

    if (!xrefFile.delete() && xrefFile.exists()) {
      log.log(Level.INFO, "Failed to remove obsolete xref-file: {0}", xrefFile.getAbsolutePath());
    }

    // Remove the parent directory if it's empty
    if (parent.delete()) {
      log.log(Level.FINE, "Removed empty xref dir:{0}", parent.getAbsolutePath());
    }
    setDirty();
    for (IndexChangedListener listener : listeners) {
      listener.fileRemoved(path);
    }
  }
Esempio n. 2
0
  /**
   * List all of the files in this index database
   *
   * @throws IOException If an IO error occurs while reading from the database
   */
  public void listFiles() throws IOException {
    IndexReader ireader = null;
    TermsEnum iter;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory); // open existing index
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.U);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        log.fine(Util.uid2url(iter.term().utf8ToString()));
        iter.next();
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
Esempio n. 3
0
  public void listTokens(int freq) throws IOException {
    IndexReader ireader = null;
    TermsEnum iter = null;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory);
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.DEFS);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        // if (iter.term().field().startsWith("f")) {
        if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
          log.warning(iter.term().utf8ToString());
        }
        iter.next();
        /*} else {
        break;
        }*/
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
  public void testThreeBlocks() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "m" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "mo" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    if (VERBOSE) {
      while (te.next() != null) {
        System.out.println("TERM: " + te.ord() + " " + te.term().utf8ToString());
      }
    }

    assertTrue(te.seekExact(new BytesRef("mo")));
    assertEquals(27, te.ord());

    te.seekExact(90);
    assertEquals(new BytesRef("s"), te.term());

    testEnum(te, terms);

    r.close();
    w.close();
    dir.close();
  }
  private void getPrefixTerms(
      ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment
    // into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf
    // individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
      Terms _terms = leaf.reader().terms(field);
      if (_terms == null) {
        continue;
      }

      TermsEnum termsEnum = _terms.iterator();
      TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
      if (TermsEnum.SeekStatus.END == seekStatus) {
        continue;
      }

      for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
        if (!StringHelper.startsWith(term, prefix.bytes())) {
          break;
        }

        terms.add(new Term(field, BytesRef.deepCopyOf(term)));
        if (terms.size() >= maxExpansions) {
          return;
        }
      }
    }
  }
  public void testSeekCeilNotFound() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    // Get empty string in there!
    doc.add(newStringField("field", "", Field.Store.NO));
    w.addDocument(doc);

    for (int i = 0; i < 36; i++) {
      doc = new Document();
      String term = "" + (char) (97 + i);
      String term2 = "a" + (char) (97 + i);
      doc.add(newTextField("field", term + " " + term2, Field.Store.NO));
      w.addDocument(doc);
    }

    w.forceMerge(1);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);
    assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef(new byte[] {0x22})));
    assertEquals("a", te.term().utf8ToString());
    assertEquals(1L, te.ord());
    r.close();
    w.close();
    dir.close();
  }
Esempio n. 7
0
  /**
   * Add term frequencies for a single document to a frequency map.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param freq where to add to the token frequencies
   */
  public static void getFrequenciesFromTermVector(
      IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) {
    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum postingsEnum = null;
      while (termsEnum.next() != null) {
        postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS);
        String term = termsEnum.term().utf8ToString();
        Integer n = freq.get(term);
        if (n == null) {
          n = 0;
        }
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          n += termsEnum.docFreq();
        }
        freq.put(term, n);
      }
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
  /*
   *  Utility function to display a term vector.
   */
  static void termVectorDisplay(Terms terms) throws IOException {

    if ((terms == null) || (terms.size() == -1)) System.out.println("    The field is not stored.");
    else {
      /*
       *  The terms for this field are stored.
       */
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %10d %-20s %d ",
            ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq());

        DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null);
        currDoc.nextDoc();

        for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++)
          System.out.print(currDoc.nextPosition() + " ");

        System.out.println();
      }
      ;
    }
    ;
  }
  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }
  protected void compareTermVectors(Terms terms, Terms memTerms, String field_name)
      throws IOException {

    TermsEnum termEnum = terms.iterator();
    TermsEnum memTermEnum = memTerms.iterator();

    while (termEnum.next() != null) {
      assertNotNull(memTermEnum.next());
      assertThat(termEnum.totalTermFreq(), equalTo(memTermEnum.totalTermFreq()));

      PostingsEnum docsPosEnum = termEnum.postings(null, PostingsEnum.POSITIONS);
      PostingsEnum memDocsPosEnum = memTermEnum.postings(null, PostingsEnum.POSITIONS);
      String currentTerm = termEnum.term().utf8ToString();

      assertThat(
          "Token mismatch for field: " + field_name,
          currentTerm,
          equalTo(memTermEnum.term().utf8ToString()));

      docsPosEnum.nextDoc();
      memDocsPosEnum.nextDoc();

      int freq = docsPosEnum.freq();
      assertThat(freq, equalTo(memDocsPosEnum.freq()));
      for (int i = 0; i < freq; i++) {
        String failDesc = " (field:" + field_name + " term:" + currentTerm + ")";
        int memPos = memDocsPosEnum.nextPosition();
        int pos = docsPosEnum.nextPosition();
        assertThat("Position test failed" + failDesc, memPos, equalTo(pos));
        assertThat(
            "Start offset test failed" + failDesc,
            memDocsPosEnum.startOffset(),
            equalTo(docsPosEnum.startOffset()));
        assertThat(
            "End offset test failed" + failDesc,
            memDocsPosEnum.endOffset(),
            equalTo(docsPosEnum.endOffset()));
        assertThat(
            "Missing payload test failed" + failDesc,
            docsPosEnum.getPayload(),
            equalTo(docsPosEnum.getPayload()));
      }
    }
    assertNull("Still some tokens not processed", memTermEnum.next());
  }
Esempio n. 11
0
 private BytesRef setTerm() throws IOException {
   term = termsEnum.term();
   // System.out.println("  setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix ==
   // null ? "null" : prefix.utf8ToString()));
   if (prefix != null && !StringHelper.startsWith(term, prefix)) {
     term = null;
   }
   return term;
 }
Esempio n. 12
0
  /**
   * We will implement this according to the Lucene specification the formula used: sum ( IDF(qi) *
   * (df(qi,D) * (k+1)) / (df(qi,D) + k * (1-b + b*|D| / avgFL)) IDF and avgFL computation are
   * described above.
   *
   * @param doc
   * @param terms
   * @param context
   * @return
   */
  @Override
  public float extract(Document doc, Terms terms, RerankerContext context) {
    Set<String> queryTokens = new HashSet<>(context.getQueryTokens());

    TermsEnum termsEnum = null;
    try {
      termsEnum = terms.iterator();
    } catch (IOException e) {
      LOG.warn("Error computing BM25, unable to retrieve terms enum");
      return 0.0f;
    }

    IndexReader reader = context.getIndexSearcher().getIndexReader();
    long maxDocs = reader.numDocs();
    long sumTotalTermFreq = getSumTermFrequency(reader, context.getField());
    // Compute by iterating
    long docSize = 0L;

    // NOTE df cannot be retrieved just from the term vector,
    // the term vector here is only a partial term vector that treats this as if we only have 1
    // document in the index
    Map<String, Integer> docFreqMap = null;
    try {
      docFreqMap = getDocFreqs(reader, context.getQueryTokens(), context.getField());
    } catch (IOException e) {
      LOG.warn("Unable to retrieve document frequencies.");
      docFreqMap = new HashMap<>();
    }

    Map<String, Long> termFreqMap = new HashMap<>();
    try {
      while (termsEnum.next() != null) {
        String termString = termsEnum.term().utf8ToString();
        docSize += termsEnum.totalTermFreq();
        if (queryTokens.contains(termString)) {
          termFreqMap.put(termString, termsEnum.totalTermFreq());
        }
      }
    } catch (IOException e) {
      LOG.warn("Unable to retrieve termsEnum, treating as 0");
    }

    float score = 0.0f;
    // Iterate over the query tokens
    double avgFL = computeAvgFL(sumTotalTermFreq, maxDocs);
    for (String token : queryTokens) {
      long docFreq = docFreqMap.containsKey(token) ? docFreqMap.get(token) : 0;
      double termFreq = termFreqMap.containsKey(token) ? termFreqMap.get(token) : 0;
      double numerator = (this.k1 + 1) * termFreq;
      double docLengthFactor = this.b * (docSize / avgFL);
      double denominator = termFreq + (this.k1) * (1 - this.b + docLengthFactor);
      score += computeIDF(docFreq, maxDocs) * numerator / denominator;
    }

    return score;
  }
 SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd)
     throws IOException {
   super(counts, total - counts[0], counts[0], endFacetOrd + 1);
   this.tenum = tenum;
   this.mergePos = startFacetOrd == -1 ? 1 : startFacetOrd + 1;
   if (mergePos < maxTermPos) {
     assert tenum != null;
     tenum.seekExact(startFacetOrd == -1 ? 0 : startFacetOrd);
     mergeTerm = tenum.term();
   }
 }
  public void testBasic() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("field", "a b c", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    // Test next()
    assertEquals(new BytesRef("a"), te.next());
    assertEquals(0L, te.ord());
    assertEquals(new BytesRef("b"), te.next());
    assertEquals(1L, te.ord());
    assertEquals(new BytesRef("c"), te.next());
    assertEquals(2L, te.ord());
    assertNull(te.next());

    // Test seekExact by term
    assertTrue(te.seekExact(new BytesRef("b")));
    assertEquals(1, te.ord());
    assertTrue(te.seekExact(new BytesRef("a")));
    assertEquals(0, te.ord());
    assertTrue(te.seekExact(new BytesRef("c")));
    assertEquals(2, te.ord());

    // Test seekExact by ord
    te.seekExact(1);
    assertEquals(new BytesRef("b"), te.term());
    te.seekExact(0);
    assertEquals(new BytesRef("a"), te.term());
    te.seekExact(2);
    assertEquals(new BytesRef("c"), te.term());

    r.close();
    w.close();
    dir.close();
  }
  private void testEnum(TermsEnum te, List<String> terms) throws IOException {
    Collections.sort(terms);
    for (int i = terms.size() - 1; i >= 0; i--) {
      if (VERBOSE) {
        System.out.println("TEST: seek to ord=" + i);
      }
      te.seekExact(i);
      assertEquals(i, te.ord());
      assertEquals(terms.get(i), te.term().utf8ToString());
    }

    int iters = atLeast(1000);
    for (int iter = 0; iter < iters; iter++) {
      int ord = random().nextInt(terms.size());
      if (random().nextBoolean()) {
        te.seekExact(ord);
        assertEquals(terms.get(ord), te.term().utf8ToString());
      } else {
        te.seekExact(new BytesRef(terms.get(ord)));
        assertEquals(ord, te.ord());
      }
    }
  }
 SegmentResult(
     int[] counts,
     int total,
     int missingCountIndex,
     TermsEnum tenum,
     int startFacetOrd,
     int endFacetOrd)
     throws IOException {
   super(
       counts,
       total - counts[missingCountIndex],
       counts[missingCountIndex],
       endFacetOrd == missingCountIndex + 1 ? missingCountIndex : endFacetOrd);
   this.tenum = tenum;
   this.mergePos = startFacetOrd;
   if (tenum != null) {
     tenum.seekExact(mergePos);
     mergeTerm = tenum.term();
   }
 }
  public void testSeveralNonRootBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 30; i++) {
      for (int j = 0; j < 30; j++) {
        Document doc = new Document();
        String term = "" + (char) (97 + i) + (char) (97 + j);
        terms.add(term);
        if (VERBOSE) {
          System.out.println("term=" + term);
        }
        doc.add(newTextField("body", term, Field.Store.NO));
        w.addDocument(doc);
      }
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w, true);
    TermsEnum te = MultiFields.getTerms(r, "body").iterator(null);

    for (int i = 0; i < 30; i++) {
      for (int j = 0; j < 30; j++) {
        String term = "" + (char) (97 + i) + (char) (97 + j);
        if (VERBOSE) {
          System.out.println("TEST: check term=" + term);
        }
        assertEquals(term, te.next().utf8ToString());
        assertEquals(30 * i + j, te.ord());
      }
    }

    testEnum(te, terms);

    te.seekExact(0);
    assertEquals("aa", te.term().utf8ToString());

    r.close();
    w.close();
    dir.close();
  }
Esempio n. 18
0
  @Override
  public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv)
      throws IOException {
    int prefixLength = prefix.length();
    Terms terms = MultiFields.getTerms(reader, fieldName);
    if (terms != null) {
      Matcher matcher = pattern.matcher("");
      try {
        TermsEnum termsEnum = terms.iterator(null);

        TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
        BytesRef text;
        if (status == TermsEnum.SeekStatus.FOUND) {
          text = prefixRef;
        } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
          text = termsEnum.term();
        } else {
          text = null;
        }

        while (text != null) {
          if (text != null && StringHelper.startsWith(text, prefixRef)) {
            String textString = text.utf8ToString();
            matcher.reset(textString.substring(prefixLength));
            if (matcher.matches()) {
              mtv.visitMatchingTerm(new Term(fieldName, textString));
            }
          } else {
            break;
          }
          text = termsEnum.next();
        }
      } finally {
        matcher.reset();
      }
    }
  }
  public void testFloorBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    for (int i = 0; i < 128; i++) {
      Document doc = new Document();
      String term = "" + (char) i;
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term));
      }
      doc.add(newStringField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w, true);
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    if (VERBOSE) {
      BytesRef term;
      while ((term = te.next()) != null) {
        System.out.println("  " + te.ord() + ": " + term.utf8ToString());
      }
    }

    assertTrue(te.seekExact(new BytesRef("a")));
    assertEquals(97, te.ord());

    te.seekExact(98);
    assertEquals(new BytesRef("b"), te.term());

    assertTrue(te.seekExact(new BytesRef("z")));
    assertEquals(122, te.ord());

    r.close();
    w.close();
    dir.close();
  }
Esempio n. 20
0
  /** Call this only once (if you subclass!) */
  protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix)
      throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
      throw new IllegalStateException(
          "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }
    // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);

    final int maxDoc = reader.maxDoc();
    final int[] index =
        new int
            [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last
    // number
    final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
    final byte[][] bytes =
        new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

    final Terms terms = reader.terms(field);
    if (terms == null) {
      // No terms
      return;
    }

    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    // System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
      // No terms match
      return;
    }

    // For our "term index wrapper"
    final List<BytesRef> indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);

    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];

    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs

    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.

    int termNum = 0;
    postingsEnum = null;

    // Loop begins with te positioned to first term (we call
    // seek above):
    for (; ; ) {
      final BytesRef t = te.term();
      if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
        break;
      }
      // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

      visitTerm(te, termNum);

      if ((termNum & indexIntervalMask) == 0) {
        // Index this term
        sizeOfIndexedStrings += t.length;
        BytesRef indexedTerm = new BytesRef();
        indexedTermsBytes.copy(t, indexedTerm);
        // TODO: really should 1) strip off useless suffix,
        // and 2) use FST not array/PagedBytes
        indexedTerms.add(indexedTerm);
      }

      final int df = te.docFreq();
      if (df <= maxTermDocFreq) {

        postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);

        // dF, but takes deletions into account
        int actualDF = 0;

        for (; ; ) {
          int doc = postingsEnum.nextDoc();
          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
            break;
          }
          // System.out.println("  chunk=" + chunk + " docs");

          actualDF++;
          termInstances++;

          // System.out.println("    docID=" + doc);
          // add TNUM_OFFSET to the term number to make room for special reserved values:
          // 0 (end term) and 1 (index into byte array follows)
          int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
          lastTerm[doc] = termNum;
          int val = index[doc];

          if ((val & 0xff) == 1) {
            // index into byte array (actually the end of
            // the doc-specific byte[] when building)
            int pos = val >>> 8;
            int ilen = vIntSize(delta);
            byte[] arr = bytes[doc];
            int newend = pos + ilen;
            if (newend > arr.length) {
              // We avoid a doubling strategy to lower memory usage.
              // this faceting method isn't for docs with many terms.
              // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit
              // boundary.
              // TODO: figure out what array lengths we can round up to w/o actually using more
              // memory
              // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
              // It should be safe to round up to the nearest 32 bits in any case.
              int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
              byte[] newarr = new byte[newLen];
              System.arraycopy(arr, 0, newarr, 0, pos);
              arr = newarr;
              bytes[doc] = newarr;
            }
            pos = writeInt(delta, arr, pos);
            index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
          } else {
            // OK, this int has data in it... find the end (a zero starting byte - not
            // part of another number, hence not following a byte with the high bit set).
            int ipos;
            if (val == 0) {
              ipos = 0;
            } else if ((val & 0x0000ff80) == 0) {
              ipos = 1;
            } else if ((val & 0x00ff8000) == 0) {
              ipos = 2;
            } else if ((val & 0xff800000) == 0) {
              ipos = 3;
            } else {
              ipos = 4;
            }

            // System.out.println("      ipos=" + ipos);

            int endPos = writeInt(delta, tempArr, ipos);
            // System.out.println("      endpos=" + endPos);
            if (endPos <= 4) {
              // System.out.println("      fits!");
              // value will fit in the integer... move bytes back
              for (int j = ipos; j < endPos; j++) {
                val |= (tempArr[j] & 0xff) << (j << 3);
              }
              index[doc] = val;
            } else {
              // value won't fit... move integer into byte[]
              for (int j = 0; j < ipos; j++) {
                tempArr[j] = (byte) val;
                val >>>= 8;
              }
              // point at the end index in the byte[]
              index[doc] = (endPos << 8) | 1;
              bytes[doc] = tempArr;
              tempArr = new byte[12];
            }
          }
        }
        setActualDocFreq(termNum, actualDF);
      }

      termNum++;
      if (te.next() == null) {
        break;
      }
    }

    numTermsInField = termNum;

    long midPoint = System.nanoTime();

    if (termInstances == 0) {
      // we didn't invert anything
      // lower memory consumption.
      tnums = null;
    } else {

      this.index = index;

      //
      // transform intermediate form into the final form, building a single byte[]
      // at a time, and releasing the intermediate byte[]s as we go to avoid
      // increasing the memory footprint.
      //

      for (int pass = 0; pass < 256; pass++) {
        byte[] target = tnums[pass];
        int pos = 0; // end in target;
        if (target != null) {
          pos = target.length;
        } else {
          target = new byte[4096];
        }

        // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
        // where pp is the pass (which array we are building), and xx is all values.
        // each pass shares the same byte[] for termNumber lists.
        for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
          int lim = Math.min(docbase + (1 << 16), maxDoc);
          for (int doc = docbase; doc < lim; doc++) {
            // System.out.println("  pass="******" process docID=" + doc);
            int val = index[doc];
            if ((val & 0xff) == 1) {
              int len = val >>> 8;
              // System.out.println("    ptr pos=" + pos);
              index[doc] = (pos << 8) | 1; // change index to point to start of array
              if ((pos & 0xff000000) != 0) {
                // we only have 24 bits for the array index
                throw new IllegalStateException(
                    "Too many values for UnInvertedField faceting on field " + field);
              }
              byte[] arr = bytes[doc];
              /*
              for(byte b : arr) {
                //System.out.println("      b=" + Integer.toHexString((int) b));
              }
              */
              bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
              if (target.length <= pos + len) {
                int newlen = target.length;
                /**
                 * * we don't have to worry about the array getting too large since the "pos" param
                 * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { //
                 * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new
                 * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen
                 * <= pos + len) newlen<<=1; // doubling strategy } **
                 */
                while (newlen <= pos + len) newlen <<= 1; // doubling strategy
                byte[] newtarget = new byte[newlen];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
              }
              System.arraycopy(arr, 0, target, pos, len);
              pos += len + 1; // skip single byte at end and leave it 0 for terminator
            }
          }
        }

        // shrink array
        if (pos < target.length) {
          byte[] newtarget = new byte[pos];
          System.arraycopy(target, 0, newtarget, 0, pos);
          target = newtarget;
        }

        tnums[pass] = target;

        if ((pass << 16) > maxDoc) break;
      }
    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);

    long endTime = System.nanoTime();

    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
  }
  private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception {
    TermsEnum leftEnum = null;
    TermsEnum rightEnum = null;

    // just an upper bound
    int numTests = atLeast(20);
    Random random = random();

    // collect this number of terms from the left side
    HashSet<BytesRef> tests = new HashSet<BytesRef>();
    int numPasses = 0;
    while (numPasses < 10 && tests.size() < numTests) {
      leftEnum = leftTerms.iterator(leftEnum);
      BytesRef term = null;
      while ((term = leftEnum.next()) != null) {
        int code = random.nextInt(10);
        if (code == 0) {
          // the term
          tests.add(BytesRef.deepCopyOf(term));
        } else if (code == 1) {
          // truncated subsequence of term
          term = BytesRef.deepCopyOf(term);
          if (term.length > 0) {
            // truncate it
            term.length = random.nextInt(term.length);
          }
        } else if (code == 2) {
          // term, but ensure a non-zero offset
          byte newbytes[] = new byte[term.length + 5];
          System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length);
          tests.add(new BytesRef(newbytes, 5, term.length));
        }
      }
      numPasses++;
    }

    ArrayList<BytesRef> shuffledTests = new ArrayList<BytesRef>(tests);
    Collections.shuffle(shuffledTests, random);

    for (BytesRef b : shuffledTests) {
      leftEnum = leftTerms.iterator(leftEnum);
      rightEnum = rightTerms.iterator(rightEnum);

      assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b));
      assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b));

      SeekStatus leftStatus;
      SeekStatus rightStatus;

      leftStatus = leftEnum.seekCeil(b);
      rightStatus = rightEnum.seekCeil(b);
      assertEquals(leftStatus, rightStatus);
      if (leftStatus != SeekStatus.END) {
        assertEquals(leftEnum.term(), rightEnum.term());
      }

      leftStatus = leftEnum.seekCeil(b);
      rightStatus = rightEnum.seekCeil(b);
      assertEquals(leftStatus, rightStatus);
      if (leftStatus != SeekStatus.END) {
        assertEquals(leftEnum.term(), rightEnum.term());
      }
    }
  }
Esempio n. 22
0
  /**
   * Get all words between the specified start and end positions from the term vector.
   *
   * <p>NOTE: this may return an array of less than the size requested, if the document ends before
   * the requested end position.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param start start position (first word we want to request)
   * @param end end position (last word we want to request)
   * @param partialOk is it okay if we're missing words in the middle, or do we need them all?
   *     (debug)
   * @return the words found, in order
   */
  public static String[] getWordsFromTermVector(
      IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk) {

    // Retrieve the term position vector of the contents of this document.
    // NOTE: might be faster to retrieve all term vectors at once

    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      if (!terms.hasPositions())
        throw new IllegalArgumentException(
            "Field " + luceneName + " has no character postion information");
      // String[] docTerms = new String[(int) terms.size()];
      // final List<BytesRef> termsList = new ArrayList<BytesRef>();
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum docPosEnum = null;
      int numFound = 0;
      String[] concordanceWords = new String[end - start + 1];
      while (termsEnum.next() != null) {
        docPosEnum = termsEnum.postings(null, docPosEnum, PostingsEnum.POSITIONS);
        while (docPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          // NOTE: .docId() will always return 0 in this case
          // if (docPosEnum.docID() != doc)
          //	throw new RuntimeException("Wrong doc id: " + docPosEnum.docID() + " (expected " + doc
          // + ")");
          for (int i = 0; i < docPosEnum.freq(); i++) {
            int position = docPosEnum.nextPosition();
            if (position == -1)
              throw new RuntimeException(
                  "Unexpected missing position (i="
                      + i
                      + ", docPosEnum.freq() = "
                      + docPosEnum.freq()
                      + ")");
            if (position >= start && position <= end) {
              if (concordanceWords[position - start] == null)
                concordanceWords[position - start] = termsEnum.term().utf8ToString();
              else concordanceWords[position - start] += "|" + termsEnum.term().utf8ToString();
              numFound++;
            }
          }
          if (numFound == concordanceWords.length) return concordanceWords;
        }
      }

      if (numFound < concordanceWords.length && !partialOk) {
        // If we simply ran into the end of the document, that's okay;
        // but if words are missing in the middle, that's not.
        String[] partial = new String[numFound];
        for (int i = 0; i < numFound; i++) {
          partial[i] = concordanceWords[i];
          if (partial[i] == null) {
            throw new RuntimeException(
                "Not all words found ("
                    + numFound
                    + " out of "
                    + concordanceWords.length
                    + "); missing words in the middle of concordance!");
          }
        }
        return partial;
      }
      return concordanceWords;
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
Esempio n. 23
0
 /**
  * Reconstruct document fields.
  *
  * @param docNum document number. If this document is deleted, but the index is not optimized yet,
  *     the reconstruction process may still yield the reconstructed field content even from
  *     deleted documents.
  * @return reconstructed document
  * @throws Exception
  */
 public Reconstructed reconstruct(int docNum) throws Exception {
   if (docNum < 0 || docNum > reader.maxDoc()) {
     throw new Exception("Document number outside of valid range.");
   }
   Reconstructed res = new Reconstructed();
   if (deleted != null && deleted.get(docNum)) {
     throw new Exception("Document is deleted.");
   } else {
     Document doc = reader.document(docNum);
     for (int i = 0; i < fieldNames.length; i++) {
       Field[] fs = doc.getFields(fieldNames[i]);
       if (fs != null && fs.length > 0) {
         res.getStoredFields().put(fieldNames[i], fs);
       }
     }
   }
   // collect values from unstored fields
   HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames));
   // try to use term vectors if available
   progress.maxValue = fieldNames.length;
   progress.curValue = 0;
   progress.minValue = 0;
   for (int i = 0; i < fieldNames.length; i++) {
     TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]);
     if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) {
       TermPositionVector tpv = (TermPositionVector) tvf;
       progress.message = "Reading term vectors ...";
       progress.curValue = i;
       setChanged();
       notifyObservers(progress);
       BytesRef[] tv = tpv.getTerms();
       for (int k = 0; k < tv.length; k++) {
         // do we have positions?
         int[] posArr = tpv.getTermPositions(k);
         if (posArr == null) {
           // only offsets
           TermVectorOffsetInfo[] offsets = tpv.getOffsets(k);
           if (offsets.length == 0) {
             continue;
           }
           // convert offsets into positions
           posArr = convertOffsets(offsets);
         }
         GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]);
         if (gsa == null) {
           gsa = new GrowableStringArray();
           res.getReconstructedFields().put(fieldNames[i], gsa);
         }
         for (int m = 0; m < posArr.length; m++) {
           gsa.append(posArr[m], "|", tv[k].utf8ToString());
         }
       }
       fields.remove(fieldNames[i]); // got what we wanted
     }
   }
   // this loop collects data only from left-over fields
   // not yet collected through term vectors
   progress.maxValue = fields.size();
   progress.curValue = 0;
   progress.minValue = 0;
   for (String fld : fields) {
     progress.message = "Collecting terms in " + fld + " ...";
     progress.curValue++;
     setChanged();
     notifyObservers(progress);
     Terms terms = MultiFields.getTerms(reader, fld);
     if (terms == null) { // no terms in this field
       continue;
     }
     TermsEnum te = terms.iterator();
     while (te.next() != null) {
       DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null);
       if (dpe == null) { // no position info for this field
         break;
       }
       int num = dpe.advance(docNum);
       if (num != docNum) { // either greater than or NO_MORE_DOCS
         continue; // no data for this term in this doc
       }
       String term = te.term().utf8ToString();
       GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld);
       if (gsa == null) {
         gsa = new GrowableStringArray();
         res.getReconstructedFields().put(fld, gsa);
       }
       for (int k = 0; k < dpe.freq(); k++) {
         int pos = dpe.nextPosition();
         gsa.append(pos, "|", term);
       }
     }
   }
   progress.message = "Done.";
   progress.curValue = 100;
   setChanged();
   notifyObservers(progress);
   return res;
 }
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(TermsParams.TERMS, false)) return;

    String[] fields = params.getParams(TermsParams.TERMS_FIELD);

    NamedList<Object> termsResult = new SimpleOrderedMap<>();
    rb.rsp.add("terms", termsResult);

    if (fields == null || fields.length == 0) return;

    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
      limit = Integer.MAX_VALUE;
    }

    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort =
        !TermsParams.TERMS_SORT_INDEX.equals(
            params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
      freqmax = Integer.MAX_VALUE;
    }
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;

    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);

    final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader();
    Fields lfields = indexReader.fields();

    for (String field : fields) {
      NamedList<Integer> fieldTerms = new NamedList<>();
      termsResult.add(field, fieldTerms);

      Terms terms = lfields == null ? null : lfields.terms(field);
      if (terms == null) {
        // no terms for this field
        continue;
      }

      FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
      if (ft == null) ft = new StrField();

      // prefix must currently be text
      BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);

      BytesRef upperBytes = null;
      if (upperStr != null) {
        upperBytes = new BytesRef();
        ft.readableToIndexed(upperStr, upperBytes);
      }

      BytesRef lowerBytes;
      if (lowerStr == null) {
        // If no lower bound was specified, use the prefix
        lowerBytes = prefixBytes;
      } else {
        lowerBytes = new BytesRef();
        if (raw) {
          // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
          // perhaps we detect if the FieldType is non-character and expect hex if so?
          lowerBytes = new BytesRef(lowerStr);
        } else {
          lowerBytes = new BytesRef();
          ft.readableToIndexed(lowerStr, lowerBytes);
        }
      }

      TermsEnum termsEnum = terms.iterator(null);
      BytesRef term = null;

      if (lowerBytes != null) {
        if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
          // Only advance the enum if we are excluding the lower bound and the lower Term actually
          // matches
          if (lowerIncl == false && term.equals(lowerBytes)) {
            term = termsEnum.next();
          }
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }

      int i = 0;
      BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
          (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
      CharsRef external = new CharsRef();
      while (term != null && (i < limit || sort)) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // stop if the prefix doesn't match
        if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break;

        if (pattern != null) {
          // indexed text or external text?
          // TODO: support "raw" mode?
          ft.indexedToReadable(term, external);
          externalized = true;
          if (!pattern.matcher(external).matches()) {
            term = termsEnum.next();
            continue;
          }
        }

        if (upperBytes != null) {
          int upperCmp = term.compareTo(upperBytes);
          // if we are past the upper term, or equal to it (when don't include upper) then stop.
          if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break;
        }

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        if (docFreq >= freqmin && docFreq <= freqmax) {
          // add the term to the list
          if (sort) {
            queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
          } else {

            // TODO: handle raw somehow
            if (!externalized) {
              ft.indexedToReadable(term, external);
            }
            fieldTerms.add(external.toString(), docFreq);
            i++;
          }
        }

        term = termsEnum.next();
      }

      if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
          if (i >= limit) break;
          ft.indexedToReadable(item.key, external);
          fieldTerms.add(external.toString(), item.val);
          i++;
        }
      }
    }
  }
Esempio n. 25
0
  /**
   * Returns a list of terms in the specified field along with the corresponding count of documents
   * in the set that match that constraint. This method uses the FilterCache to get the intersection
   * count between <code>docs</code> and the DocSet for each term in the filter.
   *
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
   */
  public NamedList<Integer> getFacetTermEnumCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String field,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase,
      SolrParams params)
      throws IOException {

    /* :TODO: potential optimization...
     * cache the Terms with the highest docFreq and try them first
     * don't enum if we get our max from them
     */

    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);

    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
      SortedIntDocSet sset = (SortedIntDocSet) docs;
      fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }

    IndexSchema schema = searcher.getSchema();
    LeafReader r = searcher.getLeafReader();
    FieldType ft = schema.getFieldType(field);

    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
        sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();

    int min = mincount - 1; // the smallest value in the top 'N' values
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

    BytesRef prefixTermBytes = null;
    if (prefix != null) {
      String indexedPrefix = ft.toInternal(prefix);
      prefixTermBytes = new BytesRef(indexedPrefix);
    }

    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
      termsEnum = terms.iterator();

      // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
      // facet.offset when sorting by index order.

      if (prefixTermBytes != null) {
        if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }
    }

    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();

    if (docs.size() >= mincount) {
      while (term != null) {

        if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break;

        if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
          int df = termsEnum.docFreq();

          // If we are sorting, we can use df>min (rather than >=) since we
          // are going in index order.  For certain term distributions this can
          // make a large difference (for example, many terms with df=1).
          if (df > 0 && df > min) {
            int c;

            if (df >= minDfFilterCache) {
              // use the filter cache

              if (deState == null) {
                deState = new SolrIndexSearcher.DocsEnumState();
                deState.fieldName = field;
                deState.liveDocs = r.getLiveDocs();
                deState.termsEnum = termsEnum;
                deState.postingsEnum = postingsEnum;
              }

              c = searcher.numDocs(docs, deState);

              postingsEnum = deState.postingsEnum;
            } else {
              // iterate over TermDocs to calculate the intersection

              // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it
              // matter for this?
              // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class
              // impl)
              // TODO: would passing deleted docs lead to better efficiency over checking the
              // fastForRandomSet?
              postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
              c = 0;

              if (postingsEnum instanceof MultiPostingsEnum) {
                MultiPostingsEnum.EnumWithSlice[] subs =
                    ((MultiPostingsEnum) postingsEnum).getSubs();
                int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                for (int subindex = 0; subindex < numSubs; subindex++) {
                  MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                  if (sub.postingsEnum == null) continue;
                  int base = sub.slice.start;
                  int docid;
                  while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    if (fastForRandomSet.exists(docid + base)) c++;
                  }
                }
              } else {
                int docid;
                while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                  if (fastForRandomSet.exists(docid)) c++;
                }
              }
            }

            if (sortByCount) {
              if (c > min) {
                BytesRef termCopy = BytesRef.deepCopyOf(term);
                queue.add(new CountPair<>(termCopy, c));
                if (queue.size() >= maxsize) min = queue.last().val;
              }
            } else {
              if (c >= mincount && --off < 0) {
                if (--lim < 0) break;
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
              }
            }
          }
        }
        term = termsEnum.next();
      }
    }

    if (sortByCount) {
      for (CountPair<BytesRef, Integer> p : queue) {
        if (--off >= 0) continue;
        if (--lim < 0) break;
        ft.indexedToReadable(p.key, charsRef);
        res.add(charsRef.toString(), p.val);
      }
    }

    if (missing) {
      res.add(null, getFieldMissingCount(searcher, docs, field));
    }

    return res;
  }
  private void duellReaders(CompositeReader other, LeafReader memIndexReader) throws IOException {
    Fields memFields = memIndexReader.fields();
    for (String field : MultiFields.getFields(other)) {
      Terms memTerms = memFields.terms(field);
      Terms iwTerms = memIndexReader.terms(field);
      if (iwTerms == null) {
        assertNull(memTerms);
      } else {
        NumericDocValues normValues = MultiDocValues.getNormValues(other, field);
        NumericDocValues memNormValues = memIndexReader.getNormValues(field);
        if (normValues != null) {
          // mem idx always computes norms on the fly
          assertNotNull(memNormValues);
          assertEquals(normValues.get(0), memNormValues.get(0));
        }

        assertNotNull(memTerms);
        assertEquals(iwTerms.getDocCount(), memTerms.getDocCount());
        assertEquals(iwTerms.getSumDocFreq(), memTerms.getSumDocFreq());
        assertEquals(iwTerms.getSumTotalTermFreq(), memTerms.getSumTotalTermFreq());
        TermsEnum iwTermsIter = iwTerms.iterator();
        TermsEnum memTermsIter = memTerms.iterator();
        if (iwTerms.hasPositions()) {
          final boolean offsets = iwTerms.hasOffsets() && memTerms.hasOffsets();

          while (iwTermsIter.next() != null) {
            assertNotNull(memTermsIter.next());
            assertEquals(iwTermsIter.term(), memTermsIter.term());
            PostingsEnum iwDocsAndPos = iwTermsIter.postings(null, PostingsEnum.ALL);
            PostingsEnum memDocsAndPos = memTermsIter.postings(null, PostingsEnum.ALL);
            while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
              assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc());
              assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq());
              for (int i = 0; i < iwDocsAndPos.freq(); i++) {
                assertEquals(
                    "term: " + iwTermsIter.term().utf8ToString(),
                    iwDocsAndPos.nextPosition(),
                    memDocsAndPos.nextPosition());
                if (offsets) {
                  assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset());
                  assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset());
                }

                if (iwTerms.hasPayloads()) {
                  assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload());
                }
              }
            }
          }
        } else {
          while (iwTermsIter.next() != null) {
            assertEquals(iwTermsIter.term(), memTermsIter.term());
            PostingsEnum iwDocsAndPos = iwTermsIter.postings(null);
            PostingsEnum memDocsAndPos = memTermsIter.postings(null);
            while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
              assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc());
              assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq());
            }
          }
        }
      }
    }
  }
Esempio n. 27
0
 /** Returns the term ({@link BytesRef}) corresponding to the provided ordinal. */
 public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
   termsEnum.seekExact(ord);
   return termsEnum.term();
 }
Esempio n. 28
0
  /**
   * Update the content of this index database
   *
   * @throws IOException if an error occurs
   * @throws HistoryException if an error occurs when accessing the history
   */
  public void update() throws IOException, HistoryException {
    synchronized (lock) {
      if (running) {
        throw new IOException("Indexer already running!");
      }
      running = true;
      interrupted = false;
    }

    String ctgs = RuntimeEnvironment.getInstance().getCtags();
    if (ctgs != null) {
      ctags = new Ctags();
      ctags.setBinary(ctgs);
    }
    if (ctags == null) {
      log.severe("Unable to run ctags! searching definitions will not work!");
    }

    if (ctags != null) {
      String filename = RuntimeEnvironment.getInstance().getCTagsExtraOptionsFile();
      if (filename != null) {
        ctags.setCTagsExtraOptionsFile(filename);
      }
    }

    try {
      Analyzer analyzer = AnalyzerGuru.getAnalyzer();
      IndexWriterConfig iwc = new IndexWriterConfig(SearchEngine.LUCENE_VERSION, analyzer);
      iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
      // iwc.setRAMBufferSizeMB(256.0);  //TODO check what is the sweet spot
      writer = new IndexWriter(indexDirectory, iwc);
      writer.commit(); // to make sure index exists on the disk
      // writer.setMaxFieldLength(RuntimeEnvironment.getInstance().getIndexWordLimit());

      if (directories.isEmpty()) {
        if (project == null) {
          directories.add("");
        } else {
          directories.add(project.getPath());
        }
      }

      for (String dir : directories) {
        File sourceRoot;
        if ("".equals(dir)) {
          sourceRoot = RuntimeEnvironment.getInstance().getSourceRootFile();
        } else {
          sourceRoot = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), dir);
        }

        HistoryGuru.getInstance().ensureHistoryCacheExists(sourceRoot);

        String startuid = Util.path2uid(dir, "");
        IndexReader reader = DirectoryReader.open(indexDirectory); // open existing index
        Terms terms = null;
        int numDocs = reader.numDocs();
        if (numDocs > 0) {
          Fields uFields = MultiFields.getFields(reader); // reader.getTermVectors(0);
          terms = uFields.terms(QueryBuilder.U);
        }

        try {
          if (numDocs > 0) {
            uidIter = terms.iterator(null);
            TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid), true); // init uid
            if (stat == TermsEnum.SeekStatus.END || stat == TermsEnum.SeekStatus.NOT_FOUND) {
              uidIter = null;
            }
          }
          // TODO below should be optional, since it traverses the tree once more to get total
          // count! :(
          int file_cnt = 0;
          if (RuntimeEnvironment.getInstance().isPrintProgress()) {
            log.log(Level.INFO, "Counting files in {0} ...", dir);
            file_cnt = indexDown(sourceRoot, dir, true, 0, 0);
            if (log.isLoggable(Level.INFO)) {
              log.log(
                  Level.INFO, "Need to process: {0} files for {1}", new Object[] {file_cnt, dir});
            }
          }

          indexDown(sourceRoot, dir, false, 0, file_cnt);

          while (uidIter != null
              && uidIter.term() != null
              && uidIter.term().utf8ToString().startsWith(startuid)) {
            removeFile();
            uidIter.next();
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      if (writer != null) {
        try {
          writer.prepareCommit();
          writer.commit();
          writer.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing writer", e);
        }
      }

      if (ctags != null) {
        try {
          ctags.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing ctags process", e);
        }
      }

      synchronized (lock) {
        running = false;
      }
    }

    if (!isInterrupted() && isDirty()) {
      if (RuntimeEnvironment.getInstance().isOptimizeDatabase()) {
        optimize();
      }
      createSpellingSuggestions();
      RuntimeEnvironment env = RuntimeEnvironment.getInstance();
      File timestamp = new File(env.getDataRootFile(), "timestamp");
      if (timestamp.exists()) {
        if (!timestamp.setLastModified(System.currentTimeMillis())) {
          log.log(
              Level.WARNING,
              "Failed to set last modified time on ''{0}'', used for timestamping the index database.",
              timestamp.getAbsolutePath());
        }
      } else {
        if (!timestamp.createNewFile()) {
          log.log(
              Level.WARNING,
              "Failed to create file ''{0}'', used for timestamping the index database.",
              timestamp.getAbsolutePath());
        }
      }
    }
  }
Esempio n. 29
0
  /**
   * Generate indexes recursively
   *
   * @param dir the root indexDirectory to generate indexes for
   * @param path the path
   * @param count_only if true will just traverse the source root and count files
   * @param cur_count current count during the traversal of the tree
   * @param est_total estimate total files to process
   */
  private int indexDown(File dir, String parent, boolean count_only, int cur_count, int est_total)
      throws IOException {
    int lcur_count = cur_count;
    if (isInterrupted()) {
      return lcur_count;
    }

    if (!accept(dir)) {
      return lcur_count;
    }

    File[] files = dir.listFiles();
    if (files == null) {
      log.log(Level.SEVERE, "Failed to get file listing for: {0}", dir.getAbsolutePath());
      return lcur_count;
    }
    Arrays.sort(
        files,
        new Comparator<File>() {
          @Override
          public int compare(File p1, File p2) {
            return p1.getName().compareTo(p2.getName());
          }
        });

    for (File file : files) {
      if (accept(dir, file)) {
        String path = parent + '/' + file.getName();

        if (file.isDirectory()) {
          lcur_count = indexDown(file, path, count_only, lcur_count, est_total);
        } else {
          lcur_count++;
          if (count_only) {
            continue;
          }

          if (RuntimeEnvironment.getInstance().isPrintProgress()
              && est_total > 0
              && log.isLoggable(Level.INFO)) {
            log.log(
                Level.INFO,
                "Progress: {0} ({1}%)",
                new Object[] {lcur_count, (lcur_count * 100.0f / est_total)});
          }

          if (uidIter != null) {
            String uid =
                Util.path2uid(
                    path,
                    DateTools.timeToString(
                        file.lastModified(),
                        DateTools.Resolution.MILLISECOND)); // construct uid for doc
            BytesRef buid = new BytesRef(uid);
            while (uidIter.term() != null
                && uidIter.term().compareTo(emptyBR) != 0
                && uidIter.term().compareTo(buid) < 0) {
              removeFile();
              uidIter.next();
            }

            if (uidIter.term() != null && uidIter.term().bytesEquals(buid)) {
              uidIter.next(); // keep matching docs
              continue;
            }
          }
          try {
            addFile(file, path);
          } catch (Exception e) {
            log.log(Level.WARNING, "Failed to add file " + file.getAbsolutePath(), e);
          }
        }
      }
    }

    return lcur_count;
  }
  protected void validateResponse(
      TermVectorResponse esResponse, Fields luceneFields, TestConfig testConfig)
      throws IOException {
    TestDoc testDoc = testConfig.doc;
    HashSet<String> selectedFields =
        testConfig.selectedFields == null
            ? null
            : new HashSet<String>(Arrays.asList(testConfig.selectedFields));
    Fields esTermVectorFields = esResponse.getFields();
    for (TestFieldSetting field : testDoc.fieldSettings) {
      Terms esTerms = esTermVectorFields.terms(field.name);
      if (selectedFields != null && !selectedFields.contains(field.name)) {
        assertNull(esTerms);
        continue;
      }

      assertNotNull(esTerms);

      Terms luceneTerms = luceneFields.terms(field.name);
      TermsEnum esTermEnum = esTerms.iterator(null);
      TermsEnum luceneTermEnum = luceneTerms.iterator(null);

      while (esTermEnum.next() != null) {
        assertNotNull(luceneTermEnum.next());

        assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
        DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0);
        DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0);
        if (luceneDocsPosEnum == null) {
          // test we expect that...
          assertFalse(field.storedOffset);
          assertFalse(field.storedPayloads);
          assertFalse(field.storedPositions);
          continue;
        }

        String currentTerm = esTermEnum.term().utf8ToString();

        assertThat(
            "Token mismatch for field: " + field.name,
            currentTerm,
            equalTo(luceneTermEnum.term().utf8ToString()));

        esDocsPosEnum.nextDoc();
        luceneDocsPosEnum.nextDoc();

        int freq = esDocsPosEnum.freq();
        assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
        for (int i = 0; i < freq; i++) {
          String failDesc = " (field:" + field.name + " term:" + currentTerm + ")";
          int lucenePos = luceneDocsPosEnum.nextPosition();
          int esPos = esDocsPosEnum.nextPosition();
          if (field.storedPositions && testConfig.requestPositions) {
            assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos));
          } else {
            assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1));
          }
          if (field.storedOffset && testConfig.requestOffsets) {
            assertThat(
                "Offset test failed" + failDesc,
                luceneDocsPosEnum.startOffset(),
                equalTo(esDocsPosEnum.startOffset()));
            assertThat(
                "Offset test failed" + failDesc,
                luceneDocsPosEnum.endOffset(),
                equalTo(esDocsPosEnum.endOffset()));
          } else {
            assertThat(
                "Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
            assertThat(
                "Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
          }
          if (field.storedPayloads && testConfig.requestPayloads) {
            assertThat(
                "Payload test failed" + failDesc,
                luceneDocsPosEnum.getPayload(),
                equalTo(esDocsPosEnum.getPayload()));
          } else {
            assertThat(
                "Missing payload test failed" + failDesc,
                esDocsPosEnum.getPayload(),
                equalTo(null));
          }
        }
      }

      assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());
    }
  }