Example #1
0
  public void listTokens(int freq) throws IOException {
    IndexReader ireader = null;
    TermsEnum iter = null;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory);
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.DEFS);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        // if (iter.term().field().startsWith("f")) {
        if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
          log.warning(iter.term().utf8ToString());
        }
        iter.next();
        /*} else {
        break;
        }*/
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
Example #2
0
  /**
   * List all of the files in this index database
   *
   * @throws IOException If an IO error occurs while reading from the database
   */
  public void listFiles() throws IOException {
    IndexReader ireader = null;
    TermsEnum iter;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory); // open existing index
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.U);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        log.fine(Util.uid2url(iter.term().utf8ToString()));
        iter.next();
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
Example #3
0
  /**
   * Remove a stale file (uidIter.term().text()) from the index database (and the xref file)
   *
   * @throws java.io.IOException if an error occurs
   */
  private void removeFile() throws IOException {
    String path = Util.uid2url(uidIter.term().utf8ToString());

    for (IndexChangedListener listener : listeners) {
      listener.fileRemove(path);
    }
    writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term()));
    writer.prepareCommit();
    writer.commit();

    File xrefFile;
    if (RuntimeEnvironment.getInstance().isCompressXref()) {
      xrefFile = new File(xrefDir, path + ".gz");
    } else {
      xrefFile = new File(xrefDir, path);
    }
    File parent = xrefFile.getParentFile();

    if (!xrefFile.delete() && xrefFile.exists()) {
      log.log(Level.INFO, "Failed to remove obsolete xref-file: {0}", xrefFile.getAbsolutePath());
    }

    // Remove the parent directory if it's empty
    if (parent.delete()) {
      log.log(Level.FINE, "Removed empty xref dir:{0}", parent.getAbsolutePath());
    }
    setDirty();
    for (IndexChangedListener listener : listeners) {
      listener.fileRemoved(path);
    }
  }
  /*
   *  Utility function to display a term vector.
   */
  static void termVectorDisplay(Terms terms) throws IOException {

    if ((terms == null) || (terms.size() == -1)) System.out.println("    The field is not stored.");
    else {
      /*
       *  The terms for this field are stored.
       */
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %10d %-20s %d ",
            ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq());

        DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null);
        currDoc.nextDoc();

        for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++)
          System.out.print(currDoc.nextPosition() + " ");

        System.out.println();
      }
      ;
    }
    ;
  }
Example #5
0
  private void printSegment(PrintWriter out, SegmentCommitInfo si) throws Exception {
    SegmentReader reader =
        new SegmentReader(si, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random()));

    for (int i = 0; i < reader.numDocs(); i++) out.println(reader.document(i));

    Fields fields = reader.fields();
    for (String field : fields) {
      Terms terms = fields.terms(field);
      assertNotNull(terms);
      TermsEnum tis = terms.iterator(null);
      while (tis.next() != null) {

        out.print("  term=" + field + ":" + tis.term());
        out.println("    DF=" + tis.docFreq());

        DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null);

        while (positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          out.print(" doc=" + positions.docID());
          out.print(" TF=" + positions.freq());
          out.print(" pos=");
          out.print(positions.nextPosition());
          for (int j = 1; j < positions.freq(); j++) out.print("," + positions.nextPosition());
          out.println("");
        }
      }
    }
    reader.close();
  }
  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }
Example #7
0
  /** tests intersect: TODO start at a random term! */
  public void testIntersect() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      CompiledAutomaton ca =
          new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false);
      TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null);
      Automaton expected = BasicOperations.intersection(termsAutomaton, automaton);
      TreeSet<BytesRef> found = new TreeSet<BytesRef>();
      while (te.next() != null) {
        found.add(BytesRef.deepCopyOf(te.term()));
      }

      Automaton actual = BasicAutomata.makeStringUnion(found);
      assertTrue(BasicOperations.sameLanguage(expected, actual));
    }
  }
Example #8
0
  /** mixes up seek and next for all terms */
  public void testSeekingAndNexting() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null);

      for (BytesRef term : terms) {
        int c = random().nextInt(3);
        if (c == 0) {
          assertEquals(term, te.next());
        } else if (c == 1) {
          assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean()));
          assertEquals(term, te.term());
        } else {
          assertTrue(te.seekExact(term, random().nextBoolean()));
        }
      }
    }
  }
Example #9
0
  /** seeks to every term accepted by some automata */
  public void testSeeking() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null);
      ArrayList<BytesRef> unsortedTerms = new ArrayList<BytesRef>(terms);
      Collections.shuffle(unsortedTerms, random());

      for (BytesRef term : unsortedTerms) {
        if (BasicOperations.run(automaton, term.utf8ToString())) {
          // term is accepted
          if (random().nextBoolean()) {
            // seek exact
            assertTrue(te.seekExact(term, random().nextBoolean()));
          } else {
            // seek ceil
            assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean()));
            assertEquals(term, te.term());
          }
        }
      }
    }
  }
Example #10
0
  /**
   * Generate indexes recursively
   *
   * @param dir the root indexDirectory to generate indexes for
   * @param path the path
   * @param count_only if true will just traverse the source root and count files
   * @param cur_count current count during the traversal of the tree
   * @param est_total estimate total files to process
   */
  private int indexDown(File dir, String parent, boolean count_only, int cur_count, int est_total)
      throws IOException {
    int lcur_count = cur_count;
    if (isInterrupted()) {
      return lcur_count;
    }

    if (!accept(dir)) {
      return lcur_count;
    }

    File[] files = dir.listFiles();
    if (files == null) {
      log.log(Level.SEVERE, "Failed to get file listing for: {0}", dir.getAbsolutePath());
      return lcur_count;
    }
    Arrays.sort(
        files,
        new Comparator<File>() {
          @Override
          public int compare(File p1, File p2) {
            return p1.getName().compareTo(p2.getName());
          }
        });

    for (File file : files) {
      if (accept(dir, file)) {
        String path = parent + '/' + file.getName();

        if (file.isDirectory()) {
          lcur_count = indexDown(file, path, count_only, lcur_count, est_total);
        } else {
          lcur_count++;
          if (count_only) {
            continue;
          }

          if (RuntimeEnvironment.getInstance().isPrintProgress()
              && est_total > 0
              && log.isLoggable(Level.INFO)) {
            log.log(
                Level.INFO,
                "Progress: {0} ({1}%)",
                new Object[] {lcur_count, (lcur_count * 100.0f / est_total)});
          }

          if (uidIter != null) {
            String uid =
                Util.path2uid(
                    path,
                    DateTools.timeToString(
                        file.lastModified(),
                        DateTools.Resolution.MILLISECOND)); // construct uid for doc
            BytesRef buid = new BytesRef(uid);
            while (uidIter.term() != null
                && uidIter.term().compareTo(emptyBR) != 0
                && uidIter.term().compareTo(buid) < 0) {
              removeFile();
              uidIter.next();
            }

            if (uidIter.term() != null && uidIter.term().bytesEquals(buid)) {
              uidIter.next(); // keep matching docs
              continue;
            }
          }
          try {
            addFile(file, path);
          } catch (Exception e) {
            log.log(Level.WARNING, "Failed to add file " + file.getAbsolutePath(), e);
          }
        }
      }
    }

    return lcur_count;
  }
Example #11
0
  /**
   * Update the content of this index database
   *
   * @throws IOException if an error occurs
   * @throws HistoryException if an error occurs when accessing the history
   */
  public void update() throws IOException, HistoryException {
    synchronized (lock) {
      if (running) {
        throw new IOException("Indexer already running!");
      }
      running = true;
      interrupted = false;
    }

    String ctgs = RuntimeEnvironment.getInstance().getCtags();
    if (ctgs != null) {
      ctags = new Ctags();
      ctags.setBinary(ctgs);
    }
    if (ctags == null) {
      log.severe("Unable to run ctags! searching definitions will not work!");
    }

    if (ctags != null) {
      String filename = RuntimeEnvironment.getInstance().getCTagsExtraOptionsFile();
      if (filename != null) {
        ctags.setCTagsExtraOptionsFile(filename);
      }
    }

    try {
      Analyzer analyzer = AnalyzerGuru.getAnalyzer();
      IndexWriterConfig iwc = new IndexWriterConfig(SearchEngine.LUCENE_VERSION, analyzer);
      iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
      // iwc.setRAMBufferSizeMB(256.0);  //TODO check what is the sweet spot
      writer = new IndexWriter(indexDirectory, iwc);
      writer.commit(); // to make sure index exists on the disk
      // writer.setMaxFieldLength(RuntimeEnvironment.getInstance().getIndexWordLimit());

      if (directories.isEmpty()) {
        if (project == null) {
          directories.add("");
        } else {
          directories.add(project.getPath());
        }
      }

      for (String dir : directories) {
        File sourceRoot;
        if ("".equals(dir)) {
          sourceRoot = RuntimeEnvironment.getInstance().getSourceRootFile();
        } else {
          sourceRoot = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), dir);
        }

        HistoryGuru.getInstance().ensureHistoryCacheExists(sourceRoot);

        String startuid = Util.path2uid(dir, "");
        IndexReader reader = DirectoryReader.open(indexDirectory); // open existing index
        Terms terms = null;
        int numDocs = reader.numDocs();
        if (numDocs > 0) {
          Fields uFields = MultiFields.getFields(reader); // reader.getTermVectors(0);
          terms = uFields.terms(QueryBuilder.U);
        }

        try {
          if (numDocs > 0) {
            uidIter = terms.iterator(null);
            TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid), true); // init uid
            if (stat == TermsEnum.SeekStatus.END || stat == TermsEnum.SeekStatus.NOT_FOUND) {
              uidIter = null;
            }
          }
          // TODO below should be optional, since it traverses the tree once more to get total
          // count! :(
          int file_cnt = 0;
          if (RuntimeEnvironment.getInstance().isPrintProgress()) {
            log.log(Level.INFO, "Counting files in {0} ...", dir);
            file_cnt = indexDown(sourceRoot, dir, true, 0, 0);
            if (log.isLoggable(Level.INFO)) {
              log.log(
                  Level.INFO, "Need to process: {0} files for {1}", new Object[] {file_cnt, dir});
            }
          }

          indexDown(sourceRoot, dir, false, 0, file_cnt);

          while (uidIter != null
              && uidIter.term() != null
              && uidIter.term().utf8ToString().startsWith(startuid)) {
            removeFile();
            uidIter.next();
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      if (writer != null) {
        try {
          writer.prepareCommit();
          writer.commit();
          writer.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing writer", e);
        }
      }

      if (ctags != null) {
        try {
          ctags.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing ctags process", e);
        }
      }

      synchronized (lock) {
        running = false;
      }
    }

    if (!isInterrupted() && isDirty()) {
      if (RuntimeEnvironment.getInstance().isOptimizeDatabase()) {
        optimize();
      }
      createSpellingSuggestions();
      RuntimeEnvironment env = RuntimeEnvironment.getInstance();
      File timestamp = new File(env.getDataRootFile(), "timestamp");
      if (timestamp.exists()) {
        if (!timestamp.setLastModified(System.currentTimeMillis())) {
          log.log(
              Level.WARNING,
              "Failed to set last modified time on ''{0}'', used for timestamping the index database.",
              timestamp.getAbsolutePath());
        }
      } else {
        if (!timestamp.createNewFile()) {
          log.log(
              Level.WARNING,
              "Failed to create file ''{0}'', used for timestamping the index database.",
              timestamp.getAbsolutePath());
        }
      }
    }
  }
Example #12
0
  private void verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef)
      throws Exception {

    final DocTermOrds dto =
        new DocTermOrds(
            r,
            r.getLiveDocs(),
            "field",
            prefixRef,
            Integer.MAX_VALUE,
            _TestUtil.nextInt(random(), 2, 10));

    final FieldCache.Ints docIDToID = FieldCache.DEFAULT.getInts(r, "id", false);
    /*
      for(int docID=0;docID<subR.maxDoc();docID++) {
      System.out.println("  docID=" + docID + " id=" + docIDToID[docID]);
      }
    */

    if (VERBOSE) {
      System.out.println(
          "TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.utf8ToString()));
      System.out.println("TEST: all TERMS:");
      TermsEnum allTE = MultiFields.getTerms(r, "field").iterator(null);
      int ord = 0;
      while (allTE.next() != null) {
        System.out.println("  ord=" + (ord++) + " term=" + allTE.term().utf8ToString());
      }
    }

    // final TermsEnum te = subR.fields().terms("field").iterator();
    final TermsEnum te = dto.getOrdTermsEnum(r);
    if (dto.numTerms() == 0) {
      if (prefixRef == null) {
        assertNull(MultiFields.getTerms(r, "field"));
      } else {
        Terms terms = MultiFields.getTerms(r, "field");
        if (terms != null) {
          TermsEnum termsEnum = terms.iterator(null);
          TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef, false);
          if (result != TermsEnum.SeekStatus.END) {
            assertFalse(
                "term="
                    + termsEnum.term().utf8ToString()
                    + " matches prefix="
                    + prefixRef.utf8ToString(),
                StringHelper.startsWith(termsEnum.term(), prefixRef));
          } else {
            // ok
          }
        } else {
          // ok
        }
      }
      return;
    }

    if (VERBOSE) {
      System.out.println("TEST: TERMS:");
      te.seekExact(0);
      while (true) {
        System.out.println("  ord=" + te.ord() + " term=" + te.term().utf8ToString());
        if (te.next() == null) {
          break;
        }
      }
    }

    SortedSetDocValues iter = dto.iterator(r);
    for (int docID = 0; docID < r.maxDoc(); docID++) {
      if (VERBOSE) {
        System.out.println(
            "TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.get(docID) + ")");
      }
      iter.setDocument(docID);
      final int[] answers = idToOrds[docIDToID.get(docID)];
      int upto = 0;
      long ord;
      while ((ord = iter.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
        te.seekExact(ord);
        final BytesRef expected = termsArray[answers[upto++]];
        if (VERBOSE) {
          System.out.println(
              "  exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString());
        }
        assertEquals(
            "expected="
                + expected.utf8ToString()
                + " actual="
                + te.term().utf8ToString()
                + " ord="
                + ord,
            expected,
            te.term());
      }
      assertEquals(answers.length, upto);
    }
  }
Example #13
0
  public void testMerge() throws IOException {
    final Codec codec = Codec.getDefault();
    final SegmentInfo si =
        new SegmentInfo(
            mergedDir,
            Version.LATEST,
            mergedSegment,
            -1,
            false,
            codec,
            Collections.emptyMap(),
            StringHelper.randomId(),
            new HashMap<>());

    SegmentMerger merger =
        new SegmentMerger(
            Arrays.<CodecReader>asList(reader1, reader2),
            si,
            InfoStream.getDefault(),
            mergedDir,
            new FieldInfos.FieldNumbers(),
            newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1))));
    MergeState mergeState = merger.merge();
    int docsMerged = mergeState.segmentInfo.maxDoc();
    assertTrue(docsMerged == 2);
    // Should be able to open a new SegmentReader against the new directory
    SegmentReader mergedReader =
        new SegmentReader(
            new SegmentCommitInfo(mergeState.segmentInfo, 0, -1L, -1L, -1L),
            newIOContext(random()));
    assertTrue(mergedReader != null);
    assertTrue(mergedReader.numDocs() == 2);
    Document newDoc1 = mergedReader.document(0);
    assertTrue(newDoc1 != null);
    // There are 2 unstored fields on the document
    assertTrue(
        DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - DocHelper.unstored.size());
    Document newDoc2 = mergedReader.document(1);
    assertTrue(newDoc2 != null);
    assertTrue(
        DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - DocHelper.unstored.size());

    PostingsEnum termDocs =
        TestUtil.docs(
            random(), mergedReader, DocHelper.TEXT_FIELD_2_KEY, new BytesRef("field"), null, 0);
    assertTrue(termDocs != null);
    assertTrue(termDocs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

    int tvCount = 0;
    for (FieldInfo fieldInfo : mergedReader.getFieldInfos()) {
      if (fieldInfo.hasVectors()) {
        tvCount++;
      }
    }

    // System.out.println("stored size: " + stored.size());
    assertEquals("We do not have 3 fields that were indexed with term vector", 3, tvCount);

    Terms vector = mergedReader.getTermVectors(0).terms(DocHelper.TEXT_FIELD_2_KEY);
    assertNotNull(vector);
    assertEquals(3, vector.size());
    TermsEnum termsEnum = vector.iterator();

    int i = 0;
    while (termsEnum.next() != null) {
      String term = termsEnum.term().utf8ToString();
      int freq = (int) termsEnum.totalTermFreq();
      // System.out.println("Term: " + term + " Freq: " + freq);
      assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
      assertTrue(DocHelper.FIELD_2_FREQS[i] == freq);
      i++;
    }

    TestSegmentReader.checkNorms(mergedReader);
    mergedReader.close();
  }
Example #14
0
  public void testSortedTermsEnum() throws IOException {
    Directory directory = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
    iwconfig.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);

    Document doc = new Document();
    doc.add(new StringField("field", "hello", Field.Store.NO));
    iwriter.addDocument(doc);

    doc = new Document();
    doc.add(new StringField("field", "world", Field.Store.NO));
    iwriter.addDocument(doc);

    doc = new Document();
    doc.add(new StringField("field", "beer", Field.Store.NO));
    iwriter.addDocument(doc);
    iwriter.forceMerge(1);

    DirectoryReader ireader = iwriter.getReader();
    iwriter.close();

    AtomicReader ar = getOnlySegmentReader(ireader);
    SortedSetDocValues dv = FieldCache.DEFAULT.getDocTermOrds(ar, "field");
    assertEquals(3, dv.getValueCount());

    TermsEnum termsEnum = dv.termsEnum();

    // next()
    assertEquals("beer", termsEnum.next().utf8ToString());
    assertEquals(0, termsEnum.ord());
    assertEquals("hello", termsEnum.next().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertEquals("world", termsEnum.next().utf8ToString());
    assertEquals(2, termsEnum.ord());

    // seekCeil()
    assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!")));
    assertEquals("hello", termsEnum.term().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer")));
    assertEquals("beer", termsEnum.term().utf8ToString());
    assertEquals(0, termsEnum.ord());
    assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz")));

    // seekExact()
    assertTrue(termsEnum.seekExact(new BytesRef("beer"), true));
    assertEquals("beer", termsEnum.term().utf8ToString());
    assertEquals(0, termsEnum.ord());
    assertTrue(termsEnum.seekExact(new BytesRef("hello"), true));
    assertEquals("hello", termsEnum.term().utf8ToString());
    assertEquals(1, termsEnum.ord());
    assertTrue(termsEnum.seekExact(new BytesRef("world"), true));
    assertEquals("world", termsEnum.term().utf8ToString());
    assertEquals(2, termsEnum.ord());
    assertFalse(termsEnum.seekExact(new BytesRef("bogus"), true));

    // seek(ord)
    termsEnum.seekExact(0);
    assertEquals("beer", termsEnum.term().utf8ToString());
    assertEquals(0, termsEnum.ord());
    termsEnum.seekExact(1);
    assertEquals("hello", termsEnum.term().utf8ToString());
    assertEquals(1, termsEnum.ord());
    termsEnum.seekExact(2);
    assertEquals("world", termsEnum.term().utf8ToString());
    assertEquals(2, termsEnum.ord());
    ireader.close();
    directory.close();
  }
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(TermsParams.TERMS, false)) return;

    String[] fields = params.getParams(TermsParams.TERMS_FIELD);

    NamedList<Object> termsResult = new SimpleOrderedMap<>();
    rb.rsp.add("terms", termsResult);

    if (fields == null || fields.length == 0) return;

    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
      limit = Integer.MAX_VALUE;
    }

    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort =
        !TermsParams.TERMS_SORT_INDEX.equals(
            params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
      freqmax = Integer.MAX_VALUE;
    }
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;

    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);

    final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader();
    Fields lfields = indexReader.fields();

    for (String field : fields) {
      NamedList<Integer> fieldTerms = new NamedList<>();
      termsResult.add(field, fieldTerms);

      Terms terms = lfields == null ? null : lfields.terms(field);
      if (terms == null) {
        // no terms for this field
        continue;
      }

      FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
      if (ft == null) ft = new StrField();

      // prefix must currently be text
      BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);

      BytesRef upperBytes = null;
      if (upperStr != null) {
        upperBytes = new BytesRef();
        ft.readableToIndexed(upperStr, upperBytes);
      }

      BytesRef lowerBytes;
      if (lowerStr == null) {
        // If no lower bound was specified, use the prefix
        lowerBytes = prefixBytes;
      } else {
        lowerBytes = new BytesRef();
        if (raw) {
          // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
          // perhaps we detect if the FieldType is non-character and expect hex if so?
          lowerBytes = new BytesRef(lowerStr);
        } else {
          lowerBytes = new BytesRef();
          ft.readableToIndexed(lowerStr, lowerBytes);
        }
      }

      TermsEnum termsEnum = terms.iterator(null);
      BytesRef term = null;

      if (lowerBytes != null) {
        if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
          // Only advance the enum if we are excluding the lower bound and the lower Term actually
          // matches
          if (lowerIncl == false && term.equals(lowerBytes)) {
            term = termsEnum.next();
          }
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }

      int i = 0;
      BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
          (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
      CharsRef external = new CharsRef();
      while (term != null && (i < limit || sort)) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // stop if the prefix doesn't match
        if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break;

        if (pattern != null) {
          // indexed text or external text?
          // TODO: support "raw" mode?
          ft.indexedToReadable(term, external);
          externalized = true;
          if (!pattern.matcher(external).matches()) {
            term = termsEnum.next();
            continue;
          }
        }

        if (upperBytes != null) {
          int upperCmp = term.compareTo(upperBytes);
          // if we are past the upper term, or equal to it (when don't include upper) then stop.
          if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break;
        }

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        if (docFreq >= freqmin && docFreq <= freqmax) {
          // add the term to the list
          if (sort) {
            queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
          } else {

            // TODO: handle raw somehow
            if (!externalized) {
              ft.indexedToReadable(term, external);
            }
            fieldTerms.add(external.toString(), docFreq);
            i++;
          }
        }

        term = termsEnum.next();
      }

      if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
          if (i >= limit) break;
          ft.indexedToReadable(item.key, external);
          fieldTerms.add(external.toString(), item.val);
          i++;
        }
      }
    }
  }
Example #16
0
 /**
  * Reconstruct document fields.
  *
  * @param docNum document number. If this document is deleted, but the index is not optimized yet,
  *     the reconstruction process may still yield the reconstructed field content even from
  *     deleted documents.
  * @return reconstructed document
  * @throws Exception
  */
 public Reconstructed reconstruct(int docNum) throws Exception {
   if (docNum < 0 || docNum > reader.maxDoc()) {
     throw new Exception("Document number outside of valid range.");
   }
   Reconstructed res = new Reconstructed();
   if (deleted != null && deleted.get(docNum)) {
     throw new Exception("Document is deleted.");
   } else {
     Document doc = reader.document(docNum);
     for (int i = 0; i < fieldNames.length; i++) {
       Field[] fs = doc.getFields(fieldNames[i]);
       if (fs != null && fs.length > 0) {
         res.getStoredFields().put(fieldNames[i], fs);
       }
     }
   }
   // collect values from unstored fields
   HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames));
   // try to use term vectors if available
   progress.maxValue = fieldNames.length;
   progress.curValue = 0;
   progress.minValue = 0;
   for (int i = 0; i < fieldNames.length; i++) {
     TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]);
     if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) {
       TermPositionVector tpv = (TermPositionVector) tvf;
       progress.message = "Reading term vectors ...";
       progress.curValue = i;
       setChanged();
       notifyObservers(progress);
       BytesRef[] tv = tpv.getTerms();
       for (int k = 0; k < tv.length; k++) {
         // do we have positions?
         int[] posArr = tpv.getTermPositions(k);
         if (posArr == null) {
           // only offsets
           TermVectorOffsetInfo[] offsets = tpv.getOffsets(k);
           if (offsets.length == 0) {
             continue;
           }
           // convert offsets into positions
           posArr = convertOffsets(offsets);
         }
         GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]);
         if (gsa == null) {
           gsa = new GrowableStringArray();
           res.getReconstructedFields().put(fieldNames[i], gsa);
         }
         for (int m = 0; m < posArr.length; m++) {
           gsa.append(posArr[m], "|", tv[k].utf8ToString());
         }
       }
       fields.remove(fieldNames[i]); // got what we wanted
     }
   }
   // this loop collects data only from left-over fields
   // not yet collected through term vectors
   progress.maxValue = fields.size();
   progress.curValue = 0;
   progress.minValue = 0;
   for (String fld : fields) {
     progress.message = "Collecting terms in " + fld + " ...";
     progress.curValue++;
     setChanged();
     notifyObservers(progress);
     Terms terms = MultiFields.getTerms(reader, fld);
     if (terms == null) { // no terms in this field
       continue;
     }
     TermsEnum te = terms.iterator();
     while (te.next() != null) {
       DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null);
       if (dpe == null) { // no position info for this field
         break;
       }
       int num = dpe.advance(docNum);
       if (num != docNum) { // either greater than or NO_MORE_DOCS
         continue; // no data for this term in this doc
       }
       String term = te.term().utf8ToString();
       GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld);
       if (gsa == null) {
         gsa = new GrowableStringArray();
         res.getReconstructedFields().put(fld, gsa);
       }
       for (int k = 0; k < dpe.freq(); k++) {
         int pos = dpe.nextPosition();
         gsa.append(pos, "|", term);
       }
     }
   }
   progress.message = "Done.";
   progress.curValue = 100;
   setChanged();
   notifyObservers(progress);
   return res;
 }
  protected void validateResponse(
      TermVectorResponse esResponse, Fields luceneFields, TestConfig testConfig)
      throws IOException {
    TestDoc testDoc = testConfig.doc;
    HashSet<String> selectedFields =
        testConfig.selectedFields == null
            ? null
            : new HashSet<String>(Arrays.asList(testConfig.selectedFields));
    Fields esTermVectorFields = esResponse.getFields();
    for (TestFieldSetting field : testDoc.fieldSettings) {
      Terms esTerms = esTermVectorFields.terms(field.name);
      if (selectedFields != null && !selectedFields.contains(field.name)) {
        assertNull(esTerms);
        continue;
      }

      assertNotNull(esTerms);

      Terms luceneTerms = luceneFields.terms(field.name);
      TermsEnum esTermEnum = esTerms.iterator(null);
      TermsEnum luceneTermEnum = luceneTerms.iterator(null);

      while (esTermEnum.next() != null) {
        assertNotNull(luceneTermEnum.next());

        assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
        DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0);
        DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0);
        if (luceneDocsPosEnum == null) {
          // test we expect that...
          assertFalse(field.storedOffset);
          assertFalse(field.storedPayloads);
          assertFalse(field.storedPositions);
          continue;
        }

        String currentTerm = esTermEnum.term().utf8ToString();

        assertThat(
            "Token mismatch for field: " + field.name,
            currentTerm,
            equalTo(luceneTermEnum.term().utf8ToString()));

        esDocsPosEnum.nextDoc();
        luceneDocsPosEnum.nextDoc();

        int freq = esDocsPosEnum.freq();
        assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
        for (int i = 0; i < freq; i++) {
          String failDesc = " (field:" + field.name + " term:" + currentTerm + ")";
          int lucenePos = luceneDocsPosEnum.nextPosition();
          int esPos = esDocsPosEnum.nextPosition();
          if (field.storedPositions && testConfig.requestPositions) {
            assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos));
          } else {
            assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1));
          }
          if (field.storedOffset && testConfig.requestOffsets) {
            assertThat(
                "Offset test failed" + failDesc,
                luceneDocsPosEnum.startOffset(),
                equalTo(esDocsPosEnum.startOffset()));
            assertThat(
                "Offset test failed" + failDesc,
                luceneDocsPosEnum.endOffset(),
                equalTo(esDocsPosEnum.endOffset()));
          } else {
            assertThat(
                "Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
            assertThat(
                "Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
          }
          if (field.storedPayloads && testConfig.requestPayloads) {
            assertThat(
                "Payload test failed" + failDesc,
                luceneDocsPosEnum.getPayload(),
                equalTo(esDocsPosEnum.getPayload()));
          } else {
            assertThat(
                "Missing payload test failed" + failDesc,
                esDocsPosEnum.getPayload(),
                equalTo(null));
          }
        }
      }

      assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());
    }
  }