Example #1
0
  public void listTokens(int freq) throws IOException {
    IndexReader ireader = null;
    TermsEnum iter = null;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory);
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.DEFS);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        // if (iter.term().field().startsWith("f")) {
        if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
          log.warning(iter.term().utf8ToString());
        }
        iter.next();
        /*} else {
        break;
        }*/
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
Example #2
0
  private void printSegment(PrintWriter out, SegmentCommitInfo si) throws Exception {
    SegmentReader reader =
        new SegmentReader(si, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random()));

    for (int i = 0; i < reader.numDocs(); i++) out.println(reader.document(i));

    Fields fields = reader.fields();
    for (String field : fields) {
      Terms terms = fields.terms(field);
      assertNotNull(terms);
      TermsEnum tis = terms.iterator(null);
      while (tis.next() != null) {

        out.print("  term=" + field + ":" + tis.term());
        out.println("    DF=" + tis.docFreq());

        DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null);

        while (positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          out.print(" doc=" + positions.docID());
          out.print(" TF=" + positions.freq());
          out.print(" pos=");
          out.print(positions.nextPosition());
          for (int j = 1; j < positions.freq(); j++) out.print("," + positions.nextPosition());
          out.println("");
        }
      }
    }
    reader.close();
  }
  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }
Example #4
0
 public static void fillQueue(TermsEnum termsEnum, TermStatsQueue tiq, String field)
     throws Exception {
   BytesRef term;
   while ((term = termsEnum.next()) != null) {
     BytesRef r = new BytesRef();
     r.copyBytes(term);
     tiq.insertWithOverflow(new TermStats(field, r, termsEnum.docFreq()));
   }
 }
Example #5
0
 @Override
 public final int docFreq(Term term) throws IOException {
   final Terms terms = terms(term.field());
   if (terms == null) {
     return 0;
   }
   final TermsEnum termsEnum = terms.iterator();
   if (termsEnum.seekExact(term.bytes())) {
     return termsEnum.docFreq();
   } else {
     return 0;
   }
 }
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(TermsParams.TERMS, false)) return;

    String[] fields = params.getParams(TermsParams.TERMS_FIELD);

    NamedList<Object> termsResult = new SimpleOrderedMap<>();
    rb.rsp.add("terms", termsResult);

    if (fields == null || fields.length == 0) return;

    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
      limit = Integer.MAX_VALUE;
    }

    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort =
        !TermsParams.TERMS_SORT_INDEX.equals(
            params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
      freqmax = Integer.MAX_VALUE;
    }
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;

    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);

    final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader();
    Fields lfields = indexReader.fields();

    for (String field : fields) {
      NamedList<Integer> fieldTerms = new NamedList<>();
      termsResult.add(field, fieldTerms);

      Terms terms = lfields == null ? null : lfields.terms(field);
      if (terms == null) {
        // no terms for this field
        continue;
      }

      FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
      if (ft == null) ft = new StrField();

      // prefix must currently be text
      BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);

      BytesRef upperBytes = null;
      if (upperStr != null) {
        upperBytes = new BytesRef();
        ft.readableToIndexed(upperStr, upperBytes);
      }

      BytesRef lowerBytes;
      if (lowerStr == null) {
        // If no lower bound was specified, use the prefix
        lowerBytes = prefixBytes;
      } else {
        lowerBytes = new BytesRef();
        if (raw) {
          // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
          // perhaps we detect if the FieldType is non-character and expect hex if so?
          lowerBytes = new BytesRef(lowerStr);
        } else {
          lowerBytes = new BytesRef();
          ft.readableToIndexed(lowerStr, lowerBytes);
        }
      }

      TermsEnum termsEnum = terms.iterator(null);
      BytesRef term = null;

      if (lowerBytes != null) {
        if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
          // Only advance the enum if we are excluding the lower bound and the lower Term actually
          // matches
          if (lowerIncl == false && term.equals(lowerBytes)) {
            term = termsEnum.next();
          }
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }

      int i = 0;
      BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
          (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
      CharsRef external = new CharsRef();
      while (term != null && (i < limit || sort)) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // stop if the prefix doesn't match
        if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break;

        if (pattern != null) {
          // indexed text or external text?
          // TODO: support "raw" mode?
          ft.indexedToReadable(term, external);
          externalized = true;
          if (!pattern.matcher(external).matches()) {
            term = termsEnum.next();
            continue;
          }
        }

        if (upperBytes != null) {
          int upperCmp = term.compareTo(upperBytes);
          // if we are past the upper term, or equal to it (when don't include upper) then stop.
          if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break;
        }

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        if (docFreq >= freqmin && docFreq <= freqmax) {
          // add the term to the list
          if (sort) {
            queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
          } else {

            // TODO: handle raw somehow
            if (!externalized) {
              ft.indexedToReadable(term, external);
            }
            fieldTerms.add(external.toString(), docFreq);
            i++;
          }
        }

        term = termsEnum.next();
      }

      if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
          if (i >= limit) break;
          ft.indexedToReadable(item.key, external);
          fieldTerms.add(external.toString(), item.val);
          i++;
        }
      }
    }
  }
  public void verifyEquals(DirectoryReader r1, DirectoryReader r2, String idField)
      throws Throwable {
    if (VERBOSE) {
      System.out.println("\nr1 docs:");
      printDocs(r1);
      System.out.println("\nr2 docs:");
      printDocs(r2);
    }
    if (r1.numDocs() != r2.numDocs()) {
      assert false : "r1.numDocs()=" + r1.numDocs() + " vs r2.numDocs()=" + r2.numDocs();
    }
    boolean hasDeletes = !(r1.maxDoc() == r2.maxDoc() && r1.numDocs() == r1.maxDoc());

    int[] r2r1 = new int[r2.maxDoc()]; // r2 id to r1 id mapping

    // create mapping from id2 space to id2 based on idField
    final Fields f1 = MultiFields.getFields(r1);
    if (f1 == null) {
      // make sure r2 is empty
      assertNull(MultiFields.getFields(r2));
      return;
    }
    final Terms terms1 = f1.terms(idField);
    if (terms1 == null) {
      assertTrue(
          MultiFields.getFields(r2) == null || MultiFields.getFields(r2).terms(idField) == null);
      return;
    }
    final TermsEnum termsEnum = terms1.iterator(null);

    final Bits liveDocs1 = MultiFields.getLiveDocs(r1);
    final Bits liveDocs2 = MultiFields.getLiveDocs(r2);

    Fields fields = MultiFields.getFields(r2);
    if (fields == null) {
      // make sure r1 is in fact empty (eg has only all
      // deleted docs):
      Bits liveDocs = MultiFields.getLiveDocs(r1);
      DocsEnum docs = null;
      while (termsEnum.next() != null) {
        docs = TestUtil.docs(random(), termsEnum, liveDocs, docs, DocsEnum.FLAG_NONE);
        while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          fail("r1 is not empty but r2 is");
        }
      }
      return;
    }
    Terms terms2 = fields.terms(idField);
    TermsEnum termsEnum2 = terms2.iterator(null);

    DocsEnum termDocs1 = null;
    DocsEnum termDocs2 = null;

    while (true) {
      BytesRef term = termsEnum.next();
      // System.out.println("TEST: match id term=" + term);
      if (term == null) {
        break;
      }

      termDocs1 = TestUtil.docs(random(), termsEnum, liveDocs1, termDocs1, DocsEnum.FLAG_NONE);
      if (termsEnum2.seekExact(term)) {
        termDocs2 = TestUtil.docs(random(), termsEnum2, liveDocs2, termDocs2, DocsEnum.FLAG_NONE);
      } else {
        termDocs2 = null;
      }

      if (termDocs1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
        // This doc is deleted and wasn't replaced
        assertTrue(termDocs2 == null || termDocs2.nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
        continue;
      }

      int id1 = termDocs1.docID();
      assertEquals(DocIdSetIterator.NO_MORE_DOCS, termDocs1.nextDoc());

      assertTrue(termDocs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      int id2 = termDocs2.docID();
      assertEquals(DocIdSetIterator.NO_MORE_DOCS, termDocs2.nextDoc());

      r2r1[id2] = id1;

      // verify stored fields are equivalent
      try {
        verifyEquals(r1.document(id1), r2.document(id2));
      } catch (Throwable t) {
        System.out.println("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term);
        System.out.println("  d1=" + r1.document(id1));
        System.out.println("  d2=" + r2.document(id2));
        throw t;
      }

      try {
        // verify term vectors are equivalent
        verifyEquals(r1.getTermVectors(id1), r2.getTermVectors(id2));
      } catch (Throwable e) {
        System.out.println("FAILED id=" + term + " id1=" + id1 + " id2=" + id2);
        Fields tv1 = r1.getTermVectors(id1);
        System.out.println("  d1=" + tv1);
        if (tv1 != null) {
          DocsAndPositionsEnum dpEnum = null;
          DocsEnum dEnum = null;
          for (String field : tv1) {
            System.out.println("    " + field + ":");
            Terms terms3 = tv1.terms(field);
            assertNotNull(terms3);
            TermsEnum termsEnum3 = terms3.iterator(null);
            BytesRef term2;
            while ((term2 = termsEnum3.next()) != null) {
              System.out.println(
                  "      " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq());
              dpEnum = termsEnum3.docsAndPositions(null, dpEnum);
              if (dpEnum != null) {
                assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                final int freq = dpEnum.freq();
                System.out.println("        doc=" + dpEnum.docID() + " freq=" + freq);
                for (int posUpto = 0; posUpto < freq; posUpto++) {
                  System.out.println("          pos=" + dpEnum.nextPosition());
                }
              } else {
                dEnum = TestUtil.docs(random(), termsEnum3, null, dEnum, DocsEnum.FLAG_FREQS);
                assertNotNull(dEnum);
                assertTrue(dEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                final int freq = dEnum.freq();
                System.out.println("        doc=" + dEnum.docID() + " freq=" + freq);
              }
            }
          }
        }

        Fields tv2 = r2.getTermVectors(id2);
        System.out.println("  d2=" + tv2);
        if (tv2 != null) {
          DocsAndPositionsEnum dpEnum = null;
          DocsEnum dEnum = null;
          for (String field : tv2) {
            System.out.println("    " + field + ":");
            Terms terms3 = tv2.terms(field);
            assertNotNull(terms3);
            TermsEnum termsEnum3 = terms3.iterator(null);
            BytesRef term2;
            while ((term2 = termsEnum3.next()) != null) {
              System.out.println(
                  "      " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq());
              dpEnum = termsEnum3.docsAndPositions(null, dpEnum);
              if (dpEnum != null) {
                assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                final int freq = dpEnum.freq();
                System.out.println("        doc=" + dpEnum.docID() + " freq=" + freq);
                for (int posUpto = 0; posUpto < freq; posUpto++) {
                  System.out.println("          pos=" + dpEnum.nextPosition());
                }
              } else {
                dEnum = TestUtil.docs(random(), termsEnum3, null, dEnum, DocsEnum.FLAG_FREQS);
                assertNotNull(dEnum);
                assertTrue(dEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                final int freq = dEnum.freq();
                System.out.println("        doc=" + dEnum.docID() + " freq=" + freq);
              }
            }
          }
        }

        throw e;
      }
    }

    // System.out.println("TEST: done match id");

    // Verify postings
    // System.out.println("TEST: create te1");
    final Fields fields1 = MultiFields.getFields(r1);
    final Iterator<String> fields1Enum = fields1.iterator();
    final Fields fields2 = MultiFields.getFields(r2);
    final Iterator<String> fields2Enum = fields2.iterator();

    String field1 = null, field2 = null;
    TermsEnum termsEnum1 = null;
    termsEnum2 = null;
    DocsEnum docs1 = null, docs2 = null;

    // pack both doc and freq into single element for easy sorting
    long[] info1 = new long[r1.numDocs()];
    long[] info2 = new long[r2.numDocs()];

    for (; ; ) {
      BytesRef term1 = null, term2 = null;

      // iterate until we get some docs
      int len1;
      for (; ; ) {
        len1 = 0;
        if (termsEnum1 == null) {
          if (!fields1Enum.hasNext()) {
            break;
          }
          field1 = fields1Enum.next();
          Terms terms = fields1.terms(field1);
          if (terms == null) {
            continue;
          }
          termsEnum1 = terms.iterator(null);
        }
        term1 = termsEnum1.next();
        if (term1 == null) {
          // no more terms in this field
          termsEnum1 = null;
          continue;
        }

        // System.out.println("TEST: term1=" + term1);
        docs1 = TestUtil.docs(random(), termsEnum1, liveDocs1, docs1, DocsEnum.FLAG_FREQS);
        while (docs1.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          int d = docs1.docID();
          int f = docs1.freq();
          info1[len1] = (((long) d) << 32) | f;
          len1++;
        }
        if (len1 > 0) break;
      }

      // iterate until we get some docs
      int len2;
      for (; ; ) {
        len2 = 0;
        if (termsEnum2 == null) {
          if (!fields2Enum.hasNext()) {
            break;
          }
          field2 = fields2Enum.next();
          Terms terms = fields2.terms(field2);
          if (terms == null) {
            continue;
          }
          termsEnum2 = terms.iterator(null);
        }
        term2 = termsEnum2.next();
        if (term2 == null) {
          // no more terms in this field
          termsEnum2 = null;
          continue;
        }

        // System.out.println("TEST: term1=" + term1);
        docs2 = TestUtil.docs(random(), termsEnum2, liveDocs2, docs2, DocsEnum.FLAG_FREQS);
        while (docs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          int d = r2r1[docs2.docID()];
          int f = docs2.freq();
          info2[len2] = (((long) d) << 32) | f;
          len2++;
        }
        if (len2 > 0) break;
      }

      assertEquals(len1, len2);
      if (len1 == 0) break; // no more terms

      assertEquals(field1, field2);
      assertTrue(term1.bytesEquals(term2));

      if (!hasDeletes) assertEquals(termsEnum1.docFreq(), termsEnum2.docFreq());

      assertEquals("len1=" + len1 + " len2=" + len2 + " deletes?=" + hasDeletes, term1, term2);

      // sort info2 to get it into ascending docid
      Arrays.sort(info2, 0, len2);

      // now compare
      for (int i = 0; i < len1; i++) {
        assertEquals(
            "i="
                + i
                + " len="
                + len1
                + " d1="
                + (info1[i] >>> 32)
                + " f1="
                + (info1[i] & Integer.MAX_VALUE)
                + " d2="
                + (info2[i] >>> 32)
                + " f2="
                + (info2[i] & Integer.MAX_VALUE)
                + " field="
                + field1
                + " term="
                + term1.utf8ToString(),
            info1[i],
            info2[i]);
      }
    }
  }