private void buildTermStatistics(XContentBuilder builder, TermsEnum termIter) throws IOException { // write term statistics. At this point we do not naturally have a // boolean that says if these values actually were requested. // However, we can assume that they were not if the statistic values are // <= 0. assert (((termIter.docFreq() > 0) && (termIter.totalTermFreq() > 0)) || ((termIter.docFreq() == -1) && (termIter.totalTermFreq() == -1))); int docFreq = termIter.docFreq(); if (docFreq > 0) { builder.field(FieldStrings.DOC_FREQ, docFreq); builder.field(FieldStrings.TTF, termIter.totalTermFreq()); } }
/** * Add term frequencies for a single document to a frequency map. * * @param reader the index * @param doc doc id * @param luceneName the index field from which to use the term vector * @param freq where to add to the token frequencies */ public static void getFrequenciesFromTermVector( IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) { try { org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName); if (terms == null) { throw new IllegalArgumentException("Field " + luceneName + " has no Terms"); } TermsEnum termsEnum = terms.iterator(); // Verzamel concordantiewoorden uit term vector PostingsEnum postingsEnum = null; while (termsEnum.next() != null) { postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS); String term = termsEnum.term().utf8ToString(); Integer n = freq.get(term); if (n == null) { n = 0; } while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { n += termsEnum.docFreq(); } freq.put(term, n); } } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } }
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { // if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { log.warning(iter.term().utf8ToString()); } iter.next(); /*} else { break; }*/ } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
/* * listTermDictionary displays the term dictionary for a field. */ static void listTermDictionary(IndexReader reader, String fieldName) throws IOException { System.out.println("\nTerm Dictionary: field " + fieldName); /* Grant says: MultiFields.getTerms(IndexReader, fieldName) */ Terms terms = MultiFields.getTerms(reader, fieldName); if ((terms == null) || (terms.size() == -1)) System.out.println(" The term dictionary is empty."); else { System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %-30s %d %d\n", ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq()); } ; } ; }
public static void fillQueue(TermsEnum termsEnum, TermStatsQueue tiq, String field) throws Exception { BytesRef term; while ((term = termsEnum.next()) != null) { BytesRef r = new BytesRef(); r.copyBytes(term); tiq.insertWithOverflow(new TermStats(field, r, termsEnum.docFreq())); } }
protected void fill(String field, TermsEnum termsEnum) throws IOException { while (true) { BytesRef term = termsEnum.next(); if (term != null) { insertWithOverflow(new TermStats(field, term, termsEnum.docFreq())); } else { break; } } }
public void collectTermContext( IndexReader reader, List<LeafReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) throws IOException { TermsEnum termsEnum = null; for (LeafReaderContext context : leaves) { final Fields fields = context.reader().fields(); for (int i = 0; i < queryTerms.length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; final Terms terms = fields.terms(term.field()); if (terms == null) { // field does not exist continue; } termsEnum = terms.iterator(); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) continue; if (termsEnum.seekExact(term.bytes())) { if (termContext == null) { contextArray[i] = new TermContext( reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { termContext.register( termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } } }
public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException { DirectoryReader reader = searcher.getRawReader(); // raw reader to avoid extra wrapping overhead int maxDoc = searcher.getIndexReader().maxDoc(); int smallSetSize = smallSetSize(maxDoc); String field = term.field(); BytesRef termVal = term.bytes(); int maxCount = 0; int firstReader = -1; List<LeafReaderContext> leaves = reader.leaves(); PostingsEnum[] postList = new PostingsEnum [leaves .size()]; // use array for slightly higher scanning cost, but fewer memory // allocations for (LeafReaderContext ctx : leaves) { assert leaves.get(ctx.ord) == ctx; LeafReader r = ctx.reader(); Fields f = r.fields(); Terms t = f.terms(field); if (t == null) continue; // field is missing TermsEnum te = t.iterator(); if (te.seekExact(termVal)) { maxCount += te.docFreq(); postList[ctx.ord] = te.postings(null, PostingsEnum.NONE); if (firstReader < 0) firstReader = ctx.ord; } } if (maxCount == 0) { return DocSet.EMPTY; } if (maxCount <= smallSetSize) { return createSmallSet(leaves, postList, maxCount, firstReader); } return createBigSet(leaves, postList, maxDoc, firstReader); }
@Override public int docFreq() throws IOException { return termsEnum.docFreq(); }
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last // number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); // System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ; ) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ; ) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } // System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; // System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit // boundary. // TODO: figure out what array lengths we can round up to w/o actually using more // memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } // System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); // System.out.println(" endpos=" + endPos); if (endPos <= 4) { // System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { // System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; // System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /** * * we don't have to worry about the array getting too large since the "pos" param * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen * <= pos + len) newlen<<=1; // doubling strategy } ** */ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(TermsParams.TERMS, false)) return; String[] fields = params.getParams(TermsParams.TERMS_FIELD); NamedList<Object> termsResult = new SimpleOrderedMap<>(); rb.rsp.add("terms", termsResult); if (fields == null || fields.length == 0) return; int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String lowerStr = params.get(TermsParams.TERMS_LOWER); String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX.equals( params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader(); Fields lfields = indexReader.fields(); for (String field : fields) { NamedList<Integer> fieldTerms = new NamedList<>(); termsResult.add(field, fieldTerms); Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field continue; } FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // prefix must currently be text BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix); BytesRef upperBytes = null; if (upperStr != null) { upperBytes = new BytesRef(); ft.readableToIndexed(upperStr, upperBytes); } BytesRef lowerBytes; if (lowerStr == null) { // If no lower bound was specified, use the prefix lowerBytes = prefixBytes; } else { lowerBytes = new BytesRef(); if (raw) { // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists // perhaps we detect if the FieldType is non-character and expect hex if so? lowerBytes = new BytesRef(lowerStr); } else { lowerBytes = new BytesRef(); ft.readableToIndexed(lowerStr, lowerBytes); } } TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; if (lowerBytes != null) { if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); // Only advance the enum if we are excluding the lower bound and the lower Term actually // matches if (lowerIncl == false && term.equals(lowerBytes)) { term = termsEnum.next(); } } } else { // position termsEnum on first term term = termsEnum.next(); } int i = 0; BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null); CharsRef external = new CharsRef(); while (term != null && (i < limit || sort)) { boolean externalized = false; // did we fill in "external" yet for this term? // stop if the prefix doesn't match if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break; if (pattern != null) { // indexed text or external text? // TODO: support "raw" mode? ft.indexedToReadable(term, external); externalized = true; if (!pattern.matcher(external).matches()) { term = termsEnum.next(); continue; } } if (upperBytes != null) { int upperCmp = term.compareTo(upperBytes); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list if (sort) { queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); i++; } } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { if (i >= limit) break; ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; } } } }
/** * Returns a list of terms in the specified field along with the corresponding count of documents * in the set that match that constraint. This method uses the FilterCache to get the intersection * count between <code>docs</code> and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts( SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, SolrParams params) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); LeafReader r = searcher.getLeafReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef prefixTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); prefixTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (prefixTermBytes != null) { if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } PostingsEnum postingsEnum = null; CharsRefBuilder charsRef = new CharsRefBuilder(); if (docs.size() >= mincount) { while (term != null) { if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break; if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) { int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; } c = searcher.numDocs(docs, deState); postingsEnum = deState.postingsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it // matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class // impl) // TODO: would passing deleted docs lead to better efficiency over checking the // fastForRandomSet? postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); c = 0; if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }
@Test public void luceneNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription( TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "text*.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir); for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) { // System.out.println(jcas.getDocumentText().length()); } int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); // Bits liveDocs = MultiFields.getLiveDocs(index); // DocsEnum docs = termsEnum.docs(liveDocs, null); // int docId; // while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) { // index.g // } BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); if (text.utf8ToString().equals("this")) { assertEquals(2, termsEnum.docFreq()); assertEquals(3, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(35, i); }
private void checkAllInfo( int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorsRequestBuilder resp = client() .prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(true) .setOffsets(true) .setPositions(true) .setFieldStatistics(true) .setTermStatistics(true) .setSelectedFields(); assertThat(resp.request().fieldStatistics(), equalTo(true)); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat( "expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else { assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = XContentFactory.jsonBuilder(); xBuilder.startObject(); response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS); xBuilder.endObject(); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", ""); ; String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
/** checks term-level statistics */ public void assertTermStats(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum) throws Exception { assertEquals(leftTermsEnum.docFreq(), rightTermsEnum.docFreq()); if (leftTermsEnum.totalTermFreq() != -1 && rightTermsEnum.totalTermFreq() != -1) { assertEquals(leftTermsEnum.totalTermFreq(), rightTermsEnum.totalTermFreq()); } }
/** * checks the terms enum sequentially if deep is false, it does a 'shallow' test that doesnt go * down to the docsenums */ public void assertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, boolean deep) throws Exception { BytesRef term; Bits randomBits = new RandomBits(MAXDOC, random().nextDouble(), random()); DocsAndPositionsEnum leftPositions = null; DocsAndPositionsEnum rightPositions = null; DocsEnum leftDocs = null; DocsEnum rightDocs = null; while ((term = leftTermsEnum.next()) != null) { assertEquals(term, rightTermsEnum.next()); assertTermStats(leftTermsEnum, rightTermsEnum); if (deep) { // with payloads + off assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions)); assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions)); assertPositionsSkipping( leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions)); assertPositionsSkipping( leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions)); // with payloads only assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.docsAndPositions( null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.docsAndPositions( null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.docsAndPositions( randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.docsAndPositions( randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); assertPositionsSkipping( leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.docsAndPositions( null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.docsAndPositions( null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); assertPositionsSkipping( leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.docsAndPositions( randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.docsAndPositions( randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); // with offsets only assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.docsAndPositions( null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.docsAndPositions( null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.docsAndPositions( randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.docsAndPositions( randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); assertPositionsSkipping( leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.docsAndPositions( null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.docsAndPositions( null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); assertPositionsSkipping( leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.docsAndPositions( randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.docsAndPositions( randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); // with positions only assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); assertPositionsSkipping( leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.docsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.docsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); assertPositionsSkipping( leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.docsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.docsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); // with freqs: assertDocsEnum( leftDocs = leftTermsEnum.docs(null, leftDocs), rightDocs = rightTermsEnum.docs(null, rightDocs)); assertDocsEnum( leftDocs = leftTermsEnum.docs(randomBits, leftDocs), rightDocs = rightTermsEnum.docs(randomBits, rightDocs)); // w/o freqs: assertDocsEnum( leftDocs = leftTermsEnum.docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.docs(null, rightDocs, DocsEnum.FLAG_NONE)); assertDocsEnum( leftDocs = leftTermsEnum.docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); // with freqs: assertDocsSkipping( leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.docs(null, leftDocs), rightDocs = rightTermsEnum.docs(null, rightDocs)); assertDocsSkipping( leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.docs(randomBits, leftDocs), rightDocs = rightTermsEnum.docs(randomBits, rightDocs)); // w/o freqs: assertDocsSkipping( leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.docs(null, rightDocs, DocsEnum.FLAG_NONE)); assertDocsSkipping( leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); } } assertNull(rightTermsEnum.next()); }