public synchronized ShapeFieldCache<T> getCache(LeafReader reader) throws IOException { ShapeFieldCache<T> idx = sidx.get(reader); if (idx != null) { return idx; } long startTime = System.currentTimeMillis(); log.fine("Building Cache [" + reader.maxDoc() + "]"); idx = new ShapeFieldCache<>(reader.maxDoc(), defaultSize); int count = 0; DocsEnum docs = null; Terms terms = reader.terms(shapeField); TermsEnum te = null; if (terms != null) { te = terms.iterator(te); BytesRef term = te.next(); while (term != null) { T shape = readShape(term); if (shape != null) { docs = te.docs(null, docs, DocsEnum.FLAG_NONE); Integer docid = docs.nextDoc(); while (docid != DocIdSetIterator.NO_MORE_DOCS) { idx.add(docid, shape); docid = docs.nextDoc(); count++; } } term = te.next(); } } sidx.put(reader, idx); long elapsed = System.currentTimeMillis() - startTime; log.fine("Cached: [" + count + " in " + elapsed + "ms] " + idx); return idx; }
/* * Utility function to display a term vector. */ static void termVectorDisplay(Terms terms) throws IOException { if ((terms == null) || (terms.size() == -1)) System.out.println(" The field is not stored."); else { /* * The terms for this field are stored. */ System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %10d %-20s %d ", ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq()); DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null); currDoc.nextDoc(); for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++) System.out.print(currDoc.nextPosition() + " "); System.out.println(); } ; } ; }
/* * listTermDictionary displays the term dictionary for a field. */ static void listTermDictionary(IndexReader reader, String fieldName) throws IOException { System.out.println("\nTerm Dictionary: field " + fieldName); /* Grant says: MultiFields.getTerms(IndexReader, fieldName) */ Terms terms = MultiFields.getTerms(reader, fieldName); if ((terms == null) || (terms.size() == -1)) System.out.println(" The term dictionary is empty."); else { System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %-30s %d %d\n", ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq()); } ; } ; }
/** * Prepare a document reconstructor. * * @param reader IndexReader to read from. * @param fieldNames if non-null or not empty, data will be collected only from these fields, * otherwise data will be collected from all fields * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated) * @throws Exception */ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception { if (reader == null) { throw new Exception("IndexReader cannot be null."); } this.reader = reader; if (fieldNames == null || fieldNames.length == 0) { // collect fieldNames this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]); } else { this.fieldNames = fieldNames; } if (numTerms == -1) { Fields fields = MultiFields.getFields(reader); numTerms = 0; FieldsEnum fe = fields.iterator(); String fld = null; while ((fld = fe.next()) != null) { TermsEnum te = fe.terms(); while (te.next() != null) { numTerms++; } } this.numTerms = numTerms; } deleted = MultiFields.getDeletedDocs(reader); }
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { // if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { log.warning(iter.term().utf8ToString()); } iter.next(); /*} else { break; }*/ } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
/** * List all of the files in this index database * * @throws IOException If an IO error occurs while reading from the database */ public void listFiles() throws IOException { IndexReader ireader = null; TermsEnum iter; Terms terms = null; try { ireader = DirectoryReader.open(indexDirectory); // open existing index int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.U); } iter = terms.iterator(null); // init uid iterator while (iter.term() != null) { log.fine(Util.uid2url(iter.term().utf8ToString())); iter.next(); } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing index reader", e); } } } }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field * @param fieldName Optional field name of the terms for skip terms */ private void addTermFrequencies( Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } if (isSkipTerm(fieldName, term)) { continue; } final PostingsEnum docs = termsEnum.postings(null, null); int freq = 0; while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { freq += docs.freq(); } // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
private void printSegment(PrintWriter out, SegmentCommitInfo si) throws Exception { SegmentReader reader = new SegmentReader(si, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random())); for (int i = 0; i < reader.numDocs(); i++) out.println(reader.document(i)); Fields fields = reader.fields(); for (String field : fields) { Terms terms = fields.terms(field); assertNotNull(terms); TermsEnum tis = terms.iterator(null); while (tis.next() != null) { out.print(" term=" + field + ":" + tis.term()); out.println(" DF=" + tis.docFreq()); DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null); while (positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { out.print(" doc=" + positions.docID()); out.print(" TF=" + positions.freq()); out.print(" pos="); out.print(positions.nextPosition()); for (int j = 1; j < positions.freq(); j++) out.print("," + positions.nextPosition()); out.println(""); } } } reader.close(); }
public static void fillQueue(TermsEnum termsEnum, TermStatsQueue tiq, String field) throws Exception { BytesRef term; while ((term = termsEnum.next()) != null) { BytesRef r = new BytesRef(); r.copyBytes(term); tiq.insertWithOverflow(new TermStats(field, r, termsEnum.docFreq())); } }
/** mixes up seek and next for all terms */ public void testSeekingAndNexting() throws Exception { for (int i = 0; i < numIterations; i++) { TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); for (BytesRef term : terms) { int c = random().nextInt(3); if (c == 0) { assertEquals(term, te.next()); } else if (c == 1) { assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean())); assertEquals(term, te.term()); } else { assertTrue(te.seekExact(term, random().nextBoolean())); } } } }
/** tests intersect: TODO start at a random term! */ public void testIntersect() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false); TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null); Automaton expected = BasicOperations.intersection(termsAutomaton, automaton); TreeSet<BytesRef> found = new TreeSet<BytesRef>(); while (te.next() != null) { found.add(BytesRef.deepCopyOf(te.term())); } Automaton actual = BasicAutomata.makeStringUnion(found); assertTrue(BasicOperations.sameLanguage(expected, actual)); } }
private void verifyCount(IndexReader ir) throws Exception { Fields fields = MultiFields.getFields(ir); for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } int docCount = terms.getDocCount(); FixedBitSet visited = new FixedBitSet(ir.maxDoc()); TermsEnum te = terms.iterator(); while (te.next() != null) { PostingsEnum de = TestUtil.docs(random(), te, null, PostingsEnum.NONE); while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visited.set(de.docID()); } } assertEquals(visited.cardinality(), docCount); } }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(TermsParams.TERMS, false)) return; String[] fields = params.getParams(TermsParams.TERMS_FIELD); NamedList<Object> termsResult = new SimpleOrderedMap<>(); rb.rsp.add("terms", termsResult); if (fields == null || fields.length == 0) return; int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String lowerStr = params.get(TermsParams.TERMS_LOWER); String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX.equals( params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader(); Fields lfields = indexReader.fields(); for (String field : fields) { NamedList<Integer> fieldTerms = new NamedList<>(); termsResult.add(field, fieldTerms); Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field continue; } FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // prefix must currently be text BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix); BytesRef upperBytes = null; if (upperStr != null) { upperBytes = new BytesRef(); ft.readableToIndexed(upperStr, upperBytes); } BytesRef lowerBytes; if (lowerStr == null) { // If no lower bound was specified, use the prefix lowerBytes = prefixBytes; } else { lowerBytes = new BytesRef(); if (raw) { // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists // perhaps we detect if the FieldType is non-character and expect hex if so? lowerBytes = new BytesRef(lowerStr); } else { lowerBytes = new BytesRef(); ft.readableToIndexed(lowerStr, lowerBytes); } } TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; if (lowerBytes != null) { if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); // Only advance the enum if we are excluding the lower bound and the lower Term actually // matches if (lowerIncl == false && term.equals(lowerBytes)) { term = termsEnum.next(); } } } else { // position termsEnum on first term term = termsEnum.next(); } int i = 0; BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null); CharsRef external = new CharsRef(); while (term != null && (i < limit || sort)) { boolean externalized = false; // did we fill in "external" yet for this term? // stop if the prefix doesn't match if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break; if (pattern != null) { // indexed text or external text? // TODO: support "raw" mode? ft.indexedToReadable(term, external); externalized = true; if (!pattern.matcher(external).matches()) { term = termsEnum.next(); continue; } } if (upperBytes != null) { int upperCmp = term.compareTo(upperBytes); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list if (sort) { queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); i++; } } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { if (i >= limit) break; ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; } } } }
@Override public DoubleArrayAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception { AtomicReader reader = context.reader(); Terms terms = reader.terms(getFieldNames().indexName()); if (terms == null) { return DoubleArrayAtomicFieldData.EMPTY; } // TODO: how can we guess the number of terms? numerics end up creating more terms per value... final TDoubleArrayList values = new TDoubleArrayList(); ArrayList<int[]> ordinals = new ArrayList<int[]>(); int[] idx = new int[reader.maxDoc()]; ordinals.add(new int[reader.maxDoc()]); values.add(0); // first "t" indicates null value int termOrd = 1; // current term number TermsEnum termsEnum = terms.iterator(null); try { DocsEnum docsEnum = null; for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) { values.add(FieldCache.NUMERIC_UTILS_DOUBLE_PARSER.parseDouble(term)); docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0); for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { int[] ordinal; if (idx[docId] >= ordinals.size()) { ordinal = new int[reader.maxDoc()]; ordinals.add(ordinal); } else { ordinal = ordinals.get(idx[docId]); } ordinal[docId] = termOrd; idx[docId]++; } termOrd++; } } catch (RuntimeException e) { if (e.getClass().getName().endsWith("StopFillCacheException")) { // all is well, in case numeric parsers are used. } else { throw e; } } if (ordinals.size() == 1) { int[] nativeOrdinals = ordinals.get(0); FixedBitSet set = new FixedBitSet(reader.maxDoc()); double[] sValues = new double[reader.maxDoc()]; boolean allHaveValue = true; for (int i = 0; i < nativeOrdinals.length; i++) { int nativeOrdinal = nativeOrdinals[i]; if (nativeOrdinal == 0) { allHaveValue = false; } else { set.set(i); sValues[i] = values.get(nativeOrdinal); } } if (allHaveValue) { return new DoubleArrayAtomicFieldData.Single(sValues, reader.maxDoc()); } else { return new DoubleArrayAtomicFieldData.SingleFixedSet(sValues, reader.maxDoc(), set); } } else { int[][] nativeOrdinals = new int[ordinals.size()][]; for (int i = 0; i < nativeOrdinals.length; i++) { nativeOrdinals[i] = ordinals.get(i); } return new DoubleArrayAtomicFieldData.WithOrdinals( values.toArray(new double[values.size()]), reader.maxDoc(), Ordinals.Factories.createFromFlatOrdinals( nativeOrdinals, termOrd, fieldDataType.getSettings())); } }
protected void validateResponse( TermVectorResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException { TestDoc testDoc = testConfig.doc; HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<String>(Arrays.asList(testConfig.selectedFields)); Fields esTermVectorFields = esResponse.getFields(); for (TestFieldSetting field : testDoc.fieldSettings) { Terms esTerms = esTermVectorFields.terms(field.name); if (selectedFields != null && !selectedFields.contains(field.name)) { assertNull(esTerms); continue; } assertNotNull(esTerms); Terms luceneTerms = luceneFields.terms(field.name); TermsEnum esTermEnum = esTerms.iterator(null); TermsEnum luceneTermEnum = luceneTerms.iterator(null); while (esTermEnum.next() != null) { assertNotNull(luceneTermEnum.next()); assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq())); DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0); DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0); if (luceneDocsPosEnum == null) { // test we expect that... assertFalse(field.storedOffset); assertFalse(field.storedPayloads); assertFalse(field.storedPositions); continue; } String currentTerm = esTermEnum.term().utf8ToString(); assertThat( "Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString())); esDocsPosEnum.nextDoc(); luceneDocsPosEnum.nextDoc(); int freq = esDocsPosEnum.freq(); assertThat(freq, equalTo(luceneDocsPosEnum.freq())); for (int i = 0; i < freq; i++) { String failDesc = " (field:" + field.name + " term:" + currentTerm + ")"; int lucenePos = luceneDocsPosEnum.nextPosition(); int esPos = esDocsPosEnum.nextPosition(); if (field.storedPositions && testConfig.requestPositions) { assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos)); } else { assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1)); } if (field.storedOffset && testConfig.requestOffsets) { assertThat( "Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset())); assertThat( "Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset())); } else { assertThat( "Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1)); assertThat( "Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1)); } if (field.storedPayloads && testConfig.requestPayloads) { assertThat( "Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload())); } else { assertThat( "Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null)); } } } assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next()); } }
/** * Generate indexes recursively * * @param dir the root indexDirectory to generate indexes for * @param path the path * @param count_only if true will just traverse the source root and count files * @param cur_count current count during the traversal of the tree * @param est_total estimate total files to process */ private int indexDown(File dir, String parent, boolean count_only, int cur_count, int est_total) throws IOException { int lcur_count = cur_count; if (isInterrupted()) { return lcur_count; } if (!accept(dir)) { return lcur_count; } File[] files = dir.listFiles(); if (files == null) { log.log(Level.SEVERE, "Failed to get file listing for: {0}", dir.getAbsolutePath()); return lcur_count; } Arrays.sort( files, new Comparator<File>() { @Override public int compare(File p1, File p2) { return p1.getName().compareTo(p2.getName()); } }); for (File file : files) { if (accept(dir, file)) { String path = parent + '/' + file.getName(); if (file.isDirectory()) { lcur_count = indexDown(file, path, count_only, lcur_count, est_total); } else { lcur_count++; if (count_only) { continue; } if (RuntimeEnvironment.getInstance().isPrintProgress() && est_total > 0 && log.isLoggable(Level.INFO)) { log.log( Level.INFO, "Progress: {0} ({1}%)", new Object[] {lcur_count, (lcur_count * 100.0f / est_total)}); } if (uidIter != null) { String uid = Util.path2uid( path, DateTools.timeToString( file.lastModified(), DateTools.Resolution.MILLISECOND)); // construct uid for doc BytesRef buid = new BytesRef(uid); while (uidIter.term() != null && uidIter.term().compareTo(emptyBR) != 0 && uidIter.term().compareTo(buid) < 0) { removeFile(); uidIter.next(); } if (uidIter.term() != null && uidIter.term().bytesEquals(buid)) { uidIter.next(); // keep matching docs continue; } } try { addFile(file, path); } catch (Exception e) { log.log(Level.WARNING, "Failed to add file " + file.getAbsolutePath(), e); } } } } return lcur_count; }
private void verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) throws Exception { final DocTermOrds dto = new DocTermOrds( r, r.getLiveDocs(), "field", prefixRef, Integer.MAX_VALUE, _TestUtil.nextInt(random(), 2, 10)); final FieldCache.Ints docIDToID = FieldCache.DEFAULT.getInts(r, "id", false); /* for(int docID=0;docID<subR.maxDoc();docID++) { System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); } */ if (VERBOSE) { System.out.println( "TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.utf8ToString())); System.out.println("TEST: all TERMS:"); TermsEnum allTE = MultiFields.getTerms(r, "field").iterator(null); int ord = 0; while (allTE.next() != null) { System.out.println(" ord=" + (ord++) + " term=" + allTE.term().utf8ToString()); } } // final TermsEnum te = subR.fields().terms("field").iterator(); final TermsEnum te = dto.getOrdTermsEnum(r); if (dto.numTerms() == 0) { if (prefixRef == null) { assertNull(MultiFields.getTerms(r, "field")); } else { Terms terms = MultiFields.getTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef, false); if (result != TermsEnum.SeekStatus.END) { assertFalse( "term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), StringHelper.startsWith(termsEnum.term(), prefixRef)); } else { // ok } } else { // ok } } return; } if (VERBOSE) { System.out.println("TEST: TERMS:"); te.seekExact(0); while (true) { System.out.println(" ord=" + te.ord() + " term=" + te.term().utf8ToString()); if (te.next() == null) { break; } } } SortedSetDocValues iter = dto.iterator(r); for (int docID = 0; docID < r.maxDoc(); docID++) { if (VERBOSE) { System.out.println( "TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.get(docID) + ")"); } iter.setDocument(docID); final int[] answers = idToOrds[docIDToID.get(docID)]; int upto = 0; long ord; while ((ord = iter.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { te.seekExact(ord); final BytesRef expected = termsArray[answers[upto++]]; if (VERBOSE) { System.out.println( " exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString()); } assertEquals( "expected=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString() + " ord=" + ord, expected, te.term()); } assertEquals(answers.length, upto); } }
/** * Update the content of this index database * * @throws IOException if an error occurs * @throws HistoryException if an error occurs when accessing the history */ public void update() throws IOException, HistoryException { synchronized (lock) { if (running) { throw new IOException("Indexer already running!"); } running = true; interrupted = false; } String ctgs = RuntimeEnvironment.getInstance().getCtags(); if (ctgs != null) { ctags = new Ctags(); ctags.setBinary(ctgs); } if (ctags == null) { log.severe("Unable to run ctags! searching definitions will not work!"); } if (ctags != null) { String filename = RuntimeEnvironment.getInstance().getCTagsExtraOptionsFile(); if (filename != null) { ctags.setCTagsExtraOptionsFile(filename); } } try { Analyzer analyzer = AnalyzerGuru.getAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(SearchEngine.LUCENE_VERSION, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // iwc.setRAMBufferSizeMB(256.0); //TODO check what is the sweet spot writer = new IndexWriter(indexDirectory, iwc); writer.commit(); // to make sure index exists on the disk // writer.setMaxFieldLength(RuntimeEnvironment.getInstance().getIndexWordLimit()); if (directories.isEmpty()) { if (project == null) { directories.add(""); } else { directories.add(project.getPath()); } } for (String dir : directories) { File sourceRoot; if ("".equals(dir)) { sourceRoot = RuntimeEnvironment.getInstance().getSourceRootFile(); } else { sourceRoot = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), dir); } HistoryGuru.getInstance().ensureHistoryCacheExists(sourceRoot); String startuid = Util.path2uid(dir, ""); IndexReader reader = DirectoryReader.open(indexDirectory); // open existing index Terms terms = null; int numDocs = reader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(reader); // reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.U); } try { if (numDocs > 0) { uidIter = terms.iterator(null); TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid), true); // init uid if (stat == TermsEnum.SeekStatus.END || stat == TermsEnum.SeekStatus.NOT_FOUND) { uidIter = null; } } // TODO below should be optional, since it traverses the tree once more to get total // count! :( int file_cnt = 0; if (RuntimeEnvironment.getInstance().isPrintProgress()) { log.log(Level.INFO, "Counting files in {0} ...", dir); file_cnt = indexDown(sourceRoot, dir, true, 0, 0); if (log.isLoggable(Level.INFO)) { log.log( Level.INFO, "Need to process: {0} files for {1}", new Object[] {file_cnt, dir}); } } indexDown(sourceRoot, dir, false, 0, file_cnt); while (uidIter != null && uidIter.term() != null && uidIter.term().utf8ToString().startsWith(startuid)) { removeFile(); uidIter.next(); } } finally { reader.close(); } } } finally { if (writer != null) { try { writer.prepareCommit(); writer.commit(); writer.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing writer", e); } } if (ctags != null) { try { ctags.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing ctags process", e); } } synchronized (lock) { running = false; } } if (!isInterrupted() && isDirty()) { if (RuntimeEnvironment.getInstance().isOptimizeDatabase()) { optimize(); } createSpellingSuggestions(); RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File timestamp = new File(env.getDataRootFile(), "timestamp"); if (timestamp.exists()) { if (!timestamp.setLastModified(System.currentTimeMillis())) { log.log( Level.WARNING, "Failed to set last modified time on ''{0}'', used for timestamping the index database.", timestamp.getAbsolutePath()); } } else { if (!timestamp.createNewFile()) { log.log( Level.WARNING, "Failed to create file ''{0}'', used for timestamping the index database.", timestamp.getAbsolutePath()); } } } }
public void testSortedTermsEnum() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwconfig.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); Document doc = new Document(); doc.add(new StringField("field", "hello", Field.Store.NO)); iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("field", "world", Field.Store.NO)); iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("field", "beer", Field.Store.NO)); iwriter.addDocument(doc); iwriter.forceMerge(1); DirectoryReader ireader = iwriter.getReader(); iwriter.close(); AtomicReader ar = getOnlySegmentReader(ireader); SortedSetDocValues dv = FieldCache.DEFAULT.getDocTermOrds(ar, "field"); assertEquals(3, dv.getValueCount()); TermsEnum termsEnum = dv.termsEnum(); // next() assertEquals("beer", termsEnum.next().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals("hello", termsEnum.next().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals("world", termsEnum.next().utf8ToString()); assertEquals(2, termsEnum.ord()); // seekCeil() assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz"))); // seekExact() assertTrue(termsEnum.seekExact(new BytesRef("beer"), true)); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("hello"), true)); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("world"), true)); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); assertFalse(termsEnum.seekExact(new BytesRef("bogus"), true)); // seek(ord) termsEnum.seekExact(0); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); termsEnum.seekExact(1); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); ireader.close(); directory.close(); }
public SparseInstances readIndex(String indexPath, String destFile, int threshold) throws Exception { if (indexPath == null || destFile == null) { System.out.println("error: indexPath or destFile is null\n"); return null; } DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(reviewKey); int capacity = (int) terms.size(); HashMap<String, Integer> wordDict = new HashMap<>(capacity); capacity = capacity > 65535 ? 65535 : capacity; SparseInstances instData = new SparseInstances(capacity, reader.numDocs()); TermsEnum termsEnum = terms.iterator(); int index = 0; BytesRef term = null; String strTerm = null; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); if (termsEnum.totalTermFreq() < threshold) { continue; } if (strTerm.isEmpty()) { continue; } if (wordDict.get(strTerm) != null) { continue; } instData.addAttribute(strTerm); index++; } int numAtt = instData.numAttributes(); int numInst = instData.numInstances(); Integer attIndex = null; String id = null; int termIndex = 0; for (int docIndex = 0; docIndex < numInst; docIndex++) { id = reader.document(docIndex).getField(idKey).stringValue(); Terms docTerms = reader.getTermVector(docIndex, reviewKey); if (docTerms == null) { continue; } int[] indices = new int[(int) docTerms.size()]; double[] attValues = new double[(int) docTerms.size()]; termsEnum = docTerms.iterator(); termIndex = 0; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); attIndex = wordDict.get(strTerm); if (attIndex == null) { continue; } indices[termIndex] = attIndex.intValue(); attValues[termIndex] = termsEnum.totalTermFreq(); } ESparseInstance instance = new ESparseInstance(id, 1.0, attValues, indices, numAtt); instData.addInstance(instance); } return null; }
public static void verifyEquals(Fields d1, Fields d2) throws IOException { if (d1 == null) { assertTrue(d2 == null || d2.size() == 0); return; } assertTrue(d2 != null); Iterator<String> fieldsEnum2 = d2.iterator(); for (String field1 : d1) { String field2 = fieldsEnum2.next(); assertEquals(field1, field2); Terms terms1 = d1.terms(field1); assertNotNull(terms1); TermsEnum termsEnum1 = terms1.iterator(null); Terms terms2 = d2.terms(field2); assertNotNull(terms2); TermsEnum termsEnum2 = terms2.iterator(null); DocsAndPositionsEnum dpEnum1 = null; DocsAndPositionsEnum dpEnum2 = null; DocsEnum dEnum1 = null; DocsEnum dEnum2 = null; BytesRef term1; while ((term1 = termsEnum1.next()) != null) { BytesRef term2 = termsEnum2.next(); assertEquals(term1, term2); assertEquals(termsEnum1.totalTermFreq(), termsEnum2.totalTermFreq()); dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1); dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2); if (dpEnum1 != null) { assertNotNull(dpEnum2); int docID1 = dpEnum1.nextDoc(); dpEnum2.nextDoc(); // docIDs are not supposed to be equal // int docID2 = dpEnum2.nextDoc(); // assertEquals(docID1, docID2); assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dpEnum1.freq(); int freq2 = dpEnum2.freq(); assertEquals(freq1, freq2); OffsetAttribute offsetAtt1 = dpEnum1.attributes().hasAttribute(OffsetAttribute.class) ? dpEnum1.attributes().getAttribute(OffsetAttribute.class) : null; OffsetAttribute offsetAtt2 = dpEnum2.attributes().hasAttribute(OffsetAttribute.class) ? dpEnum2.attributes().getAttribute(OffsetAttribute.class) : null; if (offsetAtt1 != null) { assertNotNull(offsetAtt2); } else { assertNull(offsetAtt2); } for (int posUpto = 0; posUpto < freq1; posUpto++) { int pos1 = dpEnum1.nextPosition(); int pos2 = dpEnum2.nextPosition(); assertEquals(pos1, pos2); if (offsetAtt1 != null) { assertEquals(offsetAtt1.startOffset(), offsetAtt2.startOffset()); assertEquals(offsetAtt1.endOffset(), offsetAtt2.endOffset()); } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.nextDoc()); } else { dEnum1 = TestUtil.docs(random(), termsEnum1, null, dEnum1, DocsEnum.FLAG_FREQS); dEnum2 = TestUtil.docs(random(), termsEnum2, null, dEnum2, DocsEnum.FLAG_FREQS); assertNotNull(dEnum1); assertNotNull(dEnum2); int docID1 = dEnum1.nextDoc(); dEnum2.nextDoc(); // docIDs are not supposed to be equal // int docID2 = dEnum2.nextDoc(); // assertEquals(docID1, docID2); assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dEnum1.freq(); int freq2 = dEnum2.freq(); assertEquals(freq1, freq2); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum1.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum2.nextDoc()); } } assertNull(termsEnum2.next()); } assertFalse(fieldsEnum2.hasNext()); }
public void testMerge() throws IOException { final Codec codec = Codec.getDefault(); final SegmentInfo si = new SegmentInfo( mergedDir, Version.LATEST, mergedSegment, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); SegmentMerger merger = new SegmentMerger( Arrays.<CodecReader>asList(reader1, reader2), si, InfoStream.getDefault(), mergedDir, new FieldInfos.FieldNumbers(), newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1)))); MergeState mergeState = merger.merge(); int docsMerged = mergeState.segmentInfo.maxDoc(); assertTrue(docsMerged == 2); // Should be able to open a new SegmentReader against the new directory SegmentReader mergedReader = new SegmentReader( new SegmentCommitInfo(mergeState.segmentInfo, 0, -1L, -1L, -1L), newIOContext(random())); assertTrue(mergedReader != null); assertTrue(mergedReader.numDocs() == 2); Document newDoc1 = mergedReader.document(0); assertTrue(newDoc1 != null); // There are 2 unstored fields on the document assertTrue( DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - DocHelper.unstored.size()); Document newDoc2 = mergedReader.document(1); assertTrue(newDoc2 != null); assertTrue( DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - DocHelper.unstored.size()); PostingsEnum termDocs = TestUtil.docs( random(), mergedReader, DocHelper.TEXT_FIELD_2_KEY, new BytesRef("field"), null, 0); assertTrue(termDocs != null); assertTrue(termDocs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); int tvCount = 0; for (FieldInfo fieldInfo : mergedReader.getFieldInfos()) { if (fieldInfo.hasVectors()) { tvCount++; } } // System.out.println("stored size: " + stored.size()); assertEquals("We do not have 3 fields that were indexed with term vector", 3, tvCount); Terms vector = mergedReader.getTermVectors(0).terms(DocHelper.TEXT_FIELD_2_KEY); assertNotNull(vector); assertEquals(3, vector.size()); TermsEnum termsEnum = vector.iterator(); int i = 0; while (termsEnum.next() != null) { String term = termsEnum.term().utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); // System.out.println("Term: " + term + " Freq: " + freq); assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1); assertTrue(DocHelper.FIELD_2_FREQS[i] == freq); i++; } TestSegmentReader.checkNorms(mergedReader); mergedReader.close(); }
/** * Reconstruct document fields. * * @param docNum document number. If this document is deleted, but the index is not optimized yet, * the reconstruction process may still yield the reconstructed field content even from * deleted documents. * @return reconstructed document * @throws Exception */ public Reconstructed reconstruct(int docNum) throws Exception { if (docNum < 0 || docNum > reader.maxDoc()) { throw new Exception("Document number outside of valid range."); } Reconstructed res = new Reconstructed(); if (deleted != null && deleted.get(docNum)) { throw new Exception("Document is deleted."); } else { Document doc = reader.document(docNum); for (int i = 0; i < fieldNames.length; i++) { Field[] fs = doc.getFields(fieldNames[i]); if (fs != null && fs.length > 0) { res.getStoredFields().put(fieldNames[i], fs); } } } // collect values from unstored fields HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames)); // try to use term vectors if available progress.maxValue = fieldNames.length; progress.curValue = 0; progress.minValue = 0; for (int i = 0; i < fieldNames.length; i++) { TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]); if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) { TermPositionVector tpv = (TermPositionVector) tvf; progress.message = "Reading term vectors ..."; progress.curValue = i; setChanged(); notifyObservers(progress); BytesRef[] tv = tpv.getTerms(); for (int k = 0; k < tv.length; k++) { // do we have positions? int[] posArr = tpv.getTermPositions(k); if (posArr == null) { // only offsets TermVectorOffsetInfo[] offsets = tpv.getOffsets(k); if (offsets.length == 0) { continue; } // convert offsets into positions posArr = convertOffsets(offsets); } GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fieldNames[i], gsa); } for (int m = 0; m < posArr.length; m++) { gsa.append(posArr[m], "|", tv[k].utf8ToString()); } } fields.remove(fieldNames[i]); // got what we wanted } } // this loop collects data only from left-over fields // not yet collected through term vectors progress.maxValue = fields.size(); progress.curValue = 0; progress.minValue = 0; for (String fld : fields) { progress.message = "Collecting terms in " + fld + " ..."; progress.curValue++; setChanged(); notifyObservers(progress); Terms terms = MultiFields.getTerms(reader, fld); if (terms == null) { // no terms in this field continue; } TermsEnum te = terms.iterator(); while (te.next() != null) { DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null); if (dpe == null) { // no position info for this field break; } int num = dpe.advance(docNum); if (num != docNum) { // either greater than or NO_MORE_DOCS continue; // no data for this term in this doc } String term = te.term().utf8ToString(); GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fld, gsa); } for (int k = 0; k < dpe.freq(); k++) { int pos = dpe.nextPosition(); gsa.append(pos, "|", term); } } } progress.message = "Done."; progress.curValue = 100; setChanged(); notifyObservers(progress); return res; }
public void verifyEquals(DirectoryReader r1, DirectoryReader r2, String idField) throws Throwable { if (VERBOSE) { System.out.println("\nr1 docs:"); printDocs(r1); System.out.println("\nr2 docs:"); printDocs(r2); } if (r1.numDocs() != r2.numDocs()) { assert false : "r1.numDocs()=" + r1.numDocs() + " vs r2.numDocs()=" + r2.numDocs(); } boolean hasDeletes = !(r1.maxDoc() == r2.maxDoc() && r1.numDocs() == r1.maxDoc()); int[] r2r1 = new int[r2.maxDoc()]; // r2 id to r1 id mapping // create mapping from id2 space to id2 based on idField final Fields f1 = MultiFields.getFields(r1); if (f1 == null) { // make sure r2 is empty assertNull(MultiFields.getFields(r2)); return; } final Terms terms1 = f1.terms(idField); if (terms1 == null) { assertTrue( MultiFields.getFields(r2) == null || MultiFields.getFields(r2).terms(idField) == null); return; } final TermsEnum termsEnum = terms1.iterator(null); final Bits liveDocs1 = MultiFields.getLiveDocs(r1); final Bits liveDocs2 = MultiFields.getLiveDocs(r2); Fields fields = MultiFields.getFields(r2); if (fields == null) { // make sure r1 is in fact empty (eg has only all // deleted docs): Bits liveDocs = MultiFields.getLiveDocs(r1); DocsEnum docs = null; while (termsEnum.next() != null) { docs = TestUtil.docs(random(), termsEnum, liveDocs, docs, DocsEnum.FLAG_NONE); while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { fail("r1 is not empty but r2 is"); } } return; } Terms terms2 = fields.terms(idField); TermsEnum termsEnum2 = terms2.iterator(null); DocsEnum termDocs1 = null; DocsEnum termDocs2 = null; while (true) { BytesRef term = termsEnum.next(); // System.out.println("TEST: match id term=" + term); if (term == null) { break; } termDocs1 = TestUtil.docs(random(), termsEnum, liveDocs1, termDocs1, DocsEnum.FLAG_NONE); if (termsEnum2.seekExact(term)) { termDocs2 = TestUtil.docs(random(), termsEnum2, liveDocs2, termDocs2, DocsEnum.FLAG_NONE); } else { termDocs2 = null; } if (termDocs1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { // This doc is deleted and wasn't replaced assertTrue(termDocs2 == null || termDocs2.nextDoc() == DocIdSetIterator.NO_MORE_DOCS); continue; } int id1 = termDocs1.docID(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, termDocs1.nextDoc()); assertTrue(termDocs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); int id2 = termDocs2.docID(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, termDocs2.nextDoc()); r2r1[id2] = id1; // verify stored fields are equivalent try { verifyEquals(r1.document(id1), r2.document(id2)); } catch (Throwable t) { System.out.println("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term); System.out.println(" d1=" + r1.document(id1)); System.out.println(" d2=" + r2.document(id2)); throw t; } try { // verify term vectors are equivalent verifyEquals(r1.getTermVectors(id1), r2.getTermVectors(id2)); } catch (Throwable e) { System.out.println("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); Fields tv1 = r1.getTermVectors(id1); System.out.println(" d1=" + tv1); if (tv1 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; for (String field : tv1) { System.out.println(" " + field + ":"); Terms terms3 = tv1.terms(field); assertNotNull(terms3); TermsEnum termsEnum3 = terms3.iterator(null); BytesRef term2; while ((term2 = termsEnum3.next()) != null) { System.out.println( " " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq()); dpEnum = termsEnum3.docsAndPositions(null, dpEnum); if (dpEnum != null) { assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); final int freq = dpEnum.freq(); System.out.println(" doc=" + dpEnum.docID() + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { System.out.println(" pos=" + dpEnum.nextPosition()); } } else { dEnum = TestUtil.docs(random(), termsEnum3, null, dEnum, DocsEnum.FLAG_FREQS); assertNotNull(dEnum); assertTrue(dEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); final int freq = dEnum.freq(); System.out.println(" doc=" + dEnum.docID() + " freq=" + freq); } } } } Fields tv2 = r2.getTermVectors(id2); System.out.println(" d2=" + tv2); if (tv2 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; for (String field : tv2) { System.out.println(" " + field + ":"); Terms terms3 = tv2.terms(field); assertNotNull(terms3); TermsEnum termsEnum3 = terms3.iterator(null); BytesRef term2; while ((term2 = termsEnum3.next()) != null) { System.out.println( " " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq()); dpEnum = termsEnum3.docsAndPositions(null, dpEnum); if (dpEnum != null) { assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); final int freq = dpEnum.freq(); System.out.println(" doc=" + dpEnum.docID() + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { System.out.println(" pos=" + dpEnum.nextPosition()); } } else { dEnum = TestUtil.docs(random(), termsEnum3, null, dEnum, DocsEnum.FLAG_FREQS); assertNotNull(dEnum); assertTrue(dEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); final int freq = dEnum.freq(); System.out.println(" doc=" + dEnum.docID() + " freq=" + freq); } } } } throw e; } } // System.out.println("TEST: done match id"); // Verify postings // System.out.println("TEST: create te1"); final Fields fields1 = MultiFields.getFields(r1); final Iterator<String> fields1Enum = fields1.iterator(); final Fields fields2 = MultiFields.getFields(r2); final Iterator<String> fields2Enum = fields2.iterator(); String field1 = null, field2 = null; TermsEnum termsEnum1 = null; termsEnum2 = null; DocsEnum docs1 = null, docs2 = null; // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.numDocs()]; long[] info2 = new long[r2.numDocs()]; for (; ; ) { BytesRef term1 = null, term2 = null; // iterate until we get some docs int len1; for (; ; ) { len1 = 0; if (termsEnum1 == null) { if (!fields1Enum.hasNext()) { break; } field1 = fields1Enum.next(); Terms terms = fields1.terms(field1); if (terms == null) { continue; } termsEnum1 = terms.iterator(null); } term1 = termsEnum1.next(); if (term1 == null) { // no more terms in this field termsEnum1 = null; continue; } // System.out.println("TEST: term1=" + term1); docs1 = TestUtil.docs(random(), termsEnum1, liveDocs1, docs1, DocsEnum.FLAG_FREQS); while (docs1.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = docs1.docID(); int f = docs1.freq(); info1[len1] = (((long) d) << 32) | f; len1++; } if (len1 > 0) break; } // iterate until we get some docs int len2; for (; ; ) { len2 = 0; if (termsEnum2 == null) { if (!fields2Enum.hasNext()) { break; } field2 = fields2Enum.next(); Terms terms = fields2.terms(field2); if (terms == null) { continue; } termsEnum2 = terms.iterator(null); } term2 = termsEnum2.next(); if (term2 == null) { // no more terms in this field termsEnum2 = null; continue; } // System.out.println("TEST: term1=" + term1); docs2 = TestUtil.docs(random(), termsEnum2, liveDocs2, docs2, DocsEnum.FLAG_FREQS); while (docs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = r2r1[docs2.docID()]; int f = docs2.freq(); info2[len2] = (((long) d) << 32) | f; len2++; } if (len2 > 0) break; } assertEquals(len1, len2); if (len1 == 0) break; // no more terms assertEquals(field1, field2); assertTrue(term1.bytesEquals(term2)); if (!hasDeletes) assertEquals(termsEnum1.docFreq(), termsEnum2.docFreq()); assertEquals("len1=" + len1 + " len2=" + len2 + " deletes?=" + hasDeletes, term1, term2); // sort info2 to get it into ascending docid Arrays.sort(info2, 0, len2); // now compare for (int i = 0; i < len1; i++) { assertEquals( "i=" + i + " len=" + len1 + " d1=" + (info1[i] >>> 32) + " f1=" + (info1[i] & Integer.MAX_VALUE) + " d2=" + (info2[i] >>> 32) + " f2=" + (info2[i] & Integer.MAX_VALUE) + " field=" + field1 + " term=" + term1.utf8ToString(), info1[i], info2[i]); } } }