/* * Utility function to display a term vector. */ static void termVectorDisplay(Terms terms) throws IOException { if ((terms == null) || (terms.size() == -1)) System.out.println(" The field is not stored."); else { /* * The terms for this field are stored. */ System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %10d %-20s %d ", ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq()); DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null); currDoc.nextDoc(); for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++) System.out.print(currDoc.nextPosition() + " "); System.out.println(); } ; } ; }
/* * listPostings displays the first n postings for a term in a * field in an index (specified by reader). Set n to MAX_VALUE * to display all postings. */ static void listPostings(IndexReader reader, String termString, String field, Integer n) throws IOException { System.out.println("\nPostings: " + termString + " " + field); /* * Prepare to access the index. */ BytesRef termBytes = new BytesRef(termString); Term term = new Term(field, termBytes); Bits liveDocs = MultiFields.getLiveDocs(reader); /* * Lookup the collection term frequency (ctf). */ long df = reader.docFreq(term); System.out.println("\tdf: " + df); long ctf = reader.totalTermFreq(term); System.out.println("\tctf: " + ctf); if (df < 1) return; /* * Lookup the inverted list. */ DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(reader, liveDocs, field, termBytes); /* * Iterate through the first n postings. */ long count = 0; while ((count < n) && (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)) { System.out.println("\tdocid: " + postings.docID()); int tf = postings.freq(); System.out.println("\ttf: " + tf); System.out.print("\tPositions: "); for (int j = 0; j < tf; j++) { int pos = postings.nextPosition(); System.out.print(pos + " "); } System.out.println(""); count++; } ; return; }
public static void verifyEquals(Fields d1, Fields d2) throws IOException { if (d1 == null) { assertTrue(d2 == null || d2.size() == 0); return; } assertTrue(d2 != null); Iterator<String> fieldsEnum2 = d2.iterator(); for (String field1 : d1) { String field2 = fieldsEnum2.next(); assertEquals(field1, field2); Terms terms1 = d1.terms(field1); assertNotNull(terms1); TermsEnum termsEnum1 = terms1.iterator(null); Terms terms2 = d2.terms(field2); assertNotNull(terms2); TermsEnum termsEnum2 = terms2.iterator(null); DocsAndPositionsEnum dpEnum1 = null; DocsAndPositionsEnum dpEnum2 = null; DocsEnum dEnum1 = null; DocsEnum dEnum2 = null; BytesRef term1; while ((term1 = termsEnum1.next()) != null) { BytesRef term2 = termsEnum2.next(); assertEquals(term1, term2); assertEquals(termsEnum1.totalTermFreq(), termsEnum2.totalTermFreq()); dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1); dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2); if (dpEnum1 != null) { assertNotNull(dpEnum2); int docID1 = dpEnum1.nextDoc(); dpEnum2.nextDoc(); // docIDs are not supposed to be equal // int docID2 = dpEnum2.nextDoc(); // assertEquals(docID1, docID2); assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dpEnum1.freq(); int freq2 = dpEnum2.freq(); assertEquals(freq1, freq2); OffsetAttribute offsetAtt1 = dpEnum1.attributes().hasAttribute(OffsetAttribute.class) ? dpEnum1.attributes().getAttribute(OffsetAttribute.class) : null; OffsetAttribute offsetAtt2 = dpEnum2.attributes().hasAttribute(OffsetAttribute.class) ? dpEnum2.attributes().getAttribute(OffsetAttribute.class) : null; if (offsetAtt1 != null) { assertNotNull(offsetAtt2); } else { assertNull(offsetAtt2); } for (int posUpto = 0; posUpto < freq1; posUpto++) { int pos1 = dpEnum1.nextPosition(); int pos2 = dpEnum2.nextPosition(); assertEquals(pos1, pos2); if (offsetAtt1 != null) { assertEquals(offsetAtt1.startOffset(), offsetAtt2.startOffset()); assertEquals(offsetAtt1.endOffset(), offsetAtt2.endOffset()); } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.nextDoc()); } else { dEnum1 = TestUtil.docs(random(), termsEnum1, null, dEnum1, DocsEnum.FLAG_FREQS); dEnum2 = TestUtil.docs(random(), termsEnum2, null, dEnum2, DocsEnum.FLAG_FREQS); assertNotNull(dEnum1); assertNotNull(dEnum2); int docID1 = dEnum1.nextDoc(); dEnum2.nextDoc(); // docIDs are not supposed to be equal // int docID2 = dEnum2.nextDoc(); // assertEquals(docID1, docID2); assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dEnum1.freq(); int freq2 = dEnum2.freq(); assertEquals(freq1, freq2); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum1.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum2.nextDoc()); } } assertNull(termsEnum2.next()); } assertFalse(fieldsEnum2.hasNext()); }
public void verifyEquals(DirectoryReader r1, DirectoryReader r2, String idField) throws Throwable { if (VERBOSE) { System.out.println("\nr1 docs:"); printDocs(r1); System.out.println("\nr2 docs:"); printDocs(r2); } if (r1.numDocs() != r2.numDocs()) { assert false : "r1.numDocs()=" + r1.numDocs() + " vs r2.numDocs()=" + r2.numDocs(); } boolean hasDeletes = !(r1.maxDoc() == r2.maxDoc() && r1.numDocs() == r1.maxDoc()); int[] r2r1 = new int[r2.maxDoc()]; // r2 id to r1 id mapping // create mapping from id2 space to id2 based on idField final Fields f1 = MultiFields.getFields(r1); if (f1 == null) { // make sure r2 is empty assertNull(MultiFields.getFields(r2)); return; } final Terms terms1 = f1.terms(idField); if (terms1 == null) { assertTrue( MultiFields.getFields(r2) == null || MultiFields.getFields(r2).terms(idField) == null); return; } final TermsEnum termsEnum = terms1.iterator(null); final Bits liveDocs1 = MultiFields.getLiveDocs(r1); final Bits liveDocs2 = MultiFields.getLiveDocs(r2); Fields fields = MultiFields.getFields(r2); if (fields == null) { // make sure r1 is in fact empty (eg has only all // deleted docs): Bits liveDocs = MultiFields.getLiveDocs(r1); DocsEnum docs = null; while (termsEnum.next() != null) { docs = TestUtil.docs(random(), termsEnum, liveDocs, docs, DocsEnum.FLAG_NONE); while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { fail("r1 is not empty but r2 is"); } } return; } Terms terms2 = fields.terms(idField); TermsEnum termsEnum2 = terms2.iterator(null); DocsEnum termDocs1 = null; DocsEnum termDocs2 = null; while (true) { BytesRef term = termsEnum.next(); // System.out.println("TEST: match id term=" + term); if (term == null) { break; } termDocs1 = TestUtil.docs(random(), termsEnum, liveDocs1, termDocs1, DocsEnum.FLAG_NONE); if (termsEnum2.seekExact(term)) { termDocs2 = TestUtil.docs(random(), termsEnum2, liveDocs2, termDocs2, DocsEnum.FLAG_NONE); } else { termDocs2 = null; } if (termDocs1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { // This doc is deleted and wasn't replaced assertTrue(termDocs2 == null || termDocs2.nextDoc() == DocIdSetIterator.NO_MORE_DOCS); continue; } int id1 = termDocs1.docID(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, termDocs1.nextDoc()); assertTrue(termDocs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); int id2 = termDocs2.docID(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, termDocs2.nextDoc()); r2r1[id2] = id1; // verify stored fields are equivalent try { verifyEquals(r1.document(id1), r2.document(id2)); } catch (Throwable t) { System.out.println("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term); System.out.println(" d1=" + r1.document(id1)); System.out.println(" d2=" + r2.document(id2)); throw t; } try { // verify term vectors are equivalent verifyEquals(r1.getTermVectors(id1), r2.getTermVectors(id2)); } catch (Throwable e) { System.out.println("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); Fields tv1 = r1.getTermVectors(id1); System.out.println(" d1=" + tv1); if (tv1 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; for (String field : tv1) { System.out.println(" " + field + ":"); Terms terms3 = tv1.terms(field); assertNotNull(terms3); TermsEnum termsEnum3 = terms3.iterator(null); BytesRef term2; while ((term2 = termsEnum3.next()) != null) { System.out.println( " " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq()); dpEnum = termsEnum3.docsAndPositions(null, dpEnum); if (dpEnum != null) { assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); final int freq = dpEnum.freq(); System.out.println(" doc=" + dpEnum.docID() + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { System.out.println(" pos=" + dpEnum.nextPosition()); } } else { dEnum = TestUtil.docs(random(), termsEnum3, null, dEnum, DocsEnum.FLAG_FREQS); assertNotNull(dEnum); assertTrue(dEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); final int freq = dEnum.freq(); System.out.println(" doc=" + dEnum.docID() + " freq=" + freq); } } } } Fields tv2 = r2.getTermVectors(id2); System.out.println(" d2=" + tv2); if (tv2 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; for (String field : tv2) { System.out.println(" " + field + ":"); Terms terms3 = tv2.terms(field); assertNotNull(terms3); TermsEnum termsEnum3 = terms3.iterator(null); BytesRef term2; while ((term2 = termsEnum3.next()) != null) { System.out.println( " " + term2.utf8ToString() + ": freq=" + termsEnum3.totalTermFreq()); dpEnum = termsEnum3.docsAndPositions(null, dpEnum); if (dpEnum != null) { assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); final int freq = dpEnum.freq(); System.out.println(" doc=" + dpEnum.docID() + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { System.out.println(" pos=" + dpEnum.nextPosition()); } } else { dEnum = TestUtil.docs(random(), termsEnum3, null, dEnum, DocsEnum.FLAG_FREQS); assertNotNull(dEnum); assertTrue(dEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); final int freq = dEnum.freq(); System.out.println(" doc=" + dEnum.docID() + " freq=" + freq); } } } } throw e; } } // System.out.println("TEST: done match id"); // Verify postings // System.out.println("TEST: create te1"); final Fields fields1 = MultiFields.getFields(r1); final Iterator<String> fields1Enum = fields1.iterator(); final Fields fields2 = MultiFields.getFields(r2); final Iterator<String> fields2Enum = fields2.iterator(); String field1 = null, field2 = null; TermsEnum termsEnum1 = null; termsEnum2 = null; DocsEnum docs1 = null, docs2 = null; // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.numDocs()]; long[] info2 = new long[r2.numDocs()]; for (; ; ) { BytesRef term1 = null, term2 = null; // iterate until we get some docs int len1; for (; ; ) { len1 = 0; if (termsEnum1 == null) { if (!fields1Enum.hasNext()) { break; } field1 = fields1Enum.next(); Terms terms = fields1.terms(field1); if (terms == null) { continue; } termsEnum1 = terms.iterator(null); } term1 = termsEnum1.next(); if (term1 == null) { // no more terms in this field termsEnum1 = null; continue; } // System.out.println("TEST: term1=" + term1); docs1 = TestUtil.docs(random(), termsEnum1, liveDocs1, docs1, DocsEnum.FLAG_FREQS); while (docs1.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = docs1.docID(); int f = docs1.freq(); info1[len1] = (((long) d) << 32) | f; len1++; } if (len1 > 0) break; } // iterate until we get some docs int len2; for (; ; ) { len2 = 0; if (termsEnum2 == null) { if (!fields2Enum.hasNext()) { break; } field2 = fields2Enum.next(); Terms terms = fields2.terms(field2); if (terms == null) { continue; } termsEnum2 = terms.iterator(null); } term2 = termsEnum2.next(); if (term2 == null) { // no more terms in this field termsEnum2 = null; continue; } // System.out.println("TEST: term1=" + term1); docs2 = TestUtil.docs(random(), termsEnum2, liveDocs2, docs2, DocsEnum.FLAG_FREQS); while (docs2.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = r2r1[docs2.docID()]; int f = docs2.freq(); info2[len2] = (((long) d) << 32) | f; len2++; } if (len2 > 0) break; } assertEquals(len1, len2); if (len1 == 0) break; // no more terms assertEquals(field1, field2); assertTrue(term1.bytesEquals(term2)); if (!hasDeletes) assertEquals(termsEnum1.docFreq(), termsEnum2.docFreq()); assertEquals("len1=" + len1 + " len2=" + len2 + " deletes?=" + hasDeletes, term1, term2); // sort info2 to get it into ascending docid Arrays.sort(info2, 0, len2); // now compare for (int i = 0; i < len1; i++) { assertEquals( "i=" + i + " len=" + len1 + " d1=" + (info1[i] >>> 32) + " f1=" + (info1[i] & Integer.MAX_VALUE) + " d2=" + (info2[i] >>> 32) + " f2=" + (info2[i] & Integer.MAX_VALUE) + " field=" + field1 + " term=" + term1.utf8ToString(), info1[i], info2[i]); } } }