@Override public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException { final long numOrds = globalOrdinals.getValueCount(); final LongBitSet acceptedGlobalOrdinals = new LongBitSet(numOrds); final TermsEnum termEnum = globalOrdinals.termsEnum(); BytesRef term = termEnum.next(); while (term != null) { if (Math.floorMod( StringHelper.murmurhash3_x86_32(term, HASH_PARTITIONING_SEED), incNumPartitions) == incZeroBasedPartition) { acceptedGlobalOrdinals.set(termEnum.ord()); } term = termEnum.next(); } return acceptedGlobalOrdinals; }
public void testThreeBlocks() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); List<String> terms = new ArrayList<>(); for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "m" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "mo" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); if (VERBOSE) { while (te.next() != null) { System.out.println("TERM: " + te.ord() + " " + te.term().utf8ToString()); } } assertTrue(te.seekExact(new BytesRef("mo"))); assertEquals(27, te.ord()); te.seekExact(90); assertEquals(new BytesRef("s"), te.term()); testEnum(te, terms); r.close(); w.close(); dir.close(); }
/* * Utility function to display a term vector. */ static void termVectorDisplay(Terms terms) throws IOException { if ((terms == null) || (terms.size() == -1)) System.out.println(" The field is not stored."); else { /* * The terms for this field are stored. */ System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %10d %-20s %d ", ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq()); DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null); currDoc.nextDoc(); for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++) System.out.print(currDoc.nextPosition() + " "); System.out.println(); } ; } ; }
public void testSeekCeilNotFound() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); // Get empty string in there! doc.add(newStringField("field", "", Field.Store.NO)); w.addDocument(doc); for (int i = 0; i < 36; i++) { doc = new Document(); String term = "" + (char) (97 + i); String term2 = "a" + (char) (97 + i); doc.add(newTextField("field", term + " " + term2, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef(new byte[] {0x22}))); assertEquals("a", te.term().utf8ToString()); assertEquals(1L, te.ord()); r.close(); w.close(); dir.close(); }
@Override public long lookupTerm(BytesRef key) { try { switch (te.seekCeil(key)) { case FOUND: assert te.ord() >= 0; return te.ord(); case NOT_FOUND: assert te.ord() >= 0; return -te.ord() - 1; default: /* END */ return -numTerms() - 1; } } catch (IOException e) { throw new RuntimeException(e); } }
public void testNonRootFloorBlocks() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); List<String> terms = new ArrayList<>(); for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 128; i++) { Document doc = new Document(); String term = "m" + (char) i; terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term)); } doc.add(newStringField("field", term, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = DirectoryReader.open(w, true); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); BytesRef term; int ord = 0; while ((term = te.next()) != null) { if (VERBOSE) { System.out.println("TEST: " + te.ord() + ": " + term.utf8ToString()); } assertEquals(ord, te.ord()); ord++; } testEnum(te, terms); r.close(); w.close(); dir.close(); }
/** Computes which global ordinals are accepted by this IncludeExclude instance. */ @Override public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException { LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); TermsEnum globalTermsEnum; Terms globalTerms = new DocValuesTerms(globalOrdinals); // TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can // avoid i/o and just set bits. globalTermsEnum = compiled.getTermsEnum(globalTerms); for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) { acceptedGlobalOrdinals.set(globalTermsEnum.ord()); } return acceptedGlobalOrdinals; }
private void testEnum(TermsEnum te, List<String> terms) throws IOException { Collections.sort(terms); for (int i = terms.size() - 1; i >= 0; i--) { if (VERBOSE) { System.out.println("TEST: seek to ord=" + i); } te.seekExact(i); assertEquals(i, te.ord()); assertEquals(terms.get(i), te.term().utf8ToString()); } int iters = atLeast(1000); for (int iter = 0; iter < iters; iter++) { int ord = random().nextInt(terms.size()); if (random().nextBoolean()) { te.seekExact(ord); assertEquals(terms.get(ord), te.term().utf8ToString()); } else { te.seekExact(new BytesRef(terms.get(ord))); assertEquals(ord, te.ord()); } } }
public void testFloorBlocks() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); for (int i = 0; i < 128; i++) { Document doc = new Document(); String term = "" + (char) i; if (VERBOSE) { System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term)); } doc.add(newStringField("field", term, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = DirectoryReader.open(w, true); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); if (VERBOSE) { BytesRef term; while ((term = te.next()) != null) { System.out.println(" " + te.ord() + ": " + term.utf8ToString()); } } assertTrue(te.seekExact(new BytesRef("a"))); assertEquals(97, te.ord()); te.seekExact(98); assertEquals(new BytesRef("b"), te.term()); assertTrue(te.seekExact(new BytesRef("z"))); assertEquals(122, te.ord()); r.close(); w.close(); dir.close(); }
public void testSeveralNonRootBlocks() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); List<String> terms = new ArrayList<>(); for (int i = 0; i < 30; i++) { for (int j = 0; j < 30; j++) { Document doc = new Document(); String term = "" + (char) (97 + i) + (char) (97 + j); terms.add(term); if (VERBOSE) { System.out.println("term=" + term); } doc.add(newTextField("body", term, Field.Store.NO)); w.addDocument(doc); } } w.forceMerge(1); IndexReader r = DirectoryReader.open(w, true); TermsEnum te = MultiFields.getTerms(r, "body").iterator(null); for (int i = 0; i < 30; i++) { for (int j = 0; j < 30; j++) { String term = "" + (char) (97 + i) + (char) (97 + j); if (VERBOSE) { System.out.println("TEST: check term=" + term); } assertEquals(term, te.next().utf8ToString()); assertEquals(30 * i + j, te.ord()); } } testEnum(te, terms); te.seekExact(0); assertEquals("aa", te.term().utf8ToString()); r.close(); w.close(); dir.close(); }
public void testBasic() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); doc.add(newTextField("field", "a b c", Field.Store.NO)); w.addDocument(doc); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); // Test next() assertEquals(new BytesRef("a"), te.next()); assertEquals(0L, te.ord()); assertEquals(new BytesRef("b"), te.next()); assertEquals(1L, te.ord()); assertEquals(new BytesRef("c"), te.next()); assertEquals(2L, te.ord()); assertNull(te.next()); // Test seekExact by term assertTrue(te.seekExact(new BytesRef("b"))); assertEquals(1, te.ord()); assertTrue(te.seekExact(new BytesRef("a"))); assertEquals(0, te.ord()); assertTrue(te.seekExact(new BytesRef("c"))); assertEquals(2, te.ord()); // Test seekExact by ord te.seekExact(1); assertEquals(new BytesRef("b"), te.term()); te.seekExact(0); assertEquals(new BytesRef("a"), te.term()); te.seekExact(2); assertEquals(new BytesRef("c"), te.term()); r.close(); w.close(); dir.close(); }
@Override protected void doSetNextReader(LeafReaderContext context) throws IOException { if (segmentFacetCounts != null) { segmentResults.add(createSegmentResult()); } groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField); facetFieldDocTermOrds = DocValues.getSortedSet(context.reader(), facetField); facetFieldNumTerms = (int) facetFieldDocTermOrds.getValueCount(); if (facetFieldNumTerms == 0) { facetOrdTermsEnum = null; } else { facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum(); } // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet // field segmentFacetCounts = new int[facetFieldNumTerms + 1]; segmentTotalCount = 0; segmentGroupedFacetHits.clear(); for (GroupedFacetHit groupedFacetHit : groupedFacetHits) { int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int facetOrd; if (groupedFacetHit.facetValue != null) { if (facetOrdTermsEnum == null || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue)) { continue; } facetOrd = (int) facetOrdTermsEnum.ord(); } else { facetOrd = facetFieldNumTerms; } // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not // containing facet field int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); } if (facetPrefix != null) { TermsEnum.SeekStatus seekStatus; if (facetOrdTermsEnum != null) { seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix); } else { seekStatus = TermsEnum.SeekStatus.END; } if (seekStatus != TermsEnum.SeekStatus.END) { startFacetOrd = (int) facetOrdTermsEnum.ord(); } else { startFacetOrd = 0; endFacetOrd = 0; return; } BytesRefBuilder facetEndPrefix = new BytesRefBuilder(); facetEndPrefix.append(facetPrefix); facetEndPrefix.append(UnicodeUtil.BIG_TERM); seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix.get()); if (seekStatus != TermsEnum.SeekStatus.END) { endFacetOrd = (int) facetOrdTermsEnum.ord(); } else { endFacetOrd = facetFieldNumTerms; // Don't include null... } } else { startFacetOrd = 0; endFacetOrd = facetFieldNumTerms + 1; } }
public void testTwoBlocks() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); List<String> terms = new ArrayList<>(); for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "m" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } if (VERBOSE) { System.out.println("TEST: now forceMerge"); } w.forceMerge(1); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); assertTrue(te.seekExact(new BytesRef("mo"))); assertEquals(27, te.ord()); te.seekExact(54); assertEquals(new BytesRef("s"), te.term()); Collections.sort(terms); for (int i = terms.size() - 1; i >= 0; i--) { te.seekExact(i); assertEquals(i, te.ord()); assertEquals(terms.get(i), te.term().utf8ToString()); } int iters = atLeast(1000); for (int iter = 0; iter < iters; iter++) { int ord = random().nextInt(terms.size()); BytesRef term = new BytesRef(terms.get(ord)); if (random().nextBoolean()) { if (VERBOSE) { System.out.println("TEST: iter=" + iter + " seek to ord=" + ord + " of " + terms.size()); } te.seekExact(ord); } else { if (VERBOSE) { System.out.println( "TEST: iter=" + iter + " seek to term=" + terms.get(ord) + " ord=" + ord + " of " + terms.size()); } te.seekExact(term); } assertEquals(ord, te.ord()); assertEquals(term, te.term()); } r.close(); w.close(); dir.close(); }