@Override public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException { final long numOrds = globalOrdinals.getValueCount(); final LongBitSet acceptedGlobalOrdinals = new LongBitSet(numOrds); final TermsEnum termEnum = globalOrdinals.termsEnum(); BytesRef term = termEnum.next(); while (term != null) { if (Math.floorMod( StringHelper.murmurhash3_x86_32(term, HASH_PARTITIONING_SEED), incNumPartitions) == incZeroBasedPartition) { acceptedGlobalOrdinals.set(termEnum.ord()); } term = termEnum.next(); } return acceptedGlobalOrdinals; }
@Override public boolean accept(BytesRef value) { return Math.floorMod( StringHelper.murmurhash3_x86_32(value, HASH_PARTITIONING_SEED), incNumPartitions) == incZeroBasedPartition; }
@Override public int nextPosition() throws IOException { final int pos; if (readPositions) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + POS.length, scratch.length - POS.length, scratchUTF16_2); pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } else { pos = -1; } if (readOffsets) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, START_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + START_OFFSET.length, scratch.length - START_OFFSET.length, scratchUTF16_2); startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + END_OFFSET.length, scratch.length - END_OFFSET.length, scratchUTF16_2); endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } final long fp = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, PAYLOAD)) { final int len = scratch.length - PAYLOAD.length; if (scratch2.bytes.length < len) { scratch2.grow(len); } System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len); scratch2.length = len; payload = scratch2; } else { payload = null; in.seek(fp); } return pos; }
@Override public int nextDoc() throws IOException { if (docID == NO_MORE_DOCS) { return docID; } boolean first = true; int termFreq = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); termFreq = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, POS)) { // skip termFreq++; } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END) : "scratch=" + scratch.utf8ToString(); if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } return docID = NO_MORE_DOCS; } } }
private void getPrefixTerms( ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment // into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf // individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
/** * The termCompare method in FuzzyTermEnum uses Levenshtein distance to calculate the distance * between the given term and the comparing term. * * <p>If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison. Otherwise, * this method uses the following logic to calculate similarity. * * <pre> * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen))); * </pre> * * where distance is the Levenshtein distance for the two words. */ @Override protected final AcceptStatus accept(BytesRef term) { if (StringHelper.startsWith(term, prefixBytesRef)) { utf32.copyUTF8Bytes(term); final int distance = calcDistance(utf32.ints(), realPrefixLength, utf32.length() - realPrefixLength); // Integer.MIN_VALUE is the sentinel that Levenshtein stopped early if (distance == Integer.MIN_VALUE) { return AcceptStatus.NO; } // no need to calc similarity, if raw is true and distance > maxEdits if (raw == true && distance > maxEdits) { return AcceptStatus.NO; } final float similarity = calcSimilarity(distance, (utf32.length() - realPrefixLength), text.length); // if raw is true, then distance must also be <= maxEdits by now // given the previous if statement if (raw == true || (raw == false && similarity > minSimilarity)) { boostAtt.setBoost((similarity - minSimilarity) * scale_factor); return AcceptStatus.YES; } else { return AcceptStatus.NO; } } else { return AcceptStatus.END; } }
private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor) throws IOException { readLine(); assert StringHelper.startsWith(scratch.get(), VALUE); if (type == TYPE_STRING) { byte[] bytes = new byte[scratch.length() - VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length); visitor.stringField(fieldInfo, bytes); } else if (type == TYPE_BINARY) { byte[] copy = new byte[scratch.length() - VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length); visitor.binaryField(fieldInfo, copy); } else if (type == TYPE_INT) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.intField(fieldInfo, Integer.parseInt(scratchUTF16.toString())); } else if (type == TYPE_LONG) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.longField(fieldInfo, Long.parseLong(scratchUTF16.toString())); } else if (type == TYPE_FLOAT) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.floatField(fieldInfo, Float.parseFloat(scratchUTF16.toString())); } else if (type == TYPE_DOUBLE) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.doubleField(fieldInfo, Double.parseDouble(scratchUTF16.toString())); } }
/** Test attributes map */ public void testAttributes() throws Exception { Directory dir = newDirectory(); Codec codec = getCodec(); byte id[] = StringHelper.randomId(); Map<String, String> attributes = new HashMap<>(); attributes.put("key1", "value1"); attributes.put("key2", "value2"); SegmentInfo info = new SegmentInfo( dir, getVersions()[0], "_123", 1, false, codec, Collections.emptyMap(), id, attributes, null); info.setFiles(Collections.<String>emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); assertEquals(attributes, info2.getAttributes()); // attributes map should be immutable expectThrows( UnsupportedOperationException.class, () -> { info2.getAttributes().put("bogus", "bogus"); }); dir.close(); }
@Override public int nextDoc() throws IOException { boolean first = true; in.seek(nextDocStart); long posStart = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); // System.out.println("NEXT DOC: " + scratch.utf8ToString()); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); tf = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); tf = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); posStart = in.getFilePointer(); } else if (StringHelper.startsWith(scratch, POS)) { // skip } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END); if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } return docID = NO_MORE_DOCS; } } }
@Override public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException { in.seek(offsets[n]); while (true) { readLine(); if (StringHelper.startsWith(scratch.get(), FIELD) == false) { break; } int fieldNumber = parseIntAt(FIELD.length); FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); readLine(); assert StringHelper.startsWith(scratch.get(), NAME); readLine(); assert StringHelper.startsWith(scratch.get(), TYPE); final BytesRef type; if (equalsAt(TYPE_STRING, scratch.get(), TYPE.length)) { type = TYPE_STRING; } else if (equalsAt(TYPE_BINARY, scratch.get(), TYPE.length)) { type = TYPE_BINARY; } else if (equalsAt(TYPE_INT, scratch.get(), TYPE.length)) { type = TYPE_INT; } else if (equalsAt(TYPE_LONG, scratch.get(), TYPE.length)) { type = TYPE_LONG; } else if (equalsAt(TYPE_FLOAT, scratch.get(), TYPE.length)) { type = TYPE_FLOAT; } else if (equalsAt(TYPE_DOUBLE, scratch.get(), TYPE.length)) { type = TYPE_DOUBLE; } else { throw new RuntimeException("unknown field type"); } switch (visitor.needsField(fieldInfo)) { case YES: readField(type, fieldInfo, visitor); break; case NO: readLine(); assert StringHelper.startsWith(scratch.get(), VALUE); break; case STOP: return; } } }
/** initialize native resources */ public static void initializeNatives( Path tmpFile, boolean mlockAll, boolean seccomp, boolean ctrlHandler) { final ESLogger logger = Loggers.getLogger(Bootstrap.class); // check if the user is running as root, and bail if (Natives.definitelyRunningAsRoot()) { if (Boolean.parseBoolean(System.getProperty("es.insecure.allow.root"))) { logger.warn("running as ROOT user. this is a bad idea!"); } else { throw new RuntimeException("don't run elasticsearch as root."); } } // enable secure computing mode if (seccomp) { Natives.trySeccomp(tmpFile); } // mlockall if requested if (mlockAll) { if (Constants.WINDOWS) { Natives.tryVirtualLock(); } else { Natives.tryMlockall(); } } // listener for windows close event if (ctrlHandler) { Natives.addConsoleCtrlHandler( new ConsoleCtrlHandler() { @Override public boolean handle(int code) { if (CTRL_CLOSE_EVENT == code) { logger.info("running graceful exit on windows"); try { Bootstrap.stop(); } catch (IOException e) { throw new ElasticsearchException("failed to stop node", e); } return true; } return false; } }); } // force remainder of JNA to be loaded (if available). try { JNAKernel32Library.getInstance(); } catch (Throwable ignored) { // we've already logged this. } // init lucene random seed. it will use /dev/urandom where available: StringHelper.randomId(); }
private BytesRef setTerm() throws IOException { term = termsEnum.term(); // System.out.println(" setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix == // null ? "null" : prefix.utf8ToString())); if (prefix != null && !StringHelper.startsWith(term, prefix)) { term = null; } return term; }
private void writeField() throws IOException { // remember where this field is written currentField.tvfPointer = tvf.getFilePointer(); // System.out.println("Field Pointer: " + currentField.tvfPointer); final int size = terms.size(); tvf.writeVInt(size); boolean storePositions = currentField.storePositions; boolean storeOffsets = currentField.storeOffsets; byte bits = 0x0; if (storePositions) bits |= STORE_POSITIONS_WITH_TERMVECTOR; if (storeOffsets) bits |= STORE_OFFSET_WITH_TERMVECTOR; tvf.writeByte(bits); String lastTermText = ""; for (int i = 0; i < size; i++) { TVTerm term = (TVTerm) terms.elementAt(i); int start = StringHelper.stringDifference(lastTermText, term.termText); int length = term.termText.length() - start; tvf.writeVInt(start); // write shared prefix length tvf.writeVInt(length); // write delta length tvf.writeChars(term.termText, start, length); // write delta chars tvf.writeVInt(term.freq); lastTermText = term.termText; if (storePositions) { if (term.positions == null) throw new IllegalStateException("Trying to write positions that are null!"); // use delta encoding for positions int position = 0; for (int j = 0; j < term.freq; j++) { tvf.writeVInt(term.positions[j] - position); position = term.positions[j]; } } if (storeOffsets) { if (term.offsets == null) throw new IllegalStateException("Trying to write offsets that are null!"); // use delta encoding for offsets int position = 0; for (int j = 0; j < term.freq; j++) { tvf.writeVInt(term.offsets[j].getStartOffset() - position); tvf.writeVInt( term.offsets[j].getEndOffset() - term.offsets[j].getStartOffset()); // Save the diff between the two. position = term.offsets[j].getEndOffset(); } } } }
@Override public boolean isPrefixOf(Cell c) { NRCell otherCell = (NRCell) c; assert term != otherCell.term; // trick to re-use bytesref; provided that we re-instate it int myLastLen = term.length; term.length = termLenByLevel[getLevel()]; int otherLastLen = otherCell.term.length; otherCell.term.length = termLenByLevel[otherCell.getLevel()]; boolean answer = StringHelper.startsWith(otherCell.term, term); term.length = myLastLen; otherCell.term.length = otherLastLen; return answer; }
// we don't actually write a .fdx-like index, instead we read the // stored fields file in entirety up-front and save the offsets // so we can seek to the documents later. private void readIndex(int size) throws IOException { ChecksumIndexInput input = new BufferedChecksumIndexInput(in); offsets = new long[size]; int upto = 0; while (!scratch.get().equals(END)) { SimpleTextUtil.readLine(input, scratch); if (StringHelper.startsWith(scratch.get(), DOC)) { offsets[upto] = input.getFilePointer(); upto++; } } SimpleTextUtil.checkFooter(input); assert upto == offsets.length; }
/** * Returns true if the term matches the automaton. Also stashes away the term to assist with smart * enumeration. */ @Override protected AcceptStatus accept(final BytesRef term) { if (commonSuffixRef == null || StringHelper.endsWith(term, commonSuffixRef)) { if (runAutomaton.run(term.bytes, term.offset, term.length)) return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; else return (linear && termComp.compare(term, linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; } else { return (linear && termComp.compare(term, linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; } }
@Override public String next() throws IOException { while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END)) { current = null; return null; } if (StringHelper.startsWith(scratch, FIELD)) { return current = new String( scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8"); } } }
/** * Test segment infos read that hits exception on close make sure we get our exception back, no * file handle leaks, etc. */ public void testExceptionOnCloseInput() throws Exception { Failure fail = new Failure() { @Override public void eval(MockDirectoryWrapper dir) throws IOException { for (StackTraceElement e : Thread.currentThread().getStackTrace()) { if (doFail && "close".equals(e.getMethodName())) { throw new FakeIOException(); } } } }; MockDirectoryWrapper dir = newMockDirectory(); dir.failOn(fail); Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo( dir, getVersions()[0], "_123", 1, false, codec, Collections.<String, String>emptyMap(), id, new HashMap<>(), null); info.setFiles(Collections.<String>emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); fail.setDoFail(); expectThrows( FakeIOException.class, () -> { codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); }); fail.clearDoFail(); dir.close(); }
/** Test files map */ public void testFiles() throws Exception { Directory dir = newDirectory(); Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo( dir, getVersions()[0], "_123", 1, false, codec, Collections.<String, String>emptyMap(), id, new HashMap<>(), null); info.setFiles(Collections.<String>emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); assertEquals(info.files(), info2.files()); dir.close(); }
/** Test sort */ public void testSort() throws IOException { assumeTrue("test requires a codec that can read/write index sort", supportsIndexSort()); final int iters = atLeast(5); for (int i = 0; i < iters; ++i) { Sort sort; if (i == 0) { sort = null; } else { final int numSortFields = TestUtil.nextInt(random(), 1, 3); SortField[] sortFields = new SortField[numSortFields]; for (int j = 0; j < numSortFields; ++j) { sortFields[j] = randomIndexSortField(); } sort = new Sort(sortFields); } Directory dir = newDirectory(); Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo( dir, getVersions()[0], "_123", 1, false, codec, Collections.<String, String>emptyMap(), id, new HashMap<>(), sort); info.setFiles(Collections.<String>emptySet()); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); assertEquals(sort, info2.getIndexSort()); dir.close(); } }
@Override public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { int prefixLength = prefix.length(); Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { Matcher matcher = pattern.matcher(""); try { TermsEnum termsEnum = terms.iterator(null); TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef); BytesRef text; if (status == TermsEnum.SeekStatus.FOUND) { text = prefixRef; } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { text = termsEnum.term(); } else { text = null; } while (text != null) { if (text != null && StringHelper.startsWith(text, prefixRef)) { String textString = text.utf8ToString(); matcher.reset(textString.substring(prefixLength)); if (matcher.matches()) { mtv.visitMatchingTerm(new Term(fieldName, textString)); } } else { break; } text = termsEnum.next(); } } finally { matcher.reset(); } } }
/** Tests SI writer adds itself to files... */ public void testAddsSelfToFiles() throws Exception { Directory dir = newDirectory(); Codec codec = getCodec(); byte id[] = StringHelper.randomId(); SegmentInfo info = new SegmentInfo( dir, getVersions()[0], "_123", 1, false, codec, Collections.<String, String>emptyMap(), id, new HashMap<>(), null); Set<String> originalFiles = Collections.singleton("_123.a"); info.setFiles(originalFiles); codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); Set<String> modifiedFiles = info.files(); assertTrue(modifiedFiles.containsAll(originalFiles)); assertTrue( "did you forget to add yourself to files()", modifiedFiles.size() > originalFiles.size()); SegmentInfo info2 = codec.segmentInfoFormat().read(dir, "_123", id, IOContext.DEFAULT); assertEquals(info.files(), info2.files()); // files set should be immutable expectThrows( UnsupportedOperationException.class, () -> { info2.files().add("bogus"); }); dir.close(); }
/** * Returns a list of terms in the specified field along with the corresponding count of documents * in the set that match that constraint. This method uses the FilterCache to get the intersection * count between <code>docs</code> and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts( SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase, SolrParams params) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); LeafReader r = searcher.getLeafReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef prefixTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); prefixTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (prefixTermBytes != null) { if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } PostingsEnum postingsEnum = null; CharsRefBuilder charsRef = new CharsRefBuilder(); if (docs.size() >= mincount) { while (term != null) { if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break; if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) { int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; } c = searcher.numDocs(docs, deState); postingsEnum = deState.postingsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it // matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class // impl) // TODO: would passing deleted docs lead to better efficiency over checking the // fastForRandomSet? postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); c = 0; if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }
public void testMerge() throws IOException { final Codec codec = Codec.getDefault(); final SegmentInfo si = new SegmentInfo( mergedDir, Version.LATEST, mergedSegment, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); SegmentMerger merger = new SegmentMerger( Arrays.<CodecReader>asList(reader1, reader2), si, InfoStream.getDefault(), mergedDir, new FieldInfos.FieldNumbers(), newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1)))); MergeState mergeState = merger.merge(); int docsMerged = mergeState.segmentInfo.maxDoc(); assertTrue(docsMerged == 2); // Should be able to open a new SegmentReader against the new directory SegmentReader mergedReader = new SegmentReader( new SegmentCommitInfo(mergeState.segmentInfo, 0, -1L, -1L, -1L), newIOContext(random())); assertTrue(mergedReader != null); assertTrue(mergedReader.numDocs() == 2); Document newDoc1 = mergedReader.document(0); assertTrue(newDoc1 != null); // There are 2 unstored fields on the document assertTrue( DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - DocHelper.unstored.size()); Document newDoc2 = mergedReader.document(1); assertTrue(newDoc2 != null); assertTrue( DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - DocHelper.unstored.size()); PostingsEnum termDocs = TestUtil.docs( random(), mergedReader, DocHelper.TEXT_FIELD_2_KEY, new BytesRef("field"), null, 0); assertTrue(termDocs != null); assertTrue(termDocs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); int tvCount = 0; for (FieldInfo fieldInfo : mergedReader.getFieldInfos()) { if (fieldInfo.hasVectors()) { tvCount++; } } // System.out.println("stored size: " + stored.size()); assertEquals("We do not have 3 fields that were indexed with term vector", 3, tvCount); Terms vector = mergedReader.getTermVectors(0).terms(DocHelper.TEXT_FIELD_2_KEY); assertNotNull(vector); assertEquals(3, vector.size()); TermsEnum termsEnum = vector.iterator(); int i = 0; while (termsEnum.next() != null) { String term = termsEnum.term().utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); // System.out.println("Term: " + term + " Freq: " + freq); assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1); assertTrue(DocHelper.FIELD_2_FREQS[i] == freq); i++; } TestSegmentReader.checkNorms(mergedReader); mergedReader.close(); }
public JSONWriter(Writer writer, SolrQueryRequest req, SolrQueryResponse rsp) { super(writer, req, rsp); namedListStyle = StringHelper.intern(req.getParams().get(JSON_NL_STYLE, JSON_NL_FLAT)); wrapperFunction = req.getParams().get(JSON_WRAPPER_FUNCTION); }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (params.getBool(TermsParams.TERMS, false)) { String lowerStr = params.get(TermsParams.TERMS_LOWER, null); String[] fields = params.getParams(TermsParams.TERMS_FIELD); if (fields != null && fields.length > 0) { NamedList terms = new SimpleOrderedMap(); rb.rsp.add("terms", terms); int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX.equals( params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); // initialize freqmin int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); // initialize freqmax if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); for (int j = 0; j < fields.length; j++) { String field = StringHelper.intern(fields[j]); FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // If no lower bound was specified, use the prefix String lower = lowerStr == null ? prefix : (raw ? lowerStr : ft.toInternal(lowerStr)); if (lower == null) lower = ""; String upper = upperStr == null ? null : (raw ? upperStr : ft.toInternal(upperStr)); Term lowerTerm = new Term(field, lower); Term upperTerm = upper == null ? null : new Term(field, upper); TermEnum termEnum = rb.req .getSearcher() .getReader() .terms(lowerTerm); // this will be positioned ready to go int i = 0; BoundedTreeSet<CountPair<String, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<String, Integer>>(limit) : null); NamedList fieldTerms = new NamedList(); terms.add(field, fieldTerms); Term lowerTestTerm = termEnum.term(); // Only advance the enum if we are excluding the lower bound and the lower Term actually // matches if (lowerTestTerm != null && lowerIncl == false && lowerTestTerm.field() == field // intern'd comparison && lowerTestTerm.text().equals(lower)) { termEnum.next(); } while (i < limit || sort) { Term theTerm = termEnum.term(); // check for a different field, or the end of the index. if (theTerm == null || field != theTerm.field()) // intern'd comparison break; String indexedText = theTerm.text(); // stop if the prefix doesn't match if (prefix != null && !indexedText.startsWith(prefix)) break; if (pattern != null && !pattern.matcher(indexedText).matches()) { termEnum.next(); continue; } if (upperTerm != null) { int upperCmp = theTerm.compareTo(upperTerm); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are // satisfied. int docFreq = termEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list String label = raw ? indexedText : ft.indexedToReadable(indexedText); if (sort) { queue.add(new CountPair<String, Integer>(label, docFreq)); } else { fieldTerms.add(label, docFreq); i++; } } termEnum.next(); } termEnum.close(); if (sort) { for (CountPair<String, Integer> item : queue) { if (i < limit) { fieldTerms.add(item.key, item.val); i++; } else { break; } } } } } else { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "No terms.fl parameter specified"); } } }
@Override public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException { String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION); final IndexInput in = dir.openInput(dataFile, context); BytesRefBuilder scratch = new BytesRefBuilder(); // first get to TOC: DecimalFormat df = new DecimalFormat(OFFSETPATTERN, DecimalFormatSymbols.getInstance(Locale.ROOT)); long pos = in.length() - TABLEPOS.length - OFFSETPATTERN.length() - 1; in.seek(pos); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLEPOS); long tablePos = -1; try { tablePos = df.parse(stripPrefix(scratch, TABLEPOS)).longValue(); } catch (ParseException e) { throw new CorruptIndexException( "can't parse CFS trailer, got: " + scratch.get().utf8ToString(), in); } // seek to TOC and read it in.seek(tablePos); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLE); int numEntries = Integer.parseInt(stripPrefix(scratch, TABLE)); final String fileNames[] = new String[numEntries]; final long startOffsets[] = new long[numEntries]; final long endOffsets[] = new long[numEntries]; for (int i = 0; i < numEntries; i++) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLENAME); fileNames[i] = si.name + IndexFileNames.stripSegmentName(stripPrefix(scratch, TABLENAME)); if (i > 0) { // files must be unique and in sorted order assert fileNames[i].compareTo(fileNames[i - 1]) > 0; } SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLESTART); startOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLESTART)); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLEEND); endOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLEEND)); } return new Directory() { private int getIndex(String name) throws IOException { int index = Arrays.binarySearch(fileNames, name); if (index < 0) { throw new FileNotFoundException( "No sub-file found (fileName=" + name + " files: " + Arrays.toString(fileNames) + ")"); } return index; } @Override public String[] listAll() throws IOException { ensureOpen(); return fileNames.clone(); } @Override public long fileLength(String name) throws IOException { ensureOpen(); int index = getIndex(name); return endOffsets[index] - startOffsets[index]; } @Override public IndexInput openInput(String name, IOContext context) throws IOException { ensureOpen(); int index = getIndex(name); return in.slice(name, startOffsets[index], endOffsets[index] - startOffsets[index]); } @Override public void close() throws IOException { in.close(); } // write methods: disabled @Override public IndexOutput createOutput(String name, IOContext context) { throw new UnsupportedOperationException(); } @Override public void sync(Collection<String> names) { throw new UnsupportedOperationException(); } @Override public void deleteFile(String name) { throw new UnsupportedOperationException(); } @Override public void renameFile(String source, String dest) { throw new UnsupportedOperationException(); } @Override public Lock makeLock(String name) { throw new UnsupportedOperationException(); } }; }
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(TermsParams.TERMS, false)) return; String[] fields = params.getParams(TermsParams.TERMS_FIELD); NamedList<Object> termsResult = new SimpleOrderedMap<>(); rb.rsp.add("terms", termsResult); if (fields == null || fields.length == 0) return; int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String lowerStr = params.get(TermsParams.TERMS_LOWER); String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX.equals( params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader(); Fields lfields = indexReader.fields(); for (String field : fields) { NamedList<Integer> fieldTerms = new NamedList<>(); termsResult.add(field, fieldTerms); Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field continue; } FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // prefix must currently be text BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix); BytesRef upperBytes = null; if (upperStr != null) { upperBytes = new BytesRef(); ft.readableToIndexed(upperStr, upperBytes); } BytesRef lowerBytes; if (lowerStr == null) { // If no lower bound was specified, use the prefix lowerBytes = prefixBytes; } else { lowerBytes = new BytesRef(); if (raw) { // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists // perhaps we detect if the FieldType is non-character and expect hex if so? lowerBytes = new BytesRef(lowerStr); } else { lowerBytes = new BytesRef(); ft.readableToIndexed(lowerStr, lowerBytes); } } TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; if (lowerBytes != null) { if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); // Only advance the enum if we are excluding the lower bound and the lower Term actually // matches if (lowerIncl == false && term.equals(lowerBytes)) { term = termsEnum.next(); } } } else { // position termsEnum on first term term = termsEnum.next(); } int i = 0; BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null); CharsRef external = new CharsRef(); while (term != null && (i < limit || sort)) { boolean externalized = false; // did we fill in "external" yet for this term? // stop if the prefix doesn't match if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break; if (pattern != null) { // indexed text or external text? // TODO: support "raw" mode? ft.indexedToReadable(term, external); externalized = true; if (!pattern.matcher(external).matches()) { term = termsEnum.next(); continue; } } if (upperBytes != null) { int upperCmp = term.compareTo(upperBytes); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list if (sort) { queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); i++; } } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { if (i >= limit) break; ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; } } } }
private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b; final PairOutputs<Long, Long> outputsInner = new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs); final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs = new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner); b = new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>( FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; OpenBitSet visitedDocs = new OpenBitSet(); final IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.startsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); visitedDocs.set(docID); } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; if (len > lastTerm.length) { lastTerm.grow(len); } System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); lastTerm.length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = (int) visitedDocs.cardinality(); fst = b.finish(); /* PrintStream ps = new PrintStream("out.dot"); fst.toDot(ps); ps.close(); System.out.println("SAVED out.dot"); */ // System.out.println("FST " + fst.sizeInBytes()); }
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last // number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); // System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ; ) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ; ) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } // System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; // System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit // boundary. // TODO: figure out what array lengths we can round up to w/o actually using more // memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } // System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); // System.out.println(" endpos=" + endPos); if (endPos <= 4) { // System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { // System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; // System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /** * * we don't have to worry about the array getting too large since the "pos" param * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen * <= pos + len) newlen<<=1; // doubling strategy } ** */ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }