/* * Go through all the term positions and try and move to next document. Any * failure measn we have no more. * * This can be used at initialisation and when moving away from an existing * match. * * This will set min, max, more and rootDoc * */ private void doNextOnAll() throws IOException { // Do the terms int current; boolean first = true; for (int i = 0, l = positions.length; i < l; i++) { if (positions[i].getCachingTermPositions() != null) { if (positions[i].getCachingTermPositions().next()) { current = positions[i].getCachingTermPositions().doc(); adjustMinMax(current, first); first = false; } else { more = false; return; } } } // Do the root term - it must always exists as the path could well have mutiple entries // If an entry in the index does not have a root terminal it is broken if (root.next()) { rootDoc = root.doc(); } else { more = false; return; } if (root.doc() < max) { if (root.skipTo(max)) { rootDoc = root.doc(); } else { more = false; return; } } }
protected void processPayload(Similarity similarity) throws IOException { if (positions.isPayloadAvailable()) { payload = positions.getPayload(payload, 0); payloadScore = function.currentScore( doc, term.field(), spans.start(), spans.end(), payloadsSeen, payloadScore, similarity.scorePayload( doc, term.field(), spans.start(), spans.end(), payload, 0, positions.getPayloadLength())); payloadsSeen++; } else { // zero out the payload? } }
/* * Try and skip all those term positions at documents less than the current * max up to value. This is quite likely to fail and leave us with (min != * max) but that is OK, we try again. * * It is possible that max increases as we process terms, this is OK. We * just failed to skip to a given value of max and start doing the next. */ private void skipToMax() throws IOException { // Do the terms int current; for (int i = 0, l = positions.length; i < l; i++) { if (i == 0) { min = max; } if (positions[i].getCachingTermPositions() != null) { if (positions[i].getCachingTermPositions().doc() < max) { if (positions[i].getCachingTermPositions().skipTo(max)) { current = positions[i].getCachingTermPositions().doc(); adjustMinMax(current, false); } else { more = false; return; } } } } // Do the root if (root.doc() < max) { if (root.skipTo(max)) { rootDoc = root.doc(); } else { more = false; return; } } }
/* * (non-Javadoc) * * @see org.apache.lucene.search.Scorer#next() */ public boolean next() throws IOException { // If there is no filtering if (allContainers()) { // containers and roots must be in sync or the index is broken while (more) { if (containers.next() && root.next()) { if (check(0, root.nextPosition())) { return true; } } else { doClose(); more = false; return false; } } } if (!more) { // One of the search terms has no more docuements return false; } if (max == 0) { // We need to initialise // Just do a next on all terms and check if the first doc matches doNextOnAll(); if (found()) { return true; } // drop through to the normal find sequence } return findNext(); }
private void doClose() throws IOException { if (root != null) { root.close(); } if (containers != null) { containers.close(); } if (positions != null) { for (StructuredFieldPosition position : positions) { CachingTermPositions ctp = position.getCachingTermPositions(); if (ctp != null) { ctp.close(); } } } }
/** * Check if we have found a match * * @return boolean * @throws IOException */ private boolean found() throws IOException { // No predicate test if there are no positions if (positions.length == 0) { return true; } // no more documents - no match if (!more) { return false; } // min and max must point to the same document if (min != max) { return false; } if (rootDoc != max) { return false; } // We have duplicate entries - suport should be improved but it is not used at the moment // This shuld work akin to the leaf scorer // It would compact the index // The match must be in a known term range int count = root.freq(); int start = 0; int end = -1; for (int i = 0; i < count; i++) { if (i == 0) { // First starts at zero start = 0; end = root.nextPosition(); } else { start = end + 1; end = root.nextPosition(); } if (check(start, end)) { return true; } } // We had checks to do and they all failed. return false; }
public void seek(TermEnum terms) throws IOException { original.seek(terms); docFreq = terms.docFreq(); pointer = -1; if (docFreq > postingMaps.length) { // grow postingsMap PostingMap[] newMap = new PostingMap[docFreq]; System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length); for (int i = postingMaps.length; i < docFreq; i++) { newMap[i] = new PostingMap(); } postingMaps = newMap; } out.reset(); int i = 0; while (original.next()) { PostingMap map = postingMaps[i++]; map.newDoc = oldToNew[original.doc()]; // remap the newDoc id map.offset = out.getFilePointer(); // save pointer to buffer final int tf = original.freq(); // buffer tf & positions out.writeVInt(tf); int prevPosition = 0; for (int j = tf; j > 0; j--) { // delta encode positions int p = original.nextPosition(); out.writeVInt(p - prevPosition); prevPosition = p; } } out.flush(); docFreq = i; // allow for deletions Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space // NOTE: this might be substantially faster if RAMInputStream were public // and supported a reset() operation. in = tempDir.openInput(TEMP_FILE); }
/** * Intersects all doc/position pairs at the given offset with this match list. Modifies this list * in place as an optimization. * * @param termPositions the term positions enumerator * @param offset the offset of the given term in the phrase * @throws java.io.IOException if IO problems occur within Lucene */ void intersect(final TermPositions termPositions, final int offset) throws IOException { int currentDoc = -1; int resultCount = 0; for (int i = 0; i < this.count; i++) { int docId = this.docIds[i]; while (currentDoc < docId) { if (termPositions.next()) { currentDoc = termPositions.doc(); } else { this.count = resultCount; return; } } if (currentDoc == docId) { PhraseFilterIntList positions = this.positions[i]; if (positions.intersect(termPositions, offset)) { this.docIds[resultCount] = docId; this.positions[resultCount++] = positions; } } } this.count = resultCount; }
@Override public void load() throws Exception { TermPositions tp = null; byte[] payloadBuffer = new byte[4]; // four bytes for an int try { tp = _reader.termPositions(_sizeTerm); if (tp == null) return; while (tp.next()) { if (tp.freq() > 0) { tp.nextPosition(); tp.getPayload(payloadBuffer, 0); int len = bytesToInt(payloadBuffer); allocate(tp.doc(), Math.min(len, _maxItems), true); } } } finally { if (tp != null) tp.close(); } }
/* * (non-Javadoc) * * @see org.apache.lucene.search.Scorer#skipTo(int) */ public boolean skipTo(int target) throws IOException { if (allContainers()) { containers.skipTo(target); root.skipTo(containers.doc()); // must match if (check(0, root.nextPosition())) { return true; } while (more) { if (containers.next() && root.next()) { if (check(0, root.nextPosition())) { return true; } } else { more = false; return false; } } } max = target; return findNext(); }
ReaderData(IndexReader reader) throws IOException { this.reader = reader; long minUID = Long.MAX_VALUE; long maxUID = Long.MIN_VALUE; uidMap = new Long2IntRBTreeMap(); uidMap.defaultReturnValue(-1); int maxDoc = reader.maxDoc(); if (maxDoc == 0) { _minUID = Long.MIN_VALUE; _maxUID = Long.MIN_VALUE; return; } TermPositions tp = null; byte[] payloadBuffer = new byte[8]; // four bytes for a long try { tp = reader.termPositions(ZoieSegmentReader.UID_TERM); while (tp.next()) { int doc = tp.doc(); assert doc < maxDoc; tp.nextPosition(); tp.getPayload(payloadBuffer, 0); long uid = ZoieSegmentReader.bytesToLong(payloadBuffer); if (uid < minUID) minUID = uid; if (uid > maxUID) maxUID = uid; uidMap.put(uid, doc); } } finally { if (tp != null) { tp.close(); } } _minUID = minUID; _maxUID = maxUID; }
private void dumpTerms() throws IOException { outputBanner("Terms (in Term.compareTo() order)"); TermEnum terms = mIndexReader.terms(); int order = 0; while (terms.next()) { order++; Term term = terms.term(); String field = term.field(); String text = term.text(); if (!wantThisTerm(field, text)) { continue; } outputLn(order + " " + field + ": " + text); /* * for each term, print the * <document, frequency, <position>* > tuples for a term. * * document: document in which the Term appears * frequency: number of time the Term appears in the document * position: position for each appearance in the document * * e.g. doc.add(new Field("field", "one two three two four five", Field.Store.YES, Field.Index.ANALYZED)); * then the tuple for Term("field", "two") in this document would be like: * 88, 2, <2, 4> * where * 88 is the document number * 2 is the frequency this term appear in the document * <2, 4> are the positions for each appearance in the document */ // by TermPositions outputLn(" document, frequency, <position>*"); // keep track of docs that appear in all terms that are filtered in. Set<Integer> docNums = null; if (hasFilters()) { docNums = new HashSet<Integer>(); } TermPositions termPos = mIndexReader.termPositions(term); while (termPos.next()) { int docNum = termPos.doc(); int freq = termPos.freq(); if (docNums != null) { docNums.add(docNum); } output(" " + docNum + ", " + freq + ", <"); boolean first = true; for (int f = 0; f < freq; f++) { int positionInDoc = termPos.nextPosition(); if (!first) { output(" "); } else { first = false; } output(positionInDoc + ""); } outputLn(">"); } termPos.close(); if (docNums != null) { computeDocsIntersection(docNums); } outputLn(); if (order % 1000 == 0) { mConsole.debug("Dumped " + order + " terms"); } } terms.close(); }
/* * (non-Javadoc) * * @see org.apache.lucene.search.Scorer#doc() */ public int doc() { if (allContainers()) { return containers.doc(); } return max; }
public void close() throws IOException { original.close(); }