/** * Check if we have found a match * * @return boolean * @throws IOException */ private boolean found() throws IOException { // No predicate test if there are no positions if (positions.length == 0) { return true; } // no more documents - no match if (!more) { return false; } // min and max must point to the same document if (min != max) { return false; } if (rootDoc != max) { return false; } // We have duplicate entries - suport should be improved but it is not used at the moment // This shuld work akin to the leaf scorer // It would compact the index // The match must be in a known term range int count = root.freq(); int start = 0; int end = -1; for (int i = 0; i < count; i++) { if (i == 0) { // First starts at zero start = 0; end = root.nextPosition(); } else { start = end + 1; end = root.nextPosition(); } if (check(start, end)) { return true; } } // We had checks to do and they all failed. return false; }
public void seek(TermEnum terms) throws IOException { original.seek(terms); docFreq = terms.docFreq(); pointer = -1; if (docFreq > postingMaps.length) { // grow postingsMap PostingMap[] newMap = new PostingMap[docFreq]; System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length); for (int i = postingMaps.length; i < docFreq; i++) { newMap[i] = new PostingMap(); } postingMaps = newMap; } out.reset(); int i = 0; while (original.next()) { PostingMap map = postingMaps[i++]; map.newDoc = oldToNew[original.doc()]; // remap the newDoc id map.offset = out.getFilePointer(); // save pointer to buffer final int tf = original.freq(); // buffer tf & positions out.writeVInt(tf); int prevPosition = 0; for (int j = tf; j > 0; j--) { // delta encode positions int p = original.nextPosition(); out.writeVInt(p - prevPosition); prevPosition = p; } } out.flush(); docFreq = i; // allow for deletions Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space // NOTE: this might be substantially faster if RAMInputStream were public // and supported a reset() operation. in = tempDir.openInput(TEMP_FILE); }
@Override public void load() throws Exception { TermPositions tp = null; byte[] payloadBuffer = new byte[4]; // four bytes for an int try { tp = _reader.termPositions(_sizeTerm); if (tp == null) return; while (tp.next()) { if (tp.freq() > 0) { tp.nextPosition(); tp.getPayload(payloadBuffer, 0); int len = bytesToInt(payloadBuffer); allocate(tp.doc(), Math.min(len, _maxItems), true); } } } finally { if (tp != null) tp.close(); } }
private void dumpTerms() throws IOException { outputBanner("Terms (in Term.compareTo() order)"); TermEnum terms = mIndexReader.terms(); int order = 0; while (terms.next()) { order++; Term term = terms.term(); String field = term.field(); String text = term.text(); if (!wantThisTerm(field, text)) { continue; } outputLn(order + " " + field + ": " + text); /* * for each term, print the * <document, frequency, <position>* > tuples for a term. * * document: document in which the Term appears * frequency: number of time the Term appears in the document * position: position for each appearance in the document * * e.g. doc.add(new Field("field", "one two three two four five", Field.Store.YES, Field.Index.ANALYZED)); * then the tuple for Term("field", "two") in this document would be like: * 88, 2, <2, 4> * where * 88 is the document number * 2 is the frequency this term appear in the document * <2, 4> are the positions for each appearance in the document */ // by TermPositions outputLn(" document, frequency, <position>*"); // keep track of docs that appear in all terms that are filtered in. Set<Integer> docNums = null; if (hasFilters()) { docNums = new HashSet<Integer>(); } TermPositions termPos = mIndexReader.termPositions(term); while (termPos.next()) { int docNum = termPos.doc(); int freq = termPos.freq(); if (docNums != null) { docNums.add(docNum); } output(" " + docNum + ", " + freq + ", <"); boolean first = true; for (int f = 0; f < freq; f++) { int positionInDoc = termPos.nextPosition(); if (!first) { output(" "); } else { first = false; } output(positionInDoc + ""); } outputLn(">"); } termPos.close(); if (docNums != null) { computeDocsIntersection(docNums); } outputLn(); if (order % 1000 == 0) { mConsole.debug("Dumped " + order + " terms"); } } terms.close(); }