private void initMemory(Terms curTerms, int termFreq) { // init memory for performance reasons if (curTerms.hasPositions()) { currentPositions = ArrayUtil.grow(currentPositions, termFreq); } if (curTerms.hasOffsets()) { currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq); currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq); } if (curTerms.hasPayloads()) { currentPayloads = new BytesArray[termFreq]; } }
private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException { if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) { return; } builder.startArray(FieldStrings.TOKENS); for (int i = 0; i < termFreq; i++) { builder.startObject(); if (curTerms.hasPositions()) { builder.field(FieldStrings.POS, currentPositions[i]); } if (curTerms.hasOffsets()) { builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]); builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]); } if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) { builder.field(FieldStrings.PAYLOAD, currentPayloads[i]); } builder.endObject(); } builder.endArray(); }
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException { for (int j = 0; j < termFreq; j++) { int nextPos = posEnum.nextPosition(); if (curTerms.hasPositions()) { currentPositions[j] = nextPos; } if (curTerms.hasOffsets()) { currentStartOffset[j] = posEnum.startOffset(); currentEndOffset[j] = posEnum.endOffset(); } if (curTerms.hasPayloads()) { BytesRef curPayload = posEnum.getPayload(); if (curPayload != null) { currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length); } else { currentPayloads[j] = null; } } } }
/** * Get all words between the specified start and end positions from the term vector. * * <p>NOTE: this may return an array of less than the size requested, if the document ends before * the requested end position. * * @param reader the index * @param doc doc id * @param luceneName the index field from which to use the term vector * @param start start position (first word we want to request) * @param end end position (last word we want to request) * @param partialOk is it okay if we're missing words in the middle, or do we need them all? * (debug) * @return the words found, in order */ public static String[] getWordsFromTermVector( IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk) { // Retrieve the term position vector of the contents of this document. // NOTE: might be faster to retrieve all term vectors at once try { org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName); if (terms == null) { throw new IllegalArgumentException("Field " + luceneName + " has no Terms"); } if (!terms.hasPositions()) throw new IllegalArgumentException( "Field " + luceneName + " has no character postion information"); // String[] docTerms = new String[(int) terms.size()]; // final List<BytesRef> termsList = new ArrayList<BytesRef>(); TermsEnum termsEnum = terms.iterator(); // Verzamel concordantiewoorden uit term vector PostingsEnum docPosEnum = null; int numFound = 0; String[] concordanceWords = new String[end - start + 1]; while (termsEnum.next() != null) { docPosEnum = termsEnum.postings(null, docPosEnum, PostingsEnum.POSITIONS); while (docPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { // NOTE: .docId() will always return 0 in this case // if (docPosEnum.docID() != doc) // throw new RuntimeException("Wrong doc id: " + docPosEnum.docID() + " (expected " + doc // + ")"); for (int i = 0; i < docPosEnum.freq(); i++) { int position = docPosEnum.nextPosition(); if (position == -1) throw new RuntimeException( "Unexpected missing position (i=" + i + ", docPosEnum.freq() = " + docPosEnum.freq() + ")"); if (position >= start && position <= end) { if (concordanceWords[position - start] == null) concordanceWords[position - start] = termsEnum.term().utf8ToString(); else concordanceWords[position - start] += "|" + termsEnum.term().utf8ToString(); numFound++; } } if (numFound == concordanceWords.length) return concordanceWords; } } if (numFound < concordanceWords.length && !partialOk) { // If we simply ran into the end of the document, that's okay; // but if words are missing in the middle, that's not. String[] partial = new String[numFound]; for (int i = 0; i < numFound; i++) { partial[i] = concordanceWords[i]; if (partial[i] == null) { throw new RuntimeException( "Not all words found (" + numFound + " out of " + concordanceWords.length + "); missing words in the middle of concordance!"); } } return partial; } return concordanceWords; } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } }
@Override public boolean hasPositions() { return delegateTerms.hasPositions(); }
@Override public void execute(String[] args, PrintStream out) throws Exception { String field = null; String termVal = null; try { field = args[0]; } catch (Exception e) { field = null; } if (field != null) { String[] parts = field.split(":"); if (parts.length > 1) { field = parts[0]; termVal = parts[1]; } } if (field == null || termVal == null) { out.println("usage: field:term"); out.flush(); return; } IndexReader reader = ctx.getIndexReader(); List<AtomicReaderContext> leaves = reader.leaves(); int docBase = 0; int numPerPage = 20; for (AtomicReaderContext leaf : leaves) { AtomicReader atomicReader = leaf.reader(); Terms terms = atomicReader.terms(field); if (terms == null) { continue; } boolean hasPositions = terms.hasPositions(); if (terms != null && termVal != null) { TermsEnum te = terms.iterator(null); int count = 0; if (te.seekExact(new BytesRef(termVal), true)) { if (hasPositions) { DocsAndPositionsEnum iter = te.docsAndPositions(atomicReader.getLiveDocs(), null); int docid; while ((docid = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { count++; out.print("docid: " + (docid + docBase) + ", freq: " + iter.freq() + ", "); for (int i = 0; i < iter.freq(); ++i) { out.print("pos " + i + ": " + iter.nextPosition()); BytesRef payload = iter.getPayload(); if (payload != null) { out.print(",payload: " + payload); } out.print(";"); } out.println(); if (ctx.isInteractiveMode()) { if (count % numPerPage == 0) { out.println("Ctrl-D to break"); int ch = System.in.read(); if (ch == -1) { out.flush(); return; } } } } } else { DocsEnum iter = te.docs(atomicReader.getLiveDocs(), null); int docid; while ((docid = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { count++; out.println("docid: " + (docid + docBase)); if (ctx.isInteractiveMode()) { if (count % numPerPage == 0) { out.println("Ctrl-D to break"); int ch = System.in.read(); if (ch == -1) { out.flush(); return; } } } } } } } docBase += atomicReader.maxDoc(); } }
private void duellReaders(CompositeReader other, LeafReader memIndexReader) throws IOException { Fields memFields = memIndexReader.fields(); for (String field : MultiFields.getFields(other)) { Terms memTerms = memFields.terms(field); Terms iwTerms = memIndexReader.terms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = MultiDocValues.getNormValues(other, field); NumericDocValues memNormValues = memIndexReader.getNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.get(0), memNormValues.get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.getDocCount(), memTerms.getDocCount()); assertEquals(iwTerms.getSumDocFreq(), memTerms.getSumDocFreq()); assertEquals(iwTerms.getSumTotalTermFreq(), memTerms.getSumTotalTermFreq()); TermsEnum iwTermsIter = iwTerms.iterator(); TermsEnum memTermsIter = memTerms.iterator(); if (iwTerms.hasPositions()) { final boolean offsets = iwTerms.hasOffsets() && memTerms.hasOffsets(); while (iwTermsIter.next() != null) { assertNotNull(memTermsIter.next()); assertEquals(iwTermsIter.term(), memTermsIter.term()); PostingsEnum iwDocsAndPos = iwTermsIter.postings(null, PostingsEnum.ALL); PostingsEnum memDocsAndPos = memTermsIter.postings(null, PostingsEnum.ALL); while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc()); assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq()); for (int i = 0; i < iwDocsAndPos.freq(); i++) { assertEquals( "term: " + iwTermsIter.term().utf8ToString(), iwDocsAndPos.nextPosition(), memDocsAndPos.nextPosition()); if (offsets) { assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset()); assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset()); } if (iwTerms.hasPayloads()) { assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload()); } } } } } else { while (iwTermsIter.next() != null) { assertEquals(iwTermsIter.term(), memTermsIter.term()); PostingsEnum iwDocsAndPos = iwTermsIter.postings(null); PostingsEnum memDocsAndPos = memTermsIter.postings(null); while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc()); assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq()); } } } } } }