/* * Utility function to display a term vector. */ static void termVectorDisplay(Terms terms) throws IOException { if ((terms == null) || (terms.size() == -1)) System.out.println(" The field is not stored."); else { /* * The terms for this field are stored. */ System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %10d %-20s %d ", ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq()); DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null); currDoc.nextDoc(); for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++) System.out.print(currDoc.nextPosition() + " "); System.out.println(); } ; } ; }
/** * We will implement this according to the Lucene specification the formula used: sum ( IDF(qi) * * (df(qi,D) * (k+1)) / (df(qi,D) + k * (1-b + b*|D| / avgFL)) IDF and avgFL computation are * described above. * * @param doc * @param terms * @param context * @return */ @Override public float extract(Document doc, Terms terms, RerankerContext context) { Set<String> queryTokens = new HashSet<>(context.getQueryTokens()); TermsEnum termsEnum = null; try { termsEnum = terms.iterator(); } catch (IOException e) { LOG.warn("Error computing BM25, unable to retrieve terms enum"); return 0.0f; } IndexReader reader = context.getIndexSearcher().getIndexReader(); long maxDocs = reader.numDocs(); long sumTotalTermFreq = getSumTermFrequency(reader, context.getField()); // Compute by iterating long docSize = 0L; // NOTE df cannot be retrieved just from the term vector, // the term vector here is only a partial term vector that treats this as if we only have 1 // document in the index Map<String, Integer> docFreqMap = null; try { docFreqMap = getDocFreqs(reader, context.getQueryTokens(), context.getField()); } catch (IOException e) { LOG.warn("Unable to retrieve document frequencies."); docFreqMap = new HashMap<>(); } Map<String, Long> termFreqMap = new HashMap<>(); try { while (termsEnum.next() != null) { String termString = termsEnum.term().utf8ToString(); docSize += termsEnum.totalTermFreq(); if (queryTokens.contains(termString)) { termFreqMap.put(termString, termsEnum.totalTermFreq()); } } } catch (IOException e) { LOG.warn("Unable to retrieve termsEnum, treating as 0"); } float score = 0.0f; // Iterate over the query tokens double avgFL = computeAvgFL(sumTotalTermFreq, maxDocs); for (String token : queryTokens) { long docFreq = docFreqMap.containsKey(token) ? docFreqMap.get(token) : 0; double termFreq = termFreqMap.containsKey(token) ? termFreqMap.get(token) : 0; double numerator = (this.k1 + 1) * termFreq; double docLengthFactor = this.b * (docSize / avgFL); double denominator = termFreq + (this.k1) * (1 - this.b + docLengthFactor); score += computeIDF(docFreq, maxDocs) * numerator / denominator; } return score; }
private void buildTermStatistics(XContentBuilder builder, TermsEnum termIter) throws IOException { // write term statistics. At this point we do not naturally have a // boolean that says if these values actually were requested. // However, we can assume that they were not if the statistic values are // <= 0. assert (((termIter.docFreq() > 0) && (termIter.totalTermFreq() > 0)) || ((termIter.docFreq() == -1) && (termIter.totalTermFreq() == -1))); int docFreq = termIter.docFreq(); if (docFreq > 0) { builder.field(FieldStrings.DOC_FREQ, docFreq); builder.field(FieldStrings.TTF, termIter.totalTermFreq()); } }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param field2termFreqMap a Map of terms and their frequencies per field * @param vector List of terms and their frequencies for a doc/field */ private void addTermFrequencies( Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException { Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName); if (termFreqMap == null) { termFreqMap = new HashMap<>(); field2termFreqMap.put(fieldName, termFreqMap); } final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
/* * listTermDictionary displays the term dictionary for a field. */ static void listTermDictionary(IndexReader reader, String fieldName) throws IOException { System.out.println("\nTerm Dictionary: field " + fieldName); /* Grant says: MultiFields.getTerms(IndexReader, fieldName) */ Terms terms = MultiFields.getTerms(reader, fieldName); if ((terms == null) || (terms.size() == -1)) System.out.println(" The term dictionary is empty."); else { System.out.println(" Vocabulary size: " + terms.size() + " terms"); TermsEnum ithTerm = terms.iterator(null); /* * Iterate over the terms in this document. * Information about a term's occurrences (tf and * positions) is accessed via the indexing API, which * returns inverted lists that describe (only) the * current document. */ while (ithTerm.next() != null) { System.out.format( " %-30s %d %d\n", ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq()); } ; } ; }
protected void compareTermVectors(Terms terms, Terms memTerms, String field_name) throws IOException { TermsEnum termEnum = terms.iterator(); TermsEnum memTermEnum = memTerms.iterator(); while (termEnum.next() != null) { assertNotNull(memTermEnum.next()); assertThat(termEnum.totalTermFreq(), equalTo(memTermEnum.totalTermFreq())); PostingsEnum docsPosEnum = termEnum.postings(null, PostingsEnum.POSITIONS); PostingsEnum memDocsPosEnum = memTermEnum.postings(null, PostingsEnum.POSITIONS); String currentTerm = termEnum.term().utf8ToString(); assertThat( "Token mismatch for field: " + field_name, currentTerm, equalTo(memTermEnum.term().utf8ToString())); docsPosEnum.nextDoc(); memDocsPosEnum.nextDoc(); int freq = docsPosEnum.freq(); assertThat(freq, equalTo(memDocsPosEnum.freq())); for (int i = 0; i < freq; i++) { String failDesc = " (field:" + field_name + " term:" + currentTerm + ")"; int memPos = memDocsPosEnum.nextPosition(); int pos = docsPosEnum.nextPosition(); assertThat("Position test failed" + failDesc, memPos, equalTo(pos)); assertThat( "Start offset test failed" + failDesc, memDocsPosEnum.startOffset(), equalTo(docsPosEnum.startOffset())); assertThat( "End offset test failed" + failDesc, memDocsPosEnum.endOffset(), equalTo(docsPosEnum.endOffset())); assertThat( "Missing payload test failed" + failDesc, docsPosEnum.getPayload(), equalTo(docsPosEnum.getPayload())); } } assertNull("Still some tokens not processed", memTermEnum.next()); }
/* Copied from lucene 4.2.x core */ private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException { final Terms terms = MultiFields.getTerms(r, field); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(text, true)) { return termsEnum.totalTermFreq(); } } return 0; }
public void collectTermContext( IndexReader reader, List<LeafReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) throws IOException { TermsEnum termsEnum = null; for (LeafReaderContext context : leaves) { final Fields fields = context.reader().fields(); for (int i = 0; i < queryTerms.length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; final Terms terms = fields.terms(term.field()); if (terms == null) { // field does not exist continue; } termsEnum = terms.iterator(); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) continue; if (termsEnum.seekExact(term.bytes())) { if (termContext == null) { contextArray[i] = new TermContext( reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { termContext.register( termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } } }
@Override public long totalTermFreq() throws IOException { return termsEnum.totalTermFreq(); }
@Test public void luceneNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription( TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "text*.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir); for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) { // System.out.println(jcas.getDocumentText().length()); } int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); // Bits liveDocs = MultiFields.getLiveDocs(index); // DocsEnum docs = termsEnum.docs(liveDocs, null); // int docId; // while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) { // index.g // } BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); if (text.utf8ToString().equals("this")) { assertEquals(2, termsEnum.docFreq()); assertEquals(3, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(35, i); }
private void mapOneVector( NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException { NamedList<Object> fieldNL = new NamedList<Object>(); docNL.add(field, fieldNL); BytesRef text; DocsAndPositionsEnum dpEnum = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); NamedList<Object> termInfo = new NamedList<Object>(); fieldNL.add(term, termInfo); final int freq = (int) termsEnum.totalTermFreq(); if (fieldOptions.termFreq == true) { termInfo.add("tf", freq); } dpEnum = termsEnum.docsAndPositions(null, dpEnum); boolean useOffsets = false; boolean usePositions = false; if (dpEnum != null) { dpEnum.nextDoc(); usePositions = fieldOptions.positions; useOffsets = fieldOptions.offsets; } NamedList<Integer> positionsNL = null; NamedList<Number> theOffsets = null; if (usePositions || useOffsets) { for (int i = 0; i < freq; i++) { final int pos = dpEnum.nextPosition(); if (usePositions && pos >= 0) { if (positionsNL == null) { positionsNL = new NamedList<Integer>(); termInfo.add("positions", positionsNL); } positionsNL.add("position", pos); } if (useOffsets && theOffsets == null) { if (dpEnum.startOffset() == -1) { useOffsets = false; } else { theOffsets = new NamedList<Number>(); termInfo.add("offsets", theOffsets); } } if (theOffsets != null) { theOffsets.add("start", dpEnum.startOffset()); theOffsets.add("end", dpEnum.endOffset()); } } } int df = 0; if (fieldOptions.docFreq || fieldOptions.tfIdf) { df = reader.docFreq(new Term(field, text)); } if (fieldOptions.docFreq) { termInfo.add("df", df); } // TODO: this is not TF/IDF by anyone's definition! if (fieldOptions.tfIdf) { double tfIdfVal = ((double) freq) / df; termInfo.add("tf-idf", tfIdfVal); } } }
private void checkAllInfo( int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorsRequestBuilder resp = client() .prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(true) .setOffsets(true) .setPositions(true) .setFieldStatistics(true) .setTermStatistics(true) .setSelectedFields(); assertThat(resp.request().fieldStatistics(), equalTo(true)); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat( "expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else { assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = XContentFactory.jsonBuilder(); xBuilder.startObject(); response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS); xBuilder.endObject(); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", ""); ; String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
public SparseInstances readIndex(String indexPath, String destFile, int threshold) throws Exception { if (indexPath == null || destFile == null) { System.out.println("error: indexPath or destFile is null\n"); return null; } DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(reviewKey); int capacity = (int) terms.size(); HashMap<String, Integer> wordDict = new HashMap<>(capacity); capacity = capacity > 65535 ? 65535 : capacity; SparseInstances instData = new SparseInstances(capacity, reader.numDocs()); TermsEnum termsEnum = terms.iterator(); int index = 0; BytesRef term = null; String strTerm = null; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); if (termsEnum.totalTermFreq() < threshold) { continue; } if (strTerm.isEmpty()) { continue; } if (wordDict.get(strTerm) != null) { continue; } instData.addAttribute(strTerm); index++; } int numAtt = instData.numAttributes(); int numInst = instData.numInstances(); Integer attIndex = null; String id = null; int termIndex = 0; for (int docIndex = 0; docIndex < numInst; docIndex++) { id = reader.document(docIndex).getField(idKey).stringValue(); Terms docTerms = reader.getTermVector(docIndex, reviewKey); if (docTerms == null) { continue; } int[] indices = new int[(int) docTerms.size()]; double[] attValues = new double[(int) docTerms.size()]; termsEnum = docTerms.iterator(); termIndex = 0; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); attIndex = wordDict.get(strTerm); if (attIndex == null) { continue; } indices[termIndex] = attIndex.intValue(); attValues[termIndex] = termsEnum.totalTermFreq(); } ESparseInstance instance = new ESparseInstance(id, 1.0, attValues, indices, numAtt); instData.addInstance(instance); } return null; }
/** checks term-level statistics */ public void assertTermStats(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum) throws Exception { assertEquals(leftTermsEnum.docFreq(), rightTermsEnum.docFreq()); if (leftTermsEnum.totalTermFreq() != -1 && rightTermsEnum.totalTermFreq() != -1) { assertEquals(leftTermsEnum.totalTermFreq(), rightTermsEnum.totalTermFreq()); } }
protected void validateResponse( TermVectorResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException { TestDoc testDoc = testConfig.doc; HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<String>(Arrays.asList(testConfig.selectedFields)); Fields esTermVectorFields = esResponse.getFields(); for (TestFieldSetting field : testDoc.fieldSettings) { Terms esTerms = esTermVectorFields.terms(field.name); if (selectedFields != null && !selectedFields.contains(field.name)) { assertNull(esTerms); continue; } assertNotNull(esTerms); Terms luceneTerms = luceneFields.terms(field.name); TermsEnum esTermEnum = esTerms.iterator(null); TermsEnum luceneTermEnum = luceneTerms.iterator(null); while (esTermEnum.next() != null) { assertNotNull(luceneTermEnum.next()); assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq())); DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0); DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0); if (luceneDocsPosEnum == null) { // test we expect that... assertFalse(field.storedOffset); assertFalse(field.storedPayloads); assertFalse(field.storedPositions); continue; } String currentTerm = esTermEnum.term().utf8ToString(); assertThat( "Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString())); esDocsPosEnum.nextDoc(); luceneDocsPosEnum.nextDoc(); int freq = esDocsPosEnum.freq(); assertThat(freq, equalTo(luceneDocsPosEnum.freq())); for (int i = 0; i < freq; i++) { String failDesc = " (field:" + field.name + " term:" + currentTerm + ")"; int lucenePos = luceneDocsPosEnum.nextPosition(); int esPos = esDocsPosEnum.nextPosition(); if (field.storedPositions && testConfig.requestPositions) { assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos)); } else { assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1)); } if (field.storedOffset && testConfig.requestOffsets) { assertThat( "Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset())); assertThat( "Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset())); } else { assertThat( "Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1)); assertThat( "Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1)); } if (field.storedPayloads && testConfig.requestPayloads) { assertThat( "Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload())); } else { assertThat( "Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null)); } } } assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next()); } }
/** * Safe (but, slowish) default method to write every vector field in the document. This default * implementation requires that the vectors implement both Fields.size and Terms.size. */ protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) throws IOException { if (vectors == null) { startDocument(0); return; } final int numFields = vectors.size(); if (numFields == -1) { throw new IllegalStateException("vectors.size() must be implemented (it returned -1)"); } startDocument(numFields); final FieldsEnum fieldsEnum = vectors.iterator(); String fieldName; String lastFieldName = null; while ((fieldName = fieldsEnum.next()) != null) { final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName); assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0 : "lastFieldName=" + lastFieldName + " fieldName=" + fieldName; lastFieldName = fieldName; final Terms terms = fieldsEnum.terms(); if (terms == null) { // FieldsEnum shouldn't lie... continue; } final int numTerms = (int) terms.size(); if (numTerms == -1) { throw new IllegalStateException("terms.size() must be implemented (it returned -1)"); } final TermsEnum termsEnum = terms.iterator(null); DocsAndPositionsEnum docsAndPositionsEnum = null; boolean startedField = false; // NOTE: this is tricky, because TermVectors allow // indexing offsets but NOT positions. So we must // lazily init the field by checking whether first // position we see is -1 or not. int termCount = 0; while (termsEnum.next() != null) { termCount++; final int freq = (int) termsEnum.totalTermFreq(); if (startedField) { startTerm(termsEnum.term(), freq); } // TODO: we need a "query" API where we can ask (via // flex API) what this term was indexed with... // Both positions & offsets: docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, true); final boolean hasOffsets; boolean hasPositions = false; if (docsAndPositionsEnum == null) { // Fallback: no offsets docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, false); hasOffsets = false; } else { hasOffsets = true; } if (docsAndPositionsEnum != null) { final int docID = docsAndPositionsEnum.nextDoc(); assert docID != DocIdSetIterator.NO_MORE_DOCS; assert docsAndPositionsEnum.freq() == freq; for (int posUpto = 0; posUpto < freq; posUpto++) { final int pos = docsAndPositionsEnum.nextPosition(); if (!startedField) { assert numTerms > 0; hasPositions = pos != -1; startField(fieldInfo, numTerms, hasPositions, hasOffsets); startTerm(termsEnum.term(), freq); startedField = true; } final int startOffset; final int endOffset; if (hasOffsets) { startOffset = docsAndPositionsEnum.startOffset(); endOffset = docsAndPositionsEnum.endOffset(); assert startOffset != -1; assert endOffset != -1; } else { startOffset = -1; endOffset = -1; } assert !hasPositions || pos >= 0; addPosition(pos, startOffset, endOffset); } } else { if (!startedField) { assert numTerms > 0; startField(fieldInfo, numTerms, hasPositions, hasOffsets); startTerm(termsEnum.term(), freq); startedField = true; } } } assert termCount == numTerms; } }