/** checks collection-level statistics on Terms */ public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception { if (leftTerms.getDocCount() != -1 && rightTerms.getDocCount() != -1) { assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount()); } if (leftTerms.getSumDocFreq() != -1 && rightTerms.getSumDocFreq() != -1) { assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq()); } if (leftTerms.getSumTotalTermFreq() != -1 && rightTerms.getSumTotalTermFreq() != -1) { assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq()); } if (leftTerms.size() != -1 && rightTerms.size() != -1) { assertEquals(leftTerms.size(), rightTerms.size()); } }
private void buildFieldStatistics(XContentBuilder builder, Terms curTerms) throws IOException { long sumDocFreq = curTerms.getSumDocFreq(); int docCount = curTerms.getDocCount(); long sumTotalTermFrequencies = curTerms.getSumTotalTermFreq(); if (docCount > 0) { assert ((sumDocFreq > 0)) : "docCount >= 0 but sumDocFreq ain't!"; assert ((sumTotalTermFrequencies > 0)) : "docCount >= 0 but sumTotalTermFrequencies ain't!"; builder.startObject(FieldStrings.FIELD_STATISTICS); builder.field(FieldStrings.SUM_DOC_FREQ, sumDocFreq); builder.field(FieldStrings.DOC_COUNT, docCount); builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies); builder.endObject(); } else if (docCount == -1) { // this should only be -1 if the field // statistics were not requested at all. In // this case all 3 values should be -1 assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!"; assert ((sumTotalTermFrequencies == -1)) : "docCount was -1 but sumTotalTermFrequencies ain't!"; } else { throw new ElasticsearchIllegalStateException( "Something is wrong with the field statistics of the term vector request: Values are " + "\n" + FieldStrings.SUM_DOC_FREQ + " " + sumDocFreq + "\n" + FieldStrings.DOC_COUNT + " " + docCount + "\n" + FieldStrings.SUM_TTF + " " + sumTotalTermFrequencies); } }
@Override public FieldStats stats(Terms terms, int maxDoc) throws IOException { long minValue = NumericUtils.getMinInt(terms); long maxValue = NumericUtils.getMaxInt(terms); return new FieldStats.Long( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue); }
@Override public FieldStats stats(Terms terms, int maxDoc) throws IOException { float minValue = NumericUtils.sortableIntToFloat(NumericUtils.getMinInt(terms)); float maxValue = NumericUtils.sortableIntToFloat(NumericUtils.getMaxInt(terms)); return new FieldStats.Float( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue); }
@Override public FieldStats.Long stats(IndexReader reader) throws IOException { int maxDoc = reader.maxDoc(); Terms terms = org.apache.lucene.index.MultiFields.getTerms(reader, name()); if (terms == null) { return null; } long minValue = LegacyNumericUtils.getMinInt(terms); long maxValue = LegacyNumericUtils.getMaxInt(terms); return new FieldStats.Long( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), isSearchable(), isAggregatable(), minValue, maxValue); }
public static TermsEnum filter( TermsEnum toFilter, Terms terms, AtomicReader reader, Settings settings) throws IOException { int docCount = terms.getDocCount(); if (docCount == -1) { docCount = reader.maxDoc(); } final double minFrequency = settings.getAsDouble("min", 0d); final double maxFrequency = settings.getAsDouble("max", docCount + 1d); final double minSegmentSize = settings.getAsInt("min_segment_size", 0); if (minSegmentSize < docCount) { final int minFreq = minFrequency >= 1.0 ? (int) minFrequency : (int) (docCount * minFrequency); final int maxFreq = maxFrequency >= 1.0 ? (int) maxFrequency : (int) (docCount * maxFrequency); assert minFreq < maxFreq; return new FrequencyFilter(toFilter, minFreq, maxFreq); } return toFilter; }
@Override public int getDocCount() throws IOException { return delegateTerms.getDocCount(); }
private void checkAllInfo( int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorsRequestBuilder resp = client() .prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(true) .setOffsets(true) .setPositions(true) .setFieldStatistics(true) .setTermStatistics(true) .setSelectedFields(); assertThat(resp.request().fieldStatistics(), equalTo(true)); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat( "expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else { assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = XContentFactory.jsonBuilder(); xBuilder.startObject(); response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS); xBuilder.endObject(); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", ""); ; String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
private void duellReaders(CompositeReader other, LeafReader memIndexReader) throws IOException { Fields memFields = memIndexReader.fields(); for (String field : MultiFields.getFields(other)) { Terms memTerms = memFields.terms(field); Terms iwTerms = memIndexReader.terms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = MultiDocValues.getNormValues(other, field); NumericDocValues memNormValues = memIndexReader.getNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.get(0), memNormValues.get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.getDocCount(), memTerms.getDocCount()); assertEquals(iwTerms.getSumDocFreq(), memTerms.getSumDocFreq()); assertEquals(iwTerms.getSumTotalTermFreq(), memTerms.getSumTotalTermFreq()); TermsEnum iwTermsIter = iwTerms.iterator(); TermsEnum memTermsIter = memTerms.iterator(); if (iwTerms.hasPositions()) { final boolean offsets = iwTerms.hasOffsets() && memTerms.hasOffsets(); while (iwTermsIter.next() != null) { assertNotNull(memTermsIter.next()); assertEquals(iwTermsIter.term(), memTermsIter.term()); PostingsEnum iwDocsAndPos = iwTermsIter.postings(null, PostingsEnum.ALL); PostingsEnum memDocsAndPos = memTermsIter.postings(null, PostingsEnum.ALL); while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc()); assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq()); for (int i = 0; i < iwDocsAndPos.freq(); i++) { assertEquals( "term: " + iwTermsIter.term().utf8ToString(), iwDocsAndPos.nextPosition(), memDocsAndPos.nextPosition()); if (offsets) { assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset()); assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset()); } if (iwTerms.hasPayloads()) { assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload()); } } } } } else { while (iwTermsIter.next() != null) { assertEquals(iwTermsIter.term(), memTermsIter.term()); PostingsEnum iwDocsAndPos = iwTermsIter.postings(null); PostingsEnum memDocsAndPos = memTermsIter.postings(null); while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc()); assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq()); } } } } } }