public void testGetTermVector() throws IOException { createIndexWithAlias(); assertAcked( client() .admin() .indices() .preparePutMapping("test") .setType("type1") .setSource("field", "type=text,term_vector=with_positions_offsets_payloads") .get()); ensureYellow("test"); client() .prepareIndex(indexOrAlias(), "type1", "1") .setSource("field", "the quick brown fox jumps over the lazy dog") .get(); refresh(); TermVectorsResponse termVectorsResponse = client().prepareTermVectors(indexOrAlias(), "type1", "1").get(); assertThat(termVectorsResponse.getIndex(), equalTo("test")); assertThat(termVectorsResponse.isExists(), equalTo(true)); Fields fields = termVectorsResponse.getFields(); assertThat(fields.size(), equalTo(1)); assertThat(fields.terms("field").size(), equalTo(8L)); }
private void checkAllInfo( int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorsRequestBuilder resp = client() .prepareTermVectors("test", "type1", Integer.toString(i)) .setPayloads(true) .setOffsets(true) .setPositions(true) .setFieldStatistics(true) .setTermStatistics(true) .setSelectedFields(); assertThat(resp.request().fieldStatistics(), equalTo(true)); TermVectorsResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat( "expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else { assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = XContentFactory.jsonBuilder(); xBuilder.startObject(); response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS); xBuilder.endObject(); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", ""); ; String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
public static void main(String[] args) throws IOException { IndexReader reader = null; /* * Opening the index first simplifies the processing of the * rest of the command line arguments. */ for (int i = 0; i < args.length; i++) { if (("-index".equals(args[i])) && ((i + 1) < args.length)) { reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1]))); if (reader == null) { System.err.println("Error: Can't open index " + args[i + 1]); System.exit(1); } ; break; } ; } ; if (reader == null) { System.err.println(usage); System.exit(1); } ; /* * Process the command line arguments sequentially. */ for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { /* * Handled in the previous loop, so just skip the argument. */ i++; } else if ("-list-edocid".equals(args[i])) { System.out.println("-list-edocid:"); if ((i + 1) >= args.length) { System.out.println(usage); break; } ; Document d = reader.document(Integer.parseInt(args[i + 1])); System.out.println( "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId")); i += 1; } else if ("-list-docids".equals(args[i])) { System.out.println("-list-docids:"); for (int j = 0; j < reader.numDocs(); j++) { Document d = reader.document(j); System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId")); } ; } else if ("-list-fields".equals(args[i])) { Fields fields = MultiFields.getFields(reader); System.out.print("\nNumber of fields: "); if (fields == null) System.out.println("0"); else { System.out.println(fields.size()); Iterator<String> is = fields.iterator(); while (is.hasNext()) { System.out.println("\t" + is.next()); } ; } ; } else if ("-list-postings".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE); i += 2; } else if ("-list-postings-sample".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], 5); i += 2; } else if ("-list-stats".equals(args[i])) { System.out.println("Corpus statistics:"); System.out.println("\tnumdocs\t\t" + reader.numDocs()); System.out.println( "\turl:\t" + "\tnumdocs=" + reader.getDocCount("url") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("url") + "\tavglen=" + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url")); System.out.println( "\tkeywords:" + "\tnumdocs=" + reader.getDocCount("keywords") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("keywords") + "\tavglen=" + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords")); System.out.println( "\ttitle:\t" + "\tnumdocs=" + reader.getDocCount("title") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("title") + "\tavglen=" + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title")); System.out.println( "\tbody:\t" + "\tnumdocs=" + reader.getDocCount("body") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("body") + "\tavglen=" + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body")); System.out.println( "\tinlink:\t" + "\tnumdocs=" + reader.getDocCount("inlink") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("inlink") + "\tavglen=" + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink")); } else if ("-list-terms".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermDictionary(reader, args[i + 1]); i += 1; } else if ("-list-termvector".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermVectors(reader, args[i + 1]); i += 1; } else if ("-list-termvector-field".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listTermVectorField(reader, args[i + 1], args[i + 2]); i += 2; } else System.err.println("\nWarning: Unknown argument " + args[i] + " ignored."); } ; /* * Close the index and exit gracefully. */ reader.close(); }
public static void verifyEquals(Fields d1, Fields d2) throws IOException { if (d1 == null) { assertTrue(d2 == null || d2.size() == 0); return; } assertTrue(d2 != null); Iterator<String> fieldsEnum2 = d2.iterator(); for (String field1 : d1) { String field2 = fieldsEnum2.next(); assertEquals(field1, field2); Terms terms1 = d1.terms(field1); assertNotNull(terms1); TermsEnum termsEnum1 = terms1.iterator(null); Terms terms2 = d2.terms(field2); assertNotNull(terms2); TermsEnum termsEnum2 = terms2.iterator(null); DocsAndPositionsEnum dpEnum1 = null; DocsAndPositionsEnum dpEnum2 = null; DocsEnum dEnum1 = null; DocsEnum dEnum2 = null; BytesRef term1; while ((term1 = termsEnum1.next()) != null) { BytesRef term2 = termsEnum2.next(); assertEquals(term1, term2); assertEquals(termsEnum1.totalTermFreq(), termsEnum2.totalTermFreq()); dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1); dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2); if (dpEnum1 != null) { assertNotNull(dpEnum2); int docID1 = dpEnum1.nextDoc(); dpEnum2.nextDoc(); // docIDs are not supposed to be equal // int docID2 = dpEnum2.nextDoc(); // assertEquals(docID1, docID2); assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dpEnum1.freq(); int freq2 = dpEnum2.freq(); assertEquals(freq1, freq2); OffsetAttribute offsetAtt1 = dpEnum1.attributes().hasAttribute(OffsetAttribute.class) ? dpEnum1.attributes().getAttribute(OffsetAttribute.class) : null; OffsetAttribute offsetAtt2 = dpEnum2.attributes().hasAttribute(OffsetAttribute.class) ? dpEnum2.attributes().getAttribute(OffsetAttribute.class) : null; if (offsetAtt1 != null) { assertNotNull(offsetAtt2); } else { assertNull(offsetAtt2); } for (int posUpto = 0; posUpto < freq1; posUpto++) { int pos1 = dpEnum1.nextPosition(); int pos2 = dpEnum2.nextPosition(); assertEquals(pos1, pos2); if (offsetAtt1 != null) { assertEquals(offsetAtt1.startOffset(), offsetAtt2.startOffset()); assertEquals(offsetAtt1.endOffset(), offsetAtt2.endOffset()); } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.nextDoc()); } else { dEnum1 = TestUtil.docs(random(), termsEnum1, null, dEnum1, DocsEnum.FLAG_FREQS); dEnum2 = TestUtil.docs(random(), termsEnum2, null, dEnum2, DocsEnum.FLAG_FREQS); assertNotNull(dEnum1); assertNotNull(dEnum2); int docID1 = dEnum1.nextDoc(); dEnum2.nextDoc(); // docIDs are not supposed to be equal // int docID2 = dEnum2.nextDoc(); // assertEquals(docID1, docID2); assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dEnum1.freq(); int freq2 = dEnum2.freq(); assertEquals(freq1, freq2); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum1.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum2.nextDoc()); } } assertNull(termsEnum2.next()); } assertFalse(fieldsEnum2.hasNext()); }
/** * Safe (but, slowish) default method to write every vector field in the document. This default * implementation requires that the vectors implement both Fields.size and Terms.size. */ protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) throws IOException { if (vectors == null) { startDocument(0); return; } final int numFields = vectors.size(); if (numFields == -1) { throw new IllegalStateException("vectors.size() must be implemented (it returned -1)"); } startDocument(numFields); final FieldsEnum fieldsEnum = vectors.iterator(); String fieldName; String lastFieldName = null; while ((fieldName = fieldsEnum.next()) != null) { final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName); assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0 : "lastFieldName=" + lastFieldName + " fieldName=" + fieldName; lastFieldName = fieldName; final Terms terms = fieldsEnum.terms(); if (terms == null) { // FieldsEnum shouldn't lie... continue; } final int numTerms = (int) terms.size(); if (numTerms == -1) { throw new IllegalStateException("terms.size() must be implemented (it returned -1)"); } final TermsEnum termsEnum = terms.iterator(null); DocsAndPositionsEnum docsAndPositionsEnum = null; boolean startedField = false; // NOTE: this is tricky, because TermVectors allow // indexing offsets but NOT positions. So we must // lazily init the field by checking whether first // position we see is -1 or not. int termCount = 0; while (termsEnum.next() != null) { termCount++; final int freq = (int) termsEnum.totalTermFreq(); if (startedField) { startTerm(termsEnum.term(), freq); } // TODO: we need a "query" API where we can ask (via // flex API) what this term was indexed with... // Both positions & offsets: docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, true); final boolean hasOffsets; boolean hasPositions = false; if (docsAndPositionsEnum == null) { // Fallback: no offsets docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, false); hasOffsets = false; } else { hasOffsets = true; } if (docsAndPositionsEnum != null) { final int docID = docsAndPositionsEnum.nextDoc(); assert docID != DocIdSetIterator.NO_MORE_DOCS; assert docsAndPositionsEnum.freq() == freq; for (int posUpto = 0; posUpto < freq; posUpto++) { final int pos = docsAndPositionsEnum.nextPosition(); if (!startedField) { assert numTerms > 0; hasPositions = pos != -1; startField(fieldInfo, numTerms, hasPositions, hasOffsets); startTerm(termsEnum.term(), freq); startedField = true; } final int startOffset; final int endOffset; if (hasOffsets) { startOffset = docsAndPositionsEnum.startOffset(); endOffset = docsAndPositionsEnum.endOffset(); assert startOffset != -1; assert endOffset != -1; } else { startOffset = -1; endOffset = -1; } assert !hasPositions || pos >= 0; addPosition(pos, startOffset, endOffset); } } else { if (!startedField) { assert numTerms > 0; startField(fieldInfo, numTerms, hasPositions, hasOffsets); startTerm(termsEnum.term(), freq); startedField = true; } } } assert termCount == numTerms; } }