private Fields generateTermVectorsFromDoc(TermVectorRequest request, boolean doAllFields) throws IOException { // parse the document, at the moment we do update the mapping, just like percolate ParsedDocument parsedDocument = parseDocument(indexShard.shardId().getIndex(), request.type(), request.doc()); // select the right fields and generate term vectors ParseContext.Document doc = parsedDocument.rootDoc(); Collection<String> seenFields = new HashSet<>(); Collection<GetField> getFields = new HashSet<>(); for (IndexableField field : doc.getFields()) { FieldMapper fieldMapper = indexShard.mapperService().smartNameFieldMapper(field.name()); if (seenFields.contains(field.name())) { continue; } else { seenFields.add(field.name()); } if (!isValidField(fieldMapper)) { continue; } if (request.selectedFields() == null && !doAllFields && !fieldMapper.fieldType().storeTermVectors()) { continue; } if (request.selectedFields() != null && !request.selectedFields().contains(field.name())) { continue; } String[] values = doc.getValues(field.name()); getFields.add(new GetField(field.name(), Arrays.asList((Object[]) values))); } return generateTermVectors(getFields, request.offsets(), request.perFieldAnalyzer()); }
public void assertDeleteContent(Store store, DirectoryService service) throws IOException { store.deleteContent(); assertThat( Arrays.toString(store.directory().listAll()), store.directory().listAll().length, equalTo(0)); assertThat(store.stats().sizeInBytes(), equalTo(0l)); for (Directory dir : service.build()) { assertThat(dir.listAll().length, equalTo(0)); } }
public void testDuelGlobalOrdinals() throws Exception { Random random = getRandom(); final int numDocs = scaledRandomIntBetween(10, 1000); final int numValues = scaledRandomIntBetween(10, 500); final String[] values = new String[numValues]; for (int i = 0; i < numValues; ++i) { values[i] = new String(RandomStrings.randomAsciiOfLength(random, 10)); } for (int i = 0; i < numDocs; i++) { Document d = new Document(); final int numVals = randomInt(3); for (int j = 0; j < numVals; ++j) { final String value = RandomPicks.randomFrom(random, Arrays.asList(values)); d.add(new StringField("string", value, Field.Store.NO)); d.add(new SortedSetDocValuesField("bytes", new BytesRef(value))); } writer.addDocument(d); if (randomInt(10) == 0) { refreshReader(); } } refreshReader(); Map<FieldDataType, Type> typeMap = new HashMap<FieldDataType, DuelFieldDataTests.Type>(); typeMap.put( new FieldDataType("string", ImmutableSettings.builder().put("format", "fst")), Type.Bytes); typeMap.put( new FieldDataType("string", ImmutableSettings.builder().put("format", "paged_bytes")), Type.Bytes); typeMap.put( new FieldDataType("string", ImmutableSettings.builder().put("format", "doc_values")), Type.Bytes); for (Map.Entry<FieldDataType, Type> entry : typeMap.entrySet()) { ifdService.clear(); IndexOrdinalsFieldData fieldData = getForField(entry.getKey(), entry.getValue().name().toLowerCase(Locale.ROOT)); RandomAccessOrds left = fieldData.load(readerContext).getOrdinalsValues(); fieldData.clear(); RandomAccessOrds right = fieldData .loadGlobal(topLevelReader) .load(topLevelReader.leaves().get(0)) .getOrdinalsValues(); assertEquals(left.getValueCount(), right.getValueCount()); for (long ord = 0; ord < left.getValueCount(); ++ord) { assertEquals(left.lookupOrd(ord), right.lookupOrd(ord)); } } }
public void seek(TermEnum terms) throws IOException { original.seek(terms); docFreq = terms.docFreq(); pointer = -1; if (docFreq > postingMaps.length) { // grow postingsMap PostingMap[] newMap = new PostingMap[docFreq]; System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length); for (int i = postingMaps.length; i < docFreq; i++) { newMap[i] = new PostingMap(); } postingMaps = newMap; } out.reset(); int i = 0; while (original.next()) { PostingMap map = postingMaps[i++]; map.newDoc = oldToNew[original.doc()]; // remap the newDoc id map.offset = out.getFilePointer(); // save pointer to buffer final int tf = original.freq(); // buffer tf & positions out.writeVInt(tf); int prevPosition = 0; for (int j = tf; j > 0; j--) { // delta encode positions int p = original.nextPosition(); out.writeVInt(p - prevPosition); prevPosition = p; } } out.flush(); docFreq = i; // allow for deletions Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space // NOTE: this might be substantially faster if RAMInputStream were public // and supported a reset() operation. in = tempDir.openInput(TEMP_FILE); }
/** * Reconstruct document fields. * * @param docNum document number. If this document is deleted, but the index is not optimized yet, * the reconstruction process may still yield the reconstructed field content even from * deleted documents. * @return reconstructed document * @throws Exception */ public Reconstructed reconstruct(int docNum) throws Exception { if (docNum < 0 || docNum > reader.maxDoc()) { throw new Exception("Document number outside of valid range."); } Reconstructed res = new Reconstructed(); if (deleted != null && deleted.get(docNum)) { throw new Exception("Document is deleted."); } else { Document doc = reader.document(docNum); for (int i = 0; i < fieldNames.length; i++) { Field[] fs = doc.getFields(fieldNames[i]); if (fs != null && fs.length > 0) { res.getStoredFields().put(fieldNames[i], fs); } } } // collect values from unstored fields HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames)); // try to use term vectors if available progress.maxValue = fieldNames.length; progress.curValue = 0; progress.minValue = 0; for (int i = 0; i < fieldNames.length; i++) { TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]); if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) { TermPositionVector tpv = (TermPositionVector) tvf; progress.message = "Reading term vectors ..."; progress.curValue = i; setChanged(); notifyObservers(progress); BytesRef[] tv = tpv.getTerms(); for (int k = 0; k < tv.length; k++) { // do we have positions? int[] posArr = tpv.getTermPositions(k); if (posArr == null) { // only offsets TermVectorOffsetInfo[] offsets = tpv.getOffsets(k); if (offsets.length == 0) { continue; } // convert offsets into positions posArr = convertOffsets(offsets); } GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fieldNames[i], gsa); } for (int m = 0; m < posArr.length; m++) { gsa.append(posArr[m], "|", tv[k].utf8ToString()); } } fields.remove(fieldNames[i]); // got what we wanted } } // this loop collects data only from left-over fields // not yet collected through term vectors progress.maxValue = fields.size(); progress.curValue = 0; progress.minValue = 0; for (String fld : fields) { progress.message = "Collecting terms in " + fld + " ..."; progress.curValue++; setChanged(); notifyObservers(progress); Terms terms = MultiFields.getTerms(reader, fld); if (terms == null) { // no terms in this field continue; } TermsEnum te = terms.iterator(); while (te.next() != null) { DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null); if (dpe == null) { // no position info for this field break; } int num = dpe.advance(docNum); if (num != docNum) { // either greater than or NO_MORE_DOCS continue; // no data for this term in this doc } String term = te.term().utf8ToString(); GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fld, gsa); } for (int k = 0; k < dpe.freq(); k++) { int pos = dpe.nextPosition(); gsa.append(pos, "|", term); } } } progress.message = "Done."; progress.curValue = 100; setChanged(); notifyObservers(progress); return res; }
@Test public void testDuelAllTypesSingleValue() throws Exception { final String mapping = XContentFactory.jsonBuilder() .startObject() .startObject("type") .startObject("properties") .startObject("bytes") .field("type", "string") .field("index", "not_analyzed") .startObject("fielddata") .field("format", LuceneTestCase.defaultCodecSupportsSortedSet() ? "doc_values" : "fst") .endObject() .endObject() .startObject("byte") .field("type", "byte") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .startObject("short") .field("type", "short") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .startObject("integer") .field("type", "integer") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .startObject("long") .field("type", "long") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .startObject("float") .field("type", "float") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .startObject("double") .field("type", "double") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .endObject() .endObject() .endObject() .string(); final DocumentMapper mapper = mapperService.documentMapperParser().parse(mapping); Random random = getRandom(); int atLeast = scaledRandomIntBetween(1000, 1500); for (int i = 0; i < atLeast; i++) { String s = Integer.toString(randomByte()); XContentBuilder doc = XContentFactory.jsonBuilder().startObject(); for (String fieldName : Arrays.asList("bytes", "byte", "short", "integer", "long", "float", "double")) { doc = doc.field(fieldName, s); } doc = doc.endObject(); final ParsedDocument d = mapper.parse("type", Integer.toString(i), doc.bytes()); writer.addDocument(d.rootDoc()); if (random.nextInt(10) == 0) { refreshReader(); } } AtomicReaderContext context = refreshReader(); Map<FieldDataType, Type> typeMap = new HashMap<>(); typeMap.put( new FieldDataType("string", ImmutableSettings.builder().put("format", "fst")), Type.Bytes); typeMap.put( new FieldDataType("string", ImmutableSettings.builder().put("format", "paged_bytes")), Type.Bytes); typeMap.put( new FieldDataType("byte", ImmutableSettings.builder().put("format", "array")), Type.Integer); typeMap.put( new FieldDataType("short", ImmutableSettings.builder().put("format", "array")), Type.Integer); typeMap.put( new FieldDataType("int", ImmutableSettings.builder().put("format", "array")), Type.Integer); typeMap.put( new FieldDataType("long", ImmutableSettings.builder().put("format", "array")), Type.Long); typeMap.put( new FieldDataType("double", ImmutableSettings.builder().put("format", "array")), Type.Double); typeMap.put( new FieldDataType("float", ImmutableSettings.builder().put("format", "array")), Type.Float); typeMap.put( new FieldDataType("byte", ImmutableSettings.builder().put("format", "doc_values")), Type.Integer); typeMap.put( new FieldDataType("short", ImmutableSettings.builder().put("format", "doc_values")), Type.Integer); typeMap.put( new FieldDataType("int", ImmutableSettings.builder().put("format", "doc_values")), Type.Integer); typeMap.put( new FieldDataType("long", ImmutableSettings.builder().put("format", "doc_values")), Type.Long); typeMap.put( new FieldDataType("double", ImmutableSettings.builder().put("format", "doc_values")), Type.Double); typeMap.put( new FieldDataType("float", ImmutableSettings.builder().put("format", "doc_values")), Type.Float); typeMap.put( new FieldDataType("string", ImmutableSettings.builder().put("format", "doc_values")), Type.Bytes); ArrayList<Entry<FieldDataType, Type>> list = new ArrayList<>(typeMap.entrySet()); Preprocessor pre = new ToDoublePreprocessor(); while (!list.isEmpty()) { Entry<FieldDataType, Type> left; Entry<FieldDataType, Type> right; if (list.size() > 1) { left = list.remove(random.nextInt(list.size())); right = list.remove(random.nextInt(list.size())); } else { right = left = list.remove(0); } ifdService.clear(); IndexFieldData<?> leftFieldData = getForField(left.getKey(), left.getValue().name().toLowerCase(Locale.ROOT)); ifdService.clear(); IndexFieldData<?> rightFieldData = getForField(right.getKey(), right.getValue().name().toLowerCase(Locale.ROOT)); duelFieldDataBytes(random, context, leftFieldData, rightFieldData, pre); duelFieldDataBytes(random, context, rightFieldData, leftFieldData, pre); DirectoryReader perSegment = DirectoryReader.open(writer, true); CompositeReaderContext composite = perSegment.getContext(); List<AtomicReaderContext> leaves = composite.leaves(); for (AtomicReaderContext atomicReaderContext : leaves) { duelFieldDataBytes(random, atomicReaderContext, leftFieldData, rightFieldData, pre); } } }
@Test public void testDuelIntegers() throws Exception { final String mapping = XContentFactory.jsonBuilder() .startObject() .startObject("type") .startObject("properties") .startObject("byte") .field("type", "byte") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .startObject("short") .field("type", "short") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .startObject("integer") .field("type", "integer") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .startObject("long") .field("type", "long") .startObject("fielddata") .field("format", "doc_values") .endObject() .endObject() .endObject() .endObject() .endObject() .string(); final DocumentMapper mapper = mapperService.documentMapperParser().parse(mapping); Random random = getRandom(); int atLeast = scaledRandomIntBetween(1000, 1500); final int maxNumValues = randomBoolean() ? 1 : randomIntBetween(2, 40); byte[] values = new byte[maxNumValues]; for (int i = 0; i < atLeast; i++) { int numValues = randomInt(maxNumValues); // FD loses values if they are duplicated, so we must deduplicate for this test Set<Byte> vals = new HashSet<Byte>(); for (int j = 0; j < numValues; ++j) { vals.add(randomByte()); } numValues = vals.size(); int upto = 0; for (Byte bb : vals) { values[upto++] = bb.byteValue(); } XContentBuilder doc = XContentFactory.jsonBuilder().startObject(); for (String fieldName : Arrays.asList("byte", "short", "integer", "long")) { doc = doc.startArray(fieldName); for (int j = 0; j < numValues; ++j) { doc = doc.value(values[j]); } doc = doc.endArray(); } doc = doc.endObject(); final ParsedDocument d = mapper.parse("type", Integer.toString(i), doc.bytes()); writer.addDocument(d.rootDoc()); if (random.nextInt(10) == 0) { refreshReader(); } } AtomicReaderContext context = refreshReader(); Map<FieldDataType, Type> typeMap = new HashMap<>(); typeMap.put( new FieldDataType("byte", ImmutableSettings.builder().put("format", "array")), Type.Integer); typeMap.put( new FieldDataType("short", ImmutableSettings.builder().put("format", "array")), Type.Integer); typeMap.put( new FieldDataType("int", ImmutableSettings.builder().put("format", "array")), Type.Integer); typeMap.put( new FieldDataType("long", ImmutableSettings.builder().put("format", "array")), Type.Long); typeMap.put( new FieldDataType("byte", ImmutableSettings.builder().put("format", "doc_values")), Type.Integer); typeMap.put( new FieldDataType("short", ImmutableSettings.builder().put("format", "doc_values")), Type.Integer); typeMap.put( new FieldDataType("int", ImmutableSettings.builder().put("format", "doc_values")), Type.Integer); typeMap.put( new FieldDataType("long", ImmutableSettings.builder().put("format", "doc_values")), Type.Long); ArrayList<Entry<FieldDataType, Type>> list = new ArrayList<>(typeMap.entrySet()); while (!list.isEmpty()) { Entry<FieldDataType, Type> left; Entry<FieldDataType, Type> right; if (list.size() > 1) { left = list.remove(random.nextInt(list.size())); right = list.remove(random.nextInt(list.size())); } else { right = left = list.remove(0); } ifdService.clear(); IndexNumericFieldData leftFieldData = getForField(left.getKey(), left.getValue().name().toLowerCase(Locale.ROOT)); ifdService.clear(); IndexNumericFieldData rightFieldData = getForField(right.getKey(), right.getValue().name().toLowerCase(Locale.ROOT)); duelFieldDataLong(random, context, leftFieldData, rightFieldData); duelFieldDataLong(random, context, rightFieldData, leftFieldData); DirectoryReader perSegment = DirectoryReader.open(writer, true); CompositeReaderContext composite = perSegment.getContext(); List<AtomicReaderContext> leaves = composite.leaves(); for (AtomicReaderContext atomicReaderContext : leaves) { duelFieldDataLong(random, atomicReaderContext, leftFieldData, rightFieldData); } } }
// private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException { private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException { int readerMax = reader.maxDoc(); DocScore[] newToOld = new DocScore[readerMax]; // use site, an indexed, un-tokenized field to get boost // byte[] boosts = reader.norms("site"); TODO MC /* TODO MC */ Document docMeta; Pattern includes = Pattern.compile("\\|"); String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, ""); String includeExtensions[] = includes.split(value); Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>(); for (int i = 0; i < includeExtensions.length; i++) { validExtensions.put(includeExtensions[i], true); System.out.println("extension boosted " + includeExtensions[i]); } /* TODO MC */ for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) { float score; if (reader.isDeleted(oldDoc)) { // score = 0.0f; score = -1f; // TODO MC } else { // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC /* TODO MC */ docMeta = searcher.doc(oldDoc); if (validExtensions.get(docMeta.get("subType")) == null) { // searched extensions will have higher scores score = -0.5f; } else { score = Integer.parseInt(docMeta.get("inlinks")); /* if (score==0) { score=0.001f; // TODO MC - to not erase } */ } /* TODO MC */ // System.out.println("Score for old document "+oldDoc+" is "+score+" and type // "+docMeta.get("subType")); // TODO MC debug remove } DocScore docScore = new DocScore(); docScore.doc = oldDoc; docScore.score = score; newToOld[oldDoc] = docScore; } System.out.println("Sorting " + newToOld.length + " documents."); Arrays.sort(newToOld); // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space /* TODO MC int[] oldToNew = new int[readerMax]; for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC oldToNew[docScore.oldDoc] = newDoc; // TODO MC } */ /* TODO MC * for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove } * TODO MC */ // return oldToNew; TODO MC return newToOld; // TODO MC }
protected void validateResponse( TermVectorResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException { TestDoc testDoc = testConfig.doc; HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<String>(Arrays.asList(testConfig.selectedFields)); Fields esTermVectorFields = esResponse.getFields(); for (TestFieldSetting field : testDoc.fieldSettings) { Terms esTerms = esTermVectorFields.terms(field.name); if (selectedFields != null && !selectedFields.contains(field.name)) { assertNull(esTerms); continue; } assertNotNull(esTerms); Terms luceneTerms = luceneFields.terms(field.name); TermsEnum esTermEnum = esTerms.iterator(null); TermsEnum luceneTermEnum = luceneTerms.iterator(null); while (esTermEnum.next() != null) { assertNotNull(luceneTermEnum.next()); assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq())); DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0); DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0); if (luceneDocsPosEnum == null) { // test we expect that... assertFalse(field.storedOffset); assertFalse(field.storedPayloads); assertFalse(field.storedPositions); continue; } String currentTerm = esTermEnum.term().utf8ToString(); assertThat( "Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString())); esDocsPosEnum.nextDoc(); luceneDocsPosEnum.nextDoc(); int freq = esDocsPosEnum.freq(); assertThat(freq, equalTo(luceneDocsPosEnum.freq())); for (int i = 0; i < freq; i++) { String failDesc = " (field:" + field.name + " term:" + currentTerm + ")"; int lucenePos = luceneDocsPosEnum.nextPosition(); int esPos = esDocsPosEnum.nextPosition(); if (field.storedPositions && testConfig.requestPositions) { assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos)); } else { assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1)); } if (field.storedOffset && testConfig.requestOffsets) { assertThat( "Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset())); assertThat( "Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset())); } else { assertThat( "Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1)); assertThat( "Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1)); } if (field.storedPayloads && testConfig.requestPayloads) { assertThat( "Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload())); } else { assertThat( "Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null)); } } } assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next()); } }