private Fields generateTermVectorsFromDoc(TermVectorRequest request, boolean doAllFields)
      throws IOException {
    // parse the document, at the moment we do update the mapping, just like percolate
    ParsedDocument parsedDocument =
        parseDocument(indexShard.shardId().getIndex(), request.type(), request.doc());

    // select the right fields and generate term vectors
    ParseContext.Document doc = parsedDocument.rootDoc();
    Collection<String> seenFields = new HashSet<>();
    Collection<GetField> getFields = new HashSet<>();
    for (IndexableField field : doc.getFields()) {
      FieldMapper fieldMapper = indexShard.mapperService().smartNameFieldMapper(field.name());
      if (seenFields.contains(field.name())) {
        continue;
      } else {
        seenFields.add(field.name());
      }
      if (!isValidField(fieldMapper)) {
        continue;
      }
      if (request.selectedFields() == null
          && !doAllFields
          && !fieldMapper.fieldType().storeTermVectors()) {
        continue;
      }
      if (request.selectedFields() != null && !request.selectedFields().contains(field.name())) {
        continue;
      }
      String[] values = doc.getValues(field.name());
      getFields.add(new GetField(field.name(), Arrays.asList((Object[]) values)));
    }
    return generateTermVectors(getFields, request.offsets(), request.perFieldAnalyzer());
  }
  public void testDuelGlobalOrdinals() throws Exception {
    Random random = getRandom();
    final int numDocs = scaledRandomIntBetween(10, 1000);
    final int numValues = scaledRandomIntBetween(10, 500);
    final String[] values = new String[numValues];
    for (int i = 0; i < numValues; ++i) {
      values[i] = new String(RandomStrings.randomAsciiOfLength(random, 10));
    }
    for (int i = 0; i < numDocs; i++) {
      Document d = new Document();
      final int numVals = randomInt(3);
      for (int j = 0; j < numVals; ++j) {
        final String value = RandomPicks.randomFrom(random, Arrays.asList(values));
        d.add(new StringField("string", value, Field.Store.NO));
        d.add(new SortedSetDocValuesField("bytes", new BytesRef(value)));
      }
      writer.addDocument(d);
      if (randomInt(10) == 0) {
        refreshReader();
      }
    }
    refreshReader();

    Map<FieldDataType, Type> typeMap = new HashMap<FieldDataType, DuelFieldDataTests.Type>();
    typeMap.put(
        new FieldDataType("string", ImmutableSettings.builder().put("format", "fst")), Type.Bytes);
    typeMap.put(
        new FieldDataType("string", ImmutableSettings.builder().put("format", "paged_bytes")),
        Type.Bytes);
    typeMap.put(
        new FieldDataType("string", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Bytes);

    for (Map.Entry<FieldDataType, Type> entry : typeMap.entrySet()) {
      ifdService.clear();
      IndexOrdinalsFieldData fieldData =
          getForField(entry.getKey(), entry.getValue().name().toLowerCase(Locale.ROOT));
      RandomAccessOrds left = fieldData.load(readerContext).getOrdinalsValues();
      fieldData.clear();
      RandomAccessOrds right =
          fieldData
              .loadGlobal(topLevelReader)
              .load(topLevelReader.leaves().get(0))
              .getOrdinalsValues();
      assertEquals(left.getValueCount(), right.getValueCount());
      for (long ord = 0; ord < left.getValueCount(); ++ord) {
        assertEquals(left.lookupOrd(ord), right.lookupOrd(ord));
      }
    }
  }
示例#3
0
 /**
  * Reconstruct document fields.
  *
  * @param docNum document number. If this document is deleted, but the index is not optimized yet,
  *     the reconstruction process may still yield the reconstructed field content even from
  *     deleted documents.
  * @return reconstructed document
  * @throws Exception
  */
 public Reconstructed reconstruct(int docNum) throws Exception {
   if (docNum < 0 || docNum > reader.maxDoc()) {
     throw new Exception("Document number outside of valid range.");
   }
   Reconstructed res = new Reconstructed();
   if (deleted != null && deleted.get(docNum)) {
     throw new Exception("Document is deleted.");
   } else {
     Document doc = reader.document(docNum);
     for (int i = 0; i < fieldNames.length; i++) {
       Field[] fs = doc.getFields(fieldNames[i]);
       if (fs != null && fs.length > 0) {
         res.getStoredFields().put(fieldNames[i], fs);
       }
     }
   }
   // collect values from unstored fields
   HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames));
   // try to use term vectors if available
   progress.maxValue = fieldNames.length;
   progress.curValue = 0;
   progress.minValue = 0;
   for (int i = 0; i < fieldNames.length; i++) {
     TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]);
     if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) {
       TermPositionVector tpv = (TermPositionVector) tvf;
       progress.message = "Reading term vectors ...";
       progress.curValue = i;
       setChanged();
       notifyObservers(progress);
       BytesRef[] tv = tpv.getTerms();
       for (int k = 0; k < tv.length; k++) {
         // do we have positions?
         int[] posArr = tpv.getTermPositions(k);
         if (posArr == null) {
           // only offsets
           TermVectorOffsetInfo[] offsets = tpv.getOffsets(k);
           if (offsets.length == 0) {
             continue;
           }
           // convert offsets into positions
           posArr = convertOffsets(offsets);
         }
         GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]);
         if (gsa == null) {
           gsa = new GrowableStringArray();
           res.getReconstructedFields().put(fieldNames[i], gsa);
         }
         for (int m = 0; m < posArr.length; m++) {
           gsa.append(posArr[m], "|", tv[k].utf8ToString());
         }
       }
       fields.remove(fieldNames[i]); // got what we wanted
     }
   }
   // this loop collects data only from left-over fields
   // not yet collected through term vectors
   progress.maxValue = fields.size();
   progress.curValue = 0;
   progress.minValue = 0;
   for (String fld : fields) {
     progress.message = "Collecting terms in " + fld + " ...";
     progress.curValue++;
     setChanged();
     notifyObservers(progress);
     Terms terms = MultiFields.getTerms(reader, fld);
     if (terms == null) { // no terms in this field
       continue;
     }
     TermsEnum te = terms.iterator();
     while (te.next() != null) {
       DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null);
       if (dpe == null) { // no position info for this field
         break;
       }
       int num = dpe.advance(docNum);
       if (num != docNum) { // either greater than or NO_MORE_DOCS
         continue; // no data for this term in this doc
       }
       String term = te.term().utf8ToString();
       GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld);
       if (gsa == null) {
         gsa = new GrowableStringArray();
         res.getReconstructedFields().put(fld, gsa);
       }
       for (int k = 0; k < dpe.freq(); k++) {
         int pos = dpe.nextPosition();
         gsa.append(pos, "|", term);
       }
     }
   }
   progress.message = "Done.";
   progress.curValue = 100;
   setChanged();
   notifyObservers(progress);
   return res;
 }
  @Test
  public void testDuelAllTypesSingleValue() throws Exception {
    final String mapping =
        XContentFactory.jsonBuilder()
            .startObject()
            .startObject("type")
            .startObject("properties")
            .startObject("bytes")
            .field("type", "string")
            .field("index", "not_analyzed")
            .startObject("fielddata")
            .field("format", LuceneTestCase.defaultCodecSupportsSortedSet() ? "doc_values" : "fst")
            .endObject()
            .endObject()
            .startObject("byte")
            .field("type", "byte")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .startObject("short")
            .field("type", "short")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .startObject("integer")
            .field("type", "integer")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .startObject("long")
            .field("type", "long")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .startObject("float")
            .field("type", "float")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .startObject("double")
            .field("type", "double")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .endObject()
            .endObject()
            .endObject()
            .string();
    final DocumentMapper mapper = mapperService.documentMapperParser().parse(mapping);
    Random random = getRandom();
    int atLeast = scaledRandomIntBetween(1000, 1500);
    for (int i = 0; i < atLeast; i++) {
      String s = Integer.toString(randomByte());

      XContentBuilder doc = XContentFactory.jsonBuilder().startObject();
      for (String fieldName :
          Arrays.asList("bytes", "byte", "short", "integer", "long", "float", "double")) {
        doc = doc.field(fieldName, s);
      }

      doc = doc.endObject();

      final ParsedDocument d = mapper.parse("type", Integer.toString(i), doc.bytes());

      writer.addDocument(d.rootDoc());

      if (random.nextInt(10) == 0) {
        refreshReader();
      }
    }
    AtomicReaderContext context = refreshReader();
    Map<FieldDataType, Type> typeMap = new HashMap<>();
    typeMap.put(
        new FieldDataType("string", ImmutableSettings.builder().put("format", "fst")), Type.Bytes);
    typeMap.put(
        new FieldDataType("string", ImmutableSettings.builder().put("format", "paged_bytes")),
        Type.Bytes);
    typeMap.put(
        new FieldDataType("byte", ImmutableSettings.builder().put("format", "array")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("short", ImmutableSettings.builder().put("format", "array")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("int", ImmutableSettings.builder().put("format", "array")), Type.Integer);
    typeMap.put(
        new FieldDataType("long", ImmutableSettings.builder().put("format", "array")), Type.Long);
    typeMap.put(
        new FieldDataType("double", ImmutableSettings.builder().put("format", "array")),
        Type.Double);
    typeMap.put(
        new FieldDataType("float", ImmutableSettings.builder().put("format", "array")), Type.Float);
    typeMap.put(
        new FieldDataType("byte", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("short", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("int", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("long", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Long);
    typeMap.put(
        new FieldDataType("double", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Double);
    typeMap.put(
        new FieldDataType("float", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Float);
    typeMap.put(
        new FieldDataType("string", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Bytes);
    ArrayList<Entry<FieldDataType, Type>> list = new ArrayList<>(typeMap.entrySet());
    Preprocessor pre = new ToDoublePreprocessor();
    while (!list.isEmpty()) {
      Entry<FieldDataType, Type> left;
      Entry<FieldDataType, Type> right;
      if (list.size() > 1) {
        left = list.remove(random.nextInt(list.size()));
        right = list.remove(random.nextInt(list.size()));
      } else {
        right = left = list.remove(0);
      }

      ifdService.clear();
      IndexFieldData<?> leftFieldData =
          getForField(left.getKey(), left.getValue().name().toLowerCase(Locale.ROOT));
      ifdService.clear();
      IndexFieldData<?> rightFieldData =
          getForField(right.getKey(), right.getValue().name().toLowerCase(Locale.ROOT));
      duelFieldDataBytes(random, context, leftFieldData, rightFieldData, pre);
      duelFieldDataBytes(random, context, rightFieldData, leftFieldData, pre);

      DirectoryReader perSegment = DirectoryReader.open(writer, true);
      CompositeReaderContext composite = perSegment.getContext();
      List<AtomicReaderContext> leaves = composite.leaves();
      for (AtomicReaderContext atomicReaderContext : leaves) {
        duelFieldDataBytes(random, atomicReaderContext, leftFieldData, rightFieldData, pre);
      }
    }
  }
  @Test
  public void testDuelIntegers() throws Exception {
    final String mapping =
        XContentFactory.jsonBuilder()
            .startObject()
            .startObject("type")
            .startObject("properties")
            .startObject("byte")
            .field("type", "byte")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .startObject("short")
            .field("type", "short")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .startObject("integer")
            .field("type", "integer")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .startObject("long")
            .field("type", "long")
            .startObject("fielddata")
            .field("format", "doc_values")
            .endObject()
            .endObject()
            .endObject()
            .endObject()
            .endObject()
            .string();

    final DocumentMapper mapper = mapperService.documentMapperParser().parse(mapping);
    Random random = getRandom();
    int atLeast = scaledRandomIntBetween(1000, 1500);
    final int maxNumValues = randomBoolean() ? 1 : randomIntBetween(2, 40);
    byte[] values = new byte[maxNumValues];
    for (int i = 0; i < atLeast; i++) {
      int numValues = randomInt(maxNumValues);
      // FD loses values if they are duplicated, so we must deduplicate for this test
      Set<Byte> vals = new HashSet<Byte>();
      for (int j = 0; j < numValues; ++j) {
        vals.add(randomByte());
      }

      numValues = vals.size();
      int upto = 0;
      for (Byte bb : vals) {
        values[upto++] = bb.byteValue();
      }

      XContentBuilder doc = XContentFactory.jsonBuilder().startObject();
      for (String fieldName : Arrays.asList("byte", "short", "integer", "long")) {
        doc = doc.startArray(fieldName);
        for (int j = 0; j < numValues; ++j) {
          doc = doc.value(values[j]);
        }
        doc = doc.endArray();
      }
      doc = doc.endObject();

      final ParsedDocument d = mapper.parse("type", Integer.toString(i), doc.bytes());

      writer.addDocument(d.rootDoc());
      if (random.nextInt(10) == 0) {
        refreshReader();
      }
    }
    AtomicReaderContext context = refreshReader();
    Map<FieldDataType, Type> typeMap = new HashMap<>();
    typeMap.put(
        new FieldDataType("byte", ImmutableSettings.builder().put("format", "array")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("short", ImmutableSettings.builder().put("format", "array")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("int", ImmutableSettings.builder().put("format", "array")), Type.Integer);
    typeMap.put(
        new FieldDataType("long", ImmutableSettings.builder().put("format", "array")), Type.Long);
    typeMap.put(
        new FieldDataType("byte", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("short", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("int", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Integer);
    typeMap.put(
        new FieldDataType("long", ImmutableSettings.builder().put("format", "doc_values")),
        Type.Long);
    ArrayList<Entry<FieldDataType, Type>> list = new ArrayList<>(typeMap.entrySet());
    while (!list.isEmpty()) {
      Entry<FieldDataType, Type> left;
      Entry<FieldDataType, Type> right;
      if (list.size() > 1) {
        left = list.remove(random.nextInt(list.size()));
        right = list.remove(random.nextInt(list.size()));
      } else {
        right = left = list.remove(0);
      }
      ifdService.clear();
      IndexNumericFieldData leftFieldData =
          getForField(left.getKey(), left.getValue().name().toLowerCase(Locale.ROOT));
      ifdService.clear();
      IndexNumericFieldData rightFieldData =
          getForField(right.getKey(), right.getValue().name().toLowerCase(Locale.ROOT));

      duelFieldDataLong(random, context, leftFieldData, rightFieldData);
      duelFieldDataLong(random, context, rightFieldData, leftFieldData);

      DirectoryReader perSegment = DirectoryReader.open(writer, true);
      CompositeReaderContext composite = perSegment.getContext();
      List<AtomicReaderContext> leaves = composite.leaves();
      for (AtomicReaderContext atomicReaderContext : leaves) {
        duelFieldDataLong(random, atomicReaderContext, leftFieldData, rightFieldData);
      }
    }
  }
  protected void validateResponse(
      TermVectorResponse esResponse, Fields luceneFields, TestConfig testConfig)
      throws IOException {
    TestDoc testDoc = testConfig.doc;
    HashSet<String> selectedFields =
        testConfig.selectedFields == null
            ? null
            : new HashSet<String>(Arrays.asList(testConfig.selectedFields));
    Fields esTermVectorFields = esResponse.getFields();
    for (TestFieldSetting field : testDoc.fieldSettings) {
      Terms esTerms = esTermVectorFields.terms(field.name);
      if (selectedFields != null && !selectedFields.contains(field.name)) {
        assertNull(esTerms);
        continue;
      }

      assertNotNull(esTerms);

      Terms luceneTerms = luceneFields.terms(field.name);
      TermsEnum esTermEnum = esTerms.iterator(null);
      TermsEnum luceneTermEnum = luceneTerms.iterator(null);

      while (esTermEnum.next() != null) {
        assertNotNull(luceneTermEnum.next());

        assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
        DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0);
        DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0);
        if (luceneDocsPosEnum == null) {
          // test we expect that...
          assertFalse(field.storedOffset);
          assertFalse(field.storedPayloads);
          assertFalse(field.storedPositions);
          continue;
        }

        String currentTerm = esTermEnum.term().utf8ToString();

        assertThat(
            "Token mismatch for field: " + field.name,
            currentTerm,
            equalTo(luceneTermEnum.term().utf8ToString()));

        esDocsPosEnum.nextDoc();
        luceneDocsPosEnum.nextDoc();

        int freq = esDocsPosEnum.freq();
        assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
        for (int i = 0; i < freq; i++) {
          String failDesc = " (field:" + field.name + " term:" + currentTerm + ")";
          int lucenePos = luceneDocsPosEnum.nextPosition();
          int esPos = esDocsPosEnum.nextPosition();
          if (field.storedPositions && testConfig.requestPositions) {
            assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos));
          } else {
            assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1));
          }
          if (field.storedOffset && testConfig.requestOffsets) {
            assertThat(
                "Offset test failed" + failDesc,
                luceneDocsPosEnum.startOffset(),
                equalTo(esDocsPosEnum.startOffset()));
            assertThat(
                "Offset test failed" + failDesc,
                luceneDocsPosEnum.endOffset(),
                equalTo(esDocsPosEnum.endOffset()));
          } else {
            assertThat(
                "Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
            assertThat(
                "Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
          }
          if (field.storedPayloads && testConfig.requestPayloads) {
            assertThat(
                "Payload test failed" + failDesc,
                luceneDocsPosEnum.getPayload(),
                equalTo(esDocsPosEnum.getPayload()));
          } else {
            assertThat(
                "Missing payload test failed" + failDesc,
                esDocsPosEnum.getPayload(),
                equalTo(null));
          }
        }
      }

      assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());
    }
  }