コード例 #1
0
ファイル: IndexMerger.java プロジェクト: Poneyo/druid-1
  private static File makeIndexFiles(
      final List<IndexableAdapter> indexes,
      final File outDir,
      final ProgressIndicator progress,
      final List<String> mergedDimensions,
      final List<String> mergedMetrics,
      final Function<ArrayList<Iterable<Rowboat>>, Iterable<Rowboat>> rowMergerFn)
      throws IOException {
    Map<String, String> metricTypes = Maps.newTreeMap(Ordering.<String>natural().nullsFirst());
    for (IndexableAdapter adapter : indexes) {
      for (String metric : adapter.getAvailableMetrics()) {
        metricTypes.put(metric, adapter.getMetricType(metric));
      }
    }
    final Interval dataInterval;

    /** *********** Main index.drd file ************* */
    progress.progress();
    long startTime = System.currentTimeMillis();
    File indexFile = new File(outDir, "index.drd");

    FileOutputStream fileOutputStream = null;
    FileChannel channel = null;
    try {
      fileOutputStream = new FileOutputStream(indexFile);
      channel = fileOutputStream.getChannel();
      channel.write(ByteBuffer.wrap(new byte[] {IndexIO.CURRENT_VERSION_ID}));

      GenericIndexed.fromIterable(mergedDimensions, GenericIndexed.stringStrategy)
          .writeToChannel(channel);
      GenericIndexed.fromIterable(mergedMetrics, GenericIndexed.stringStrategy)
          .writeToChannel(channel);

      DateTime minTime = new DateTime(Long.MAX_VALUE);
      DateTime maxTime = new DateTime(0l);

      for (IndexableAdapter index : indexes) {
        minTime = JodaUtils.minDateTime(minTime, index.getDataInterval().getStart());
        maxTime = JodaUtils.maxDateTime(maxTime, index.getDataInterval().getEnd());
      }

      dataInterval = new Interval(minTime, maxTime);
      serializerUtils.writeString(channel, String.format("%s/%s", minTime, maxTime));
    } finally {
      Closeables.closeQuietly(channel);
      channel = null;
      Closeables.closeQuietly(fileOutputStream);
      fileOutputStream = null;
    }
    IndexIO.checkFileSize(indexFile);
    log.info(
        "outDir[%s] completed index.drd in %,d millis.",
        outDir, System.currentTimeMillis() - startTime);

    /** *********** Setup Dim Conversions ************* */
    progress.progress();
    startTime = System.currentTimeMillis();

    IOPeon ioPeon = new TmpFileIOPeon();
    ArrayList<FileOutputSupplier> dimOuts = Lists.newArrayListWithCapacity(mergedDimensions.size());
    Map<String, Integer> dimensionCardinalities = Maps.newHashMap();
    ArrayList<Map<String, IntBuffer>> dimConversions =
        Lists.newArrayListWithCapacity(indexes.size());

    for (IndexableAdapter index : indexes) {
      dimConversions.add(Maps.<String, IntBuffer>newHashMap());
    }

    for (String dimension : mergedDimensions) {
      final FlattenedArrayWriter<String> writer =
          new FlattenedArrayWriter<String>(ioPeon, dimension, GenericIndexed.stringStrategy);
      writer.open();

      List<Indexed<String>> dimValueLookups = Lists.newArrayListWithCapacity(indexes.size());
      DimValueConverter[] converters = new DimValueConverter[indexes.size()];
      for (int i = 0; i < indexes.size(); i++) {
        Indexed<String> dimValues = indexes.get(i).getDimValueLookup(dimension);
        if (dimValues != null) {
          dimValueLookups.add(dimValues);
          converters[i] = new DimValueConverter(dimValues);
        }
      }

      Iterable<String> dimensionValues =
          CombiningIterable.createSplatted(
              Iterables.transform(
                  dimValueLookups,
                  new Function<Indexed<String>, Iterable<String>>() {
                    @Override
                    public Iterable<String> apply(@Nullable Indexed<String> indexed) {
                      return Iterables.transform(
                          indexed,
                          new Function<String, String>() {
                            @Override
                            public String apply(@Nullable String input) {
                              return (input == null) ? "" : input;
                            }
                          });
                    }
                  }),
              Ordering.<String>natural().nullsFirst());

      int count = 0;
      for (String value : dimensionValues) {
        value = value == null ? "" : value;
        writer.write(value);

        for (int i = 0; i < indexes.size(); i++) {
          DimValueConverter converter = converters[i];
          if (converter != null) {
            converter.convert(value, count);
          }
        }

        ++count;
      }
      dimensionCardinalities.put(dimension, count);

      FileOutputSupplier dimOut =
          new FileOutputSupplier(IndexIO.makeDimFile(outDir, dimension), true);
      dimOuts.add(dimOut);

      writer.close();
      serializerUtils.writeString(dimOut, dimension);
      ByteStreams.copy(writer.combineStreams(), dimOut);
      for (int i = 0; i < indexes.size(); ++i) {
        DimValueConverter converter = converters[i];
        if (converter != null) {
          dimConversions.get(i).put(dimension, converters[i].getConversionBuffer());
        }
      }

      ioPeon.cleanup();
    }
    log.info(
        "outDir[%s] completed dim conversions in %,d millis.",
        outDir, System.currentTimeMillis() - startTime);

    /** *********** Walk through data sets and merge them ************ */
    progress.progress();
    startTime = System.currentTimeMillis();

    ArrayList<Iterable<Rowboat>> boats = Lists.newArrayListWithCapacity(indexes.size());

    for (int i = 0; i < indexes.size(); ++i) {
      final IndexableAdapter adapter = indexes.get(i);

      final int[] dimLookup = new int[mergedDimensions.size()];
      int count = 0;
      for (String dim : adapter.getAvailableDimensions()) {
        dimLookup[count] = mergedDimensions.indexOf(dim.toLowerCase());
        count++;
      }

      final int[] metricLookup = new int[mergedMetrics.size()];
      count = 0;
      for (String metric : adapter.getAvailableMetrics()) {
        metricLookup[count] = mergedMetrics.indexOf(metric);
        count++;
      }

      boats.add(
          new MMappedIndexRowIterable(
              Iterables.transform(
                  indexes.get(i).getRows(),
                  new Function<Rowboat, Rowboat>() {
                    @Override
                    public Rowboat apply(@Nullable Rowboat input) {
                      int[][] newDims = new int[mergedDimensions.size()][];
                      int j = 0;
                      for (int[] dim : input.getDims()) {
                        newDims[dimLookup[j]] = dim;
                        j++;
                      }

                      Object[] newMetrics = new Object[mergedMetrics.size()];
                      j = 0;
                      for (Object met : input.getMetrics()) {
                        newMetrics[metricLookup[j]] = met;
                        j++;
                      }

                      return new Rowboat(
                          input.getTimestamp(), newDims, newMetrics, input.getRowNum());
                    }
                  }),
              mergedDimensions,
              dimConversions.get(i),
              i));
    }

    Iterable<Rowboat> theRows = rowMergerFn.apply(boats);

    CompressedLongsSupplierSerializer littleEndianTimeWriter =
        CompressedLongsSupplierSerializer.create(
            ioPeon, "little_end_time", ByteOrder.LITTLE_ENDIAN);
    CompressedLongsSupplierSerializer bigEndianTimeWriter =
        CompressedLongsSupplierSerializer.create(ioPeon, "big_end_time", ByteOrder.BIG_ENDIAN);

    littleEndianTimeWriter.open();
    bigEndianTimeWriter.open();

    ArrayList<VSizeIndexedWriter> forwardDimWriters =
        Lists.newArrayListWithCapacity(mergedDimensions.size());
    for (String dimension : mergedDimensions) {
      VSizeIndexedWriter writer =
          new VSizeIndexedWriter(ioPeon, dimension, dimensionCardinalities.get(dimension));
      writer.open();
      forwardDimWriters.add(writer);
    }

    ArrayList<MetricColumnSerializer> metWriters =
        Lists.newArrayListWithCapacity(mergedMetrics.size());
    for (Map.Entry<String, String> entry : metricTypes.entrySet()) {
      String metric = entry.getKey();
      String typeName = entry.getValue();
      if ("float".equals(typeName)) {
        metWriters.add(new FloatMetricColumnSerializer(metric, outDir, ioPeon));
      } else {
        ComplexMetricSerde serde = ComplexMetrics.getSerdeForType(typeName);

        if (serde == null) {
          throw new ISE("Unknown type[%s]", typeName);
        }

        metWriters.add(new ComplexMetricColumnSerializer(metric, outDir, ioPeon, serde));
      }
    }
    for (MetricColumnSerializer metWriter : metWriters) {
      metWriter.open();
    }

    int rowCount = 0;
    long time = System.currentTimeMillis();
    List<IntBuffer> rowNumConversions = Lists.newArrayListWithCapacity(indexes.size());
    for (IndexableAdapter index : indexes) {
      int[] arr = new int[index.getNumRows()];
      Arrays.fill(arr, INVALID_ROW);
      rowNumConversions.add(IntBuffer.wrap(arr));
    }

    for (Rowboat theRow : theRows) {
      progress.progress();
      littleEndianTimeWriter.add(theRow.getTimestamp());
      bigEndianTimeWriter.add(theRow.getTimestamp());

      final Object[] metrics = theRow.getMetrics();
      for (int i = 0; i < metrics.length; ++i) {
        metWriters.get(i).serialize(metrics[i]);
      }

      int[][] dims = theRow.getDims();
      for (int i = 0; i < dims.length; ++i) {
        List<Integer> listToWrite =
            (i >= dims.length || dims[i] == null) ? null : Ints.asList(dims[i]);
        forwardDimWriters.get(i).write(listToWrite);
      }

      for (Map.Entry<Integer, TreeSet<Integer>> comprisedRow :
          theRow.getComprisedRows().entrySet()) {
        final IntBuffer conversionBuffer = rowNumConversions.get(comprisedRow.getKey());

        for (Integer rowNum : comprisedRow.getValue()) {
          while (conversionBuffer.position() < rowNum) {
            conversionBuffer.put(INVALID_ROW);
          }
          conversionBuffer.put(rowCount);
        }
      }

      if ((++rowCount % 500000) == 0) {
        log.info(
            "outDir[%s] walked 500,000/%,d rows in %,d millis.",
            outDir, rowCount, System.currentTimeMillis() - time);
        time = System.currentTimeMillis();
      }
    }

    for (IntBuffer rowNumConversion : rowNumConversions) {
      rowNumConversion.rewind();
    }

    final File littleEndianFile = IndexIO.makeTimeFile(outDir, ByteOrder.LITTLE_ENDIAN);
    littleEndianFile.delete();
    OutputSupplier<FileOutputStream> out = Files.newOutputStreamSupplier(littleEndianFile, true);
    littleEndianTimeWriter.closeAndConsolidate(out);
    IndexIO.checkFileSize(littleEndianFile);

    final File bigEndianFile = IndexIO.makeTimeFile(outDir, ByteOrder.BIG_ENDIAN);
    bigEndianFile.delete();
    out = Files.newOutputStreamSupplier(bigEndianFile, true);
    bigEndianTimeWriter.closeAndConsolidate(out);
    IndexIO.checkFileSize(bigEndianFile);

    for (int i = 0; i < mergedDimensions.size(); ++i) {
      forwardDimWriters.get(i).close();
      ByteStreams.copy(forwardDimWriters.get(i).combineStreams(), dimOuts.get(i));
    }

    for (MetricColumnSerializer metWriter : metWriters) {
      metWriter.close();
    }

    ioPeon.cleanup();
    log.info(
        "outDir[%s] completed walk through of %,d rows in %,d millis.",
        outDir, rowCount, System.currentTimeMillis() - startTime);

    /** ********** Create Inverted Indexes ************ */
    startTime = System.currentTimeMillis();

    final File invertedFile = new File(outDir, "inverted.drd");
    Files.touch(invertedFile);
    out = Files.newOutputStreamSupplier(invertedFile, true);
    for (int i = 0; i < mergedDimensions.size(); ++i) {
      long dimStartTime = System.currentTimeMillis();
      String dimension = mergedDimensions.get(i);

      File dimOutFile = dimOuts.get(i).getFile();
      final MappedByteBuffer dimValsMapped = Files.map(dimOutFile);

      if (!dimension.equals(serializerUtils.readString(dimValsMapped))) {
        throw new ISE("dimensions[%s] didn't equate!?  This is a major WTF moment.", dimension);
      }
      Indexed<String> dimVals =
          GenericIndexed.readFromByteBuffer(dimValsMapped, GenericIndexed.stringStrategy);
      log.info("Starting dimension[%s] with cardinality[%,d]", dimension, dimVals.size());

      FlattenedArrayWriter<ImmutableConciseSet> writer =
          new FlattenedArrayWriter<ImmutableConciseSet>(
              ioPeon, dimension, ConciseCompressedIndexedInts.objectStrategy);
      writer.open();

      for (String dimVal : IndexedIterable.create(dimVals)) {
        progress.progress();
        List<Iterable<Integer>> convertedInverteds = Lists.newArrayListWithCapacity(indexes.size());
        for (int j = 0; j < indexes.size(); ++j) {
          convertedInverteds.add(
              new ConvertingIndexedInts(
                  indexes.get(j).getInverteds(dimension, dimVal), rowNumConversions.get(j)));
        }

        ConciseSet bitset = new ConciseSet();
        for (Integer row :
            CombiningIterable.createSplatted(
                convertedInverteds, Ordering.<Integer>natural().nullsFirst())) {
          if (row != INVALID_ROW) {
            bitset.add(row);
          }
        }

        writer.write(ImmutableConciseSet.newImmutableFromMutable(bitset));
      }
      writer.close();

      serializerUtils.writeString(out, dimension);
      ByteStreams.copy(writer.combineStreams(), out);
      ioPeon.cleanup();

      log.info(
          "Completed dimension[%s] in %,d millis.",
          dimension, System.currentTimeMillis() - dimStartTime);
    }
    log.info(
        "outDir[%s] completed inverted.drd in %,d millis.",
        outDir, System.currentTimeMillis() - startTime);

    final ArrayList<String> expectedFiles =
        Lists.newArrayList(
            Iterables.concat(
                Arrays.asList(
                    "index.drd", "inverted.drd", "time_BIG_ENDIAN.drd", "time_LITTLE_ENDIAN.drd"),
                Iterables.transform(mergedDimensions, GuavaUtils.formatFunction("dim_%s.drd")),
                Iterables.transform(
                    mergedMetrics, GuavaUtils.formatFunction("met_%s_LITTLE_ENDIAN.drd")),
                Iterables.transform(
                    mergedMetrics, GuavaUtils.formatFunction("met_%s_BIG_ENDIAN.drd"))));

    Map<String, File> files = Maps.newLinkedHashMap();
    for (String fileName : expectedFiles) {
      files.put(fileName, new File(outDir, fileName));
    }

    File smooshDir = new File(outDir, "smoosher");
    smooshDir.mkdir();

    for (Map.Entry<String, File> entry : Smoosh.smoosh(outDir, smooshDir, files).entrySet()) {
      entry.getValue().delete();
    }

    for (File file : smooshDir.listFiles()) {
      Files.move(file, new File(outDir, file.getName()));
    }

    if (!smooshDir.delete()) {
      log.info(
          "Unable to delete temporary dir[%s], contains[%s]",
          smooshDir, Arrays.asList(smooshDir.listFiles()));
      throw new IOException(String.format("Unable to delete temporary dir[%s]", smooshDir));
    }

    createIndexDrdFile(
        IndexIO.CURRENT_VERSION_ID,
        outDir,
        GenericIndexed.fromIterable(mergedDimensions, GenericIndexed.stringStrategy),
        GenericIndexed.fromIterable(mergedMetrics, GenericIndexed.stringStrategy),
        dataInterval);

    return outDir;
  }
コード例 #2
0
ファイル: TestIndex.java プロジェクト: heptagon/druid
  public static Index convertMMapToIndex(MMappedIndex mmappedIndex) {
    Indexed<String> dimsIndexed = mmappedIndex.getAvailableDimensions();
    String[] dimensions = new String[dimsIndexed.size()];
    for (int i = 0; i < dimsIndexed.size(); ++i) {
      dimensions[i] = dimsIndexed.get(i);
    }

    Indexed<String> metricsIndexed = mmappedIndex.getAvailableMetrics();
    String[] metrics = new String[metricsIndexed.size()];
    for (int i = 0; i < metricsIndexed.size(); ++i) {
      metrics[i] = metricsIndexed.get(i);
    }

    IndexedLongs timeBuf = mmappedIndex.getReadOnlyTimestamps();
    long[] timestamps = new long[timeBuf.size()];
    timeBuf.fill(0, timestamps);
    Closeables.closeQuietly(timeBuf);

    Map<String, MetricHolder> metricVals = Maps.newLinkedHashMap();
    for (String metric : metrics) {
      MetricHolder holder = mmappedIndex.getMetricHolder(metric);
      switch (holder.getType()) {
        case FLOAT:
          IndexedFloats mmappedFloats = holder.getFloatType();
          float[] metricValsArray = new float[mmappedFloats.size()];
          mmappedFloats.fill(0, metricValsArray);
          Closeables.closeQuietly(mmappedFloats);

          metricVals.put(
              metric,
              MetricHolder.floatMetric(
                  metric,
                  CompressedFloatsIndexedSupplier.fromFloatBuffer(
                      FloatBuffer.wrap(metricValsArray), ByteOrder.nativeOrder())));
          break;
        case COMPLEX:
          Indexed complexObjects = holder.getComplexType();
          Object[] vals = new Object[complexObjects.size()];
          for (int i = 0; i < complexObjects.size(); ++i) {
            vals[i] = complexObjects.get(i);
          }

          final ComplexMetricSerde serde = ComplexMetrics.getSerdeForType(holder.getTypeName());
          if (serde == null) {
            throw new ISE("Unknown type[%s]", holder.getTypeName());
          }

          metricVals.put(
              metric,
              MetricHolder.complexMetric(
                  metric,
                  holder.getTypeName(),
                  new ArrayIndexed(vals, serde.getObjectStrategy().getClazz())));
          break;
      }
    }

    Map<String, Map<String, Integer>> dimIdLookup = Maps.newHashMap();
    Map<String, String[]> reverseDimLookup = Maps.newHashMap();
    Map<String, ImmutableConciseSet[]> invertedIndexesMap = Maps.newHashMap();
    Map<String, DimensionColumn> dimensionColumns = Maps.newHashMap();

    for (String dimension : dimensions) {
      final Indexed<String> dimValueLookup = mmappedIndex.getDimValueLookup(dimension);
      String[] values = new String[dimValueLookup.size()];
      for (int i = 0; i < dimValueLookup.size(); ++i) {
        values[i] = dimValueLookup.get(i);
      }

      Map<String, Integer> lookupMap = Maps.newHashMapWithExpectedSize(dimValueLookup.size());
      for (int i = 0; i < values.length; i++) {
        lookupMap.put(values[i], i);
      }

      ImmutableConciseSet[] invertedIndexes = new ImmutableConciseSet[values.length];
      final Indexed<String> dimValuesIndexed = mmappedIndex.getDimValueLookup(dimension);
      for (int i = 0; i < dimValuesIndexed.size(); ++i) {
        invertedIndexes[i] = mmappedIndex.getInvertedIndex(dimension, dimValuesIndexed.get(i));
      }

      int[] dimValues = new int[timestamps.length];
      Map<List<Integer>, Integer> rowGroupings = Maps.newHashMap();
      final Indexed<? extends IndexedInts> dimColumn = mmappedIndex.getDimColumn(dimension);
      for (int i = 0; i < dimColumn.size(); ++i) {
        int[] expansionValue = Indexedids.arrayFromIndexedInts(dimColumn.get(i));
        Integer value = rowGroupings.get(Ints.asList(expansionValue));
        if (value == null) {
          value = rowGroupings.size();
          rowGroupings.put(Ints.asList(expansionValue), value);
        }
        dimValues[i] = value;
      }

      int[][] expansionValues = new int[rowGroupings.size()][];
      for (Map.Entry<List<Integer>, Integer> entry : rowGroupings.entrySet()) {
        expansionValues[entry.getValue()] = Ints.toArray(entry.getKey());
      }

      dimIdLookup.put(dimension, lookupMap);
      reverseDimLookup.put(dimension, values);
      invertedIndexesMap.put(dimension, invertedIndexes);
      dimensionColumns.put(dimension, new DimensionColumn(expansionValues, dimValues));
    }

    return new Index(
        dimensions,
        metrics,
        mmappedIndex.getDataInterval(),
        timestamps,
        metricVals,
        dimIdLookup,
        reverseDimLookup,
        invertedIndexesMap,
        dimensionColumns);
  }