public void convert(String value, int index) { if (dimSet.size() == 0) { return; } if (lastVal != null) { if (value.compareTo(lastVal) <= 0) { throw new ISE( "Value[%s] is less than the last value[%s] I have, cannot be.", value, lastVal); } return; } String currValue = dimSet.get(currIndex); while (currValue == null) { conversionBuf.position(conversionBuf.position() + 1); ++currIndex; if (currIndex == dimSet.size()) { lastVal = value; return; } currValue = dimSet.get(currIndex); } if (Objects.equal(currValue, value)) { conversionBuf.put(index); ++currIndex; if (currIndex == dimSet.size()) { lastVal = value; } } else if (currValue.compareTo(value) < 0) { throw new ISE( "Skipped currValue[%s], currIndex[%,d]; incoming value[%s], index[%,d]", currValue, currIndex, value, index); } }
@Override public int getDimensionCardinality(String dimension) { final Indexed<String> dimValueLookup = index.getDimValueLookup(dimension); if (dimValueLookup == null) { return 0; } return dimValueLookup.size(); }
DimValueConverter(Indexed<String> dimSet) { this.dimSet = dimSet; conversionBuf = ByteBuffer.allocateDirect(dimSet.size() * Ints.BYTES).asIntBuffer(); currIndex = 0; }
private static File makeIndexFiles( final List<IndexableAdapter> indexes, final File outDir, final ProgressIndicator progress, final List<String> mergedDimensions, final List<String> mergedMetrics, final Function<ArrayList<Iterable<Rowboat>>, Iterable<Rowboat>> rowMergerFn) throws IOException { Map<String, String> metricTypes = Maps.newTreeMap(Ordering.<String>natural().nullsFirst()); for (IndexableAdapter adapter : indexes) { for (String metric : adapter.getAvailableMetrics()) { metricTypes.put(metric, adapter.getMetricType(metric)); } } final Interval dataInterval; /** *********** Main index.drd file ************* */ progress.progress(); long startTime = System.currentTimeMillis(); File indexFile = new File(outDir, "index.drd"); FileOutputStream fileOutputStream = null; FileChannel channel = null; try { fileOutputStream = new FileOutputStream(indexFile); channel = fileOutputStream.getChannel(); channel.write(ByteBuffer.wrap(new byte[] {IndexIO.CURRENT_VERSION_ID})); GenericIndexed.fromIterable(mergedDimensions, GenericIndexed.stringStrategy) .writeToChannel(channel); GenericIndexed.fromIterable(mergedMetrics, GenericIndexed.stringStrategy) .writeToChannel(channel); DateTime minTime = new DateTime(Long.MAX_VALUE); DateTime maxTime = new DateTime(0l); for (IndexableAdapter index : indexes) { minTime = JodaUtils.minDateTime(minTime, index.getDataInterval().getStart()); maxTime = JodaUtils.maxDateTime(maxTime, index.getDataInterval().getEnd()); } dataInterval = new Interval(minTime, maxTime); serializerUtils.writeString(channel, String.format("%s/%s", minTime, maxTime)); } finally { Closeables.closeQuietly(channel); channel = null; Closeables.closeQuietly(fileOutputStream); fileOutputStream = null; } IndexIO.checkFileSize(indexFile); log.info( "outDir[%s] completed index.drd in %,d millis.", outDir, System.currentTimeMillis() - startTime); /** *********** Setup Dim Conversions ************* */ progress.progress(); startTime = System.currentTimeMillis(); IOPeon ioPeon = new TmpFileIOPeon(); ArrayList<FileOutputSupplier> dimOuts = Lists.newArrayListWithCapacity(mergedDimensions.size()); Map<String, Integer> dimensionCardinalities = Maps.newHashMap(); ArrayList<Map<String, IntBuffer>> dimConversions = Lists.newArrayListWithCapacity(indexes.size()); for (IndexableAdapter index : indexes) { dimConversions.add(Maps.<String, IntBuffer>newHashMap()); } for (String dimension : mergedDimensions) { final FlattenedArrayWriter<String> writer = new FlattenedArrayWriter<String>(ioPeon, dimension, GenericIndexed.stringStrategy); writer.open(); List<Indexed<String>> dimValueLookups = Lists.newArrayListWithCapacity(indexes.size()); DimValueConverter[] converters = new DimValueConverter[indexes.size()]; for (int i = 0; i < indexes.size(); i++) { Indexed<String> dimValues = indexes.get(i).getDimValueLookup(dimension); if (dimValues != null) { dimValueLookups.add(dimValues); converters[i] = new DimValueConverter(dimValues); } } Iterable<String> dimensionValues = CombiningIterable.createSplatted( Iterables.transform( dimValueLookups, new Function<Indexed<String>, Iterable<String>>() { @Override public Iterable<String> apply(@Nullable Indexed<String> indexed) { return Iterables.transform( indexed, new Function<String, String>() { @Override public String apply(@Nullable String input) { return (input == null) ? "" : input; } }); } }), Ordering.<String>natural().nullsFirst()); int count = 0; for (String value : dimensionValues) { value = value == null ? "" : value; writer.write(value); for (int i = 0; i < indexes.size(); i++) { DimValueConverter converter = converters[i]; if (converter != null) { converter.convert(value, count); } } ++count; } dimensionCardinalities.put(dimension, count); FileOutputSupplier dimOut = new FileOutputSupplier(IndexIO.makeDimFile(outDir, dimension), true); dimOuts.add(dimOut); writer.close(); serializerUtils.writeString(dimOut, dimension); ByteStreams.copy(writer.combineStreams(), dimOut); for (int i = 0; i < indexes.size(); ++i) { DimValueConverter converter = converters[i]; if (converter != null) { dimConversions.get(i).put(dimension, converters[i].getConversionBuffer()); } } ioPeon.cleanup(); } log.info( "outDir[%s] completed dim conversions in %,d millis.", outDir, System.currentTimeMillis() - startTime); /** *********** Walk through data sets and merge them ************ */ progress.progress(); startTime = System.currentTimeMillis(); ArrayList<Iterable<Rowboat>> boats = Lists.newArrayListWithCapacity(indexes.size()); for (int i = 0; i < indexes.size(); ++i) { final IndexableAdapter adapter = indexes.get(i); final int[] dimLookup = new int[mergedDimensions.size()]; int count = 0; for (String dim : adapter.getAvailableDimensions()) { dimLookup[count] = mergedDimensions.indexOf(dim.toLowerCase()); count++; } final int[] metricLookup = new int[mergedMetrics.size()]; count = 0; for (String metric : adapter.getAvailableMetrics()) { metricLookup[count] = mergedMetrics.indexOf(metric); count++; } boats.add( new MMappedIndexRowIterable( Iterables.transform( indexes.get(i).getRows(), new Function<Rowboat, Rowboat>() { @Override public Rowboat apply(@Nullable Rowboat input) { int[][] newDims = new int[mergedDimensions.size()][]; int j = 0; for (int[] dim : input.getDims()) { newDims[dimLookup[j]] = dim; j++; } Object[] newMetrics = new Object[mergedMetrics.size()]; j = 0; for (Object met : input.getMetrics()) { newMetrics[metricLookup[j]] = met; j++; } return new Rowboat( input.getTimestamp(), newDims, newMetrics, input.getRowNum()); } }), mergedDimensions, dimConversions.get(i), i)); } Iterable<Rowboat> theRows = rowMergerFn.apply(boats); CompressedLongsSupplierSerializer littleEndianTimeWriter = CompressedLongsSupplierSerializer.create( ioPeon, "little_end_time", ByteOrder.LITTLE_ENDIAN); CompressedLongsSupplierSerializer bigEndianTimeWriter = CompressedLongsSupplierSerializer.create(ioPeon, "big_end_time", ByteOrder.BIG_ENDIAN); littleEndianTimeWriter.open(); bigEndianTimeWriter.open(); ArrayList<VSizeIndexedWriter> forwardDimWriters = Lists.newArrayListWithCapacity(mergedDimensions.size()); for (String dimension : mergedDimensions) { VSizeIndexedWriter writer = new VSizeIndexedWriter(ioPeon, dimension, dimensionCardinalities.get(dimension)); writer.open(); forwardDimWriters.add(writer); } ArrayList<MetricColumnSerializer> metWriters = Lists.newArrayListWithCapacity(mergedMetrics.size()); for (Map.Entry<String, String> entry : metricTypes.entrySet()) { String metric = entry.getKey(); String typeName = entry.getValue(); if ("float".equals(typeName)) { metWriters.add(new FloatMetricColumnSerializer(metric, outDir, ioPeon)); } else { ComplexMetricSerde serde = ComplexMetrics.getSerdeForType(typeName); if (serde == null) { throw new ISE("Unknown type[%s]", typeName); } metWriters.add(new ComplexMetricColumnSerializer(metric, outDir, ioPeon, serde)); } } for (MetricColumnSerializer metWriter : metWriters) { metWriter.open(); } int rowCount = 0; long time = System.currentTimeMillis(); List<IntBuffer> rowNumConversions = Lists.newArrayListWithCapacity(indexes.size()); for (IndexableAdapter index : indexes) { int[] arr = new int[index.getNumRows()]; Arrays.fill(arr, INVALID_ROW); rowNumConversions.add(IntBuffer.wrap(arr)); } for (Rowboat theRow : theRows) { progress.progress(); littleEndianTimeWriter.add(theRow.getTimestamp()); bigEndianTimeWriter.add(theRow.getTimestamp()); final Object[] metrics = theRow.getMetrics(); for (int i = 0; i < metrics.length; ++i) { metWriters.get(i).serialize(metrics[i]); } int[][] dims = theRow.getDims(); for (int i = 0; i < dims.length; ++i) { List<Integer> listToWrite = (i >= dims.length || dims[i] == null) ? null : Ints.asList(dims[i]); forwardDimWriters.get(i).write(listToWrite); } for (Map.Entry<Integer, TreeSet<Integer>> comprisedRow : theRow.getComprisedRows().entrySet()) { final IntBuffer conversionBuffer = rowNumConversions.get(comprisedRow.getKey()); for (Integer rowNum : comprisedRow.getValue()) { while (conversionBuffer.position() < rowNum) { conversionBuffer.put(INVALID_ROW); } conversionBuffer.put(rowCount); } } if ((++rowCount % 500000) == 0) { log.info( "outDir[%s] walked 500,000/%,d rows in %,d millis.", outDir, rowCount, System.currentTimeMillis() - time); time = System.currentTimeMillis(); } } for (IntBuffer rowNumConversion : rowNumConversions) { rowNumConversion.rewind(); } final File littleEndianFile = IndexIO.makeTimeFile(outDir, ByteOrder.LITTLE_ENDIAN); littleEndianFile.delete(); OutputSupplier<FileOutputStream> out = Files.newOutputStreamSupplier(littleEndianFile, true); littleEndianTimeWriter.closeAndConsolidate(out); IndexIO.checkFileSize(littleEndianFile); final File bigEndianFile = IndexIO.makeTimeFile(outDir, ByteOrder.BIG_ENDIAN); bigEndianFile.delete(); out = Files.newOutputStreamSupplier(bigEndianFile, true); bigEndianTimeWriter.closeAndConsolidate(out); IndexIO.checkFileSize(bigEndianFile); for (int i = 0; i < mergedDimensions.size(); ++i) { forwardDimWriters.get(i).close(); ByteStreams.copy(forwardDimWriters.get(i).combineStreams(), dimOuts.get(i)); } for (MetricColumnSerializer metWriter : metWriters) { metWriter.close(); } ioPeon.cleanup(); log.info( "outDir[%s] completed walk through of %,d rows in %,d millis.", outDir, rowCount, System.currentTimeMillis() - startTime); /** ********** Create Inverted Indexes ************ */ startTime = System.currentTimeMillis(); final File invertedFile = new File(outDir, "inverted.drd"); Files.touch(invertedFile); out = Files.newOutputStreamSupplier(invertedFile, true); for (int i = 0; i < mergedDimensions.size(); ++i) { long dimStartTime = System.currentTimeMillis(); String dimension = mergedDimensions.get(i); File dimOutFile = dimOuts.get(i).getFile(); final MappedByteBuffer dimValsMapped = Files.map(dimOutFile); if (!dimension.equals(serializerUtils.readString(dimValsMapped))) { throw new ISE("dimensions[%s] didn't equate!? This is a major WTF moment.", dimension); } Indexed<String> dimVals = GenericIndexed.readFromByteBuffer(dimValsMapped, GenericIndexed.stringStrategy); log.info("Starting dimension[%s] with cardinality[%,d]", dimension, dimVals.size()); FlattenedArrayWriter<ImmutableConciseSet> writer = new FlattenedArrayWriter<ImmutableConciseSet>( ioPeon, dimension, ConciseCompressedIndexedInts.objectStrategy); writer.open(); for (String dimVal : IndexedIterable.create(dimVals)) { progress.progress(); List<Iterable<Integer>> convertedInverteds = Lists.newArrayListWithCapacity(indexes.size()); for (int j = 0; j < indexes.size(); ++j) { convertedInverteds.add( new ConvertingIndexedInts( indexes.get(j).getInverteds(dimension, dimVal), rowNumConversions.get(j))); } ConciseSet bitset = new ConciseSet(); for (Integer row : CombiningIterable.createSplatted( convertedInverteds, Ordering.<Integer>natural().nullsFirst())) { if (row != INVALID_ROW) { bitset.add(row); } } writer.write(ImmutableConciseSet.newImmutableFromMutable(bitset)); } writer.close(); serializerUtils.writeString(out, dimension); ByteStreams.copy(writer.combineStreams(), out); ioPeon.cleanup(); log.info( "Completed dimension[%s] in %,d millis.", dimension, System.currentTimeMillis() - dimStartTime); } log.info( "outDir[%s] completed inverted.drd in %,d millis.", outDir, System.currentTimeMillis() - startTime); final ArrayList<String> expectedFiles = Lists.newArrayList( Iterables.concat( Arrays.asList( "index.drd", "inverted.drd", "time_BIG_ENDIAN.drd", "time_LITTLE_ENDIAN.drd"), Iterables.transform(mergedDimensions, GuavaUtils.formatFunction("dim_%s.drd")), Iterables.transform( mergedMetrics, GuavaUtils.formatFunction("met_%s_LITTLE_ENDIAN.drd")), Iterables.transform( mergedMetrics, GuavaUtils.formatFunction("met_%s_BIG_ENDIAN.drd")))); Map<String, File> files = Maps.newLinkedHashMap(); for (String fileName : expectedFiles) { files.put(fileName, new File(outDir, fileName)); } File smooshDir = new File(outDir, "smoosher"); smooshDir.mkdir(); for (Map.Entry<String, File> entry : Smoosh.smoosh(outDir, smooshDir, files).entrySet()) { entry.getValue().delete(); } for (File file : smooshDir.listFiles()) { Files.move(file, new File(outDir, file.getName())); } if (!smooshDir.delete()) { log.info( "Unable to delete temporary dir[%s], contains[%s]", smooshDir, Arrays.asList(smooshDir.listFiles())); throw new IOException(String.format("Unable to delete temporary dir[%s]", smooshDir)); } createIndexDrdFile( IndexIO.CURRENT_VERSION_ID, outDir, GenericIndexed.fromIterable(mergedDimensions, GenericIndexed.stringStrategy), GenericIndexed.fromIterable(mergedMetrics, GenericIndexed.stringStrategy), dataInterval); return outDir; }
public static Index convertMMapToIndex(MMappedIndex mmappedIndex) { Indexed<String> dimsIndexed = mmappedIndex.getAvailableDimensions(); String[] dimensions = new String[dimsIndexed.size()]; for (int i = 0; i < dimsIndexed.size(); ++i) { dimensions[i] = dimsIndexed.get(i); } Indexed<String> metricsIndexed = mmappedIndex.getAvailableMetrics(); String[] metrics = new String[metricsIndexed.size()]; for (int i = 0; i < metricsIndexed.size(); ++i) { metrics[i] = metricsIndexed.get(i); } IndexedLongs timeBuf = mmappedIndex.getReadOnlyTimestamps(); long[] timestamps = new long[timeBuf.size()]; timeBuf.fill(0, timestamps); Closeables.closeQuietly(timeBuf); Map<String, MetricHolder> metricVals = Maps.newLinkedHashMap(); for (String metric : metrics) { MetricHolder holder = mmappedIndex.getMetricHolder(metric); switch (holder.getType()) { case FLOAT: IndexedFloats mmappedFloats = holder.getFloatType(); float[] metricValsArray = new float[mmappedFloats.size()]; mmappedFloats.fill(0, metricValsArray); Closeables.closeQuietly(mmappedFloats); metricVals.put( metric, MetricHolder.floatMetric( metric, CompressedFloatsIndexedSupplier.fromFloatBuffer( FloatBuffer.wrap(metricValsArray), ByteOrder.nativeOrder()))); break; case COMPLEX: Indexed complexObjects = holder.getComplexType(); Object[] vals = new Object[complexObjects.size()]; for (int i = 0; i < complexObjects.size(); ++i) { vals[i] = complexObjects.get(i); } final ComplexMetricSerde serde = ComplexMetrics.getSerdeForType(holder.getTypeName()); if (serde == null) { throw new ISE("Unknown type[%s]", holder.getTypeName()); } metricVals.put( metric, MetricHolder.complexMetric( metric, holder.getTypeName(), new ArrayIndexed(vals, serde.getObjectStrategy().getClazz()))); break; } } Map<String, Map<String, Integer>> dimIdLookup = Maps.newHashMap(); Map<String, String[]> reverseDimLookup = Maps.newHashMap(); Map<String, ImmutableConciseSet[]> invertedIndexesMap = Maps.newHashMap(); Map<String, DimensionColumn> dimensionColumns = Maps.newHashMap(); for (String dimension : dimensions) { final Indexed<String> dimValueLookup = mmappedIndex.getDimValueLookup(dimension); String[] values = new String[dimValueLookup.size()]; for (int i = 0; i < dimValueLookup.size(); ++i) { values[i] = dimValueLookup.get(i); } Map<String, Integer> lookupMap = Maps.newHashMapWithExpectedSize(dimValueLookup.size()); for (int i = 0; i < values.length; i++) { lookupMap.put(values[i], i); } ImmutableConciseSet[] invertedIndexes = new ImmutableConciseSet[values.length]; final Indexed<String> dimValuesIndexed = mmappedIndex.getDimValueLookup(dimension); for (int i = 0; i < dimValuesIndexed.size(); ++i) { invertedIndexes[i] = mmappedIndex.getInvertedIndex(dimension, dimValuesIndexed.get(i)); } int[] dimValues = new int[timestamps.length]; Map<List<Integer>, Integer> rowGroupings = Maps.newHashMap(); final Indexed<? extends IndexedInts> dimColumn = mmappedIndex.getDimColumn(dimension); for (int i = 0; i < dimColumn.size(); ++i) { int[] expansionValue = Indexedids.arrayFromIndexedInts(dimColumn.get(i)); Integer value = rowGroupings.get(Ints.asList(expansionValue)); if (value == null) { value = rowGroupings.size(); rowGroupings.put(Ints.asList(expansionValue), value); } dimValues[i] = value; } int[][] expansionValues = new int[rowGroupings.size()][]; for (Map.Entry<List<Integer>, Integer> entry : rowGroupings.entrySet()) { expansionValues[entry.getValue()] = Ints.toArray(entry.getKey()); } dimIdLookup.put(dimension, lookupMap); reverseDimLookup.put(dimension, values); invertedIndexesMap.put(dimension, invertedIndexes); dimensionColumns.put(dimension, new DimensionColumn(expansionValues, dimValues)); } return new Index( dimensions, metrics, mmappedIndex.getDataInterval(), timestamps, metricVals, dimIdLookup, reverseDimLookup, invertedIndexesMap, dimensionColumns); }