/** * Returns the unique metric values for each column. * * <p>The original unique values cannot be used because after aggregation, we almost certainly * have new values to encode that were not present in the original data set. */ private Map<String, Set<Object>> computeUniqueMetricValues() { Map<String, Set<Object>> uniqueMetricValues = new HashMap<String, Set<Object>>(); Iterator<StarTreeTableRow> tableIterator = starTreeBuilder.getTable().getAllCombinations(); while (tableIterator.hasNext()) { StarTreeTableRow row = tableIterator.next(); for (int i = 0; i < schema.getMetricNames().size(); i++) { String metricName = schema.getMetricNames().get(i); Object metricValue = row.getMetrics().get(i); Set<Object> uniqueValues = uniqueMetricValues.get(metricName); if (uniqueValues == null) { uniqueValues = new HashSet<Object>(); uniqueMetricValues.put(metricName, uniqueValues); } uniqueValues.add(metricValue); } } return uniqueMetricValues; }
/** Converts a raw row into its (possibly partial) dimension and complete metric values */ private StarTreeTableRow extractValues(GenericRow row) { List<Integer> dimensions = new ArrayList<Integer>(); for (String dimensionName : schema.getDimensionNames()) { Integer valueId; if (schema.getFieldSpecFor(dimensionName).isSingleValueField() && !starTreeIndexSpec.getExcludedDimensions().contains(dimensionName)) { Object value = row.getValue(dimensionName); valueId = dictionaryCreatorMap.get(dimensionName).indexOfSV(value); } else { // Multi-value fields are not supported - always ALL valueId = V1Constants.STARTREE_ALL_NUMBER.intValue(); } dimensions.add(valueId); } List<Number> metrics = new ArrayList<Number>(schema.getMetricNames().size()); for (MetricFieldSpec metricFieldSpec : schema.getMetricFieldSpecs()) { Object value = row.getValue(metricFieldSpec.getName()); switch (metricFieldSpec.getDataType()) { case INT: metrics.add((Integer) value); break; case LONG: metrics.add((Long) value); break; case DOUBLE: metrics.add((Double) value); break; case FLOAT: metrics.add((Float) value); break; default: throw new IllegalStateException("Unsupported data type " + metricFieldSpec.getDataType()); } } return new StarTreeTableRow(dimensions, metrics); }
@Override public void init( SegmentGeneratorConfig config, Map<String, ColumnIndexCreationInfo> columnInfo, Schema schema, int totalDocs, File outDir) throws Exception { // Member variables this.config = config; this.columnInfo = columnInfo; this.schema = schema; this.outDir = outDir; this.starTreeDimensionDictionary = new HashMap<String, Integer>(); this.starTreeMetricDictionary = new HashMap<String, Integer>(); // Dictionaries (will go in root segment) initializeAndBuildDictionaries(schema, columnInfo, outDir); // Compute dimension dictionary for (int i = 0; i < schema.getDimensionNames().size(); i++) { starTreeDimensionDictionary.put(schema.getDimensionNames().get(i), i); } LOG.info("StarTree dimension dictionary: {}", starTreeDimensionDictionary); // Compute the metric dictionary for (int i = 0; i < schema.getMetricNames().size(); i++) { starTreeMetricDictionary.put(schema.getMetricNames().get(i), i); } LOG.info("StarTree metric dictionary: {}", starTreeDimensionDictionary); // Compute StarTree split order splitOrder = computeSplitOrder(columnInfo); LOG.info("Computed split order {}", splitOrder); List<Integer> splitOrderIndexes = new ArrayList<Integer>(); for (String dimensionName : splitOrder) { Integer dimensionId = starTreeDimensionDictionary.get(dimensionName); splitOrderIndexes.add(dimensionId); } Collections.reverse(splitOrderIndexes); // StarTree builder / table StarTreeTable table = new LinkedListStarTreeTable(); // TODO: ByteBuffer-based StarTreeDocumentIdMap documentIdMap = new HashMapStarTreeDocumentIdMap(); // TODO: ByteBuffer-based starTreeBuilder.init( splitOrderIndexes, starTreeIndexSpec.getMaxLeafRecords(), table, documentIdMap); // Build the StarTree structure and table LOG.info("Building StarTree table..."); int count = 0; long startMillis = System.currentTimeMillis(); recordReader.rewind(); while (recordReader.hasNext()) { GenericRow row = recordReader.next(); StarTreeTableRow starTreeTableRow = extractValues(row); starTreeBuilder.append(starTreeTableRow); count++; } long endMillis = System.currentTimeMillis(); LOG.info( "Finished building StarTree table ({} documents, took {} ms)", count, endMillis - startMillis); LOG.info("Building StarTree (computing aggregates)..."); startMillis = System.currentTimeMillis(); starTreeBuilder.build(); endMillis = System.currentTimeMillis(); LOG.info("Finished building StarTree, took {} ms", endMillis - startMillis); // Re-compute the unique values for metrics including aggregates to allow for dictionary // encoding LOG.info("Re-computing unique metric values for dictionary encoding..."); startMillis = System.currentTimeMillis(); Map<String, Set<Object>> uniqueMetricValues = computeUniqueMetricValues(); resetMetricDictionaries(uniqueMetricValues); endMillis = System.currentTimeMillis(); LOG.info("Finished re-computing unique metric values (took {} ms)", endMillis - startMillis); // StarTree directory starTreeDir = new File(outDir, V1Constants.STARTREE_DIR); if (!starTreeDir.mkdir()) { throw new RuntimeException( "Could not create star tree directory " + starTreeDir.getAbsolutePath()); } // For each column, build its dictionary and initialize a forwards and an inverted index for raw // / agg segment int totalAggDocs = starTreeBuilder.getTotalAggregateDocumentCount(); int totalRawDocs = starTreeBuilder.getTotalRawDocumentCount(); for (final String column : dictionaryCreatorMap.keySet()) { ColumnIndexCreationInfo indexCreationInfo = columnInfo.get(column); Object[] uniqueValues = indexCreationInfo.getSortedUniqueElementsArray(); if (schema.getMetricNames().contains(column)) { // Use the unique values including the new aggregate values uniqueValues = uniqueMetricValues.get(column).toArray(); } if (schema.getFieldSpecFor(column).isSingleValueField()) { if (indexCreationInfo.isSorted()) { forwardIndexCreatorMap.put( column, new SingleValueSortedForwardIndexCreator( outDir, uniqueValues.length, schema.getFieldSpecFor(column))); aggregateForwardIndexCreatorMap.put( column, new SingleValueSortedForwardIndexCreator( starTreeDir, uniqueValues.length, schema.getFieldSpecFor(column))); } else { forwardIndexCreatorMap.put( column, new SingleValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), outDir, uniqueValues.length, totalRawDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); aggregateForwardIndexCreatorMap.put( column, new SingleValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), starTreeDir, indexCreationInfo.getSortedUniqueElementsArray().length, totalAggDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); } } else { forwardIndexCreatorMap.put( column, new MultiValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), outDir, uniqueValues.length, totalRawDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); aggregateForwardIndexCreatorMap.put( column, new MultiValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), starTreeDir, uniqueValues.length, totalAggDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); } if (config.createInvertedIndexEnabled()) { invertedIndexCreatorMap.put( column, new BitmapInvertedIndexCreator( outDir, indexCreationInfo.getSortedUniqueElementsArray().length, schema.getFieldSpecFor(column))); aggregateInvertedIndexCreatorMap.put( column, new BitmapInvertedIndexCreator( starTreeDir, indexCreationInfo.getSortedUniqueElementsArray().length, schema.getFieldSpecFor(column))); } } }