private void setupSegmentList(int numberOfSegments) throws Exception { final String filePath = TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource(SMALL_AVRO_DATA)); _indexSegmentList.clear(); if (INDEXES_DIR.exists()) { FileUtils.deleteQuietly(INDEXES_DIR); } INDEXES_DIR.mkdir(); for (int i = 0; i < numberOfSegments; ++i) { final File segmentDir = new File(INDEXES_DIR, "segment_" + i); final SegmentGeneratorConfig config = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns( new File(filePath), segmentDir, "dim" + i, TimeUnit.DAYS, "midas"); config.setSegmentNamePostfix(String.valueOf(i)); final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null); driver.init(config); driver.build(); File parent = new File(INDEXES_DIR, "segment_" + String.valueOf(i)); String segmentName = parent.list()[0]; _indexSegmentList.add( ColumnarSegmentLoader.load(new File(parent, segmentName), ReadMode.mmap)); System.out.println("built at : " + segmentDir.getAbsolutePath()); } }
private void setupSegmentList(int numberOfSegments) throws Exception { final String filePath = TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource(AVRO_DATA)); _indexSegmentList.clear(); if (INDEXES_DIR.exists()) { FileUtils.deleteQuietly(INDEXES_DIR); } INDEXES_DIR.mkdir(); for (int i = 0; i < numberOfSegments; ++i) { final File segmentDir = new File(INDEXES_DIR, "segment_" + i); final SegmentGeneratorConfig config = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns( new File(filePath), segmentDir, "time_day", TimeUnit.DAYS, "test"); config.setSegmentNamePostfix(String.valueOf(i)); final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null); driver.init(config); driver.build(); LOGGER.debug("built at : {}", segmentDir.getAbsolutePath()); final File indexSegmentDir = new File(segmentDir, driver.getSegmentName()); _indexSegmentList.add( new OfflineSegmentDataManager( ColumnarSegmentLoader.load(indexSegmentDir, ReadMode.heap))); } }
@Override public void indexRow(GenericRow row) { // Find matching leaves in StarTree for row currentMatchingNodes.clear(); StarTreeTableRow tableRow = extractValues(row); findMatchingLeaves(starTreeBuilder.getTree(), tableRow.getDimensions(), currentMatchingNodes); // Only write the raw value, maintaining sort order (we will write aggregates when sealing) for (StarTreeIndexNode node : currentMatchingNodes) { Map<Integer, Integer> pathValues = node.getPathValues(); if (!pathValues.containsValue(StarTreeIndexNode.all())) { StarTreeTableRange range = starTreeBuilder.getDocumentIdRange(node.getNodeId()); StarTreeTable subTable = starTreeBuilder.getTable().view(range.getStartDocumentId(), range.getDocumentCount()); Integer nextMatchingDocumentId = starTreeBuilder.getNextDocumentId(tableRow.getDimensions()); if (nextMatchingDocumentId == null) { throw new IllegalStateException("Could not assign document ID for row " + tableRow); } // Write using that document ID to all columns for (final String column : dictionaryCreatorMap.keySet()) { Object columnValueToIndex = row.getValue(column); if (schema.getFieldSpecFor(column).isSingleValueField()) { System.out.println(column + ": " + columnValueToIndex); int dictionaryIndex = dictionaryCreatorMap.get(column).indexOfSV(columnValueToIndex); ((SingleValueForwardIndexCreator) forwardIndexCreatorMap.get(column)) .index(nextMatchingDocumentId, dictionaryIndex); if (config.createInvertedIndexEnabled()) { invertedIndexCreatorMap .get(column) .add(nextMatchingDocumentId, (Object) dictionaryIndex); } } else { int[] dictionaryIndex = dictionaryCreatorMap.get(column).indexOfMV(columnValueToIndex); ((MultiValueForwardIndexCreator) forwardIndexCreatorMap.get(column)) .index(nextMatchingDocumentId, dictionaryIndex); if (config.createInvertedIndexEnabled()) { invertedIndexCreatorMap.get(column).add(nextMatchingDocumentId, dictionaryIndex); } } } } } }
@Override public void seal() throws ConfigurationException, IOException { for (final String column : forwardIndexCreatorMap.keySet()) { forwardIndexCreatorMap.get(column).close(); if (config.createInvertedIndexEnabled()) { invertedIndexCreatorMap.get(column).seal(); } dictionaryCreatorMap.get(column).close(); } writeMetadata(); }
@Override public void indexRow(GenericRow row) { for (final String column : dictionaryCreatorMap.keySet()) { Object columnValueToIndex = row.getValue(column); Object dictionaryIndex; if (dictionaryCache.get(column).containsKey(columnValueToIndex)) { dictionaryIndex = dictionaryCache.get(column).get(columnValueToIndex); } else { dictionaryIndex = dictionaryCreatorMap.get(column).indexOf(columnValueToIndex); dictionaryCache.get(column).put(columnValueToIndex, dictionaryIndex); } forwardIndexCreatorMap.get(column).index(docIdCounter, dictionaryIndex); if (config.createInvertedIndexEnabled()) { invertedIndexCreatorMap.get(column).add(docIdCounter, dictionaryIndex); } } docIdCounter++; }
@Override public void init( SegmentGeneratorConfig config, Map<String, ColumnIndexCreationInfo> columnInfo, Schema schema, int totalDocs, File outDir) throws Exception { // Member variables this.config = config; this.columnInfo = columnInfo; this.schema = schema; this.outDir = outDir; this.starTreeDimensionDictionary = new HashMap<String, Integer>(); this.starTreeMetricDictionary = new HashMap<String, Integer>(); // Dictionaries (will go in root segment) initializeAndBuildDictionaries(schema, columnInfo, outDir); // Compute dimension dictionary for (int i = 0; i < schema.getDimensionNames().size(); i++) { starTreeDimensionDictionary.put(schema.getDimensionNames().get(i), i); } LOG.info("StarTree dimension dictionary: {}", starTreeDimensionDictionary); // Compute the metric dictionary for (int i = 0; i < schema.getMetricNames().size(); i++) { starTreeMetricDictionary.put(schema.getMetricNames().get(i), i); } LOG.info("StarTree metric dictionary: {}", starTreeDimensionDictionary); // Compute StarTree split order splitOrder = computeSplitOrder(columnInfo); LOG.info("Computed split order {}", splitOrder); List<Integer> splitOrderIndexes = new ArrayList<Integer>(); for (String dimensionName : splitOrder) { Integer dimensionId = starTreeDimensionDictionary.get(dimensionName); splitOrderIndexes.add(dimensionId); } Collections.reverse(splitOrderIndexes); // StarTree builder / table StarTreeTable table = new LinkedListStarTreeTable(); // TODO: ByteBuffer-based StarTreeDocumentIdMap documentIdMap = new HashMapStarTreeDocumentIdMap(); // TODO: ByteBuffer-based starTreeBuilder.init( splitOrderIndexes, starTreeIndexSpec.getMaxLeafRecords(), table, documentIdMap); // Build the StarTree structure and table LOG.info("Building StarTree table..."); int count = 0; long startMillis = System.currentTimeMillis(); recordReader.rewind(); while (recordReader.hasNext()) { GenericRow row = recordReader.next(); StarTreeTableRow starTreeTableRow = extractValues(row); starTreeBuilder.append(starTreeTableRow); count++; } long endMillis = System.currentTimeMillis(); LOG.info( "Finished building StarTree table ({} documents, took {} ms)", count, endMillis - startMillis); LOG.info("Building StarTree (computing aggregates)..."); startMillis = System.currentTimeMillis(); starTreeBuilder.build(); endMillis = System.currentTimeMillis(); LOG.info("Finished building StarTree, took {} ms", endMillis - startMillis); // Re-compute the unique values for metrics including aggregates to allow for dictionary // encoding LOG.info("Re-computing unique metric values for dictionary encoding..."); startMillis = System.currentTimeMillis(); Map<String, Set<Object>> uniqueMetricValues = computeUniqueMetricValues(); resetMetricDictionaries(uniqueMetricValues); endMillis = System.currentTimeMillis(); LOG.info("Finished re-computing unique metric values (took {} ms)", endMillis - startMillis); // StarTree directory starTreeDir = new File(outDir, V1Constants.STARTREE_DIR); if (!starTreeDir.mkdir()) { throw new RuntimeException( "Could not create star tree directory " + starTreeDir.getAbsolutePath()); } // For each column, build its dictionary and initialize a forwards and an inverted index for raw // / agg segment int totalAggDocs = starTreeBuilder.getTotalAggregateDocumentCount(); int totalRawDocs = starTreeBuilder.getTotalRawDocumentCount(); for (final String column : dictionaryCreatorMap.keySet()) { ColumnIndexCreationInfo indexCreationInfo = columnInfo.get(column); Object[] uniqueValues = indexCreationInfo.getSortedUniqueElementsArray(); if (schema.getMetricNames().contains(column)) { // Use the unique values including the new aggregate values uniqueValues = uniqueMetricValues.get(column).toArray(); } if (schema.getFieldSpecFor(column).isSingleValueField()) { if (indexCreationInfo.isSorted()) { forwardIndexCreatorMap.put( column, new SingleValueSortedForwardIndexCreator( outDir, uniqueValues.length, schema.getFieldSpecFor(column))); aggregateForwardIndexCreatorMap.put( column, new SingleValueSortedForwardIndexCreator( starTreeDir, uniqueValues.length, schema.getFieldSpecFor(column))); } else { forwardIndexCreatorMap.put( column, new SingleValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), outDir, uniqueValues.length, totalRawDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); aggregateForwardIndexCreatorMap.put( column, new SingleValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), starTreeDir, indexCreationInfo.getSortedUniqueElementsArray().length, totalAggDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); } } else { forwardIndexCreatorMap.put( column, new MultiValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), outDir, uniqueValues.length, totalRawDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); aggregateForwardIndexCreatorMap.put( column, new MultiValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), starTreeDir, uniqueValues.length, totalAggDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); } if (config.createInvertedIndexEnabled()) { invertedIndexCreatorMap.put( column, new BitmapInvertedIndexCreator( outDir, indexCreationInfo.getSortedUniqueElementsArray().length, schema.getFieldSpecFor(column))); aggregateInvertedIndexCreatorMap.put( column, new BitmapInvertedIndexCreator( starTreeDir, indexCreationInfo.getSortedUniqueElementsArray().length, schema.getFieldSpecFor(column))); } } }
/** Constructs the segment metadata file, and writes in outputDir */ private void writeMetadata(File outputDir, int totalDocs) throws ConfigurationException { final PropertiesConfiguration properties = new PropertiesConfiguration( new File(outputDir, V1Constants.MetadataKeys.METADATA_FILE_NAME)); properties.setProperty(SEGMENT_NAME, segmentName); properties.setProperty(TABLE_NAME, config.getTableName()); properties.setProperty(DIMENSIONS, config.getDimensions()); properties.setProperty(METRICS, config.getMetrics()); properties.setProperty(TIME_COLUMN_NAME, config.getTimeColumnName()); properties.setProperty(TIME_INTERVAL, "not_there"); properties.setProperty(SEGMENT_TOTAL_DOCS, String.valueOf(totalDocs)); // StarTree Joiner csv = Joiner.on(","); properties.setProperty(SPLIT_ORDER, csv.join(splitOrder)); properties.setProperty(SPLIT_EXCLUDES, csv.join(starTreeIndexSpec.getSplitExcludes())); properties.setProperty(MAX_LEAF_RECORDS, starTreeIndexSpec.getMaxLeafRecords()); properties.setProperty( EXCLUDED_DIMENSIONS, csv.join(starTreeIndexSpec.getExcludedDimensions())); String timeColumn = config.getTimeColumnName(); if (columnInfo.get(timeColumn) != null) { properties.setProperty(SEGMENT_START_TIME, columnInfo.get(timeColumn).getMin()); properties.setProperty(SEGMENT_END_TIME, columnInfo.get(timeColumn).getMax()); properties.setProperty(TIME_UNIT, config.getTimeUnitForSegment()); } if (config.containsKey(SEGMENT_START_TIME)) { properties.setProperty(SEGMENT_START_TIME, config.getStartTime()); } if (config.containsKey(SEGMENT_END_TIME)) { properties.setProperty(SEGMENT_END_TIME, config.getStartTime()); } if (config.containsKey(TIME_UNIT)) { properties.setProperty(TIME_UNIT, config.getTimeUnitForSegment()); } for (final String key : config.getAllCustomKeyValuePair().keySet()) { properties.setProperty(key, config.getAllCustomKeyValuePair().get(key)); } for (final String column : columnInfo.keySet()) { properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, CARDINALITY), String.valueOf(columnInfo.get(column).getSortedUniqueElementsArray().length)); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, TOTAL_DOCS), String.valueOf(totalDocs)); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, DATA_TYPE), schema.getFieldSpecFor(column).getDataType().toString()); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, BITS_PER_ELEMENT), String.valueOf( SingleValueUnsortedForwardIndexCreator.getNumOfBits( columnInfo.get(column).getSortedUniqueElementsArray().length))); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, DICTIONARY_ELEMENT_SIZE), String.valueOf(dictionaryCreatorMap.get(column).getStringColumnMaxLength())); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, COLUMN_TYPE), String.valueOf(schema.getFieldSpecFor(column).getFieldType().toString())); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, IS_SORTED), String.valueOf(columnInfo.get(column).isSorted())); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, HAS_NULL_VALUE), String.valueOf(columnInfo.get(column).hasNulls())); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor( column, V1Constants.MetadataKeys.Column.HAS_DICTIONARY), String.valueOf(columnInfo.get(column).isCreateDictionary())); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, HAS_INVERTED_INDEX), String.valueOf(true)); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, IS_SINGLE_VALUED), String.valueOf(schema.getFieldSpecFor(column).isSingleValueField())); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, MAX_MULTI_VALUE_ELEMTS), String.valueOf(columnInfo.get(column).getMaxNumberOfMutiValueElements())); properties.setProperty( V1Constants.MetadataKeys.Column.getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES), String.valueOf(columnInfo.get(column).getTotalNumberOfEntries())); } properties.save(); }
@Override public void seal() throws ConfigurationException, IOException { // Write all the aggregate rows to the aggregate segment LOG.info("Writing aggregate segment..."); long startMillis = System.currentTimeMillis(); int currentAggregateDocumentId = 0; Iterator<StarTreeTableRow> itr = starTreeBuilder.getTable().getAllCombinations(); while (itr.hasNext()) { StarTreeTableRow next = itr.next(); if (next.getDimensions().contains(StarTreeIndexNode.all())) { // Write using that document ID to all columns for (final String column : dictionaryCreatorMap.keySet()) { Object dictionaryIndex = null; // TODO: Is this okay? if (starTreeDimensionDictionary.containsKey(column)) { // Index the dimension value Integer dimensionId = starTreeDimensionDictionary.get(column); Integer dimensionValue = next.getDimensions().get(dimensionId); if (dimensionValue == StarTreeIndexNode.all()) { // Use all value Object allValue = StarTreeIndexNode.getAllValue(schema.getFieldSpecFor(column)); if (schema.getFieldSpecFor(column).isSingleValueField()) { dictionaryIndex = dictionaryCreatorMap.get(column).indexOfSV(allValue); } else { dictionaryIndex = dictionaryCreatorMap.get(column).indexOfMV(allValue); } } else { dictionaryIndex = dimensionValue; } } else if (starTreeMetricDictionary.containsKey(column)) { // Index the aggregate metric Integer metricId = starTreeMetricDictionary.get(column); Object columnValueToIndex = next.getMetrics().get(metricId); if (schema.getFieldSpecFor(column).isSingleValueField()) { dictionaryIndex = dictionaryCreatorMap.get(column).indexOfSV(columnValueToIndex); } else { dictionaryIndex = dictionaryCreatorMap.get(column).indexOfMV(columnValueToIndex); } } else { // Just index the raw value Object columnValueToIndex = StarTreeIndexNode.getAllValue(schema.getFieldSpecFor(column)); if (schema.getFieldSpecFor(column).isSingleValueField()) { dictionaryIndex = dictionaryCreatorMap.get(column).indexOfSV(columnValueToIndex); } else { dictionaryIndex = dictionaryCreatorMap.get(column).indexOfMV(columnValueToIndex); } } if (schema.getFieldSpecFor(column).isSingleValueField()) { ((SingleValueForwardIndexCreator) aggregateForwardIndexCreatorMap.get(column)) .index(currentAggregateDocumentId, (Integer) dictionaryIndex); } else { ((MultiValueForwardIndexCreator) aggregateForwardIndexCreatorMap.get(column)) .index(currentAggregateDocumentId, (int[]) dictionaryIndex); } if (config.createInvertedIndexEnabled()) { aggregateInvertedIndexCreatorMap .get(column) .add(currentAggregateDocumentId, dictionaryIndex); } } currentAggregateDocumentId++; } } long endMillis = System.currentTimeMillis(); LOG.info("Done writing aggregate segment (took {} ms)", endMillis - startMillis); for (final String column : forwardIndexCreatorMap.keySet()) { forwardIndexCreatorMap.get(column).close(); if (config.createInvertedIndexEnabled()) { invertedIndexCreatorMap.get(column).seal(); } dictionaryCreatorMap.get(column).close(); } for (final String column : aggregateForwardIndexCreatorMap.keySet()) { aggregateForwardIndexCreatorMap.get(column).close(); if (config.createInvertedIndexEnabled()) { aggregateInvertedIndexCreatorMap.get(column).seal(); } // n.b. The dictionary from raw data is used } writeMetadata(outDir, starTreeBuilder.getTotalRawDocumentCount()); // Write star tree LOG.info("Writing " + V1Constants.STARTREE_FILE); startMillis = System.currentTimeMillis(); File starTreeFile = new File(starTreeDir, V1Constants.STARTREE_FILE); OutputStream starTreeOutputStream = new FileOutputStream(starTreeFile); starTreeBuilder.getTree().writeTree(starTreeOutputStream); starTreeOutputStream.close(); endMillis = System.currentTimeMillis(); LOG.info("Wrote StarTree file (took {} ms)", endMillis - startMillis); // Copy the dictionary files into startree directory // n.b. this is done so the segment is as stand-alone as possible, though could be removed as an // optimization File[] dictionaryFiles = outDir.listFiles( new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.endsWith(V1Constants.Dict.FILE_EXTENTION); } }); for (File dictionaryFile : dictionaryFiles) { FileUtils.copyFile(dictionaryFile, new File(starTreeDir, dictionaryFile.getName())); } // Write star tree metadata writeMetadata(starTreeDir, starTreeBuilder.getTotalAggregateDocumentCount()); }
@Override public void init( SegmentGeneratorConfig segmentCreationSpec, Map<String, ColumnIndexCreationInfo> indexCreationInfoMap, Schema schema, int totalDocs, File outDir) throws Exception { docIdCounter = 0; config = segmentCreationSpec; this.indexCreationInfoMap = indexCreationInfoMap; dictionaryCreatorMap = new HashMap<String, SegmentDictionaryCreator>(); forwardIndexCreatorMap = new HashMap<String, ForwardIndexCreator>(); this.indexCreationInfoMap = indexCreationInfoMap; invertedIndexCreatorMap = new HashMap<String, InvertedIndexCreator>(); file = outDir; // Check that the output directory does not exist if (file.exists()) { throw new RuntimeException( "Segment output directory " + file.getAbsolutePath() + " already exists."); } file.mkdir(); this.schema = schema; this.totalDocs = totalDocs; // Initialize and build dictionaries for (final FieldSpec spec : schema.getAllFieldSpecs()) { final ColumnIndexCreationInfo info = indexCreationInfoMap.get(spec.getName()); if (info.isCreateDictionary()) { dictionaryCreatorMap.put( spec.getName(), new SegmentDictionaryCreator( info.hasNulls(), info.getSortedUniqueElementsArray(), spec, file)); } else { throw new RuntimeException("Creation of indices without dictionaries is not implemented!"); } } // For each column, build its dictionary and initialize a forwards and an inverted index for (final String column : dictionaryCreatorMap.keySet()) { dictionaryCreatorMap.get(column).build(); dictionaryCache.put(column, new HashMap<Object, Object>()); ColumnIndexCreationInfo indexCreationInfo = indexCreationInfoMap.get(column); if (schema.getFieldSpecFor(column).isSingleValueField()) { if (indexCreationInfo.isSorted()) { forwardIndexCreatorMap.put( column, new SingleValueSortedForwardIndexCreator( file, indexCreationInfo.getSortedUniqueElementsArray().length, schema.getFieldSpecFor(column))); } else { forwardIndexCreatorMap.put( column, new SingleValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), file, indexCreationInfo.getSortedUniqueElementsArray().length, totalDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); } } else { forwardIndexCreatorMap.put( column, new MultiValueUnsortedForwardIndexCreator( schema.getFieldSpecFor(column), file, indexCreationInfo.getSortedUniqueElementsArray().length, totalDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); } if (config.createInvertedIndexEnabled()) { invertedIndexCreatorMap.put( column, new BitmapInvertedIndexCreator( file, indexCreationInfo.getSortedUniqueElementsArray().length, schema.getFieldSpecFor(column))); } } }