@BeforeClass public static void before() throws Exception { final String filePath = TestUtils.getFileFromResourceUrl( DictionariesTest.class.getClassLoader().getResource(AVRO_DATA)); if (INDEX_DIR.exists()) { FileUtils.deleteQuietly(INDEX_DIR); } final SegmentGeneratorConfig config = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns( new File(filePath), INDEX_DIR, "time_day", TimeUnit.DAYS, "test"); final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null); driver.init(config); driver.build(); final Schema schema = AvroUtils.extractSchemaFromAvro(new File(filePath)); final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath)); final org.apache.avro.Schema avroSchema = avroReader.getSchema(); final String[] columns = new String[avroSchema.getFields().size()]; int i = 0; for (final Field f : avroSchema.getFields()) { columns[i] = f.name(); i++; } uniqueEntries = new HashMap<String, Set<Object>>(); for (final String column : columns) { uniqueEntries.put(column, new HashSet<Object>()); } while (avroReader.hasNext()) { final GenericRecord rec = avroReader.next(); for (final String column : columns) { Object val = rec.get(column); if (val instanceof Utf8) { val = ((Utf8) val).toString(); } uniqueEntries .get(column) .add(getAppropriateType(schema.getFieldSpecFor(column).getDataType(), val)); } } }
@Test public void testAvroNativeJson() throws IOException { AvroNativeFileOutputFormat format = new AvroNativeFileOutputFormat(); ByteArrayOutputStream sos = new ByteArrayOutputStream(); format.format(sos, e); format.close(); byte[] bytes = sos.toByteArray(); ReflectData reflectData = ReflectData.get(); Schema schema = reflectData.getSchema(EventImpl.class); ReflectDatumReader<EventImpl> dr = new ReflectDatumReader<EventImpl>(schema); ByteArrayInputStream bais = new ByteArrayInputStream(bytes); DataFileStream<EventImpl> dec = new DataFileStream<EventImpl>(bais, dr); Event er = dec.next(); assertEquals(e.getHost(), er.getHost()); assertEquals(e.getNanos(), er.getNanos()); assertEquals(e.getPriority(), er.getPriority()); assertTrue(Arrays.equals(e.getBody(), er.getBody())); }
public void generateSimpleAggregationOnSingleColumnFilters() throws IOException { final Map<String, Map<Object, Integer>> cardinalityCountsMap = new HashMap<String, Map<Object, Integer>>(); final Map<String, Map<Object, Map<String, Double>>> sumMap = new HashMap<String, Map<Object, Map<String, Double>>>(); // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue final Map<String, Map<Object, Double>> sumGroupBy = new HashMap<String, Map<Object, Double>>(); aggregationQueries = new ArrayList<AvroQueryGenerator.TestSimpleAggreationQuery>(); groupByQueries = new ArrayList<AvroQueryGenerator.TestGroupByAggreationQuery>(); for (final Field f : schema.getFields()) { final String fieldName = f.name(); if (dimensions.contains(fieldName) || metrics.contains(fieldName) || time.equals(fieldName)) { isSingleValueMap.put(fieldName, isSingleValueField(f)); dataTypeMap.put(fieldName, getColumnType(f)); if (!metrics.contains(fieldName)) { cardinalityCountsMap.put(fieldName, new HashMap<Object, Integer>()); } } } for (final String column : cardinalityCountsMap.keySet()) { sumMap.put(column, new HashMap<Object, Map<String, Double>>()); } // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue while (dataStream.hasNext()) { final GenericRecord record = dataStream.next(); for (final String column : cardinalityCountsMap.keySet()) { Object value = record.get(column); if (value == null) { switch (schema.getField(column).schema().getType()) { case INT: value = 0; break; case FLOAT: value = 0F; break; case LONG: value = 0L; break; case DOUBLE: value = 0D; break; case STRING: case BOOLEAN: value = "null"; break; } } if (value instanceof Utf8) { value = ((Utf8) value).toString(); } if (value instanceof Array) { continue; } // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue for (final String metricName : metrics) { final String groupbyKeyBase = column + ":" + record.get(column) + ":" + metricName; int dimCounter = 1; for (final String dim : cardinalityCountsMap.keySet()) { if (!dim.equals(column)) { dimCounter++; final String groupbyKey = groupbyKeyBase + ":" + dim; if (sumGroupBy.containsKey(groupbyKey)) { if (sumGroupBy.get(groupbyKey).containsKey(record.get(dim))) { sumGroupBy .get(groupbyKey) .put( record.get(dim), getAppropriateNumberType( metricName, record.get(metricName), sumGroupBy.get(groupbyKey).get(record.get(dim)))); } else { sumGroupBy .get(groupbyKey) .put(record.get(dim), Double.parseDouble(record.get(metricName).toString())); } } else { sumGroupBy.put(groupbyKey, new HashMap<Object, Double>()); sumGroupBy .get(groupbyKey) .put(record.get(dim), Double.parseDouble(record.get(metricName).toString())); } } if (dimCounter == 4) { break; } } } if (cardinalityCountsMap.get(column).containsKey(value)) { cardinalityCountsMap .get(column) .put(value, cardinalityCountsMap.get(column).get(value) + 1); } else { cardinalityCountsMap.get(column).put(value, 1); } if (!sumMap.get(column).containsKey(value)) { sumMap.get(column).put(value, new HashMap<String, Double>()); } for (final String metric : metrics) { if (!sumMap.get(column).get(value).containsKey(metric)) { sumMap .get(column) .get(value) .put(metric, getAppropriateNumberType(metric, record.get(metric), 0D)); } else { sumMap .get(column) .get(value) .put( metric, getAppropriateNumberType( metric, record.get(metric), sumMap.get(column).get(value).get(metric))); } } // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue } } dataStream.close(); if (!isRealtimeSegment) { for (final String column : cardinalityCountsMap.keySet()) { for (final Object entry : cardinalityCountsMap.get(column).keySet()) { final StringBuilder bld = new StringBuilder(); bld.append("select count(*) from "); bld.append(resourceName); bld.append(" where "); bld.append(column); bld.append("="); bld.append("'"); bld.append(entry); bld.append("'"); bld.append(" "); bld.append("limit 0"); String queryString = bld.toString(); if (!queryString.contains("null")) { aggregationQueries.add( new TestSimpleAggreationQuery( queryString, new Double(cardinalityCountsMap.get(column).get(entry)))); } } } } for (final String column : sumMap.keySet()) { for (final Object value : sumMap.get(column).keySet()) { for (final String metric : sumMap.get(column).get(value).keySet()) { final StringBuilder bld = new StringBuilder(); bld.append("select sum('" + metric + "') from "); bld.append(resourceName); bld.append(" where "); bld.append(column); bld.append("="); bld.append("'"); bld.append(value); bld.append("'"); bld.append(" "); bld.append("limit 0"); String queryString = bld.toString(); if (!queryString.contains("null")) { aggregationQueries.add( new TestSimpleAggreationQuery( bld.toString(), sumMap.get(column).get(value).get(metric))); } } } } for (final String groupKey : sumGroupBy.keySet()) { final String columnName = groupKey.split(":")[0]; final String columnValue = groupKey.split(":")[1]; final String metricColumn = groupKey.split(":")[2]; final String groupByColumnName = groupKey.split(":")[3]; final StringBuilder bld = new StringBuilder(); bld.append("select sum('" + metricColumn + "') from "); bld.append(resourceName); bld.append(" where "); bld.append(columnName); bld.append("="); bld.append("'"); bld.append(columnValue); bld.append("'"); bld.append(" "); bld.append(" group by "); bld.append(groupByColumnName); bld.append(" top 10 "); bld.append("limit 0"); String queryString = bld.toString(); if (!queryString.contains("null")) { groupByQueries.add( new TestGroupByAggreationQuery(bld.toString(), sumGroupBy.get(groupKey))); } } }
public void init() throws FileNotFoundException, IOException { dataStream = new DataFileStream<GenericRecord>( new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>()); schema = dataStream.getSchema(); }