/** * Returns true if the types of two avro schemas are equal. This ignores things like custom field * properties that the equals() implementation of Schema checks. * * @param schema1 The first schema to compare * @param schema2 The second schema to compare * @return True if the types are equal, otherwise false. */ public static boolean avroSchemaTypesEqual(Schema schema1, Schema schema2) { if (schema1.getType() != schema2.getType()) { // if the types aren't equal, no need to go further. Return false return false; } if (schema1.getType() == Schema.Type.ENUM || schema1.getType() == Schema.Type.FIXED) { // Enum and Fixed types schemas should be equal using the Schema.equals // method. return schema1.equals(schema2); } if (schema1.getType() == Schema.Type.ARRAY) { // Avro element schemas should be equal, which is tested by recursively // calling this method. return avroSchemaTypesEqual(schema1.getElementType(), schema2.getElementType()); } else if (schema1.getType() == Schema.Type.MAP) { // Map type values schemas should be equal, which is tested by recursively // calling this method. return avroSchemaTypesEqual(schema1.getValueType(), schema2.getValueType()); } else if (schema1.getType() == Schema.Type.UNION) { // Compare Union fields in the same position by comparing their schemas // recursively calling this method. if (schema1.getTypes().size() != schema2.getTypes().size()) { return false; } for (int i = 0; i < schema1.getTypes().size(); i++) { if (!avroSchemaTypesEqual(schema1.getTypes().get(i), schema2.getTypes().get(i))) { return false; } } return true; } else if (schema1.getType() == Schema.Type.RECORD) { // Compare record fields that match in name by comparing their schemas // recursively calling this method. if (schema1.getFields().size() != schema2.getFields().size()) { return false; } for (Field field1 : schema1.getFields()) { Field field2 = schema2.getField(field1.name()); if (field2 == null) { return false; } if (!avroSchemaTypesEqual(field1.schema(), field2.schema())) { return false; } } return true; } else { // All other types are primitive, so them matching in type is enough. return true; } }
/** * Get a map of field names to default values for an Avro schema. * * @param avroRecordSchema The schema to get the map of field names to values. * @return The map. */ public static Map<String, Object> getDefaultValueMap(Schema avroRecordSchema) { List<Field> defaultFields = new ArrayList<Field>(); for (Field f : avroRecordSchema.getFields()) { if (f.defaultValue() != null) { // Need to create a new Field here or we will get // org.apache.avro.AvroRuntimeException: Field already used: // schemaVersion defaultFields.add(new Field(f.name(), f.schema(), f.doc(), f.defaultValue(), f.order())); } } Schema defaultSchema = Schema.createRecord(defaultFields); Schema emptyRecordSchema = Schema.createRecord(new ArrayList<Field>()); DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(emptyRecordSchema); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(emptyRecordSchema, defaultSchema); GenericRecord emptyRecord = new GenericData.Record(emptyRecordSchema); GenericRecord defaultRecord = AvroUtils.readAvroEntity(AvroUtils.writeAvroEntity(emptyRecord, writer), reader); Map<String, Object> defaultValueMap = new HashMap<String, Object>(); for (Field f : defaultFields) { defaultValueMap.put(f.name(), defaultRecord.get(f.name())); } return defaultValueMap; }
@BeforeClass public static void before() throws Exception { final String filePath = TestUtils.getFileFromResourceUrl( DictionariesTest.class.getClassLoader().getResource(AVRO_DATA)); if (INDEX_DIR.exists()) { FileUtils.deleteQuietly(INDEX_DIR); } final SegmentGeneratorConfig config = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns( new File(filePath), INDEX_DIR, "time_day", TimeUnit.DAYS, "test"); final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null); driver.init(config); driver.build(); final Schema schema = AvroUtils.extractSchemaFromAvro(new File(filePath)); final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath)); final org.apache.avro.Schema avroSchema = avroReader.getSchema(); final String[] columns = new String[avroSchema.getFields().size()]; int i = 0; for (final Field f : avroSchema.getFields()) { columns[i] = f.name(); i++; } uniqueEntries = new HashMap<String, Set<Object>>(); for (final String column : columns) { uniqueEntries.put(column, new HashSet<Object>()); } while (avroReader.hasNext()) { final GenericRecord rec = avroReader.next(); for (final String column : columns) { Object val = rec.get(column); if (val instanceof Utf8) { val = ((Utf8) val).toString(); } uniqueEntries .get(column) .add(getAppropriateType(schema.getFieldSpecFor(column).getDataType(), val)); } } }
@Override public void insertValues(NullWritable key, AvroGenericRecordWritable value) throws SQLException { deleteStmt.execute(); for (int i = 0; i < schema.getFields().size(); i++) { insertStmt.setObject(i + 1, convertIn(value.getRecord().get(i))); } insertStmt.execute(); }
private static int hashCode(HashData data, Schema schema) throws IOException { Decoder decoder = data.decoder; switch (schema.getType()) { case RECORD: { int hashCode = 1; for (Field field : schema.getFields()) { if (field.order() == Field.Order.IGNORE) { GenericDatumReader.skip(field.schema(), decoder); continue; } hashCode = hashCode * 31 + hashCode(data, field.schema()); } return hashCode; } case ENUM: case INT: return decoder.readInt(); case FLOAT: return Float.floatToIntBits(decoder.readFloat()); case LONG: { long l = decoder.readLong(); return (int) (l ^ (l >>> 32)); } case DOUBLE: { long l = Double.doubleToLongBits(decoder.readDouble()); return (int) (l ^ (l >>> 32)); } case ARRAY: { Schema elementType = schema.getElementType(); int hashCode = 1; for (long l = decoder.readArrayStart(); l != 0; l = decoder.arrayNext()) for (long i = 0; i < l; i++) hashCode = hashCode * 31 + hashCode(data, elementType); return hashCode; } case MAP: throw new AvroRuntimeException("Can't hashCode maps!"); case UNION: return hashCode(data, schema.getTypes().get(decoder.readInt())); case FIXED: return hashBytes(1, data, schema.getFixedSize(), false); case STRING: return hashBytes(0, data, decoder.readInt(), false); case BYTES: return hashBytes(1, data, decoder.readInt(), true); case BOOLEAN: return decoder.readBoolean() ? 1231 : 1237; case NULL: return 0; default: throw new AvroRuntimeException("Unexpected schema to hashCode!"); } }
@Override public void setConf(org.apache.hadoop.conf.Configuration conf) { if (conf == null) return; // you first get a null configuration - ignore that String mos = conf.get(AvroJob.MAP_OUTPUT_SCHEMA); Schema schema = Schema.parse(mos); pair = new Pair<Object, Object>(schema); Schema keySchema = Pair.getKeySchema(schema); final List<Field> fields = keySchema.getFields(); final GenericRecord key = new GenericData.Record(keySchema); projector = new Projector(key, fields); }
public static DataType getColumnType(Field field) { org.apache.avro.Schema fieldSchema = field.schema(); fieldSchema = extractSchemaFromUnionIfNeeded(fieldSchema); final Type type = fieldSchema.getType(); if (type == Type.ARRAY) { org.apache.avro.Schema elementSchema = extractSchemaFromUnionIfNeeded(fieldSchema.getElementType()); if (elementSchema.getType() == Type.RECORD) { if (elementSchema.getFields().size() == 1) { elementSchema = elementSchema.getFields().get(0).schema(); } else { throw new RuntimeException("More than one schema in Multi-value column!"); } elementSchema = extractSchemaFromUnionIfNeeded(elementSchema); } return DataType.valueOf(elementSchema.getType()); } else { return DataType.valueOf(type); } }
@Override @SuppressWarnings("unchecked") protected void writeRecord(Schema schema, Object datum, Encoder out) throws IOException { if (persistent == null) { persistent = (T) datum; } if (!writeDirtyBits) { super.writeRecord(schema, datum, out); return; } // check if top level schema if (schema.equals(persistent.getSchema())) { // write readable fields and dirty fields info boolean[] dirtyFields = new boolean[schema.getFields().size()]; boolean[] readableFields = new boolean[schema.getFields().size()]; StateManager manager = persistent.getStateManager(); int i = 0; for (@SuppressWarnings("unused") Field field : schema.getFields()) { dirtyFields[i] = manager.isDirty(persistent, i); readableFields[i] = manager.isReadable(persistent, i); i++; } IOUtils.writeBoolArray(out, dirtyFields); IOUtils.writeBoolArray(out, readableFields); for (Field field : schema.getFields()) { if (readableFields[field.pos()]) { write(field.schema(), getData().getField(datum, field.name(), field.pos()), out); } } } else { super.writeRecord(schema, datum, out); } }
@Test public void test_getHiveTypeFromAvroType_primitive() throws Exception { // Expected ORC types String[] expectedTypes = { "INT", "BIGINT", "BOOLEAN", "FLOAT", "DOUBLE", "BINARY", "STRING", }; Schema testSchema = buildPrimitiveAvroSchema(); List<Schema.Field> fields = testSchema.getFields(); for (int i = 0; i < fields.size(); i++) { assertEquals(expectedTypes[i], NiFiOrcUtils.getHiveTypeFromAvroType(fields.get(i).schema())); } }
/** * Called by {@link #containsRecursiveRecord(Schema)} and it recursively checks whether the input * schema contains recursive records. */ protected static boolean containsRecursiveRecord(Schema s, Set<String> definedRecordNames) { /* if it is a record, check itself and all fields*/ if (s.getType().equals(Schema.Type.RECORD)) { String name = s.getName(); if (definedRecordNames.contains(name)) return true; /* add its own name into defined record set*/ definedRecordNames.add(s.getName()); /* check all fields */ List<Field> fields = s.getFields(); for (Field field : fields) { Schema fs = field.schema(); if (containsRecursiveRecord(fs, definedRecordNames)) return true; } /* remove its own name from the name set */ definedRecordNames.remove(s.getName()); return false; } /* if it is an array, check its element type */ else if (s.getType().equals(Schema.Type.ARRAY)) { Schema fs = s.getElementType(); return containsRecursiveRecord(fs, definedRecordNames); } /*if it is a map, check its value type */ else if (s.getType().equals(Schema.Type.MAP)) { Schema vs = s.getValueType(); return containsRecursiveRecord(vs, definedRecordNames); } /* if it is a union, check all possible types */ else if (s.getType().equals(Schema.Type.UNION)) { List<Schema> types = s.getTypes(); for (Schema type : types) { if (containsRecursiveRecord(type, definedRecordNames)) return true; } return false; } /* return false for other cases */ else { return false; } }
@Test public void test_getHiveTypeFromAvroType_complex() throws Exception { // Expected ORC types String[] expectedTypes = { "INT", "MAP<STRING, DOUBLE>", "STRING", "UNIONTYPE<BIGINT, FLOAT>", "ARRAY<INT>" }; Schema testSchema = buildComplexAvroSchema(); List<Schema.Field> fields = testSchema.getFields(); for (int i = 0; i < fields.size(); i++) { assertEquals(expectedTypes[i], NiFiOrcUtils.getHiveTypeFromAvroType(fields.get(i).schema())); } assertEquals( "STRUCT<myInt:INT, myMap:MAP<STRING, DOUBLE>, myEnum:STRING, myLongOrFloat:UNIONTYPE<BIGINT, FLOAT>, myIntList:ARRAY<INT>>", NiFiOrcUtils.getHiveTypeFromAvroType(testSchema)); }
@Override public void initialize(Connection conn) throws SQLException { this.conn = conn; StringBuilder create = new StringBuilder("CREATE TABLE ").append(tableName).append(" ("); StringBuilder insert = new StringBuilder("INSERT INTO ").append(tableName).append(" VALUES ("); for (int i = 0; i < schema.getFields().size(); i++) { Schema.Field sf = schema.getFields().get(i); create.append(sf.name()).append(' ').append(getSQLType(sf.schema())).append(','); insert.append("?,"); } create.deleteCharAt(create.length() - 1).append(")"); insert.deleteCharAt(insert.length() - 1).append(")"); conn.createStatement().execute(create.toString()); insertStmt = conn.prepareStatement(insert.toString()); deleteStmt = conn.prepareStatement("DELETE FROM " + tableName); }
/* * for each avro field, search for nested schema. * if field is nested, create tree and recursive * else fetch field as a element */ private static void fetchToTree(Field field, Tree parent) { // System.out.println(" Field: " + field.name()); if (field.schema().getType().toString().equalsIgnoreCase("RECORD")) { // if(!multipleData) // multipleData = true; // Tree child = new Tree(parent); child.setName(field.name()); List<Field> list = field.schema().getFields(); Iterator<Field> it = list.iterator(); while (it.hasNext()) { Field fieldOfField = it.next(); fetchToTree(fieldOfField, child); } parent.getTrees().add(child); } else if (field.schema().getType().getName().equalsIgnoreCase("ARRAY")) { if (field.schema().getElementType().getType().name().toString().equalsIgnoreCase("RECORD")) { if (!multipleData) multipleData = true; Schema arraySchema = field.schema().getElementType(); Tree childParent = new Tree(parent); childParent.setName(field.name()); // employee // parent.getTrees().add(childParent); // Tree child = new Tree(childParent); // child.setName(arraySchema.getName());//employeerecord List<Field> list = arraySchema.getFields(); Iterator<Field> it = list.iterator(); while (it.hasNext()) { Field fieldOfField = it.next(); fetchToTree(fieldOfField, childParent); } parent.getTrees().add(childParent); } } else { Element elementNew = new Element(parent); elementNew.setName(field.name()); parent.getElements().add(elementNew); } // return parent; }
@Test public void test_getOrcField_primitive() throws Exception { // Expected ORC types TypeInfo[] expectedTypes = { TypeInfoFactory.getPrimitiveTypeInfo("int"), TypeInfoFactory.getPrimitiveTypeInfo("bigint"), TypeInfoFactory.getPrimitiveTypeInfo("boolean"), TypeInfoFactory.getPrimitiveTypeInfo("float"), TypeInfoFactory.getPrimitiveTypeInfo("double"), TypeInfoFactory.getPrimitiveTypeInfo("binary"), TypeInfoFactory.getPrimitiveTypeInfo("string") }; // Build a fake Avro record with all types Schema testSchema = buildPrimitiveAvroSchema(); List<Schema.Field> fields = testSchema.getFields(); for (int i = 0; i < fields.size(); i++) { assertEquals(expectedTypes[i], NiFiOrcUtils.getOrcField(fields.get(i).schema())); } }
@Override public boolean retrieveResults(NullWritable key, AvroGenericRecordWritable value) throws SQLException { Statement stmt = conn.createStatement(); try { ResultSet rs = stmt.executeQuery("select * from " + tableName); if (rs.next()) { for (int i = 0; i < schema.getFields().size(); i++) { value.getRecord().put(i, convertOut(rs.getObject(i + 1))); } return false; } else { return true; } } finally { if (stmt != null) { stmt.close(); } } }
@Test public void test_getPrimitiveOrcTypeFromPrimitiveAvroType() throws Exception { // Expected ORC types TypeInfo[] expectedTypes = { TypeInfoCreator.createInt(), TypeInfoCreator.createLong(), TypeInfoCreator.createBoolean(), TypeInfoCreator.createFloat(), TypeInfoCreator.createDouble(), TypeInfoCreator.createBinary(), TypeInfoCreator.createString(), }; Schema testSchema = buildPrimitiveAvroSchema(); List<Schema.Field> fields = testSchema.getFields(); for (int i = 0; i < fields.size(); i++) { assertEquals( expectedTypes[i], NiFiOrcUtils.getPrimitiveOrcTypeFromPrimitiveAvroType(fields.get(i).schema().getType())); } }
@Test public void testProjection() throws IOException { Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false); Configuration conf = new Configuration(); Schema schema = Car.getClassSchema(); List<Schema.Field> fields = schema.getFields(); // Schema.Parser parser = new Schema.Parser(); List<Schema.Field> projectedFields = new ArrayList<Schema.Field>(); for (Schema.Field field : fields) { String name = field.name(); if ("optionalExtra".equals(name) || "serviceHistory".equals(name)) { continue; } // Schema schemaClone = parser.parse(field.schema().toString(false)); Schema.Field fieldClone = new Schema.Field(name, field.schema(), field.doc(), field.defaultValue()); projectedFields.add(fieldClone); } Schema projectedSchema = Schema.createRecord( schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); projectedSchema.setFields(projectedFields); AvroReadSupport.setRequestedProjection(conf, projectedSchema); ParquetReader<Car> reader = new AvroParquetReader<Car>(conf, path); for (Car car = reader.read(); car != null; car = reader.read()) { assertEquals(car.getDoors() != null, true); assertEquals(car.getEngine() != null, true); assertEquals(car.getMake() != null, true); assertEquals(car.getModel() != null, true); assertEquals(car.getYear() != null, true); assertEquals(car.getVin() != null, true); assertNull(car.getOptionalExtra()); assertNull(car.getServiceHistory()); } }
/** * Precondition-style validation that the DatasetDescriptor is compatible. * * @param descriptor a {@link DatasetDescriptor} */ public static void checkDescriptor(DatasetDescriptor descriptor) { Preconditions.checkNotNull(descriptor, "Descriptor cannot be null"); Schema schema = descriptor.getSchema(); checkSchema(schema); if (descriptor.isPartitioned()) { // marked as [BUG] because this is checked in DatasetDescriptor Preconditions.checkArgument( schema.getType() == Schema.Type.RECORD, "[BUG] Partitioned datasets must have record schemas"); Set<String> names = Sets.newHashSet(); for (Schema.Field field : schema.getFields()) { names.add(field.name()); } List<String> incompatible = Lists.newArrayList(); List<String> duplicates = Lists.newArrayList(); for (FieldPartitioner fp : descriptor.getPartitionStrategy().getFieldPartitioners()) { String name = fp.getName(); if (!isCompatibleName(name)) { incompatible.add(name); } else if (names.contains(name)) { duplicates.add(name); } else { names.add(name); } } Preconditions.checkState( incompatible.isEmpty(), "Hive incompatible: partition names are not alphanumeric (plus '_'): %s", Joiner.on(", ").join(incompatible)); Preconditions.checkState( duplicates.isEmpty(), "Hive incompatible: partition names duplicate data fields: %s", Joiner.on(", ").join(duplicates)); } }
/** determine whether the input schema contains generic unions */ public static boolean containsGenericUnion(Schema s) { /* if it is a record, check all fields*/ if (s.getType().equals(Schema.Type.RECORD)) { List<Field> fields = s.getFields(); for (Field field : fields) { Schema fs = field.schema(); if (containsGenericUnion(fs)) return true; } return false; } /* if it is an array, check its element type */ else if (s.getType().equals(Schema.Type.ARRAY)) { Schema fs = s.getElementType(); return containsGenericUnion(fs); } /*if it is a map, check its value type */ else if (s.getType().equals(Schema.Type.MAP)) { Schema vs = s.getValueType(); return containsGenericUnion(vs); } /* if it is a union, check all possible types and itself */ else if (s.getType().equals(Schema.Type.UNION)) { List<Schema> types = s.getTypes(); for (Schema type : types) { if (containsGenericUnion(type)) return true; } /* check whether itself is acceptable (null-union) */ return !isAcceptableUnion(s); } /* return false for other cases */ else { return false; } }
private Object translate(Object value, DataSchema dataSchema, Schema avroSchema) { AvroOverride avroOverride = getAvroOverride(dataSchema); if (avroOverride != null) { return avroOverride .getCustomDataTranslator() .avroGenericToData(this, value, avroSchema, dataSchema); } DataSchema dereferencedDataSchema = dataSchema.getDereferencedDataSchema(); DataSchema.Type type = dereferencedDataSchema.getType(); Object result; switch (type) { case NULL: if (value != null) { appendMessage("value must be null for null schema"); result = BAD_RESULT; break; } result = Data.NULL; break; case BOOLEAN: result = ((Boolean) value).booleanValue(); break; case INT: result = ((Number) value).intValue(); break; case LONG: result = ((Number) value).longValue(); break; case FLOAT: result = ((Number) value).floatValue(); break; case DOUBLE: result = ((Number) value).doubleValue(); break; case STRING: result = value.toString(); break; case BYTES: ByteBuffer byteBuffer = (ByteBuffer) value; ByteString byteString = ByteString.copy(byteBuffer); byteBuffer.rewind(); result = byteString; break; case ENUM: String enumValue = value.toString(); EnumDataSchema enumDataSchema = (EnumDataSchema) dereferencedDataSchema; if (enumDataSchema.getSymbols().contains(enumValue) == false) { appendMessage( "enum value %1$s not one of %2$s", enumValue, enumDataSchema.getSymbols()); result = BAD_RESULT; break; } result = enumValue; break; case FIXED: GenericFixed fixed = (GenericFixed) value; byte[] fixedBytes = fixed.bytes(); FixedDataSchema fixedDataSchema = (FixedDataSchema) dereferencedDataSchema; if (fixedDataSchema.getSize() != fixedBytes.length) { appendMessage( "GenericFixed size %1$d != FixedDataSchema size %2$d", fixedBytes.length, fixedDataSchema.getSize()); result = BAD_RESULT; break; } byteString = ByteString.copy(fixedBytes); result = byteString; break; case MAP: @SuppressWarnings("unchecked") Map<?, Object> map = (Map<?, Object>) value; DataSchema valueDataSchema = ((MapDataSchema) dereferencedDataSchema).getValues(); Schema valueAvroSchema = avroSchema.getValueType(); DataMap dataMap = new DataMap(map.size()); for (Map.Entry<?, Object> entry : map.entrySet()) { String key = entry.getKey().toString(); _path.addLast(key); Object entryValue = translate(entry.getValue(), valueDataSchema, valueAvroSchema); _path.removeLast(); dataMap.put(key, entryValue); } result = dataMap; break; case ARRAY: GenericArray<?> list = (GenericArray<?>) value; DataSchema elementDataSchema = ((ArrayDataSchema) dereferencedDataSchema).getItems(); Schema elementAvroSchema = avroSchema.getElementType(); DataList dataList = new DataList(list.size()); for (int i = 0; i < list.size(); i++) { _path.addLast(i); Object entryValue = translate(list.get(i), elementDataSchema, elementAvroSchema); _path.removeLast(); dataList.add(entryValue); } result = dataList; break; case RECORD: GenericRecord record = (GenericRecord) value; RecordDataSchema recordDataSchema = (RecordDataSchema) dereferencedDataSchema; dataMap = new DataMap(avroSchema.getFields().size()); for (RecordDataSchema.Field field : recordDataSchema.getFields()) { String fieldName = field.getName(); Object fieldValue = record.get(fieldName); // fieldValue could be null if the Avro schema does not contain the named field or // the field is present with a null value. In either case we do not add a value // to the translated DataMap. We do not consider optional/required/default here // either (i.e. it is not an error if a required field is missing); the user can // later call ValidateDataAgainstSchema with various // settings for RequiredMode to obtain the desired behaviour. if (fieldValue == null) { continue; } boolean isOptional = field.getOptional(); DataSchema fieldDataSchema = field.getType(); Schema fieldAvroSchema = avroSchema.getField(fieldName).schema(); if (isOptional && (fieldDataSchema.getDereferencedType() != DataSchema.Type.UNION)) { // Avro schema should be union with 2 types: null and the field's type. Map.Entry<String, Schema> fieldAvroEntry = findUnionMember(fieldDataSchema, fieldAvroSchema); if (fieldAvroEntry == null) { continue; } fieldAvroSchema = fieldAvroEntry.getValue(); } _path.addLast(fieldName); dataMap.put(fieldName, translate(fieldValue, fieldDataSchema, fieldAvroSchema)); _path.removeLast(); } result = dataMap; break; case UNION: UnionDataSchema unionDataSchema = (UnionDataSchema) dereferencedDataSchema; Map.Entry<DataSchema, Schema> memberSchemas = findUnionMemberSchema(value, unionDataSchema, avroSchema); if (memberSchemas == null) { result = BAD_RESULT; break; } if (value == null) { // schema must be "null" schema result = Data.NULL; } else { DataSchema memberDataSchema = memberSchemas.getKey(); Schema memberAvroSchema = memberSchemas.getValue(); String key = memberDataSchema.getUnionMemberKey(); dataMap = new DataMap(1); _path.addLast(key); dataMap.put(key, translate(value, memberDataSchema, memberAvroSchema)); _path.removeLast(); result = dataMap; } break; default: appendMessage("schema type unknown %1$s", dereferencedDataSchema.getType()); result = BAD_RESULT; break; } return result; }
/** * If equal, return the number of bytes consumed. If greater than, return GT, if less than, return * LT. */ private static int compare(Decoders d, Schema schema) throws IOException { Decoder d1 = d.d1; Decoder d2 = d.d2; switch (schema.getType()) { case RECORD: { for (Field field : schema.getFields()) { if (field.order() == Field.Order.IGNORE) { GenericDatumReader.skip(field.schema(), d1); GenericDatumReader.skip(field.schema(), d2); continue; } int c = compare(d, field.schema()); if (c != 0) return (field.order() != Field.Order.DESCENDING) ? c : -c; } return 0; } case ENUM: case INT: { int i1 = d1.readInt(); int i2 = d2.readInt(); return i1 == i2 ? 0 : (i1 > i2 ? 1 : -1); } case LONG: { long l1 = d1.readLong(); long l2 = d2.readLong(); return l1 == l2 ? 0 : (l1 > l2 ? 1 : -1); } case ARRAY: { long i = 0; // position in array long r1 = 0, r2 = 0; // remaining in current block long l1 = 0, l2 = 0; // total array length while (true) { if (r1 == 0) { // refill blocks(s) r1 = d1.readLong(); if (r1 < 0) { r1 = -r1; d1.readLong(); } l1 += r1; } if (r2 == 0) { r2 = d2.readLong(); if (r2 < 0) { r2 = -r2; d2.readLong(); } l2 += r2; } if (r1 == 0 || r2 == 0) // empty block: done return (l1 == l2) ? 0 : ((l1 > l2) ? 1 : -1); long l = Math.min(l1, l2); while (i < l) { // compare to end of block int c = compare(d, schema.getElementType()); if (c != 0) return c; i++; r1--; r2--; } } } case MAP: throw new AvroRuntimeException("Can't compare maps!"); case UNION: { int i1 = d1.readInt(); int i2 = d2.readInt(); if (i1 == i2) { return compare(d, schema.getTypes().get(i1)); } else { return i1 - i2; } } case FIXED: { int size = schema.getFixedSize(); int c = compareBytes(d.d1.getBuf(), d.d1.getPos(), size, d.d2.getBuf(), d.d2.getPos(), size); d.d1.skipFixed(size); d.d2.skipFixed(size); return c; } case STRING: case BYTES: { int l1 = d1.readInt(); int l2 = d2.readInt(); int c = compareBytes(d.d1.getBuf(), d.d1.getPos(), l1, d.d2.getBuf(), d.d2.getPos(), l2); d.d1.skipFixed(l1); d.d2.skipFixed(l2); return c; } case FLOAT: { float f1 = d1.readFloat(); float f2 = d2.readFloat(); return (f1 == f2) ? 0 : ((f1 > f2) ? 1 : -1); } case DOUBLE: { double f1 = d1.readDouble(); double f2 = d2.readDouble(); return (f1 == f2) ? 0 : ((f1 > f2) ? 1 : -1); } case BOOLEAN: boolean b1 = d1.readBoolean(); boolean b2 = d2.readBoolean(); return (b1 == b2) ? 0 : (b1 ? 1 : -1); case NULL: return 0; default: throw new AvroRuntimeException("Unexpected schema to compare!"); } }
/** Writes the given Avro datum into the given record, using the given Avro schema */ private void extractTree(Object datum, Schema schema, Record outputRecord, String prefix) { // RECORD, ENUM, ARRAY, MAP, UNION, FIXED, STRING, BYTES, INT, LONG, FLOAT, // DOUBLE, BOOLEAN, NULL switch (schema.getType()) { case RECORD: { IndexedRecord avroRecord = (IndexedRecord) datum; String prefix2 = prefix + "/"; for (Field field : schema.getFields()) { extractTree( avroRecord.get(field.pos()), field.schema(), outputRecord, prefix2 + field.name()); } break; } case ENUM: { GenericEnumSymbol symbol = (GenericEnumSymbol) datum; outputRecord.put(prefix, symbol.toString()); break; } case ARRAY: { Iterator iter = ((Collection) datum).iterator(); while (iter.hasNext()) { extractTree(iter.next(), schema.getElementType(), outputRecord, prefix); } break; } case MAP: { Map<CharSequence, ?> map = (Map<CharSequence, ?>) datum; for (Map.Entry<CharSequence, ?> entry : map.entrySet()) { extractTree( entry.getValue(), schema.getValueType(), outputRecord, prefix + "/" + entry.getKey().toString()); } break; } case UNION: { int index = GenericData.get().resolveUnion(schema, datum); // String typeName = schema.getTypes().get(index).getName(); // String prefix2 = prefix + "/" + typeName; String prefix2 = prefix; extractTree(datum, schema.getTypes().get(index), outputRecord, prefix2); break; } case FIXED: { GenericFixed fixed = (GenericFixed) datum; outputRecord.put(prefix, fixed.bytes()); // outputRecord.put(prefix, utf8toString(fixed.bytes())); break; } case BYTES: { ByteBuffer buf = (ByteBuffer) datum; int pos = buf.position(); byte[] bytes = new byte[buf.remaining()]; buf.get(bytes); buf.position(pos); // undo relative read outputRecord.put(prefix, bytes); // outputRecord.put(prefix, utf8toString(bytes)); break; } case STRING: { outputRecord.put(prefix, datum.toString()); break; } case INT: { outputRecord.put(prefix, datum); break; } case LONG: { outputRecord.put(prefix, datum); break; } case FLOAT: { outputRecord.put(prefix, datum); break; } case DOUBLE: { outputRecord.put(prefix, datum); break; } case BOOLEAN: { outputRecord.put(prefix, datum); break; } case NULL: { break; } default: throw new MorphlineRuntimeException("Unknown Avro schema type: " + schema.getType()); } }
public void generateSimpleAggregationOnSingleColumnFilters() throws IOException { final Map<String, Map<Object, Integer>> cardinalityCountsMap = new HashMap<String, Map<Object, Integer>>(); final Map<String, Map<Object, Map<String, Double>>> sumMap = new HashMap<String, Map<Object, Map<String, Double>>>(); // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue final Map<String, Map<Object, Double>> sumGroupBy = new HashMap<String, Map<Object, Double>>(); aggregationQueries = new ArrayList<AvroQueryGenerator.TestSimpleAggreationQuery>(); groupByQueries = new ArrayList<AvroQueryGenerator.TestGroupByAggreationQuery>(); for (final Field f : schema.getFields()) { final String fieldName = f.name(); if (dimensions.contains(fieldName) || metrics.contains(fieldName) || time.equals(fieldName)) { isSingleValueMap.put(fieldName, isSingleValueField(f)); dataTypeMap.put(fieldName, getColumnType(f)); if (!metrics.contains(fieldName)) { cardinalityCountsMap.put(fieldName, new HashMap<Object, Integer>()); } } } for (final String column : cardinalityCountsMap.keySet()) { sumMap.put(column, new HashMap<Object, Map<String, Double>>()); } // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue while (dataStream.hasNext()) { final GenericRecord record = dataStream.next(); for (final String column : cardinalityCountsMap.keySet()) { Object value = record.get(column); if (value == null) { switch (schema.getField(column).schema().getType()) { case INT: value = 0; break; case FLOAT: value = 0F; break; case LONG: value = 0L; break; case DOUBLE: value = 0D; break; case STRING: case BOOLEAN: value = "null"; break; } } if (value instanceof Utf8) { value = ((Utf8) value).toString(); } if (value instanceof Array) { continue; } // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue for (final String metricName : metrics) { final String groupbyKeyBase = column + ":" + record.get(column) + ":" + metricName; int dimCounter = 1; for (final String dim : cardinalityCountsMap.keySet()) { if (!dim.equals(column)) { dimCounter++; final String groupbyKey = groupbyKeyBase + ":" + dim; if (sumGroupBy.containsKey(groupbyKey)) { if (sumGroupBy.get(groupbyKey).containsKey(record.get(dim))) { sumGroupBy .get(groupbyKey) .put( record.get(dim), getAppropriateNumberType( metricName, record.get(metricName), sumGroupBy.get(groupbyKey).get(record.get(dim)))); } else { sumGroupBy .get(groupbyKey) .put(record.get(dim), Double.parseDouble(record.get(metricName).toString())); } } else { sumGroupBy.put(groupbyKey, new HashMap<Object, Double>()); sumGroupBy .get(groupbyKey) .put(record.get(dim), Double.parseDouble(record.get(metricName).toString())); } } if (dimCounter == 4) { break; } } } if (cardinalityCountsMap.get(column).containsKey(value)) { cardinalityCountsMap .get(column) .put(value, cardinalityCountsMap.get(column).get(value) + 1); } else { cardinalityCountsMap.get(column).put(value, 1); } if (!sumMap.get(column).containsKey(value)) { sumMap.get(column).put(value, new HashMap<String, Double>()); } for (final String metric : metrics) { if (!sumMap.get(column).get(value).containsKey(metric)) { sumMap .get(column) .get(value) .put(metric, getAppropriateNumberType(metric, record.get(metric), 0D)); } else { sumMap .get(column) .get(value) .put( metric, getAppropriateNumberType( metric, record.get(metric), sumMap.get(column).get(value).get(metric))); } } // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue } } dataStream.close(); if (!isRealtimeSegment) { for (final String column : cardinalityCountsMap.keySet()) { for (final Object entry : cardinalityCountsMap.get(column).keySet()) { final StringBuilder bld = new StringBuilder(); bld.append("select count(*) from "); bld.append(resourceName); bld.append(" where "); bld.append(column); bld.append("="); bld.append("'"); bld.append(entry); bld.append("'"); bld.append(" "); bld.append("limit 0"); String queryString = bld.toString(); if (!queryString.contains("null")) { aggregationQueries.add( new TestSimpleAggreationQuery( queryString, new Double(cardinalityCountsMap.get(column).get(entry)))); } } } } for (final String column : sumMap.keySet()) { for (final Object value : sumMap.get(column).keySet()) { for (final String metric : sumMap.get(column).get(value).keySet()) { final StringBuilder bld = new StringBuilder(); bld.append("select sum('" + metric + "') from "); bld.append(resourceName); bld.append(" where "); bld.append(column); bld.append("="); bld.append("'"); bld.append(value); bld.append("'"); bld.append(" "); bld.append("limit 0"); String queryString = bld.toString(); if (!queryString.contains("null")) { aggregationQueries.add( new TestSimpleAggreationQuery( bld.toString(), sumMap.get(column).get(value).get(metric))); } } } } for (final String groupKey : sumGroupBy.keySet()) { final String columnName = groupKey.split(":")[0]; final String columnValue = groupKey.split(":")[1]; final String metricColumn = groupKey.split(":")[2]; final String groupByColumnName = groupKey.split(":")[3]; final StringBuilder bld = new StringBuilder(); bld.append("select sum('" + metricColumn + "') from "); bld.append(resourceName); bld.append(" where "); bld.append(columnName); bld.append("="); bld.append("'"); bld.append(columnValue); bld.append("'"); bld.append(" "); bld.append(" group by "); bld.append(groupByColumnName); bld.append(" top 10 "); bld.append("limit 0"); String queryString = bld.toString(); if (!queryString.contains("null")) { groupByQueries.add( new TestGroupByAggreationQuery(bld.toString(), sumGroupBy.get(groupKey))); } } }