예제 #1
0
파일: AvroUtils.java 프로젝트: jmhsieh/kite
  /**
   * Returns true if the types of two avro schemas are equal. This ignores things like custom field
   * properties that the equals() implementation of Schema checks.
   *
   * @param schema1 The first schema to compare
   * @param schema2 The second schema to compare
   * @return True if the types are equal, otherwise false.
   */
  public static boolean avroSchemaTypesEqual(Schema schema1, Schema schema2) {
    if (schema1.getType() != schema2.getType()) {
      // if the types aren't equal, no need to go further. Return false
      return false;
    }

    if (schema1.getType() == Schema.Type.ENUM || schema1.getType() == Schema.Type.FIXED) {
      // Enum and Fixed types schemas should be equal using the Schema.equals
      // method.
      return schema1.equals(schema2);
    }
    if (schema1.getType() == Schema.Type.ARRAY) {
      // Avro element schemas should be equal, which is tested by recursively
      // calling this method.
      return avroSchemaTypesEqual(schema1.getElementType(), schema2.getElementType());
    } else if (schema1.getType() == Schema.Type.MAP) {
      // Map type values schemas should be equal, which is tested by recursively
      // calling this method.
      return avroSchemaTypesEqual(schema1.getValueType(), schema2.getValueType());
    } else if (schema1.getType() == Schema.Type.UNION) {
      // Compare Union fields in the same position by comparing their schemas
      // recursively calling this method.
      if (schema1.getTypes().size() != schema2.getTypes().size()) {
        return false;
      }
      for (int i = 0; i < schema1.getTypes().size(); i++) {
        if (!avroSchemaTypesEqual(schema1.getTypes().get(i), schema2.getTypes().get(i))) {
          return false;
        }
      }
      return true;
    } else if (schema1.getType() == Schema.Type.RECORD) {
      // Compare record fields that match in name by comparing their schemas
      // recursively calling this method.
      if (schema1.getFields().size() != schema2.getFields().size()) {
        return false;
      }
      for (Field field1 : schema1.getFields()) {
        Field field2 = schema2.getField(field1.name());
        if (field2 == null) {
          return false;
        }
        if (!avroSchemaTypesEqual(field1.schema(), field2.schema())) {
          return false;
        }
      }
      return true;
    } else {
      // All other types are primitive, so them matching in type is enough.
      return true;
    }
  }
예제 #2
0
파일: AvroUtils.java 프로젝트: jmhsieh/kite
  /**
   * Get a map of field names to default values for an Avro schema.
   *
   * @param avroRecordSchema The schema to get the map of field names to values.
   * @return The map.
   */
  public static Map<String, Object> getDefaultValueMap(Schema avroRecordSchema) {
    List<Field> defaultFields = new ArrayList<Field>();
    for (Field f : avroRecordSchema.getFields()) {
      if (f.defaultValue() != null) {
        // Need to create a new Field here or we will get
        // org.apache.avro.AvroRuntimeException: Field already used:
        // schemaVersion
        defaultFields.add(new Field(f.name(), f.schema(), f.doc(), f.defaultValue(), f.order()));
      }
    }

    Schema defaultSchema = Schema.createRecord(defaultFields);
    Schema emptyRecordSchema = Schema.createRecord(new ArrayList<Field>());
    DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(emptyRecordSchema);
    DatumReader<GenericRecord> reader =
        new GenericDatumReader<GenericRecord>(emptyRecordSchema, defaultSchema);

    GenericRecord emptyRecord = new GenericData.Record(emptyRecordSchema);
    GenericRecord defaultRecord =
        AvroUtils.readAvroEntity(AvroUtils.writeAvroEntity(emptyRecord, writer), reader);

    Map<String, Object> defaultValueMap = new HashMap<String, Object>();
    for (Field f : defaultFields) {
      defaultValueMap.put(f.name(), defaultRecord.get(f.name()));
    }
    return defaultValueMap;
  }
예제 #3
0
  @BeforeClass
  public static void before() throws Exception {
    final String filePath =
        TestUtils.getFileFromResourceUrl(
            DictionariesTest.class.getClassLoader().getResource(AVRO_DATA));
    if (INDEX_DIR.exists()) {
      FileUtils.deleteQuietly(INDEX_DIR);
    }

    final SegmentGeneratorConfig config =
        SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(
            new File(filePath), INDEX_DIR, "time_day", TimeUnit.DAYS, "test");

    final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
    driver.init(config);
    driver.build();

    final Schema schema = AvroUtils.extractSchemaFromAvro(new File(filePath));

    final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath));
    final org.apache.avro.Schema avroSchema = avroReader.getSchema();
    final String[] columns = new String[avroSchema.getFields().size()];
    int i = 0;
    for (final Field f : avroSchema.getFields()) {
      columns[i] = f.name();
      i++;
    }

    uniqueEntries = new HashMap<String, Set<Object>>();
    for (final String column : columns) {
      uniqueEntries.put(column, new HashSet<Object>());
    }

    while (avroReader.hasNext()) {
      final GenericRecord rec = avroReader.next();
      for (final String column : columns) {
        Object val = rec.get(column);
        if (val instanceof Utf8) {
          val = ((Utf8) val).toString();
        }
        uniqueEntries
            .get(column)
            .add(getAppropriateType(schema.getFieldSpecFor(column).getDataType(), val));
      }
    }
  }
예제 #4
0
 @Override
 public void insertValues(NullWritable key, AvroGenericRecordWritable value)
     throws SQLException {
   deleteStmt.execute();
   for (int i = 0; i < schema.getFields().size(); i++) {
     insertStmt.setObject(i + 1, convertIn(value.getRecord().get(i)));
   }
   insertStmt.execute();
 }
예제 #5
0
 private static int hashCode(HashData data, Schema schema) throws IOException {
   Decoder decoder = data.decoder;
   switch (schema.getType()) {
     case RECORD:
       {
         int hashCode = 1;
         for (Field field : schema.getFields()) {
           if (field.order() == Field.Order.IGNORE) {
             GenericDatumReader.skip(field.schema(), decoder);
             continue;
           }
           hashCode = hashCode * 31 + hashCode(data, field.schema());
         }
         return hashCode;
       }
     case ENUM:
     case INT:
       return decoder.readInt();
     case FLOAT:
       return Float.floatToIntBits(decoder.readFloat());
     case LONG:
       {
         long l = decoder.readLong();
         return (int) (l ^ (l >>> 32));
       }
     case DOUBLE:
       {
         long l = Double.doubleToLongBits(decoder.readDouble());
         return (int) (l ^ (l >>> 32));
       }
     case ARRAY:
       {
         Schema elementType = schema.getElementType();
         int hashCode = 1;
         for (long l = decoder.readArrayStart(); l != 0; l = decoder.arrayNext())
           for (long i = 0; i < l; i++) hashCode = hashCode * 31 + hashCode(data, elementType);
         return hashCode;
       }
     case MAP:
       throw new AvroRuntimeException("Can't hashCode maps!");
     case UNION:
       return hashCode(data, schema.getTypes().get(decoder.readInt()));
     case FIXED:
       return hashBytes(1, data, schema.getFixedSize(), false);
     case STRING:
       return hashBytes(0, data, decoder.readInt(), false);
     case BYTES:
       return hashBytes(1, data, decoder.readInt(), true);
     case BOOLEAN:
       return decoder.readBoolean() ? 1231 : 1237;
     case NULL:
       return 0;
     default:
       throw new AvroRuntimeException("Unexpected schema to hashCode!");
   }
 }
 @Override
 public void setConf(org.apache.hadoop.conf.Configuration conf) {
   if (conf == null) return; // you first get a null configuration - ignore that
   String mos = conf.get(AvroJob.MAP_OUTPUT_SCHEMA);
   Schema schema = Schema.parse(mos);
   pair = new Pair<Object, Object>(schema);
   Schema keySchema = Pair.getKeySchema(schema);
   final List<Field> fields = keySchema.getFields();
   final GenericRecord key = new GenericData.Record(keySchema);
   projector = new Projector(key, fields);
 }
예제 #7
0
  public static DataType getColumnType(Field field) {
    org.apache.avro.Schema fieldSchema = field.schema();
    fieldSchema = extractSchemaFromUnionIfNeeded(fieldSchema);

    final Type type = fieldSchema.getType();
    if (type == Type.ARRAY) {
      org.apache.avro.Schema elementSchema =
          extractSchemaFromUnionIfNeeded(fieldSchema.getElementType());
      if (elementSchema.getType() == Type.RECORD) {
        if (elementSchema.getFields().size() == 1) {
          elementSchema = elementSchema.getFields().get(0).schema();
        } else {
          throw new RuntimeException("More than one schema in Multi-value column!");
        }
        elementSchema = extractSchemaFromUnionIfNeeded(elementSchema);
      }
      return DataType.valueOf(elementSchema.getType());
    } else {
      return DataType.valueOf(type);
    }
  }
  @Override
  @SuppressWarnings("unchecked")
  protected void writeRecord(Schema schema, Object datum, Encoder out) throws IOException {

    if (persistent == null) {
      persistent = (T) datum;
    }

    if (!writeDirtyBits) {
      super.writeRecord(schema, datum, out);
      return;
    }

    // check if top level schema
    if (schema.equals(persistent.getSchema())) {
      // write readable fields and dirty fields info
      boolean[] dirtyFields = new boolean[schema.getFields().size()];
      boolean[] readableFields = new boolean[schema.getFields().size()];
      StateManager manager = persistent.getStateManager();

      int i = 0;
      for (@SuppressWarnings("unused") Field field : schema.getFields()) {
        dirtyFields[i] = manager.isDirty(persistent, i);
        readableFields[i] = manager.isReadable(persistent, i);
        i++;
      }

      IOUtils.writeBoolArray(out, dirtyFields);
      IOUtils.writeBoolArray(out, readableFields);

      for (Field field : schema.getFields()) {
        if (readableFields[field.pos()]) {
          write(field.schema(), getData().getField(datum, field.name(), field.pos()), out);
        }
      }

    } else {
      super.writeRecord(schema, datum, out);
    }
  }
예제 #9
0
  @Test
  public void test_getHiveTypeFromAvroType_primitive() throws Exception {
    // Expected ORC types
    String[] expectedTypes = {
      "INT", "BIGINT", "BOOLEAN", "FLOAT", "DOUBLE", "BINARY", "STRING",
    };

    Schema testSchema = buildPrimitiveAvroSchema();
    List<Schema.Field> fields = testSchema.getFields();
    for (int i = 0; i < fields.size(); i++) {
      assertEquals(expectedTypes[i], NiFiOrcUtils.getHiveTypeFromAvroType(fields.get(i).schema()));
    }
  }
예제 #10
0
  /**
   * Called by {@link #containsRecursiveRecord(Schema)} and it recursively checks whether the input
   * schema contains recursive records.
   */
  protected static boolean containsRecursiveRecord(Schema s, Set<String> definedRecordNames) {

    /* if it is a record, check itself and all fields*/
    if (s.getType().equals(Schema.Type.RECORD)) {
      String name = s.getName();
      if (definedRecordNames.contains(name)) return true;

      /* add its own name into defined record set*/
      definedRecordNames.add(s.getName());

      /* check all fields */
      List<Field> fields = s.getFields();
      for (Field field : fields) {
        Schema fs = field.schema();
        if (containsRecursiveRecord(fs, definedRecordNames)) return true;
      }

      /* remove its own name from the name set */
      definedRecordNames.remove(s.getName());

      return false;
    }

    /* if it is an array, check its element type */
    else if (s.getType().equals(Schema.Type.ARRAY)) {
      Schema fs = s.getElementType();
      return containsRecursiveRecord(fs, definedRecordNames);
    }

    /*if it is a map, check its value type */
    else if (s.getType().equals(Schema.Type.MAP)) {
      Schema vs = s.getValueType();
      return containsRecursiveRecord(vs, definedRecordNames);
    }

    /* if it is a union, check all possible types */
    else if (s.getType().equals(Schema.Type.UNION)) {
      List<Schema> types = s.getTypes();
      for (Schema type : types) {
        if (containsRecursiveRecord(type, definedRecordNames)) return true;
      }
      return false;
    }

    /* return false for other cases */
    else {
      return false;
    }
  }
예제 #11
0
  @Test
  public void test_getHiveTypeFromAvroType_complex() throws Exception {
    // Expected ORC types
    String[] expectedTypes = {
      "INT", "MAP<STRING, DOUBLE>", "STRING", "UNIONTYPE<BIGINT, FLOAT>", "ARRAY<INT>"
    };

    Schema testSchema = buildComplexAvroSchema();
    List<Schema.Field> fields = testSchema.getFields();
    for (int i = 0; i < fields.size(); i++) {
      assertEquals(expectedTypes[i], NiFiOrcUtils.getHiveTypeFromAvroType(fields.get(i).schema()));
    }

    assertEquals(
        "STRUCT<myInt:INT, myMap:MAP<STRING, DOUBLE>, myEnum:STRING, myLongOrFloat:UNIONTYPE<BIGINT, FLOAT>, myIntList:ARRAY<INT>>",
        NiFiOrcUtils.getHiveTypeFromAvroType(testSchema));
  }
예제 #12
0
    @Override
    public void initialize(Connection conn) throws SQLException {
      this.conn = conn;
      StringBuilder create = new StringBuilder("CREATE TABLE ").append(tableName).append(" (");
      StringBuilder insert =
          new StringBuilder("INSERT INTO ").append(tableName).append(" VALUES (");
      for (int i = 0; i < schema.getFields().size(); i++) {
        Schema.Field sf = schema.getFields().get(i);
        create.append(sf.name()).append(' ').append(getSQLType(sf.schema())).append(',');
        insert.append("?,");
      }
      create.deleteCharAt(create.length() - 1).append(")");
      insert.deleteCharAt(insert.length() - 1).append(")");

      conn.createStatement().execute(create.toString());
      insertStmt = conn.prepareStatement(insert.toString());
      deleteStmt = conn.prepareStatement("DELETE FROM " + tableName);
    }
예제 #13
0
  /*
   * for each avro field, search for nested schema.
   * if field is nested, create tree and recursive
   * else fetch field as a element
   */
  private static void fetchToTree(Field field, Tree parent) {
    //		System.out.println(" Field: " + field.name());
    if (field.schema().getType().toString().equalsIgnoreCase("RECORD")) {
      //			if(!multipleData)
      //				multipleData = true;
      //

      Tree child = new Tree(parent);
      child.setName(field.name());

      List<Field> list = field.schema().getFields();
      Iterator<Field> it = list.iterator();

      while (it.hasNext()) {
        Field fieldOfField = it.next();
        fetchToTree(fieldOfField, child);
      }
      parent.getTrees().add(child);
    } else if (field.schema().getType().getName().equalsIgnoreCase("ARRAY")) {
      if (field.schema().getElementType().getType().name().toString().equalsIgnoreCase("RECORD")) {
        if (!multipleData) multipleData = true;

        Schema arraySchema = field.schema().getElementType();
        Tree childParent = new Tree(parent);
        childParent.setName(field.name()); // employee
        //					parent.getTrees().add(childParent);
        //					Tree child = new Tree(childParent);
        //					child.setName(arraySchema.getName());//employeerecord
        List<Field> list = arraySchema.getFields();
        Iterator<Field> it = list.iterator();

        while (it.hasNext()) {
          Field fieldOfField = it.next();
          fetchToTree(fieldOfField, childParent);
        }
        parent.getTrees().add(childParent);
      }
    } else {
      Element elementNew = new Element(parent);
      elementNew.setName(field.name());
      parent.getElements().add(elementNew);
    }
    //		return parent;
  }
예제 #14
0
  @Test
  public void test_getOrcField_primitive() throws Exception {
    // Expected ORC types
    TypeInfo[] expectedTypes = {
      TypeInfoFactory.getPrimitiveTypeInfo("int"),
      TypeInfoFactory.getPrimitiveTypeInfo("bigint"),
      TypeInfoFactory.getPrimitiveTypeInfo("boolean"),
      TypeInfoFactory.getPrimitiveTypeInfo("float"),
      TypeInfoFactory.getPrimitiveTypeInfo("double"),
      TypeInfoFactory.getPrimitiveTypeInfo("binary"),
      TypeInfoFactory.getPrimitiveTypeInfo("string")
    };

    // Build a fake Avro record with all types
    Schema testSchema = buildPrimitiveAvroSchema();
    List<Schema.Field> fields = testSchema.getFields();
    for (int i = 0; i < fields.size(); i++) {
      assertEquals(expectedTypes[i], NiFiOrcUtils.getOrcField(fields.get(i).schema()));
    }
  }
예제 #15
0
 @Override
 public boolean retrieveResults(NullWritable key, AvroGenericRecordWritable value)
     throws SQLException {
   Statement stmt = conn.createStatement();
   try {
     ResultSet rs = stmt.executeQuery("select * from " + tableName);
     if (rs.next()) {
       for (int i = 0; i < schema.getFields().size(); i++) {
         value.getRecord().put(i, convertOut(rs.getObject(i + 1)));
       }
       return false;
     } else {
       return true;
     }
   } finally {
     if (stmt != null) {
       stmt.close();
     }
   }
 }
예제 #16
0
  @Test
  public void test_getPrimitiveOrcTypeFromPrimitiveAvroType() throws Exception {
    // Expected ORC types
    TypeInfo[] expectedTypes = {
      TypeInfoCreator.createInt(),
      TypeInfoCreator.createLong(),
      TypeInfoCreator.createBoolean(),
      TypeInfoCreator.createFloat(),
      TypeInfoCreator.createDouble(),
      TypeInfoCreator.createBinary(),
      TypeInfoCreator.createString(),
    };

    Schema testSchema = buildPrimitiveAvroSchema();
    List<Schema.Field> fields = testSchema.getFields();
    for (int i = 0; i < fields.size(); i++) {
      assertEquals(
          expectedTypes[i],
          NiFiOrcUtils.getPrimitiveOrcTypeFromPrimitiveAvroType(fields.get(i).schema().getType()));
    }
  }
예제 #17
0
  @Test
  public void testProjection() throws IOException {
    Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false);
    Configuration conf = new Configuration();

    Schema schema = Car.getClassSchema();
    List<Schema.Field> fields = schema.getFields();

    // Schema.Parser parser = new Schema.Parser();
    List<Schema.Field> projectedFields = new ArrayList<Schema.Field>();
    for (Schema.Field field : fields) {
      String name = field.name();
      if ("optionalExtra".equals(name) || "serviceHistory".equals(name)) {
        continue;
      }

      // Schema schemaClone = parser.parse(field.schema().toString(false));
      Schema.Field fieldClone =
          new Schema.Field(name, field.schema(), field.doc(), field.defaultValue());
      projectedFields.add(fieldClone);
    }

    Schema projectedSchema =
        Schema.createRecord(
            schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError());
    projectedSchema.setFields(projectedFields);
    AvroReadSupport.setRequestedProjection(conf, projectedSchema);

    ParquetReader<Car> reader = new AvroParquetReader<Car>(conf, path);
    for (Car car = reader.read(); car != null; car = reader.read()) {
      assertEquals(car.getDoors() != null, true);
      assertEquals(car.getEngine() != null, true);
      assertEquals(car.getMake() != null, true);
      assertEquals(car.getModel() != null, true);
      assertEquals(car.getYear() != null, true);
      assertEquals(car.getVin() != null, true);
      assertNull(car.getOptionalExtra());
      assertNull(car.getServiceHistory());
    }
  }
예제 #18
0
  /**
   * Precondition-style validation that the DatasetDescriptor is compatible.
   *
   * @param descriptor a {@link DatasetDescriptor}
   */
  public static void checkDescriptor(DatasetDescriptor descriptor) {
    Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");

    Schema schema = descriptor.getSchema();
    checkSchema(schema);

    if (descriptor.isPartitioned()) {
      // marked as [BUG] because this is checked in DatasetDescriptor
      Preconditions.checkArgument(
          schema.getType() == Schema.Type.RECORD,
          "[BUG] Partitioned datasets must have record schemas");

      Set<String> names = Sets.newHashSet();
      for (Schema.Field field : schema.getFields()) {
        names.add(field.name());
      }

      List<String> incompatible = Lists.newArrayList();
      List<String> duplicates = Lists.newArrayList();
      for (FieldPartitioner fp : descriptor.getPartitionStrategy().getFieldPartitioners()) {
        String name = fp.getName();
        if (!isCompatibleName(name)) {
          incompatible.add(name);
        } else if (names.contains(name)) {
          duplicates.add(name);
        } else {
          names.add(name);
        }
      }
      Preconditions.checkState(
          incompatible.isEmpty(),
          "Hive incompatible: partition names are not alphanumeric (plus '_'): %s",
          Joiner.on(", ").join(incompatible));
      Preconditions.checkState(
          duplicates.isEmpty(),
          "Hive incompatible: partition names duplicate data fields: %s",
          Joiner.on(", ").join(duplicates));
    }
  }
예제 #19
0
  /** determine whether the input schema contains generic unions */
  public static boolean containsGenericUnion(Schema s) {

    /* if it is a record, check all fields*/
    if (s.getType().equals(Schema.Type.RECORD)) {
      List<Field> fields = s.getFields();
      for (Field field : fields) {
        Schema fs = field.schema();
        if (containsGenericUnion(fs)) return true;
      }
      return false;
    }

    /* if it is an array, check its element type */
    else if (s.getType().equals(Schema.Type.ARRAY)) {
      Schema fs = s.getElementType();
      return containsGenericUnion(fs);
    }

    /*if it is a map, check its value type */
    else if (s.getType().equals(Schema.Type.MAP)) {
      Schema vs = s.getValueType();
      return containsGenericUnion(vs);
    }

    /* if it is a union, check all possible types and itself */
    else if (s.getType().equals(Schema.Type.UNION)) {
      List<Schema> types = s.getTypes();
      for (Schema type : types) {
        if (containsGenericUnion(type)) return true;
      }
      /* check whether itself is acceptable (null-union) */
      return !isAcceptableUnion(s);
    }

    /* return false for other cases */
    else {
      return false;
    }
  }
예제 #20
0
    private Object translate(Object value, DataSchema dataSchema, Schema avroSchema) {
      AvroOverride avroOverride = getAvroOverride(dataSchema);
      if (avroOverride != null) {
        return avroOverride
            .getCustomDataTranslator()
            .avroGenericToData(this, value, avroSchema, dataSchema);
      }

      DataSchema dereferencedDataSchema = dataSchema.getDereferencedDataSchema();
      DataSchema.Type type = dereferencedDataSchema.getType();
      Object result;
      switch (type) {
        case NULL:
          if (value != null) {
            appendMessage("value must be null for null schema");
            result = BAD_RESULT;
            break;
          }
          result = Data.NULL;
          break;
        case BOOLEAN:
          result = ((Boolean) value).booleanValue();
          break;
        case INT:
          result = ((Number) value).intValue();
          break;
        case LONG:
          result = ((Number) value).longValue();
          break;
        case FLOAT:
          result = ((Number) value).floatValue();
          break;
        case DOUBLE:
          result = ((Number) value).doubleValue();
          break;
        case STRING:
          result = value.toString();
          break;
        case BYTES:
          ByteBuffer byteBuffer = (ByteBuffer) value;
          ByteString byteString = ByteString.copy(byteBuffer);
          byteBuffer.rewind();
          result = byteString;
          break;
        case ENUM:
          String enumValue = value.toString();
          EnumDataSchema enumDataSchema = (EnumDataSchema) dereferencedDataSchema;
          if (enumDataSchema.getSymbols().contains(enumValue) == false) {
            appendMessage(
                "enum value %1$s not one of %2$s", enumValue, enumDataSchema.getSymbols());
            result = BAD_RESULT;
            break;
          }
          result = enumValue;
          break;
        case FIXED:
          GenericFixed fixed = (GenericFixed) value;
          byte[] fixedBytes = fixed.bytes();
          FixedDataSchema fixedDataSchema = (FixedDataSchema) dereferencedDataSchema;
          if (fixedDataSchema.getSize() != fixedBytes.length) {
            appendMessage(
                "GenericFixed size %1$d != FixedDataSchema size %2$d",
                fixedBytes.length, fixedDataSchema.getSize());
            result = BAD_RESULT;
            break;
          }
          byteString = ByteString.copy(fixedBytes);
          result = byteString;
          break;
        case MAP:
          @SuppressWarnings("unchecked")
          Map<?, Object> map = (Map<?, Object>) value;
          DataSchema valueDataSchema = ((MapDataSchema) dereferencedDataSchema).getValues();
          Schema valueAvroSchema = avroSchema.getValueType();
          DataMap dataMap = new DataMap(map.size());
          for (Map.Entry<?, Object> entry : map.entrySet()) {
            String key = entry.getKey().toString();
            _path.addLast(key);
            Object entryValue = translate(entry.getValue(), valueDataSchema, valueAvroSchema);
            _path.removeLast();
            dataMap.put(key, entryValue);
          }
          result = dataMap;
          break;
        case ARRAY:
          GenericArray<?> list = (GenericArray<?>) value;
          DataSchema elementDataSchema = ((ArrayDataSchema) dereferencedDataSchema).getItems();
          Schema elementAvroSchema = avroSchema.getElementType();
          DataList dataList = new DataList(list.size());
          for (int i = 0; i < list.size(); i++) {
            _path.addLast(i);
            Object entryValue = translate(list.get(i), elementDataSchema, elementAvroSchema);
            _path.removeLast();
            dataList.add(entryValue);
          }
          result = dataList;
          break;
        case RECORD:
          GenericRecord record = (GenericRecord) value;
          RecordDataSchema recordDataSchema = (RecordDataSchema) dereferencedDataSchema;
          dataMap = new DataMap(avroSchema.getFields().size());
          for (RecordDataSchema.Field field : recordDataSchema.getFields()) {
            String fieldName = field.getName();
            Object fieldValue = record.get(fieldName);
            // fieldValue could be null if the Avro schema does not contain the named field or
            // the field is present with a null value. In either case we do not add a value
            // to the translated DataMap. We do not consider optional/required/default here
            // either (i.e. it is not an error if a required field is missing); the user can
            // later call ValidateDataAgainstSchema with various
            // settings for RequiredMode to obtain the desired behaviour.
            if (fieldValue == null) {
              continue;
            }
            boolean isOptional = field.getOptional();
            DataSchema fieldDataSchema = field.getType();
            Schema fieldAvroSchema = avroSchema.getField(fieldName).schema();
            if (isOptional && (fieldDataSchema.getDereferencedType() != DataSchema.Type.UNION)) {
              // Avro schema should be union with 2 types: null and the field's type.
              Map.Entry<String, Schema> fieldAvroEntry =
                  findUnionMember(fieldDataSchema, fieldAvroSchema);
              if (fieldAvroEntry == null) {
                continue;
              }
              fieldAvroSchema = fieldAvroEntry.getValue();
            }
            _path.addLast(fieldName);
            dataMap.put(fieldName, translate(fieldValue, fieldDataSchema, fieldAvroSchema));
            _path.removeLast();
          }
          result = dataMap;
          break;
        case UNION:
          UnionDataSchema unionDataSchema = (UnionDataSchema) dereferencedDataSchema;
          Map.Entry<DataSchema, Schema> memberSchemas =
              findUnionMemberSchema(value, unionDataSchema, avroSchema);
          if (memberSchemas == null) {
            result = BAD_RESULT;
            break;
          }
          if (value == null) {
            // schema must be "null" schema
            result = Data.NULL;
          } else {
            DataSchema memberDataSchema = memberSchemas.getKey();
            Schema memberAvroSchema = memberSchemas.getValue();
            String key = memberDataSchema.getUnionMemberKey();
            dataMap = new DataMap(1);
            _path.addLast(key);
            dataMap.put(key, translate(value, memberDataSchema, memberAvroSchema));
            _path.removeLast();
            result = dataMap;
          }
          break;
        default:
          appendMessage("schema type unknown %1$s", dereferencedDataSchema.getType());
          result = BAD_RESULT;
          break;
      }
      return result;
    }
예제 #21
0
 /**
  * If equal, return the number of bytes consumed. If greater than, return GT, if less than, return
  * LT.
  */
 private static int compare(Decoders d, Schema schema) throws IOException {
   Decoder d1 = d.d1;
   Decoder d2 = d.d2;
   switch (schema.getType()) {
     case RECORD:
       {
         for (Field field : schema.getFields()) {
           if (field.order() == Field.Order.IGNORE) {
             GenericDatumReader.skip(field.schema(), d1);
             GenericDatumReader.skip(field.schema(), d2);
             continue;
           }
           int c = compare(d, field.schema());
           if (c != 0) return (field.order() != Field.Order.DESCENDING) ? c : -c;
         }
         return 0;
       }
     case ENUM:
     case INT:
       {
         int i1 = d1.readInt();
         int i2 = d2.readInt();
         return i1 == i2 ? 0 : (i1 > i2 ? 1 : -1);
       }
     case LONG:
       {
         long l1 = d1.readLong();
         long l2 = d2.readLong();
         return l1 == l2 ? 0 : (l1 > l2 ? 1 : -1);
       }
     case ARRAY:
       {
         long i = 0; // position in array
         long r1 = 0, r2 = 0; // remaining in current block
         long l1 = 0, l2 = 0; // total array length
         while (true) {
           if (r1 == 0) { // refill blocks(s)
             r1 = d1.readLong();
             if (r1 < 0) {
               r1 = -r1;
               d1.readLong();
             }
             l1 += r1;
           }
           if (r2 == 0) {
             r2 = d2.readLong();
             if (r2 < 0) {
               r2 = -r2;
               d2.readLong();
             }
             l2 += r2;
           }
           if (r1 == 0 || r2 == 0) // empty block: done
           return (l1 == l2) ? 0 : ((l1 > l2) ? 1 : -1);
           long l = Math.min(l1, l2);
           while (i < l) { // compare to end of block
             int c = compare(d, schema.getElementType());
             if (c != 0) return c;
             i++;
             r1--;
             r2--;
           }
         }
       }
     case MAP:
       throw new AvroRuntimeException("Can't compare maps!");
     case UNION:
       {
         int i1 = d1.readInt();
         int i2 = d2.readInt();
         if (i1 == i2) {
           return compare(d, schema.getTypes().get(i1));
         } else {
           return i1 - i2;
         }
       }
     case FIXED:
       {
         int size = schema.getFixedSize();
         int c =
             compareBytes(d.d1.getBuf(), d.d1.getPos(), size, d.d2.getBuf(), d.d2.getPos(), size);
         d.d1.skipFixed(size);
         d.d2.skipFixed(size);
         return c;
       }
     case STRING:
     case BYTES:
       {
         int l1 = d1.readInt();
         int l2 = d2.readInt();
         int c = compareBytes(d.d1.getBuf(), d.d1.getPos(), l1, d.d2.getBuf(), d.d2.getPos(), l2);
         d.d1.skipFixed(l1);
         d.d2.skipFixed(l2);
         return c;
       }
     case FLOAT:
       {
         float f1 = d1.readFloat();
         float f2 = d2.readFloat();
         return (f1 == f2) ? 0 : ((f1 > f2) ? 1 : -1);
       }
     case DOUBLE:
       {
         double f1 = d1.readDouble();
         double f2 = d2.readDouble();
         return (f1 == f2) ? 0 : ((f1 > f2) ? 1 : -1);
       }
     case BOOLEAN:
       boolean b1 = d1.readBoolean();
       boolean b2 = d2.readBoolean();
       return (b1 == b2) ? 0 : (b1 ? 1 : -1);
     case NULL:
       return 0;
     default:
       throw new AvroRuntimeException("Unexpected schema to compare!");
   }
 }
예제 #22
0
 /** Writes the given Avro datum into the given record, using the given Avro schema */
 private void extractTree(Object datum, Schema schema, Record outputRecord, String prefix) {
   // RECORD, ENUM, ARRAY, MAP, UNION, FIXED, STRING, BYTES, INT, LONG, FLOAT,
   // DOUBLE, BOOLEAN, NULL
   switch (schema.getType()) {
     case RECORD:
       {
         IndexedRecord avroRecord = (IndexedRecord) datum;
         String prefix2 = prefix + "/";
         for (Field field : schema.getFields()) {
           extractTree(
               avroRecord.get(field.pos()),
               field.schema(),
               outputRecord,
               prefix2 + field.name());
         }
         break;
       }
     case ENUM:
       {
         GenericEnumSymbol symbol = (GenericEnumSymbol) datum;
         outputRecord.put(prefix, symbol.toString());
         break;
       }
     case ARRAY:
       {
         Iterator iter = ((Collection) datum).iterator();
         while (iter.hasNext()) {
           extractTree(iter.next(), schema.getElementType(), outputRecord, prefix);
         }
         break;
       }
     case MAP:
       {
         Map<CharSequence, ?> map = (Map<CharSequence, ?>) datum;
         for (Map.Entry<CharSequence, ?> entry : map.entrySet()) {
           extractTree(
               entry.getValue(),
               schema.getValueType(),
               outputRecord,
               prefix + "/" + entry.getKey().toString());
         }
         break;
       }
     case UNION:
       {
         int index = GenericData.get().resolveUnion(schema, datum);
         // String typeName = schema.getTypes().get(index).getName();
         // String prefix2 = prefix + "/" + typeName;
         String prefix2 = prefix;
         extractTree(datum, schema.getTypes().get(index), outputRecord, prefix2);
         break;
       }
     case FIXED:
       {
         GenericFixed fixed = (GenericFixed) datum;
         outputRecord.put(prefix, fixed.bytes());
         // outputRecord.put(prefix, utf8toString(fixed.bytes()));
         break;
       }
     case BYTES:
       {
         ByteBuffer buf = (ByteBuffer) datum;
         int pos = buf.position();
         byte[] bytes = new byte[buf.remaining()];
         buf.get(bytes);
         buf.position(pos); // undo relative read
         outputRecord.put(prefix, bytes);
         // outputRecord.put(prefix, utf8toString(bytes));
         break;
       }
     case STRING:
       {
         outputRecord.put(prefix, datum.toString());
         break;
       }
     case INT:
       {
         outputRecord.put(prefix, datum);
         break;
       }
     case LONG:
       {
         outputRecord.put(prefix, datum);
         break;
       }
     case FLOAT:
       {
         outputRecord.put(prefix, datum);
         break;
       }
     case DOUBLE:
       {
         outputRecord.put(prefix, datum);
         break;
       }
     case BOOLEAN:
       {
         outputRecord.put(prefix, datum);
         break;
       }
     case NULL:
       {
         break;
       }
     default:
       throw new MorphlineRuntimeException("Unknown Avro schema type: " + schema.getType());
   }
 }
예제 #23
0
  public void generateSimpleAggregationOnSingleColumnFilters() throws IOException {
    final Map<String, Map<Object, Integer>> cardinalityCountsMap =
        new HashMap<String, Map<Object, Integer>>();
    final Map<String, Map<Object, Map<String, Double>>> sumMap =
        new HashMap<String, Map<Object, Map<String, Double>>>();
    // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue

    final Map<String, Map<Object, Double>> sumGroupBy = new HashMap<String, Map<Object, Double>>();

    aggregationQueries = new ArrayList<AvroQueryGenerator.TestSimpleAggreationQuery>();
    groupByQueries = new ArrayList<AvroQueryGenerator.TestGroupByAggreationQuery>();
    for (final Field f : schema.getFields()) {
      final String fieldName = f.name();
      if (dimensions.contains(fieldName) || metrics.contains(fieldName) || time.equals(fieldName)) {
        isSingleValueMap.put(fieldName, isSingleValueField(f));
        dataTypeMap.put(fieldName, getColumnType(f));
        if (!metrics.contains(fieldName)) {
          cardinalityCountsMap.put(fieldName, new HashMap<Object, Integer>());
        }
      }
    }

    for (final String column : cardinalityCountsMap.keySet()) {
      sumMap.put(column, new HashMap<Object, Map<String, Double>>());
    }

    // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue

    while (dataStream.hasNext()) {
      final GenericRecord record = dataStream.next();

      for (final String column : cardinalityCountsMap.keySet()) {
        Object value = record.get(column);

        if (value == null) {
          switch (schema.getField(column).schema().getType()) {
            case INT:
              value = 0;
              break;
            case FLOAT:
              value = 0F;
              break;
            case LONG:
              value = 0L;
              break;
            case DOUBLE:
              value = 0D;
              break;
            case STRING:
            case BOOLEAN:
              value = "null";
              break;
          }
        }

        if (value instanceof Utf8) {
          value = ((Utf8) value).toString();
        }

        if (value instanceof Array) {
          continue;
        }

        // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue

        for (final String metricName : metrics) {
          final String groupbyKeyBase = column + ":" + record.get(column) + ":" + metricName;
          int dimCounter = 1;
          for (final String dim : cardinalityCountsMap.keySet()) {
            if (!dim.equals(column)) {
              dimCounter++;
              final String groupbyKey = groupbyKeyBase + ":" + dim;
              if (sumGroupBy.containsKey(groupbyKey)) {
                if (sumGroupBy.get(groupbyKey).containsKey(record.get(dim))) {
                  sumGroupBy
                      .get(groupbyKey)
                      .put(
                          record.get(dim),
                          getAppropriateNumberType(
                              metricName,
                              record.get(metricName),
                              sumGroupBy.get(groupbyKey).get(record.get(dim))));
                } else {
                  sumGroupBy
                      .get(groupbyKey)
                      .put(record.get(dim), Double.parseDouble(record.get(metricName).toString()));
                }
              } else {
                sumGroupBy.put(groupbyKey, new HashMap<Object, Double>());
                sumGroupBy
                    .get(groupbyKey)
                    .put(record.get(dim), Double.parseDouble(record.get(metricName).toString()));
              }
            }
            if (dimCounter == 4) {
              break;
            }
          }
        }

        if (cardinalityCountsMap.get(column).containsKey(value)) {
          cardinalityCountsMap
              .get(column)
              .put(value, cardinalityCountsMap.get(column).get(value) + 1);
        } else {
          cardinalityCountsMap.get(column).put(value, 1);
        }

        if (!sumMap.get(column).containsKey(value)) {
          sumMap.get(column).put(value, new HashMap<String, Double>());
        }

        for (final String metric : metrics) {
          if (!sumMap.get(column).get(value).containsKey(metric)) {
            sumMap
                .get(column)
                .get(value)
                .put(metric, getAppropriateNumberType(metric, record.get(metric), 0D));
          } else {
            sumMap
                .get(column)
                .get(value)
                .put(
                    metric,
                    getAppropriateNumberType(
                        metric, record.get(metric), sumMap.get(column).get(value).get(metric)));
          }
        }
        // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue
      }
    }

    dataStream.close();

    if (!isRealtimeSegment) {
      for (final String column : cardinalityCountsMap.keySet()) {
        for (final Object entry : cardinalityCountsMap.get(column).keySet()) {
          final StringBuilder bld = new StringBuilder();
          bld.append("select count(*) from ");
          bld.append(resourceName);
          bld.append(" where ");
          bld.append(column);
          bld.append("=");
          bld.append("'");
          bld.append(entry);
          bld.append("'");
          bld.append(" ");
          bld.append("limit 0");
          String queryString = bld.toString();
          if (!queryString.contains("null")) {
            aggregationQueries.add(
                new TestSimpleAggreationQuery(
                    queryString, new Double(cardinalityCountsMap.get(column).get(entry))));
          }
        }
      }
    }

    for (final String column : sumMap.keySet()) {
      for (final Object value : sumMap.get(column).keySet()) {
        for (final String metric : sumMap.get(column).get(value).keySet()) {
          final StringBuilder bld = new StringBuilder();
          bld.append("select sum('" + metric + "') from ");
          bld.append(resourceName);
          bld.append(" where ");
          bld.append(column);
          bld.append("=");
          bld.append("'");
          bld.append(value);
          bld.append("'");
          bld.append(" ");
          bld.append("limit 0");
          String queryString = bld.toString();
          if (!queryString.contains("null")) {
            aggregationQueries.add(
                new TestSimpleAggreationQuery(
                    bld.toString(), sumMap.get(column).get(value).get(metric)));
          }
        }
      }
    }

    for (final String groupKey : sumGroupBy.keySet()) {
      final String columnName = groupKey.split(":")[0];
      final String columnValue = groupKey.split(":")[1];
      final String metricColumn = groupKey.split(":")[2];
      final String groupByColumnName = groupKey.split(":")[3];

      final StringBuilder bld = new StringBuilder();
      bld.append("select sum('" + metricColumn + "') from ");
      bld.append(resourceName);
      bld.append(" where ");
      bld.append(columnName);
      bld.append("=");
      bld.append("'");
      bld.append(columnValue);
      bld.append("'");
      bld.append(" ");
      bld.append(" group by ");
      bld.append(groupByColumnName);
      bld.append(" top 10 ");
      bld.append("limit 0");
      String queryString = bld.toString();
      if (!queryString.contains("null")) {
        groupByQueries.add(
            new TestGroupByAggreationQuery(bld.toString(), sumGroupBy.get(groupKey)));
      }
    }
  }