// Extract schema of the value field
  public String getValueSchema() throws IOException {
    Schema schema = AvroUtils.getAvroSchemaFromPath(getInputPath());

    String valueSchema = schema.getField(valueField).schema().toString();

    return valueSchema;
  }
  // Extract schema of the key field
  public String getKeySchema() throws IOException {
    Schema schema = AvroUtils.getAvroSchemaFromPath(getInputPath());

    String keySchema = schema.getField(keyField).schema().toString();

    return keySchema;
  }
Beispiel #3
0
 public static AvroKeySchema mergeSpecificStringTypes(
     Class<? extends SpecificRecord> specificClass, AvroKeySchema keySchema) {
   Schema schemaField;
   try {
     schemaField = (Schema) specificClass.getField("SCHEMA$").get(null);
   } catch (IllegalArgumentException e) {
     throw new DatasetException(e);
   } catch (SecurityException e) {
     throw new DatasetException(e);
   } catch (IllegalAccessException e) {
     throw new DatasetException(e);
   } catch (NoSuchFieldException e) {
     throw new DatasetException(e);
   }
   // Ensure schema is limited to keySchema's fields. The class may have more
   // fields
   // in the case that the entity is being used as a key.
   List<Field> fields = Lists.newArrayList();
   for (Schema.Field field : keySchema.getAvroSchema().getFields()) {
     fields.add(copy(schemaField.getField(field.name())));
   }
   Schema schema =
       Schema.createRecord(
           keySchema.getAvroSchema().getName(),
           keySchema.getAvroSchema().getDoc(),
           keySchema.getAvroSchema().getNamespace(),
           keySchema.getAvroSchema().isError());
   schema.setFields(fields);
   return new AvroKeySchema(schema, keySchema.getPartitionStrategy());
 }
Beispiel #4
0
 @Test
 public void test_getOrcField_enum() throws Exception {
   final SchemaBuilder.FieldAssembler<Schema> builder =
       SchemaBuilder.record("testRecord").namespace("any.data").fields();
   builder.name("enumField").type().enumeration("enum").symbols("a", "b", "c").enumDefault("a");
   Schema testSchema = builder.endRecord();
   TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("enumField").schema());
   assertEquals(TypeInfoCreator.createString(), orcType);
 }
Beispiel #5
0
 @Test
 public void test_getOrcField_array() throws Exception {
   final SchemaBuilder.FieldAssembler<Schema> builder =
       SchemaBuilder.record("testRecord").namespace("any.data").fields();
   builder.name("array").type().array().items().longType().noDefault();
   Schema testSchema = builder.endRecord();
   TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("array").schema());
   assertEquals(TypeInfoFactory.getListTypeInfo(TypeInfoCreator.createLong()), orcType);
 }
 public List<TimestampedValue> getValuesForValuePath(
     final String fieldName, String id, final long now, final long timespan) {
   Type fieldType = getType(fieldName);
   Cursor values;
   if (schema.getField(EXPRESSION_ID) != null) {
     values = getValuesCursor(this, uri, new String[] {fieldName}, now, timespan, id);
   } else {
     values = getValuesCursor(this, uri, new String[] {fieldName}, now, timespan, null);
   }
   List<TimestampedValue> ret = null;
   if (values != null && values.moveToFirst()) {
     int column = values.getColumnIndex(fieldName);
     ret = new ArrayList<TimestampedValue>(values.getCount());
     do {
       switch (fieldType) {
         case INT:
           ret.add(new TimestampedValue(values.getInt(column), values.getLong(0)));
           break;
         case LONG:
           ret.add(new TimestampedValue(values.getLong(column), values.getLong(0)));
           break;
         case ENUM:
         case STRING:
           ret.add(new TimestampedValue(values.getString(column), values.getLong(0)));
           break;
         case FLOAT:
           ret.add(new TimestampedValue(values.getFloat(column), values.getLong(0)));
           break;
         case DOUBLE:
           ret.add(new TimestampedValue(values.getDouble(column), values.getLong(0)));
           break;
         case FIXED:
         case BYTES:
           ret.add(new TimestampedValue(values.getBlob(column), values.getLong(0)));
         default:
           throw new RuntimeException("Unsupported type.");
       }
       // Limit to one result if timespan is zero
       if (timespan == 0) {
         break;
       }
     } while (values.moveToNext());
   }
   try {
     if (values != null) {
       values.close();
     }
   } catch (Exception e) {
     LOG.warn("Error closing cursor ignored.", e);
   }
   if (ret == null) {
     ret = new ArrayList<TimestampedValue>(0);
   }
   return ret;
 }
Beispiel #7
0
 @Test
 public void test_getOrcField_union() throws Exception {
   final SchemaBuilder.FieldAssembler<Schema> builder =
       SchemaBuilder.record("testRecord").namespace("any.data").fields();
   builder.name("union").type().unionOf().intType().and().booleanType().endUnion().noDefault();
   Schema testSchema = builder.endRecord();
   TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("union").schema());
   assertEquals(
       TypeInfoFactory.getUnionTypeInfo(
           Arrays.asList(TypeInfoCreator.createInt(), TypeInfoCreator.createBoolean())),
       orcType);
 }
Beispiel #8
0
 @Test
 public void test_getOrcField_map() throws Exception {
   final SchemaBuilder.FieldAssembler<Schema> builder =
       SchemaBuilder.record("testRecord").namespace("any.data").fields();
   builder.name("map").type().map().values().doubleType().noDefault();
   Schema testSchema = builder.endRecord();
   TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("map").schema());
   assertEquals(
       TypeInfoFactory.getMapTypeInfo(
           TypeInfoCreator.createString(), TypeInfoCreator.createDouble()),
       orcType);
 }
Beispiel #9
0
  /**
   * Returns true if the types of two avro schemas are equal. This ignores things like custom field
   * properties that the equals() implementation of Schema checks.
   *
   * @param schema1 The first schema to compare
   * @param schema2 The second schema to compare
   * @return True if the types are equal, otherwise false.
   */
  public static boolean avroSchemaTypesEqual(Schema schema1, Schema schema2) {
    if (schema1.getType() != schema2.getType()) {
      // if the types aren't equal, no need to go further. Return false
      return false;
    }

    if (schema1.getType() == Schema.Type.ENUM || schema1.getType() == Schema.Type.FIXED) {
      // Enum and Fixed types schemas should be equal using the Schema.equals
      // method.
      return schema1.equals(schema2);
    }
    if (schema1.getType() == Schema.Type.ARRAY) {
      // Avro element schemas should be equal, which is tested by recursively
      // calling this method.
      return avroSchemaTypesEqual(schema1.getElementType(), schema2.getElementType());
    } else if (schema1.getType() == Schema.Type.MAP) {
      // Map type values schemas should be equal, which is tested by recursively
      // calling this method.
      return avroSchemaTypesEqual(schema1.getValueType(), schema2.getValueType());
    } else if (schema1.getType() == Schema.Type.UNION) {
      // Compare Union fields in the same position by comparing their schemas
      // recursively calling this method.
      if (schema1.getTypes().size() != schema2.getTypes().size()) {
        return false;
      }
      for (int i = 0; i < schema1.getTypes().size(); i++) {
        if (!avroSchemaTypesEqual(schema1.getTypes().get(i), schema2.getTypes().get(i))) {
          return false;
        }
      }
      return true;
    } else if (schema1.getType() == Schema.Type.RECORD) {
      // Compare record fields that match in name by comparing their schemas
      // recursively calling this method.
      if (schema1.getFields().size() != schema2.getFields().size()) {
        return false;
      }
      for (Field field1 : schema1.getFields()) {
        Field field2 = schema2.getField(field1.name());
        if (field2 == null) {
          return false;
        }
        if (!avroSchemaTypesEqual(field1.schema(), field2.schema())) {
          return false;
        }
      }
      return true;
    } else {
      // All other types are primitive, so them matching in type is enough.
      return true;
    }
  }
public class TestAvroRecordConverter {
  static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
  static final Map<String, String> EMPTY_MAPPING = ImmutableMap.of();
  static final String NESTED_RECORD_SCHEMA_STRING =
      "{\n"
          + "    \"type\": \"record\",\n"
          + "    \"name\": \"NestedInput\",\n"
          + "    \"namespace\": \"org.apache.example\",\n"
          + "    \"fields\": [\n"
          + "        {\n"
          + "            \"name\": \"l1\",\n"
          + "            \"type\": \"long\"\n"
          + "        },\n"
          + "        {\n"
          + "            \"name\": \"s1\",\n"
          + "            \"type\": \"string\"\n"
          + "        },\n"
          + "        {\n"
          + "            \"name\": \"parent\",\n"
          + "            \"type\": [\"null\", {\n"
          + "              \"type\": \"record\",\n"
          + "              \"name\": \"parent\",\n"
          + "              \"fields\": [\n"
          + "                { \"name\": \"id\", \"type\": \"long\" },\n"
          + "                { \"name\": \"name\", \"type\": \"string\" }\n"
          + "              ]"
          + "            } ]"
          + "        }"
          + "   ] }";
  static final Schema NESTED_RECORD_SCHEMA = new Schema.Parser().parse(NESTED_RECORD_SCHEMA_STRING);
  static final Schema NESTED_PARENT_SCHEMA =
      AvroRecordConverter.getNonNullSchema(NESTED_RECORD_SCHEMA.getField("parent").schema());
  static final Schema UNNESTED_OUTPUT_SCHEMA =
      SchemaBuilder.record("Output")
          .namespace("org.apache.example")
          .fields()
          .requiredLong("l1")
          .requiredLong("s1")
          .optionalLong("parentId")
          .endRecord();

  /** Tests the case where we don't use a mapping file and just map records by name. */
  @Test
  public void testDefaultConversion() throws Exception {
    // We will convert s1 from string to long (or leave it null), ignore s2,
    // convert s3 to from string to double, convert l1 from long to string,
    // and leave l2 the same.
    Schema input =
        SchemaBuilder.record("Input")
            .namespace("com.cloudera.edh")
            .fields()
            .nullableString("s1", "")
            .requiredString("s2")
            .requiredString("s3")
            .optionalLong("l1")
            .requiredLong("l2")
            .endRecord();
    Schema output =
        SchemaBuilder.record("Output")
            .namespace("com.cloudera.edh")
            .fields()
            .optionalLong("s1")
            .optionalString("l1")
            .requiredLong("l2")
            .requiredDouble("s3")
            .endRecord();

    AvroRecordConverter converter =
        new AvroRecordConverter(input, output, EMPTY_MAPPING, LocaleUtils.toLocale("en_US"));

    Record inputRecord = new Record(input);
    inputRecord.put("s1", null);
    inputRecord.put("s2", "blah");
    inputRecord.put("s3", "5.5");
    inputRecord.put("l1", null);
    inputRecord.put("l2", 5L);
    Record outputRecord = converter.convert(inputRecord);
    assertNull(outputRecord.get("s1"));
    assertNull(outputRecord.get("l1"));
    assertEquals(5L, outputRecord.get("l2"));
    assertEquals(5.5, outputRecord.get("s3"));

    inputRecord.put("s1", "500");
    inputRecord.put("s2", "blah");
    inputRecord.put("s3", "5.5e-5");
    inputRecord.put("l1", 100L);
    inputRecord.put("l2", 2L);
    outputRecord = converter.convert(inputRecord);
    assertEquals(500L, outputRecord.get("s1"));
    assertEquals("100", outputRecord.get("l1"));
    assertEquals(2L, outputRecord.get("l2"));
    assertEquals(5.5e-5, outputRecord.get("s3"));
  }

  /** Tests the case where we want to default map one field and explicitly map another. */
  @Test
  public void testExplicitMapping() throws Exception {
    // We will convert s1 from string to long (or leave it null), ignore s2,
    // convert l1 from long to string, and leave l2 the same.
    Schema input = NESTED_RECORD_SCHEMA;
    Schema parent = NESTED_PARENT_SCHEMA;
    Schema output = UNNESTED_OUTPUT_SCHEMA;
    Map<String, String> mapping = ImmutableMap.of("parent.id", "parentId");

    AvroRecordConverter converter = new AvroRecordConverter(input, output, mapping);

    Record inputRecord = new Record(input);
    inputRecord.put("l1", 5L);
    inputRecord.put("s1", "1000");
    Record parentRecord = new Record(parent);
    parentRecord.put("id", 200L);
    parentRecord.put("name", "parent");
    inputRecord.put("parent", parentRecord);
    Record outputRecord = converter.convert(inputRecord);
    assertEquals(5L, outputRecord.get("l1"));
    assertEquals(1000L, outputRecord.get("s1"));
    assertEquals(200L, outputRecord.get("parentId"));
  }

  /** Tests the case where we try to convert a string to a long incorrectly. */
  @Test(
      expected = org.apache.nifi.processors.kite.AvroRecordConverter.AvroConversionException.class)
  public void testIllegalConversion() throws Exception {
    // We will convert s1 from string to long (or leave it null), ignore s2,
    // convert l1 from long to string, and leave l2 the same.
    Schema input =
        SchemaBuilder.record("Input")
            .namespace("com.cloudera.edh")
            .fields()
            .nullableString("s1", "")
            .requiredString("s2")
            .optionalLong("l1")
            .requiredLong("l2")
            .endRecord();
    Schema output =
        SchemaBuilder.record("Output")
            .namespace("com.cloudera.edh")
            .fields()
            .optionalLong("s1")
            .optionalString("l1")
            .requiredLong("l2")
            .endRecord();

    AvroRecordConverter converter = new AvroRecordConverter(input, output, EMPTY_MAPPING);

    Record inputRecord = new Record(input);
    inputRecord.put("s1", "blah");
    inputRecord.put("s2", "blah");
    inputRecord.put("l1", null);
    inputRecord.put("l2", 5L);
    converter.convert(inputRecord);
  }

  @Test
  public void testGetUnmappedFields() throws Exception {
    Schema input =
        SchemaBuilder.record("Input")
            .namespace("com.cloudera.edh")
            .fields()
            .nullableString("s1", "")
            .requiredString("s2")
            .optionalLong("l1")
            .requiredLong("l2")
            .endRecord();
    Schema output =
        SchemaBuilder.record("Output")
            .namespace("com.cloudera.edh")
            .fields()
            .optionalLong("field")
            .endRecord();

    // Test the case where the field isn't mapped at all.
    AvroRecordConverter converter = new AvroRecordConverter(input, output, EMPTY_MAPPING);
    assertEquals(ImmutableList.of("field"), converter.getUnmappedFields());

    // Test the case where we tried to map from a non-existent field.
    converter =
        new AvroRecordConverter(input, output, ImmutableMap.of("nonExistentField", "field"));
    assertEquals(ImmutableList.of("field"), converter.getUnmappedFields());

    // Test the case where we tried to map from a non-existent record.
    converter =
        new AvroRecordConverter(input, output, ImmutableMap.of("parent.nonExistentField", "field"));
    assertEquals(ImmutableList.of("field"), converter.getUnmappedFields());

    // Test a valid case
    converter = new AvroRecordConverter(input, output, ImmutableMap.of("l2", "field"));
    assertEquals(Collections.EMPTY_LIST, converter.getUnmappedFields());
  }
}
Beispiel #11
0
  public static Record buildTweet(Schema schema, Status status) {
    GenericRecordBuilder builderTweet = new GenericRecordBuilder(schema);

    builderTweet.set("created_at", status.getCreatedAt().getTime());
    builderTweet.set("favorite_count", status.getFavoriteCount());
    builderTweet.set("favorited", status.isFavorited());
    builderTweet.set("id", status.getId());
    builderTweet.set("in_reply_to_screen_name", status.getInReplyToScreenName());
    if (status.getInReplyToStatusId() != -1)
      builderTweet.set("in_reply_to_status_id", status.getInReplyToStatusId());
    if (status.getInReplyToUserId() != -1)
      builderTweet.set("in_reply_to_user_id", status.getInReplyToUserId());
    builderTweet.set("lang", status.getLang());
    builderTweet.set("possibly_sensitive", status.isPossiblySensitive());
    builderTweet.set("retweet_count", status.getRetweetCount());
    builderTweet.set("retweeted", status.isRetweeted());
    builderTweet.set("source", status.getSource());
    builderTweet.set("text", status.getText());
    builderTweet.set("truncated", status.isTruncated());
    if (status.getWithheldInCountries() != null)
      builderTweet.set("withheld_in_countries", Arrays.asList(status.getWithheldInCountries()));
    if (status.getGeoLocation() != null)
      builderTweet.set(
          "coordinates",
          Arrays.asList(
              status.getGeoLocation().getLatitude(), status.getGeoLocation().getLongitude()));

    builderTweet.set("entities", buildEntities(schema.getField("entities").schema(), status));

    if (status.getPlace() != null)
      builderTweet.set(
          "place",
          buildPlace(schema.getField("place").schema().getTypes().get(1), status.getPlace()));

    User user = status.getUser();
    if (user != null && schema.getField("user") != null) {
      Schema schemaUser = schema.getField("user").schema();
      GenericRecordBuilder builderUser = new GenericRecordBuilder(schemaUser);
      builderUser.set("contributors_enabled", user.isContributorsEnabled());
      builderUser.set("created_at", user.getCreatedAt().getTime());
      builderUser.set("default_profile", user.isDefaultProfile());
      builderUser.set("default_profile_image", user.isDefaultProfileImage());
      builderUser.set("description", user.getDescription());
      builderUser.set(
          "entities",
          buildURLEntity(schemaUser.getField("entities").schema(), user.getURLEntity()));
      builderUser.set("favourites_count", user.getFavouritesCount());
      builderUser.set("followers_count", user.getFollowersCount());
      builderUser.set("friends_count", user.getFriendsCount());
      builderUser.set("geo_enabled", user.isGeoEnabled());
      builderUser.set("id", user.getId());
      builderUser.set("is_translator", user.isTranslator());
      builderUser.set("lang", user.getLang());
      builderUser.set("listed_count", user.getListedCount());
      builderUser.set("location", user.getLocation());
      builderUser.set("name", user.getName());
      builderUser.set("screen_name", user.getScreenName());
      builderUser.set("profile_background_color", user.getProfileBackgroundColor());
      builderUser.set("profile_background_image_url", user.getProfileBackgroundImageURL());
      builderUser.set(
          "profile_background_image_url_https", user.getProfileBackgroundImageUrlHttps());
      builderUser.set("profile_background_tile", user.isProfileBackgroundTiled());
      builderUser.set("profile_banner_url", user.getProfileBannerURL());
      builderUser.set("profile_image_url", user.getProfileImageURL());
      builderUser.set("profile_image_url_https", user.getProfileBackgroundImageUrlHttps());
      builderUser.set("profile_link_color", user.getProfileLinkColor());
      builderUser.set("profile_sidebar_border_color", user.getProfileSidebarBorderColor());
      builderUser.set("profile_sidebar_fill_color", user.getProfileSidebarFillColor());
      builderUser.set("profile_text_color", user.getProfileTextColor());
      builderUser.set("profile_use_background_image", user.isProfileUseBackgroundImage());
      builderUser.set("protected", user.isProtected());
      builderUser.set("show_all_inline_media", user.isShowAllInlineMedia());
      builderUser.set("statuses_count", user.getStatusesCount());
      builderUser.set("time_zone", user.getTimeZone());
      builderUser.set("url", user.getURL());
      builderUser.set("utc_offset", user.getUtcOffset());
      builderUser.set("verified", user.isVerified());
      if (user.getStatus() != null && schemaUser.getField("status") != null)
        builderUser.set(
            "status",
            buildTweet(schemaUser.getField("status").schema().getTypes().get(1), user.getStatus()));
      if (user.getWithheldInCountries() != null)
        builderUser.set("withheld_in_countries", Arrays.asList(user.getWithheldInCountries()));
      builderTweet.set("user", builderUser.build());
    }

    if (status.getQuotedStatus() != null && schema.getField("quoted_status") != null)
      builderTweet.set(
          "quoted_status",
          buildTweet(
              schema.getField("quoted_status").schema().getTypes().get(1),
              status.getQuotedStatus()));

    if (status.getRetweetedStatus() != null && schema.getField("retweeted_status") != null)
      builderTweet.set(
          "retweeted_status",
          buildTweet(
              schema.getField("retweeted_status").schema().getTypes().get(1),
              status.getRetweetedStatus()));

    return builderTweet.build();
  }
Beispiel #12
0
  private static Record buildEntities(Schema schemaEntities, Status status) {
    GenericRecordBuilder builderEntities = new GenericRecordBuilder(schemaEntities);

    if (status.getHashtagEntities().length > 0) {
      Schema schemaHashtagObject = schemaEntities.getField("hashtags").schema().getElementType();
      List<GenericRecord> listHashtagObjects = new ArrayList<>();
      for (HashtagEntity hashtagEntity : status.getHashtagEntities()) {
        GenericRecordBuilder builderHashtagObject = new GenericRecordBuilder(schemaHashtagObject);
        builderHashtagObject.set("text", hashtagEntity.getText());
        builderHashtagObject.set("start", hashtagEntity.getStart());
        builderHashtagObject.set("end", hashtagEntity.getEnd());
        listHashtagObjects.add(builderHashtagObject.build());
      }
      builderEntities.set("hashtags", listHashtagObjects);
    } else builderEntities.set("hashtags", Collections.emptyList());

    if (status.getSymbolEntities().length > 0) {
      Schema schemaSymbolObject = schemaEntities.getField("symbols").schema().getElementType();
      List<GenericRecord> listSymbolObject = new ArrayList<>();
      for (SymbolEntity symbolEntity : status.getSymbolEntities()) {
        GenericRecordBuilder builderSymbolObject = new GenericRecordBuilder(schemaSymbolObject);
        builderSymbolObject.set("text", symbolEntity.getText());
        builderSymbolObject.set("start", symbolEntity.getStart());
        builderSymbolObject.set("end", symbolEntity.getEnd());
        listSymbolObject.add(builderSymbolObject.build());
      }
      builderEntities.set("symbols", listSymbolObject);
    } else builderEntities.set("symbols", Collections.emptyList());

    if (status.getMediaEntities().length > 0) {
      Schema schemaMediaObject = schemaEntities.getField("media").schema().getElementType();
      List<GenericRecord> listMediaObject = new ArrayList<>();
      for (MediaEntity mediaEntity : status.getMediaEntities()) {
        GenericRecordBuilder builderMediaObject = new GenericRecordBuilder(schemaMediaObject);
        builderMediaObject.set("url", mediaEntity.getURL());
        builderMediaObject.set("display_url", mediaEntity.getDisplayURL());
        builderMediaObject.set("expanded_url", mediaEntity.getExpandedURL());
        builderMediaObject.set("id", mediaEntity.getId());
        builderMediaObject.set("media_url", mediaEntity.getMediaURL());
        builderMediaObject.set("media_url_https", mediaEntity.getMediaURLHttps());
        builderMediaObject.set("type", mediaEntity.getType());
        builderMediaObject.set("text", mediaEntity.getText());
        builderMediaObject.set("start", mediaEntity.getStart());
        builderMediaObject.set("end", mediaEntity.getEnd());

        Schema schemaSize = schemaMediaObject.getField("sizes").schema().getValueType();
        GenericRecordBuilder builderSize = new GenericRecordBuilder(schemaSize);
        Map<String, GenericRecord> mapSizes = new HashMap<>(4);
        for (int key : mediaEntity.getSizes().keySet()) {
          Size size = mediaEntity.getSizes().get(key);
          builderSize.set("h", size.getHeight());
          builderSize.set("w", size.getWidth());
          builderSize.set("resize", size.getResize());
          mapSizes.put(Integer.toString(key), builderSize.build());
        }
        builderMediaObject.set("sizes", mapSizes);
        listMediaObject.add(builderMediaObject.build());
      }
      builderEntities.set("media", listMediaObject);
    } else builderEntities.set("media", Collections.emptyList());

    if (status.getURLEntities().length > 0) {
      Schema schemaURLObject = schemaEntities.getField("urls").schema().getElementType();
      List<GenericRecord> listURLObject1 = new ArrayList<>();
      for (URLEntity urlEntity : status.getURLEntities())
        listURLObject1.add(buildURLEntity(schemaURLObject, urlEntity));
      builderEntities.set("urls", listURLObject1);
    } else builderEntities.set("urls", Collections.emptyList());

    if (status.getUserMentionEntities().length > 0) {
      Schema schemaUserMentionObject =
          schemaEntities.getField("user_mentions").schema().getElementType();
      List<GenericRecord> listUserMentionObject = new ArrayList<>();
      for (UserMentionEntity userMentionEntity : status.getUserMentionEntities()) {
        GenericRecordBuilder builderUserMentionObject =
            new GenericRecordBuilder(schemaUserMentionObject);
        builderUserMentionObject.set("name", userMentionEntity.getName());
        builderUserMentionObject.set("screen_name", userMentionEntity.getScreenName());
        builderUserMentionObject.set("text", userMentionEntity.getText());
        builderUserMentionObject.set("id", userMentionEntity.getId());
        builderUserMentionObject.set("start", userMentionEntity.getStart());
        builderUserMentionObject.set("end", userMentionEntity.getEnd());
        listUserMentionObject.add(builderUserMentionObject.build());
      }
      builderEntities.set("user_mentions", listUserMentionObject);
    } else builderEntities.set("user_mentions", Collections.emptyList());

    if (status.getExtendedMediaEntities().length > 0) {
      Schema schemaExtendedMediaObject =
          schemaEntities.getField("extended_entities").schema().getElementType();
      List<GenericRecord> listExtendedMediaObject = new ArrayList<>();
      for (ExtendedMediaEntity extendedMediaEntity : status.getExtendedMediaEntities()) {
        GenericRecordBuilder builderExtendedMediaObject =
            new GenericRecordBuilder(schemaExtendedMediaObject);
        builderExtendedMediaObject.set("url", extendedMediaEntity.getURL());
        builderExtendedMediaObject.set("display_url", extendedMediaEntity.getDisplayURL());
        builderExtendedMediaObject.set("expanded_url", extendedMediaEntity.getExpandedURL());
        builderExtendedMediaObject.set("id", extendedMediaEntity.getId());
        builderExtendedMediaObject.set("media_url", extendedMediaEntity.getMediaURL());
        builderExtendedMediaObject.set("media_url_https", extendedMediaEntity.getMediaURLHttps());
        builderExtendedMediaObject.set("type", extendedMediaEntity.getType());
        builderExtendedMediaObject.set("text", extendedMediaEntity.getText());
        builderExtendedMediaObject.set("start", extendedMediaEntity.getStart());
        builderExtendedMediaObject.set("end", extendedMediaEntity.getEnd());

        Schema schemaSize = schemaExtendedMediaObject.getField("sizes").schema().getValueType();
        GenericRecordBuilder builderSize = new GenericRecordBuilder(schemaSize);
        Map<String, GenericRecord> mapSizes = new HashMap<>(4);
        for (int key : extendedMediaEntity.getSizes().keySet()) {
          Size size = extendedMediaEntity.getSizes().get(key);
          builderSize.set("h", size.getHeight());
          builderSize.set("w", size.getWidth());
          builderSize.set("resize", size.getResize());
          mapSizes.put(Integer.toString(key), builderSize.build());
        }
        builderExtendedMediaObject.set("sizes", mapSizes);

        Schema schemaVideoInfo = schemaExtendedMediaObject.getField("video_info").schema();
        GenericRecordBuilder builderVideoInfo = new GenericRecordBuilder(schemaVideoInfo);
        builderVideoInfo.set("h", extendedMediaEntity.getVideoAspectRatioHeight());
        builderVideoInfo.set("w", extendedMediaEntity.getVideoAspectRatioWidth());
        builderVideoInfo.set("duration_millis", extendedMediaEntity.getVideoDurationMillis());

        Schema schemaVideoVariants = schemaVideoInfo.getField("variants").schema().getElementType();
        List<GenericRecord> listVideoVariants = new ArrayList<>();
        for (Variant extendedVideoVariant : extendedMediaEntity.getVideoVariants()) {
          GenericRecordBuilder builderVideoVariant = new GenericRecordBuilder(schemaVideoVariants);
          builderVideoVariant.set("bitrate", extendedVideoVariant.getBitrate());
          builderVideoVariant.set("content_type", extendedVideoVariant.getContentType());
          builderVideoVariant.set("url", extendedVideoVariant.getUrl());
          listVideoVariants.add(builderVideoVariant.build());
        }
        builderVideoInfo.set("variants", listVideoVariants);
        builderExtendedMediaObject.set("video_info", builderVideoInfo.build());

        listExtendedMediaObject.add(builderExtendedMediaObject.build());
      }
      builderEntities.set("extended_entities", listExtendedMediaObject);
    } else builderEntities.set("extended_entities", Collections.emptyList());
    return builderEntities.build();
  }
Beispiel #13
0
  public void generateSimpleAggregationOnSingleColumnFilters() throws IOException {
    final Map<String, Map<Object, Integer>> cardinalityCountsMap =
        new HashMap<String, Map<Object, Integer>>();
    final Map<String, Map<Object, Map<String, Double>>> sumMap =
        new HashMap<String, Map<Object, Map<String, Double>>>();
    // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue

    final Map<String, Map<Object, Double>> sumGroupBy = new HashMap<String, Map<Object, Double>>();

    aggregationQueries = new ArrayList<AvroQueryGenerator.TestSimpleAggreationQuery>();
    groupByQueries = new ArrayList<AvroQueryGenerator.TestGroupByAggreationQuery>();
    for (final Field f : schema.getFields()) {
      final String fieldName = f.name();
      if (dimensions.contains(fieldName) || metrics.contains(fieldName) || time.equals(fieldName)) {
        isSingleValueMap.put(fieldName, isSingleValueField(f));
        dataTypeMap.put(fieldName, getColumnType(f));
        if (!metrics.contains(fieldName)) {
          cardinalityCountsMap.put(fieldName, new HashMap<Object, Integer>());
        }
      }
    }

    for (final String column : cardinalityCountsMap.keySet()) {
      sumMap.put(column, new HashMap<Object, Map<String, Double>>());
    }

    // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue

    while (dataStream.hasNext()) {
      final GenericRecord record = dataStream.next();

      for (final String column : cardinalityCountsMap.keySet()) {
        Object value = record.get(column);

        if (value == null) {
          switch (schema.getField(column).schema().getType()) {
            case INT:
              value = 0;
              break;
            case FLOAT:
              value = 0F;
              break;
            case LONG:
              value = 0L;
              break;
            case DOUBLE:
              value = 0D;
              break;
            case STRING:
            case BOOLEAN:
              value = "null";
              break;
          }
        }

        if (value instanceof Utf8) {
          value = ((Utf8) value).toString();
        }

        if (value instanceof Array) {
          continue;
        }

        // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue

        for (final String metricName : metrics) {
          final String groupbyKeyBase = column + ":" + record.get(column) + ":" + metricName;
          int dimCounter = 1;
          for (final String dim : cardinalityCountsMap.keySet()) {
            if (!dim.equals(column)) {
              dimCounter++;
              final String groupbyKey = groupbyKeyBase + ":" + dim;
              if (sumGroupBy.containsKey(groupbyKey)) {
                if (sumGroupBy.get(groupbyKey).containsKey(record.get(dim))) {
                  sumGroupBy
                      .get(groupbyKey)
                      .put(
                          record.get(dim),
                          getAppropriateNumberType(
                              metricName,
                              record.get(metricName),
                              sumGroupBy.get(groupbyKey).get(record.get(dim))));
                } else {
                  sumGroupBy
                      .get(groupbyKey)
                      .put(record.get(dim), Double.parseDouble(record.get(metricName).toString()));
                }
              } else {
                sumGroupBy.put(groupbyKey, new HashMap<Object, Double>());
                sumGroupBy
                    .get(groupbyKey)
                    .put(record.get(dim), Double.parseDouble(record.get(metricName).toString()));
              }
            }
            if (dimCounter == 4) {
              break;
            }
          }
        }

        if (cardinalityCountsMap.get(column).containsKey(value)) {
          cardinalityCountsMap
              .get(column)
              .put(value, cardinalityCountsMap.get(column).get(value) + 1);
        } else {
          cardinalityCountsMap.get(column).put(value, 1);
        }

        if (!sumMap.get(column).containsKey(value)) {
          sumMap.get(column).put(value, new HashMap<String, Double>());
        }

        for (final String metric : metrics) {
          if (!sumMap.get(column).get(value).containsKey(metric)) {
            sumMap
                .get(column)
                .get(value)
                .put(metric, getAppropriateNumberType(metric, record.get(metric), 0D));
          } else {
            sumMap
                .get(column)
                .get(value)
                .put(
                    metric,
                    getAppropriateNumberType(
                        metric, record.get(metric), sumMap.get(column).get(value).get(metric)));
          }
        }
        // here string key is columnName:columnValue:MetricName:GroupColumnName:groupKey:metricValue
      }
    }

    dataStream.close();

    if (!isRealtimeSegment) {
      for (final String column : cardinalityCountsMap.keySet()) {
        for (final Object entry : cardinalityCountsMap.get(column).keySet()) {
          final StringBuilder bld = new StringBuilder();
          bld.append("select count(*) from ");
          bld.append(resourceName);
          bld.append(" where ");
          bld.append(column);
          bld.append("=");
          bld.append("'");
          bld.append(entry);
          bld.append("'");
          bld.append(" ");
          bld.append("limit 0");
          String queryString = bld.toString();
          if (!queryString.contains("null")) {
            aggregationQueries.add(
                new TestSimpleAggreationQuery(
                    queryString, new Double(cardinalityCountsMap.get(column).get(entry))));
          }
        }
      }
    }

    for (final String column : sumMap.keySet()) {
      for (final Object value : sumMap.get(column).keySet()) {
        for (final String metric : sumMap.get(column).get(value).keySet()) {
          final StringBuilder bld = new StringBuilder();
          bld.append("select sum('" + metric + "') from ");
          bld.append(resourceName);
          bld.append(" where ");
          bld.append(column);
          bld.append("=");
          bld.append("'");
          bld.append(value);
          bld.append("'");
          bld.append(" ");
          bld.append("limit 0");
          String queryString = bld.toString();
          if (!queryString.contains("null")) {
            aggregationQueries.add(
                new TestSimpleAggreationQuery(
                    bld.toString(), sumMap.get(column).get(value).get(metric)));
          }
        }
      }
    }

    for (final String groupKey : sumGroupBy.keySet()) {
      final String columnName = groupKey.split(":")[0];
      final String columnValue = groupKey.split(":")[1];
      final String metricColumn = groupKey.split(":")[2];
      final String groupByColumnName = groupKey.split(":")[3];

      final StringBuilder bld = new StringBuilder();
      bld.append("select sum('" + metricColumn + "') from ");
      bld.append(resourceName);
      bld.append(" where ");
      bld.append(columnName);
      bld.append("=");
      bld.append("'");
      bld.append(columnValue);
      bld.append("'");
      bld.append(" ");
      bld.append(" group by ");
      bld.append(groupByColumnName);
      bld.append(" top 10 ");
      bld.append("limit 0");
      String queryString = bld.toString();
      if (!queryString.contains("null")) {
        groupByQueries.add(
            new TestGroupByAggreationQuery(bld.toString(), sumGroupBy.get(groupKey)));
      }
    }
  }
  @Test
  public void testConvertBigQuerySchemaToAvroSchema() {
    TableSchema tableSchema = new TableSchema();
    tableSchema.setFields(fields);
    Schema avroSchema =
        BigQueryAvroUtils.toGenericAvroSchema("testSchema", tableSchema.getFields());

    assertThat(avroSchema.getField("number").schema(), equalTo(Schema.create(Type.LONG)));
    assertThat(
        avroSchema.getField("species").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING))));
    assertThat(
        avroSchema.getField("quality").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.DOUBLE))));
    assertThat(
        avroSchema.getField("quantity").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.LONG))));
    assertThat(
        avroSchema.getField("birthday").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.LONG))));
    assertThat(
        avroSchema.getField("flighted").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.BOOLEAN))));
    assertThat(
        avroSchema.getField("sound").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.BYTES))));
    assertThat(
        avroSchema.getField("anniversaryDate").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING))));
    assertThat(
        avroSchema.getField("anniversaryDatetime").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING))));
    assertThat(
        avroSchema.getField("anniversaryTime").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING))));

    assertThat(
        avroSchema.getField("scion").schema(),
        equalTo(
            Schema.createUnion(
                Schema.create(Type.NULL),
                Schema.createRecord(
                    "scion",
                    "org.apache.beam.sdk.io.gcp.bigquery",
                    "Translated Avro Schema for scion",
                    false,
                    ImmutableList.of(
                        new Field(
                            "species",
                            Schema.createUnion(
                                Schema.create(Type.NULL), Schema.create(Type.STRING)),
                            null,
                            (Object) null))))));
    assertThat(
        avroSchema.getField("associates").schema(),
        equalTo(
            Schema.createArray(
                Schema.createRecord(
                    "associates",
                    "org.apache.beam.sdk.io.gcp.bigquery",
                    "Translated Avro Schema for associates",
                    false,
                    ImmutableList.of(
                        new Field(
                            "species",
                            Schema.createUnion(
                                Schema.create(Type.NULL), Schema.create(Type.STRING)),
                            null,
                            (Object) null))))));
  }
  @Test
  public void testArrayWithNullValues() throws Exception {
    Schema schema = new Schema.Parser().parse(Resources.getResource("all.avsc").openStream());

    File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
    tmp.deleteOnExit();
    tmp.delete();
    Path file = new Path(tmp.getPath());

    GenericData.Record nestedRecord =
        new GenericRecordBuilder(schema.getField("mynestedrecord").schema())
            .set("mynestedint", 1)
            .build();

    List<Integer> integerArray = Arrays.asList(1, 2, 3);
    GenericData.Array<Integer> genericIntegerArray =
        new GenericData.Array<Integer>(
            Schema.createArray(Schema.create(Schema.Type.INT)), integerArray);

    GenericFixed genericFixed =
        new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] {(byte) 65});

    List<Integer> emptyArray = new ArrayList<Integer>();
    ImmutableMap emptyMap = new ImmutableMap.Builder<String, Integer>().build();

    Schema arrayOfOptionalIntegers = Schema.createArray(optional(Schema.create(Schema.Type.INT)));
    GenericData.Array<Integer> genericIntegerArrayWithNulls =
        new GenericData.Array<Integer>(arrayOfOptionalIntegers, Arrays.asList(1, null, 2, null, 3));

    GenericData.Record record =
        new GenericRecordBuilder(schema)
            .set("mynull", null)
            .set("myboolean", true)
            .set("myint", 1)
            .set("mylong", 2L)
            .set("myfloat", 3.1f)
            .set("mydouble", 4.1)
            .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
            .set("mystring", "hello")
            .set("mynestedrecord", nestedRecord)
            .set("myenum", "a")
            .set("myarray", genericIntegerArray)
            .set("myemptyarray", emptyArray)
            .set("myoptionalarray", genericIntegerArray)
            .set("myarrayofoptional", genericIntegerArrayWithNulls)
            .set("mymap", ImmutableMap.of("a", 1, "b", 2))
            .set("myemptymap", emptyMap)
            .set("myfixed", genericFixed)
            .build();

    final AvroParquetWriter<GenericRecord> writer =
        new AvroParquetWriter<GenericRecord>(file, schema);

    try {
      writer.write(record);
      fail("Should not succeed writing an array with null values");
    } catch (Exception e) {
      Assert.assertTrue(
          "Error message should provide context and help",
          e.getMessage().contains("parquet.avro.write-old-list-structure"));
    } finally {
      writer.close();
    }
  }
  @Test
  public void testAll() throws Exception {
    Schema schema = new Schema.Parser().parse(Resources.getResource("all.avsc").openStream());

    File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
    tmp.deleteOnExit();
    tmp.delete();
    Path file = new Path(tmp.getPath());

    AvroParquetWriter<GenericRecord> writer = new AvroParquetWriter<GenericRecord>(file, schema);

    GenericData.Record nestedRecord =
        new GenericRecordBuilder(schema.getField("mynestedrecord").schema())
            .set("mynestedint", 1)
            .build();

    List<Integer> integerArray = Arrays.asList(1, 2, 3);
    GenericData.Array<Integer> genericIntegerArray =
        new GenericData.Array<Integer>(
            Schema.createArray(Schema.create(Schema.Type.INT)), integerArray);

    GenericFixed genericFixed =
        new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] {(byte) 65});

    List<Integer> emptyArray = new ArrayList<Integer>();
    ImmutableMap emptyMap = new ImmutableMap.Builder<String, Integer>().build();

    GenericData.Record record =
        new GenericRecordBuilder(schema)
            .set("mynull", null)
            .set("myboolean", true)
            .set("myint", 1)
            .set("mylong", 2L)
            .set("myfloat", 3.1f)
            .set("mydouble", 4.1)
            .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
            .set("mystring", "hello")
            .set("mynestedrecord", nestedRecord)
            .set("myenum", "a")
            .set("myarray", genericIntegerArray)
            .set("myemptyarray", emptyArray)
            .set("myoptionalarray", genericIntegerArray)
            .set("myarrayofoptional", genericIntegerArray)
            .set("mymap", ImmutableMap.of("a", 1, "b", 2))
            .set("myemptymap", emptyMap)
            .set("myfixed", genericFixed)
            .build();

    writer.write(record);
    writer.close();

    AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(testConf, file);
    GenericRecord nextRecord = reader.read();

    Object expectedEnumSymbol =
        compat ? "a" : new GenericData.EnumSymbol(schema.getField("myenum").schema(), "a");

    assertNotNull(nextRecord);
    assertEquals(null, nextRecord.get("mynull"));
    assertEquals(true, nextRecord.get("myboolean"));
    assertEquals(1, nextRecord.get("myint"));
    assertEquals(2L, nextRecord.get("mylong"));
    assertEquals(3.1f, nextRecord.get("myfloat"));
    assertEquals(4.1, nextRecord.get("mydouble"));
    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
    assertEquals(str("hello"), nextRecord.get("mystring"));
    assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
    assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
    assertEquals(integerArray, nextRecord.get("myarray"));
    assertEquals(emptyArray, nextRecord.get("myemptyarray"));
    assertEquals(integerArray, nextRecord.get("myoptionalarray"));
    assertEquals(integerArray, nextRecord.get("myarrayofoptional"));
    assertEquals(ImmutableMap.of(str("a"), 1, str("b"), 2), nextRecord.get("mymap"));
    assertEquals(emptyMap, nextRecord.get("myemptymap"));
    assertEquals(genericFixed, nextRecord.get("myfixed"));
  }
 /**
  * @param fieldName the name of the field
  * @return the type of the field
  */
 private Type getType(final String fieldName) {
   return schema.getField(fieldName).schema().getType();
 }
Beispiel #18
0
    private Object translate(Object value, DataSchema dataSchema, Schema avroSchema) {
      AvroOverride avroOverride = getAvroOverride(dataSchema);
      if (avroOverride != null) {
        return avroOverride
            .getCustomDataTranslator()
            .avroGenericToData(this, value, avroSchema, dataSchema);
      }

      DataSchema dereferencedDataSchema = dataSchema.getDereferencedDataSchema();
      DataSchema.Type type = dereferencedDataSchema.getType();
      Object result;
      switch (type) {
        case NULL:
          if (value != null) {
            appendMessage("value must be null for null schema");
            result = BAD_RESULT;
            break;
          }
          result = Data.NULL;
          break;
        case BOOLEAN:
          result = ((Boolean) value).booleanValue();
          break;
        case INT:
          result = ((Number) value).intValue();
          break;
        case LONG:
          result = ((Number) value).longValue();
          break;
        case FLOAT:
          result = ((Number) value).floatValue();
          break;
        case DOUBLE:
          result = ((Number) value).doubleValue();
          break;
        case STRING:
          result = value.toString();
          break;
        case BYTES:
          ByteBuffer byteBuffer = (ByteBuffer) value;
          ByteString byteString = ByteString.copy(byteBuffer);
          byteBuffer.rewind();
          result = byteString;
          break;
        case ENUM:
          String enumValue = value.toString();
          EnumDataSchema enumDataSchema = (EnumDataSchema) dereferencedDataSchema;
          if (enumDataSchema.getSymbols().contains(enumValue) == false) {
            appendMessage(
                "enum value %1$s not one of %2$s", enumValue, enumDataSchema.getSymbols());
            result = BAD_RESULT;
            break;
          }
          result = enumValue;
          break;
        case FIXED:
          GenericFixed fixed = (GenericFixed) value;
          byte[] fixedBytes = fixed.bytes();
          FixedDataSchema fixedDataSchema = (FixedDataSchema) dereferencedDataSchema;
          if (fixedDataSchema.getSize() != fixedBytes.length) {
            appendMessage(
                "GenericFixed size %1$d != FixedDataSchema size %2$d",
                fixedBytes.length, fixedDataSchema.getSize());
            result = BAD_RESULT;
            break;
          }
          byteString = ByteString.copy(fixedBytes);
          result = byteString;
          break;
        case MAP:
          @SuppressWarnings("unchecked")
          Map<?, Object> map = (Map<?, Object>) value;
          DataSchema valueDataSchema = ((MapDataSchema) dereferencedDataSchema).getValues();
          Schema valueAvroSchema = avroSchema.getValueType();
          DataMap dataMap = new DataMap(map.size());
          for (Map.Entry<?, Object> entry : map.entrySet()) {
            String key = entry.getKey().toString();
            _path.addLast(key);
            Object entryValue = translate(entry.getValue(), valueDataSchema, valueAvroSchema);
            _path.removeLast();
            dataMap.put(key, entryValue);
          }
          result = dataMap;
          break;
        case ARRAY:
          GenericArray<?> list = (GenericArray<?>) value;
          DataSchema elementDataSchema = ((ArrayDataSchema) dereferencedDataSchema).getItems();
          Schema elementAvroSchema = avroSchema.getElementType();
          DataList dataList = new DataList(list.size());
          for (int i = 0; i < list.size(); i++) {
            _path.addLast(i);
            Object entryValue = translate(list.get(i), elementDataSchema, elementAvroSchema);
            _path.removeLast();
            dataList.add(entryValue);
          }
          result = dataList;
          break;
        case RECORD:
          GenericRecord record = (GenericRecord) value;
          RecordDataSchema recordDataSchema = (RecordDataSchema) dereferencedDataSchema;
          dataMap = new DataMap(avroSchema.getFields().size());
          for (RecordDataSchema.Field field : recordDataSchema.getFields()) {
            String fieldName = field.getName();
            Object fieldValue = record.get(fieldName);
            // fieldValue could be null if the Avro schema does not contain the named field or
            // the field is present with a null value. In either case we do not add a value
            // to the translated DataMap. We do not consider optional/required/default here
            // either (i.e. it is not an error if a required field is missing); the user can
            // later call ValidateDataAgainstSchema with various
            // settings for RequiredMode to obtain the desired behaviour.
            if (fieldValue == null) {
              continue;
            }
            boolean isOptional = field.getOptional();
            DataSchema fieldDataSchema = field.getType();
            Schema fieldAvroSchema = avroSchema.getField(fieldName).schema();
            if (isOptional && (fieldDataSchema.getDereferencedType() != DataSchema.Type.UNION)) {
              // Avro schema should be union with 2 types: null and the field's type.
              Map.Entry<String, Schema> fieldAvroEntry =
                  findUnionMember(fieldDataSchema, fieldAvroSchema);
              if (fieldAvroEntry == null) {
                continue;
              }
              fieldAvroSchema = fieldAvroEntry.getValue();
            }
            _path.addLast(fieldName);
            dataMap.put(fieldName, translate(fieldValue, fieldDataSchema, fieldAvroSchema));
            _path.removeLast();
          }
          result = dataMap;
          break;
        case UNION:
          UnionDataSchema unionDataSchema = (UnionDataSchema) dereferencedDataSchema;
          Map.Entry<DataSchema, Schema> memberSchemas =
              findUnionMemberSchema(value, unionDataSchema, avroSchema);
          if (memberSchemas == null) {
            result = BAD_RESULT;
            break;
          }
          if (value == null) {
            // schema must be "null" schema
            result = Data.NULL;
          } else {
            DataSchema memberDataSchema = memberSchemas.getKey();
            Schema memberAvroSchema = memberSchemas.getValue();
            String key = memberDataSchema.getUnionMemberKey();
            dataMap = new DataMap(1);
            _path.addLast(key);
            dataMap.put(key, translate(value, memberDataSchema, memberAvroSchema));
            _path.removeLast();
            result = dataMap;
          }
          break;
        default:
          appendMessage("schema type unknown %1$s", dereferencedDataSchema.getType());
          result = BAD_RESULT;
          break;
      }
      return result;
    }
  // Verify if the new avro schema being pushed is the same one as the old one
  // Does not have logic to check for Avro schema evolution yet
  public void verifyAvroSchema(String url) throws Exception {
    // create new n store def with schema from the metadata in the input
    // path
    Schema schema = AvroUtils.getAvroSchemaFromPath(getInputPath());
    int replicationFactor = props.getInt("build.replication.factor", 2);
    int requiredReads = props.getInt("build.required.reads", 1);
    int requiredWrites = props.getInt("build.required.writes", 1);
    String description = props.getString("push.store.description", "");
    String owners = props.getString("push.store.owners", "");

    String keySchema =
        "\n\t\t<type>avro-generic</type>\n\t\t<schema-info version=\"0\">"
            + schema.getField(keyField).schema()
            + "</schema-info>\n\t";
    String valSchema =
        "\n\t\t<type>avro-generic</type>\n\t\t<schema-info version=\"0\">"
            + schema.getField(valueField).schema()
            + "</schema-info>\n\t";

    boolean hasCompression = false;
    if (props.containsKey("build.compress.value")) hasCompression = true;

    if (hasCompression) {
      valSchema += "\t<compression><type>gzip</type></compression>\n\t";
    }

    if (props.containsKey("build.force.schema.key")) {
      keySchema = props.get("build.force.schema.key");
    }

    if (props.containsKey("build.force.schema.value")) {
      valSchema = props.get("build.force.schema.value");
    }

    String newStoreDefXml =
        VoldemortUtils.getStoreDefXml(
            storeName,
            replicationFactor,
            requiredReads,
            requiredWrites,
            props.containsKey("build.preferred.reads")
                ? props.getInt("build.preferred.reads")
                : null,
            props.containsKey("build.preferred.writes")
                ? props.getInt("build.preferred.writes")
                : null,
            (props.containsKey("push.force.schema.key"))
                ? props.getString("push.force.schema.key")
                : keySchema,
            (props.containsKey("push.force.schema.value"))
                ? props.getString("push.force.schema.value")
                : valSchema,
            description,
            owners);

    log.info("Verifying store: \n" + newStoreDefXml.toString());

    StoreDefinition newStoreDef = VoldemortUtils.getStoreDef(newStoreDefXml);

    // get store def from cluster
    log.info("Getting store definition from: " + url + " (node id " + this.nodeId + ")");

    AdminClient adminClient = new AdminClient(url, new AdminClientConfig());
    try {
      List<StoreDefinition> remoteStoreDefs =
          adminClient.getRemoteStoreDefList(this.nodeId).getValue();
      boolean foundStore = false;

      // go over all store defs and see if one has the same name as the
      // store we're trying
      // to build
      for (StoreDefinition remoteStoreDef : remoteStoreDefs) {
        if (remoteStoreDef.getName().equals(storeName)) {
          // if the store already exists, but doesn't match what we
          // want to push, we need
          // to worry
          if (!remoteStoreDef.equals(newStoreDef)) {

            // let's check to see if the key/value serializers are
            // REALLY equal.
            SerializerDefinition localKeySerializerDef = newStoreDef.getKeySerializer();
            SerializerDefinition localValueSerializerDef = newStoreDef.getValueSerializer();
            SerializerDefinition remoteKeySerializerDef = remoteStoreDef.getKeySerializer();
            SerializerDefinition remoteValueSerializerDef = remoteStoreDef.getValueSerializer();

            if (remoteKeySerializerDef.getName().equals("avro-generic")
                && remoteValueSerializerDef.getName().equals("avro-generic")
                && remoteKeySerializerDef.getAllSchemaInfoVersions().size() == 1
                && remoteValueSerializerDef.getAllSchemaInfoVersions().size() == 1) {
              Schema remoteKeyDef = Schema.parse(remoteKeySerializerDef.getCurrentSchemaInfo());
              Schema remoteValDef = Schema.parse(remoteValueSerializerDef.getCurrentSchemaInfo());
              Schema localKeyDef = Schema.parse(localKeySerializerDef.getCurrentSchemaInfo());
              Schema localValDef = Schema.parse(localValueSerializerDef.getCurrentSchemaInfo());

              if (remoteKeyDef.equals(localKeyDef) && remoteValDef.equals(localValDef)) {
                String compressionPolicy = "";
                if (hasCompression) {
                  compressionPolicy = "\n\t\t<compression><type>gzip</type></compression>";
                }

                // if the key/value serializers are REALLY equal
                // (even though the strings may not match), then
                // just use the remote stores to GUARANTEE that
                // they
                // match, and try again.
                newStoreDefXml =
                    VoldemortUtils.getStoreDefXml(
                        storeName,
                        replicationFactor,
                        requiredReads,
                        requiredWrites,
                        props.containsKey("build.preferred.reads")
                            ? props.getInt("build.preferred.reads")
                            : null,
                        props.containsKey("build.preferred.writes")
                            ? props.getInt("build.preferred.writes")
                            : null,
                        "\n\t\t<type>avro-generic</type>\n\t\t<schema-info version=\"0\">"
                            + remoteKeySerializerDef.getCurrentSchemaInfo()
                            + "</schema-info>\n\t",
                        "\n\t\t<type>avro-generic</type>\n\t\t<schema-info version=\"0\">"
                            + remoteValueSerializerDef.getCurrentSchemaInfo()
                            + "</schema-info>"
                            + compressionPolicy
                            + "\n\t");

                newStoreDef = VoldemortUtils.getStoreDef(newStoreDefXml);

                if (!remoteStoreDef.equals(newStoreDef)) {
                  // if we still get a fail, then we know that
                  // the
                  // store defs don't match for reasons OTHER
                  // than
                  // the key/value serializer
                  throw new RuntimeException(
                      "Your store schema is identical, but the store definition does not match. Have: "
                          + newStoreDef
                          + "\nBut expected: "
                          + remoteStoreDef);
                }
              } else {
                // if the key/value serializers are not equal
                // (even
                // in java, not just json strings), then fail
                throw new RuntimeException(
                    "Your store definition does not match the store definition that is already in the cluster. Tried to resolve identical schemas between local and remote, but failed. Have: "
                        + newStoreDef
                        + "\nBut expected: "
                        + remoteStoreDef);
              }
            }
          }

          foundStore = true;
          break;
        }
      }

      // if the store doesn't exist yet, create it
      if (!foundStore) {
        // New requirement - Make sure the user had description and
        // owner specified
        if (description.length() == 0) {
          throw new RuntimeException(
              "Description field missing in store definition. "
                  + "Please add \"push.store.description\" with a line describing your store");
        }

        if (owners.length() == 0) {
          throw new RuntimeException(
              "Owner field missing in store definition. "
                  + "Please add \"push.store.owners\" with value being comma-separated list of LinkedIn email ids");
        }

        log.info("Could not find store " + storeName + " on Voldemort. Adding it to all nodes ");
        adminClient.addStore(newStoreDef);
      }

      storeDefs =
          ImmutableList.of(
              VoldemortUtils.getStoreDef(
                  VoldemortUtils.getStoreDefXml(
                      storeName,
                      replicationFactor,
                      requiredReads,
                      requiredWrites,
                      props.containsKey("build.preferred.reads")
                          ? props.getInt("build.preferred.reads")
                          : null,
                      props.containsKey("build.preferred.writes")
                          ? props.getInt("build.preferred.writes")
                          : null,
                      keySchema,
                      valSchema)));
      cluster = adminClient.getAdminClientCluster();
    } finally {
      adminClient.stop();
    }
  }
Beispiel #20
0
    private Object translate(Object value, DataSchema dataSchema, Schema avroSchema) {
      AvroOverride avroOverride = getAvroOverride(dataSchema);
      if (avroOverride != null) {
        return avroOverride
            .getCustomDataTranslator()
            .dataToAvroGeneric(this, value, dataSchema, avroSchema);
      }

      DataSchema dereferencedDataSchema = dataSchema.getDereferencedDataSchema();
      DataSchema.Type type = dereferencedDataSchema.getType();
      Object result;
      switch (type) {
        case NULL:
          if (value != Data.NULL) {
            appendMessage("value must be null for null schema");
            result = BAD_RESULT;
            break;
          }
          result = null;
          break;
        case BOOLEAN:
          result = ((Boolean) value).booleanValue();
          break;
        case INT:
          result = ((Number) value).intValue();
          break;
        case LONG:
          result = ((Number) value).longValue();
          break;
        case FLOAT:
          result = ((Number) value).floatValue();
          break;
        case DOUBLE:
          result = ((Number) value).doubleValue();
          break;
        case STRING:
          result = new Utf8((String) value);
          break;
        case BYTES:
          result = ByteBuffer.wrap(translateBytes(value));
          break;
        case ENUM:
          String enumValue = value.toString();
          EnumDataSchema enumDataSchema = (EnumDataSchema) dereferencedDataSchema;
          if (enumDataSchema.getSymbols().contains(enumValue) == false) {
            appendMessage(
                "enum value %1$s not one of %2$s", enumValue, enumDataSchema.getSymbols());
            result = BAD_RESULT;
            break;
          }
          result = _avroAdapter.createEnumSymbol(avroSchema, enumValue);
          break;
        case FIXED:
          byte[] bytes = translateBytes(value);
          FixedDataSchema fixedDataSchema = (FixedDataSchema) dereferencedDataSchema;
          if (fixedDataSchema.getSize() != bytes.length) {
            appendMessage(
                "ByteString size %1$d != FixedDataSchema size %2$d",
                bytes.length, fixedDataSchema.getSize());
            result = null;
            break;
          }
          GenericData.Fixed fixed = new GenericData.Fixed(avroSchema);
          fixed.bytes(bytes);
          result = fixed;
          break;
        case MAP:
          DataMap map = (DataMap) value;
          DataSchema valueDataSchema = ((MapDataSchema) dereferencedDataSchema).getValues();
          Schema valueAvroSchema = avroSchema.getValueType();
          Map<String, Object> avroMap = new HashMap<String, Object>(map.size());
          for (Map.Entry<String, Object> entry : map.entrySet()) {
            String key = entry.getKey();
            _path.addLast(key);
            Object entryAvroValue = translate(entry.getValue(), valueDataSchema, valueAvroSchema);
            _path.removeLast();
            avroMap.put(key, entryAvroValue);
          }
          result = avroMap;
          break;
        case ARRAY:
          DataList list = (DataList) value;
          DataSchema elementDataSchema = ((ArrayDataSchema) dereferencedDataSchema).getItems();
          Schema elementAvroSchema = avroSchema.getElementType();
          GenericData.Array<Object> avroList =
              new GenericData.Array<Object>(list.size(), avroSchema);
          for (int i = 0; i < list.size(); i++) {
            _path.addLast(i);
            Object entryAvroValue = translate(list.get(i), elementDataSchema, elementAvroSchema);
            _path.removeLast();
            avroList.add(entryAvroValue);
          }
          result = avroList;
          break;
        case RECORD:
          map = (DataMap) value;
          RecordDataSchema recordDataSchema = (RecordDataSchema) dereferencedDataSchema;
          GenericData.Record avroRecord = new GenericData.Record(avroSchema);
          for (RecordDataSchema.Field field : recordDataSchema.getFields()) {
            String fieldName = field.getName();
            DataSchema fieldDataSchema = field.getType();
            Schema.Field avroField = avroSchema.getField(fieldName);
            if (avroField == null) {
              // field present in input but there is no field for it in Avro schema.
              // TODO: Whether and how to indicate this condition to clients.
              continue;
            }
            _path.addLast(fieldName);
            Schema fieldAvroSchema = avroField.schema();
            Object fieldValue = map.get(fieldName);
            boolean isOptional = field.getOptional();
            if (isOptional) {
              if (fieldDataSchema.getDereferencedType() != DataSchema.Type.UNION) {
                if (fieldValue == null) {
                  fieldValue = Data.NULL;
                  fieldDataSchema = DataSchemaConstants.NULL_DATA_SCHEMA;
                }
                Map.Entry<String, Schema> fieldAvroEntry =
                    findUnionMember(fieldDataSchema, fieldAvroSchema);
                if (fieldAvroEntry == null) {
                  _path.removeLast();
                  continue;
                }
                fieldAvroSchema = fieldAvroEntry.getValue();
              } else {
                // already a union
                if (fieldValue == null) {
                  // field is not present
                  fieldValue = Data.NULL;
                  fieldDataSchema = DataSchemaConstants.NULL_DATA_SCHEMA;
                }
              }
            } else {
              if (fieldValue == null) {
                appendMessage("required field is absent");
                _path.removeLast();
                continue;
              }
            }
            Object fieldAvroValue = translate(fieldValue, fieldDataSchema, fieldAvroSchema);
            avroRecord.put(fieldName, fieldAvroValue);
            _path.removeLast();
          }
          result = avroRecord;
          break;
        case UNION:
          UnionDataSchema unionDataSchema = (UnionDataSchema) dereferencedDataSchema;
          String key;
          Object memberValue;
          if (value == Data.NULL) {
            key = DataSchemaConstants.NULL_TYPE;
            memberValue = Data.NULL;
          } else {
            map = (DataMap) value;
            Map.Entry<String, Object> entry = map.entrySet().iterator().next();
            key = entry.getKey();
            memberValue = entry.getValue();
          }
          DataSchema memberDataSchema = unionDataSchema.getType(key);
          Map.Entry<String, Schema> memberAvroEntry = findUnionMember(memberDataSchema, avroSchema);
          if (memberAvroEntry == null) {
            result = BAD_RESULT;
            break;
          }
          Schema memberAvroSchema = memberAvroEntry.getValue();
          _path.addLast(memberAvroEntry.getKey());
          Object memberAvroValue = translate(memberValue, memberDataSchema, memberAvroSchema);
          _path.removeLast();
          result = memberAvroValue;
          break;
        default:
          appendMessage("schema type unknown %1$s", dereferencedDataSchema.getType());
          result = BAD_RESULT;
          break;
      }
      return result;
    }
 /** get field schema given index number */
 public static Field getUDField(Schema s, int index) {
   return s.getField(getDummyFieldName(index));
 }