/** * Wrap an avro schema as a nullable union if needed. For instance, wrap schema "int" as ["null", * "int"] */ public static Schema wrapAsUnion(Schema schema, boolean nullable) { if (nullable) { /* if schema is an acceptable union, then return itself */ if (schema.getType().equals(Schema.Type.UNION) && isAcceptableUnion(schema)) return schema; else return Schema.createUnion(Arrays.asList(NullSchema, schema)); } else /*do not wrap it if not */ return schema; }
public static Schema getAvroSchema(FieldType type) { switch (type) { case STRING: return Schema.create(Schema.Type.STRING); case BINARY: return Schema.create(Schema.Type.BYTES); case DOUBLE: case DECIMAL: return Schema.create(Schema.Type.DOUBLE); case BOOLEAN: return Schema.create(Schema.Type.BOOLEAN); case DATE: case TIME: case INTEGER: return Schema.create(Schema.Type.INT); case LONG: case TIMESTAMP: return Schema.create(Schema.Type.LONG); default: if (type.isMap()) { Schema union = Schema.createUnion( ImmutableList.of( Schema.create(Schema.Type.NULL), getAvroSchema(type.getMapValueType()))); return Schema.createMap(union); } if (type.isArray()) { Schema union = Schema.createUnion( ImmutableList.of( Schema.create(Schema.Type.NULL), getAvroSchema(type.getArrayElementType()))); return Schema.createArray(union); } throw new IllegalStateException(); } }
Schema computeAvroSchema() { HashSet<String> observedSchemas = new HashSet<String>(); List<Schema> fields = new ArrayList<Schema>(); for (InferredType it : unionTypes) { Schema itS = it.getAvroSchema(); if (itS == null) { continue; } String schemaDesc = itS.toString(); if (!observedSchemas.contains(schemaDesc)) { observedSchemas.add(schemaDesc); fields.add(it.getAvroSchema()); } } return Schema.createUnion(fields); }
static { Schemas.STRING_LANG.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("label", Schemas.STRING, null, null), new Schema.Field("language", Schemas.STRING, null, null))); Schemas.SHORT.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("short", Schemas.INT, null, null))); Schemas.BYTE.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("byte", Schemas.INT, null, null))); Schemas.BIGINTEGER.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("biginteger", Schemas.STRING, null, null))); Schemas.BIGDECIMAL.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("bigdecimal", Schemas.STRING, null, null))); Schemas.PLAIN_IDENTIFIER.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("identifier", Schemas.STRING, null, null))); Schemas.COMPRESSED_IDENTIFIER.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("identifier", Schemas.INT, null, null))); Schemas.CALENDAR.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("timezone", Schemas.INT, null, null), new Schema.Field("timestamp", Schemas.LONG, null, null))); Schemas.STATEMENT.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("subject", Schemas.IDENTIFIER, null, null), new Schema.Field("predicate", Schemas.IDENTIFIER, null, null), new Schema.Field( "object", Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)), null, null), // new Schema.Field("context", Schemas.IDENTIFIER, null, null))); Schemas.PROPERTY.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("propertyURI", Schemas.COMPRESSED_IDENTIFIER, null, null), new Schema.Field( "propertyValue", Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER, Schemas.STATEMENT, Schemas.RECORD, Schemas.LIST)), null, null))); Schemas.RECORD.setFields( ImmutableList.<Schema.Field>of( new Schema.Field( "id", Schema.createUnion( ImmutableList.<Schema>of( Schemas.NULL, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)), null, null), // new Schema.Field("properties", Schema.createArray(Schemas.PROPERTY), null, null))); }
private static final class Schemas { /** The namespace for KS-specific AVRO schemas. */ public static final String NAMESPACE = "eu.fbk.knowledgestore"; /** AVRO schema for NULL. */ public static final Schema NULL = Schema.create(Schema.Type.NULL); /** AVRO schema for boolean literals. */ public static final Schema BOOLEAN = Schema.create(Schema.Type.BOOLEAN); /** AVRO schema for string literals. */ public static final Schema STRING = Schema.create(Schema.Type.STRING); /** AVRO schema for string literals with a language. */ public static final Schema STRING_LANG = Schema.createRecord("stringlang", null, Schemas.NAMESPACE, false); /** AVRO schema for long literals. */ public static final Schema LONG = Schema.create(Schema.Type.LONG); /** AVRO schema for int literals. */ public static final Schema INT = Schema.create(Schema.Type.INT); /** AVRO schema for short literals. */ public static final Schema SHORT = Schema.createRecord("short", null, Schemas.NAMESPACE, false); /** AVRO schema for byte literals. */ public static final Schema BYTE = Schema.createRecord("byte", null, Schemas.NAMESPACE, false); /** AVRO schema for double literals. */ public static final Schema DOUBLE = Schema.create(Schema.Type.DOUBLE); /** AVRO schema for float literals. */ public static final Schema FLOAT = Schema.create(Schema.Type.FLOAT); /** AVRO schema for big integer literals. */ public static final Schema BIGINTEGER = Schema.createRecord("biginteger", null, Schemas.NAMESPACE, false); /** AVRO schema for big decimal literals. */ public static final Schema BIGDECIMAL = Schema.createRecord("bigdecimal", null, Schemas.NAMESPACE, false); /** AVRO schema for non-compressed IDs (URIs, BNodes). */ public static final Schema PLAIN_IDENTIFIER = Schema // .createRecord("plainidentifier", null, Schemas.NAMESPACE, false); /** AVRO schema for compressed ID (URIs, BNodes). */ public static final Schema COMPRESSED_IDENTIFIER = Schema // .createRecord("compressedidentifier", null, Schemas.NAMESPACE, false); /** AVRO schema for any ID (URIs, BNodes). */ public static final Schema IDENTIFIER = Schema.createUnion(ImmutableList.<Schema>of(PLAIN_IDENTIFIER, COMPRESSED_IDENTIFIER)); /** AVRO schema for calendar literals. */ public static final Schema CALENDAR = Schema.createRecord("calendar", null, Schemas.NAMESPACE, false); /** AVRO schema for RDF statements. */ public static final Schema STATEMENT = Schema.createRecord("statement", null, Schemas.NAMESPACE, false); /** AVRO schema for record nodes ({@code Record}). */ public static final Schema RECORD = Schema.createRecord("struct", null, Schemas.NAMESPACE, false); /** AVRO schema for generic data model nodes. */ public static final Schema NODE = Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER, Schemas.CALENDAR, Schemas.STATEMENT, Schemas.RECORD)); /** AVRO schema for lists of nodes. */ public static final Schema LIST = Schema.createArray(Schemas.NODE); /** AVRO schema for properties of a record node. */ public static final Schema PROPERTY = Schema.createRecord("property", null, Schemas.NAMESPACE, false); private Schemas() {} static { Schemas.STRING_LANG.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("label", Schemas.STRING, null, null), new Schema.Field("language", Schemas.STRING, null, null))); Schemas.SHORT.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("short", Schemas.INT, null, null))); Schemas.BYTE.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("byte", Schemas.INT, null, null))); Schemas.BIGINTEGER.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("biginteger", Schemas.STRING, null, null))); Schemas.BIGDECIMAL.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("bigdecimal", Schemas.STRING, null, null))); Schemas.PLAIN_IDENTIFIER.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("identifier", Schemas.STRING, null, null))); Schemas.COMPRESSED_IDENTIFIER.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("identifier", Schemas.INT, null, null))); Schemas.CALENDAR.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("timezone", Schemas.INT, null, null), new Schema.Field("timestamp", Schemas.LONG, null, null))); Schemas.STATEMENT.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("subject", Schemas.IDENTIFIER, null, null), new Schema.Field("predicate", Schemas.IDENTIFIER, null, null), new Schema.Field( "object", Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)), null, null), // new Schema.Field("context", Schemas.IDENTIFIER, null, null))); Schemas.PROPERTY.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("propertyURI", Schemas.COMPRESSED_IDENTIFIER, null, null), new Schema.Field( "propertyValue", Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER, Schemas.STATEMENT, Schemas.RECORD, Schemas.LIST)), null, null))); Schemas.RECORD.setFields( ImmutableList.<Schema.Field>of( new Schema.Field( "id", Schema.createUnion( ImmutableList.<Schema>of( Schemas.NULL, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)), null, null), // new Schema.Field("properties", Schema.createArray(Schemas.PROPERTY), null, null))); } }
@Test public void testConvertBigQuerySchemaToAvroSchema() { TableSchema tableSchema = new TableSchema(); tableSchema.setFields(fields); Schema avroSchema = BigQueryAvroUtils.toGenericAvroSchema("testSchema", tableSchema.getFields()); assertThat(avroSchema.getField("number").schema(), equalTo(Schema.create(Type.LONG))); assertThat( avroSchema.getField("species").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING)))); assertThat( avroSchema.getField("quality").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.DOUBLE)))); assertThat( avroSchema.getField("quantity").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.LONG)))); assertThat( avroSchema.getField("birthday").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.LONG)))); assertThat( avroSchema.getField("flighted").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.BOOLEAN)))); assertThat( avroSchema.getField("sound").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.BYTES)))); assertThat( avroSchema.getField("anniversaryDate").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING)))); assertThat( avroSchema.getField("anniversaryDatetime").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING)))); assertThat( avroSchema.getField("anniversaryTime").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING)))); assertThat( avroSchema.getField("scion").schema(), equalTo( Schema.createUnion( Schema.create(Type.NULL), Schema.createRecord( "scion", "org.apache.beam.sdk.io.gcp.bigquery", "Translated Avro Schema for scion", false, ImmutableList.of( new Field( "species", Schema.createUnion( Schema.create(Type.NULL), Schema.create(Type.STRING)), null, (Object) null)))))); assertThat( avroSchema.getField("associates").schema(), equalTo( Schema.createArray( Schema.createRecord( "associates", "org.apache.beam.sdk.io.gcp.bigquery", "Translated Avro Schema for associates", false, ImmutableList.of( new Field( "species", Schema.createUnion( Schema.create(Type.NULL), Schema.create(Type.STRING)), null, (Object) null)))))); }
public static Schema generateAvroSchema(FieldType field) { return Schema.createUnion(Lists.newArrayList(Schema.create(NULL), getAvroSchema(field))); }