@Test public void testCollections() throws Exception { Collection<String> j = Lists.newArrayList(); j.add("a"); j.add("b"); Schema collectionSchema = Schema.createArray(Avros.strings().getSchema()); GenericData.Array<Utf8> w = new GenericData.Array<Utf8>(2, collectionSchema); w.add(new Utf8("a")); w.add(new Utf8("b")); testInputOutputFn(Avros.collections(Avros.strings()), j, w); }
/** {@inheritDoc} */ @Override public Array<CharSequence> convert(List<String> recommendationList) { List<CharSequence> recommendationArray = new ArrayList<CharSequence>(); ; for (String s : recommendationList) { recommendationArray.add(s); } Array<CharSequence> recomendationArray = new Array<CharSequence>( Schema.createArray(Schema.create(Schema.Type.STRING)), recommendationArray); return recomendationArray; }
public static Schema getAvroSchema(FieldType type) { switch (type) { case STRING: return Schema.create(Schema.Type.STRING); case BINARY: return Schema.create(Schema.Type.BYTES); case DOUBLE: case DECIMAL: return Schema.create(Schema.Type.DOUBLE); case BOOLEAN: return Schema.create(Schema.Type.BOOLEAN); case DATE: case TIME: case INTEGER: return Schema.create(Schema.Type.INT); case LONG: case TIMESTAMP: return Schema.create(Schema.Type.LONG); default: if (type.isMap()) { Schema union = Schema.createUnion( ImmutableList.of( Schema.create(Schema.Type.NULL), getAvroSchema(type.getMapValueType()))); return Schema.createMap(union); } if (type.isArray()) { Schema union = Schema.createUnion( ImmutableList.of( Schema.create(Schema.Type.NULL), getAvroSchema(type.getArrayElementType()))); return Schema.createArray(union); } throw new IllegalStateException(); } }
static { Schemas.STRING_LANG.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("label", Schemas.STRING, null, null), new Schema.Field("language", Schemas.STRING, null, null))); Schemas.SHORT.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("short", Schemas.INT, null, null))); Schemas.BYTE.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("byte", Schemas.INT, null, null))); Schemas.BIGINTEGER.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("biginteger", Schemas.STRING, null, null))); Schemas.BIGDECIMAL.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("bigdecimal", Schemas.STRING, null, null))); Schemas.PLAIN_IDENTIFIER.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("identifier", Schemas.STRING, null, null))); Schemas.COMPRESSED_IDENTIFIER.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("identifier", Schemas.INT, null, null))); Schemas.CALENDAR.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("timezone", Schemas.INT, null, null), new Schema.Field("timestamp", Schemas.LONG, null, null))); Schemas.STATEMENT.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("subject", Schemas.IDENTIFIER, null, null), new Schema.Field("predicate", Schemas.IDENTIFIER, null, null), new Schema.Field( "object", Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)), null, null), // new Schema.Field("context", Schemas.IDENTIFIER, null, null))); Schemas.PROPERTY.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("propertyURI", Schemas.COMPRESSED_IDENTIFIER, null, null), new Schema.Field( "propertyValue", Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER, Schemas.STATEMENT, Schemas.RECORD, Schemas.LIST)), null, null))); Schemas.RECORD.setFields( ImmutableList.<Schema.Field>of( new Schema.Field( "id", Schema.createUnion( ImmutableList.<Schema>of( Schemas.NULL, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)), null, null), // new Schema.Field("properties", Schema.createArray(Schemas.PROPERTY), null, null))); }
private static final class Schemas { /** The namespace for KS-specific AVRO schemas. */ public static final String NAMESPACE = "eu.fbk.knowledgestore"; /** AVRO schema for NULL. */ public static final Schema NULL = Schema.create(Schema.Type.NULL); /** AVRO schema for boolean literals. */ public static final Schema BOOLEAN = Schema.create(Schema.Type.BOOLEAN); /** AVRO schema for string literals. */ public static final Schema STRING = Schema.create(Schema.Type.STRING); /** AVRO schema for string literals with a language. */ public static final Schema STRING_LANG = Schema.createRecord("stringlang", null, Schemas.NAMESPACE, false); /** AVRO schema for long literals. */ public static final Schema LONG = Schema.create(Schema.Type.LONG); /** AVRO schema for int literals. */ public static final Schema INT = Schema.create(Schema.Type.INT); /** AVRO schema for short literals. */ public static final Schema SHORT = Schema.createRecord("short", null, Schemas.NAMESPACE, false); /** AVRO schema for byte literals. */ public static final Schema BYTE = Schema.createRecord("byte", null, Schemas.NAMESPACE, false); /** AVRO schema for double literals. */ public static final Schema DOUBLE = Schema.create(Schema.Type.DOUBLE); /** AVRO schema for float literals. */ public static final Schema FLOAT = Schema.create(Schema.Type.FLOAT); /** AVRO schema for big integer literals. */ public static final Schema BIGINTEGER = Schema.createRecord("biginteger", null, Schemas.NAMESPACE, false); /** AVRO schema for big decimal literals. */ public static final Schema BIGDECIMAL = Schema.createRecord("bigdecimal", null, Schemas.NAMESPACE, false); /** AVRO schema for non-compressed IDs (URIs, BNodes). */ public static final Schema PLAIN_IDENTIFIER = Schema // .createRecord("plainidentifier", null, Schemas.NAMESPACE, false); /** AVRO schema for compressed ID (URIs, BNodes). */ public static final Schema COMPRESSED_IDENTIFIER = Schema // .createRecord("compressedidentifier", null, Schemas.NAMESPACE, false); /** AVRO schema for any ID (URIs, BNodes). */ public static final Schema IDENTIFIER = Schema.createUnion(ImmutableList.<Schema>of(PLAIN_IDENTIFIER, COMPRESSED_IDENTIFIER)); /** AVRO schema for calendar literals. */ public static final Schema CALENDAR = Schema.createRecord("calendar", null, Schemas.NAMESPACE, false); /** AVRO schema for RDF statements. */ public static final Schema STATEMENT = Schema.createRecord("statement", null, Schemas.NAMESPACE, false); /** AVRO schema for record nodes ({@code Record}). */ public static final Schema RECORD = Schema.createRecord("struct", null, Schemas.NAMESPACE, false); /** AVRO schema for generic data model nodes. */ public static final Schema NODE = Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER, Schemas.CALENDAR, Schemas.STATEMENT, Schemas.RECORD)); /** AVRO schema for lists of nodes. */ public static final Schema LIST = Schema.createArray(Schemas.NODE); /** AVRO schema for properties of a record node. */ public static final Schema PROPERTY = Schema.createRecord("property", null, Schemas.NAMESPACE, false); private Schemas() {} static { Schemas.STRING_LANG.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("label", Schemas.STRING, null, null), new Schema.Field("language", Schemas.STRING, null, null))); Schemas.SHORT.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("short", Schemas.INT, null, null))); Schemas.BYTE.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("byte", Schemas.INT, null, null))); Schemas.BIGINTEGER.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("biginteger", Schemas.STRING, null, null))); Schemas.BIGDECIMAL.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("bigdecimal", Schemas.STRING, null, null))); Schemas.PLAIN_IDENTIFIER.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("identifier", Schemas.STRING, null, null))); Schemas.COMPRESSED_IDENTIFIER.setFields( ImmutableList.<Schema.Field>of(new Schema.Field("identifier", Schemas.INT, null, null))); Schemas.CALENDAR.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("timezone", Schemas.INT, null, null), new Schema.Field("timestamp", Schemas.LONG, null, null))); Schemas.STATEMENT.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("subject", Schemas.IDENTIFIER, null, null), new Schema.Field("predicate", Schemas.IDENTIFIER, null, null), new Schema.Field( "object", Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)), null, null), // new Schema.Field("context", Schemas.IDENTIFIER, null, null))); Schemas.PROPERTY.setFields( ImmutableList.<Schema.Field>of( new Schema.Field("propertyURI", Schemas.COMPRESSED_IDENTIFIER, null, null), new Schema.Field( "propertyValue", Schema.createUnion( ImmutableList.<Schema>of( Schemas.BOOLEAN, Schemas.STRING, Schemas.STRING_LANG, Schemas.LONG, Schemas.INT, Schemas.SHORT, Schemas.BYTE, Schemas.DOUBLE, Schemas.FLOAT, Schemas.BIGINTEGER, Schemas.BIGDECIMAL, Schemas.CALENDAR, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER, Schemas.STATEMENT, Schemas.RECORD, Schemas.LIST)), null, null))); Schemas.RECORD.setFields( ImmutableList.<Schema.Field>of( new Schema.Field( "id", Schema.createUnion( ImmutableList.<Schema>of( Schemas.NULL, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)), null, null), // new Schema.Field("properties", Schema.createArray(Schemas.PROPERTY), null, null))); } }
@Test public void testAllUsingDefaultAvroSchema() throws Exception { File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp"); tmp.deleteOnExit(); tmp.delete(); Path file = new Path(tmp.getPath()); // write file using Parquet APIs ParquetWriter<Map<String, Object>> parquetWriter = new ParquetWriter<Map<String, Object>>( file, new WriteSupport<Map<String, Object>>() { private RecordConsumer recordConsumer; @Override public WriteContext init(Configuration configuration) { return new WriteContext( MessageTypeParser.parseMessageType(TestAvroSchemaConverter.ALL_PARQUET_SCHEMA), new HashMap<String, String>()); } @Override public void prepareForWrite(RecordConsumer recordConsumer) { this.recordConsumer = recordConsumer; } @Override public void write(Map<String, Object> record) { recordConsumer.startMessage(); int index = 0; recordConsumer.startField("myboolean", index); recordConsumer.addBoolean((Boolean) record.get("myboolean")); recordConsumer.endField("myboolean", index++); recordConsumer.startField("myint", index); recordConsumer.addInteger((Integer) record.get("myint")); recordConsumer.endField("myint", index++); recordConsumer.startField("mylong", index); recordConsumer.addLong((Long) record.get("mylong")); recordConsumer.endField("mylong", index++); recordConsumer.startField("myfloat", index); recordConsumer.addFloat((Float) record.get("myfloat")); recordConsumer.endField("myfloat", index++); recordConsumer.startField("mydouble", index); recordConsumer.addDouble((Double) record.get("mydouble")); recordConsumer.endField("mydouble", index++); recordConsumer.startField("mybytes", index); recordConsumer.addBinary( Binary.fromReusedByteBuffer((ByteBuffer) record.get("mybytes"))); recordConsumer.endField("mybytes", index++); recordConsumer.startField("mystring", index); recordConsumer.addBinary(Binary.fromString((String) record.get("mystring"))); recordConsumer.endField("mystring", index++); recordConsumer.startField("mynestedrecord", index); recordConsumer.startGroup(); recordConsumer.startField("mynestedint", 0); recordConsumer.addInteger((Integer) record.get("mynestedint")); recordConsumer.endField("mynestedint", 0); recordConsumer.endGroup(); recordConsumer.endField("mynestedrecord", index++); recordConsumer.startField("myenum", index); recordConsumer.addBinary(Binary.fromString((String) record.get("myenum"))); recordConsumer.endField("myenum", index++); recordConsumer.startField("myarray", index); recordConsumer.startGroup(); recordConsumer.startField("array", 0); for (int val : (int[]) record.get("myarray")) { recordConsumer.addInteger(val); } recordConsumer.endField("array", 0); recordConsumer.endGroup(); recordConsumer.endField("myarray", index++); recordConsumer.startField("myoptionalarray", index); recordConsumer.startGroup(); recordConsumer.startField("array", 0); for (int val : (int[]) record.get("myoptionalarray")) { recordConsumer.addInteger(val); } recordConsumer.endField("array", 0); recordConsumer.endGroup(); recordConsumer.endField("myoptionalarray", index++); recordConsumer.startField("myarrayofoptional", index); recordConsumer.startGroup(); recordConsumer.startField("list", 0); for (Integer val : (Integer[]) record.get("myarrayofoptional")) { recordConsumer.startGroup(); if (val != null) { recordConsumer.startField("element", 0); recordConsumer.addInteger(val); recordConsumer.endField("element", 0); } recordConsumer.endGroup(); } recordConsumer.endField("list", 0); recordConsumer.endGroup(); recordConsumer.endField("myarrayofoptional", index++); recordConsumer.startField("myrecordarray", index); recordConsumer.startGroup(); recordConsumer.startField("array", 0); recordConsumer.startGroup(); recordConsumer.startField("a", 0); for (int val : (int[]) record.get("myrecordarraya")) { recordConsumer.addInteger(val); } recordConsumer.endField("a", 0); recordConsumer.startField("b", 1); for (int val : (int[]) record.get("myrecordarrayb")) { recordConsumer.addInteger(val); } recordConsumer.endField("b", 1); recordConsumer.endGroup(); recordConsumer.endField("array", 0); recordConsumer.endGroup(); recordConsumer.endField("myrecordarray", index++); recordConsumer.startField("mymap", index); recordConsumer.startGroup(); recordConsumer.startField("map", 0); recordConsumer.startGroup(); Map<String, Integer> mymap = (Map<String, Integer>) record.get("mymap"); recordConsumer.startField("key", 0); for (String key : mymap.keySet()) { recordConsumer.addBinary(Binary.fromString(key)); } recordConsumer.endField("key", 0); recordConsumer.startField("value", 1); for (int val : mymap.values()) { recordConsumer.addInteger(val); } recordConsumer.endField("value", 1); recordConsumer.endGroup(); recordConsumer.endField("map", 0); recordConsumer.endGroup(); recordConsumer.endField("mymap", index++); recordConsumer.startField("myfixed", index); recordConsumer.addBinary( Binary.fromReusedByteArray((byte[]) record.get("myfixed"))); recordConsumer.endField("myfixed", index++); recordConsumer.endMessage(); } }); Map<String, Object> record = new HashMap<String, Object>(); record.put("myboolean", true); record.put("myint", 1); record.put("mylong", 2L); record.put("myfloat", 3.1f); record.put("mydouble", 4.1); record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8))); record.put("mystring", "hello"); record.put("myenum", "a"); record.put("mynestedint", 1); record.put("myarray", new int[] {1, 2, 3}); record.put("myoptionalarray", new int[] {1, 2, 3}); record.put("myarrayofoptional", new Integer[] {1, null, 2, null, 3}); record.put("myrecordarraya", new int[] {1, 2, 3}); record.put("myrecordarrayb", new int[] {4, 5, 6}); record.put("mymap", ImmutableMap.of("a", 1, "b", 2)); record.put("myfixed", new byte[] {(byte) 65}); parquetWriter.write(record); parquetWriter.close(); Schema nestedRecordSchema = Schema.createRecord("mynestedrecord", null, null, false); nestedRecordSchema.setFields( Arrays.asList(new Schema.Field("mynestedint", Schema.create(Schema.Type.INT), null, null))); GenericData.Record nestedRecord = new GenericRecordBuilder(nestedRecordSchema).set("mynestedint", 1).build(); List<Integer> integerArray = Arrays.asList(1, 2, 3); Schema recordArraySchema = Schema.createRecord("array", null, null, false); recordArraySchema.setFields( Arrays.asList( new Schema.Field("a", Schema.create(Schema.Type.INT), null, null), new Schema.Field("b", Schema.create(Schema.Type.INT), null, null))); GenericRecordBuilder builder = new GenericRecordBuilder(recordArraySchema); List<GenericData.Record> recordArray = new ArrayList<GenericData.Record>(); recordArray.add(builder.set("a", 1).set("b", 4).build()); recordArray.add(builder.set("a", 2).set("b", 5).build()); recordArray.add(builder.set("a", 3).set("b", 6).build()); GenericData.Array<GenericData.Record> genericRecordArray = new GenericData.Array<GenericData.Record>( Schema.createArray(recordArraySchema), recordArray); GenericFixed genericFixed = new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] {(byte) 65}); // 3-level lists are deserialized with the extra layer present Schema elementSchema = record("list", optionalField("element", primitive(Schema.Type.INT))); GenericRecordBuilder elementBuilder = new GenericRecordBuilder(elementSchema); GenericData.Array<GenericData.Record> genericRecordArrayWithNullIntegers = new GenericData.Array<GenericData.Record>( array(elementSchema), Arrays.asList( elementBuilder.set("element", 1).build(), elementBuilder.set("element", null).build(), elementBuilder.set("element", 2).build(), elementBuilder.set("element", null).build(), elementBuilder.set("element", 3).build())); AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(testConf, file); GenericRecord nextRecord = reader.read(); assertNotNull(nextRecord); assertEquals(true, nextRecord.get("myboolean")); assertEquals(1, nextRecord.get("myint")); assertEquals(2L, nextRecord.get("mylong")); assertEquals(3.1f, nextRecord.get("myfloat")); assertEquals(4.1, nextRecord.get("mydouble")); assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes")); assertEquals(str("hello"), nextRecord.get("mystring")); assertEquals(str("a"), nextRecord.get("myenum")); assertEquals(nestedRecord, nextRecord.get("mynestedrecord")); assertEquals(integerArray, nextRecord.get("myarray")); assertEquals(integerArray, nextRecord.get("myoptionalarray")); assertEquals(genericRecordArrayWithNullIntegers, nextRecord.get("myarrayofoptional")); assertEquals(genericRecordArray, nextRecord.get("myrecordarray")); assertEquals(ImmutableMap.of(str("a"), 1, str("b"), 2), nextRecord.get("mymap")); assertEquals(genericFixed, nextRecord.get("myfixed")); }
@Test public void testArrayWithNullValues() throws Exception { Schema schema = new Schema.Parser().parse(Resources.getResource("all.avsc").openStream()); File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp"); tmp.deleteOnExit(); tmp.delete(); Path file = new Path(tmp.getPath()); GenericData.Record nestedRecord = new GenericRecordBuilder(schema.getField("mynestedrecord").schema()) .set("mynestedint", 1) .build(); List<Integer> integerArray = Arrays.asList(1, 2, 3); GenericData.Array<Integer> genericIntegerArray = new GenericData.Array<Integer>( Schema.createArray(Schema.create(Schema.Type.INT)), integerArray); GenericFixed genericFixed = new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] {(byte) 65}); List<Integer> emptyArray = new ArrayList<Integer>(); ImmutableMap emptyMap = new ImmutableMap.Builder<String, Integer>().build(); Schema arrayOfOptionalIntegers = Schema.createArray(optional(Schema.create(Schema.Type.INT))); GenericData.Array<Integer> genericIntegerArrayWithNulls = new GenericData.Array<Integer>(arrayOfOptionalIntegers, Arrays.asList(1, null, 2, null, 3)); GenericData.Record record = new GenericRecordBuilder(schema) .set("mynull", null) .set("myboolean", true) .set("myint", 1) .set("mylong", 2L) .set("myfloat", 3.1f) .set("mydouble", 4.1) .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8))) .set("mystring", "hello") .set("mynestedrecord", nestedRecord) .set("myenum", "a") .set("myarray", genericIntegerArray) .set("myemptyarray", emptyArray) .set("myoptionalarray", genericIntegerArray) .set("myarrayofoptional", genericIntegerArrayWithNulls) .set("mymap", ImmutableMap.of("a", 1, "b", 2)) .set("myemptymap", emptyMap) .set("myfixed", genericFixed) .build(); final AvroParquetWriter<GenericRecord> writer = new AvroParquetWriter<GenericRecord>(file, schema); try { writer.write(record); fail("Should not succeed writing an array with null values"); } catch (Exception e) { Assert.assertTrue( "Error message should provide context and help", e.getMessage().contains("parquet.avro.write-old-list-structure")); } finally { writer.close(); } }
@Test public void testAll() throws Exception { Schema schema = new Schema.Parser().parse(Resources.getResource("all.avsc").openStream()); File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp"); tmp.deleteOnExit(); tmp.delete(); Path file = new Path(tmp.getPath()); AvroParquetWriter<GenericRecord> writer = new AvroParquetWriter<GenericRecord>(file, schema); GenericData.Record nestedRecord = new GenericRecordBuilder(schema.getField("mynestedrecord").schema()) .set("mynestedint", 1) .build(); List<Integer> integerArray = Arrays.asList(1, 2, 3); GenericData.Array<Integer> genericIntegerArray = new GenericData.Array<Integer>( Schema.createArray(Schema.create(Schema.Type.INT)), integerArray); GenericFixed genericFixed = new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] {(byte) 65}); List<Integer> emptyArray = new ArrayList<Integer>(); ImmutableMap emptyMap = new ImmutableMap.Builder<String, Integer>().build(); GenericData.Record record = new GenericRecordBuilder(schema) .set("mynull", null) .set("myboolean", true) .set("myint", 1) .set("mylong", 2L) .set("myfloat", 3.1f) .set("mydouble", 4.1) .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8))) .set("mystring", "hello") .set("mynestedrecord", nestedRecord) .set("myenum", "a") .set("myarray", genericIntegerArray) .set("myemptyarray", emptyArray) .set("myoptionalarray", genericIntegerArray) .set("myarrayofoptional", genericIntegerArray) .set("mymap", ImmutableMap.of("a", 1, "b", 2)) .set("myemptymap", emptyMap) .set("myfixed", genericFixed) .build(); writer.write(record); writer.close(); AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(testConf, file); GenericRecord nextRecord = reader.read(); Object expectedEnumSymbol = compat ? "a" : new GenericData.EnumSymbol(schema.getField("myenum").schema(), "a"); assertNotNull(nextRecord); assertEquals(null, nextRecord.get("mynull")); assertEquals(true, nextRecord.get("myboolean")); assertEquals(1, nextRecord.get("myint")); assertEquals(2L, nextRecord.get("mylong")); assertEquals(3.1f, nextRecord.get("myfloat")); assertEquals(4.1, nextRecord.get("mydouble")); assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes")); assertEquals(str("hello"), nextRecord.get("mystring")); assertEquals(expectedEnumSymbol, nextRecord.get("myenum")); assertEquals(nestedRecord, nextRecord.get("mynestedrecord")); assertEquals(integerArray, nextRecord.get("myarray")); assertEquals(emptyArray, nextRecord.get("myemptyarray")); assertEquals(integerArray, nextRecord.get("myoptionalarray")); assertEquals(integerArray, nextRecord.get("myarrayofoptional")); assertEquals(ImmutableMap.of(str("a"), 1, str("b"), 2), nextRecord.get("mymap")); assertEquals(emptyMap, nextRecord.get("myemptymap")); assertEquals(genericFixed, nextRecord.get("myfixed")); }
@Test public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException { // Create a test input file. File inputFile = createInputFile(); // Configure the job input. Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath())); job.setInputFormatClass(CombineAvroKeyValueInputFormat.class); AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT)); AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING)); // Configure a mapper. job.setMapperClass(IndexMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // Configure a reducer. job.setReducerClass(IndexReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AvroValue.class); AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT))); // Configure the output format. job.setOutputFormatClass(AvroKeyValueOutputFormat.class); Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index"); FileOutputFormat.setOutputPath(job, outputPath); // Run the job. assertTrue(job.waitForCompletion(true)); // Verify that the output Avro container file as the expected data. File avroFile = new File(outputPath.toString(), "part-r-00000.avro"); DatumReader<GenericRecord> datumReader = new SpecificDatumReader<GenericRecord>( AvroKeyValue.getSchema( Schema.create(Schema.Type.STRING), Schema.createArray(Schema.create(Schema.Type.INT)))); DataFileReader<GenericRecord> avroFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> appleRecord = new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next()); assertNotNull(appleRecord.get()); assertEquals("apple", appleRecord.getKey().toString()); List<Integer> appleDocs = appleRecord.getValue(); assertEquals(3, appleDocs.size()); assertTrue(appleDocs.contains(1)); assertTrue(appleDocs.contains(2)); assertTrue(appleDocs.contains(3)); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> bananaRecord = new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next()); assertNotNull(bananaRecord.get()); assertEquals("banana", bananaRecord.getKey().toString()); List<Integer> bananaDocs = bananaRecord.getValue(); assertEquals(2, bananaDocs.size()); assertTrue(bananaDocs.contains(1)); assertTrue(bananaDocs.contains(2)); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> carrotRecord = new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next()); assertEquals("carrot", carrotRecord.getKey().toString()); List<Integer> carrotDocs = carrotRecord.getValue(); assertEquals(1, carrotDocs.size()); assertTrue(carrotDocs.contains(1)); assertFalse(avroFileReader.hasNext()); avroFileReader.close(); }
@Test public void testConvertBigQuerySchemaToAvroSchema() { TableSchema tableSchema = new TableSchema(); tableSchema.setFields(fields); Schema avroSchema = BigQueryAvroUtils.toGenericAvroSchema("testSchema", tableSchema.getFields()); assertThat(avroSchema.getField("number").schema(), equalTo(Schema.create(Type.LONG))); assertThat( avroSchema.getField("species").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING)))); assertThat( avroSchema.getField("quality").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.DOUBLE)))); assertThat( avroSchema.getField("quantity").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.LONG)))); assertThat( avroSchema.getField("birthday").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.LONG)))); assertThat( avroSchema.getField("flighted").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.BOOLEAN)))); assertThat( avroSchema.getField("sound").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.BYTES)))); assertThat( avroSchema.getField("anniversaryDate").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING)))); assertThat( avroSchema.getField("anniversaryDatetime").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING)))); assertThat( avroSchema.getField("anniversaryTime").schema(), equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING)))); assertThat( avroSchema.getField("scion").schema(), equalTo( Schema.createUnion( Schema.create(Type.NULL), Schema.createRecord( "scion", "org.apache.beam.sdk.io.gcp.bigquery", "Translated Avro Schema for scion", false, ImmutableList.of( new Field( "species", Schema.createUnion( Schema.create(Type.NULL), Schema.create(Type.STRING)), null, (Object) null)))))); assertThat( avroSchema.getField("associates").schema(), equalTo( Schema.createArray( Schema.createRecord( "associates", "org.apache.beam.sdk.io.gcp.bigquery", "Translated Avro Schema for associates", false, ImmutableList.of( new Field( "species", Schema.createUnion( Schema.create(Type.NULL), Schema.create(Type.STRING)), null, (Object) null)))))); }
Schema computeAvroSchema() { return Schema.createArray(bodyType.getAvroSchema()); }