Пример #1
0
 @Test
 public void testCollections() throws Exception {
   Collection<String> j = Lists.newArrayList();
   j.add("a");
   j.add("b");
   Schema collectionSchema = Schema.createArray(Avros.strings().getSchema());
   GenericData.Array<Utf8> w = new GenericData.Array<Utf8>(2, collectionSchema);
   w.add(new Utf8("a"));
   w.add(new Utf8("b"));
   testInputOutputFn(Avros.collections(Avros.strings()), j, w);
 }
 /** {@inheritDoc} */
 @Override
 public Array<CharSequence> convert(List<String> recommendationList) {
   List<CharSequence> recommendationArray = new ArrayList<CharSequence>();
   ;
   for (String s : recommendationList) {
     recommendationArray.add(s);
   }
   Array<CharSequence> recomendationArray =
       new Array<CharSequence>(
           Schema.createArray(Schema.create(Schema.Type.STRING)), recommendationArray);
   return recomendationArray;
 }
Пример #3
0
 public static Schema getAvroSchema(FieldType type) {
   switch (type) {
     case STRING:
       return Schema.create(Schema.Type.STRING);
     case BINARY:
       return Schema.create(Schema.Type.BYTES);
     case DOUBLE:
     case DECIMAL:
       return Schema.create(Schema.Type.DOUBLE);
     case BOOLEAN:
       return Schema.create(Schema.Type.BOOLEAN);
     case DATE:
     case TIME:
     case INTEGER:
       return Schema.create(Schema.Type.INT);
     case LONG:
     case TIMESTAMP:
       return Schema.create(Schema.Type.LONG);
     default:
       if (type.isMap()) {
         Schema union =
             Schema.createUnion(
                 ImmutableList.of(
                     Schema.create(Schema.Type.NULL), getAvroSchema(type.getMapValueType())));
         return Schema.createMap(union);
       }
       if (type.isArray()) {
         Schema union =
             Schema.createUnion(
                 ImmutableList.of(
                     Schema.create(Schema.Type.NULL), getAvroSchema(type.getArrayElementType())));
         return Schema.createArray(union);
       }
       throw new IllegalStateException();
   }
 }
Пример #4
0
    static {
      Schemas.STRING_LANG.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("label", Schemas.STRING, null, null),
              new Schema.Field("language", Schemas.STRING, null, null)));
      Schemas.SHORT.setFields(
          ImmutableList.<Schema.Field>of(new Schema.Field("short", Schemas.INT, null, null)));
      Schemas.BYTE.setFields(
          ImmutableList.<Schema.Field>of(new Schema.Field("byte", Schemas.INT, null, null)));
      Schemas.BIGINTEGER.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("biginteger", Schemas.STRING, null, null)));
      Schemas.BIGDECIMAL.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("bigdecimal", Schemas.STRING, null, null)));
      Schemas.PLAIN_IDENTIFIER.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("identifier", Schemas.STRING, null, null)));
      Schemas.COMPRESSED_IDENTIFIER.setFields(
          ImmutableList.<Schema.Field>of(new Schema.Field("identifier", Schemas.INT, null, null)));
      Schemas.CALENDAR.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("timezone", Schemas.INT, null, null),
              new Schema.Field("timestamp", Schemas.LONG, null, null)));

      Schemas.STATEMENT.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("subject", Schemas.IDENTIFIER, null, null),
              new Schema.Field("predicate", Schemas.IDENTIFIER, null, null),
              new Schema.Field(
                  "object",
                  Schema.createUnion(
                      ImmutableList.<Schema>of(
                          Schemas.BOOLEAN,
                          Schemas.STRING,
                          Schemas.STRING_LANG,
                          Schemas.LONG,
                          Schemas.INT,
                          Schemas.SHORT,
                          Schemas.BYTE,
                          Schemas.DOUBLE,
                          Schemas.FLOAT,
                          Schemas.BIGINTEGER,
                          Schemas.BIGDECIMAL,
                          Schemas.CALENDAR,
                          Schemas.PLAIN_IDENTIFIER,
                          Schemas.COMPRESSED_IDENTIFIER)),
                  null,
                  null), //
              new Schema.Field("context", Schemas.IDENTIFIER, null, null)));

      Schemas.PROPERTY.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("propertyURI", Schemas.COMPRESSED_IDENTIFIER, null, null),
              new Schema.Field(
                  "propertyValue",
                  Schema.createUnion(
                      ImmutableList.<Schema>of(
                          Schemas.BOOLEAN,
                          Schemas.STRING,
                          Schemas.STRING_LANG,
                          Schemas.LONG,
                          Schemas.INT,
                          Schemas.SHORT,
                          Schemas.BYTE,
                          Schemas.DOUBLE,
                          Schemas.FLOAT,
                          Schemas.BIGINTEGER,
                          Schemas.BIGDECIMAL,
                          Schemas.CALENDAR,
                          Schemas.PLAIN_IDENTIFIER,
                          Schemas.COMPRESSED_IDENTIFIER,
                          Schemas.STATEMENT,
                          Schemas.RECORD,
                          Schemas.LIST)),
                  null,
                  null)));

      Schemas.RECORD.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field(
                  "id",
                  Schema.createUnion(
                      ImmutableList.<Schema>of(
                          Schemas.NULL, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)),
                  null,
                  null), //
              new Schema.Field("properties", Schema.createArray(Schemas.PROPERTY), null, null)));
    }
Пример #5
0
  private static final class Schemas {

    /** The namespace for KS-specific AVRO schemas. */
    public static final String NAMESPACE = "eu.fbk.knowledgestore";

    /** AVRO schema for NULL. */
    public static final Schema NULL = Schema.create(Schema.Type.NULL);

    /** AVRO schema for boolean literals. */
    public static final Schema BOOLEAN = Schema.create(Schema.Type.BOOLEAN);

    /** AVRO schema for string literals. */
    public static final Schema STRING = Schema.create(Schema.Type.STRING);

    /** AVRO schema for string literals with a language. */
    public static final Schema STRING_LANG =
        Schema.createRecord("stringlang", null, Schemas.NAMESPACE, false);

    /** AVRO schema for long literals. */
    public static final Schema LONG = Schema.create(Schema.Type.LONG);

    /** AVRO schema for int literals. */
    public static final Schema INT = Schema.create(Schema.Type.INT);

    /** AVRO schema for short literals. */
    public static final Schema SHORT = Schema.createRecord("short", null, Schemas.NAMESPACE, false);

    /** AVRO schema for byte literals. */
    public static final Schema BYTE = Schema.createRecord("byte", null, Schemas.NAMESPACE, false);

    /** AVRO schema for double literals. */
    public static final Schema DOUBLE = Schema.create(Schema.Type.DOUBLE);

    /** AVRO schema for float literals. */
    public static final Schema FLOAT = Schema.create(Schema.Type.FLOAT);

    /** AVRO schema for big integer literals. */
    public static final Schema BIGINTEGER =
        Schema.createRecord("biginteger", null, Schemas.NAMESPACE, false);

    /** AVRO schema for big decimal literals. */
    public static final Schema BIGDECIMAL =
        Schema.createRecord("bigdecimal", null, Schemas.NAMESPACE, false);

    /** AVRO schema for non-compressed IDs (URIs, BNodes). */
    public static final Schema PLAIN_IDENTIFIER =
        Schema //
            .createRecord("plainidentifier", null, Schemas.NAMESPACE, false);

    /** AVRO schema for compressed ID (URIs, BNodes). */
    public static final Schema COMPRESSED_IDENTIFIER =
        Schema //
            .createRecord("compressedidentifier", null, Schemas.NAMESPACE, false);

    /** AVRO schema for any ID (URIs, BNodes). */
    public static final Schema IDENTIFIER =
        Schema.createUnion(ImmutableList.<Schema>of(PLAIN_IDENTIFIER, COMPRESSED_IDENTIFIER));

    /** AVRO schema for calendar literals. */
    public static final Schema CALENDAR =
        Schema.createRecord("calendar", null, Schemas.NAMESPACE, false);

    /** AVRO schema for RDF statements. */
    public static final Schema STATEMENT =
        Schema.createRecord("statement", null, Schemas.NAMESPACE, false);

    /** AVRO schema for record nodes ({@code Record}). */
    public static final Schema RECORD =
        Schema.createRecord("struct", null, Schemas.NAMESPACE, false);

    /** AVRO schema for generic data model nodes. */
    public static final Schema NODE =
        Schema.createUnion(
            ImmutableList.<Schema>of(
                Schemas.BOOLEAN,
                Schemas.STRING,
                Schemas.STRING_LANG,
                Schemas.LONG,
                Schemas.INT,
                Schemas.SHORT,
                Schemas.BYTE,
                Schemas.DOUBLE,
                Schemas.FLOAT,
                Schemas.BIGINTEGER,
                Schemas.BIGDECIMAL,
                Schemas.PLAIN_IDENTIFIER,
                Schemas.COMPRESSED_IDENTIFIER,
                Schemas.CALENDAR,
                Schemas.STATEMENT,
                Schemas.RECORD));

    /** AVRO schema for lists of nodes. */
    public static final Schema LIST = Schema.createArray(Schemas.NODE);

    /** AVRO schema for properties of a record node. */
    public static final Schema PROPERTY =
        Schema.createRecord("property", null, Schemas.NAMESPACE, false);

    private Schemas() {}

    static {
      Schemas.STRING_LANG.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("label", Schemas.STRING, null, null),
              new Schema.Field("language", Schemas.STRING, null, null)));
      Schemas.SHORT.setFields(
          ImmutableList.<Schema.Field>of(new Schema.Field("short", Schemas.INT, null, null)));
      Schemas.BYTE.setFields(
          ImmutableList.<Schema.Field>of(new Schema.Field("byte", Schemas.INT, null, null)));
      Schemas.BIGINTEGER.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("biginteger", Schemas.STRING, null, null)));
      Schemas.BIGDECIMAL.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("bigdecimal", Schemas.STRING, null, null)));
      Schemas.PLAIN_IDENTIFIER.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("identifier", Schemas.STRING, null, null)));
      Schemas.COMPRESSED_IDENTIFIER.setFields(
          ImmutableList.<Schema.Field>of(new Schema.Field("identifier", Schemas.INT, null, null)));
      Schemas.CALENDAR.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("timezone", Schemas.INT, null, null),
              new Schema.Field("timestamp", Schemas.LONG, null, null)));

      Schemas.STATEMENT.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("subject", Schemas.IDENTIFIER, null, null),
              new Schema.Field("predicate", Schemas.IDENTIFIER, null, null),
              new Schema.Field(
                  "object",
                  Schema.createUnion(
                      ImmutableList.<Schema>of(
                          Schemas.BOOLEAN,
                          Schemas.STRING,
                          Schemas.STRING_LANG,
                          Schemas.LONG,
                          Schemas.INT,
                          Schemas.SHORT,
                          Schemas.BYTE,
                          Schemas.DOUBLE,
                          Schemas.FLOAT,
                          Schemas.BIGINTEGER,
                          Schemas.BIGDECIMAL,
                          Schemas.CALENDAR,
                          Schemas.PLAIN_IDENTIFIER,
                          Schemas.COMPRESSED_IDENTIFIER)),
                  null,
                  null), //
              new Schema.Field("context", Schemas.IDENTIFIER, null, null)));

      Schemas.PROPERTY.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field("propertyURI", Schemas.COMPRESSED_IDENTIFIER, null, null),
              new Schema.Field(
                  "propertyValue",
                  Schema.createUnion(
                      ImmutableList.<Schema>of(
                          Schemas.BOOLEAN,
                          Schemas.STRING,
                          Schemas.STRING_LANG,
                          Schemas.LONG,
                          Schemas.INT,
                          Schemas.SHORT,
                          Schemas.BYTE,
                          Schemas.DOUBLE,
                          Schemas.FLOAT,
                          Schemas.BIGINTEGER,
                          Schemas.BIGDECIMAL,
                          Schemas.CALENDAR,
                          Schemas.PLAIN_IDENTIFIER,
                          Schemas.COMPRESSED_IDENTIFIER,
                          Schemas.STATEMENT,
                          Schemas.RECORD,
                          Schemas.LIST)),
                  null,
                  null)));

      Schemas.RECORD.setFields(
          ImmutableList.<Schema.Field>of(
              new Schema.Field(
                  "id",
                  Schema.createUnion(
                      ImmutableList.<Schema>of(
                          Schemas.NULL, Schemas.PLAIN_IDENTIFIER, Schemas.COMPRESSED_IDENTIFIER)),
                  null,
                  null), //
              new Schema.Field("properties", Schema.createArray(Schemas.PROPERTY), null, null)));
    }
  }
  @Test
  public void testAllUsingDefaultAvroSchema() throws Exception {
    File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
    tmp.deleteOnExit();
    tmp.delete();
    Path file = new Path(tmp.getPath());

    // write file using Parquet APIs
    ParquetWriter<Map<String, Object>> parquetWriter =
        new ParquetWriter<Map<String, Object>>(
            file,
            new WriteSupport<Map<String, Object>>() {

              private RecordConsumer recordConsumer;

              @Override
              public WriteContext init(Configuration configuration) {
                return new WriteContext(
                    MessageTypeParser.parseMessageType(TestAvroSchemaConverter.ALL_PARQUET_SCHEMA),
                    new HashMap<String, String>());
              }

              @Override
              public void prepareForWrite(RecordConsumer recordConsumer) {
                this.recordConsumer = recordConsumer;
              }

              @Override
              public void write(Map<String, Object> record) {
                recordConsumer.startMessage();

                int index = 0;

                recordConsumer.startField("myboolean", index);
                recordConsumer.addBoolean((Boolean) record.get("myboolean"));
                recordConsumer.endField("myboolean", index++);

                recordConsumer.startField("myint", index);
                recordConsumer.addInteger((Integer) record.get("myint"));
                recordConsumer.endField("myint", index++);

                recordConsumer.startField("mylong", index);
                recordConsumer.addLong((Long) record.get("mylong"));
                recordConsumer.endField("mylong", index++);

                recordConsumer.startField("myfloat", index);
                recordConsumer.addFloat((Float) record.get("myfloat"));
                recordConsumer.endField("myfloat", index++);

                recordConsumer.startField("mydouble", index);
                recordConsumer.addDouble((Double) record.get("mydouble"));
                recordConsumer.endField("mydouble", index++);

                recordConsumer.startField("mybytes", index);
                recordConsumer.addBinary(
                    Binary.fromReusedByteBuffer((ByteBuffer) record.get("mybytes")));
                recordConsumer.endField("mybytes", index++);

                recordConsumer.startField("mystring", index);
                recordConsumer.addBinary(Binary.fromString((String) record.get("mystring")));
                recordConsumer.endField("mystring", index++);

                recordConsumer.startField("mynestedrecord", index);
                recordConsumer.startGroup();
                recordConsumer.startField("mynestedint", 0);
                recordConsumer.addInteger((Integer) record.get("mynestedint"));
                recordConsumer.endField("mynestedint", 0);
                recordConsumer.endGroup();
                recordConsumer.endField("mynestedrecord", index++);

                recordConsumer.startField("myenum", index);
                recordConsumer.addBinary(Binary.fromString((String) record.get("myenum")));
                recordConsumer.endField("myenum", index++);

                recordConsumer.startField("myarray", index);
                recordConsumer.startGroup();
                recordConsumer.startField("array", 0);
                for (int val : (int[]) record.get("myarray")) {
                  recordConsumer.addInteger(val);
                }
                recordConsumer.endField("array", 0);
                recordConsumer.endGroup();
                recordConsumer.endField("myarray", index++);

                recordConsumer.startField("myoptionalarray", index);
                recordConsumer.startGroup();
                recordConsumer.startField("array", 0);
                for (int val : (int[]) record.get("myoptionalarray")) {
                  recordConsumer.addInteger(val);
                }
                recordConsumer.endField("array", 0);
                recordConsumer.endGroup();
                recordConsumer.endField("myoptionalarray", index++);

                recordConsumer.startField("myarrayofoptional", index);
                recordConsumer.startGroup();
                recordConsumer.startField("list", 0);
                for (Integer val : (Integer[]) record.get("myarrayofoptional")) {
                  recordConsumer.startGroup();
                  if (val != null) {
                    recordConsumer.startField("element", 0);
                    recordConsumer.addInteger(val);
                    recordConsumer.endField("element", 0);
                  }
                  recordConsumer.endGroup();
                }
                recordConsumer.endField("list", 0);
                recordConsumer.endGroup();
                recordConsumer.endField("myarrayofoptional", index++);

                recordConsumer.startField("myrecordarray", index);
                recordConsumer.startGroup();
                recordConsumer.startField("array", 0);
                recordConsumer.startGroup();
                recordConsumer.startField("a", 0);
                for (int val : (int[]) record.get("myrecordarraya")) {
                  recordConsumer.addInteger(val);
                }
                recordConsumer.endField("a", 0);
                recordConsumer.startField("b", 1);
                for (int val : (int[]) record.get("myrecordarrayb")) {
                  recordConsumer.addInteger(val);
                }
                recordConsumer.endField("b", 1);
                recordConsumer.endGroup();
                recordConsumer.endField("array", 0);
                recordConsumer.endGroup();
                recordConsumer.endField("myrecordarray", index++);

                recordConsumer.startField("mymap", index);
                recordConsumer.startGroup();
                recordConsumer.startField("map", 0);
                recordConsumer.startGroup();
                Map<String, Integer> mymap = (Map<String, Integer>) record.get("mymap");
                recordConsumer.startField("key", 0);
                for (String key : mymap.keySet()) {
                  recordConsumer.addBinary(Binary.fromString(key));
                }
                recordConsumer.endField("key", 0);
                recordConsumer.startField("value", 1);
                for (int val : mymap.values()) {
                  recordConsumer.addInteger(val);
                }
                recordConsumer.endField("value", 1);
                recordConsumer.endGroup();
                recordConsumer.endField("map", 0);
                recordConsumer.endGroup();
                recordConsumer.endField("mymap", index++);

                recordConsumer.startField("myfixed", index);
                recordConsumer.addBinary(
                    Binary.fromReusedByteArray((byte[]) record.get("myfixed")));
                recordConsumer.endField("myfixed", index++);

                recordConsumer.endMessage();
              }
            });
    Map<String, Object> record = new HashMap<String, Object>();
    record.put("myboolean", true);
    record.put("myint", 1);
    record.put("mylong", 2L);
    record.put("myfloat", 3.1f);
    record.put("mydouble", 4.1);
    record.put("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)));
    record.put("mystring", "hello");
    record.put("myenum", "a");
    record.put("mynestedint", 1);
    record.put("myarray", new int[] {1, 2, 3});
    record.put("myoptionalarray", new int[] {1, 2, 3});
    record.put("myarrayofoptional", new Integer[] {1, null, 2, null, 3});
    record.put("myrecordarraya", new int[] {1, 2, 3});
    record.put("myrecordarrayb", new int[] {4, 5, 6});
    record.put("mymap", ImmutableMap.of("a", 1, "b", 2));
    record.put("myfixed", new byte[] {(byte) 65});
    parquetWriter.write(record);
    parquetWriter.close();

    Schema nestedRecordSchema = Schema.createRecord("mynestedrecord", null, null, false);
    nestedRecordSchema.setFields(
        Arrays.asList(new Schema.Field("mynestedint", Schema.create(Schema.Type.INT), null, null)));
    GenericData.Record nestedRecord =
        new GenericRecordBuilder(nestedRecordSchema).set("mynestedint", 1).build();

    List<Integer> integerArray = Arrays.asList(1, 2, 3);

    Schema recordArraySchema = Schema.createRecord("array", null, null, false);
    recordArraySchema.setFields(
        Arrays.asList(
            new Schema.Field("a", Schema.create(Schema.Type.INT), null, null),
            new Schema.Field("b", Schema.create(Schema.Type.INT), null, null)));
    GenericRecordBuilder builder = new GenericRecordBuilder(recordArraySchema);
    List<GenericData.Record> recordArray = new ArrayList<GenericData.Record>();
    recordArray.add(builder.set("a", 1).set("b", 4).build());
    recordArray.add(builder.set("a", 2).set("b", 5).build());
    recordArray.add(builder.set("a", 3).set("b", 6).build());
    GenericData.Array<GenericData.Record> genericRecordArray =
        new GenericData.Array<GenericData.Record>(
            Schema.createArray(recordArraySchema), recordArray);

    GenericFixed genericFixed =
        new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] {(byte) 65});

    // 3-level lists are deserialized with the extra layer present
    Schema elementSchema = record("list", optionalField("element", primitive(Schema.Type.INT)));
    GenericRecordBuilder elementBuilder = new GenericRecordBuilder(elementSchema);
    GenericData.Array<GenericData.Record> genericRecordArrayWithNullIntegers =
        new GenericData.Array<GenericData.Record>(
            array(elementSchema),
            Arrays.asList(
                elementBuilder.set("element", 1).build(),
                elementBuilder.set("element", null).build(),
                elementBuilder.set("element", 2).build(),
                elementBuilder.set("element", null).build(),
                elementBuilder.set("element", 3).build()));

    AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(testConf, file);
    GenericRecord nextRecord = reader.read();
    assertNotNull(nextRecord);
    assertEquals(true, nextRecord.get("myboolean"));
    assertEquals(1, nextRecord.get("myint"));
    assertEquals(2L, nextRecord.get("mylong"));
    assertEquals(3.1f, nextRecord.get("myfloat"));
    assertEquals(4.1, nextRecord.get("mydouble"));
    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
    assertEquals(str("hello"), nextRecord.get("mystring"));
    assertEquals(str("a"), nextRecord.get("myenum"));
    assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
    assertEquals(integerArray, nextRecord.get("myarray"));
    assertEquals(integerArray, nextRecord.get("myoptionalarray"));
    assertEquals(genericRecordArrayWithNullIntegers, nextRecord.get("myarrayofoptional"));
    assertEquals(genericRecordArray, nextRecord.get("myrecordarray"));
    assertEquals(ImmutableMap.of(str("a"), 1, str("b"), 2), nextRecord.get("mymap"));
    assertEquals(genericFixed, nextRecord.get("myfixed"));
  }
  @Test
  public void testArrayWithNullValues() throws Exception {
    Schema schema = new Schema.Parser().parse(Resources.getResource("all.avsc").openStream());

    File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
    tmp.deleteOnExit();
    tmp.delete();
    Path file = new Path(tmp.getPath());

    GenericData.Record nestedRecord =
        new GenericRecordBuilder(schema.getField("mynestedrecord").schema())
            .set("mynestedint", 1)
            .build();

    List<Integer> integerArray = Arrays.asList(1, 2, 3);
    GenericData.Array<Integer> genericIntegerArray =
        new GenericData.Array<Integer>(
            Schema.createArray(Schema.create(Schema.Type.INT)), integerArray);

    GenericFixed genericFixed =
        new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] {(byte) 65});

    List<Integer> emptyArray = new ArrayList<Integer>();
    ImmutableMap emptyMap = new ImmutableMap.Builder<String, Integer>().build();

    Schema arrayOfOptionalIntegers = Schema.createArray(optional(Schema.create(Schema.Type.INT)));
    GenericData.Array<Integer> genericIntegerArrayWithNulls =
        new GenericData.Array<Integer>(arrayOfOptionalIntegers, Arrays.asList(1, null, 2, null, 3));

    GenericData.Record record =
        new GenericRecordBuilder(schema)
            .set("mynull", null)
            .set("myboolean", true)
            .set("myint", 1)
            .set("mylong", 2L)
            .set("myfloat", 3.1f)
            .set("mydouble", 4.1)
            .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
            .set("mystring", "hello")
            .set("mynestedrecord", nestedRecord)
            .set("myenum", "a")
            .set("myarray", genericIntegerArray)
            .set("myemptyarray", emptyArray)
            .set("myoptionalarray", genericIntegerArray)
            .set("myarrayofoptional", genericIntegerArrayWithNulls)
            .set("mymap", ImmutableMap.of("a", 1, "b", 2))
            .set("myemptymap", emptyMap)
            .set("myfixed", genericFixed)
            .build();

    final AvroParquetWriter<GenericRecord> writer =
        new AvroParquetWriter<GenericRecord>(file, schema);

    try {
      writer.write(record);
      fail("Should not succeed writing an array with null values");
    } catch (Exception e) {
      Assert.assertTrue(
          "Error message should provide context and help",
          e.getMessage().contains("parquet.avro.write-old-list-structure"));
    } finally {
      writer.close();
    }
  }
  @Test
  public void testAll() throws Exception {
    Schema schema = new Schema.Parser().parse(Resources.getResource("all.avsc").openStream());

    File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
    tmp.deleteOnExit();
    tmp.delete();
    Path file = new Path(tmp.getPath());

    AvroParquetWriter<GenericRecord> writer = new AvroParquetWriter<GenericRecord>(file, schema);

    GenericData.Record nestedRecord =
        new GenericRecordBuilder(schema.getField("mynestedrecord").schema())
            .set("mynestedint", 1)
            .build();

    List<Integer> integerArray = Arrays.asList(1, 2, 3);
    GenericData.Array<Integer> genericIntegerArray =
        new GenericData.Array<Integer>(
            Schema.createArray(Schema.create(Schema.Type.INT)), integerArray);

    GenericFixed genericFixed =
        new GenericData.Fixed(Schema.createFixed("fixed", null, null, 1), new byte[] {(byte) 65});

    List<Integer> emptyArray = new ArrayList<Integer>();
    ImmutableMap emptyMap = new ImmutableMap.Builder<String, Integer>().build();

    GenericData.Record record =
        new GenericRecordBuilder(schema)
            .set("mynull", null)
            .set("myboolean", true)
            .set("myint", 1)
            .set("mylong", 2L)
            .set("myfloat", 3.1f)
            .set("mydouble", 4.1)
            .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
            .set("mystring", "hello")
            .set("mynestedrecord", nestedRecord)
            .set("myenum", "a")
            .set("myarray", genericIntegerArray)
            .set("myemptyarray", emptyArray)
            .set("myoptionalarray", genericIntegerArray)
            .set("myarrayofoptional", genericIntegerArray)
            .set("mymap", ImmutableMap.of("a", 1, "b", 2))
            .set("myemptymap", emptyMap)
            .set("myfixed", genericFixed)
            .build();

    writer.write(record);
    writer.close();

    AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(testConf, file);
    GenericRecord nextRecord = reader.read();

    Object expectedEnumSymbol =
        compat ? "a" : new GenericData.EnumSymbol(schema.getField("myenum").schema(), "a");

    assertNotNull(nextRecord);
    assertEquals(null, nextRecord.get("mynull"));
    assertEquals(true, nextRecord.get("myboolean"));
    assertEquals(1, nextRecord.get("myint"));
    assertEquals(2L, nextRecord.get("mylong"));
    assertEquals(3.1f, nextRecord.get("myfloat"));
    assertEquals(4.1, nextRecord.get("mydouble"));
    assertEquals(ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)), nextRecord.get("mybytes"));
    assertEquals(str("hello"), nextRecord.get("mystring"));
    assertEquals(expectedEnumSymbol, nextRecord.get("myenum"));
    assertEquals(nestedRecord, nextRecord.get("mynestedrecord"));
    assertEquals(integerArray, nextRecord.get("myarray"));
    assertEquals(emptyArray, nextRecord.get("myemptyarray"));
    assertEquals(integerArray, nextRecord.get("myoptionalarray"));
    assertEquals(integerArray, nextRecord.get("myarrayofoptional"));
    assertEquals(ImmutableMap.of(str("a"), 1, str("b"), 2), nextRecord.get("mymap"));
    assertEquals(emptyMap, nextRecord.get("myemptymap"));
    assertEquals(genericFixed, nextRecord.get("myfixed"));
  }
  @Test
  public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException {
    // Create a test input file.
    File inputFile = createInputFile();

    // Configure the job input.
    Job job = new Job();
    FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath()));
    job.setInputFormatClass(CombineAvroKeyValueInputFormat.class);
    AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT));
    AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING));

    // Configure a mapper.
    job.setMapperClass(IndexMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    // Configure a reducer.
    job.setReducerClass(IndexReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(AvroValue.class);
    AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT)));

    // Configure the output format.
    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index");
    FileOutputFormat.setOutputPath(job, outputPath);

    // Run the job.
    assertTrue(job.waitForCompletion(true));

    // Verify that the output Avro container file as the expected data.
    File avroFile = new File(outputPath.toString(), "part-r-00000.avro");
    DatumReader<GenericRecord> datumReader =
        new SpecificDatumReader<GenericRecord>(
            AvroKeyValue.getSchema(
                Schema.create(Schema.Type.STRING),
                Schema.createArray(Schema.create(Schema.Type.INT))));
    DataFileReader<GenericRecord> avroFileReader =
        new DataFileReader<GenericRecord>(avroFile, datumReader);
    assertTrue(avroFileReader.hasNext());

    AvroKeyValue<CharSequence, List<Integer>> appleRecord =
        new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next());
    assertNotNull(appleRecord.get());
    assertEquals("apple", appleRecord.getKey().toString());
    List<Integer> appleDocs = appleRecord.getValue();
    assertEquals(3, appleDocs.size());
    assertTrue(appleDocs.contains(1));
    assertTrue(appleDocs.contains(2));
    assertTrue(appleDocs.contains(3));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> bananaRecord =
        new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next());
    assertNotNull(bananaRecord.get());
    assertEquals("banana", bananaRecord.getKey().toString());
    List<Integer> bananaDocs = bananaRecord.getValue();
    assertEquals(2, bananaDocs.size());
    assertTrue(bananaDocs.contains(1));
    assertTrue(bananaDocs.contains(2));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> carrotRecord =
        new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next());
    assertEquals("carrot", carrotRecord.getKey().toString());
    List<Integer> carrotDocs = carrotRecord.getValue();
    assertEquals(1, carrotDocs.size());
    assertTrue(carrotDocs.contains(1));

    assertFalse(avroFileReader.hasNext());
    avroFileReader.close();
  }
  @Test
  public void testConvertBigQuerySchemaToAvroSchema() {
    TableSchema tableSchema = new TableSchema();
    tableSchema.setFields(fields);
    Schema avroSchema =
        BigQueryAvroUtils.toGenericAvroSchema("testSchema", tableSchema.getFields());

    assertThat(avroSchema.getField("number").schema(), equalTo(Schema.create(Type.LONG)));
    assertThat(
        avroSchema.getField("species").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING))));
    assertThat(
        avroSchema.getField("quality").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.DOUBLE))));
    assertThat(
        avroSchema.getField("quantity").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.LONG))));
    assertThat(
        avroSchema.getField("birthday").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.LONG))));
    assertThat(
        avroSchema.getField("flighted").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.BOOLEAN))));
    assertThat(
        avroSchema.getField("sound").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.BYTES))));
    assertThat(
        avroSchema.getField("anniversaryDate").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING))));
    assertThat(
        avroSchema.getField("anniversaryDatetime").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING))));
    assertThat(
        avroSchema.getField("anniversaryTime").schema(),
        equalTo(Schema.createUnion(Schema.create(Type.NULL), Schema.create(Type.STRING))));

    assertThat(
        avroSchema.getField("scion").schema(),
        equalTo(
            Schema.createUnion(
                Schema.create(Type.NULL),
                Schema.createRecord(
                    "scion",
                    "org.apache.beam.sdk.io.gcp.bigquery",
                    "Translated Avro Schema for scion",
                    false,
                    ImmutableList.of(
                        new Field(
                            "species",
                            Schema.createUnion(
                                Schema.create(Type.NULL), Schema.create(Type.STRING)),
                            null,
                            (Object) null))))));
    assertThat(
        avroSchema.getField("associates").schema(),
        equalTo(
            Schema.createArray(
                Schema.createRecord(
                    "associates",
                    "org.apache.beam.sdk.io.gcp.bigquery",
                    "Translated Avro Schema for associates",
                    false,
                    ImmutableList.of(
                        new Field(
                            "species",
                            Schema.createUnion(
                                Schema.create(Type.NULL), Schema.create(Type.STRING)),
                            null,
                            (Object) null))))));
  }
Пример #11
0
 Schema computeAvroSchema() {
   return Schema.createArray(bodyType.getAvroSchema());
 }