@Test
  public void testMapWithNulls() throws Exception {
    Schema schema =
        new Schema.Parser().parse(Resources.getResource("map_with_nulls.avsc").openStream());

    File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
    tmp.deleteOnExit();
    tmp.delete();
    Path file = new Path(tmp.getPath());

    AvroParquetWriter<GenericRecord> writer = new AvroParquetWriter<GenericRecord>(file, schema);

    // Write a record with a null value
    Map<CharSequence, Integer> map = new HashMap<CharSequence, Integer>();
    map.put(str("thirty-four"), 34);
    map.put(str("eleventy-one"), null);
    map.put(str("one-hundred"), 100);

    GenericData.Record record = new GenericRecordBuilder(schema).set("mymap", map).build();
    writer.write(record);
    writer.close();

    AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(testConf, file);
    GenericRecord nextRecord = reader.read();

    assertNotNull(nextRecord);
    assertEquals(map, nextRecord.get("mymap"));
  }
  @Test
  public void testMapWithUtf8Key() throws Exception {
    Schema schema = new Schema.Parser().parse(Resources.getResource("map.avsc").openStream());

    File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
    tmp.deleteOnExit();
    tmp.delete();
    Path file = new Path(tmp.getPath());

    AvroParquetWriter<GenericRecord> writer = new AvroParquetWriter<GenericRecord>(file, schema);

    // Write a record with a map with Utf8 keys.
    GenericData.Record record =
        new GenericRecordBuilder(schema)
            .set("mymap", ImmutableMap.of(new Utf8("a"), 1, new Utf8("b"), 2))
            .build();
    writer.write(record);
    writer.close();

    AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(testConf, file);
    GenericRecord nextRecord = reader.read();

    assertNotNull(nextRecord);
    assertEquals(ImmutableMap.of(str("a"), 1, str("b"), 2), nextRecord.get("mymap"));
  }
  @Test
  public void testRead_SpecificReader() throws IOException {
    GenericRecord savedRecord = new GenericData.Record(schema);
    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord));

    AvroFileReaderFactory<Person> genericReader =
        new AvroFileReaderFactory<Person>(Avros.records(Person.class), new Configuration());
    Iterator<Person> recordIterator =
        genericReader.read(
            FileSystem.getLocal(new Configuration()), new Path(this.avroFile.getAbsolutePath()));

    Person expectedPerson = new Person();
    expectedPerson.setAge(42);
    expectedPerson.setName("John Doe");
    List<CharSequence> siblingNames = Lists.newArrayList();
    siblingNames.add("Jimmy");
    siblingNames.add("Jane");
    expectedPerson.setSiblingnames(siblingNames);

    Person person = recordIterator.next();

    assertEquals(expectedPerson, person);
    assertFalse(recordIterator.hasNext());
  }
Example #4
0
 private static GenericRecord buildUser(Schema schema, String name, String office, String color) {
   GenericRecord user = new GenericData.Record(schema);
   user.put("name", name);
   user.put("office", office);
   if (color != null) user.put("favorite_color", color);
   return user;
 }
 private void buildAttribute(Object element, LuceneWorksBuilder hydrator) {
   if (element instanceof GenericRecord) {
     GenericRecord record = (GenericRecord) element;
     String name = record.getSchema().getName();
     if ("TokenTrackingAttribute".equals(name)) {
       @SuppressWarnings("unchecked")
       List<Integer> positionList = (List<Integer>) record.get("positions");
       hydrator.addTokenTrackingAttribute(positionList);
     } else if ("CharTermAttribute".equals(name)) {
       hydrator.addCharTermAttribute((CharSequence) record.get("sequence"));
     } else if ("PayloadAttribute".equals(name)) {
       hydrator.addPayloadAttribute(asByteArray(record, "payload"));
     } else if ("KeywordAttribute".equals(name)) {
       hydrator.addKeywordAttribute(asBoolean(record, "isKeyword"));
     } else if ("PositionIncrementAttribute".equals(name)) {
       hydrator.addPositionIncrementAttribute(asInt(record, "positionIncrement"));
     } else if ("FlagsAttribute".equals(name)) {
       hydrator.addFlagsAttribute(asInt(record, "flags"));
     } else if ("TypeAttribute".equals(name)) {
       hydrator.addTypeAttribute(asString(record, "type"));
     } else if ("OffsetAttribute".equals(name)) {
       hydrator.addOffsetAttribute(asInt(record, "startOffset"), asInt(record, "endOffset"));
     } else {
       throw log.unknownAttributeSerializedRepresentation(name);
     }
   } else if (element instanceof ByteBuffer) {
     hydrator.addSerializedAttribute(asByteArray((ByteBuffer) element));
   } else {
     throw log.unknownAttributeSerializedRepresentation(element.getClass().getName());
   }
 }
Example #6
0
 private void assertField(GenericRecord field) {
   assertThat(field.get("name")).isInstanceOf(Utf8.class);
   assertThat(field.get("name").toString()).isEqualTo(field.getSchema().getName());
   assertThat(field.get("boost")).isEqualTo(2.3f);
   assertThat(field.get("omitNorms")).isEqualTo(true);
   assertThat(field.get("omitTermFreqAndPositions")).isEqualTo(true);
 }
 private void processRecordField(
     CommonRecord record, GenericRecord deltaRecord, String fieldName) {
   CommonRecord nextRecord = null;
   CommonValue nextValue = record.getField(fieldName);
   if (nextValue != null
       && nextValue.isRecord()
       && nextValue
           .getRecord()
           .getSchema()
           .getFullName()
           .equals(deltaRecord.getSchema().getFullName())) {
     nextRecord = nextValue.getRecord();
     GenericFixed uuidFixed = (GenericFixed) deltaRecord.get(UUID);
     if (uuidFixed != null) {
       UUID uuid = AvroGenericUtils.createUuidFromFixed(uuidFixed);
       // Checking if the uuid was changed
       if (!uuid.equals(nextRecord.getUuid())) {
         records.remove(nextRecord.getUuid());
         records.put(uuid, nextRecord);
         nextRecord.setUuid(uuid);
       }
     }
   } else {
     nextRecord = createCommonRecord(deltaRecord);
     record.setField(fieldName, commonFactory.createCommonValue(nextRecord));
   }
   updateRecord(nextRecord, deltaRecord);
 }
  /**
   * Use resource key(Optional) and rest json entry as a template and fill in template using Avro as
   * a reference. e.g: Rest JSON entry HOCON template:
   * AccountId=${sf_account_id},Member_Id__c=${member_id} Avro:
   * {"sf_account_id":{"string":"0016000000UiCYHAA3"},"member_id":{"long":296458833}}
   *
   * <p>Converted Json: {"AccountId":"0016000000UiCYHAA3","Member_Id__c":296458833}
   *
   * <p>As it's template based approach, it can produce nested JSON structure even Avro is flat (or
   * vice versa).
   *
   * <p>e.g: Rest resource template: /sobject/account/memberId/${member_id} Avro:
   * {"sf_account_id":{"string":"0016000000UiCYHAA3"},"member_id":{"long":296458833}} Converted
   * resource: /sobject/account/memberId/296458833
   *
   * <p>Converted resource will be used to form end point.
   * http://www.server.com:9090/sobject/account/memberId/296458833
   *
   * <p>{@inheritDoc}
   *
   * @see gobblin.converter.Converter#convertRecord(java.lang.Object, java.lang.Object,
   *     gobblin.configuration.WorkUnitState)
   */
  @Override
  public Iterable<RestEntry<JsonObject>> convertRecord(
      Void outputSchema, GenericRecord inputRecord, WorkUnitState workUnit)
      throws DataConversionException {

    Config srcConfig =
        ConfigFactory.parseString(
            inputRecord.toString(), ConfigParseOptions.defaults().setSyntax(ConfigSyntax.JSON));

    String resourceKey = workUnit.getProp(CONVERTER_AVRO_REST_ENTRY_RESOURCE_KEY, "");
    if (!StringUtils.isEmpty(resourceKey)) {
      final String dummyKey = "DUMMY";
      Config tmpConfig =
          ConfigFactory.parseString(dummyKey + "=" + resourceKey).resolveWith(srcConfig);
      resourceKey = tmpConfig.getString(dummyKey);
    }

    String hoconInput = workUnit.getProp(CONVERTER_AVRO_REST_JSON_ENTRY_TEMPLATE);
    if (StringUtils.isEmpty(hoconInput)) {
      return new SingleRecordIterable<>(
          new RestEntry<>(resourceKey, parser.parse(inputRecord.toString()).getAsJsonObject()));
    }

    Config destConfig = ConfigFactory.parseString(hoconInput).resolveWith(srcConfig);
    JsonObject json =
        parser.parse(destConfig.root().render(ConfigRenderOptions.concise())).getAsJsonObject();
    return new SingleRecordIterable<>(new RestEntry<>(resourceKey, json));
  }
Example #9
0
  /**
   * Get a map of field names to default values for an Avro schema.
   *
   * @param avroRecordSchema The schema to get the map of field names to values.
   * @return The map.
   */
  public static Map<String, Object> getDefaultValueMap(Schema avroRecordSchema) {
    List<Field> defaultFields = new ArrayList<Field>();
    for (Field f : avroRecordSchema.getFields()) {
      if (f.defaultValue() != null) {
        // Need to create a new Field here or we will get
        // org.apache.avro.AvroRuntimeException: Field already used:
        // schemaVersion
        defaultFields.add(new Field(f.name(), f.schema(), f.doc(), f.defaultValue(), f.order()));
      }
    }

    Schema defaultSchema = Schema.createRecord(defaultFields);
    Schema emptyRecordSchema = Schema.createRecord(new ArrayList<Field>());
    DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(emptyRecordSchema);
    DatumReader<GenericRecord> reader =
        new GenericDatumReader<GenericRecord>(emptyRecordSchema, defaultSchema);

    GenericRecord emptyRecord = new GenericData.Record(emptyRecordSchema);
    GenericRecord defaultRecord =
        AvroUtils.readAvroEntity(AvroUtils.writeAvroEntity(emptyRecord, writer), reader);

    Map<String, Object> defaultValueMap = new HashMap<String, Object>();
    for (Field f : defaultFields) {
      defaultValueMap.put(f.name(), defaultRecord.get(f.name()));
    }
    return defaultValueMap;
  }
  private byte[] createAvroData(String name, int age, List<String> emails) throws IOException {
    String AVRO_SCHEMA =
        "{\n"
            + "\"type\": \"record\",\n"
            + "\"name\": \"Employee\",\n"
            + "\"fields\": [\n"
            + " {\"name\": \"name\", \"type\": \"string\"},\n"
            + " {\"name\": \"age\", \"type\": \"int\"},\n"
            + " {\"name\": \"emails\", \"type\": {\"type\": \"array\", \"items\": \"string\"}},\n"
            + " {\"name\": \"boss\", \"type\": [\"Employee\",\"null\"]}\n"
            + "]}";
    Schema schema = new Schema.Parser().parse(AVRO_SCHEMA);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    GenericRecord e1 = new GenericData.Record(schema);
    e1.put("name", name);
    e1.put("age", age);
    e1.put("emails", emails);
    e1.put("boss", null);

    DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.create(schema, out);
    dataFileWriter.append(e1);
    dataFileWriter.close();
    return out.toByteArray();
  }
Example #11
0
 private GenericRecord createField(Schema schema) {
   GenericRecord field = new GenericData.Record(schema);
   field.put("name", schema.getName());
   field.put("boost", 2.3f);
   field.put("omitNorms", true);
   field.put("omitTermFreqAndPositions", true);
   return field;
 }
 @Override
 protected void processEvent(GenericRecord record, EventAggregator eventAggregator) {
   System.out.println(schemaName + "-Stream: " + record.toString());
   long userId = (long) record.get("userId");
   long time = (long) record.get("time");
   String contactHash = record.get("contactHash").toString();
   int msgLength = (int) record.get("msgLength");
   eventAggregator.processSmsReceived(userId, time, contactHash, msgLength);
 }
  @Override
  public void deserialize(byte[] data, LuceneWorksBuilder hydrator) {
    final ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
    final int majorVersion = inputStream.read();
    final int minorVersion = inputStream.read();
    final Protocol protocol = protocols.getProtocol(majorVersion, minorVersion);

    Decoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null);
    GenericDatumReader<GenericRecord> reader =
        new GenericDatumReader<>(protocol.getType("Message"));
    GenericRecord result;
    try {
      result = reader.read(null, decoder);
    } catch (IOException e) {
      throw log.unableToDeserializeAvroStream(e);
    }

    classReferences = asListOfString(result, "classReferences");
    final List<GenericRecord> operations = asListOfGenericRecords(result, "operations");
    final ConversionContext conversionContext = new ContextualExceptionBridgeHelper();
    for (GenericRecord operation : operations) {
      String schema = operation.getSchema().getName();
      if ("OptimizeAll".equals(schema)) {
        hydrator.addOptimizeAll();
      } else if ("PurgeAll".equals(schema)) {
        hydrator.addPurgeAllLuceneWork(asClass(operation, "class"));
      } else if ("Flush".equals(schema)) {
        hydrator.addFlush();
      } else if ("Delete".equals(schema)) {
        processId(operation, hydrator);
        hydrator.addDeleteLuceneWork(asClass(operation, "class"), conversionContext);
      } else if ("DeleteByQuery".equals(schema)) {
        String entityClassName = asClass(operation, "class");
        int queryKey = asInt(operation, "key");
        DeleteByQuerySupport.StringToQueryMapper mapper =
            DeleteByQuerySupport.getStringToQueryMapper(queryKey);
        List<Utf8> stringList = asListOfString(operation, "query");
        String[] query = new String[stringList.size()];
        for (int i = 0; i < stringList.size(); ++i) {
          query[i] = stringList.get(i).toString();
        }
        hydrator.addDeleteByQueryLuceneWork(entityClassName, mapper.fromString(query));
      } else if ("Add".equals(schema)) {
        buildLuceneDocument(asGenericRecord(operation, "document"), hydrator);
        Map<String, String> analyzers = getAnalyzers(operation);
        processId(operation, hydrator);
        hydrator.addAddLuceneWork(asClass(operation, "class"), analyzers, conversionContext);
      } else if ("Update".equals(schema)) {
        buildLuceneDocument(asGenericRecord(operation, "document"), hydrator);
        Map<String, String> analyzers = getAnalyzers(operation);
        processId(operation, hydrator);
        hydrator.addUpdateLuceneWork(asClass(operation, "class"), analyzers, conversionContext);
      } else {
        throw log.cannotDeserializeOperation(schema);
      }
    }
  }
 private Value decodeValue(final Object generic) {
   if (generic instanceof GenericRecord) {
     final GenericRecord record = (GenericRecord) generic;
     final Schema schema = record.getSchema();
     if (schema.equals(Schemas.COMPRESSED_IDENTIFIER) || schema.equals(Schemas.PLAIN_IDENTIFIER)) {
       return decodeIdentifier(record);
     }
   }
   return decodeLiteral(generic);
 }
 private Statement decodeStatement(final GenericRecord record) {
   final Resource subj = decodeIdentifier((GenericRecord) record.get(0));
   final URI pred = (URI) decodeIdentifier((GenericRecord) record.get(1));
   final Value obj = decodeValue(record.get(2));
   final Resource ctx = decodeIdentifier((GenericRecord) record.get(3));
   if (ctx == null) {
     return this.factory.createStatement(subj, pred, obj);
   } else {
     return this.factory.createStatement(subj, pred, obj, ctx);
   }
 }
 private CommonRecord createCommonRecord(GenericRecord avroRecord) {
   GenericFixed uuidFixed = (GenericFixed) avroRecord.get(UUID);
   if (uuidFixed != null) {
     UUID uuid = AvroGenericUtils.createUuidFromFixed(uuidFixed);
     CommonRecord newRecord = commonFactory.createCommonRecord(uuid, avroRecord.getSchema());
     records.put(uuid, newRecord);
     return newRecord;
   } else {
     return commonFactory.createCommonRecord(avroRecord.getSchema());
   }
 }
 private void assertEqualsWithGeneric(List<Bird> expected, List<GenericRecord> actual) {
   assertEquals(expected.size(), actual.size());
   for (int i = 0; i < expected.size(); i++) {
     Bird fixed = expected.get(i);
     GenericRecord generic = actual.get(i);
     assertEquals(fixed.number, generic.get("number"));
     assertEquals(fixed.quality, generic.get("quality").toString()); // From Avro util.Utf8
     assertEquals(fixed.quantity, generic.get("quantity"));
     assertEquals(fixed.species, generic.get("species").toString());
   }
 }
  private GenericRecord convertRecord(String inputRecord) {
    Gson gson = new Gson();
    JsonElement element = gson.fromJson(inputRecord, JsonElement.class);
    Map<String, Object> fields = gson.fromJson(element, FIELD_ENTRY_TYPE);
    GenericRecord outputRecord = new GenericData.Record(schema);
    for (Map.Entry<String, Object> entry : fields.entrySet()) {
      outputRecord.put(entry.getKey(), entry.getValue());
    }

    return outputRecord;
  }
 private void processId(GenericRecord operation, LuceneWorksBuilder hydrator) {
   GenericRecord id = (GenericRecord) operation.get("id");
   Object value = id.get("value");
   if (value instanceof ByteBuffer) {
     hydrator.addIdAsJavaSerialized(asByteArray((ByteBuffer) value));
   } else if (value instanceof Utf8) {
     hydrator.addId(value.toString());
   } else {
     // the rest are serialized objects
     hydrator.addId((Serializable) value);
   }
 }
 public void execute(TridentTuple tuple, TridentCollector collector) {
   GenericRecord docEntry = new GenericData.Record(schema);
   docEntry.put("docid", tuple.getStringByField("documentId"));
   docEntry.put("time", Time.currentTimeMillis());
   docEntry.put("line", tuple.getStringByField("document"));
   try {
     dataFileWriter.append(docEntry);
     dataFileWriter.flush();
   } catch (IOException e) {
     LOG.error("Error writing to document record: " + e);
     throw new RuntimeException(e);
   }
 }
    public void reduce(
        Utf8 key, Iterable<Long> values, AvroCollector<GenericRecord> collector, Reporter reporter)
        throws IOException {
      long sum = 0;
      for (Long val : values) {
        sum += val;
      }

      GenericRecord value = new GenericData.Record(OUTPUT_SCHEMA);
      value.put("shape", key);
      value.put("count", sum);

      collector.collect(value);
    }
Example #22
0
  @Override
  public void execute(Tuple inputTuple) {
    /* Processing tuples of the shape
    (DATASOURCE_ID, TIMESTAMP_FIELD, CONTENT_FIELD) */

    // get datasource
    String datasource = inputTuple.getStringByField(RestIngestionSpout.DATASOURCE_ID);
    // compute month
    long timestamp = inputTuple.getLongByField(RestIngestionSpout.TIMESTAMP_FIELD);
    // this computation is completely stateless
    String month = timestampToMonth(timestamp);

    // now get the DataFileWriter
    DataFileWriter<GenericRecord> writer = null;
    try {
      writer = this.writersCache.get(DatasourceMonth.create(datasource, month));
    } catch (ExecutionException ee) {
      LOGGER.error(
          "Error getting DataFileWriter for tuple for datasource "
              + datasource
              + " and timestamp "
              + timestamp
              + " : "
              + ee.getMessage());
      this.collector.fail(inputTuple);
      return;
    }

    // create and write a new record
    GenericRecord newDataRecord = new GenericData.Record(AVRO_SCHEMA);
    newDataRecord.put(AVRO_TIMESTAMP_FIELD, new Long(timestamp));
    newDataRecord.put(
        AVRO_CONTENT_FIELD, inputTuple.getStringByField(RestIngestionSpout.CONTENT_FIELD));
    try {
      writer.append(newDataRecord);
    } catch (IOException ioe) {
      LOGGER.error(
          "Error writing Avro record for datasource "
              + datasource
              + " and timestamp "
              + timestamp
              + " : "
              + ioe.getMessage());
      this.collector.fail(inputTuple);
      return;
    }

    // ACK processing for this tupe as ok
    this.collector.ack(inputTuple);
  }
 private Object decodeNode(final Object generic) {
   if (generic instanceof GenericRecord) {
     final GenericRecord record = (GenericRecord) generic;
     final Schema schema = record.getSchema();
     if (schema.equals(Schemas.RECORD)) {
       return decodeRecord(record, null);
     } else if (schema.equals(Schemas.PLAIN_IDENTIFIER)
         || schema.equals(Schemas.COMPRESSED_IDENTIFIER)) {
       return decodeIdentifier(record);
     } else if (schema.equals(Schemas.STATEMENT)) {
       return decodeStatement(record);
     }
   }
   return decodeLiteral(generic);
 }
 public void validateAvroFile(File file) throws IOException {
   // read the events back using GenericRecord
   DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
   DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(file, reader);
   GenericRecord record = new GenericData.Record(fileReader.getSchema());
   int numEvents = 0;
   while (fileReader.hasNext()) {
     fileReader.next(record);
     String bodyStr = record.get("message").toString();
     System.out.println(bodyStr);
     numEvents++;
   }
   fileReader.close();
   Assert.assertEquals("Should have found a total of 3 events", 3, numEvents);
 }
Example #25
0
  public Iterable<GenericRecord> convertRecord(
      Schema outputSchema, JsonElement inputRecord, WorkUnitState workUnit) {
    JsonElement element = GSON.fromJson(inputRecord, JsonElement.class);
    Map<String, Object> fields = GSON.fromJson(element, FIELD_ENTRY_TYPE);
    GenericRecord record = new GenericData.Record(outputSchema);
    for (Map.Entry<String, Object> entry : fields.entrySet()) {
      if (entry.getKey().equals("*")) {
        // switch '*' to 'content' since '*' is not a valid avro schema field name
        record.put(JSON_CONTENT_MEMBER, entry.getValue());
      } else {
        record.put(entry.getKey(), entry.getValue());
      }
    }

    return new SingleRecordIterable<GenericRecord>(record);
  }
  public void testWrite() throws IOException {

    URL url = this.getClass().getClassLoader().getResource("input/Company.avsc");
    assertNotNull(url);
    Schema schema = new Schema.Parser().parse(new File(url.getFile()));
    assertNotNull(schema);

    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema);
    // Another way of loading a file
    File file = new File("src/test/resources/input/companies.avro");
    DataFileReader<GenericRecord> dataFileReader =
        new DataFileReader<GenericRecord>(file, datumReader);

    File fileOut = new File("target/companies2.avro");
    Schema schemaOut =
        new Schema.Parser().parse(new File("src/test/resources/input/Company2.avsc"));
    DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schemaOut);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);

    GenericRecord company = null;
    int count = 0;
    while (dataFileReader.hasNext()) {
      company = dataFileReader.next(company);
      if (company.get("name").toString().equals("aol")) {
        dataFileWriter.create(schemaOut, fileOut);

        GenericRecord recordOut = new GenericData.Record(schemaOut);
        recordOut.put("id", company.get("id"));
        recordOut.put("name", company.get("name"));
        assertTrue(recordOut.getSchema().getField("address") != null);
        assertTrue(recordOut.getSchema().getField("employeeCount") == null);

        // address is of complex type
        GenericRecord address =
            new GenericData.Record((GenericData.Record) company.get("address"), true);
        recordOut.put("address", address);

        dataFileWriter.append(recordOut);

        count++;
      }
    }
    assertTrue(count > 0);

    dataFileWriter.close();
  }
Example #27
0
  @BeforeClass
  public static void before() throws Exception {
    final String filePath =
        TestUtils.getFileFromResourceUrl(
            DictionariesTest.class.getClassLoader().getResource(AVRO_DATA));
    if (INDEX_DIR.exists()) {
      FileUtils.deleteQuietly(INDEX_DIR);
    }

    final SegmentGeneratorConfig config =
        SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(
            new File(filePath), INDEX_DIR, "time_day", TimeUnit.DAYS, "test");

    final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
    driver.init(config);
    driver.build();

    final Schema schema = AvroUtils.extractSchemaFromAvro(new File(filePath));

    final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath));
    final org.apache.avro.Schema avroSchema = avroReader.getSchema();
    final String[] columns = new String[avroSchema.getFields().size()];
    int i = 0;
    for (final Field f : avroSchema.getFields()) {
      columns[i] = f.name();
      i++;
    }

    uniqueEntries = new HashMap<String, Set<Object>>();
    for (final String column : columns) {
      uniqueEntries.put(column, new HashSet<Object>());
    }

    while (avroReader.hasNext()) {
      final GenericRecord rec = avroReader.next();
      for (final String column : columns) {
        Object val = rec.get(column);
        if (val instanceof Utf8) {
          val = ((Utf8) val).toString();
        }
        uniqueEntries
            .get(column)
            .add(getAppropriateType(schema.getFieldSpecFor(column).getDataType(), val));
      }
    }
  }
 @SuppressWarnings("unchecked")
 private Record decodeRecord(
     final GenericRecord generic, @Nullable final Set<URI> propertiesToDecode) {
   final Record record = Record.create();
   final GenericRecord encodedID = (GenericRecord) generic.get(0);
   if (encodedID != null) {
     record.setID((URI) decodeIdentifier(encodedID));
   }
   for (final GenericRecord prop : (Iterable<GenericRecord>) generic.get(1)) {
     final URI property = (URI) decodeIdentifier((GenericRecord) prop.get(0));
     final List<Object> values = decodeNodes(prop.get(1));
     if (propertiesToDecode == null || propertiesToDecode.contains(property)) {
       record.set(property, values);
     }
   }
   return record;
 }
  @Test
  public void testRead_GenericReader() throws IOException {
    GenericRecord savedRecord = new GenericData.Record(schema);
    savedRecord.put("name", "John Doe");
    savedRecord.put("age", 42);
    savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
    populateGenericFile(Lists.newArrayList(savedRecord));

    AvroFileReaderFactory<GenericData.Record> genericReader =
        new AvroFileReaderFactory<GenericData.Record>(Avros.generics(schema), new Configuration());
    Iterator<GenericData.Record> recordIterator =
        genericReader.read(
            FileSystem.getLocal(new Configuration()), new Path(this.avroFile.getAbsolutePath()));

    GenericRecord genericRecord = recordIterator.next();
    assertEquals(savedRecord, genericRecord);
    assertFalse(recordIterator.hasNext());
  }
 private Resource decodeIdentifier(final GenericRecord record) {
   final Schema schema = record.getSchema();
   if (schema.equals(Schemas.COMPRESSED_IDENTIFIER)) {
     try {
       return this.dictionary.objectFor((Integer) record.get(0));
     } catch (final IOException ex) {
       throw new IllegalStateException("Cannot access dictionary: " + ex.getMessage(), ex);
     }
   } else if (schema.equals(Schemas.PLAIN_IDENTIFIER)) {
     final String string = record.get(0).toString();
     if (string.startsWith("_:")) {
       return this.factory.createBNode(string.substring(2));
     } else {
       return this.factory.createURI(string);
     }
   }
   throw new IllegalArgumentException("Unsupported encoded identifier: " + record);
 }