Пример #1
0
 @Test
 public void testHadoopCodecFactoryGZip() {
   CodecFactory hadoopSnappyCodec =
       HadoopCodecFactory.fromHadoopString("org.apache.hadoop.io.compress.GZipCodec");
   CodecFactory avroSnappyCodec = CodecFactory.fromString("deflate");
   assertTrue(hadoopSnappyCodec.getClass().equals(avroSnappyCodec.getClass()));
 }
    public WriteAvroToByteArray(
        CommandBuilder builder,
        Config config,
        Command parent,
        Command child,
        MorphlineContext context) {
      super(builder, config, parent, child, context);
      this.format =
          new Validator<Format>()
              .validateEnum(
                  config,
                  getConfigs().getString(config, "format", Format.container.toString()),
                  Format.class);

      String codec = getConfigs().getString(config, "codec", null);
      if (codec == null) {
        this.codecFactory = null;
      } else {
        this.codecFactory = CodecFactory.fromString(codec);
      }

      Config metadataConfig = getConfigs().getConfig(config, "metadata", ConfigFactory.empty());
      for (Map.Entry<String, Object> entry : new Configs().getEntrySet(metadataConfig)) {
        this.metadata.put(entry.getKey(), entry.getValue().toString());
      }

      validateArguments();
    }
Пример #3
0
    /**
     * Assure that currentWriter is populated and refers to the correct data file. This may
     * roll-over the existing data file. Also assures that writing one more span will not violate
     * limits on Span storage.
     *
     * @throws IOException
     */
    private void assureCurrentWriter() throws IOException {
      boolean createNewFile = false;

      // Will we overshoot policy?
      while (this.spansSoFar >= maxSpans) {
        File oldest = null;
        // If spansSoFar is positive, there must be at least one file
        synchronized (this.files) {
          oldest = this.files.remove(this.files.firstKey());
        }
        this.spansSoFar -= spansPerFile.get(oldest);
        spansPerFile.remove(oldest);
        oldest.delete();
      }
      if (files.size() == 0) {
        // In corner case we have removed the current file,
        // if that happened we need to clear current variables.
        currentTimestamp = (long) 0;
        currentWriter = null;
      }
      long rightNow = System.currentTimeMillis() / 1000L;

      // What file should we be in
      long cutOff = floorSecond(rightNow);

      if (currentWriter == null) {
        createNewFile = true;
      }
      // Test for roll-over.
      else if (cutOff >= (currentTimestamp + secondsPerFile)) {
        currentWriter.close();
        createNewFile = true;
      }

      if (createNewFile) {
        File newFile =
            new File(
                traceFileDir + "/" + Thread.currentThread().getId() + "_" + cutOff + FILE_SUFFIX);
        synchronized (this.files) {
          this.files.put(cutOff, newFile);
        }
        this.spansPerFile.put(newFile, (long) 0);
        this.currentWriter = new DataFileWriter<Span>(SPAN_WRITER);
        this.currentWriter.setCodec(CodecFactory.deflateCodec(compressionLevel));
        this.currentWriter.create(Span.SCHEMA$, newFile);
        this.currentTimestamp = cutOff;
      }
    }
Пример #4
0
  /**
   * Generates an input Avro file containing the given records in the temporary directory and
   * returns the full path of the file.
   */
  private <T> String generateTestFile(
      String filename,
      List<T> elems,
      SyncBehavior syncBehavior,
      int syncInterval,
      AvroCoder<T> coder,
      String codec)
      throws IOException {
    Random random = new Random(0);
    File tmpFile = tmpFolder.newFile(filename);
    String path = tmpFile.toString();

    FileOutputStream os = new FileOutputStream(tmpFile);
    DatumWriter<T> datumWriter = coder.createDatumWriter();
    try (DataFileWriter<T> writer = new DataFileWriter<>(datumWriter)) {
      writer.setCodec(CodecFactory.fromString(codec));
      writer.create(coder.getSchema(), os);

      int recordIndex = 0;
      int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0;

      for (T elem : elems) {
        writer.append(elem);
        recordIndex++;

        switch (syncBehavior) {
          case SYNC_REGULAR:
            if (recordIndex == syncInterval) {
              recordIndex = 0;
              writer.sync();
            }
            break;
          case SYNC_RANDOM:
            if (recordIndex == syncIndex) {
              recordIndex = 0;
              writer.sync();
              syncIndex = random.nextInt(syncInterval);
            }
            break;
          case SYNC_DEFAULT:
          default:
        }
      }
    }
    return path;
  }
  @Override
  public void onTrigger(ProcessContext context, final ProcessSession session)
      throws ProcessException {
    FlowFile incomingAvro = session.get();
    if (incomingAvro == null) {
      return;
    }

    String inputSchemaProperty =
        context.getProperty(INPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
    final Schema inputSchema;
    try {
      inputSchema = getSchema(inputSchemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
      getLogger().error("Cannot find schema: " + inputSchemaProperty);
      session.transfer(incomingAvro, FAILURE);
      return;
    }
    String outputSchemaProperty =
        context.getProperty(OUTPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue();
    final Schema outputSchema;
    try {
      outputSchema = getSchema(outputSchemaProperty, DefaultConfiguration.get());
    } catch (SchemaNotFoundException e) {
      getLogger().error("Cannot find schema: " + outputSchemaProperty);
      session.transfer(incomingAvro, FAILURE);
      return;
    }
    final Map<String, String> fieldMapping = new HashMap<>();
    for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
      if (entry.getKey().isDynamic()) {
        fieldMapping.put(entry.getKey().getName(), entry.getValue());
      }
    }
    final AvroRecordConverter converter =
        new AvroRecordConverter(inputSchema, outputSchema, fieldMapping);

    final DataFileWriter<Record> writer =
        new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
    writer.setCodec(CodecFactory.snappyCodec());

    final DataFileWriter<Record> failureWriter =
        new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class));
    failureWriter.setCodec(CodecFactory.snappyCodec());

    try {
      final LongHolder written = new LongHolder(0L);
      final FailureTracker failures = new FailureTracker();

      final List<Record> badRecords = Lists.newLinkedList();
      FlowFile incomingAvroCopy = session.clone(incomingAvro);
      FlowFile outgoingAvro =
          session.write(
              incomingAvro,
              new StreamCallback() {
                @Override
                public void process(InputStream in, OutputStream out) throws IOException {
                  try (DataFileStream<Record> stream =
                      new DataFileStream<Record>(
                          in, new GenericDatumReader<Record>(converter.getInputSchema()))) {
                    try (DataFileWriter<Record> w = writer.create(outputSchema, out)) {
                      for (Record record : stream) {
                        try {
                          Record converted = converter.convert(record);
                          w.append(converted);
                          written.incrementAndGet();
                        } catch (AvroConversionException e) {
                          failures.add(e);
                          getLogger().error("Error converting data: " + e.getMessage());
                          badRecords.add(record);
                        }
                      }
                    }
                  }
                }
              });

      FlowFile badOutput =
          session.write(
              incomingAvroCopy,
              new StreamCallback() {
                @Override
                public void process(InputStream in, OutputStream out) throws IOException {

                  try (DataFileWriter<Record> w = failureWriter.create(inputSchema, out)) {
                    for (Record record : badRecords) {
                      w.append(record);
                    }
                  }
                }
              });

      long errors = failures.count();

      // update only if file transfer is successful
      session.adjustCounter("Converted records", written.get(), false);
      // update only if file transfer is successful
      session.adjustCounter("Conversion errors", errors, false);

      if (written.get() > 0L) {
        session.transfer(outgoingAvro, SUCCESS);
      } else {
        session.remove(outgoingAvro);

        if (errors == 0L) {
          badOutput = session.putAttribute(badOutput, "errors", "No incoming records");
          session.transfer(badOutput, FAILURE);
        }
      }

      if (errors > 0L) {
        getLogger()
            .warn(
                "Failed to convert {}/{} records between Avro Schemas",
                new Object[] {errors, errors + written.get()});
        badOutput = session.putAttribute(badOutput, "errors", failures.summary());
        session.transfer(badOutput, FAILURE);
      } else {
        session.remove(badOutput);
      }
    } catch (ProcessException | DatasetIOException e) {
      getLogger().error("Failed reading or writing", e);
      session.transfer(incomingAvro, FAILURE);
    } catch (DatasetException e) {
      getLogger().error("Failed to read FlowFile", e);
      session.transfer(incomingAvro, FAILURE);
    }
  }