private void validateParserFactoryConfigs(List<ConfigIssue> issues) {
    DataParserFactoryBuilder builder =
        new DataParserFactoryBuilder(getContext(), dataFormat.getParserFormat())
            .setCharset(Charset.defaultCharset());

    // TextInputFormat supports Hadoop Text class which is Standard UTF-8
    builder.setCharset(StandardCharsets.UTF_8);

    switch (dataFormat) {
      case DELIMITED:
        builder
            .setMaxDataLen(csvMaxObjectLen)
            .setMode(csvFileFormat)
            .setMode((csvHeader == CsvHeader.IGNORE_HEADER) ? CsvHeader.NO_HEADER : csvHeader)
            .setMode(csvRecordType)
            .setConfig(DelimitedDataParserFactory.DELIMITER_CONFIG, csvCustomDelimiter)
            .setConfig(DelimitedDataParserFactory.ESCAPE_CONFIG, csvCustomEscape)
            .setConfig(DelimitedDataParserFactory.QUOTE_CONFIG, csvCustomQuote);
        break;
      case TEXT:
        builder.setMaxDataLen(textMaxLineLen);
        break;
      case JSON:
        builder.setMode(JsonMode.MULTIPLE_OBJECTS);
        builder.setMaxDataLen(jsonMaxObjectLen);
        break;
      case LOG:
        logDataFormatValidator.populateBuilder(builder);
        break;
      case AVRO:
        builder
            .setMaxDataLen(Integer.MAX_VALUE)
            .setConfig(AvroDataParserFactory.SCHEMA_KEY, avroSchema)
            .setConfig(AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, true);
        break;
    }
    parserFactory = builder.build();
  }
Exemplo n.º 2
0
  private void validateParserFactoryConfigs(List<ConfigIssue> issues) {
    DataParserFactoryBuilder builder =
        new DataParserFactoryBuilder(getContext(), dataFormat.getParserFormat())
            .setCharset(Charset.defaultCharset());
    if (charset == null) {
      messageCharset = StandardCharsets.UTF_8;
    } else {
      try {
        messageCharset = Charset.forName(charset);
      } catch (UnsupportedCharsetException ex) {
        // setting it to a valid one so the parser factory can be configured and tested for more
        // errors
        messageCharset = StandardCharsets.UTF_8;
        issues.add(
            getContext()
                .createConfigIssue(Groups.KAFKA.name(), "charset", KafkaErrors.KAFKA_08, charset));
      }
    }
    builder.setCharset(messageCharset).setRemoveCtrlChars(removeCtrlChars);

    switch ((dataFormat)) {
      case TEXT:
        builder.setMaxDataLen(textMaxLineLen);
        break;
      case JSON:
        builder.setMode(jsonContent);
        builder.setMaxDataLen(jsonMaxObjectLen);
        break;
      case DELIMITED:
        builder
            .setMaxDataLen(csvMaxObjectLen)
            .setMode(csvFileFormat)
            .setMode(csvHeader)
            .setMode(csvRecordType)
            .setConfig(DelimitedDataConstants.DELIMITER_CONFIG, csvCustomDelimiter)
            .setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape)
            .setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape)
            .setConfig(DelimitedDataConstants.QUOTE_CONFIG, csvCustomQuote);
        break;
      case XML:
        builder.setMaxDataLen(xmlMaxObjectLen);
        builder.setConfig(XmlDataParserFactory.RECORD_ELEMENT_KEY, xmlRecordElement);
        break;
      case SDC_JSON:
        builder.setMaxDataLen(-1);
        break;
      case LOG:
        logDataFormatValidator.populateBuilder(builder);
        parserFactory = builder.build();
        break;
      case AVRO:
        builder
            .setMaxDataLen(Integer.MAX_VALUE)
            .setConfig(AvroDataParserFactory.SCHEMA_KEY, avroSchema)
            .setConfig(AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, messageHasSchema);
        break;
      case BINARY:
        builder.setMaxDataLen(binaryMaxObjectLen);
    }
    parserFactory = builder.build();
  }
  private boolean validateDataGenerator(
      Stage.Context context,
      DataFormat dataFormat,
      String groupName,
      List<Stage.ConfigIssue> issues) {
    boolean valid = true;

    DataGeneratorFactoryBuilder builder =
        new DataGeneratorFactoryBuilder(context, dataFormat.getGeneratorFormat());
    if (charset == null || charset.trim().isEmpty()) {
      charset = CHARSET_UTF8;
    }

    Charset cSet;
    try {
      cSet = Charset.forName(charset);
    } catch (UnsupportedCharsetException ex) {
      // setting it to a valid one so the parser factory can be configured and tested for more
      // errors
      cSet = StandardCharsets.UTF_8;
      issues.add(
          context.createConfigIssue(
              groupName, "charset", DataFormatErrors.DATA_FORMAT_05, charset));
      valid &= false;
    }

    builder.setCharset(cSet);

    switch (dataFormat) {
      case SDC_JSON:
        break;
      case DELIMITED:
        builder.setMode(csvFileFormat);
        builder.setMode(csvHeader);
        builder.setConfig(DelimitedDataGeneratorFactory.REPLACE_NEWLINES_KEY, csvReplaceNewLines);
        builder.setConfig(DelimitedDataConstants.DELIMITER_CONFIG, csvCustomDelimiter);
        builder.setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape);
        builder.setConfig(DelimitedDataConstants.QUOTE_CONFIG, csvCustomQuote);
        break;
      case TEXT:
        builder.setConfig(TextDataGeneratorFactory.FIELD_PATH_KEY, textFieldPath);
        builder.setConfig(TextDataGeneratorFactory.EMPTY_LINE_IF_NULL_KEY, textEmptyLineIfNull);
        break;
      case JSON:
        builder.setMode(jsonMode);
        break;
      case AVRO:
        Schema schema = null;
        Map<String, Object> defaultValues = new HashMap<>();
        try {
          schema =
              new Schema.Parser().setValidate(true).setValidateDefaults(true).parse(avroSchema);
        } catch (Exception e) {
          issues.add(
              context.createConfigIssue(
                  DataFormatGroups.AVRO.name(),
                  "avroSchema",
                  DataFormatErrors.DATA_FORMAT_300,
                  e.toString(),
                  e));
          valid &= false;
        }
        if (schema != null) {
          try {
            defaultValues.putAll(
                AvroTypeUtil.getDefaultValuesFromSchema(schema, new HashSet<String>()));
          } catch (IOException e) {
            issues.add(
                context.createConfigIssue(
                    DataFormatGroups.AVRO.name(),
                    "avroSchema",
                    DataFormatErrors.DATA_FORMAT_301,
                    e.toString(),
                    e));
            valid &= false;
          }
        }
        builder.setConfig(AvroDataGeneratorFactory.SCHEMA_KEY, avroSchema);
        builder.setConfig(AvroDataGeneratorFactory.INCLUDE_SCHEMA_KEY, includeSchema);
        builder.setConfig(AvroDataGeneratorFactory.DEFAULT_VALUES_KEY, defaultValues);
        break;
      case BINARY:
        builder.setConfig(BinaryDataGeneratorFactory.FIELD_PATH_KEY, binaryFieldPath);
        break;
      case PROTOBUF:
        builder
            .setConfig(ProtobufConstants.PROTO_DESCRIPTOR_FILE_KEY, protoDescriptorFile)
            .setConfig(ProtobufConstants.MESSAGE_TYPE_KEY, messageType);
        break;
      default:
        // no action needed
        break;
    }
    if (valid) {
      try {
        dataGeneratorFactory = builder.build();
      } catch (Exception ex) {
        issues.add(
            context.createConfigIssue(
                null, null, DataFormatErrors.DATA_FORMAT_201, ex.toString(), ex));
        valid &= false;
      }
    }
    return valid;
  }
Exemplo n.º 4
0
  public List<Stage.ConfigIssue> init(Source.Context context) {
    List<Stage.ConfigIssue> issues = new ArrayList<>();
    switch (dataFormat) {
      case JSON:
        if (dataFormatConfig.jsonMaxObjectLen < 1) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.JSON.name(),
                  "dataFormatConfig.maxJsonObjectLen",
                  ParserErrors.PARSER_04));
        }
        break;
      case TEXT:
        if (dataFormatConfig.textMaxLineLen < 1) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.TEXT.name(),
                  "dataFormatConfig.maxLogLineLength",
                  ParserErrors.PARSER_04));
        }
        break;
      case DELIMITED:
        if (dataFormatConfig.csvMaxObjectLen < 1) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.DELIMITED.name(),
                  "dataFormatConfig.csvMaxObjectLen",
                  ParserErrors.PARSER_04));
        }
        break;
      case XML:
        if (messageConfig != null && messageConfig.produceSingleRecordPerMessage) {
          issues.add(
              context.createConfigIssue(
                  parentName,
                  "messageConfig.produceSingleRecordPerMessage",
                  ParserErrors.PARSER_06));
        }
        if (dataFormatConfig.xmlMaxObjectLen < 1) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.XML.name(),
                  "dataFormatConfig.maxXmlObjectLen",
                  ParserErrors.PARSER_04));
        }
        if (dataFormatConfig.xmlRecordElement != null
            && !dataFormatConfig.xmlRecordElement.isEmpty()
            && !XMLChar.isValidName(dataFormatConfig.xmlRecordElement)) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.XML.name(),
                  "dataFormatConfig.xmlRecordElement",
                  ParserErrors.PARSER_02,
                  dataFormatConfig.xmlRecordElement));
        }
        break;
      case SDC_JSON:
        break;
      case LOG:
        logDataFormatValidator =
            new LogDataFormatValidator(
                dataFormatConfig.logMode,
                dataFormatConfig.logMaxObjectLen,
                dataFormatConfig.retainOriginalLine,
                dataFormatConfig.customLogFormat,
                dataFormatConfig.regex,
                dataFormatConfig.grokPatternDefinition,
                dataFormatConfig.grokPattern,
                dataFormatConfig.enableLog4jCustomLogFormat,
                dataFormatConfig.log4jCustomLogFormat,
                dataFormatConfig.onParseError,
                dataFormatConfig.maxStackTraceLines,
                DataFormat.LOG.name(),
                getFieldPathToGroupMap(dataFormatConfig.fieldPathsToGroupName));
        logDataFormatValidator.validateLogFormatConfig(issues, context);
        break;
      case AVRO:
        if (!dataFormatConfig.schemaInMessage
            && (dataFormatConfig.avroSchema == null || dataFormatConfig.avroSchema.isEmpty())) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.AVRO.name(),
                  "dataFormatConfig.avroSchema",
                  ParserErrors.PARSER_07,
                  dataFormatConfig.avroSchema));
        }
        break;
      default:
        issues.add(
            context.createConfigIssue(
                parentName, "dataFormat", ParserErrors.PARSER_05, dataFormat));
    }

    DataParserFactoryBuilder builder =
        new DataParserFactoryBuilder(context, dataFormat.getParserFormat())
            .setCharset(Charset.defaultCharset());
    if (dataFormatConfig.charset == null) {
      messageCharset = StandardCharsets.UTF_8;
    } else {
      try {
        messageCharset = Charset.forName(dataFormatConfig.charset);
      } catch (UnsupportedCharsetException ex) {
        // setting it to a valid one so the parser factory can be configured and tested for more
        // errors
        messageCharset = StandardCharsets.UTF_8;
        issues.add(
            context.createConfigIssue(
                parentName, "charset", ParserErrors.PARSER_01, dataFormatConfig.charset));
      }
    }
    builder.setCharset(messageCharset).setRemoveCtrlChars(dataFormatConfig.removeCtrlChars);

    switch (dataFormat) {
      case TEXT:
        builder.setMaxDataLen(dataFormatConfig.textMaxLineLen);
        break;
      case JSON:
        builder.setMode(dataFormatConfig.jsonContent);
        builder.setMaxDataLen(dataFormatConfig.jsonMaxObjectLen);
        break;
      case DELIMITED:
        builder
            .setMaxDataLen(dataFormatConfig.csvMaxObjectLen)
            .setMode(dataFormatConfig.csvFileFormat)
            .setMode(dataFormatConfig.csvHeader)
            .setMode(dataFormatConfig.csvRecordType)
            .setConfig(
                DelimitedDataParserFactory.DELIMITER_CONFIG, dataFormatConfig.csvCustomDelimiter)
            .setConfig(DelimitedDataParserFactory.ESCAPE_CONFIG, dataFormatConfig.csvCustomEscape)
            .setConfig(DelimitedDataParserFactory.QUOTE_CONFIG, dataFormatConfig.csvCustomQuote);
        break;
      case XML:
        builder.setMaxDataLen(dataFormatConfig.xmlMaxObjectLen);
        builder.setConfig(
            XmlDataParserFactory.RECORD_ELEMENT_KEY, dataFormatConfig.xmlRecordElement);
        break;
      case SDC_JSON:
        builder.setMaxDataLen(-1);
        break;
      case LOG:
        logDataFormatValidator.populateBuilder(builder);
        break;
      case AVRO:
        builder
            .setMaxDataLen(Integer.MAX_VALUE)
            .setConfig(AvroDataParserFactory.SCHEMA_KEY, dataFormatConfig.avroSchema)
            .setConfig(
                AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, dataFormatConfig.schemaInMessage);
        break;
      default:
        throw new IllegalStateException("Unknown data format: " + dataFormat);
    }
    parserFactory = builder.build();
    return issues;
  }