private void validateParserFactoryConfigs(List<ConfigIssue> issues) { DataParserFactoryBuilder builder = new DataParserFactoryBuilder(getContext(), dataFormat.getParserFormat()) .setCharset(Charset.defaultCharset()); // TextInputFormat supports Hadoop Text class which is Standard UTF-8 builder.setCharset(StandardCharsets.UTF_8); switch (dataFormat) { case DELIMITED: builder .setMaxDataLen(csvMaxObjectLen) .setMode(csvFileFormat) .setMode((csvHeader == CsvHeader.IGNORE_HEADER) ? CsvHeader.NO_HEADER : csvHeader) .setMode(csvRecordType) .setConfig(DelimitedDataParserFactory.DELIMITER_CONFIG, csvCustomDelimiter) .setConfig(DelimitedDataParserFactory.ESCAPE_CONFIG, csvCustomEscape) .setConfig(DelimitedDataParserFactory.QUOTE_CONFIG, csvCustomQuote); break; case TEXT: builder.setMaxDataLen(textMaxLineLen); break; case JSON: builder.setMode(JsonMode.MULTIPLE_OBJECTS); builder.setMaxDataLen(jsonMaxObjectLen); break; case LOG: logDataFormatValidator.populateBuilder(builder); break; case AVRO: builder .setMaxDataLen(Integer.MAX_VALUE) .setConfig(AvroDataParserFactory.SCHEMA_KEY, avroSchema) .setConfig(AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, true); break; } parserFactory = builder.build(); }
private void validateParserFactoryConfigs(List<ConfigIssue> issues) { DataParserFactoryBuilder builder = new DataParserFactoryBuilder(getContext(), dataFormat.getParserFormat()) .setCharset(Charset.defaultCharset()); if (charset == null) { messageCharset = StandardCharsets.UTF_8; } else { try { messageCharset = Charset.forName(charset); } catch (UnsupportedCharsetException ex) { // setting it to a valid one so the parser factory can be configured and tested for more // errors messageCharset = StandardCharsets.UTF_8; issues.add( getContext() .createConfigIssue(Groups.KAFKA.name(), "charset", KafkaErrors.KAFKA_08, charset)); } } builder.setCharset(messageCharset).setRemoveCtrlChars(removeCtrlChars); switch ((dataFormat)) { case TEXT: builder.setMaxDataLen(textMaxLineLen); break; case JSON: builder.setMode(jsonContent); builder.setMaxDataLen(jsonMaxObjectLen); break; case DELIMITED: builder .setMaxDataLen(csvMaxObjectLen) .setMode(csvFileFormat) .setMode(csvHeader) .setMode(csvRecordType) .setConfig(DelimitedDataConstants.DELIMITER_CONFIG, csvCustomDelimiter) .setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape) .setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape) .setConfig(DelimitedDataConstants.QUOTE_CONFIG, csvCustomQuote); break; case XML: builder.setMaxDataLen(xmlMaxObjectLen); builder.setConfig(XmlDataParserFactory.RECORD_ELEMENT_KEY, xmlRecordElement); break; case SDC_JSON: builder.setMaxDataLen(-1); break; case LOG: logDataFormatValidator.populateBuilder(builder); parserFactory = builder.build(); break; case AVRO: builder .setMaxDataLen(Integer.MAX_VALUE) .setConfig(AvroDataParserFactory.SCHEMA_KEY, avroSchema) .setConfig(AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, messageHasSchema); break; case BINARY: builder.setMaxDataLen(binaryMaxObjectLen); } parserFactory = builder.build(); }
private boolean validateDataGenerator( Stage.Context context, DataFormat dataFormat, String groupName, List<Stage.ConfigIssue> issues) { boolean valid = true; DataGeneratorFactoryBuilder builder = new DataGeneratorFactoryBuilder(context, dataFormat.getGeneratorFormat()); if (charset == null || charset.trim().isEmpty()) { charset = CHARSET_UTF8; } Charset cSet; try { cSet = Charset.forName(charset); } catch (UnsupportedCharsetException ex) { // setting it to a valid one so the parser factory can be configured and tested for more // errors cSet = StandardCharsets.UTF_8; issues.add( context.createConfigIssue( groupName, "charset", DataFormatErrors.DATA_FORMAT_05, charset)); valid &= false; } builder.setCharset(cSet); switch (dataFormat) { case SDC_JSON: break; case DELIMITED: builder.setMode(csvFileFormat); builder.setMode(csvHeader); builder.setConfig(DelimitedDataGeneratorFactory.REPLACE_NEWLINES_KEY, csvReplaceNewLines); builder.setConfig(DelimitedDataConstants.DELIMITER_CONFIG, csvCustomDelimiter); builder.setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape); builder.setConfig(DelimitedDataConstants.QUOTE_CONFIG, csvCustomQuote); break; case TEXT: builder.setConfig(TextDataGeneratorFactory.FIELD_PATH_KEY, textFieldPath); builder.setConfig(TextDataGeneratorFactory.EMPTY_LINE_IF_NULL_KEY, textEmptyLineIfNull); break; case JSON: builder.setMode(jsonMode); break; case AVRO: Schema schema = null; Map<String, Object> defaultValues = new HashMap<>(); try { schema = new Schema.Parser().setValidate(true).setValidateDefaults(true).parse(avroSchema); } catch (Exception e) { issues.add( context.createConfigIssue( DataFormatGroups.AVRO.name(), "avroSchema", DataFormatErrors.DATA_FORMAT_300, e.toString(), e)); valid &= false; } if (schema != null) { try { defaultValues.putAll( AvroTypeUtil.getDefaultValuesFromSchema(schema, new HashSet<String>())); } catch (IOException e) { issues.add( context.createConfigIssue( DataFormatGroups.AVRO.name(), "avroSchema", DataFormatErrors.DATA_FORMAT_301, e.toString(), e)); valid &= false; } } builder.setConfig(AvroDataGeneratorFactory.SCHEMA_KEY, avroSchema); builder.setConfig(AvroDataGeneratorFactory.INCLUDE_SCHEMA_KEY, includeSchema); builder.setConfig(AvroDataGeneratorFactory.DEFAULT_VALUES_KEY, defaultValues); break; case BINARY: builder.setConfig(BinaryDataGeneratorFactory.FIELD_PATH_KEY, binaryFieldPath); break; case PROTOBUF: builder .setConfig(ProtobufConstants.PROTO_DESCRIPTOR_FILE_KEY, protoDescriptorFile) .setConfig(ProtobufConstants.MESSAGE_TYPE_KEY, messageType); break; default: // no action needed break; } if (valid) { try { dataGeneratorFactory = builder.build(); } catch (Exception ex) { issues.add( context.createConfigIssue( null, null, DataFormatErrors.DATA_FORMAT_201, ex.toString(), ex)); valid &= false; } } return valid; }
public List<Stage.ConfigIssue> init(Source.Context context) { List<Stage.ConfigIssue> issues = new ArrayList<>(); switch (dataFormat) { case JSON: if (dataFormatConfig.jsonMaxObjectLen < 1) { issues.add( context.createConfigIssue( DataFormat.JSON.name(), "dataFormatConfig.maxJsonObjectLen", ParserErrors.PARSER_04)); } break; case TEXT: if (dataFormatConfig.textMaxLineLen < 1) { issues.add( context.createConfigIssue( DataFormat.TEXT.name(), "dataFormatConfig.maxLogLineLength", ParserErrors.PARSER_04)); } break; case DELIMITED: if (dataFormatConfig.csvMaxObjectLen < 1) { issues.add( context.createConfigIssue( DataFormat.DELIMITED.name(), "dataFormatConfig.csvMaxObjectLen", ParserErrors.PARSER_04)); } break; case XML: if (messageConfig != null && messageConfig.produceSingleRecordPerMessage) { issues.add( context.createConfigIssue( parentName, "messageConfig.produceSingleRecordPerMessage", ParserErrors.PARSER_06)); } if (dataFormatConfig.xmlMaxObjectLen < 1) { issues.add( context.createConfigIssue( DataFormat.XML.name(), "dataFormatConfig.maxXmlObjectLen", ParserErrors.PARSER_04)); } if (dataFormatConfig.xmlRecordElement != null && !dataFormatConfig.xmlRecordElement.isEmpty() && !XMLChar.isValidName(dataFormatConfig.xmlRecordElement)) { issues.add( context.createConfigIssue( DataFormat.XML.name(), "dataFormatConfig.xmlRecordElement", ParserErrors.PARSER_02, dataFormatConfig.xmlRecordElement)); } break; case SDC_JSON: break; case LOG: logDataFormatValidator = new LogDataFormatValidator( dataFormatConfig.logMode, dataFormatConfig.logMaxObjectLen, dataFormatConfig.retainOriginalLine, dataFormatConfig.customLogFormat, dataFormatConfig.regex, dataFormatConfig.grokPatternDefinition, dataFormatConfig.grokPattern, dataFormatConfig.enableLog4jCustomLogFormat, dataFormatConfig.log4jCustomLogFormat, dataFormatConfig.onParseError, dataFormatConfig.maxStackTraceLines, DataFormat.LOG.name(), getFieldPathToGroupMap(dataFormatConfig.fieldPathsToGroupName)); logDataFormatValidator.validateLogFormatConfig(issues, context); break; case AVRO: if (!dataFormatConfig.schemaInMessage && (dataFormatConfig.avroSchema == null || dataFormatConfig.avroSchema.isEmpty())) { issues.add( context.createConfigIssue( DataFormat.AVRO.name(), "dataFormatConfig.avroSchema", ParserErrors.PARSER_07, dataFormatConfig.avroSchema)); } break; default: issues.add( context.createConfigIssue( parentName, "dataFormat", ParserErrors.PARSER_05, dataFormat)); } DataParserFactoryBuilder builder = new DataParserFactoryBuilder(context, dataFormat.getParserFormat()) .setCharset(Charset.defaultCharset()); if (dataFormatConfig.charset == null) { messageCharset = StandardCharsets.UTF_8; } else { try { messageCharset = Charset.forName(dataFormatConfig.charset); } catch (UnsupportedCharsetException ex) { // setting it to a valid one so the parser factory can be configured and tested for more // errors messageCharset = StandardCharsets.UTF_8; issues.add( context.createConfigIssue( parentName, "charset", ParserErrors.PARSER_01, dataFormatConfig.charset)); } } builder.setCharset(messageCharset).setRemoveCtrlChars(dataFormatConfig.removeCtrlChars); switch (dataFormat) { case TEXT: builder.setMaxDataLen(dataFormatConfig.textMaxLineLen); break; case JSON: builder.setMode(dataFormatConfig.jsonContent); builder.setMaxDataLen(dataFormatConfig.jsonMaxObjectLen); break; case DELIMITED: builder .setMaxDataLen(dataFormatConfig.csvMaxObjectLen) .setMode(dataFormatConfig.csvFileFormat) .setMode(dataFormatConfig.csvHeader) .setMode(dataFormatConfig.csvRecordType) .setConfig( DelimitedDataParserFactory.DELIMITER_CONFIG, dataFormatConfig.csvCustomDelimiter) .setConfig(DelimitedDataParserFactory.ESCAPE_CONFIG, dataFormatConfig.csvCustomEscape) .setConfig(DelimitedDataParserFactory.QUOTE_CONFIG, dataFormatConfig.csvCustomQuote); break; case XML: builder.setMaxDataLen(dataFormatConfig.xmlMaxObjectLen); builder.setConfig( XmlDataParserFactory.RECORD_ELEMENT_KEY, dataFormatConfig.xmlRecordElement); break; case SDC_JSON: builder.setMaxDataLen(-1); break; case LOG: logDataFormatValidator.populateBuilder(builder); break; case AVRO: builder .setMaxDataLen(Integer.MAX_VALUE) .setConfig(AvroDataParserFactory.SCHEMA_KEY, dataFormatConfig.avroSchema) .setConfig( AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, dataFormatConfig.schemaInMessage); break; default: throw new IllegalStateException("Unknown data format: " + dataFormat); } parserFactory = builder.build(); return issues; }