private void validateParserFactoryConfigs(List<ConfigIssue> issues) { DataParserFactoryBuilder builder = new DataParserFactoryBuilder(getContext(), dataFormat.getParserFormat()) .setCharset(Charset.defaultCharset()); // TextInputFormat supports Hadoop Text class which is Standard UTF-8 builder.setCharset(StandardCharsets.UTF_8); switch (dataFormat) { case DELIMITED: builder .setMaxDataLen(csvMaxObjectLen) .setMode(csvFileFormat) .setMode((csvHeader == CsvHeader.IGNORE_HEADER) ? CsvHeader.NO_HEADER : csvHeader) .setMode(csvRecordType) .setConfig(DelimitedDataParserFactory.DELIMITER_CONFIG, csvCustomDelimiter) .setConfig(DelimitedDataParserFactory.ESCAPE_CONFIG, csvCustomEscape) .setConfig(DelimitedDataParserFactory.QUOTE_CONFIG, csvCustomQuote); break; case TEXT: builder.setMaxDataLen(textMaxLineLen); break; case JSON: builder.setMode(JsonMode.MULTIPLE_OBJECTS); builder.setMaxDataLen(jsonMaxObjectLen); break; case LOG: logDataFormatValidator.populateBuilder(builder); break; case AVRO: builder .setMaxDataLen(Integer.MAX_VALUE) .setConfig(AvroDataParserFactory.SCHEMA_KEY, avroSchema) .setConfig(AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, true); break; } parserFactory = builder.build(); }
private void validateParserFactoryConfigs(List<ConfigIssue> issues) { DataParserFactoryBuilder builder = new DataParserFactoryBuilder(getContext(), dataFormat.getParserFormat()) .setCharset(Charset.defaultCharset()); if (charset == null) { messageCharset = StandardCharsets.UTF_8; } else { try { messageCharset = Charset.forName(charset); } catch (UnsupportedCharsetException ex) { // setting it to a valid one so the parser factory can be configured and tested for more // errors messageCharset = StandardCharsets.UTF_8; issues.add( getContext() .createConfigIssue(Groups.KAFKA.name(), "charset", KafkaErrors.KAFKA_08, charset)); } } builder.setCharset(messageCharset).setRemoveCtrlChars(removeCtrlChars); switch ((dataFormat)) { case TEXT: builder.setMaxDataLen(textMaxLineLen); break; case JSON: builder.setMode(jsonContent); builder.setMaxDataLen(jsonMaxObjectLen); break; case DELIMITED: builder .setMaxDataLen(csvMaxObjectLen) .setMode(csvFileFormat) .setMode(csvHeader) .setMode(csvRecordType) .setConfig(DelimitedDataConstants.DELIMITER_CONFIG, csvCustomDelimiter) .setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape) .setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape) .setConfig(DelimitedDataConstants.QUOTE_CONFIG, csvCustomQuote); break; case XML: builder.setMaxDataLen(xmlMaxObjectLen); builder.setConfig(XmlDataParserFactory.RECORD_ELEMENT_KEY, xmlRecordElement); break; case SDC_JSON: builder.setMaxDataLen(-1); break; case LOG: logDataFormatValidator.populateBuilder(builder); parserFactory = builder.build(); break; case AVRO: builder .setMaxDataLen(Integer.MAX_VALUE) .setConfig(AvroDataParserFactory.SCHEMA_KEY, avroSchema) .setConfig(AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, messageHasSchema); break; case BINARY: builder.setMaxDataLen(binaryMaxObjectLen); } parserFactory = builder.build(); }
public List<Stage.ConfigIssue> init(Source.Context context) { List<Stage.ConfigIssue> issues = new ArrayList<>(); switch (dataFormat) { case JSON: if (dataFormatConfig.jsonMaxObjectLen < 1) { issues.add( context.createConfigIssue( DataFormat.JSON.name(), "dataFormatConfig.maxJsonObjectLen", ParserErrors.PARSER_04)); } break; case TEXT: if (dataFormatConfig.textMaxLineLen < 1) { issues.add( context.createConfigIssue( DataFormat.TEXT.name(), "dataFormatConfig.maxLogLineLength", ParserErrors.PARSER_04)); } break; case DELIMITED: if (dataFormatConfig.csvMaxObjectLen < 1) { issues.add( context.createConfigIssue( DataFormat.DELIMITED.name(), "dataFormatConfig.csvMaxObjectLen", ParserErrors.PARSER_04)); } break; case XML: if (messageConfig != null && messageConfig.produceSingleRecordPerMessage) { issues.add( context.createConfigIssue( parentName, "messageConfig.produceSingleRecordPerMessage", ParserErrors.PARSER_06)); } if (dataFormatConfig.xmlMaxObjectLen < 1) { issues.add( context.createConfigIssue( DataFormat.XML.name(), "dataFormatConfig.maxXmlObjectLen", ParserErrors.PARSER_04)); } if (dataFormatConfig.xmlRecordElement != null && !dataFormatConfig.xmlRecordElement.isEmpty() && !XMLChar.isValidName(dataFormatConfig.xmlRecordElement)) { issues.add( context.createConfigIssue( DataFormat.XML.name(), "dataFormatConfig.xmlRecordElement", ParserErrors.PARSER_02, dataFormatConfig.xmlRecordElement)); } break; case SDC_JSON: break; case LOG: logDataFormatValidator = new LogDataFormatValidator( dataFormatConfig.logMode, dataFormatConfig.logMaxObjectLen, dataFormatConfig.retainOriginalLine, dataFormatConfig.customLogFormat, dataFormatConfig.regex, dataFormatConfig.grokPatternDefinition, dataFormatConfig.grokPattern, dataFormatConfig.enableLog4jCustomLogFormat, dataFormatConfig.log4jCustomLogFormat, dataFormatConfig.onParseError, dataFormatConfig.maxStackTraceLines, DataFormat.LOG.name(), getFieldPathToGroupMap(dataFormatConfig.fieldPathsToGroupName)); logDataFormatValidator.validateLogFormatConfig(issues, context); break; case AVRO: if (!dataFormatConfig.schemaInMessage && (dataFormatConfig.avroSchema == null || dataFormatConfig.avroSchema.isEmpty())) { issues.add( context.createConfigIssue( DataFormat.AVRO.name(), "dataFormatConfig.avroSchema", ParserErrors.PARSER_07, dataFormatConfig.avroSchema)); } break; default: issues.add( context.createConfigIssue( parentName, "dataFormat", ParserErrors.PARSER_05, dataFormat)); } DataParserFactoryBuilder builder = new DataParserFactoryBuilder(context, dataFormat.getParserFormat()) .setCharset(Charset.defaultCharset()); if (dataFormatConfig.charset == null) { messageCharset = StandardCharsets.UTF_8; } else { try { messageCharset = Charset.forName(dataFormatConfig.charset); } catch (UnsupportedCharsetException ex) { // setting it to a valid one so the parser factory can be configured and tested for more // errors messageCharset = StandardCharsets.UTF_8; issues.add( context.createConfigIssue( parentName, "charset", ParserErrors.PARSER_01, dataFormatConfig.charset)); } } builder.setCharset(messageCharset).setRemoveCtrlChars(dataFormatConfig.removeCtrlChars); switch (dataFormat) { case TEXT: builder.setMaxDataLen(dataFormatConfig.textMaxLineLen); break; case JSON: builder.setMode(dataFormatConfig.jsonContent); builder.setMaxDataLen(dataFormatConfig.jsonMaxObjectLen); break; case DELIMITED: builder .setMaxDataLen(dataFormatConfig.csvMaxObjectLen) .setMode(dataFormatConfig.csvFileFormat) .setMode(dataFormatConfig.csvHeader) .setMode(dataFormatConfig.csvRecordType) .setConfig( DelimitedDataParserFactory.DELIMITER_CONFIG, dataFormatConfig.csvCustomDelimiter) .setConfig(DelimitedDataParserFactory.ESCAPE_CONFIG, dataFormatConfig.csvCustomEscape) .setConfig(DelimitedDataParserFactory.QUOTE_CONFIG, dataFormatConfig.csvCustomQuote); break; case XML: builder.setMaxDataLen(dataFormatConfig.xmlMaxObjectLen); builder.setConfig( XmlDataParserFactory.RECORD_ELEMENT_KEY, dataFormatConfig.xmlRecordElement); break; case SDC_JSON: builder.setMaxDataLen(-1); break; case LOG: logDataFormatValidator.populateBuilder(builder); break; case AVRO: builder .setMaxDataLen(Integer.MAX_VALUE) .setConfig(AvroDataParserFactory.SCHEMA_KEY, dataFormatConfig.avroSchema) .setConfig( AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, dataFormatConfig.schemaInMessage); break; default: throw new IllegalStateException("Unknown data format: " + dataFormat); } parserFactory = builder.build(); return issues; }