private void validateParserFactoryConfigs(List<ConfigIssue> issues) { DataParserFactoryBuilder builder = new DataParserFactoryBuilder(getContext(), dataFormat.getParserFormat()) .setCharset(Charset.defaultCharset()); // TextInputFormat supports Hadoop Text class which is Standard UTF-8 builder.setCharset(StandardCharsets.UTF_8); switch (dataFormat) { case DELIMITED: builder .setMaxDataLen(csvMaxObjectLen) .setMode(csvFileFormat) .setMode((csvHeader == CsvHeader.IGNORE_HEADER) ? CsvHeader.NO_HEADER : csvHeader) .setMode(csvRecordType) .setConfig(DelimitedDataParserFactory.DELIMITER_CONFIG, csvCustomDelimiter) .setConfig(DelimitedDataParserFactory.ESCAPE_CONFIG, csvCustomEscape) .setConfig(DelimitedDataParserFactory.QUOTE_CONFIG, csvCustomQuote); break; case TEXT: builder.setMaxDataLen(textMaxLineLen); break; case JSON: builder.setMode(JsonMode.MULTIPLE_OBJECTS); builder.setMaxDataLen(jsonMaxObjectLen); break; case LOG: logDataFormatValidator.populateBuilder(builder); break; case AVRO: builder .setMaxDataLen(Integer.MAX_VALUE) .setConfig(AvroDataParserFactory.SCHEMA_KEY, avroSchema) .setConfig(AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, true); break; } parserFactory = builder.build(); }
private void validateParserFactoryConfigs(List<ConfigIssue> issues) { DataParserFactoryBuilder builder = new DataParserFactoryBuilder(getContext(), dataFormat.getParserFormat()) .setCharset(Charset.defaultCharset()); if (charset == null) { messageCharset = StandardCharsets.UTF_8; } else { try { messageCharset = Charset.forName(charset); } catch (UnsupportedCharsetException ex) { // setting it to a valid one so the parser factory can be configured and tested for more // errors messageCharset = StandardCharsets.UTF_8; issues.add( getContext() .createConfigIssue(Groups.KAFKA.name(), "charset", KafkaErrors.KAFKA_08, charset)); } } builder.setCharset(messageCharset).setRemoveCtrlChars(removeCtrlChars); switch ((dataFormat)) { case TEXT: builder.setMaxDataLen(textMaxLineLen); break; case JSON: builder.setMode(jsonContent); builder.setMaxDataLen(jsonMaxObjectLen); break; case DELIMITED: builder .setMaxDataLen(csvMaxObjectLen) .setMode(csvFileFormat) .setMode(csvHeader) .setMode(csvRecordType) .setConfig(DelimitedDataConstants.DELIMITER_CONFIG, csvCustomDelimiter) .setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape) .setConfig(DelimitedDataConstants.ESCAPE_CONFIG, csvCustomEscape) .setConfig(DelimitedDataConstants.QUOTE_CONFIG, csvCustomQuote); break; case XML: builder.setMaxDataLen(xmlMaxObjectLen); builder.setConfig(XmlDataParserFactory.RECORD_ELEMENT_KEY, xmlRecordElement); break; case SDC_JSON: builder.setMaxDataLen(-1); break; case LOG: logDataFormatValidator.populateBuilder(builder); parserFactory = builder.build(); break; case AVRO: builder .setMaxDataLen(Integer.MAX_VALUE) .setConfig(AvroDataParserFactory.SCHEMA_KEY, avroSchema) .setConfig(AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, messageHasSchema); break; case BINARY: builder.setMaxDataLen(binaryMaxObjectLen); } parserFactory = builder.build(); }
@Override protected List<ConfigIssue> init() { List<ConfigIssue> issues = new ArrayList<ConfigIssue>(); if (topic == null || topic.isEmpty()) { issues.add( getContext().createConfigIssue(Groups.KAFKA.name(), "topic", KafkaErrors.KAFKA_05)); } // maxWaitTime if (maxWaitTime < 1) { issues.add( getContext().createConfigIssue(Groups.KAFKA.name(), "maxWaitTime", KafkaErrors.KAFKA_35)); } switch (dataFormat) { case JSON: if (jsonMaxObjectLen < 1) { issues.add( getContext() .createConfigIssue(Groups.JSON.name(), "maxJsonObjectLen", KafkaErrors.KAFKA_38)); } break; case TEXT: if (textMaxLineLen < 1) { issues.add( getContext() .createConfigIssue(Groups.TEXT.name(), "maxLogLineLength", KafkaErrors.KAFKA_38)); } break; case DELIMITED: if (csvMaxObjectLen < 1) { issues.add( getContext() .createConfigIssue( Groups.DELIMITED.name(), "csvMaxObjectLen", KafkaErrors.KAFKA_38)); } break; case XML: if (produceSingleRecordPerMessage) { issues.add( getContext() .createConfigIssue( Groups.KAFKA.name(), "produceSingleRecordPerMessage", KafkaErrors.KAFKA_40)); } if (xmlMaxObjectLen < 1) { issues.add( getContext() .createConfigIssue(Groups.XML.name(), "maxXmlObjectLen", KafkaErrors.KAFKA_38)); } if (xmlRecordElement != null && !xmlRecordElement.isEmpty() && !XMLChar.isValidName(xmlRecordElement)) { issues.add( getContext() .createConfigIssue( Groups.XML.name(), "xmlRecordElement", KafkaErrors.KAFKA_36, xmlRecordElement)); } break; case SDC_JSON: case BINARY: break; case LOG: logDataFormatValidator = new LogDataFormatValidator( logMode, logMaxObjectLen, logRetainOriginalLine, customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat, log4jCustomLogFormat, onParseError, maxStackTraceLines, Groups.LOG.name(), getFieldPathToGroupMap(fieldPathsToGroupName)); logDataFormatValidator.validateLogFormatConfig(issues, getContext()); break; case AVRO: if (!messageHasSchema && (avroSchema == null || avroSchema.isEmpty())) { issues.add( getContext() .createConfigIssue( Groups.AVRO.name(), "avroSchema", KafkaErrors.KAFKA_43, avroSchema)); } break; default: issues.add( getContext() .createConfigIssue( Groups.KAFKA.name(), "dataFormat", KafkaErrors.KAFKA_39, dataFormat)); } validateParserFactoryConfigs(issues); // Validate broker config try { int partitionCount = KafkaUtil.getPartitionCount(metadataBrokerList, topic, 3, 1000); if (partitionCount < 1) { issues.add( getContext() .createConfigIssue(Groups.KAFKA.name(), "topic", KafkaErrors.KAFKA_42, topic)); } else { // cache the partition count as parallelism for future use originParallelism = partitionCount; } } catch (IOException e) { issues.add( getContext() .createConfigIssue( Groups.KAFKA.name(), "topic", KafkaErrors.KAFKA_41, topic, e.toString(), e)); } // Validate zookeeper config List<KafkaBroker> kafkaBrokers = KafkaUtil.validateZkConnectionString( issues, zookeeperConnect, Groups.KAFKA.name(), "zookeeperConnect", getContext()); // validate connecting to kafka if (kafkaBrokers != null && !kafkaBrokers.isEmpty() && topic != null && !topic.isEmpty()) { kafkaConsumer = new KafkaConsumer( zookeeperConnect, topic, consumerGroup, maxBatchSize, maxWaitTime, kafkaConsumerConfigs, getContext()); kafkaConsumer.validate(issues, getContext()); } // consumerGroup if (consumerGroup == null || consumerGroup.isEmpty()) { issues.add( getContext() .createConfigIssue(Groups.KAFKA.name(), "consumerGroup", KafkaErrors.KAFKA_33)); } return issues; }
@Override public List<ConfigIssue> init() { List<ConfigIssue> issues = super.init(); validateHadoopFS(issues); // This is for getting no of splits - no of executors hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) { hadoopConf.set(config.getKey(), config.getValue()); } List<Path> hdfsDirPaths = new ArrayList<>(); if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) { issues.add( getContext() .createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_18)); } else if (issues.isEmpty()) { for (String hdfsDirLocation : hdfsDirLocations) { try { FileSystem fs = getFileSystemForInitDestroy(); Path ph = fs.makeQualified(new Path(hdfsDirLocation)); hdfsDirPaths.add(ph); if (!fs.exists(ph)) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_10, hdfsDirLocation)); } else if (!fs.getFileStatus(ph).isDirectory()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_15, hdfsDirLocation)); } else { try { FileStatus[] files = fs.listStatus(ph); if (files == null || files.length == 0) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation)); } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) { for (FileStatus fileStatus : files) { if (fileStatus.isFile()) { String path = fileStatus.getPath().toString(); try { List<Map.Entry> buffer; if (dataFormat == DataFormat.AVRO) { buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE); } else { buffer = previewTextBatch(fileStatus, PREVIEW_SIZE); } for (int i = 0; i < buffer.size() && previewBuffer.size() < PREVIEW_SIZE; i++) { Map.Entry entry = buffer.get(i); previewBuffer.put( String.valueOf(entry.getKey()), entry.getValue() == null ? null : entry.getValue()); } } catch (IOException | InterruptedException ex) { String msg = "Error opening " + path + ": " + ex; LOG.info(msg, ex); issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath())); } } } } } catch (IOException ex) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex)); } } } catch (IOException ioe) { LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe); issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe)); } } } hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ",")); hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive)); switch (dataFormat) { case JSON: if (jsonMaxObjectLen < 1) { issues.add( getContext() .createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04)); } break; case TEXT: if (textMaxLineLen < 1) { issues.add( getContext() .createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05)); } break; case LOG: logDataFormatValidator = new LogDataFormatValidator( logMode, logMaxObjectLen, retainOriginalLine, customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat, log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(), getFieldPathToGroupMap(fieldPathsToGroupName)); logDataFormatValidator.validateLogFormatConfig(issues, getContext()); break; case DELIMITED: if (csvMaxObjectLen < 1) { issues.add( getContext() .createConfigIssue( Groups.DELIMITED.name(), "csvMaxObjectLen", Errors.HADOOPFS_30)); } break; case AVRO: if (avroSchema != null && !avroSchema.isEmpty()) { hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema); hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema); } break; default: issues.add( getContext() .createConfigIssue( Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06, dataFormat)); } validateParserFactoryConfigs(issues); LOG.info("Issues: " + issues); return issues; }
public List<Stage.ConfigIssue> init(Source.Context context) { List<Stage.ConfigIssue> issues = new ArrayList<>(); switch (dataFormat) { case JSON: if (dataFormatConfig.jsonMaxObjectLen < 1) { issues.add( context.createConfigIssue( DataFormat.JSON.name(), "dataFormatConfig.maxJsonObjectLen", ParserErrors.PARSER_04)); } break; case TEXT: if (dataFormatConfig.textMaxLineLen < 1) { issues.add( context.createConfigIssue( DataFormat.TEXT.name(), "dataFormatConfig.maxLogLineLength", ParserErrors.PARSER_04)); } break; case DELIMITED: if (dataFormatConfig.csvMaxObjectLen < 1) { issues.add( context.createConfigIssue( DataFormat.DELIMITED.name(), "dataFormatConfig.csvMaxObjectLen", ParserErrors.PARSER_04)); } break; case XML: if (messageConfig != null && messageConfig.produceSingleRecordPerMessage) { issues.add( context.createConfigIssue( parentName, "messageConfig.produceSingleRecordPerMessage", ParserErrors.PARSER_06)); } if (dataFormatConfig.xmlMaxObjectLen < 1) { issues.add( context.createConfigIssue( DataFormat.XML.name(), "dataFormatConfig.maxXmlObjectLen", ParserErrors.PARSER_04)); } if (dataFormatConfig.xmlRecordElement != null && !dataFormatConfig.xmlRecordElement.isEmpty() && !XMLChar.isValidName(dataFormatConfig.xmlRecordElement)) { issues.add( context.createConfigIssue( DataFormat.XML.name(), "dataFormatConfig.xmlRecordElement", ParserErrors.PARSER_02, dataFormatConfig.xmlRecordElement)); } break; case SDC_JSON: break; case LOG: logDataFormatValidator = new LogDataFormatValidator( dataFormatConfig.logMode, dataFormatConfig.logMaxObjectLen, dataFormatConfig.retainOriginalLine, dataFormatConfig.customLogFormat, dataFormatConfig.regex, dataFormatConfig.grokPatternDefinition, dataFormatConfig.grokPattern, dataFormatConfig.enableLog4jCustomLogFormat, dataFormatConfig.log4jCustomLogFormat, dataFormatConfig.onParseError, dataFormatConfig.maxStackTraceLines, DataFormat.LOG.name(), getFieldPathToGroupMap(dataFormatConfig.fieldPathsToGroupName)); logDataFormatValidator.validateLogFormatConfig(issues, context); break; case AVRO: if (!dataFormatConfig.schemaInMessage && (dataFormatConfig.avroSchema == null || dataFormatConfig.avroSchema.isEmpty())) { issues.add( context.createConfigIssue( DataFormat.AVRO.name(), "dataFormatConfig.avroSchema", ParserErrors.PARSER_07, dataFormatConfig.avroSchema)); } break; default: issues.add( context.createConfigIssue( parentName, "dataFormat", ParserErrors.PARSER_05, dataFormat)); } DataParserFactoryBuilder builder = new DataParserFactoryBuilder(context, dataFormat.getParserFormat()) .setCharset(Charset.defaultCharset()); if (dataFormatConfig.charset == null) { messageCharset = StandardCharsets.UTF_8; } else { try { messageCharset = Charset.forName(dataFormatConfig.charset); } catch (UnsupportedCharsetException ex) { // setting it to a valid one so the parser factory can be configured and tested for more // errors messageCharset = StandardCharsets.UTF_8; issues.add( context.createConfigIssue( parentName, "charset", ParserErrors.PARSER_01, dataFormatConfig.charset)); } } builder.setCharset(messageCharset).setRemoveCtrlChars(dataFormatConfig.removeCtrlChars); switch (dataFormat) { case TEXT: builder.setMaxDataLen(dataFormatConfig.textMaxLineLen); break; case JSON: builder.setMode(dataFormatConfig.jsonContent); builder.setMaxDataLen(dataFormatConfig.jsonMaxObjectLen); break; case DELIMITED: builder .setMaxDataLen(dataFormatConfig.csvMaxObjectLen) .setMode(dataFormatConfig.csvFileFormat) .setMode(dataFormatConfig.csvHeader) .setMode(dataFormatConfig.csvRecordType) .setConfig( DelimitedDataParserFactory.DELIMITER_CONFIG, dataFormatConfig.csvCustomDelimiter) .setConfig(DelimitedDataParserFactory.ESCAPE_CONFIG, dataFormatConfig.csvCustomEscape) .setConfig(DelimitedDataParserFactory.QUOTE_CONFIG, dataFormatConfig.csvCustomQuote); break; case XML: builder.setMaxDataLen(dataFormatConfig.xmlMaxObjectLen); builder.setConfig( XmlDataParserFactory.RECORD_ELEMENT_KEY, dataFormatConfig.xmlRecordElement); break; case SDC_JSON: builder.setMaxDataLen(-1); break; case LOG: logDataFormatValidator.populateBuilder(builder); break; case AVRO: builder .setMaxDataLen(Integer.MAX_VALUE) .setConfig(AvroDataParserFactory.SCHEMA_KEY, dataFormatConfig.avroSchema) .setConfig( AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, dataFormatConfig.schemaInMessage); break; default: throw new IllegalStateException("Unknown data format: " + dataFormat); } parserFactory = builder.build(); return issues; }