Ejemplo n.º 1
0
 @Override
 public List<ConfigIssue> init() {
   List<ConfigIssue> issues = super.init();
   validateHadoopFS(issues);
   // This is for getting no of splits - no of executors
   hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark
   hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark
   for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) {
     hadoopConf.set(config.getKey(), config.getValue());
   }
   List<Path> hdfsDirPaths = new ArrayList<>();
   if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) {
     issues.add(
         getContext()
             .createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_18));
   } else if (issues.isEmpty()) {
     for (String hdfsDirLocation : hdfsDirLocations) {
       try {
         FileSystem fs = getFileSystemForInitDestroy();
         Path ph = fs.makeQualified(new Path(hdfsDirLocation));
         hdfsDirPaths.add(ph);
         if (!fs.exists(ph)) {
           issues.add(
               getContext()
                   .createConfigIssue(
                       Groups.HADOOP_FS.name(),
                       "hdfsDirLocations",
                       Errors.HADOOPFS_10,
                       hdfsDirLocation));
         } else if (!fs.getFileStatus(ph).isDirectory()) {
           issues.add(
               getContext()
                   .createConfigIssue(
                       Groups.HADOOP_FS.name(),
                       "hdfsDirLocations",
                       Errors.HADOOPFS_15,
                       hdfsDirLocation));
         } else {
           try {
             FileStatus[] files = fs.listStatus(ph);
             if (files == null || files.length == 0) {
               issues.add(
                   getContext()
                       .createConfigIssue(
                           Groups.HADOOP_FS.name(),
                           "hdfsDirLocations",
                           Errors.HADOOPFS_16,
                           hdfsDirLocation));
             } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) {
               for (FileStatus fileStatus : files) {
                 if (fileStatus.isFile()) {
                   String path = fileStatus.getPath().toString();
                   try {
                     List<Map.Entry> buffer;
                     if (dataFormat == DataFormat.AVRO) {
                       buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE);
                     } else {
                       buffer = previewTextBatch(fileStatus, PREVIEW_SIZE);
                     }
                     for (int i = 0;
                         i < buffer.size() && previewBuffer.size() < PREVIEW_SIZE;
                         i++) {
                       Map.Entry entry = buffer.get(i);
                       previewBuffer.put(
                           String.valueOf(entry.getKey()),
                           entry.getValue() == null ? null : entry.getValue());
                     }
                   } catch (IOException | InterruptedException ex) {
                     String msg = "Error opening " + path + ": " + ex;
                     LOG.info(msg, ex);
                     issues.add(
                         getContext()
                             .createConfigIssue(
                                 Groups.HADOOP_FS.name(),
                                 "hdfsDirLocations",
                                 Errors.HADOOPFS_16,
                                 fileStatus.getPath()));
                   }
                 }
               }
             }
           } catch (IOException ex) {
             issues.add(
                 getContext()
                     .createConfigIssue(
                         Groups.HADOOP_FS.name(),
                         "hdfsDirLocations",
                         Errors.HADOOPFS_09,
                         hdfsDirLocation,
                         ex.toString(),
                         ex));
           }
         }
       } catch (IOException ioe) {
         LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe);
         issues.add(
             getContext()
                 .createConfigIssue(
                     Groups.HADOOP_FS.name(),
                     "hdfsDirLocations",
                     Errors.HADOOPFS_11,
                     hdfsDirLocation,
                     ioe.toString(),
                     ioe));
       }
     }
   }
   hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ","));
   hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive));
   switch (dataFormat) {
     case JSON:
       if (jsonMaxObjectLen < 1) {
         issues.add(
             getContext()
                 .createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04));
       }
       break;
     case TEXT:
       if (textMaxLineLen < 1) {
         issues.add(
             getContext()
                 .createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05));
       }
       break;
     case LOG:
       logDataFormatValidator =
           new LogDataFormatValidator(
               logMode,
               logMaxObjectLen,
               retainOriginalLine,
               customLogFormat,
               regex,
               grokPatternDefinition,
               grokPattern,
               enableLog4jCustomLogFormat,
               log4jCustomLogFormat,
               OnParseError.ERROR,
               0,
               Groups.LOG.name(),
               getFieldPathToGroupMap(fieldPathsToGroupName));
       logDataFormatValidator.validateLogFormatConfig(issues, getContext());
       break;
     case DELIMITED:
       if (csvMaxObjectLen < 1) {
         issues.add(
             getContext()
                 .createConfigIssue(
                     Groups.DELIMITED.name(), "csvMaxObjectLen", Errors.HADOOPFS_30));
       }
       break;
     case AVRO:
       if (avroSchema != null && !avroSchema.isEmpty()) {
         hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema);
         hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema);
       }
       break;
     default:
       issues.add(
           getContext()
               .createConfigIssue(
                   Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06, dataFormat));
   }
   validateParserFactoryConfigs(issues);
   LOG.info("Issues: " + issues);
   return issues;
 }
Ejemplo n.º 2
0
  @Override
  protected List<ConfigIssue> init() {
    List<ConfigIssue> issues = new ArrayList<ConfigIssue>();
    if (topic == null || topic.isEmpty()) {
      issues.add(
          getContext().createConfigIssue(Groups.KAFKA.name(), "topic", KafkaErrors.KAFKA_05));
    }
    // maxWaitTime
    if (maxWaitTime < 1) {
      issues.add(
          getContext().createConfigIssue(Groups.KAFKA.name(), "maxWaitTime", KafkaErrors.KAFKA_35));
    }

    switch (dataFormat) {
      case JSON:
        if (jsonMaxObjectLen < 1) {
          issues.add(
              getContext()
                  .createConfigIssue(Groups.JSON.name(), "maxJsonObjectLen", KafkaErrors.KAFKA_38));
        }
        break;
      case TEXT:
        if (textMaxLineLen < 1) {
          issues.add(
              getContext()
                  .createConfigIssue(Groups.TEXT.name(), "maxLogLineLength", KafkaErrors.KAFKA_38));
        }
        break;
      case DELIMITED:
        if (csvMaxObjectLen < 1) {
          issues.add(
              getContext()
                  .createConfigIssue(
                      Groups.DELIMITED.name(), "csvMaxObjectLen", KafkaErrors.KAFKA_38));
        }
        break;
      case XML:
        if (produceSingleRecordPerMessage) {
          issues.add(
              getContext()
                  .createConfigIssue(
                      Groups.KAFKA.name(), "produceSingleRecordPerMessage", KafkaErrors.KAFKA_40));
        }
        if (xmlMaxObjectLen < 1) {
          issues.add(
              getContext()
                  .createConfigIssue(Groups.XML.name(), "maxXmlObjectLen", KafkaErrors.KAFKA_38));
        }
        if (xmlRecordElement != null
            && !xmlRecordElement.isEmpty()
            && !XMLChar.isValidName(xmlRecordElement)) {
          issues.add(
              getContext()
                  .createConfigIssue(
                      Groups.XML.name(),
                      "xmlRecordElement",
                      KafkaErrors.KAFKA_36,
                      xmlRecordElement));
        }
        break;
      case SDC_JSON:
      case BINARY:
        break;
      case LOG:
        logDataFormatValidator =
            new LogDataFormatValidator(
                logMode,
                logMaxObjectLen,
                logRetainOriginalLine,
                customLogFormat,
                regex,
                grokPatternDefinition,
                grokPattern,
                enableLog4jCustomLogFormat,
                log4jCustomLogFormat,
                onParseError,
                maxStackTraceLines,
                Groups.LOG.name(),
                getFieldPathToGroupMap(fieldPathsToGroupName));
        logDataFormatValidator.validateLogFormatConfig(issues, getContext());
        break;
      case AVRO:
        if (!messageHasSchema && (avroSchema == null || avroSchema.isEmpty())) {
          issues.add(
              getContext()
                  .createConfigIssue(
                      Groups.AVRO.name(), "avroSchema", KafkaErrors.KAFKA_43, avroSchema));
        }
        break;
      default:
        issues.add(
            getContext()
                .createConfigIssue(
                    Groups.KAFKA.name(), "dataFormat", KafkaErrors.KAFKA_39, dataFormat));
    }

    validateParserFactoryConfigs(issues);

    // Validate broker config
    try {
      int partitionCount = KafkaUtil.getPartitionCount(metadataBrokerList, topic, 3, 1000);
      if (partitionCount < 1) {
        issues.add(
            getContext()
                .createConfigIssue(Groups.KAFKA.name(), "topic", KafkaErrors.KAFKA_42, topic));
      } else {
        // cache the partition count as parallelism for future use
        originParallelism = partitionCount;
      }
    } catch (IOException e) {
      issues.add(
          getContext()
              .createConfigIssue(
                  Groups.KAFKA.name(), "topic", KafkaErrors.KAFKA_41, topic, e.toString(), e));
    }

    // Validate zookeeper config
    List<KafkaBroker> kafkaBrokers =
        KafkaUtil.validateZkConnectionString(
            issues, zookeeperConnect, Groups.KAFKA.name(), "zookeeperConnect", getContext());

    // validate connecting to kafka
    if (kafkaBrokers != null && !kafkaBrokers.isEmpty() && topic != null && !topic.isEmpty()) {
      kafkaConsumer =
          new KafkaConsumer(
              zookeeperConnect,
              topic,
              consumerGroup,
              maxBatchSize,
              maxWaitTime,
              kafkaConsumerConfigs,
              getContext());
      kafkaConsumer.validate(issues, getContext());
    }

    // consumerGroup
    if (consumerGroup == null || consumerGroup.isEmpty()) {
      issues.add(
          getContext()
              .createConfigIssue(Groups.KAFKA.name(), "consumerGroup", KafkaErrors.KAFKA_33));
    }
    return issues;
  }
Ejemplo n.º 3
0
  public List<Stage.ConfigIssue> init(Source.Context context) {
    List<Stage.ConfigIssue> issues = new ArrayList<>();
    switch (dataFormat) {
      case JSON:
        if (dataFormatConfig.jsonMaxObjectLen < 1) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.JSON.name(),
                  "dataFormatConfig.maxJsonObjectLen",
                  ParserErrors.PARSER_04));
        }
        break;
      case TEXT:
        if (dataFormatConfig.textMaxLineLen < 1) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.TEXT.name(),
                  "dataFormatConfig.maxLogLineLength",
                  ParserErrors.PARSER_04));
        }
        break;
      case DELIMITED:
        if (dataFormatConfig.csvMaxObjectLen < 1) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.DELIMITED.name(),
                  "dataFormatConfig.csvMaxObjectLen",
                  ParserErrors.PARSER_04));
        }
        break;
      case XML:
        if (messageConfig != null && messageConfig.produceSingleRecordPerMessage) {
          issues.add(
              context.createConfigIssue(
                  parentName,
                  "messageConfig.produceSingleRecordPerMessage",
                  ParserErrors.PARSER_06));
        }
        if (dataFormatConfig.xmlMaxObjectLen < 1) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.XML.name(),
                  "dataFormatConfig.maxXmlObjectLen",
                  ParserErrors.PARSER_04));
        }
        if (dataFormatConfig.xmlRecordElement != null
            && !dataFormatConfig.xmlRecordElement.isEmpty()
            && !XMLChar.isValidName(dataFormatConfig.xmlRecordElement)) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.XML.name(),
                  "dataFormatConfig.xmlRecordElement",
                  ParserErrors.PARSER_02,
                  dataFormatConfig.xmlRecordElement));
        }
        break;
      case SDC_JSON:
        break;
      case LOG:
        logDataFormatValidator =
            new LogDataFormatValidator(
                dataFormatConfig.logMode,
                dataFormatConfig.logMaxObjectLen,
                dataFormatConfig.retainOriginalLine,
                dataFormatConfig.customLogFormat,
                dataFormatConfig.regex,
                dataFormatConfig.grokPatternDefinition,
                dataFormatConfig.grokPattern,
                dataFormatConfig.enableLog4jCustomLogFormat,
                dataFormatConfig.log4jCustomLogFormat,
                dataFormatConfig.onParseError,
                dataFormatConfig.maxStackTraceLines,
                DataFormat.LOG.name(),
                getFieldPathToGroupMap(dataFormatConfig.fieldPathsToGroupName));
        logDataFormatValidator.validateLogFormatConfig(issues, context);
        break;
      case AVRO:
        if (!dataFormatConfig.schemaInMessage
            && (dataFormatConfig.avroSchema == null || dataFormatConfig.avroSchema.isEmpty())) {
          issues.add(
              context.createConfigIssue(
                  DataFormat.AVRO.name(),
                  "dataFormatConfig.avroSchema",
                  ParserErrors.PARSER_07,
                  dataFormatConfig.avroSchema));
        }
        break;
      default:
        issues.add(
            context.createConfigIssue(
                parentName, "dataFormat", ParserErrors.PARSER_05, dataFormat));
    }

    DataParserFactoryBuilder builder =
        new DataParserFactoryBuilder(context, dataFormat.getParserFormat())
            .setCharset(Charset.defaultCharset());
    if (dataFormatConfig.charset == null) {
      messageCharset = StandardCharsets.UTF_8;
    } else {
      try {
        messageCharset = Charset.forName(dataFormatConfig.charset);
      } catch (UnsupportedCharsetException ex) {
        // setting it to a valid one so the parser factory can be configured and tested for more
        // errors
        messageCharset = StandardCharsets.UTF_8;
        issues.add(
            context.createConfigIssue(
                parentName, "charset", ParserErrors.PARSER_01, dataFormatConfig.charset));
      }
    }
    builder.setCharset(messageCharset).setRemoveCtrlChars(dataFormatConfig.removeCtrlChars);

    switch (dataFormat) {
      case TEXT:
        builder.setMaxDataLen(dataFormatConfig.textMaxLineLen);
        break;
      case JSON:
        builder.setMode(dataFormatConfig.jsonContent);
        builder.setMaxDataLen(dataFormatConfig.jsonMaxObjectLen);
        break;
      case DELIMITED:
        builder
            .setMaxDataLen(dataFormatConfig.csvMaxObjectLen)
            .setMode(dataFormatConfig.csvFileFormat)
            .setMode(dataFormatConfig.csvHeader)
            .setMode(dataFormatConfig.csvRecordType)
            .setConfig(
                DelimitedDataParserFactory.DELIMITER_CONFIG, dataFormatConfig.csvCustomDelimiter)
            .setConfig(DelimitedDataParserFactory.ESCAPE_CONFIG, dataFormatConfig.csvCustomEscape)
            .setConfig(DelimitedDataParserFactory.QUOTE_CONFIG, dataFormatConfig.csvCustomQuote);
        break;
      case XML:
        builder.setMaxDataLen(dataFormatConfig.xmlMaxObjectLen);
        builder.setConfig(
            XmlDataParserFactory.RECORD_ELEMENT_KEY, dataFormatConfig.xmlRecordElement);
        break;
      case SDC_JSON:
        builder.setMaxDataLen(-1);
        break;
      case LOG:
        logDataFormatValidator.populateBuilder(builder);
        break;
      case AVRO:
        builder
            .setMaxDataLen(Integer.MAX_VALUE)
            .setConfig(AvroDataParserFactory.SCHEMA_KEY, dataFormatConfig.avroSchema)
            .setConfig(
                AvroDataParserFactory.SCHEMA_IN_MESSAGE_KEY, dataFormatConfig.schemaInMessage);
        break;
      default:
        throw new IllegalStateException("Unknown data format: " + dataFormat);
    }
    parserFactory = builder.build();
    return issues;
  }