private void validateHadoopFS(List<ConfigIssue> issues) { boolean validHapoopFsUri = true; hadoopConf = getHadoopConfiguration(issues); String hdfsUriInConf; if (hdfsUri != null && !hdfsUri.isEmpty()) { hadoopConf.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, hdfsUri); } else { hdfsUriInConf = hadoopConf.get(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY); if (hdfsUriInConf == null) { issues.add( getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsUri", Errors.HADOOPFS_19)); return; } else { hdfsUri = hdfsUriInConf; } } if (hdfsUri.contains("://")) { try { URI uri = new URI(hdfsUri); if (!"hdfs".equals(uri.getScheme())) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsUri", Errors.HADOOPFS_12, hdfsUri, uri.getScheme())); validHapoopFsUri = false; } else if (uri.getAuthority() == null) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsUri", Errors.HADOOPFS_13, hdfsUri)); validHapoopFsUri = false; } } catch (Exception ex) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsUri", Errors.HADOOPFS_22, hdfsUri, ex.getMessage(), ex)); validHapoopFsUri = false; } } else { issues.add( getContext() .createConfigIssue(Groups.HADOOP_FS.name(), "hdfsUri", Errors.HADOOPFS_02, hdfsUri)); validHapoopFsUri = false; } StringBuilder logMessage = new StringBuilder(); try { // forcing UGI to initialize with the security settings from the stage UserGroupInformation.setConfiguration(hadoopConf); Subject subject = Subject.getSubject(AccessController.getContext()); if (UserGroupInformation.isSecurityEnabled()) { loginUgi = UserGroupInformation.getUGIFromSubject(subject); } else { UserGroupInformation.loginUserFromSubject(subject); loginUgi = UserGroupInformation.getLoginUser(); } LOG.info( "Subject = {}, Principals = {}, Login UGI = {}", subject, subject == null ? "null" : subject.getPrincipals(), loginUgi); if (hdfsKerberos) { logMessage.append("Using Kerberos"); if (loginUgi.getAuthenticationMethod() != UserGroupInformation.AuthenticationMethod.KERBEROS) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsKerberos", Errors.HADOOPFS_00, loginUgi.getAuthenticationMethod(), UserGroupInformation.AuthenticationMethod.KERBEROS)); } } else { logMessage.append("Using Simple"); hadoopConf.set( CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION, UserGroupInformation.AuthenticationMethod.SIMPLE.name()); } if (validHapoopFsUri) { getUGI() .doAs( new PrivilegedExceptionAction<Void>() { @Override public Void run() throws Exception { try (FileSystem fs = getFileSystemForInitDestroy()) { // to trigger the close } return null; } }); } } catch (Exception ex) { LOG.info("Error connecting to FileSystem: " + ex, ex); issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), null, Errors.HADOOPFS_11, hdfsUri, String.valueOf(ex), ex)); } LOG.info("Authentication Config: " + logMessage); }
@Override public List<ConfigIssue> init() { List<ConfigIssue> issues = super.init(); validateHadoopFS(issues); // This is for getting no of splits - no of executors hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) { hadoopConf.set(config.getKey(), config.getValue()); } List<Path> hdfsDirPaths = new ArrayList<>(); if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) { issues.add( getContext() .createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_18)); } else if (issues.isEmpty()) { for (String hdfsDirLocation : hdfsDirLocations) { try { FileSystem fs = getFileSystemForInitDestroy(); Path ph = fs.makeQualified(new Path(hdfsDirLocation)); hdfsDirPaths.add(ph); if (!fs.exists(ph)) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_10, hdfsDirLocation)); } else if (!fs.getFileStatus(ph).isDirectory()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_15, hdfsDirLocation)); } else { try { FileStatus[] files = fs.listStatus(ph); if (files == null || files.length == 0) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation)); } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) { for (FileStatus fileStatus : files) { if (fileStatus.isFile()) { String path = fileStatus.getPath().toString(); try { List<Map.Entry> buffer; if (dataFormat == DataFormat.AVRO) { buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE); } else { buffer = previewTextBatch(fileStatus, PREVIEW_SIZE); } for (int i = 0; i < buffer.size() && previewBuffer.size() < PREVIEW_SIZE; i++) { Map.Entry entry = buffer.get(i); previewBuffer.put( String.valueOf(entry.getKey()), entry.getValue() == null ? null : entry.getValue()); } } catch (IOException | InterruptedException ex) { String msg = "Error opening " + path + ": " + ex; LOG.info(msg, ex); issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath())); } } } } } catch (IOException ex) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex)); } } } catch (IOException ioe) { LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe); issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe)); } } } hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ",")); hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive)); switch (dataFormat) { case JSON: if (jsonMaxObjectLen < 1) { issues.add( getContext() .createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04)); } break; case TEXT: if (textMaxLineLen < 1) { issues.add( getContext() .createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05)); } break; case LOG: logDataFormatValidator = new LogDataFormatValidator( logMode, logMaxObjectLen, retainOriginalLine, customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat, log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(), getFieldPathToGroupMap(fieldPathsToGroupName)); logDataFormatValidator.validateLogFormatConfig(issues, getContext()); break; case DELIMITED: if (csvMaxObjectLen < 1) { issues.add( getContext() .createConfigIssue( Groups.DELIMITED.name(), "csvMaxObjectLen", Errors.HADOOPFS_30)); } break; case AVRO: if (avroSchema != null && !avroSchema.isEmpty()) { hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema); hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema); } break; default: issues.add( getContext() .createConfigIssue( Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06, dataFormat)); } validateParserFactoryConfigs(issues); LOG.info("Issues: " + issues); return issues; }
Configuration getHadoopConfiguration(List<ConfigIssue> issues) { Configuration conf = new Configuration(); if (hdfsKerberos) { conf.set( CommonConfigurationKeys.HADOOP_SECURITY_AUTHENTICATION, UserGroupInformation.AuthenticationMethod.KERBEROS.name()); try { conf.set( DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, "hdfs/_HOST@" + KerberosUtil.getDefaultRealm()); } catch (Exception ex) { if (!hdfsConfigs.containsKey(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY)) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), null, Errors.HADOOPFS_28, ex.getMessage())); } } } if (hadoopConfDir != null && !hadoopConfDir.isEmpty()) { File hadoopConfigDir = new File(hadoopConfDir); if (hadoopConfigDir.isAbsolute()) { // Do not allow absolute hadoop config directory in cluster mode issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hadoopConfDir", Errors.HADOOPFS_29, hadoopConfDir)); } else { hadoopConfigDir = new File(getContext().getResourcesDirectory(), hadoopConfDir).getAbsoluteFile(); } if (!hadoopConfigDir.exists()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsConfDir", Errors.HADOOPFS_25, hadoopConfigDir.getPath())); } else if (!hadoopConfigDir.isDirectory()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsConfDir", Errors.HADOOPFS_26, hadoopConfigDir.getPath())); } else { File coreSite = new File(hadoopConfigDir, "core-site.xml"); if (coreSite.exists()) { if (!coreSite.isFile()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsConfDir", Errors.HADOOPFS_27, coreSite.getPath())); } conf.addResource(new Path(coreSite.getAbsolutePath())); } File hdfsSite = new File(hadoopConfigDir, "hdfs-site.xml"); if (hdfsSite.exists()) { if (!hdfsSite.isFile()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsConfDir", Errors.HADOOPFS_27, hdfsSite.getPath())); } conf.addResource(new Path(hdfsSite.getAbsolutePath())); } File yarnSite = new File(hadoopConfigDir, "yarn-site.xml"); if (yarnSite.exists()) { if (!yarnSite.isFile()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsConfDir", Errors.HADOOPFS_27, yarnSite.getPath())); } conf.addResource(new Path(yarnSite.getAbsolutePath())); } File mapredSite = new File(hadoopConfigDir, "mapred-site.xml"); if (mapredSite.exists()) { if (!mapredSite.isFile()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), "hdfsConfDir", Errors.HADOOPFS_27, mapredSite.getPath())); } conf.addResource(new Path(mapredSite.getAbsolutePath())); } } } for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) { conf.set(config.getKey(), config.getValue()); } return conf; }
@Override public List<ConfigIssue> init() { List<ConfigIssue> issues = super.init(); validateHadoopFS(issues); // This is for getting no of splits - no of executors hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark for (Map.Entry<String, String> config : conf.hdfsConfigs.entrySet()) { hadoopConf.set(config.getKey(), config.getValue()); } List<Path> hdfsDirPaths = new ArrayList<>(); if (conf.hdfsDirLocations == null || conf.hdfsDirLocations.isEmpty()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), ClusterHdfsConfigBean.CLUSTER_HDFS_CONFIG_BEAN_PREFIX + "hdfsDirLocations", Errors.HADOOPFS_18)); } else if (issues.isEmpty()) { for (String hdfsDirLocation : conf.hdfsDirLocations) { try { FileSystem fs = getFileSystemForInitDestroy(); Path ph = fs.makeQualified(new Path(hdfsDirLocation)); hdfsDirPaths.add(ph); if (!fs.exists(ph)) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), ClusterHdfsConfigBean.CLUSTER_HDFS_CONFIG_BEAN_PREFIX + "hdfsDirLocations", Errors.HADOOPFS_10, hdfsDirLocation)); } else if (!fs.getFileStatus(ph).isDirectory()) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), ClusterHdfsConfigBean.CLUSTER_HDFS_CONFIG_BEAN_PREFIX + "hdfsDirLocations", Errors.HADOOPFS_15, hdfsDirLocation)); } else { try { FileStatus[] files = fs.listStatus(ph); if (files == null || files.length == 0) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), ClusterHdfsConfigBean.CLUSTER_HDFS_CONFIG_BEAN_PREFIX + "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation)); } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) { for (FileStatus fileStatus : files) { if (fileStatus.isFile()) { String path = fileStatus.getPath().toString(); try { List<Map.Entry> buffer; if (conf.dataFormat == DataFormat.AVRO) { buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE); } else { buffer = previewTextBatch(fileStatus, PREVIEW_SIZE); } for (int i = 0; i < buffer.size() && previewBuffer.size() < PREVIEW_SIZE; i++) { Map.Entry entry = buffer.get(i); previewBuffer.put( String.valueOf(entry.getKey()), entry.getValue() == null ? null : entry.getValue()); } } catch (IOException | InterruptedException ex) { String msg = "Error opening " + path + ": " + ex; LOG.info(msg, ex); issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), ClusterHdfsConfigBean.CLUSTER_HDFS_CONFIG_BEAN_PREFIX + "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath())); } } } } } catch (IOException ex) { issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), ClusterHdfsConfigBean.CLUSTER_HDFS_CONFIG_BEAN_PREFIX + "hdfsDirLocations", Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex)); } } } catch (IOException ioe) { LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe); issues.add( getContext() .createConfigIssue( Groups.HADOOP_FS.name(), ClusterHdfsConfigBean.CLUSTER_HDFS_CONFIG_BEAN_PREFIX + "hdfsDirLocations", Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe)); } } } hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ",")); hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(conf.recursive)); // CsvHeader.IGNORE_HEADER must be overridden to CsvHeader.NO_HEADER prior to building the // parser. // But it must be set back to the original value for the produce() method. CsvHeader originalCsvHeader = conf.dataFormatConfig.csvHeader; if (originalCsvHeader != null && originalCsvHeader == CsvHeader.IGNORE_HEADER) { conf.dataFormatConfig.csvHeader = CsvHeader.NO_HEADER; } conf.dataFormatConfig.init( getContext(), conf.dataFormat, Groups.HADOOP_FS.name(), DATA_FROMAT_CONFIG_BEAN_PREFIX, issues); conf.dataFormatConfig.csvHeader = originalCsvHeader; parserFactory = conf.dataFormatConfig.getParserFactory(); LOG.info("Issues: " + issues); return issues; }