protected void initConfig(Map<Object, Object> properties, JobConf parentConfig) { if (properties != null) parentConfig = createConfig(properties, parentConfig); if (parentConfig == null) // this is ok, getJobConf will pass a default parent in return; jobConf = HadoopUtil.copyJobConf(parentConfig); // prevent local values from being shared jobConf.set("fs.http.impl", HttpFileSystem.class.getName()); jobConf.set("fs.https.impl", HttpFileSystem.class.getName()); syncPaths = HadoopUtil.addToClassPath(jobConf, getClassPath()); }
@Override public void sourceConfInit(FlowProcess<? extends Configuration> process, Configuration conf) { if (HadoopUtil.isLocal(conf) || Tap.id(this).equals(conf.get("cascading.node.source")) || Tap.id(this).equals(conf.get("cascading.step.source"))) { LOG.info("can't use distributed cache. reading '{}' from hdfs", super.getIdentifier()); super.sourceConfInit(process, conf); return; } try { registerHfs(process, conf, getHfs()); } catch (IOException exception) { throw new TapException(exception); } }
@Override public TupleEntryIterator openForRead( FlowProcess<? extends Configuration> flowProcess, RecordReader input) throws IOException { // always read via Hadoop FileSystem if in standalone/local mode, or if an RecordReader is // provided if (HadoopUtil.isLocal(flowProcess.getConfig()) || input != null) { LOG.info("delegating to parent"); return super.openForRead(flowProcess, input); } Path[] cachedFiles = getLocalCacheFiles(flowProcess); if (cachedFiles == null || cachedFiles.length == 0) return super.openForRead(flowProcess, null); List<Path> paths = new ArrayList<>(); List<Tap> taps = new ArrayList<>(); if (isSimpleGlob()) { FileSystem fs = FileSystem.get(flowProcess.getConfig()); FileStatus[] statuses = fs.globStatus(getHfs().getPath()); for (FileStatus status : statuses) paths.add(status.getPath()); } else { paths.add(getHfs().getPath()); } for (Path pathToFind : paths) { for (Path path : cachedFiles) { if (path.toString().endsWith(pathToFind.getName())) { LOG.info("found {} in distributed cache", path); taps.add(new Lfs(getScheme(), path.toString())); } } } if (paths.isEmpty()) // not in cache, read from HDFS { LOG.info( "could not find files in local resource path. delegating to parent: {}", super.getIdentifier()); return super.openForRead(flowProcess, input); } return new MultiSourceTap(taps.toArray(new Tap[taps.size()])).openForRead(flowProcess, input); }
private static synchronized void getHdfsShutdownHook() { if (hdfsShutdown == null) hdfsShutdown = HadoopUtil.getHDFSShutdownHook(); }
@Override public boolean stepsAreLocal() { return HadoopUtil.isLocal(getConfig()); }
private void copyToDistributedCache() { HadoopUtil.syncPaths(jobConf, syncPaths); }
@Override public Map<Object, Object> getConfigAsProperties() { return HadoopUtil.createProperties(getConfig()); }
@Override public JobConf getConfigCopy() { return HadoopUtil.copyJobConf(getConfig()); }
@Override protected JobConf newConfig(JobConf defaultConfig) { return defaultConfig == null ? new JobConf() : HadoopUtil.copyJobConf(defaultConfig); }