Exemple #1
0
  protected void initConfig(Map<Object, Object> properties, JobConf parentConfig) {
    if (properties != null) parentConfig = createConfig(properties, parentConfig);

    if (parentConfig == null) // this is ok, getJobConf will pass a default parent in
    return;

    jobConf = HadoopUtil.copyJobConf(parentConfig); // prevent local values from being shared
    jobConf.set("fs.http.impl", HttpFileSystem.class.getName());
    jobConf.set("fs.https.impl", HttpFileSystem.class.getName());

    syncPaths = HadoopUtil.addToClassPath(jobConf, getClassPath());
  }
 @Override
 public void sourceConfInit(FlowProcess<? extends Configuration> process, Configuration conf) {
   if (HadoopUtil.isLocal(conf)
       || Tap.id(this).equals(conf.get("cascading.node.source"))
       || Tap.id(this).equals(conf.get("cascading.step.source"))) {
     LOG.info("can't use distributed cache. reading '{}' from hdfs", super.getIdentifier());
     super.sourceConfInit(process, conf);
     return;
   }
   try {
     registerHfs(process, conf, getHfs());
   } catch (IOException exception) {
     throw new TapException(exception);
   }
 }
  @Override
  public TupleEntryIterator openForRead(
      FlowProcess<? extends Configuration> flowProcess, RecordReader input) throws IOException {
    // always read via Hadoop FileSystem if in standalone/local mode, or if an RecordReader is
    // provided
    if (HadoopUtil.isLocal(flowProcess.getConfig()) || input != null) {
      LOG.info("delegating to parent");
      return super.openForRead(flowProcess, input);
    }

    Path[] cachedFiles = getLocalCacheFiles(flowProcess);

    if (cachedFiles == null || cachedFiles.length == 0) return super.openForRead(flowProcess, null);

    List<Path> paths = new ArrayList<>();
    List<Tap> taps = new ArrayList<>();

    if (isSimpleGlob()) {
      FileSystem fs = FileSystem.get(flowProcess.getConfig());
      FileStatus[] statuses = fs.globStatus(getHfs().getPath());

      for (FileStatus status : statuses) paths.add(status.getPath());
    } else {
      paths.add(getHfs().getPath());
    }

    for (Path pathToFind : paths) {
      for (Path path : cachedFiles) {
        if (path.toString().endsWith(pathToFind.getName())) {
          LOG.info("found {} in distributed cache", path);
          taps.add(new Lfs(getScheme(), path.toString()));
        }
      }
    }

    if (paths.isEmpty()) // not in cache, read from HDFS
    {
      LOG.info(
          "could not find files in local resource path. delegating to parent: {}",
          super.getIdentifier());
      return super.openForRead(flowProcess, input);
    }

    return new MultiSourceTap(taps.toArray(new Tap[taps.size()])).openForRead(flowProcess, input);
  }
Exemple #4
0
 private static synchronized void getHdfsShutdownHook() {
   if (hdfsShutdown == null) hdfsShutdown = HadoopUtil.getHDFSShutdownHook();
 }
Exemple #5
0
 @Override
 public boolean stepsAreLocal() {
   return HadoopUtil.isLocal(getConfig());
 }
Exemple #6
0
 private void copyToDistributedCache() {
   HadoopUtil.syncPaths(jobConf, syncPaths);
 }
Exemple #7
0
 @Override
 public Map<Object, Object> getConfigAsProperties() {
   return HadoopUtil.createProperties(getConfig());
 }
Exemple #8
0
 @Override
 public JobConf getConfigCopy() {
   return HadoopUtil.copyJobConf(getConfig());
 }
Exemple #9
0
 @Override
 protected JobConf newConfig(JobConf defaultConfig) {
   return defaultConfig == null ? new JobConf() : HadoopUtil.copyJobConf(defaultConfig);
 }