Java HDFSUtils примеры использования

Язык программирования: Java

Пространство имен/Пакет: distributed.hadoop

Класс/Тип: HDFSUtils

Примеров на hotexamples.com: 3

Java HDFSUtils - 3 примера найдено. Это лучшие примеры Java кода для distributed.hadoop.HDFSUtils, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

addFileToDistributedCache(1)

copyToHDFS(1)

resolvePath(1)

Пример #1

Показать файл

Файл: ArffHeaderHadoopJob.java Проект: TheSengers/Weka

  /**
   * Copies the names file into HDFS (if necessary) and adds it to the distributed cache for the job
   *
   * @param conf
   * @return the filename part of the path
   * @throws IOException
   */
  protected String handleNamesFile(Configuration conf) throws IOException {
    String namesFile = environmentSubstitute(getAttributeNamesFile());

    String filenameOnly =
        HDFSUtils.addFileToDistributedCache(m_mrConfig.getHDFSConfig(), conf, namesFile, m_env);

    return filenameOnly;
  }

Пример #2

Показать файл

Файл: ArffHeaderHadoopJob.java Проект: TheSengers/Weka

  /**
   * Checks for the existence of an existing ARFF header file in HDFS or the local file system. If
   * local, it is copied into HDFS.
   *
   * @throws DistributedWekaException if the file does not exist or there is a problem transfering
   *     it into HDFS
   */
  protected void handleExistingHeaderFile() throws DistributedWekaException {

    String existingPath = getPathToExistingHeader();

    try {
      existingPath = environmentSubstitute(existingPath);
    } catch (Exception ex) {
    }

    // check local file system first
    File f = new File(existingPath);
    boolean success = false;
    if (f.exists()) {
      // copy this file into HDFS
      String hdfsDest = HDFSUtils.WEKA_TEMP_DISTRIBUTED_CACHE_FILES + f.getName();

      try {
        HDFSUtils.copyToHDFS(existingPath, hdfsDest, m_mrConfig.getHDFSConfig(), m_env, true);

        m_hdfsPathToAggregatedHeader = hdfsDest;
        Configuration conf = new Configuration();
        m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env);
        getFinalHeaderFromHDFS(conf, hdfsDest);
        success = true;
      } catch (IOException e) {
        throw new DistributedWekaException(e);
      }
    } else {
      try {
        Path p = new Path(existingPath);
        Configuration conf = new Configuration();
        m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env);
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(p)) {
          m_hdfsPathToAggregatedHeader = existingPath;
          getFinalHeaderFromHDFS(conf, existingPath);
          success = true;
        }
      } catch (IOException ex) {
        throw new DistributedWekaException(ex);
      }
    }

    if (!success) {
      throw new DistributedWekaException(
          "Was unable to find '"
              + existingPath
              + "' on either "
              + "the local file system or in HDFS");
    }
  }

Пример #3

Показать файл

Файл: ArffHeaderHadoopJob.java Проект: TheSengers/Weka

  @Override
  public boolean runJob() throws DistributedWekaException {
    boolean success = true;
    m_finalHeader = null;

    ClassLoader orig = Thread.currentThread().getContextClassLoader();
    try {
      Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
      if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) {
        try {
          handleExistingHeaderFile();

          // done!!
          return true;
        } catch (DistributedWekaException ex) {
          logMessage(
              "Unable to laod existing header file from '"
                  + getPathToExistingHeader()
                  + "' (reason: "
                  + ex.getMessage()
                  + "). Running job to create header...");
        }
      }

      setJobStatus(JobStatus.RUNNING);

      if (m_env == null) {
        m_env = Environment.getSystemWide();
      }

      // make sure that we save out to a subdirectory of the output directory
      // so that the arff header doesn't get deleted by any jobs that invoke
      // us first before themselves
      String outputPath = m_mrConfig.getOutputPath();
      outputPath += OUTPUT_SUBDIR;
      m_mrConfig.setOutputPath(outputPath);
      Random r = new Random();
      String outHeadName = "" + Math.abs(r.nextInt());
      if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) {
        outHeadName = environmentSubstitute(getOutputHeaderFileName());
      }
      if (!outHeadName.toLowerCase().endsWith(".arff")) {
        outHeadName += ".arff";
      }
      outputPath += "/" + outHeadName;
      // m_hdfsPathToAggregatedHeader = HDFSUtils.constructHDFSURI(
      // m_mrConfig.getHDFSConfig(), outputPath, m_env);
      m_hdfsPathToAggregatedHeader = outputPath;

      // Where to write the consolidated ARFF file to
      m_mrConfig.setUserSuppliedProperty(
          CSVToArffHeaderHadoopReducer.CSV_TO_ARFF_HEADER_WRITE_PATH, outputPath);

      // Now lets check to see whether this job needs to run at all - i.e.
      // if the header can be produced immediately (no nominal values need to
      // be determined) then we can just write ARFF header directly to the
      // output location in HDFS and quit
      CSVToARFFHeaderMapTask tempMapTask = new CSVToARFFHeaderMapTask();
      if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) {
        String[] tempOpts = Utils.splitOptions(getCsvToArffTaskOptions());
        tempMapTask.setOptions(tempOpts);
      }
      boolean haveAttributeNames = false;
      List<String> tempAttNames = null;
      if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
        tempAttNames = new ArrayList<String>();
        String[] n = environmentSubstitute(getAttributeNames()).split(",");
        for (String nn : n) {
          nn = nn.trim();
          if (nn.length() > 0) {
            tempAttNames.add(nn);
          }
        }
        haveAttributeNames = true;
      } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
        // handle names file - bail out if there are no atts or names file
        String namesFile = environmentSubstitute(getAttributeNamesFile());
        if (namesFile.startsWith("hdfs://")) {
          namesFile = namesFile.replace("hdfs://", "");

          // strip the host and port (if provided)
          namesFile = namesFile.substring(namesFile.indexOf("/"));
          String pathS = HDFSUtils.resolvePath(namesFile, m_env);
          Configuration tempConf = new Configuration();
          m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env);
          FileSystem fs = FileSystem.get(tempConf);
          Path path = new Path(pathS);
          InputStream is = fs.open(path);
          InputStreamReader isr = new InputStreamReader(is);
          BufferedReader br = new BufferedReader(isr);
          // this closes the stream for us
          tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br);
        } else {
          // local file
          URI uri = new URI(namesFile);
          File path = new File(uri.getPath());
          BufferedReader br = new BufferedReader(new FileReader(path));
          tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br);
        }
        haveAttributeNames = true;
      }

      // can't generate header immediately if we have to compute summary
      // stats
      if (haveAttributeNames && !tempMapTask.getComputeSummaryStats()) {
        if (tempMapTask.headerAvailableImmediately(
            tempAttNames.size(), tempAttNames, new StringBuffer())) {
          Instances finalHeader = tempMapTask.getHeader(tempAttNames.size(), tempAttNames);
          m_finalHeader = finalHeader;

          Configuration tempConf = new Configuration();
          m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env);
          CSVToArffHeaderHadoopReducer.writeHeaderToDestination(finalHeader, outputPath, tempConf);

          // done!
          return true;
        }
      }

      // Otherwise, go ahead with the job...
      Configuration conf = new Configuration();

      // Options to the map task and the underlying general Weka map
      // task
      StringBuilder csvToArffTaskOptions = new StringBuilder();
      if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
        csvToArffTaskOptions.append(" -A ").append(environmentSubstitute(getAttributeNames()));
      } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
        String filenameOnly = handleNamesFile(conf);
        csvToArffTaskOptions.append(" -names-file ").append(filenameOnly);
      }

      if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) {
        csvToArffTaskOptions.append(" ").append(getCsvToArffTaskOptions());
      }

      if (csvToArffTaskOptions.length() > 0) {
        m_mrConfig.setUserSuppliedProperty(
            CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS,
            environmentSubstitute(csvToArffTaskOptions.toString()));

        setJobName(getJobName() + " " + csvToArffTaskOptions.toString());
      }

      // install the weka libraries and any user-selected packages
      // to HDFS and add to the distributed cache/classpath for
      // the job
      installWekaLibrariesInHDFS(conf);

      Job job = m_mrConfig.configureForHadoop(environmentSubstitute(getJobName()), conf, m_env);

      cleanOutputDirectory(job);

      statusMessage("Submitting job: " + getJobName());
      logMessage("Submitting job: " + getJobName());

      success = runJob(job);

      if (success) {
        getFinalHeaderFromHDFS(conf, outputPath);
      }

      setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED);
    } catch (Exception ex) {
      setJobStatus(JobStatus.FAILED);
      throw new DistributedWekaException(ex);
    } finally {
      Thread.currentThread().setContextClassLoader(orig);
    }

    return success;
  }