Exemplo n.º 1
0
  /**
   * Copies the names file into HDFS (if necessary) and adds it to the distributed cache for the job
   *
   * @param conf
   * @return the filename part of the path
   * @throws IOException
   */
  protected String handleNamesFile(Configuration conf) throws IOException {
    String namesFile = environmentSubstitute(getAttributeNamesFile());

    String filenameOnly =
        HDFSUtils.addFileToDistributedCache(m_mrConfig.getHDFSConfig(), conf, namesFile, m_env);

    return filenameOnly;
  }
Exemplo n.º 2
0
  /**
   * Checks for the existence of an existing ARFF header file in HDFS or the local file system. If
   * local, it is copied into HDFS.
   *
   * @throws DistributedWekaException if the file does not exist or there is a problem transfering
   *     it into HDFS
   */
  protected void handleExistingHeaderFile() throws DistributedWekaException {

    String existingPath = getPathToExistingHeader();

    try {
      existingPath = environmentSubstitute(existingPath);
    } catch (Exception ex) {
    }

    // check local file system first
    File f = new File(existingPath);
    boolean success = false;
    if (f.exists()) {
      // copy this file into HDFS
      String hdfsDest = HDFSUtils.WEKA_TEMP_DISTRIBUTED_CACHE_FILES + f.getName();

      try {
        HDFSUtils.copyToHDFS(existingPath, hdfsDest, m_mrConfig.getHDFSConfig(), m_env, true);

        m_hdfsPathToAggregatedHeader = hdfsDest;
        Configuration conf = new Configuration();
        m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env);
        getFinalHeaderFromHDFS(conf, hdfsDest);
        success = true;
      } catch (IOException e) {
        throw new DistributedWekaException(e);
      }
    } else {
      try {
        Path p = new Path(existingPath);
        Configuration conf = new Configuration();
        m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env);
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(p)) {
          m_hdfsPathToAggregatedHeader = existingPath;
          getFinalHeaderFromHDFS(conf, existingPath);
          success = true;
        }
      } catch (IOException ex) {
        throw new DistributedWekaException(ex);
      }
    }

    if (!success) {
      throw new DistributedWekaException(
          "Was unable to find '"
              + existingPath
              + "' on either "
              + "the local file system or in HDFS");
    }
  }
Exemplo n.º 3
0
  @Override
  public boolean runJob() throws DistributedWekaException {
    boolean success = true;
    m_finalHeader = null;

    ClassLoader orig = Thread.currentThread().getContextClassLoader();
    try {
      Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
      if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) {
        try {
          handleExistingHeaderFile();

          // done!!
          return true;
        } catch (DistributedWekaException ex) {
          logMessage(
              "Unable to laod existing header file from '"
                  + getPathToExistingHeader()
                  + "' (reason: "
                  + ex.getMessage()
                  + "). Running job to create header...");
        }
      }

      setJobStatus(JobStatus.RUNNING);

      if (m_env == null) {
        m_env = Environment.getSystemWide();
      }

      // make sure that we save out to a subdirectory of the output directory
      // so that the arff header doesn't get deleted by any jobs that invoke
      // us first before themselves
      String outputPath = m_mrConfig.getOutputPath();
      outputPath += OUTPUT_SUBDIR;
      m_mrConfig.setOutputPath(outputPath);
      Random r = new Random();
      String outHeadName = "" + Math.abs(r.nextInt());
      if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) {
        outHeadName = environmentSubstitute(getOutputHeaderFileName());
      }
      if (!outHeadName.toLowerCase().endsWith(".arff")) {
        outHeadName += ".arff";
      }
      outputPath += "/" + outHeadName;
      // m_hdfsPathToAggregatedHeader = HDFSUtils.constructHDFSURI(
      // m_mrConfig.getHDFSConfig(), outputPath, m_env);
      m_hdfsPathToAggregatedHeader = outputPath;

      // Where to write the consolidated ARFF file to
      m_mrConfig.setUserSuppliedProperty(
          CSVToArffHeaderHadoopReducer.CSV_TO_ARFF_HEADER_WRITE_PATH, outputPath);

      // Now lets check to see whether this job needs to run at all - i.e.
      // if the header can be produced immediately (no nominal values need to
      // be determined) then we can just write ARFF header directly to the
      // output location in HDFS and quit
      CSVToARFFHeaderMapTask tempMapTask = new CSVToARFFHeaderMapTask();
      if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) {
        String[] tempOpts = Utils.splitOptions(getCsvToArffTaskOptions());
        tempMapTask.setOptions(tempOpts);
      }
      boolean haveAttributeNames = false;
      List<String> tempAttNames = null;
      if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
        tempAttNames = new ArrayList<String>();
        String[] n = environmentSubstitute(getAttributeNames()).split(",");
        for (String nn : n) {
          nn = nn.trim();
          if (nn.length() > 0) {
            tempAttNames.add(nn);
          }
        }
        haveAttributeNames = true;
      } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
        // handle names file - bail out if there are no atts or names file
        String namesFile = environmentSubstitute(getAttributeNamesFile());
        if (namesFile.startsWith("hdfs://")) {
          namesFile = namesFile.replace("hdfs://", "");

          // strip the host and port (if provided)
          namesFile = namesFile.substring(namesFile.indexOf("/"));
          String pathS = HDFSUtils.resolvePath(namesFile, m_env);
          Configuration tempConf = new Configuration();
          m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env);
          FileSystem fs = FileSystem.get(tempConf);
          Path path = new Path(pathS);
          InputStream is = fs.open(path);
          InputStreamReader isr = new InputStreamReader(is);
          BufferedReader br = new BufferedReader(isr);
          // this closes the stream for us
          tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br);
        } else {
          // local file
          URI uri = new URI(namesFile);
          File path = new File(uri.getPath());
          BufferedReader br = new BufferedReader(new FileReader(path));
          tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br);
        }
        haveAttributeNames = true;
      }

      // can't generate header immediately if we have to compute summary
      // stats
      if (haveAttributeNames && !tempMapTask.getComputeSummaryStats()) {
        if (tempMapTask.headerAvailableImmediately(
            tempAttNames.size(), tempAttNames, new StringBuffer())) {
          Instances finalHeader = tempMapTask.getHeader(tempAttNames.size(), tempAttNames);
          m_finalHeader = finalHeader;

          Configuration tempConf = new Configuration();
          m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env);
          CSVToArffHeaderHadoopReducer.writeHeaderToDestination(finalHeader, outputPath, tempConf);

          // done!
          return true;
        }
      }

      // Otherwise, go ahead with the job...
      Configuration conf = new Configuration();

      // Options to the map task and the underlying general Weka map
      // task
      StringBuilder csvToArffTaskOptions = new StringBuilder();
      if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
        csvToArffTaskOptions.append(" -A ").append(environmentSubstitute(getAttributeNames()));
      } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
        String filenameOnly = handleNamesFile(conf);
        csvToArffTaskOptions.append(" -names-file ").append(filenameOnly);
      }

      if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) {
        csvToArffTaskOptions.append(" ").append(getCsvToArffTaskOptions());
      }

      if (csvToArffTaskOptions.length() > 0) {
        m_mrConfig.setUserSuppliedProperty(
            CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS,
            environmentSubstitute(csvToArffTaskOptions.toString()));

        setJobName(getJobName() + " " + csvToArffTaskOptions.toString());
      }

      // install the weka libraries and any user-selected packages
      // to HDFS and add to the distributed cache/classpath for
      // the job
      installWekaLibrariesInHDFS(conf);

      Job job = m_mrConfig.configureForHadoop(environmentSubstitute(getJobName()), conf, m_env);

      cleanOutputDirectory(job);

      statusMessage("Submitting job: " + getJobName());
      logMessage("Submitting job: " + getJobName());

      success = runJob(job);

      if (success) {
        getFinalHeaderFromHDFS(conf, outputPath);
      }

      setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED);
    } catch (Exception ex) {
      setJobStatus(JobStatus.FAILED);
      throw new DistributedWekaException(ex);
    } finally {
      Thread.currentThread().setContextClassLoader(orig);
    }

    return success;
  }