Esempio n. 1
0
  @Override
  public void setOptions(String[] options) throws Exception {

    // these are options to the hadoop map task (especially the -names-file)

    String existing = Utils.getOption("existing-header", options);
    setPathToExistingHeader(existing);

    String attNames = Utils.getOption('A', options);
    setAttributeNames(attNames);

    String namesFile = Utils.getOption("names-file", options);
    setAttributeNamesFile(namesFile);

    String outputName = Utils.getOption("header-file-name", options);
    setOutputHeaderFileName(outputName);

    super.setOptions(options);

    // any options to pass on to the underlying Weka csv to arff map task?
    CSVToARFFHeaderMapTask tempMap = new CSVToARFFHeaderMapTask();
    tempMap.setOptions(options);

    String optsToWekaMapTask = Utils.joinOptions(tempMap.getOptions());
    if (!DistributedJobConfig.isEmpty(optsToWekaMapTask)) {
      setCsvToArffTaskOptions(optsToWekaMapTask);
    }
  }
  /**
   * Adds a tab for the ARFF header job
   *
   * @param tabTitle the title for the tab
   * @param arffJob the ARFF job
   */
  protected void addTabForArffHeaderJob(String tabTitle, HadoopJob arffJob) {
    JPanel jobHolder = new JPanel();
    jobHolder.setLayout(new BorderLayout());

    PropertySheetPanel arffJobEditor = new PropertySheetPanel();
    arffJobEditor.setEnvironment(m_env);
    arffJobEditor.setTarget(arffJob);
    jobHolder.add(arffJobEditor, BorderLayout.NORTH);

    m_arffMapTask = new CSVToARFFHeaderMapTask();
    try {
      m_arffMapTask.setOptions(Utils.splitOptions(m_optionsOrig));
    } catch (Exception ex) {
      ex.printStackTrace();
    }
    PropertySheetPanel mapTaskEditor = new PropertySheetPanel();
    mapTaskEditor.setTarget(m_arffMapTask);

    mapTaskEditor.setEnvironment(m_env);
    jobHolder.add(mapTaskEditor, BorderLayout.CENTER);

    // JScrollPane scroller = new JScrollPane(jobHolder);

    m_configTabs.addTab(tabTitle, jobHolder);
  }
Esempio n. 3
0
  @Override
  public boolean runJob() throws DistributedWekaException {
    boolean success = true;
    m_finalHeader = null;

    ClassLoader orig = Thread.currentThread().getContextClassLoader();
    try {
      Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
      if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) {
        try {
          handleExistingHeaderFile();

          // done!!
          return true;
        } catch (DistributedWekaException ex) {
          logMessage(
              "Unable to laod existing header file from '"
                  + getPathToExistingHeader()
                  + "' (reason: "
                  + ex.getMessage()
                  + "). Running job to create header...");
        }
      }

      setJobStatus(JobStatus.RUNNING);

      if (m_env == null) {
        m_env = Environment.getSystemWide();
      }

      // make sure that we save out to a subdirectory of the output directory
      // so that the arff header doesn't get deleted by any jobs that invoke
      // us first before themselves
      String outputPath = m_mrConfig.getOutputPath();
      outputPath += OUTPUT_SUBDIR;
      m_mrConfig.setOutputPath(outputPath);
      Random r = new Random();
      String outHeadName = "" + Math.abs(r.nextInt());
      if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) {
        outHeadName = environmentSubstitute(getOutputHeaderFileName());
      }
      if (!outHeadName.toLowerCase().endsWith(".arff")) {
        outHeadName += ".arff";
      }
      outputPath += "/" + outHeadName;
      // m_hdfsPathToAggregatedHeader = HDFSUtils.constructHDFSURI(
      // m_mrConfig.getHDFSConfig(), outputPath, m_env);
      m_hdfsPathToAggregatedHeader = outputPath;

      // Where to write the consolidated ARFF file to
      m_mrConfig.setUserSuppliedProperty(
          CSVToArffHeaderHadoopReducer.CSV_TO_ARFF_HEADER_WRITE_PATH, outputPath);

      // Now lets check to see whether this job needs to run at all - i.e.
      // if the header can be produced immediately (no nominal values need to
      // be determined) then we can just write ARFF header directly to the
      // output location in HDFS and quit
      CSVToARFFHeaderMapTask tempMapTask = new CSVToARFFHeaderMapTask();
      if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) {
        String[] tempOpts = Utils.splitOptions(getCsvToArffTaskOptions());
        tempMapTask.setOptions(tempOpts);
      }
      boolean haveAttributeNames = false;
      List<String> tempAttNames = null;
      if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
        tempAttNames = new ArrayList<String>();
        String[] n = environmentSubstitute(getAttributeNames()).split(",");
        for (String nn : n) {
          nn = nn.trim();
          if (nn.length() > 0) {
            tempAttNames.add(nn);
          }
        }
        haveAttributeNames = true;
      } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
        // handle names file - bail out if there are no atts or names file
        String namesFile = environmentSubstitute(getAttributeNamesFile());
        if (namesFile.startsWith("hdfs://")) {
          namesFile = namesFile.replace("hdfs://", "");

          // strip the host and port (if provided)
          namesFile = namesFile.substring(namesFile.indexOf("/"));
          String pathS = HDFSUtils.resolvePath(namesFile, m_env);
          Configuration tempConf = new Configuration();
          m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env);
          FileSystem fs = FileSystem.get(tempConf);
          Path path = new Path(pathS);
          InputStream is = fs.open(path);
          InputStreamReader isr = new InputStreamReader(is);
          BufferedReader br = new BufferedReader(isr);
          // this closes the stream for us
          tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br);
        } else {
          // local file
          URI uri = new URI(namesFile);
          File path = new File(uri.getPath());
          BufferedReader br = new BufferedReader(new FileReader(path));
          tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br);
        }
        haveAttributeNames = true;
      }

      // can't generate header immediately if we have to compute summary
      // stats
      if (haveAttributeNames && !tempMapTask.getComputeSummaryStats()) {
        if (tempMapTask.headerAvailableImmediately(
            tempAttNames.size(), tempAttNames, new StringBuffer())) {
          Instances finalHeader = tempMapTask.getHeader(tempAttNames.size(), tempAttNames);
          m_finalHeader = finalHeader;

          Configuration tempConf = new Configuration();
          m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env);
          CSVToArffHeaderHadoopReducer.writeHeaderToDestination(finalHeader, outputPath, tempConf);

          // done!
          return true;
        }
      }

      // Otherwise, go ahead with the job...
      Configuration conf = new Configuration();

      // Options to the map task and the underlying general Weka map
      // task
      StringBuilder csvToArffTaskOptions = new StringBuilder();
      if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
        csvToArffTaskOptions.append(" -A ").append(environmentSubstitute(getAttributeNames()));
      } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
        String filenameOnly = handleNamesFile(conf);
        csvToArffTaskOptions.append(" -names-file ").append(filenameOnly);
      }

      if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) {
        csvToArffTaskOptions.append(" ").append(getCsvToArffTaskOptions());
      }

      if (csvToArffTaskOptions.length() > 0) {
        m_mrConfig.setUserSuppliedProperty(
            CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS,
            environmentSubstitute(csvToArffTaskOptions.toString()));

        setJobName(getJobName() + " " + csvToArffTaskOptions.toString());
      }

      // install the weka libraries and any user-selected packages
      // to HDFS and add to the distributed cache/classpath for
      // the job
      installWekaLibrariesInHDFS(conf);

      Job job = m_mrConfig.configureForHadoop(environmentSubstitute(getJobName()), conf, m_env);

      cleanOutputDirectory(job);

      statusMessage("Submitting job: " + getJobName());
      logMessage("Submitting job: " + getJobName());

      success = runJob(job);

      if (success) {
        getFinalHeaderFromHDFS(conf, outputPath);
      }

      setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED);
    } catch (Exception ex) {
      setJobStatus(JobStatus.FAILED);
      throw new DistributedWekaException(ex);
    } finally {
      Thread.currentThread().setContextClassLoader(orig);
    }

    return success;
  }