@Override public void setOptions(String[] options) throws Exception { // these are options to the hadoop map task (especially the -names-file) String existing = Utils.getOption("existing-header", options); setPathToExistingHeader(existing); String attNames = Utils.getOption('A', options); setAttributeNames(attNames); String namesFile = Utils.getOption("names-file", options); setAttributeNamesFile(namesFile); String outputName = Utils.getOption("header-file-name", options); setOutputHeaderFileName(outputName); super.setOptions(options); // any options to pass on to the underlying Weka csv to arff map task? CSVToARFFHeaderMapTask tempMap = new CSVToARFFHeaderMapTask(); tempMap.setOptions(options); String optsToWekaMapTask = Utils.joinOptions(tempMap.getOptions()); if (!DistributedJobConfig.isEmpty(optsToWekaMapTask)) { setCsvToArffTaskOptions(optsToWekaMapTask); } }
/** * Adds a tab for the ARFF header job * * @param tabTitle the title for the tab * @param arffJob the ARFF job */ protected void addTabForArffHeaderJob(String tabTitle, HadoopJob arffJob) { JPanel jobHolder = new JPanel(); jobHolder.setLayout(new BorderLayout()); PropertySheetPanel arffJobEditor = new PropertySheetPanel(); arffJobEditor.setEnvironment(m_env); arffJobEditor.setTarget(arffJob); jobHolder.add(arffJobEditor, BorderLayout.NORTH); m_arffMapTask = new CSVToARFFHeaderMapTask(); try { m_arffMapTask.setOptions(Utils.splitOptions(m_optionsOrig)); } catch (Exception ex) { ex.printStackTrace(); } PropertySheetPanel mapTaskEditor = new PropertySheetPanel(); mapTaskEditor.setTarget(m_arffMapTask); mapTaskEditor.setEnvironment(m_env); jobHolder.add(mapTaskEditor, BorderLayout.CENTER); // JScrollPane scroller = new JScrollPane(jobHolder); m_configTabs.addTab(tabTitle, jobHolder); }
@Override public boolean runJob() throws DistributedWekaException { boolean success = true; m_finalHeader = null; ClassLoader orig = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) { try { handleExistingHeaderFile(); // done!! return true; } catch (DistributedWekaException ex) { logMessage( "Unable to laod existing header file from '" + getPathToExistingHeader() + "' (reason: " + ex.getMessage() + "). Running job to create header..."); } } setJobStatus(JobStatus.RUNNING); if (m_env == null) { m_env = Environment.getSystemWide(); } // make sure that we save out to a subdirectory of the output directory // so that the arff header doesn't get deleted by any jobs that invoke // us first before themselves String outputPath = m_mrConfig.getOutputPath(); outputPath += OUTPUT_SUBDIR; m_mrConfig.setOutputPath(outputPath); Random r = new Random(); String outHeadName = "" + Math.abs(r.nextInt()); if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) { outHeadName = environmentSubstitute(getOutputHeaderFileName()); } if (!outHeadName.toLowerCase().endsWith(".arff")) { outHeadName += ".arff"; } outputPath += "/" + outHeadName; // m_hdfsPathToAggregatedHeader = HDFSUtils.constructHDFSURI( // m_mrConfig.getHDFSConfig(), outputPath, m_env); m_hdfsPathToAggregatedHeader = outputPath; // Where to write the consolidated ARFF file to m_mrConfig.setUserSuppliedProperty( CSVToArffHeaderHadoopReducer.CSV_TO_ARFF_HEADER_WRITE_PATH, outputPath); // Now lets check to see whether this job needs to run at all - i.e. // if the header can be produced immediately (no nominal values need to // be determined) then we can just write ARFF header directly to the // output location in HDFS and quit CSVToARFFHeaderMapTask tempMapTask = new CSVToARFFHeaderMapTask(); if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) { String[] tempOpts = Utils.splitOptions(getCsvToArffTaskOptions()); tempMapTask.setOptions(tempOpts); } boolean haveAttributeNames = false; List<String> tempAttNames = null; if (!DistributedJobConfig.isEmpty(getAttributeNames())) { tempAttNames = new ArrayList<String>(); String[] n = environmentSubstitute(getAttributeNames()).split(","); for (String nn : n) { nn = nn.trim(); if (nn.length() > 0) { tempAttNames.add(nn); } } haveAttributeNames = true; } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { // handle names file - bail out if there are no atts or names file String namesFile = environmentSubstitute(getAttributeNamesFile()); if (namesFile.startsWith("hdfs://")) { namesFile = namesFile.replace("hdfs://", ""); // strip the host and port (if provided) namesFile = namesFile.substring(namesFile.indexOf("/")); String pathS = HDFSUtils.resolvePath(namesFile, m_env); Configuration tempConf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env); FileSystem fs = FileSystem.get(tempConf); Path path = new Path(pathS); InputStream is = fs.open(path); InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr); // this closes the stream for us tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br); } else { // local file URI uri = new URI(namesFile); File path = new File(uri.getPath()); BufferedReader br = new BufferedReader(new FileReader(path)); tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br); } haveAttributeNames = true; } // can't generate header immediately if we have to compute summary // stats if (haveAttributeNames && !tempMapTask.getComputeSummaryStats()) { if (tempMapTask.headerAvailableImmediately( tempAttNames.size(), tempAttNames, new StringBuffer())) { Instances finalHeader = tempMapTask.getHeader(tempAttNames.size(), tempAttNames); m_finalHeader = finalHeader; Configuration tempConf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env); CSVToArffHeaderHadoopReducer.writeHeaderToDestination(finalHeader, outputPath, tempConf); // done! return true; } } // Otherwise, go ahead with the job... Configuration conf = new Configuration(); // Options to the map task and the underlying general Weka map // task StringBuilder csvToArffTaskOptions = new StringBuilder(); if (!DistributedJobConfig.isEmpty(getAttributeNames())) { csvToArffTaskOptions.append(" -A ").append(environmentSubstitute(getAttributeNames())); } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { String filenameOnly = handleNamesFile(conf); csvToArffTaskOptions.append(" -names-file ").append(filenameOnly); } if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) { csvToArffTaskOptions.append(" ").append(getCsvToArffTaskOptions()); } if (csvToArffTaskOptions.length() > 0) { m_mrConfig.setUserSuppliedProperty( CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS, environmentSubstitute(csvToArffTaskOptions.toString())); setJobName(getJobName() + " " + csvToArffTaskOptions.toString()); } // install the weka libraries and any user-selected packages // to HDFS and add to the distributed cache/classpath for // the job installWekaLibrariesInHDFS(conf); Job job = m_mrConfig.configureForHadoop(environmentSubstitute(getJobName()), conf, m_env); cleanOutputDirectory(job); statusMessage("Submitting job: " + getJobName()); logMessage("Submitting job: " + getJobName()); success = runJob(job); if (success) { getFinalHeaderFromHDFS(conf, outputPath); } setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED); } catch (Exception ex) { setJobStatus(JobStatus.FAILED); throw new DistributedWekaException(ex); } finally { Thread.currentThread().setContextClassLoader(orig); } return success; }