@Override public void setOptions(String[] options) throws Exception { // these are options to the hadoop map task (especially the -names-file) String existing = Utils.getOption("existing-header", options); setPathToExistingHeader(existing); String attNames = Utils.getOption('A', options); setAttributeNames(attNames); String namesFile = Utils.getOption("names-file", options); setAttributeNamesFile(namesFile); String outputName = Utils.getOption("header-file-name", options); setOutputHeaderFileName(outputName); super.setOptions(options); // any options to pass on to the underlying Weka csv to arff map task? CSVToARFFHeaderMapTask tempMap = new CSVToARFFHeaderMapTask(); tempMap.setOptions(options); String optsToWekaMapTask = Utils.joinOptions(tempMap.getOptions()); if (!DistributedJobConfig.isEmpty(optsToWekaMapTask)) { setCsvToArffTaskOptions(optsToWekaMapTask); } }
@Override public Enumeration<Option> listOptions() { Vector<Option> result = new Vector<Option>(); result.add( new Option( "\tPath to header file to use. Set this if you have\n\t" + "run previous jobs that have generated a header already. Setting this\n\t" + "prevents this job from running.", "existing-header", 1, "-existing-header")); result.add( new Option( "\tComma separated list of attribute names to use.\n\t" + "Use either this option, -names-file or neither (in which case\n\t" + "attribute names will be generated).", "A", 1, "-A <attribute names>")); result.add( new Option( "\tLocation of a names file to source attribute names\n\t" + "from. Can exist locally or in HDFS. " + "Use either this option, -A or neither (in which case\n\t" + "attribute names will be generated).", "names-file", 1, "-names-file <path to file>")); result.add( new Option( "\tFile name for output ARFF header. Note that this is a name only\n\t" + "and not a path. This file will be created in the output directory\n\t" + "specified by the -output-path option. (default is a " + "randomly generated name)", "header-file-name", 1, "-header-file-name <name>")); CSVToARFFHeaderMapTask tempTask = new CSVToARFFHeaderMapTask(); Enumeration<Option> mtOpts = tempTask.listOptions(); while (mtOpts.hasMoreElements()) { result.addElement(mtOpts.nextElement()); } result.add(new Option("", "", 0, "\nGeneral Hadoop job configuration options:")); Enumeration<Option> superOpts = super.listOptions(); while (superOpts.hasMoreElements()) { result.addElement(superOpts.nextElement()); } return result.elements(); }
/** * Adds options from the ARFF map task to the supplied list of options * * @param opts the list of options to add the ARFF options to */ protected void addArffMapTaskOpts(List<String> opts) { String[] arffMapOpts = m_arffMapTask.getOptions(); for (String s : arffMapOpts) { opts.add(s); } }
/** * Adds a tab for the ARFF header job * * @param tabTitle the title for the tab * @param arffJob the ARFF job */ protected void addTabForArffHeaderJob(String tabTitle, HadoopJob arffJob) { JPanel jobHolder = new JPanel(); jobHolder.setLayout(new BorderLayout()); PropertySheetPanel arffJobEditor = new PropertySheetPanel(); arffJobEditor.setEnvironment(m_env); arffJobEditor.setTarget(arffJob); jobHolder.add(arffJobEditor, BorderLayout.NORTH); m_arffMapTask = new CSVToARFFHeaderMapTask(); try { m_arffMapTask.setOptions(Utils.splitOptions(m_optionsOrig)); } catch (Exception ex) { ex.printStackTrace(); } PropertySheetPanel mapTaskEditor = new PropertySheetPanel(); mapTaskEditor.setTarget(m_arffMapTask); mapTaskEditor.setEnvironment(m_env); jobHolder.add(mapTaskEditor, BorderLayout.CENTER); // JScrollPane scroller = new JScrollPane(jobHolder); m_configTabs.addTab(tabTitle, jobHolder); }
@Override public boolean runJob() throws DistributedWekaException { boolean success = true; m_finalHeader = null; ClassLoader orig = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) { try { handleExistingHeaderFile(); // done!! return true; } catch (DistributedWekaException ex) { logMessage( "Unable to laod existing header file from '" + getPathToExistingHeader() + "' (reason: " + ex.getMessage() + "). Running job to create header..."); } } setJobStatus(JobStatus.RUNNING); if (m_env == null) { m_env = Environment.getSystemWide(); } // make sure that we save out to a subdirectory of the output directory // so that the arff header doesn't get deleted by any jobs that invoke // us first before themselves String outputPath = m_mrConfig.getOutputPath(); outputPath += OUTPUT_SUBDIR; m_mrConfig.setOutputPath(outputPath); Random r = new Random(); String outHeadName = "" + Math.abs(r.nextInt()); if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) { outHeadName = environmentSubstitute(getOutputHeaderFileName()); } if (!outHeadName.toLowerCase().endsWith(".arff")) { outHeadName += ".arff"; } outputPath += "/" + outHeadName; // m_hdfsPathToAggregatedHeader = HDFSUtils.constructHDFSURI( // m_mrConfig.getHDFSConfig(), outputPath, m_env); m_hdfsPathToAggregatedHeader = outputPath; // Where to write the consolidated ARFF file to m_mrConfig.setUserSuppliedProperty( CSVToArffHeaderHadoopReducer.CSV_TO_ARFF_HEADER_WRITE_PATH, outputPath); // Now lets check to see whether this job needs to run at all - i.e. // if the header can be produced immediately (no nominal values need to // be determined) then we can just write ARFF header directly to the // output location in HDFS and quit CSVToARFFHeaderMapTask tempMapTask = new CSVToARFFHeaderMapTask(); if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) { String[] tempOpts = Utils.splitOptions(getCsvToArffTaskOptions()); tempMapTask.setOptions(tempOpts); } boolean haveAttributeNames = false; List<String> tempAttNames = null; if (!DistributedJobConfig.isEmpty(getAttributeNames())) { tempAttNames = new ArrayList<String>(); String[] n = environmentSubstitute(getAttributeNames()).split(","); for (String nn : n) { nn = nn.trim(); if (nn.length() > 0) { tempAttNames.add(nn); } } haveAttributeNames = true; } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { // handle names file - bail out if there are no atts or names file String namesFile = environmentSubstitute(getAttributeNamesFile()); if (namesFile.startsWith("hdfs://")) { namesFile = namesFile.replace("hdfs://", ""); // strip the host and port (if provided) namesFile = namesFile.substring(namesFile.indexOf("/")); String pathS = HDFSUtils.resolvePath(namesFile, m_env); Configuration tempConf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env); FileSystem fs = FileSystem.get(tempConf); Path path = new Path(pathS); InputStream is = fs.open(path); InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr); // this closes the stream for us tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br); } else { // local file URI uri = new URI(namesFile); File path = new File(uri.getPath()); BufferedReader br = new BufferedReader(new FileReader(path)); tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br); } haveAttributeNames = true; } // can't generate header immediately if we have to compute summary // stats if (haveAttributeNames && !tempMapTask.getComputeSummaryStats()) { if (tempMapTask.headerAvailableImmediately( tempAttNames.size(), tempAttNames, new StringBuffer())) { Instances finalHeader = tempMapTask.getHeader(tempAttNames.size(), tempAttNames); m_finalHeader = finalHeader; Configuration tempConf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env); CSVToArffHeaderHadoopReducer.writeHeaderToDestination(finalHeader, outputPath, tempConf); // done! return true; } } // Otherwise, go ahead with the job... Configuration conf = new Configuration(); // Options to the map task and the underlying general Weka map // task StringBuilder csvToArffTaskOptions = new StringBuilder(); if (!DistributedJobConfig.isEmpty(getAttributeNames())) { csvToArffTaskOptions.append(" -A ").append(environmentSubstitute(getAttributeNames())); } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { String filenameOnly = handleNamesFile(conf); csvToArffTaskOptions.append(" -names-file ").append(filenameOnly); } if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) { csvToArffTaskOptions.append(" ").append(getCsvToArffTaskOptions()); } if (csvToArffTaskOptions.length() > 0) { m_mrConfig.setUserSuppliedProperty( CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS, environmentSubstitute(csvToArffTaskOptions.toString())); setJobName(getJobName() + " " + csvToArffTaskOptions.toString()); } // install the weka libraries and any user-selected packages // to HDFS and add to the distributed cache/classpath for // the job installWekaLibrariesInHDFS(conf); Job job = m_mrConfig.configureForHadoop(environmentSubstitute(getJobName()), conf, m_env); cleanOutputDirectory(job); statusMessage("Submitting job: " + getJobName()); logMessage("Submitting job: " + getJobName()); success = runJob(job); if (success) { getFinalHeaderFromHDFS(conf, outputPath); } setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED); } catch (Exception ex) { setJobStatus(JobStatus.FAILED); throw new DistributedWekaException(ex); } finally { Thread.currentThread().setContextClassLoader(orig); } return success; }