/** * Get the options specific to this job only. * * @return the options specific to this job only */ public String[] getJobOptionsOnly() { List<String> options = new ArrayList<String>(); if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) { options.add("-existing-header"); options.add(getPathToExistingHeader()); } if (!DistributedJobConfig.isEmpty(getAttributeNames())) { options.add("-A"); options.add(getAttributeNames()); } if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { options.add("-names-file"); options.add(getAttributeNamesFile()); } if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) { options.add("-header-file-name"); options.add(getOutputHeaderFileName()); } return options.toArray(new String[options.size()]); }
@Override public void setOptions(String[] options) throws Exception { // these are options to the hadoop map task (especially the -names-file) String existing = Utils.getOption("existing-header", options); setPathToExistingHeader(existing); String attNames = Utils.getOption('A', options); setAttributeNames(attNames); String namesFile = Utils.getOption("names-file", options); setAttributeNamesFile(namesFile); String outputName = Utils.getOption("header-file-name", options); setOutputHeaderFileName(outputName); super.setOptions(options); // any options to pass on to the underlying Weka csv to arff map task? CSVToARFFHeaderMapTask tempMap = new CSVToARFFHeaderMapTask(); tempMap.setOptions(options); String optsToWekaMapTask = Utils.joinOptions(tempMap.getOptions()); if (!DistributedJobConfig.isEmpty(optsToWekaMapTask)) { setCsvToArffTaskOptions(optsToWekaMapTask); } }
/** * Set the model to use * * @param model the model to use * @param modelHeader the header of the training data used to train the model * @param dataHeader the header of the incoming data * @throws DistributedWekaException if more than 50% of the attributes expected by the model are * missing or have a type mismatch with the incoming data */ public void setModel(Object model, Instances modelHeader, Instances dataHeader) throws DistributedWekaException { m_missingMismatch.clear(); if (dataHeader == null || modelHeader == null) { throw new DistributedWekaException( "Can't continue without a header for the model and incoming data"); } try { m_isUsingStringAttributes = modelHeader.checkForStringAttributes(); m_model = ScoringModel.createScorer(model); if (modelHeader != null) { m_model.setHeader(modelHeader); } if (m_model.isBatchPredicor()) { m_batchScoringData = new Instances(modelHeader, 0); Environment env = Environment.getSystemWide(); String batchSize = ((BatchPredictor) model).getBatchSize(); if (!DistributedJobConfig.isEmpty(batchSize)) { m_batchSize = Integer.parseInt(env.substitute(batchSize)); } else { m_batchSize = 1000; } } } catch (Exception ex) { throw new DistributedWekaException(ex); } buildAttributeMap(modelHeader, dataHeader); }
@Override public String[] getOptions() { List<String> options = new ArrayList<String>(); // these are options to the hadoop map task (especially the -names-file) if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) { options.add("-existing-header"); options.add(getPathToExistingHeader()); } if (!DistributedJobConfig.isEmpty(getAttributeNames())) { options.add("-A"); options.add(getAttributeNames()); } if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { options.add("-names-file"); options.add(getAttributeNamesFile()); } if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) { options.add("-header-file-name"); options.add(getOutputHeaderFileName()); } String[] superOpts = super.getOptions(); for (String o : superOpts) { options.add(o); } if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) { try { String[] csvOpts = Utils.splitOptions(getCsvToArffTaskOptions()); for (String s : csvOpts) { options.add(s); } } catch (Exception e) { e.printStackTrace(); } } return options.toArray(new String[options.size()]); }
/** * Constructor * * @param properties a map of properties to edit */ public HadoopPropertyPanel(Map<String, String> properties) { setLayout(new BorderLayout()); setBorder(BorderFactory.createTitledBorder("User defined properties")); add(m_table, BorderLayout.CENTER); // populate table with supplied properties if (properties != null) { int row = 0; JTable table = m_table.getTable(); for (Map.Entry<String, String> e : properties.entrySet()) { String prop = e.getKey(); String val = e.getValue(); // make sure to skip internal weka properties!! if (!DistributedJobConfig.isEmpty(val) && !prop.startsWith("*")) { table.getModel().setValueAt(prop, row, 0); table.getModel().setValueAt(val, row, 1); ((InteractiveTableModel) table.getModel()).addEmptyRow(); row++; } } } }
@Override public boolean runJob() throws DistributedWekaException { boolean success = true; m_finalHeader = null; ClassLoader orig = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) { try { handleExistingHeaderFile(); // done!! return true; } catch (DistributedWekaException ex) { logMessage( "Unable to laod existing header file from '" + getPathToExistingHeader() + "' (reason: " + ex.getMessage() + "). Running job to create header..."); } } setJobStatus(JobStatus.RUNNING); if (m_env == null) { m_env = Environment.getSystemWide(); } // make sure that we save out to a subdirectory of the output directory // so that the arff header doesn't get deleted by any jobs that invoke // us first before themselves String outputPath = m_mrConfig.getOutputPath(); outputPath += OUTPUT_SUBDIR; m_mrConfig.setOutputPath(outputPath); Random r = new Random(); String outHeadName = "" + Math.abs(r.nextInt()); if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) { outHeadName = environmentSubstitute(getOutputHeaderFileName()); } if (!outHeadName.toLowerCase().endsWith(".arff")) { outHeadName += ".arff"; } outputPath += "/" + outHeadName; // m_hdfsPathToAggregatedHeader = HDFSUtils.constructHDFSURI( // m_mrConfig.getHDFSConfig(), outputPath, m_env); m_hdfsPathToAggregatedHeader = outputPath; // Where to write the consolidated ARFF file to m_mrConfig.setUserSuppliedProperty( CSVToArffHeaderHadoopReducer.CSV_TO_ARFF_HEADER_WRITE_PATH, outputPath); // Now lets check to see whether this job needs to run at all - i.e. // if the header can be produced immediately (no nominal values need to // be determined) then we can just write ARFF header directly to the // output location in HDFS and quit CSVToARFFHeaderMapTask tempMapTask = new CSVToARFFHeaderMapTask(); if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) { String[] tempOpts = Utils.splitOptions(getCsvToArffTaskOptions()); tempMapTask.setOptions(tempOpts); } boolean haveAttributeNames = false; List<String> tempAttNames = null; if (!DistributedJobConfig.isEmpty(getAttributeNames())) { tempAttNames = new ArrayList<String>(); String[] n = environmentSubstitute(getAttributeNames()).split(","); for (String nn : n) { nn = nn.trim(); if (nn.length() > 0) { tempAttNames.add(nn); } } haveAttributeNames = true; } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { // handle names file - bail out if there are no atts or names file String namesFile = environmentSubstitute(getAttributeNamesFile()); if (namesFile.startsWith("hdfs://")) { namesFile = namesFile.replace("hdfs://", ""); // strip the host and port (if provided) namesFile = namesFile.substring(namesFile.indexOf("/")); String pathS = HDFSUtils.resolvePath(namesFile, m_env); Configuration tempConf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env); FileSystem fs = FileSystem.get(tempConf); Path path = new Path(pathS); InputStream is = fs.open(path); InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr); // this closes the stream for us tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br); } else { // local file URI uri = new URI(namesFile); File path = new File(uri.getPath()); BufferedReader br = new BufferedReader(new FileReader(path)); tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br); } haveAttributeNames = true; } // can't generate header immediately if we have to compute summary // stats if (haveAttributeNames && !tempMapTask.getComputeSummaryStats()) { if (tempMapTask.headerAvailableImmediately( tempAttNames.size(), tempAttNames, new StringBuffer())) { Instances finalHeader = tempMapTask.getHeader(tempAttNames.size(), tempAttNames); m_finalHeader = finalHeader; Configuration tempConf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env); CSVToArffHeaderHadoopReducer.writeHeaderToDestination(finalHeader, outputPath, tempConf); // done! return true; } } // Otherwise, go ahead with the job... Configuration conf = new Configuration(); // Options to the map task and the underlying general Weka map // task StringBuilder csvToArffTaskOptions = new StringBuilder(); if (!DistributedJobConfig.isEmpty(getAttributeNames())) { csvToArffTaskOptions.append(" -A ").append(environmentSubstitute(getAttributeNames())); } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { String filenameOnly = handleNamesFile(conf); csvToArffTaskOptions.append(" -names-file ").append(filenameOnly); } if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) { csvToArffTaskOptions.append(" ").append(getCsvToArffTaskOptions()); } if (csvToArffTaskOptions.length() > 0) { m_mrConfig.setUserSuppliedProperty( CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS, environmentSubstitute(csvToArffTaskOptions.toString())); setJobName(getJobName() + " " + csvToArffTaskOptions.toString()); } // install the weka libraries and any user-selected packages // to HDFS and add to the distributed cache/classpath for // the job installWekaLibrariesInHDFS(conf); Job job = m_mrConfig.configureForHadoop(environmentSubstitute(getJobName()), conf, m_env); cleanOutputDirectory(job); statusMessage("Submitting job: " + getJobName()); logMessage("Submitting job: " + getJobName()); success = runJob(job); if (success) { getFinalHeaderFromHDFS(conf, outputPath); } setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED); } catch (Exception ex) { setJobStatus(JobStatus.FAILED); throw new DistributedWekaException(ex); } finally { Thread.currentThread().setContextClassLoader(orig); } return success; }