/** * Copies the names file into HDFS (if necessary) and adds it to the distributed cache for the job * * @param conf * @return the filename part of the path * @throws IOException */ protected String handleNamesFile(Configuration conf) throws IOException { String namesFile = environmentSubstitute(getAttributeNamesFile()); String filenameOnly = HDFSUtils.addFileToDistributedCache(m_mrConfig.getHDFSConfig(), conf, namesFile, m_env); return filenameOnly; }
/** * Checks for the existence of an existing ARFF header file in HDFS or the local file system. If * local, it is copied into HDFS. * * @throws DistributedWekaException if the file does not exist or there is a problem transfering * it into HDFS */ protected void handleExistingHeaderFile() throws DistributedWekaException { String existingPath = getPathToExistingHeader(); try { existingPath = environmentSubstitute(existingPath); } catch (Exception ex) { } // check local file system first File f = new File(existingPath); boolean success = false; if (f.exists()) { // copy this file into HDFS String hdfsDest = HDFSUtils.WEKA_TEMP_DISTRIBUTED_CACHE_FILES + f.getName(); try { HDFSUtils.copyToHDFS(existingPath, hdfsDest, m_mrConfig.getHDFSConfig(), m_env, true); m_hdfsPathToAggregatedHeader = hdfsDest; Configuration conf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env); getFinalHeaderFromHDFS(conf, hdfsDest); success = true; } catch (IOException e) { throw new DistributedWekaException(e); } } else { try { Path p = new Path(existingPath); Configuration conf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(conf, m_env); FileSystem fs = FileSystem.get(conf); if (fs.exists(p)) { m_hdfsPathToAggregatedHeader = existingPath; getFinalHeaderFromHDFS(conf, existingPath); success = true; } } catch (IOException ex) { throw new DistributedWekaException(ex); } } if (!success) { throw new DistributedWekaException( "Was unable to find '" + existingPath + "' on either " + "the local file system or in HDFS"); } }
@Override public boolean runJob() throws DistributedWekaException { boolean success = true; m_finalHeader = null; ClassLoader orig = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) { try { handleExistingHeaderFile(); // done!! return true; } catch (DistributedWekaException ex) { logMessage( "Unable to laod existing header file from '" + getPathToExistingHeader() + "' (reason: " + ex.getMessage() + "). Running job to create header..."); } } setJobStatus(JobStatus.RUNNING); if (m_env == null) { m_env = Environment.getSystemWide(); } // make sure that we save out to a subdirectory of the output directory // so that the arff header doesn't get deleted by any jobs that invoke // us first before themselves String outputPath = m_mrConfig.getOutputPath(); outputPath += OUTPUT_SUBDIR; m_mrConfig.setOutputPath(outputPath); Random r = new Random(); String outHeadName = "" + Math.abs(r.nextInt()); if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) { outHeadName = environmentSubstitute(getOutputHeaderFileName()); } if (!outHeadName.toLowerCase().endsWith(".arff")) { outHeadName += ".arff"; } outputPath += "/" + outHeadName; // m_hdfsPathToAggregatedHeader = HDFSUtils.constructHDFSURI( // m_mrConfig.getHDFSConfig(), outputPath, m_env); m_hdfsPathToAggregatedHeader = outputPath; // Where to write the consolidated ARFF file to m_mrConfig.setUserSuppliedProperty( CSVToArffHeaderHadoopReducer.CSV_TO_ARFF_HEADER_WRITE_PATH, outputPath); // Now lets check to see whether this job needs to run at all - i.e. // if the header can be produced immediately (no nominal values need to // be determined) then we can just write ARFF header directly to the // output location in HDFS and quit CSVToARFFHeaderMapTask tempMapTask = new CSVToARFFHeaderMapTask(); if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) { String[] tempOpts = Utils.splitOptions(getCsvToArffTaskOptions()); tempMapTask.setOptions(tempOpts); } boolean haveAttributeNames = false; List<String> tempAttNames = null; if (!DistributedJobConfig.isEmpty(getAttributeNames())) { tempAttNames = new ArrayList<String>(); String[] n = environmentSubstitute(getAttributeNames()).split(","); for (String nn : n) { nn = nn.trim(); if (nn.length() > 0) { tempAttNames.add(nn); } } haveAttributeNames = true; } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { // handle names file - bail out if there are no atts or names file String namesFile = environmentSubstitute(getAttributeNamesFile()); if (namesFile.startsWith("hdfs://")) { namesFile = namesFile.replace("hdfs://", ""); // strip the host and port (if provided) namesFile = namesFile.substring(namesFile.indexOf("/")); String pathS = HDFSUtils.resolvePath(namesFile, m_env); Configuration tempConf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env); FileSystem fs = FileSystem.get(tempConf); Path path = new Path(pathS); InputStream is = fs.open(path); InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr); // this closes the stream for us tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br); } else { // local file URI uri = new URI(namesFile); File path = new File(uri.getPath()); BufferedReader br = new BufferedReader(new FileReader(path)); tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br); } haveAttributeNames = true; } // can't generate header immediately if we have to compute summary // stats if (haveAttributeNames && !tempMapTask.getComputeSummaryStats()) { if (tempMapTask.headerAvailableImmediately( tempAttNames.size(), tempAttNames, new StringBuffer())) { Instances finalHeader = tempMapTask.getHeader(tempAttNames.size(), tempAttNames); m_finalHeader = finalHeader; Configuration tempConf = new Configuration(); m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env); CSVToArffHeaderHadoopReducer.writeHeaderToDestination(finalHeader, outputPath, tempConf); // done! return true; } } // Otherwise, go ahead with the job... Configuration conf = new Configuration(); // Options to the map task and the underlying general Weka map // task StringBuilder csvToArffTaskOptions = new StringBuilder(); if (!DistributedJobConfig.isEmpty(getAttributeNames())) { csvToArffTaskOptions.append(" -A ").append(environmentSubstitute(getAttributeNames())); } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) { String filenameOnly = handleNamesFile(conf); csvToArffTaskOptions.append(" -names-file ").append(filenameOnly); } if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) { csvToArffTaskOptions.append(" ").append(getCsvToArffTaskOptions()); } if (csvToArffTaskOptions.length() > 0) { m_mrConfig.setUserSuppliedProperty( CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS, environmentSubstitute(csvToArffTaskOptions.toString())); setJobName(getJobName() + " " + csvToArffTaskOptions.toString()); } // install the weka libraries and any user-selected packages // to HDFS and add to the distributed cache/classpath for // the job installWekaLibrariesInHDFS(conf); Job job = m_mrConfig.configureForHadoop(environmentSubstitute(getJobName()), conf, m_env); cleanOutputDirectory(job); statusMessage("Submitting job: " + getJobName()); logMessage("Submitting job: " + getJobName()); success = runJob(job); if (success) { getFinalHeaderFromHDFS(conf, outputPath); } setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED); } catch (Exception ex) { setJobStatus(JobStatus.FAILED); throw new DistributedWekaException(ex); } finally { Thread.currentThread().setContextClassLoader(orig); } return success; }