예제 #1
0
  /**
   * Get the options specific to this job only.
   *
   * @return the options specific to this job only
   */
  public String[] getJobOptionsOnly() {
    List<String> options = new ArrayList<String>();

    if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) {
      options.add("-existing-header");
      options.add(getPathToExistingHeader());
    }

    if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
      options.add("-A");
      options.add(getAttributeNames());
    }

    if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
      options.add("-names-file");
      options.add(getAttributeNamesFile());
    }

    if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) {
      options.add("-header-file-name");
      options.add(getOutputHeaderFileName());
    }

    return options.toArray(new String[options.size()]);
  }
예제 #2
0
  @Override
  public void setOptions(String[] options) throws Exception {

    // these are options to the hadoop map task (especially the -names-file)

    String existing = Utils.getOption("existing-header", options);
    setPathToExistingHeader(existing);

    String attNames = Utils.getOption('A', options);
    setAttributeNames(attNames);

    String namesFile = Utils.getOption("names-file", options);
    setAttributeNamesFile(namesFile);

    String outputName = Utils.getOption("header-file-name", options);
    setOutputHeaderFileName(outputName);

    super.setOptions(options);

    // any options to pass on to the underlying Weka csv to arff map task?
    CSVToARFFHeaderMapTask tempMap = new CSVToARFFHeaderMapTask();
    tempMap.setOptions(options);

    String optsToWekaMapTask = Utils.joinOptions(tempMap.getOptions());
    if (!DistributedJobConfig.isEmpty(optsToWekaMapTask)) {
      setCsvToArffTaskOptions(optsToWekaMapTask);
    }
  }
예제 #3
0
  /**
   * Set the model to use
   *
   * @param model the model to use
   * @param modelHeader the header of the training data used to train the model
   * @param dataHeader the header of the incoming data
   * @throws DistributedWekaException if more than 50% of the attributes expected by the model are
   *     missing or have a type mismatch with the incoming data
   */
  public void setModel(Object model, Instances modelHeader, Instances dataHeader)
      throws DistributedWekaException {

    m_missingMismatch.clear();

    if (dataHeader == null || modelHeader == null) {
      throw new DistributedWekaException(
          "Can't continue without a header for the model and incoming data");
    }
    try {
      m_isUsingStringAttributes = modelHeader.checkForStringAttributes();
      m_model = ScoringModel.createScorer(model);

      if (modelHeader != null) {
        m_model.setHeader(modelHeader);
      }

      if (m_model.isBatchPredicor()) {
        m_batchScoringData = new Instances(modelHeader, 0);
        Environment env = Environment.getSystemWide();
        String batchSize = ((BatchPredictor) model).getBatchSize();
        if (!DistributedJobConfig.isEmpty(batchSize)) {
          m_batchSize = Integer.parseInt(env.substitute(batchSize));
        } else {
          m_batchSize = 1000;
        }
      }
    } catch (Exception ex) {
      throw new DistributedWekaException(ex);
    }

    buildAttributeMap(modelHeader, dataHeader);
  }
예제 #4
0
  @Override
  public String[] getOptions() {
    List<String> options = new ArrayList<String>();

    // these are options to the hadoop map task (especially the -names-file)
    if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) {
      options.add("-existing-header");
      options.add(getPathToExistingHeader());
    }

    if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
      options.add("-A");
      options.add(getAttributeNames());
    }

    if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
      options.add("-names-file");
      options.add(getAttributeNamesFile());
    }

    if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) {
      options.add("-header-file-name");
      options.add(getOutputHeaderFileName());
    }

    String[] superOpts = super.getOptions();
    for (String o : superOpts) {
      options.add(o);
    }

    if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) {
      try {
        String[] csvOpts = Utils.splitOptions(getCsvToArffTaskOptions());

        for (String s : csvOpts) {
          options.add(s);
        }
      } catch (Exception e) {
        e.printStackTrace();
      }
    }

    return options.toArray(new String[options.size()]);
  }
    /**
     * Constructor
     *
     * @param properties a map of properties to edit
     */
    public HadoopPropertyPanel(Map<String, String> properties) {
      setLayout(new BorderLayout());
      setBorder(BorderFactory.createTitledBorder("User defined properties"));
      add(m_table, BorderLayout.CENTER);

      // populate table with supplied properties
      if (properties != null) {
        int row = 0;
        JTable table = m_table.getTable();
        for (Map.Entry<String, String> e : properties.entrySet()) {
          String prop = e.getKey();
          String val = e.getValue();

          // make sure to skip internal weka properties!!
          if (!DistributedJobConfig.isEmpty(val) && !prop.startsWith("*")) {
            table.getModel().setValueAt(prop, row, 0);
            table.getModel().setValueAt(val, row, 1);
            ((InteractiveTableModel) table.getModel()).addEmptyRow();
            row++;
          }
        }
      }
    }
예제 #6
0
  @Override
  public boolean runJob() throws DistributedWekaException {
    boolean success = true;
    m_finalHeader = null;

    ClassLoader orig = Thread.currentThread().getContextClassLoader();
    try {
      Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
      if (!DistributedJobConfig.isEmpty(getPathToExistingHeader())) {
        try {
          handleExistingHeaderFile();

          // done!!
          return true;
        } catch (DistributedWekaException ex) {
          logMessage(
              "Unable to laod existing header file from '"
                  + getPathToExistingHeader()
                  + "' (reason: "
                  + ex.getMessage()
                  + "). Running job to create header...");
        }
      }

      setJobStatus(JobStatus.RUNNING);

      if (m_env == null) {
        m_env = Environment.getSystemWide();
      }

      // make sure that we save out to a subdirectory of the output directory
      // so that the arff header doesn't get deleted by any jobs that invoke
      // us first before themselves
      String outputPath = m_mrConfig.getOutputPath();
      outputPath += OUTPUT_SUBDIR;
      m_mrConfig.setOutputPath(outputPath);
      Random r = new Random();
      String outHeadName = "" + Math.abs(r.nextInt());
      if (!DistributedJobConfig.isEmpty(getOutputHeaderFileName())) {
        outHeadName = environmentSubstitute(getOutputHeaderFileName());
      }
      if (!outHeadName.toLowerCase().endsWith(".arff")) {
        outHeadName += ".arff";
      }
      outputPath += "/" + outHeadName;
      // m_hdfsPathToAggregatedHeader = HDFSUtils.constructHDFSURI(
      // m_mrConfig.getHDFSConfig(), outputPath, m_env);
      m_hdfsPathToAggregatedHeader = outputPath;

      // Where to write the consolidated ARFF file to
      m_mrConfig.setUserSuppliedProperty(
          CSVToArffHeaderHadoopReducer.CSV_TO_ARFF_HEADER_WRITE_PATH, outputPath);

      // Now lets check to see whether this job needs to run at all - i.e.
      // if the header can be produced immediately (no nominal values need to
      // be determined) then we can just write ARFF header directly to the
      // output location in HDFS and quit
      CSVToARFFHeaderMapTask tempMapTask = new CSVToARFFHeaderMapTask();
      if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) {
        String[] tempOpts = Utils.splitOptions(getCsvToArffTaskOptions());
        tempMapTask.setOptions(tempOpts);
      }
      boolean haveAttributeNames = false;
      List<String> tempAttNames = null;
      if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
        tempAttNames = new ArrayList<String>();
        String[] n = environmentSubstitute(getAttributeNames()).split(",");
        for (String nn : n) {
          nn = nn.trim();
          if (nn.length() > 0) {
            tempAttNames.add(nn);
          }
        }
        haveAttributeNames = true;
      } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
        // handle names file - bail out if there are no atts or names file
        String namesFile = environmentSubstitute(getAttributeNamesFile());
        if (namesFile.startsWith("hdfs://")) {
          namesFile = namesFile.replace("hdfs://", "");

          // strip the host and port (if provided)
          namesFile = namesFile.substring(namesFile.indexOf("/"));
          String pathS = HDFSUtils.resolvePath(namesFile, m_env);
          Configuration tempConf = new Configuration();
          m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env);
          FileSystem fs = FileSystem.get(tempConf);
          Path path = new Path(pathS);
          InputStream is = fs.open(path);
          InputStreamReader isr = new InputStreamReader(is);
          BufferedReader br = new BufferedReader(isr);
          // this closes the stream for us
          tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br);
        } else {
          // local file
          URI uri = new URI(namesFile);
          File path = new File(uri.getPath());
          BufferedReader br = new BufferedReader(new FileReader(path));
          tempAttNames = CSVToArffHeaderHadoopMapper.readNames(br);
        }
        haveAttributeNames = true;
      }

      // can't generate header immediately if we have to compute summary
      // stats
      if (haveAttributeNames && !tempMapTask.getComputeSummaryStats()) {
        if (tempMapTask.headerAvailableImmediately(
            tempAttNames.size(), tempAttNames, new StringBuffer())) {
          Instances finalHeader = tempMapTask.getHeader(tempAttNames.size(), tempAttNames);
          m_finalHeader = finalHeader;

          Configuration tempConf = new Configuration();
          m_mrConfig.getHDFSConfig().configureForHadoop(tempConf, m_env);
          CSVToArffHeaderHadoopReducer.writeHeaderToDestination(finalHeader, outputPath, tempConf);

          // done!
          return true;
        }
      }

      // Otherwise, go ahead with the job...
      Configuration conf = new Configuration();

      // Options to the map task and the underlying general Weka map
      // task
      StringBuilder csvToArffTaskOptions = new StringBuilder();
      if (!DistributedJobConfig.isEmpty(getAttributeNames())) {
        csvToArffTaskOptions.append(" -A ").append(environmentSubstitute(getAttributeNames()));
      } else if (!DistributedJobConfig.isEmpty(getAttributeNamesFile())) {
        String filenameOnly = handleNamesFile(conf);
        csvToArffTaskOptions.append(" -names-file ").append(filenameOnly);
      }

      if (!DistributedJobConfig.isEmpty(getCsvToArffTaskOptions())) {
        csvToArffTaskOptions.append(" ").append(getCsvToArffTaskOptions());
      }

      if (csvToArffTaskOptions.length() > 0) {
        m_mrConfig.setUserSuppliedProperty(
            CSVToArffHeaderHadoopMapper.CSV_TO_ARFF_HEADER_MAP_TASK_OPTIONS,
            environmentSubstitute(csvToArffTaskOptions.toString()));

        setJobName(getJobName() + " " + csvToArffTaskOptions.toString());
      }

      // install the weka libraries and any user-selected packages
      // to HDFS and add to the distributed cache/classpath for
      // the job
      installWekaLibrariesInHDFS(conf);

      Job job = m_mrConfig.configureForHadoop(environmentSubstitute(getJobName()), conf, m_env);

      cleanOutputDirectory(job);

      statusMessage("Submitting job: " + getJobName());
      logMessage("Submitting job: " + getJobName());

      success = runJob(job);

      if (success) {
        getFinalHeaderFromHDFS(conf, outputPath);
      }

      setJobStatus(success ? JobStatus.FINISHED : JobStatus.FAILED);
    } catch (Exception ex) {
      setJobStatus(JobStatus.FAILED);
      throw new DistributedWekaException(ex);
    } finally {
      Thread.currentThread().setContextClassLoader(orig);
    }

    return success;
  }