Beispiel #1
0
    /**
     * Compute the actual train_samples_per_iteration size from the user-given parameter
     *
     * @param mp Model parameter (DeepLearning object)
     * @param numRows number of training rows
     * @param model DL model
     * @return The total number of training rows to be processed per iteration (summed over on all
     *     nodes)
     */
    private long computeTrainSamplesPerIteration(
        final DeepLearningParameters mp, final long numRows, final DeepLearningModel model) {
      long tspi = mp._train_samples_per_iteration;
      assert (tspi == 0 || tspi == -1 || tspi == -2 || tspi >= 1);
      if (tspi == 0 || (!mp._replicate_training_data && tspi == -1)) {
        tspi = numRows;
        if (!mp._quiet_mode)
          Log.info(
              "Setting train_samples_per_iteration ("
                  + mp._train_samples_per_iteration
                  + ") to one epoch: #rows ("
                  + tspi
                  + ").");
      } else if (tspi == -1) {
        tspi = (mp._single_node_mode ? 1 : H2O.CLOUD.size()) * numRows;
        if (!mp._quiet_mode)
          Log.info(
              "Setting train_samples_per_iteration ("
                  + mp._train_samples_per_iteration
                  + ") to #nodes x #rows ("
                  + tspi
                  + ").");
      } else if (tspi == -2) {
        // automatic tuning based on CPU speed, network speed and model size

        // measure cpu speed
        double total_gflops = 0;
        for (H2ONode h2o : H2O.CLOUD._memary) {
          HeartBeat hb = h2o._heartbeat;
          total_gflops += hb._gflops; // can be NaN if not yet run
        }
        if (mp._single_node_mode) total_gflops /= H2O.CLOUD.size();
        if (Double.isNaN(total_gflops)) {
          total_gflops =
              Linpack.run(H2O.SELF._heartbeat._cpus_allowed)
                  * (mp._single_node_mode ? 1 : H2O.CLOUD.size());
        }
        assert (!Double.isNaN(total_gflops));

        final long model_size = model.model_info().size();
        int[] msg_sizes =
            new int[] {
              1,
              (int) (model_size * 4) == (model_size * 4)
                  ? (int) (model_size * 4)
                  : Integer.MAX_VALUE
            };
        double[] microseconds_collective = new double[msg_sizes.length];
        NetworkTest.NetworkTester nt =
            new NetworkTest.NetworkTester(
                msg_sizes,
                null,
                microseconds_collective,
                model_size > 1e6 ? 1 : 5 /*repeats*/,
                false,
                true /*only collectives*/);
        nt.compute2();

        // length of the network traffic queue based on log-tree rollup (2 log(nodes))
        int network_queue_length =
            mp._single_node_mode || H2O.CLOUD.size() == 1
                ? 1
                : 2 * (int) Math.floor(Math.log(H2O.CLOUD.size()) / Math.log(2));

        // heuristics
        double flops_overhead_per_row = 50;
        if (mp._activation == DeepLearningParameters.Activation.Maxout
            || mp._activation == DeepLearningParameters.Activation.MaxoutWithDropout) {
          flops_overhead_per_row *= 8;
        } else if (mp._activation == DeepLearningParameters.Activation.Tanh
            || mp._activation == DeepLearningParameters.Activation.TanhWithDropout) {
          flops_overhead_per_row *= 5;
        }

        // target fraction of comm vs cpu time: 5%
        double fraction =
            mp._single_node_mode || H2O.CLOUD.size() == 1
                ? 1e-3
                : mp._target_ratio_comm_to_comp; // one single node mode, there's no model averaging
                                                 // effect, so less need to shorten the M/R
                                                 // iteration

        // estimate the time for communication (network) and training (compute)
        model.time_for_communication_us =
            (H2O.CLOUD.size() == 1
                    ? 1e4 /* add 10ms for single-node */
                    : 1e5 /* add 100ms for multi-node MR overhead */)
                + network_queue_length * microseconds_collective[1];
        double time_per_row_us =
            (flops_overhead_per_row * model_size + 10000 * model.model_info().units[0])
                / (total_gflops * 1e9)
                / H2O.SELF._heartbeat._cpus_allowed
                * 1e6;
        assert (!Double.isNaN(time_per_row_us));

        // compute the optimal number of training rows per iteration
        // fraction := time_comm_us / (time_comm_us + tspi * time_per_row_us)  ==>  tspi =
        // (time_comm_us/fraction - time_comm_us)/time_per_row_us
        tspi =
            (long)
                ((model.time_for_communication_us / fraction - model.time_for_communication_us)
                    / time_per_row_us);

        tspi =
            Math.min(
                tspi,
                (mp._single_node_mode ? 1 : H2O.CLOUD.size())
                    * numRows
                    * 10); // not more than 10x of what train_samples_per_iteration=-1 would do

        // If the number is close to a multiple of epochs, use that -> prettier scoring
        if (tspi > numRows && Math.abs(tspi % numRows) / (double) numRows < 0.2)
          tspi -= tspi % numRows;
        tspi =
            Math.min(
                tspi,
                (long)
                    (mp._epochs
                        * numRows
                        / 10)); // limit to number of epochs desired, but at least 10 iterations
                                // total
        if (H2O.CLOUD.size() == 1 || mp._single_node_mode) {
          tspi =
              Math.min(
                  tspi,
                  10
                      * (int)
                          (1e6
                              / time_per_row_us)); // in single-node mode, only run for at most 10
                                                   // seconds
        }
        tspi = Math.max(1, tspi); // at least 1 row
        tspi =
            Math.min(
                100000 * H2O.CLOUD.size(),
                tspi); // at most 100k rows per node for initial guess - can always relax later on

        if (!mp._quiet_mode) {
          Log.info("Auto-tuning parameter 'train_samples_per_iteration':");
          Log.info("Estimated compute power : " + Math.round(total_gflops * 100) / 100 + " GFlops");
          Log.info(
              "Estimated time for comm : "
                  + PrettyPrint.usecs((long) model.time_for_communication_us));
          Log.info(
              "Estimated time per row  : "
                  + ((long) time_per_row_us > 0
                      ? PrettyPrint.usecs((long) time_per_row_us)
                      : time_per_row_us + " usecs"));
          Log.info("Estimated training speed: " + (int) (1e6 / time_per_row_us) + " rows/sec");
          Log.info(
              "Setting train_samples_per_iteration ("
                  + mp._train_samples_per_iteration
                  + ") to auto-tuned value: "
                  + tspi);
        }

      } else {
        // limit user-given value to number of epochs desired
        tspi = Math.max(1, Math.min(tspi, (long) (mp._epochs * numRows)));
      }
      assert (tspi != 0 && tspi != -1 && tspi != -2 && tspi >= 1);
      model.tspiGuess = tspi;
      return tspi;
    }