@Override
  public void initialize(DataSet d) {

    trainData = d;
    indices = new TIntArrayList(RandomUtils.getIndexes(d.getInstanceLength()));
    classCount = trainData.getLabels().getClassIndexMap().size();
    scoreCache = new float[trainData.getInstanceLength()][classCount];
    roundData = new DataSet[classCount];
    roundIndices = new int[(int) (trainData.getInstanceLength() * SAMPLE_RATE)];
    tempLabels = new float[classCount][trainData.getInstanceLength()];

    indices.shuffle(new Random());
    for (int j = 0; j < roundIndices.length; j++) roundIndices[j] = indices.get(j);

    for (int j = 0; j < classCount; j++) {
      roundData[j] = new DataSet(trainData.getFeatureMatrix(), makeClassLabel(j));
    }

    log.info(
        "initialize finished, tempLabels + ScoreCache MEM use ~ {} GB",
        2.0 * scoreCache.length * scoreCache[0].length * 4 / 1024 / 1024 / 1024);
  }
  @Override
  public void train() {

    for (int i = 0; i < boosters.length; i++) {

      long t1 = System.currentTimeMillis();

      final int ROUND = i;

      service = Executors.newFixedThreadPool(MAX_THREADS);
      countDownLatch = new CountDownLatch(classCount);
      IntStream.range(0, classCount)
          .forEach(
              j ->
                  service.submit(
                      () -> {
                        long tic = System.currentTimeMillis();

                        try {
                          boosters[ROUND][j].boostInitialize(roundData[j], roundIndices);
                          boosters[ROUND][j].boost();
                        } catch (Throwable t) {
                          log.error(t.getMessage(), t);
                        }

                        long toc = System.currentTimeMillis();
                        log.debug(
                            "round {}, task: {}/{} finished, elapsed {} ms",
                            ROUND,
                            j,
                            classCount,
                            toc - tic);
                        countDownLatch.countDown();
                      }));
      try {
        TimeUnit.SECONDS.sleep(10);
        countDownLatch.await();
      } catch (Throwable t) {
        log.error(t.getMessage(), t);
      }
      service.shutdown();

      roundIndicator[ROUND] = true;

      long t2 = System.currentTimeMillis();

      float[] probs = new float[classCount];
      for (int j = 0; j < trainData.getInstanceLength(); j++) {

        double[] x = trainData.getInstance(j);
        double y = trainData.getLabel(j);
        float[] ys = new float[classCount];
        ys[(int) y] = 1;

        for (int k = 0; k < classCount; k++) {
          scoreCache[j][k] += LEARNING_RATE * boosters[ROUND][k].boostPredict(x);
          probs[k] = (float) Math.exp(scoreCache[j][k]);
        }

        ArraySumUtil.normalize(probs);

        roundKL[ROUND] += (float) ArrayUtil.KLDivergence(ys, probs);

        for (int k = 0; k < classCount; k++) tempLabels[k][j] = ys[k] - probs[k];
      }

      roundKL[ROUND] /= trainData.getInstanceLength();

      for (int j = 0; j < classCount; j++)
        roundData[j] = new DataSet(trainData.getFeatureMatrix(), new Label(tempLabels[j], null));

      indices.shuffle(new Random());
      for (int j = 0; j < roundIndices.length; j++) roundIndices[j] = indices.get(j);

      long t3 = System.currentTimeMillis();

      if (NEED_REPORT) {
        statisticReport(ROUND);
      }

      long t4 = System.currentTimeMillis();

      log.info("round {}, KL {}", ROUND, roundKL[ROUND]);
      log.info(
          "boost {} |update gradient {} |report {}| total {}", t2 - t1, t3 - t2, t4 - t3, t4 - t1);
    }

    log.info("GradientBoostClassification training finished ...");

    if (NEED_REPORT) {
      printRoundReport();
    }
  }