Beispiel #1
0
  /**
   * Convert data to probability co-occurrences (aka calculating the kernel)
   *
   * @param d the data to convert
   * @param u the perplexity of the model
   * @return the probabilities of co-occurrence
   */
  public INDArray computeGaussianPerplexity(final INDArray d, double u) {
    int n = d.rows();
    final INDArray p = zeros(n, n);
    final INDArray beta = ones(n, 1);
    final double logU = Math.log(u);

    log.info("Calculating probabilities of data similarities..");
    for (int i = 0; i < n; i++) {
      if (i % 500 == 0 && i > 0) log.info("Handled " + i + " records");

      double betaMin = Double.NEGATIVE_INFINITY;
      double betaMax = Double.POSITIVE_INFINITY;
      int[] vals = Ints.concat(ArrayUtil.range(0, i), ArrayUtil.range(i + 1, d.columns()));
      INDArrayIndex[] range = new INDArrayIndex[] {new NDArrayIndex(vals)};

      INDArray row = d.slice(i).get(range);
      Pair<INDArray, INDArray> pair = hBeta(row, beta.getDouble(i));
      INDArray hDiff = pair.getFirst().sub(logU);
      int tries = 0;

      // while hdiff > tolerance
      while (BooleanIndexing.and(abs(hDiff), Conditions.greaterThan(tolerance)) && tries < 50) {
        // if hdiff > 0
        if (BooleanIndexing.and(hDiff, Conditions.greaterThan(0))) {
          if (Double.isInfinite(betaMax)) beta.putScalar(i, beta.getDouble(i) * 2.0);
          else beta.putScalar(i, (beta.getDouble(i) + betaMax) / 2.0);
          betaMin = beta.getDouble(i);
        } else {
          if (Double.isInfinite(betaMin)) beta.putScalar(i, beta.getDouble(i) / 2.0);
          else beta.putScalar(i, (beta.getDouble(i) + betaMin) / 2.0);
          betaMax = beta.getDouble(i);
        }

        pair = hBeta(row, beta.getDouble(i));
        hDiff = pair.getFirst().subi(logU);
        tries++;
      }

      p.slice(i).put(range, pair.getSecond());
    }

    // dont need data in memory after
    log.info("Mean value of sigma " + sqrt(beta.rdiv(1)).mean(Integer.MAX_VALUE));
    BooleanIndexing.applyWhere(p, Conditions.isNan(), new Value(realMin));

    // set 0 along the diagonal
    INDArray permute = p.transpose();

    INDArray pOut = p.add(permute);

    pOut.divi(pOut.sum(Integer.MAX_VALUE));
    BooleanIndexing.applyWhere(
        pOut, Conditions.lessThan(Nd4j.EPS_THRESHOLD), new Value(Nd4j.EPS_THRESHOLD));
    // ensure no nans
    return pOut;
  }
Beispiel #2
0
  /* compute the gradient given the current solution, the probabilities and the constant */
  protected Pair<Double, INDArray> gradient(INDArray p) {
    INDArray sumY = pow(y, 2).sum(1);
    if (yIncs == null) yIncs = zeros(y.shape());
    if (gains == null) gains = ones(y.shape());

    // Student-t distribution
    // also un normalized q
    INDArray qu =
        y.mmul(y.transpose())
            .muli(-2)
            .addiRowVector(sumY)
            .transpose()
            .addiRowVector(sumY)
            .addi(1)
            .rdivi(1);

    int n = y.rows();

    // set diagonal to zero
    doAlongDiagonal(qu, new Zero());

    // normalize to get probabilities
    INDArray q = qu.div(qu.sum(Integer.MAX_VALUE));

    BooleanIndexing.applyWhere(q, Conditions.lessThan(realMin), new Value(realMin));

    INDArray PQ = p.sub(q);

    INDArray yGrads = getYGradient(n, PQ, qu);

    gains =
        gains
            .add(.2)
            .muli(
                yGrads.cond(Conditions.greaterThan(0)).neqi(yIncs.cond(Conditions.greaterThan(0))))
            .addi(
                gains
                    .mul(0.8)
                    .muli(
                        yGrads
                            .cond(Conditions.greaterThan(0))
                            .eqi(yIncs.cond(Conditions.greaterThan(0)))));

    BooleanIndexing.applyWhere(gains, Conditions.lessThan(minGain), new Value(minGain));

    INDArray gradChange = gains.mul(yGrads);

    if (useAdaGrad) gradChange = adaGrad.getGradient(gradChange, 0);
    else gradChange.muli(learningRate);

    yIncs.muli(momentum).subi(gradChange);

    double cost = p.mul(log(p.div(q), false)).sum(Integer.MAX_VALUE).getDouble(0);
    return new Pair<>(cost, yIncs);
  }
Beispiel #3
0
  /** Apply gradient normalization: scale based on L2, clipping etc. */
  public void preApply(Layer layer, Gradient gradient, int iteration) {
    GradientNormalization normalization = layer.conf().getLayer().getGradientNormalization();
    if (normalization == null || normalization == GradientNormalization.None) return; // no op

    final double threshold = layer.conf().getLayer().getGradientNormalizationThreshold();

    switch (normalization) {
      case RenormalizeL2PerLayer:
        double sumSquares = 0.0;
        for (INDArray g : gradient.gradientForVariable().values()) {
          double l2 = g.norm2Number().doubleValue();
          // l2 norm: sqrt(sum_i g_i^2)
          sumSquares += l2 * l2;
        }
        double layerL2 = FastMath.sqrt(sumSquares);
        for (INDArray g : gradient.gradientForVariable().values()) {
          g.divi(layerL2);
        }
        break;
      case RenormalizeL2PerParamType:
        for (INDArray g : gradient.gradientForVariable().values()) {
          double l2 =
              Nd4j.getExecutioner().execAndReturn(new Norm2(g)).getFinalResult().doubleValue();
          g.divi(l2);
        }
        break;
      case ClipElementWiseAbsoluteValue:
        Condition absValueCondition = new AbsValueGreaterThan(threshold);
        Function<Number, Number> clipFn =
            new Function<Number, Number>() {
              @Override
              public Number apply(Number number) {
                return (number.doubleValue() > threshold ? threshold : -threshold);
              }
            };

        for (INDArray g : gradient.gradientForVariable().values()) {
          BooleanIndexing.applyWhere(g, absValueCondition, clipFn);
        }
        break;
      case ClipL2PerLayer:
        double sumSquares2 = 0.0;
        for (INDArray g : gradient.gradientForVariable().values()) {
          double l2 =
              Nd4j.getExecutioner().execAndReturn(new Norm2(g)).getFinalResult().doubleValue();
          // l2 norm: sqrt(sum_i g_i^2)
          sumSquares2 += l2 * l2;
        }
        double layerL22 = FastMath.sqrt(sumSquares2);
        if (layerL22 > threshold) {
          double scalingFactor = threshold / layerL22; // g = g / l2 * threshold ->
          for (INDArray g : gradient.gradientForVariable().values()) {
            g.muli(scalingFactor);
          }
        }
        break;
      case ClipL2PerParamType:
        for (INDArray g : gradient.gradientForVariable().values()) {
          double l2 = g.norm2Number().doubleValue();
          if (l2 > threshold) {
            double scalingFactor = l2 / threshold;
            g.divi(scalingFactor);
          }
        }
        break;
      default:
        throw new RuntimeException(
            "Unknown (or not implemented) gradient normalization strategy: " + normalization);
    }
  }