/** * Convert data to probability co-occurrences (aka calculating the kernel) * * @param d the data to convert * @param u the perplexity of the model * @return the probabilities of co-occurrence */ public INDArray computeGaussianPerplexity(final INDArray d, double u) { int n = d.rows(); final INDArray p = zeros(n, n); final INDArray beta = ones(n, 1); final double logU = Math.log(u); log.info("Calculating probabilities of data similarities.."); for (int i = 0; i < n; i++) { if (i % 500 == 0 && i > 0) log.info("Handled " + i + " records"); double betaMin = Double.NEGATIVE_INFINITY; double betaMax = Double.POSITIVE_INFINITY; int[] vals = Ints.concat(ArrayUtil.range(0, i), ArrayUtil.range(i + 1, d.columns())); INDArrayIndex[] range = new INDArrayIndex[] {new NDArrayIndex(vals)}; INDArray row = d.slice(i).get(range); Pair<INDArray, INDArray> pair = hBeta(row, beta.getDouble(i)); INDArray hDiff = pair.getFirst().sub(logU); int tries = 0; // while hdiff > tolerance while (BooleanIndexing.and(abs(hDiff), Conditions.greaterThan(tolerance)) && tries < 50) { // if hdiff > 0 if (BooleanIndexing.and(hDiff, Conditions.greaterThan(0))) { if (Double.isInfinite(betaMax)) beta.putScalar(i, beta.getDouble(i) * 2.0); else beta.putScalar(i, (beta.getDouble(i) + betaMax) / 2.0); betaMin = beta.getDouble(i); } else { if (Double.isInfinite(betaMin)) beta.putScalar(i, beta.getDouble(i) / 2.0); else beta.putScalar(i, (beta.getDouble(i) + betaMin) / 2.0); betaMax = beta.getDouble(i); } pair = hBeta(row, beta.getDouble(i)); hDiff = pair.getFirst().subi(logU); tries++; } p.slice(i).put(range, pair.getSecond()); } // dont need data in memory after log.info("Mean value of sigma " + sqrt(beta.rdiv(1)).mean(Integer.MAX_VALUE)); BooleanIndexing.applyWhere(p, Conditions.isNan(), new Value(realMin)); // set 0 along the diagonal INDArray permute = p.transpose(); INDArray pOut = p.add(permute); pOut.divi(pOut.sum(Integer.MAX_VALUE)); BooleanIndexing.applyWhere( pOut, Conditions.lessThan(Nd4j.EPS_THRESHOLD), new Value(Nd4j.EPS_THRESHOLD)); // ensure no nans return pOut; }
/* compute the gradient given the current solution, the probabilities and the constant */ protected Pair<Double, INDArray> gradient(INDArray p) { INDArray sumY = pow(y, 2).sum(1); if (yIncs == null) yIncs = zeros(y.shape()); if (gains == null) gains = ones(y.shape()); // Student-t distribution // also un normalized q INDArray qu = y.mmul(y.transpose()) .muli(-2) .addiRowVector(sumY) .transpose() .addiRowVector(sumY) .addi(1) .rdivi(1); int n = y.rows(); // set diagonal to zero doAlongDiagonal(qu, new Zero()); // normalize to get probabilities INDArray q = qu.div(qu.sum(Integer.MAX_VALUE)); BooleanIndexing.applyWhere(q, Conditions.lessThan(realMin), new Value(realMin)); INDArray PQ = p.sub(q); INDArray yGrads = getYGradient(n, PQ, qu); gains = gains .add(.2) .muli( yGrads.cond(Conditions.greaterThan(0)).neqi(yIncs.cond(Conditions.greaterThan(0)))) .addi( gains .mul(0.8) .muli( yGrads .cond(Conditions.greaterThan(0)) .eqi(yIncs.cond(Conditions.greaterThan(0))))); BooleanIndexing.applyWhere(gains, Conditions.lessThan(minGain), new Value(minGain)); INDArray gradChange = gains.mul(yGrads); if (useAdaGrad) gradChange = adaGrad.getGradient(gradChange, 0); else gradChange.muli(learningRate); yIncs.muli(momentum).subi(gradChange); double cost = p.mul(log(p.div(q), false)).sum(Integer.MAX_VALUE).getDouble(0); return new Pair<>(cost, yIncs); }
/** Apply gradient normalization: scale based on L2, clipping etc. */ public void preApply(Layer layer, Gradient gradient, int iteration) { GradientNormalization normalization = layer.conf().getLayer().getGradientNormalization(); if (normalization == null || normalization == GradientNormalization.None) return; // no op final double threshold = layer.conf().getLayer().getGradientNormalizationThreshold(); switch (normalization) { case RenormalizeL2PerLayer: double sumSquares = 0.0; for (INDArray g : gradient.gradientForVariable().values()) { double l2 = g.norm2Number().doubleValue(); // l2 norm: sqrt(sum_i g_i^2) sumSquares += l2 * l2; } double layerL2 = FastMath.sqrt(sumSquares); for (INDArray g : gradient.gradientForVariable().values()) { g.divi(layerL2); } break; case RenormalizeL2PerParamType: for (INDArray g : gradient.gradientForVariable().values()) { double l2 = Nd4j.getExecutioner().execAndReturn(new Norm2(g)).getFinalResult().doubleValue(); g.divi(l2); } break; case ClipElementWiseAbsoluteValue: Condition absValueCondition = new AbsValueGreaterThan(threshold); Function<Number, Number> clipFn = new Function<Number, Number>() { @Override public Number apply(Number number) { return (number.doubleValue() > threshold ? threshold : -threshold); } }; for (INDArray g : gradient.gradientForVariable().values()) { BooleanIndexing.applyWhere(g, absValueCondition, clipFn); } break; case ClipL2PerLayer: double sumSquares2 = 0.0; for (INDArray g : gradient.gradientForVariable().values()) { double l2 = Nd4j.getExecutioner().execAndReturn(new Norm2(g)).getFinalResult().doubleValue(); // l2 norm: sqrt(sum_i g_i^2) sumSquares2 += l2 * l2; } double layerL22 = FastMath.sqrt(sumSquares2); if (layerL22 > threshold) { double scalingFactor = threshold / layerL22; // g = g / l2 * threshold -> for (INDArray g : gradient.gradientForVariable().values()) { g.muli(scalingFactor); } } break; case ClipL2PerParamType: for (INDArray g : gradient.gradientForVariable().values()) { double l2 = g.norm2Number().doubleValue(); if (l2 > threshold) { double scalingFactor = l2 / threshold; g.divi(scalingFactor); } } break; default: throw new RuntimeException( "Unknown (or not implemented) gradient normalization strategy: " + normalization); } }