Example #1
0
 @Override
 public GradientUpdater getUpdater() {
   AdaGrad adaGrad = new AdaGrad(lrSum / count);
   adaGrad.setHistoricalGradient(historicalGradientSum.div(count));
   adaGrad.setNumIterations((int) (numIterationsSum / count));
   return adaGrad;
 }
  @Test
  public void testAdaGrad1() {
    int rows = 1;
    int cols = 1;

    AdaGrad grad = new AdaGrad(rows, cols, 1e-3);
    INDArray W = Nd4j.ones(rows, cols);
    assertEquals(1e-1, grad.getGradient(W, 0).getDouble(0), 1e-1);
  }
  private Pair<INDArray, Double> update(
      AdaGrad weightAdaGrad,
      AdaGrad biasAdaGrad,
      INDArray syn0,
      INDArray bias,
      VocabWord w1,
      INDArray wordVector,
      INDArray contextVector,
      double gradient) {
    // gradient for word vectors
    INDArray grad1 = contextVector.mul(gradient);
    INDArray update = weightAdaGrad.getGradient(grad1, w1.getIndex(), syn0.shape());

    double w1Bias = bias.getDouble(w1.getIndex());
    double biasGradient = biasAdaGrad.getGradient(gradient, w1.getIndex(), bias.shape());
    double update2 = w1Bias - biasGradient;
    return new Pair<>(update, update2);
  }
Example #4
0
  /* compute the gradient given the current solution, the probabilities and the constant */
  protected Pair<Double, INDArray> gradient(INDArray p) {
    INDArray sumY = pow(y, 2).sum(1);
    if (yIncs == null) yIncs = zeros(y.shape());
    if (gains == null) gains = ones(y.shape());

    // Student-t distribution
    // also un normalized q
    INDArray qu =
        y.mmul(y.transpose())
            .muli(-2)
            .addiRowVector(sumY)
            .transpose()
            .addiRowVector(sumY)
            .addi(1)
            .rdivi(1);

    int n = y.rows();

    // set diagonal to zero
    doAlongDiagonal(qu, new Zero());

    // normalize to get probabilities
    INDArray q = qu.div(qu.sum(Integer.MAX_VALUE));

    BooleanIndexing.applyWhere(q, Conditions.lessThan(realMin), new Value(realMin));

    INDArray PQ = p.sub(q);

    INDArray yGrads = getYGradient(n, PQ, qu);

    gains =
        gains
            .add(.2)
            .muli(
                yGrads.cond(Conditions.greaterThan(0)).neqi(yIncs.cond(Conditions.greaterThan(0))))
            .addi(
                gains
                    .mul(0.8)
                    .muli(
                        yGrads
                            .cond(Conditions.greaterThan(0))
                            .eqi(yIncs.cond(Conditions.greaterThan(0)))));

    BooleanIndexing.applyWhere(gains, Conditions.lessThan(minGain), new Value(minGain));

    INDArray gradChange = gains.mul(yGrads);

    if (useAdaGrad) gradChange = adaGrad.getGradient(gradChange, 0);
    else gradChange.muli(learningRate);

    yIncs.muli(momentum).subi(gradChange);

    double cost = p.mul(log(p.div(q), false)).sum(Integer.MAX_VALUE).getDouble(0);
    return new Pair<>(cost, yIncs);
  }
  @Test
  public void testAdaGrad() {
    int rows = 10;
    int cols = 2;

    /*
    Project for tomorrow:

           BaseElementWiseOp is having issues with the reshape (which produces inconsistent results) the test case for this  was adagrad
     */
    AdaGrad grad = new AdaGrad(rows, cols, 0.1);
    INDArray W = Nd4j.zeros(rows, cols);
    Distribution dist = Nd4j.getDistributions().createNormal(1, 1);
    for (int i = 0; i < W.rows(); i++) W.putRow(i, Nd4j.create(dist.sample(W.columns())));

    for (int i = 0; i < 5; i++) {
      String learningRates =
          String.valueOf("\nAdagrad\n " + grad.getGradient(W, i)).replaceAll(";", "\n");
      log.info(learningRates);
      W.addi(Nd4j.randn(rows, cols));
    }
  }
Example #6
0
  /**
   * @param X
   * @param nDims
   * @param perplexity
   */
  public INDArray calculate(INDArray X, int nDims, double perplexity) {
    if (usePca) X = PCA.pca(X, Math.min(50, X.columns()), normalize);
    // normalization (don't normalize again after pca)
    if (normalize) {
      X.subi(X.min(Integer.MAX_VALUE));
      X = X.divi(X.max(Integer.MAX_VALUE));
      X = X.subiRowVector(X.mean(0));
    }

    if (nDims > X.columns()) nDims = X.columns();

    INDArray sumX = pow(X, 2).sum(1);

    INDArray D = X.mmul(X.transpose()).muli(-2).addRowVector(sumX).transpose().addRowVector(sumX);

    // output
    if (y == null) y = randn(X.rows(), nDims, Nd4j.getRandom()).muli(1e-3f);

    INDArray p = computeGaussianPerplexity(D, perplexity);

    // lie for better local minima
    p.muli(4);

    // init adagrad where needed
    if (useAdaGrad) {
      if (adaGrad == null) {
        adaGrad = new AdaGrad(y.shape());
        adaGrad.setMasterStepSize(learningRate);
      }
    }

    for (int i = 0; i < maxIter; i++) {
      step(p, i);

      if (i == switchMomentumIteration) momentum = finalMomentum;
      if (i == stopLyingIteration) p.divi(4);

      if (iterationListener != null) iterationListener.iterationDone(null, i);
    }

    return y;
  }
Example #7
0
  public AdaGrad createSubset(int index) {
    if (historicalGradient == null) this.historicalGradient = Nd4j.ones(shape);

    if (Shape.isMatrix(shape)) {
      AdaGrad a = new AdaGrad(1, historicalGradient.columns());
      // grab only the needed elements
      INDArray slice = historicalGradient.slice(index).dup();
      a.historicalGradient = slice;
      a.setLearningRate(learningRate);
      return a;
    } else {
      AdaGrad a = new AdaGrad(1, 1);
      // grab only the needed elements
      INDArray slice = Nd4j.scalar(historicalGradient.getDouble(index));
      a.historicalGradient = slice;
      a.setLearningRate(learningRate);
      return a;
    }
  }
Example #8
0
  public INDArray getValueGradient(final List<Tree> trainingBatch) {

    // We use TreeMap for each of these so that they stay in a
    // canonical sorted order
    // TODO: factor out the initialization routines
    // binaryTD stands for Transform Derivatives
    final MultiDimensionalMap<String, String, INDArray> binaryTD =
        MultiDimensionalMap.newTreeBackedMap();
    // the derivatives of the INd4j for the binary nodes
    final MultiDimensionalMap<String, String, INDArray> binaryINDArrayTD =
        MultiDimensionalMap.newTreeBackedMap();
    // binaryCD stands for Classification Derivatives
    final MultiDimensionalMap<String, String, INDArray> binaryCD =
        MultiDimensionalMap.newTreeBackedMap();

    // unaryCD stands for Classification Derivatives
    final Map<String, INDArray> unaryCD = new TreeMap<>();

    // word vector derivatives
    final Map<String, INDArray> wordVectorD = new TreeMap<>();

    for (MultiDimensionalMap.Entry<String, String, INDArray> entry : binaryTransform.entrySet()) {
      int numRows = entry.getValue().rows();
      int numCols = entry.getValue().columns();

      binaryTD.put(entry.getFirstKey(), entry.getSecondKey(), Nd4j.create(numRows, numCols));
    }

    if (!combineClassification) {
      for (MultiDimensionalMap.Entry<String, String, INDArray> entry :
          binaryClassification.entrySet()) {
        int numRows = entry.getValue().rows();
        int numCols = entry.getValue().columns();

        binaryCD.put(entry.getFirstKey(), entry.getSecondKey(), Nd4j.create(numRows, numCols));
      }
    }

    if (useDoubleTensors) {
      for (MultiDimensionalMap.Entry<String, String, INDArray> entry : binaryTensors.entrySet()) {
        int numRows = entry.getValue().size(1);
        int numCols = entry.getValue().size(2);
        int numSlices = entry.getValue().slices();

        binaryINDArrayTD.put(
            entry.getFirstKey(), entry.getSecondKey(), Nd4j.create(numRows, numCols, numSlices));
      }
    }

    for (Map.Entry<String, INDArray> entry : unaryClassification.entrySet()) {
      int numRows = entry.getValue().rows();
      int numCols = entry.getValue().columns();
      unaryCD.put(entry.getKey(), Nd4j.create(numRows, numCols));
    }

    for (String s : vocabCache.words()) {
      INDArray vector = featureVectors.vector(s);
      int numRows = vector.rows();
      int numCols = vector.columns();
      wordVectorD.put(s, Nd4j.create(numRows, numCols));
    }

    final List<Tree> forwardPropTrees = new CopyOnWriteArrayList<>();
    // if(!forwardPropTrees.isEmpty())
    Parallelization.iterateInParallel(
        trainingBatch,
        new Parallelization.RunnableWithParams<Tree>() {

          public void run(Tree currentItem, Object[] args) {
            Tree trainingTree = new Tree(currentItem);
            trainingTree.connect(new ArrayList<>(currentItem.children()));
            // this will attach the error vectors and the node vectors
            // to each node in the tree
            forwardPropagateTree(trainingTree);
            forwardPropTrees.add(trainingTree);
          }
        },
        rnTnActorSystem);

    // TODO: we may find a big speedup by separating the derivatives and then summing
    final AtomicDouble error = new AtomicDouble(0);
    if (!forwardPropTrees.isEmpty())
      Parallelization.iterateInParallel(
          forwardPropTrees,
          new Parallelization.RunnableWithParams<Tree>() {

            public void run(Tree currentItem, Object[] args) {
              backpropDerivativesAndError(
                  currentItem, binaryTD, binaryCD, binaryINDArrayTD, unaryCD, wordVectorD);
              error.addAndGet(currentItem.errorSum());
            }
          },
          new Parallelization.RunnableWithParams<Tree>() {

            public void run(Tree currentItem, Object[] args) {}
          },
          rnTnActorSystem,
          new Object[] {binaryTD, binaryCD, binaryINDArrayTD, unaryCD, wordVectorD});

    // scale the error by the number of sentences so that the
    // regularization isn't drowned out for large training batchs
    double scale =
        trainingBatch == null || trainingBatch.isEmpty() ? 1.0f : (1.0f / trainingBatch.size());
    value = error.doubleValue() * scale;

    value += scaleAndRegularize(binaryTD, binaryTransform, scale, regTransformMatrix);
    value += scaleAndRegularize(binaryCD, binaryClassification, scale, regClassification);
    value +=
        scaleAndRegularizeINDArray(binaryINDArrayTD, binaryTensors, scale, regTransformINDArray);
    value += scaleAndRegularize(unaryCD, unaryClassification, scale, regClassification);
    value += scaleAndRegularize(wordVectorD, featureVectors, scale, regWordVector);

    INDArray derivative =
        Nd4j.toFlattened(
            getNumParameters(),
            binaryTD.values().iterator(),
            binaryCD.values().iterator(),
            binaryINDArrayTD.values().iterator(),
            unaryCD.values().iterator(),
            wordVectorD.values().iterator());

    if (derivative.length() != numParameters)
      throw new IllegalStateException(
          "Gradient has wrong number of parameters "
              + derivative.length()
              + " should have been "
              + numParameters);

    if (paramAdaGrad == null) paramAdaGrad = new AdaGrad(1, derivative.columns());

    derivative = paramAdaGrad.getGradient(derivative, 0);

    return derivative;
  }