public static Job createTimesSquaredJob( Configuration initialConf, Vector v, int outputVectorDim, Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass, Class<? extends VectorSummingReducer> redClass) throws IOException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf); matrixInputPath = fs.makeQualified(matrixInputPath); outputVectorPathBase = fs.makeQualified(outputVectorPathBase); long now = System.nanoTime(); Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now); SequenceFile.Writer inputVectorPathWriter = null; try { inputVectorPathWriter = new SequenceFile.Writer( fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class); inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v)); } finally { Closeables.close(inputVectorPathWriter, false); } URI ivpURI = inputVectorPath.toUri(); DistributedCache.setCacheFiles(new URI[] {ivpURI}, initialConf); Job job = HadoopUtil.prepareJob( matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME), SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(redClass); job.setJobName("TimesSquaredJob: " + matrixInputPath); Configuration conf = job.getConfiguration(); conf.set(INPUT_VECTOR, ivpURI.toString()); conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense()); conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim); return job; }
/** * A version to compute yRow as a sparse vector in case of extremely sparse matrices * * @param aRow * @param yRowOut */ public void computeYRow(Vector aRow, Vector yRowOut) { yRowOut.assign(0.0); if (aRow.isDense()) { int n = aRow.size(); for (int j = 0; j < n; j++) { accumDots(j, aRow.getQuick(j), yRowOut); } } else { for (Iterator<Element> iter = aRow.iterateNonZero(); iter.hasNext(); ) { Element el = iter.next(); accumDots(el.index(), el.get(), yRowOut); } } }
/** * compute YRow=ARow*Omega. * * @param aRow row of matrix A (size n) * @param yRow row of matrix Y (result) must be pre-allocated to size of (k+p) */ @Deprecated public void computeYRow(Vector aRow, double[] yRow) { // assert yRow.length == kp; Arrays.fill(yRow, 0.0); if (aRow.isDense()) { int n = aRow.size(); for (int j = 0; j < n; j++) { accumDots(j, aRow.getQuick(j), yRow); } } else { for (Iterator<Element> iter = aRow.iterateNonZero(); iter.hasNext(); ) { Element el = iter.next(); accumDots(el.index(), el.get(), yRow); } } }
@Override protected void map(Writable key, VectorWritable value, Context context) throws IOException, InterruptedException { omega.computeYRow(value.get(), yRow); // compute outer product update for YtY if (yRow.isDense()) { for (int i = 0; i < kp; i++) { double yi; if ((yi = yRow.getQuick(i)) == 0.0) { continue; // avoid densing up here unnecessarily } for (int j = i; j < kp; j++) { double yj; if ((yj = yRow.getQuick(j)) != 0.0) { mYtY.setQuick(i, j, mYtY.getQuick(i, j) + yi * yj); } } } } else { /* * the disadvantage of using sparse vector (aside from the fact that we * are creating some short-lived references) here is that we obviously * do two times more iterations then necessary if y row is pretty dense. */ for (Iterator<Vector.Element> iterI = yRow.iterateNonZero(); iterI.hasNext(); ) { Vector.Element eli = iterI.next(); int i = eli.index(); for (Iterator<Vector.Element> iterJ = yRow.iterateNonZero(); iterJ.hasNext(); ) { Vector.Element elj = iterJ.next(); int j = elj.index(); if (j < i) { continue; } mYtY.setQuick(i, j, mYtY.getQuick(i, j) + eli.get() * elj.get()); } } } }