@Override protected void map(IntWritable row, VectorWritable vectorWritable, Context ctx) throws IOException, InterruptedException { Vector rowVector = similarity.normalize(vectorWritable.get()); int numNonZeroEntries = 0; double maxValue = Double.MIN_VALUE; Iterator<Vector.Element> nonZeroElements = rowVector.iterateNonZero(); while (nonZeroElements.hasNext()) { Vector.Element element = nonZeroElements.next(); RandomAccessSparseVector partialColumnVector = new RandomAccessSparseVector(Integer.MAX_VALUE); partialColumnVector.setQuick(row.get(), element.get()); ctx.write(new IntWritable(element.index()), new VectorWritable(partialColumnVector)); numNonZeroEntries++; if (maxValue < element.get()) { maxValue = element.get(); } } if (threshold != NO_THRESHOLD) { nonZeroEntries.setQuick(row.get(), numNonZeroEntries); maxValues.setQuick(row.get(), maxValue); } norms.setQuick(row.get(), similarity.norm(rowVector)); ctx.getCounter(Counters.ROWS).increment(1); }
@Override protected void reduce(IntWritable row, Iterable<VectorWritable> partialDots, Context ctx) throws IOException, InterruptedException { Iterator<VectorWritable> partialDotsIterator = partialDots.iterator(); Vector dots = partialDotsIterator.next().get(); while (partialDotsIterator.hasNext()) { Vector toAdd = partialDotsIterator.next().get(); Iterator<Vector.Element> nonZeroElements = toAdd.iterateNonZero(); while (nonZeroElements.hasNext()) { Vector.Element nonZeroElement = nonZeroElements.next(); dots.setQuick( nonZeroElement.index(), dots.getQuick(nonZeroElement.index()) + nonZeroElement.get()); } } Vector similarities = dots.like(); double normA = norms.getQuick(row.get()); Iterator<Vector.Element> dotsWith = dots.iterateNonZero(); while (dotsWith.hasNext()) { Vector.Element b = dotsWith.next(); double similarityValue = similarity.similarity(b.get(), normA, norms.getQuick(b.index()), numberOfColumns); if (similarityValue >= treshold) { similarities.set(b.index(), similarityValue); } } if (excludeSelfSimilarity) { similarities.setQuick(row.get(), 0); } ctx.write(row, new VectorWritable(similarities)); }
@Override protected void map(IntWritable column, VectorWritable occurrenceVector, Context ctx) throws IOException, InterruptedException { Vector.Element[] occurrences = Vectors.toArray(occurrenceVector); Arrays.sort(occurrences, BY_INDEX); int cooccurrences = 0; int prunedCooccurrences = 0; for (int n = 0; n < occurrences.length; n++) { Vector.Element occurrenceA = occurrences[n]; Vector dots = new RandomAccessSparseVector(Integer.MAX_VALUE); for (int m = n; m < occurrences.length; m++) { Vector.Element occurrenceB = occurrences[m]; if (threshold == NO_THRESHOLD || consider(occurrenceA, occurrenceB)) { dots.setQuick( occurrenceB.index(), similarity.aggregate(occurrenceA.get(), occurrenceB.get())); cooccurrences++; } else { prunedCooccurrences++; } } ctx.write(new IntWritable(occurrenceA.index()), new VectorWritable(dots)); } ctx.getCounter(Counters.COOCCURRENCES).increment(cooccurrences); ctx.getCounter(Counters.PRUNED_COOCCURRENCES).increment(prunedCooccurrences); }
private boolean consider(Vector.Element occurrenceA, Vector.Element occurrenceB) { int numNonZeroEntriesA = numNonZeroEntries.get(occurrenceA.index()); int numNonZeroEntriesB = numNonZeroEntries.get(occurrenceB.index()); double maxValueA = maxValues.get(occurrenceA.index()); double maxValueB = maxValues.get(occurrenceB.index()); return similarity.consider( numNonZeroEntriesA, numNonZeroEntriesB, maxValueA, maxValueB, threshold); }