private boolean outputHelper( HalfPair hp, int vectorID, VectorComponentArrayWritable vector, float similarity, OutputCollector<VectorPair, FloatWritable> output, Reporter reporter) throws IOException { reporter.incrCounter(APS.EVALUATED, 1); reporter.progress(); if (haspruned) { VectorComponentArrayWritable remainder = pruned.get(hp.getID()); if (remainder != null) { // cheap upper bound dot(x,y) <= min(|x|,|y|) * maxweight(x) * maxweight(y) // double dotProdBound = min(remainder.length(), vector.length()) * // remainder.getMaxWeight() // * vector.getMaxWeight(); // if (compare(similarity + dotProdBound, threshold) >= 0) similarity += VectorComponentArrayWritable.dotProduct(vector, remainder); } else { LOG.warn("No remainder found for vector " + hp.getID()); } } if (compare(similarity, threshold) >= 0) { int firstID = VectorPair.canonicalFirst(vectorID, hp.getID()); int secondID = VectorPair.canonicalSecond(vectorID, hp.getID()); outKey.set(firstID, secondID); outValue.set(similarity); output.collect(outKey, outValue); reporter.incrCounter(APS.SIMILAR, 1); return true; } return false; }
@Override public void reduce( GenericKey key, Iterator<GenericValue> values, OutputCollector<GenericKey, GenericValue> output, Reporter reporter) throws IOException { if (key.getSecondary() < Preprocesser.MINIMUM_ID) { // vector output.collect(key, values.next()); if (values.hasNext()) assert false : "Vectors should not get grouped by combiner: " + key; } else { // addend reporter.progress(); int counter = 0; float sim = 0; HalfPair hp = null; while (values.hasNext()) { hp = (HalfPair) values.next().get(); sim += hp.getSimilarity(); if (counter++ % REPORTER_INTERVAL == 0) reporter.progress(); } if (hp != null) { payload.set(hp.getID(), sim); outValue.set(payload); output.collect(key, outValue); } else { assert false : "There is nothing to combine!"; } } }
@Override public void map( LongWritable key, IndexItemArrayWritable value, OutputCollector<GenericKey, GenericValue> output, Reporter reporter) throws IOException { IndexItem[] postingList = value.toIndexItemArray(); for (int i = 1; i < postingList.length; i++) { for (int j = 0; j < i; j++) { IndexItem x = postingList[i]; IndexItem y = postingList[j]; // |y| >= t / maxweight(x) && |x| >= t / maxweight(y) if (compare(x.vectorLength(), Math.ceil(threshold / y.vectorMaxWeight())) >= 0 && compare(y.vectorLength(), Math.ceil(threshold / x.vectorMaxWeight())) >= 0 // tight upper bound on similarity score && compare( min(x.vectorMaxWeight() * y.vectorSum(), y.vectorMaxWeight() * x.vectorSum()), threshold) >= 0) { // positional filter // && compare( // min(x.positionalMaxWeight() * y.positionalSum(), // y.positionalMaxWeight() * x.positionalSum()) // + x.getWeight() * y.getWeight(), threshold) >= 0) if (j % REPORTER_INTERVAL == 0) reporter.progress(); int lpv = IndexItem.getLeastPrunedVectorID(x, y); int mpv = IndexItem.getMostPrunedVectorID(x, y); float psim = (float) (x.getWeight() * y.getWeight()); outKey.set(lpv, mpv); payload.set(mpv, psim); outValue.set(payload); output.collect(outKey, outValue); reporter.incrCounter(APS.ADDEND, 1); } } } }
@Override public void reduce( GenericKey key, Iterator<GenericValue> values, OutputCollector<VectorPair, FloatWritable> output, Reporter reporter) throws IOException { int vectorID = key.getPrimary(); assert (key.getSecondary() == -1); // the vector is the first value VectorComponentArrayWritable vector = (VectorComponentArrayWritable) values.next().get(); // half pairs are sorted such that all equal pairs are consecutive if (values.hasNext()) { reporter.incrCounter(APS.COMBINED, 1); HalfPair hp1 = (HalfPair) values.next().get(); float similarity = hp1.getSimilarity(); HalfPair hp2; int counter = 0; while (values.hasNext()) { reporter.incrCounter(APS.COMBINED, 1); if (counter++ % REPORTER_INTERVAL == 0) reporter.progress(); hp2 = (HalfPair) values.next().get(); if (hp1.equals(hp2)) { similarity += hp2.getSimilarity(); } else { // output outputHelper(hp1, vectorID, vector, similarity, output, reporter); // start new stripe hp1 = hp2; similarity = hp1.getSimilarity(); } } // output the last one outputHelper(hp1, vectorID, vector, similarity, output, reporter); } }