/** * Merge the given canopy into the canopies list. If it touches any existing canopy (norm<T1) then * add the center of each to the other. If it covers any other canopies (norm<T2), then merge the * given canopy with the closest covering canopy. If the given canopy does not cover any other * canopies, add it to the canopies list. * * @param aCanopy a MeanShiftCanopy to be merged * @param canopies the List<Canopy> to be appended */ public void mergeCanopy(MeanShiftCanopy aCanopy, Collection<MeanShiftCanopy> canopies) { MeanShiftCanopy closestCoveringCanopy = null; double closestNorm = Double.MAX_VALUE; for (MeanShiftCanopy canopy : canopies) { double norm = measure.distance(canopy.getCenter(), aCanopy.getCenter()); if (norm < t1) { aCanopy.touch(canopy); } if (norm < t2 && (closestCoveringCanopy == null || norm < closestNorm)) { closestNorm = norm; closestCoveringCanopy = canopy; } } if (closestCoveringCanopy == null) { canopies.add(aCanopy); } else { closestCoveringCanopy.merge(aCanopy); } }
public static void mapPoint( IntWritable clusterId, WeightedVectorWritable point, DistanceMeasure measure, Map<Integer, List<VectorWritable>> representativePoints, Map<Integer, WeightedVectorWritable> mostDistantPoints) { int key = clusterId.get(); WeightedVectorWritable currentMDP = mostDistantPoints.get(key); List<VectorWritable> repPoints = representativePoints.get(key); double totalDistance = 0.0; for (VectorWritable refPoint : repPoints) { totalDistance += measure.distance(refPoint.get(), point.getVector()); } if (currentMDP == null || currentMDP.getWeight() < totalDistance) { mostDistantPoints.put( key, new WeightedVectorWritable(totalDistance, point.getVector().clone())); } }
public MeanShiftCanopyClusterer(Configuration configuration) { try { measure = Class.forName(configuration.get(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY)) .asSubclass(DistanceMeasure.class) .newInstance(); measure.configure(configuration); } catch (ClassNotFoundException e) { throw new IllegalStateException(e); } catch (IllegalAccessException e) { throw new IllegalStateException(e); } catch (InstantiationException e) { throw new IllegalStateException(e); } // nextCanopyId = 0; // never read? t1 = Double.parseDouble(configuration.get(MeanShiftCanopyConfigKeys.T1_KEY)); t2 = Double.parseDouble(configuration.get(MeanShiftCanopyConfigKeys.T2_KEY)); convergenceDelta = Double.parseDouble(configuration.get(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY)); }
// @Test public void testSearch() throws Exception { int nVar = 10; final DistanceMeasure distance = new EuclideanDistanceMeasure(); // WeightedEuclideanDistanceMeasure weightFunction = new WeightedEuclideanDistanceMeasure(); // Vector w = new DenseVector(nVar); // w.assign(1); // w.viewPart(0, 5).assign(2); // w.viewPart(5, 5).assign(1); // weightFunction.setWeights(w); double d1 = 0; double d2 = 0; double d3 = 0; double t1 = 0; double t2 = 0; double t3 = 0; double tsim = 0.0; double sim; int nearest = 100; int numberOfNeighbors = 100; int sz; int tsz = 0; // LocalitySensitiveHash lsh = new LocalitySensitiveHash(weightFunction, nVar); LocalitySensitiveHash lsh = new LocalitySensitiveHash(distance, nVar, 2000); List<Vector> randomNeighbor = Lists.newArrayList(); List<Vector> orgNeighbor = Lists.newArrayList(); List<Vector> ref = Lists.newArrayList(); // final DoubleFunction random = Functions.random(); List<Vector> inputList = readInputFile("/Users/dixu/Documents/Amex/kNN/kMeansTestFile.csv"); for (int i = 0; i < 40000; i++) { // Vector v = inputList.get(i); // v.assign(random); lsh.add(inputList.get(i), i); ref.add(inputList.get(i)); orgNeighbor.add(inputList.get(i)); } randomNeighbor.addAll(ref.subList(0, numberOfNeighbors)); long runningTime = 0; for (int i = 40100; i < (40100 + nearest); i++) { final Vector v = inputList.get(i); // v.assign(random); long time1 = System.nanoTime(); List<WeightedVector> rx = lsh.search(v, numberOfNeighbors); List<Vector> lshNeighbor = Lists.newArrayList(); for (WeightedVector obs : rx) { lshNeighbor.add(obs.getVector()); } long time2 = System.nanoTime(); runningTime = runningTime + time2 - time1; sz = lsh.countVectors(); Ordering<Vector> queryOrder = new Ordering<Vector>() { @Override public int compare(Vector v1, Vector v2) { return Double.compare(distance.distance(v, v1), distance.distance(v, v2)); } }; Collections.sort(orgNeighbor, queryOrder); List<Vector> trueNeighbor = orgNeighbor.subList(0, numberOfNeighbors); List<Vector> intersection1 = ListUtils.intersection(trueNeighbor, lshNeighbor); sim = intersection1.size() / (double) numberOfNeighbors; for (int j = 0; j < numberOfNeighbors; j++) { d1 += distance.distance(v, lshNeighbor.get(j)); d2 += distance.distance(v, randomNeighbor.get(j)); d3 += distance.distance(v, trueNeighbor.get(j)); } d1 = d1 / numberOfNeighbors; d2 = d2 / numberOfNeighbors; d3 = d3 / numberOfNeighbors; t1 += d1; t2 += d2; t3 += d3; tsim += sim; tsz += sz; } t1 = t1 / nearest; t2 = t2 / nearest; t3 = t3 / nearest; tsim = tsim / nearest; tsz = tsz / nearest; System.out.printf( "ave_search=%d ave_sim=%.2f trueNeighbor_dist=%.2f proxyNeighbor_dist=%.2f " + "randomNeighbor_dist=%.2f \n", tsz, tsim, t3, t1, t2); System.out.printf("running time = %.2f seconds \n", runningTime / 1e9); }
/** * Return if the cluster is converged by comparing its center and centroid. * * @param measure The distance measure to use for cluster-point comparisons. * @param convergenceDelta the convergence delta to use for stopping. * @return if the cluster is converged */ public boolean computeConvergence(DistanceMeasure measure, double convergenceDelta) { Vector centroid = computeCentroid(); converged = measure.distance(centroid.getLengthSquared(), centroid, getCenter()) <= convergenceDelta; return converged; }
/** * Return if the point is closely covered by the canopy * * @param canopy a canopy. * @param point a Vector point * @return if the point is covered */ public boolean closelyBound(MeanShiftCanopy canopy, Vector point) { return measure.distance(canopy.getCenter(), point) < t2; }
/** * Return if the point is covered by this canopy * * @param canopy a canopy. * @param point a Vector point * @return if the point is covered */ boolean covers(MeanShiftCanopy canopy, Vector point) { return measure.distance(canopy.getCenter(), point) < t1; }