@Override public Tuple2<String, Instance> call(Tuple2<String, Instance> inst) throws Exception { Instance bestCluster = null; double bestScore = Double.MAX_VALUE; for (String clusterId : clusters.keySet()) { Instance cluster = clusters.get(clusterId); double d = distFunc.distance(inst._2, cluster); if (d < bestScore && d < threshold) { bestScore = d; bestCluster = cluster; } } // System.out.println(inst._1 + " -> " + bestCluster); if (bestCluster == null) { return null; } return new Tuple2<String, Instance>(bestCluster.getId(), inst._2); }
@Override public ClusterContext clusterEntities( Collection<FL_Entity> entities, Collection<FL_Cluster> immutableClusters, Collection<FL_Cluster> clusters, ClusterContext context) { Map<String, FL_Entity> entityIndex = createEntityIndex(entities); Map<String, FL_Cluster> immutableClusterIndex = createClusterIndex(immutableClusters); Map<String, FL_Cluster> clusterIndex = createClusterIndex(clusters); DataSet ds = createDataSet(entities, immutableClusters, context); BaseClusterer clusterer = createNumericClusterer(); List<Cluster> existingClusters = new LinkedList<Cluster>(); for (FL_Cluster cluster : clusters) { double val = 0; PropertyHelper prop = getFirstProperty(cluster, toClusterPropertyName(clusterField)); if (prop != null) { val = getDoubleValue(prop); } Cluster c = clusterer.createCluster(); c.setId(cluster.getUid()); NumericVectorFeature feature = new NumericVectorFeature("num"); feature.setValue(new double[] {val}); c.addFeature(feature); existingClusters.add(c); } ClusterResult rs = null; if (existingClusters.isEmpty()) { rs = clusterer.doCluster(ds); } else { rs = clusterer.doIncrementalCluster(ds, existingClusters); } // clean up clusterer.terminate(); Map<String, FL_Cluster> modifiedClusters = new HashMap<String, FL_Cluster>(); for (Cluster c : rs) { List<FL_Cluster> subClusters = new LinkedList<FL_Cluster>(); List<FL_Entity> members = new LinkedList<FL_Entity>(); for (Instance inst : c.getMembers()) { String id = inst.getId(); if (entityIndex.containsKey(id)) { members.add(entityIndex.get(id)); } else if (immutableClusterIndex.containsKey(id)) { subClusters.add(immutableClusterIndex.get(id)); } } FL_Cluster cluster = clusterIndex.get(c.getId()); if (cluster == null) { cluster = clusterFactory.toCluster(members, subClusters); // cache the cluster property NumericVectorFeature feature = (NumericVectorFeature) c.getFeature("num"); double value = feature.getValue()[0]; addClusterProperty(cluster, clusterField, value); } else { ClusterHelper.addMembers(cluster, members); EntityClusterFactory.setEntityCluster(members, cluster); for (FL_Cluster subCluster : subClusters) { ClusterHelper.addSubCluster(cluster, subCluster); subCluster.setParent(cluster.getUid()); } // cache the cluster property NumericVectorFeature feature = (NumericVectorFeature) c.getFeature("num"); double value = feature.getValue()[0]; addClusterProperty(cluster, clusterField, value); } modifiedClusters.put(cluster.getUid(), cluster); } ClusterContext result = new ClusterContext(); result.roots.putAll(modifiedClusters); result.clusters.putAll(modifiedClusters); return result; }