private Instance createInstance(String id, Double value) { Instance inst = new Instance(id); NumericVectorFeature feature = new NumericVectorFeature("num"); feature.setValue(new double[] {value}); inst.addFeature(feature); return inst; }
@Override public Tuple2<String, Instance> call(Tuple2<String, Instance> inst) throws Exception { Instance bestCluster = null; double bestScore = Double.MAX_VALUE; for (String clusterId : clusters.keySet()) { Instance cluster = clusters.get(clusterId); double d = distFunc.distance(inst._2, cluster); if (d < bestScore && d < threshold) { bestScore = d; bestCluster = cluster; } } // System.out.println(inst._1 + " -> " + bestCluster); if (bestCluster == null) { return null; } return new Tuple2<String, Instance>(bestCluster.getId(), inst._2); }
@Override public ClusterContext clusterEntities( Collection<FL_Entity> entities, Collection<FL_Cluster> immutableClusters, Collection<FL_Cluster> clusters, ClusterContext context) { Map<String, FL_Entity> entityIndex = createEntityIndex(entities); Map<String, FL_Cluster> immutableClusterIndex = createClusterIndex(immutableClusters); Map<String, FL_Cluster> clusterIndex = createClusterIndex(clusters); DataSet ds = createDataSet(entities, immutableClusters, context); BaseClusterer clusterer = createNumericClusterer(); List<Cluster> existingClusters = new LinkedList<Cluster>(); for (FL_Cluster cluster : clusters) { double val = 0; PropertyHelper prop = getFirstProperty(cluster, toClusterPropertyName(clusterField)); if (prop != null) { val = getDoubleValue(prop); } Cluster c = clusterer.createCluster(); c.setId(cluster.getUid()); NumericVectorFeature feature = new NumericVectorFeature("num"); feature.setValue(new double[] {val}); c.addFeature(feature); existingClusters.add(c); } ClusterResult rs = null; if (existingClusters.isEmpty()) { rs = clusterer.doCluster(ds); } else { rs = clusterer.doIncrementalCluster(ds, existingClusters); } // clean up clusterer.terminate(); Map<String, FL_Cluster> modifiedClusters = new HashMap<String, FL_Cluster>(); for (Cluster c : rs) { List<FL_Cluster> subClusters = new LinkedList<FL_Cluster>(); List<FL_Entity> members = new LinkedList<FL_Entity>(); for (Instance inst : c.getMembers()) { String id = inst.getId(); if (entityIndex.containsKey(id)) { members.add(entityIndex.get(id)); } else if (immutableClusterIndex.containsKey(id)) { subClusters.add(immutableClusterIndex.get(id)); } } FL_Cluster cluster = clusterIndex.get(c.getId()); if (cluster == null) { cluster = clusterFactory.toCluster(members, subClusters); // cache the cluster property NumericVectorFeature feature = (NumericVectorFeature) c.getFeature("num"); double value = feature.getValue()[0]; addClusterProperty(cluster, clusterField, value); } else { ClusterHelper.addMembers(cluster, members); EntityClusterFactory.setEntityCluster(members, cluster); for (FL_Cluster subCluster : subClusters) { ClusterHelper.addSubCluster(cluster, subCluster); subCluster.setParent(cluster.getUid()); } // cache the cluster property NumericVectorFeature feature = (NumericVectorFeature) c.getFeature("num"); double value = feature.getValue()[0]; addClusterProperty(cluster, clusterField, value); } modifiedClusters.put(cluster.getUid(), cluster); } ClusterContext result = new ClusterContext(); result.roots.putAll(modifiedClusters); result.clusters.putAll(modifiedClusters); return result; }
/** @param args */ public static void main(String[] args) { DataSet ds = new DataSet(); Instance inst = new Instance("1"); StringFeature feature = new StringFeature(FEATURE_NAME1); feature.setValue("jack black"); inst.addFeature(feature); feature = new StringFeature(FEATURE_NAME2); feature.setValue("san diego"); inst.addFeature(feature); ds.add(inst); inst = new Instance("2"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("jack black"); inst.addFeature(feature); ds.add(inst); inst = new Instance("3"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("jack"); inst.addFeature(feature); feature = new StringFeature(FEATURE_NAME2); feature.setValue("san diego"); inst.addFeature(feature); ds.add(inst); inst = new Instance("4"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("jack l. black"); inst.addFeature(feature); feature = new StringFeature(FEATURE_NAME2); feature.setValue("san diego"); inst.addFeature(feature); ds.add(inst); inst = new Instance("5"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("j. black"); inst.addFeature(feature); feature = new StringFeature(FEATURE_NAME2); feature.setValue("san diego"); inst.addFeature(feature); ds.add(inst); inst = new Instance("6"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("j black"); inst.addFeature(feature); feature = new StringFeature(FEATURE_NAME2); feature.setValue("new york"); inst.addFeature(feature); ds.add(inst); inst = new Instance("7"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("black"); inst.addFeature(feature); feature = new StringFeature(FEATURE_NAME2); feature.setValue("new york"); inst.addFeature(feature); ds.add(inst); inst = new Instance("8"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("jackie black"); inst.addFeature(feature); ds.add(inst); inst = new Instance("9"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("jack brown"); inst.addFeature(feature); ds.add(inst); inst = new Instance("10"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("jackie green"); inst.addFeature(feature); ds.add(inst); inst = new Instance("11"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("bob"); inst.addFeature(feature); feature = new StringFeature(FEATURE_NAME2); feature.setValue("san fran"); inst.addFeature(feature); ds.add(inst); inst = new Instance("12"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("bobbie"); inst.addFeature(feature); feature = new StringFeature(FEATURE_NAME2); feature.setValue("san fran"); inst.addFeature(feature); ds.add(inst); inst = new Instance("13"); feature = new StringFeature(FEATURE_NAME1); feature.setValue("jackie"); inst.addFeature(feature); ds.add(inst); DPMeans clusterer = new DPMeans(5, true); clusterer.registerFeatureType(FEATURE_NAME1, StringMedianCentroid.class, new EditDistance(0.5)); clusterer.registerFeatureType( FEATURE_NAME2, StringMedianCentroid.class, new ExactTokenMatchDistance(0.5)); clusterer.setThreshold(0.5); ClusterResult clusters = clusterer.doCluster(ds); for (Cluster c : clusters) { System.out.println(c.toString(true)); } clusterer.terminate(); }