Ejemplo n.º 1
0
  private Instance createInstance(String id, Double value) {
    Instance inst = new Instance(id);

    NumericVectorFeature feature = new NumericVectorFeature("num");
    feature.setValue(new double[] {value});
    inst.addFeature(feature);

    return inst;
  }
  @Override
  public Tuple2<String, Instance> call(Tuple2<String, Instance> inst) throws Exception {
    Instance bestCluster = null;
    double bestScore = Double.MAX_VALUE;

    for (String clusterId : clusters.keySet()) {
      Instance cluster = clusters.get(clusterId);
      double d = distFunc.distance(inst._2, cluster);

      if (d < bestScore && d < threshold) {
        bestScore = d;
        bestCluster = cluster;
      }
    }
    // System.out.println(inst._1 + " -> " + bestCluster);
    if (bestCluster == null) {
      return null;
    }

    return new Tuple2<String, Instance>(bestCluster.getId(), inst._2);
  }
Ejemplo n.º 3
0
  @Override
  public ClusterContext clusterEntities(
      Collection<FL_Entity> entities,
      Collection<FL_Cluster> immutableClusters,
      Collection<FL_Cluster> clusters,
      ClusterContext context) {

    Map<String, FL_Entity> entityIndex = createEntityIndex(entities);
    Map<String, FL_Cluster> immutableClusterIndex = createClusterIndex(immutableClusters);
    Map<String, FL_Cluster> clusterIndex = createClusterIndex(clusters);

    DataSet ds = createDataSet(entities, immutableClusters, context);

    BaseClusterer clusterer = createNumericClusterer();

    List<Cluster> existingClusters = new LinkedList<Cluster>();

    for (FL_Cluster cluster : clusters) {
      double val = 0;
      PropertyHelper prop = getFirstProperty(cluster, toClusterPropertyName(clusterField));
      if (prop != null) {
        val = getDoubleValue(prop);
      }
      Cluster c = clusterer.createCluster();
      c.setId(cluster.getUid());
      NumericVectorFeature feature = new NumericVectorFeature("num");
      feature.setValue(new double[] {val});
      c.addFeature(feature);
      existingClusters.add(c);
    }

    ClusterResult rs = null;
    if (existingClusters.isEmpty()) {
      rs = clusterer.doCluster(ds);
    } else {
      rs = clusterer.doIncrementalCluster(ds, existingClusters);
    }
    // clean up
    clusterer.terminate();

    Map<String, FL_Cluster> modifiedClusters = new HashMap<String, FL_Cluster>();

    for (Cluster c : rs) {
      List<FL_Cluster> subClusters = new LinkedList<FL_Cluster>();
      List<FL_Entity> members = new LinkedList<FL_Entity>();

      for (Instance inst : c.getMembers()) {
        String id = inst.getId();
        if (entityIndex.containsKey(id)) {
          members.add(entityIndex.get(id));
        } else if (immutableClusterIndex.containsKey(id)) {
          subClusters.add(immutableClusterIndex.get(id));
        }
      }

      FL_Cluster cluster = clusterIndex.get(c.getId());
      if (cluster == null) {
        cluster = clusterFactory.toCluster(members, subClusters);
        // cache the cluster property
        NumericVectorFeature feature = (NumericVectorFeature) c.getFeature("num");
        double value = feature.getValue()[0];
        addClusterProperty(cluster, clusterField, value);
      } else {
        ClusterHelper.addMembers(cluster, members);
        EntityClusterFactory.setEntityCluster(members, cluster);

        for (FL_Cluster subCluster : subClusters) {
          ClusterHelper.addSubCluster(cluster, subCluster);
          subCluster.setParent(cluster.getUid());
        }

        // cache the cluster property
        NumericVectorFeature feature = (NumericVectorFeature) c.getFeature("num");
        double value = feature.getValue()[0];
        addClusterProperty(cluster, clusterField, value);
      }
      modifiedClusters.put(cluster.getUid(), cluster);
    }

    ClusterContext result = new ClusterContext();
    result.roots.putAll(modifiedClusters);
    result.clusters.putAll(modifiedClusters);

    return result;
  }
  /** @param args */
  public static void main(String[] args) {
    DataSet ds = new DataSet();

    Instance inst = new Instance("1");
    StringFeature feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("jack black");
    inst.addFeature(feature);
    feature = new StringFeature(FEATURE_NAME2);
    feature.setValue("san diego");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("2");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("jack black");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("3");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("jack");
    inst.addFeature(feature);
    feature = new StringFeature(FEATURE_NAME2);
    feature.setValue("san diego");
    inst.addFeature(feature);

    ds.add(inst);

    inst = new Instance("4");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("jack l. black");
    inst.addFeature(feature);
    feature = new StringFeature(FEATURE_NAME2);
    feature.setValue("san diego");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("5");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("j. black");
    inst.addFeature(feature);
    feature = new StringFeature(FEATURE_NAME2);
    feature.setValue("san diego");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("6");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("j black");
    inst.addFeature(feature);
    feature = new StringFeature(FEATURE_NAME2);
    feature.setValue("new york");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("7");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("black");
    inst.addFeature(feature);
    feature = new StringFeature(FEATURE_NAME2);
    feature.setValue("new york");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("8");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("jackie black");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("9");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("jack brown");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("10");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("jackie green");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("11");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("bob");
    inst.addFeature(feature);
    feature = new StringFeature(FEATURE_NAME2);
    feature.setValue("san fran");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("12");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("bobbie");
    inst.addFeature(feature);
    feature = new StringFeature(FEATURE_NAME2);
    feature.setValue("san fran");
    inst.addFeature(feature);
    ds.add(inst);

    inst = new Instance("13");
    feature = new StringFeature(FEATURE_NAME1);
    feature.setValue("jackie");
    inst.addFeature(feature);
    ds.add(inst);

    DPMeans clusterer = new DPMeans(5, true);
    clusterer.registerFeatureType(FEATURE_NAME1, StringMedianCentroid.class, new EditDistance(0.5));
    clusterer.registerFeatureType(
        FEATURE_NAME2, StringMedianCentroid.class, new ExactTokenMatchDistance(0.5));
    clusterer.setThreshold(0.5);

    ClusterResult clusters = clusterer.doCluster(ds);
    for (Cluster c : clusters) {
      System.out.println(c.toString(true));
    }
    clusterer.terminate();
  }