Beispiel #1
0
  @operator(
      value = {"clustering_em"},
      content_type = IType.LIST,
      category = {IOperatorCategory.STATISTICAL},
      concept = {IConcept.STATISTIC})
  @doc(
      value =
          "A list of agent groups clustered by EM Algorithm based on the given attributes. Some paremeters can be defined: "
              + "max_iterations: the maximum number of iterations to perform;"
              + "num_clusters: the number of clusters; minStdDev: minimum allowable standard deviation",
      examples = {
        @example(
            value =
                "clustering_em([ag1, ag2, ag3, ag4, ag5],[\"size\",\"age\", \"weight\"],[\"max_iterations\"::10, \"num_clusters\"::3])",
            equals = "for example, can return [[ag1, ag3], [ag2], [ag4, ag5]]",
            isExecutable = false)
      },
      see = {
        "clustering_xmeans",
        "clustering_simple_kmeans",
        "clustering_farthestFirst",
        "clustering_DBScan",
        "clustering_cobweb"
      })
  public static IList<IList<IAgent>> primClusteringEM(
      final IScope scope,
      final IAddressableContainer<Integer, IAgent, Integer, IAgent> agents,
      final IList<String> attributes,
      final GamaMap<String, Object> parameters) {
    EM em = new EM();
    em.setSeed(Cast.asInt(scope, scope.getRandom().getSeed()));

    if (parameters != null) {
      try {

        if (parameters.containsKey("max_iterations")) {
          em.setMaxIterations(Cast.asInt(scope, parameters.get("max_iterations")));
        }
        if (parameters.containsKey("num_clusters")) {
          em.setNumClusters(Cast.asInt(scope, parameters.get("num_clusters")));
        }
        if (parameters.containsKey("minStdDev")) {
          em.setMinStdDev(Cast.asFloat(scope, parameters.get("minStdDev")));
        }
      } catch (Exception e) {

      }
    }
    IList<IList<IAgent>> groupes = clusteringUsingWeka(scope, em, attributes, agents);
    return groupes;
  }
  public JSONArray Cluster(String wekaFilePath, int clusterNum) throws Exception {
    File inputFile = new File(wekaFilePath);
    ArffLoader arf = new ArffLoader();
    arf.setFile(inputFile);
    Instances originIns = arf.getDataSet();
    Instances insTest = new Instances(originIns);
    insTest.deleteStringAttributes();
    int totalNum = insTest.numInstances();

    // SimpleKMeans sm = new SimpleKMeans();
    EM em = new EM();
    em.setNumClusters(clusterNum);
    MakeDensityBasedClusterer sm = new MakeDensityBasedClusterer();
    sm.setClusterer(em);
    sm.buildClusterer(insTest);

    System.out.println("totalNum:" + insTest.numInstances());
    System.out.println("============================");
    System.out.println(sm.toString());
    Map<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>();
    for (int i = 0; i < clusterNum; i++) {
      result.put(i, new ArrayList<String>());
    }

    for (int i = 0; i < totalNum; i++) {
      Instance ins = originIns.instance(i);
      String word = ins.stringValue(0);
      Instance tempIns = new Instance(ins);
      tempIns.deleteAttributeAt(0);
      int cluster = sm.clusterInstance(tempIns);
      result.get(cluster).add(word);
    }

    // print the result
    ArrayList<String> words = new ArrayList<String>();
    JSONArray keyWords = new JSONArray();
    for (int k : result.keySet()) {
      words = result.get(k);
      PriorityQueue<MyTerm> clusterQueue = new PriorityQueue<MyTerm>(1, MyTermCompare);
      for (int i = 0; i < words.size(); i++) {
        String s = words.get(i);
        assert linkMap.containsKey(s);
        int freq = linkMap.get(s).totalFreq;
        clusterQueue.add(linkMap.get(s));
        words.set(i, "(" + s + ":" + freq + ")");
      }

      JSONArray clusterArray = new JSONArray();
      int num = clusterQueue.size() / 10 + 1; // 5%
      int totalFreq = 0;
      int totalLength = 0;
      for (int i = 0; i < num && !clusterQueue.isEmpty(); ) {
        JSONObject mem = new JSONObject();
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        if (word.length() == 1) {
          continue;
        }
        mem.put("text", word);
        mem.put("freq", myTerm.totalFreq);
        clusterArray.put(mem);
        i++;
        totalFreq += myTerm.totalFreq;
        totalLength += word.length();
      }

      double averFreq = totalFreq * 1.0 / num;
      double averLength = totalLength * 1.0 / num;
      int count = 0;
      while (!clusterQueue.isEmpty() && count < num) {
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        int freq = myTerm.totalFreq;
        int times = (int) (word.length() / averFreq) + 1;
        if (freq > averFreq / times) {
          JSONObject mem = new JSONObject();
          mem.put("text", word);
          mem.put("freq", freq);
          mem.put("extra", true);
          clusterArray.put(mem);
        }
      }

      keyWords.put(clusterArray);
      System.out.println(
          "cluster" + k + ":" + words.size() + ":\t" + (int) (words.size() * 1.0 / totalNum * 100));
      if (result.get(k).size() < 100) {
        System.out.println(result.get(k));
      }
    }
    // System.out.println("errorNum:"+errorNum);
    return keyWords;
  }