public EM_Cluster(String tf, String[] options) throws Exception { this.model = new EM(); this.training_f = tf; this.options = options; BufferedReader r = new BufferedReader(new FileReader(tf)); Instances data = new Instances(r); model.setOptions(options); model.buildClusterer(data); }
@operator( value = {"clustering_em"}, content_type = IType.LIST, category = {IOperatorCategory.STATISTICAL}, concept = {IConcept.STATISTIC}) @doc( value = "A list of agent groups clustered by EM Algorithm based on the given attributes. Some paremeters can be defined: " + "max_iterations: the maximum number of iterations to perform;" + "num_clusters: the number of clusters; minStdDev: minimum allowable standard deviation", examples = { @example( value = "clustering_em([ag1, ag2, ag3, ag4, ag5],[\"size\",\"age\", \"weight\"],[\"max_iterations\"::10, \"num_clusters\"::3])", equals = "for example, can return [[ag1, ag3], [ag2], [ag4, ag5]]", isExecutable = false) }, see = { "clustering_xmeans", "clustering_simple_kmeans", "clustering_farthestFirst", "clustering_DBScan", "clustering_cobweb" }) public static IList<IList<IAgent>> primClusteringEM( final IScope scope, final IAddressableContainer<Integer, IAgent, Integer, IAgent> agents, final IList<String> attributes, final GamaMap<String, Object> parameters) { EM em = new EM(); em.setSeed(Cast.asInt(scope, scope.getRandom().getSeed())); if (parameters != null) { try { if (parameters.containsKey("max_iterations")) { em.setMaxIterations(Cast.asInt(scope, parameters.get("max_iterations"))); } if (parameters.containsKey("num_clusters")) { em.setNumClusters(Cast.asInt(scope, parameters.get("num_clusters"))); } if (parameters.containsKey("minStdDev")) { em.setMinStdDev(Cast.asFloat(scope, parameters.get("minStdDev"))); } } catch (Exception e) { } } IList<IList<IAgent>> groupes = clusteringUsingWeka(scope, em, attributes, agents); return groupes; }
public JSONArray Cluster(String wekaFilePath, int clusterNum) throws Exception { File inputFile = new File(wekaFilePath); ArffLoader arf = new ArffLoader(); arf.setFile(inputFile); Instances originIns = arf.getDataSet(); Instances insTest = new Instances(originIns); insTest.deleteStringAttributes(); int totalNum = insTest.numInstances(); // SimpleKMeans sm = new SimpleKMeans(); EM em = new EM(); em.setNumClusters(clusterNum); MakeDensityBasedClusterer sm = new MakeDensityBasedClusterer(); sm.setClusterer(em); sm.buildClusterer(insTest); System.out.println("totalNum:" + insTest.numInstances()); System.out.println("============================"); System.out.println(sm.toString()); Map<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>(); for (int i = 0; i < clusterNum; i++) { result.put(i, new ArrayList<String>()); } for (int i = 0; i < totalNum; i++) { Instance ins = originIns.instance(i); String word = ins.stringValue(0); Instance tempIns = new Instance(ins); tempIns.deleteAttributeAt(0); int cluster = sm.clusterInstance(tempIns); result.get(cluster).add(word); } // print the result ArrayList<String> words = new ArrayList<String>(); JSONArray keyWords = new JSONArray(); for (int k : result.keySet()) { words = result.get(k); PriorityQueue<MyTerm> clusterQueue = new PriorityQueue<MyTerm>(1, MyTermCompare); for (int i = 0; i < words.size(); i++) { String s = words.get(i); assert linkMap.containsKey(s); int freq = linkMap.get(s).totalFreq; clusterQueue.add(linkMap.get(s)); words.set(i, "(" + s + ":" + freq + ")"); } JSONArray clusterArray = new JSONArray(); int num = clusterQueue.size() / 10 + 1; // 5% int totalFreq = 0; int totalLength = 0; for (int i = 0; i < num && !clusterQueue.isEmpty(); ) { JSONObject mem = new JSONObject(); MyTerm myTerm = clusterQueue.poll(); String word = myTerm.originTrem.text(); if (word.length() == 1) { continue; } mem.put("text", word); mem.put("freq", myTerm.totalFreq); clusterArray.put(mem); i++; totalFreq += myTerm.totalFreq; totalLength += word.length(); } double averFreq = totalFreq * 1.0 / num; double averLength = totalLength * 1.0 / num; int count = 0; while (!clusterQueue.isEmpty() && count < num) { MyTerm myTerm = clusterQueue.poll(); String word = myTerm.originTrem.text(); int freq = myTerm.totalFreq; int times = (int) (word.length() / averFreq) + 1; if (freq > averFreq / times) { JSONObject mem = new JSONObject(); mem.put("text", word); mem.put("freq", freq); mem.put("extra", true); clusterArray.put(mem); } } keyWords.put(clusterArray); System.out.println( "cluster" + k + ":" + words.size() + ":\t" + (int) (words.size() * 1.0 / totalNum * 100)); if (result.get(k).size() < 100) { System.out.println(result.get(k)); } } // System.out.println("errorNum:"+errorNum); return keyWords; }