public ArrayList<String> getClusterIDs(int clusterNum) { ArrayList<String> clusterSeqs = new ArrayList<String>(); for (int i = 0; i < seqList.size(); i++) { if (cluster[i] == clusterNum) clusterSeqs.add(seqList.get(i)); } return clusterSeqs; }
public void getIndexInfo(String indexdir, int freqThreshold) { IndexReader reader = null; try { Directory dir = FSDirectory.open(new File(indexdir)); System.out.println(dir); reader = IndexReader.open(dir); System.out.println("document num:" + reader.numDocs()); System.out.println("======================"); TermEnum terms = reader.terms(); sortedTermQueue.clear(); maxDocNum = reader.maxDoc(); linkMap.clear(); termList.clear(); while (terms.next()) { // System.out.print(terms.term() + "\tDocFreq:" + TermDocs termDocs = reader.termDocs(terms.term()); MyTerm temp = new MyTerm(terms.term(), termDocs, maxDocNum); if (temp.totalFreq < freqThreshold) { continue; } /* * if(temp.originTrem.text().length()==1){ continue; } */ linkMap.put(temp.originTrem.text(), temp); sortedTermQueue.add(temp); termList.add(temp); } System.out.println("total Size:" + sortedTermQueue.size()); System.out.println("mapsize:" + linkMap.keySet().size()); // System.exit(0); int num = 0; this.maxFreq = sortedTermQueue.peek().totalFreq; while (!sortedTermQueue.isEmpty()) { num++; System.out.println(num + ":" + sortedTermQueue.poll()); } System.out.println("read index info done"); } catch (IOException e) { e.printStackTrace(); } finally { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } }
public int getClusterSize(int clusterNum) { int count = 0; for (int i = 0; i < seqList.size(); i++) { if (cluster[i] == clusterNum) count++; } return count; }
public void buildClusterer(ArrayList<String> seqDB, double[][] sm) { seqList = seqDB; this.setSimMatrix(sm); Attribute seqString = new Attribute("sequence", (FastVector) null); FastVector attrInfo = new FastVector(); attrInfo.addElement(seqString); Instances data = new Instances("data", attrInfo, 0); for (int i = 0; i < seqList.size(); i++) { Instance currentInst = new Instance(1); currentInst.setDataset(data); currentInst.setValue(0, seqList.get(i)); data.add(currentInst); } try { buildClusterer(data); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public void generateWekaFile(ArrayList<MyTerm> myTerms, int maxDocNum, String wekaFilePath) throws IOException { String text = "@relation interest\n"; text += "@attribute text string\n"; for (int i = 0; i < maxDocNum; i++) { text += "@attribute doc" + i + "\treal\n"; } text += "@data\n"; for (int j = 0; j < myTerms.size(); j++) { MyTerm term = myTerms.get(j); String line = ""; line += term.originTrem.text(); for (int i = 0; i < term.vector.length; i++) { line += "," + term.vector[i]; } line += "\n"; text += line; } // System.out.println(text); PrintWriter Pout = new PrintWriter(new FileWriter(wekaFilePath)); Pout.println(text); Pout.close(); }
public JSONArray Cluster(String wekaFilePath, int clusterNum) throws Exception { File inputFile = new File(wekaFilePath); ArffLoader arf = new ArffLoader(); arf.setFile(inputFile); Instances originIns = arf.getDataSet(); Instances insTest = new Instances(originIns); insTest.deleteStringAttributes(); int totalNum = insTest.numInstances(); // SimpleKMeans sm = new SimpleKMeans(); EM em = new EM(); em.setNumClusters(clusterNum); MakeDensityBasedClusterer sm = new MakeDensityBasedClusterer(); sm.setClusterer(em); sm.buildClusterer(insTest); System.out.println("totalNum:" + insTest.numInstances()); System.out.println("============================"); System.out.println(sm.toString()); Map<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>(); for (int i = 0; i < clusterNum; i++) { result.put(i, new ArrayList<String>()); } for (int i = 0; i < totalNum; i++) { Instance ins = originIns.instance(i); String word = ins.stringValue(0); Instance tempIns = new Instance(ins); tempIns.deleteAttributeAt(0); int cluster = sm.clusterInstance(tempIns); result.get(cluster).add(word); } // print the result ArrayList<String> words = new ArrayList<String>(); JSONArray keyWords = new JSONArray(); for (int k : result.keySet()) { words = result.get(k); PriorityQueue<MyTerm> clusterQueue = new PriorityQueue<MyTerm>(1, MyTermCompare); for (int i = 0; i < words.size(); i++) { String s = words.get(i); assert linkMap.containsKey(s); int freq = linkMap.get(s).totalFreq; clusterQueue.add(linkMap.get(s)); words.set(i, "(" + s + ":" + freq + ")"); } JSONArray clusterArray = new JSONArray(); int num = clusterQueue.size() / 10 + 1; // 5% int totalFreq = 0; int totalLength = 0; for (int i = 0; i < num && !clusterQueue.isEmpty(); ) { JSONObject mem = new JSONObject(); MyTerm myTerm = clusterQueue.poll(); String word = myTerm.originTrem.text(); if (word.length() == 1) { continue; } mem.put("text", word); mem.put("freq", myTerm.totalFreq); clusterArray.put(mem); i++; totalFreq += myTerm.totalFreq; totalLength += word.length(); } double averFreq = totalFreq * 1.0 / num; double averLength = totalLength * 1.0 / num; int count = 0; while (!clusterQueue.isEmpty() && count < num) { MyTerm myTerm = clusterQueue.poll(); String word = myTerm.originTrem.text(); int freq = myTerm.totalFreq; int times = (int) (word.length() / averFreq) + 1; if (freq > averFreq / times) { JSONObject mem = new JSONObject(); mem.put("text", word); mem.put("freq", freq); mem.put("extra", true); clusterArray.put(mem); } } keyWords.put(clusterArray); System.out.println( "cluster" + k + ":" + words.size() + ":\t" + (int) (words.size() * 1.0 / totalNum * 100)); if (result.get(k).size() < 100) { System.out.println(result.get(k)); } } // System.out.println("errorNum:"+errorNum); return keyWords; }
/** * ************************************************** Convert a table to a set of instances, with * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing * <b>attributes</b> (e.g. as is common with microarray data) */ public Instances tableColsToInstances(Table t, String relationName) { System.err.print("Converting table cols to instances..."); // Set up attributes, which for colInstances will be the rowNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int r = 0; r < t.numRows; r++) { if (rowIsNumeric(t, r)) { isNominal.add(false); atts.addElement(new Attribute(t.rowNames[r])); allAttVals.add(null); // No enumeration of attribute values. } else { // It's nominal... determine the range of values and create a nominal attribute... isNominal.add(true); FastVector attVals = getRowValues(t, r); atts.addElement(new Attribute(t.rowNames[r], attVals)); // Save it for later allAttVals.add(attVals); } } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); /** ***** CREATE INSTANCES ************* */ // Fill the instances with data... // For each instance... for (int c = 0; c < t.numCols; c++) { double[] vals = new double[data.numAttributes()]; // Even nominal values are stored as double pointers. // For each attribute fill in the numeric or attributeValue index... for (int r = 0; r < t.numRows; r++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[r] = Instance.missingValue(); else if (isNominal.get(r)) { vals[r] = allAttVals.get(r).indexOf(val); } else { vals[r] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); /** ***** ADD FEATURE NAMES ************* */ // takes basically zero time... all time is in previous 2 chunks. if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int c = 0; c < t.colNames.length; c++) { instanceNames.add(t.colNames[c]); newData.instance(c).setValue(attrIdx, t.colNames[c]); } data = newData; } System.err.println("done."); return (data); }
/** * ************************************************** Convert a table to a set of instances, with * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing * <b>attributes</b> */ public Instances tableRowsToNominalInstances(Table t, String relationName) { System.err.print("Converting table rows to instances..."); // Set up attributes, which for rowInstances will be the colNames... FastVector atts = new FastVector(); ArrayList<Boolean> isNominal = new ArrayList<Boolean>(); ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later... System.err.print("creating attributes..."); for (int c = 0; c < t.numCols; c++) { // It's nominal... determine the range of values isNominal.add(true); FastVector attVals = getColValues(t, c); atts.addElement(new Attribute(t.colNames[c], attVals)); // Save it for later allAttVals.add(attVals); } System.err.print("creating instances..."); // Create Instances object.. Instances data = new Instances(relationName, atts, 0); data.setRelationName(relationName); // Fill the instances with data... // For each instance... for (int r = 0; r < t.numRows; r++) { double[] vals = new double[data.numAttributes()]; // for each attribute for (int c = 0; c < t.numCols; c++) { String val = (String) t.matrix.getQuick(r, c); if (val == "?") vals[c] = Instance.missingValue(); else if (isNominal.get(c)) { vals[c] = allAttVals.get(c).indexOf(val); } else { vals[c] = Double.parseDouble((String) val); } } // Add the a newly minted instance with those attribute values... data.add(new Instance(1.0, vals)); } System.err.print("add feature names..."); if (addInstanceNamesAsFeatures) { Instances newData = new Instances(data); newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0); int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0 // We save the instanceNames in a list because it's handy later on... instanceNames = new ArrayList<String>(); for (int r = 0; r < t.rowNames.length; r++) { instanceNames.add(t.rowNames[r]); newData.instance(r).setValue(attrIdx, t.rowNames[r]); } data = newData; } System.err.println("done."); return (data); }
/** if clusterIdx is -1, all instances are used (a single metric for all clusters is used) */ public boolean trainMetric(int clusterIdx) throws Exception { Init(clusterIdx); double[] weights = new double[m_numAttributes]; int violatedConstraints = 0; int numInstances = 0; for (int instIdx = 0; instIdx < m_instances.numInstances(); instIdx++) { int assignment = m_clusterAssignments[instIdx]; // only instances assigned to this cluster are of importance if (assignment == clusterIdx || clusterIdx == -1) { numInstances++; if (clusterIdx < 0) { m_centroid = m_kmeans.getClusterCentroids().instance(assignment); } // accumulate variance Instance instance = m_instances.instance(instIdx); Instance diffInstance = m_metric.createDiffInstance(instance, m_centroid); for (int attr = 0; attr < m_numAttributes; attr++) { weights[attr] += diffInstance.value(attr); } // check all constraints for this instance Object list = m_instanceConstraintMap.get(new Integer(instIdx)); if (list != null) { // there are constraints associated with this instance ArrayList constraintList = (ArrayList) list; for (int i = 0; i < constraintList.size(); i++) { InstancePair pair = (InstancePair) constraintList.get(i); int linkType = pair.linkType; int firstIdx = pair.first; int secondIdx = pair.second; Instance instance1 = m_instances.instance(firstIdx); Instance instance2 = m_instances.instance(secondIdx); int otherIdx = (firstIdx == instIdx) ? m_clusterAssignments[secondIdx] : m_clusterAssignments[firstIdx]; if (otherIdx != -1) { // check whether the constraint is violated if (otherIdx != assignment && linkType == InstancePair.MUST_LINK) { diffInstance = m_metric.createDiffInstance(instance1, instance2); for (int attr = 0; attr < m_numAttributes; attr++) { weights[attr] += 0.5 * m_MLweight * diffInstance.value(attr); } } else if (otherIdx == assignment && linkType == InstancePair.CANNOT_LINK) { diffInstance = m_metric.createDiffInstance(instance1, instance2); for (int attr = 0; attr < m_numAttributes; attr++) { // this constraint will be counted twice, hence 0.5 weights[attr] += 0.5 * m_CLweight * m_maxCLDiffInstance.value(attr); weights[attr] -= 0.5 * m_CLweight * diffInstance.value(attr); } } } } } } } // System.out.println("Updating cluster " + clusterIdx // + " containing " + numInstances); // check the weights double[] newWeights = new double[m_numAttributes]; double[] currentWeights = m_metric.getWeights(); boolean needNewtonRaphson = false; for (int attr = 0; attr < m_numAttributes; attr++) { if (weights[attr] <= 0) { // check to avoid divide by 0 - TODO! System.out.println( "Negative weight " + weights[attr] + " for clusterIdx=" + clusterIdx + "; using prev value=" + currentWeights[attr]); newWeights[attr] = currentWeights[attr]; // needNewtonRaphson = true; // break; } else { if (m_regularize) { // solution of quadratic equation - TODO! int n = m_instances.numInstances(); double ratio = (m_logTermWeight * n) / (2 * weights[attr]); newWeights[attr] = ratio + Math.sqrt(ratio * ratio + (m_regularizerTermWeight * n) / weights[attr]); } else { newWeights[attr] = m_logTermWeight * numInstances / weights[attr]; } } } // do NR if needed if (needNewtonRaphson) { System.out.println("GOING TO NEWTON-RAPHSON!!!\n"); newWeights = updateWeightsUsingNewtonRaphson(currentWeights, weights); } // PRINT routine // System.out.println("Total constraints violated: " + violatedConstraints/2 + "; weights // are:"); // for (int attr=0; attr<numAttributes; attr++) { // System.out.print(newWeights[attr] + "\t"); // } // System.out.println(); // end PRINT routine m_metric.setWeights(newWeights); return true; }