public int getClusterSize(int clusterNum) {
    int count = 0;
    for (int i = 0; i < seqList.size(); i++) {
      if (cluster[i] == clusterNum) count++;
    }

    return count;
  }
  public ArrayList<String> getClusterIDs(int clusterNum) {
    ArrayList<String> clusterSeqs = new ArrayList<String>();

    for (int i = 0; i < seqList.size(); i++) {
      if (cluster[i] == clusterNum) clusterSeqs.add(seqList.get(i));
    }

    return clusterSeqs;
  }
  public void buildClusterer(ArrayList<String> seqDB, double[][] sm) {
    seqList = seqDB;

    this.setSimMatrix(sm);

    Attribute seqString = new Attribute("sequence", (FastVector) null);
    FastVector attrInfo = new FastVector();
    attrInfo.addElement(seqString);
    Instances data = new Instances("data", attrInfo, 0);

    for (int i = 0; i < seqList.size(); i++) {
      Instance currentInst = new Instance(1);
      currentInst.setDataset(data);
      currentInst.setValue(0, seqList.get(i));
      data.add(currentInst);
    }

    try {
      buildClusterer(data);
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
예제 #4
0
  public void generateWekaFile(ArrayList<MyTerm> myTerms, int maxDocNum, String wekaFilePath)
      throws IOException {

    String text = "@relation interest\n";
    text += "@attribute text string\n";
    for (int i = 0; i < maxDocNum; i++) {
      text += "@attribute doc" + i + "\treal\n";
    }
    text += "@data\n";
    for (int j = 0; j < myTerms.size(); j++) {
      MyTerm term = myTerms.get(j);
      String line = "";
      line += term.originTrem.text();
      for (int i = 0; i < term.vector.length; i++) {
        line += "," + term.vector[i];
      }
      line += "\n";
      text += line;
    }
    // System.out.println(text);
    PrintWriter Pout = new PrintWriter(new FileWriter(wekaFilePath));
    Pout.println(text);
    Pout.close();
  }
예제 #5
0
  public JSONArray Cluster(String wekaFilePath, int clusterNum) throws Exception {
    File inputFile = new File(wekaFilePath);
    ArffLoader arf = new ArffLoader();
    arf.setFile(inputFile);
    Instances originIns = arf.getDataSet();
    Instances insTest = new Instances(originIns);
    insTest.deleteStringAttributes();
    int totalNum = insTest.numInstances();

    // SimpleKMeans sm = new SimpleKMeans();
    EM em = new EM();
    em.setNumClusters(clusterNum);
    MakeDensityBasedClusterer sm = new MakeDensityBasedClusterer();
    sm.setClusterer(em);
    sm.buildClusterer(insTest);

    System.out.println("totalNum:" + insTest.numInstances());
    System.out.println("============================");
    System.out.println(sm.toString());
    Map<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>();
    for (int i = 0; i < clusterNum; i++) {
      result.put(i, new ArrayList<String>());
    }

    for (int i = 0; i < totalNum; i++) {
      Instance ins = originIns.instance(i);
      String word = ins.stringValue(0);
      Instance tempIns = new Instance(ins);
      tempIns.deleteAttributeAt(0);
      int cluster = sm.clusterInstance(tempIns);
      result.get(cluster).add(word);
    }

    // print the result
    ArrayList<String> words = new ArrayList<String>();
    JSONArray keyWords = new JSONArray();
    for (int k : result.keySet()) {
      words = result.get(k);
      PriorityQueue<MyTerm> clusterQueue = new PriorityQueue<MyTerm>(1, MyTermCompare);
      for (int i = 0; i < words.size(); i++) {
        String s = words.get(i);
        assert linkMap.containsKey(s);
        int freq = linkMap.get(s).totalFreq;
        clusterQueue.add(linkMap.get(s));
        words.set(i, "(" + s + ":" + freq + ")");
      }

      JSONArray clusterArray = new JSONArray();
      int num = clusterQueue.size() / 10 + 1; // 5%
      int totalFreq = 0;
      int totalLength = 0;
      for (int i = 0; i < num && !clusterQueue.isEmpty(); ) {
        JSONObject mem = new JSONObject();
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        if (word.length() == 1) {
          continue;
        }
        mem.put("text", word);
        mem.put("freq", myTerm.totalFreq);
        clusterArray.put(mem);
        i++;
        totalFreq += myTerm.totalFreq;
        totalLength += word.length();
      }

      double averFreq = totalFreq * 1.0 / num;
      double averLength = totalLength * 1.0 / num;
      int count = 0;
      while (!clusterQueue.isEmpty() && count < num) {
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        int freq = myTerm.totalFreq;
        int times = (int) (word.length() / averFreq) + 1;
        if (freq > averFreq / times) {
          JSONObject mem = new JSONObject();
          mem.put("text", word);
          mem.put("freq", freq);
          mem.put("extra", true);
          clusterArray.put(mem);
        }
      }

      keyWords.put(clusterArray);
      System.out.println(
          "cluster" + k + ":" + words.size() + ":\t" + (int) (words.size() * 1.0 / totalNum * 100));
      if (result.get(k).size() < 100) {
        System.out.println(result.get(k));
      }
    }
    // System.out.println("errorNum:"+errorNum);
    return keyWords;
  }
  /** if clusterIdx is -1, all instances are used (a single metric for all clusters is used) */
  public boolean trainMetric(int clusterIdx) throws Exception {
    Init(clusterIdx);

    double[] weights = new double[m_numAttributes];
    int violatedConstraints = 0;
    int numInstances = 0;

    for (int instIdx = 0; instIdx < m_instances.numInstances(); instIdx++) {
      int assignment = m_clusterAssignments[instIdx];

      // only instances assigned to this cluster are of importance
      if (assignment == clusterIdx || clusterIdx == -1) {
        numInstances++;
        if (clusterIdx < 0) {
          m_centroid = m_kmeans.getClusterCentroids().instance(assignment);
        }

        // accumulate variance
        Instance instance = m_instances.instance(instIdx);
        Instance diffInstance = m_metric.createDiffInstance(instance, m_centroid);
        for (int attr = 0; attr < m_numAttributes; attr++) {
          weights[attr] += diffInstance.value(attr);
        }

        // check all constraints for this instance
        Object list = m_instanceConstraintMap.get(new Integer(instIdx));
        if (list != null) { // there are constraints associated with this instance
          ArrayList constraintList = (ArrayList) list;
          for (int i = 0; i < constraintList.size(); i++) {
            InstancePair pair = (InstancePair) constraintList.get(i);
            int linkType = pair.linkType;
            int firstIdx = pair.first;
            int secondIdx = pair.second;
            Instance instance1 = m_instances.instance(firstIdx);
            Instance instance2 = m_instances.instance(secondIdx);
            int otherIdx =
                (firstIdx == instIdx)
                    ? m_clusterAssignments[secondIdx]
                    : m_clusterAssignments[firstIdx];

            if (otherIdx != -1) { // check whether the constraint is violated
              if (otherIdx != assignment && linkType == InstancePair.MUST_LINK) {
                diffInstance = m_metric.createDiffInstance(instance1, instance2);
                for (int attr = 0; attr < m_numAttributes; attr++) {
                  weights[attr] += 0.5 * m_MLweight * diffInstance.value(attr);
                }
              } else if (otherIdx == assignment && linkType == InstancePair.CANNOT_LINK) {
                diffInstance = m_metric.createDiffInstance(instance1, instance2);
                for (int attr = 0; attr < m_numAttributes; attr++) {
                  // this constraint will be counted twice, hence 0.5
                  weights[attr] += 0.5 * m_CLweight * m_maxCLDiffInstance.value(attr);
                  weights[attr] -= 0.5 * m_CLweight * diffInstance.value(attr);
                }
              }
            }
          }
        }
      }
    }
    //      System.out.println("Updating cluster " + clusterIdx
    //  		       + " containing " + numInstances);

    // check the weights
    double[] newWeights = new double[m_numAttributes];
    double[] currentWeights = m_metric.getWeights();

    boolean needNewtonRaphson = false;
    for (int attr = 0; attr < m_numAttributes; attr++) {
      if (weights[attr] <= 0) { // check to avoid divide by 0 - TODO!
        System.out.println(
            "Negative weight "
                + weights[attr]
                + " for clusterIdx="
                + clusterIdx
                + "; using prev value="
                + currentWeights[attr]);
        newWeights[attr] = currentWeights[attr];
        //  	needNewtonRaphson = true;
        //	break;
      } else {
        if (m_regularize) { // solution of quadratic equation - TODO!
          int n = m_instances.numInstances();
          double ratio = (m_logTermWeight * n) / (2 * weights[attr]);
          newWeights[attr] =
              ratio + Math.sqrt(ratio * ratio + (m_regularizerTermWeight * n) / weights[attr]);
        } else {
          newWeights[attr] = m_logTermWeight * numInstances / weights[attr];
        }
      }
    }

    // do NR if needed
    if (needNewtonRaphson) {
      System.out.println("GOING TO NEWTON-RAPHSON!!!\n");
      newWeights = updateWeightsUsingNewtonRaphson(currentWeights, weights);
    }

    // PRINT routine
    //      System.out.println("Total constraints violated: " + violatedConstraints/2 + "; weights
    // are:");
    //      for (int attr=0; attr<numAttributes; attr++) {
    //        System.out.print(newWeights[attr] + "\t");
    //      }
    //      System.out.println();
    // end PRINT routine

    m_metric.setWeights(newWeights);
    return true;
  }