public ArrayList<String> getClusterIDs(int clusterNum) {
    ArrayList<String> clusterSeqs = new ArrayList<String>();

    for (int i = 0; i < seqList.size(); i++) {
      if (cluster[i] == clusterNum) clusterSeqs.add(seqList.get(i));
    }

    return clusterSeqs;
  }
Пример #2
0
  public void getIndexInfo(String indexdir, int freqThreshold) {
    IndexReader reader = null;

    try {
      Directory dir = FSDirectory.open(new File(indexdir));
      System.out.println(dir);
      reader = IndexReader.open(dir);

      System.out.println("document num:" + reader.numDocs());
      System.out.println("======================");

      TermEnum terms = reader.terms();
      sortedTermQueue.clear();
      maxDocNum = reader.maxDoc();
      linkMap.clear();
      termList.clear();
      while (terms.next()) {
        // System.out.print(terms.term() + "\tDocFreq:" +
        TermDocs termDocs = reader.termDocs(terms.term());
        MyTerm temp = new MyTerm(terms.term(), termDocs, maxDocNum);
        if (temp.totalFreq < freqThreshold) {
          continue;
        } /*
           * if(temp.originTrem.text().length()==1){ continue; }
           */
        linkMap.put(temp.originTrem.text(), temp);
        sortedTermQueue.add(temp);
        termList.add(temp);
      }
      System.out.println("total Size:" + sortedTermQueue.size());
      System.out.println("mapsize:" + linkMap.keySet().size());
      // System.exit(0);
      int num = 0;
      this.maxFreq = sortedTermQueue.peek().totalFreq;
      while (!sortedTermQueue.isEmpty()) {
        num++;
        System.out.println(num + ":" + sortedTermQueue.poll());
      }
      System.out.println("read index info done");
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      try {
        reader.close();

      } catch (IOException e) {
        e.printStackTrace();
      }
    }
  }
  public int getClusterSize(int clusterNum) {
    int count = 0;
    for (int i = 0; i < seqList.size(); i++) {
      if (cluster[i] == clusterNum) count++;
    }

    return count;
  }
  public void buildClusterer(ArrayList<String> seqDB, double[][] sm) {
    seqList = seqDB;

    this.setSimMatrix(sm);

    Attribute seqString = new Attribute("sequence", (FastVector) null);
    FastVector attrInfo = new FastVector();
    attrInfo.addElement(seqString);
    Instances data = new Instances("data", attrInfo, 0);

    for (int i = 0; i < seqList.size(); i++) {
      Instance currentInst = new Instance(1);
      currentInst.setDataset(data);
      currentInst.setValue(0, seqList.get(i));
      data.add(currentInst);
    }

    try {
      buildClusterer(data);
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
Пример #5
0
  public void generateWekaFile(ArrayList<MyTerm> myTerms, int maxDocNum, String wekaFilePath)
      throws IOException {

    String text = "@relation interest\n";
    text += "@attribute text string\n";
    for (int i = 0; i < maxDocNum; i++) {
      text += "@attribute doc" + i + "\treal\n";
    }
    text += "@data\n";
    for (int j = 0; j < myTerms.size(); j++) {
      MyTerm term = myTerms.get(j);
      String line = "";
      line += term.originTrem.text();
      for (int i = 0; i < term.vector.length; i++) {
        line += "," + term.vector[i];
      }
      line += "\n";
      text += line;
    }
    // System.out.println(text);
    PrintWriter Pout = new PrintWriter(new FileWriter(wekaFilePath));
    Pout.println(text);
    Pout.close();
  }
Пример #6
0
  public JSONArray Cluster(String wekaFilePath, int clusterNum) throws Exception {
    File inputFile = new File(wekaFilePath);
    ArffLoader arf = new ArffLoader();
    arf.setFile(inputFile);
    Instances originIns = arf.getDataSet();
    Instances insTest = new Instances(originIns);
    insTest.deleteStringAttributes();
    int totalNum = insTest.numInstances();

    // SimpleKMeans sm = new SimpleKMeans();
    EM em = new EM();
    em.setNumClusters(clusterNum);
    MakeDensityBasedClusterer sm = new MakeDensityBasedClusterer();
    sm.setClusterer(em);
    sm.buildClusterer(insTest);

    System.out.println("totalNum:" + insTest.numInstances());
    System.out.println("============================");
    System.out.println(sm.toString());
    Map<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>();
    for (int i = 0; i < clusterNum; i++) {
      result.put(i, new ArrayList<String>());
    }

    for (int i = 0; i < totalNum; i++) {
      Instance ins = originIns.instance(i);
      String word = ins.stringValue(0);
      Instance tempIns = new Instance(ins);
      tempIns.deleteAttributeAt(0);
      int cluster = sm.clusterInstance(tempIns);
      result.get(cluster).add(word);
    }

    // print the result
    ArrayList<String> words = new ArrayList<String>();
    JSONArray keyWords = new JSONArray();
    for (int k : result.keySet()) {
      words = result.get(k);
      PriorityQueue<MyTerm> clusterQueue = new PriorityQueue<MyTerm>(1, MyTermCompare);
      for (int i = 0; i < words.size(); i++) {
        String s = words.get(i);
        assert linkMap.containsKey(s);
        int freq = linkMap.get(s).totalFreq;
        clusterQueue.add(linkMap.get(s));
        words.set(i, "(" + s + ":" + freq + ")");
      }

      JSONArray clusterArray = new JSONArray();
      int num = clusterQueue.size() / 10 + 1; // 5%
      int totalFreq = 0;
      int totalLength = 0;
      for (int i = 0; i < num && !clusterQueue.isEmpty(); ) {
        JSONObject mem = new JSONObject();
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        if (word.length() == 1) {
          continue;
        }
        mem.put("text", word);
        mem.put("freq", myTerm.totalFreq);
        clusterArray.put(mem);
        i++;
        totalFreq += myTerm.totalFreq;
        totalLength += word.length();
      }

      double averFreq = totalFreq * 1.0 / num;
      double averLength = totalLength * 1.0 / num;
      int count = 0;
      while (!clusterQueue.isEmpty() && count < num) {
        MyTerm myTerm = clusterQueue.poll();
        String word = myTerm.originTrem.text();
        int freq = myTerm.totalFreq;
        int times = (int) (word.length() / averFreq) + 1;
        if (freq > averFreq / times) {
          JSONObject mem = new JSONObject();
          mem.put("text", word);
          mem.put("freq", freq);
          mem.put("extra", true);
          clusterArray.put(mem);
        }
      }

      keyWords.put(clusterArray);
      System.out.println(
          "cluster" + k + ":" + words.size() + ":\t" + (int) (words.size() * 1.0 / totalNum * 100));
      if (result.get(k).size() < 100) {
        System.out.println(result.get(k));
      }
    }
    // System.out.println("errorNum:"+errorNum);
    return keyWords;
  }
Пример #7
0
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>columns</b> representing individual </b>instances</b> and <b>rows</b> representing
   * <b>attributes</b> (e.g. as is common with microarray data)
   */
  public Instances tableColsToInstances(Table t, String relationName) {

    System.err.print("Converting table cols to instances...");

    // Set up attributes, which for colInstances will be the rowNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...

    System.err.print("creating attributes...");

    for (int r = 0; r < t.numRows; r++) {
      if (rowIsNumeric(t, r)) {
        isNominal.add(false);
        atts.addElement(new Attribute(t.rowNames[r]));
        allAttVals.add(null); // No enumeration of attribute values.
      } else {
        // It's nominal... determine the range of values and create a nominal attribute...
        isNominal.add(true);
        FastVector attVals = getRowValues(t, r);
        atts.addElement(new Attribute(t.rowNames[r], attVals));
        // Save it for later
        allAttVals.add(attVals);
      }
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    /** ***** CREATE INSTANCES ************* */
    // Fill the instances with data...
    // For each instance...
    for (int c = 0; c < t.numCols; c++) {
      double[] vals =
          new double[data.numAttributes()]; // Even nominal values are stored as double pointers.

      // For each attribute fill in the numeric or attributeValue index...
      for (int r = 0; r < t.numRows; r++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[r] = Instance.missingValue();
        else if (isNominal.get(r)) {
          vals[r] = allAttVals.get(r).indexOf(val);
        } else {
          vals[r] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    /** ***** ADD FEATURE NAMES ************* */
    // takes basically zero time... all time is in previous 2 chunks.
    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int c = 0; c < t.colNames.length; c++) {
        instanceNames.add(t.colNames[c]);
        newData.instance(c).setValue(attrIdx, t.colNames[c]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
Пример #8
0
  /**
   * ************************************************** Convert a table to a set of instances, with
   * <b>rows</b> representing individual </b>instances</b> and <b>columns</b> representing
   * <b>attributes</b>
   */
  public Instances tableRowsToNominalInstances(Table t, String relationName) {

    System.err.print("Converting table rows to instances...");

    // Set up attributes, which for rowInstances will be the colNames...
    FastVector atts = new FastVector();
    ArrayList<Boolean> isNominal = new ArrayList<Boolean>();
    ArrayList<FastVector> allAttVals = new ArrayList<FastVector>(); // Save values for later...			

    System.err.print("creating attributes...");

    for (int c = 0; c < t.numCols; c++) {
      // It's nominal... determine the range of values
      isNominal.add(true);
      FastVector attVals = getColValues(t, c);
      atts.addElement(new Attribute(t.colNames[c], attVals));
      // Save it for later
      allAttVals.add(attVals);
    }

    System.err.print("creating instances...");

    // Create Instances object..
    Instances data = new Instances(relationName, atts, 0);
    data.setRelationName(relationName);

    // Fill the instances with data...
    // For each instance...
    for (int r = 0; r < t.numRows; r++) {
      double[] vals = new double[data.numAttributes()];

      // for each attribute
      for (int c = 0; c < t.numCols; c++) {
        String val = (String) t.matrix.getQuick(r, c);
        if (val == "?") vals[c] = Instance.missingValue();
        else if (isNominal.get(c)) {
          vals[c] = allAttVals.get(c).indexOf(val);
        } else {
          vals[c] = Double.parseDouble((String) val);
        }
      }
      // Add the a newly minted instance with those attribute values...
      data.add(new Instance(1.0, vals));
    }

    System.err.print("add feature names...");

    if (addInstanceNamesAsFeatures) {
      Instances newData = new Instances(data);
      newData.insertAttributeAt(new Attribute("ID", (FastVector) null), 0);
      int attrIdx = newData.attribute("ID").index(); // Paranoid... should be 0

      // We save the instanceNames in a list because it's handy later on...
      instanceNames = new ArrayList<String>();

      for (int r = 0; r < t.rowNames.length; r++) {
        instanceNames.add(t.rowNames[r]);
        newData.instance(r).setValue(attrIdx, t.rowNames[r]);
      }
      data = newData;
    }

    System.err.println("done.");

    return (data);
  }
  /** if clusterIdx is -1, all instances are used (a single metric for all clusters is used) */
  public boolean trainMetric(int clusterIdx) throws Exception {
    Init(clusterIdx);

    double[] weights = new double[m_numAttributes];
    int violatedConstraints = 0;
    int numInstances = 0;

    for (int instIdx = 0; instIdx < m_instances.numInstances(); instIdx++) {
      int assignment = m_clusterAssignments[instIdx];

      // only instances assigned to this cluster are of importance
      if (assignment == clusterIdx || clusterIdx == -1) {
        numInstances++;
        if (clusterIdx < 0) {
          m_centroid = m_kmeans.getClusterCentroids().instance(assignment);
        }

        // accumulate variance
        Instance instance = m_instances.instance(instIdx);
        Instance diffInstance = m_metric.createDiffInstance(instance, m_centroid);
        for (int attr = 0; attr < m_numAttributes; attr++) {
          weights[attr] += diffInstance.value(attr);
        }

        // check all constraints for this instance
        Object list = m_instanceConstraintMap.get(new Integer(instIdx));
        if (list != null) { // there are constraints associated with this instance
          ArrayList constraintList = (ArrayList) list;
          for (int i = 0; i < constraintList.size(); i++) {
            InstancePair pair = (InstancePair) constraintList.get(i);
            int linkType = pair.linkType;
            int firstIdx = pair.first;
            int secondIdx = pair.second;
            Instance instance1 = m_instances.instance(firstIdx);
            Instance instance2 = m_instances.instance(secondIdx);
            int otherIdx =
                (firstIdx == instIdx)
                    ? m_clusterAssignments[secondIdx]
                    : m_clusterAssignments[firstIdx];

            if (otherIdx != -1) { // check whether the constraint is violated
              if (otherIdx != assignment && linkType == InstancePair.MUST_LINK) {
                diffInstance = m_metric.createDiffInstance(instance1, instance2);
                for (int attr = 0; attr < m_numAttributes; attr++) {
                  weights[attr] += 0.5 * m_MLweight * diffInstance.value(attr);
                }
              } else if (otherIdx == assignment && linkType == InstancePair.CANNOT_LINK) {
                diffInstance = m_metric.createDiffInstance(instance1, instance2);
                for (int attr = 0; attr < m_numAttributes; attr++) {
                  // this constraint will be counted twice, hence 0.5
                  weights[attr] += 0.5 * m_CLweight * m_maxCLDiffInstance.value(attr);
                  weights[attr] -= 0.5 * m_CLweight * diffInstance.value(attr);
                }
              }
            }
          }
        }
      }
    }
    //      System.out.println("Updating cluster " + clusterIdx
    //  		       + " containing " + numInstances);

    // check the weights
    double[] newWeights = new double[m_numAttributes];
    double[] currentWeights = m_metric.getWeights();

    boolean needNewtonRaphson = false;
    for (int attr = 0; attr < m_numAttributes; attr++) {
      if (weights[attr] <= 0) { // check to avoid divide by 0 - TODO!
        System.out.println(
            "Negative weight "
                + weights[attr]
                + " for clusterIdx="
                + clusterIdx
                + "; using prev value="
                + currentWeights[attr]);
        newWeights[attr] = currentWeights[attr];
        //  	needNewtonRaphson = true;
        //	break;
      } else {
        if (m_regularize) { // solution of quadratic equation - TODO!
          int n = m_instances.numInstances();
          double ratio = (m_logTermWeight * n) / (2 * weights[attr]);
          newWeights[attr] =
              ratio + Math.sqrt(ratio * ratio + (m_regularizerTermWeight * n) / weights[attr]);
        } else {
          newWeights[attr] = m_logTermWeight * numInstances / weights[attr];
        }
      }
    }

    // do NR if needed
    if (needNewtonRaphson) {
      System.out.println("GOING TO NEWTON-RAPHSON!!!\n");
      newWeights = updateWeightsUsingNewtonRaphson(currentWeights, weights);
    }

    // PRINT routine
    //      System.out.println("Total constraints violated: " + violatedConstraints/2 + "; weights
    // are:");
    //      for (int attr=0; attr<numAttributes; attr++) {
    //        System.out.print(newWeights[attr] + "\t");
    //      }
    //      System.out.println();
    // end PRINT routine

    m_metric.setWeights(newWeights);
    return true;
  }