コード例 #1
0
ファイル: SimilarFilter.java プロジェクト: chord1645/jackals
 private SpectralClustering cluster(double[][] data, int k) {
   long clock = System.currentTimeMillis();
   SpectralClustering cluster = new SpectralClustering(data, k, 0.27);
   System.out.format(
       "DBSCAN clusterings %d samples in %dms\n", data.length, System.currentTimeMillis() - clock);
   System.out.println("getNumClusters:" + cluster.getNumClusters());
   System.out.println("getClusterSize:" + cluster.getClusterSize());
   //        System.out.println(JSON.toJSONString(dbscan.getClusterSize()));
   System.out.println("toString:" + cluster.toString());
   return cluster;
 }
コード例 #2
0
ファイル: SimilarFilter.java プロジェクト: chord1645/jackals
 public void main() {
   double[][] data = buildData();
   SpectralClustering clustering = cluster(data, data.length - 1);
   double[] eigenValues = clustering.getEigen().getEigenValues();
   int count = bestCount(eigenValues);
   //        for (;count<100;){
   clustering = cluster(data, count);
   //        }
   //        SpectralClustering clustering = cluster(data, 379);
   print(clustering);
   writeFile(clustering);
   writeSolr(clustering);
   System.currentTimeMillis();
 }
コード例 #3
0
ファイル: SimilarFilter.java プロジェクト: chord1645/jackals
 private void print(SpectralClustering cluster) {
   Map<Integer, List<Doc>> map = new HashMap<Integer, List<Doc>>();
   int[] lab = cluster.getClusterLabel();
   for (int x = 0; x < lab.length; x++) {
     final Doc doc = list.get(x);
     if (!map.containsKey(new Integer(lab[x]))) {
       map.put(
           new Integer(lab[x]),
           new ArrayList<Doc>() {
             {
               add(doc);
             }
           });
     } else {
       map.get(new Integer(lab[x])).add(doc);
     }
   }
   for (Map.Entry<Integer, List<Doc>> e : map.entrySet()) {
     System.out.println("type:" + e.getKey());
     if (e.getValue().size() < 10) {
       for (Doc d : e.getValue()) {
         System.out.println("-------------------------------------");
         System.out.println("url:" + d.url);
         System.out.println("title:" + d.title);
         System.out.println("doc:" + d.text);
       }
     }
     System.out.println("==============================");
   }
 }
コード例 #4
0
ファイル: SimilarFilter.java プロジェクト: chord1645/jackals
 private void writeFile(SpectralClustering cluster) {
   FileUtil.clean(output);
   int[] lab = cluster.getClusterLabel();
   for (int x = 0; x < lab.length; x++) {
     final Doc doc = list.get(x);
     String one = doc.url + "\n" + doc.title + "\n" + doc.text + "\n" + hr;
     FileUtil.write(new File(output, lab[x] + ".txt"), one, true);
   }
 }
コード例 #5
0
ファイル: SimilarFilter.java プロジェクト: chord1645/jackals
  private void _cluster(double[][] data, int k) {
    long clock = System.currentTimeMillis();
    SpectralClustering cluster = new SpectralClustering(data, k, 0.355);
    System.out.format(
        "DBSCAN clusterings %d samples in %dms\n", data.length, System.currentTimeMillis() - clock);
    System.out.println("getNumClusters:" + cluster.getNumClusters());
    System.out.println("getClusterSize:" + cluster.getClusterSize());
    //        System.out.println(JSON.toJSONString(dbscan.getClusterSize()));
    System.out.println("toString:" + cluster.toString());
    /** ************************************************************ */
    boolean more = true;
    EigenValueDecomposition eigen = cluster.getEigen();
    double[] lab = eigen.getEigenValues();
    double sd = smile.math.Math.sd(eigen.getEigenValues());

    System.out.println("sd(eigen.getEigenValues()):" + sd);
    if (Math.min(eigen.getEigenValues()) > 0.3) {
      result = cluster;
      cluster(data, k + 1);
    } else {
      return;
    }
  }
コード例 #6
0
ファイル: SimilarFilter.java プロジェクト: chord1645/jackals
  private void writeSolr(SpectralClustering cluster) {
    Map<Integer, List<Doc>> map = new HashMap<Integer, List<Doc>>();
    int[] lab = cluster.getClusterLabel();
    for (int x = 0; x < lab.length; x++) {
      final Doc doc = list.get(x);
      if (!map.containsKey(new Integer(lab[x]))) {
        map.put(
            new Integer(lab[x]),
            new ArrayList<Doc>() {
              {
                add(doc);
              }
            });
      } else {
        map.get(new Integer(lab[x])).add(doc);
      }
    }
    for (Map.Entry<Integer, List<Doc>> e : map.entrySet()) {
      System.out.println("type:" + e.getKey());

      if (e.getValue().size() < 10) {
        StringBuffer query = new StringBuffer();
        for (Doc doc : e.getValue()) {
          query.append("url:\"").append(doc.url).append("\"");
          query.append(" OR ");
        }
        query.delete(query.lastIndexOf("OR"), query.length());
        String groupId = UUID.nameUUIDFromBytes(query.toString().getBytes()).toString();
        List<Map<String, Object>> toSave = new ArrayList<Map<String, Object>>();
        System.out.println("group:" + groupId + " = " + e.getValue().size());
        List<SolrDocument> list =
            (List<SolrDocument>) indexDao.sortList(query.toString(), 1, 100, "infoTime_dt desc");
        Date newest = new Date(0);
        int useful = 1;
        for (SolrDocument doc : list) {
          Date date = (Date) doc.get("infoTime_dt");
          if (date.getTime() > newest.getTime()) {}

          Map<String, Object> inputDoc = new HashMap<String, Object>(doc);
          inputDoc.put("useful_i", useful);
          inputDoc.put("sim_i", list.size());
          inputDoc.put("group_s", groupId);
          toSave.add(inputDoc);
          useful = 0;
        }
        indexDao.addIndex(toSave);
      }
    }
  }