예제 #1
0
 /**
  * Calculate the average similarity to the cluster
  *
  * @param document the target document
  * @param cluster the target cluster
  * @param datasetHandler dataset handler
  * @param neo4jHandler neo4j handler
  * @return the average similarity
  * @throws IOException
  * @throws ClassNotFoundException
  */
 private double calculateAvgSimilairtyToCluster(
     Neo4jDocument document,
     Neo4jCluster cluster,
     DatasetLoader datasetHandler,
     Neo4jHandler neo4jHandler)
     throws IOException, ClassNotFoundException {
   double similairty = 0;
   DDSimIF similairtyCalculator = new DDSimilairty();
   ArrayList<Neo4jDocument> clusterDocuments =
       cluster.getDocumentsList(datasetHandler, neo4jHandler);
   for (Iterator iterator = clusterDocuments.iterator(); iterator.hasNext(); ) {
     Neo4jDocument neo4jDocument = (Neo4jDocument) iterator.next();
     similairty +=
         similairtyCalculator.calculateSimilarity(
             document, neo4jDocument, datasetHandler.numberOfDocuments());
   }
   return similairty / clusterDocuments.size();
 }
예제 #2
0
  public Hashtable<String, Neo4jCluster> perform(
      DatasetLoader datasetHandler,
      Neo4jHandler neo4jHandler,
      double similairtyThreshold,
      int softClusteringThreshold)
      throws Exception {
    Hashtable<String, Neo4jCluster> clustersList = new Hashtable<String, Neo4jCluster>();
    Hashtable<String, Document> docsHash = datasetHandler.loadDocuments();
    DDSimIF similarityCalculator = new DDSimilairty();
    Enumeration e = docsHash.keys();
    int numberOfClusters = 0;
    // loop for documents in the dataset
    while (e.hasMoreElements()) {
      Hashtable<String, Double> candidateClustersHash = new Hashtable<String, Double>();
      String documentID = (String) e.nextElement();
      System.out.println("Processing document " + documentID);
      Document document = docsHash.get(documentID);
      Neo4jDocument neo4jDocument = neo4jHandler.loadDocument(document);
      boolean clusteredYet = false;
      // get similar documents to the document
      ArrayList<Neo4jDocument> similarDocuments =
          getSimilarDocuments(neo4jDocument, neo4jHandler, datasetHandler);
      // loop over the similar documents
      for (Iterator iterator = similarDocuments.iterator(); iterator.hasNext(); ) {

        Neo4jDocument neo4jSimilarDocument = (Neo4jDocument) iterator.next();
        // continue if the current similar document has no clusters
        if (neo4jSimilarDocument.getClustersHash().isEmpty()) continue;
        // check if the distance to the document is greater than the threshold
        if (similarityCalculator.calculateSimilarity(
                neo4jDocument, neo4jSimilarDocument, datasetHandler.numberOfDocuments())
            > similairtyThreshold) {
          // get the clusters of the similar document
          ArrayList<String> candidateDocumentClustersIDs = neo4jSimilarDocument.getClusterIDsList();
          // loop over the clusters of the similar document
          for (Iterator iterator2 = candidateDocumentClustersIDs.iterator();
              iterator2.hasNext(); ) { // loop for candidate clusters
            String candidateClusterID = (String) iterator2.next();
            // get the cluster
            Neo4jCluster candidateNeo4jCluster = clustersList.get(candidateClusterID);
            // calculate the average similarity to the cluster
            double averageSimilairtyToCluster =
                calculateAvgSimilairtyToCluster(
                    neo4jDocument, candidateNeo4jCluster, datasetHandler, neo4jHandler);
            // if the average similarity greater the the threshold then the cluster to the candidate
            // clusters
            if (averageSimilairtyToCluster > similairtyThreshold) {
              clusteredYet = true;
              candidateClustersHash.put(candidateClusterID, averageSimilairtyToCluster);
            } // end if adding cluster to candidate cluster hash
          } // end loop for candidate clusters
        } // end if [checking if the distance to the candidate document is less than the threshold]
      } // end looping for similar documents

      if (!clusteredYet) { // create new cluster
        numberOfClusters++;
        Neo4jCluster newCluster = new Neo4jCluster(String.valueOf(numberOfClusters));
        newCluster.addDcoument(documentID);
        neo4jDocument.addCluster(newCluster.getId(), 1);
        clustersList.put(newCluster.getId(), newCluster);
      } else { // add to the cloeset cluster
        String nearestClusterID = getNearestCluster(candidateClustersHash);
        Neo4jCluster cluster = clustersList.get(nearestClusterID);
        cluster.addDcoument(neo4jDocument.getDocumentID());
        neo4jDocument.addCluster(nearestClusterID, 1);
      }
    } // end loop for all documents in the data set

    return clustersList;
  }