/** * Calculate the average similarity to the cluster * * @param document the target document * @param cluster the target cluster * @param datasetHandler dataset handler * @param neo4jHandler neo4j handler * @return the average similarity * @throws IOException * @throws ClassNotFoundException */ private double calculateAvgSimilairtyToCluster( Neo4jDocument document, Neo4jCluster cluster, DatasetLoader datasetHandler, Neo4jHandler neo4jHandler) throws IOException, ClassNotFoundException { double similairty = 0; DDSimIF similairtyCalculator = new DDSimilairty(); ArrayList<Neo4jDocument> clusterDocuments = cluster.getDocumentsList(datasetHandler, neo4jHandler); for (Iterator iterator = clusterDocuments.iterator(); iterator.hasNext(); ) { Neo4jDocument neo4jDocument = (Neo4jDocument) iterator.next(); similairty += similairtyCalculator.calculateSimilarity( document, neo4jDocument, datasetHandler.numberOfDocuments()); } return similairty / clusterDocuments.size(); }
public Hashtable<String, Neo4jCluster> perform( DatasetLoader datasetHandler, Neo4jHandler neo4jHandler, double similairtyThreshold, int softClusteringThreshold) throws Exception { Hashtable<String, Neo4jCluster> clustersList = new Hashtable<String, Neo4jCluster>(); Hashtable<String, Document> docsHash = datasetHandler.loadDocuments(); DDSimIF similarityCalculator = new DDSimilairty(); Enumeration e = docsHash.keys(); int numberOfClusters = 0; // loop for documents in the dataset while (e.hasMoreElements()) { Hashtable<String, Double> candidateClustersHash = new Hashtable<String, Double>(); String documentID = (String) e.nextElement(); System.out.println("Processing document " + documentID); Document document = docsHash.get(documentID); Neo4jDocument neo4jDocument = neo4jHandler.loadDocument(document); boolean clusteredYet = false; // get similar documents to the document ArrayList<Neo4jDocument> similarDocuments = getSimilarDocuments(neo4jDocument, neo4jHandler, datasetHandler); // loop over the similar documents for (Iterator iterator = similarDocuments.iterator(); iterator.hasNext(); ) { Neo4jDocument neo4jSimilarDocument = (Neo4jDocument) iterator.next(); // continue if the current similar document has no clusters if (neo4jSimilarDocument.getClustersHash().isEmpty()) continue; // check if the distance to the document is greater than the threshold if (similarityCalculator.calculateSimilarity( neo4jDocument, neo4jSimilarDocument, datasetHandler.numberOfDocuments()) > similairtyThreshold) { // get the clusters of the similar document ArrayList<String> candidateDocumentClustersIDs = neo4jSimilarDocument.getClusterIDsList(); // loop over the clusters of the similar document for (Iterator iterator2 = candidateDocumentClustersIDs.iterator(); iterator2.hasNext(); ) { // loop for candidate clusters String candidateClusterID = (String) iterator2.next(); // get the cluster Neo4jCluster candidateNeo4jCluster = clustersList.get(candidateClusterID); // calculate the average similarity to the cluster double averageSimilairtyToCluster = calculateAvgSimilairtyToCluster( neo4jDocument, candidateNeo4jCluster, datasetHandler, neo4jHandler); // if the average similarity greater the the threshold then the cluster to the candidate // clusters if (averageSimilairtyToCluster > similairtyThreshold) { clusteredYet = true; candidateClustersHash.put(candidateClusterID, averageSimilairtyToCluster); } // end if adding cluster to candidate cluster hash } // end loop for candidate clusters } // end if [checking if the distance to the candidate document is less than the threshold] } // end looping for similar documents if (!clusteredYet) { // create new cluster numberOfClusters++; Neo4jCluster newCluster = new Neo4jCluster(String.valueOf(numberOfClusters)); newCluster.addDcoument(documentID); neo4jDocument.addCluster(newCluster.getId(), 1); clustersList.put(newCluster.getId(), newCluster); } else { // add to the cloeset cluster String nearestClusterID = getNearestCluster(candidateClustersHash); Neo4jCluster cluster = clustersList.get(nearestClusterID); cluster.addDcoument(neo4jDocument.getDocumentID()); neo4jDocument.addCluster(nearestClusterID, 1); } } // end loop for all documents in the data set return clustersList; }