/** * Get similar documents to the current document by matching the nodes * * @param doc target document * @param neo4jHandler neo4j handler * @param datasetHandler dataset handler * @return arraylist of matching document * @throws IOException * @throws ClassNotFoundException */ private ArrayList<Neo4jDocument> getSimilarDocuments( Neo4jDocument doc, Neo4jHandler neo4jHandler, DatasetLoader datasetHandler) throws IOException, ClassNotFoundException { ArrayList<Neo4jDocument> similarDocument = new ArrayList<Neo4jDocument>(); Hashtable<String, Neo4jDocument> similarDocumentHash = new Hashtable<String, Neo4jDocument>(); Neo4jDocument neo4jDocument = doc; ArrayList<Neo4jNode> nodes = neo4jDocument.getNodesList(); for (Iterator iterator = nodes.iterator(); iterator.hasNext(); ) { Neo4jNode neo4jNode = (Neo4jNode) iterator.next(); Hashtable<String, ArrayList<String>> documentTable = neo4jNode.getDocumentTable(); Enumeration e = documentTable.keys(); while (e.hasMoreElements()) { String simDocumentID = (String) e.nextElement(); if (!simDocumentID.equalsIgnoreCase(doc.getDocumentID()) && !similarDocumentHash.containsKey(simDocumentID)) { Document d = datasetHandler.getDocument(simDocumentID); Neo4jDocument nd = neo4jHandler.loadDocument(d); similarDocumentHash.put(simDocumentID, nd); similarDocument.add(nd); } // end if } // end looping for document at document table in the current node } // end looping for the node return similarDocument; }
public Hashtable<String, Neo4jCluster> perform( DatasetLoader datasetHandler, Neo4jHandler neo4jHandler, double similairtyThreshold, int softClusteringThreshold) throws Exception { Hashtable<String, Neo4jCluster> clustersList = new Hashtable<String, Neo4jCluster>(); Hashtable<String, Document> docsHash = datasetHandler.loadDocuments(); DDSimIF similarityCalculator = new DDSimilairty(); Enumeration e = docsHash.keys(); int numberOfClusters = 0; // loop for documents in the dataset while (e.hasMoreElements()) { Hashtable<String, Double> candidateClustersHash = new Hashtable<String, Double>(); String documentID = (String) e.nextElement(); System.out.println("Processing document " + documentID); Document document = docsHash.get(documentID); Neo4jDocument neo4jDocument = neo4jHandler.loadDocument(document); boolean clusteredYet = false; // get similar documents to the document ArrayList<Neo4jDocument> similarDocuments = getSimilarDocuments(neo4jDocument, neo4jHandler, datasetHandler); // loop over the similar documents for (Iterator iterator = similarDocuments.iterator(); iterator.hasNext(); ) { Neo4jDocument neo4jSimilarDocument = (Neo4jDocument) iterator.next(); // continue if the current similar document has no clusters if (neo4jSimilarDocument.getClustersHash().isEmpty()) continue; // check if the distance to the document is greater than the threshold if (similarityCalculator.calculateSimilarity( neo4jDocument, neo4jSimilarDocument, datasetHandler.numberOfDocuments()) > similairtyThreshold) { // get the clusters of the similar document ArrayList<String> candidateDocumentClustersIDs = neo4jSimilarDocument.getClusterIDsList(); // loop over the clusters of the similar document for (Iterator iterator2 = candidateDocumentClustersIDs.iterator(); iterator2.hasNext(); ) { // loop for candidate clusters String candidateClusterID = (String) iterator2.next(); // get the cluster Neo4jCluster candidateNeo4jCluster = clustersList.get(candidateClusterID); // calculate the average similarity to the cluster double averageSimilairtyToCluster = calculateAvgSimilairtyToCluster( neo4jDocument, candidateNeo4jCluster, datasetHandler, neo4jHandler); // if the average similarity greater the the threshold then the cluster to the candidate // clusters if (averageSimilairtyToCluster > similairtyThreshold) { clusteredYet = true; candidateClustersHash.put(candidateClusterID, averageSimilairtyToCluster); } // end if adding cluster to candidate cluster hash } // end loop for candidate clusters } // end if [checking if the distance to the candidate document is less than the threshold] } // end looping for similar documents if (!clusteredYet) { // create new cluster numberOfClusters++; Neo4jCluster newCluster = new Neo4jCluster(String.valueOf(numberOfClusters)); newCluster.addDcoument(documentID); neo4jDocument.addCluster(newCluster.getId(), 1); clustersList.put(newCluster.getId(), newCluster); } else { // add to the cloeset cluster String nearestClusterID = getNearestCluster(candidateClustersHash); Neo4jCluster cluster = clustersList.get(nearestClusterID); cluster.addDcoument(neo4jDocument.getDocumentID()); neo4jDocument.addCluster(nearestClusterID, 1); } } // end loop for all documents in the data set return clustersList; }