public static void main(String[] args) throws Exception { Neo4jHandler neo4jHandler = Neo4jHandler.getInstance("/media/disk/master/Noe4j/reuters"); ReutersDataset datasetHandler = new ReutersDataset("/media/disk/master/Noe4j/datasets/reuters_mod"); long startTime = System.currentTimeMillis(); SinglePass singlePassAlgorithm = new SinglePass(); double threshold = 0.1; Hashtable<String, Neo4jCluster> clusters = singlePassAlgorithm.perform(datasetHandler, neo4jHandler, threshold, 5); long endTime = System.currentTimeMillis(); FMeasure fmeasureCalculate = new FMeasure(); fmeasureCalculate.calculate(clusters, datasetHandler, neo4jHandler); System.out.println("*********************"); System.out.println("Total elapsed time in execution is :" + (endTime - startTime)); System.out.println("******* For Threshold = " + threshold); System.out.println("Fmeasure = " + fmeasureCalculate.getFmeasure()); System.out.println("Precision = " + fmeasureCalculate.getPrecision()); System.out.println("Recall = " + fmeasureCalculate.getRecall()); System.out.println("*********************"); // System.out.println("Number of documents = "+ datasetHandler.numberOfDocuments()); // Enumeration e = clusters.keys(); // while (e.hasMoreElements()) { // String clusterID = (String) e.nextElement(); // System.out.println("Cluster = " + clusterID + " has number of documents = " + // clusters.get(clusterID).getDocumentIDs().size()); // ArrayList<Neo4jDocument> documents = // clusters.get(clusterID).getDocumentsList(datasetHandler, neo4jHandler); // for (int i = 0; i < documents.size(); i++) { // // System.out.println(datasetHandler.getDocument(documents.get(i).getDocumentID()).getOrginalCluster()); // } // // System.out.println(("*********************************************************************************")); // } neo4jHandler.registerShutdownHook(); }
/** * Get similar documents to the current document by matching the nodes * * @param doc target document * @param neo4jHandler neo4j handler * @param datasetHandler dataset handler * @return arraylist of matching document * @throws IOException * @throws ClassNotFoundException */ private ArrayList<Neo4jDocument> getSimilarDocuments( Neo4jDocument doc, Neo4jHandler neo4jHandler, DatasetLoader datasetHandler) throws IOException, ClassNotFoundException { ArrayList<Neo4jDocument> similarDocument = new ArrayList<Neo4jDocument>(); Hashtable<String, Neo4jDocument> similarDocumentHash = new Hashtable<String, Neo4jDocument>(); Neo4jDocument neo4jDocument = doc; ArrayList<Neo4jNode> nodes = neo4jDocument.getNodesList(); for (Iterator iterator = nodes.iterator(); iterator.hasNext(); ) { Neo4jNode neo4jNode = (Neo4jNode) iterator.next(); Hashtable<String, ArrayList<String>> documentTable = neo4jNode.getDocumentTable(); Enumeration e = documentTable.keys(); while (e.hasMoreElements()) { String simDocumentID = (String) e.nextElement(); if (!simDocumentID.equalsIgnoreCase(doc.getDocumentID()) && !similarDocumentHash.containsKey(simDocumentID)) { Document d = datasetHandler.getDocument(simDocumentID); Neo4jDocument nd = neo4jHandler.loadDocument(d); similarDocumentHash.put(simDocumentID, nd); similarDocument.add(nd); } // end if } // end looping for document at document table in the current node } // end looping for the node return similarDocument; }
public Hashtable<String, Neo4jCluster> perform( DatasetLoader datasetHandler, Neo4jHandler neo4jHandler, double similairtyThreshold, int softClusteringThreshold) throws Exception { Hashtable<String, Neo4jCluster> clustersList = new Hashtable<String, Neo4jCluster>(); Hashtable<String, Document> docsHash = datasetHandler.loadDocuments(); DDSimIF similarityCalculator = new DDSimilairty(); Enumeration e = docsHash.keys(); int numberOfClusters = 0; // loop for documents in the dataset while (e.hasMoreElements()) { Hashtable<String, Double> candidateClustersHash = new Hashtable<String, Double>(); String documentID = (String) e.nextElement(); System.out.println("Processing document " + documentID); Document document = docsHash.get(documentID); Neo4jDocument neo4jDocument = neo4jHandler.loadDocument(document); boolean clusteredYet = false; // get similar documents to the document ArrayList<Neo4jDocument> similarDocuments = getSimilarDocuments(neo4jDocument, neo4jHandler, datasetHandler); // loop over the similar documents for (Iterator iterator = similarDocuments.iterator(); iterator.hasNext(); ) { Neo4jDocument neo4jSimilarDocument = (Neo4jDocument) iterator.next(); // continue if the current similar document has no clusters if (neo4jSimilarDocument.getClustersHash().isEmpty()) continue; // check if the distance to the document is greater than the threshold if (similarityCalculator.calculateSimilarity( neo4jDocument, neo4jSimilarDocument, datasetHandler.numberOfDocuments()) > similairtyThreshold) { // get the clusters of the similar document ArrayList<String> candidateDocumentClustersIDs = neo4jSimilarDocument.getClusterIDsList(); // loop over the clusters of the similar document for (Iterator iterator2 = candidateDocumentClustersIDs.iterator(); iterator2.hasNext(); ) { // loop for candidate clusters String candidateClusterID = (String) iterator2.next(); // get the cluster Neo4jCluster candidateNeo4jCluster = clustersList.get(candidateClusterID); // calculate the average similarity to the cluster double averageSimilairtyToCluster = calculateAvgSimilairtyToCluster( neo4jDocument, candidateNeo4jCluster, datasetHandler, neo4jHandler); // if the average similarity greater the the threshold then the cluster to the candidate // clusters if (averageSimilairtyToCluster > similairtyThreshold) { clusteredYet = true; candidateClustersHash.put(candidateClusterID, averageSimilairtyToCluster); } // end if adding cluster to candidate cluster hash } // end loop for candidate clusters } // end if [checking if the distance to the candidate document is less than the threshold] } // end looping for similar documents if (!clusteredYet) { // create new cluster numberOfClusters++; Neo4jCluster newCluster = new Neo4jCluster(String.valueOf(numberOfClusters)); newCluster.addDcoument(documentID); neo4jDocument.addCluster(newCluster.getId(), 1); clustersList.put(newCluster.getId(), newCluster); } else { // add to the cloeset cluster String nearestClusterID = getNearestCluster(candidateClustersHash); Neo4jCluster cluster = clustersList.get(nearestClusterID); cluster.addDcoument(neo4jDocument.getDocumentID()); neo4jDocument.addCluster(nearestClusterID, 1); } } // end loop for all documents in the data set return clustersList; }