Beispiel #1
0
  public static void main(String[] args) throws Exception {
    Neo4jHandler neo4jHandler = Neo4jHandler.getInstance("/media/disk/master/Noe4j/reuters");
    ReutersDataset datasetHandler =
        new ReutersDataset("/media/disk/master/Noe4j/datasets/reuters_mod");

    long startTime = System.currentTimeMillis();

    SinglePass singlePassAlgorithm = new SinglePass();
    double threshold = 0.1;
    Hashtable<String, Neo4jCluster> clusters =
        singlePassAlgorithm.perform(datasetHandler, neo4jHandler, threshold, 5);
    long endTime = System.currentTimeMillis();

    FMeasure fmeasureCalculate = new FMeasure();
    fmeasureCalculate.calculate(clusters, datasetHandler, neo4jHandler);
    System.out.println("*********************");
    System.out.println("Total elapsed time in execution  is :" + (endTime - startTime));

    System.out.println("******* For Threshold = " + threshold);
    System.out.println("Fmeasure = " + fmeasureCalculate.getFmeasure());
    System.out.println("Precision = " + fmeasureCalculate.getPrecision());
    System.out.println("Recall = " + fmeasureCalculate.getRecall());
    System.out.println("*********************");

    //		System.out.println("Number of documents = "+ datasetHandler.numberOfDocuments());
    //		Enumeration e = clusters.keys();
    //		while (e.hasMoreElements()) {
    //			String clusterID = (String) e.nextElement();
    //			System.out.println("Cluster = " + clusterID + " has number of documents = " +
    // clusters.get(clusterID).getDocumentIDs().size());
    //			ArrayList<Neo4jDocument> documents =
    // clusters.get(clusterID).getDocumentsList(datasetHandler, neo4jHandler);
    //			for (int i = 0; i < documents.size(); i++) {
    //
    //	System.out.println(datasetHandler.getDocument(documents.get(i).getDocumentID()).getOrginalCluster());
    //			}
    //
    //	System.out.println(("*********************************************************************************"));
    //		}
    neo4jHandler.registerShutdownHook();
  }
Beispiel #2
0
 /**
  * Get similar documents to the current document by matching the nodes
  *
  * @param doc target document
  * @param neo4jHandler neo4j handler
  * @param datasetHandler dataset handler
  * @return arraylist of matching document
  * @throws IOException
  * @throws ClassNotFoundException
  */
 private ArrayList<Neo4jDocument> getSimilarDocuments(
     Neo4jDocument doc, Neo4jHandler neo4jHandler, DatasetLoader datasetHandler)
     throws IOException, ClassNotFoundException {
   ArrayList<Neo4jDocument> similarDocument = new ArrayList<Neo4jDocument>();
   Hashtable<String, Neo4jDocument> similarDocumentHash = new Hashtable<String, Neo4jDocument>();
   Neo4jDocument neo4jDocument = doc;
   ArrayList<Neo4jNode> nodes = neo4jDocument.getNodesList();
   for (Iterator iterator = nodes.iterator(); iterator.hasNext(); ) {
     Neo4jNode neo4jNode = (Neo4jNode) iterator.next();
     Hashtable<String, ArrayList<String>> documentTable = neo4jNode.getDocumentTable();
     Enumeration e = documentTable.keys();
     while (e.hasMoreElements()) {
       String simDocumentID = (String) e.nextElement();
       if (!simDocumentID.equalsIgnoreCase(doc.getDocumentID())
           && !similarDocumentHash.containsKey(simDocumentID)) {
         Document d = datasetHandler.getDocument(simDocumentID);
         Neo4jDocument nd = neo4jHandler.loadDocument(d);
         similarDocumentHash.put(simDocumentID, nd);
         similarDocument.add(nd);
       } // end if
     } // end looping for document at document table in the current node
   } // end looping for the node
   return similarDocument;
 }
Beispiel #3
0
  public Hashtable<String, Neo4jCluster> perform(
      DatasetLoader datasetHandler,
      Neo4jHandler neo4jHandler,
      double similairtyThreshold,
      int softClusteringThreshold)
      throws Exception {
    Hashtable<String, Neo4jCluster> clustersList = new Hashtable<String, Neo4jCluster>();
    Hashtable<String, Document> docsHash = datasetHandler.loadDocuments();
    DDSimIF similarityCalculator = new DDSimilairty();
    Enumeration e = docsHash.keys();
    int numberOfClusters = 0;
    // loop for documents in the dataset
    while (e.hasMoreElements()) {
      Hashtable<String, Double> candidateClustersHash = new Hashtable<String, Double>();
      String documentID = (String) e.nextElement();
      System.out.println("Processing document " + documentID);
      Document document = docsHash.get(documentID);
      Neo4jDocument neo4jDocument = neo4jHandler.loadDocument(document);
      boolean clusteredYet = false;
      // get similar documents to the document
      ArrayList<Neo4jDocument> similarDocuments =
          getSimilarDocuments(neo4jDocument, neo4jHandler, datasetHandler);
      // loop over the similar documents
      for (Iterator iterator = similarDocuments.iterator(); iterator.hasNext(); ) {

        Neo4jDocument neo4jSimilarDocument = (Neo4jDocument) iterator.next();
        // continue if the current similar document has no clusters
        if (neo4jSimilarDocument.getClustersHash().isEmpty()) continue;
        // check if the distance to the document is greater than the threshold
        if (similarityCalculator.calculateSimilarity(
                neo4jDocument, neo4jSimilarDocument, datasetHandler.numberOfDocuments())
            > similairtyThreshold) {
          // get the clusters of the similar document
          ArrayList<String> candidateDocumentClustersIDs = neo4jSimilarDocument.getClusterIDsList();
          // loop over the clusters of the similar document
          for (Iterator iterator2 = candidateDocumentClustersIDs.iterator();
              iterator2.hasNext(); ) { // loop for candidate clusters
            String candidateClusterID = (String) iterator2.next();
            // get the cluster
            Neo4jCluster candidateNeo4jCluster = clustersList.get(candidateClusterID);
            // calculate the average similarity to the cluster
            double averageSimilairtyToCluster =
                calculateAvgSimilairtyToCluster(
                    neo4jDocument, candidateNeo4jCluster, datasetHandler, neo4jHandler);
            // if the average similarity greater the the threshold then the cluster to the candidate
            // clusters
            if (averageSimilairtyToCluster > similairtyThreshold) {
              clusteredYet = true;
              candidateClustersHash.put(candidateClusterID, averageSimilairtyToCluster);
            } // end if adding cluster to candidate cluster hash
          } // end loop for candidate clusters
        } // end if [checking if the distance to the candidate document is less than the threshold]
      } // end looping for similar documents

      if (!clusteredYet) { // create new cluster
        numberOfClusters++;
        Neo4jCluster newCluster = new Neo4jCluster(String.valueOf(numberOfClusters));
        newCluster.addDcoument(documentID);
        neo4jDocument.addCluster(newCluster.getId(), 1);
        clustersList.put(newCluster.getId(), newCluster);
      } else { // add to the cloeset cluster
        String nearestClusterID = getNearestCluster(candidateClustersHash);
        Neo4jCluster cluster = clustersList.get(nearestClusterID);
        cluster.addDcoument(neo4jDocument.getDocumentID());
        neo4jDocument.addCluster(nearestClusterID, 1);
      }
    } // end loop for all documents in the data set

    return clustersList;
  }