Beispiel #1
0
 /**
  * Get similar documents to the current document by matching the nodes
  *
  * @param doc target document
  * @param neo4jHandler neo4j handler
  * @param datasetHandler dataset handler
  * @return arraylist of matching document
  * @throws IOException
  * @throws ClassNotFoundException
  */
 private ArrayList<Neo4jDocument> getSimilarDocuments(
     Neo4jDocument doc, Neo4jHandler neo4jHandler, DatasetLoader datasetHandler)
     throws IOException, ClassNotFoundException {
   ArrayList<Neo4jDocument> similarDocument = new ArrayList<Neo4jDocument>();
   Hashtable<String, Neo4jDocument> similarDocumentHash = new Hashtable<String, Neo4jDocument>();
   Neo4jDocument neo4jDocument = doc;
   ArrayList<Neo4jNode> nodes = neo4jDocument.getNodesList();
   for (Iterator iterator = nodes.iterator(); iterator.hasNext(); ) {
     Neo4jNode neo4jNode = (Neo4jNode) iterator.next();
     Hashtable<String, ArrayList<String>> documentTable = neo4jNode.getDocumentTable();
     Enumeration e = documentTable.keys();
     while (e.hasMoreElements()) {
       String simDocumentID = (String) e.nextElement();
       if (!simDocumentID.equalsIgnoreCase(doc.getDocumentID())
           && !similarDocumentHash.containsKey(simDocumentID)) {
         Document d = datasetHandler.getDocument(simDocumentID);
         Neo4jDocument nd = neo4jHandler.loadDocument(d);
         similarDocumentHash.put(simDocumentID, nd);
         similarDocument.add(nd);
       } // end if
     } // end looping for document at document table in the current node
   } // end looping for the node
   return similarDocument;
 }
Beispiel #2
0
  public Hashtable<String, Neo4jCluster> perform(
      DatasetLoader datasetHandler,
      Neo4jHandler neo4jHandler,
      double similairtyThreshold,
      int softClusteringThreshold)
      throws Exception {
    Hashtable<String, Neo4jCluster> clustersList = new Hashtable<String, Neo4jCluster>();
    Hashtable<String, Document> docsHash = datasetHandler.loadDocuments();
    DDSimIF similarityCalculator = new DDSimilairty();
    Enumeration e = docsHash.keys();
    int numberOfClusters = 0;
    // loop for documents in the dataset
    while (e.hasMoreElements()) {
      Hashtable<String, Double> candidateClustersHash = new Hashtable<String, Double>();
      String documentID = (String) e.nextElement();
      System.out.println("Processing document " + documentID);
      Document document = docsHash.get(documentID);
      Neo4jDocument neo4jDocument = neo4jHandler.loadDocument(document);
      boolean clusteredYet = false;
      // get similar documents to the document
      ArrayList<Neo4jDocument> similarDocuments =
          getSimilarDocuments(neo4jDocument, neo4jHandler, datasetHandler);
      // loop over the similar documents
      for (Iterator iterator = similarDocuments.iterator(); iterator.hasNext(); ) {

        Neo4jDocument neo4jSimilarDocument = (Neo4jDocument) iterator.next();
        // continue if the current similar document has no clusters
        if (neo4jSimilarDocument.getClustersHash().isEmpty()) continue;
        // check if the distance to the document is greater than the threshold
        if (similarityCalculator.calculateSimilarity(
                neo4jDocument, neo4jSimilarDocument, datasetHandler.numberOfDocuments())
            > similairtyThreshold) {
          // get the clusters of the similar document
          ArrayList<String> candidateDocumentClustersIDs = neo4jSimilarDocument.getClusterIDsList();
          // loop over the clusters of the similar document
          for (Iterator iterator2 = candidateDocumentClustersIDs.iterator();
              iterator2.hasNext(); ) { // loop for candidate clusters
            String candidateClusterID = (String) iterator2.next();
            // get the cluster
            Neo4jCluster candidateNeo4jCluster = clustersList.get(candidateClusterID);
            // calculate the average similarity to the cluster
            double averageSimilairtyToCluster =
                calculateAvgSimilairtyToCluster(
                    neo4jDocument, candidateNeo4jCluster, datasetHandler, neo4jHandler);
            // if the average similarity greater the the threshold then the cluster to the candidate
            // clusters
            if (averageSimilairtyToCluster > similairtyThreshold) {
              clusteredYet = true;
              candidateClustersHash.put(candidateClusterID, averageSimilairtyToCluster);
            } // end if adding cluster to candidate cluster hash
          } // end loop for candidate clusters
        } // end if [checking if the distance to the candidate document is less than the threshold]
      } // end looping for similar documents

      if (!clusteredYet) { // create new cluster
        numberOfClusters++;
        Neo4jCluster newCluster = new Neo4jCluster(String.valueOf(numberOfClusters));
        newCluster.addDcoument(documentID);
        neo4jDocument.addCluster(newCluster.getId(), 1);
        clustersList.put(newCluster.getId(), newCluster);
      } else { // add to the cloeset cluster
        String nearestClusterID = getNearestCluster(candidateClustersHash);
        Neo4jCluster cluster = clustersList.get(nearestClusterID);
        cluster.addDcoument(neo4jDocument.getDocumentID());
        neo4jDocument.addCluster(nearestClusterID, 1);
      }
    } // end loop for all documents in the data set

    return clustersList;
  }