/** Replace document refids with the actual references upon deserialization. */
  private void documentIdToReference(Cluster cluster, Map<String, Document> documents) {
    if (cluster.documentIds != null) {
      for (Cluster.DocumentRefid documentRefid : cluster.documentIds) {
        cluster.addDocuments(documents.get(documentRefid.refid));
      }
    }

    for (Cluster subcluster : cluster.getSubclusters()) {
      documentIdToReference(subcluster, documents);
    }
  }
  private void addToCluster(Map<Object, Cluster> clusters, Object fieldValue, Document document) {
    if (fieldValue == null) {
      return;
    }

    Cluster cluster = clusters.get(fieldValue);
    if (cluster == null) {
      cluster = new Cluster();
      cluster.addPhrases(buildClusterLabel(fieldValue));
      clusters.put(fieldValue, cluster);
    }

    cluster.addDocuments(document);
  }
Exemple #3
0
  /**
   * Builds an "Other Topics" cluster that groups those documents from <code>allDocument</code> that
   * were not referenced in any cluster in <code>clusters</code>.
   *
   * @param allDocuments all documents to check against
   * @param clusters list of clusters with assigned documents
   * @param label label for the "Other Topics" group
   * @return the "Other Topics" cluster
   */
  public static Cluster buildOtherTopics(
      List<Document> allDocuments, List<Cluster> clusters, String label) {
    final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments);
    final Set<Document> assignedDocuments = Sets.newHashSet();

    for (Cluster cluster : clusters) {
      collectAllDocuments(cluster, assignedDocuments);
    }

    unclusteredDocuments.removeAll(assignedDocuments);

    final Cluster otherTopics = new Cluster(label);
    otherTopics.addDocuments(unclusteredDocuments);
    otherTopics.setOtherTopics(true);

    return otherTopics;
  }