/** * If there are unclustered documents, appends the "Other Topics" group to the <code>clusters * </code>. * * @see #buildOtherTopics(List, List, String) */ public static void appendOtherTopics( List<Document> allDocuments, List<Cluster> clusters, String label) { final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label); if (!otherTopics.getDocuments().isEmpty()) { clusters.add(otherTopics); } }
/** * Assigns sequential identifiers to the provided <code>clusters</code> (and their sub-clusters). * If a cluster already has an identifier, the identifier will not be changed. * * @param clusters Clusters to assign identifiers to. * @throws IllegalArgumentException if the provided clusters contain non-unique identifiers */ public static void assignClusterIds(Collection<Cluster> clusters) { final ArrayList<Cluster> flattened = Lists.newArrayListWithExpectedSize(clusters.size()); flatten(flattened, clusters); synchronized (clusters) { final HashSet<Integer> ids = Sets.newHashSet(); // First, find the start value for the id and check uniqueness of the ids // already provided. int maxId = Integer.MIN_VALUE; for (final Cluster cluster : flattened) { if (cluster.id != null) { if (!ids.add(cluster.id)) { throw new IllegalArgumentException("Non-unique cluster id found: " + cluster.id); } maxId = Math.max(maxId, cluster.id); } } // We'd rather start with 0 maxId = Math.max(maxId, -1); // Assign missing ids for (final Cluster c : flattened) { if (c.id == null) { c.id = ++maxId; } } } }
/* * Recursive descent into subclusters. */ private static void flatten(ArrayList<Cluster> flattened, Collection<Cluster> clusters) { for (Cluster c : clusters) { flattened.add(c); final List<Cluster> subclusters = c.getSubclusters(); if (!subclusters.isEmpty()) { flatten(flattened, subclusters); } } }
/** A recursive routine for collecting unique documents from this cluster and subclusters. */ private static Set<Document> collectAllDocuments(Cluster cluster, Set<Document> docs) { if (cluster == null) { return docs; } docs.addAll(cluster.getDocuments()); final List<Cluster> subclusters = cluster.getSubclusters(); for (final Cluster subcluster : subclusters) { collectAllDocuments(subcluster, docs); } return docs; }
/** * Builds an "Other Topics" cluster that groups those documents from <code>allDocument</code> that * were not referenced in any cluster in <code>clusters</code>. * * @param allDocuments all documents to check against * @param clusters list of clusters with assigned documents * @param label label for the "Other Topics" group * @return the "Other Topics" cluster */ public static Cluster buildOtherTopics( List<Document> allDocuments, List<Cluster> clusters, String label) { final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments); final Set<Document> assignedDocuments = Sets.newHashSet(); for (Cluster cluster : clusters) { collectAllDocuments(cluster, assignedDocuments); } unclusteredDocuments.removeAll(assignedDocuments); final Cluster otherTopics = new Cluster(label); otherTopics.addDocuments(unclusteredDocuments); otherTopics.setOtherTopics(true); return otherTopics; }
/** * Locate the first cluster that has id equal to <code>id</code>. The search includes all the * clusters in the input and their sub-clusters. The first cluster with matching identifier is * returned or <code>null</code> if no such cluster could be found. */ public static Cluster find(int id, Collection<Cluster> clusters) { for (Cluster c : clusters) { if (c != null) { if (c.id != null && c.id == id) { return c; } if (!c.getSubclusters().isEmpty()) { final Cluster sub = find(id, c.getSubclusters()); if (sub != null) { return sub; } } } } return null; }
public Double apply(Cluster cluster) { return cluster.isOtherTopics() ? 1.0 : -1.0; }
public String apply(Cluster cluster) { return cluster.getLabel(); }
public Double apply(Cluster cluster) { return cluster.getAttribute(SCORE); }
public Integer apply(Cluster cluster) { return cluster.size(); }