/** * If there are unclustered documents, appends the "Other Topics" group to the <code>clusters * </code>. * * @see #buildOtherTopics(List, List, String) */ public static void appendOtherTopics( List<Document> allDocuments, List<Cluster> clusters, String label) { final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label); if (!otherTopics.getDocuments().isEmpty()) { clusters.add(otherTopics); } }
/** * Assigns sequential identifiers to the provided <code>clusters</code> (and their sub-clusters). * If a cluster already has an identifier, the identifier will not be changed. * * @param clusters Clusters to assign identifiers to. * @throws IllegalArgumentException if the provided clusters contain non-unique identifiers */ public static void assignClusterIds(Collection<Cluster> clusters) { final ArrayList<Cluster> flattened = Lists.newArrayListWithExpectedSize(clusters.size()); flatten(flattened, clusters); synchronized (clusters) { final HashSet<Integer> ids = Sets.newHashSet(); // First, find the start value for the id and check uniqueness of the ids // already provided. int maxId = Integer.MIN_VALUE; for (final Cluster cluster : flattened) { if (cluster.id != null) { if (!ids.add(cluster.id)) { throw new IllegalArgumentException("Non-unique cluster id found: " + cluster.id); } maxId = Math.max(maxId, cluster.id); } } // We'd rather start with 0 maxId = Math.max(maxId, -1); // Assign missing ids for (final Cluster c : flattened) { if (c.id == null) { c.id = ++maxId; } } } }
/* * Recursive descent into subclusters. */ private static void flatten(ArrayList<Cluster> flattened, Collection<Cluster> clusters) { for (Cluster c : clusters) { flattened.add(c); final List<Cluster> subclusters = c.getSubclusters(); if (!subclusters.isEmpty()) { flatten(flattened, subclusters); } } }
/** Replace document refids with the actual references upon deserialization. */ private void documentIdToReference(Cluster cluster, Map<String, Document> documents) { if (cluster.documentIds != null) { for (Cluster.DocumentRefid documentRefid : cluster.documentIds) { cluster.addDocuments(documents.get(documentRefid.refid)); } } for (Cluster subcluster : cluster.getSubclusters()) { documentIdToReference(subcluster, documents); } }
/** A recursive routine for collecting unique documents from this cluster and subclusters. */ private static Set<Document> collectAllDocuments(Cluster cluster, Set<Document> docs) { if (cluster == null) { return docs; } docs.addAll(cluster.getDocuments()); final List<Cluster> subclusters = cluster.getSubclusters(); for (final Cluster subcluster : subclusters) { collectAllDocuments(subcluster, docs); } return docs; }
/** * Builds an "Other Topics" cluster that groups those documents from <code>allDocument</code> that * were not referenced in any cluster in <code>clusters</code>. * * @param allDocuments all documents to check against * @param clusters list of clusters with assigned documents * @param label label for the "Other Topics" group * @return the "Other Topics" cluster */ public static Cluster buildOtherTopics( List<Document> allDocuments, List<Cluster> clusters, String label) { final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments); final Set<Document> assignedDocuments = Sets.newHashSet(); for (Cluster cluster : clusters) { collectAllDocuments(cluster, assignedDocuments); } unclusteredDocuments.removeAll(assignedDocuments); final Cluster otherTopics = new Cluster(label); otherTopics.addDocuments(unclusteredDocuments); otherTopics.setOtherTopics(true); return otherTopics; }
/** * Locate the first cluster that has id equal to <code>id</code>. The search includes all the * clusters in the input and their sub-clusters. The first cluster with matching identifier is * returned or <code>null</code> if no such cluster could be found. */ public static Cluster find(int id, Collection<Cluster> clusters) { for (Cluster c : clusters) { if (c != null) { if (c.id != null && c.id == id) { return c; } if (!c.getSubclusters().isEmpty()) { final Cluster sub = find(id, c.getSubclusters()); if (sub != null) { return sub; } } } } return null; }
/** * Creates a {@link ProcessingResult} with the provided <code>attributes</code>. Assigns unique * document identifiers if documents are present in the <code>attributes</code> map (under the key * {@link AttributeNames#DOCUMENTS}). */ @SuppressWarnings("unchecked") ProcessingResult(Map<String, Object> attributes) { this.attributes = attributes; // Replace a modifiable collection of documents with an unmodifiable one final List<Document> documents = (List<Document>) attributes.get(AttributeNames.DOCUMENTS); if (documents != null) { Document.assignDocumentIds(documents); attributes.put(AttributeNames.DOCUMENTS, Collections.unmodifiableList(documents)); } // Replace a modifiable collection of clusters with an unmodifiable one final List<Cluster> clusters = (List<Cluster>) attributes.get(AttributeNames.CLUSTERS); if (clusters != null) { Cluster.assignClusterIds(clusters); attributes.put(AttributeNames.CLUSTERS, Collections.unmodifiableList(clusters)); } // Store a reference to attributes as an unmodifiable map this.attributesView = Collections.unmodifiableMap(attributes); }
public Double apply(Cluster cluster) { return cluster.isOtherTopics() ? 1.0 : -1.0; }
public String apply(Cluster cluster) { return cluster.getLabel(); }
public Double apply(Cluster cluster) { return cluster.getAttribute(SCORE); }
public Integer apply(Cluster cluster) { return cluster.size(); }