/** * Creates a {@link ProcessingResult} with the provided <code>attributes</code>. Assigns unique * document identifiers if documents are present in the <code>attributes</code> map (under the key * {@link AttributeNames#DOCUMENTS}). */ @SuppressWarnings("unchecked") ProcessingResult(Map<String, Object> attributes) { this.attributes = attributes; // Replace a modifiable collection of documents with an unmodifiable one final List<Document> documents = (List<Document>) attributes.get(AttributeNames.DOCUMENTS); if (documents != null) { Document.assignDocumentIds(documents); attributes.put(AttributeNames.DOCUMENTS, Collections.unmodifiableList(documents)); } // Replace a modifiable collection of clusters with an unmodifiable one final List<Cluster> clusters = (List<Cluster>) attributes.get(AttributeNames.CLUSTERS); if (clusters != null) { Cluster.assignClusterIds(clusters); attributes.put(AttributeNames.CLUSTERS, Collections.unmodifiableList(clusters)); } // Store a reference to attributes as an unmodifiable map this.attributesView = Collections.unmodifiableMap(attributes); }
/** Transfers document and cluster lists to the attributes map after deserialization. */ @Commit private void afterDeserialization() throws Exception { if (otherAttributesForSerialization != null) { attributes = SimpleXmlWrappers.unwrap(otherAttributesForSerialization); } attributesView = Collections.unmodifiableMap(attributes); attributes.put(AttributeNames.QUERY, query != null ? query.trim() : null); attributes.put(AttributeNames.DOCUMENTS, documents); attributes.put(AttributeNames.CLUSTERS, clusters); // Convert document ids to the actual references if (clusters != null && documents != null) { final Map<String, Document> documentsById = Maps.newHashMap(); for (Document document : documents) { documentsById.put(document.getStringId(), document); } for (Cluster cluster : clusters) { documentIdToReference(cluster, documentsById); } } }
/** * A cluster (group) of {@link Document}s. Each cluster has a human-readable label consisting of one * or more phrases, a list of documents it contains and a list of its subclusters. Optionally, * additional attributes can be associated with a cluster, e.g. {@link #OTHER_TOPICS}. This class is * <strong>not</strong> thread-safe. */ @Root(name = "group", strict = false) @JsonAutoDetect(JsonMethod.NONE) @JsonWriteNullProperties(false) public final class Cluster { /** * Indicates that the cluster is an <i>Other Topics</i> cluster. Such a cluster contains documents * that remain unclustered at given level of cluster hierarchy. * * <p>Type of this attribute is {@link Boolean}. * * @see #setAttribute(String, Object) * @see #getAttribute(String) */ public static final String OTHER_TOPICS = "other-topics"; /** Default label for the <i>Other Topics</i> cluster. */ static final String OTHER_TOPICS_LABEL = "Other Topics"; /** * Score of this cluster that indicates the clustering algorithm's beliefs on the quality of this * cluster. The exact semantics of the score varies across algorithms. * * <p>Type of this attribute is {@link Double}. * * @see #setAttribute(String, Object) * @see #getAttribute(String) */ public static final String SCORE = "score"; /** @see #getId() */ @Attribute(required = false) Integer id; /** Phrases describing this cluster. */ @ElementList(required = false, name = "title", entry = "phrase") private ArrayList<String> phrases = new ArrayList<String>(); /** A read-only list of phrases exposed in {@link #getPhrases()}. */ private List<String> phrasesView = Collections.unmodifiableList(phrases); /** Subclusters of this cluster. */ @ElementList(required = false, inline = true) private ArrayList<Cluster> subclusters = new ArrayList<Cluster>(); /** A read-only list of subclusters exposed in {@link #getSubclusters()}. */ private List<Cluster> subclustersView = Collections.unmodifiableList(subclusters); /** Documents contained in this cluster. */ private final ArrayList<Document> documents = new ArrayList<Document>(); /** A read-only list of this cluster's document exposed in {@link #getDocuments()}. */ private final List<Document> documentsView = Collections.unmodifiableList(documents); /** Attributes of this cluster. */ private Map<String, Object> attributes = new HashMap<String, Object>(); /** A Read-only view of the attributes of this cluster. */ private Map<String, Object> attributesView = Collections.unmodifiableMap(attributes); /** Cached concatenated label */ private String labelCache = null; /** Cached list of documents from this cluster and subclusters */ private List<Document> allDocuments; /** Attributes of this cluster for serialization/ deserialization purposes. */ @ElementMap(entry = "attribute", key = "key", attribute = true, inline = true, required = false) private HashMap<String, SimpleXmlWrapperValue> otherAttributesForSerialization; /** List of document ids used for serialization/ deserialization purposes. */ @ElementList(required = false, inline = true) List<DocumentRefid> documentIds; /** A helper class for serialization/ deserialization of documents with refids. */ @Root(name = "document") static class DocumentRefid { @Attribute Integer refid; DocumentRefid() {} DocumentRefid(Integer refid) { this.refid = refid; } } /** Creates a {@link Cluster} with an empty label, no documents and no subclusters. */ public Cluster() {} /** * Creates a {@link Cluster} with the provided <code>phrase</code> to be used as the cluster's * label and <code>documents</code> contained in the cluster. * * @param phrase the phrase to form the cluster's label * @param documents documents contained in the cluster */ public Cluster(String phrase, Document... documents) { addPhrases(phrase); addDocuments(documents); } /** * Formats this cluster's label. If there is more than one phrase describing this cluster, phrases * will be separated by a comma followed by a space, e.g. "Phrase one, Phrase two". To format * multi-phrase label in a different way, use {@link #getPhrases()}. * * @return formatted label of this cluster */ public String getLabel() { if (labelCache == null) { labelCache = StringUtils.toString(phrases, ", "); } return labelCache; } /** * Returns all phrases describing this cluster. The returned list is unmodifiable. * * @return phrases describing this cluster */ @JsonProperty public List<String> getPhrases() { return phrasesView; } /** * Returns all subclusters of this cluster. The returned list is unmodifiable. * * @return subclusters of this cluster */ public List<Cluster> getSubclusters() { return subclustersView; } /** For JSON serialization only. */ @JsonProperty("clusters") @SuppressWarnings("unused") private List<Cluster> getSubclustersForSerialization() { return subclustersView.isEmpty() ? null : subclustersView; } /** * Returns all documents contained in this cluster. The returned list is unmodifiable. * * @return documents contained in this cluster */ public List<Document> getDocuments() { return documentsView; } /** * Returns all documents contained in this cluster and (recursively) all documents from this * cluster's subclusters. The returned list contains unique documents, i.e. if a document is * attached to multiple subclusters if this cluster, the document will appear only once on the * list. The documents are enumerated in breadth first order, i.e. first come documents returned * by {@link #getDocuments()} and then documents from subclusters. * * @return all documents from this cluster and its subclusters */ public List<Document> getAllDocuments() { if (allDocuments == null) { allDocuments = new ArrayList<Document>(collectAllDocuments(this, new LinkedHashSet<Document>())); } return allDocuments; } /** * Returns all documents in this cluster ordered according to the provided comparator. See {@link * Document} for common comparators, e.g. {@link Document#BY_ID_COMPARATOR} . */ public List<Document> getAllDocuments(Comparator<Document> comparator) { final List<Document> sortedDocuments = Lists.newArrayList(getAllDocuments()); Collections.sort(sortedDocuments, comparator); return sortedDocuments; } /** A recursive routine for collecting unique documents from this cluster and subclusters. */ private static Set<Document> collectAllDocuments(Cluster cluster, Set<Document> docs) { if (cluster == null) { return docs; } docs.addAll(cluster.getDocuments()); final List<Cluster> subclusters = cluster.getSubclusters(); for (final Cluster subcluster : subclusters) { collectAllDocuments(subcluster, docs); } return docs; } /** * Adds phrases to the description of this cluster. * * @param phrases to be added to the description of this cluster * @return this cluster for convenience */ public Cluster addPhrases(String... phrases) { labelCache = null; for (final String phrase : phrases) { this.phrases.add(phrase); } return this; } /** * Adds phrases to the description of this cluster. * * @param phrases to be added to the description of this cluster * @return this cluster for convenience */ public Cluster addPhrases(Iterable<String> phrases) { labelCache = null; for (final String phrase : phrases) { this.phrases.add(phrase); } return this; } /** * Adds document to this cluster. * * @param documents to be added to this cluster * @return this cluster for convenience */ public Cluster addDocuments(Document... documents) { for (final Document document : documents) { this.documents.add(document); } allDocuments = null; return this; } /** * Adds document to this cluster. * * @param documents to be added to this cluster * @return this cluster for convenience */ public Cluster addDocuments(Iterable<Document> documents) { for (final Document document : documents) { this.documents.add(document); } allDocuments = null; return this; } /** * Adds subclusters to this cluster * * @param subclusters to be added to this cluster * @return this cluster for convenience */ public Cluster addSubclusters(Cluster... subclusters) { for (final Cluster cluster : subclusters) { this.subclusters.add(cluster); } allDocuments = null; return this; } /** * Adds subclusters to this cluster * * @param clusters to be added to this cluster * @return this cluster for convenience */ public Cluster addSubclusters(Iterable<Cluster> clusters) { for (final Cluster cluster : clusters) { this.subclusters.add(cluster); } allDocuments = null; return this; } /** Returns this cluster's {@value #SCORE} field. */ @JsonProperty @Attribute(required = false) public Double getScore() { return getAttribute(SCORE); } /** * Sets this cluster's {@link #SCORE} field. * * @param score score to set * @return this cluster for convenience */ @Attribute(required = false) public Cluster setScore(Double score) { return setAttribute(SCORE, score); } /** * Returns the attribute associated with this cluster under the provided <code>key</code>. If * there is no attribute under the provided <code>key</code>, <code>null</code> will be returned. * * @param key of the attribute * @return attribute value of <code>null</code> */ @SuppressWarnings("unchecked") public <T> T getAttribute(String key) { return (T) attributes.get(key); } /** * Associates an attribute with this cluster. * * @param key for the attribute * @param value for the attribute * @return this cluster for convenience */ public <T> Cluster setAttribute(String key, T value) { synchronized (attributes) { attributes.put(key, value); } return this; } /** * Returns all attributes of this cluster. The returned map is unmodifiable. * * @return all attributes of this cluster */ public Map<String, Object> getAttributes() { return attributesView; } /** * Returns the size of the cluster calculated as the number of unique documents it contains, * including its subclusters. * * @return size of the cluster */ public int size() { return getAllDocuments().size(); } /** For serialization only. */ @JsonProperty @Attribute(required = false) @SuppressWarnings("unused") private int getSize() { return size(); } /** Empty implementation, SimpleXML requires both a getter and a setter. */ @Attribute(required = false) @SuppressWarnings("unused") private void setSize(int size) { // We only serialize the size, hence empty implementation } /** * Internal identifier of this cluster within the {@link ProcessingResult}. This identifier is * assigned dynamically after clusters are passed to {@link ProcessingResult}. * * @see ProcessingResult */ @JsonProperty public Integer getId() { return id; } /** Returns <code>true</code> if this cluster is the {@link #OTHER_TOPICS} cluster. */ public boolean isOtherTopics() { final Boolean otherTopics = getAttribute(OTHER_TOPICS); return otherTopics != null && otherTopics.booleanValue(); } /** * Sets the {@link #OTHER_TOPICS} attribute of this cluster. * * @param isOtherTopics if <code>true</code>, this cluster will be marked as an <i>Other * Topics</i> cluster. * @return this cluster for convenience */ public Cluster setOtherTopics(boolean isOtherTopics) { return setAttribute(OTHER_TOPICS, isOtherTopics).setScore(0.0); } /** * Compares clusters by size as returned by {@link #size()}. Clusters with more documents are * larger. */ public static final Comparator<Cluster> BY_SIZE_COMPARATOR = Ordering.natural() .nullsFirst() .onResultOf( new Function<Cluster, Integer>() { public Integer apply(Cluster cluster) { return cluster.size(); } }); /** * Compares clusters by score as returned by {@link #SCORE}. Clusters with larger score are * larger. */ public static final Comparator<Cluster> BY_SCORE_COMPARATOR = Ordering.natural() .nullsFirst() .onResultOf( new Function<Cluster, Double>() { public Double apply(Cluster cluster) { return cluster.getAttribute(SCORE); } }); /** Compares clusters by the natural order of their labels as returned by {@link #getLabel()}. */ public static final Comparator<Cluster> BY_LABEL_COMPARATOR = Ordering.natural() .nullsFirst() .onResultOf( new Function<Cluster, String>() { public String apply(Cluster cluster) { return cluster.getLabel(); } }); /** * Compares clusters first by their size as returned by {@link #size()} and labels as returned by * {@link #getLabel()}. In case of equal sizes, natural order of the labels decides. * * <p><b>Please note</b>: this is a reversed comparator, so "larger" clusters end up nearer the * beginning of the list being sorted (which is usually the order in which the applications want * to display clusters). */ public static final Comparator<Cluster> BY_REVERSED_SIZE_AND_LABEL_COMPARATOR = Ordering.from(Collections.reverseOrder(BY_SIZE_COMPARATOR)).compound(BY_LABEL_COMPARATOR); /** * Compares clusters first by their size as returned by {@link #SCORE} and labels as returned by * {@link #getLabel()}. In case of equal scores, natural order of the labels decides. * * <p><b>Please note</b>: this is a reversed comparator, so "larger" clusters end up nearer the * beginning of the list being sorted (which is usually the order in which the applications want * to display clusters). */ public static final Comparator<Cluster> BY_REVERSED_SCORE_AND_LABEL_COMPARATOR = Ordering.from(Collections.reverseOrder(BY_SCORE_COMPARATOR)).compound(BY_LABEL_COMPARATOR); /** * Returns a comparator that compares clusters based on the aggregation of their size and score. * If <code>scoreWeight</code> is 0.0, the order depends only on cluster sizes. If <code> * scoreWeight</code> is 1.1, the order depends only on cluster scores. For <code>scoreWeight * </code> values between 0.0 and 1.0, the higher the <code>scoreWeight</code>, the more * contribution of cluster scores to the order. In case of a tie on the aggregated cluster size * and score, clusters are compared by the natural order of their labels. * * <p><b>Please note</b>: this is a reversed comparator, so "larger" clusters end up nearer the * beginning of the list being sorted (which is usually the order in which the applications want * to display clusters). */ public static Comparator<Cluster> byReversedWeightedScoreAndSizeComparator( final double scoreWeight) { if (scoreWeight < 0 || scoreWeight > 1) { throw new IllegalArgumentException( "Score weight must be between 0.0 (inclusive) and 1.0 (inclusive) "); } return Ordering.natural() .onResultOf( new Function<Cluster, Double>() { public Double apply(Cluster cluster) { return -Math.pow(cluster.size(), (1 - scoreWeight)) * Math.pow((Double) cluster.getAttribute(SCORE), scoreWeight); } }) .compound(BY_LABEL_COMPARATOR); } /** * A comparator that puts {@link #OTHER_TOPICS} clusters at the end of the list. In other words, * to this comparator an {@link #OTHER_TOPICS} topics cluster is "bigger" than a non-{{@link * #OTHER_TOPICS} cluster. * * <p><strong>Note:</strong> This comparator is designed for use in combination with other * comparators, such as {@link #BY_REVERSED_SIZE_AND_LABEL_COMPARATOR}. If you only need to * partition a list of clusters into regular and other topic ones, this is better done in linear * time without resorting to {@link Collections#sort(List)}. */ public static final Comparator<Cluster> OTHER_TOPICS_AT_THE_END = Ordering.natural() .onResultOf( new Function<Cluster, Double>() { public Double apply(Cluster cluster) { return cluster.isOtherTopics() ? 1.0 : -1.0; } }); /** * Assigns sequential identifiers to the provided <code>clusters</code> (and their sub-clusters). * If a cluster already has an identifier, the identifier will not be changed. * * @param clusters Clusters to assign identifiers to. * @throws IllegalArgumentException if the provided clusters contain non-unique identifiers */ public static void assignClusterIds(Collection<Cluster> clusters) { final ArrayList<Cluster> flattened = Lists.newArrayListWithExpectedSize(clusters.size()); flatten(flattened, clusters); synchronized (clusters) { final HashSet<Integer> ids = Sets.newHashSet(); // First, find the start value for the id and check uniqueness of the ids // already provided. int maxId = Integer.MIN_VALUE; for (final Cluster cluster : flattened) { if (cluster.id != null) { if (!ids.add(cluster.id)) { throw new IllegalArgumentException("Non-unique cluster id found: " + cluster.id); } maxId = Math.max(maxId, cluster.id); } } // We'd rather start with 0 maxId = Math.max(maxId, -1); // Assign missing ids for (final Cluster c : flattened) { if (c.id == null) { c.id = ++maxId; } } } } /* * Recursive descent into subclusters. */ private static void flatten(ArrayList<Cluster> flattened, Collection<Cluster> clusters) { for (Cluster c : clusters) { flattened.add(c); final List<Cluster> subclusters = c.getSubclusters(); if (!subclusters.isEmpty()) { flatten(flattened, subclusters); } } } /** * Locate the first cluster that has id equal to <code>id</code>. The search includes all the * clusters in the input and their sub-clusters. The first cluster with matching identifier is * returned or <code>null</code> if no such cluster could be found. */ public static Cluster find(int id, Collection<Cluster> clusters) { for (Cluster c : clusters) { if (c != null) { if (c.id != null && c.id == id) { return c; } if (!c.getSubclusters().isEmpty()) { final Cluster sub = find(id, c.getSubclusters()); if (sub != null) { return sub; } } } } return null; } /** * Builds an "Other Topics" cluster that groups those documents from <code>allDocument</code> that * were not referenced in any cluster in <code>clusters</code>. * * @param allDocuments all documents to check against * @param clusters list of clusters with assigned documents * @return the "Other Topics" cluster */ public static Cluster buildOtherTopics(List<Document> allDocuments, List<Cluster> clusters) { return buildOtherTopics(allDocuments, clusters, OTHER_TOPICS_LABEL); } /** * Builds an "Other Topics" cluster that groups those documents from <code>allDocument</code> that * were not referenced in any cluster in <code>clusters</code>. * * @param allDocuments all documents to check against * @param clusters list of clusters with assigned documents * @param label label for the "Other Topics" group * @return the "Other Topics" cluster */ public static Cluster buildOtherTopics( List<Document> allDocuments, List<Cluster> clusters, String label) { final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments); final Set<Document> assignedDocuments = Sets.newHashSet(); for (Cluster cluster : clusters) { collectAllDocuments(cluster, assignedDocuments); } unclusteredDocuments.removeAll(assignedDocuments); final Cluster otherTopics = new Cluster(label); otherTopics.addDocuments(unclusteredDocuments); otherTopics.setOtherTopics(true); return otherTopics; } /** * If there are unclustered documents, appends the "Other Topics" group to the <code>clusters * </code>. * * @see #buildOtherTopics(List, List) */ public static void appendOtherTopics(List<Document> allDocuments, List<Cluster> clusters) { appendOtherTopics(allDocuments, clusters, OTHER_TOPICS_LABEL); } /** * If there are unclustered documents, appends the "Other Topics" group to the <code>clusters * </code>. * * @see #buildOtherTopics(List, List, String) */ public static void appendOtherTopics( List<Document> allDocuments, List<Cluster> clusters, String label) { final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label); if (!otherTopics.getDocuments().isEmpty()) { clusters.add(otherTopics); } } @Persist @SuppressWarnings("unused") private void beforeSerialization() { documentIds = Lists.transform( documents, new Function<Document, DocumentRefid>() { public DocumentRefid apply(Document document) { return new DocumentRefid(document.getId()); } }); // Remove score from attributes for serialization otherAttributesForSerialization = MapUtils.asHashMap(SimpleXmlWrappers.wrap(attributes)); otherAttributesForSerialization.remove(SCORE); if (otherAttributesForSerialization.isEmpty()) { otherAttributesForSerialization = null; } } @Commit @SuppressWarnings("unused") private void afterDeserialization() throws Exception { if (otherAttributesForSerialization != null) { attributes.putAll(SimpleXmlWrappers.unwrap(otherAttributesForSerialization)); } phrasesView = Collections.unmodifiableList(phrases); subclustersView = Collections.unmodifiableList(subclusters); // Documents will be restored on the ProcessingResult level } /** For JSON serialization only. */ @JsonProperty("documents") @SuppressWarnings("unused") private List<Integer> getDocumentIds() { return Lists.transform(documents, Document.DocumentToId.INSTANCE); } /** For JSON and XML serialization only. */ @JsonProperty("attributes") @SuppressWarnings("unused") private Map<String, Object> getOtherAttributes() { final Map<String, Object> otherAttributes = Maps.newHashMap(attributesView); return otherAttributes.isEmpty() ? null : otherAttributes; } }