/** * If there are unclustered documents, appends the "Other Topics" group to the <code>clusters * </code>. * * @see #buildOtherTopics(List, List, String) */ public static void appendOtherTopics( List<Document> allDocuments, List<Cluster> clusters, String label) { final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label); if (!otherTopics.getDocuments().isEmpty()) { clusters.add(otherTopics); } }
@SuppressWarnings("unchecked") @Override public List<String> getClusterByCarrot2(String query) { // TODO Auto-generated method stub List<String> strs = new ArrayList<String>(); final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class); final List<org.carrot2.core.Document> documents = Lists.newArrayList(); try { q = getParser().parse(QueryParserUtil.escape(query)); docs = getIndexSearcher().search(q, Integer.MAX_VALUE); hits = docs.scoreDocs; for (int i = 0; i < hits.length; i++) { Document doc = getIndexSearcher().doc(hits[i].doc); documents.add( new org.carrot2.core.Document( doc.get(CONTENTS_FIELD), doc.get(TITLE_FIELD), doc.get(USER_FIELD))); } final ProcessingResult byTopicClusters = controller.process(documents, query, LingoClusteringAlgorithm.class); final List<Cluster> clustersByTopic = byTopicClusters.getClusters(); final ProcessingResult byDomainClusters = controller.process(documents, query, ByUrlClusteringAlgorithm.class); final List<Cluster> clustersByDomain = byDomainClusters.getClusters(); for (Cluster c : clustersByDomain) { strs.add(c.getLabel()); } for (Cluster c : clustersByTopic) { strs.add(c.getLabel()); } } catch (Exception ex) { } return strs; }
/** * Assigns sequential identifiers to the provided <code>clusters</code> (and their sub-clusters). * If a cluster already has an identifier, the identifier will not be changed. * * @param clusters Clusters to assign identifiers to. * @throws IllegalArgumentException if the provided clusters contain non-unique identifiers */ public static void assignClusterIds(Collection<Cluster> clusters) { final ArrayList<Cluster> flattened = Lists.newArrayListWithExpectedSize(clusters.size()); flatten(flattened, clusters); synchronized (clusters) { final HashSet<Integer> ids = Sets.newHashSet(); // First, find the start value for the id and check uniqueness of the ids // already provided. int maxId = Integer.MIN_VALUE; for (final Cluster cluster : flattened) { if (cluster.id != null) { if (!ids.add(cluster.id)) { throw new IllegalArgumentException("Non-unique cluster id found: " + cluster.id); } maxId = Math.max(maxId, cluster.id); } } // We'd rather start with 0 maxId = Math.max(maxId, -1); // Assign missing ids for (final Cluster c : flattened) { if (c.id == null) { c.id = ++maxId; } } } }
@SuppressWarnings("unchecked") @Override public List<String> getClusterByCarrotVersion2(String query) { // TODO Auto-generated method stub List<String> strs = new ArrayList<String>(); final Controller controller = ControllerFactory.createPooling(); final Map<String, Object> luceneGlobalAttributes = new HashMap<String, Object>(); LuceneDocumentSourceDescriptor.attributeBuilder(luceneGlobalAttributes).directory(directory); SimpleFieldMapperDescriptor.attributeBuilder(luceneGlobalAttributes) .titleField(TITLE_FIELD) .contentField(CONTENTS_FIELD) .searchFields(Arrays.asList(new String[] {TITLE_FIELD, CONTENTS_FIELD})); controller.init( new HashMap<String, Object>(), new ProcessingComponentConfiguration( LuceneDocumentSource.class, "lucene", luceneGlobalAttributes)); final Map<String, Object> processingAttributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(processingAttributes).query(query); ProcessingResult process = controller.process( processingAttributes, "lucene", LingoClusteringAlgorithm.class.getName()); for (Cluster c : process.getClusters()) { strs.add(c.getLabel() + " >>>> " + c.getAllDocuments().size()); } return strs; }
/* * Recursive descent into subclusters. */ private static void flatten(ArrayList<Cluster> flattened, Collection<Cluster> clusters) { for (Cluster c : clusters) { flattened.add(c); final List<Cluster> subclusters = c.getSubclusters(); if (!subclusters.isEmpty()) { flatten(flattened, subclusters); } } }
private Collection<Document> collectDocuments( Collection<Cluster> clusters, Collection<Document> documents) { for (final Cluster cluster : clusters) { documents.addAll(cluster.getDocuments()); collectDocuments(cluster.getSubclusters()); } return documents; }
/** Replace document refids with the actual references upon deserialization. */ private void documentIdToReference(Cluster cluster, Map<String, Document> documents) { if (cluster.documentIds != null) { for (Cluster.DocumentRefid documentRefid : cluster.documentIds) { cluster.addDocuments(documents.get(documentRefid.refid)); } } for (Cluster subcluster : cluster.getSubclusters()) { documentIdToReference(subcluster, documents); } }
private void addToCluster(Map<Object, Cluster> clusters, Object fieldValue, Document document) { if (fieldValue == null) { return; } Cluster cluster = clusters.get(fieldValue); if (cluster == null) { cluster = new Cluster(); cluster.addPhrases(buildClusterLabel(fieldValue)); clusters.put(fieldValue, cluster); } cluster.addDocuments(document); }
/** A recursive routine for collecting unique documents from this cluster and subclusters. */ private static Set<Document> collectAllDocuments(Cluster cluster, Set<Document> docs) { if (cluster == null) { return docs; } docs.addAll(cluster.getDocuments()); final List<Cluster> subclusters = cluster.getSubclusters(); for (final Cluster subcluster : subclusters) { collectAllDocuments(subcluster, docs); } return docs; }
/** Performs Lingo clustering of {@link #documents}. */ @Override @SuppressWarnings("unchecked") public void process() throws ProcessingException { // There is a tiny trick here to support multilingual clustering without // refactoring the whole component: we remember the original list of documents // and invoke clustering for each language separately within the // IMonolingualClusteringAlgorithm implementation below. This is safe because // processing components are not thread-safe by definition and // IMonolingualClusteringAlgorithm forbids concurrent execution by contract. final List<Document> originalDocuments = documents; clusters = multilingualClustering.process( documents, new IMonolingualClusteringAlgorithm() { public List<Cluster> process(List<Document> documents, LanguageCode language) { LingoClusteringAlgorithm.this.documents = documents; LingoClusteringAlgorithm.this.cluster(language); return LingoClusteringAlgorithm.this.clusters; } }); documents = originalDocuments; if (multilingualClustering.languageAggregationStrategy == LanguageAggregationStrategy.FLATTEN_ALL) { Collections.sort( clusters, Ordering.compound( Lists.newArrayList( Cluster.OTHER_TOPICS_AT_THE_END, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)))); } }
/** * Builds an "Other Topics" cluster that groups those documents from <code>allDocument</code> that * were not referenced in any cluster in <code>clusters</code>. * * @param allDocuments all documents to check against * @param clusters list of clusters with assigned documents * @param label label for the "Other Topics" group * @return the "Other Topics" cluster */ public static Cluster buildOtherTopics( List<Document> allDocuments, List<Cluster> clusters, String label) { final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments); final Set<Document> assignedDocuments = Sets.newHashSet(); for (Cluster cluster : clusters) { collectAllDocuments(cluster, assignedDocuments); } unclusteredDocuments.removeAll(assignedDocuments); final Cluster otherTopics = new Cluster(label); otherTopics.addDocuments(unclusteredDocuments); otherTopics.setOtherTopics(true); return otherTopics; }
/** * Locate the first cluster that has id equal to <code>id</code>. The search includes all the * clusters in the input and their sub-clusters. The first cluster with matching identifier is * returned or <code>null</code> if no such cluster could be found. */ public static Cluster find(int id, Collection<Cluster> clusters) { for (Cluster c : clusters) { if (c != null) { if (c.id != null && c.id == id) { return c; } if (!c.getSubclusters().isEmpty()) { final Cluster sub = find(id, c.getSubclusters()); if (sub != null) { return sub; } } } } return null; }
int calculateH(Cluster cluster) { final Map<Object, Integer> documentCountByPartition = getDocumentCountByPartition(cluster.getAllDocuments()); final ArrayList<Integer> counts = Lists.newArrayList(); counts.addAll(documentCountByPartition.values()); return calculateH(counts); }
/** * Create the junk (unassigned documents) cluster and create the final set of clusters in Carrot2 * format. */ private void postProcessing(ArrayList<ClusterCandidate> clusters) { // Adapt to Carrot2 classes, counting used documents on the way. final BitSet all = new BitSet(documents.size()); final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size()); final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3); for (ClusterCandidate c : clusters) { final Cluster c2 = new Cluster(); c2.addPhrases(collectPhrases(phrases, c)); c2.addDocuments(collectDocuments(docs, c.documents)); c2.setScore((double) c.score); this.clusters.add(c2); all.or(c.documents); docs.clear(); phrases.clear(); } Cluster.appendOtherTopics(this.documents, this.clusters); }
public void calculate() { final int partitionCount = getPartitionsCount(documents); if (partitionCount == 0) { return; } int weightSum = 0; double contaminationSum = 0; for (Cluster cluster : clusters) { if (cluster.isOtherTopics()) { continue; } final double contamination = calculate(cluster, partitionCount); cluster.setAttribute(CONTAMINATION, contamination); contaminationSum += contamination * cluster.size(); weightSum += cluster.size(); } weightedAverageContamination = contaminationSum / weightSum; }
@SuppressWarnings("unchecked") double calculate(Cluster cluster, int partitionCount) { int clusterPartitionAssignments = 0; for (Document document : cluster.getAllDocuments()) { clusterPartitionAssignments += ((Collection<Object>) document.getField(Document.PARTITIONS)).size(); } final double worstCaseH = calculateWorstCaseH(clusterPartitionAssignments, partitionCount); if (worstCaseH == 0) { return 0; } else { return calculateH(cluster) / worstCaseH; } }
/** Performs by URL clustering. */ @Override public void process() throws ProcessingException { final Map<Object, Cluster> clusterMap = Maps.newHashMap(); for (Document document : documents) { final Object field = document.getField(fieldName); if (field instanceof Collection<?>) { for (Object value : (Collection<?>) field) { addToCluster(clusterMap, value, document); } } else { addToCluster(clusterMap, field, document); } } clusters = Lists.newArrayList(clusterMap.values()); Collections.sort(clusters, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR); Cluster.appendOtherTopics(documents, clusters); }
/** * Creates a {@link ProcessingResult} with the provided <code>attributes</code>. Assigns unique * document identifiers if documents are present in the <code>attributes</code> map (under the key * {@link AttributeNames#DOCUMENTS}). */ @SuppressWarnings("unchecked") ProcessingResult(Map<String, Object> attributes) { this.attributes = attributes; // Replace a modifiable collection of documents with an unmodifiable one final List<Document> documents = (List<Document>) attributes.get(AttributeNames.DOCUMENTS); if (documents != null) { Document.assignDocumentIds(documents); attributes.put(AttributeNames.DOCUMENTS, Collections.unmodifiableList(documents)); } // Replace a modifiable collection of clusters with an unmodifiable one final List<Cluster> clusters = (List<Cluster>) attributes.get(AttributeNames.CLUSTERS); if (clusters != null) { Cluster.assignClusterIds(clusters); attributes.put(AttributeNames.CLUSTERS, Collections.unmodifiableList(clusters)); } // Store a reference to attributes as an unmodifiable map this.attributesView = Collections.unmodifiableMap(attributes); }
public Integer apply(Cluster cluster) { return cluster.size(); }
public Double apply(Cluster cluster) { return cluster.getAttribute(SCORE); }
public String apply(Cluster cluster) { return cluster.getLabel(); }
/** * Performs the actual clustering with an assumption that all documents are written in one <code> * language</code>. */ private void cluster(LanguageCode language) { // Preprocessing of documents final PreprocessingContext context = preprocessingPipeline.preprocess(documents, query, language); // Further processing only if there are words to process clusters = Lists.newArrayList(); if (context.hasLabels()) { // Term-document matrix building and reduction final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context); final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(vsmContext); LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext); matrixBuilder.buildTermDocumentMatrix(vsmContext); matrixBuilder.buildTermPhraseMatrix(vsmContext); matrixReducer.reduce( reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size())); // Cluster label building clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting); // Document assignment clusterBuilder.assignDocuments(lingoContext); // Cluster merging clusterBuilder.merge(lingoContext); // Format final clusters final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex; final BitSet[] clusterDocuments = lingoContext.clusterDocuments; final double[] clusterLabelScore = lingoContext.clusterLabelScore; for (int i = 0; i < clusterLabelIndex.length; i++) { final Cluster cluster = new Cluster(); final int labelFeature = clusterLabelIndex[i]; if (labelFeature < 0) { // Cluster removed during merging continue; } // Add label and score cluster.addPhrases(labelFormatter.format(context, labelFeature)); cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]); // Add documents final BitSet bs = clusterDocuments[i]; for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) { cluster.addDocuments(documents.get(bit)); } // Add cluster clusters.add(cluster); } Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); } Cluster.appendOtherTopics(documents, clusters); }
public Double apply(Cluster cluster) { return cluster.isOtherTopics() ? 1.0 : -1.0; }