/** * A test to determine if we can create and destroy {@link Cluster}s. * * @throws Exception */ @Test public void testClusterBuilding() throws Exception { final int CLUSTER_SIZE = 5; AvailablePortIterator portIterator = new AvailablePortIterator(40000); ClusterMemberSchema schema = new ClusterMemberSchema() .setEnvironmentVariables(PropertiesBuilder.fromCurrentEnvironmentVariables()) .setSingleServerMode() .setClusterPort(portIterator.next()) .setJMXPort(portIterator) .setJMXManagementMode(JMXManagementMode.LOCAL_ONLY); Cluster cluster = null; try { ClusterBuilder builder = new ClusterBuilder(); builder.addBuilder(new ClusterMemberBuilder(), schema, "DCCF", CLUSTER_SIZE); cluster = builder.realize(new SystemApplicationConsole()); final Cluster initialCluster = cluster; ResourceProvider<Cluster> resource = new AbstractDeferredResourceProvider<Cluster>("Cluster", 500, 60000) { @Override protected Cluster ensureResource() throws ResourceUnavailableException { return initialCluster.iterator().next().getClusterSize() == CLUSTER_SIZE ? initialCluster : null; } }; Assert.assertNotNull(resource.getResource()); } catch (Exception e) { e.printStackTrace(); Assert.fail(); } finally { if (cluster != null) { cluster.destroy(); } } }
/** * Performs the actual clustering with an assumption that all documents are written in one <code> * language</code>. */ private void cluster(LanguageCode language) { // Preprocessing of documents final PreprocessingContext context = preprocessingPipeline.preprocess(documents, query, language); // Further processing only if there are words to process clusters = Lists.newArrayList(); if (context.hasLabels()) { // Term-document matrix building and reduction final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context); final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(vsmContext); LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext); matrixBuilder.buildTermDocumentMatrix(vsmContext); matrixBuilder.buildTermPhraseMatrix(vsmContext); matrixReducer.reduce( reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size())); // Cluster label building clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting); // Document assignment clusterBuilder.assignDocuments(lingoContext); // Cluster merging clusterBuilder.merge(lingoContext); // Format final clusters final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex; final BitSet[] clusterDocuments = lingoContext.clusterDocuments; final double[] clusterLabelScore = lingoContext.clusterLabelScore; for (int i = 0; i < clusterLabelIndex.length; i++) { final Cluster cluster = new Cluster(); final int labelFeature = clusterLabelIndex[i]; if (labelFeature < 0) { // Cluster removed during merging continue; } // Add label and score cluster.addPhrases(labelFormatter.format(context, labelFeature)); cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]); // Add documents final BitSet bs = clusterDocuments[i]; for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) { cluster.addDocuments(documents.get(bit)); } // Add cluster clusters.add(cluster); } Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); } Cluster.appendOtherTopics(documents, clusters); }