/** * Builds a term-phrase matrix in the same space as the main term-document matrix. If the * processing context contains no phrases, {@link VectorSpaceModelContext#termPhraseMatrix} will * remain <code>null</code>. */ public void buildTermPhraseMatrix(VectorSpaceModelContext context) { final PreprocessingContext preprocessingContext = context.preprocessingContext; final IntIntHashMap stemToRowIndex = context.stemToRowIndex; final int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex; final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex; if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0) { // Build phrase matrix int[] phraseFeatureIndices = new int[labelsFeatureIndex.length - firstPhraseIndex]; for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; featureIndex++) { phraseFeatureIndices[featureIndex] = labelsFeatureIndex[featureIndex + firstPhraseIndex]; } final DoubleMatrix2D phraseMatrix = TermDocumentMatrixBuilder.buildAlignedMatrix( context, phraseFeatureIndices, termWeighting); MatrixUtils.normalizeColumnL2(phraseMatrix, null); context.termPhraseMatrix = phraseMatrix.viewDice(); } }
/** * Performs the actual clustering with an assumption that all documents are written in one <code> * language</code>. */ private void cluster(LanguageCode language) { // Preprocessing of documents final PreprocessingContext context = preprocessingPipeline.preprocess(documents, query, language); // Further processing only if there are words to process clusters = Lists.newArrayList(); if (context.hasLabels()) { // Term-document matrix building and reduction final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context); final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(vsmContext); LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext); matrixBuilder.buildTermDocumentMatrix(vsmContext); matrixBuilder.buildTermPhraseMatrix(vsmContext); matrixReducer.reduce( reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size())); // Cluster label building clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting); // Document assignment clusterBuilder.assignDocuments(lingoContext); // Cluster merging clusterBuilder.merge(lingoContext); // Format final clusters final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex; final BitSet[] clusterDocuments = lingoContext.clusterDocuments; final double[] clusterLabelScore = lingoContext.clusterLabelScore; for (int i = 0; i < clusterLabelIndex.length; i++) { final Cluster cluster = new Cluster(); final int labelFeature = clusterLabelIndex[i]; if (labelFeature < 0) { // Cluster removed during merging continue; } // Add label and score cluster.addPhrases(labelFormatter.format(context, labelFeature)); cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]); // Add documents final BitSet bs = clusterDocuments[i]; for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) { cluster.addDocuments(documents.get(bit)); } // Add cluster clusters.add(cluster); } Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); } Cluster.appendOtherTopics(documents, clusters); }