Пример #1
0
  /** Build the cluster's label from suffix tree edge indices. */
  private String buildLabel(int[] phraseIndices) {
    final boolean joinWithSpace =
        context.language.getLanguageCode() != LanguageCode.CHINESE_SIMPLIFIED;

    // Count the number of terms first.
    int termsCount = 0;
    for (int j = 0; j < phraseIndices.length; j += 2) {
      termsCount += phraseIndices[j + 1] - phraseIndices[j] + 1;
    }

    // Extract terms info for the phrase and construct the label.
    final boolean[] stopwords = new boolean[termsCount];
    final char[][] images = new char[termsCount][];
    final short[] tokenTypes = context.allWords.type;

    int k = 0;
    for (int i = 0; i < phraseIndices.length; i += 2) {
      for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++) {
        final int termIndex = sb.input.get(j);
        images[k] = context.allWords.image[termIndex];
        stopwords[k] = TokenTypeUtils.isCommon(tokenTypes[termIndex]);
      }
    }

    return LabelFormatter.format(images, stopwords, joinWithSpace);
  }
  /**
   * Performs the actual clustering with an assumption that all documents are written in one <code>
   * language</code>.
   */
  private void cluster(LanguageCode language) {
    // Preprocessing of documents
    final PreprocessingContext context =
        preprocessingPipeline.preprocess(documents, query, language);

    // Further processing only if there are words to process
    clusters = Lists.newArrayList();
    if (context.hasLabels()) {
      // Term-document matrix building and reduction
      final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context);
      final ReducedVectorSpaceModelContext reducedVsmContext =
          new ReducedVectorSpaceModelContext(vsmContext);
      LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext);

      matrixBuilder.buildTermDocumentMatrix(vsmContext);
      matrixBuilder.buildTermPhraseMatrix(vsmContext);

      matrixReducer.reduce(
          reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size()));

      // Cluster label building
      clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting);

      // Document assignment
      clusterBuilder.assignDocuments(lingoContext);

      // Cluster merging
      clusterBuilder.merge(lingoContext);

      // Format final clusters
      final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex;
      final BitSet[] clusterDocuments = lingoContext.clusterDocuments;
      final double[] clusterLabelScore = lingoContext.clusterLabelScore;
      for (int i = 0; i < clusterLabelIndex.length; i++) {
        final Cluster cluster = new Cluster();

        final int labelFeature = clusterLabelIndex[i];
        if (labelFeature < 0) {
          // Cluster removed during merging
          continue;
        }

        // Add label and score
        cluster.addPhrases(labelFormatter.format(context, labelFeature));
        cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]);

        // Add documents
        final BitSet bs = clusterDocuments[i];
        for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) {
          cluster.addDocuments(documents.get(bit));
        }

        // Add cluster
        clusters.add(cluster);
      }

      Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight));
    }

    Cluster.appendOtherTopics(documents, clusters);
  }