Ejemplo n.º 1
0
  /** Performs tokenization and saves the results to the <code>context</code>. */
  public void tokenize(PreprocessingContext context) {
    // Documents to tokenize
    final List<Document> documents = context.documents;

    // Fields to tokenize
    final String[] fieldNames = documentFields.toArray(new String[documentFields.size()]);

    if (fieldNames.length > 8) {
      throw new ProcessingException("Maximum number of tokenized fields is 8.");
    }

    // Prepare arrays
    images = Lists.newArrayList();
    tokenTypes = new ShortArrayList();
    documentIndices = new IntArrayList();
    fieldIndices = new ByteArrayList();

    final Iterator<Document> docIterator = documents.iterator();
    int documentIndex = 0;
    final ITokenizer ts = context.language.getTokenizer();
    final MutableCharArray wrapper = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);

    while (docIterator.hasNext()) {
      final Document doc = docIterator.next();

      boolean hadTokens = false;
      for (int i = 0; i < fieldNames.length; i++) {
        final byte fieldIndex = (byte) i;
        final String fieldName = fieldNames[i];
        final String fieldValue = doc.getField(fieldName);

        if (!StringUtils.isEmpty(fieldValue)) {
          try {
            short tokenType;

            ts.reset(new StringReader(fieldValue));
            if ((tokenType = ts.nextToken()) != ITokenizer.TT_EOF) {
              if (hadTokens) addFieldSeparator(documentIndex);
              do {
                ts.setTermBuffer(wrapper);
                add(documentIndex, fieldIndex, context.intern(wrapper), tokenType);
              } while ((tokenType = ts.nextToken()) != ITokenizer.TT_EOF);
              hadTokens = true;
            }
          } catch (IOException e) {
            // Not possible (StringReader above)?
            throw ExceptionUtils.wrapAsRuntimeException(e);
          }
        }
      }

      if (docIterator.hasNext()) {
        addDocumentSeparator();
      }

      documentIndex++;
    }

    addTerminator();

    // Save results in the PreprocessingContext
    context.allTokens.documentIndex = documentIndices.toArray();
    context.allTokens.fieldIndex = fieldIndices.toArray();
    context.allTokens.image = images.toArray(new char[images.size()][]);
    context.allTokens.type = tokenTypes.toArray();
    context.allFields.name = fieldNames;

    // Clean up
    images = null;
    fieldIndices = null;
    tokenTypes = null;
    documentIndices = null;
  }
  /**
   * Performs the actual clustering with an assumption that all documents are written in one <code>
   * language</code>.
   */
  private void cluster(LanguageCode language) {
    // Preprocessing of documents
    final PreprocessingContext context =
        preprocessingPipeline.preprocess(documents, query, language);

    // Further processing only if there are words to process
    clusters = Lists.newArrayList();
    if (context.hasLabels()) {
      // Term-document matrix building and reduction
      final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context);
      final ReducedVectorSpaceModelContext reducedVsmContext =
          new ReducedVectorSpaceModelContext(vsmContext);
      LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext);

      matrixBuilder.buildTermDocumentMatrix(vsmContext);
      matrixBuilder.buildTermPhraseMatrix(vsmContext);

      matrixReducer.reduce(
          reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size()));

      // Cluster label building
      clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting);

      // Document assignment
      clusterBuilder.assignDocuments(lingoContext);

      // Cluster merging
      clusterBuilder.merge(lingoContext);

      // Format final clusters
      final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex;
      final BitSet[] clusterDocuments = lingoContext.clusterDocuments;
      final double[] clusterLabelScore = lingoContext.clusterLabelScore;
      for (int i = 0; i < clusterLabelIndex.length; i++) {
        final Cluster cluster = new Cluster();

        final int labelFeature = clusterLabelIndex[i];
        if (labelFeature < 0) {
          // Cluster removed during merging
          continue;
        }

        // Add label and score
        cluster.addPhrases(labelFormatter.format(context, labelFeature));
        cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]);

        // Add documents
        final BitSet bs = clusterDocuments[i];
        for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) {
          cluster.addDocuments(documents.get(bit));
        }

        // Add cluster
        clusters.add(cluster);
      }

      Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight));
    }

    Cluster.appendOtherTopics(documents, clusters);
  }