@Override
 public void process() throws ProcessingException {
   final PreprocessingContext preprocessingContext =
       preprocessing.preprocess(documents, "", LanguageCode.ENGLISH);
   clusters = Lists.newArrayListWithCapacity(preprocessingContext.allTokens.image.length);
   for (char[] token : preprocessingContext.allTokens.image) {
     if (token != null) {
       clusters.add(new Cluster(new String(token)));
     }
   }
 }
Exemplo n.º 2
0
  /**
   * Performs the actual clustering with an assumption that all documents are written in one <code>
   * language</code>.
   */
  private void cluster(LanguageCode language) {
    clusters = new ArrayList<Cluster>();

    /*
     * Step 1. Preprocessing: tokenization, stop word marking and stemming (if available).
     */
    context = preprocessingPipeline.preprocess(documents, query, language);

    /*
     * Step 2: Create a generalized suffix tree from phrases in the input.
     */
    sb = new GeneralizedSuffixTree.SequenceBuilder();

    final int[] tokenIndex = context.allTokens.wordIndex;
    final short[] tokenType = context.allTokens.type;
    for (int i = 0; i < tokenIndex.length; i++) {
      /* Advance until the first real token. */
      if (tokenIndex[i] == -1) {
        if ((tokenType[i] & (ITokenizer.TF_SEPARATOR_DOCUMENT | ITokenizer.TF_TERMINATOR)) != 0) {
          sb.endDocument();
        }
        continue;
      }

      /* We have the first token. Advance until non-token. */
      final int s = i;

      while (tokenIndex[i + 1] != -1) i++;
      final int phraseLenght = 1 + i - s;
      if (phraseLenght >= 1) {
        /* We have a phrase. */
        sb.addPhrase(tokenIndex, s, phraseLenght);
      }
    }
    sb.buildSuffixTree();

    /*
     * Step 3: Find "base" clusters by looking up frequently recurring phrases in the
     * generalized suffix tree.
     */
    final ArrayList<ClusterCandidate> baseClusters = createBaseClusters(sb);

    /*
     * Step 4: Merge base clusters that overlap too much to form final clusters.
     */
    final ArrayList<ClusterCandidate> mergedClusters = createMergedClusters(baseClusters);

    /*
     * Step 5: Create the junk (unassigned documents) cluster and create the final
     * set of clusters in Carrot2 format.
     */
    postProcessing(mergedClusters);
  }
 @Override
 public void process() throws ProcessingException {
   final PreprocessingContext preprocessingContext =
       preprocessing.preprocess(documents, "", LanguageCode.ENGLISH);
   final AllTokens allTokens = preprocessingContext.allTokens;
   final AllWords allWords = preprocessingContext.allWords;
   final AllStems allStems = preprocessingContext.allStems;
   clusters = Lists.newArrayListWithCapacity(allTokens.image.length);
   for (int i = 0; i < allTokens.image.length; i++) {
     if (allTokens.wordIndex[i] >= 0) {
       clusters.add(
           new Cluster(new String(allStems.image[allWords.stemIndex[allTokens.wordIndex[i]]])));
     }
   }
 }
Exemplo n.º 4
0
 /**
  * Performs the actual clustering with an assumption that all documents are written in one <code>
  * language</code>.
  */
 private void cluster(LanguageCode language) {
   clusters = new ArrayList<Cluster>();
   context = preprocessingPipeline.preprocess(documents, null, language);
 }