@Override public void process() throws ProcessingException { final PreprocessingContext preprocessingContext = preprocessing.preprocess(documents, "", LanguageCode.ENGLISH); clusters = Lists.newArrayListWithCapacity(preprocessingContext.allTokens.image.length); for (char[] token : preprocessingContext.allTokens.image) { if (token != null) { clusters.add(new Cluster(new String(token))); } } }
/** * Performs the actual clustering with an assumption that all documents are written in one <code> * language</code>. */ private void cluster(LanguageCode language) { clusters = new ArrayList<Cluster>(); /* * Step 1. Preprocessing: tokenization, stop word marking and stemming (if available). */ context = preprocessingPipeline.preprocess(documents, query, language); /* * Step 2: Create a generalized suffix tree from phrases in the input. */ sb = new GeneralizedSuffixTree.SequenceBuilder(); final int[] tokenIndex = context.allTokens.wordIndex; final short[] tokenType = context.allTokens.type; for (int i = 0; i < tokenIndex.length; i++) { /* Advance until the first real token. */ if (tokenIndex[i] == -1) { if ((tokenType[i] & (ITokenizer.TF_SEPARATOR_DOCUMENT | ITokenizer.TF_TERMINATOR)) != 0) { sb.endDocument(); } continue; } /* We have the first token. Advance until non-token. */ final int s = i; while (tokenIndex[i + 1] != -1) i++; final int phraseLenght = 1 + i - s; if (phraseLenght >= 1) { /* We have a phrase. */ sb.addPhrase(tokenIndex, s, phraseLenght); } } sb.buildSuffixTree(); /* * Step 3: Find "base" clusters by looking up frequently recurring phrases in the * generalized suffix tree. */ final ArrayList<ClusterCandidate> baseClusters = createBaseClusters(sb); /* * Step 4: Merge base clusters that overlap too much to form final clusters. */ final ArrayList<ClusterCandidate> mergedClusters = createMergedClusters(baseClusters); /* * Step 5: Create the junk (unassigned documents) cluster and create the final * set of clusters in Carrot2 format. */ postProcessing(mergedClusters); }
@Override public void process() throws ProcessingException { final PreprocessingContext preprocessingContext = preprocessing.preprocess(documents, "", LanguageCode.ENGLISH); final AllTokens allTokens = preprocessingContext.allTokens; final AllWords allWords = preprocessingContext.allWords; final AllStems allStems = preprocessingContext.allStems; clusters = Lists.newArrayListWithCapacity(allTokens.image.length); for (int i = 0; i < allTokens.image.length; i++) { if (allTokens.wordIndex[i] >= 0) { clusters.add( new Cluster(new String(allStems.image[allWords.stemIndex[allTokens.wordIndex[i]]]))); } } }
/** * Performs the actual clustering with an assumption that all documents are written in one <code> * language</code>. */ private void cluster(LanguageCode language) { clusters = new ArrayList<Cluster>(); context = preprocessingPipeline.preprocess(documents, null, language); }