/** Performs tokenization and saves the results to the <code>context</code>. */ public void tokenize(PreprocessingContext context) { // Documents to tokenize final List<Document> documents = context.documents; // Fields to tokenize final String[] fieldNames = documentFields.toArray(new String[documentFields.size()]); if (fieldNames.length > 8) { throw new ProcessingException("Maximum number of tokenized fields is 8."); } // Prepare arrays images = Lists.newArrayList(); tokenTypes = new ShortArrayList(); documentIndices = new IntArrayList(); fieldIndices = new ByteArrayList(); final Iterator<Document> docIterator = documents.iterator(); int documentIndex = 0; final ITokenizer ts = context.language.getTokenizer(); final MutableCharArray wrapper = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY); while (docIterator.hasNext()) { final Document doc = docIterator.next(); boolean hadTokens = false; for (int i = 0; i < fieldNames.length; i++) { final byte fieldIndex = (byte) i; final String fieldName = fieldNames[i]; final String fieldValue = doc.getField(fieldName); if (!StringUtils.isEmpty(fieldValue)) { try { short tokenType; ts.reset(new StringReader(fieldValue)); if ((tokenType = ts.nextToken()) != ITokenizer.TT_EOF) { if (hadTokens) addFieldSeparator(documentIndex); do { ts.setTermBuffer(wrapper); add(documentIndex, fieldIndex, context.intern(wrapper), tokenType); } while ((tokenType = ts.nextToken()) != ITokenizer.TT_EOF); hadTokens = true; } } catch (IOException e) { // Not possible (StringReader above)? throw ExceptionUtils.wrapAsRuntimeException(e); } } } if (docIterator.hasNext()) { addDocumentSeparator(); } documentIndex++; } addTerminator(); // Save results in the PreprocessingContext context.allTokens.documentIndex = documentIndices.toArray(); context.allTokens.fieldIndex = fieldIndices.toArray(); context.allTokens.image = images.toArray(new char[images.size()][]); context.allTokens.type = tokenTypes.toArray(); context.allFields.name = fieldNames; // Clean up images = null; fieldIndices = null; tokenTypes = null; documentIndices = null; }
/** * Performs the actual clustering with an assumption that all documents are written in one <code> * language</code>. */ private void cluster(LanguageCode language) { // Preprocessing of documents final PreprocessingContext context = preprocessingPipeline.preprocess(documents, query, language); // Further processing only if there are words to process clusters = Lists.newArrayList(); if (context.hasLabels()) { // Term-document matrix building and reduction final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context); final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(vsmContext); LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext); matrixBuilder.buildTermDocumentMatrix(vsmContext); matrixBuilder.buildTermPhraseMatrix(vsmContext); matrixReducer.reduce( reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size())); // Cluster label building clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting); // Document assignment clusterBuilder.assignDocuments(lingoContext); // Cluster merging clusterBuilder.merge(lingoContext); // Format final clusters final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex; final BitSet[] clusterDocuments = lingoContext.clusterDocuments; final double[] clusterLabelScore = lingoContext.clusterLabelScore; for (int i = 0; i < clusterLabelIndex.length; i++) { final Cluster cluster = new Cluster(); final int labelFeature = clusterLabelIndex[i]; if (labelFeature < 0) { // Cluster removed during merging continue; } // Add label and score cluster.addPhrases(labelFormatter.format(context, labelFeature)); cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]); // Add documents final BitSet bs = clusterDocuments[i]; for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) { cluster.addDocuments(documents.get(bit)); } // Add cluster clusters.add(cluster); } Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); } Cluster.appendOtherTopics(documents, clusters); }