@Override public byte[] encode() { byte[] bytes = new byte[encodedSize()]; bytes[0] = (byte) 0x80; BitWriter bitWriter = new BitWriter(bytes, 0, bytes.length, 5, true); int lastSetBit = -1; for (int setPos = bitSet.nextSetBit(0); setPos >= 0; setPos = bitSet.nextSetBit(setPos + 1)) { // skip the distance between setPos and lastSetBit bitWriter.skip(setPos - lastSetBit - 1); /* * Because this field is present, we need to use 2 bits to indicate the * type information necessary to parse. The format for the type bit is * * Untyped: 00 * Double: 01 * Float: 10 * Scalar: 11 */ if (scalarFields != null && scalarFields.get(setPos)) { bitWriter.set(3); } else if (floatFields != null && floatFields.get(setPos)) { bitWriter.set(2); bitWriter.skipNext(); } else if (doubleFields != null && doubleFields.get(setPos)) { bitWriter.setNext(); bitWriter.skipNext(); bitWriter.setNext(); } else { bitWriter.setNext(); bitWriter.skip(2); } lastSetBit = setPos; } return bytes; }
@Override public int cardinality(int position) { int count = 0; for (int i = bitSet.nextSetBit(0); i >= 0 && i < position; i = bitSet.nextSetBit(i + 1)) { count++; } return count; }
@Override public int nextSetBit(int currentPosition) { return bitSet.nextSetBit(currentPosition); }
/** * Performs the actual clustering with an assumption that all documents are written in one <code> * language</code>. */ private void cluster(LanguageCode language) { // Preprocessing of documents final PreprocessingContext context = preprocessingPipeline.preprocess(documents, query, language); // Further processing only if there are words to process clusters = Lists.newArrayList(); if (context.hasLabels()) { // Term-document matrix building and reduction final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context); final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(vsmContext); LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext); matrixBuilder.buildTermDocumentMatrix(vsmContext); matrixBuilder.buildTermPhraseMatrix(vsmContext); matrixReducer.reduce( reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size())); // Cluster label building clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting); // Document assignment clusterBuilder.assignDocuments(lingoContext); // Cluster merging clusterBuilder.merge(lingoContext); // Format final clusters final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex; final BitSet[] clusterDocuments = lingoContext.clusterDocuments; final double[] clusterLabelScore = lingoContext.clusterLabelScore; for (int i = 0; i < clusterLabelIndex.length; i++) { final Cluster cluster = new Cluster(); final int labelFeature = clusterLabelIndex[i]; if (labelFeature < 0) { // Cluster removed during merging continue; } // Add label and score cluster.addPhrases(labelFormatter.format(context, labelFeature)); cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]); // Add documents final BitSet bs = clusterDocuments[i]; for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) { cluster.addDocuments(documents.get(bit)); } // Add cluster clusters.add(cluster); } Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); } Cluster.appendOtherTopics(documents, clusters); }