@Override public byte[] encode() { byte[] bytes = new byte[encodedSize()]; bytes[0] = (byte) 0x80; BitWriter bitWriter = new BitWriter(bytes, 0, bytes.length, 5, true); int lastSetBit = -1; for (int setPos = bitSet.nextSetBit(0); setPos >= 0; setPos = bitSet.nextSetBit(setPos + 1)) { // skip the distance between setPos and lastSetBit bitWriter.skip(setPos - lastSetBit - 1); /* * Because this field is present, we need to use 2 bits to indicate the * type information necessary to parse. The format for the type bit is * * Untyped: 00 * Double: 01 * Float: 10 * Scalar: 11 */ if (scalarFields != null && scalarFields.get(setPos)) { bitWriter.set(3); } else if (floatFields != null && floatFields.get(setPos)) { bitWriter.set(2); bitWriter.skipNext(); } else if (doubleFields != null && doubleFields.get(setPos)) { bitWriter.setNext(); bitWriter.skipNext(); bitWriter.setNext(); } else { bitWriter.setNext(); bitWriter.skip(2); } lastSetBit = setPos; } return bytes; }
/** * Computes stem indices of words that are one-word label candidates or are non-stop words from * phrase label candidates. */ private int[] computeRequiredStemIndices(PreprocessingContext context) { final int[] labelsFeatureIndex = context.allLabels.featureIndex; final int[] wordsStemIndex = context.allWords.stemIndex; final short[] wordsTypes = context.allWords.type; final int[][] phrasesWordIndices = context.allPhrases.wordIndices; final int wordCount = wordsStemIndex.length; final int[][] stemsTfByDocument = context.allStems.tfByDocument; int documentCount = context.documents.size(); final BitSet requiredStemIndices = new BitSet(labelsFeatureIndex.length); for (int i = 0; i < labelsFeatureIndex.length; i++) { final int featureIndex = labelsFeatureIndex[i]; if (featureIndex < wordCount) { addStemIndex( wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, featureIndex); } else { final int[] wordIndices = phrasesWordIndices[featureIndex - wordCount]; for (int j = 0; j < wordIndices.length; j++) { final int wordIndex = wordIndices[j]; if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex])) { addStemIndex( wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, wordIndex); } } } } return requiredStemIndices.asIntLookupContainer().toArray(); }
@Override public int cardinality(int position) { int count = 0; for (int i = bitSet.nextSetBit(0); i >= 0 && i < position; i = bitSet.nextSetBit(i + 1)) { count++; } return count; }
@Override public int hashCode() { int result = bitSet.hashCode(); result = 31 * result + (scalarFields != null ? scalarFields.hashCode() : 0); result = 31 * result + (floatFields != null ? floatFields.hashCode() : 0); result = 31 * result + (doubleFields != null ? doubleFields.hashCode() : 0); return result; }
public static void main(final String[] args) { final com.carrotsearch.hppc.BitSet hppcBitSet = new com.carrotsearch.hppc.BitSet(Long.MAX_VALUE); hppcBitSet.set(Integer.MAX_VALUE); final java.util.BitSet javaBitSet = new java.util.BitSet(Integer.MAX_VALUE); javaBitSet.set(Integer.MAX_VALUE - 1); System.out.println(ObjectSizeCalculator.getObjectSize(hppcBitSet)); System.out.println(ObjectSizeCalculator.getObjectSize(javaBitSet)); }
/** Collect documents from a bitset. */ private List<Document> collectDocuments(List<Document> l, BitSet bitset) { if (l == null) { l = Lists.newArrayListWithCapacity((int) bitset.cardinality()); } final BitSetIterator i = bitset.iterator(); for (int d = i.nextSetBit(); d >= 0; d = i.nextSetBit()) { l.add(documents.get(d)); } return l; }
@Override public BitSet findMatchings( T expectedElement, List<T> annotatorResult, BitSet alreadyUsedResults) { BitSet matchings; matchings = matchingsCounter[0].findMatchings(expectedElement, annotatorResult, alreadyUsedResults); for (int i = 1; (i < matchingsCounter.length) && (!matchings.isEmpty()); i++) { matchings.intersect( matchingsCounter[i].findMatchings(expectedElement, annotatorResult, alreadyUsedResults)); } return matchings; }
private ByteEntryAccumulator getKeyAccumulator() { if (indexKeyAccumulator == null) { BitSet keyFields = new BitSet(); for (int keyColumn : mainColToIndexPosMap) { if (keyColumn >= 0) keyFields.set(keyColumn); } indexKeyAccumulator = new ByteEntryAccumulator(EntryPredicateFilter.emptyPredicate(), keyFields); } return indexKeyAccumulator; }
public static BitIndex wrap(byte[] data, int position, int limit) { // create a BitSet underneath BitSet bitSet = new BitSet(); BitSet scalarFields = new BitSet(); BitSet floatFields = new BitSet(); BitSet doubleFields = new BitSet(); BitReader bitReader = new BitReader(data, position, limit, 5, true); int bitPos = 0; while (bitReader.hasNext()) { int zeros = bitReader.nextSetBit(); if (zeros < 0) break; bitPos += zeros; bitSet.set(bitPos); if (bitReader.next() != 0) { // either float or scalar if (bitReader.next() != 0) { scalarFields.set(bitPos); } else floatFields.set(bitPos); } else { // either a double or untyped if (bitReader.next() != 0) doubleFields.set(bitPos); } bitPos++; } return new UncompressedBitIndex(bitSet, scalarFields, floatFields, doubleFields); }
@Test public void testAlwaysAcceptEntryWorks() throws Exception { BitSet fields = new BitSet(); fields.set(0); fields.set(2); EntryPredicateFilter predicateFilter = new EntryPredicateFilter(fields); EntryAccumulator accumulator = new ByteEntryAccumulator(predicateFilter, false, null); byte[] encodedOne = Encoding.encode(1); accumulator.add(2, encodedOne, 0, encodedOne.length); byte[] encodedTwo = Encoding.encode(2); accumulator.add(0, encodedTwo, 0, encodedTwo.length); byte[] bytes = accumulator.finish(); MultiFieldDecoder decoder = MultiFieldDecoder.wrap(bytes); Assert.assertEquals(2, decoder.decodeNextInt()); Assert.assertEquals(1, decoder.decodeNextInt()); }
@Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof UncompressedBitIndex)) return false; UncompressedBitIndex that = (UncompressedBitIndex) o; if (!bitSet.equals(that.bitSet)) return false; if (doubleFields != null ? !doubleFields.equals(that.doubleFields) : that.doubleFields != null) return false; if (floatFields != null ? !floatFields.equals(that.floatFields) : that.floatFields != null) return false; if (scalarFields != null ? !scalarFields.equals(that.scalarFields) : that.scalarFields != null) return false; return true; }
/** * Do we need to update the index, i.e. did any of the values change? * * @param mutation * @param indexedColumns * @return */ public boolean areIndexKeysModified(KVPair mutation, BitSet indexedColumns) { EntryDecoder newPutDecoder = new EntryDecoder(); newPutDecoder.set(mutation.getValue()); BitIndex updateIndex = newPutDecoder.getCurrentIndex(); for (int i = updateIndex.nextSetBit(0); i >= 0; i = updateIndex.nextSetBit(i + 1)) { if (indexedColumns.get(i)) return true; } return false; }
/** * Create the junk (unassigned documents) cluster and create the final set of clusters in Carrot2 * format. */ private void postProcessing(ArrayList<ClusterCandidate> clusters) { // Adapt to Carrot2 classes, counting used documents on the way. final BitSet all = new BitSet(documents.size()); final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size()); final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3); for (ClusterCandidate c : clusters) { final Cluster c2 = new Cluster(); c2.addPhrases(collectPhrases(phrases, c)); c2.addDocuments(collectDocuments(docs, c.documents)); c2.setScore((double) c.score); this.clusters.add(c2); all.or(c.documents); docs.clear(); phrases.clear(); } Cluster.appendOtherTopics(this.documents, this.clusters); }
private EntryEncoder getRowEncoder() { if (indexValueEncoder == null) { BitSet nonNullFields = new BitSet(); int highestSetPosition = 0; for (int keyColumn : mainColToIndexPosMap) { if (keyColumn > highestSetPosition) highestSetPosition = keyColumn; } nonNullFields.set(highestSetPosition + 1); indexValueEncoder = EntryEncoder.create( SpliceKryoRegistry.getInstance(), 1, nonNullFields, new BitSet(), new BitSet(), new BitSet()); } return indexValueEncoder; }
/** Adds stem index to the set with a check on the stem's document frequency. */ private void addStemIndex( final int[] wordsStemIndex, int documentCount, int[][] stemsTfByDocument, final BitSet requiredStemIndices, final int featureIndex) { final int stemIndex = wordsStemIndex[featureIndex]; final int df = stemsTfByDocument[stemIndex].length / 2; if (((double) df / documentCount) <= maxWordDf) { requiredStemIndices.set(stemIndex); } }
@Override public int encodedSize() { /* * The number of bytes goes as follows: * * you need at least as many bits as the highest 1-bit in the bitSet(equivalent * to bitSet.length()). Because each set bit will have an additional 2-bit "type delimiter" * set afterwords, we need to have 3 bits for every set bit, but 1 for every non-set bit * * This is equivalent to length()+2*numSetBits(). * * we have 4 available bits in the header, and 7 bits in each subsequent byte (we use a continuation * bit). */ int numBits = (int) (bitSet.length() + 2 * bitSet.cardinality()); int numBytes = 1; numBits -= 4; if (numBits > 0) { numBytes += numBits / 7; if (numBits % 7 != 0) numBytes++; } return numBytes; }
private DataResult fetchBaseRow(KVPair mutation, WriteContext ctx, BitSet indexedColumns) throws IOException { baseGet = SIDriver.driver() .getOperationFactory() .newDataGet(ctx.getTxn(), mutation.getRowKey(), baseGet); EntryPredicateFilter epf; if (indexedColumns != null && indexedColumns.size() > 0) { epf = new EntryPredicateFilter(indexedColumns); } else epf = EntryPredicateFilter.emptyPredicate(); TransactionalRegion region = ctx.txnRegion(); TxnFilter txnFilter = region.packedFilter(ctx.getTxn(), epf, false); baseGet.setFilter(txnFilter); baseResult = ctx.getRegion().get(baseGet, baseResult); return baseResult; }
/** * Translate the given base table record mutation into its associated, referencing index record. * <br> * Encapsulates the logic required to create an index record for a given base table record with * all the required discriminating and encoding rules (column is part of a PK, value is null, * etc). * * @param mutation KVPair containing the rowKey of the base table record for which we want to * translate to the associated index. This mutation should already have its requred {@link * KVPair.Type Type} set. * @return A KVPair representing the index record of the given base table mutation. This KVPair is * suitable for performing the required modification of the index record associated with this * mutation. * @throws IOException for encoding/decoding problems. */ public KVPair translate(KVPair mutation) throws IOException { if (mutation == null) { return null; } EntryAccumulator keyAccumulator = getKeyAccumulator(); keyAccumulator.reset(); boolean hasNullKeyFields = false; /* * Handle index columns from the source table's primary key. */ if (table.getColumnOrderingCount() > 0) { // we have key columns to check MultiFieldDecoder keyDecoder = getSrcKeyDecoder(); keyDecoder.set(mutation.getRowKey()); for (int i = 0; i < table.getColumnOrderingCount(); i++) { int sourceKeyColumnPos = table.getColumnOrdering(i); int indexKeyPos = sourceKeyColumnPos < mainColToIndexPosMap.length ? mainColToIndexPosMap[sourceKeyColumnPos] : -1; int offset = keyDecoder.offset(); boolean isNull = skip(keyDecoder, table.getFormatIds(sourceKeyColumnPos)); if (!indexedCols.get(sourceKeyColumnPos)) continue; if (indexKeyPos >= 0) { /* * since primary keys have an implicit NOT NULL constraint here, we don't need to check for it, * and isNull==true would represent a programmer error, rather than an actual state the * system can be in. */ assert !isNull : "Programmer error: Cannot update a primary key to a null value!"; int length = keyDecoder.offset() - offset - 1; /* * A note about sort order: * * We are in the primary key section, which means that the element is ordered in * ASCENDING order. In an ideal world, that wouldn't matter because */ accumulate( keyAccumulator, indexKeyPos, table.getFormatIds(sourceKeyColumnPos), index.getDescColumns(indexKeyPos), keyDecoder.array(), offset, length); } } } /* * Handle non-null index columns from the source tables non-primary key columns. * * this will set indexed columns with values taken from the incoming mutation (rather than * backfilling them with existing values, which would occur elsewhere). */ EntryDecoder rowDecoder = getSrcValueDecoder(); rowDecoder.set(mutation.getValue()); BitIndex bitIndex = rowDecoder.getCurrentIndex(); MultiFieldDecoder rowFieldDecoder = rowDecoder.getEntryDecoder(); for (int i = bitIndex.nextSetBit(0); i >= 0; i = bitIndex.nextSetBit(i + 1)) { if (!indexedCols.get(i)) { // skip non-indexed columns rowDecoder.seekForward(rowFieldDecoder, i); continue; } int keyColumnPos = i < mainColToIndexPosMap.length ? mainColToIndexPosMap[i] : -1; if (keyColumnPos < 0) { rowDecoder.seekForward(rowFieldDecoder, i); } else { int offset = rowFieldDecoder.offset(); boolean isNull = rowDecoder.seekForward(rowFieldDecoder, i); hasNullKeyFields = isNull || hasNullKeyFields; int length; if (!isNull) { length = rowFieldDecoder.offset() - offset - 1; accumulate( keyAccumulator, keyColumnPos, table.getFormatIds(i), index.getDescColumns(keyColumnPos), rowFieldDecoder.array(), offset, length); } else { /* * because the field is NULL and it's source is the incoming mutation, we * still need to accumulate it. We must be careful, however, to accumulate the * proper null value. * * In theory, we could use a sparse encoding here--just accumulate a length 0 entry, * which will allow us to use a very short row key to determine nullity. However, that * doesn't work correctly, because doubles and floats at the end of the index might decode * the row key as a double, resulting in goofball answers. * * Instead, we must use the dense encoding approach here. That means that we must * select the proper dense type based on columnTypes[i]. For most data types, this is still * a length-0 array, but for floats and doubles it will put the proper type into place. */ accumulateNull(keyAccumulator, keyColumnPos, table.getFormatIds(i)); } } } /* * Handle NULL index columns from the source tables non-primary key columns. */ for (int srcColIndex = 0; srcColIndex < mainColToIndexPosMap.length; srcColIndex++) { /* position of the source column within the index encoding */ int indexColumnPosition = mainColToIndexPosMap[srcColIndex]; if (!isSourceColumnPrimaryKey(srcColIndex) && indexColumnPosition >= 0 && !bitIndex.isSet(srcColIndex)) { hasNullKeyFields = true; keyAccumulator.add(indexColumnPosition, new byte[] {}, 0, 0); } } // add the row key to the end of the index key byte[] srcRowKey = Encoding.encodeBytesUnsorted(mutation.getRowKey()); EntryEncoder rowEncoder = getRowEncoder(); MultiFieldEncoder entryEncoder = rowEncoder.getEntryEncoder(); entryEncoder.reset(); entryEncoder.setRawBytes(srcRowKey); byte[] indexValue = rowEncoder.encode(); byte[] indexRowKey; if (index.getUnique()) { boolean nonUnique = index.getUniqueWithDuplicateNulls() && (hasNullKeyFields || !keyAccumulator.isFinished()); indexRowKey = getIndexRowKey(srcRowKey, nonUnique); } else indexRowKey = getIndexRowKey(srcRowKey, true); return new KVPair(indexRowKey, indexValue, mutation.getType()); }
/** * Performs the actual clustering with an assumption that all documents are written in one <code> * language</code>. */ private void cluster(LanguageCode language) { // Preprocessing of documents final PreprocessingContext context = preprocessingPipeline.preprocess(documents, query, language); // Further processing only if there are words to process clusters = Lists.newArrayList(); if (context.hasLabels()) { // Term-document matrix building and reduction final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context); final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(vsmContext); LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext); matrixBuilder.buildTermDocumentMatrix(vsmContext); matrixBuilder.buildTermPhraseMatrix(vsmContext); matrixReducer.reduce( reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size())); // Cluster label building clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting); // Document assignment clusterBuilder.assignDocuments(lingoContext); // Cluster merging clusterBuilder.merge(lingoContext); // Format final clusters final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex; final BitSet[] clusterDocuments = lingoContext.clusterDocuments; final double[] clusterLabelScore = lingoContext.clusterLabelScore; for (int i = 0; i < clusterLabelIndex.length; i++) { final Cluster cluster = new Cluster(); final int labelFeature = clusterLabelIndex[i]; if (labelFeature < 0) { // Cluster removed during merging continue; } // Add label and score cluster.addPhrases(labelFormatter.format(context, labelFeature)); cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]); // Add documents final BitSet bs = clusterDocuments[i]; for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) { cluster.addDocuments(documents.get(bit)); } // Add cluster clusters.add(cluster); } Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); } Cluster.appendOtherTopics(documents, clusters); }
@Override public int nextSetBit(int currentPosition) { return bitSet.nextSetBit(currentPosition); }
/** * Create final clusters by merging base clusters and pruning their labels. Cluster merging is a * greedy process of compacting clusters with document sets that overlap by a certain ratio. In * other words, phrases that "cover" nearly identical document sets will be conflated. */ private ArrayList<ClusterCandidate> createMergedClusters( ArrayList<ClusterCandidate> baseClusters) { /* * Calculate overlap between base clusters first, saving adjacency lists for * each base cluster. */ // [i] - next neighbor or END, [i + 1] - neighbor cluster index. final int END = -1; final IntStack neighborList = new IntStack(); neighborList.push(END); final int[] neighbors = new int[baseClusters.size()]; final float m = (float) mergeThreshold; for (int i = 0; i < baseClusters.size(); i++) { for (int j = i + 1; j < baseClusters.size(); j++) { final ClusterCandidate c1 = baseClusters.get(i); final ClusterCandidate c2 = baseClusters.get(j); final float a = c1.cardinality; final float b = c2.cardinality; final float c = BitSet.intersectionCount(c1.documents, c2.documents); if (c / a > m && c / b > m) { neighborList.push(neighbors[i], j); neighbors[i] = neighborList.size() - 2; neighborList.push(neighbors[j], i); neighbors[j] = neighborList.size() - 2; } } } /* * Find connected components in the similarity graph using Tarjan's algorithm * (flattened to use the stack instead of recursion). */ final int NO_INDEX = -1; final int[] merged = new int[baseClusters.size()]; Arrays.fill(merged, NO_INDEX); final ArrayList<ClusterCandidate> mergedClusters = Lists.newArrayListWithCapacity(baseClusters.size()); final IntStack stack = new IntStack(baseClusters.size()); final IntStack mergeList = new IntStack(baseClusters.size()); int mergedIndex = 0; for (int v = 0; v < baseClusters.size(); v++) { if (merged[v] != NO_INDEX) continue; // Recursively mark all connected components from an unmerged cluster. stack.push(v); while (stack.size() > 0) { final int c = stack.pop(); assert merged[c] == NO_INDEX || merged[c] == mergedIndex; if (merged[c] == mergedIndex) continue; merged[c] = mergedIndex; mergeList.push(c); for (int i = neighbors[c]; neighborList.get(i) != END; ) { final int neighbor = neighborList.get(i + 1); if (merged[neighbor] == NO_INDEX) { stack.push(neighbor); } else { assert merged[neighbor] == mergedIndex; } i = neighborList.get(i); } } mergedIndex++; /* * Aggregate documents from each base cluster of the current merge, compute * the score and labels. */ mergedClusters.add(merge(mergeList, baseClusters)); mergeList.clear(); } /* * Sort merged clusters. */ Collections.sort( mergedClusters, new Comparator<ClusterCandidate>() { public int compare(ClusterCandidate c1, ClusterCandidate c2) { if (c1.score < c2.score) return 1; if (c1.score > c2.score) return -1; if (c1.cardinality < c2.cardinality) return 1; if (c1.cardinality > c2.cardinality) return -1; return 0; }; }); if (mergedClusters.size() > maxClusters) { mergedClusters.subList(maxClusters, mergedClusters.size()).clear(); } return mergedClusters; }
@Override public BitSet and(BitSet bitSet) { final BitSet result = (BitSet) this.bitSet.clone(); result.and(bitSet); return result; }
@Override public boolean isSet(int pos) { return bitSet.get(pos); }
@Override public int length() { return (int) bitSet.length(); }
@Override public boolean isDoubleType(int position) { return doubleFields != null && doubleFields.get(position); }
@Override public boolean isScalarType(int position) { return scalarFields != null && scalarFields.get(position); }
@Override public boolean isFloatType(int position) { return floatFields != null && floatFields.get(position); }
@Override public int cardinality() { return (int) bitSet.cardinality(); }
@Override public boolean isEmpty() { return bitSet.isEmpty(); }