@Override
 public byte[] encode() {
   byte[] bytes = new byte[encodedSize()];
   bytes[0] = (byte) 0x80;
   BitWriter bitWriter = new BitWriter(bytes, 0, bytes.length, 5, true);
   int lastSetBit = -1;
   for (int setPos = bitSet.nextSetBit(0); setPos >= 0; setPos = bitSet.nextSetBit(setPos + 1)) {
     // skip the distance between setPos and lastSetBit
     bitWriter.skip(setPos - lastSetBit - 1);
     /*
      * Because this field is present, we need to use 2 bits to indicate the
      * type information necessary to parse. The format for the type bit is
      *
      * Untyped: 00
      * Double: 01
      * Float: 10
      * Scalar: 11
      */
     if (scalarFields != null && scalarFields.get(setPos)) {
       bitWriter.set(3);
     } else if (floatFields != null && floatFields.get(setPos)) {
       bitWriter.set(2);
       bitWriter.skipNext();
     } else if (doubleFields != null && doubleFields.get(setPos)) {
       bitWriter.setNext();
       bitWriter.skipNext();
       bitWriter.setNext();
     } else {
       bitWriter.setNext();
       bitWriter.skip(2);
     }
     lastSetBit = setPos;
   }
   return bytes;
 }
  /**
   * Computes stem indices of words that are one-word label candidates or are non-stop words from
   * phrase label candidates.
   */
  private int[] computeRequiredStemIndices(PreprocessingContext context) {
    final int[] labelsFeatureIndex = context.allLabels.featureIndex;
    final int[] wordsStemIndex = context.allWords.stemIndex;
    final short[] wordsTypes = context.allWords.type;
    final int[][] phrasesWordIndices = context.allPhrases.wordIndices;
    final int wordCount = wordsStemIndex.length;

    final int[][] stemsTfByDocument = context.allStems.tfByDocument;
    int documentCount = context.documents.size();
    final BitSet requiredStemIndices = new BitSet(labelsFeatureIndex.length);

    for (int i = 0; i < labelsFeatureIndex.length; i++) {
      final int featureIndex = labelsFeatureIndex[i];
      if (featureIndex < wordCount) {
        addStemIndex(
            wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, featureIndex);
      } else {
        final int[] wordIndices = phrasesWordIndices[featureIndex - wordCount];
        for (int j = 0; j < wordIndices.length; j++) {
          final int wordIndex = wordIndices[j];
          if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex])) {
            addStemIndex(
                wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, wordIndex);
          }
        }
      }
    }

    return requiredStemIndices.asIntLookupContainer().toArray();
  }
 @Override
 public int cardinality(int position) {
   int count = 0;
   for (int i = bitSet.nextSetBit(0); i >= 0 && i < position; i = bitSet.nextSetBit(i + 1)) {
     count++;
   }
   return count;
 }
 @Override
 public int hashCode() {
   int result = bitSet.hashCode();
   result = 31 * result + (scalarFields != null ? scalarFields.hashCode() : 0);
   result = 31 * result + (floatFields != null ? floatFields.hashCode() : 0);
   result = 31 * result + (doubleFields != null ? doubleFields.hashCode() : 0);
   return result;
 }
  public static void main(final String[] args) {
    final com.carrotsearch.hppc.BitSet hppcBitSet =
        new com.carrotsearch.hppc.BitSet(Long.MAX_VALUE);
    hppcBitSet.set(Integer.MAX_VALUE);
    final java.util.BitSet javaBitSet = new java.util.BitSet(Integer.MAX_VALUE);
    javaBitSet.set(Integer.MAX_VALUE - 1);

    System.out.println(ObjectSizeCalculator.getObjectSize(hppcBitSet));
    System.out.println(ObjectSizeCalculator.getObjectSize(javaBitSet));
  }
  /** Collect documents from a bitset. */
  private List<Document> collectDocuments(List<Document> l, BitSet bitset) {
    if (l == null) {
      l = Lists.newArrayListWithCapacity((int) bitset.cardinality());
    }

    final BitSetIterator i = bitset.iterator();
    for (int d = i.nextSetBit(); d >= 0; d = i.nextSetBit()) {
      l.add(documents.get(d));
    }
    return l;
  }
 @Override
 public BitSet findMatchings(
     T expectedElement, List<T> annotatorResult, BitSet alreadyUsedResults) {
   BitSet matchings;
   matchings =
       matchingsCounter[0].findMatchings(expectedElement, annotatorResult, alreadyUsedResults);
   for (int i = 1; (i < matchingsCounter.length) && (!matchings.isEmpty()); i++) {
     matchings.intersect(
         matchingsCounter[i].findMatchings(expectedElement, annotatorResult, alreadyUsedResults));
   }
   return matchings;
 }
  private ByteEntryAccumulator getKeyAccumulator() {
    if (indexKeyAccumulator == null) {
      BitSet keyFields = new BitSet();
      for (int keyColumn : mainColToIndexPosMap) {
        if (keyColumn >= 0) keyFields.set(keyColumn);
      }
      indexKeyAccumulator =
          new ByteEntryAccumulator(EntryPredicateFilter.emptyPredicate(), keyFields);
    }

    return indexKeyAccumulator;
  }
  public static BitIndex wrap(byte[] data, int position, int limit) {
    // create a BitSet underneath
    BitSet bitSet = new BitSet();
    BitSet scalarFields = new BitSet();
    BitSet floatFields = new BitSet();
    BitSet doubleFields = new BitSet();
    BitReader bitReader = new BitReader(data, position, limit, 5, true);

    int bitPos = 0;
    while (bitReader.hasNext()) {
      int zeros = bitReader.nextSetBit();
      if (zeros < 0) break;
      bitPos += zeros;
      bitSet.set(bitPos);
      if (bitReader.next() != 0) {
        // either float or scalar
        if (bitReader.next() != 0) {
          scalarFields.set(bitPos);
        } else floatFields.set(bitPos);
      } else {
        // either a double or untyped
        if (bitReader.next() != 0) doubleFields.set(bitPos);
      }
      bitPos++;
    }
    return new UncompressedBitIndex(bitSet, scalarFields, floatFields, doubleFields);
  }
  @Test
  public void testAlwaysAcceptEntryWorks() throws Exception {
    BitSet fields = new BitSet();
    fields.set(0);
    fields.set(2);
    EntryPredicateFilter predicateFilter = new EntryPredicateFilter(fields);
    EntryAccumulator accumulator = new ByteEntryAccumulator(predicateFilter, false, null);
    byte[] encodedOne = Encoding.encode(1);
    accumulator.add(2, encodedOne, 0, encodedOne.length);
    byte[] encodedTwo = Encoding.encode(2);
    accumulator.add(0, encodedTwo, 0, encodedTwo.length);

    byte[] bytes = accumulator.finish();
    MultiFieldDecoder decoder = MultiFieldDecoder.wrap(bytes);
    Assert.assertEquals(2, decoder.decodeNextInt());
    Assert.assertEquals(1, decoder.decodeNextInt());
  }
  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (!(o instanceof UncompressedBitIndex)) return false;

    UncompressedBitIndex that = (UncompressedBitIndex) o;

    if (!bitSet.equals(that.bitSet)) return false;
    if (doubleFields != null ? !doubleFields.equals(that.doubleFields) : that.doubleFields != null)
      return false;
    if (floatFields != null ? !floatFields.equals(that.floatFields) : that.floatFields != null)
      return false;
    if (scalarFields != null ? !scalarFields.equals(that.scalarFields) : that.scalarFields != null)
      return false;

    return true;
  }
 /**
  * Do we need to update the index, i.e. did any of the values change?
  *
  * @param mutation
  * @param indexedColumns
  * @return
  */
 public boolean areIndexKeysModified(KVPair mutation, BitSet indexedColumns) {
   EntryDecoder newPutDecoder = new EntryDecoder();
   newPutDecoder.set(mutation.getValue());
   BitIndex updateIndex = newPutDecoder.getCurrentIndex();
   for (int i = updateIndex.nextSetBit(0); i >= 0; i = updateIndex.nextSetBit(i + 1)) {
     if (indexedColumns.get(i)) return true;
   }
   return false;
 }
示例#13
0
  /**
   * Create the junk (unassigned documents) cluster and create the final set of clusters in Carrot2
   * format.
   */
  private void postProcessing(ArrayList<ClusterCandidate> clusters) {
    // Adapt to Carrot2 classes, counting used documents on the way.
    final BitSet all = new BitSet(documents.size());
    final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size());
    final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3);
    for (ClusterCandidate c : clusters) {
      final Cluster c2 = new Cluster();
      c2.addPhrases(collectPhrases(phrases, c));
      c2.addDocuments(collectDocuments(docs, c.documents));
      c2.setScore((double) c.score);
      this.clusters.add(c2);

      all.or(c.documents);
      docs.clear();
      phrases.clear();
    }

    Cluster.appendOtherTopics(this.documents, this.clusters);
  }
 private EntryEncoder getRowEncoder() {
   if (indexValueEncoder == null) {
     BitSet nonNullFields = new BitSet();
     int highestSetPosition = 0;
     for (int keyColumn : mainColToIndexPosMap) {
       if (keyColumn > highestSetPosition) highestSetPosition = keyColumn;
     }
     nonNullFields.set(highestSetPosition + 1);
     indexValueEncoder =
         EntryEncoder.create(
             SpliceKryoRegistry.getInstance(),
             1,
             nonNullFields,
             new BitSet(),
             new BitSet(),
             new BitSet());
   }
   return indexValueEncoder;
 }
 /** Adds stem index to the set with a check on the stem's document frequency. */
 private void addStemIndex(
     final int[] wordsStemIndex,
     int documentCount,
     int[][] stemsTfByDocument,
     final BitSet requiredStemIndices,
     final int featureIndex) {
   final int stemIndex = wordsStemIndex[featureIndex];
   final int df = stemsTfByDocument[stemIndex].length / 2;
   if (((double) df / documentCount) <= maxWordDf) {
     requiredStemIndices.set(stemIndex);
   }
 }
  @Override
  public int encodedSize() {
    /*
     * The number of bytes goes as follows:
     *
     * you need at least as many bits as the highest 1-bit in the bitSet(equivalent
     *  to bitSet.length()). Because each set bit will have an additional 2-bit "type delimiter"
     *  set afterwords, we need to have 3 bits for every set bit, but 1 for every non-set bit
     *
     * This is equivalent to length()+2*numSetBits().
     *
     * we have 4 available bits in the header, and 7 bits in each subsequent byte (we use a continuation
     * bit).
     */
    int numBits = (int) (bitSet.length() + 2 * bitSet.cardinality());
    int numBytes = 1;
    numBits -= 4;
    if (numBits > 0) {
      numBytes += numBits / 7;
      if (numBits % 7 != 0) numBytes++;
    }

    return numBytes;
  }
  private DataResult fetchBaseRow(KVPair mutation, WriteContext ctx, BitSet indexedColumns)
      throws IOException {
    baseGet =
        SIDriver.driver()
            .getOperationFactory()
            .newDataGet(ctx.getTxn(), mutation.getRowKey(), baseGet);

    EntryPredicateFilter epf;
    if (indexedColumns != null && indexedColumns.size() > 0) {
      epf = new EntryPredicateFilter(indexedColumns);
    } else epf = EntryPredicateFilter.emptyPredicate();

    TransactionalRegion region = ctx.txnRegion();
    TxnFilter txnFilter = region.packedFilter(ctx.getTxn(), epf, false);
    baseGet.setFilter(txnFilter);
    baseResult = ctx.getRegion().get(baseGet, baseResult);
    return baseResult;
  }
  /**
   * Translate the given base table record mutation into its associated, referencing index record.
   * <br>
   * Encapsulates the logic required to create an index record for a given base table record with
   * all the required discriminating and encoding rules (column is part of a PK, value is null,
   * etc).
   *
   * @param mutation KVPair containing the rowKey of the base table record for which we want to
   *     translate to the associated index. This mutation should already have its requred {@link
   *     KVPair.Type Type} set.
   * @return A KVPair representing the index record of the given base table mutation. This KVPair is
   *     suitable for performing the required modification of the index record associated with this
   *     mutation.
   * @throws IOException for encoding/decoding problems.
   */
  public KVPair translate(KVPair mutation) throws IOException {
    if (mutation == null) {
      return null;
    }

    EntryAccumulator keyAccumulator = getKeyAccumulator();
    keyAccumulator.reset();
    boolean hasNullKeyFields = false;

    /*
     * Handle index columns from the source table's primary key.
     */
    if (table.getColumnOrderingCount() > 0) {
      // we have key columns to check
      MultiFieldDecoder keyDecoder = getSrcKeyDecoder();
      keyDecoder.set(mutation.getRowKey());
      for (int i = 0; i < table.getColumnOrderingCount(); i++) {
        int sourceKeyColumnPos = table.getColumnOrdering(i);

        int indexKeyPos =
            sourceKeyColumnPos < mainColToIndexPosMap.length
                ? mainColToIndexPosMap[sourceKeyColumnPos]
                : -1;
        int offset = keyDecoder.offset();
        boolean isNull = skip(keyDecoder, table.getFormatIds(sourceKeyColumnPos));
        if (!indexedCols.get(sourceKeyColumnPos)) continue;
        if (indexKeyPos >= 0) {
          /*
           * since primary keys have an implicit NOT NULL constraint here, we don't need to check for it,
           * and isNull==true would represent a programmer error, rather than an actual state the
           * system can be in.
           */
          assert !isNull : "Programmer error: Cannot update a primary key to a null value!";
          int length = keyDecoder.offset() - offset - 1;
          /*
           * A note about sort order:
           *
           * We are in the primary key section, which means that the element is ordered in
           * ASCENDING order. In an ideal world, that wouldn't matter because
           */
          accumulate(
              keyAccumulator,
              indexKeyPos,
              table.getFormatIds(sourceKeyColumnPos),
              index.getDescColumns(indexKeyPos),
              keyDecoder.array(),
              offset,
              length);
        }
      }
    }

    /*
     * Handle non-null index columns from the source tables non-primary key columns.
     *
     * this will set indexed columns with values taken from the incoming mutation (rather than
     * backfilling them with existing values, which would occur elsewhere).
     */
    EntryDecoder rowDecoder = getSrcValueDecoder();
    rowDecoder.set(mutation.getValue());
    BitIndex bitIndex = rowDecoder.getCurrentIndex();
    MultiFieldDecoder rowFieldDecoder = rowDecoder.getEntryDecoder();
    for (int i = bitIndex.nextSetBit(0); i >= 0; i = bitIndex.nextSetBit(i + 1)) {
      if (!indexedCols.get(i)) {
        // skip non-indexed columns
        rowDecoder.seekForward(rowFieldDecoder, i);
        continue;
      }
      int keyColumnPos = i < mainColToIndexPosMap.length ? mainColToIndexPosMap[i] : -1;
      if (keyColumnPos < 0) {
        rowDecoder.seekForward(rowFieldDecoder, i);
      } else {
        int offset = rowFieldDecoder.offset();
        boolean isNull = rowDecoder.seekForward(rowFieldDecoder, i);
        hasNullKeyFields = isNull || hasNullKeyFields;
        int length;
        if (!isNull) {
          length = rowFieldDecoder.offset() - offset - 1;
          accumulate(
              keyAccumulator,
              keyColumnPos,
              table.getFormatIds(i),
              index.getDescColumns(keyColumnPos),
              rowFieldDecoder.array(),
              offset,
              length);
        } else {
          /*
           * because the field is NULL and it's source is the incoming mutation, we
           * still need to accumulate it. We must be careful, however, to accumulate the
           * proper null value.
           *
           * In theory, we could use a sparse encoding here--just accumulate a length 0 entry,
           * which will allow us to use a very short row key to determine nullity. However, that
           * doesn't work correctly, because doubles and floats at the end of the index might decode
           * the row key as a double, resulting in goofball answers.
           *
           * Instead, we must use the dense encoding approach here. That means that we must
           * select the proper dense type based on columnTypes[i]. For most data types, this is still
           * a length-0 array, but for floats and doubles it will put the proper type into place.
           */
          accumulateNull(keyAccumulator, keyColumnPos, table.getFormatIds(i));
        }
      }
    }

    /*
     * Handle NULL index columns from the source tables non-primary key columns.
     */
    for (int srcColIndex = 0; srcColIndex < mainColToIndexPosMap.length; srcColIndex++) {
      /* position of the source column within the index encoding */
      int indexColumnPosition = mainColToIndexPosMap[srcColIndex];
      if (!isSourceColumnPrimaryKey(srcColIndex)
          && indexColumnPosition >= 0
          && !bitIndex.isSet(srcColIndex)) {
        hasNullKeyFields = true;
        keyAccumulator.add(indexColumnPosition, new byte[] {}, 0, 0);
      }
    }

    // add the row key to the end of the index key
    byte[] srcRowKey = Encoding.encodeBytesUnsorted(mutation.getRowKey());

    EntryEncoder rowEncoder = getRowEncoder();
    MultiFieldEncoder entryEncoder = rowEncoder.getEntryEncoder();
    entryEncoder.reset();
    entryEncoder.setRawBytes(srcRowKey);
    byte[] indexValue = rowEncoder.encode();

    byte[] indexRowKey;
    if (index.getUnique()) {
      boolean nonUnique =
          index.getUniqueWithDuplicateNulls() && (hasNullKeyFields || !keyAccumulator.isFinished());
      indexRowKey = getIndexRowKey(srcRowKey, nonUnique);
    } else indexRowKey = getIndexRowKey(srcRowKey, true);

    return new KVPair(indexRowKey, indexValue, mutation.getType());
  }
  /**
   * Performs the actual clustering with an assumption that all documents are written in one <code>
   * language</code>.
   */
  private void cluster(LanguageCode language) {
    // Preprocessing of documents
    final PreprocessingContext context =
        preprocessingPipeline.preprocess(documents, query, language);

    // Further processing only if there are words to process
    clusters = Lists.newArrayList();
    if (context.hasLabels()) {
      // Term-document matrix building and reduction
      final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context);
      final ReducedVectorSpaceModelContext reducedVsmContext =
          new ReducedVectorSpaceModelContext(vsmContext);
      LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext);

      matrixBuilder.buildTermDocumentMatrix(vsmContext);
      matrixBuilder.buildTermPhraseMatrix(vsmContext);

      matrixReducer.reduce(
          reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size()));

      // Cluster label building
      clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting);

      // Document assignment
      clusterBuilder.assignDocuments(lingoContext);

      // Cluster merging
      clusterBuilder.merge(lingoContext);

      // Format final clusters
      final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex;
      final BitSet[] clusterDocuments = lingoContext.clusterDocuments;
      final double[] clusterLabelScore = lingoContext.clusterLabelScore;
      for (int i = 0; i < clusterLabelIndex.length; i++) {
        final Cluster cluster = new Cluster();

        final int labelFeature = clusterLabelIndex[i];
        if (labelFeature < 0) {
          // Cluster removed during merging
          continue;
        }

        // Add label and score
        cluster.addPhrases(labelFormatter.format(context, labelFeature));
        cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]);

        // Add documents
        final BitSet bs = clusterDocuments[i];
        for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) {
          cluster.addDocuments(documents.get(bit));
        }

        // Add cluster
        clusters.add(cluster);
      }

      Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight));
    }

    Cluster.appendOtherTopics(documents, clusters);
  }
 @Override
 public int nextSetBit(int currentPosition) {
   return bitSet.nextSetBit(currentPosition);
 }
示例#21
0
  /**
   * Create final clusters by merging base clusters and pruning their labels. Cluster merging is a
   * greedy process of compacting clusters with document sets that overlap by a certain ratio. In
   * other words, phrases that "cover" nearly identical document sets will be conflated.
   */
  private ArrayList<ClusterCandidate> createMergedClusters(
      ArrayList<ClusterCandidate> baseClusters) {
    /*
     * Calculate overlap between base clusters first, saving adjacency lists for
     * each base cluster.
     */

    // [i] - next neighbor or END, [i + 1] - neighbor cluster index.
    final int END = -1;
    final IntStack neighborList = new IntStack();
    neighborList.push(END);
    final int[] neighbors = new int[baseClusters.size()];
    final float m = (float) mergeThreshold;
    for (int i = 0; i < baseClusters.size(); i++) {
      for (int j = i + 1; j < baseClusters.size(); j++) {
        final ClusterCandidate c1 = baseClusters.get(i);
        final ClusterCandidate c2 = baseClusters.get(j);

        final float a = c1.cardinality;
        final float b = c2.cardinality;
        final float c = BitSet.intersectionCount(c1.documents, c2.documents);

        if (c / a > m && c / b > m) {
          neighborList.push(neighbors[i], j);
          neighbors[i] = neighborList.size() - 2;
          neighborList.push(neighbors[j], i);
          neighbors[j] = neighborList.size() - 2;
        }
      }
    }

    /*
     * Find connected components in the similarity graph using Tarjan's algorithm
     * (flattened to use the stack instead of recursion).
     */

    final int NO_INDEX = -1;
    final int[] merged = new int[baseClusters.size()];
    Arrays.fill(merged, NO_INDEX);

    final ArrayList<ClusterCandidate> mergedClusters =
        Lists.newArrayListWithCapacity(baseClusters.size());
    final IntStack stack = new IntStack(baseClusters.size());
    final IntStack mergeList = new IntStack(baseClusters.size());
    int mergedIndex = 0;
    for (int v = 0; v < baseClusters.size(); v++) {
      if (merged[v] != NO_INDEX) continue;

      // Recursively mark all connected components from an unmerged cluster.
      stack.push(v);
      while (stack.size() > 0) {
        final int c = stack.pop();

        assert merged[c] == NO_INDEX || merged[c] == mergedIndex;
        if (merged[c] == mergedIndex) continue;

        merged[c] = mergedIndex;
        mergeList.push(c);

        for (int i = neighbors[c]; neighborList.get(i) != END; ) {
          final int neighbor = neighborList.get(i + 1);
          if (merged[neighbor] == NO_INDEX) {
            stack.push(neighbor);
          } else {
            assert merged[neighbor] == mergedIndex;
          }
          i = neighborList.get(i);
        }
      }
      mergedIndex++;

      /*
       * Aggregate documents from each base cluster of the current merge, compute
       * the score and labels.
       */
      mergedClusters.add(merge(mergeList, baseClusters));
      mergeList.clear();
    }

    /*
     * Sort merged clusters.
     */
    Collections.sort(
        mergedClusters,
        new Comparator<ClusterCandidate>() {
          public int compare(ClusterCandidate c1, ClusterCandidate c2) {
            if (c1.score < c2.score) return 1;
            if (c1.score > c2.score) return -1;
            if (c1.cardinality < c2.cardinality) return 1;
            if (c1.cardinality > c2.cardinality) return -1;
            return 0;
          };
        });

    if (mergedClusters.size() > maxClusters) {
      mergedClusters.subList(maxClusters, mergedClusters.size()).clear();
    }

    return mergedClusters;
  }
 @Override
 public BitSet and(BitSet bitSet) {
   final BitSet result = (BitSet) this.bitSet.clone();
   result.and(bitSet);
   return result;
 }
 @Override
 public boolean isSet(int pos) {
   return bitSet.get(pos);
 }
 @Override
 public int length() {
   return (int) bitSet.length();
 }
 @Override
 public boolean isDoubleType(int position) {
   return doubleFields != null && doubleFields.get(position);
 }
 @Override
 public boolean isScalarType(int position) {
   return scalarFields != null && scalarFields.get(position);
 }
 @Override
 public boolean isFloatType(int position) {
   return floatFields != null && floatFields.get(position);
 }
 @Override
 public int cardinality() {
   return (int) bitSet.cardinality();
 }
 @Override
 public boolean isEmpty() {
   return bitSet.isEmpty();
 }