Ejemplo n.º 1
   * Builds a sparse term-document-like matrix for the provided matrixWordIndices in the same term
   * space as the original term-document matrix.
  static DoubleMatrix2D buildAlignedMatrix(
      VectorSpaceModelContext vsmContext, int[] featureIndex, ITermWeighting termWeighting) {
    final IntIntHashMap stemToRowIndex = vsmContext.stemToRowIndex;
    if (featureIndex.length == 0) {
      return new DenseDoubleMatrix2D(stemToRowIndex.size(), 0);

    final DoubleMatrix2D phraseMatrix =
        new SparseDoubleMatrix2D(stemToRowIndex.size(), featureIndex.length);

    final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;
    final int[] wordsStemIndex = preprocessingContext.allWords.stemIndex;
    final int[] stemsTf = preprocessingContext.allStems.tf;
    final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
    final int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices;
    final int documentCount = preprocessingContext.documents.size();
    final int wordCount = wordsStemIndex.length;

    for (int i = 0; i < featureIndex.length; i++) {
      final int feature = featureIndex[i];
      final int[] wordIndices;
      if (feature < wordCount) {
        wordIndices = new int[] {feature};
      } else {
        wordIndices = phrasesWordIndices[feature - wordCount];

      for (int wordIndex = 0; wordIndex < wordIndices.length; wordIndex++) {
        final int stemIndex = wordsStemIndex[wordIndices[wordIndex]];
        final int index = stemToRowIndex.indexOf(stemIndex);
        if (stemToRowIndex.indexExists(index)) {
          final int rowIndex = stemToRowIndex.indexGet(index);

          double weight =
                  stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount);

          phraseMatrix.setQuick(rowIndex, i, weight);

    return phraseMatrix;
Ejemplo n.º 2
   * Builds a term document matrix from data provided in the <code>context</code>, stores the result
   * in there.
  public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) {
    final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;

    final int documentCount = preprocessingContext.documents.size();
    final int[] stemsTf = preprocessingContext.allStems.tf;
    final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
    final byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices;

    if (documentCount == 0) {
      vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0);
      vsmContext.stemToRowIndex = new IntIntHashMap();

    // Determine the index of the title field
    int titleFieldIndex = -1;
    final String[] fieldsName = preprocessingContext.allFields.name;
    for (int i = 0; i < fieldsName.length; i++) {
      if (Document.TITLE.equals(fieldsName[i])) {
        titleFieldIndex = i;

    // Determine the stems we, ideally, should include in the matrix
    int[] stemsToInclude = computeRequiredStemIndices(preprocessingContext);

    // Sort stems by weight, so that stems get included in the matrix in the order
    // of frequency
    final double[] stemsWeight = new double[stemsToInclude.length];
    for (int i = 0; i < stemsToInclude.length; i++) {
      final int stemIndex = stemsToInclude[i];
      stemsWeight[i] =
                  stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount)
              * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]);
    final int[] stemWeightOrder =
            0, stemsWeight.length, new IndirectComparator.DescendingDoubleComparator(stemsWeight));

    // Calculate the number of terms we can include to fulfill the max matrix size
    final int maxRows = maximumMatrixSize / documentCount;
    final DoubleMatrix2D tdMatrix =
        new DenseDoubleMatrix2D(Math.min(maxRows, stemsToInclude.length), documentCount);

    for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++) {
      final int stemIndex = stemsToInclude[stemWeightOrder[i]];
      final int[] tfByDocument = stemsTfByDocument[stemIndex];
      final int df = tfByDocument.length / 2;
      final byte fieldIndices = stemsFieldIndices[stemIndex];

      for (int j = 0; j < df; j++) {
        double weight =
            termWeighting.calculateTermWeight(tfByDocument[j * 2 + 1], df, documentCount);

        weight *= getWeightBoost(titleFieldIndex, fieldIndices);
        tdMatrix.set(i, tfByDocument[j * 2], weight);

    // Convert stemsToInclude into tdMatrixStemIndices
    final IntIntHashMap stemToRowIndex = new IntIntHashMap();
    for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++) {
      stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i);

    // Store the results
    vsmContext.termDocumentMatrix = tdMatrix;
    vsmContext.stemToRowIndex = stemToRowIndex;