コード例 #1
0
  /**
   * Builds a term-phrase matrix in the same space as the main term-document matrix. If the
   * processing context contains no phrases, {@link VectorSpaceModelContext#termPhraseMatrix} will
   * remain <code>null</code>.
   */
  public void buildTermPhraseMatrix(VectorSpaceModelContext context) {
    final PreprocessingContext preprocessingContext = context.preprocessingContext;
    final IntIntHashMap stemToRowIndex = context.stemToRowIndex;
    final int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
    final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;

    if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0) {
      // Build phrase matrix
      int[] phraseFeatureIndices = new int[labelsFeatureIndex.length - firstPhraseIndex];
      for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; featureIndex++) {
        phraseFeatureIndices[featureIndex] = labelsFeatureIndex[featureIndex + firstPhraseIndex];
      }

      final DoubleMatrix2D phraseMatrix =
          TermDocumentMatrixBuilder.buildAlignedMatrix(
              context, phraseFeatureIndices, termWeighting);
      MatrixUtils.normalizeColumnL2(phraseMatrix, null);
      context.termPhraseMatrix = phraseMatrix.viewDice();
    }
  }
コード例 #2
0
  /**
   * Builds a term document matrix from data provided in the <code>context</code>, stores the result
   * in there.
   */
  public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) {
    final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;

    final int documentCount = preprocessingContext.documents.size();
    final int[] stemsTf = preprocessingContext.allStems.tf;
    final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
    final byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices;

    if (documentCount == 0) {
      vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0);
      vsmContext.stemToRowIndex = new IntIntHashMap();
      return;
    }

    // Determine the index of the title field
    int titleFieldIndex = -1;
    final String[] fieldsName = preprocessingContext.allFields.name;
    for (int i = 0; i < fieldsName.length; i++) {
      if (Document.TITLE.equals(fieldsName[i])) {
        titleFieldIndex = i;
        break;
      }
    }

    // Determine the stems we, ideally, should include in the matrix
    int[] stemsToInclude = computeRequiredStemIndices(preprocessingContext);

    // Sort stems by weight, so that stems get included in the matrix in the order
    // of frequency
    final double[] stemsWeight = new double[stemsToInclude.length];
    for (int i = 0; i < stemsToInclude.length; i++) {
      final int stemIndex = stemsToInclude[i];
      stemsWeight[i] =
          termWeighting.calculateTermWeight(
                  stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount)
              * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]);
    }
    final int[] stemWeightOrder =
        IndirectSort.mergesort(
            0, stemsWeight.length, new IndirectComparator.DescendingDoubleComparator(stemsWeight));

    // Calculate the number of terms we can include to fulfill the max matrix size
    final int maxRows = maximumMatrixSize / documentCount;
    final DoubleMatrix2D tdMatrix =
        new DenseDoubleMatrix2D(Math.min(maxRows, stemsToInclude.length), documentCount);

    for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++) {
      final int stemIndex = stemsToInclude[stemWeightOrder[i]];
      final int[] tfByDocument = stemsTfByDocument[stemIndex];
      final int df = tfByDocument.length / 2;
      final byte fieldIndices = stemsFieldIndices[stemIndex];

      for (int j = 0; j < df; j++) {
        double weight =
            termWeighting.calculateTermWeight(tfByDocument[j * 2 + 1], df, documentCount);

        weight *= getWeightBoost(titleFieldIndex, fieldIndices);
        tdMatrix.set(i, tfByDocument[j * 2], weight);
      }
    }

    // Convert stemsToInclude into tdMatrixStemIndices
    final IntIntHashMap stemToRowIndex = new IntIntHashMap();
    for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++) {
      stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i);
    }

    // Store the results
    vsmContext.termDocumentMatrix = tdMatrix;
    vsmContext.stemToRowIndex = stemToRowIndex;
  }