Java VectorSpaceModelContext примеры использования

Язык программирования: Java

Примеров на hotexamples.com: 2

Java VectorSpaceModelContext - 2 примера найдено. Это лучшие примеры Java кода для VectorSpaceModelContext, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

stemToRowIndex(1)

termDocumentMatrix(1)

termPhraseMatrix(1)

Пример #1

Показать файл

Файл: TermDocumentMatrixBuilder.java Проект: renodim/carrot2

  /**
   * Builds a term-phrase matrix in the same space as the main term-document matrix. If the
   * processing context contains no phrases, {@link VectorSpaceModelContext#termPhraseMatrix} will
   * remain <code>null</code>.
   */
  public void buildTermPhraseMatrix(VectorSpaceModelContext context) {
    final PreprocessingContext preprocessingContext = context.preprocessingContext;
    final IntIntHashMap stemToRowIndex = context.stemToRowIndex;
    final int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
    final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;

    if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0) {
      // Build phrase matrix
      int[] phraseFeatureIndices = new int[labelsFeatureIndex.length - firstPhraseIndex];
      for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; featureIndex++) {
        phraseFeatureIndices[featureIndex] = labelsFeatureIndex[featureIndex + firstPhraseIndex];
      }

      final DoubleMatrix2D phraseMatrix =
          TermDocumentMatrixBuilder.buildAlignedMatrix(
              context, phraseFeatureIndices, termWeighting);
      MatrixUtils.normalizeColumnL2(phraseMatrix, null);
      context.termPhraseMatrix = phraseMatrix.viewDice();
    }
  }

Пример #2

Показать файл

Файл: TermDocumentMatrixBuilder.java Проект: renodim/carrot2

  /**
   * Builds a term document matrix from data provided in the <code>context</code>, stores the result
   * in there.
   */
  public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) {
    final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;

    final int documentCount = preprocessingContext.documents.size();
    final int[] stemsTf = preprocessingContext.allStems.tf;
    final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
    final byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices;

    if (documentCount == 0) {
      vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0);
      vsmContext.stemToRowIndex = new IntIntHashMap();
      return;
    }

    // Determine the index of the title field
    int titleFieldIndex = -1;
    final String[] fieldsName = preprocessingContext.allFields.name;
    for (int i = 0; i < fieldsName.length; i++) {
      if (Document.TITLE.equals(fieldsName[i])) {
        titleFieldIndex = i;
        break;
      }
    }

    // Determine the stems we, ideally, should include in the matrix
    int[] stemsToInclude = computeRequiredStemIndices(preprocessingContext);

    // Sort stems by weight, so that stems get included in the matrix in the order
    // of frequency
    final double[] stemsWeight = new double[stemsToInclude.length];
    for (int i = 0; i < stemsToInclude.length; i++) {
      final int stemIndex = stemsToInclude[i];
      stemsWeight[i] =
          termWeighting.calculateTermWeight(
                  stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount)
              * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]);
    }
    final int[] stemWeightOrder =
        IndirectSort.mergesort(
            0, stemsWeight.length, new IndirectComparator.DescendingDoubleComparator(stemsWeight));

    // Calculate the number of terms we can include to fulfill the max matrix size
    final int maxRows = maximumMatrixSize / documentCount;
    final DoubleMatrix2D tdMatrix =
        new DenseDoubleMatrix2D(Math.min(maxRows, stemsToInclude.length), documentCount);

    for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++) {
      final int stemIndex = stemsToInclude[stemWeightOrder[i]];
      final int[] tfByDocument = stemsTfByDocument[stemIndex];
      final int df = tfByDocument.length / 2;
      final byte fieldIndices = stemsFieldIndices[stemIndex];

      for (int j = 0; j < df; j++) {
        double weight =
            termWeighting.calculateTermWeight(tfByDocument[j * 2 + 1], df, documentCount);

        weight *= getWeightBoost(titleFieldIndex, fieldIndices);
        tdMatrix.set(i, tfByDocument[j * 2], weight);
      }
    }

    // Convert stemsToInclude into tdMatrixStemIndices
    final IntIntHashMap stemToRowIndex = new IntIntHashMap();
    for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++) {
      stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i);
    }

    // Store the results
    vsmContext.termDocumentMatrix = tdMatrix;
    vsmContext.stemToRowIndex = stemToRowIndex;
  }