Java ITermWeighting.calculateTermWeight 예제들

프로그래밍 언어: Java

클래스/타입: ITermWeighting

메소드/함수: calculateTermWeight

hotexamples.com에서의 예제들: 2

Java ITermWeighting.calculateTermWeight - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Java의 ITermWeighting.calculateTermWeight에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

calculateTermWeight(2)

자주 사용되는 메소드들

calculateTermWeight (2)

예제 #1

파일 보기

파일: TermDocumentMatrixBuilder.java 프로젝트: renodim/carrot2

  /**
   * Builds a sparse term-document-like matrix for the provided matrixWordIndices in the same term
   * space as the original term-document matrix.
   */
  static DoubleMatrix2D buildAlignedMatrix(
      VectorSpaceModelContext vsmContext, int[] featureIndex, ITermWeighting termWeighting) {
    final IntIntHashMap stemToRowIndex = vsmContext.stemToRowIndex;
    if (featureIndex.length == 0) {
      return new DenseDoubleMatrix2D(stemToRowIndex.size(), 0);
    }

    final DoubleMatrix2D phraseMatrix =
        new SparseDoubleMatrix2D(stemToRowIndex.size(), featureIndex.length);

    final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;
    final int[] wordsStemIndex = preprocessingContext.allWords.stemIndex;
    final int[] stemsTf = preprocessingContext.allStems.tf;
    final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
    final int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices;
    final int documentCount = preprocessingContext.documents.size();
    final int wordCount = wordsStemIndex.length;

    for (int i = 0; i < featureIndex.length; i++) {
      final int feature = featureIndex[i];
      final int[] wordIndices;
      if (feature < wordCount) {
        wordIndices = new int[] {feature};
      } else {
        wordIndices = phrasesWordIndices[feature - wordCount];
      }

      for (int wordIndex = 0; wordIndex < wordIndices.length; wordIndex++) {
        final int stemIndex = wordsStemIndex[wordIndices[wordIndex]];
        final int index = stemToRowIndex.indexOf(stemIndex);
        if (stemToRowIndex.indexExists(index)) {
          final int rowIndex = stemToRowIndex.indexGet(index);

          double weight =
              termWeighting.calculateTermWeight(
                  stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount);

          phraseMatrix.setQuick(rowIndex, i, weight);
        }
      }
    }

    return phraseMatrix;
  }

예제 #2

파일 보기

파일: TermDocumentMatrixBuilder.java 프로젝트: renodim/carrot2

  /**
   * Builds a term document matrix from data provided in the <code>context</code>, stores the result
   * in there.
   */
  public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) {
    final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;

    final int documentCount = preprocessingContext.documents.size();
    final int[] stemsTf = preprocessingContext.allStems.tf;
    final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
    final byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices;

    if (documentCount == 0) {
      vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0);
      vsmContext.stemToRowIndex = new IntIntHashMap();
      return;
    }

    // Determine the index of the title field
    int titleFieldIndex = -1;
    final String[] fieldsName = preprocessingContext.allFields.name;
    for (int i = 0; i < fieldsName.length; i++) {
      if (Document.TITLE.equals(fieldsName[i])) {
        titleFieldIndex = i;
        break;
      }
    }

    // Determine the stems we, ideally, should include in the matrix
    int[] stemsToInclude = computeRequiredStemIndices(preprocessingContext);

    // Sort stems by weight, so that stems get included in the matrix in the order
    // of frequency
    final double[] stemsWeight = new double[stemsToInclude.length];
    for (int i = 0; i < stemsToInclude.length; i++) {
      final int stemIndex = stemsToInclude[i];
      stemsWeight[i] =
          termWeighting.calculateTermWeight(
                  stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount)
              * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]);
    }
    final int[] stemWeightOrder =
        IndirectSort.mergesort(
            0, stemsWeight.length, new IndirectComparator.DescendingDoubleComparator(stemsWeight));

    // Calculate the number of terms we can include to fulfill the max matrix size
    final int maxRows = maximumMatrixSize / documentCount;
    final DoubleMatrix2D tdMatrix =
        new DenseDoubleMatrix2D(Math.min(maxRows, stemsToInclude.length), documentCount);

    for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++) {
      final int stemIndex = stemsToInclude[stemWeightOrder[i]];
      final int[] tfByDocument = stemsTfByDocument[stemIndex];
      final int df = tfByDocument.length / 2;
      final byte fieldIndices = stemsFieldIndices[stemIndex];

      for (int j = 0; j < df; j++) {
        double weight =
            termWeighting.calculateTermWeight(tfByDocument[j * 2 + 1], df, documentCount);

        weight *= getWeightBoost(titleFieldIndex, fieldIndices);
        tdMatrix.set(i, tfByDocument[j * 2], weight);
      }
    }

    // Convert stemsToInclude into tdMatrixStemIndices
    final IntIntHashMap stemToRowIndex = new IntIntHashMap();
    for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++) {
      stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i);
    }

    // Store the results
    vsmContext.termDocumentMatrix = tdMatrix;
    vsmContext.stemToRowIndex = stemToRowIndex;
  }