Exemplo n.º 1
0
 private static void normalize(
     final Collection<UnresolvedDependency> unresolvedDependencies,
     final IntIntHashMap substitutions,
     final Set<UnresolvedDependency> newUnresolvedDependencies,
     final Collection<UnlabelledDependency> newResolvedDependencies) {
   for (final UnresolvedDependency dep : unresolvedDependencies) {
     final int newID = substitutions.getOrDefault(dep.argumentID, Integer.MIN_VALUE);
     if (newID != Integer.MIN_VALUE) {
       if (newID == dep.argumentID) {
         newUnresolvedDependencies.add(dep);
       } else {
         newUnresolvedDependencies.add(
             new UnresolvedDependency(
                 dep.getHead(),
                 dep.getCategory(),
                 dep.getArgNumber(),
                 newID,
                 dep.getPreposition()));
       }
     } else {
       // Dependencies that don't get any attachment (e.g. in arbitrary control).
       newResolvedDependencies.add(
           new UnlabelledDependency(
               dep.getHead(),
               dep.getCategory(),
               dep.getArgNumber(),
               Collections.singletonList(dep.getHead()),
               dep.getPreposition()));
     }
   }
 }
  /**
   * Builds a term-phrase matrix in the same space as the main term-document matrix. If the
   * processing context contains no phrases, {@link VectorSpaceModelContext#termPhraseMatrix} will
   * remain <code>null</code>.
   */
  public void buildTermPhraseMatrix(VectorSpaceModelContext context) {
    final PreprocessingContext preprocessingContext = context.preprocessingContext;
    final IntIntHashMap stemToRowIndex = context.stemToRowIndex;
    final int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
    final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;

    if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0) {
      // Build phrase matrix
      int[] phraseFeatureIndices = new int[labelsFeatureIndex.length - firstPhraseIndex];
      for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; featureIndex++) {
        phraseFeatureIndices[featureIndex] = labelsFeatureIndex[featureIndex + firstPhraseIndex];
      }

      final DoubleMatrix2D phraseMatrix =
          TermDocumentMatrixBuilder.buildAlignedMatrix(
              context, phraseFeatureIndices, termWeighting);
      MatrixUtils.normalizeColumnL2(phraseMatrix, null);
      context.termPhraseMatrix = phraseMatrix.viewDice();
    }
  }
  /**
   * Builds a sparse term-document-like matrix for the provided matrixWordIndices in the same term
   * space as the original term-document matrix.
   */
  static DoubleMatrix2D buildAlignedMatrix(
      VectorSpaceModelContext vsmContext, int[] featureIndex, ITermWeighting termWeighting) {
    final IntIntHashMap stemToRowIndex = vsmContext.stemToRowIndex;
    if (featureIndex.length == 0) {
      return new DenseDoubleMatrix2D(stemToRowIndex.size(), 0);
    }

    final DoubleMatrix2D phraseMatrix =
        new SparseDoubleMatrix2D(stemToRowIndex.size(), featureIndex.length);

    final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;
    final int[] wordsStemIndex = preprocessingContext.allWords.stemIndex;
    final int[] stemsTf = preprocessingContext.allStems.tf;
    final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
    final int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices;
    final int documentCount = preprocessingContext.documents.size();
    final int wordCount = wordsStemIndex.length;

    for (int i = 0; i < featureIndex.length; i++) {
      final int feature = featureIndex[i];
      final int[] wordIndices;
      if (feature < wordCount) {
        wordIndices = new int[] {feature};
      } else {
        wordIndices = phrasesWordIndices[feature - wordCount];
      }

      for (int wordIndex = 0; wordIndex < wordIndices.length; wordIndex++) {
        final int stemIndex = wordsStemIndex[wordIndices[wordIndex]];
        final int index = stemToRowIndex.indexOf(stemIndex);
        if (stemToRowIndex.indexExists(index)) {
          final int rowIndex = stemToRowIndex.indexGet(index);

          double weight =
              termWeighting.calculateTermWeight(
                  stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount);

          phraseMatrix.setQuick(rowIndex, i, weight);
        }
      }
    }

    return phraseMatrix;
  }
  /**
   * Builds a term document matrix from data provided in the <code>context</code>, stores the result
   * in there.
   */
  public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) {
    final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;

    final int documentCount = preprocessingContext.documents.size();
    final int[] stemsTf = preprocessingContext.allStems.tf;
    final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
    final byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices;

    if (documentCount == 0) {
      vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0);
      vsmContext.stemToRowIndex = new IntIntHashMap();
      return;
    }

    // Determine the index of the title field
    int titleFieldIndex = -1;
    final String[] fieldsName = preprocessingContext.allFields.name;
    for (int i = 0; i < fieldsName.length; i++) {
      if (Document.TITLE.equals(fieldsName[i])) {
        titleFieldIndex = i;
        break;
      }
    }

    // Determine the stems we, ideally, should include in the matrix
    int[] stemsToInclude = computeRequiredStemIndices(preprocessingContext);

    // Sort stems by weight, so that stems get included in the matrix in the order
    // of frequency
    final double[] stemsWeight = new double[stemsToInclude.length];
    for (int i = 0; i < stemsToInclude.length; i++) {
      final int stemIndex = stemsToInclude[i];
      stemsWeight[i] =
          termWeighting.calculateTermWeight(
                  stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount)
              * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]);
    }
    final int[] stemWeightOrder =
        IndirectSort.mergesort(
            0, stemsWeight.length, new IndirectComparator.DescendingDoubleComparator(stemsWeight));

    // Calculate the number of terms we can include to fulfill the max matrix size
    final int maxRows = maximumMatrixSize / documentCount;
    final DoubleMatrix2D tdMatrix =
        new DenseDoubleMatrix2D(Math.min(maxRows, stemsToInclude.length), documentCount);

    for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++) {
      final int stemIndex = stemsToInclude[stemWeightOrder[i]];
      final int[] tfByDocument = stemsTfByDocument[stemIndex];
      final int df = tfByDocument.length / 2;
      final byte fieldIndices = stemsFieldIndices[stemIndex];

      for (int j = 0; j < df; j++) {
        double weight =
            termWeighting.calculateTermWeight(tfByDocument[j * 2 + 1], df, documentCount);

        weight *= getWeightBoost(titleFieldIndex, fieldIndices);
        tdMatrix.set(i, tfByDocument[j * 2], weight);
      }
    }

    // Convert stemsToInclude into tdMatrixStemIndices
    final IntIntHashMap stemToRowIndex = new IntIntHashMap();
    for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++) {
      stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i);
    }

    // Store the results
    vsmContext.termDocumentMatrix = tdMatrix;
    vsmContext.stemToRowIndex = stemToRowIndex;
  }