/** * Builds a sparse term-document-like matrix for the provided matrixWordIndices in the same term * space as the original term-document matrix. */ static DoubleMatrix2D buildAlignedMatrix( VectorSpaceModelContext vsmContext, int[] featureIndex, ITermWeighting termWeighting) { final IntIntHashMap stemToRowIndex = vsmContext.stemToRowIndex; if (featureIndex.length == 0) { return new DenseDoubleMatrix2D(stemToRowIndex.size(), 0); } final DoubleMatrix2D phraseMatrix = new SparseDoubleMatrix2D(stemToRowIndex.size(), featureIndex.length); final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext; final int[] wordsStemIndex = preprocessingContext.allWords.stemIndex; final int[] stemsTf = preprocessingContext.allStems.tf; final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument; final int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices; final int documentCount = preprocessingContext.documents.size(); final int wordCount = wordsStemIndex.length; for (int i = 0; i < featureIndex.length; i++) { final int feature = featureIndex[i]; final int[] wordIndices; if (feature < wordCount) { wordIndices = new int[] {feature}; } else { wordIndices = phrasesWordIndices[feature - wordCount]; } for (int wordIndex = 0; wordIndex < wordIndices.length; wordIndex++) { final int stemIndex = wordsStemIndex[wordIndices[wordIndex]]; final int index = stemToRowIndex.indexOf(stemIndex); if (stemToRowIndex.indexExists(index)) { final int rowIndex = stemToRowIndex.indexGet(index); double weight = termWeighting.calculateTermWeight( stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount); phraseMatrix.setQuick(rowIndex, i, weight); } } } return phraseMatrix; }
/** * Builds a term-phrase matrix in the same space as the main term-document matrix. If the * processing context contains no phrases, {@link VectorSpaceModelContext#termPhraseMatrix} will * remain <code>null</code>. */ public void buildTermPhraseMatrix(VectorSpaceModelContext context) { final PreprocessingContext preprocessingContext = context.preprocessingContext; final IntIntHashMap stemToRowIndex = context.stemToRowIndex; final int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex; final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex; if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0) { // Build phrase matrix int[] phraseFeatureIndices = new int[labelsFeatureIndex.length - firstPhraseIndex]; for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; featureIndex++) { phraseFeatureIndices[featureIndex] = labelsFeatureIndex[featureIndex + firstPhraseIndex]; } final DoubleMatrix2D phraseMatrix = TermDocumentMatrixBuilder.buildAlignedMatrix( context, phraseFeatureIndices, termWeighting); MatrixUtils.normalizeColumnL2(phraseMatrix, null); context.termPhraseMatrix = phraseMatrix.viewDice(); } }