/** * Builds a term-phrase matrix in the same space as the main term-document matrix. If the * processing context contains no phrases, {@link VectorSpaceModelContext#termPhraseMatrix} will * remain <code>null</code>. */ public void buildTermPhraseMatrix(VectorSpaceModelContext context) { final PreprocessingContext preprocessingContext = context.preprocessingContext; final IntIntHashMap stemToRowIndex = context.stemToRowIndex; final int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex; final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex; if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0) { // Build phrase matrix int[] phraseFeatureIndices = new int[labelsFeatureIndex.length - firstPhraseIndex]; for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; featureIndex++) { phraseFeatureIndices[featureIndex] = labelsFeatureIndex[featureIndex + firstPhraseIndex]; } final DoubleMatrix2D phraseMatrix = TermDocumentMatrixBuilder.buildAlignedMatrix( context, phraseFeatureIndices, termWeighting); MatrixUtils.normalizeColumnL2(phraseMatrix, null); context.termPhraseMatrix = phraseMatrix.viewDice(); } }
/** * Builds a term document matrix from data provided in the <code>context</code>, stores the result * in there. */ public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) { final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext; final int documentCount = preprocessingContext.documents.size(); final int[] stemsTf = preprocessingContext.allStems.tf; final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument; final byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices; if (documentCount == 0) { vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0); vsmContext.stemToRowIndex = new IntIntHashMap(); return; } // Determine the index of the title field int titleFieldIndex = -1; final String[] fieldsName = preprocessingContext.allFields.name; for (int i = 0; i < fieldsName.length; i++) { if (Document.TITLE.equals(fieldsName[i])) { titleFieldIndex = i; break; } } // Determine the stems we, ideally, should include in the matrix int[] stemsToInclude = computeRequiredStemIndices(preprocessingContext); // Sort stems by weight, so that stems get included in the matrix in the order // of frequency final double[] stemsWeight = new double[stemsToInclude.length]; for (int i = 0; i < stemsToInclude.length; i++) { final int stemIndex = stemsToInclude[i]; stemsWeight[i] = termWeighting.calculateTermWeight( stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount) * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]); } final int[] stemWeightOrder = IndirectSort.mergesort( 0, stemsWeight.length, new IndirectComparator.DescendingDoubleComparator(stemsWeight)); // Calculate the number of terms we can include to fulfill the max matrix size final int maxRows = maximumMatrixSize / documentCount; final DoubleMatrix2D tdMatrix = new DenseDoubleMatrix2D(Math.min(maxRows, stemsToInclude.length), documentCount); for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++) { final int stemIndex = stemsToInclude[stemWeightOrder[i]]; final int[] tfByDocument = stemsTfByDocument[stemIndex]; final int df = tfByDocument.length / 2; final byte fieldIndices = stemsFieldIndices[stemIndex]; for (int j = 0; j < df; j++) { double weight = termWeighting.calculateTermWeight(tfByDocument[j * 2 + 1], df, documentCount); weight *= getWeightBoost(titleFieldIndex, fieldIndices); tdMatrix.set(i, tfByDocument[j * 2], weight); } } // Convert stemsToInclude into tdMatrixStemIndices final IntIntHashMap stemToRowIndex = new IntIntHashMap(); for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++) { stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i); } // Store the results vsmContext.termDocumentMatrix = tdMatrix; vsmContext.stemToRowIndex = stemToRowIndex; }