/** * Builds a sparse term-document-like matrix for the provided matrixWordIndices in the same term * space as the original term-document matrix. */ static DoubleMatrix2D buildAlignedMatrix( VectorSpaceModelContext vsmContext, int[] featureIndex, ITermWeighting termWeighting) { final IntIntHashMap stemToRowIndex = vsmContext.stemToRowIndex; if (featureIndex.length == 0) { return new DenseDoubleMatrix2D(stemToRowIndex.size(), 0); } final DoubleMatrix2D phraseMatrix = new SparseDoubleMatrix2D(stemToRowIndex.size(), featureIndex.length); final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext; final int[] wordsStemIndex = preprocessingContext.allWords.stemIndex; final int[] stemsTf = preprocessingContext.allStems.tf; final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument; final int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices; final int documentCount = preprocessingContext.documents.size(); final int wordCount = wordsStemIndex.length; for (int i = 0; i < featureIndex.length; i++) { final int feature = featureIndex[i]; final int[] wordIndices; if (feature < wordCount) { wordIndices = new int[] {feature}; } else { wordIndices = phrasesWordIndices[feature - wordCount]; } for (int wordIndex = 0; wordIndex < wordIndices.length; wordIndex++) { final int stemIndex = wordsStemIndex[wordIndices[wordIndex]]; final int index = stemToRowIndex.indexOf(stemIndex); if (stemToRowIndex.indexExists(index)) { final int rowIndex = stemToRowIndex.indexGet(index); double weight = termWeighting.calculateTermWeight( stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount); phraseMatrix.setQuick(rowIndex, i, weight); } } } return phraseMatrix; }
/** * Builds a term document matrix from data provided in the <code>context</code>, stores the result * in there. */ public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) { final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext; final int documentCount = preprocessingContext.documents.size(); final int[] stemsTf = preprocessingContext.allStems.tf; final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument; final byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices; if (documentCount == 0) { vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0); vsmContext.stemToRowIndex = new IntIntHashMap(); return; } // Determine the index of the title field int titleFieldIndex = -1; final String[] fieldsName = preprocessingContext.allFields.name; for (int i = 0; i < fieldsName.length; i++) { if (Document.TITLE.equals(fieldsName[i])) { titleFieldIndex = i; break; } } // Determine the stems we, ideally, should include in the matrix int[] stemsToInclude = computeRequiredStemIndices(preprocessingContext); // Sort stems by weight, so that stems get included in the matrix in the order // of frequency final double[] stemsWeight = new double[stemsToInclude.length]; for (int i = 0; i < stemsToInclude.length; i++) { final int stemIndex = stemsToInclude[i]; stemsWeight[i] = termWeighting.calculateTermWeight( stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount) * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]); } final int[] stemWeightOrder = IndirectSort.mergesort( 0, stemsWeight.length, new IndirectComparator.DescendingDoubleComparator(stemsWeight)); // Calculate the number of terms we can include to fulfill the max matrix size final int maxRows = maximumMatrixSize / documentCount; final DoubleMatrix2D tdMatrix = new DenseDoubleMatrix2D(Math.min(maxRows, stemsToInclude.length), documentCount); for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++) { final int stemIndex = stemsToInclude[stemWeightOrder[i]]; final int[] tfByDocument = stemsTfByDocument[stemIndex]; final int df = tfByDocument.length / 2; final byte fieldIndices = stemsFieldIndices[stemIndex]; for (int j = 0; j < df; j++) { double weight = termWeighting.calculateTermWeight(tfByDocument[j * 2 + 1], df, documentCount); weight *= getWeightBoost(titleFieldIndex, fieldIndices); tdMatrix.set(i, tfByDocument[j * 2], weight); } } // Convert stemsToInclude into tdMatrixStemIndices final IntIntHashMap stemToRowIndex = new IntIntHashMap(); for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++) { stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i); } // Store the results vsmContext.termDocumentMatrix = tdMatrix; vsmContext.stemToRowIndex = stemToRowIndex; }