public void compute() { int n = A.columns(); // Distances to centroids DoubleMatrix2D D = new DenseDoubleMatrix2D(k, n); // Object-cluster assignments V = new DenseDoubleMatrix2D(n, k); // Initialize the centroids with some document vectors U = new DenseDoubleMatrix2D(A.rows(), k); U.assign(A.viewPart(0, 0, A.rows(), k)); int[] minIndices = new int[D.columns()]; double[] minValues = new double[D.columns()]; for (iterationsCompleted = 0; iterationsCompleted < maxIterations; iterationsCompleted++) { // Calculate cosine distances U.zMult(A, D, 1, 0, true, false); V.assign(0); U.assign(0); // For each object MatrixUtils.maxInColumns(D, minIndices, minValues); for (int i = 0; i < minIndices.length; i++) { V.setQuick(i, minIndices[i], 1); } // Update centroids for (int c = 0; c < V.columns(); c++) { // Sum int count = 0; for (int d = 0; d < V.rows(); d++) { if (V.getQuick(d, c) != 0) { count++; U.viewColumn(c).assign(A.viewColumn(d), Functions.PLUS); } } // Divide U.viewColumn(c).assign(Mult.div(count)); MatrixUtils.normalizeColumnL2(U, null); } } }
/** * Builds a sparse term-document-like matrix for the provided matrixWordIndices in the same term * space as the original term-document matrix. */ static DoubleMatrix2D buildAlignedMatrix( VectorSpaceModelContext vsmContext, int[] featureIndex, ITermWeighting termWeighting) { final IntIntHashMap stemToRowIndex = vsmContext.stemToRowIndex; if (featureIndex.length == 0) { return new DenseDoubleMatrix2D(stemToRowIndex.size(), 0); } final DoubleMatrix2D phraseMatrix = new SparseDoubleMatrix2D(stemToRowIndex.size(), featureIndex.length); final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext; final int[] wordsStemIndex = preprocessingContext.allWords.stemIndex; final int[] stemsTf = preprocessingContext.allStems.tf; final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument; final int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices; final int documentCount = preprocessingContext.documents.size(); final int wordCount = wordsStemIndex.length; for (int i = 0; i < featureIndex.length; i++) { final int feature = featureIndex[i]; final int[] wordIndices; if (feature < wordCount) { wordIndices = new int[] {feature}; } else { wordIndices = phrasesWordIndices[feature - wordCount]; } for (int wordIndex = 0; wordIndex < wordIndices.length; wordIndex++) { final int stemIndex = wordsStemIndex[wordIndices[wordIndex]]; final int index = stemToRowIndex.indexOf(stemIndex); if (stemToRowIndex.indexExists(index)) { final int rowIndex = stemToRowIndex.indexGet(index); double weight = termWeighting.calculateTermWeight( stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount); phraseMatrix.setQuick(rowIndex, i, weight); } } } return phraseMatrix; }
/** * Builds a term-phrase matrix in the same space as the main term-document matrix. If the * processing context contains no phrases, {@link VectorSpaceModelContext#termPhraseMatrix} will * remain <code>null</code>. */ public void buildTermPhraseMatrix(VectorSpaceModelContext context) { final PreprocessingContext preprocessingContext = context.preprocessingContext; final IntIntHashMap stemToRowIndex = context.stemToRowIndex; final int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex; final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex; if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0) { // Build phrase matrix int[] phraseFeatureIndices = new int[labelsFeatureIndex.length - firstPhraseIndex]; for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; featureIndex++) { phraseFeatureIndices[featureIndex] = labelsFeatureIndex[featureIndex + firstPhraseIndex]; } final DoubleMatrix2D phraseMatrix = TermDocumentMatrixBuilder.buildAlignedMatrix( context, phraseFeatureIndices, termWeighting); MatrixUtils.normalizeColumnL2(phraseMatrix, null); context.termPhraseMatrix = phraseMatrix.viewDice(); } }
/** * A native implementation of Colt's original multiplication method method. * * <p>NOTE: this method will use native calls only when: * * <ul> * <li>all input matrices are @link DenseDoubleMatrix2D or its subclasses (e.g. @link * NNIDenseDoubleMatrix2D) * <li>none of the input matrices is a view * <li>the dynamic libraries required by the NNI are available * </ul> */ public DoubleMatrix2D zMult( DoubleMatrix2D B, DoubleMatrix2D C, double alpha, double beta, boolean transposeA, boolean transposeB) { // A workaround for a bug in DenseDoubleMatrix2D. // If B is a SelectedDenseDoubleMatrix the implementation of this method // throws a ClassCastException. The workaround is to swap and transpose // the arguments and then transpose the result. As SelectedDenseDoubleMatrix2D is // package-private, if it was loaded with a different class loader than // the one used for this class it would give a VerificationError if we referred // to it directly here. Hence the hacky string comparison here. // if (B.getClass().getName().endsWith("SelectedDenseDoubleMatrix2D")) { return B.zMult(this, C, alpha, beta, !transposeB, !transposeA).viewDice(); } // Check the sizes int rowsB = (transposeB ? B.columns() : B.rows()); int columnsB = (transposeB ? B.rows() : B.columns()); int rowsA = (transposeA ? columns() : rows()); int columnsA = (transposeA ? rows() : columns()); if (C == null) { C = new NNIDenseDoubleMatrix2D(rowsA, columnsB); } if (this == C || B == C) { throw new IllegalArgumentException("Matrices must not be identical"); } final int rowsC = C.rows(); final int columnsC = C.columns(); if (rowsB != columnsA) { throw new IllegalArgumentException( "Matrix2D inner dimensions must agree:" + toStringShort() + ", " + B.toStringShort()); } if (rowsC != rowsA || columnsC != columnsB) { throw new IllegalArgumentException( "Incompatibile result matrix: " + toStringShort() + ", " + B.toStringShort() + ", " + C.toStringShort()); } // Need native BLAS, dense matrices and no views to operate // Default to Colt's implementation otherwise if (!NNIInterface.isNativeBlasAvailable() || (!(B instanceof NNIDenseDoubleMatrix2D)) || (!(C instanceof NNIDenseDoubleMatrix2D)) || isView() || ((NNIDenseDoubleMatrix2D) B).isView() || ((NNIDenseDoubleMatrix2D) C).isView()) { return super.zMult(B, C, alpha, beta, transposeA, transposeB); } NNIInterface.getBlas() .gemm( this, (NNIDenseDoubleMatrix2D) B, (NNIDenseDoubleMatrix2D) C, transposeA, transposeB, columnsA, alpha, columns, beta); return C; }
/** * Builds a term document matrix from data provided in the <code>context</code>, stores the result * in there. */ public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) { final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext; final int documentCount = preprocessingContext.documents.size(); final int[] stemsTf = preprocessingContext.allStems.tf; final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument; final byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices; if (documentCount == 0) { vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0); vsmContext.stemToRowIndex = new IntIntHashMap(); return; } // Determine the index of the title field int titleFieldIndex = -1; final String[] fieldsName = preprocessingContext.allFields.name; for (int i = 0; i < fieldsName.length; i++) { if (Document.TITLE.equals(fieldsName[i])) { titleFieldIndex = i; break; } } // Determine the stems we, ideally, should include in the matrix int[] stemsToInclude = computeRequiredStemIndices(preprocessingContext); // Sort stems by weight, so that stems get included in the matrix in the order // of frequency final double[] stemsWeight = new double[stemsToInclude.length]; for (int i = 0; i < stemsToInclude.length; i++) { final int stemIndex = stemsToInclude[i]; stemsWeight[i] = termWeighting.calculateTermWeight( stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount) * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]); } final int[] stemWeightOrder = IndirectSort.mergesort( 0, stemsWeight.length, new IndirectComparator.DescendingDoubleComparator(stemsWeight)); // Calculate the number of terms we can include to fulfill the max matrix size final int maxRows = maximumMatrixSize / documentCount; final DoubleMatrix2D tdMatrix = new DenseDoubleMatrix2D(Math.min(maxRows, stemsToInclude.length), documentCount); for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++) { final int stemIndex = stemsToInclude[stemWeightOrder[i]]; final int[] tfByDocument = stemsTfByDocument[stemIndex]; final int df = tfByDocument.length / 2; final byte fieldIndices = stemsFieldIndices[stemIndex]; for (int j = 0; j < df; j++) { double weight = termWeighting.calculateTermWeight(tfByDocument[j * 2 + 1], df, documentCount); weight *= getWeightBoost(titleFieldIndex, fieldIndices); tdMatrix.set(i, tfByDocument[j * 2], weight); } } // Convert stemsToInclude into tdMatrixStemIndices final IntIntHashMap stemToRowIndex = new IntIntHashMap(); for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++) { stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i); } // Store the results vsmContext.termDocumentMatrix = tdMatrix; vsmContext.stemToRowIndex = stemToRowIndex; }