/** {@inheritDoc} */ public void processSpace(Properties properties) { SparseMatrix cleanedMatrix = (SparseMatrix) transform.transform(cooccurrenceMatrix); for (String term : basis.keySet()) { int index = basis.getDimension(term); SparseDoubleVector sdv = cleanedMatrix.getRowVector(index); double score = 0; for (int i : sdv.getNonZeroIndices()) score += sdv.get(i); wordScores.put(term, score); } }
@Override public ScoreResult score(SparseMatrix m) { ScoreResult hitsResult = hits.score(m); double[] colScore = hitsResult.getColScore(); SingularValueDecompositionLibJ svd = new SingularValueDecompositionLibJ(); svd.factorize(m, Math.min(20, Math.min(m.rows(), m.columns()) * 7 / 11)); Matrix u = svd.dataClasses(); Matrix v = new TransposedMatrix(svd.classFeatures()); if (hasNaN(u) || hasNaN(v)) return hitsResult; double[] svs = svd.singularValues(); PriorityQueue<Double, Integer> cos2dim = PriorityQueue.make( Math.min(5, svs.length), SortUtils.reverse(Double.class), SortUtils.comp(Integer.class)); for (int j = 0; j < v.columns(); j++) { double cos = MatrixUtils.cosine(colScore, v.getColumn(j)); cos2dim.add(cos, j); } List<Integer> dims = cos2dim.values(); double[] hub = new double[u.rows()]; for (int k = 0; k < u.rows(); k++) { double sum = 0; for (int i : dims) sum += u.get(k, i) * u.get(k, i) * svs[i] * svs[i]; hub[k] = Math.sqrt(sum); } MatrixUtils.normalize(hub); double[] authority = new double[v.rows()]; for (int k = 0; k < v.rows(); k++) { double sum = 0; for (int i : dims) sum += v.get(k, i) * v.get(k, i) * svs[i] * svs[i]; authority[k] = Math.sqrt(sum); } MatrixUtils.normalize(authority); return new ScoreResult(hub, authority); }
/** * Returns an array containing the row indices of the neighbors of the impost node and the row * index of the impost node itself. */ private static int[] getImpostNeighbors(SparseMatrix sm, int rowIndex) { int[] impost1edges = sm.getRowVector(rowIndex).getNonZeroIndices(); int[] neighbors = Arrays.copyOf(impost1edges, impost1edges.length + 1); neighbors[neighbors.length - 1] = rowIndex; return neighbors; }
/** * {@inheritDoc} * * @throws IllegalArgumentException if {@code matrix} is not square, or is not an instance of * {@link SparseMatrix} */ public Assignments cluster(Matrix matrix, Properties props) { if (matrix.rows() != matrix.columns()) throw new IllegalArgumentException( "Input matrix is not square. " + "Matrix is expected to be a square matrix whose values (i,j) " + "denote an edge from row i to row j"); if (!(matrix instanceof SparseMatrix)) { throw new IllegalArgumentException("Input matrix must be a " + "sparse matrix."); } SparseMatrix sm = (SparseMatrix) matrix; String inMemProp = props.getProperty(KEEP_SIMILARITY_MATRIX_IN_MEMORY_PROPERTY); boolean keepSimMatrixInMem = (inMemProp != null) ? Boolean.parseBoolean(inMemProp) : true; // IMPLEMENTATION NOTE: Ahn et al. used single-linkage HAC, which can be // efficiently implemented in O(n^2) time as a special case of HAC. // However, we currently don't optimize for this special case and // instead use our HAC class. Because of the complexity of the edge // similarity function, we build our own similarity matrix and then pass // it in, rather than passing in the edge matrix directly. final int rows = sm.rows(); numRows = rows; LOGGER.fine("Generating link similarity matrix for " + rows + " nodes"); // Rather than create an O(row^3) matrix for representing the edges, // compress the edge matrix by getting a mapping for each edge to a row // in the new matrix. final List<Edge> edgeList = new ArrayList<Edge>(); this.edgeList = edgeList; for (int r = 0; r < rows; ++r) { SparseDoubleVector row = sm.getRowVector(r); int[] edges = row.getNonZeroIndices(); for (int col : edges) { // Always add edges from the upper triangular if (r > col) edgeList.add(new Edge(r, col)); // Otherwise, we only add the edge from the lower triangular if // it wasn't present in the upper. This avoids counting // duplicate edges. else if (r < col && sm.get(col, r) == 0) edgeList.add(new Edge(r, col)); } } final int numEdges = edgeList.size(); LOGGER.fine("Number of edges to cluster: " + numEdges); Matrix edgeSimMatrix = getEdgeSimMatrix(edgeList, sm, keepSimMatrixInMem); LOGGER.fine("Computing single linkage link clustering"); final List<Merge> mergeOrder = new HierarchicalAgglomerativeClustering() .buildDendrogram(edgeSimMatrix, ClusterLinkage.SINGLE_LINKAGE); this.mergeOrder = mergeOrder; LOGGER.fine("Calculating partition densitities"); // Set up a concurrent map that each thread will update once it has // calculated the densitites of each of its partitions. This map is // only written to once per thread. final ConcurrentNavigableMap<Double, Integer> partitionDensities = new ConcurrentSkipListMap<Double, Integer>(); // Register a task group for calculating all of the partition // densitities Object key = WORK_QUEUE.registerTaskGroup(mergeOrder.size()); for (int p = 0; p < mergeOrder.size(); ++p) { final int part = p; WORK_QUEUE.add( key, new Runnable() { public void run() { // Get the merges for this particular partitioning of // the links List<Merge> mergeSteps = mergeOrder.subList(0, part); // Convert the merges to a specific cluster labeling MultiMap<Integer, Integer> clusterToElements = convertMergesToAssignments(mergeSteps, numEdges); // Based on the link partitioning, calculate the // partition density for each cluster double partitionDensitySum = 0d; for (Integer cluster : clusterToElements.keySet()) { Set<Integer> linkPartition = clusterToElements.get(cluster); int numLinks = linkPartition.size(); BitSet nodesInPartition = new BitSet(rows); for (Integer linkIndex : linkPartition) { Edge link = edgeList.get(linkIndex); nodesInPartition.set(link.from); nodesInPartition.set(link.to); } int numNodes = nodesInPartition.cardinality(); // This reflects the density of this particular // cluster double partitionDensity = (numLinks - (numNodes - 1d)) / (((numNodes * (numNodes - 1d)) / 2d) - (numLinks - 1)); partitionDensitySum += partitionDensity; } // Compute the density for the total partitioning // solution double partitionDensity = (2d / numEdges) * partitionDensitySum; LOGGER.log( Level.FINER, "Partition solution {0} had " + "density {1}", new Object[] {part, partitionDensity}); // Update the thread-shared partition density map with // this task's calculation partitionDensities.put(partitionDensity, part); } }); } // Wait for all the partition densities to be calculated WORK_QUEUE.await(key); Map.Entry<Double, Integer> densest = partitionDensities.lastEntry(); LOGGER.fine( "Partition " + densest.getValue() + " had the highest density: " + densest.getKey()); int partitionWithMaxDensity = densest.getValue(); // Select the solution with the highest partition density and assign // nodes accordingly MultiMap<Integer, Integer> bestEdgeAssignment = convertMergesToAssignments(mergeOrder.subList(0, partitionWithMaxDensity), numEdges); List<Set<Integer>> nodeClusters = new ArrayList<Set<Integer>>(rows); for (int i = 0; i < rows; ++i) nodeClusters.add(new HashSet<Integer>()); // Ignore the original partition labeling, and use our own cluster // labeling to ensure that the IDs are contiguous. int clusterId = 0; // For each of the partitions, add the partion's cluster ID to all the // nodes that are connected by one of the partition's edges for (Integer cluster : bestEdgeAssignment.keySet()) { Set<Integer> edgePartition = bestEdgeAssignment.get(cluster); for (Integer edgeId : edgePartition) { Edge e = edgeList.get(edgeId); nodeClusters.get(e.from).add(clusterId); nodeClusters.get(e.to).add(clusterId); } // Update the cluster id clusterId++; } int numClusters = 0; Assignment[] nodeAssignments = new Assignment[rows]; for (int i = 0; i < nodeAssignments.length; ++i) { nodeAssignments[i] = new SoftAssignment(nodeClusters.get(i)); } return new Assignments(numClusters, nodeAssignments, matrix); }