Ejemplo n.º 1
0
    /** {@inheritDoc} */
    public void processSpace(Properties properties) {
      SparseMatrix cleanedMatrix = (SparseMatrix) transform.transform(cooccurrenceMatrix);
      for (String term : basis.keySet()) {
        int index = basis.getDimension(term);
        SparseDoubleVector sdv = cleanedMatrix.getRowVector(index);

        double score = 0;
        for (int i : sdv.getNonZeroIndices()) score += sdv.get(i);

        wordScores.put(term, score);
      }
    }
Ejemplo n.º 2
0
 /**
  * Returns an array containing the row indices of the neighbors of the impost node and the row
  * index of the impost node itself.
  */
 private static int[] getImpostNeighbors(SparseMatrix sm, int rowIndex) {
   int[] impost1edges = sm.getRowVector(rowIndex).getNonZeroIndices();
   int[] neighbors = Arrays.copyOf(impost1edges, impost1edges.length + 1);
   neighbors[neighbors.length - 1] = rowIndex;
   return neighbors;
 }
Ejemplo n.º 3
0
  /**
   * {@inheritDoc}
   *
   * @throws IllegalArgumentException if {@code matrix} is not square, or is not an instance of
   *     {@link SparseMatrix}
   */
  public Assignments cluster(Matrix matrix, Properties props) {
    if (matrix.rows() != matrix.columns())
      throw new IllegalArgumentException(
          "Input matrix is not square. "
              + "Matrix is expected to be a square matrix whose values (i,j) "
              + "denote an edge from row i to row j");
    if (!(matrix instanceof SparseMatrix)) {
      throw new IllegalArgumentException("Input matrix must be a " + "sparse matrix.");
    }
    SparseMatrix sm = (SparseMatrix) matrix;

    String inMemProp = props.getProperty(KEEP_SIMILARITY_MATRIX_IN_MEMORY_PROPERTY);
    boolean keepSimMatrixInMem = (inMemProp != null) ? Boolean.parseBoolean(inMemProp) : true;

    // IMPLEMENTATION NOTE: Ahn et al. used single-linkage HAC, which can be
    // efficiently implemented in O(n^2) time as a special case of HAC.
    // However, we currently don't optimize for this special case and
    // instead use our HAC class.  Because of the complexity of the edge
    // similarity function, we build our own similarity matrix and then pass
    // it in, rather than passing in the edge matrix directly.

    final int rows = sm.rows();
    numRows = rows;
    LOGGER.fine("Generating link similarity matrix for " + rows + " nodes");

    //  Rather than create an O(row^3) matrix for representing the edges,
    // compress the edge matrix by getting a mapping for each edge to a row
    // in the new matrix.
    final List<Edge> edgeList = new ArrayList<Edge>();
    this.edgeList = edgeList;

    for (int r = 0; r < rows; ++r) {
      SparseDoubleVector row = sm.getRowVector(r);
      int[] edges = row.getNonZeroIndices();
      for (int col : edges) {
        // Always add edges from the upper triangular
        if (r > col) edgeList.add(new Edge(r, col));
        // Otherwise, we only add the edge from the lower triangular if
        // it wasn't present in the upper.  This avoids counting
        // duplicate edges.
        else if (r < col && sm.get(col, r) == 0) edgeList.add(new Edge(r, col));
      }
    }

    final int numEdges = edgeList.size();
    LOGGER.fine("Number of edges to cluster: " + numEdges);

    Matrix edgeSimMatrix = getEdgeSimMatrix(edgeList, sm, keepSimMatrixInMem);

    LOGGER.fine("Computing single linkage link clustering");

    final List<Merge> mergeOrder =
        new HierarchicalAgglomerativeClustering()
            .buildDendrogram(edgeSimMatrix, ClusterLinkage.SINGLE_LINKAGE);
    this.mergeOrder = mergeOrder;

    LOGGER.fine("Calculating partition densitities");

    // Set up a concurrent map that each thread will update once it has
    // calculated the densitites of each of its partitions.  This map is
    // only written to once per thread.
    final ConcurrentNavigableMap<Double, Integer> partitionDensities =
        new ConcurrentSkipListMap<Double, Integer>();

    // Register a task group for calculating all of the partition
    // densitities
    Object key = WORK_QUEUE.registerTaskGroup(mergeOrder.size());
    for (int p = 0; p < mergeOrder.size(); ++p) {
      final int part = p;
      WORK_QUEUE.add(
          key,
          new Runnable() {
            public void run() {
              // Get the merges for this particular partitioning of
              // the links
              List<Merge> mergeSteps = mergeOrder.subList(0, part);

              // Convert the merges to a specific cluster labeling
              MultiMap<Integer, Integer> clusterToElements =
                  convertMergesToAssignments(mergeSteps, numEdges);

              // Based on the link partitioning, calculate the
              // partition density for each cluster
              double partitionDensitySum = 0d;
              for (Integer cluster : clusterToElements.keySet()) {
                Set<Integer> linkPartition = clusterToElements.get(cluster);
                int numLinks = linkPartition.size();
                BitSet nodesInPartition = new BitSet(rows);
                for (Integer linkIndex : linkPartition) {
                  Edge link = edgeList.get(linkIndex);
                  nodesInPartition.set(link.from);
                  nodesInPartition.set(link.to);
                }
                int numNodes = nodesInPartition.cardinality();
                // This reflects the density of this particular
                // cluster
                double partitionDensity =
                    (numLinks - (numNodes - 1d))
                        / (((numNodes * (numNodes - 1d)) / 2d) - (numLinks - 1));
                partitionDensitySum += partitionDensity;
              }
              // Compute the density for the total partitioning
              // solution
              double partitionDensity = (2d / numEdges) * partitionDensitySum;
              LOGGER.log(
                  Level.FINER,
                  "Partition solution {0} had " + "density {1}",
                  new Object[] {part, partitionDensity});

              // Update the thread-shared partition density map with
              // this task's calculation
              partitionDensities.put(partitionDensity, part);
            }
          });
    }

    // Wait for all the partition densities to be calculated
    WORK_QUEUE.await(key);

    Map.Entry<Double, Integer> densest = partitionDensities.lastEntry();
    LOGGER.fine(
        "Partition " + densest.getValue() + " had the highest density: " + densest.getKey());
    int partitionWithMaxDensity = densest.getValue();

    // Select the solution with the highest partition density and assign
    // nodes accordingly
    MultiMap<Integer, Integer> bestEdgeAssignment =
        convertMergesToAssignments(mergeOrder.subList(0, partitionWithMaxDensity), numEdges);

    List<Set<Integer>> nodeClusters = new ArrayList<Set<Integer>>(rows);
    for (int i = 0; i < rows; ++i) nodeClusters.add(new HashSet<Integer>());

    // Ignore the original partition labeling, and use our own cluster
    // labeling to ensure that the IDs are contiguous.
    int clusterId = 0;

    // For each of the partitions, add the partion's cluster ID to all the
    // nodes that are connected by one of the partition's edges
    for (Integer cluster : bestEdgeAssignment.keySet()) {
      Set<Integer> edgePartition = bestEdgeAssignment.get(cluster);
      for (Integer edgeId : edgePartition) {
        Edge e = edgeList.get(edgeId);
        nodeClusters.get(e.from).add(clusterId);
        nodeClusters.get(e.to).add(clusterId);
      }
      // Update the cluster id
      clusterId++;
    }

    int numClusters = 0;
    Assignment[] nodeAssignments = new Assignment[rows];
    for (int i = 0; i < nodeAssignments.length; ++i) {
      nodeAssignments[i] = new SoftAssignment(nodeClusters.get(i));
    }
    return new Assignments(numClusters, nodeAssignments, matrix);
  }