예제 #1
0
 /**
  * If there are unclustered documents, appends the "Other Topics" group to the <code>clusters
  * </code>.
  *
  * @see #buildOtherTopics(List, List, String)
  */
 public static void appendOtherTopics(
     List<Document> allDocuments, List<Cluster> clusters, String label) {
   final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label);
   if (!otherTopics.getDocuments().isEmpty()) {
     clusters.add(otherTopics);
   }
 }
예제 #2
0
  /**
   * Assigns sequential identifiers to the provided <code>clusters</code> (and their sub-clusters).
   * If a cluster already has an identifier, the identifier will not be changed.
   *
   * @param clusters Clusters to assign identifiers to.
   * @throws IllegalArgumentException if the provided clusters contain non-unique identifiers
   */
  public static void assignClusterIds(Collection<Cluster> clusters) {
    final ArrayList<Cluster> flattened = Lists.newArrayListWithExpectedSize(clusters.size());

    flatten(flattened, clusters);

    synchronized (clusters) {
      final HashSet<Integer> ids = Sets.newHashSet();

      // First, find the start value for the id and check uniqueness of the ids
      // already provided.
      int maxId = Integer.MIN_VALUE;
      for (final Cluster cluster : flattened) {
        if (cluster.id != null) {
          if (!ids.add(cluster.id)) {
            throw new IllegalArgumentException("Non-unique cluster id found: " + cluster.id);
          }
          maxId = Math.max(maxId, cluster.id);
        }
      }

      // We'd rather start with 0
      maxId = Math.max(maxId, -1);

      // Assign missing ids
      for (final Cluster c : flattened) {
        if (c.id == null) {
          c.id = ++maxId;
        }
      }
    }
  }
예제 #3
0
 /*
  * Recursive descent into subclusters.
  */
 private static void flatten(ArrayList<Cluster> flattened, Collection<Cluster> clusters) {
   for (Cluster c : clusters) {
     flattened.add(c);
     final List<Cluster> subclusters = c.getSubclusters();
     if (!subclusters.isEmpty()) {
       flatten(flattened, subclusters);
     }
   }
 }
예제 #4
0
  /** A recursive routine for collecting unique documents from this cluster and subclusters. */
  private static Set<Document> collectAllDocuments(Cluster cluster, Set<Document> docs) {
    if (cluster == null) {
      return docs;
    }

    docs.addAll(cluster.getDocuments());

    final List<Cluster> subclusters = cluster.getSubclusters();
    for (final Cluster subcluster : subclusters) {
      collectAllDocuments(subcluster, docs);
    }

    return docs;
  }
예제 #5
0
  /**
   * Builds an "Other Topics" cluster that groups those documents from <code>allDocument</code> that
   * were not referenced in any cluster in <code>clusters</code>.
   *
   * @param allDocuments all documents to check against
   * @param clusters list of clusters with assigned documents
   * @param label label for the "Other Topics" group
   * @return the "Other Topics" cluster
   */
  public static Cluster buildOtherTopics(
      List<Document> allDocuments, List<Cluster> clusters, String label) {
    final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments);
    final Set<Document> assignedDocuments = Sets.newHashSet();

    for (Cluster cluster : clusters) {
      collectAllDocuments(cluster, assignedDocuments);
    }

    unclusteredDocuments.removeAll(assignedDocuments);

    final Cluster otherTopics = new Cluster(label);
    otherTopics.addDocuments(unclusteredDocuments);
    otherTopics.setOtherTopics(true);

    return otherTopics;
  }
예제 #6
0
  /**
   * Locate the first cluster that has id equal to <code>id</code>. The search includes all the
   * clusters in the input and their sub-clusters. The first cluster with matching identifier is
   * returned or <code>null</code> if no such cluster could be found.
   */
  public static Cluster find(int id, Collection<Cluster> clusters) {
    for (Cluster c : clusters) {
      if (c != null) {
        if (c.id != null && c.id == id) {
          return c;
        }

        if (!c.getSubclusters().isEmpty()) {
          final Cluster sub = find(id, c.getSubclusters());
          if (sub != null) {
            return sub;
          }
        }
      }
    }

    return null;
  }
예제 #7
0
 public Double apply(Cluster cluster) {
   return cluster.isOtherTopics() ? 1.0 : -1.0;
 }
예제 #8
0
 public String apply(Cluster cluster) {
   return cluster.getLabel();
 }
예제 #9
0
 public Double apply(Cluster cluster) {
   return cluster.getAttribute(SCORE);
 }
예제 #10
0
 public Integer apply(Cluster cluster) {
   return cluster.size();
 }