示例#1
0
 /**
  * If there are unclustered documents, appends the "Other Topics" group to the <code>clusters
  * </code>.
  *
  * @see #buildOtherTopics(List, List, String)
  */
 public static void appendOtherTopics(
     List<Document> allDocuments, List<Cluster> clusters, String label) {
   final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label);
   if (!otherTopics.getDocuments().isEmpty()) {
     clusters.add(otherTopics);
   }
 }
示例#2
0
  /**
   * Assigns sequential identifiers to the provided <code>clusters</code> (and their sub-clusters).
   * If a cluster already has an identifier, the identifier will not be changed.
   *
   * @param clusters Clusters to assign identifiers to.
   * @throws IllegalArgumentException if the provided clusters contain non-unique identifiers
   */
  public static void assignClusterIds(Collection<Cluster> clusters) {
    final ArrayList<Cluster> flattened = Lists.newArrayListWithExpectedSize(clusters.size());

    flatten(flattened, clusters);

    synchronized (clusters) {
      final HashSet<Integer> ids = Sets.newHashSet();

      // First, find the start value for the id and check uniqueness of the ids
      // already provided.
      int maxId = Integer.MIN_VALUE;
      for (final Cluster cluster : flattened) {
        if (cluster.id != null) {
          if (!ids.add(cluster.id)) {
            throw new IllegalArgumentException("Non-unique cluster id found: " + cluster.id);
          }
          maxId = Math.max(maxId, cluster.id);
        }
      }

      // We'd rather start with 0
      maxId = Math.max(maxId, -1);

      // Assign missing ids
      for (final Cluster c : flattened) {
        if (c.id == null) {
          c.id = ++maxId;
        }
      }
    }
  }
示例#3
0
 /*
  * Recursive descent into subclusters.
  */
 private static void flatten(ArrayList<Cluster> flattened, Collection<Cluster> clusters) {
   for (Cluster c : clusters) {
     flattened.add(c);
     final List<Cluster> subclusters = c.getSubclusters();
     if (!subclusters.isEmpty()) {
       flatten(flattened, subclusters);
     }
   }
 }
示例#4
0
  /** Replace document refids with the actual references upon deserialization. */
  private void documentIdToReference(Cluster cluster, Map<String, Document> documents) {
    if (cluster.documentIds != null) {
      for (Cluster.DocumentRefid documentRefid : cluster.documentIds) {
        cluster.addDocuments(documents.get(documentRefid.refid));
      }
    }

    for (Cluster subcluster : cluster.getSubclusters()) {
      documentIdToReference(subcluster, documents);
    }
  }
示例#5
0
  /** A recursive routine for collecting unique documents from this cluster and subclusters. */
  private static Set<Document> collectAllDocuments(Cluster cluster, Set<Document> docs) {
    if (cluster == null) {
      return docs;
    }

    docs.addAll(cluster.getDocuments());

    final List<Cluster> subclusters = cluster.getSubclusters();
    for (final Cluster subcluster : subclusters) {
      collectAllDocuments(subcluster, docs);
    }

    return docs;
  }
示例#6
0
  /**
   * Builds an "Other Topics" cluster that groups those documents from <code>allDocument</code> that
   * were not referenced in any cluster in <code>clusters</code>.
   *
   * @param allDocuments all documents to check against
   * @param clusters list of clusters with assigned documents
   * @param label label for the "Other Topics" group
   * @return the "Other Topics" cluster
   */
  public static Cluster buildOtherTopics(
      List<Document> allDocuments, List<Cluster> clusters, String label) {
    final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments);
    final Set<Document> assignedDocuments = Sets.newHashSet();

    for (Cluster cluster : clusters) {
      collectAllDocuments(cluster, assignedDocuments);
    }

    unclusteredDocuments.removeAll(assignedDocuments);

    final Cluster otherTopics = new Cluster(label);
    otherTopics.addDocuments(unclusteredDocuments);
    otherTopics.setOtherTopics(true);

    return otherTopics;
  }
示例#7
0
  /**
   * Locate the first cluster that has id equal to <code>id</code>. The search includes all the
   * clusters in the input and their sub-clusters. The first cluster with matching identifier is
   * returned or <code>null</code> if no such cluster could be found.
   */
  public static Cluster find(int id, Collection<Cluster> clusters) {
    for (Cluster c : clusters) {
      if (c != null) {
        if (c.id != null && c.id == id) {
          return c;
        }

        if (!c.getSubclusters().isEmpty()) {
          final Cluster sub = find(id, c.getSubclusters());
          if (sub != null) {
            return sub;
          }
        }
      }
    }

    return null;
  }
示例#8
0
  /**
   * Creates a {@link ProcessingResult} with the provided <code>attributes</code>. Assigns unique
   * document identifiers if documents are present in the <code>attributes</code> map (under the key
   * {@link AttributeNames#DOCUMENTS}).
   */
  @SuppressWarnings("unchecked")
  ProcessingResult(Map<String, Object> attributes) {
    this.attributes = attributes;

    // Replace a modifiable collection of documents with an unmodifiable one
    final List<Document> documents = (List<Document>) attributes.get(AttributeNames.DOCUMENTS);
    if (documents != null) {
      Document.assignDocumentIds(documents);
      attributes.put(AttributeNames.DOCUMENTS, Collections.unmodifiableList(documents));
    }

    // Replace a modifiable collection of clusters with an unmodifiable one
    final List<Cluster> clusters = (List<Cluster>) attributes.get(AttributeNames.CLUSTERS);
    if (clusters != null) {
      Cluster.assignClusterIds(clusters);
      attributes.put(AttributeNames.CLUSTERS, Collections.unmodifiableList(clusters));
    }

    // Store a reference to attributes as an unmodifiable map
    this.attributesView = Collections.unmodifiableMap(attributes);
  }
示例#9
0
 public Double apply(Cluster cluster) {
   return cluster.isOtherTopics() ? 1.0 : -1.0;
 }
示例#10
0
 public String apply(Cluster cluster) {
   return cluster.getLabel();
 }
示例#11
0
 public Double apply(Cluster cluster) {
   return cluster.getAttribute(SCORE);
 }
示例#12
0
 public Integer apply(Cluster cluster) {
   return cluster.size();
 }