Пример #1
0
 /**
  * If there are unclustered documents, appends the "Other Topics" group to the <code>clusters
  * </code>.
  *
  * @see #buildOtherTopics(List, List, String)
  */
 public static void appendOtherTopics(
     List<Document> allDocuments, List<Cluster> clusters, String label) {
   final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label);
   if (!otherTopics.getDocuments().isEmpty()) {
     clusters.add(otherTopics);
   }
 }
Пример #2
0
  @SuppressWarnings("unchecked")
  @Override
  public List<String> getClusterByCarrot2(String query) {
    // TODO Auto-generated method stub
    List<String> strs = new ArrayList<String>();
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);
    final List<org.carrot2.core.Document> documents = Lists.newArrayList();
    try {
      q = getParser().parse(QueryParserUtil.escape(query));
      docs = getIndexSearcher().search(q, Integer.MAX_VALUE);
      hits = docs.scoreDocs;
      for (int i = 0; i < hits.length; i++) {
        Document doc = getIndexSearcher().doc(hits[i].doc);
        documents.add(
            new org.carrot2.core.Document(
                doc.get(CONTENTS_FIELD), doc.get(TITLE_FIELD), doc.get(USER_FIELD)));
      }
      final ProcessingResult byTopicClusters =
          controller.process(documents, query, LingoClusteringAlgorithm.class);
      final List<Cluster> clustersByTopic = byTopicClusters.getClusters();
      final ProcessingResult byDomainClusters =
          controller.process(documents, query, ByUrlClusteringAlgorithm.class);
      final List<Cluster> clustersByDomain = byDomainClusters.getClusters();
      for (Cluster c : clustersByDomain) {
        strs.add(c.getLabel());
      }
      for (Cluster c : clustersByTopic) {
        strs.add(c.getLabel());
      }
    } catch (Exception ex) {

    }
    return strs;
  }
Пример #3
0
  /**
   * Assigns sequential identifiers to the provided <code>clusters</code> (and their sub-clusters).
   * If a cluster already has an identifier, the identifier will not be changed.
   *
   * @param clusters Clusters to assign identifiers to.
   * @throws IllegalArgumentException if the provided clusters contain non-unique identifiers
   */
  public static void assignClusterIds(Collection<Cluster> clusters) {
    final ArrayList<Cluster> flattened = Lists.newArrayListWithExpectedSize(clusters.size());

    flatten(flattened, clusters);

    synchronized (clusters) {
      final HashSet<Integer> ids = Sets.newHashSet();

      // First, find the start value for the id and check uniqueness of the ids
      // already provided.
      int maxId = Integer.MIN_VALUE;
      for (final Cluster cluster : flattened) {
        if (cluster.id != null) {
          if (!ids.add(cluster.id)) {
            throw new IllegalArgumentException("Non-unique cluster id found: " + cluster.id);
          }
          maxId = Math.max(maxId, cluster.id);
        }
      }

      // We'd rather start with 0
      maxId = Math.max(maxId, -1);

      // Assign missing ids
      for (final Cluster c : flattened) {
        if (c.id == null) {
          c.id = ++maxId;
        }
      }
    }
  }
Пример #4
0
 @SuppressWarnings("unchecked")
 @Override
 public List<String> getClusterByCarrotVersion2(String query) {
   // TODO Auto-generated method stub
   List<String> strs = new ArrayList<String>();
   final Controller controller = ControllerFactory.createPooling();
   final Map<String, Object> luceneGlobalAttributes = new HashMap<String, Object>();
   LuceneDocumentSourceDescriptor.attributeBuilder(luceneGlobalAttributes).directory(directory);
   SimpleFieldMapperDescriptor.attributeBuilder(luceneGlobalAttributes)
       .titleField(TITLE_FIELD)
       .contentField(CONTENTS_FIELD)
       .searchFields(Arrays.asList(new String[] {TITLE_FIELD, CONTENTS_FIELD}));
   controller.init(
       new HashMap<String, Object>(),
       new ProcessingComponentConfiguration(
           LuceneDocumentSource.class, "lucene", luceneGlobalAttributes));
   final Map<String, Object> processingAttributes = Maps.newHashMap();
   CommonAttributesDescriptor.attributeBuilder(processingAttributes).query(query);
   ProcessingResult process =
       controller.process(
           processingAttributes, "lucene", LingoClusteringAlgorithm.class.getName());
   for (Cluster c : process.getClusters()) {
     strs.add(c.getLabel() + " >>>> " + c.getAllDocuments().size());
   }
   return strs;
 }
Пример #5
0
 /*
  * Recursive descent into subclusters.
  */
 private static void flatten(ArrayList<Cluster> flattened, Collection<Cluster> clusters) {
   for (Cluster c : clusters) {
     flattened.add(c);
     final List<Cluster> subclusters = c.getSubclusters();
     if (!subclusters.isEmpty()) {
       flatten(flattened, subclusters);
     }
   }
 }
  private Collection<Document> collectDocuments(
      Collection<Cluster> clusters, Collection<Document> documents) {
    for (final Cluster cluster : clusters) {
      documents.addAll(cluster.getDocuments());
      collectDocuments(cluster.getSubclusters());
    }

    return documents;
  }
Пример #7
0
  /** Replace document refids with the actual references upon deserialization. */
  private void documentIdToReference(Cluster cluster, Map<String, Document> documents) {
    if (cluster.documentIds != null) {
      for (Cluster.DocumentRefid documentRefid : cluster.documentIds) {
        cluster.addDocuments(documents.get(documentRefid.refid));
      }
    }

    for (Cluster subcluster : cluster.getSubclusters()) {
      documentIdToReference(subcluster, documents);
    }
  }
  private void addToCluster(Map<Object, Cluster> clusters, Object fieldValue, Document document) {
    if (fieldValue == null) {
      return;
    }

    Cluster cluster = clusters.get(fieldValue);
    if (cluster == null) {
      cluster = new Cluster();
      cluster.addPhrases(buildClusterLabel(fieldValue));
      clusters.put(fieldValue, cluster);
    }

    cluster.addDocuments(document);
  }
Пример #9
0
  /** A recursive routine for collecting unique documents from this cluster and subclusters. */
  private static Set<Document> collectAllDocuments(Cluster cluster, Set<Document> docs) {
    if (cluster == null) {
      return docs;
    }

    docs.addAll(cluster.getDocuments());

    final List<Cluster> subclusters = cluster.getSubclusters();
    for (final Cluster subcluster : subclusters) {
      collectAllDocuments(subcluster, docs);
    }

    return docs;
  }
  /** Performs Lingo clustering of {@link #documents}. */
  @Override
  @SuppressWarnings("unchecked")
  public void process() throws ProcessingException {
    // There is a tiny trick here to support multilingual clustering without
    // refactoring the whole component: we remember the original list of documents
    // and invoke clustering for each language separately within the
    // IMonolingualClusteringAlgorithm implementation below. This is safe because
    // processing components are not thread-safe by definition and
    // IMonolingualClusteringAlgorithm forbids concurrent execution by contract.
    final List<Document> originalDocuments = documents;
    clusters =
        multilingualClustering.process(
            documents,
            new IMonolingualClusteringAlgorithm() {
              public List<Cluster> process(List<Document> documents, LanguageCode language) {
                LingoClusteringAlgorithm.this.documents = documents;
                LingoClusteringAlgorithm.this.cluster(language);
                return LingoClusteringAlgorithm.this.clusters;
              }
            });
    documents = originalDocuments;

    if (multilingualClustering.languageAggregationStrategy
        == LanguageAggregationStrategy.FLATTEN_ALL) {
      Collections.sort(
          clusters,
          Ordering.compound(
              Lists.newArrayList(
                  Cluster.OTHER_TOPICS_AT_THE_END,
                  Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight))));
    }
  }
Пример #11
0
  /**
   * Builds an "Other Topics" cluster that groups those documents from <code>allDocument</code> that
   * were not referenced in any cluster in <code>clusters</code>.
   *
   * @param allDocuments all documents to check against
   * @param clusters list of clusters with assigned documents
   * @param label label for the "Other Topics" group
   * @return the "Other Topics" cluster
   */
  public static Cluster buildOtherTopics(
      List<Document> allDocuments, List<Cluster> clusters, String label) {
    final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments);
    final Set<Document> assignedDocuments = Sets.newHashSet();

    for (Cluster cluster : clusters) {
      collectAllDocuments(cluster, assignedDocuments);
    }

    unclusteredDocuments.removeAll(assignedDocuments);

    final Cluster otherTopics = new Cluster(label);
    otherTopics.addDocuments(unclusteredDocuments);
    otherTopics.setOtherTopics(true);

    return otherTopics;
  }
Пример #12
0
  /**
   * Locate the first cluster that has id equal to <code>id</code>. The search includes all the
   * clusters in the input and their sub-clusters. The first cluster with matching identifier is
   * returned or <code>null</code> if no such cluster could be found.
   */
  public static Cluster find(int id, Collection<Cluster> clusters) {
    for (Cluster c : clusters) {
      if (c != null) {
        if (c.id != null && c.id == id) {
          return c;
        }

        if (!c.getSubclusters().isEmpty()) {
          final Cluster sub = find(id, c.getSubclusters());
          if (sub != null) {
            return sub;
          }
        }
      }
    }

    return null;
  }
Пример #13
0
  int calculateH(Cluster cluster) {
    final Map<Object, Integer> documentCountByPartition =
        getDocumentCountByPartition(cluster.getAllDocuments());

    final ArrayList<Integer> counts = Lists.newArrayList();
    counts.addAll(documentCountByPartition.values());

    return calculateH(counts);
  }
Пример #14
0
  /**
   * Create the junk (unassigned documents) cluster and create the final set of clusters in Carrot2
   * format.
   */
  private void postProcessing(ArrayList<ClusterCandidate> clusters) {
    // Adapt to Carrot2 classes, counting used documents on the way.
    final BitSet all = new BitSet(documents.size());
    final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size());
    final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3);
    for (ClusterCandidate c : clusters) {
      final Cluster c2 = new Cluster();
      c2.addPhrases(collectPhrases(phrases, c));
      c2.addDocuments(collectDocuments(docs, c.documents));
      c2.setScore((double) c.score);
      this.clusters.add(c2);

      all.or(c.documents);
      docs.clear();
      phrases.clear();
    }

    Cluster.appendOtherTopics(this.documents, this.clusters);
  }
Пример #15
0
  public void calculate() {
    final int partitionCount = getPartitionsCount(documents);
    if (partitionCount == 0) {
      return;
    }

    int weightSum = 0;
    double contaminationSum = 0;

    for (Cluster cluster : clusters) {
      if (cluster.isOtherTopics()) {
        continue;
      }
      final double contamination = calculate(cluster, partitionCount);
      cluster.setAttribute(CONTAMINATION, contamination);

      contaminationSum += contamination * cluster.size();
      weightSum += cluster.size();
    }

    weightedAverageContamination = contaminationSum / weightSum;
  }
Пример #16
0
  @SuppressWarnings("unchecked")
  double calculate(Cluster cluster, int partitionCount) {
    int clusterPartitionAssignments = 0;
    for (Document document : cluster.getAllDocuments()) {
      clusterPartitionAssignments +=
          ((Collection<Object>) document.getField(Document.PARTITIONS)).size();
    }

    final double worstCaseH = calculateWorstCaseH(clusterPartitionAssignments, partitionCount);
    if (worstCaseH == 0) {
      return 0;
    } else {
      return calculateH(cluster) / worstCaseH;
    }
  }
  /** Performs by URL clustering. */
  @Override
  public void process() throws ProcessingException {
    final Map<Object, Cluster> clusterMap = Maps.newHashMap();
    for (Document document : documents) {
      final Object field = document.getField(fieldName);
      if (field instanceof Collection<?>) {
        for (Object value : (Collection<?>) field) {
          addToCluster(clusterMap, value, document);
        }
      } else {
        addToCluster(clusterMap, field, document);
      }
    }

    clusters = Lists.newArrayList(clusterMap.values());
    Collections.sort(clusters, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR);
    Cluster.appendOtherTopics(documents, clusters);
  }
Пример #18
0
  /**
   * Creates a {@link ProcessingResult} with the provided <code>attributes</code>. Assigns unique
   * document identifiers if documents are present in the <code>attributes</code> map (under the key
   * {@link AttributeNames#DOCUMENTS}).
   */
  @SuppressWarnings("unchecked")
  ProcessingResult(Map<String, Object> attributes) {
    this.attributes = attributes;

    // Replace a modifiable collection of documents with an unmodifiable one
    final List<Document> documents = (List<Document>) attributes.get(AttributeNames.DOCUMENTS);
    if (documents != null) {
      Document.assignDocumentIds(documents);
      attributes.put(AttributeNames.DOCUMENTS, Collections.unmodifiableList(documents));
    }

    // Replace a modifiable collection of clusters with an unmodifiable one
    final List<Cluster> clusters = (List<Cluster>) attributes.get(AttributeNames.CLUSTERS);
    if (clusters != null) {
      Cluster.assignClusterIds(clusters);
      attributes.put(AttributeNames.CLUSTERS, Collections.unmodifiableList(clusters));
    }

    // Store a reference to attributes as an unmodifiable map
    this.attributesView = Collections.unmodifiableMap(attributes);
  }
Пример #19
0
 public Integer apply(Cluster cluster) {
   return cluster.size();
 }
Пример #20
0
 public Double apply(Cluster cluster) {
   return cluster.getAttribute(SCORE);
 }
Пример #21
0
 public String apply(Cluster cluster) {
   return cluster.getLabel();
 }
  /**
   * Performs the actual clustering with an assumption that all documents are written in one <code>
   * language</code>.
   */
  private void cluster(LanguageCode language) {
    // Preprocessing of documents
    final PreprocessingContext context =
        preprocessingPipeline.preprocess(documents, query, language);

    // Further processing only if there are words to process
    clusters = Lists.newArrayList();
    if (context.hasLabels()) {
      // Term-document matrix building and reduction
      final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(context);
      final ReducedVectorSpaceModelContext reducedVsmContext =
          new ReducedVectorSpaceModelContext(vsmContext);
      LingoProcessingContext lingoContext = new LingoProcessingContext(reducedVsmContext);

      matrixBuilder.buildTermDocumentMatrix(vsmContext);
      matrixBuilder.buildTermPhraseMatrix(vsmContext);

      matrixReducer.reduce(
          reducedVsmContext, computeClusterCount(desiredClusterCountBase, documents.size()));

      // Cluster label building
      clusterBuilder.buildLabels(lingoContext, matrixBuilder.termWeighting);

      // Document assignment
      clusterBuilder.assignDocuments(lingoContext);

      // Cluster merging
      clusterBuilder.merge(lingoContext);

      // Format final clusters
      final int[] clusterLabelIndex = lingoContext.clusterLabelFeatureIndex;
      final BitSet[] clusterDocuments = lingoContext.clusterDocuments;
      final double[] clusterLabelScore = lingoContext.clusterLabelScore;
      for (int i = 0; i < clusterLabelIndex.length; i++) {
        final Cluster cluster = new Cluster();

        final int labelFeature = clusterLabelIndex[i];
        if (labelFeature < 0) {
          // Cluster removed during merging
          continue;
        }

        // Add label and score
        cluster.addPhrases(labelFormatter.format(context, labelFeature));
        cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]);

        // Add documents
        final BitSet bs = clusterDocuments[i];
        for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) {
          cluster.addDocuments(documents.get(bit));
        }

        // Add cluster
        clusters.add(cluster);
      }

      Collections.sort(clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight));
    }

    Cluster.appendOtherTopics(documents, clusters);
  }
Пример #23
0
 public Double apply(Cluster cluster) {
   return cluster.isOtherTopics() ? 1.0 : -1.0;
 }