Exemplo n.º 1
0
  /** Performs STC clustering of {@link #documents}. */
  @Override
  public void process() throws ProcessingException {
    final List<Document> originalDocuments = documents;
    clusters =
        multilingualClustering.process(
            documents,
            new IMonolingualClusteringAlgorithm() {
              public List<Cluster> process(List<Document> documents, LanguageCode language) {
                BasicPreprocessing.this.documents = documents;
                BasicPreprocessing.this.cluster(language);
                return BasicPreprocessing.this.clusters;
              }
            });
    documents = originalDocuments;

    if (multilingualClustering.languageAggregationStrategy
        == LanguageAggregationStrategy.FLATTEN_ALL) {
      Collections.sort(
          clusters,
          new Comparator<Cluster>() {
            public int compare(Cluster c1, Cluster c2) {
              if (c1.isOtherTopics()) return 1;
              if (c2.isOtherTopics()) return -1;
              if (c1.getScore() < c2.getScore()) return 1;
              if (c1.getScore() > c2.getScore()) return -1;
              if (c1.size() < c2.size()) return 1;
              if (c1.size() > c2.size()) return -1;
              return 0;
            }
          });
    }
  }
  /** Performs Lingo clustering of {@link #documents}. */
  @Override
  @SuppressWarnings("unchecked")
  public void process() throws ProcessingException {
    // There is a tiny trick here to support multilingual clustering without
    // refactoring the whole component: we remember the original list of documents
    // and invoke clustering for each language separately within the
    // IMonolingualClusteringAlgorithm implementation below. This is safe because
    // processing components are not thread-safe by definition and
    // IMonolingualClusteringAlgorithm forbids concurrent execution by contract.
    final List<Document> originalDocuments = documents;
    clusters =
        multilingualClustering.process(
            documents,
            new IMonolingualClusteringAlgorithm() {
              public List<Cluster> process(List<Document> documents, LanguageCode language) {
                LingoClusteringAlgorithm.this.documents = documents;
                LingoClusteringAlgorithm.this.cluster(language);
                return LingoClusteringAlgorithm.this.clusters;
              }
            });
    documents = originalDocuments;

    if (multilingualClustering.languageAggregationStrategy
        == LanguageAggregationStrategy.FLATTEN_ALL) {
      Collections.sort(
          clusters,
          Ordering.compound(
              Lists.newArrayList(
                  Cluster.OTHER_TOPICS_AT_THE_END,
                  Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight))));
    }
  }
Exemplo n.º 3
0
  /** Performs STC clustering of {@link #documents}. */
  @Override
  public void process() throws ProcessingException {
    // There is a tiny trick here to support multilingual clustering without
    // refactoring the whole component: we remember the original list of documents
    // and invoke clustering for each language separately within the
    // IMonolingualClusteringAlgorithm implementation below. This is safe because
    // processing components are not thread-safe by definition and
    // IMonolingualClusteringAlgorithm forbids concurrent execution by contract.
    final List<Document> originalDocuments = documents;
    clusters =
        multilingualClustering.process(
            documents,
            new IMonolingualClusteringAlgorithm() {
              public List<Cluster> process(List<Document> documents, LanguageCode language) {
                STCClusteringAlgorithm.this.documents = documents;
                STCClusteringAlgorithm.this.cluster(language);
                return STCClusteringAlgorithm.this.clusters;
              }
            });
    documents = originalDocuments;

    // TODO: be consistent here with Lingo implementation (sort with a compound).
    if (multilingualClustering.languageAggregationStrategy
        == LanguageAggregationStrategy.FLATTEN_ALL) {
      Collections.sort(
          clusters,
          new Comparator<Cluster>() {
            public int compare(Cluster c1, Cluster c2) {
              if (c1.isOtherTopics() != c2.isOtherTopics()) {
                return c1.isOtherTopics() ? 1 : -1;
              }
              if (c1.getScore() < c2.getScore()) return 1;
              if (c1.getScore() > c2.getScore()) return -1;
              if (c1.size() < c2.size()) return 1;
              if (c1.size() > c2.size()) return -1;
              return 0;
            }
          });
    }
  }