Ejemplo n.º 1
0
  @Test
  public void testNoIdsInSourceXml() {
    IResource xml = resourceLocator.getFirst("/xml/carrot2-no-ids.xml");

    processingAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xml"), xml);
    final int documentCount = runQuery();
    assertEquals(2, documentCount);
    assertEquals(Lists.newArrayList(0, 1), Lists.transform(getDocuments(), DOCUMENT_TO_INT_ID));
  }
Ejemplo n.º 2
0
  @Override
  public void process() throws ProcessingException {
    if (!enabled) {
      return;
    }

    if (query == null) {
      query = "";
    }

    // Create regexp patterns for each query word
    final String[] queryTerms =
        querySanitizePatternCompiled.matcher(query).replaceAll("").split("\\s+");

    Pattern queryPattern = null;
    List<String> patterns = Lists.newArrayList();
    for (String queryTerm : queryTerms) {
      if (Strings.isNullOrEmpty(queryTerm)) {
        continue;
      }

      if (dontHighlightPatternCompiled != null
          && dontHighlightPatternCompiled.matcher(queryTerm).matches()) {
        continue;
      }

      patterns.add("(" + Pattern.quote(escapeLtGt(queryTerm)) + ")");
    }

    if (patterns.size() > 0) {
      queryPattern =
          Pattern.compile(
              Joiner.on("|").join(patterns), Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
    }

    // As we're going to modify documents, we need to copy them to
    // avoid ConcurrentModificationExceptions.
    final List<Document> inputDocuments = documents;
    final List<Document> outputDocuments = Lists.newArrayListWithCapacity(inputDocuments.size());

    for (Document document : inputDocuments) {
      final Document clonedDocument = document.clone();
      for (String fieldName : fields) {
        highlightQueryTerms(clonedDocument, fieldName, queryPattern);
      }
      outputDocuments.add(clonedDocument);
    }
    documents = outputDocuments;
  }
  /**
   * Performs a very simple stress test using a pooling {@link Controller}. The test is performed
   * with default init attributes.
   */
  @Nightly
  @Test
  @ThreadLeakLingering(linger = 5000)
  public void testStress() throws InterruptedException, ExecutionException {
    final int numberOfThreads = randomIntBetween(1, 10);
    final int queriesPerThread = scaledRandomIntBetween(5, 25);

    /*
     * This yields a pooling controller effectively, because no cache interfaces are passed.
     */
    @SuppressWarnings("unchecked")
    final Controller controller = getCachingController(initAttributes);

    ExecutorService executorService = Executors.newFixedThreadPool(numberOfThreads);
    List<Callable<ProcessingResult>> callables = Lists.newArrayList();
    for (int i = 0; i < numberOfThreads * queriesPerThread; i++) {
      final int dataSetIndex = i;
      callables.add(
          new Callable<ProcessingResult>() {
            public ProcessingResult call() throws Exception {
              Map<String, Object> localAttributes = Maps.newHashMap();
              localAttributes.put(
                  AttributeNames.DOCUMENTS,
                  SampleDocumentData.ALL.get(dataSetIndex % SampleDocumentData.ALL.size()));
              localAttributes.put("dataSetIndex", dataSetIndex);
              return controller.process(localAttributes, getComponentClass());
            }
          });
    }

    try {
      List<Future<ProcessingResult>> results = executorService.invokeAll(callables);
      Multimap<Integer, List<Cluster>> clusterings = ArrayListMultimap.create();

      // Group results by query
      for (Future<ProcessingResult> future : results) {
        final ProcessingResult processingResult = future.get();
        final Integer dataSetIndex = (Integer) processingResult.getAttributes().get("dataSetIndex");
        clusterings.put(dataSetIndex, processingResult.getClusters());
      }

      // Make sure results are the same within each data set
      for (Integer dataSetIndex : clusterings.keySet()) {
        Collection<List<Cluster>> clustering = clusterings.get(dataSetIndex);
        Iterator<List<Cluster>> iterator = clustering.iterator();
        if (!iterator.hasNext()) {
          continue;
        }

        final List<Cluster> firstClusterList = iterator.next();
        Assertions.assertThat(firstClusterList).isNotEmpty();
        while (iterator.hasNext()) {
          assertThatClusters(firstClusterList).isEquivalentTo(iterator.next());
        }
      }
    } finally {
      executorService.shutdown();
    }
  }
  /** @see "http://issues.carrot2.org/browse/CARROT-400" */
  @Test
  public void testEmptyDocuments() {
    final List<Document> documents = Lists.newArrayList();
    final int documentCount = randomIntBetween(1, 100);
    for (int i = 0; i < documentCount; i++) {
      documents.add(new Document());
    }

    final List<Cluster> clusters = cluster(documents).getClusters();

    assertNotNull(clusters);
    assertEquals(1, clusters.size());
    assertThat(clusters.get(0).size()).isEqualTo(documentCount);
  }
Ejemplo n.º 5
0
 private void assertTransformedDocumentsEqual(final int documentCount) {
   assertEquals(2, documentCount);
   assertEquals("xslt test", resultAttributes.get(AttributeNames.QUERY));
   assertEquals(
       Lists.newArrayList(498967, 831478), Lists.transform(getDocuments(), DOCUMENT_TO_INT_ID));
   assertEquals(
       Lists.newArrayList("IBM's MARS Block Cipher.", "IBM WebSphere Studio Device Developer"),
       Lists.transform(getDocuments(), DOCUMENT_TO_TITLE));
   assertEquals(
       Lists.newArrayList(
           "The company's AES proposal using 128 bit blocks.",
           "An integrated development environment."),
       Lists.transform(getDocuments(), DOCUMENT_TO_SUMMARY));
   assertEquals(
       Lists.newArrayList(
           "http://www.research.ibm.com/security/mars.html",
           "http://www-3.ibm.com/software/wireless/wsdd/"),
       Lists.transform(getDocuments(), DOCUMENT_TO_CONTENT_URL));
 }
Ejemplo n.º 6
0
  @Test
  public void testOverridingInitializationTimeXslt() {
    IResource initXslt = resourceLocator.getFirst("/xsl/carrot2-identity.xsl");
    initAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xslt"), initXslt);

    @SuppressWarnings("unchecked")
    Controller controller = getCachingController(initAttributes);

    // Run with identity XSLT
    {
      IResource xml = resourceLocator.getFirst("/xml/carrot2-test.xml");
      processingAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xml"), xml);

      final int documentCount = runQuery(controller);
      assertEquals(2, documentCount);
      assertEquals(
          Lists.newArrayList("Title 0", "Title 1"),
          Lists.transform(getDocuments(), DOCUMENT_TO_TITLE));
      assertEquals(
          Lists.newArrayList("Snippet 0", "Snippet 1"),
          Lists.transform(getDocuments(), DOCUMENT_TO_SUMMARY));
    }

    // Run with swapping XSLT
    {
      IResource xml = resourceLocator.getFirst("/xml/carrot2-test.xml");
      IResource xslt = resourceLocator.getFirst("/xsl/carrot2-title-snippet-switch.xsl");
      processingAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xml"), xml);
      processingAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xslt"), xslt);

      final int documentCount = runQuery(controller);
      assertEquals(2, documentCount);
      assertEquals(
          Lists.newArrayList("Snippet 0", "Snippet 1"),
          Lists.transform(getDocuments(), DOCUMENT_TO_TITLE));
      assertEquals(
          Lists.newArrayList("Title 0", "Title 1"),
          Lists.transform(getDocuments(), DOCUMENT_TO_SUMMARY));
    }
  }
Ejemplo n.º 7
0
  /** Reload all lexical resources associated with the given key. */
  private static HashMap<LanguageCode, ILexicalData> reloadResources(
      ResourceLookup resourceLookup) {
    // Load lexical resources.
    ObjectHashSet<MutableCharArray> mergedStopwords = new ObjectHashSet<>();
    ArrayList<Pattern> mergedStoplabels = Lists.newArrayList();

    HashMap<LanguageCode, ILexicalData> resourceMap = Maps.newHashMap();
    for (LanguageCode languageCode : LanguageCode.values()) {
      final String isoCode = languageCode.getIsoCode();

      ObjectHashSet<MutableCharArray> stopwords =
          toLower(load(resourceLookup, "stopwords." + isoCode));
      ArrayList<Pattern> stoplabels = compile(load(resourceLookup, "stoplabels." + isoCode));

      mergedStopwords.addAll(stopwords);
      mergedStoplabels.addAll(stoplabels);

      resourceMap.put(languageCode, new DefaultLexicalData(stopwords, stoplabels));
    }
    resourceMap.put(null, new DefaultLexicalData(mergedStopwords, mergedStoplabels));

    return resourceMap;
  }