@Test public void testNoIdsInSourceXml() { IResource xml = resourceLocator.getFirst("/xml/carrot2-no-ids.xml"); processingAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xml"), xml); final int documentCount = runQuery(); assertEquals(2, documentCount); assertEquals(Lists.newArrayList(0, 1), Lists.transform(getDocuments(), DOCUMENT_TO_INT_ID)); }
@Override public void process() throws ProcessingException { if (!enabled) { return; } if (query == null) { query = ""; } // Create regexp patterns for each query word final String[] queryTerms = querySanitizePatternCompiled.matcher(query).replaceAll("").split("\\s+"); Pattern queryPattern = null; List<String> patterns = Lists.newArrayList(); for (String queryTerm : queryTerms) { if (Strings.isNullOrEmpty(queryTerm)) { continue; } if (dontHighlightPatternCompiled != null && dontHighlightPatternCompiled.matcher(queryTerm).matches()) { continue; } patterns.add("(" + Pattern.quote(escapeLtGt(queryTerm)) + ")"); } if (patterns.size() > 0) { queryPattern = Pattern.compile( Joiner.on("|").join(patterns), Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } // As we're going to modify documents, we need to copy them to // avoid ConcurrentModificationExceptions. final List<Document> inputDocuments = documents; final List<Document> outputDocuments = Lists.newArrayListWithCapacity(inputDocuments.size()); for (Document document : inputDocuments) { final Document clonedDocument = document.clone(); for (String fieldName : fields) { highlightQueryTerms(clonedDocument, fieldName, queryPattern); } outputDocuments.add(clonedDocument); } documents = outputDocuments; }
/** * Performs a very simple stress test using a pooling {@link Controller}. The test is performed * with default init attributes. */ @Nightly @Test @ThreadLeakLingering(linger = 5000) public void testStress() throws InterruptedException, ExecutionException { final int numberOfThreads = randomIntBetween(1, 10); final int queriesPerThread = scaledRandomIntBetween(5, 25); /* * This yields a pooling controller effectively, because no cache interfaces are passed. */ @SuppressWarnings("unchecked") final Controller controller = getCachingController(initAttributes); ExecutorService executorService = Executors.newFixedThreadPool(numberOfThreads); List<Callable<ProcessingResult>> callables = Lists.newArrayList(); for (int i = 0; i < numberOfThreads * queriesPerThread; i++) { final int dataSetIndex = i; callables.add( new Callable<ProcessingResult>() { public ProcessingResult call() throws Exception { Map<String, Object> localAttributes = Maps.newHashMap(); localAttributes.put( AttributeNames.DOCUMENTS, SampleDocumentData.ALL.get(dataSetIndex % SampleDocumentData.ALL.size())); localAttributes.put("dataSetIndex", dataSetIndex); return controller.process(localAttributes, getComponentClass()); } }); } try { List<Future<ProcessingResult>> results = executorService.invokeAll(callables); Multimap<Integer, List<Cluster>> clusterings = ArrayListMultimap.create(); // Group results by query for (Future<ProcessingResult> future : results) { final ProcessingResult processingResult = future.get(); final Integer dataSetIndex = (Integer) processingResult.getAttributes().get("dataSetIndex"); clusterings.put(dataSetIndex, processingResult.getClusters()); } // Make sure results are the same within each data set for (Integer dataSetIndex : clusterings.keySet()) { Collection<List<Cluster>> clustering = clusterings.get(dataSetIndex); Iterator<List<Cluster>> iterator = clustering.iterator(); if (!iterator.hasNext()) { continue; } final List<Cluster> firstClusterList = iterator.next(); Assertions.assertThat(firstClusterList).isNotEmpty(); while (iterator.hasNext()) { assertThatClusters(firstClusterList).isEquivalentTo(iterator.next()); } } } finally { executorService.shutdown(); } }
/** @see "http://issues.carrot2.org/browse/CARROT-400" */ @Test public void testEmptyDocuments() { final List<Document> documents = Lists.newArrayList(); final int documentCount = randomIntBetween(1, 100); for (int i = 0; i < documentCount; i++) { documents.add(new Document()); } final List<Cluster> clusters = cluster(documents).getClusters(); assertNotNull(clusters); assertEquals(1, clusters.size()); assertThat(clusters.get(0).size()).isEqualTo(documentCount); }
private void assertTransformedDocumentsEqual(final int documentCount) { assertEquals(2, documentCount); assertEquals("xslt test", resultAttributes.get(AttributeNames.QUERY)); assertEquals( Lists.newArrayList(498967, 831478), Lists.transform(getDocuments(), DOCUMENT_TO_INT_ID)); assertEquals( Lists.newArrayList("IBM's MARS Block Cipher.", "IBM WebSphere Studio Device Developer"), Lists.transform(getDocuments(), DOCUMENT_TO_TITLE)); assertEquals( Lists.newArrayList( "The company's AES proposal using 128 bit blocks.", "An integrated development environment."), Lists.transform(getDocuments(), DOCUMENT_TO_SUMMARY)); assertEquals( Lists.newArrayList( "http://www.research.ibm.com/security/mars.html", "http://www-3.ibm.com/software/wireless/wsdd/"), Lists.transform(getDocuments(), DOCUMENT_TO_CONTENT_URL)); }
@Test public void testOverridingInitializationTimeXslt() { IResource initXslt = resourceLocator.getFirst("/xsl/carrot2-identity.xsl"); initAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xslt"), initXslt); @SuppressWarnings("unchecked") Controller controller = getCachingController(initAttributes); // Run with identity XSLT { IResource xml = resourceLocator.getFirst("/xml/carrot2-test.xml"); processingAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xml"), xml); final int documentCount = runQuery(controller); assertEquals(2, documentCount); assertEquals( Lists.newArrayList("Title 0", "Title 1"), Lists.transform(getDocuments(), DOCUMENT_TO_TITLE)); assertEquals( Lists.newArrayList("Snippet 0", "Snippet 1"), Lists.transform(getDocuments(), DOCUMENT_TO_SUMMARY)); } // Run with swapping XSLT { IResource xml = resourceLocator.getFirst("/xml/carrot2-test.xml"); IResource xslt = resourceLocator.getFirst("/xsl/carrot2-title-snippet-switch.xsl"); processingAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xml"), xml); processingAttributes.put(AttributeUtils.getKey(XmlDocumentSource.class, "xslt"), xslt); final int documentCount = runQuery(controller); assertEquals(2, documentCount); assertEquals( Lists.newArrayList("Snippet 0", "Snippet 1"), Lists.transform(getDocuments(), DOCUMENT_TO_TITLE)); assertEquals( Lists.newArrayList("Title 0", "Title 1"), Lists.transform(getDocuments(), DOCUMENT_TO_SUMMARY)); } }
/** Reload all lexical resources associated with the given key. */ private static HashMap<LanguageCode, ILexicalData> reloadResources( ResourceLookup resourceLookup) { // Load lexical resources. ObjectHashSet<MutableCharArray> mergedStopwords = new ObjectHashSet<>(); ArrayList<Pattern> mergedStoplabels = Lists.newArrayList(); HashMap<LanguageCode, ILexicalData> resourceMap = Maps.newHashMap(); for (LanguageCode languageCode : LanguageCode.values()) { final String isoCode = languageCode.getIsoCode(); ObjectHashSet<MutableCharArray> stopwords = toLower(load(resourceLookup, "stopwords." + isoCode)); ArrayList<Pattern> stoplabels = compile(load(resourceLookup, "stoplabels." + isoCode)); mergedStopwords.addAll(stopwords); mergedStoplabels.addAll(stoplabels); resourceMap.put(languageCode, new DefaultLexicalData(stopwords, stoplabels)); } resourceMap.put(null, new DefaultLexicalData(mergedStopwords, mergedStoplabels)); return resourceMap; }