public void testParseAndAssignDictionaries() throws Throwable { Collection<Dictionary> dictionaries = new ArrayList<Dictionary>(); dictionaries.add( new SimpleDictionary("eobjects.org products", "MetaModel", "DataCleaner", "AnalyzerBeans")); dictionaries.add( new SimpleDictionary( "apache products", "commons-lang", "commons-math", "commons-codec", "commons-logging")); dictionaries.add( new SimpleDictionary( "logging products", "commons-logging", "log4j", "slf4j", "java.util.Logging")); Collection<SynonymCatalog> synonymCatalogs = new ArrayList<SynonymCatalog>(); synonymCatalogs.add( new SimpleSynonymCatalog( "translated terms", new SimpleSynonym("hello", "howdy", "hi", "yo", "hey"), new SimpleSynonym("goodbye", "bye", "see you", "hey"))); Collection<StringPattern> stringPatterns = new ArrayList<StringPattern>(); ReferenceDataCatalogImpl ref = new ReferenceDataCatalogImpl(dictionaries, synonymCatalogs, stringPatterns); Datastore datastore = new CsvDatastore("my database", "src/test/resources/projects.csv"); DataCleanerConfigurationImpl conf = new DataCleanerConfigurationImpl(); AnalysisJobBuilder job = new AnalysisJobBuilder(conf); job.setDatastore(datastore); job.addSourceColumns("product", "version"); TransformerComponentBuilder<DictionaryMatcherTransformer> tjb1 = job.addTransformer(DictionaryMatcherTransformer.class); tjb1.setConfiguredProperty( "Dictionaries", new Dictionary[] { ref.getDictionary("eobjects.org products"), ref.getDictionary("apache products"), ref.getDictionary("logging products") }); tjb1.addInputColumn(job.getSourceColumnByName("product")); List<MutableInputColumn<?>> outputColumns = tjb1.getOutputColumns(); assertEquals(3, outputColumns.size()); outputColumns.get(0).setName("eobjects match"); outputColumns.get(1).setName("apache match"); outputColumns.get(2).setName("logging match"); TransformerComponentBuilder<ConvertToNumberTransformer> tjb2 = job.addTransformer(ConvertToNumberTransformer.class); tjb2.addInputColumn(outputColumns.get(2)); tjb2.getOutputColumns().get(0).setName("logging match -> number"); AnalyzerComponentBuilder<ValueDistributionAnalyzer> ajb = job.addAnalyzer(ValueDistributionAnalyzer.class); ajb.addInputColumns(tjb1.getOutputColumns()); ajb.addInputColumns(tjb2.getOutputColumns()); assertTrue(job.isConfigured()); AnalysisJob analysisJob = job.toAnalysisJob(); AnalysisResultFuture resultFuture = new AnalysisRunnerImpl(conf).run(analysisJob); if (!resultFuture.isSuccessful()) { job.close(); throw resultFuture.getErrors().get(0); } List<AnalyzerResult> results = resultFuture.getResults(); assertEquals(4, results.size()); ValueDistributionAnalyzerResult res = (ValueDistributionAnalyzerResult) results.get(0); assertEquals("eobjects match", res.getName()); assertEquals(8, res.getCount("true").intValue()); assertEquals(4, res.getCount("false").intValue()); res = (ValueDistributionAnalyzerResult) results.get(1); assertEquals("apache match", res.getName()); assertEquals(2, res.getCount("true").intValue()); assertEquals(10, res.getCount("false").intValue()); res = (ValueDistributionAnalyzerResult) results.get(2); assertEquals("logging match", res.getName()); assertEquals(3, res.getCount("true").intValue()); assertEquals(9, res.getCount("false").intValue()); res = (ValueDistributionAnalyzerResult) results.get(3); assertEquals("logging match -> number", res.getName()); assertEquals(3, res.getCount("1").intValue()); assertEquals(9, res.getCount("0").intValue()); job.close(); }
public void testScenario() throws Throwable { final AnalysisJob job; try (DatastoreConnection connection = datastore.openConnection(); ) { final DataContext dataContext = connection.getDataContext(); final Table table = dataContext.getTableByQualifiedLabel("PUBLIC.CUSTOMERS"); final Row row = MetaModelHelper.executeSingleRowQuery( dataContext, dataContext.query().from(table).selectCount().toQuery()); assertEquals(recordsInTable, ((Number) row.getValue(0)).intValue()); try (AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration)) { jobBuilder.setDatastore(datastore); jobBuilder.addSourceColumns("CUSTOMERS.CONTACTFIRSTNAME"); jobBuilder.addSourceColumns("CUSTOMERS.CONTACTLASTNAME"); // although not semantically correct, we pretend that EVEN is // the // success-state in our cleansing street and that ODD is the // reject-state. final Category valid = org.datacleaner.test.mock.EvenOddFilter.Category.EVEN; final Category invalid = org.datacleaner.test.mock.EvenOddFilter.Category.ODD; final TransformerComponentBuilder<MockTransformer> trans1 = jobBuilder.addTransformer(MockTransformer.class); trans1.setName("trans1"); trans1.addInputColumn(jobBuilder.getSourceColumns().get(0)); final FilterComponentBuilder< EvenOddFilter, org.datacleaner.test.mock.EvenOddFilter.Category> filter1 = jobBuilder.addFilter(EvenOddFilter.class); filter1.setName("filter1"); filter1.addInputColumn(trans1.getOutputColumns().get(0)); final TransformerComponentBuilder<MockTransformer> trans2 = jobBuilder.addTransformer(MockTransformer.class); trans2.setName("trans2"); trans2.addInputColumn(jobBuilder.getSourceColumns().get(1)); trans2.setRequirement(filter1, valid); final FilterComponentBuilder< EvenOddFilter, org.datacleaner.test.mock.EvenOddFilter.Category> filter2 = jobBuilder.addFilter(EvenOddFilter.class); filter2.setName("filter2"); filter2.addInputColumn(trans2.getOutputColumns().get(0)); final AnalyzerComponentBuilder<MockAnalyzer> analyzer1 = jobBuilder.addAnalyzer(MockAnalyzer.class); analyzer1.setName("success"); analyzer1.addInputColumn(jobBuilder.getSourceColumns().get(0)); analyzer1.addInputColumn(jobBuilder.getSourceColumns().get(1)); analyzer1.addInputColumn(trans1.getOutputColumns().get(0)); analyzer1.addInputColumn(trans2.getOutputColumns().get(0)); analyzer1.setRequirement(filter2, valid); final FilterOutcome invalid1 = filter1.getFilterOutcome(invalid); final FilterOutcome invalid2 = filter2.getFilterOutcome(invalid); final AnalyzerComponentBuilder<MockAnalyzer> analyzer2 = jobBuilder.addAnalyzer(MockAnalyzer.class); analyzer2.setName("rejects"); analyzer2.addInputColumn(jobBuilder.getSourceColumns().get(0)); analyzer2.addInputColumn(jobBuilder.getSourceColumns().get(1)); analyzer2.setComponentRequirement(new CompoundComponentRequirement(invalid1, invalid2)); job = jobBuilder.toAnalysisJob(); } } final AnalysisRunner runner = new AnalysisRunnerImpl(configuration); final AnalysisResultFuture resultFuture = runner.run(job); resultFuture.await(); if (resultFuture.isErrornous()) { throw resultFuture.getErrors().get(0); } int recordsInResults = 0; final Map<ComponentJob, AnalyzerResult> map = resultFuture.getResultMap(); for (Entry<ComponentJob, AnalyzerResult> entry : map.entrySet()) { final ComponentJob componentJob = entry.getKey(); @SuppressWarnings("unchecked") final ListResult<InputRow> result = (ListResult<InputRow>) entry.getValue(); final List<InputRow> values = result.getValues(); final int recordsInResult = values.size(); recordsInResults += recordsInResult; switch (componentJob.getName()) { case "success": case "rejects": // expected states assertTrue( "Expected records in all buckets of the cleansing street, but did not find any in: " + componentJob, recordsInResult > 0); assertTrue( "Expected records to be distributed across buckets, but found all in: " + componentJob, recordsInResult != recordsInTable); break; default: fail("Unexpected component in result map: " + componentJob); } } assertEquals(recordsInTable, recordsInResults); }
public void runScenario() { MultiThreadedTaskRunner taskRunner = TestEnvironment.getMultiThreadedTaskRunner(); ThreadPoolExecutor executorService = (ThreadPoolExecutor) taskRunner.getExecutorService(); assertEquals(TestEnvironment.THREAD_COUNT, executorService.getMaximumPoolSize()); assertEquals(0, executorService.getActiveCount()); DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl() .withEnvironment(new DataCleanerEnvironmentImpl().withTaskRunner(taskRunner)); AnalysisRunner runner = new AnalysisRunnerImpl(configuration); Datastore ds = TestHelper.createSampleDatabaseDatastore("foobar"); try (DatastoreConnection con = ds.openConnection()) { AnalysisJob job; try (AnalysisJobBuilder analysisJobBuilder = new AnalysisJobBuilder(configuration)) { analysisJobBuilder.setDatastore(ds); Table table = con.getDataContext().getDefaultSchema().getTableByName("ORDERFACT"); assertNotNull(table); Column statusColumn = table.getColumnByName("STATUS"); Column commentsColumn = table.getColumnByName("COMMENTS"); analysisJobBuilder.addSourceColumns(statusColumn, commentsColumn); analysisJobBuilder .addAnalyzer(AnalyzerMock.class) .addInputColumns(analysisJobBuilder.getSourceColumns()); job = analysisJobBuilder.toAnalysisJob(); } AnalysisResultFuture resultFuture = runner.run(job); try { Thread.sleep(550); } catch (InterruptedException e) { e.printStackTrace(); fail("Interrupted! " + e.getMessage()); } resultFuture.cancel(); assertFalse(resultFuture.isSuccessful()); assertTrue(resultFuture.isCancelled()); assertTrue(resultFuture.isErrornous()); try { Thread.sleep(400); } catch (InterruptedException e) { e.printStackTrace(); fail("Interrupted! " + e.getMessage()); } assertEquals(TestEnvironment.THREAD_COUNT, executorService.getMaximumPoolSize()); long completedTaskCount = executorService.getCompletedTaskCount(); assertTrue("completedTaskCount was: " + completedTaskCount, completedTaskCount > 3); int largestPoolSize = executorService.getLargestPoolSize(); assertTrue("largestPoolSize was: " + largestPoolSize, largestPoolSize > 5); assertEquals(0, executorService.getActiveCount()); } taskRunner.shutdown(); }