Ejemplo n.º 1
0
  public void testFindInputColumns() throws Exception {
    SourceColumnFinder columnFinder = new SourceColumnFinder();

    AnalysisJobBuilder analysisJobBuilder =
        new AnalysisJobBuilder(new DataCleanerConfigurationImpl());
    analysisJobBuilder
        .addTransformer(MockConvertToMonthObjectTransformer.class)
        .addInputColumn(new MockInputColumn<String>("month", String.class));
    columnFinder.addSources(analysisJobBuilder);
    List<InputColumn<?>> findInputColumns = columnFinder.findInputColumns(Month.class);
    assertEquals(1, findInputColumns.size());
  }
  public void testParseAndAssignDictionaries() throws Throwable {
    Collection<Dictionary> dictionaries = new ArrayList<Dictionary>();
    dictionaries.add(
        new SimpleDictionary("eobjects.org products", "MetaModel", "DataCleaner", "AnalyzerBeans"));
    dictionaries.add(
        new SimpleDictionary(
            "apache products", "commons-lang", "commons-math", "commons-codec", "commons-logging"));
    dictionaries.add(
        new SimpleDictionary(
            "logging products", "commons-logging", "log4j", "slf4j", "java.util.Logging"));

    Collection<SynonymCatalog> synonymCatalogs = new ArrayList<SynonymCatalog>();
    synonymCatalogs.add(
        new SimpleSynonymCatalog(
            "translated terms",
            new SimpleSynonym("hello", "howdy", "hi", "yo", "hey"),
            new SimpleSynonym("goodbye", "bye", "see you", "hey")));

    Collection<StringPattern> stringPatterns = new ArrayList<StringPattern>();

    ReferenceDataCatalogImpl ref =
        new ReferenceDataCatalogImpl(dictionaries, synonymCatalogs, stringPatterns);

    Datastore datastore = new CsvDatastore("my database", "src/test/resources/projects.csv");
    DataCleanerConfigurationImpl conf = new DataCleanerConfigurationImpl();
    AnalysisJobBuilder job = new AnalysisJobBuilder(conf);
    job.setDatastore(datastore);
    job.addSourceColumns("product", "version");
    TransformerComponentBuilder<DictionaryMatcherTransformer> tjb1 =
        job.addTransformer(DictionaryMatcherTransformer.class);
    tjb1.setConfiguredProperty(
        "Dictionaries",
        new Dictionary[] {
          ref.getDictionary("eobjects.org products"),
          ref.getDictionary("apache products"),
          ref.getDictionary("logging products")
        });
    tjb1.addInputColumn(job.getSourceColumnByName("product"));
    List<MutableInputColumn<?>> outputColumns = tjb1.getOutputColumns();
    assertEquals(3, outputColumns.size());
    outputColumns.get(0).setName("eobjects match");
    outputColumns.get(1).setName("apache match");
    outputColumns.get(2).setName("logging match");

    TransformerComponentBuilder<ConvertToNumberTransformer> tjb2 =
        job.addTransformer(ConvertToNumberTransformer.class);
    tjb2.addInputColumn(outputColumns.get(2));
    tjb2.getOutputColumns().get(0).setName("logging match -> number");

    AnalyzerComponentBuilder<ValueDistributionAnalyzer> ajb =
        job.addAnalyzer(ValueDistributionAnalyzer.class);
    ajb.addInputColumns(tjb1.getOutputColumns());
    ajb.addInputColumns(tjb2.getOutputColumns());

    assertTrue(job.isConfigured());

    AnalysisJob analysisJob = job.toAnalysisJob();
    AnalysisResultFuture resultFuture = new AnalysisRunnerImpl(conf).run(analysisJob);

    if (!resultFuture.isSuccessful()) {
      job.close();
      throw resultFuture.getErrors().get(0);
    }

    List<AnalyzerResult> results = resultFuture.getResults();

    assertEquals(4, results.size());
    ValueDistributionAnalyzerResult res = (ValueDistributionAnalyzerResult) results.get(0);
    assertEquals("eobjects match", res.getName());
    assertEquals(8, res.getCount("true").intValue());
    assertEquals(4, res.getCount("false").intValue());

    res = (ValueDistributionAnalyzerResult) results.get(1);
    assertEquals("apache match", res.getName());
    assertEquals(2, res.getCount("true").intValue());
    assertEquals(10, res.getCount("false").intValue());

    res = (ValueDistributionAnalyzerResult) results.get(2);
    assertEquals("logging match", res.getName());
    assertEquals(3, res.getCount("true").intValue());
    assertEquals(9, res.getCount("false").intValue());

    res = (ValueDistributionAnalyzerResult) results.get(3);
    assertEquals("logging match -> number", res.getName());
    assertEquals(3, res.getCount("1").intValue());
    assertEquals(9, res.getCount("0").intValue());

    job.close();
  }
  public void testScenario() throws Throwable {
    final AnalysisJob job;

    try (DatastoreConnection connection = datastore.openConnection(); ) {
      final DataContext dataContext = connection.getDataContext();
      final Table table = dataContext.getTableByQualifiedLabel("PUBLIC.CUSTOMERS");
      final Row row =
          MetaModelHelper.executeSingleRowQuery(
              dataContext, dataContext.query().from(table).selectCount().toQuery());
      assertEquals(recordsInTable, ((Number) row.getValue(0)).intValue());

      try (AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration)) {
        jobBuilder.setDatastore(datastore);
        jobBuilder.addSourceColumns("CUSTOMERS.CONTACTFIRSTNAME");
        jobBuilder.addSourceColumns("CUSTOMERS.CONTACTLASTNAME");

        // although not semantically correct, we pretend that EVEN is
        // the
        // success-state in our cleansing street and that ODD is the
        // reject-state.
        final Category valid = org.datacleaner.test.mock.EvenOddFilter.Category.EVEN;
        final Category invalid = org.datacleaner.test.mock.EvenOddFilter.Category.ODD;

        final TransformerComponentBuilder<MockTransformer> trans1 =
            jobBuilder.addTransformer(MockTransformer.class);
        trans1.setName("trans1");
        trans1.addInputColumn(jobBuilder.getSourceColumns().get(0));

        final FilterComponentBuilder<
                EvenOddFilter, org.datacleaner.test.mock.EvenOddFilter.Category>
            filter1 = jobBuilder.addFilter(EvenOddFilter.class);
        filter1.setName("filter1");
        filter1.addInputColumn(trans1.getOutputColumns().get(0));

        final TransformerComponentBuilder<MockTransformer> trans2 =
            jobBuilder.addTransformer(MockTransformer.class);
        trans2.setName("trans2");
        trans2.addInputColumn(jobBuilder.getSourceColumns().get(1));
        trans2.setRequirement(filter1, valid);

        final FilterComponentBuilder<
                EvenOddFilter, org.datacleaner.test.mock.EvenOddFilter.Category>
            filter2 = jobBuilder.addFilter(EvenOddFilter.class);
        filter2.setName("filter2");
        filter2.addInputColumn(trans2.getOutputColumns().get(0));

        final AnalyzerComponentBuilder<MockAnalyzer> analyzer1 =
            jobBuilder.addAnalyzer(MockAnalyzer.class);
        analyzer1.setName("success");
        analyzer1.addInputColumn(jobBuilder.getSourceColumns().get(0));
        analyzer1.addInputColumn(jobBuilder.getSourceColumns().get(1));
        analyzer1.addInputColumn(trans1.getOutputColumns().get(0));
        analyzer1.addInputColumn(trans2.getOutputColumns().get(0));
        analyzer1.setRequirement(filter2, valid);

        final FilterOutcome invalid1 = filter1.getFilterOutcome(invalid);
        final FilterOutcome invalid2 = filter2.getFilterOutcome(invalid);
        final AnalyzerComponentBuilder<MockAnalyzer> analyzer2 =
            jobBuilder.addAnalyzer(MockAnalyzer.class);
        analyzer2.setName("rejects");
        analyzer2.addInputColumn(jobBuilder.getSourceColumns().get(0));
        analyzer2.addInputColumn(jobBuilder.getSourceColumns().get(1));
        analyzer2.setComponentRequirement(new CompoundComponentRequirement(invalid1, invalid2));

        job = jobBuilder.toAnalysisJob();
      }
    }

    final AnalysisRunner runner = new AnalysisRunnerImpl(configuration);
    final AnalysisResultFuture resultFuture = runner.run(job);
    resultFuture.await();

    if (resultFuture.isErrornous()) {
      throw resultFuture.getErrors().get(0);
    }

    int recordsInResults = 0;

    final Map<ComponentJob, AnalyzerResult> map = resultFuture.getResultMap();
    for (Entry<ComponentJob, AnalyzerResult> entry : map.entrySet()) {
      final ComponentJob componentJob = entry.getKey();
      @SuppressWarnings("unchecked")
      final ListResult<InputRow> result = (ListResult<InputRow>) entry.getValue();
      final List<InputRow> values = result.getValues();
      final int recordsInResult = values.size();
      recordsInResults += recordsInResult;

      switch (componentJob.getName()) {
        case "success":
        case "rejects":
          // expected states
          assertTrue(
              "Expected records in all buckets of the cleansing street, but did not find any in: "
                  + componentJob,
              recordsInResult > 0);
          assertTrue(
              "Expected records to be distributed across buckets, but found all in: "
                  + componentJob,
              recordsInResult != recordsInTable);
          break;
        default:
          fail("Unexpected component in result map: " + componentJob);
      }
    }

    assertEquals(recordsInTable, recordsInResults);
  }
  public void runScenario() {
    MultiThreadedTaskRunner taskRunner = TestEnvironment.getMultiThreadedTaskRunner();

    ThreadPoolExecutor executorService = (ThreadPoolExecutor) taskRunner.getExecutorService();
    assertEquals(TestEnvironment.THREAD_COUNT, executorService.getMaximumPoolSize());
    assertEquals(0, executorService.getActiveCount());

    DataCleanerConfiguration configuration =
        new DataCleanerConfigurationImpl()
            .withEnvironment(new DataCleanerEnvironmentImpl().withTaskRunner(taskRunner));

    AnalysisRunner runner = new AnalysisRunnerImpl(configuration);

    Datastore ds = TestHelper.createSampleDatabaseDatastore("foobar");
    try (DatastoreConnection con = ds.openConnection()) {

      AnalysisJob job;
      try (AnalysisJobBuilder analysisJobBuilder = new AnalysisJobBuilder(configuration)) {
        analysisJobBuilder.setDatastore(ds);

        Table table = con.getDataContext().getDefaultSchema().getTableByName("ORDERFACT");
        assertNotNull(table);

        Column statusColumn = table.getColumnByName("STATUS");
        Column commentsColumn = table.getColumnByName("COMMENTS");

        analysisJobBuilder.addSourceColumns(statusColumn, commentsColumn);
        analysisJobBuilder
            .addAnalyzer(AnalyzerMock.class)
            .addInputColumns(analysisJobBuilder.getSourceColumns());

        job = analysisJobBuilder.toAnalysisJob();
      }

      AnalysisResultFuture resultFuture = runner.run(job);

      try {
        Thread.sleep(550);
      } catch (InterruptedException e) {
        e.printStackTrace();
        fail("Interrupted! " + e.getMessage());
      }

      resultFuture.cancel();

      assertFalse(resultFuture.isSuccessful());
      assertTrue(resultFuture.isCancelled());
      assertTrue(resultFuture.isErrornous());

      try {
        Thread.sleep(400);
      } catch (InterruptedException e) {
        e.printStackTrace();
        fail("Interrupted! " + e.getMessage());
      }

      assertEquals(TestEnvironment.THREAD_COUNT, executorService.getMaximumPoolSize());

      long completedTaskCount = executorService.getCompletedTaskCount();
      assertTrue("completedTaskCount was: " + completedTaskCount, completedTaskCount > 3);

      int largestPoolSize = executorService.getLargestPoolSize();
      assertTrue("largestPoolSize was: " + largestPoolSize, largestPoolSize > 5);
      assertEquals(0, executorService.getActiveCount());
    }

    taskRunner.shutdown();
  }