コード例 #1
0
 /**
  * Gets or creates a {@link TransformerJob} for a particular {@link TransformerComponentBuilder}.
  * Since {@link MultiStreamComponent}s are subtypes of {@link Transformer} it is necesary to have
  * this caching mechanism in place in order to allow diamond-shaped component graphs where
  * multiple streams include the same component.
  *
  * @param validate
  * @param tjb
  * @return
  */
 public TransformerJob getOrCreateTransformerJob(
     boolean validate, TransformerComponentBuilder<?> tjb) {
   TransformerJob componentJob = (TransformerJob) _componentJobs.get(tjb);
   if (componentJob == null) {
     try {
       componentJob = tjb.toTransformerJob(validate, this);
       _componentJobs.put(tjb, componentJob);
     } catch (IllegalStateException e) {
       throw new IllegalStateException(
           "Could not create transformer job from builder: " + tjb + ", (" + e.getMessage() + ")",
           e);
     }
   }
   return componentJob;
 }
  public void testScenario() throws Throwable {
    final AnalysisJob job;

    try (DatastoreConnection connection = datastore.openConnection(); ) {
      final DataContext dataContext = connection.getDataContext();
      final Table table = dataContext.getTableByQualifiedLabel("PUBLIC.CUSTOMERS");
      final Row row =
          MetaModelHelper.executeSingleRowQuery(
              dataContext, dataContext.query().from(table).selectCount().toQuery());
      assertEquals(recordsInTable, ((Number) row.getValue(0)).intValue());

      try (AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration)) {
        jobBuilder.setDatastore(datastore);
        jobBuilder.addSourceColumns("CUSTOMERS.CONTACTFIRSTNAME");
        jobBuilder.addSourceColumns("CUSTOMERS.CONTACTLASTNAME");

        // although not semantically correct, we pretend that EVEN is
        // the
        // success-state in our cleansing street and that ODD is the
        // reject-state.
        final Category valid = org.datacleaner.test.mock.EvenOddFilter.Category.EVEN;
        final Category invalid = org.datacleaner.test.mock.EvenOddFilter.Category.ODD;

        final TransformerComponentBuilder<MockTransformer> trans1 =
            jobBuilder.addTransformer(MockTransformer.class);
        trans1.setName("trans1");
        trans1.addInputColumn(jobBuilder.getSourceColumns().get(0));

        final FilterComponentBuilder<
                EvenOddFilter, org.datacleaner.test.mock.EvenOddFilter.Category>
            filter1 = jobBuilder.addFilter(EvenOddFilter.class);
        filter1.setName("filter1");
        filter1.addInputColumn(trans1.getOutputColumns().get(0));

        final TransformerComponentBuilder<MockTransformer> trans2 =
            jobBuilder.addTransformer(MockTransformer.class);
        trans2.setName("trans2");
        trans2.addInputColumn(jobBuilder.getSourceColumns().get(1));
        trans2.setRequirement(filter1, valid);

        final FilterComponentBuilder<
                EvenOddFilter, org.datacleaner.test.mock.EvenOddFilter.Category>
            filter2 = jobBuilder.addFilter(EvenOddFilter.class);
        filter2.setName("filter2");
        filter2.addInputColumn(trans2.getOutputColumns().get(0));

        final AnalyzerComponentBuilder<MockAnalyzer> analyzer1 =
            jobBuilder.addAnalyzer(MockAnalyzer.class);
        analyzer1.setName("success");
        analyzer1.addInputColumn(jobBuilder.getSourceColumns().get(0));
        analyzer1.addInputColumn(jobBuilder.getSourceColumns().get(1));
        analyzer1.addInputColumn(trans1.getOutputColumns().get(0));
        analyzer1.addInputColumn(trans2.getOutputColumns().get(0));
        analyzer1.setRequirement(filter2, valid);

        final FilterOutcome invalid1 = filter1.getFilterOutcome(invalid);
        final FilterOutcome invalid2 = filter2.getFilterOutcome(invalid);
        final AnalyzerComponentBuilder<MockAnalyzer> analyzer2 =
            jobBuilder.addAnalyzer(MockAnalyzer.class);
        analyzer2.setName("rejects");
        analyzer2.addInputColumn(jobBuilder.getSourceColumns().get(0));
        analyzer2.addInputColumn(jobBuilder.getSourceColumns().get(1));
        analyzer2.setComponentRequirement(new CompoundComponentRequirement(invalid1, invalid2));

        job = jobBuilder.toAnalysisJob();
      }
    }

    final AnalysisRunner runner = new AnalysisRunnerImpl(configuration);
    final AnalysisResultFuture resultFuture = runner.run(job);
    resultFuture.await();

    if (resultFuture.isErrornous()) {
      throw resultFuture.getErrors().get(0);
    }

    int recordsInResults = 0;

    final Map<ComponentJob, AnalyzerResult> map = resultFuture.getResultMap();
    for (Entry<ComponentJob, AnalyzerResult> entry : map.entrySet()) {
      final ComponentJob componentJob = entry.getKey();
      @SuppressWarnings("unchecked")
      final ListResult<InputRow> result = (ListResult<InputRow>) entry.getValue();
      final List<InputRow> values = result.getValues();
      final int recordsInResult = values.size();
      recordsInResults += recordsInResult;

      switch (componentJob.getName()) {
        case "success":
        case "rejects":
          // expected states
          assertTrue(
              "Expected records in all buckets of the cleansing street, but did not find any in: "
                  + componentJob,
              recordsInResult > 0);
          assertTrue(
              "Expected records to be distributed across buckets, but found all in: "
                  + componentJob,
              recordsInResult != recordsInTable);
          break;
        default:
          fail("Unexpected component in result map: " + componentJob);
      }
    }

    assertEquals(recordsInTable, recordsInResults);
  }
コード例 #3
0
  public void testParseAndAssignDictionaries() throws Throwable {
    Collection<Dictionary> dictionaries = new ArrayList<Dictionary>();
    dictionaries.add(
        new SimpleDictionary("eobjects.org products", "MetaModel", "DataCleaner", "AnalyzerBeans"));
    dictionaries.add(
        new SimpleDictionary(
            "apache products", "commons-lang", "commons-math", "commons-codec", "commons-logging"));
    dictionaries.add(
        new SimpleDictionary(
            "logging products", "commons-logging", "log4j", "slf4j", "java.util.Logging"));

    Collection<SynonymCatalog> synonymCatalogs = new ArrayList<SynonymCatalog>();
    synonymCatalogs.add(
        new SimpleSynonymCatalog(
            "translated terms",
            new SimpleSynonym("hello", "howdy", "hi", "yo", "hey"),
            new SimpleSynonym("goodbye", "bye", "see you", "hey")));

    Collection<StringPattern> stringPatterns = new ArrayList<StringPattern>();

    ReferenceDataCatalogImpl ref =
        new ReferenceDataCatalogImpl(dictionaries, synonymCatalogs, stringPatterns);

    Datastore datastore = new CsvDatastore("my database", "src/test/resources/projects.csv");
    DataCleanerConfigurationImpl conf = new DataCleanerConfigurationImpl();
    AnalysisJobBuilder job = new AnalysisJobBuilder(conf);
    job.setDatastore(datastore);
    job.addSourceColumns("product", "version");
    TransformerComponentBuilder<DictionaryMatcherTransformer> tjb1 =
        job.addTransformer(DictionaryMatcherTransformer.class);
    tjb1.setConfiguredProperty(
        "Dictionaries",
        new Dictionary[] {
          ref.getDictionary("eobjects.org products"),
          ref.getDictionary("apache products"),
          ref.getDictionary("logging products")
        });
    tjb1.addInputColumn(job.getSourceColumnByName("product"));
    List<MutableInputColumn<?>> outputColumns = tjb1.getOutputColumns();
    assertEquals(3, outputColumns.size());
    outputColumns.get(0).setName("eobjects match");
    outputColumns.get(1).setName("apache match");
    outputColumns.get(2).setName("logging match");

    TransformerComponentBuilder<ConvertToNumberTransformer> tjb2 =
        job.addTransformer(ConvertToNumberTransformer.class);
    tjb2.addInputColumn(outputColumns.get(2));
    tjb2.getOutputColumns().get(0).setName("logging match -> number");

    AnalyzerComponentBuilder<ValueDistributionAnalyzer> ajb =
        job.addAnalyzer(ValueDistributionAnalyzer.class);
    ajb.addInputColumns(tjb1.getOutputColumns());
    ajb.addInputColumns(tjb2.getOutputColumns());

    assertTrue(job.isConfigured());

    AnalysisJob analysisJob = job.toAnalysisJob();
    AnalysisResultFuture resultFuture = new AnalysisRunnerImpl(conf).run(analysisJob);

    if (!resultFuture.isSuccessful()) {
      job.close();
      throw resultFuture.getErrors().get(0);
    }

    List<AnalyzerResult> results = resultFuture.getResults();

    assertEquals(4, results.size());
    ValueDistributionAnalyzerResult res = (ValueDistributionAnalyzerResult) results.get(0);
    assertEquals("eobjects match", res.getName());
    assertEquals(8, res.getCount("true").intValue());
    assertEquals(4, res.getCount("false").intValue());

    res = (ValueDistributionAnalyzerResult) results.get(1);
    assertEquals("apache match", res.getName());
    assertEquals(2, res.getCount("true").intValue());
    assertEquals(10, res.getCount("false").intValue());

    res = (ValueDistributionAnalyzerResult) results.get(2);
    assertEquals("logging match", res.getName());
    assertEquals(3, res.getCount("true").intValue());
    assertEquals(9, res.getCount("false").intValue());

    res = (ValueDistributionAnalyzerResult) results.get(3);
    assertEquals("logging match -> number", res.getName());
    assertEquals(3, res.getCount("1").intValue());
    assertEquals(9, res.getCount("0").intValue());

    job.close();
  }