Exemplo n.º 1
0
  /**
   * Calculates the reduction ratio. Must be called by all sub classes in generatePairs(...).
   *
   * @param dataset1 the first data set
   * @param dataset2 the second data set
   * @param blockedPairs the list of pairs that resulted from the blocking
   */
  protected void calculatePerformance(
      DataSet<RecordType> dataset1,
      DataSet<RecordType> dataset2,
      List<Pair<RecordType, RecordType>> blockedPairs) {
    long maxPairs = (long) dataset1.getSize() * (long) dataset2.getSize();

    reductionRatio = (double) maxPairs / (double) blockedPairs.size();
  }
  public void testRunMatching()
      throws XPathExpressionException, ParserConfigurationException, SAXException, IOException {
    DataSet<Movie> ds = new DataSet<>();
    File sourceFile1 = new File("usecase/movie/input/actors.xml");
    ds.loadFromXML(sourceFile1, new MovieFactory(), "/movies/movie");

    DataSet<Movie> ds2 = new DataSet<>();
    File sourceFile2 = new File("usecase/movie/input/academy_awards.xml");
    ds2.loadFromXML(sourceFile2, new MovieFactory(), "/movies/movie");

    LinearCombinationMatchingRule<Movie> rule = new LinearCombinationMatchingRule<>(0, 0);
    rule.addComparator(new MovieTitleComparator(), 0.5);
    rule.addComparator(new MovieDirectorComparatorLevenshtein(), 0.25);
    rule.addComparator(new MovieDateComparator(), 0.25);

    Blocker<Movie> blocker = new CrossProductBlocker<>();
    MatchingEngine<Movie> engine = new MatchingEngine<>(rule, blocker);

    engine.runMatching(ds, ds2);
  }
  public void testGenerateFeaturesForOptimisation()
      throws XPathExpressionException, ParserConfigurationException, SAXException, IOException {
    DataSet<Movie> ds = new DataSet<>();
    File sourceFile1 = new File("usecase/movie/input/actors.xml");
    ds.loadFromXML(sourceFile1, new MovieFactory(), "/movies/movie");

    DataSet<Movie> ds2 = new DataSet<>();
    File sourceFile2 = new File("usecase/movie/input/academy_awards.xml");
    ds2.loadFromXML(sourceFile2, new MovieFactory(), "/movies/movie");

    LinearCombinationMatchingRule<Movie> rule = new LinearCombinationMatchingRule<>(0, 0);
    rule.addComparator(new MovieTitleComparator(), 0.5);
    rule.addComparator(new MovieDirectorComparatorLevenshtein(), 0.25);
    rule.addComparator(new MovieDateComparator(), 0.25);

    Blocker<Movie> blocker = new CrossProductBlocker<>();
    MatchingEngine<Movie> engine = new MatchingEngine<>(rule, blocker);

    GoldStandard gs = new GoldStandard();
    gs.loadFromCSVFile(new File("usecase/movie/goldstandard/gs_academy_awards_2_actors.csv"));

    engine.generateTrainingDataForLearning(ds, ds2, gs);
  }
  public static void main(String[] args)
      throws XPathExpressionException, ParserConfigurationException, SAXException, IOException,
          TransformerException {
    // String printCompanyID = "Forbes_Company_35";  //null;
    String printCompanyID = null;
    // load the data sets
    FusableDataSet<FusableCompany> dsForbes = new FusableDataSet<>();
    FusableDataSet<FusableCompany> dsFreebase = new FusableDataSet<>();
    FusableDataSet<FusableCompany> dsDBpedia = new FusableDataSet<>();
    FusableDataSet<FusableCompany> dsLocation = new FusableDataSet<>();
    dsForbes.loadFromXML(
        new File("data/mappingResults/IntegratedCompanyForbes.xml"),
        new FusableCompanyFactory(printCompanyID),
        "/companies/company");
    dsFreebase.loadFromXML(
        new File("data/mappingResults/IntegratedCompanyFreebase.xml"),
        new FusableCompanyFactory(printCompanyID),
        "/companies/company");
    dsDBpedia.loadFromXML(
        new File("data/mappingResults/IntegratedCompanyDBpedia.xml"),
        new FusableCompanyFactory(printCompanyID),
        "/companies/company");
    dsLocation.loadFromXML(
        new File("data/mappingResults/IntegratedLocationDBpedia.xml"),
        new FusableCompanyFactory(printCompanyID),
        "/companies/company");

    // set dataset metadata
    dsForbes.setScore(2.0);
    dsFreebase.setScore(1.0);
    dsDBpedia.setScore(1.0);
    dsLocation.setScore(0.5);
    dsForbes.setDate(DateTime.parse("2014-01-01"));
    // dsFreebase.setDate(DateTime.parse("2015-11-21"));
    // dsDBpedia.setDate(DateTime.parse("2015-11-21"));
    // dsLocation.setDate(DateTime.parse("2015-11-21"));
    // print dataset density
    //		System.out.println("IntegratedCompanyForbes.xml");
    //		dsForbes.printDataSetDensityReport();
    //		System.out.println("IntegratedCompanyFreebase.xml");
    //		dsFreebase.printDataSetDensityReport();
    //		System.out.println("IntegratedCompanyDBpedia.xml");
    //		dsDBpedia.printDataSetDensityReport();
    //		System.out.println("IntegratedLocationDBpedia.xml");
    //		dsLocation.printDataSetDensityReport();

    // load the correspondences
    CorrespondenceSet<FusableCompany> correspondences = new CorrespondenceSet<>();
    correspondences.loadCorrespondences(
        new File("data/resolutionResults/companyForbes_2_companyFreebase_correspondences.csv"),
        dsForbes,
        dsFreebase,
        true);
    correspondences.loadCorrespondences(
        new File("data/resolutionResults/companyFreebase_2_companyDBpedia_correspondences.csv"),
        dsFreebase,
        dsDBpedia,
        true);
    correspondences.loadCorrespondences(
        new File("data/resolutionResults/company_2_location_correspondences.csv"),
        dsDBpedia,
        dsLocation,
        false);

    // write group size distribution
    correspondences.writeGroupSizeDistribution(
        new File("data/fusionResults/group_size_distribution.csv"));

    // define the fusion strategy
    DataFusionStrategy<FusableCompany> strategy =
        new DataFusionStrategy<>(new FusableCompanyFactory(printCompanyID));
    // add attribute fusers
    // Note: The attribute name is only used for printing the reports
    strategy.addAttributeFuser("name", new NameFuser(), new NameEvaluationRule());
    strategy.addAttributeFuser("countries", new CountriesFuser(), new CountriesEvaluationRule());
    strategy.addAttributeFuser("industries", new IndustriesFuser(), new IndustriesEvaluationRule());
    strategy.addAttributeFuser("revenue", new RevenueFuser(), new RevenueEvaluationRule());
    strategy.addAttributeFuser(
        "numberOfEmployees", new NumberOfEmployeesFuser(), new NumberOfEmployeesEvaluationRule());
    strategy.addAttributeFuser(
        "dateFounded", new DateFoundedFuser(), new DateFoundedEvaluationRule());
    strategy.addAttributeFuser("assets", new AssetsFuser(), new AssetsEvaluationRule());
    strategy.addAttributeFuser(
        "marketValue", new MarketValueFuser(), new MarketValueEvaluationRule());
    strategy.addAttributeFuser("profit", new ProfitFuser(), new ProfitEvaluationRule());
    strategy.addAttributeFuser("continent", new ContinentFuser(), new ContinentEvaluationRule());
    strategy.addAttributeFuser("keyPeople", new KeyPeopleFuser(), new KeyPeopleEvaluationRule());
    strategy.addAttributeFuser("locations", new LocationsFuser(), new LocationsEvaluationRule());

    // create the fusion engine
    DataFusionEngine<FusableCompany> engine = new DataFusionEngine<>(strategy);

    // calculate cluster consistency
    // engine.printClusterConsistencyReport(correspondences);

    // run the fusion
    FusableDataSet<FusableCompany> fusedDataSet = engine.run(correspondences);

    System.out.println("FUSED RESULT");
    // fusedDataSet.printDataSetDensityReport();

    // write the result
    ArrayList<FusableDataSet<FusableCompany>> datasets = new ArrayList<>();
    datasets.add(dsForbes);
    datasets.add(dsFreebase);
    datasets.add(dsDBpedia);
    datasets.add(dsLocation);
    fusedDataSet.writeXML(
        new File("data/fusionResults/fused.xml"), new CompanyXMLFormatter(datasets));

    // load the gold standard
    DataSet<FusableCompany> gs = new FusableDataSet<>();
    gs.loadFromXML(
        new File("data/goldstandard/fused.xml"),
        new FusableCompanyFactory(printCompanyID),
        "/companies/company");

    // evaluate
    DataFusionEvaluator<FusableCompany> evaluator = new DataFusionEvaluator<>(strategy);
    evaluator.setVerbose(true);
    double accuracy = evaluator.evaluate(fusedDataSet, gs);

    System.out.println(String.format("Accuracy: %.2f", accuracy));
  }
Exemplo n.º 5
0
  public static void main(String[] args)
      throws XPathExpressionException, ParserConfigurationException, SAXException, IOException,
          TransformerException {
    // load the data sets
    FusableDataSet<FusableBooks> ds1 = new FusableDataSet<>();
    FusableDataSet<FusableBooks> ds2 = new FusableDataSet<>();
    FusableDataSet<FusableBooks> ds3 = new FusableDataSet<>();
    FusableDataSet<FusableBooks> ds4 = new FusableDataSet<>();
    ds1.loadFromXML(
        new File("usecase/books/input/AuthorTargetSchemaB.xml"),
        new FusableBooksFactory(),
        "/Books/Book");
    ds2.loadFromXML(
        new File("usecase/books/input/DBPediaTargetSchemaBooks.xml"),
        new FusableBooksFactory(),
        "/Books/Book");
    ds3.loadFromXML(
        new File("usecase/books/input/GoodReadsTargetSchema.xml"),
        new FusableBooksFactory(),
        "/Books/Book");
    ds3.loadFromXML(
        new File("usecase/books/input/FreiburgTargetSchemaOutput.xml"),
        new FusableBooksFactory(),
        "/Books/Book");

    // set dataset metadata
    ds1.setScore(4.0);
    ds2.setScore(3.0);
    ds3.setScore(1.0);
    ds4.setScore(2.0);

    ds1.setDate(DateTime.parse("2015-11-12"));
    ds2.setDate(DateTime.parse("2015-11-12"));
    ds3.setDate(DateTime.parse("2015-10-15"));
    ds4.setDate(DateTime.parse("2014-09-01"));
    // print dataset density
    System.out.println("AuthorTargetSchemaB.xml");
    ds1.printDataSetDensityReport();
    System.out.println("DBPediaTargetSchemaBooks.xml");
    ds2.printDataSetDensityReport();
    System.out.println("GoodReadsTargetSchema.xml");
    ds3.printDataSetDensityReport();
    System.out.println("FreiburgTargetSchemaOutput.xml");
    ds4.printDataSetDensityReport();

    // load the correspondences
    CorrespondenceSet<FusableBooks> correspondences = new CorrespondenceSet<>();
    correspondences.loadCorrespondences(
        new File("usecase/books/correspondences/Author_2_DbpediaBooks_Correspondences.csv"),
        ds1,
        ds2);
    correspondences.loadCorrespondences(
        new File("usecase/books/correspondences/Author_2_GoodReads_Correspondences.csv"), ds1, ds3);
    correspondences.loadCorrespondences(
        new File("usecase/books/correspondences/GoodReads_2_DbpediaBooks_Correspondences.csv"),
        ds3,
        ds2);

    // write group size distribution
    correspondences.writeGroupSizeDistribution(
        new File("usecase/Books/output/group_size_distribution.csv"));

    // define the fusion strategy
    DataFusionStrategy<FusableBooks> strategy = new DataFusionStrategy<>(new FusableBooksFactory());
    // add attribute fusers
    // Note: The attribute name is only used for printing the reports
    strategy.addAttributeFuser("ISBN", new ISBNFuser(), new ISBNEvaluationRule());
    strategy.addAttributeFuser("Book_Name", new BookTitleFuser(), new BookTitleEvaluationRule());
    strategy.addAttributeFuser("Authors", new AuthorFuser(), new AuthorsEvaluationRule());
    strategy.addAttributeFuser("Publisher", new PublisherFuser(), new PublisherEvaluationRule());
    //		strategy.addAttributeFuser("Genre", new GenreFuser(), new GenreEvaluationRule());
    //		strategy.addAttributeFuser("Pages", new PagesFuser(), new PagesEvaluationRule());
    //		strategy.addAttributeFuser("Publication_Country", new PubCountryFuser(), new
    // PubCountryEvaluationRule());
    strategy.addAttributeFuser("Publication_Date", new PubDateFuser(), new PubYearEvaluationRule());
    strategy.addAttributeFuser("Rating", new RatingFuser(), new RatingEvaluationRule());
    // create the fusion engine
    DataFusionEngine<FusableBooks> engine = new DataFusionEngine<>(strategy);

    // calculate cluster consistency
    engine.printClusterConsistencyReport(correspondences);

    // run the fusion
    FusableDataSet<FusableBooks> fusedDataSet = engine.run(correspondences);

    // write the result
    fusedDataSet.writeXML(new File("usecase/book/output/fused.xml"), new BookXMLFormatter());

    // load the gold standard
    DataSet<FusableBooks> gs = new FusableDataSet<>();
    gs.loadFromXML(
        new File("usecase/Books/goldstandard/fused.xml"),
        new FusableBooksFactory(),
        "/Books/Books");

    // evaluate
    DataFusionEvaluator<FusableBooks> evaluator = new DataFusionEvaluator<>(strategy);
    evaluator.setVerbose(true);
    double accuracy = evaluator.evaluate(fusedDataSet, gs);

    System.out.println(String.format("Accuracy: %.2f", accuracy));
  }