/** * Calculates the reduction ratio. Must be called by all sub classes in generatePairs(...). * * @param dataset1 the first data set * @param dataset2 the second data set * @param blockedPairs the list of pairs that resulted from the blocking */ protected void calculatePerformance( DataSet<RecordType> dataset1, DataSet<RecordType> dataset2, List<Pair<RecordType, RecordType>> blockedPairs) { long maxPairs = (long) dataset1.getSize() * (long) dataset2.getSize(); reductionRatio = (double) maxPairs / (double) blockedPairs.size(); }
public void testRunMatching() throws XPathExpressionException, ParserConfigurationException, SAXException, IOException { DataSet<Movie> ds = new DataSet<>(); File sourceFile1 = new File("usecase/movie/input/actors.xml"); ds.loadFromXML(sourceFile1, new MovieFactory(), "/movies/movie"); DataSet<Movie> ds2 = new DataSet<>(); File sourceFile2 = new File("usecase/movie/input/academy_awards.xml"); ds2.loadFromXML(sourceFile2, new MovieFactory(), "/movies/movie"); LinearCombinationMatchingRule<Movie> rule = new LinearCombinationMatchingRule<>(0, 0); rule.addComparator(new MovieTitleComparator(), 0.5); rule.addComparator(new MovieDirectorComparatorLevenshtein(), 0.25); rule.addComparator(new MovieDateComparator(), 0.25); Blocker<Movie> blocker = new CrossProductBlocker<>(); MatchingEngine<Movie> engine = new MatchingEngine<>(rule, blocker); engine.runMatching(ds, ds2); }
public void testGenerateFeaturesForOptimisation() throws XPathExpressionException, ParserConfigurationException, SAXException, IOException { DataSet<Movie> ds = new DataSet<>(); File sourceFile1 = new File("usecase/movie/input/actors.xml"); ds.loadFromXML(sourceFile1, new MovieFactory(), "/movies/movie"); DataSet<Movie> ds2 = new DataSet<>(); File sourceFile2 = new File("usecase/movie/input/academy_awards.xml"); ds2.loadFromXML(sourceFile2, new MovieFactory(), "/movies/movie"); LinearCombinationMatchingRule<Movie> rule = new LinearCombinationMatchingRule<>(0, 0); rule.addComparator(new MovieTitleComparator(), 0.5); rule.addComparator(new MovieDirectorComparatorLevenshtein(), 0.25); rule.addComparator(new MovieDateComparator(), 0.25); Blocker<Movie> blocker = new CrossProductBlocker<>(); MatchingEngine<Movie> engine = new MatchingEngine<>(rule, blocker); GoldStandard gs = new GoldStandard(); gs.loadFromCSVFile(new File("usecase/movie/goldstandard/gs_academy_awards_2_actors.csv")); engine.generateTrainingDataForLearning(ds, ds2, gs); }
public static void main(String[] args) throws XPathExpressionException, ParserConfigurationException, SAXException, IOException, TransformerException { // String printCompanyID = "Forbes_Company_35"; //null; String printCompanyID = null; // load the data sets FusableDataSet<FusableCompany> dsForbes = new FusableDataSet<>(); FusableDataSet<FusableCompany> dsFreebase = new FusableDataSet<>(); FusableDataSet<FusableCompany> dsDBpedia = new FusableDataSet<>(); FusableDataSet<FusableCompany> dsLocation = new FusableDataSet<>(); dsForbes.loadFromXML( new File("data/mappingResults/IntegratedCompanyForbes.xml"), new FusableCompanyFactory(printCompanyID), "/companies/company"); dsFreebase.loadFromXML( new File("data/mappingResults/IntegratedCompanyFreebase.xml"), new FusableCompanyFactory(printCompanyID), "/companies/company"); dsDBpedia.loadFromXML( new File("data/mappingResults/IntegratedCompanyDBpedia.xml"), new FusableCompanyFactory(printCompanyID), "/companies/company"); dsLocation.loadFromXML( new File("data/mappingResults/IntegratedLocationDBpedia.xml"), new FusableCompanyFactory(printCompanyID), "/companies/company"); // set dataset metadata dsForbes.setScore(2.0); dsFreebase.setScore(1.0); dsDBpedia.setScore(1.0); dsLocation.setScore(0.5); dsForbes.setDate(DateTime.parse("2014-01-01")); // dsFreebase.setDate(DateTime.parse("2015-11-21")); // dsDBpedia.setDate(DateTime.parse("2015-11-21")); // dsLocation.setDate(DateTime.parse("2015-11-21")); // print dataset density // System.out.println("IntegratedCompanyForbes.xml"); // dsForbes.printDataSetDensityReport(); // System.out.println("IntegratedCompanyFreebase.xml"); // dsFreebase.printDataSetDensityReport(); // System.out.println("IntegratedCompanyDBpedia.xml"); // dsDBpedia.printDataSetDensityReport(); // System.out.println("IntegratedLocationDBpedia.xml"); // dsLocation.printDataSetDensityReport(); // load the correspondences CorrespondenceSet<FusableCompany> correspondences = new CorrespondenceSet<>(); correspondences.loadCorrespondences( new File("data/resolutionResults/companyForbes_2_companyFreebase_correspondences.csv"), dsForbes, dsFreebase, true); correspondences.loadCorrespondences( new File("data/resolutionResults/companyFreebase_2_companyDBpedia_correspondences.csv"), dsFreebase, dsDBpedia, true); correspondences.loadCorrespondences( new File("data/resolutionResults/company_2_location_correspondences.csv"), dsDBpedia, dsLocation, false); // write group size distribution correspondences.writeGroupSizeDistribution( new File("data/fusionResults/group_size_distribution.csv")); // define the fusion strategy DataFusionStrategy<FusableCompany> strategy = new DataFusionStrategy<>(new FusableCompanyFactory(printCompanyID)); // add attribute fusers // Note: The attribute name is only used for printing the reports strategy.addAttributeFuser("name", new NameFuser(), new NameEvaluationRule()); strategy.addAttributeFuser("countries", new CountriesFuser(), new CountriesEvaluationRule()); strategy.addAttributeFuser("industries", new IndustriesFuser(), new IndustriesEvaluationRule()); strategy.addAttributeFuser("revenue", new RevenueFuser(), new RevenueEvaluationRule()); strategy.addAttributeFuser( "numberOfEmployees", new NumberOfEmployeesFuser(), new NumberOfEmployeesEvaluationRule()); strategy.addAttributeFuser( "dateFounded", new DateFoundedFuser(), new DateFoundedEvaluationRule()); strategy.addAttributeFuser("assets", new AssetsFuser(), new AssetsEvaluationRule()); strategy.addAttributeFuser( "marketValue", new MarketValueFuser(), new MarketValueEvaluationRule()); strategy.addAttributeFuser("profit", new ProfitFuser(), new ProfitEvaluationRule()); strategy.addAttributeFuser("continent", new ContinentFuser(), new ContinentEvaluationRule()); strategy.addAttributeFuser("keyPeople", new KeyPeopleFuser(), new KeyPeopleEvaluationRule()); strategy.addAttributeFuser("locations", new LocationsFuser(), new LocationsEvaluationRule()); // create the fusion engine DataFusionEngine<FusableCompany> engine = new DataFusionEngine<>(strategy); // calculate cluster consistency // engine.printClusterConsistencyReport(correspondences); // run the fusion FusableDataSet<FusableCompany> fusedDataSet = engine.run(correspondences); System.out.println("FUSED RESULT"); // fusedDataSet.printDataSetDensityReport(); // write the result ArrayList<FusableDataSet<FusableCompany>> datasets = new ArrayList<>(); datasets.add(dsForbes); datasets.add(dsFreebase); datasets.add(dsDBpedia); datasets.add(dsLocation); fusedDataSet.writeXML( new File("data/fusionResults/fused.xml"), new CompanyXMLFormatter(datasets)); // load the gold standard DataSet<FusableCompany> gs = new FusableDataSet<>(); gs.loadFromXML( new File("data/goldstandard/fused.xml"), new FusableCompanyFactory(printCompanyID), "/companies/company"); // evaluate DataFusionEvaluator<FusableCompany> evaluator = new DataFusionEvaluator<>(strategy); evaluator.setVerbose(true); double accuracy = evaluator.evaluate(fusedDataSet, gs); System.out.println(String.format("Accuracy: %.2f", accuracy)); }
public static void main(String[] args) throws XPathExpressionException, ParserConfigurationException, SAXException, IOException, TransformerException { // load the data sets FusableDataSet<FusableBooks> ds1 = new FusableDataSet<>(); FusableDataSet<FusableBooks> ds2 = new FusableDataSet<>(); FusableDataSet<FusableBooks> ds3 = new FusableDataSet<>(); FusableDataSet<FusableBooks> ds4 = new FusableDataSet<>(); ds1.loadFromXML( new File("usecase/books/input/AuthorTargetSchemaB.xml"), new FusableBooksFactory(), "/Books/Book"); ds2.loadFromXML( new File("usecase/books/input/DBPediaTargetSchemaBooks.xml"), new FusableBooksFactory(), "/Books/Book"); ds3.loadFromXML( new File("usecase/books/input/GoodReadsTargetSchema.xml"), new FusableBooksFactory(), "/Books/Book"); ds3.loadFromXML( new File("usecase/books/input/FreiburgTargetSchemaOutput.xml"), new FusableBooksFactory(), "/Books/Book"); // set dataset metadata ds1.setScore(4.0); ds2.setScore(3.0); ds3.setScore(1.0); ds4.setScore(2.0); ds1.setDate(DateTime.parse("2015-11-12")); ds2.setDate(DateTime.parse("2015-11-12")); ds3.setDate(DateTime.parse("2015-10-15")); ds4.setDate(DateTime.parse("2014-09-01")); // print dataset density System.out.println("AuthorTargetSchemaB.xml"); ds1.printDataSetDensityReport(); System.out.println("DBPediaTargetSchemaBooks.xml"); ds2.printDataSetDensityReport(); System.out.println("GoodReadsTargetSchema.xml"); ds3.printDataSetDensityReport(); System.out.println("FreiburgTargetSchemaOutput.xml"); ds4.printDataSetDensityReport(); // load the correspondences CorrespondenceSet<FusableBooks> correspondences = new CorrespondenceSet<>(); correspondences.loadCorrespondences( new File("usecase/books/correspondences/Author_2_DbpediaBooks_Correspondences.csv"), ds1, ds2); correspondences.loadCorrespondences( new File("usecase/books/correspondences/Author_2_GoodReads_Correspondences.csv"), ds1, ds3); correspondences.loadCorrespondences( new File("usecase/books/correspondences/GoodReads_2_DbpediaBooks_Correspondences.csv"), ds3, ds2); // write group size distribution correspondences.writeGroupSizeDistribution( new File("usecase/Books/output/group_size_distribution.csv")); // define the fusion strategy DataFusionStrategy<FusableBooks> strategy = new DataFusionStrategy<>(new FusableBooksFactory()); // add attribute fusers // Note: The attribute name is only used for printing the reports strategy.addAttributeFuser("ISBN", new ISBNFuser(), new ISBNEvaluationRule()); strategy.addAttributeFuser("Book_Name", new BookTitleFuser(), new BookTitleEvaluationRule()); strategy.addAttributeFuser("Authors", new AuthorFuser(), new AuthorsEvaluationRule()); strategy.addAttributeFuser("Publisher", new PublisherFuser(), new PublisherEvaluationRule()); // strategy.addAttributeFuser("Genre", new GenreFuser(), new GenreEvaluationRule()); // strategy.addAttributeFuser("Pages", new PagesFuser(), new PagesEvaluationRule()); // strategy.addAttributeFuser("Publication_Country", new PubCountryFuser(), new // PubCountryEvaluationRule()); strategy.addAttributeFuser("Publication_Date", new PubDateFuser(), new PubYearEvaluationRule()); strategy.addAttributeFuser("Rating", new RatingFuser(), new RatingEvaluationRule()); // create the fusion engine DataFusionEngine<FusableBooks> engine = new DataFusionEngine<>(strategy); // calculate cluster consistency engine.printClusterConsistencyReport(correspondences); // run the fusion FusableDataSet<FusableBooks> fusedDataSet = engine.run(correspondences); // write the result fusedDataSet.writeXML(new File("usecase/book/output/fused.xml"), new BookXMLFormatter()); // load the gold standard DataSet<FusableBooks> gs = new FusableDataSet<>(); gs.loadFromXML( new File("usecase/Books/goldstandard/fused.xml"), new FusableBooksFactory(), "/Books/Books"); // evaluate DataFusionEvaluator<FusableBooks> evaluator = new DataFusionEvaluator<>(strategy); evaluator.setVerbose(true); double accuracy = evaluator.evaluate(fusedDataSet, gs); System.out.println(String.format("Accuracy: %.2f", accuracy)); }