private void generateSample() { logger.info("Generating sample..."); sample = ModelFactory.createDefaultModel(); // we have to set up a new query execution factory working on our local model qef = new QueryExecutionFactoryModel(sample); reasoner = new SPARQLReasoner(qef); // get the page size // TODO put to base class long pageSize = 10000; // PaginationUtils.adjustPageSize(globalQef, 10000); ParameterizedSparqlString sampleQueryTemplate = getSampleQuery(); sampleQueryTemplate.setIri("p", entityToDescribe.toStringID()); Query query = sampleQueryTemplate.asQuery(); query.setLimit(pageSize); boolean isEmpty = false; int i = 0; while (!isTimeout() && !isEmpty) { // get next sample logger.debug("Extending sample..."); query.setOffset(i++ * pageSize); QueryExecution qe = ksQef.createQueryExecution(query); Model tmp = qe.execConstruct(); sample.add(tmp); // if last call returned empty model, we can leave loop isEmpty = tmp.isEmpty(); } logger.info("...done. Sample size: " + sample.size() + " triples"); }
private Model execute(Model inputModel, String endpoint) { Model cube = createModel(); Resource dataset; Calendar calendar = Calendar.getInstance(TimeZone.getDefault()); dataset = cube.createResource( GK.uri + "Properties_per_Class" + calendar.getTimeInMillis(), QB.Dataset); dataset.addLiteral(RDFS.comment, "Properties per class"); dataset.addLiteral(DCTerms.date, cube.createTypedLiteral(calendar)); dataset.addLiteral(DCTerms.publisher, "R & D, Unister GmbH, Geoknow"); dataset.addProperty(QB.structure, cube.createResource(STRUCTURE)); QueryExecution qExec; if (inputModel != null) { qExec = QueryExecutionFactory.create(INSTANCES, inputModel); } else { qExec = QueryExecutionFactory.sparqlService(endpoint, INSTANCES, defaultGraphs, defaultGraphs); } ResultSet result = qExec.execSelect(); int i = 0; while (result.hasNext()) { Resource owlClass = result.next().getResource("class"); NUMBER_OF_PROPERTIES.setIri("class", owlClass.getURI()); QueryExecution propertiesQexec; if (inputModel != null) { propertiesQexec = QueryExecutionFactory.create(NUMBER_OF_PROPERTIES.asQuery(), inputModel); } else { propertiesQexec = QueryExecutionFactory.sparqlService( endpoint, NUMBER_OF_PROPERTIES.asQuery(), defaultGraphs, defaultGraphs); System.out.println(NUMBER_OF_PROPERTIES.asQuery()); } try { ResultSet propertiesResult = propertiesQexec.execSelect(); if (propertiesResult.hasNext()) { System.out.println(i); Resource obs = cube.createResource( "http://www.geoknow.eu/data-cube/metric2/observation" + i, QB.Observation); obs.addProperty(QB.dataset, dataset); obs.addProperty(GK.DIM.Class, owlClass); obs.addLiteral(GK.MEASURE.PropertyCount, propertiesResult.next().getLiteral("count")); i++; } } catch (Exception e) { System.out.println(i); Resource obs = cube.createResource( "http://www.geoknow.eu/data-cube/metric2/observation" + i, QB.Observation); obs.addProperty(QB.dataset, dataset); obs.addProperty(GK.DIM.Class, owlClass); obs.addLiteral(GK.MEASURE.PropertyCount, -1); obs.addLiteral(RDFS.comment, e.getMessage()); i++; } } return cube; }
private Model execute(Model inputModel, String endpoint) { Model cube = createModel(); Resource dataset; Calendar calendar = Calendar.getInstance(TimeZone.getDefault()); dataset = cube.createResource(GK.uri + "Average_Surface", QB.Dataset); dataset.addLiteral(RDFS.comment, "Average Surface per class"); dataset.addLiteral(DCTerms.date, cube.createTypedLiteral(calendar)); dataset.addLiteral(DCTerms.publisher, "R & D, Unister GmbH, Geoknow"); dataset.addProperty(QB.structure, cube.createResource(STRUCTURE)); if (endpoint != null) { dataset.addProperty(DCTerms.source, endpoint); } QueryExecution qExec; if (inputModel != null) { qExec = QueryExecutionFactory.create(GET_CLASSES, inputModel); } else { qExec = QueryExecutionFactory.sparqlService(endpoint, GET_CLASSES, defaultGraphs, defaultGraphs); } ResultSet result = qExec.execSelect(); int obsCount = 0; while (result.hasNext()) { double area = 0; int i = 0; Resource owlClass = result.next().get("class").asResource(); if (!blacklist.contains(owlClass.toString())) { System.out.println(owlClass); GET_INSTANCES.setIri("class", owlClass.getURI()); QueryExecution qexecInstances; if (inputModel != null) { qexecInstances = QueryExecutionFactory.create(GET_INSTANCES.asQuery(), inputModel); } else { qexecInstances = QueryExecutionFactory.sparqlService( endpoint, GET_INSTANCES.asQuery(), defaultGraphs, defaultGraphs); } for (ResultSet instancesResult = qexecInstances.execSelect(); instancesResult.hasNext(); ) { QuerySolution next = instancesResult.next(); String instance = next.get("instance").asResource().getURI(); if (instance == null) { continue; } POLYGON.setIri("instance", instance); QueryExecution qexecMember; if (inputModel != null) { qexecMember = QueryExecutionFactory.create(POLYGON.asQuery(), inputModel); } else { qexecMember = QueryExecutionFactory.sparqlService( endpoint, POLYGON.asQuery(), defaultGraphs, defaultGraphs); } StringBuilder polygonBuilder = new StringBuilder(); firstLat = null; firstLong = null; for (ResultSet latLong = qexecMember.execSelect(); latLong.hasNext(); ) { processPoint(latLong.next(), polygonBuilder); } if (polygonBuilder.length() > 0) { area += calculateArea(polygonBuilder); } else { area = 0; polygonBuilder.setLength(0); this.firstLat = null; this.firstLong = null; MULTI_POLYGON.setIri("instance", instance); QueryExecution qexecMultiPolygon; if (inputModel != null) { qexecMultiPolygon = QueryExecutionFactory.create(MULTI_POLYGON.asQuery(), inputModel); } else { qexecMultiPolygon = QueryExecutionFactory.sparqlService( endpoint, MULTI_POLYGON.asQuery(), defaultGraphs, defaultGraphs); } String polygonName = ""; for (ResultSet latLong = qexecMultiPolygon.execSelect(); latLong.hasNext(); ) { QuerySolution solution = latLong.next(); if (!polygonName.equals(solution.get("polygon").asNode().getBlankNodeLabel())) { if (polygonBuilder.length() > 0) { area += calculateArea(polygonBuilder); } this.firstLat = null; this.firstLong = null; polygonBuilder.setLength(0); } polygonName = solution.get("polygon").asNode().getBlankNodeLabel(); processPoint(solution, polygonBuilder); } } i++; } } Resource obs = cube.createResource(structureUri + "/obs/" + obsCount, QB.Observation); double average = i == 0 ? 0 : area / i; obs.addProperty(GK.MEASURE.Average, cube.createTypedLiteral(average)); obs.addProperty(GK.DIM.Class, owlClass); obs.addProperty(QB.dataset, dataset); obsCount++; } return cube; }
// exact computation for 5 heuristics; each one adapted to super class learning; // each one takes the noise parameter into account public double getAccuracyOrTooWeakExact(OWLClassExpression description, double noise) { // System.out.println(description); nanoStartTime = System.nanoTime(); if (heuristic.equals(HeuristicType.JACCARD)) { // computing R(A) TreeSet<OWLIndividual> coveredInstancesSet = new TreeSet<OWLIndividual>(); for (OWLIndividual ind : classInstances) { if (getReasoner().hasType(description, ind)) { coveredInstancesSet.add(ind); } if (terminationTimeExpired()) { return 0; } } // if even the optimal case (no additional instances covered) is not sufficient, // the concept is too weak if (coveredInstancesSet.size() / (double) classInstances.size() <= 1 - noise) { return -1; } // computing R(C) restricted to relevant instances TreeSet<OWLIndividual> additionalInstancesSet = new TreeSet<OWLIndividual>(); for (OWLIndividual ind : superClassInstances) { if (getReasoner().hasType(description, ind)) { additionalInstancesSet.add(ind); } if (terminationTimeExpired()) { return 0; } } Set<OWLIndividual> union = Helper.union(classInstancesSet, additionalInstancesSet); return Heuristics.getJaccardCoefficient(coveredInstancesSet.size(), union.size()); } else if (heuristic.equals(HeuristicType.AMEASURE) || heuristic.equals(HeuristicType.FMEASURE) || heuristic.equals(HeuristicType.PRED_ACC)) { int additionalInstances = 0; int coveredInstances = 0; if (reasoner.getClass().isAssignableFrom(SPARQLReasoner.class)) { // R(C) String query = "SELECT (COUNT(DISTINCT(?s)) AS ?cnt) WHERE {" + "?s a ?sup . ?classToDescribe <http://www.w3.org/2000/01/rdf-schema#subClassOf> ?sup . " + converter.convert("?s", description) + "FILTER NOT EXISTS {?s a ?classToDescribe}}"; ParameterizedSparqlString template = new ParameterizedSparqlString(query); // System.err.println(converter.convert("?s", description)); // template.setIri("cls", description.asOWLClass().toStringID()); template.setIri("classToDescribe", classToDescribe.toStringID()); QueryExecution qe = ((SPARQLReasoner) reasoner) .getQueryExecutionFactory() .createQueryExecution(template.toString()); additionalInstances = qe.execSelect().next().getLiteral("cnt").getInt(); // R(A) OWLObjectIntersectionOf ce = df.getOWLObjectIntersectionOf(classToDescribe, description); coveredInstances = ((SPARQLReasoner) reasoner).getPopularityOf(ce); // System.out.println(coveredInstances); // System.out.println(additionalInstances); } else { // computing R(C) restricted to relevant instances if (useInstanceChecks) { for (OWLIndividual ind : superClassInstances) { if (getReasoner().hasType(description, ind)) { additionalInstances++; } if (terminationTimeExpired()) { return 0; } } } else { SortedSet<OWLIndividual> individuals = getReasoner().getIndividuals(description); individuals.retainAll(superClassInstances); additionalInstances = individuals.size(); } // computing R(A) if (useInstanceChecks) { for (OWLIndividual ind : classInstances) { if (getReasoner().hasType(description, ind)) { coveredInstances++; } if (terminationTimeExpired()) { return 0; } } } else { SortedSet<OWLIndividual> individuals = getReasoner().getIndividuals(description); individuals.retainAll(classInstances); coveredInstances = individuals.size(); } } // System.out.println(description + ":" + coveredInstances + "/" + classInstances.size()); double recall = coveredInstances / (double) classInstances.size(); // noise computation is incorrect // if(recall < 1 - noise) { // return -1; // } double precision = (additionalInstances + coveredInstances == 0) ? 0 : coveredInstances / (double) (coveredInstances + additionalInstances); if (heuristic.equals(HeuristicType.AMEASURE)) { // best reachable concept has same recall and precision 1: // 1/t+1 * (t*r + 1) if ((coverageFactor * recall + 1) / (coverageFactor + 1) < (1 - noise)) { return -1; } else { return Heuristics.getAScore(recall, precision, coverageFactor); } } else if (heuristic.equals(HeuristicType.FMEASURE)) { // best reachable concept has same recall and precision 1: if (((1 + Math.sqrt(coverageFactor)) * recall) / (Math.sqrt(coverageFactor) + 1) < 1 - noise) { return -1; } else { return Heuristics.getFScore(recall, precision, coverageFactor); } } else if (heuristic.equals(HeuristicType.PRED_ACC)) { if ((coverageFactor * coveredInstances + superClassInstances.size()) / (coverageFactor * classInstances.size() + superClassInstances.size()) < 1 - noise) { return -1; } else { // correctly classified divided by all examples return (coverageFactor * coveredInstances + superClassInstances.size() - additionalInstances) / (coverageFactor * classInstances.size() + superClassInstances.size()); } } // return heuristic.equals(HeuristicType.FMEASURE) ? getFMeasure(recall, precision) : // getAccuracy(recall, precision); } else if (heuristic.equals(HeuristicType.GEN_FMEASURE)) { // implementation is based on: // http://sunsite.informatik.rwth-aachen.de/Publications/CEUR-WS/Vol-426/swap2008_submission_14.pdf // default negation should be turned off when using fast instance checker // compute I_C (negated and non-negated concepts separately) TreeSet<OWLIndividual> icPos = new TreeSet<OWLIndividual>(); TreeSet<OWLIndividual> icNeg = new TreeSet<OWLIndividual>(); OWLClassExpression descriptionNeg = df.getOWLObjectComplementOf(description); // loop through all relevant instances for (OWLIndividual ind : classAndSuperClassInstances) { if (getReasoner().hasType(description, ind)) { icPos.add(ind); } else if (getReasoner().hasType(descriptionNeg, ind)) { icNeg.add(ind); } if (terminationTimeExpired()) { return 0; } } // semantic precision // first compute I_C \cap Cn(DC) // it seems that in our setting, we can ignore Cn, because the examples (class instances) // are already part of the background knowledge Set<OWLIndividual> tmp1Pos = Helper.intersection(icPos, classInstancesSet); Set<OWLIndividual> tmp1Neg = Helper.intersection(icNeg, negatedClassInstances); int tmp1Size = tmp1Pos.size() + tmp1Neg.size(); // Cn(I_C) \cap D_C is the same set if we ignore Cn ... int icSize = icPos.size() + icNeg.size(); double prec = (icSize == 0) ? 0 : tmp1Size / (double) icSize; double rec = tmp1Size / (double) (classInstances.size() + negatedClassInstances.size()); // System.out.println(description); // System.out.println("I_C pos: " + icPos); // System.out.println("I_C neg: " + icNeg); // System.out.println("class instances: " + classInstances); // System.out.println("negated class instances: " + negatedClassInstances); // System.out.println(prec); // System.out.println(rec); // System.out.println(coverageFactor); // too weak: see F-measure above // => does not work for generalised F-measure, because even very general // concepts do not have a recall of 1 // if(((1+Math.sqrt(coverageFactor))*rec)/(Math.sqrt(coverageFactor)+1)<1-noise) { // return -1; // } // we only return too weak if there is no recall if (rec <= 0.0000001) { return -1; } return getFMeasure(rec, prec); } throw new Error("ClassLearningProblem error: not implemented"); }
@Override public ClassScore computeScore(OWLClassExpression description, double noise) { // TODO: reuse code to ensure that we never return inconsistent results // between getAccuracy, getAccuracyOrTooWeak and computeScore Set<OWLIndividual> additionalInstances = new TreeSet<OWLIndividual>(); Set<OWLIndividual> coveredInstances = new TreeSet<OWLIndividual>(); int additionalInstancesCnt = 0; int coveredInstancesCnt = 0; if (reasoner.getClass().isAssignableFrom(SPARQLReasoner.class)) { // R(C) String query = "SELECT (COUNT(DISTINCT(?s)) AS ?cnt) WHERE {" + "?s a ?sup . ?classToDescribe <http://www.w3.org/2000/01/rdf-schema#subClassOf> ?sup . " + converter.convert("?s", description) + "FILTER NOT EXISTS {?s a ?classToDescribe}}"; ParameterizedSparqlString template = new ParameterizedSparqlString(query); // System.err.println(converter.convert("?s", description)); // template.setIri("cls", description.asOWLClass().toStringID()); template.setIri("classToDescribe", classToDescribe.toStringID()); QueryExecution qe = ((SPARQLReasoner) reasoner) .getQueryExecutionFactory() .createQueryExecution(template.toString()); additionalInstancesCnt = qe.execSelect().next().getLiteral("cnt").getInt(); // R(A) OWLObjectIntersectionOf ce = df.getOWLObjectIntersectionOf(classToDescribe, description); coveredInstancesCnt = ((SPARQLReasoner) reasoner).getPopularityOf(ce); } else { // overhang for (OWLIndividual ind : superClassInstances) { if (getReasoner().hasType(description, ind)) { additionalInstances.add(ind); } } // coverage for (OWLIndividual ind : classInstances) { if (getReasoner().hasType(description, ind)) { coveredInstances.add(ind); } } additionalInstancesCnt = additionalInstances.size(); coveredInstancesCnt = coveredInstances.size(); } double recall = coveredInstancesCnt / (double) classInstances.size(); double precision = (additionalInstancesCnt + coveredInstancesCnt == 0) ? 0 : coveredInstancesCnt / (double) (coveredInstancesCnt + additionalInstancesCnt); // for each OWLClassExpression with less than 100% coverage, we check whether it is // leads to an inconsistent knowledge base double acc = 0; if (heuristic.equals(HeuristicType.FMEASURE)) { acc = Heuristics.getFScore(recall, precision, coverageFactor); } else if (heuristic.equals(HeuristicType.AMEASURE)) { acc = Heuristics.getAScore(recall, precision, coverageFactor); } else { // TODO: some superfluous instance checks are required to compute accuracy => // move accuracy computation here if possible acc = getAccuracyOrTooWeakExact(description, noise); } if (checkConsistency) { // we check whether the axiom already follows from the knowledge base // boolean followsFromKB = reasoner.isSuperClassOf(description, classToDescribe); // boolean followsFromKB = equivalence ? reasoner.isEquivalentClass(description, // classToDescribe) : reasoner.isSuperClassOf(description, classToDescribe); boolean followsFromKB = followsFromKB(description); // workaround due to a bug (see // http://sourceforge.net/tracker/?func=detail&aid=2866610&group_id=203619&atid=986319) // boolean isConsistent = coverage >= 0.999999 || isConsistent(description); // (if the axiom follows, then the knowledge base remains consistent) boolean isConsistent = followsFromKB || isConsistent(description); // double acc = useFMeasure ? getFMeasure(coverage, protusion) : getAccuracy(coverage, // protusion); return new ClassScore( coveredInstances, Helper.difference(classInstancesSet, coveredInstances), recall, additionalInstances, precision, acc, isConsistent, followsFromKB); } else { return new ClassScore( coveredInstances, Helper.difference(classInstancesSet, coveredInstances), recall, additionalInstances, precision, acc); } }