/** * Return an array of the distinct predicates in the KB ordered by their descending frequency of * use. The {@link IV}s in the returned array will have been resolved to the corresponding {@link * BigdataURI}s which can be accessed using {@link IV#getValue()}. * * @param kb The KB instance. */ protected static IVCount[] predicateUsage(final AbstractTripleStore kb) { final SPORelation r = kb.getSPORelation(); if (r.oneAccessPath) { // The necessary index (POS or POCS) does not exist. throw new UnsupportedOperationException(); } final boolean quads = kb.isQuads(); // the index to use for distinct predicate scan. final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS; // visit distinct term identifiers for predicate position on that index. @SuppressWarnings("rawtypes") final IChunkedIterator<IV> itr = r.distinctTermScan(keyOrder); // resolve term identifiers to terms efficiently during iteration. final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(kb /* resolveTerms */, itr); try { final Set<IV<?, ?>> ivs = new LinkedHashSet<IV<?, ?>>(); final Map<IV<?, ?>, IVCount> counts = new LinkedHashMap<IV<?, ?>, IVCount>(); while (itr2.hasNext()) { final BigdataValue term = itr2.next(); final IV<?, ?> iv = term.getIV(); final long n = r.getAccessPath(null, iv, null, null).rangeCount(false /* exact */); ivs.add(iv); counts.put(iv, new IVCount(iv, n)); } // Batch resolve IVs to Values final Map<IV<?, ?>, BigdataValue> x = kb.getLexiconRelation().getTerms(ivs); for (Map.Entry<IV<?, ?>, BigdataValue> e : x.entrySet()) { final IVCount count = counts.get(e.getKey()); count.setValue(e.getValue()); } final IVCount[] a = counts.values().toArray(new IVCount[counts.size()]); // Order by descending count. Arrays.sort(a); return a; } finally { itr2.close(); } }
/** * Return an efficient statistical summary for the class partitions. The SPARQL query for this is * * <pre> * SELECT ?class (COUNT(?s) AS ?count ) { ?s a ?class } GROUP BY ?class ORDER BY ?count * </pre> * * However, it is much efficient to scan POS for * * <pre> * rdf:type ?o ?s * </pre> * * and report the range count of * * <pre> * rdf:type ?o ?s * </pre> * * for each distinct value of <code>?o</code>. * * @param kb The KB instance. * @return The class usage statistics. */ protected static IVCount[] classUsage(final AbstractTripleStore kb) { final SPORelation r = kb.getSPORelation(); if (r.oneAccessPath) { // The necessary index (POS or POCS) does not exist. throw new UnsupportedOperationException(); } final boolean quads = kb.isQuads(); final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS; // Resolve IV for rdf:type final BigdataURI rdfType = kb.getValueFactory().asValue(RDF.TYPE); kb.getLexiconRelation() .addTerms(new BigdataValue[] {rdfType}, 1 /* numTerms */, true /* readOnly */); if (rdfType.getIV() == null) { // No rdf:type assertions since rdf:type is unknown term. return new IVCount[0]; } // visit distinct term identifiers for the rdf:type predicate. @SuppressWarnings("rawtypes") final IChunkedIterator<IV> itr = r.distinctMultiTermScan(keyOrder, new IV[] {rdfType.getIV()} /* knownTerms */); // resolve term identifiers to terms efficiently during iteration. final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(kb /* resolveTerms */, itr); try { final Set<IV<?, ?>> ivs = new LinkedHashSet<IV<?, ?>>(); final Map<IV<?, ?>, IVCount> counts = new LinkedHashMap<IV<?, ?>, IVCount>(); while (itr2.hasNext()) { final BigdataValue term = itr2.next(); final IV<?, ?> iv = term.getIV(); final long n = r.getAccessPath(null, rdfType.getIV() /* p */, iv /* o */, null) .rangeCount(false /* exact */); ivs.add(iv); counts.put(iv, new IVCount(iv, n)); } // Batch resolve IVs to Values final Map<IV<?, ?>, BigdataValue> x = kb.getLexiconRelation().getTerms(ivs); for (Map.Entry<IV<?, ?>, BigdataValue> e : x.entrySet()) { final IVCount count = counts.get(e.getKey()); count.setValue(e.getValue()); } final IVCount[] a = counts.values().toArray(new IVCount[counts.size()]); // Order by descending count. Arrays.sort(a); return a; } finally { itr2.close(); } }
/** * Describe the default data set (the one identified by the namespace associated with the {@link * AbstractTripleStore}. * * @param describeStatistics When <code>true</code>, the VoID description will include the {@link * VoidVocabularyDecl#vocabulary} declarations, the property partition statistics, and the * class partition statistics. * @param describeNamedGraphs When <code>true</code>, each named graph will also be described in * in the same level of detail as the default graph. Otherwise only the default graph will be * described. */ public void describeDataSet(final boolean describeStatistics, final boolean describeNamedGraphs) { final String namespace = tripleStore.getNamespace(); // This is a VoID data set. g.add(aDataset, RDF.TYPE, VoidVocabularyDecl.Dataset); // The namespace is used as a title for the data set. g.add(aDataset, DCTermsVocabularyDecl.title, f.createLiteral(namespace)); // Also present the namespace in an unambiguous manner. g.add(aDataset, SD.KB_NAMESPACE, f.createLiteral(namespace)); /** * Service end point for this namespace. * * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/689" > Missing URL encoding in * RemoteRepositoryManager </a> */ for (String uri : serviceURI) { g.add( aDataset, VoidVocabularyDecl.sparqlEndpoint, f.createURI(uri + "/" + ConnectOptions.urlEncode(namespace) + "/sparql")); } // any URI is considered to be an entity. g.add(aDataset, VoidVocabularyDecl.uriRegexPattern, f.createLiteral("^.*")); if (!describeStatistics) { // No statistics. return; } // Frequency count of the predicates in the default graph. final IVCount[] predicatePartitionCounts = predicateUsage(tripleStore); // Frequency count of the classes in the default graph. final IVCount[] classPartitionCounts = classUsage(tripleStore); // Describe vocabularies based on the predicate partitions. describeVocabularies(predicatePartitionCounts); // defaultGraph description. { // Default graph in the default data set. g.add(aDataset, SD.defaultGraph, aDefaultGraph); // Describe the default graph using statistics. describeGraph(aDefaultGraph, predicatePartitionCounts, classPartitionCounts); } // end defaultGraph // sb.append("termCount\t = " + tripleStore.getTermCount() + "\n"); // // sb.append("uriCount\t = " + tripleStore.getURICount() + "\n"); // // sb.append("literalCount\t = " + tripleStore.getLiteralCount() + // "\n"); // // /* // * Note: The blank node count is only available when using the told // * bnodes mode. // */ // sb // .append("bnodeCount\t = " // + (tripleStore.getLexiconRelation() // .isStoreBlankNodes() ? "" // + tripleStore.getBNodeCount() : "N/A") // + "\n"); /* * Report for each named graph. */ if (describeNamedGraphs && tripleStore.isQuads()) { final SPORelation r = tripleStore.getSPORelation(); // the index to use for distinct term scan. final SPOKeyOrder keyOrder = SPOKeyOrder.CSPO; // visit distinct IVs for context position on that index. @SuppressWarnings("rawtypes") final IChunkedIterator<IV> itr = r.distinctTermScan(keyOrder); // resolve IVs to terms efficiently during iteration. final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(tripleStore /* resolveTerms */, itr); try { while (itr2.hasNext()) { /* * Describe this named graph. * * Note: This is using the predicate and class partition * statistics from the default graph (RDF merge) to identify * the set of all possible predicates and classes within * each named graph. It then tests each predicate and class * partition against the named graph and ignores those which * are not present in a given named graph. This is being * done because we do not have a CPxx index. */ final BigdataResource graph = (BigdataResource) itr2.next(); final IVCount[] predicatePartitionCounts2 = predicateUsage(tripleStore, graph.getIV(), predicatePartitionCounts); final IVCount[] classPartitionCounts2 = classUsage(tripleStore, graph.getIV(), classPartitionCounts); final BNode aNamedGraph = f.createBNode(); // Named graph in the default data set. g.add(aDataset, SD.namedGraph, aNamedGraph); // The name of that named graph. g.add(aNamedGraph, SD.name, graph); // Describe the named graph. describeGraph(aNamedGraph, predicatePartitionCounts2, classPartitionCounts2); } } finally { itr2.close(); } } }