Example #1
0
  /**
   * Return an array of the distinct predicates in the KB ordered by their descending frequency of
   * use. The {@link IV}s in the returned array will have been resolved to the corresponding {@link
   * BigdataURI}s which can be accessed using {@link IV#getValue()}.
   *
   * @param kb The KB instance.
   */
  protected static IVCount[] predicateUsage(final AbstractTripleStore kb) {

    final SPORelation r = kb.getSPORelation();

    if (r.oneAccessPath) {

      // The necessary index (POS or POCS) does not exist.
      throw new UnsupportedOperationException();
    }

    final boolean quads = kb.isQuads();

    // the index to use for distinct predicate scan.
    final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS;

    // visit distinct term identifiers for predicate position on that index.
    @SuppressWarnings("rawtypes")
    final IChunkedIterator<IV> itr = r.distinctTermScan(keyOrder);

    // resolve term identifiers to terms efficiently during iteration.
    final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(kb /* resolveTerms */, itr);

    try {

      final Set<IV<?, ?>> ivs = new LinkedHashSet<IV<?, ?>>();

      final Map<IV<?, ?>, IVCount> counts = new LinkedHashMap<IV<?, ?>, IVCount>();

      while (itr2.hasNext()) {

        final BigdataValue term = itr2.next();

        final IV<?, ?> iv = term.getIV();

        final long n = r.getAccessPath(null, iv, null, null).rangeCount(false /* exact */);

        ivs.add(iv);

        counts.put(iv, new IVCount(iv, n));
      }

      // Batch resolve IVs to Values
      final Map<IV<?, ?>, BigdataValue> x = kb.getLexiconRelation().getTerms(ivs);

      for (Map.Entry<IV<?, ?>, BigdataValue> e : x.entrySet()) {

        final IVCount count = counts.get(e.getKey());

        count.setValue(e.getValue());
      }

      final IVCount[] a = counts.values().toArray(new IVCount[counts.size()]);

      // Order by descending count.
      Arrays.sort(a);

      return a;

    } finally {

      itr2.close();
    }
  }
Example #2
0
  /**
   * Return an efficient statistical summary for the class partitions. The SPARQL query for this is
   *
   * <pre>
   * SELECT  ?class (COUNT(?s) AS ?count ) { ?s a ?class } GROUP BY ?class ORDER BY ?count
   * </pre>
   *
   * However, it is much efficient to scan POS for
   *
   * <pre>
   * rdf:type ?o ?s
   * </pre>
   *
   * and report the range count of
   *
   * <pre>
   * rdf:type ?o ?s
   * </pre>
   *
   * for each distinct value of <code>?o</code>.
   *
   * @param kb The KB instance.
   * @return The class usage statistics.
   */
  protected static IVCount[] classUsage(final AbstractTripleStore kb) {

    final SPORelation r = kb.getSPORelation();

    if (r.oneAccessPath) {

      // The necessary index (POS or POCS) does not exist.
      throw new UnsupportedOperationException();
    }

    final boolean quads = kb.isQuads();

    final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS;

    // Resolve IV for rdf:type
    final BigdataURI rdfType = kb.getValueFactory().asValue(RDF.TYPE);

    kb.getLexiconRelation()
        .addTerms(new BigdataValue[] {rdfType}, 1 /* numTerms */, true /* readOnly */);

    if (rdfType.getIV() == null) {

      // No rdf:type assertions since rdf:type is unknown term.
      return new IVCount[0];
    }

    // visit distinct term identifiers for the rdf:type predicate.
    @SuppressWarnings("rawtypes")
    final IChunkedIterator<IV> itr =
        r.distinctMultiTermScan(keyOrder, new IV[] {rdfType.getIV()} /* knownTerms */);

    // resolve term identifiers to terms efficiently during iteration.
    final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(kb /* resolveTerms */, itr);

    try {

      final Set<IV<?, ?>> ivs = new LinkedHashSet<IV<?, ?>>();

      final Map<IV<?, ?>, IVCount> counts = new LinkedHashMap<IV<?, ?>, IVCount>();

      while (itr2.hasNext()) {

        final BigdataValue term = itr2.next();

        final IV<?, ?> iv = term.getIV();

        final long n =
            r.getAccessPath(null, rdfType.getIV() /* p */, iv /* o */, null)
                .rangeCount(false /* exact */);

        ivs.add(iv);

        counts.put(iv, new IVCount(iv, n));
      }

      // Batch resolve IVs to Values
      final Map<IV<?, ?>, BigdataValue> x = kb.getLexiconRelation().getTerms(ivs);

      for (Map.Entry<IV<?, ?>, BigdataValue> e : x.entrySet()) {

        final IVCount count = counts.get(e.getKey());

        count.setValue(e.getValue());
      }

      final IVCount[] a = counts.values().toArray(new IVCount[counts.size()]);

      // Order by descending count.
      Arrays.sort(a);

      return a;

    } finally {

      itr2.close();
    }
  }
Example #3
0
  /**
   * Describe the default data set (the one identified by the namespace associated with the {@link
   * AbstractTripleStore}.
   *
   * @param describeStatistics When <code>true</code>, the VoID description will include the {@link
   *     VoidVocabularyDecl#vocabulary} declarations, the property partition statistics, and the
   *     class partition statistics.
   * @param describeNamedGraphs When <code>true</code>, each named graph will also be described in
   *     in the same level of detail as the default graph. Otherwise only the default graph will be
   *     described.
   */
  public void describeDataSet(final boolean describeStatistics, final boolean describeNamedGraphs) {

    final String namespace = tripleStore.getNamespace();

    // This is a VoID data set.
    g.add(aDataset, RDF.TYPE, VoidVocabularyDecl.Dataset);

    // The namespace is used as a title for the data set.
    g.add(aDataset, DCTermsVocabularyDecl.title, f.createLiteral(namespace));

    // Also present the namespace in an unambiguous manner.
    g.add(aDataset, SD.KB_NAMESPACE, f.createLiteral(namespace));

    /**
     * Service end point for this namespace.
     *
     * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/689" > Missing URL encoding in
     *     RemoteRepositoryManager </a>
     */
    for (String uri : serviceURI) {
      g.add(
          aDataset,
          VoidVocabularyDecl.sparqlEndpoint,
          f.createURI(uri + "/" + ConnectOptions.urlEncode(namespace) + "/sparql"));
    }

    // any URI is considered to be an entity.
    g.add(aDataset, VoidVocabularyDecl.uriRegexPattern, f.createLiteral("^.*"));

    if (!describeStatistics) {

      // No statistics.
      return;
    }

    // Frequency count of the predicates in the default graph.
    final IVCount[] predicatePartitionCounts = predicateUsage(tripleStore);

    // Frequency count of the classes in the default graph.
    final IVCount[] classPartitionCounts = classUsage(tripleStore);

    // Describe vocabularies based on the predicate partitions.
    describeVocabularies(predicatePartitionCounts);

    // defaultGraph description.
    {

      // Default graph in the default data set.
      g.add(aDataset, SD.defaultGraph, aDefaultGraph);

      // Describe the default graph using statistics.
      describeGraph(aDefaultGraph, predicatePartitionCounts, classPartitionCounts);
    } // end defaultGraph

    // sb.append("termCount\t = " + tripleStore.getTermCount() + "\n");
    //
    // sb.append("uriCount\t = " + tripleStore.getURICount() + "\n");
    //
    // sb.append("literalCount\t = " + tripleStore.getLiteralCount() +
    // "\n");
    //
    // /*
    // * Note: The blank node count is only available when using the told
    // * bnodes mode.
    // */
    // sb
    // .append("bnodeCount\t = "
    // + (tripleStore.getLexiconRelation()
    // .isStoreBlankNodes() ? ""
    // + tripleStore.getBNodeCount() : "N/A")
    // + "\n");

    /*
     * Report for each named graph.
     */
    if (describeNamedGraphs && tripleStore.isQuads()) {

      final SPORelation r = tripleStore.getSPORelation();

      // the index to use for distinct term scan.
      final SPOKeyOrder keyOrder = SPOKeyOrder.CSPO;

      // visit distinct IVs for context position on that index.
      @SuppressWarnings("rawtypes")
      final IChunkedIterator<IV> itr = r.distinctTermScan(keyOrder);

      // resolve IVs to terms efficiently during iteration.
      final BigdataValueIterator itr2 =
          new BigdataValueIteratorImpl(tripleStore /* resolveTerms */, itr);

      try {

        while (itr2.hasNext()) {

          /*
           * Describe this named graph.
           *
           * Note: This is using the predicate and class partition
           * statistics from the default graph (RDF merge) to identify
           * the set of all possible predicates and classes within
           * each named graph. It then tests each predicate and class
           * partition against the named graph and ignores those which
           * are not present in a given named graph. This is being
           * done because we do not have a CPxx index.
           */

          final BigdataResource graph = (BigdataResource) itr2.next();

          final IVCount[] predicatePartitionCounts2 =
              predicateUsage(tripleStore, graph.getIV(), predicatePartitionCounts);

          final IVCount[] classPartitionCounts2 =
              classUsage(tripleStore, graph.getIV(), classPartitionCounts);

          final BNode aNamedGraph = f.createBNode();

          // Named graph in the default data set.
          g.add(aDataset, SD.namedGraph, aNamedGraph);

          // The name of that named graph.
          g.add(aNamedGraph, SD.name, graph);

          // Describe the named graph.
          describeGraph(aNamedGraph, predicatePartitionCounts2, classPartitionCounts2);
        }

      } finally {

        itr2.close();
      }
    }
  }