/** Starts the harvester for queries and/or URLs */
  public boolean runIndexAll() {
    logger.info(
        "Starting RDF harvester: endpoint [{}], queries [{}],"
            + "URIs [{}], index name [{}], typeName [{}]",
        rdfEndpoint,
        rdfQueries,
        rdfUris,
        indexName,
        typeName);

    while (true) {
      if (this.closed) {
        logger.info(
            "Ended harvest for endpoint [{}], queries [{}],"
                + "URIs [{}], index name {}, type name {}",
            rdfEndpoint,
            rdfQueries,
            rdfUris,
            indexName,
            typeName);
        return true;
      }

      /** Harvest from a SPARQL endpoint */
      if (!rdfQueries.isEmpty()) {
        harvestFromEndpoint();
      }

      /** Harvest from RDF dumps */
      harvestFromDumps();

      closed = true;
    }
  }
  /**
   * Index all the resources in a Jena Model to ES
   *
   * @param model the model to index
   * @param bulkRequest a BulkRequestBuilder
   * @param getPropLabel if set to true all URI property values will be indexed as their label. The
   *     label is taken as the value of one of the properties set in {@link #uriDescriptionList}.
   */
  private void addModelToES(Model model, BulkRequestBuilder bulkRequest, boolean getPropLabel) {
    long startTime = System.currentTimeMillis();
    long bulkLength = 0;
    HashSet<Property> properties = new HashSet<Property>();

    StmtIterator it = model.listStatements();
    while (it.hasNext()) {
      Statement st = it.nextStatement();
      Property prop = st.getPredicate();
      String property = prop.toString();

      if (rdfPropList.isEmpty()
          || (isWhitePropList && rdfPropList.contains(property))
          || (!isWhitePropList && !rdfPropList.contains(property))
          || (normalizeProp.containsKey(property))) {
        properties.add(prop);
      }
    }

    ResIterator resIt = model.listSubjects();

    while (resIt.hasNext()) {
      Resource rs = resIt.nextResource();
      Map<String, ArrayList<String>> jsonMap = getJsonMap(rs, properties, model, getPropLabel);

      bulkRequest.add(
          client.prepareIndex(indexName, typeName, rs.toString()).setSource(mapToString(jsonMap)));
      bulkLength++;

      // We want to execute the bulk for every  DEFAULT_BULK_SIZE requests
      if (bulkLength % EEASettings.DEFAULT_BULK_SIZE == 0) {
        BulkResponse bulkResponse = bulkRequest.execute().actionGet();
        // After executing, flush the BulkRequestBuilder.
        bulkRequest = client.prepareBulk();

        if (bulkResponse.hasFailures()) {
          processBulkResponseFailure(bulkResponse);
        }
      }
    }

    // Execute remaining requests
    if (bulkRequest.numberOfActions() > 0) {
      BulkResponse response = bulkRequest.execute().actionGet();
      // Handle failure by iterating through each bulk response item
      if (response.hasFailures()) {
        processBulkResponseFailure(response);
      }
    }

    // Show time taken to index the documents
    logger.info(
        "Indexed {} documents on {}/{} in {} seconds",
        bulkLength,
        indexName,
        typeName,
        (System.currentTimeMillis() - startTime) / 1000.0);
  }
Beispiel #3
0
  /**
   * Answer a list of the named hierarchy roots of a given {@link OntModel}. This will be similar to
   * the results of {@link OntModel#listHierarchyRootClasses()}, with the added constraint that
   * every member of the returned iterator will be a named class, not an anonymous class expression.
   * The named root classes are calculated from the root classes, by recursively replacing every
   * anonymous class with its direct sub-classes. Thus it can be seen that the values in the list
   * consists of the shallowest fringe of named classes in the hierarchy.
   *
   * @param m An ontology model
   * @return A list of classes whose members are the named root classes of the class hierarchy in
   *     <code>m</code>
   */
  public static List<OntClass> namedHierarchyRoots(OntModel m) {
    List<OntClass> nhr = new ArrayList<OntClass>(); // named roots
    List<OntClass> ahr = new ArrayList<OntClass>(); // anon roots

    // do the initial partition of the root classes
    partitionByNamed(m.listHierarchyRootClasses(), nhr, ahr);

    // now push the fringe down until we have only named classes
    while (!ahr.isEmpty()) {
      OntClass c = ahr.remove(0);
      partitionByNamed(c.listSubClasses(true), nhr, ahr);
    }

    return nhr;
  }
Beispiel #4
0
  /**
   * Answer the shortest path from the <code>start</code> resource to the <code>end</code> RDF node,
   * such that every step on the path is accepted by the given filter. A path is a {@link List} of
   * RDF {@link Statement}s. The subject of the first statement in the list is <code>start</code>,
   * and the object of the last statement in the list is <code>end</code>.
   *
   * <p>The <code>onPath</code> argument is a {@link Filter}, which accepts a statement and returns
   * true if the statement should be considered to be on the path. To search for an unconstrained
   * path, pass {@link Filter#any} as an argument. To search for a path whose predicates match a
   * fixed restricted set of property names, pass an instance of {@link PredicatesFilter}.
   *
   * <p>If there is more than one path of minimal length from <code>start</code> to <code>end</code>
   * , this method returns an arbitrary one. The algorithm is blind breadth-first search, with loop
   * detection.
   *
   * @param m The model in which we are seeking a path
   * @param start The starting resource
   * @param end The end, or goal, node
   * @param onPath A filter which determines whether a given statement can be considered part of the
   *     path
   * @return A path, consisting of a list of statements whose first subject is <code>start</code>,
   *     and whose last object is <code>end</code>, or null if no such path exists.
   */
  public static Path findShortestPath(
      Model m, Resource start, RDFNode end, Filter<Statement> onPath) {
    List<Path> bfs = new LinkedList<Path>();
    Set<Resource> seen = new HashSet<Resource>();

    // initialise the paths
    for (Iterator<Statement> i = m.listStatements(start, null, (RDFNode) null).filterKeep(onPath);
        i.hasNext(); ) {
      bfs.add(new Path().append(i.next()));
    }

    // search
    Path solution = null;
    while (solution == null && !bfs.isEmpty()) {
      Path candidate = bfs.remove(0);

      if (candidate.hasTerminus(end)) {
        solution = candidate;
      } else {
        Resource terminus = candidate.getTerminalResource();
        if (terminus != null) {
          seen.add(terminus);

          // breadth-first expansion
          for (Iterator<Statement> i = terminus.listProperties().filterKeep(onPath);
              i.hasNext(); ) {
            Statement link = i.next();

            // no looping allowed, so we skip this link if it takes us to a node we've seen
            if (!seen.contains(link.getObject())) {
              bfs.add(candidate.append(link));
            }
          }
        }
      }
    }

    return solution;
  }
 /**
  * Return all values for the given options as Strings, either locally or from the parent options
  * object.
  */
 @Override
 protected List<String> getAllValues(OPT option) {
   List<String> l = super.getAllValues(option);
   return (l.isEmpty() && hasParent()) ? getParent().getAllValues(option) : l;
 }
  /**
   * Build a query returning all triples in which members of uris are the subjects of the triplets.
   *
   * <p>If toDescribeURIs is true the query will automatically add logic to retrieve the labels
   * directly from the SPARQL endpoint.
   *
   * @param uris URIs for queried resources
   * @return a CONSTRUCT query string
   */
  private String getSyncQueryStr(Iterable<String> uris) {
    StringBuilder uriSetStrBuilder = new StringBuilder();
    String delimiter = "";

    uriSetStrBuilder.append("(");
    for (String uri : uris) {
      uriSetStrBuilder.append(delimiter).append(String.format("<%s>", uri));
      delimiter = ", ";
    }
    uriSetStrBuilder.append(")");

    String uriSet = uriSetStrBuilder.toString();

    /* Get base triplets having any element from uris as subject */
    StringBuilder queryBuilder = new StringBuilder();
    queryBuilder
        .append("CONSTRUCT { ?s ?p ?o } WHERE {")
        .append("{?s ?p ?o")
        .append(String.format(" . FILTER (?s in %s )", uriSet));

    /* Perform uri label resolution only if desired */
    if (uriDescriptionList.isEmpty()) {
      queryBuilder.append("}}");
      return queryBuilder.toString();
    }

    /* Filter out properties having a label */
    int index = 0;
    for (String prop : uriDescriptionList) {
      index++;
      String filterTemplate = " . OPTIONAL { ?o <%s> ?o%d } " + " . FILTER(!BOUND(?o%d))";
      queryBuilder.append(String.format(filterTemplate, prop, index, index));
    }
    queryBuilder.append("}");

    /* We need this redundant clause as UNION queries can't handle sub-selects
     * without a prior clause.
     */
    String redundantClause =
        "<http://www.w3.org/2000/01/rdf-schema#Class> "
            + "a <http://www.w3.org/2000/01/rdf-schema#Class>";

    /* Add labels for filtered out properties */
    for (String prop : uriDescriptionList) {
      /* Resolve ?o as str(?label) for the resource ?res
       * label is taken as being ?res <prop> ?label
       *
       * We need to take str(?label) in order to drop
       * language references of the terms so that the document
       * is indexed with a language present only in it's top-level
       * properties.
       *
       * As some Virtuoso versions do not allow the usage
       * of BIND so we have to create a sub-select in order to bind
       * ?o to str(?label)
       *
       * The sub-select works only with a prior clause.
       * We are using a redundant clause that is always true
       */
      String partQueryTemplate =
          " UNION "
              + "{ "
              + redundantClause
              + " . "
              + "{ SELECT ?s ?p (str(?label) as ?o) { "
              + "   ?s ?p ?res"
              + "   . FILTER (?s in %s)"
              + "   . ?res <%s> ?label }}}";
      queryBuilder.append(String.format(partQueryTemplate, uriSet, prop));
    }

    queryBuilder.append("}");
    return queryBuilder.toString();
  }
 /**
  * Sets the {@link Harvester}'s {@link #rdfPropList} parameter
  *
  * @param list - a list of properties names that are either required in the object description, or
  *     undesired, depending on its {@link #isWhitePropList}
  * @return the same {@link Harvester} with the {@link #rdfPropList} parameter set
  */
 public Harvester rdfPropList(List<String> list) {
   if (!list.isEmpty()) {
     rdfPropList = new ArrayList<String>(list);
   }
   return this;
 }