/** Starts the harvester for queries and/or URLs */ public boolean runIndexAll() { logger.info( "Starting RDF harvester: endpoint [{}], queries [{}]," + "URIs [{}], index name [{}], typeName [{}]", rdfEndpoint, rdfQueries, rdfUris, indexName, typeName); while (true) { if (this.closed) { logger.info( "Ended harvest for endpoint [{}], queries [{}]," + "URIs [{}], index name {}, type name {}", rdfEndpoint, rdfQueries, rdfUris, indexName, typeName); return true; } /** Harvest from a SPARQL endpoint */ if (!rdfQueries.isEmpty()) { harvestFromEndpoint(); } /** Harvest from RDF dumps */ harvestFromDumps(); closed = true; } }
/** * Index all the resources in a Jena Model to ES * * @param model the model to index * @param bulkRequest a BulkRequestBuilder * @param getPropLabel if set to true all URI property values will be indexed as their label. The * label is taken as the value of one of the properties set in {@link #uriDescriptionList}. */ private void addModelToES(Model model, BulkRequestBuilder bulkRequest, boolean getPropLabel) { long startTime = System.currentTimeMillis(); long bulkLength = 0; HashSet<Property> properties = new HashSet<Property>(); StmtIterator it = model.listStatements(); while (it.hasNext()) { Statement st = it.nextStatement(); Property prop = st.getPredicate(); String property = prop.toString(); if (rdfPropList.isEmpty() || (isWhitePropList && rdfPropList.contains(property)) || (!isWhitePropList && !rdfPropList.contains(property)) || (normalizeProp.containsKey(property))) { properties.add(prop); } } ResIterator resIt = model.listSubjects(); while (resIt.hasNext()) { Resource rs = resIt.nextResource(); Map<String, ArrayList<String>> jsonMap = getJsonMap(rs, properties, model, getPropLabel); bulkRequest.add( client.prepareIndex(indexName, typeName, rs.toString()).setSource(mapToString(jsonMap))); bulkLength++; // We want to execute the bulk for every DEFAULT_BULK_SIZE requests if (bulkLength % EEASettings.DEFAULT_BULK_SIZE == 0) { BulkResponse bulkResponse = bulkRequest.execute().actionGet(); // After executing, flush the BulkRequestBuilder. bulkRequest = client.prepareBulk(); if (bulkResponse.hasFailures()) { processBulkResponseFailure(bulkResponse); } } } // Execute remaining requests if (bulkRequest.numberOfActions() > 0) { BulkResponse response = bulkRequest.execute().actionGet(); // Handle failure by iterating through each bulk response item if (response.hasFailures()) { processBulkResponseFailure(response); } } // Show time taken to index the documents logger.info( "Indexed {} documents on {}/{} in {} seconds", bulkLength, indexName, typeName, (System.currentTimeMillis() - startTime) / 1000.0); }
/** * Answer a list of the named hierarchy roots of a given {@link OntModel}. This will be similar to * the results of {@link OntModel#listHierarchyRootClasses()}, with the added constraint that * every member of the returned iterator will be a named class, not an anonymous class expression. * The named root classes are calculated from the root classes, by recursively replacing every * anonymous class with its direct sub-classes. Thus it can be seen that the values in the list * consists of the shallowest fringe of named classes in the hierarchy. * * @param m An ontology model * @return A list of classes whose members are the named root classes of the class hierarchy in * <code>m</code> */ public static List<OntClass> namedHierarchyRoots(OntModel m) { List<OntClass> nhr = new ArrayList<OntClass>(); // named roots List<OntClass> ahr = new ArrayList<OntClass>(); // anon roots // do the initial partition of the root classes partitionByNamed(m.listHierarchyRootClasses(), nhr, ahr); // now push the fringe down until we have only named classes while (!ahr.isEmpty()) { OntClass c = ahr.remove(0); partitionByNamed(c.listSubClasses(true), nhr, ahr); } return nhr; }
/** * Answer the shortest path from the <code>start</code> resource to the <code>end</code> RDF node, * such that every step on the path is accepted by the given filter. A path is a {@link List} of * RDF {@link Statement}s. The subject of the first statement in the list is <code>start</code>, * and the object of the last statement in the list is <code>end</code>. * * <p>The <code>onPath</code> argument is a {@link Filter}, which accepts a statement and returns * true if the statement should be considered to be on the path. To search for an unconstrained * path, pass {@link Filter#any} as an argument. To search for a path whose predicates match a * fixed restricted set of property names, pass an instance of {@link PredicatesFilter}. * * <p>If there is more than one path of minimal length from <code>start</code> to <code>end</code> * , this method returns an arbitrary one. The algorithm is blind breadth-first search, with loop * detection. * * @param m The model in which we are seeking a path * @param start The starting resource * @param end The end, or goal, node * @param onPath A filter which determines whether a given statement can be considered part of the * path * @return A path, consisting of a list of statements whose first subject is <code>start</code>, * and whose last object is <code>end</code>, or null if no such path exists. */ public static Path findShortestPath( Model m, Resource start, RDFNode end, Filter<Statement> onPath) { List<Path> bfs = new LinkedList<Path>(); Set<Resource> seen = new HashSet<Resource>(); // initialise the paths for (Iterator<Statement> i = m.listStatements(start, null, (RDFNode) null).filterKeep(onPath); i.hasNext(); ) { bfs.add(new Path().append(i.next())); } // search Path solution = null; while (solution == null && !bfs.isEmpty()) { Path candidate = bfs.remove(0); if (candidate.hasTerminus(end)) { solution = candidate; } else { Resource terminus = candidate.getTerminalResource(); if (terminus != null) { seen.add(terminus); // breadth-first expansion for (Iterator<Statement> i = terminus.listProperties().filterKeep(onPath); i.hasNext(); ) { Statement link = i.next(); // no looping allowed, so we skip this link if it takes us to a node we've seen if (!seen.contains(link.getObject())) { bfs.add(candidate.append(link)); } } } } } return solution; }
/** * Return all values for the given options as Strings, either locally or from the parent options * object. */ @Override protected List<String> getAllValues(OPT option) { List<String> l = super.getAllValues(option); return (l.isEmpty() && hasParent()) ? getParent().getAllValues(option) : l; }
/** * Build a query returning all triples in which members of uris are the subjects of the triplets. * * <p>If toDescribeURIs is true the query will automatically add logic to retrieve the labels * directly from the SPARQL endpoint. * * @param uris URIs for queried resources * @return a CONSTRUCT query string */ private String getSyncQueryStr(Iterable<String> uris) { StringBuilder uriSetStrBuilder = new StringBuilder(); String delimiter = ""; uriSetStrBuilder.append("("); for (String uri : uris) { uriSetStrBuilder.append(delimiter).append(String.format("<%s>", uri)); delimiter = ", "; } uriSetStrBuilder.append(")"); String uriSet = uriSetStrBuilder.toString(); /* Get base triplets having any element from uris as subject */ StringBuilder queryBuilder = new StringBuilder(); queryBuilder .append("CONSTRUCT { ?s ?p ?o } WHERE {") .append("{?s ?p ?o") .append(String.format(" . FILTER (?s in %s )", uriSet)); /* Perform uri label resolution only if desired */ if (uriDescriptionList.isEmpty()) { queryBuilder.append("}}"); return queryBuilder.toString(); } /* Filter out properties having a label */ int index = 0; for (String prop : uriDescriptionList) { index++; String filterTemplate = " . OPTIONAL { ?o <%s> ?o%d } " + " . FILTER(!BOUND(?o%d))"; queryBuilder.append(String.format(filterTemplate, prop, index, index)); } queryBuilder.append("}"); /* We need this redundant clause as UNION queries can't handle sub-selects * without a prior clause. */ String redundantClause = "<http://www.w3.org/2000/01/rdf-schema#Class> " + "a <http://www.w3.org/2000/01/rdf-schema#Class>"; /* Add labels for filtered out properties */ for (String prop : uriDescriptionList) { /* Resolve ?o as str(?label) for the resource ?res * label is taken as being ?res <prop> ?label * * We need to take str(?label) in order to drop * language references of the terms so that the document * is indexed with a language present only in it's top-level * properties. * * As some Virtuoso versions do not allow the usage * of BIND so we have to create a sub-select in order to bind * ?o to str(?label) * * The sub-select works only with a prior clause. * We are using a redundant clause that is always true */ String partQueryTemplate = " UNION " + "{ " + redundantClause + " . " + "{ SELECT ?s ?p (str(?label) as ?o) { " + " ?s ?p ?res" + " . FILTER (?s in %s)" + " . ?res <%s> ?label }}}"; queryBuilder.append(String.format(partQueryTemplate, uriSet, prop)); } queryBuilder.append("}"); return queryBuilder.toString(); }
/** * Sets the {@link Harvester}'s {@link #rdfPropList} parameter * * @param list - a list of properties names that are either required in the object description, or * undesired, depending on its {@link #isWhitePropList} * @return the same {@link Harvester} with the {@link #rdfPropList} parameter set */ public Harvester rdfPropList(List<String> list) { if (!list.isEmpty()) { rdfPropList = new ArrayList<String>(list); } return this; }