/** Starts the harvester for queries and/or URLs */ public boolean runIndexAll() { logger.info( "Starting RDF harvester: endpoint [{}], queries [{}]," + "URIs [{}], index name [{}], typeName [{}]", rdfEndpoint, rdfQueries, rdfUris, indexName, typeName); while (true) { if (this.closed) { logger.info( "Ended harvest for endpoint [{}], queries [{}]," + "URIs [{}], index name {}, type name {}", rdfEndpoint, rdfQueries, rdfUris, indexName, typeName); return true; } /** Harvest from a SPARQL endpoint */ if (!rdfQueries.isEmpty()) { harvestFromEndpoint(); } /** Harvest from RDF dumps */ harvestFromDumps(); closed = true; } }
/** * Index all the resources in a Jena Model to ES * * @param model the model to index * @param bulkRequest a BulkRequestBuilder * @param getPropLabel if set to true all URI property values will be indexed as their label. The * label is taken as the value of one of the properties set in {@link #uriDescriptionList}. */ private void addModelToES(Model model, BulkRequestBuilder bulkRequest, boolean getPropLabel) { long startTime = System.currentTimeMillis(); long bulkLength = 0; HashSet<Property> properties = new HashSet<Property>(); StmtIterator it = model.listStatements(); while (it.hasNext()) { Statement st = it.nextStatement(); Property prop = st.getPredicate(); String property = prop.toString(); if (rdfPropList.isEmpty() || (isWhitePropList && rdfPropList.contains(property)) || (!isWhitePropList && !rdfPropList.contains(property)) || (normalizeProp.containsKey(property))) { properties.add(prop); } } ResIterator resIt = model.listSubjects(); while (resIt.hasNext()) { Resource rs = resIt.nextResource(); Map<String, ArrayList<String>> jsonMap = getJsonMap(rs, properties, model, getPropLabel); bulkRequest.add( client.prepareIndex(indexName, typeName, rs.toString()).setSource(mapToString(jsonMap))); bulkLength++; // We want to execute the bulk for every DEFAULT_BULK_SIZE requests if (bulkLength % EEASettings.DEFAULT_BULK_SIZE == 0) { BulkResponse bulkResponse = bulkRequest.execute().actionGet(); // After executing, flush the BulkRequestBuilder. bulkRequest = client.prepareBulk(); if (bulkResponse.hasFailures()) { processBulkResponseFailure(bulkResponse); } } } // Execute remaining requests if (bulkRequest.numberOfActions() > 0) { BulkResponse response = bulkRequest.execute().actionGet(); // Handle failure by iterating through each bulk response item if (response.hasFailures()) { processBulkResponseFailure(response); } } // Show time taken to index the documents logger.info( "Indexed {} documents on {}/{} in {} seconds", bulkLength, indexName, typeName, (System.currentTimeMillis() - startTime) / 1000.0); }
/** * Build a query returning all triples in which members of uris are the subjects of the triplets. * * <p>If toDescribeURIs is true the query will automatically add logic to retrieve the labels * directly from the SPARQL endpoint. * * @param uris URIs for queried resources * @return a CONSTRUCT query string */ private String getSyncQueryStr(Iterable<String> uris) { StringBuilder uriSetStrBuilder = new StringBuilder(); String delimiter = ""; uriSetStrBuilder.append("("); for (String uri : uris) { uriSetStrBuilder.append(delimiter).append(String.format("<%s>", uri)); delimiter = ", "; } uriSetStrBuilder.append(")"); String uriSet = uriSetStrBuilder.toString(); /* Get base triplets having any element from uris as subject */ StringBuilder queryBuilder = new StringBuilder(); queryBuilder .append("CONSTRUCT { ?s ?p ?o } WHERE {") .append("{?s ?p ?o") .append(String.format(" . FILTER (?s in %s )", uriSet)); /* Perform uri label resolution only if desired */ if (uriDescriptionList.isEmpty()) { queryBuilder.append("}}"); return queryBuilder.toString(); } /* Filter out properties having a label */ int index = 0; for (String prop : uriDescriptionList) { index++; String filterTemplate = " . OPTIONAL { ?o <%s> ?o%d } " + " . FILTER(!BOUND(?o%d))"; queryBuilder.append(String.format(filterTemplate, prop, index, index)); } queryBuilder.append("}"); /* We need this redundant clause as UNION queries can't handle sub-selects * without a prior clause. */ String redundantClause = "<http://www.w3.org/2000/01/rdf-schema#Class> " + "a <http://www.w3.org/2000/01/rdf-schema#Class>"; /* Add labels for filtered out properties */ for (String prop : uriDescriptionList) { /* Resolve ?o as str(?label) for the resource ?res * label is taken as being ?res <prop> ?label * * We need to take str(?label) in order to drop * language references of the terms so that the document * is indexed with a language present only in it's top-level * properties. * * As some Virtuoso versions do not allow the usage * of BIND so we have to create a sub-select in order to bind * ?o to str(?label) * * The sub-select works only with a prior clause. * We are using a redundant clause that is always true */ String partQueryTemplate = " UNION " + "{ " + redundantClause + " . " + "{ SELECT ?s ?p (str(?label) as ?o) { " + " ?s ?p ?res" + " . FILTER (?s in %s)" + " . ?res <%s> ?label }}}"; queryBuilder.append(String.format(partQueryTemplate, uriSet, prop)); } queryBuilder.append("}"); return queryBuilder.toString(); }
/** * Sets the {@link Harvester}'s {@link #rdfPropList} parameter * * @param list - a list of properties names that are either required in the object description, or * undesired, depending on its {@link #isWhitePropList} * @return the same {@link Harvester} with the {@link #rdfPropList} parameter set */ public Harvester rdfPropList(List<String> list) { if (!list.isEmpty()) { rdfPropList = new ArrayList<String>(list); } return this; }