/** * Queries the {@link #rdfEndpoint(String)} with each of the {@link #rdfQueries} and harvests the * results of the query. */ private void harvestFromEndpoint() { Query query; QueryExecution qExec; for (String rdfQuery : rdfQueries) { if (closed) break; logger.info( "Harvesting with query: [{}] on index [{}] and type [{}]", rdfQuery, indexName, typeName); try { query = QueryFactory.create(rdfQuery); } catch (QueryParseException qpe) { logger.error("Could not parse [{}]. Please provide a relevant query. {}", rdfQuery, qpe); continue; } qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query); try { harvest(qExec); } catch (Exception e) { logger.error("Exception [{}] occurred while harvesting", e.getLocalizedMessage()); } finally { qExec.close(); } } }
/** * Returns the string value of the first of the properties in the uriDescriptionList for the given * resource (as an URI). In case the resource does not have any of the properties mentioned, its * URI is returned. The value is obtained by querying the endpoint and the endpoint is queried * repeatedly until it gives a response (value or the lack of it) * * <p>It is highly recommended that the list contains properties like labels or titles, with test * values. * * @param uri - the URI for which a label is required * @return a String value, either a label for the parameter or its value if no label is obtained * from the endpoint */ private String getLabelForUri(String uri) { String result; if (uriLabelCache.containsKey(uri)) { return uriLabelCache.get(uri); } for (String prop : uriDescriptionList) { String innerQuery = "SELECT ?r WHERE {<" + uri + "> <" + prop + "> ?r } LIMIT 1"; try { Query query = QueryFactory.create(innerQuery); QueryExecution qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query); boolean keepTrying = true; while (keepTrying) { keepTrying = false; try { ResultSet results = qExec.execSelect(); if (results.hasNext()) { QuerySolution sol = results.nextSolution(); result = EEASettings.parseForJson(sol.getLiteral("r").getLexicalForm()); if (!result.isEmpty()) { uriLabelCache.put(uri, result); return result; } } } catch (Exception e) { keepTrying = true; logger.warn("Could not get label for uri {}. Retrying.", uri); } finally { qExec.close(); } } } catch (QueryParseException qpe) { logger.error("Exception for query {}. The label cannot be obtained", innerQuery); } } return uri; }
/** * Get a set of unique queryObjName returned from a select query * * <p>Used to retrieve sets of modified objects used in sync * * @param rdfQuery query to execute * @param queryObjName name of the object returned * @return set of values for queryObjectName in the rdfQuery result */ HashSet<String> executeSyncQuery(String rdfQuery, String queryObjName) { HashSet<String> rdfUrls = new HashSet<String>(); Query query; try { query = QueryFactory.create(rdfQuery); } catch (QueryParseException qpe) { logger.warn( "Could not parse [{}]. Please provide a relevant query. {}", rdfQuery, qpe.getLocalizedMessage()); return null; } QueryExecution qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query); try { ResultSet results = qExec.execSelect(); while (results.hasNext()) { QuerySolution sol = results.nextSolution(); try { String value = sol.getResource(queryObjName).toString(); rdfUrls.add(value); } catch (NoSuchElementException e) { logger.error("Encountered a NoSuchElementException: " + e.getLocalizedMessage()); return null; } } } catch (Exception e) { logger.error( "Encountered a [{}] while querying the endpoint for sync", e.getLocalizedMessage()); return null; } finally { qExec.close(); } return rdfUrls; }
/** * Starts a harvester with predefined queries to synchronize with the changes from the SPARQL * endpoint */ public boolean sync() { logger.info("Sync resources newer than {}", startTime); String rdfQueryTemplate = "PREFIX xsd:<http://www.w3.org/2001/XMLSchema#> " + "SELECT DISTINCT ?resource WHERE { " + " GRAPH ?graph { %s }" + " ?graph <%s> ?time . %s " + " FILTER (?time > xsd:dateTime(\"%s\")) }"; String queryStr = String.format( rdfQueryTemplate, syncConditions, syncTimeProp, graphSyncConditions, startTime); Set<String> syncUris = executeSyncQuery(queryStr, "resource"); if (syncUris == null) { logger.error("Errors occurred during sync procedure. Aborting!"); return false; } /** * If desired, query for old data that has the sync conditions modified * * <p>This option is useful in the case in which the application indexes resources that match * some conditions. In this case, if they are modified and no longer match the initial * conditions, they will not be synchronized. When syncOldData is True, the modified resources * that no longer match the conditions are deleted. */ int deleted = 0; int count = 0; if (this.syncOldData) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss"); queryStr = String.format( rdfQueryTemplate, syncConditions, syncTimeProp, graphSyncConditions, sdf.format(new Date(0))); HashSet<String> allIndexURIs = executeSyncQuery(queryStr, "resource"); if (allIndexURIs == null) { logger.error("Errors occurred during modified content sync query. Aborting!"); return false; } deleted = removeMissingUris(allIndexURIs); } /* Prepare a series of bulk uris to be described so we can make * a smaller number of calls to the SPARQL endpoint. */ ArrayList<ArrayList<String>> bulks = new ArrayList<ArrayList<String>>(); ArrayList<String> currentBulk = new ArrayList<String>(); for (String uri : syncUris) { currentBulk.add(uri); if (currentBulk.size() == EEASettings.DEFAULT_BULK_SIZE) { bulks.add(currentBulk); currentBulk = new ArrayList<String>(); } } if (currentBulk.size() > 0) { bulks.add(currentBulk); } /* Execute RDF queries for the resources in each bulk */ for (ArrayList<String> bulk : bulks) { String syncQuery = getSyncQueryStr(bulk); try { Query query = QueryFactory.create(syncQuery); QueryExecution qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query); try { Model constructModel = ModelFactory.createDefaultModel(); qExec.execConstruct(constructModel); BulkRequestBuilder bulkRequest = client.prepareBulk(); /** * When adding the model to ES do not use toDescribeURIs as the query already returned the * correct labels. */ addModelToES(constructModel, bulkRequest, false); count += bulk.size(); } catch (Exception e) { logger.error("Error while querying for modified content. {}", e.getLocalizedMessage()); return false; } finally { qExec.close(); } } catch (QueryParseException qpe) { logger.warn( "Could not parse Sync query. Please provide a relevant query. {}", qpe.getLocalizedMessage()); return false; } } logger.info( "Finished synchronisation: Deleted {}, Updated {}/{}", deleted, count, syncUris.size()); return true; }