/** * Queries the {@link #rdfEndpoint(String)} with each of the {@link #rdfQueries} and harvests the * results of the query. */ private void harvestFromEndpoint() { Query query; QueryExecution qExec; for (String rdfQuery : rdfQueries) { if (closed) break; logger.info( "Harvesting with query: [{}] on index [{}] and type [{}]", rdfQuery, indexName, typeName); try { query = QueryFactory.create(rdfQuery); } catch (QueryParseException qpe) { logger.error("Could not parse [{}]. Please provide a relevant query. {}", rdfQuery, qpe); continue; } qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query); try { harvest(qExec); } catch (Exception e) { logger.error("Exception [{}] occurred while harvesting", e.getLocalizedMessage()); } finally { qExec.close(); } } }
public static String Extract(String sPattern, int iOffset, String sText) { String sResult = ""; try { Scanner document_scanner = new Scanner(sText); int iCountOcurr = 0; int iFirstLine = 0; String sLine = ""; Boolean bReading = false; Boolean bIsEmpty = true; while (document_scanner.hasNext()) { sLine = document_scanner.next(); /// Pattern detected if (sLine.indexOf(sPattern) > -1) { iCountOcurr++; } /// Number of pattern has been reached if (iCountOcurr == iOffset) { bReading = true; } if (bReading && bIsEmpty) { if (iFirstLine != 0) { /// Stop if (sLine.indexOf(sStop) > -1) { bReading = false; bIsEmpty = false; } else { sResult += " " + sLine; } } else { sResult += " " + sLine; } System.out.println("sResult:" + sResult + "\n"); iFirstLine++; } } document_scanner.close(); // Print number of times the search pattern was found // System.out.println("Found Input "+ iCountOcurr + " times"); } catch (Exception e) { sResult = "Error buscar parrafo: " + e.getMessage(); } return sResult; }
@POST @Consumes(MediaType.APPLICATION_JSON) public String GetDocumentParagraph(InputStream incomingData) { String output = ""; StringBuilder builder = new StringBuilder(); Calendar cal = Calendar.getInstance(); SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss.SSS"); try { // output += "Web Service Document ..." + sdf.format(cal.getTime()) +"<br>"; /// Reeading vars from JSON /// ---------------------------------------------------------------------------------------- BufferedReader in = new BufferedReader(new InputStreamReader(incomingData)); String line = null; while ((line = in.readLine()) != null) { builder.append(line); } cal = Calendar.getInstance(); // output += "Json stream readed: " + sdf.format(cal.getTime()) + "<br>"; /// Reeading vars from JSON /// ---------------------------------------------------------------------------------------- JSONObject jsonObject = new JSONObject(builder.toString()); String sUrl = jsonObject.getString("url"); String sPattern = jsonObject.getString("pattern"); int iOffset = jsonObject.getInt("offset"); sUrl = sUrl.replace('^', '"'); // output += "sUrl: " + sUrl + " <br>"; // output += "sPattern: " + sPattern + " <br>"; // output += "iOffset: " + iOffset + " <br>"; output += GetDocumentParagraph(sPattern, iOffset, sUrl); } catch (Exception e) { cal = Calendar.getInstance(); output += "Error: " + e.toString() + sdf.format(cal.getTime()) + "<br>"; } // return HTTP response 200 in case of success return output; }
/** Harvests all the triplets from each URI in the @rdfUris list */ private void harvestFromDumps() { for (String uri : rdfUris) { if (uri.isEmpty()) continue; logger.info("Harvesting uri [{}]", uri); Model model = ModelFactory.createDefaultModel(); try { RDFDataMgr.read(model, uri.trim(), RDFLanguages.RDFXML); BulkRequestBuilder bulkRequest = client.prepareBulk(); addModelToES(model, bulkRequest, true); } catch (RiotException re) { logger.error("Illegal xml character [{}]", re.getLocalizedMessage()); } catch (Exception e) { logger.error( "Exception when harvesting url: {}. Details: {}", uri, e.getLocalizedMessage()); } } }
public static String GetText(String sUrl) { String sRet = ""; try { System.out.print(" Connecting to: " + sUrl + "... \n"); InputStream inputStream = new URL(sUrl).openStream(); System.out.print(" Stream readed from: " + sUrl + "\n"); HWPFDocument docx = new HWPFDocument(inputStream); WordExtractor we = new WordExtractor(docx); sRet = we.getText(); we.close(); } catch (Exception e) { sRet = "Error al leer el archivo" + e.getMessage(); } return sRet; }
@Override protected void runTestForReal() throws Throwable { Query query = null; try { try { query = queryFromTestItem(testItem); } catch (QueryException qEx) { query = null; qEx.printStackTrace(System.err); fail("Parse failure: " + qEx.getMessage()); throw qEx; } Dataset dataset = setUpDataset(query, testItem); if (dataset == null && !doesQueryHaveDataset(query)) fail("No dataset for query"); QueryExecution qe = null; if (dataset == null) qe = QueryExecutionFactory.create(query, queryFileManager); else qe = QueryExecutionFactory.create(query, dataset); try { if (query.isSelectType()) runTestSelect(query, qe); else if (query.isConstructType()) runTestConstruct(query, qe); else if (query.isDescribeType()) runTestDescribe(query, qe); else if (query.isAskType()) runTestAsk(query, qe); } finally { qe.close(); } } catch (IOException ioEx) { // log.debug("IOException: ",ioEx) ; fail("IOException: " + ioEx.getMessage()); throw ioEx; } catch (NullPointerException ex) { throw ex; } catch (Exception ex) { ex.printStackTrace(System.err); fail("Exception: " + ex.getClass().getName() + ": " + ex.getMessage()); } }
/** * Get a set of unique queryObjName returned from a select query * * <p>Used to retrieve sets of modified objects used in sync * * @param rdfQuery query to execute * @param queryObjName name of the object returned * @return set of values for queryObjectName in the rdfQuery result */ HashSet<String> executeSyncQuery(String rdfQuery, String queryObjName) { HashSet<String> rdfUrls = new HashSet<String>(); Query query; try { query = QueryFactory.create(rdfQuery); } catch (QueryParseException qpe) { logger.warn( "Could not parse [{}]. Please provide a relevant query. {}", rdfQuery, qpe.getLocalizedMessage()); return null; } QueryExecution qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query); try { ResultSet results = qExec.execSelect(); while (results.hasNext()) { QuerySolution sol = results.nextSolution(); try { String value = sol.getResource(queryObjName).toString(); rdfUrls.add(value); } catch (NoSuchElementException e) { logger.error("Encountered a NoSuchElementException: " + e.getLocalizedMessage()); return null; } } } catch (Exception e) { logger.error( "Encountered a [{}] while querying the endpoint for sync", e.getLocalizedMessage()); return null; } finally { qExec.close(); } return rdfUrls; }
/** * Starts a harvester with predefined queries to synchronize with the changes from the SPARQL * endpoint */ public boolean sync() { logger.info("Sync resources newer than {}", startTime); String rdfQueryTemplate = "PREFIX xsd:<http://www.w3.org/2001/XMLSchema#> " + "SELECT DISTINCT ?resource WHERE { " + " GRAPH ?graph { %s }" + " ?graph <%s> ?time . %s " + " FILTER (?time > xsd:dateTime(\"%s\")) }"; String queryStr = String.format( rdfQueryTemplate, syncConditions, syncTimeProp, graphSyncConditions, startTime); Set<String> syncUris = executeSyncQuery(queryStr, "resource"); if (syncUris == null) { logger.error("Errors occurred during sync procedure. Aborting!"); return false; } /** * If desired, query for old data that has the sync conditions modified * * <p>This option is useful in the case in which the application indexes resources that match * some conditions. In this case, if they are modified and no longer match the initial * conditions, they will not be synchronized. When syncOldData is True, the modified resources * that no longer match the conditions are deleted. */ int deleted = 0; int count = 0; if (this.syncOldData) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss"); queryStr = String.format( rdfQueryTemplate, syncConditions, syncTimeProp, graphSyncConditions, sdf.format(new Date(0))); HashSet<String> allIndexURIs = executeSyncQuery(queryStr, "resource"); if (allIndexURIs == null) { logger.error("Errors occurred during modified content sync query. Aborting!"); return false; } deleted = removeMissingUris(allIndexURIs); } /* Prepare a series of bulk uris to be described so we can make * a smaller number of calls to the SPARQL endpoint. */ ArrayList<ArrayList<String>> bulks = new ArrayList<ArrayList<String>>(); ArrayList<String> currentBulk = new ArrayList<String>(); for (String uri : syncUris) { currentBulk.add(uri); if (currentBulk.size() == EEASettings.DEFAULT_BULK_SIZE) { bulks.add(currentBulk); currentBulk = new ArrayList<String>(); } } if (currentBulk.size() > 0) { bulks.add(currentBulk); } /* Execute RDF queries for the resources in each bulk */ for (ArrayList<String> bulk : bulks) { String syncQuery = getSyncQueryStr(bulk); try { Query query = QueryFactory.create(syncQuery); QueryExecution qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query); try { Model constructModel = ModelFactory.createDefaultModel(); qExec.execConstruct(constructModel); BulkRequestBuilder bulkRequest = client.prepareBulk(); /** * When adding the model to ES do not use toDescribeURIs as the query already returned the * correct labels. */ addModelToES(constructModel, bulkRequest, false); count += bulk.size(); } catch (Exception e) { logger.error("Error while querying for modified content. {}", e.getLocalizedMessage()); return false; } finally { qExec.close(); } } catch (QueryParseException qpe) { logger.warn( "Could not parse Sync query. Please provide a relevant query. {}", qpe.getLocalizedMessage()); return false; } } logger.info( "Finished synchronisation: Deleted {}, Updated {}/{}", deleted, count, syncUris.size()); return true; }