/**
   * Queries the {@link #rdfEndpoint(String)} with each of the {@link #rdfQueries} and harvests the
   * results of the query.
   */
  private void harvestFromEndpoint() {

    Query query;
    QueryExecution qExec;

    for (String rdfQuery : rdfQueries) {
      if (closed) break;

      logger.info(
          "Harvesting with query: [{}] on index [{}] and type [{}]", rdfQuery, indexName, typeName);

      try {
        query = QueryFactory.create(rdfQuery);
      } catch (QueryParseException qpe) {
        logger.error("Could not parse [{}]. Please provide a relevant query. {}", rdfQuery, qpe);
        continue;
      }

      qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query);

      try {
        harvest(qExec);
      } catch (Exception e) {
        logger.error("Exception [{}] occurred while harvesting", e.getLocalizedMessage());
      } finally {
        qExec.close();
      }
    }
  }
示例#2
0
  public static String Extract(String sPattern, int iOffset, String sText) {

    String sResult = "";
    try {
      Scanner document_scanner = new Scanner(sText);

      int iCountOcurr = 0;
      int iFirstLine = 0;
      String sLine = "";
      Boolean bReading = false;
      Boolean bIsEmpty = true;

      while (document_scanner.hasNext()) {
        sLine = document_scanner.next();

        /// Pattern detected
        if (sLine.indexOf(sPattern) > -1) {
          iCountOcurr++;
        }

        /// Number of pattern has been reached
        if (iCountOcurr == iOffset) {
          bReading = true;
        }

        if (bReading && bIsEmpty) {

          if (iFirstLine != 0) {
            /// Stop
            if (sLine.indexOf(sStop) > -1) {
              bReading = false;
              bIsEmpty = false;
            } else {
              sResult += " " + sLine;
            }
          } else {
            sResult += " " + sLine;
          }

          System.out.println("sResult:" + sResult + "\n");
          iFirstLine++;
        }
      }

      document_scanner.close();
      // Print number of times the search pattern was found
      // System.out.println("Found Input "+ iCountOcurr + " times");
    } catch (Exception e) {
      sResult = "Error buscar parrafo: " + e.getMessage();
    }

    return sResult;
  }
示例#3
0
  @POST
  @Consumes(MediaType.APPLICATION_JSON)
  public String GetDocumentParagraph(InputStream incomingData) {
    String output = "";
    StringBuilder builder = new StringBuilder();

    Calendar cal = Calendar.getInstance();
    SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss.SSS");
    try {
      // output += "Web Service Document ..." + sdf.format(cal.getTime()) +"<br>";

      /// Reeading vars from JSON
      /// ----------------------------------------------------------------------------------------
      BufferedReader in = new BufferedReader(new InputStreamReader(incomingData));
      String line = null;
      while ((line = in.readLine()) != null) {
        builder.append(line);
      }

      cal = Calendar.getInstance();
      // output += "Json stream readed: " + sdf.format(cal.getTime()) + "<br>";

      /// Reeading vars from JSON
      /// ----------------------------------------------------------------------------------------
      JSONObject jsonObject = new JSONObject(builder.toString());
      String sUrl = jsonObject.getString("url");
      String sPattern = jsonObject.getString("pattern");
      int iOffset = jsonObject.getInt("offset");

      sUrl = sUrl.replace('^', '"');

      // output += "sUrl: " + sUrl + " <br>";
      // output += "sPattern: " + sPattern + " <br>";
      // output += "iOffset: " + iOffset + " <br>";

      output += GetDocumentParagraph(sPattern, iOffset, sUrl);

    } catch (Exception e) {
      cal = Calendar.getInstance();
      output += "Error: " + e.toString() + sdf.format(cal.getTime()) + "<br>";
    }

    // return HTTP response 200 in case of success
    return output;
  }
  /** Harvests all the triplets from each URI in the @rdfUris list */
  private void harvestFromDumps() {
    for (String uri : rdfUris) {
      if (uri.isEmpty()) continue;

      logger.info("Harvesting uri [{}]", uri);

      Model model = ModelFactory.createDefaultModel();
      try {
        RDFDataMgr.read(model, uri.trim(), RDFLanguages.RDFXML);
        BulkRequestBuilder bulkRequest = client.prepareBulk();
        addModelToES(model, bulkRequest, true);
      } catch (RiotException re) {
        logger.error("Illegal xml character [{}]", re.getLocalizedMessage());
      } catch (Exception e) {
        logger.error(
            "Exception when harvesting url: {}. Details: {}", uri, e.getLocalizedMessage());
      }
    }
  }
示例#5
0
  public static String GetText(String sUrl) {

    String sRet = "";
    try {
      System.out.print(" Connecting to: " + sUrl + "... \n");
      InputStream inputStream = new URL(sUrl).openStream();
      System.out.print(" Stream readed from: " + sUrl + "\n");

      HWPFDocument docx = new HWPFDocument(inputStream);
      WordExtractor we = new WordExtractor(docx);
      sRet = we.getText();

      we.close();

    } catch (Exception e) {
      sRet = "Error al leer el archivo" + e.getMessage();
    }

    return sRet;
  }
示例#6
0
  @Override
  protected void runTestForReal() throws Throwable {
    Query query = null;
    try {
      try {
        query = queryFromTestItem(testItem);
      } catch (QueryException qEx) {
        query = null;
        qEx.printStackTrace(System.err);
        fail("Parse failure: " + qEx.getMessage());
        throw qEx;
      }

      Dataset dataset = setUpDataset(query, testItem);
      if (dataset == null && !doesQueryHaveDataset(query)) fail("No dataset for query");

      QueryExecution qe = null;

      if (dataset == null) qe = QueryExecutionFactory.create(query, queryFileManager);
      else qe = QueryExecutionFactory.create(query, dataset);

      try {
        if (query.isSelectType()) runTestSelect(query, qe);
        else if (query.isConstructType()) runTestConstruct(query, qe);
        else if (query.isDescribeType()) runTestDescribe(query, qe);
        else if (query.isAskType()) runTestAsk(query, qe);
      } finally {
        qe.close();
      }
    } catch (IOException ioEx) {
      // log.debug("IOException: ",ioEx) ;
      fail("IOException: " + ioEx.getMessage());
      throw ioEx;
    } catch (NullPointerException ex) {
      throw ex;
    } catch (Exception ex) {
      ex.printStackTrace(System.err);
      fail("Exception: " + ex.getClass().getName() + ": " + ex.getMessage());
    }
  }
  /**
   * Get a set of unique queryObjName returned from a select query
   *
   * <p>Used to retrieve sets of modified objects used in sync
   *
   * @param rdfQuery query to execute
   * @param queryObjName name of the object returned
   * @return set of values for queryObjectName in the rdfQuery result
   */
  HashSet<String> executeSyncQuery(String rdfQuery, String queryObjName) {
    HashSet<String> rdfUrls = new HashSet<String>();

    Query query;
    try {
      query = QueryFactory.create(rdfQuery);
    } catch (QueryParseException qpe) {
      logger.warn(
          "Could not parse [{}]. Please provide a relevant query. {}",
          rdfQuery,
          qpe.getLocalizedMessage());
      return null;
    }

    QueryExecution qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query);
    try {
      ResultSet results = qExec.execSelect();

      while (results.hasNext()) {
        QuerySolution sol = results.nextSolution();
        try {
          String value = sol.getResource(queryObjName).toString();
          rdfUrls.add(value);
        } catch (NoSuchElementException e) {
          logger.error("Encountered a NoSuchElementException: " + e.getLocalizedMessage());
          return null;
        }
      }
    } catch (Exception e) {
      logger.error(
          "Encountered a [{}] while querying the endpoint for sync", e.getLocalizedMessage());
      return null;
    } finally {
      qExec.close();
    }

    return rdfUrls;
  }
  /**
   * Starts a harvester with predefined queries to synchronize with the changes from the SPARQL
   * endpoint
   */
  public boolean sync() {
    logger.info("Sync resources newer than {}", startTime);

    String rdfQueryTemplate =
        "PREFIX xsd:<http://www.w3.org/2001/XMLSchema#> "
            + "SELECT DISTINCT ?resource WHERE { "
            + " GRAPH ?graph { %s }"
            + " ?graph <%s> ?time .  %s "
            + " FILTER (?time > xsd:dateTime(\"%s\")) }";

    String queryStr =
        String.format(
            rdfQueryTemplate, syncConditions, syncTimeProp, graphSyncConditions, startTime);
    Set<String> syncUris = executeSyncQuery(queryStr, "resource");

    if (syncUris == null) {
      logger.error("Errors occurred during sync procedure. Aborting!");
      return false;
    }

    /**
     * If desired, query for old data that has the sync conditions modified
     *
     * <p>This option is useful in the case in which the application indexes resources that match
     * some conditions. In this case, if they are modified and no longer match the initial
     * conditions, they will not be synchronized. When syncOldData is True, the modified resources
     * that no longer match the conditions are deleted.
     */
    int deleted = 0;
    int count = 0;
    if (this.syncOldData) {
      SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
      queryStr =
          String.format(
              rdfQueryTemplate,
              syncConditions,
              syncTimeProp,
              graphSyncConditions,
              sdf.format(new Date(0)));

      HashSet<String> allIndexURIs = executeSyncQuery(queryStr, "resource");

      if (allIndexURIs == null) {
        logger.error("Errors occurred during modified content sync query. Aborting!");
        return false;
      }

      deleted = removeMissingUris(allIndexURIs);
    }

    /* Prepare a series of bulk uris to be described so we can make
     * a smaller number of calls to the SPARQL endpoint. */
    ArrayList<ArrayList<String>> bulks = new ArrayList<ArrayList<String>>();
    ArrayList<String> currentBulk = new ArrayList<String>();

    for (String uri : syncUris) {
      currentBulk.add(uri);

      if (currentBulk.size() == EEASettings.DEFAULT_BULK_SIZE) {
        bulks.add(currentBulk);
        currentBulk = new ArrayList<String>();
      }
    }

    if (currentBulk.size() > 0) {
      bulks.add(currentBulk);
    }

    /* Execute RDF queries for the resources in each bulk */
    for (ArrayList<String> bulk : bulks) {
      String syncQuery = getSyncQueryStr(bulk);

      try {
        Query query = QueryFactory.create(syncQuery);
        QueryExecution qExec = QueryExecutionFactory.sparqlService(rdfEndpoint, query);
        try {
          Model constructModel = ModelFactory.createDefaultModel();
          qExec.execConstruct(constructModel);
          BulkRequestBuilder bulkRequest = client.prepareBulk();

          /**
           * When adding the model to ES do not use toDescribeURIs as the query already returned the
           * correct labels.
           */
          addModelToES(constructModel, bulkRequest, false);
          count += bulk.size();
        } catch (Exception e) {
          logger.error("Error while querying for modified content. {}", e.getLocalizedMessage());
          return false;
        } finally {
          qExec.close();
        }
      } catch (QueryParseException qpe) {
        logger.warn(
            "Could not parse Sync query. Please provide a relevant query. {}",
            qpe.getLocalizedMessage());
        return false;
      }
    }
    logger.info(
        "Finished synchronisation: Deleted {}, Updated {}/{}", deleted, count, syncUris.size());
    return true;
  }