private boolean is200AnRDF(CachedHTTPResource resource) { if (resource.isContainsRDF() != null) return resource.isContainsRDF(); if (resource != null && resource.getResponses() != null) { for (SerialisableHttpResponse response : resource.getResponses()) { if (response != null && response.getHeaders("Content-Type") != null) { if (LinkedDataContent.contentTypes.contains(response.getHeaders("Content-Type"))) { if (response.getHeaders("Content-Type").equals(WebContent.contentTypeTextPlain)) { Model m = this.tryRead(resource.getUri()); if (m != null && m.size() == 0) { this.createProblemQuad(resource.getUri(), DQM.SC200WithoutRDF); resource.setContainsRDF(false); return false; } } this.createProblemQuad(resource.getUri(), DQM.SC200WithRDF); resource.setContainsRDF(true); return true; } } } } this.createProblemQuad(resource.getUri(), DQM.SC200WithoutRDF); resource.setContainsRDF(false); return false; }
// Private Method for checking forward linking private void checkForForwardLinking() { for (String uri : uriSet) { CachedHTTPResource httpResource = (CachedHTTPResource) DiachronCacheManager.getInstance() .getFromCache(DiachronCacheManager.HTTP_RESOURCE_CACHE, uri); if (httpResource == null || (httpResource.getResponses() == null && httpResource.getDereferencabilityStatusCode() != StatusCode.BAD)) { this.notFetchedQueue.add(uri); } else { logger.info("Checking resource: {}. URIs left: {}.", httpResource.getUri(), uriSet.size()); // We perform a semantic lookup using heuristics to check if we // really need to try parsing or not if (HTTPResourceUtils.semanticURILookup(httpResource)) { logger.info( "Trying to find any dereferencable forward links for {}.", httpResource.getUri()); if (Dereferencer.hasValidDereferencability(httpResource)) { logger.info("Dereferencable resource {}.", httpResource.getUri()); // boolean isValid = ModelParser.snapshotParserForForwardDereference(httpResource, // (Lang) null, httpResource.getUri()); // if (isValid){ // //ok // logger.info("A description exists for resource {}.", httpResource.getUri()); // // totalDerefDataWithSub++; // } else { // //not ok // this.createNotValidForwardLink(httpResource.getUri()); // } Model m = RDFDataMgr.loadModel(httpResource.getUri()); // load partial model Resource r = m.createResource(httpResource.getUri()); List<Statement> stmtList = m.listStatements(r, (Property) null, (RDFNode) null).toList(); if (stmtList.size() > 1) { // ok logger.info("A description exists for resource {}.", httpResource.getUri()); totalDerefDataWithSub++; } else { // not ok this.createNotValidForwardLink(httpResource.getUri()); } } } else { logger.info("Non-meaningful dereferencable resource {}.", httpResource.getUri()); this.createNotValidForwardLink(httpResource.getUri()); } } } }
// Private Methods for Dereferenceability Process private boolean isDereferenceable(CachedHTTPResource httpResource) { if (httpResource.getDereferencabilityStatusCode() == null) { List<Integer> statusCode = this.getStatusCodes(httpResource.getStatusLines()); if (httpResource.getUri().contains("#") && statusCode.contains(200)) httpResource.setDereferencabilityStatusCode(StatusCode.HASH); else if (statusCode.contains(200)) { httpResource.setDereferencabilityStatusCode(StatusCode.SC200); if (statusCode.contains(303)) httpResource.setDereferencabilityStatusCode(StatusCode.SC303); else { if (statusCode.contains(301)) httpResource.setDereferencabilityStatusCode(StatusCode.SC301); else if (statusCode.contains(302)) httpResource.setDereferencabilityStatusCode(StatusCode.SC302); else if (statusCode.contains(307)) httpResource.setDereferencabilityStatusCode(StatusCode.SC307); } } if (has4xxCode(statusCode)) { httpResource.setDereferencabilityStatusCode(StatusCode.SC4XX); } if (has5xxCode(statusCode)) { httpResource.setDereferencabilityStatusCode(StatusCode.SC5XX); } } StatusCode scode = httpResource.getDereferencabilityStatusCode(); return this.mapDerefStatusCode(scode); }
// Private Method to check content type private Model getMeaningfulData(CachedHTTPResource resource) { Model m = null; if (resource != null && resource.getResponses() != null) { for (SerialisableHttpResponse response : resource.getResponses()) { if (response != null && response.getHeaders("Content-Type") != null) { if (CommonDataStructures.ldContentTypes.contains(response.getHeaders("Content-Type"))) { m = this.tryRead(resource.getUri()); } } } } return m; }
/** * Tries to dereference all the URIs contained in the parameter, by retrieving them from the * cache. URIs not found in the cache are added to the queue containing the URIs to be fetched by * the async HTTP retrieval process * * @param uriSet Set of URIs to be dereferenced * @return list with the results of the dereferenceability operations, for those URIs that were * found in the cache */ private List<DerefResult> deReferenceUris(List<String> uriSet) { // Start the dereferencing process, which will be run in parallel httpRetriever.addListOfResourceToQueue(uriSet); httpRetriever.start(); List<DerefResult> lstDerefUris = new ArrayList<DerefResult>(); List<String> lstToDerefUris = new ArrayList<String>(uriSet); // Dereference each and every one of the URIs contained in the specified set while (lstToDerefUris.size() > 0) { // Remove the URI at the head of the queue of URIs to be dereferenced String headUri = lstToDerefUris.remove(0); // First, search for the URI in the cache CachedHTTPResource httpResource = (CachedHTTPResource) dcmgr.getFromCache(DiachronCacheManager.HTTP_RESOURCE_CACHE, headUri); if (httpResource == null || httpResource.getStatusLines() == null) { // URIs not found in the cache, is still to be fetched via HTTP, add it to the end of the // list lstToDerefUris.add(headUri); } else { // URI found in the cache (which means that was fetched at some point), check if // successfully dereferenced DerefResult curUrlResult = new DerefResult(headUri, false, false); lstDerefUris.add(curUrlResult); if (this.isDereferenceable(httpResource)) { curUrlResult.isDeref = true; if (this.is200AnRDF(httpResource)) { curUrlResult.isRdfXml = true; } else this.createProblemQuad(httpResource.getUri(), DQM.NotMeaningful); } else if (httpResource.getDereferencabilityStatusCode() == StatusCode.SC200) { curUrlResult.isDeref = true; // Check if the resource contains RDF on XML if (this.is200AnRDF(httpResource)) { curUrlResult.isRdfXml = true; } } logger.trace( "Resource fetched: {}. Deref. status: {}. Is RDF: {}", headUri, httpResource.getDereferencabilityStatusCode(), curUrlResult.isRdfXml); } } return lstDerefUris; }
private boolean isDereferenceable(CachedHTTPResource httpResource) { if (httpResource.getDereferencabilityStatusCode() == null) { List<Integer> statusCode = this.getStatusCodes(httpResource.getStatusLines()); if (httpResource.getUri().contains("#") && statusCode.contains(200)) httpResource.setDereferencabilityStatusCode(StatusCode.HASH); else if (statusCode.contains(200)) { httpResource.setDereferencabilityStatusCode(StatusCode.SC200); if (statusCode.contains(303)) httpResource.setDereferencabilityStatusCode(StatusCode.SC303); else { if (statusCode.contains(301)) { httpResource.setDereferencabilityStatusCode(StatusCode.SC301); this.createProblemQuad(httpResource.getUri(), DQM.SC301MovedPermanently); } else if (statusCode.contains(302)) { httpResource.setDereferencabilityStatusCode(StatusCode.SC302); this.createProblemQuad(httpResource.getUri(), DQM.SC302Found); } else if (statusCode.contains(307)) { httpResource.setDereferencabilityStatusCode(StatusCode.SC307); this.createProblemQuad(httpResource.getUri(), DQM.SC307TemporaryRedirectory); } else { if (hasBad3xxCode(statusCode)) this.createProblemQuad(httpResource.getUri(), DQM.SC3XXRedirection); } } } if (has4xxCode(statusCode)) { httpResource.setDereferencabilityStatusCode(StatusCode.SC4XX); this.createProblemQuad(httpResource.getUri(), DQM.SC4XXClientError); } if (has5xxCode(statusCode)) { httpResource.setDereferencabilityStatusCode(StatusCode.SC5XX); this.createProblemQuad(httpResource.getUri(), DQM.SC5XXServerError); } } StatusCode scode = httpResource.getDereferencabilityStatusCode(); return this.mapDerefStatusCode(scode); }