/** * Sets the {@link Harvester}'s {@link #whiteMap} parameter. A whiteMap contains all the pairs * property - list of objects that are meant to be indexed. * * @param whiteMap - a new value for the parameter * @return the same {@link Harvester} with the {@link #whiteMap} parameter set */ @SuppressWarnings("unchecked") public Harvester rdfWhiteMap(Map<String, Object> whiteMap) { if (whiteMap != null && !whiteMap.isEmpty()) { this.whiteMap = new HashMap<String, Set<String>>(); for (Map.Entry<String, Object> entry : whiteMap.entrySet()) { this.whiteMap.put(entry.getKey(), new HashSet((List<String>) entry.getValue())); } } return this; }
/** * Index all the resources in a Jena Model to ES * * @param model the model to index * @param bulkRequest a BulkRequestBuilder * @param getPropLabel if set to true all URI property values will be indexed as their label. The * label is taken as the value of one of the properties set in {@link #uriDescriptionList}. */ private void addModelToES(Model model, BulkRequestBuilder bulkRequest, boolean getPropLabel) { long startTime = System.currentTimeMillis(); long bulkLength = 0; HashSet<Property> properties = new HashSet<Property>(); StmtIterator it = model.listStatements(); while (it.hasNext()) { Statement st = it.nextStatement(); Property prop = st.getPredicate(); String property = prop.toString(); if (rdfPropList.isEmpty() || (isWhitePropList && rdfPropList.contains(property)) || (!isWhitePropList && !rdfPropList.contains(property)) || (normalizeProp.containsKey(property))) { properties.add(prop); } } ResIterator resIt = model.listSubjects(); while (resIt.hasNext()) { Resource rs = resIt.nextResource(); Map<String, ArrayList<String>> jsonMap = getJsonMap(rs, properties, model, getPropLabel); bulkRequest.add( client.prepareIndex(indexName, typeName, rs.toString()).setSource(mapToString(jsonMap))); bulkLength++; // We want to execute the bulk for every DEFAULT_BULK_SIZE requests if (bulkLength % EEASettings.DEFAULT_BULK_SIZE == 0) { BulkResponse bulkResponse = bulkRequest.execute().actionGet(); // After executing, flush the BulkRequestBuilder. bulkRequest = client.prepareBulk(); if (bulkResponse.hasFailures()) { processBulkResponseFailure(bulkResponse); } } } // Execute remaining requests if (bulkRequest.numberOfActions() > 0) { BulkResponse response = bulkRequest.execute().actionGet(); // Handle failure by iterating through each bulk response item if (response.hasFailures()) { processBulkResponseFailure(response); } } // Show time taken to index the documents logger.info( "Indexed {} documents on {}/{} in {} seconds", bulkLength, indexName, typeName, (System.currentTimeMillis() - startTime) / 1000.0); }
/** * Get JSON map for a given resource by applying the river settings * * @param rs resource being processed * @param properties properties to be indexed * @param model model returned by the indexing query * @param getPropLabel if set to true all URI property values will be indexed as their label. The * label is taken as the value of one of the properties set in {@link #uriDescriptionList}. * @return map of properties to be indexed for res */ private Map<String, ArrayList<String>> getJsonMap( Resource rs, Set<Property> properties, Model model, boolean getPropLabel) { Map<String, ArrayList<String>> jsonMap = new HashMap<String, ArrayList<String>>(); ArrayList<String> results = new ArrayList<String>(); if (addUriForResource) { results.add("\"" + rs.toString() + "\""); jsonMap.put("http://www.w3.org/1999/02/22-rdf-syntax-ns#about", results); } Set<String> rdfLanguages = new HashSet<String>(); for (Property prop : properties) { NodeIterator niter = model.listObjectsOfProperty(rs, prop); String property = prop.toString(); results = new ArrayList<String>(); String lang; String currValue; while (niter.hasNext()) { RDFNode node = niter.next(); currValue = getStringForResult(node, getPropLabel); if (addLanguage) { if (node.isLiteral()) { lang = node.asLiteral().getLanguage(); if (!lang.isEmpty()) { rdfLanguages.add("\"" + lang + "\""); } } } String shortValue = currValue; int currLen = currValue.length(); // Unquote string if (currLen > 1) shortValue = currValue.substring(1, currLen - 1); // If either whiteMap does contains shortValue // or blackMap contains the value // skip adding it to the index boolean whiteMapCond = whiteMap.containsKey(property) && !whiteMap.get(property).contains(shortValue); boolean blackMapCond = blackMap.containsKey(property) && blackMap.get(property).contains(shortValue); if (whiteMapCond || blackMapCond) { continue; } if (normalizeObj.containsKey(shortValue)) { results.add("\"" + normalizeObj.get(shortValue) + "\""); } else { results.add(currValue); } } // Do not index empty properties if (results.isEmpty()) continue; if (normalizeProp.containsKey(property)) { property = normalizeProp.get(property); if (jsonMap.containsKey(property)) { jsonMap.get(property).addAll(results); } else { jsonMap.put(property, results); } } else { jsonMap.put(property, results); } } if (addLanguage) { if (rdfLanguages.isEmpty() && !language.isEmpty()) rdfLanguages.add(language); if (!rdfLanguages.isEmpty()) jsonMap.put("language", new ArrayList<String>(rdfLanguages)); } for (Map.Entry<String, String> it : normalizeMissing.entrySet()) { if (!jsonMap.containsKey(it.getKey())) { ArrayList<String> res = new ArrayList<String>(); res.add("\"" + it.getValue() + "\""); jsonMap.put(it.getKey(), res); } } return jsonMap; }
/** * Sets the {@link Harvester}'s {@link #normalizeMissing} parameter. {@link #normalizeMissing} * contains pairs of property-value. Missing properties are indexed with the given value. * * @param normalizeMissing - new value for the parameter * @return the same {@link Harvester} with the {@link #normalizeMissing} parameter set */ public Harvester rdfNormalizationMissing(Map<String, String> normalizeMissing) { if (normalizeMissing != null && !normalizeMissing.isEmpty()) { this.normalizeMissing = normalizeMissing; } return this; }
/** * Sets the {@link Harvester}'s {@link #normalizeObj} parameter. {@link #normalizeObj} contains * pairs of object-replacement. Objects are replaced with given values no matter of the property * whose value they represent. * * @param normalizeObj - new value for the parameter * @return the same {@link Harvester} with the {@link #normalizeObj} parameter set */ public Harvester rdfNormalizationObj(Map<String, String> normalizeObj) { if (normalizeObj != null && !normalizeObj.isEmpty()) { this.normalizeObj = normalizeObj; } return this; }
/** * Sets the {@link Harvester}'s {@link #normalizeProp} parameter. {@link #normalizeProp} contains * pairs of property-replacement. The properties are replaced with the given values and if one * resource has both properties their values are grouped in a list. * * @param normalizeProp - new value for the parameter * @return the same {@link Harvester} with the {@link #normalizeProp} parameter set */ public Harvester rdfNormalizationProp(Map<String, String> normalizeProp) { if (normalizeProp != null && !normalizeProp.isEmpty()) { this.normalizeProp = normalizeProp; } return this; }