protected void ensureParsed_(Iterator<Relation> relations) throws IOException { objects.clear(); predicates.clear(); contexts.clear(); subjectTokens.clear(); // Index subject tokens // We index the BNode id. Do we need it? String subject = getSubject(); FastBufferedReader fbr; // remove http/https or _: int startAt = subject.indexOf(':'); if (startAt < 0) { fbr = new FastBufferedReader(subject.toCharArray()); } else { startAt++; fbr = new FastBufferedReader(subject.toCharArray(), startAt, subject.length() - startAt); } MutableString word = new MutableString(); MutableString nonWord = new MutableString(); while (fbr.next(word, nonWord)) { if (word != null && !word.equals("")) { if (CombinedTermProcessor.getInstance().processTerm(word)) { subjectTokens.add(word.toString().toLowerCase()); } } } fbr.close(); while (relations.hasNext()) { Relation relation = relations.next(); String predicate = relation.getPredicate().toString(); // Check if prefix is on blacklist if (RDFDocumentFactory.isOnPredicateBlacklist(predicate.toLowerCase())) { factory.incrementCounter(RdfCounters.BLACKLISTED_TRIPLES, 1); continue; } String predicateId = factory.lookupResource(predicate, false); if (predicateId == null) { throw new IllegalStateException( "Predicate " + predicate + " not in resources hash function!"); } String contextId = NO_CONTEXT; if (factory.isWithContexts() && relation.getContext() != null) { if (relation.getContext() instanceof Resource) { contextId = factory.lookupResource(relation.getContext().toString(), false); if (contextId == null) { throw new IllegalStateException( "Context " + relation.getContext() + " not in resources hash function!"); } } else { throw new IllegalStateException( "Context " + relation.getContext() + " is not a Resource."); } } if (relation.getObject() instanceof Resource) { if (predicate.equals(RDF.TYPE.toString())) { factory.incrementCounter(RdfCounters.RDF_TYPE_TRIPLES, 1); objects.add(relation.getObject().toString()); } else { String objectId = factory.lookupResource(relation.getObject().toString(), true); if (objectId == null) { throw new IllegalStateException( "Object " + relation.getObject() + " not in resources hash function!"); } objects.add(objectId); } predicates.add(predicateId); contexts.add(contextId); } else if (relation.getObject() instanceof BNode) { String objectId = factory.lookupResource(relation.getObject().toString(), false); if (objectId == null) { throw new IllegalStateException( "Object " + relation.getObject() + " not in resources hash function!"); } objects.add(objectId); predicates.add(predicateId); contexts.add(contextId); } else { String object = relation.getObject().toString(); // Iterate over the words of the value fbr = new FastBufferedReader(object.toCharArray()); while (fbr.next(word, nonWord)) { if (word != null && !word.equals("")) { if (CombinedTermProcessor.getInstance().processTerm(word)) { // Lowercase terms objects.add(word.toString()); // Preserve casing for properties and // contexts predicates.add(predicateId); contexts.add(contextId); } } } fbr.close(); } factory.incrementCounter(RdfCounters.INDEXED_TRIPLES, 1); } }
public static void main(final String[] arg) throws IOException, JSAPException, NoSuchMethodException { final SimpleJSAP jsap = new SimpleJSAP( BloomFilter.class.getName(), "Creates a Bloom filter reading from standard input a newline-separated list of terms.", new Parameter[] { new FlaggedOption( "bufferSize", IntSizeStringParser.getParser(), "64Ki", JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of the I/O buffer used to read terms."), new FlaggedOption( "encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term file encoding."), new UnflaggedOption( "bloomFilter", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised front-coded list."), new UnflaggedOption( "size", JSAP.INTSIZE_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The size of the filter (i.e., the expected number of elements in the filter; usually, the number of terms)."), new UnflaggedOption( "precision", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The precision of the filter.") }); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; final int bufferSize = jsapResult.getInt("bufferSize"); final String filterName = jsapResult.getString("bloomFilter"); final Charset encoding = (Charset) jsapResult.getObject("encoding"); BloomFilter filter = new BloomFilter(jsapResult.getInt("size"), jsapResult.getInt("precision")); final ProgressLogger pl = new ProgressLogger(); pl.itemsName = "terms"; pl.start("Reading terms..."); MutableString s = new MutableString(); FastBufferedReader reader = new FastBufferedReader(new InputStreamReader(System.in, encoding), bufferSize); while (reader.readLine(s) != null) { filter.add(s); pl.lightUpdate(); } pl.done(); BinIO.storeObject(filter, filterName); }