protected void ensureParsed_(Iterator<Relation> relations) throws IOException {
    objects.clear();
    predicates.clear();
    contexts.clear();
    subjectTokens.clear();

    // Index subject tokens
    // We index the BNode id. Do we need it?
    String subject = getSubject();
    FastBufferedReader fbr;
    // remove http/https or _:
    int startAt = subject.indexOf(':');

    if (startAt < 0) {
      fbr = new FastBufferedReader(subject.toCharArray());
    } else {
      startAt++;
      fbr = new FastBufferedReader(subject.toCharArray(), startAt, subject.length() - startAt);
    }
    MutableString word = new MutableString();
    MutableString nonWord = new MutableString();
    while (fbr.next(word, nonWord)) {
      if (word != null && !word.equals("")) {
        if (CombinedTermProcessor.getInstance().processTerm(word)) {
          subjectTokens.add(word.toString().toLowerCase());
        }
      }
    }
    fbr.close();

    while (relations.hasNext()) {
      Relation relation = relations.next();
      String predicate = relation.getPredicate().toString();

      // Check if prefix is on blacklist
      if (RDFDocumentFactory.isOnPredicateBlacklist(predicate.toLowerCase())) {
        factory.incrementCounter(RdfCounters.BLACKLISTED_TRIPLES, 1);
        continue;
      }

      String predicateId = factory.lookupResource(predicate, false);
      if (predicateId == null) {
        throw new IllegalStateException(
            "Predicate " + predicate + " not in resources hash function!");
      }

      String contextId = NO_CONTEXT;
      if (factory.isWithContexts() && relation.getContext() != null) {
        if (relation.getContext() instanceof Resource) {
          contextId = factory.lookupResource(relation.getContext().toString(), false);
          if (contextId == null) {
            throw new IllegalStateException(
                "Context " + relation.getContext() + " not in resources hash function!");
          }
        } else {
          throw new IllegalStateException(
              "Context " + relation.getContext() + " is not a Resource.");
        }
      }

      if (relation.getObject() instanceof Resource) {
        if (predicate.equals(RDF.TYPE.toString())) {
          factory.incrementCounter(RdfCounters.RDF_TYPE_TRIPLES, 1);
          objects.add(relation.getObject().toString());
        } else {
          String objectId = factory.lookupResource(relation.getObject().toString(), true);
          if (objectId == null) {
            throw new IllegalStateException(
                "Object " + relation.getObject() + " not in resources hash function!");
          }
          objects.add(objectId);
        }
        predicates.add(predicateId);
        contexts.add(contextId);
      } else if (relation.getObject() instanceof BNode) {
        String objectId = factory.lookupResource(relation.getObject().toString(), false);
        if (objectId == null) {
          throw new IllegalStateException(
              "Object " + relation.getObject() + " not in resources hash function!");
        }
        objects.add(objectId);
        predicates.add(predicateId);
        contexts.add(contextId);
      } else {
        String object = relation.getObject().toString();
        // Iterate over the words of the value
        fbr = new FastBufferedReader(object.toCharArray());
        while (fbr.next(word, nonWord)) {
          if (word != null && !word.equals("")) {
            if (CombinedTermProcessor.getInstance().processTerm(word)) {
              // Lowercase terms
              objects.add(word.toString());

              // Preserve casing for properties and
              // contexts
              predicates.add(predicateId);
              contexts.add(contextId);
            }
          }
        }
        fbr.close();
      }

      factory.incrementCounter(RdfCounters.INDEXED_TRIPLES, 1);
    }
  }
Beispiel #2
0
  public static void main(final String[] arg)
      throws IOException, JSAPException, NoSuchMethodException {

    final SimpleJSAP jsap =
        new SimpleJSAP(
            BloomFilter.class.getName(),
            "Creates a Bloom filter reading from standard input a newline-separated list of terms.",
            new Parameter[] {
              new FlaggedOption(
                  "bufferSize",
                  IntSizeStringParser.getParser(),
                  "64Ki",
                  JSAP.NOT_REQUIRED,
                  'b',
                  "buffer-size",
                  "The size of the I/O buffer used to read terms."),
              new FlaggedOption(
                  "encoding",
                  ForNameStringParser.getParser(Charset.class),
                  "UTF-8",
                  JSAP.NOT_REQUIRED,
                  'e',
                  "encoding",
                  "The term file encoding."),
              new UnflaggedOption(
                  "bloomFilter",
                  JSAP.STRING_PARSER,
                  JSAP.NO_DEFAULT,
                  JSAP.REQUIRED,
                  JSAP.NOT_GREEDY,
                  "The filename for the serialised front-coded list."),
              new UnflaggedOption(
                  "size",
                  JSAP.INTSIZE_PARSER,
                  JSAP.NO_DEFAULT,
                  JSAP.REQUIRED,
                  JSAP.NOT_GREEDY,
                  "The size of the filter (i.e., the expected number of elements in the filter; usually, the number of terms)."),
              new UnflaggedOption(
                  "precision",
                  JSAP.INTEGER_PARSER,
                  JSAP.NO_DEFAULT,
                  JSAP.REQUIRED,
                  JSAP.NOT_GREEDY,
                  "The precision of the filter.")
            });

    JSAPResult jsapResult = jsap.parse(arg);
    if (jsap.messagePrinted()) return;

    final int bufferSize = jsapResult.getInt("bufferSize");
    final String filterName = jsapResult.getString("bloomFilter");
    final Charset encoding = (Charset) jsapResult.getObject("encoding");

    BloomFilter filter = new BloomFilter(jsapResult.getInt("size"), jsapResult.getInt("precision"));
    final ProgressLogger pl = new ProgressLogger();
    pl.itemsName = "terms";
    pl.start("Reading terms...");
    MutableString s = new MutableString();
    FastBufferedReader reader =
        new FastBufferedReader(new InputStreamReader(System.in, encoding), bufferSize);
    while (reader.readLine(s) != null) {
      filter.add(s);
      pl.lightUpdate();
    }
    pl.done();

    BinIO.storeObject(filter, filterName);
  }