Пример #1
0
  // Filter the AllPairsTask file, rejecting all entries that where found to
  // be only used by filtered entries.
  private void filterFeatures() throws FileNotFoundException, IOException {
    IntSet rejectedFeatures = new IntOpenHashSet();

    WeightedTokenSource featureSource =
        BybloIO.openFeaturesSource(activeFeaturesFile, getCharset(), indexDeligate);

    File outputFile = tempFiles.createFile();

    WeightedTokenSink featureSink =
        BybloIO.openFeaturesSink(outputFile, getCharset(), indexDeligate);

    progress.setMessage("Filtering features.");

    // Store an filtered wieght here and record it so as to maintain
    // accurate priors for those features that remain
    double filteredWeight = 0;
    int filteredId = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING);

    long inCount = 0;
    long outCount = 0;
    while (featureSource.hasNext()) {
      Weighted<Token> feature = featureSource.read();
      ++inCount;

      if (feature.record().id() == filteredId) {
        filteredWeight += feature.weight();
      } else if (acceptFeature.apply(feature)) {
        featureSink.write(feature);
        ++outCount;
      } else {
        rejectedFeatures.add(feature.record().id());
        filteredWeight += feature.weight();
      }

      if ((inCount % PROGRESS_INTERVAL == 0 || !featureSource.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage(format("Accepted {0} of {1} features.", outCount, inCount));
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (filteredWeight != 0) {
      featureSink.write(new Weighted<Token>(new Token(filteredId), filteredWeight));
    }
    featureSource.close();
    featureSink.flush();
    featureSink.close();

    if (!activeFeaturesFile.equals(inputFeaturesFile)) {
      activeFeaturesFile.delete();
    }

    featureFilterRequired = false;
    activeFeaturesFile = outputFile;

    // Update the feature acceptance predicate
    if (rejectedFeatures.size() > 0) {

      eventFilterRequired = true;
      acceptEvent =
          Predicates2.and(
              acceptEvent,
              Predicates2.compose(
                  Predicates2.not(Predicates2.in(rejectedFeatures)), eventFeatureId()));
    }
  }
Пример #2
0
  private void filterEntries() throws FileNotFoundException, IOException {

    final IntSet rejected = new IntOpenHashSet();

    WeightedTokenSource entriesSource =
        BybloIO.openEntriesSource(activeEntriesFile, getCharset(), getIndexDeligate());

    File outputFile = tempFiles.createFile();

    WeightedTokenSink entriesSink =
        BybloIO.openEntriesSink(outputFile, getCharset(), getIndexDeligate());

    progress.setMessage("Filtering entries.");

    final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING);
    double filteredWeight = 0;

    long inCount = 0;
    long outCount = 0;
    while (entriesSource.hasNext()) {
      ++inCount;
      Weighted<Token> record = entriesSource.read();

      if (record.record().id() == filteredEntry) {
        filteredWeight += record.weight();
      } else if (acceptEntry.apply(record)) {
        entriesSink.write(record);
        ++outCount;
      } else {
        rejected.add(record.record().id());
        filteredWeight += record.weight();
      }

      if ((inCount % PROGRESS_INTERVAL == 0 || !entriesSource.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage(format("Accepted {0} of {1} entries.", outCount, inCount));
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (filteredWeight != 0) {
      entriesSink.write(new Weighted<Token>(new Token(filteredEntry), filteredWeight));
    }

    entriesSource.close();
    entriesSink.flush();
    entriesSink.close();

    if (!activeEntriesFile.equals(inputEntriesFile)) {
      activeEntriesFile.delete();
    }

    entryFilterRequired = false;
    activeEntriesFile = outputFile;

    // Update the feature acceptance predicate
    if (rejected.size() > 0) {
      eventFilterRequired = true;
      acceptEvent =
          Predicates2.and(
              acceptEvent,
              Predicates2.compose(Predicates2.not(Predicates2.in(rejected)), eventEntryId()));
    }
  }