// Filter the AllPairsTask file, rejecting all entries that where found to // be only used by filtered entries. private void filterFeatures() throws FileNotFoundException, IOException { IntSet rejectedFeatures = new IntOpenHashSet(); WeightedTokenSource featureSource = BybloIO.openFeaturesSource(activeFeaturesFile, getCharset(), indexDeligate); File outputFile = tempFiles.createFile(); WeightedTokenSink featureSink = BybloIO.openFeaturesSink(outputFile, getCharset(), indexDeligate); progress.setMessage("Filtering features."); // Store an filtered wieght here and record it so as to maintain // accurate priors for those features that remain double filteredWeight = 0; int filteredId = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING); long inCount = 0; long outCount = 0; while (featureSource.hasNext()) { Weighted<Token> feature = featureSource.read(); ++inCount; if (feature.record().id() == filteredId) { filteredWeight += feature.weight(); } else if (acceptFeature.apply(feature)) { featureSink.write(feature); ++outCount; } else { rejectedFeatures.add(feature.record().id()); filteredWeight += feature.weight(); } if ((inCount % PROGRESS_INTERVAL == 0 || !featureSource.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage(format("Accepted {0} of {1} features.", outCount, inCount)); LOG.debug(MiscUtil.memoryInfoString()); } } if (filteredWeight != 0) { featureSink.write(new Weighted<Token>(new Token(filteredId), filteredWeight)); } featureSource.close(); featureSink.flush(); featureSink.close(); if (!activeFeaturesFile.equals(inputFeaturesFile)) { activeFeaturesFile.delete(); } featureFilterRequired = false; activeFeaturesFile = outputFile; // Update the feature acceptance predicate if (rejectedFeatures.size() > 0) { eventFilterRequired = true; acceptEvent = Predicates2.and( acceptEvent, Predicates2.compose( Predicates2.not(Predicates2.in(rejectedFeatures)), eventFeatureId())); } }