// Filter the AllPairsTask file, rejecting all entires that contain entries // dropped in the entries file filter pass. Store a list of featuress that // only appear in filtered entries to filter the featuress file. private void filterEvents() throws FileNotFoundException, IOException { IntSet acceptedEntries = new IntOpenHashSet(); IntSet rejectedEntries = new IntOpenHashSet(); IntSet rejectedFeatures = new IntOpenHashSet(); IntSet acceptedFeatures = new IntOpenHashSet(); WeightedTokenPairSource efSrc = BybloIO.openEventsSource(activeEventsFile, getCharset(), indexDeligate); File outputFile = tempFiles.createFile(); // outputFile.deleteOnExit(); WeightedTokenPairSink efSink = BybloIO.openEventsSink(outputFile, getCharset(), indexDeligate); progress.setMessage("Filtering events from."); // Store the id of the special filtered feature and entry // TODO This can probably be removed now but need to check final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING); final int filteredFeature = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING); int currentEntryId = -1; int currentEventCount = 0; double currentEntryFilteredFeatureWeight = 0; double filteredEntryWeight = 0; int readCount = 0; int writeCount = 0; while (efSrc.hasNext()) { Weighted<TokenPair> record = efSrc.read(); ++readCount; if (record.record().id1() == filteredEntry) { filteredEntryWeight += record.weight(); continue; } if (record.record().id1() != currentEntryId) { if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) { if (currentEventCount == 0) { filteredEntryWeight += currentEntryFilteredFeatureWeight; } else { efSink.write( new Weighted<TokenPair>( new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight)); ++writeCount; } } currentEntryId = record.record().id1(); currentEntryFilteredFeatureWeight = 0; currentEventCount = 0; } if (record.record().id2() == filteredFeature) { currentEntryFilteredFeatureWeight += record.weight(); } else if (acceptEvent.apply(record)) { efSink.write(record); ++writeCount; acceptedEntries.add(record.record().id1()); acceptedFeatures.add(record.record().id2()); ++currentEventCount; } else { rejectedEntries.add(record.record().id1()); rejectedFeatures.add(record.record().id2()); currentEntryFilteredFeatureWeight += record.weight(); } if ((readCount % PROGRESS_INTERVAL == 0 || !efSrc.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage("Accepted " + writeCount + " of " + readCount + " events."); LOG.debug(MiscUtil.memoryInfoString()); } } if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) { if (currentEventCount == 0) { filteredEntryWeight += currentEntryFilteredFeatureWeight; } else { efSink.write( new Weighted<TokenPair>( new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight)); } } // If there have been entire entries filtered then write their summed // weights to a special filtered entry/feature pair if (filteredEntryWeight != 0) { efSink.write( new Weighted<TokenPair>( new TokenPair(filteredEntry, filteredFeature), filteredEntryWeight)); } efSrc.close(); efSink.flush(); efSink.close(); if (!activeEventsFile.equals(inputEventsFile)) { activeEventsFile.delete(); } eventFilterRequired = false; activeEventsFile = outputFile; rejectedFeatures.removeAll(acceptedFeatures); rejectedEntries.removeAll(acceptedEntries); if (rejectedEntries.size() > 0) { acceptEntry = Predicates2.and( acceptEntry, Predicates2.compose(Predicates2.not(Predicates2.in(rejectedEntries)), id())); entryFilterRequired = true; } if (rejectedFeatures.size() > 0) { acceptFeature = Predicates2.and( acceptFeature, Predicates2.not(Predicates2.compose(Predicates2.in(rejectedFeatures), id()))); featureFilterRequired = true; } }
// Filter the AllPairsTask file, rejecting all entries that where found to // be only used by filtered entries. private void filterFeatures() throws FileNotFoundException, IOException { IntSet rejectedFeatures = new IntOpenHashSet(); WeightedTokenSource featureSource = BybloIO.openFeaturesSource(activeFeaturesFile, getCharset(), indexDeligate); File outputFile = tempFiles.createFile(); WeightedTokenSink featureSink = BybloIO.openFeaturesSink(outputFile, getCharset(), indexDeligate); progress.setMessage("Filtering features."); // Store an filtered wieght here and record it so as to maintain // accurate priors for those features that remain double filteredWeight = 0; int filteredId = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING); long inCount = 0; long outCount = 0; while (featureSource.hasNext()) { Weighted<Token> feature = featureSource.read(); ++inCount; if (feature.record().id() == filteredId) { filteredWeight += feature.weight(); } else if (acceptFeature.apply(feature)) { featureSink.write(feature); ++outCount; } else { rejectedFeatures.add(feature.record().id()); filteredWeight += feature.weight(); } if ((inCount % PROGRESS_INTERVAL == 0 || !featureSource.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage(format("Accepted {0} of {1} features.", outCount, inCount)); LOG.debug(MiscUtil.memoryInfoString()); } } if (filteredWeight != 0) { featureSink.write(new Weighted<Token>(new Token(filteredId), filteredWeight)); } featureSource.close(); featureSink.flush(); featureSink.close(); if (!activeFeaturesFile.equals(inputFeaturesFile)) { activeFeaturesFile.delete(); } featureFilterRequired = false; activeFeaturesFile = outputFile; // Update the feature acceptance predicate if (rejectedFeatures.size() > 0) { eventFilterRequired = true; acceptEvent = Predicates2.and( acceptEvent, Predicates2.compose( Predicates2.not(Predicates2.in(rejectedFeatures)), eventFeatureId())); } }
private void filterEntries() throws FileNotFoundException, IOException { final IntSet rejected = new IntOpenHashSet(); WeightedTokenSource entriesSource = BybloIO.openEntriesSource(activeEntriesFile, getCharset(), getIndexDeligate()); File outputFile = tempFiles.createFile(); WeightedTokenSink entriesSink = BybloIO.openEntriesSink(outputFile, getCharset(), getIndexDeligate()); progress.setMessage("Filtering entries."); final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING); double filteredWeight = 0; long inCount = 0; long outCount = 0; while (entriesSource.hasNext()) { ++inCount; Weighted<Token> record = entriesSource.read(); if (record.record().id() == filteredEntry) { filteredWeight += record.weight(); } else if (acceptEntry.apply(record)) { entriesSink.write(record); ++outCount; } else { rejected.add(record.record().id()); filteredWeight += record.weight(); } if ((inCount % PROGRESS_INTERVAL == 0 || !entriesSource.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage(format("Accepted {0} of {1} entries.", outCount, inCount)); LOG.debug(MiscUtil.memoryInfoString()); } } if (filteredWeight != 0) { entriesSink.write(new Weighted<Token>(new Token(filteredEntry), filteredWeight)); } entriesSource.close(); entriesSink.flush(); entriesSink.close(); if (!activeEntriesFile.equals(inputEntriesFile)) { activeEntriesFile.delete(); } entryFilterRequired = false; activeEntriesFile = outputFile; // Update the feature acceptance predicate if (rejected.size() > 0) { eventFilterRequired = true; acceptEvent = Predicates2.and( acceptEvent, Predicates2.compose(Predicates2.not(Predicates2.in(rejected)), eventEntryId())); } }