// Filter the AllPairsTask file, rejecting all entires that contain entries // dropped in the entries file filter pass. Store a list of featuress that // only appear in filtered entries to filter the featuress file. private void filterEvents() throws FileNotFoundException, IOException { IntSet acceptedEntries = new IntOpenHashSet(); IntSet rejectedEntries = new IntOpenHashSet(); IntSet rejectedFeatures = new IntOpenHashSet(); IntSet acceptedFeatures = new IntOpenHashSet(); WeightedTokenPairSource efSrc = BybloIO.openEventsSource(activeEventsFile, getCharset(), indexDeligate); File outputFile = tempFiles.createFile(); // outputFile.deleteOnExit(); WeightedTokenPairSink efSink = BybloIO.openEventsSink(outputFile, getCharset(), indexDeligate); progress.setMessage("Filtering events from."); // Store the id of the special filtered feature and entry // TODO This can probably be removed now but need to check final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING); final int filteredFeature = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING); int currentEntryId = -1; int currentEventCount = 0; double currentEntryFilteredFeatureWeight = 0; double filteredEntryWeight = 0; int readCount = 0; int writeCount = 0; while (efSrc.hasNext()) { Weighted<TokenPair> record = efSrc.read(); ++readCount; if (record.record().id1() == filteredEntry) { filteredEntryWeight += record.weight(); continue; } if (record.record().id1() != currentEntryId) { if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) { if (currentEventCount == 0) { filteredEntryWeight += currentEntryFilteredFeatureWeight; } else { efSink.write( new Weighted<TokenPair>( new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight)); ++writeCount; } } currentEntryId = record.record().id1(); currentEntryFilteredFeatureWeight = 0; currentEventCount = 0; } if (record.record().id2() == filteredFeature) { currentEntryFilteredFeatureWeight += record.weight(); } else if (acceptEvent.apply(record)) { efSink.write(record); ++writeCount; acceptedEntries.add(record.record().id1()); acceptedFeatures.add(record.record().id2()); ++currentEventCount; } else { rejectedEntries.add(record.record().id1()); rejectedFeatures.add(record.record().id2()); currentEntryFilteredFeatureWeight += record.weight(); } if ((readCount % PROGRESS_INTERVAL == 0 || !efSrc.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage("Accepted " + writeCount + " of " + readCount + " events."); LOG.debug(MiscUtil.memoryInfoString()); } } if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) { if (currentEventCount == 0) { filteredEntryWeight += currentEntryFilteredFeatureWeight; } else { efSink.write( new Weighted<TokenPair>( new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight)); } } // If there have been entire entries filtered then write their summed // weights to a special filtered entry/feature pair if (filteredEntryWeight != 0) { efSink.write( new Weighted<TokenPair>( new TokenPair(filteredEntry, filteredFeature), filteredEntryWeight)); } efSrc.close(); efSink.flush(); efSink.close(); if (!activeEventsFile.equals(inputEventsFile)) { activeEventsFile.delete(); } eventFilterRequired = false; activeEventsFile = outputFile; rejectedFeatures.removeAll(acceptedFeatures); rejectedEntries.removeAll(acceptedEntries); if (rejectedEntries.size() > 0) { acceptEntry = Predicates2.and( acceptEntry, Predicates2.compose(Predicates2.not(Predicates2.in(rejectedEntries)), id())); entryFilterRequired = true; } if (rejectedFeatures.size() > 0) { acceptFeature = Predicates2.and( acceptFeature, Predicates2.not(Predicates2.compose(Predicates2.in(rejectedFeatures), id()))); featureFilterRequired = true; } }