@Override protected void runTask() throws Exception { progress.setState(State.RUNNING); int mergeCount = 0; T a = sourceA.hasNext() ? sourceA.read() : null; T b = sourceB.hasNext() ? sourceB.read() : null; while (a != null && b != null) { final int c = comparator.compare(a, b); if (c < 0) { sink.write(a); a = sourceA.hasNext() ? sourceA.read() : null; } else if (c > 0) { sink.write(b); b = sourceB.hasNext() ? sourceB.read() : null; } else { sink.write(a); sink.write(b); a = sourceA.hasNext() ? sourceA.read() : null; b = sourceB.hasNext() ? sourceB.read() : null; } ++mergeCount; if (mergeCount % 1000000 == 0) { progress.setMessage(MessageFormat.format("Merged {0} unique items.", mergeCount)); } } while (a != null) { sink.write(a); a = sourceA.hasNext() ? sourceA.read() : null; ++mergeCount; if (mergeCount % 1000000 == 0) { progress.setMessage(MessageFormat.format("Merged {0} unique items.", mergeCount)); } } while (b != null) { sink.write(b); b = sourceB.hasNext() ? sourceB.read() : null; ++mergeCount; if (mergeCount % 1000000 == 0) { progress.setMessage(MessageFormat.format("Merged {0} unique items.", mergeCount)); } } progress.startAdjusting(); progress.setMessage(MessageFormat.format("Merged {0} unique items.", mergeCount)); progress.setState(State.COMPLETED); progress.endAdjusting(); if (sink instanceof Flushable) ((Flushable) sink).flush(); }
// Filter the AllPairsTask file, rejecting all entries that where found to // be only used by filtered entries. private void filterFeatures() throws FileNotFoundException, IOException { IntSet rejectedFeatures = new IntOpenHashSet(); WeightedTokenSource featureSource = BybloIO.openFeaturesSource(activeFeaturesFile, getCharset(), indexDeligate); File outputFile = tempFiles.createFile(); WeightedTokenSink featureSink = BybloIO.openFeaturesSink(outputFile, getCharset(), indexDeligate); progress.setMessage("Filtering features."); // Store an filtered wieght here and record it so as to maintain // accurate priors for those features that remain double filteredWeight = 0; int filteredId = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING); long inCount = 0; long outCount = 0; while (featureSource.hasNext()) { Weighted<Token> feature = featureSource.read(); ++inCount; if (feature.record().id() == filteredId) { filteredWeight += feature.weight(); } else if (acceptFeature.apply(feature)) { featureSink.write(feature); ++outCount; } else { rejectedFeatures.add(feature.record().id()); filteredWeight += feature.weight(); } if ((inCount % PROGRESS_INTERVAL == 0 || !featureSource.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage(format("Accepted {0} of {1} features.", outCount, inCount)); LOG.debug(MiscUtil.memoryInfoString()); } } if (filteredWeight != 0) { featureSink.write(new Weighted<Token>(new Token(filteredId), filteredWeight)); } featureSource.close(); featureSink.flush(); featureSink.close(); if (!activeFeaturesFile.equals(inputFeaturesFile)) { activeFeaturesFile.delete(); } featureFilterRequired = false; activeFeaturesFile = outputFile; // Update the feature acceptance predicate if (rejectedFeatures.size() > 0) { eventFilterRequired = true; acceptEvent = Predicates2.and( acceptEvent, Predicates2.compose( Predicates2.not(Predicates2.in(rejectedFeatures)), eventFeatureId())); } }
private void filterEntries() throws FileNotFoundException, IOException { final IntSet rejected = new IntOpenHashSet(); WeightedTokenSource entriesSource = BybloIO.openEntriesSource(activeEntriesFile, getCharset(), getIndexDeligate()); File outputFile = tempFiles.createFile(); WeightedTokenSink entriesSink = BybloIO.openEntriesSink(outputFile, getCharset(), getIndexDeligate()); progress.setMessage("Filtering entries."); final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING); double filteredWeight = 0; long inCount = 0; long outCount = 0; while (entriesSource.hasNext()) { ++inCount; Weighted<Token> record = entriesSource.read(); if (record.record().id() == filteredEntry) { filteredWeight += record.weight(); } else if (acceptEntry.apply(record)) { entriesSink.write(record); ++outCount; } else { rejected.add(record.record().id()); filteredWeight += record.weight(); } if ((inCount % PROGRESS_INTERVAL == 0 || !entriesSource.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage(format("Accepted {0} of {1} entries.", outCount, inCount)); LOG.debug(MiscUtil.memoryInfoString()); } } if (filteredWeight != 0) { entriesSink.write(new Weighted<Token>(new Token(filteredEntry), filteredWeight)); } entriesSource.close(); entriesSink.flush(); entriesSink.close(); if (!activeEntriesFile.equals(inputEntriesFile)) { activeEntriesFile.delete(); } entryFilterRequired = false; activeEntriesFile = outputFile; // Update the feature acceptance predicate if (rejected.size() > 0) { eventFilterRequired = true; acceptEvent = Predicates2.and( acceptEvent, Predicates2.compose(Predicates2.not(Predicates2.in(rejected)), eventEntryId())); } }
// Filter the AllPairsTask file, rejecting all entires that contain entries // dropped in the entries file filter pass. Store a list of featuress that // only appear in filtered entries to filter the featuress file. private void filterEvents() throws FileNotFoundException, IOException { IntSet acceptedEntries = new IntOpenHashSet(); IntSet rejectedEntries = new IntOpenHashSet(); IntSet rejectedFeatures = new IntOpenHashSet(); IntSet acceptedFeatures = new IntOpenHashSet(); WeightedTokenPairSource efSrc = BybloIO.openEventsSource(activeEventsFile, getCharset(), indexDeligate); File outputFile = tempFiles.createFile(); // outputFile.deleteOnExit(); WeightedTokenPairSink efSink = BybloIO.openEventsSink(outputFile, getCharset(), indexDeligate); progress.setMessage("Filtering events from."); // Store the id of the special filtered feature and entry // TODO This can probably be removed now but need to check final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING); final int filteredFeature = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING); int currentEntryId = -1; int currentEventCount = 0; double currentEntryFilteredFeatureWeight = 0; double filteredEntryWeight = 0; int readCount = 0; int writeCount = 0; while (efSrc.hasNext()) { Weighted<TokenPair> record = efSrc.read(); ++readCount; if (record.record().id1() == filteredEntry) { filteredEntryWeight += record.weight(); continue; } if (record.record().id1() != currentEntryId) { if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) { if (currentEventCount == 0) { filteredEntryWeight += currentEntryFilteredFeatureWeight; } else { efSink.write( new Weighted<TokenPair>( new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight)); ++writeCount; } } currentEntryId = record.record().id1(); currentEntryFilteredFeatureWeight = 0; currentEventCount = 0; } if (record.record().id2() == filteredFeature) { currentEntryFilteredFeatureWeight += record.weight(); } else if (acceptEvent.apply(record)) { efSink.write(record); ++writeCount; acceptedEntries.add(record.record().id1()); acceptedFeatures.add(record.record().id2()); ++currentEventCount; } else { rejectedEntries.add(record.record().id1()); rejectedFeatures.add(record.record().id2()); currentEntryFilteredFeatureWeight += record.weight(); } if ((readCount % PROGRESS_INTERVAL == 0 || !efSrc.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage("Accepted " + writeCount + " of " + readCount + " events."); LOG.debug(MiscUtil.memoryInfoString()); } } if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) { if (currentEventCount == 0) { filteredEntryWeight += currentEntryFilteredFeatureWeight; } else { efSink.write( new Weighted<TokenPair>( new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight)); } } // If there have been entire entries filtered then write their summed // weights to a special filtered entry/feature pair if (filteredEntryWeight != 0) { efSink.write( new Weighted<TokenPair>( new TokenPair(filteredEntry, filteredFeature), filteredEntryWeight)); } efSrc.close(); efSink.flush(); efSink.close(); if (!activeEventsFile.equals(inputEventsFile)) { activeEventsFile.delete(); } eventFilterRequired = false; activeEventsFile = outputFile; rejectedFeatures.removeAll(acceptedFeatures); rejectedEntries.removeAll(acceptedEntries); if (rejectedEntries.size() > 0) { acceptEntry = Predicates2.and( acceptEntry, Predicates2.compose(Predicates2.not(Predicates2.in(rejectedEntries)), id())); entryFilterRequired = true; } if (rejectedFeatures.size() > 0) { acceptFeature = Predicates2.and( acceptFeature, Predicates2.not(Predicates2.compose(Predicates2.in(rejectedFeatures), id()))); featureFilterRequired = true; } }
@Override public void runCommand() throws Exception { if (LOG.isInfoEnabled()) LOG.info("Running filtering."); if (LOG.isDebugEnabled()) LOG.debug(this); if (filterFeatureMinFreq > 0) { addFeaturesMinimumFrequency(filterFeatureMinFreq); } if (filterFeaturePattern != null) { addFeaturesPattern(filterFeaturePattern); } if (filterFeatureWhitelist != null) { addFeaturesWhitelist( com.google.common.io.Files.readLines(filterFeatureWhitelist, getCharset())); } if (filterEntryMinFreq > 0) { addEntryMinimumFrequency(filterEntryMinFreq); } if (filterEntryPattern != null) { addEntryPattern(filterEntryPattern); } if (filterEntryWhitelist != null) { addEntryWhitelist(com.google.common.io.Files.readLines(filterEntryWhitelist, getCharset())); } if (filterEventMinFreq > 0) { addEventMinimumFrequency(filterEventMinFreq); } checkState(); activeEventsFile = inputEventsFile; activeEntriesFile = inputEntriesFile; activeFeaturesFile = inputFeaturesFile; progress.addProgressListener( new ProgressListener() { @Override public void progressChanged(ProgressEvent progressEvent) { LOG.info(progressEvent.getSource().getProgressReport()); } }); progress.setState(State.RUNNING); progress.setProgressPercent(0); // Run the filters forwards then backwards. Each filtering step may // introduce additionaly filters for the other files, so continue // looping until there is no work remaining. Depending on filters this // very unlikely to take more than 3 passes int passCount = 0; int opCount = 0; while (entryFilterRequired || eventFilterRequired || featureFilterRequired) { // if (entryFilterRequired || eventFilterRequired) { progress.setMessage("Running filtering pass (#" + (++passCount) + ")."); if (entryFilterRequired) { filterEntries(); ++opCount; progress.setProgressPercent( 100 * opCount / (opCount + 3 + (entryFilterRequired ? 1 : 0) + (eventFilterRequired ? 1 : 0) + (featureFilterRequired ? 1 : 0))); } if (eventFilterRequired) { filterEvents(); ++opCount; progress.setProgressPercent( 100 * opCount / (opCount + 3 + (entryFilterRequired ? 1 : 0) + (eventFilterRequired ? 1 : 0) + (featureFilterRequired ? 1 : 0))); } if (featureFilterRequired) { filterFeatures(); ++opCount; progress.setProgressPercent( 100 * opCount / (opCount + 3 + (entryFilterRequired ? 1 : 0) + (eventFilterRequired ? 1 : 0) + (featureFilterRequired ? 1 : 0))); } // } // // if (featureFilterRequired || eventFilterRequired) { // // progress.setMessage("Running backwards filtering pass (#" + (++passCount) + // ")."); // // if (featureFilterRequired) { // filterFeatures(); // ++opCount; // progress.setProgressPercent(100 * opCount / (opCount // + (entryFilterRequired ? 1 : 0) // + (eventFilterRequired ? 1 : 0) // + (featureFilterRequired ? 1 : 0))); // } if (eventFilterRequired) { filterEvents(); ++opCount; progress.setProgressPercent( 100 * opCount / (opCount + 3 + (entryFilterRequired ? 1 : 0) + (eventFilterRequired ? 1 : 0) + (featureFilterRequired ? 1 : 0))); } if (entryFilterRequired) { filterEntries(); ++opCount; progress.setProgressPercent( 100 * opCount / (opCount + 3 + (entryFilterRequired ? 1 : 0) + (eventFilterRequired ? 1 : 0) + (featureFilterRequired ? 1 : 0))); } // } } // Finished filtering so copy the results files to the outputs. progress.setMessage("Copying final entries file."); outputEntriesFile.delete(); if (!activeEntriesFile.renameTo(outputEntriesFile)) { com.google.common.io.Files.copy(activeEntriesFile, outputEntriesFile); if (!activeEntriesFile.equals(inputEntriesFile)) activeEntriesFile.delete(); } ++opCount; progress.startAdjusting(); progress.setProgressPercent( 100 * opCount / (opCount + 2 + (entryFilterRequired ? 1 : 0) + (eventFilterRequired ? 1 : 0) + (featureFilterRequired ? 1 : 0))); progress.setMessage("Copying finaly events file."); progress.endAdjusting(); outputEventsFile.delete(); if (!activeEventsFile.renameTo(outputEventsFile)) { com.google.common.io.Files.copy(activeEventsFile, outputEventsFile); if (!activeEventsFile.equals(inputEventsFile)) activeEventsFile.delete(); } ++opCount; progress.startAdjusting(); progress.setProgressPercent( 100 * opCount / (opCount + 1 + (entryFilterRequired ? 1 : 0) + (eventFilterRequired ? 1 : 0) + (featureFilterRequired ? 1 : 0))); progress.setMessage("Copying final features file."); progress.endAdjusting(); outputFeaturesFile.delete(); if (!activeFeaturesFile.renameTo(outputFeaturesFile)) { com.google.common.io.Files.copy(activeFeaturesFile, outputFeaturesFile); if (!activeFeaturesFile.equals(inputFeaturesFile)) activeFeaturesFile.delete(); } ++opCount; progress.setProgressPercent( 100 * opCount / (opCount + 0 + (entryFilterRequired ? 1 : 0) + (eventFilterRequired ? 1 : 0) + (featureFilterRequired ? 1 : 0))); if (indexDeligate.isEnumeratorOpen()) { indexDeligate.saveEnumerator(); indexDeligate.closeEnumerator(); } progress.setState(State.COMPLETED); }