Beispiel #1
0
  @Override
  protected void runTask() throws Exception {

    progress.setState(State.RUNNING);

    int mergeCount = 0;

    T a = sourceA.hasNext() ? sourceA.read() : null;
    T b = sourceB.hasNext() ? sourceB.read() : null;
    while (a != null && b != null) {
      final int c = comparator.compare(a, b);
      if (c < 0) {
        sink.write(a);
        a = sourceA.hasNext() ? sourceA.read() : null;
      } else if (c > 0) {
        sink.write(b);
        b = sourceB.hasNext() ? sourceB.read() : null;
      } else {
        sink.write(a);
        sink.write(b);
        a = sourceA.hasNext() ? sourceA.read() : null;
        b = sourceB.hasNext() ? sourceB.read() : null;
      }
      ++mergeCount;

      if (mergeCount % 1000000 == 0) {
        progress.setMessage(MessageFormat.format("Merged {0} unique items.", mergeCount));
      }
    }
    while (a != null) {
      sink.write(a);
      a = sourceA.hasNext() ? sourceA.read() : null;
      ++mergeCount;

      if (mergeCount % 1000000 == 0) {
        progress.setMessage(MessageFormat.format("Merged {0} unique items.", mergeCount));
      }
    }
    while (b != null) {
      sink.write(b);
      b = sourceB.hasNext() ? sourceB.read() : null;
      ++mergeCount;

      if (mergeCount % 1000000 == 0) {
        progress.setMessage(MessageFormat.format("Merged {0} unique items.", mergeCount));
      }
    }

    progress.startAdjusting();
    progress.setMessage(MessageFormat.format("Merged {0} unique items.", mergeCount));
    progress.setState(State.COMPLETED);
    progress.endAdjusting();

    if (sink instanceof Flushable) ((Flushable) sink).flush();
  }
Beispiel #2
0
  // Filter the AllPairsTask file, rejecting all entries that where found to
  // be only used by filtered entries.
  private void filterFeatures() throws FileNotFoundException, IOException {
    IntSet rejectedFeatures = new IntOpenHashSet();

    WeightedTokenSource featureSource =
        BybloIO.openFeaturesSource(activeFeaturesFile, getCharset(), indexDeligate);

    File outputFile = tempFiles.createFile();

    WeightedTokenSink featureSink =
        BybloIO.openFeaturesSink(outputFile, getCharset(), indexDeligate);

    progress.setMessage("Filtering features.");

    // Store an filtered wieght here and record it so as to maintain
    // accurate priors for those features that remain
    double filteredWeight = 0;
    int filteredId = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING);

    long inCount = 0;
    long outCount = 0;
    while (featureSource.hasNext()) {
      Weighted<Token> feature = featureSource.read();
      ++inCount;

      if (feature.record().id() == filteredId) {
        filteredWeight += feature.weight();
      } else if (acceptFeature.apply(feature)) {
        featureSink.write(feature);
        ++outCount;
      } else {
        rejectedFeatures.add(feature.record().id());
        filteredWeight += feature.weight();
      }

      if ((inCount % PROGRESS_INTERVAL == 0 || !featureSource.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage(format("Accepted {0} of {1} features.", outCount, inCount));
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (filteredWeight != 0) {
      featureSink.write(new Weighted<Token>(new Token(filteredId), filteredWeight));
    }
    featureSource.close();
    featureSink.flush();
    featureSink.close();

    if (!activeFeaturesFile.equals(inputFeaturesFile)) {
      activeFeaturesFile.delete();
    }

    featureFilterRequired = false;
    activeFeaturesFile = outputFile;

    // Update the feature acceptance predicate
    if (rejectedFeatures.size() > 0) {

      eventFilterRequired = true;
      acceptEvent =
          Predicates2.and(
              acceptEvent,
              Predicates2.compose(
                  Predicates2.not(Predicates2.in(rejectedFeatures)), eventFeatureId()));
    }
  }
Beispiel #3
0
  private void filterEntries() throws FileNotFoundException, IOException {

    final IntSet rejected = new IntOpenHashSet();

    WeightedTokenSource entriesSource =
        BybloIO.openEntriesSource(activeEntriesFile, getCharset(), getIndexDeligate());

    File outputFile = tempFiles.createFile();

    WeightedTokenSink entriesSink =
        BybloIO.openEntriesSink(outputFile, getCharset(), getIndexDeligate());

    progress.setMessage("Filtering entries.");

    final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING);
    double filteredWeight = 0;

    long inCount = 0;
    long outCount = 0;
    while (entriesSource.hasNext()) {
      ++inCount;
      Weighted<Token> record = entriesSource.read();

      if (record.record().id() == filteredEntry) {
        filteredWeight += record.weight();
      } else if (acceptEntry.apply(record)) {
        entriesSink.write(record);
        ++outCount;
      } else {
        rejected.add(record.record().id());
        filteredWeight += record.weight();
      }

      if ((inCount % PROGRESS_INTERVAL == 0 || !entriesSource.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage(format("Accepted {0} of {1} entries.", outCount, inCount));
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (filteredWeight != 0) {
      entriesSink.write(new Weighted<Token>(new Token(filteredEntry), filteredWeight));
    }

    entriesSource.close();
    entriesSink.flush();
    entriesSink.close();

    if (!activeEntriesFile.equals(inputEntriesFile)) {
      activeEntriesFile.delete();
    }

    entryFilterRequired = false;
    activeEntriesFile = outputFile;

    // Update the feature acceptance predicate
    if (rejected.size() > 0) {
      eventFilterRequired = true;
      acceptEvent =
          Predicates2.and(
              acceptEvent,
              Predicates2.compose(Predicates2.not(Predicates2.in(rejected)), eventEntryId()));
    }
  }
Beispiel #4
0
  // Filter the AllPairsTask file, rejecting all entires that contain entries
  // dropped in the entries file filter pass. Store a list of featuress that
  // only appear in filtered entries to filter the featuress file.
  private void filterEvents() throws FileNotFoundException, IOException {

    IntSet acceptedEntries = new IntOpenHashSet();
    IntSet rejectedEntries = new IntOpenHashSet();

    IntSet rejectedFeatures = new IntOpenHashSet();
    IntSet acceptedFeatures = new IntOpenHashSet();

    WeightedTokenPairSource efSrc =
        BybloIO.openEventsSource(activeEventsFile, getCharset(), indexDeligate);

    File outputFile = tempFiles.createFile();
    //        outputFile.deleteOnExit();

    WeightedTokenPairSink efSink = BybloIO.openEventsSink(outputFile, getCharset(), indexDeligate);

    progress.setMessage("Filtering events from.");

    // Store the id of the special filtered feature and entry
    // TODO This can probably be removed now but need to check
    final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING);
    final int filteredFeature = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING);

    int currentEntryId = -1;
    int currentEventCount = 0;
    double currentEntryFilteredFeatureWeight = 0;

    double filteredEntryWeight = 0;

    int readCount = 0;
    int writeCount = 0;

    while (efSrc.hasNext()) {
      Weighted<TokenPair> record = efSrc.read();
      ++readCount;

      if (record.record().id1() == filteredEntry) {
        filteredEntryWeight += record.weight();
        continue;
      }

      if (record.record().id1() != currentEntryId) {

        if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) {
          if (currentEventCount == 0) {
            filteredEntryWeight += currentEntryFilteredFeatureWeight;
          } else {
            efSink.write(
                new Weighted<TokenPair>(
                    new TokenPair(currentEntryId, filteredFeature),
                    currentEntryFilteredFeatureWeight));
            ++writeCount;
          }
        }

        currentEntryId = record.record().id1();
        currentEntryFilteredFeatureWeight = 0;
        currentEventCount = 0;
      }

      if (record.record().id2() == filteredFeature) {

        currentEntryFilteredFeatureWeight += record.weight();

      } else if (acceptEvent.apply(record)) {

        efSink.write(record);
        ++writeCount;
        acceptedEntries.add(record.record().id1());
        acceptedFeatures.add(record.record().id2());
        ++currentEventCount;

      } else {
        rejectedEntries.add(record.record().id1());
        rejectedFeatures.add(record.record().id2());

        currentEntryFilteredFeatureWeight += record.weight();
      }

      if ((readCount % PROGRESS_INTERVAL == 0 || !efSrc.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage("Accepted " + writeCount + " of " + readCount + " events.");
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) {
      if (currentEventCount == 0) {
        filteredEntryWeight += currentEntryFilteredFeatureWeight;
      } else {
        efSink.write(
            new Weighted<TokenPair>(
                new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight));
      }
    }

    // If there have been entire entries filtered then write their summed
    // weights to a special filtered entry/feature pair
    if (filteredEntryWeight != 0) {
      efSink.write(
          new Weighted<TokenPair>(
              new TokenPair(filteredEntry, filteredFeature), filteredEntryWeight));
    }

    efSrc.close();
    efSink.flush();
    efSink.close();

    if (!activeEventsFile.equals(inputEventsFile)) {
      activeEventsFile.delete();
    }

    eventFilterRequired = false;
    activeEventsFile = outputFile;

    rejectedFeatures.removeAll(acceptedFeatures);
    rejectedEntries.removeAll(acceptedEntries);

    if (rejectedEntries.size() > 0) {
      acceptEntry =
          Predicates2.and(
              acceptEntry,
              Predicates2.compose(Predicates2.not(Predicates2.in(rejectedEntries)), id()));
      entryFilterRequired = true;
    }

    if (rejectedFeatures.size() > 0) {
      acceptFeature =
          Predicates2.and(
              acceptFeature,
              Predicates2.not(Predicates2.compose(Predicates2.in(rejectedFeatures), id())));
      featureFilterRequired = true;
    }
  }
Beispiel #5
0
  @Override
  public void runCommand() throws Exception {
    if (LOG.isInfoEnabled()) LOG.info("Running filtering.");
    if (LOG.isDebugEnabled()) LOG.debug(this);

    if (filterFeatureMinFreq > 0) {
      addFeaturesMinimumFrequency(filterFeatureMinFreq);
    }
    if (filterFeaturePattern != null) {
      addFeaturesPattern(filterFeaturePattern);
    }
    if (filterFeatureWhitelist != null) {
      addFeaturesWhitelist(
          com.google.common.io.Files.readLines(filterFeatureWhitelist, getCharset()));
    }

    if (filterEntryMinFreq > 0) {
      addEntryMinimumFrequency(filterEntryMinFreq);
    }
    if (filterEntryPattern != null) {
      addEntryPattern(filterEntryPattern);
    }
    if (filterEntryWhitelist != null) {
      addEntryWhitelist(com.google.common.io.Files.readLines(filterEntryWhitelist, getCharset()));
    }

    if (filterEventMinFreq > 0) {
      addEventMinimumFrequency(filterEventMinFreq);
    }

    checkState();
    activeEventsFile = inputEventsFile;
    activeEntriesFile = inputEntriesFile;
    activeFeaturesFile = inputFeaturesFile;

    progress.addProgressListener(
        new ProgressListener() {

          @Override
          public void progressChanged(ProgressEvent progressEvent) {
            LOG.info(progressEvent.getSource().getProgressReport());
          }
        });

    progress.setState(State.RUNNING);
    progress.setProgressPercent(0);

    // Run the filters forwards then backwards. Each filtering step may
    // introduce additionaly filters for the other files, so continue
    // looping until there is no work remaining. Depending on filters this
    // very unlikely to take more than 3 passes

    int passCount = 0;
    int opCount = 0;

    while (entryFilterRequired || eventFilterRequired || featureFilterRequired) {

      //            if (entryFilterRequired || eventFilterRequired) {

      progress.setMessage("Running filtering pass (#" + (++passCount) + ").");

      if (entryFilterRequired) {
        filterEntries();
        ++opCount;
        progress.setProgressPercent(
            100
                * opCount
                / (opCount
                    + 3
                    + (entryFilterRequired ? 1 : 0)
                    + (eventFilterRequired ? 1 : 0)
                    + (featureFilterRequired ? 1 : 0)));
      }

      if (eventFilterRequired) {
        filterEvents();
        ++opCount;
        progress.setProgressPercent(
            100
                * opCount
                / (opCount
                    + 3
                    + (entryFilterRequired ? 1 : 0)
                    + (eventFilterRequired ? 1 : 0)
                    + (featureFilterRequired ? 1 : 0)));
      }

      if (featureFilterRequired) {
        filterFeatures();
        ++opCount;
        progress.setProgressPercent(
            100
                * opCount
                / (opCount
                    + 3
                    + (entryFilterRequired ? 1 : 0)
                    + (eventFilterRequired ? 1 : 0)
                    + (featureFilterRequired ? 1 : 0)));
      }
      //            }
      //
      //            if (featureFilterRequired || eventFilterRequired) {
      //
      //                progress.setMessage("Running backwards filtering pass (#" + (++passCount) +
      // ").");
      //
      //                if (featureFilterRequired) {
      //                    filterFeatures();
      //                    ++opCount;
      //                    progress.setProgressPercent(100 * opCount / (opCount
      //                            + (entryFilterRequired ? 1 : 0)
      //                            + (eventFilterRequired ? 1 : 0)
      //                            + (featureFilterRequired ? 1 : 0)));
      //                }

      if (eventFilterRequired) {
        filterEvents();
        ++opCount;
        progress.setProgressPercent(
            100
                * opCount
                / (opCount
                    + 3
                    + (entryFilterRequired ? 1 : 0)
                    + (eventFilterRequired ? 1 : 0)
                    + (featureFilterRequired ? 1 : 0)));
      }

      if (entryFilterRequired) {
        filterEntries();
        ++opCount;
        progress.setProgressPercent(
            100
                * opCount
                / (opCount
                    + 3
                    + (entryFilterRequired ? 1 : 0)
                    + (eventFilterRequired ? 1 : 0)
                    + (featureFilterRequired ? 1 : 0)));
      }
      //            }
    }

    // Finished filtering so copy the results files to the outputs.

    progress.setMessage("Copying final entries file.");

    outputEntriesFile.delete();
    if (!activeEntriesFile.renameTo(outputEntriesFile)) {
      com.google.common.io.Files.copy(activeEntriesFile, outputEntriesFile);
      if (!activeEntriesFile.equals(inputEntriesFile)) activeEntriesFile.delete();
    }
    ++opCount;

    progress.startAdjusting();
    progress.setProgressPercent(
        100
            * opCount
            / (opCount
                + 2
                + (entryFilterRequired ? 1 : 0)
                + (eventFilterRequired ? 1 : 0)
                + (featureFilterRequired ? 1 : 0)));
    progress.setMessage("Copying finaly events file.");
    progress.endAdjusting();

    outputEventsFile.delete();
    if (!activeEventsFile.renameTo(outputEventsFile)) {
      com.google.common.io.Files.copy(activeEventsFile, outputEventsFile);
      if (!activeEventsFile.equals(inputEventsFile)) activeEventsFile.delete();
    }
    ++opCount;

    progress.startAdjusting();
    progress.setProgressPercent(
        100
            * opCount
            / (opCount
                + 1
                + (entryFilterRequired ? 1 : 0)
                + (eventFilterRequired ? 1 : 0)
                + (featureFilterRequired ? 1 : 0)));
    progress.setMessage("Copying final features file.");
    progress.endAdjusting();

    outputFeaturesFile.delete();
    if (!activeFeaturesFile.renameTo(outputFeaturesFile)) {
      com.google.common.io.Files.copy(activeFeaturesFile, outputFeaturesFile);
      if (!activeFeaturesFile.equals(inputFeaturesFile)) activeFeaturesFile.delete();
    }
    ++opCount;
    progress.setProgressPercent(
        100
            * opCount
            / (opCount
                + 0
                + (entryFilterRequired ? 1 : 0)
                + (eventFilterRequired ? 1 : 0)
                + (featureFilterRequired ? 1 : 0)));

    if (indexDeligate.isEnumeratorOpen()) {
      indexDeligate.saveEnumerator();
      indexDeligate.closeEnumerator();
    }

    progress.setState(State.COMPLETED);
  }