Пример #1
0
  // Filter the AllPairsTask file, rejecting all entires that contain entries
  // dropped in the entries file filter pass. Store a list of featuress that
  // only appear in filtered entries to filter the featuress file.
  private void filterEvents() throws FileNotFoundException, IOException {

    IntSet acceptedEntries = new IntOpenHashSet();
    IntSet rejectedEntries = new IntOpenHashSet();

    IntSet rejectedFeatures = new IntOpenHashSet();
    IntSet acceptedFeatures = new IntOpenHashSet();

    WeightedTokenPairSource efSrc =
        BybloIO.openEventsSource(activeEventsFile, getCharset(), indexDeligate);

    File outputFile = tempFiles.createFile();
    //        outputFile.deleteOnExit();

    WeightedTokenPairSink efSink = BybloIO.openEventsSink(outputFile, getCharset(), indexDeligate);

    progress.setMessage("Filtering events from.");

    // Store the id of the special filtered feature and entry
    // TODO This can probably be removed now but need to check
    final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING);
    final int filteredFeature = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING);

    int currentEntryId = -1;
    int currentEventCount = 0;
    double currentEntryFilteredFeatureWeight = 0;

    double filteredEntryWeight = 0;

    int readCount = 0;
    int writeCount = 0;

    while (efSrc.hasNext()) {
      Weighted<TokenPair> record = efSrc.read();
      ++readCount;

      if (record.record().id1() == filteredEntry) {
        filteredEntryWeight += record.weight();
        continue;
      }

      if (record.record().id1() != currentEntryId) {

        if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) {
          if (currentEventCount == 0) {
            filteredEntryWeight += currentEntryFilteredFeatureWeight;
          } else {
            efSink.write(
                new Weighted<TokenPair>(
                    new TokenPair(currentEntryId, filteredFeature),
                    currentEntryFilteredFeatureWeight));
            ++writeCount;
          }
        }

        currentEntryId = record.record().id1();
        currentEntryFilteredFeatureWeight = 0;
        currentEventCount = 0;
      }

      if (record.record().id2() == filteredFeature) {

        currentEntryFilteredFeatureWeight += record.weight();

      } else if (acceptEvent.apply(record)) {

        efSink.write(record);
        ++writeCount;
        acceptedEntries.add(record.record().id1());
        acceptedFeatures.add(record.record().id2());
        ++currentEventCount;

      } else {
        rejectedEntries.add(record.record().id1());
        rejectedFeatures.add(record.record().id2());

        currentEntryFilteredFeatureWeight += record.weight();
      }

      if ((readCount % PROGRESS_INTERVAL == 0 || !efSrc.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage("Accepted " + writeCount + " of " + readCount + " events.");
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) {
      if (currentEventCount == 0) {
        filteredEntryWeight += currentEntryFilteredFeatureWeight;
      } else {
        efSink.write(
            new Weighted<TokenPair>(
                new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight));
      }
    }

    // If there have been entire entries filtered then write their summed
    // weights to a special filtered entry/feature pair
    if (filteredEntryWeight != 0) {
      efSink.write(
          new Weighted<TokenPair>(
              new TokenPair(filteredEntry, filteredFeature), filteredEntryWeight));
    }

    efSrc.close();
    efSink.flush();
    efSink.close();

    if (!activeEventsFile.equals(inputEventsFile)) {
      activeEventsFile.delete();
    }

    eventFilterRequired = false;
    activeEventsFile = outputFile;

    rejectedFeatures.removeAll(acceptedFeatures);
    rejectedEntries.removeAll(acceptedEntries);

    if (rejectedEntries.size() > 0) {
      acceptEntry =
          Predicates2.and(
              acceptEntry,
              Predicates2.compose(Predicates2.not(Predicates2.in(rejectedEntries)), id()));
      entryFilterRequired = true;
    }

    if (rejectedFeatures.size() > 0) {
      acceptFeature =
          Predicates2.and(
              acceptFeature,
              Predicates2.not(Predicates2.compose(Predicates2.in(rejectedFeatures), id())));
      featureFilterRequired = true;
    }
  }