Ejemplo n.º 1
0
  // Filter the AllPairsTask file, rejecting all entires that contain entries
  // dropped in the entries file filter pass. Store a list of featuress that
  // only appear in filtered entries to filter the featuress file.
  private void filterEvents() throws FileNotFoundException, IOException {

    IntSet acceptedEntries = new IntOpenHashSet();
    IntSet rejectedEntries = new IntOpenHashSet();

    IntSet rejectedFeatures = new IntOpenHashSet();
    IntSet acceptedFeatures = new IntOpenHashSet();

    WeightedTokenPairSource efSrc =
        BybloIO.openEventsSource(activeEventsFile, getCharset(), indexDeligate);

    File outputFile = tempFiles.createFile();
    //        outputFile.deleteOnExit();

    WeightedTokenPairSink efSink = BybloIO.openEventsSink(outputFile, getCharset(), indexDeligate);

    progress.setMessage("Filtering events from.");

    // Store the id of the special filtered feature and entry
    // TODO This can probably be removed now but need to check
    final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING);
    final int filteredFeature = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING);

    int currentEntryId = -1;
    int currentEventCount = 0;
    double currentEntryFilteredFeatureWeight = 0;

    double filteredEntryWeight = 0;

    int readCount = 0;
    int writeCount = 0;

    while (efSrc.hasNext()) {
      Weighted<TokenPair> record = efSrc.read();
      ++readCount;

      if (record.record().id1() == filteredEntry) {
        filteredEntryWeight += record.weight();
        continue;
      }

      if (record.record().id1() != currentEntryId) {

        if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) {
          if (currentEventCount == 0) {
            filteredEntryWeight += currentEntryFilteredFeatureWeight;
          } else {
            efSink.write(
                new Weighted<TokenPair>(
                    new TokenPair(currentEntryId, filteredFeature),
                    currentEntryFilteredFeatureWeight));
            ++writeCount;
          }
        }

        currentEntryId = record.record().id1();
        currentEntryFilteredFeatureWeight = 0;
        currentEventCount = 0;
      }

      if (record.record().id2() == filteredFeature) {

        currentEntryFilteredFeatureWeight += record.weight();

      } else if (acceptEvent.apply(record)) {

        efSink.write(record);
        ++writeCount;
        acceptedEntries.add(record.record().id1());
        acceptedFeatures.add(record.record().id2());
        ++currentEventCount;

      } else {
        rejectedEntries.add(record.record().id1());
        rejectedFeatures.add(record.record().id2());

        currentEntryFilteredFeatureWeight += record.weight();
      }

      if ((readCount % PROGRESS_INTERVAL == 0 || !efSrc.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage("Accepted " + writeCount + " of " + readCount + " events.");
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) {
      if (currentEventCount == 0) {
        filteredEntryWeight += currentEntryFilteredFeatureWeight;
      } else {
        efSink.write(
            new Weighted<TokenPair>(
                new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight));
      }
    }

    // If there have been entire entries filtered then write their summed
    // weights to a special filtered entry/feature pair
    if (filteredEntryWeight != 0) {
      efSink.write(
          new Weighted<TokenPair>(
              new TokenPair(filteredEntry, filteredFeature), filteredEntryWeight));
    }

    efSrc.close();
    efSink.flush();
    efSink.close();

    if (!activeEventsFile.equals(inputEventsFile)) {
      activeEventsFile.delete();
    }

    eventFilterRequired = false;
    activeEventsFile = outputFile;

    rejectedFeatures.removeAll(acceptedFeatures);
    rejectedEntries.removeAll(acceptedEntries);

    if (rejectedEntries.size() > 0) {
      acceptEntry =
          Predicates2.and(
              acceptEntry,
              Predicates2.compose(Predicates2.not(Predicates2.in(rejectedEntries)), id()));
      entryFilterRequired = true;
    }

    if (rejectedFeatures.size() > 0) {
      acceptFeature =
          Predicates2.and(
              acceptFeature,
              Predicates2.not(Predicates2.compose(Predicates2.in(rejectedFeatures), id())));
      featureFilterRequired = true;
    }
  }
Ejemplo n.º 2
0
  // Filter the AllPairsTask file, rejecting all entries that where found to
  // be only used by filtered entries.
  private void filterFeatures() throws FileNotFoundException, IOException {
    IntSet rejectedFeatures = new IntOpenHashSet();

    WeightedTokenSource featureSource =
        BybloIO.openFeaturesSource(activeFeaturesFile, getCharset(), indexDeligate);

    File outputFile = tempFiles.createFile();

    WeightedTokenSink featureSink =
        BybloIO.openFeaturesSink(outputFile, getCharset(), indexDeligate);

    progress.setMessage("Filtering features.");

    // Store an filtered wieght here and record it so as to maintain
    // accurate priors for those features that remain
    double filteredWeight = 0;
    int filteredId = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING);

    long inCount = 0;
    long outCount = 0;
    while (featureSource.hasNext()) {
      Weighted<Token> feature = featureSource.read();
      ++inCount;

      if (feature.record().id() == filteredId) {
        filteredWeight += feature.weight();
      } else if (acceptFeature.apply(feature)) {
        featureSink.write(feature);
        ++outCount;
      } else {
        rejectedFeatures.add(feature.record().id());
        filteredWeight += feature.weight();
      }

      if ((inCount % PROGRESS_INTERVAL == 0 || !featureSource.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage(format("Accepted {0} of {1} features.", outCount, inCount));
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (filteredWeight != 0) {
      featureSink.write(new Weighted<Token>(new Token(filteredId), filteredWeight));
    }
    featureSource.close();
    featureSink.flush();
    featureSink.close();

    if (!activeFeaturesFile.equals(inputFeaturesFile)) {
      activeFeaturesFile.delete();
    }

    featureFilterRequired = false;
    activeFeaturesFile = outputFile;

    // Update the feature acceptance predicate
    if (rejectedFeatures.size() > 0) {

      eventFilterRequired = true;
      acceptEvent =
          Predicates2.and(
              acceptEvent,
              Predicates2.compose(
                  Predicates2.not(Predicates2.in(rejectedFeatures)), eventFeatureId()));
    }
  }
Ejemplo n.º 3
0
  private void filterEntries() throws FileNotFoundException, IOException {

    final IntSet rejected = new IntOpenHashSet();

    WeightedTokenSource entriesSource =
        BybloIO.openEntriesSource(activeEntriesFile, getCharset(), getIndexDeligate());

    File outputFile = tempFiles.createFile();

    WeightedTokenSink entriesSink =
        BybloIO.openEntriesSink(outputFile, getCharset(), getIndexDeligate());

    progress.setMessage("Filtering entries.");

    final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING);
    double filteredWeight = 0;

    long inCount = 0;
    long outCount = 0;
    while (entriesSource.hasNext()) {
      ++inCount;
      Weighted<Token> record = entriesSource.read();

      if (record.record().id() == filteredEntry) {
        filteredWeight += record.weight();
      } else if (acceptEntry.apply(record)) {
        entriesSink.write(record);
        ++outCount;
      } else {
        rejected.add(record.record().id());
        filteredWeight += record.weight();
      }

      if ((inCount % PROGRESS_INTERVAL == 0 || !entriesSource.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage(format("Accepted {0} of {1} entries.", outCount, inCount));
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (filteredWeight != 0) {
      entriesSink.write(new Weighted<Token>(new Token(filteredEntry), filteredWeight));
    }

    entriesSource.close();
    entriesSink.flush();
    entriesSink.close();

    if (!activeEntriesFile.equals(inputEntriesFile)) {
      activeEntriesFile.delete();
    }

    entryFilterRequired = false;
    activeEntriesFile = outputFile;

    // Update the feature acceptance predicate
    if (rejected.size() > 0) {
      eventFilterRequired = true;
      acceptEvent =
          Predicates2.and(
              acceptEvent,
              Predicates2.compose(Predicates2.not(Predicates2.in(rejected)), eventEntryId()));
    }
  }