Пример #1
0
    public void process(IntList from, IntList to) {
      for (int i = 0; i < from.size(); i++) {
        int fromId = from.getInt(i);
        int toId = to.getInt(i);

        //
        // nothing propagates to and from whitelisted nodes
        // we never visit again
        //

        if (whitelist.contains(fromId) || whitelist.contains(toId)) {
          continue;
        }

        boolean fromBlacklist = blacklist.contains(fromId);
        boolean toBlacklist = blacklist.contains(toId);

        if (fromBlacklist && toBlacklist) continue;

        if (fromBlacklist) {
          blacklist(toId);
          continue;
        }

        if (toBlacklist) {
          blacklist(fromId);
          continue;
        }

        // if we haven't followed the link,  copy it to the list to go again.

        linkFrom.add(fromId);
        linkTo.add(toId);
      }
    }
Пример #2
0
 public static IntSet loadCreation(SerializedMultiFile<int[]> input) throws Exception {
   IntSet output = new IntOpenHashSet();
   for (int i = 0; i < input.getPartitionFunction().getPartitionCount(); i++) {
     IntSet part = new IntOpenHashSet(input.readFirstObject(i));
     output.addAll(part);
   }
   return output;
 }
Пример #3
0
 public void addEntryWhitelist(List<String> strings) throws IOException {
   IntSet entryIdSet = new IntOpenHashSet();
   for (String string : strings) {
     final int id = getIndexDeligate().getEntryEnumerator().indexOf(string);
     entryIdSet.add(id);
   }
   setAcceptEntry(
       Predicates2.<Weighted<Token>>and(
           getAcceptEntry(), Predicates2.compose(Predicates2.in(entryIdSet), id())));
 }
Пример #4
0
 public void addFeaturesBlacklist(List<String> strings) throws IOException {
   IntSet featureIdSet = new IntOpenHashSet();
   for (String string : strings) {
     final int id = getIndexDeligate().getFeatureEnumerator().indexOf(string);
     featureIdSet.add(id);
   }
   setAcceptFeatures(
       Predicates2.<Weighted<Token>>and(
           getAcceptFeatures(),
           Predicates2.compose(Predicates2.not(Predicates2.in(featureIdSet)), id())));
 }
 public static VariableOrdering determineOrdering(
     VariableSummary variableSummary, TransitionTable transitionTable) {
   int bitsPerRow = variableSummary.bitsPerRow();
   IntSet remainingVariableIds = variableSummary.allVariableIds();
   List<Variable> variablesWithOrder = new ArrayList<>(bitsPerRow);
   List<TransitionTable> frontier = singletonList(transitionTable);
   for (int height = 0; height < bitsPerRow; height++) {
     int countPerSplit = countPerSplit(bitsPerRow, height);
     int nextVariable = determineNext(frontier, remainingVariableIds, countPerSplit);
     remainingVariableIds.remove(nextVariable);
     variablesWithOrder.add(new Variable(height, nextVariable));
     frontier = nextFrontier(frontier, nextVariable);
   }
   return new VariableOrdering(variableSummary, variablesWithOrder);
 }
 private static int determineNext(
     List<TransitionTable> frontier, IntSet remainingVariableIds, int countPerSplit) {
   double maxEntropy = -Double.MAX_VALUE;
   int maxVariable = remainingVariableIds.iterator().nextInt();
   for (int variable : remainingVariableIds) {
     double entropy = frontierEntropy(frontier, variable, countPerSplit);
     if (entropy > maxEntropy) {
       maxEntropy = entropy;
       maxVariable = variable;
     }
   }
   return maxVariable;
 }
Пример #7
0
  public static int[] buildPartitions(String[] partitionArray) throws ConfigurationException {
    IntSet partitions = new IntOpenHashSet();
    try {
      for (int i = 0; i < partitionArray.length; ++i) {
        Matcher matcher = PARTITION_PATTERN.matcher(partitionArray[i]);
        if (!matcher.matches()) {
          throw new ConfigurationException("Invalid partition: " + partitionArray[i]);
        }
        String[] partitionRange = partitionArray[i].split("-");
        int start = Integer.parseInt(partitionRange[0]);
        int end;
        if (partitionRange.length > 1) {
          end = Integer.parseInt(partitionRange[1]);
          if (end < start) {
            throw new ConfigurationException("invalid partition range: " + partitionArray[i]);
          }
        } else {
          end = start;
        }

        for (int k = start; k <= end; ++k) {
          partitions.add(k);
        }
      }
    } catch (Exception e) {
      throw new ConfigurationException(
          "Error parsing '"
              + SENSEI_PROPERTIES
              + "': "
              + PARTITIONS
              + "="
              + Arrays.toString(partitionArray),
          e);
    }

    int[] ret = partitions.toIntArray();
    Arrays.sort(ret);
    return ret;
  }
Пример #8
0
  public static EdgeImpl[] generateEdgeList(
      NodeStore nodeStore, int edgeCount, int type, boolean directed, boolean allowSelfLoops) {
    int nodeCount = nodeStore.size();
    final List<EdgeImpl> edgeList = new ArrayList<>();
    LongSet idSet = new LongOpenHashSet();
    Random r = new Random(124);

    IntSet leafs = new IntOpenHashSet();
    if (nodeCount > 10) {
      for (int i = 0; i < Math.min(10, (int) (nodeCount * .05)); i++) {
        int id = r.nextInt(nodeCount);
        if (leafs.contains(id)) {
          i--;
        } else {
          leafs.add(id);
        }
      }
    }

    long cnt = 0;
    while (idSet.size() < edgeCount) {
      int sourceId = r.nextInt(nodeCount);
      int targetId = r.nextInt(nodeCount);
      Node source = nodeStore.get(sourceId);
      Node target = nodeStore.get(targetId);
      EdgeImpl edge = new EdgeImpl(cnt++, source, target, 1.0, directed);
      if (!leafs.contains(sourceId)
          && !leafs.contains(targetId)
          && (allowSelfLoops || (!allowSelfLoops && source != target))
          && !idSet.contains(edge.getLongId())) {
        edgeList.add(edge);
        idSet.add(edge.getLongId());
      }
    }

    return edgeList.toArray(new EdgeImpl[0]);
  }
Пример #9
0
  public static void main(String[] args) throws Exception {
    blacklist = loadCreation(PartitionsAndFiles.getBlackList());
    whitelist = loadCreation(PartitionsAndFiles.getWhiteList());

    System.out.println("Read lists: wl: " + whitelist.size() + " bl:" + blacklist.size());

    PropagateBlacklist pb =
        initialPropagation(PartitionsAndFiles.getLinkFrom(), PartitionsAndFiles.getLinkTo());

    int pass = 1;
    logger.info(
        "Pass:"
            + pass
            + " wl: "
            + whitelist.size()
            + " bl:"
            + blacklist.size()
            + " remaining links:"
            + pb.linkFrom.size());
    while (pb.blacklisted > 0) {
      pass++;
      PropagateBlacklist oldPb = pb;
      pb = new PropagateBlacklist();
      pb.process(pb.linkFrom, pb.linkTo);
      logger.info(
          "Pass:"
              + pass
              + " wl: "
              + whitelist.size()
              + " bl:"
              + blacklist.size()
              + " remaining links:"
              + pb.linkFrom.size());
    }

    new FileOpener()
        .writeObject(PartitionsAndFiles.getExpandedBlackListFile(), blacklist.toIntArray());
  }
Пример #10
0
 public static boolean isInList(IntSet list, String fbId) {
   int subjectInt = (int) midToLong(fbId);
   return list.contains(subjectInt);
 }
Пример #11
0
 private void blacklist(int toId) {
   blacklist.add(toId);
   //			System.out.println("blacklisting "+FreebaseMid.longToGuid(toId));
   blacklisted++;
 }
Пример #12
0
 public Graph neighbourhoodGraph(int nnodes[], int hops) {
   PrimaryHashMap<Integer, String> nodes;
   PrimaryHashMap<String, Integer> nodesReverse;
   try {
     File auxFile = File.createTempFile("graph-maps-" + System.currentTimeMillis(), "aux");
     auxFile.deleteOnExit();
     RecordManager recMan = RecordManagerFactory.createRecordManager(auxFile.getAbsolutePath());
     nodes = recMan.hashMap("nodes");
     nodesReverse = recMan.hashMap("nodesReverse");
   } catch (IOException ex) {
     throw new Error(ex);
   }
   nodes.clear();
   nodesReverse.clear();
   WeightedArcSet list1 = new WeightedArcSet();
   Int2IntAVLTreeMap map = new Int2IntAVLTreeMap();
   IntSet set = new IntLinkedOpenHashSet();
   int numIterators = 100;
   Constructor[] cons = WeightedArc.class.getDeclaredConstructors();
   for (int i = 0; i < cons.length; i++) cons[i].setAccessible(true);
   for (int n : nnodes) map.put(n, 0);
   NodeIterator its[] = new NodeIterator[numIterators];
   int itNum[] = new int[numIterators];
   for (int n = 0; n < its.length; n++) {
     its[n] = nodeIterator();
     itNum[n] = 0;
   }
   while (map.size() != 0) {
     Integer node = 0;
     for (int n = 0; n < its.length; n++) if (itNum[n] <= node) node = itNum[n];
     node = map.tailMap(node).firstKey();
     if (node == null) map.firstKey();
     NodeIterator it = null;
     Integer aux1 = 0;
     int iit = 0;
     for (int n = 0; n < its.length; n++) {
       if (!its[n].hasNext()) {
         its[n] = nodeIterator();
         itNum[n] = 0;
       }
       if (itNum[n] == node) {
         it = its[n];
         aux1 = itNum[n];
         iit = 0;
         break;
       }
       if (itNum[n] < node && itNum[n] >= aux1) {
         it = its[n];
         aux1 = itNum[n];
         iit = n;
       }
     }
     if (it == null) {
       its[0] = nodeIterator();
       itNum[0] = 0;
       it = its[0];
     }
     while (it != null && (aux1 = it.nextInt()) != null && aux1 >= 0 && aux1 < node) {}
     itNum[iit] = aux1 + 1;
     Integer aux2 = null;
     ArcLabelledNodeIterator.LabelledArcIterator suc = it.successors();
     while ((aux2 = suc.nextInt()) != null && aux2 >= 0 && (aux2 < graph.numNodes()))
       try {
         if (commit++ % COMMIT_SIZE == 0) {
           try {
             nodes.getRecordManager().commit();
           } catch (IOException e) {
             throw new Error(e);
           }
           try {
             nodesReverse.getRecordManager().commit();
           } catch (IOException e) {
             throw new Error(e);
           }
         }
         if (!nodesReverse.containsKey(this.nodes.get(aux1))) {
           nodes.put(nodes.size(), this.nodes.get(aux1));
           nodesReverse.put(this.nodes.get(aux1), nodesReverse.size());
         }
         if (!nodesReverse.containsKey(this.nodes.get(aux2))) {
           nodes.put(nodes.size(), this.nodes.get(aux2));
           nodesReverse.put(this.nodes.get(aux2), nodesReverse.size());
         }
         int aaux1 = nodesReverse.get(this.nodes.get(aux1));
         int aaux2 = nodesReverse.get(this.nodes.get(aux2));
         WeightedArc arc1 =
             (WeightedArc) cons[0].newInstance(aaux1, aaux2, suc.label().getFloat());
         list1.add(arc1);
         if (map.get(node) < hops) {
           if (!set.contains(aux1) && (map.get(aux1) == null || map.get(aux1) > map.get(node) + 1))
             map.put(aux1.intValue(), map.get(node) + 1);
           if (!set.contains(aux2) && (map.get(aux2) == null || map.get(aux2) > map.get(node) + 1))
             map.put(aux2.intValue(), map.get(node) + 1);
         }
       } catch (Exception ex) {
         ex.printStackTrace();
         throw new Error(ex);
       }
     ArcLabelledNodeIterator.LabelledArcIterator anc = it.ancestors();
     while ((aux2 = anc.nextInt()) != null && aux2 >= 0 && (aux2 < graph.numNodes()))
       try {
         if (commit++ % COMMIT_SIZE == 0) {
           try {
             nodes.getRecordManager().commit();
           } catch (IOException e) {
             throw new Error(e);
           }
           try {
             nodesReverse.getRecordManager().commit();
           } catch (IOException e) {
             throw new Error(e);
           }
         }
         if (!nodesReverse.containsKey(this.nodes.get(aux1))) {
           nodes.put(nodes.size(), this.nodes.get(aux1));
           nodesReverse.put(this.nodes.get(aux1), nodesReverse.size());
         }
         if (!nodesReverse.containsKey(this.nodes.get(aux2))) {
           nodes.put(nodes.size(), this.nodes.get(aux2));
           nodesReverse.put(this.nodes.get(aux2), nodesReverse.size());
         }
         int aaux1 = nodesReverse.get(this.nodes.get(aux1));
         int aaux2 = nodesReverse.get(this.nodes.get(aux2));
         WeightedArc arc1 =
             (WeightedArc) cons[0].newInstance(aaux2, aaux1, anc.label().getFloat());
         list1.add(arc1);
         if (map.get(node) < hops) {
           if (!set.contains(aux1) && (map.get(aux1) == null || map.get(aux1) > map.get(node) + 1))
             map.put(aux1.intValue(), map.get(node) + 1);
           if (!set.contains(aux2) && (map.get(aux2) == null || map.get(aux2) > map.get(node) + 1))
             map.put(aux2.intValue(), map.get(node) + 1);
         }
       } catch (Exception ex) {
         ex.printStackTrace();
         throw new Error(ex);
       }
     map.remove(node);
     set.add(node);
   }
   Graph newGraph = new Graph(list1.toArray(new WeightedArc[0]));
   newGraph.nodes.clear();
   newGraph.nodesReverse.clear();
   newGraph.nodes = nodes;
   newGraph.nodesReverse = nodesReverse;
   return newGraph;
 }
 private IntCollection getFileIndices(final Int2IntMap transcriptIndex2FileIndex) {
   final IntSet result = new IntArraySet();
   result.addAll(transcriptIndex2FileIndex.values());
   return result;
 }
Пример #14
0
  // Filter the AllPairsTask file, rejecting all entries that where found to
  // be only used by filtered entries.
  private void filterFeatures() throws FileNotFoundException, IOException {
    IntSet rejectedFeatures = new IntOpenHashSet();

    WeightedTokenSource featureSource =
        BybloIO.openFeaturesSource(activeFeaturesFile, getCharset(), indexDeligate);

    File outputFile = tempFiles.createFile();

    WeightedTokenSink featureSink =
        BybloIO.openFeaturesSink(outputFile, getCharset(), indexDeligate);

    progress.setMessage("Filtering features.");

    // Store an filtered wieght here and record it so as to maintain
    // accurate priors for those features that remain
    double filteredWeight = 0;
    int filteredId = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING);

    long inCount = 0;
    long outCount = 0;
    while (featureSource.hasNext()) {
      Weighted<Token> feature = featureSource.read();
      ++inCount;

      if (feature.record().id() == filteredId) {
        filteredWeight += feature.weight();
      } else if (acceptFeature.apply(feature)) {
        featureSink.write(feature);
        ++outCount;
      } else {
        rejectedFeatures.add(feature.record().id());
        filteredWeight += feature.weight();
      }

      if ((inCount % PROGRESS_INTERVAL == 0 || !featureSource.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage(format("Accepted {0} of {1} features.", outCount, inCount));
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (filteredWeight != 0) {
      featureSink.write(new Weighted<Token>(new Token(filteredId), filteredWeight));
    }
    featureSource.close();
    featureSink.flush();
    featureSink.close();

    if (!activeFeaturesFile.equals(inputFeaturesFile)) {
      activeFeaturesFile.delete();
    }

    featureFilterRequired = false;
    activeFeaturesFile = outputFile;

    // Update the feature acceptance predicate
    if (rejectedFeatures.size() > 0) {

      eventFilterRequired = true;
      acceptEvent =
          Predicates2.and(
              acceptEvent,
              Predicates2.compose(
                  Predicates2.not(Predicates2.in(rejectedFeatures)), eventFeatureId()));
    }
  }
Пример #15
0
  // Filter the AllPairsTask file, rejecting all entires that contain entries
  // dropped in the entries file filter pass. Store a list of featuress that
  // only appear in filtered entries to filter the featuress file.
  private void filterEvents() throws FileNotFoundException, IOException {

    IntSet acceptedEntries = new IntOpenHashSet();
    IntSet rejectedEntries = new IntOpenHashSet();

    IntSet rejectedFeatures = new IntOpenHashSet();
    IntSet acceptedFeatures = new IntOpenHashSet();

    WeightedTokenPairSource efSrc =
        BybloIO.openEventsSource(activeEventsFile, getCharset(), indexDeligate);

    File outputFile = tempFiles.createFile();
    //        outputFile.deleteOnExit();

    WeightedTokenPairSink efSink = BybloIO.openEventsSink(outputFile, getCharset(), indexDeligate);

    progress.setMessage("Filtering events from.");

    // Store the id of the special filtered feature and entry
    // TODO This can probably be removed now but need to check
    final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING);
    final int filteredFeature = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING);

    int currentEntryId = -1;
    int currentEventCount = 0;
    double currentEntryFilteredFeatureWeight = 0;

    double filteredEntryWeight = 0;

    int readCount = 0;
    int writeCount = 0;

    while (efSrc.hasNext()) {
      Weighted<TokenPair> record = efSrc.read();
      ++readCount;

      if (record.record().id1() == filteredEntry) {
        filteredEntryWeight += record.weight();
        continue;
      }

      if (record.record().id1() != currentEntryId) {

        if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) {
          if (currentEventCount == 0) {
            filteredEntryWeight += currentEntryFilteredFeatureWeight;
          } else {
            efSink.write(
                new Weighted<TokenPair>(
                    new TokenPair(currentEntryId, filteredFeature),
                    currentEntryFilteredFeatureWeight));
            ++writeCount;
          }
        }

        currentEntryId = record.record().id1();
        currentEntryFilteredFeatureWeight = 0;
        currentEventCount = 0;
      }

      if (record.record().id2() == filteredFeature) {

        currentEntryFilteredFeatureWeight += record.weight();

      } else if (acceptEvent.apply(record)) {

        efSink.write(record);
        ++writeCount;
        acceptedEntries.add(record.record().id1());
        acceptedFeatures.add(record.record().id2());
        ++currentEventCount;

      } else {
        rejectedEntries.add(record.record().id1());
        rejectedFeatures.add(record.record().id2());

        currentEntryFilteredFeatureWeight += record.weight();
      }

      if ((readCount % PROGRESS_INTERVAL == 0 || !efSrc.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage("Accepted " + writeCount + " of " + readCount + " events.");
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) {
      if (currentEventCount == 0) {
        filteredEntryWeight += currentEntryFilteredFeatureWeight;
      } else {
        efSink.write(
            new Weighted<TokenPair>(
                new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight));
      }
    }

    // If there have been entire entries filtered then write their summed
    // weights to a special filtered entry/feature pair
    if (filteredEntryWeight != 0) {
      efSink.write(
          new Weighted<TokenPair>(
              new TokenPair(filteredEntry, filteredFeature), filteredEntryWeight));
    }

    efSrc.close();
    efSink.flush();
    efSink.close();

    if (!activeEventsFile.equals(inputEventsFile)) {
      activeEventsFile.delete();
    }

    eventFilterRequired = false;
    activeEventsFile = outputFile;

    rejectedFeatures.removeAll(acceptedFeatures);
    rejectedEntries.removeAll(acceptedEntries);

    if (rejectedEntries.size() > 0) {
      acceptEntry =
          Predicates2.and(
              acceptEntry,
              Predicates2.compose(Predicates2.not(Predicates2.in(rejectedEntries)), id()));
      entryFilterRequired = true;
    }

    if (rejectedFeatures.size() > 0) {
      acceptFeature =
          Predicates2.and(
              acceptFeature,
              Predicates2.not(Predicates2.compose(Predicates2.in(rejectedFeatures), id())));
      featureFilterRequired = true;
    }
  }
Пример #16
0
  private void filterEntries() throws FileNotFoundException, IOException {

    final IntSet rejected = new IntOpenHashSet();

    WeightedTokenSource entriesSource =
        BybloIO.openEntriesSource(activeEntriesFile, getCharset(), getIndexDeligate());

    File outputFile = tempFiles.createFile();

    WeightedTokenSink entriesSink =
        BybloIO.openEntriesSink(outputFile, getCharset(), getIndexDeligate());

    progress.setMessage("Filtering entries.");

    final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING);
    double filteredWeight = 0;

    long inCount = 0;
    long outCount = 0;
    while (entriesSource.hasNext()) {
      ++inCount;
      Weighted<Token> record = entriesSource.read();

      if (record.record().id() == filteredEntry) {
        filteredWeight += record.weight();
      } else if (acceptEntry.apply(record)) {
        entriesSink.write(record);
        ++outCount;
      } else {
        rejected.add(record.record().id());
        filteredWeight += record.weight();
      }

      if ((inCount % PROGRESS_INTERVAL == 0 || !entriesSource.hasNext()) && LOG.isInfoEnabled()) {
        progress.setMessage(format("Accepted {0} of {1} entries.", outCount, inCount));
        LOG.debug(MiscUtil.memoryInfoString());
      }
    }

    if (filteredWeight != 0) {
      entriesSink.write(new Weighted<Token>(new Token(filteredEntry), filteredWeight));
    }

    entriesSource.close();
    entriesSink.flush();
    entriesSink.close();

    if (!activeEntriesFile.equals(inputEntriesFile)) {
      activeEntriesFile.delete();
    }

    entryFilterRequired = false;
    activeEntriesFile = outputFile;

    // Update the feature acceptance predicate
    if (rejected.size() > 0) {
      eventFilterRequired = true;
      acceptEvent =
          Predicates2.and(
              acceptEvent,
              Predicates2.compose(Predicates2.not(Predicates2.in(rejected)), eventEntryId()));
    }
  }