public void process(IntList from, IntList to) { for (int i = 0; i < from.size(); i++) { int fromId = from.getInt(i); int toId = to.getInt(i); // // nothing propagates to and from whitelisted nodes // we never visit again // if (whitelist.contains(fromId) || whitelist.contains(toId)) { continue; } boolean fromBlacklist = blacklist.contains(fromId); boolean toBlacklist = blacklist.contains(toId); if (fromBlacklist && toBlacklist) continue; if (fromBlacklist) { blacklist(toId); continue; } if (toBlacklist) { blacklist(fromId); continue; } // if we haven't followed the link, copy it to the list to go again. linkFrom.add(fromId); linkTo.add(toId); } }
public static IntSet loadCreation(SerializedMultiFile<int[]> input) throws Exception { IntSet output = new IntOpenHashSet(); for (int i = 0; i < input.getPartitionFunction().getPartitionCount(); i++) { IntSet part = new IntOpenHashSet(input.readFirstObject(i)); output.addAll(part); } return output; }
public void addEntryWhitelist(List<String> strings) throws IOException { IntSet entryIdSet = new IntOpenHashSet(); for (String string : strings) { final int id = getIndexDeligate().getEntryEnumerator().indexOf(string); entryIdSet.add(id); } setAcceptEntry( Predicates2.<Weighted<Token>>and( getAcceptEntry(), Predicates2.compose(Predicates2.in(entryIdSet), id()))); }
public void addFeaturesBlacklist(List<String> strings) throws IOException { IntSet featureIdSet = new IntOpenHashSet(); for (String string : strings) { final int id = getIndexDeligate().getFeatureEnumerator().indexOf(string); featureIdSet.add(id); } setAcceptFeatures( Predicates2.<Weighted<Token>>and( getAcceptFeatures(), Predicates2.compose(Predicates2.not(Predicates2.in(featureIdSet)), id()))); }
public static VariableOrdering determineOrdering( VariableSummary variableSummary, TransitionTable transitionTable) { int bitsPerRow = variableSummary.bitsPerRow(); IntSet remainingVariableIds = variableSummary.allVariableIds(); List<Variable> variablesWithOrder = new ArrayList<>(bitsPerRow); List<TransitionTable> frontier = singletonList(transitionTable); for (int height = 0; height < bitsPerRow; height++) { int countPerSplit = countPerSplit(bitsPerRow, height); int nextVariable = determineNext(frontier, remainingVariableIds, countPerSplit); remainingVariableIds.remove(nextVariable); variablesWithOrder.add(new Variable(height, nextVariable)); frontier = nextFrontier(frontier, nextVariable); } return new VariableOrdering(variableSummary, variablesWithOrder); }
private static int determineNext( List<TransitionTable> frontier, IntSet remainingVariableIds, int countPerSplit) { double maxEntropy = -Double.MAX_VALUE; int maxVariable = remainingVariableIds.iterator().nextInt(); for (int variable : remainingVariableIds) { double entropy = frontierEntropy(frontier, variable, countPerSplit); if (entropy > maxEntropy) { maxEntropy = entropy; maxVariable = variable; } } return maxVariable; }
public static int[] buildPartitions(String[] partitionArray) throws ConfigurationException { IntSet partitions = new IntOpenHashSet(); try { for (int i = 0; i < partitionArray.length; ++i) { Matcher matcher = PARTITION_PATTERN.matcher(partitionArray[i]); if (!matcher.matches()) { throw new ConfigurationException("Invalid partition: " + partitionArray[i]); } String[] partitionRange = partitionArray[i].split("-"); int start = Integer.parseInt(partitionRange[0]); int end; if (partitionRange.length > 1) { end = Integer.parseInt(partitionRange[1]); if (end < start) { throw new ConfigurationException("invalid partition range: " + partitionArray[i]); } } else { end = start; } for (int k = start; k <= end; ++k) { partitions.add(k); } } } catch (Exception e) { throw new ConfigurationException( "Error parsing '" + SENSEI_PROPERTIES + "': " + PARTITIONS + "=" + Arrays.toString(partitionArray), e); } int[] ret = partitions.toIntArray(); Arrays.sort(ret); return ret; }
public static EdgeImpl[] generateEdgeList( NodeStore nodeStore, int edgeCount, int type, boolean directed, boolean allowSelfLoops) { int nodeCount = nodeStore.size(); final List<EdgeImpl> edgeList = new ArrayList<>(); LongSet idSet = new LongOpenHashSet(); Random r = new Random(124); IntSet leafs = new IntOpenHashSet(); if (nodeCount > 10) { for (int i = 0; i < Math.min(10, (int) (nodeCount * .05)); i++) { int id = r.nextInt(nodeCount); if (leafs.contains(id)) { i--; } else { leafs.add(id); } } } long cnt = 0; while (idSet.size() < edgeCount) { int sourceId = r.nextInt(nodeCount); int targetId = r.nextInt(nodeCount); Node source = nodeStore.get(sourceId); Node target = nodeStore.get(targetId); EdgeImpl edge = new EdgeImpl(cnt++, source, target, 1.0, directed); if (!leafs.contains(sourceId) && !leafs.contains(targetId) && (allowSelfLoops || (!allowSelfLoops && source != target)) && !idSet.contains(edge.getLongId())) { edgeList.add(edge); idSet.add(edge.getLongId()); } } return edgeList.toArray(new EdgeImpl[0]); }
public static void main(String[] args) throws Exception { blacklist = loadCreation(PartitionsAndFiles.getBlackList()); whitelist = loadCreation(PartitionsAndFiles.getWhiteList()); System.out.println("Read lists: wl: " + whitelist.size() + " bl:" + blacklist.size()); PropagateBlacklist pb = initialPropagation(PartitionsAndFiles.getLinkFrom(), PartitionsAndFiles.getLinkTo()); int pass = 1; logger.info( "Pass:"******" wl: " + whitelist.size() + " bl:" + blacklist.size() + " remaining links:" + pb.linkFrom.size()); while (pb.blacklisted > 0) { pass++; PropagateBlacklist oldPb = pb; pb = new PropagateBlacklist(); pb.process(pb.linkFrom, pb.linkTo); logger.info( "Pass:"******" wl: " + whitelist.size() + " bl:" + blacklist.size() + " remaining links:" + pb.linkFrom.size()); } new FileOpener() .writeObject(PartitionsAndFiles.getExpandedBlackListFile(), blacklist.toIntArray()); }
public static boolean isInList(IntSet list, String fbId) { int subjectInt = (int) midToLong(fbId); return list.contains(subjectInt); }
private void blacklist(int toId) { blacklist.add(toId); // System.out.println("blacklisting "+FreebaseMid.longToGuid(toId)); blacklisted++; }
public Graph neighbourhoodGraph(int nnodes[], int hops) { PrimaryHashMap<Integer, String> nodes; PrimaryHashMap<String, Integer> nodesReverse; try { File auxFile = File.createTempFile("graph-maps-" + System.currentTimeMillis(), "aux"); auxFile.deleteOnExit(); RecordManager recMan = RecordManagerFactory.createRecordManager(auxFile.getAbsolutePath()); nodes = recMan.hashMap("nodes"); nodesReverse = recMan.hashMap("nodesReverse"); } catch (IOException ex) { throw new Error(ex); } nodes.clear(); nodesReverse.clear(); WeightedArcSet list1 = new WeightedArcSet(); Int2IntAVLTreeMap map = new Int2IntAVLTreeMap(); IntSet set = new IntLinkedOpenHashSet(); int numIterators = 100; Constructor[] cons = WeightedArc.class.getDeclaredConstructors(); for (int i = 0; i < cons.length; i++) cons[i].setAccessible(true); for (int n : nnodes) map.put(n, 0); NodeIterator its[] = new NodeIterator[numIterators]; int itNum[] = new int[numIterators]; for (int n = 0; n < its.length; n++) { its[n] = nodeIterator(); itNum[n] = 0; } while (map.size() != 0) { Integer node = 0; for (int n = 0; n < its.length; n++) if (itNum[n] <= node) node = itNum[n]; node = map.tailMap(node).firstKey(); if (node == null) map.firstKey(); NodeIterator it = null; Integer aux1 = 0; int iit = 0; for (int n = 0; n < its.length; n++) { if (!its[n].hasNext()) { its[n] = nodeIterator(); itNum[n] = 0; } if (itNum[n] == node) { it = its[n]; aux1 = itNum[n]; iit = 0; break; } if (itNum[n] < node && itNum[n] >= aux1) { it = its[n]; aux1 = itNum[n]; iit = n; } } if (it == null) { its[0] = nodeIterator(); itNum[0] = 0; it = its[0]; } while (it != null && (aux1 = it.nextInt()) != null && aux1 >= 0 && aux1 < node) {} itNum[iit] = aux1 + 1; Integer aux2 = null; ArcLabelledNodeIterator.LabelledArcIterator suc = it.successors(); while ((aux2 = suc.nextInt()) != null && aux2 >= 0 && (aux2 < graph.numNodes())) try { if (commit++ % COMMIT_SIZE == 0) { try { nodes.getRecordManager().commit(); } catch (IOException e) { throw new Error(e); } try { nodesReverse.getRecordManager().commit(); } catch (IOException e) { throw new Error(e); } } if (!nodesReverse.containsKey(this.nodes.get(aux1))) { nodes.put(nodes.size(), this.nodes.get(aux1)); nodesReverse.put(this.nodes.get(aux1), nodesReverse.size()); } if (!nodesReverse.containsKey(this.nodes.get(aux2))) { nodes.put(nodes.size(), this.nodes.get(aux2)); nodesReverse.put(this.nodes.get(aux2), nodesReverse.size()); } int aaux1 = nodesReverse.get(this.nodes.get(aux1)); int aaux2 = nodesReverse.get(this.nodes.get(aux2)); WeightedArc arc1 = (WeightedArc) cons[0].newInstance(aaux1, aaux2, suc.label().getFloat()); list1.add(arc1); if (map.get(node) < hops) { if (!set.contains(aux1) && (map.get(aux1) == null || map.get(aux1) > map.get(node) + 1)) map.put(aux1.intValue(), map.get(node) + 1); if (!set.contains(aux2) && (map.get(aux2) == null || map.get(aux2) > map.get(node) + 1)) map.put(aux2.intValue(), map.get(node) + 1); } } catch (Exception ex) { ex.printStackTrace(); throw new Error(ex); } ArcLabelledNodeIterator.LabelledArcIterator anc = it.ancestors(); while ((aux2 = anc.nextInt()) != null && aux2 >= 0 && (aux2 < graph.numNodes())) try { if (commit++ % COMMIT_SIZE == 0) { try { nodes.getRecordManager().commit(); } catch (IOException e) { throw new Error(e); } try { nodesReverse.getRecordManager().commit(); } catch (IOException e) { throw new Error(e); } } if (!nodesReverse.containsKey(this.nodes.get(aux1))) { nodes.put(nodes.size(), this.nodes.get(aux1)); nodesReverse.put(this.nodes.get(aux1), nodesReverse.size()); } if (!nodesReverse.containsKey(this.nodes.get(aux2))) { nodes.put(nodes.size(), this.nodes.get(aux2)); nodesReverse.put(this.nodes.get(aux2), nodesReverse.size()); } int aaux1 = nodesReverse.get(this.nodes.get(aux1)); int aaux2 = nodesReverse.get(this.nodes.get(aux2)); WeightedArc arc1 = (WeightedArc) cons[0].newInstance(aaux2, aaux1, anc.label().getFloat()); list1.add(arc1); if (map.get(node) < hops) { if (!set.contains(aux1) && (map.get(aux1) == null || map.get(aux1) > map.get(node) + 1)) map.put(aux1.intValue(), map.get(node) + 1); if (!set.contains(aux2) && (map.get(aux2) == null || map.get(aux2) > map.get(node) + 1)) map.put(aux2.intValue(), map.get(node) + 1); } } catch (Exception ex) { ex.printStackTrace(); throw new Error(ex); } map.remove(node); set.add(node); } Graph newGraph = new Graph(list1.toArray(new WeightedArc[0])); newGraph.nodes.clear(); newGraph.nodesReverse.clear(); newGraph.nodes = nodes; newGraph.nodesReverse = nodesReverse; return newGraph; }
private IntCollection getFileIndices(final Int2IntMap transcriptIndex2FileIndex) { final IntSet result = new IntArraySet(); result.addAll(transcriptIndex2FileIndex.values()); return result; }
// Filter the AllPairsTask file, rejecting all entries that where found to // be only used by filtered entries. private void filterFeatures() throws FileNotFoundException, IOException { IntSet rejectedFeatures = new IntOpenHashSet(); WeightedTokenSource featureSource = BybloIO.openFeaturesSource(activeFeaturesFile, getCharset(), indexDeligate); File outputFile = tempFiles.createFile(); WeightedTokenSink featureSink = BybloIO.openFeaturesSink(outputFile, getCharset(), indexDeligate); progress.setMessage("Filtering features."); // Store an filtered wieght here and record it so as to maintain // accurate priors for those features that remain double filteredWeight = 0; int filteredId = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING); long inCount = 0; long outCount = 0; while (featureSource.hasNext()) { Weighted<Token> feature = featureSource.read(); ++inCount; if (feature.record().id() == filteredId) { filteredWeight += feature.weight(); } else if (acceptFeature.apply(feature)) { featureSink.write(feature); ++outCount; } else { rejectedFeatures.add(feature.record().id()); filteredWeight += feature.weight(); } if ((inCount % PROGRESS_INTERVAL == 0 || !featureSource.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage(format("Accepted {0} of {1} features.", outCount, inCount)); LOG.debug(MiscUtil.memoryInfoString()); } } if (filteredWeight != 0) { featureSink.write(new Weighted<Token>(new Token(filteredId), filteredWeight)); } featureSource.close(); featureSink.flush(); featureSink.close(); if (!activeFeaturesFile.equals(inputFeaturesFile)) { activeFeaturesFile.delete(); } featureFilterRequired = false; activeFeaturesFile = outputFile; // Update the feature acceptance predicate if (rejectedFeatures.size() > 0) { eventFilterRequired = true; acceptEvent = Predicates2.and( acceptEvent, Predicates2.compose( Predicates2.not(Predicates2.in(rejectedFeatures)), eventFeatureId())); } }
// Filter the AllPairsTask file, rejecting all entires that contain entries // dropped in the entries file filter pass. Store a list of featuress that // only appear in filtered entries to filter the featuress file. private void filterEvents() throws FileNotFoundException, IOException { IntSet acceptedEntries = new IntOpenHashSet(); IntSet rejectedEntries = new IntOpenHashSet(); IntSet rejectedFeatures = new IntOpenHashSet(); IntSet acceptedFeatures = new IntOpenHashSet(); WeightedTokenPairSource efSrc = BybloIO.openEventsSource(activeEventsFile, getCharset(), indexDeligate); File outputFile = tempFiles.createFile(); // outputFile.deleteOnExit(); WeightedTokenPairSink efSink = BybloIO.openEventsSink(outputFile, getCharset(), indexDeligate); progress.setMessage("Filtering events from."); // Store the id of the special filtered feature and entry // TODO This can probably be removed now but need to check final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING); final int filteredFeature = getIndexDeligate().getFeatureEnumerator().indexOf(FILTERED_STRING); int currentEntryId = -1; int currentEventCount = 0; double currentEntryFilteredFeatureWeight = 0; double filteredEntryWeight = 0; int readCount = 0; int writeCount = 0; while (efSrc.hasNext()) { Weighted<TokenPair> record = efSrc.read(); ++readCount; if (record.record().id1() == filteredEntry) { filteredEntryWeight += record.weight(); continue; } if (record.record().id1() != currentEntryId) { if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) { if (currentEventCount == 0) { filteredEntryWeight += currentEntryFilteredFeatureWeight; } else { efSink.write( new Weighted<TokenPair>( new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight)); ++writeCount; } } currentEntryId = record.record().id1(); currentEntryFilteredFeatureWeight = 0; currentEventCount = 0; } if (record.record().id2() == filteredFeature) { currentEntryFilteredFeatureWeight += record.weight(); } else if (acceptEvent.apply(record)) { efSink.write(record); ++writeCount; acceptedEntries.add(record.record().id1()); acceptedFeatures.add(record.record().id2()); ++currentEventCount; } else { rejectedEntries.add(record.record().id1()); rejectedFeatures.add(record.record().id2()); currentEntryFilteredFeatureWeight += record.weight(); } if ((readCount % PROGRESS_INTERVAL == 0 || !efSrc.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage("Accepted " + writeCount + " of " + readCount + " events."); LOG.debug(MiscUtil.memoryInfoString()); } } if (currentEntryId != -1 && currentEntryFilteredFeatureWeight != 0) { if (currentEventCount == 0) { filteredEntryWeight += currentEntryFilteredFeatureWeight; } else { efSink.write( new Weighted<TokenPair>( new TokenPair(currentEntryId, filteredFeature), currentEntryFilteredFeatureWeight)); } } // If there have been entire entries filtered then write their summed // weights to a special filtered entry/feature pair if (filteredEntryWeight != 0) { efSink.write( new Weighted<TokenPair>( new TokenPair(filteredEntry, filteredFeature), filteredEntryWeight)); } efSrc.close(); efSink.flush(); efSink.close(); if (!activeEventsFile.equals(inputEventsFile)) { activeEventsFile.delete(); } eventFilterRequired = false; activeEventsFile = outputFile; rejectedFeatures.removeAll(acceptedFeatures); rejectedEntries.removeAll(acceptedEntries); if (rejectedEntries.size() > 0) { acceptEntry = Predicates2.and( acceptEntry, Predicates2.compose(Predicates2.not(Predicates2.in(rejectedEntries)), id())); entryFilterRequired = true; } if (rejectedFeatures.size() > 0) { acceptFeature = Predicates2.and( acceptFeature, Predicates2.not(Predicates2.compose(Predicates2.in(rejectedFeatures), id()))); featureFilterRequired = true; } }
private void filterEntries() throws FileNotFoundException, IOException { final IntSet rejected = new IntOpenHashSet(); WeightedTokenSource entriesSource = BybloIO.openEntriesSource(activeEntriesFile, getCharset(), getIndexDeligate()); File outputFile = tempFiles.createFile(); WeightedTokenSink entriesSink = BybloIO.openEntriesSink(outputFile, getCharset(), getIndexDeligate()); progress.setMessage("Filtering entries."); final int filteredEntry = getIndexDeligate().getEntryEnumerator().indexOf(FILTERED_STRING); double filteredWeight = 0; long inCount = 0; long outCount = 0; while (entriesSource.hasNext()) { ++inCount; Weighted<Token> record = entriesSource.read(); if (record.record().id() == filteredEntry) { filteredWeight += record.weight(); } else if (acceptEntry.apply(record)) { entriesSink.write(record); ++outCount; } else { rejected.add(record.record().id()); filteredWeight += record.weight(); } if ((inCount % PROGRESS_INTERVAL == 0 || !entriesSource.hasNext()) && LOG.isInfoEnabled()) { progress.setMessage(format("Accepted {0} of {1} entries.", outCount, inCount)); LOG.debug(MiscUtil.memoryInfoString()); } } if (filteredWeight != 0) { entriesSink.write(new Weighted<Token>(new Token(filteredEntry), filteredWeight)); } entriesSource.close(); entriesSink.flush(); entriesSink.close(); if (!activeEntriesFile.equals(inputEntriesFile)) { activeEntriesFile.delete(); } entryFilterRequired = false; activeEntriesFile = outputFile; // Update the feature acceptance predicate if (rejected.size() > 0) { eventFilterRequired = true; acceptEvent = Predicates2.and( acceptEvent, Predicates2.compose(Predicates2.not(Predicates2.in(rejected)), eventEntryId())); } }