/** * For use within an iterator stack. Apply an appropriate column filter based on the input string. * Four modes of operation: 1. Null or blank ("") `colFilter`: do nothing. 2. No ranges * `colFilter`: use Accumulo system ColumnQualifierFilter. 3. Singleton range `colFilter`: use * Accumulo user ColumnSliceFilter. 4. Multi-range `colFilter`: use Graphulo D4mRangeFilter. * * @param colFilter column filter string * @param skvi Parent / source iterator * @return SKVI with appropriate filter iterators placed in front of it. */ public static SortedKeyValueIterator<Key, Value> applyGeneralColumnFilter( String colFilter, SortedKeyValueIterator<Key, Value> skvi, IteratorEnvironment env) throws IOException { if (colFilter == null || colFilter.isEmpty()) return skvi; int pos1 = colFilter.indexOf(':'); if (pos1 == -1) { // no ranges - collection of singleton texts Set<Column> colset = new HashSet<>(); for (Text text : GraphuloUtil.d4mRowToTexts(colFilter)) { byte[] by = text.copyBytes(); // log.debug("Printing characters of string TEXT LIM: "+ Key.toPrintableString(by, 0, // text.getLength(), 100)); // log.debug("Printing characters of string TEXT : "+ Key.toPrintableString(by, 0, // by.length, 100)); colset.add(new Column(EMPTY_BYTES, text.copyBytes(), EMPTY_BYTES)); } return new ColumnQualifierFilter(skvi, colset); } else { SortedSet<Range> ranges = GraphuloUtil.d4mRowToRanges(colFilter); assert ranges.size() > 0; if (ranges.size() == 1) { // single range - use ColumnSliceFilter Range r = ranges.first(); Map<String, String> map = new HashMap<>(); String start = r.isInfiniteStartKey() ? null : r.getStartKey().getRow().toString(), end = r.isInfiniteStopKey() ? null : r.getEndKey().getRow().toString(); boolean startInclusive = true, endInclusive = true; if (start != null) map.put(ColumnSliceFilter.START_BOUND, start); if (end != null) map.put(ColumnSliceFilter.END_BOUND, end); map.put(ColumnSliceFilter.START_INCLUSIVE, String.valueOf(startInclusive)); map.put(ColumnSliceFilter.END_INCLUSIVE, String.valueOf(endInclusive)); SortedKeyValueIterator<Key, Value> filter = new ColumnSliceFilter(); filter.init(skvi, map, env); return filter; } else { // multiple ranges SortedKeyValueIterator<Key, Value> filter = new D4mRangeFilter(); filter.init( skvi, D4mRangeFilter.iteratorSetting(1, D4mRangeFilter.KeyPart.COLQ, colFilter).getOptions(), env); return filter; } } }
/** * Generates RemoteSourceIterator (possibly x2), TwoTableIterator, RemoteWriteIterator * configuration through a DynamicIteratorSetting. * * @param map Map of all options. * @param priority Priority to use for the IteratorSetting of the whole stack * @param name Null means use the default name "TableMultIterator" */ public static IteratorSetting tableMultIterator( Map<String, String> map, int priority, String name) { Map<String, String> optDM = new HashMap<>(), optC = new HashMap<>(); { Map<String, Map<String, String>> prefixMap = GraphuloUtil.splitMapPrefix(map); for (Map.Entry<String, Map<String, String>> prefixEntry : prefixMap.entrySet()) { final String prefix = prefixEntry.getKey(); Map<String, String> entryMap = prefixEntry.getValue(); switch (prefix) { case TwoTableIterator.PREFIX_AT: case TwoTableIterator.PREFIX_B: optDM.putAll(GraphuloUtil.preprendPrefixToKey(prefix + '.', entryMap)); break; case "C": optC.putAll(entryMap); break; default: for (Map.Entry<String, String> entry : entryMap.entrySet()) { // switch (entry.getKey()) { // case "dotmode": // case "multiplyOp": // optDM.put(entry.getKey(), entry.getValue()); // break; // default: // log.warn("Unrecognized option: " + prefix + '.' + entry); // break; // } if (prefix.isEmpty()) optDM.put(entry.getKey(), entry.getValue()); else optDM.put(prefix + '.' + entry.getKey(), entry.getValue()); } break; } } } DynamicIteratorSetting dis = new DynamicIteratorSetting(priority, name == null ? "TableMultIterator" : name) .append(new IteratorSetting(1, TwoTableIterator.class, optDM)); if (!optC.isEmpty()) dis.append(new IteratorSetting(1, RemoteWriteIterator.class, optC)); return dis.toIteratorSetting(); }
/** * Add prefix and/or suffix to every part of a D4M string. * prependStartPrefix("pre|","X","a,b,:,v,:,") ==> "pre|aX,pre|bX,:,pre|vX,:," */ public static String padD4mString_Single(String prefix, String suffix, String str) { if (prefix == null) prefix = ""; if (suffix == null) suffix = ""; if (prefix.isEmpty() && suffix.isEmpty()) return str; char sep = str.charAt(str.length() - 1); StringBuilder sb = new StringBuilder(); for (String part : GraphuloUtil.splitD4mString(str)) { if (part.equals(":")) sb.append(part).append(sep); else sb.append(prefix).append(part).append(suffix).append(sep); } return sb.toString(); }
/** * Add Cartesian product of prefixes and suffixes to a string, each give as a D4M String. * * @see #padD4mString_Single(String, String, String) */ public static String padD4mString(String prefixes, String suffixes, String str) { if (prefixes == null || prefixes.isEmpty()) prefixes = ","; if (suffixes == null || suffixes.isEmpty()) suffixes = ","; if (prefixes.length() <= 1 && suffixes.length() <= 1) return str; if (d4mStringContainsRange(str)) { // if (suffixes.length()>1) // throw new UnsupportedOperationException("No way to append the suffixes "+suffixes+ // " to a D4M String containing a Range: "+str); // add prefix to v0 Ranges. Goto full Range Objects because ':' is complicated. SortedSet<Range> tmp = GraphuloUtil.d4mRowToRanges(str), tmp2 = new TreeSet<>(); for (String startPre : GraphuloUtil.splitD4mString(prefixes)) for (Range range : tmp) tmp2.add(GraphuloUtil.prependPrefixToRange(startPre, range)); str = GraphuloUtil.rangesToD4MString(tmp2, str.charAt(str.length() - 1)); prefixes = ","; } String s = ""; for (String pre : GraphuloUtil.splitD4mString(prefixes)) { for (String suf : GraphuloUtil.splitD4mString(suffixes)) { s += padD4mString_Single(pre, suf, str); } } return s; }
/** * Apply an appropriate column filter based on the input string. Four modes of operation: * * <ol> * <li>1. Null or blank ("") `colFilter`: do nothing. * <li>2. No ranges `colFilter`: use scanner.fetchColumn() which invokes an Accumulo system * {@link ColumnQualifierFilter}. * <li>3. Singleton range `colFilter`: use Accumulo user {@link ColumnSliceFilter}. * <li>4. Multi-range `colFilter`: use Graphulo {@link D4mRangeFilter}. * </ol> * * @param colFilter column filter string * @param scanner to call fetchColumn() on, for case #2 * @param dis to call append()/prepend() on, for cases #3 and #4 * @param append True means call {@link DynamicIteratorSetting#append}. False means call {@link * DynamicIteratorSetting#prepend} */ public static void applyGeneralColumnFilter( String colFilter, ScannerBase scanner, DynamicIteratorSetting dis, boolean append) { // System.err.println("colFilter: "+colFilter); if (colFilter != null && !colFilter.isEmpty()) { int pos1 = colFilter.indexOf(':'); if (pos1 == -1) { // no ranges - collection of singleton texts // todo - the order this filter applies is different. Ensure no logical bugs when we have // case 2. for (Text text : GraphuloUtil.d4mRowToTexts(colFilter)) { scanner.fetchColumn(GraphuloUtil.EMPTY_TEXT, text); } } else { SortedSet<Range> ranges = GraphuloUtil.d4mRowToRanges(colFilter); assert ranges.size() > 0; IteratorSetting s; if (ranges.size() == 1) { // single range - use ColumnSliceFilter Range r = ranges.first(); s = new IteratorSetting(1, ColumnSliceFilter.class); // System.err.println("start: "+(r.isInfiniteStartKey() ? null : // r.getStartKey().getRow().toString()) // +"end: "+(r.isInfiniteStopKey() ? null : // r.getEndKey().getRow().toString())); ColumnSliceFilter.setSlice( s, r.isInfiniteStartKey() ? null : r.getStartKey().getRow().toString(), true, r.isInfiniteStopKey() ? null : r.getEndKey().getRow().toString(), true); // System.err.println("!ok "+GraphuloUtil.d4mRowToRanges(colFilter)); } else { // multiple ranges // System.err.println("ok "+GraphuloUtil.d4mRowToRanges(colFilter)); s = D4mRangeFilter.iteratorSetting(1, D4mRangeFilter.KeyPart.COLQ, colFilter); } if (append) dis.append(s); else dis.prepend(s); } } }
// @Deprecated // * @deprecated Use {@link #applyGeneralColumnFilter(String, ScannerBase, // DynamicIteratorSetting)} for more robust filter setting. public static void applyGeneralColumnFilter(String colFilter, ScannerBase scanner, int priority) { // System.err.println("colFilter: "+colFilter); if (colFilter != null && !colFilter.isEmpty()) { int pos1 = colFilter.indexOf(':'); if (pos1 == -1) { // no ranges - collection of singleton texts for (Text text : GraphuloUtil.d4mRowToTexts(colFilter)) { scanner.fetchColumn(GraphuloUtil.EMPTY_TEXT, text); } } else { SortedSet<Range> ranges = GraphuloUtil.d4mRowToRanges(colFilter); assert ranges.size() > 0; IteratorSetting s; if (ranges.size() == 1) { // single range - use ColumnSliceFilter Range r = ranges.first(); if (r.isInfiniteStartKey() && r.isInfiniteStopKey()) return; // Infinite case: no filtering. s = new IteratorSetting(priority, ColumnSliceFilter.class); // System.err.println("start: "+(r.isInfiniteStartKey() ? null : // r.getStartKey().getRow().toString()) // +"end: "+(r.isInfiniteStopKey() ? null : // r.getEndKey().getRow().toString())); ColumnSliceFilter.setSlice( s, r.isInfiniteStartKey() ? null : r.getStartKey().getRow().toString(), true, r.isInfiniteStopKey() ? null : r.getEndKey().getRow().toString(), true); // System.err.println("!ok "+GraphuloUtil.d4mRowToRanges(colFilter)); } else { // multiple ranges // System.err.println("ok "+GraphuloUtil.d4mRowToRanges(colFilter)); s = D4mRangeFilter.iteratorSetting(1, D4mRangeFilter.KeyPart.COLQ, colFilter); } scanner.addScanIterator(s); } } }
/** * Makes each input term into a prefix range. * * <pre> * "v1,v5," => "v1|,:,v1},v5|,:,v5}," * "v1,:,v3,v5," => "v1,:,v3,v5|,:,v5}," * </pre> */ public static String singletonsAsPrefix(String str) { Preconditions.checkNotNull(str); Preconditions.checkArgument(!str.isEmpty()); // Preconditions.checkArgument(str.indexOf(':') != -1, "Cannot have the ':' character: // "+str); char sep = str.charAt(str.length() - 1); if (d4mStringContainsEmptyString(str)) // empty prefix is full range. return ":" + sep; if (!d4mStringContainsRange(str)) { StringBuilder sb = new StringBuilder(); for (String vktext : GraphuloUtil.splitD4mString(str)) { sb.append(vktext) .append(sep) .append(':') .append(sep) .append(prevRow(Range.followingPrefix(new Text(vktext)).toString())) .append(sep); } return sb.toString(); } // Collection<Range> origRngs = d4mRowToRanges(str); // for (Range rng : origRngs) { // // if a singleton row, then make into a prefix row // // } // String[] strSplit = str.substring(0, str.length() - 1) // .split(String.valueOf(sep)); // List<String> strList = Arrays.asList(strSplit); // PeekingIterator3<String> pi = new PeekingIterator3<>(strList.iterator()); // SortedSet<Range> rngset = new TreeSet<>(); // // if (pi.peekFirst().equals(":")) { // (-Inf, // if (pi.peekSecond() == null) { // return str; // (-Inf,+Inf) // } else { // if (pi.peekSecond().equals(":") || (pi.peekThird() != null && // pi.peekThird().equals(":"))) // throw new IllegalArgumentException("Bad D4M rowStr: " + str); //// sb.append(':').append(sep).append(pi.peekSecond()).append(sep); // (-Inf,2] // rngset.add(new Range(null, false, pi.peekSecond(), true)); // (-Inf,2] // pi.next(); // pi.next(); // } // } // // while (pi.hasNext()) { // if (pi.peekSecond() == null) { // last singleton row [1,1~) //// sb.append(pi.peekFirst()).append(sep) //// .append(':').append(sep) //// .append(Range.followingPrefix(new Text(pi.peekFirst())).toString()).append(sep); // rngset.add(Range.prefix(pi.peekFirst())); // // } else if (pi.peekSecond().equals(":")) { // if (pi.peekThird() == null) { // [1,+Inf) //// sb.append(pi.peekFirst()).append(sep).append(':').append(sep); // rngset.add(new Range(pi.peekFirst(), true, null, false)); // // } else { String s = GraphuloUtil.singletonsAsPrefix(vktexts, sep);// [1,3] // if (pi.peekThird().equals(":")) // throw new IllegalArgumentException("Bad D4M rowStr: " + str); //// sb.append(pi.peekFirst()).append(sep) //// .append(':').append(sep) //// .append(pi.peekThird()).append(sep); // rngset.add(new Range(pi.peekFirst(), true, pi.peekThird(), true)); // pi.next(); // pi.next(); // pi.next(); // } // } else { // [1,1~) //// sb.append(pi.peekFirst()).append(sep) //// .append(':').append(sep) //// .append(Range.followingPrefix(new Text(pi.peekFirst())).toString()).append(sep); // rngset.add(Range.prefix(pi.peekFirst())); // pi.next(); // } // } Collection<Range> prefixRngs = d4mRowToRanges(str, true); // log.info(prefixRngs); return rangesToD4MString(prefixRngs, sep); }
@Test public void exampleNMF() throws FileNotFoundException, TableNotFoundException, AccumuloSecurityException, AccumuloException { String Atable = "ex" + SCALE + "A"; // Table base name. String Etable = "ex" + SCALE + "AEdge"; // Incidence table. String ETtable = "ex" + SCALE + "AEdgeT"; // Transpose of incidence table. String EtableSample = "ex" + SCALE + "AEdgeSample"; // Sampled-down version of incidence table. String ETtableSample = "ex" + SCALE + "AEdgeTSample"; // Sampled-down version of transpose of incidence table. String Wtable = "ex" + SCALE + "AEdgeW"; // Output table W. String WTtable = "ex" + SCALE + "AEdgeWT"; // Transpose of output table W. String Htable = "ex" + SCALE + "AEdgeH"; // Output table H. String HTtable = "ex" + SCALE + "AEdgeHT"; // Transpose of output table HT. int K = 3; // 3 topics int maxiter = 5; // 3 iterations of NMF maximum double cutoffThreshold = 0.0; // Threshold to cut off entries with value less than this int maxColsPerTopic = 10; // Threshold - only keep 10 nodes in H per topic String newVisibility = ""; // Column Visibility to use for newly created entries. // In your code, you would connect to an Accumulo instance by writing something similar to: // ClientConfiguration cc = // ClientConfiguration.loadDefault().withInstance("instance").withZkHosts("localhost:2181").withZkTimeout(5000); // Instance instance = new ZooKeeperInstance(cc); // Connector c = instance.getConnector("root", new PasswordToken("secret")); // Here, we connect to the Accumulo instance given by TEST_CONFIG.java. // You can change this by passing the option -DTEST_CONFIG=local or -DTEST_CONFIG=txe1 or // similar. Connector conn = tester.getConnector(); // Delete result table if it exists, so that we don't sum in previous runs with our results. GraphuloUtil.deleteTables(conn, Htable, HTtable, Wtable, WTtable); if (conn.tableOperations().exists(Htable)) conn.tableOperations().delete(Htable); // Insert data from the file test/resources/data/10Ar.txt and 10Ac.txt into Accumulo. // Deletes tables if they already exist. ExampleUtil.ingestIncidenceSCALE(SCALE, 'A', Atable, conn); // Create Graphulo executor. Supply the password for your Accumulo user account. Graphulo graphulo = new Graphulo(conn, tester.getPassword()); // DistributedTrace.enable("NMFExample"); // remove this for no tracing // Option to use in-memory version double nmfError = graphulo.NMF_Client( Etable, false, WTtable, true, Htable, false, K, maxiter, cutoffThreshold, maxColsPerTopic); // Sample the graph with 10% uniform sampling and materialize the result in a sampled table // double probability = 0.1; // long nnzSample = graphulo.SampleCopy(Etable, EtableSample+"tmp", null, probability, // trace); // long nnzSample = graphulo.OneTable(Etable, EtableSample, ETtableSample, null, -1, null, // null, null, null, null, // Collections.singletonList(SamplingFilter.iteratorSetting(1, probability)), null, // Authorizations.EMPTY); // System.out.println("Sample finished; #entries in sample is "+nnzSample); // Non-negative matrix factorization. // This call blocks until the NMF completes. // double nmfError = graphulo.NMF(EtableSample, ETtableSample, Wtable, WTtable, Htable, // HTtable, KMER, maxiter, true, // cutoffThreshold); System.out.println("Final NMF absolute difference in error: " + nmfError); // Result is in Htable, HTtable, Wtable, WTtable. Do whatever you like with it. // For this example we will multiply H*W into a new table that approximates the original // incidence matrix. String APtable = "ex" + SCALE + "AEdgeApprox"; // Approximation of the incidence table Etable. GraphuloUtil.deleteTables(conn, APtable); graphulo.TableMult( WTtable, Htable, APtable, null, -1, MathTwoScalar.class, MathTwoScalar.optionMap( MathTwoScalar.ScalarOp.TIMES, MathTwoScalar.ScalarType.DOUBLE, newVisibility, false), MathTwoScalar.combinerSetting( Graphulo.PLUS_ITERATOR_BIGDECIMAL.getPriority(), null, MathTwoScalar.ScalarOp.PLUS, MathTwoScalar.ScalarType.DOUBLE, false), null, null, null, false, false, -1); DistributedTrace.disable(); // Now Scanning APtable BatchScanner bs = conn.createBatchScanner(APtable, Authorizations.EMPTY, 2); bs.setRanges(Collections.singleton(new Range())); // Scan whole table. int cnt = 0; for (Map.Entry<Key, Value> entry : bs) { cnt++; // System.out.println(entry.getKey().toStringNoTime()+" -> "+entry.getValue()); } bs.close(); log.info("# of entries in approximation table " + APtable + ": " + cnt); }