Ejemplo n.º 1
0
  /**
   * For use within an iterator stack. Apply an appropriate column filter based on the input string.
   * Four modes of operation: 1. Null or blank ("") `colFilter`: do nothing. 2. No ranges
   * `colFilter`: use Accumulo system ColumnQualifierFilter. 3. Singleton range `colFilter`: use
   * Accumulo user ColumnSliceFilter. 4. Multi-range `colFilter`: use Graphulo D4mRangeFilter.
   *
   * @param colFilter column filter string
   * @param skvi Parent / source iterator
   * @return SKVI with appropriate filter iterators placed in front of it.
   */
  public static SortedKeyValueIterator<Key, Value> applyGeneralColumnFilter(
      String colFilter, SortedKeyValueIterator<Key, Value> skvi, IteratorEnvironment env)
      throws IOException {
    if (colFilter == null || colFilter.isEmpty()) return skvi;

    int pos1 = colFilter.indexOf(':');
    if (pos1 == -1) { // no ranges - collection of singleton texts
      Set<Column> colset = new HashSet<>();
      for (Text text : GraphuloUtil.d4mRowToTexts(colFilter)) {
        byte[] by = text.copyBytes();
        //        log.debug("Printing characters of string TEXT LIM: "+ Key.toPrintableString(by, 0,
        // text.getLength(), 100));
        //        log.debug("Printing characters of string TEXT    : "+ Key.toPrintableString(by, 0,
        // by.length, 100));
        colset.add(new Column(EMPTY_BYTES, text.copyBytes(), EMPTY_BYTES));
      }
      return new ColumnQualifierFilter(skvi, colset);

    } else {
      SortedSet<Range> ranges = GraphuloUtil.d4mRowToRanges(colFilter);
      assert ranges.size() > 0;

      if (ranges.size() == 1) { // single range - use ColumnSliceFilter
        Range r = ranges.first();
        Map<String, String> map = new HashMap<>();

        String start = r.isInfiniteStartKey() ? null : r.getStartKey().getRow().toString(),
            end = r.isInfiniteStopKey() ? null : r.getEndKey().getRow().toString();
        boolean startInclusive = true, endInclusive = true;

        if (start != null) map.put(ColumnSliceFilter.START_BOUND, start);
        if (end != null) map.put(ColumnSliceFilter.END_BOUND, end);
        map.put(ColumnSliceFilter.START_INCLUSIVE, String.valueOf(startInclusive));
        map.put(ColumnSliceFilter.END_INCLUSIVE, String.valueOf(endInclusive));

        SortedKeyValueIterator<Key, Value> filter = new ColumnSliceFilter();
        filter.init(skvi, map, env);
        return filter;

      } else { // multiple ranges
        SortedKeyValueIterator<Key, Value> filter = new D4mRangeFilter();
        filter.init(
            skvi,
            D4mRangeFilter.iteratorSetting(1, D4mRangeFilter.KeyPart.COLQ, colFilter).getOptions(),
            env);
        return filter;
      }
    }
  }
Ejemplo n.º 2
0
  /**
   * Generates RemoteSourceIterator (possibly x2), TwoTableIterator, RemoteWriteIterator
   * configuration through a DynamicIteratorSetting.
   *
   * @param map Map of all options.
   * @param priority Priority to use for the IteratorSetting of the whole stack
   * @param name Null means use the default name "TableMultIterator"
   */
  public static IteratorSetting tableMultIterator(
      Map<String, String> map, int priority, String name) {
    Map<String, String> optDM = new HashMap<>(), optC = new HashMap<>();
    {
      Map<String, Map<String, String>> prefixMap = GraphuloUtil.splitMapPrefix(map);
      for (Map.Entry<String, Map<String, String>> prefixEntry : prefixMap.entrySet()) {
        final String prefix = prefixEntry.getKey();
        Map<String, String> entryMap = prefixEntry.getValue();

        switch (prefix) {
          case TwoTableIterator.PREFIX_AT:
          case TwoTableIterator.PREFIX_B:
            optDM.putAll(GraphuloUtil.preprendPrefixToKey(prefix + '.', entryMap));
            break;
          case "C":
            optC.putAll(entryMap);
            break;
          default:
            for (Map.Entry<String, String> entry : entryMap.entrySet()) {
              //              switch (entry.getKey()) {
              //                case "dotmode":
              //                case "multiplyOp":
              //                  optDM.put(entry.getKey(), entry.getValue());
              //                  break;
              //                default:
              //                  log.warn("Unrecognized option: " + prefix + '.' + entry);
              //                  break;
              //              }
              if (prefix.isEmpty()) optDM.put(entry.getKey(), entry.getValue());
              else optDM.put(prefix + '.' + entry.getKey(), entry.getValue());
            }
            break;
        }
      }
    }
    DynamicIteratorSetting dis =
        new DynamicIteratorSetting(priority, name == null ? "TableMultIterator" : name)
            .append(new IteratorSetting(1, TwoTableIterator.class, optDM));
    if (!optC.isEmpty()) dis.append(new IteratorSetting(1, RemoteWriteIterator.class, optC));
    return dis.toIteratorSetting();
  }
Ejemplo n.º 3
0
 /**
  * Add prefix and/or suffix to every part of a D4M string.
  * prependStartPrefix("pre|","X","a,b,:,v,:,") ==> "pre|aX,pre|bX,:,pre|vX,:,"
  */
 public static String padD4mString_Single(String prefix, String suffix, String str) {
   if (prefix == null) prefix = "";
   if (suffix == null) suffix = "";
   if (prefix.isEmpty() && suffix.isEmpty()) return str;
   char sep = str.charAt(str.length() - 1);
   StringBuilder sb = new StringBuilder();
   for (String part : GraphuloUtil.splitD4mString(str)) {
     if (part.equals(":")) sb.append(part).append(sep);
     else sb.append(prefix).append(part).append(suffix).append(sep);
   }
   return sb.toString();
 }
Ejemplo n.º 4
0
  /**
   * Add Cartesian product of prefixes and suffixes to a string, each give as a D4M String.
   *
   * @see #padD4mString_Single(String, String, String)
   */
  public static String padD4mString(String prefixes, String suffixes, String str) {
    if (prefixes == null || prefixes.isEmpty()) prefixes = ",";
    if (suffixes == null || suffixes.isEmpty()) suffixes = ",";
    if (prefixes.length() <= 1 && suffixes.length() <= 1) return str;

    if (d4mStringContainsRange(str)) {
      //      if (suffixes.length()>1)
      //        throw new UnsupportedOperationException("No way to append the suffixes "+suffixes+
      //            " to a D4M String containing a Range: "+str);
      // add prefix to v0 Ranges. Goto full Range Objects because ':' is complicated.
      SortedSet<Range> tmp = GraphuloUtil.d4mRowToRanges(str), tmp2 = new TreeSet<>();
      for (String startPre : GraphuloUtil.splitD4mString(prefixes))
        for (Range range : tmp) tmp2.add(GraphuloUtil.prependPrefixToRange(startPre, range));
      str = GraphuloUtil.rangesToD4MString(tmp2, str.charAt(str.length() - 1));
      prefixes = ",";
    }

    String s = "";
    for (String pre : GraphuloUtil.splitD4mString(prefixes)) {
      for (String suf : GraphuloUtil.splitD4mString(suffixes)) {
        s += padD4mString_Single(pre, suf, str);
      }
    }
    return s;
  }
Ejemplo n.º 5
0
 /**
  * Apply an appropriate column filter based on the input string. Four modes of operation:
  *
  * <ol>
  *   <li>1. Null or blank ("") `colFilter`: do nothing.
  *   <li>2. No ranges `colFilter`: use scanner.fetchColumn() which invokes an Accumulo system
  *       {@link ColumnQualifierFilter}.
  *   <li>3. Singleton range `colFilter`: use Accumulo user {@link ColumnSliceFilter}.
  *   <li>4. Multi-range `colFilter`: use Graphulo {@link D4mRangeFilter}.
  * </ol>
  *
  * @param colFilter column filter string
  * @param scanner to call fetchColumn() on, for case #2
  * @param dis to call append()/prepend() on, for cases #3 and #4
  * @param append True means call {@link DynamicIteratorSetting#append}. False means call {@link
  *     DynamicIteratorSetting#prepend}
  */
 public static void applyGeneralColumnFilter(
     String colFilter, ScannerBase scanner, DynamicIteratorSetting dis, boolean append) {
   //    System.err.println("colFilter: "+colFilter);
   if (colFilter != null && !colFilter.isEmpty()) {
     int pos1 = colFilter.indexOf(':');
     if (pos1 == -1) { // no ranges - collection of singleton texts
       // todo - the order this filter applies is different. Ensure no logical bugs when we have
       // case 2.
       for (Text text : GraphuloUtil.d4mRowToTexts(colFilter)) {
         scanner.fetchColumn(GraphuloUtil.EMPTY_TEXT, text);
       }
     } else {
       SortedSet<Range> ranges = GraphuloUtil.d4mRowToRanges(colFilter);
       assert ranges.size() > 0;
       IteratorSetting s;
       if (ranges.size() == 1) { // single range - use ColumnSliceFilter
         Range r = ranges.first();
         s = new IteratorSetting(1, ColumnSliceFilter.class);
         //          System.err.println("start: "+(r.isInfiniteStartKey() ? null :
         // r.getStartKey().getRow().toString())
         //              +"end: "+(r.isInfiniteStopKey() ? null :
         // r.getEndKey().getRow().toString()));
         ColumnSliceFilter.setSlice(
             s,
             r.isInfiniteStartKey() ? null : r.getStartKey().getRow().toString(),
             true,
             r.isInfiniteStopKey() ? null : r.getEndKey().getRow().toString(),
             true);
         //          System.err.println("!ok "+GraphuloUtil.d4mRowToRanges(colFilter));
       } else { // multiple ranges
         //          System.err.println("ok "+GraphuloUtil.d4mRowToRanges(colFilter));
         s = D4mRangeFilter.iteratorSetting(1, D4mRangeFilter.KeyPart.COLQ, colFilter);
       }
       if (append) dis.append(s);
       else dis.prepend(s);
     }
   }
 }
Ejemplo n.º 6
0
 //  @Deprecated
 //   * @deprecated Use {@link #applyGeneralColumnFilter(String, ScannerBase,
 // DynamicIteratorSetting)} for more robust filter setting.
 public static void applyGeneralColumnFilter(String colFilter, ScannerBase scanner, int priority) {
   //    System.err.println("colFilter: "+colFilter);
   if (colFilter != null && !colFilter.isEmpty()) {
     int pos1 = colFilter.indexOf(':');
     if (pos1 == -1) { // no ranges - collection of singleton texts
       for (Text text : GraphuloUtil.d4mRowToTexts(colFilter)) {
         scanner.fetchColumn(GraphuloUtil.EMPTY_TEXT, text);
       }
     } else {
       SortedSet<Range> ranges = GraphuloUtil.d4mRowToRanges(colFilter);
       assert ranges.size() > 0;
       IteratorSetting s;
       if (ranges.size() == 1) { // single range - use ColumnSliceFilter
         Range r = ranges.first();
         if (r.isInfiniteStartKey() && r.isInfiniteStopKey())
           return; // Infinite case: no filtering.
         s = new IteratorSetting(priority, ColumnSliceFilter.class);
         //          System.err.println("start: "+(r.isInfiniteStartKey() ? null :
         // r.getStartKey().getRow().toString())
         //              +"end: "+(r.isInfiniteStopKey() ? null :
         // r.getEndKey().getRow().toString()));
         ColumnSliceFilter.setSlice(
             s,
             r.isInfiniteStartKey() ? null : r.getStartKey().getRow().toString(),
             true,
             r.isInfiniteStopKey() ? null : r.getEndKey().getRow().toString(),
             true);
         //          System.err.println("!ok "+GraphuloUtil.d4mRowToRanges(colFilter));
       } else { // multiple ranges
         //          System.err.println("ok "+GraphuloUtil.d4mRowToRanges(colFilter));
         s = D4mRangeFilter.iteratorSetting(1, D4mRangeFilter.KeyPart.COLQ, colFilter);
       }
       scanner.addScanIterator(s);
     }
   }
 }
Ejemplo n.º 7
0
  /**
   * Makes each input term into a prefix range.
   *
   * <pre>
   *  "v1,v5," => "v1|,:,v1},v5|,:,v5},"
   *  "v1,:,v3,v5," => "v1,:,v3,v5|,:,v5},"
   * </pre>
   */
  public static String singletonsAsPrefix(String str) {
    Preconditions.checkNotNull(str);
    Preconditions.checkArgument(!str.isEmpty());
    //    Preconditions.checkArgument(str.indexOf(':') != -1, "Cannot have the ':' character:
    // "+str);
    char sep = str.charAt(str.length() - 1);
    if (d4mStringContainsEmptyString(str)) // empty prefix is full range.
    return ":" + sep;

    if (!d4mStringContainsRange(str)) {
      StringBuilder sb = new StringBuilder();
      for (String vktext : GraphuloUtil.splitD4mString(str)) {
        sb.append(vktext)
            .append(sep)
            .append(':')
            .append(sep)
            .append(prevRow(Range.followingPrefix(new Text(vktext)).toString()))
            .append(sep);
      }
      return sb.toString();
    }

    //    Collection<Range> origRngs = d4mRowToRanges(str);
    //    for (Range rng : origRngs) {
    //      // if a singleton row, then make into a prefix row
    //
    //    }

    //    String[] strSplit = str.substring(0, str.length() - 1)
    //        .split(String.valueOf(sep));
    //    List<String> strList = Arrays.asList(strSplit);
    //    PeekingIterator3<String> pi = new PeekingIterator3<>(strList.iterator());
    //    SortedSet<Range> rngset = new TreeSet<>();
    //
    //    if (pi.peekFirst().equals(":")) { // (-Inf,
    //      if (pi.peekSecond() == null) {
    //        return str; // (-Inf,+Inf)
    //      } else {
    //        if (pi.peekSecond().equals(":") || (pi.peekThird() != null &&
    // pi.peekThird().equals(":")))
    //          throw new IllegalArgumentException("Bad D4M rowStr: " + str);
    ////        sb.append(':').append(sep).append(pi.peekSecond()).append(sep); // (-Inf,2]
    //        rngset.add(new Range(null, false, pi.peekSecond(), true)); // (-Inf,2]
    //        pi.next();
    //        pi.next();
    //      }
    //    }
    //
    //    while (pi.hasNext()) {
    //      if (pi.peekSecond() == null) { // last singleton row [1,1~)
    ////        sb.append(pi.peekFirst()).append(sep)
    ////            .append(':').append(sep)
    ////            .append(Range.followingPrefix(new Text(pi.peekFirst())).toString()).append(sep);
    //        rngset.add(Range.prefix(pi.peekFirst()));
    //
    //      } else if (pi.peekSecond().equals(":")) {
    //        if (pi.peekThird() == null) { // [1,+Inf)
    ////          sb.append(pi.peekFirst()).append(sep).append(':').append(sep);
    //          rngset.add(new Range(pi.peekFirst(), true, null, false));
    //
    //        } else { String s = GraphuloUtil.singletonsAsPrefix(vktexts, sep);// [1,3]
    //          if (pi.peekThird().equals(":"))
    //            throw new IllegalArgumentException("Bad D4M rowStr: " + str);
    ////          sb.append(pi.peekFirst()).append(sep)
    ////              .append(':').append(sep)
    ////              .append(pi.peekThird()).append(sep);
    //          rngset.add(new Range(pi.peekFirst(), true, pi.peekThird(), true));
    //          pi.next();
    //          pi.next();
    //          pi.next();
    //        }
    //      } else { // [1,1~)
    ////        sb.append(pi.peekFirst()).append(sep)
    ////            .append(':').append(sep)
    ////            .append(Range.followingPrefix(new Text(pi.peekFirst())).toString()).append(sep);
    //        rngset.add(Range.prefix(pi.peekFirst()));
    //        pi.next();
    //      }
    //    }

    Collection<Range> prefixRngs = d4mRowToRanges(str, true);
    //    log.info(prefixRngs);
    return rangesToD4MString(prefixRngs, sep);
  }
Ejemplo n.º 8
0
  @Test
  public void exampleNMF()
      throws FileNotFoundException, TableNotFoundException, AccumuloSecurityException,
          AccumuloException {
    String Atable = "ex" + SCALE + "A"; // Table base name.
    String Etable = "ex" + SCALE + "AEdge"; // Incidence table.
    String ETtable = "ex" + SCALE + "AEdgeT"; // Transpose of incidence table.
    String EtableSample = "ex" + SCALE + "AEdgeSample"; // Sampled-down version of incidence table.
    String ETtableSample =
        "ex" + SCALE + "AEdgeTSample"; // Sampled-down version of transpose of incidence table.
    String Wtable = "ex" + SCALE + "AEdgeW"; // Output table W.
    String WTtable = "ex" + SCALE + "AEdgeWT"; // Transpose of output table W.
    String Htable = "ex" + SCALE + "AEdgeH"; // Output table H.
    String HTtable = "ex" + SCALE + "AEdgeHT"; // Transpose of output table HT.
    int K = 3; // 3 topics
    int maxiter = 5; // 3 iterations of NMF maximum
    double cutoffThreshold = 0.0; // Threshold to cut off entries with value less than this
    int maxColsPerTopic = 10; // Threshold - only keep 10 nodes in H per topic
    String newVisibility = ""; // Column Visibility to use for newly created entries.

    // In your code, you would connect to an Accumulo instance by writing something similar to:
    //    ClientConfiguration cc =
    // ClientConfiguration.loadDefault().withInstance("instance").withZkHosts("localhost:2181").withZkTimeout(5000);
    //    Instance instance = new ZooKeeperInstance(cc);
    //    Connector c = instance.getConnector("root", new PasswordToken("secret"));
    // Here, we connect to the Accumulo instance given by TEST_CONFIG.java.
    // You can change this by passing the option -DTEST_CONFIG=local or -DTEST_CONFIG=txe1 or
    // similar.
    Connector conn = tester.getConnector();

    // Delete result table if it exists, so that we don't sum in previous runs with our results.
    GraphuloUtil.deleteTables(conn, Htable, HTtable, Wtable, WTtable);
    if (conn.tableOperations().exists(Htable)) conn.tableOperations().delete(Htable);

    // Insert data from the file test/resources/data/10Ar.txt and 10Ac.txt into Accumulo.
    // Deletes tables if they already exist.
    ExampleUtil.ingestIncidenceSCALE(SCALE, 'A', Atable, conn);

    // Create Graphulo executor. Supply the password for your Accumulo user account.
    Graphulo graphulo = new Graphulo(conn, tester.getPassword());

    //    DistributedTrace.enable("NMFExample");  // remove this for no tracing

    // Option to use in-memory version
    double nmfError =
        graphulo.NMF_Client(
            Etable,
            false,
            WTtable,
            true,
            Htable,
            false,
            K,
            maxiter,
            cutoffThreshold,
            maxColsPerTopic);

    // Sample the graph with 10% uniform sampling and materialize the result in a sampled table
    //    double probability = 0.1;
    //    long nnzSample = graphulo.SampleCopy(Etable, EtableSample+"tmp", null, probability,
    // trace);
    //    long nnzSample = graphulo.OneTable(Etable, EtableSample, ETtableSample, null, -1, null,
    // null, null, null, null,
    //        Collections.singletonList(SamplingFilter.iteratorSetting(1, probability)), null,
    // Authorizations.EMPTY);
    //    System.out.println("Sample finished; #entries in sample is "+nnzSample);

    // Non-negative matrix factorization.
    // This call blocks until the NMF completes.
    //    double nmfError = graphulo.NMF(EtableSample, ETtableSample, Wtable, WTtable, Htable,
    // HTtable, KMER, maxiter, true,
    //        cutoffThreshold);
    System.out.println("Final NMF absolute difference in error: " + nmfError);

    // Result is in Htable, HTtable, Wtable, WTtable. Do whatever you like with it.
    // For this example we will multiply H*W into a new table that approximates the original
    // incidence matrix.
    String APtable = "ex" + SCALE + "AEdgeApprox"; // Approximation of the incidence table Etable.
    GraphuloUtil.deleteTables(conn, APtable);
    graphulo.TableMult(
        WTtable,
        Htable,
        APtable,
        null,
        -1,
        MathTwoScalar.class,
        MathTwoScalar.optionMap(
            MathTwoScalar.ScalarOp.TIMES, MathTwoScalar.ScalarType.DOUBLE, newVisibility, false),
        MathTwoScalar.combinerSetting(
            Graphulo.PLUS_ITERATOR_BIGDECIMAL.getPriority(),
            null,
            MathTwoScalar.ScalarOp.PLUS,
            MathTwoScalar.ScalarType.DOUBLE,
            false),
        null,
        null,
        null,
        false,
        false,
        -1);

    DistributedTrace.disable();

    // Now Scanning APtable
    BatchScanner bs = conn.createBatchScanner(APtable, Authorizations.EMPTY, 2);
    bs.setRanges(Collections.singleton(new Range())); // Scan whole table.
    int cnt = 0;
    for (Map.Entry<Key, Value> entry : bs) {
      cnt++;
      //      System.out.println(entry.getKey().toStringNoTime()+" -> "+entry.getValue());
    }
    bs.close();
    log.info("# of entries in approximation table " + APtable + ": " + cnt);
  }