Exemple #1
0
  /**
   * Compute relative variable importance for GBM model.
   *
   * <p>See (45), (35) formulas in Friedman: Greedy Function Approximation: A Gradient boosting
   * machine. Algo used here can be used for computation individual importance of features per
   * output class.
   */
  @Override
  protected VarImp doVarImpCalc(
      GBMModel model, DTree[] ktrees, int tid, Frame validationFrame, boolean scale) {
    assert model.ntrees() - 1 == tid
        : "varimp computation expect model with already serialized trees: tid=" + tid;
    // Iterates over k-tree
    for (DTree t : ktrees) { // Iterate over trees
      if (t != null) {
        for (int n = 0; n < t.len() - t.leaves; n++)
          if (t.node(n) instanceof DecidedNode) { // it is split node
            Split split = t.decided(n)._split;
            _improvPerVar[split._col] += split.improvement(); // least squares improvement
          }
      }
    }
    // Compute variable importance for all trees in model
    float[] varimp = new float[model.nfeatures()];

    int ntreesTotal = model.ntrees() * model.nclasses();
    int maxVar = 0;
    for (int var = 0; var < _improvPerVar.length; var++) {
      varimp[var] = _improvPerVar[var] / ntreesTotal;
      if (varimp[var] > varimp[maxVar]) maxVar = var;
    }
    // GBM scale varimp to scale 0..100
    if (scale) {
      float maxVal = varimp[maxVar];
      for (int var = 0; var < varimp.length; var++) varimp[var] /= maxVal;
    }

    return new VarImp(varimp);
  }
Exemple #2
0
 private static Split parseSplit(final XMLStreamReader reader) throws XMLStreamException {
   final Split split = new Split(getAttributeValue(reader, XmlAttribute.ID, true));
   split.setAttributeNext(getAttributeValue(reader, XmlAttribute.NEXT, false));
   while (reader.hasNext()) {
     final int eventType = reader.next();
     if (eventType != START_ELEMENT && eventType != END_ELEMENT) {
       continue;
     }
     final XmlElement element = XmlElement.forName(reader.getLocalName());
     switch (eventType) {
       case START_ELEMENT:
         if (element == XmlElement.FLOW) {
           split.addFlow(parseFlow(reader));
         } else {
           throw BatchLogger.LOGGER.unexpectedXmlElement(
               reader.getLocalName(), reader.getLocation());
         }
         break;
       case END_ELEMENT:
         if (element == XmlElement.SPLIT) {
           return split;
         } else {
           throw BatchLogger.LOGGER.unexpectedXmlElement(
               reader.getLocalName(), reader.getLocation());
         }
     }
   }
   throw BatchLogger.LOGGER.unexpectedXmlElement(reader.getLocalName(), reader.getLocation());
 }
Exemple #3
0
 private Dimension preferredNodeSize(Node root) {
   if (root instanceof Leaf) return preferredComponentSize(root);
   else if (root instanceof Divider) {
     int dividerSize = getDividerSize();
     return new Dimension(dividerSize, dividerSize);
   } else {
     Split split = (Split) root;
     List<Node> splitChildren = split.getChildren();
     int width = 0;
     int height = 0;
     if (split.isRowLayout()) {
       for (Node splitChild : splitChildren) {
         Dimension size = preferredNodeSize(splitChild);
         width += size.width;
         height = Math.max(height, size.height);
       }
     } else {
       for (Node splitChild : splitChildren) {
         Dimension size = preferredNodeSize(splitChild);
         width = Math.max(width, size.width);
         height += size.height;
       }
     }
     return new Dimension(width, height);
   }
 }
Exemple #4
0
  @Test
  public void testGeneratedSplitsSingleColumn() throws DbException {
    final String[][] expectedResults = {
      {"foo:bar:baz", "foo"}, {"foo:bar:baz", "bar"}, {"foo:bar:baz", "baz"}
    };
    final Schema schema = Schema.ofFields("string", Type.STRING_TYPE);
    final Schema expectedResultSchema =
        Schema.appendColumn(schema, Type.STRING_TYPE, "string_splits");
    final TupleBatchBuffer input = new TupleBatchBuffer(schema);
    input.putString(0, "foo:bar:baz");
    Split splitOp = new Split(new TupleSource(input), 0, ":");

    splitOp.open(TestEnvVars.get());
    int rowIdx = 0;
    while (!splitOp.eos()) {
      TupleBatch result = splitOp.nextReady();
      if (result != null) {
        assertEquals(expectedResultSchema, result.getSchema());

        for (int batchIdx = 0; batchIdx < result.numTuples(); ++batchIdx, ++rowIdx) {
          assertEquals(expectedResults[rowIdx][0], result.getString(0, batchIdx));
          assertEquals(expectedResults[rowIdx][1], result.getString(1, batchIdx));
        }
      }
    }
    assertEquals(expectedResults.length, rowIdx);
    splitOp.close();
  }
Exemple #5
0
  @Test
  public void testGeneratedSplits() throws DbException {
    final Object[][] expectedResults = {
      {true, "foo:bar:baz", 1L, 0.1, "foo"},
      {true, "foo:bar:baz", 1L, 0.1, "bar"},
      {true, "foo:bar:baz", 1L, 0.1, "baz"},
      {false, ":qux::", 2L, 0.2, ""},
      {false, ":qux::", 2L, 0.2, "qux"},
      {false, ":qux::", 2L, 0.2, ""},
      {false, ":qux::", 2L, 0.2, ""}
    };
    final Schema schema =
        Schema.ofFields(
            "bool",
            Type.BOOLEAN_TYPE,
            "string",
            Type.STRING_TYPE,
            "long",
            Type.LONG_TYPE,
            "double",
            Type.DOUBLE_TYPE);
    final Schema expectedResultSchema =
        Schema.appendColumn(schema, Type.STRING_TYPE, "string_splits");
    final TupleBatchBuffer input = new TupleBatchBuffer(schema);
    // First row to explode
    input.putBoolean(0, true);
    input.putString(1, "foo:bar:baz");
    input.putLong(2, 1L);
    input.putDouble(3, 0.1);
    // Second row to explode
    input.putBoolean(0, false);
    input.putString(1, ":qux::");
    input.putLong(2, 2L);
    input.putDouble(3, 0.2);
    Split splitOp = new Split(new TupleSource(input), 1, ":");

    splitOp.open(TestEnvVars.get());
    int rowIdx = 0;
    while (!splitOp.eos()) {
      TupleBatch result = splitOp.nextReady();
      if (result != null) {
        assertEquals(expectedResultSchema, result.getSchema());

        for (int batchIdx = 0; batchIdx < result.numTuples(); ++batchIdx, ++rowIdx) {
          assertEquals(
              ((Boolean) expectedResults[rowIdx][0]).booleanValue(),
              result.getBoolean(0, batchIdx));
          assertEquals((expectedResults[rowIdx][1]).toString(), result.getString(1, batchIdx));
          assertEquals(
              ((Long) expectedResults[rowIdx][2]).longValue(), result.getLong(2, batchIdx));
          assertEquals(
              Double.doubleToLongBits(((Double) expectedResults[rowIdx][3]).doubleValue()),
              Double.doubleToLongBits(result.getDouble(3, batchIdx)));
          assertEquals((expectedResults[rowIdx][4]).toString(), result.getString(4, batchIdx));
        }
      }
    }
    assertEquals(expectedResults.length, rowIdx);
    splitOp.close();
  }
Exemple #6
0
 private static void parseSplit(StreamTokenizer st, Split parent) throws Exception {
   int token;
   while ((token = st.nextToken()) != StreamTokenizer.TT_EOF) {
     if (token == ')') {
       break;
     } else if (token == StreamTokenizer.TT_WORD) {
       if (st.sval.equalsIgnoreCase("WEIGHT")) {
         parseAttribute(st.sval, st, parent);
       } else {
         addSplitChild(parent, new Leaf(st.sval));
       }
     } else if (token == '(') {
       if ((token = st.nextToken()) != StreamTokenizer.TT_WORD) {
         throwParseException(st, "invalid node type");
       }
       String nodeType = st.sval.toUpperCase();
       if (nodeType.equals("LEAF")) {
         parseLeaf(st, parent);
       } else if (nodeType.equals("ROW") || nodeType.equals("COLUMN")) {
         Split split = new Split();
         split.setRowLayout(nodeType.equals("ROW"));
         addSplitChild(parent, split);
         parseSplit(st, split);
       } else {
         throwParseException(st, "unrecognized node type '" + nodeType + "'");
       }
     }
   }
 }
Exemple #7
0
 private void checkLayout(Node root) {
   if (root instanceof Split) {
     Split split = (Split) root;
     if (split.getChildren().size() <= 2) {
       throwInvalidLayout("Split must have > 2 children", root);
     }
     Iterator<Node> splitChildren = split.getChildren().iterator();
     double weight = 0.0;
     while (splitChildren.hasNext()) {
       Node splitChild = splitChildren.next();
       if (splitChild instanceof Divider) {
         throwInvalidLayout("expected a Split or Leaf Node", splitChild);
       }
       if (splitChildren.hasNext()) {
         Node dividerChild = splitChildren.next();
         if (!(dividerChild instanceof Divider)) {
           throwInvalidLayout("expected a Divider Node", dividerChild);
         }
       }
       weight += splitChild.getWeight();
       checkLayout(splitChild);
     }
     if (weight > 1.0 + 0.000000001) {
         /* add some epsilon to a double check */
       throwInvalidLayout("Split children's total weight > 1.0", root);
     }
   }
 }
  @SuppressWarnings("unchecked")
  @Override
  public <E extends Entry<K, V>> E put(E entry) {
    if (entry.getKey() == null) {
      return updateNullEntry(entry);
    }

    if (entry.getValue() == null) {
      return remove(entry.getKey());
    }

    Object result = root.put(conf, entry);
    if (result == null) {
      this.size++;
      return null;
    }
    if (result instanceof Split) {
      Split<K, V> split = (Split<K, V>) result;
      this.root = new InnerNode<K, V>(conf, root, split.getKey(), split.getGreater());
      this.size++;
      return null;
    } else {
      return (E) result;
    }
  }
Exemple #9
0
 private Dimension minimumNodeSize(Node root) {
   if (root instanceof Leaf) {
     Component child = childForNode(root);
     return (child != null) ? child.getMinimumSize() : new Dimension(0, 0);
   } else if (root instanceof Divider) {
     int dividerSize = getDividerSize();
     return new Dimension(dividerSize, dividerSize);
   } else {
     Split split = (Split) root;
     List<Node> splitChildren = split.getChildren();
     int width = 0;
     int height = 0;
     if (split.isRowLayout()) {
       for (Node splitChild : splitChildren) {
         Dimension size = minimumNodeSize(splitChild);
         width += size.width;
         height = Math.max(height, size.height);
       }
     } else {
       for (Node splitChild : splitChildren) {
         Dimension size = minimumNodeSize(splitChild);
         width = Math.max(width, size.width);
         height += size.height;
       }
     }
     return new Dimension(width, height);
   }
 }
Exemple #10
0
  /**
   * Test output spanning multiple batches. All integers from 0 to 2 * TupleBatch.BATCH_SIZE are
   * concatenated as a single comma-separated string. Result should contain each integer from the
   * input in its own row.
   *
   * @throws DbException
   */
  @Test
  public void testAllBatchesReturned() throws DbException {
    final Schema schema = Schema.ofFields("joined_ints", Type.STRING_TYPE);
    final Schema expectedResultSchema =
        Schema.appendColumn(schema, Type.STRING_TYPE, "joined_ints_splits");
    final TupleBatchBuffer input = new TupleBatchBuffer(schema);
    final long expectedResults = 2 * TupleBatch.BATCH_SIZE + 1;
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < expectedResults; ++i) {
      sb.append(i);
      if (i < expectedResults - 1) {
        sb.append(",");
      }
    }
    input.putString(0, sb.toString());

    Split splitOp = new Split(new TupleSource(input), 0, ",");
    splitOp.open(TestEnvVars.get());
    long rowIdx = 0;
    while (!splitOp.eos()) {
      TupleBatch result = splitOp.nextReady();
      if (result != null) {
        assertEquals(expectedResultSchema, result.getSchema());

        for (int batchIdx = 0; batchIdx < result.numTuples(); ++batchIdx, ++rowIdx) {
          assertEquals(rowIdx, Integer.parseInt(result.getString(1, batchIdx)));
        }
      }
    }
    assertEquals(expectedResults, rowIdx);
    splitOp.close();
  }
Exemple #11
0
 @Test(expected = PatternSyntaxException.class)
 public void testInvalidRegex() throws DbException {
   final Schema schema = Schema.ofFields("string", Type.STRING_TYPE);
   final TupleBatchBuffer input = new TupleBatchBuffer(schema);
   input.putString(0, "foo");
   Split splitOp = new Split(new TupleSource(input), 0, "?:(");
   splitOp.open(TestEnvVars.get());
 }
Exemple #12
0
 public void run(Boolean whileWaiting, HashMap<String, Object> map) {
   for (Split split : splits) {
     if (split.isEnabled()) {
       FlowNode flowNode = FlowController.getSourceFromContainedText(split.getTarget());
       flowNode.getSource().run(whileWaiting, map);
     }
   }
 }
Exemple #13
0
 public void updateSplitEnabled(Integer splitId, Boolean enabled) {
   for (Split split : splits) {
     if (split.getId().equals(splitId)) {
       split.setEnabled(enabled);
       DataBank.saveSplit(split);
     }
   }
 }
Exemple #14
0
 @Test(expected = IllegalStateException.class)
 public void testSplitColumnInvalidType() throws DbException {
   final Schema schema = Schema.ofFields("long", Type.LONG_TYPE);
   final TupleBatchBuffer input = new TupleBatchBuffer(schema);
   input.putLong(0, 1L);
   Split splitOp = new Split(new TupleSource(input), 0, ":");
   splitOp.open(TestEnvVars.get());
 }
Exemple #15
0
 public void updateSplitTarget(Integer splitId, String target) {
   for (Split split : splits) {
     if (split.getId().equals(splitId)) {
       split.setTarget(target);
       DataBank.saveSplit(split);
     }
   }
 }
Exemple #16
0
 private Node siblingAtOffset(int offset) {
   Split parent = parent_get();
   if (parent == null) return null;
   List<Node> siblings = parent.getChildren();
   int index = siblings.indexOf(this);
   if (index == -1) return null;
   index += offset;
   return ((index > -1) && (index < siblings.size())) ? siblings.get(index) : null;
 }
Exemple #17
0
  public Split getSplit(Integer id) {
    for (Split split : splits) {
      if (split.getId().equals(id)) {
        return split;
      }
    }

    return null;
  }
Exemple #18
0
 private static void addSplitChild(Split parent, Node child) {
   List<Node> children = new ArrayList<Node>(parent.getChildren());
   if (children.size() == 0) {
     children.add(child);
   } else {
     children.add(new Divider());
     children.add(child);
   }
   parent.setChildren(children);
 }
Exemple #19
0
 private static void printModel(String indent, Node root) {
   if (root instanceof Split) {
     Split split = (Split) root;
     System.out.println(indent + split);
     for (Node child : split.getChildren()) {
       printModel(indent + "  ", child);
     }
   } else {
     System.out.println(indent + root);
   }
 }
Exemple #20
0
 private Divider dividerAt(Node root, int x, int y) {
   if (root instanceof Divider) {
     Divider divider = (Divider) root;
     return (divider.getBounds().contains(x, y)) ? divider : null;
   } else if (root instanceof Split) {
     Split split = (Split) root;
     for (Node child : split.getChildren()) {
       if (child.getBounds().contains(x, y)) return dividerAt(child, x, y);
     }
   }
   return null;
 }
Exemple #21
0
 private static Node parseModel(Reader r) {
   StreamTokenizer st = new StreamTokenizer(r);
   try {
     Split root = new Split();
     parseSplit(st, root);
     return root.getChildren().get(0);
   } catch (Exception e) {
     Main.error(e);
   } finally {
     Utils.close(r);
   }
   return null;
 }
Exemple #22
0
 protected void updateTreeOutput(RegressionTree rt) {
   List<Split> leaves = rt.leaves();
   for (int i = 0; i < leaves.size(); i++) {
     float s1 = 0.0F;
     Split s = leaves.get(i);
     int[] idx = s.getSamples();
     for (int j = 0; j < idx.length; j++) {
       int k = idx[j];
       s1 += pseudoResponses[k];
     }
     s.setOutput(s1 / idx.length);
   }
 }
  private RuleModel createNumericalRuleModel(ExampleSet trainingSet, Attribute attribute) {
    RuleModel model = new RuleModel(trainingSet);

    // split by best attribute
    int oldSize = -1;
    while ((trainingSet.size() > 0) && (trainingSet.size() != oldSize)) {
      ExampleSet exampleSet = (ExampleSet) trainingSet.clone();
      Split bestSplit = splitter.getBestSplit(exampleSet, attribute, null);
      double bestSplitValue = bestSplit.getSplitPoint();
      if (!Double.isNaN(bestSplitValue)) {
        SplittedExampleSet splittedSet =
            SplittedExampleSet.splitByAttribute(exampleSet, attribute, bestSplitValue);
        Attribute label = splittedSet.getAttributes().getLabel();
        splittedSet.selectSingleSubset(0);
        SplitCondition condition = new LessEqualsSplitCondition(attribute, bestSplitValue);

        splittedSet.recalculateAttributeStatistics(label);
        int labelValue = (int) splittedSet.getStatistics(label, Statistics.MODE);
        String labelName = label.getMapping().mapIndex(labelValue);
        Rule rule = new Rule(labelName, condition);

        int[] frequencies = new int[label.getMapping().size()];
        int counter = 0;
        for (String value : label.getMapping().getValues())
          frequencies[counter++] = (int) splittedSet.getStatistics(label, Statistics.COUNT, value);
        rule.setFrequencies(frequencies);
        model.addRule(rule);
        oldSize = trainingSet.size();
        trainingSet = rule.removeCovered(trainingSet);
      } else {
        break;
      }
    }

    // add default rule if some examples were not yet covered
    if (trainingSet.size() > 0) {
      Attribute label = trainingSet.getAttributes().getLabel();
      trainingSet.recalculateAttributeStatistics(label);
      int index = (int) trainingSet.getStatistics(label, Statistics.MODE);
      String defaultLabel = label.getMapping().mapIndex(index);
      Rule defaultRule = new Rule(defaultLabel);
      int[] frequencies = new int[label.getMapping().size()];
      int counter = 0;
      for (String value : label.getMapping().getValues())
        frequencies[counter++] = (int) (trainingSet.getStatistics(label, Statistics.COUNT, value));
      defaultRule.setFrequencies(frequencies);
      model.addRule(defaultRule);
    }

    return model;
  }
Exemple #24
0
 private void minimizeSplitBounds(Split split, Rectangle bounds) {
   Rectangle splitBounds = new Rectangle(bounds.x, bounds.y, 0, 0);
   List<Node> splitChildren = split.getChildren();
   Node lastChild = splitChildren.get(splitChildren.size() - 1);
   Rectangle lastChildBounds = lastChild.getBounds();
   if (split.isRowLayout()) {
     int lastChildMaxX = lastChildBounds.x + lastChildBounds.width;
     splitBounds.add(lastChildMaxX, bounds.y + bounds.height);
   } else {
     int lastChildMaxY = lastChildBounds.y + lastChildBounds.height;
     splitBounds.add(bounds.x + bounds.width, lastChildMaxY);
   }
   split.setBounds(splitBounds);
 }
Exemple #25
0
 private static Node parseModel(Reader r) {
   StreamTokenizer st = new StreamTokenizer(r);
   try {
     Split root = new Split();
     parseSplit(st, root);
     return root.getChildren().get(0);
   } catch (Exception e) {
     System.err.println(e);
   } finally {
     try {
       r.close();
     } catch (IOException ignore) {
     }
   }
   return null;
 }
  @Override
  public ImmutableBinaryTree decompose(ArrayList<Vertex<Integer>> vertices) {
    Split split = createSplit(0, this, vertices);
    ArrayList<Vertex<Integer>> ordering = new ArrayList<>();

    while (!split.done()) {
      split = split.decomposeAdvance();
      ordering.add(split.getLastMoved());
    }

    // System.out.printf("ordering: %s\n", Util.labels(ordering));
    int i = 0;
    for (Vertex<Integer> v : ordering) {
      VertexLabel.setOrder(v, Integer.toString(i));
      i += 1;
    }
    return getCaterpillarIBTFromOrdering(ordering);
  }
Exemple #27
0
  @Override
  public void graphTree(StringBuffer buff) {
    boolean first = true;
    for (Map.Entry<String, HNode> e : m_children.entrySet()) {

      HNode child = e.getValue();
      String branch = e.getKey();

      if (child != null) {
        String conditionForBranch = m_split.conditionForBranch(branch);
        if (first) {
          String testAttName = null;

          if (conditionForBranch.indexOf("<=") < 0) {
            testAttName = conditionForBranch.substring(0, conditionForBranch.indexOf("=")).trim();
          } else {
            testAttName = conditionForBranch.substring(0, conditionForBranch.indexOf("<")).trim();
          }
          first = false;
          buff.append("N" + m_nodeNum + " [label=\"" + testAttName + "\"]\n");
        }

        int startIndex = 0;
        if (conditionForBranch.indexOf("<=") > 0) {
          startIndex = conditionForBranch.indexOf("<") - 1;
        } else if (conditionForBranch.indexOf("=") > 0) {
          startIndex = conditionForBranch.indexOf("=") - 1;
        } else {
          startIndex = conditionForBranch.indexOf(">") - 1;
        }
        conditionForBranch =
            conditionForBranch.substring(startIndex, conditionForBranch.length()).trim();

        buff.append(
                "N"
                    + m_nodeNum
                    + "->"
                    + "N"
                    + child.m_nodeNum
                    + "[label=\""
                    + conditionForBranch
                    + "\"]\n")
            .append("\n");
      }
    }

    for (Map.Entry<String, HNode> e : m_children.entrySet()) {
      HNode child = e.getValue();

      if (child != null) {
        child.graphTree(buff);
      }
    }
  }
Exemple #28
0
 /**
  * Each input node @n corersponds to a <split> tag in the model file.
  *
  * @param n
  * @return
  */
 private Split create(Node n) {
   Split s = null;
   if (n.getFirstChild().getNodeName().compareToIgnoreCase("feature") == 0) // this is a split
   {
     NodeList nl = n.getChildNodes();
     int fid =
         Integer.parseInt(
             nl.item(0).getFirstChild().getNodeValue().toString().trim()); // <feature>
     float threshold =
         Float.parseFloat(
             nl.item(1).getFirstChild().getNodeValue().toString().trim()); // <threshold>
     s = new Split(fid, threshold, 0);
     s.setLeft(create(nl.item(2)));
     s.setRight(create(nl.item(3)));
   } else // this is a stump
   {
     float output =
         Float.parseFloat(n.getFirstChild().getFirstChild().getNodeValue().toString().trim());
     s = new Split();
     s.setOutput(output);
   }
   return s;
 }
Exemple #29
0
 /* Second pass of the layout algorithm: branch to layoutGrow/Shrink
  * as needed.
  */
 private void layout2(Node root, Rectangle bounds) {
   if (root instanceof Leaf) {
     Component child = childForNode(root);
     if (child != null) {
       child.setBounds(bounds);
     }
     root.setBounds(bounds);
   } else if (root instanceof Divider) {
     root.setBounds(bounds);
   } else if (root instanceof Split) {
     Split split = (Split) root;
     boolean grow =
         split.isRowLayout()
             ? (split.getBounds().width <= bounds.width)
             : (split.getBounds().height <= bounds.height);
     if (grow) {
       layoutGrow(split, bounds);
       root.setBounds(bounds);
     } else {
       layoutShrink(split, bounds);
       // split.setBounds() called in layoutShrink()
     }
   }
 }
Exemple #30
0
  @Override
  protected int dumpTree(int depth, int leafCount, StringBuffer buff) {

    for (Map.Entry<String, HNode> e : m_children.entrySet()) {

      HNode child = e.getValue();
      String branch = e.getKey();

      if (child != null) {

        buff.append("\n");

        for (int i = 0; i < depth; i++) {
          buff.append("|   ");
        }

        buff.append(m_split.conditionForBranch(branch).trim());
        buff.append(": ");
        leafCount = child.dumpTree(depth + 1, leafCount, buff);
      }
    }
    return leafCount;
  }