/** * Compute relative variable importance for GBM model. * * <p>See (45), (35) formulas in Friedman: Greedy Function Approximation: A Gradient boosting * machine. Algo used here can be used for computation individual importance of features per * output class. */ @Override protected VarImp doVarImpCalc( GBMModel model, DTree[] ktrees, int tid, Frame validationFrame, boolean scale) { assert model.ntrees() - 1 == tid : "varimp computation expect model with already serialized trees: tid=" + tid; // Iterates over k-tree for (DTree t : ktrees) { // Iterate over trees if (t != null) { for (int n = 0; n < t.len() - t.leaves; n++) if (t.node(n) instanceof DecidedNode) { // it is split node Split split = t.decided(n)._split; _improvPerVar[split._col] += split.improvement(); // least squares improvement } } } // Compute variable importance for all trees in model float[] varimp = new float[model.nfeatures()]; int ntreesTotal = model.ntrees() * model.nclasses(); int maxVar = 0; for (int var = 0; var < _improvPerVar.length; var++) { varimp[var] = _improvPerVar[var] / ntreesTotal; if (varimp[var] > varimp[maxVar]) maxVar = var; } // GBM scale varimp to scale 0..100 if (scale) { float maxVal = varimp[maxVar]; for (int var = 0; var < varimp.length; var++) varimp[var] /= maxVal; } return new VarImp(varimp); }
private static Split parseSplit(final XMLStreamReader reader) throws XMLStreamException { final Split split = new Split(getAttributeValue(reader, XmlAttribute.ID, true)); split.setAttributeNext(getAttributeValue(reader, XmlAttribute.NEXT, false)); while (reader.hasNext()) { final int eventType = reader.next(); if (eventType != START_ELEMENT && eventType != END_ELEMENT) { continue; } final XmlElement element = XmlElement.forName(reader.getLocalName()); switch (eventType) { case START_ELEMENT: if (element == XmlElement.FLOW) { split.addFlow(parseFlow(reader)); } else { throw BatchLogger.LOGGER.unexpectedXmlElement( reader.getLocalName(), reader.getLocation()); } break; case END_ELEMENT: if (element == XmlElement.SPLIT) { return split; } else { throw BatchLogger.LOGGER.unexpectedXmlElement( reader.getLocalName(), reader.getLocation()); } } } throw BatchLogger.LOGGER.unexpectedXmlElement(reader.getLocalName(), reader.getLocation()); }
private Dimension preferredNodeSize(Node root) { if (root instanceof Leaf) return preferredComponentSize(root); else if (root instanceof Divider) { int dividerSize = getDividerSize(); return new Dimension(dividerSize, dividerSize); } else { Split split = (Split) root; List<Node> splitChildren = split.getChildren(); int width = 0; int height = 0; if (split.isRowLayout()) { for (Node splitChild : splitChildren) { Dimension size = preferredNodeSize(splitChild); width += size.width; height = Math.max(height, size.height); } } else { for (Node splitChild : splitChildren) { Dimension size = preferredNodeSize(splitChild); width = Math.max(width, size.width); height += size.height; } } return new Dimension(width, height); } }
@Test public void testGeneratedSplitsSingleColumn() throws DbException { final String[][] expectedResults = { {"foo:bar:baz", "foo"}, {"foo:bar:baz", "bar"}, {"foo:bar:baz", "baz"} }; final Schema schema = Schema.ofFields("string", Type.STRING_TYPE); final Schema expectedResultSchema = Schema.appendColumn(schema, Type.STRING_TYPE, "string_splits"); final TupleBatchBuffer input = new TupleBatchBuffer(schema); input.putString(0, "foo:bar:baz"); Split splitOp = new Split(new TupleSource(input), 0, ":"); splitOp.open(TestEnvVars.get()); int rowIdx = 0; while (!splitOp.eos()) { TupleBatch result = splitOp.nextReady(); if (result != null) { assertEquals(expectedResultSchema, result.getSchema()); for (int batchIdx = 0; batchIdx < result.numTuples(); ++batchIdx, ++rowIdx) { assertEquals(expectedResults[rowIdx][0], result.getString(0, batchIdx)); assertEquals(expectedResults[rowIdx][1], result.getString(1, batchIdx)); } } } assertEquals(expectedResults.length, rowIdx); splitOp.close(); }
@Test public void testGeneratedSplits() throws DbException { final Object[][] expectedResults = { {true, "foo:bar:baz", 1L, 0.1, "foo"}, {true, "foo:bar:baz", 1L, 0.1, "bar"}, {true, "foo:bar:baz", 1L, 0.1, "baz"}, {false, ":qux::", 2L, 0.2, ""}, {false, ":qux::", 2L, 0.2, "qux"}, {false, ":qux::", 2L, 0.2, ""}, {false, ":qux::", 2L, 0.2, ""} }; final Schema schema = Schema.ofFields( "bool", Type.BOOLEAN_TYPE, "string", Type.STRING_TYPE, "long", Type.LONG_TYPE, "double", Type.DOUBLE_TYPE); final Schema expectedResultSchema = Schema.appendColumn(schema, Type.STRING_TYPE, "string_splits"); final TupleBatchBuffer input = new TupleBatchBuffer(schema); // First row to explode input.putBoolean(0, true); input.putString(1, "foo:bar:baz"); input.putLong(2, 1L); input.putDouble(3, 0.1); // Second row to explode input.putBoolean(0, false); input.putString(1, ":qux::"); input.putLong(2, 2L); input.putDouble(3, 0.2); Split splitOp = new Split(new TupleSource(input), 1, ":"); splitOp.open(TestEnvVars.get()); int rowIdx = 0; while (!splitOp.eos()) { TupleBatch result = splitOp.nextReady(); if (result != null) { assertEquals(expectedResultSchema, result.getSchema()); for (int batchIdx = 0; batchIdx < result.numTuples(); ++batchIdx, ++rowIdx) { assertEquals( ((Boolean) expectedResults[rowIdx][0]).booleanValue(), result.getBoolean(0, batchIdx)); assertEquals((expectedResults[rowIdx][1]).toString(), result.getString(1, batchIdx)); assertEquals( ((Long) expectedResults[rowIdx][2]).longValue(), result.getLong(2, batchIdx)); assertEquals( Double.doubleToLongBits(((Double) expectedResults[rowIdx][3]).doubleValue()), Double.doubleToLongBits(result.getDouble(3, batchIdx))); assertEquals((expectedResults[rowIdx][4]).toString(), result.getString(4, batchIdx)); } } } assertEquals(expectedResults.length, rowIdx); splitOp.close(); }
private static void parseSplit(StreamTokenizer st, Split parent) throws Exception { int token; while ((token = st.nextToken()) != StreamTokenizer.TT_EOF) { if (token == ')') { break; } else if (token == StreamTokenizer.TT_WORD) { if (st.sval.equalsIgnoreCase("WEIGHT")) { parseAttribute(st.sval, st, parent); } else { addSplitChild(parent, new Leaf(st.sval)); } } else if (token == '(') { if ((token = st.nextToken()) != StreamTokenizer.TT_WORD) { throwParseException(st, "invalid node type"); } String nodeType = st.sval.toUpperCase(); if (nodeType.equals("LEAF")) { parseLeaf(st, parent); } else if (nodeType.equals("ROW") || nodeType.equals("COLUMN")) { Split split = new Split(); split.setRowLayout(nodeType.equals("ROW")); addSplitChild(parent, split); parseSplit(st, split); } else { throwParseException(st, "unrecognized node type '" + nodeType + "'"); } } } }
private void checkLayout(Node root) { if (root instanceof Split) { Split split = (Split) root; if (split.getChildren().size() <= 2) { throwInvalidLayout("Split must have > 2 children", root); } Iterator<Node> splitChildren = split.getChildren().iterator(); double weight = 0.0; while (splitChildren.hasNext()) { Node splitChild = splitChildren.next(); if (splitChild instanceof Divider) { throwInvalidLayout("expected a Split or Leaf Node", splitChild); } if (splitChildren.hasNext()) { Node dividerChild = splitChildren.next(); if (!(dividerChild instanceof Divider)) { throwInvalidLayout("expected a Divider Node", dividerChild); } } weight += splitChild.getWeight(); checkLayout(splitChild); } if (weight > 1.0 + 0.000000001) { /* add some epsilon to a double check */ throwInvalidLayout("Split children's total weight > 1.0", root); } } }
@SuppressWarnings("unchecked") @Override public <E extends Entry<K, V>> E put(E entry) { if (entry.getKey() == null) { return updateNullEntry(entry); } if (entry.getValue() == null) { return remove(entry.getKey()); } Object result = root.put(conf, entry); if (result == null) { this.size++; return null; } if (result instanceof Split) { Split<K, V> split = (Split<K, V>) result; this.root = new InnerNode<K, V>(conf, root, split.getKey(), split.getGreater()); this.size++; return null; } else { return (E) result; } }
private Dimension minimumNodeSize(Node root) { if (root instanceof Leaf) { Component child = childForNode(root); return (child != null) ? child.getMinimumSize() : new Dimension(0, 0); } else if (root instanceof Divider) { int dividerSize = getDividerSize(); return new Dimension(dividerSize, dividerSize); } else { Split split = (Split) root; List<Node> splitChildren = split.getChildren(); int width = 0; int height = 0; if (split.isRowLayout()) { for (Node splitChild : splitChildren) { Dimension size = minimumNodeSize(splitChild); width += size.width; height = Math.max(height, size.height); } } else { for (Node splitChild : splitChildren) { Dimension size = minimumNodeSize(splitChild); width = Math.max(width, size.width); height += size.height; } } return new Dimension(width, height); } }
/** * Test output spanning multiple batches. All integers from 0 to 2 * TupleBatch.BATCH_SIZE are * concatenated as a single comma-separated string. Result should contain each integer from the * input in its own row. * * @throws DbException */ @Test public void testAllBatchesReturned() throws DbException { final Schema schema = Schema.ofFields("joined_ints", Type.STRING_TYPE); final Schema expectedResultSchema = Schema.appendColumn(schema, Type.STRING_TYPE, "joined_ints_splits"); final TupleBatchBuffer input = new TupleBatchBuffer(schema); final long expectedResults = 2 * TupleBatch.BATCH_SIZE + 1; StringBuilder sb = new StringBuilder(); for (int i = 0; i < expectedResults; ++i) { sb.append(i); if (i < expectedResults - 1) { sb.append(","); } } input.putString(0, sb.toString()); Split splitOp = new Split(new TupleSource(input), 0, ","); splitOp.open(TestEnvVars.get()); long rowIdx = 0; while (!splitOp.eos()) { TupleBatch result = splitOp.nextReady(); if (result != null) { assertEquals(expectedResultSchema, result.getSchema()); for (int batchIdx = 0; batchIdx < result.numTuples(); ++batchIdx, ++rowIdx) { assertEquals(rowIdx, Integer.parseInt(result.getString(1, batchIdx))); } } } assertEquals(expectedResults, rowIdx); splitOp.close(); }
@Test(expected = PatternSyntaxException.class) public void testInvalidRegex() throws DbException { final Schema schema = Schema.ofFields("string", Type.STRING_TYPE); final TupleBatchBuffer input = new TupleBatchBuffer(schema); input.putString(0, "foo"); Split splitOp = new Split(new TupleSource(input), 0, "?:("); splitOp.open(TestEnvVars.get()); }
public void run(Boolean whileWaiting, HashMap<String, Object> map) { for (Split split : splits) { if (split.isEnabled()) { FlowNode flowNode = FlowController.getSourceFromContainedText(split.getTarget()); flowNode.getSource().run(whileWaiting, map); } } }
public void updateSplitEnabled(Integer splitId, Boolean enabled) { for (Split split : splits) { if (split.getId().equals(splitId)) { split.setEnabled(enabled); DataBank.saveSplit(split); } } }
@Test(expected = IllegalStateException.class) public void testSplitColumnInvalidType() throws DbException { final Schema schema = Schema.ofFields("long", Type.LONG_TYPE); final TupleBatchBuffer input = new TupleBatchBuffer(schema); input.putLong(0, 1L); Split splitOp = new Split(new TupleSource(input), 0, ":"); splitOp.open(TestEnvVars.get()); }
public void updateSplitTarget(Integer splitId, String target) { for (Split split : splits) { if (split.getId().equals(splitId)) { split.setTarget(target); DataBank.saveSplit(split); } } }
private Node siblingAtOffset(int offset) { Split parent = parent_get(); if (parent == null) return null; List<Node> siblings = parent.getChildren(); int index = siblings.indexOf(this); if (index == -1) return null; index += offset; return ((index > -1) && (index < siblings.size())) ? siblings.get(index) : null; }
public Split getSplit(Integer id) { for (Split split : splits) { if (split.getId().equals(id)) { return split; } } return null; }
private static void addSplitChild(Split parent, Node child) { List<Node> children = new ArrayList<Node>(parent.getChildren()); if (children.size() == 0) { children.add(child); } else { children.add(new Divider()); children.add(child); } parent.setChildren(children); }
private static void printModel(String indent, Node root) { if (root instanceof Split) { Split split = (Split) root; System.out.println(indent + split); for (Node child : split.getChildren()) { printModel(indent + " ", child); } } else { System.out.println(indent + root); } }
private Divider dividerAt(Node root, int x, int y) { if (root instanceof Divider) { Divider divider = (Divider) root; return (divider.getBounds().contains(x, y)) ? divider : null; } else if (root instanceof Split) { Split split = (Split) root; for (Node child : split.getChildren()) { if (child.getBounds().contains(x, y)) return dividerAt(child, x, y); } } return null; }
private static Node parseModel(Reader r) { StreamTokenizer st = new StreamTokenizer(r); try { Split root = new Split(); parseSplit(st, root); return root.getChildren().get(0); } catch (Exception e) { Main.error(e); } finally { Utils.close(r); } return null; }
protected void updateTreeOutput(RegressionTree rt) { List<Split> leaves = rt.leaves(); for (int i = 0; i < leaves.size(); i++) { float s1 = 0.0F; Split s = leaves.get(i); int[] idx = s.getSamples(); for (int j = 0; j < idx.length; j++) { int k = idx[j]; s1 += pseudoResponses[k]; } s.setOutput(s1 / idx.length); } }
private RuleModel createNumericalRuleModel(ExampleSet trainingSet, Attribute attribute) { RuleModel model = new RuleModel(trainingSet); // split by best attribute int oldSize = -1; while ((trainingSet.size() > 0) && (trainingSet.size() != oldSize)) { ExampleSet exampleSet = (ExampleSet) trainingSet.clone(); Split bestSplit = splitter.getBestSplit(exampleSet, attribute, null); double bestSplitValue = bestSplit.getSplitPoint(); if (!Double.isNaN(bestSplitValue)) { SplittedExampleSet splittedSet = SplittedExampleSet.splitByAttribute(exampleSet, attribute, bestSplitValue); Attribute label = splittedSet.getAttributes().getLabel(); splittedSet.selectSingleSubset(0); SplitCondition condition = new LessEqualsSplitCondition(attribute, bestSplitValue); splittedSet.recalculateAttributeStatistics(label); int labelValue = (int) splittedSet.getStatistics(label, Statistics.MODE); String labelName = label.getMapping().mapIndex(labelValue); Rule rule = new Rule(labelName, condition); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) frequencies[counter++] = (int) splittedSet.getStatistics(label, Statistics.COUNT, value); rule.setFrequencies(frequencies); model.addRule(rule); oldSize = trainingSet.size(); trainingSet = rule.removeCovered(trainingSet); } else { break; } } // add default rule if some examples were not yet covered if (trainingSet.size() > 0) { Attribute label = trainingSet.getAttributes().getLabel(); trainingSet.recalculateAttributeStatistics(label); int index = (int) trainingSet.getStatistics(label, Statistics.MODE); String defaultLabel = label.getMapping().mapIndex(index); Rule defaultRule = new Rule(defaultLabel); int[] frequencies = new int[label.getMapping().size()]; int counter = 0; for (String value : label.getMapping().getValues()) frequencies[counter++] = (int) (trainingSet.getStatistics(label, Statistics.COUNT, value)); defaultRule.setFrequencies(frequencies); model.addRule(defaultRule); } return model; }
private void minimizeSplitBounds(Split split, Rectangle bounds) { Rectangle splitBounds = new Rectangle(bounds.x, bounds.y, 0, 0); List<Node> splitChildren = split.getChildren(); Node lastChild = splitChildren.get(splitChildren.size() - 1); Rectangle lastChildBounds = lastChild.getBounds(); if (split.isRowLayout()) { int lastChildMaxX = lastChildBounds.x + lastChildBounds.width; splitBounds.add(lastChildMaxX, bounds.y + bounds.height); } else { int lastChildMaxY = lastChildBounds.y + lastChildBounds.height; splitBounds.add(bounds.x + bounds.width, lastChildMaxY); } split.setBounds(splitBounds); }
private static Node parseModel(Reader r) { StreamTokenizer st = new StreamTokenizer(r); try { Split root = new Split(); parseSplit(st, root); return root.getChildren().get(0); } catch (Exception e) { System.err.println(e); } finally { try { r.close(); } catch (IOException ignore) { } } return null; }
@Override public ImmutableBinaryTree decompose(ArrayList<Vertex<Integer>> vertices) { Split split = createSplit(0, this, vertices); ArrayList<Vertex<Integer>> ordering = new ArrayList<>(); while (!split.done()) { split = split.decomposeAdvance(); ordering.add(split.getLastMoved()); } // System.out.printf("ordering: %s\n", Util.labels(ordering)); int i = 0; for (Vertex<Integer> v : ordering) { VertexLabel.setOrder(v, Integer.toString(i)); i += 1; } return getCaterpillarIBTFromOrdering(ordering); }
@Override public void graphTree(StringBuffer buff) { boolean first = true; for (Map.Entry<String, HNode> e : m_children.entrySet()) { HNode child = e.getValue(); String branch = e.getKey(); if (child != null) { String conditionForBranch = m_split.conditionForBranch(branch); if (first) { String testAttName = null; if (conditionForBranch.indexOf("<=") < 0) { testAttName = conditionForBranch.substring(0, conditionForBranch.indexOf("=")).trim(); } else { testAttName = conditionForBranch.substring(0, conditionForBranch.indexOf("<")).trim(); } first = false; buff.append("N" + m_nodeNum + " [label=\"" + testAttName + "\"]\n"); } int startIndex = 0; if (conditionForBranch.indexOf("<=") > 0) { startIndex = conditionForBranch.indexOf("<") - 1; } else if (conditionForBranch.indexOf("=") > 0) { startIndex = conditionForBranch.indexOf("=") - 1; } else { startIndex = conditionForBranch.indexOf(">") - 1; } conditionForBranch = conditionForBranch.substring(startIndex, conditionForBranch.length()).trim(); buff.append( "N" + m_nodeNum + "->" + "N" + child.m_nodeNum + "[label=\"" + conditionForBranch + "\"]\n") .append("\n"); } } for (Map.Entry<String, HNode> e : m_children.entrySet()) { HNode child = e.getValue(); if (child != null) { child.graphTree(buff); } } }
/** * Each input node @n corersponds to a <split> tag in the model file. * * @param n * @return */ private Split create(Node n) { Split s = null; if (n.getFirstChild().getNodeName().compareToIgnoreCase("feature") == 0) // this is a split { NodeList nl = n.getChildNodes(); int fid = Integer.parseInt( nl.item(0).getFirstChild().getNodeValue().toString().trim()); // <feature> float threshold = Float.parseFloat( nl.item(1).getFirstChild().getNodeValue().toString().trim()); // <threshold> s = new Split(fid, threshold, 0); s.setLeft(create(nl.item(2))); s.setRight(create(nl.item(3))); } else // this is a stump { float output = Float.parseFloat(n.getFirstChild().getFirstChild().getNodeValue().toString().trim()); s = new Split(); s.setOutput(output); } return s; }
/* Second pass of the layout algorithm: branch to layoutGrow/Shrink * as needed. */ private void layout2(Node root, Rectangle bounds) { if (root instanceof Leaf) { Component child = childForNode(root); if (child != null) { child.setBounds(bounds); } root.setBounds(bounds); } else if (root instanceof Divider) { root.setBounds(bounds); } else if (root instanceof Split) { Split split = (Split) root; boolean grow = split.isRowLayout() ? (split.getBounds().width <= bounds.width) : (split.getBounds().height <= bounds.height); if (grow) { layoutGrow(split, bounds); root.setBounds(bounds); } else { layoutShrink(split, bounds); // split.setBounds() called in layoutShrink() } } }
@Override protected int dumpTree(int depth, int leafCount, StringBuffer buff) { for (Map.Entry<String, HNode> e : m_children.entrySet()) { HNode child = e.getValue(); String branch = e.getKey(); if (child != null) { buff.append("\n"); for (int i = 0; i < depth; i++) { buff.append("| "); } buff.append(m_split.conditionForBranch(branch).trim()); buff.append(": "); leafCount = child.dumpTree(depth + 1, leafCount, buff); } } return leafCount; }