/** * Load the directed graph from the given file * * @param fileName the file to load the cart from * @param featDefinition the feature definition * @param dummy unused, just here for compatibility with the FeatureFileIndexer. * @throws IOException , {@link MaryConfigurationException} if a problem occurs while loading */ public DirectedGraph load(InputStream inStream) throws IOException, MaryConfigurationException { BufferedInputStream buffInStream = new BufferedInputStream(inStream); assert buffInStream.markSupported(); buffInStream.mark(10000); // open the CART-File and read the header DataInput raf = new DataInputStream(buffInStream); MaryHeader maryHeader = new MaryHeader(raf); if (!maryHeader.hasCurrentVersion()) { throw new IOException("Wrong version of database file"); } if (maryHeader.getType() != MaryHeader.DIRECTED_GRAPH) { if (maryHeader.getType() == MaryHeader.CARTS) { buffInStream.reset(); return new MaryCARTReader().loadFromStream(buffInStream); } else { throw new IOException("Not a directed graph file"); } } // Read properties short propDataLength = raf.readShort(); Properties props; if (propDataLength == 0) { props = null; } else { byte[] propsData = new byte[propDataLength]; raf.readFully(propsData); ByteArrayInputStream bais = new ByteArrayInputStream(propsData); props = new Properties(); props.load(bais); bais.close(); } // Read the feature definition FeatureDefinition featureDefinition = new FeatureDefinition(raf); // read the decision nodes int numDecNodes = raf.readInt(); // number of decision nodes // First we need to read all nodes into memory, then we can link them properly // in terms of parent/child. DecisionNode[] dns = new DecisionNode[numDecNodes]; int[][] childIndexes = new int[numDecNodes][]; for (int i = 0; i < numDecNodes; i++) { // read one decision node int featureNameIndex = raf.readInt(); int nodeTypeNr = raf.readInt(); DecisionNode.Type nodeType = DecisionNode.Type.values()[nodeTypeNr]; int numChildren = 2; // for binary nodes switch (nodeType) { case BinaryByteDecisionNode: int criterion = raf.readInt(); dns[i] = new DecisionNode.BinaryByteDecisionNode( featureNameIndex, (byte) criterion, featureDefinition); break; case BinaryShortDecisionNode: criterion = raf.readInt(); dns[i] = new DecisionNode.BinaryShortDecisionNode( featureNameIndex, (short) criterion, featureDefinition); break; case BinaryFloatDecisionNode: float floatCriterion = raf.readFloat(); dns[i] = new DecisionNode.BinaryFloatDecisionNode( featureNameIndex, floatCriterion, featureDefinition); break; case ByteDecisionNode: numChildren = raf.readInt(); if (featureDefinition.getNumberOfValues(featureNameIndex) != numChildren) { throw new IOException( "Inconsistent cart file: feature " + featureDefinition.getFeatureName(featureNameIndex) + " should have " + featureDefinition.getNumberOfValues(featureNameIndex) + " values, but decision node " + i + " has only " + numChildren + " child nodes"); } dns[i] = new DecisionNode.ByteDecisionNode(featureNameIndex, numChildren, featureDefinition); break; case ShortDecisionNode: numChildren = raf.readInt(); if (featureDefinition.getNumberOfValues(featureNameIndex) != numChildren) { throw new IOException( "Inconsistent cart file: feature " + featureDefinition.getFeatureName(featureNameIndex) + " should have " + featureDefinition.getNumberOfValues(featureNameIndex) + " values, but decision node " + i + " has only " + numChildren + " child nodes"); } dns[i] = new DecisionNode.ShortDecisionNode(featureNameIndex, numChildren, featureDefinition); } dns[i].setUniqueDecisionNodeId(i + 1); // now read the children, indexes only: childIndexes[i] = new int[numChildren]; for (int k = 0; k < numChildren; k++) { childIndexes[i][k] = raf.readInt(); } } // read the leaves int numLeafNodes = raf.readInt(); // number of leaves, it does not include empty leaves LeafNode[] lns = new LeafNode[numLeafNodes]; for (int j = 0; j < numLeafNodes; j++) { // read one leaf node int leafTypeNr = raf.readInt(); LeafNode.LeafType leafNodeType = LeafNode.LeafType.values()[leafTypeNr]; switch (leafNodeType) { case IntArrayLeafNode: int numData = raf.readInt(); int[] data = new int[numData]; for (int d = 0; d < numData; d++) { data[d] = raf.readInt(); } lns[j] = new LeafNode.IntArrayLeafNode(data); break; case FloatLeafNode: float stddev = raf.readFloat(); float mean = raf.readFloat(); lns[j] = new LeafNode.FloatLeafNode(new float[] {stddev, mean}); break; case IntAndFloatArrayLeafNode: case StringAndFloatLeafNode: int numPairs = raf.readInt(); int[] ints = new int[numPairs]; float[] floats = new float[numPairs]; for (int d = 0; d < numPairs; d++) { ints[d] = raf.readInt(); floats[d] = raf.readFloat(); } if (leafNodeType == LeafNode.LeafType.IntAndFloatArrayLeafNode) lns[j] = new LeafNode.IntAndFloatArrayLeafNode(ints, floats); else lns[j] = new LeafNode.StringAndFloatLeafNode(ints, floats); break; case FeatureVectorLeafNode: throw new IllegalArgumentException( "Reading feature vector leaf nodes is not yet implemented"); case PdfLeafNode: throw new IllegalArgumentException("Reading pdf leaf nodes is not yet implemented"); } lns[j].setUniqueLeafId(j + 1); } // Graph nodes int numDirectedGraphNodes = raf.readInt(); DirectedGraphNode[] graphNodes = new DirectedGraphNode[numDirectedGraphNodes]; int[] dgnLeafIndices = new int[numDirectedGraphNodes]; int[] dgnDecIndices = new int[numDirectedGraphNodes]; for (int g = 0; g < numDirectedGraphNodes; g++) { graphNodes[g] = new DirectedGraphNode(null, null); graphNodes[g].setUniqueGraphNodeID(g + 1); dgnLeafIndices[g] = raf.readInt(); dgnDecIndices[g] = raf.readInt(); } // Now, link up the decision nodes with their daughters for (int i = 0; i < numDecNodes; i++) { // System.out.print(dns[i]+" "+dns[i].getFeatureName()+" "); for (int k = 0; k < childIndexes[i].length; k++) { Node child = childIndexToNode(childIndexes[i][k], dns, lns, graphNodes); dns[i].addDaughter(child); // System.out.print(" "+dns[i].getDaughter(k)); } // System.out.println(); } // And link up directed graph nodes for (int g = 0; g < numDirectedGraphNodes; g++) { Node leaf = childIndexToNode(dgnLeafIndices[g], dns, lns, graphNodes); graphNodes[g].setLeafNode(leaf); Node dec = childIndexToNode(dgnDecIndices[g], dns, lns, graphNodes); if (dec != null && !dec.isDecisionNode()) throw new IllegalArgumentException("Only decision nodes allowed, read " + dec.getClass()); graphNodes[g].setDecisionNode((DecisionNode) dec); // System.out.println("Graph node "+(g+1)+", leaf: "+Integer.toHexString(dgnLeafIndices[g])+", // "+leaf+" -- dec: "+Integer.toHexString(dgnDecIndices[g])+", "+dec); } Node rootNode; if (graphNodes.length > 0) { rootNode = graphNodes[0]; } else if (dns.length > 0) { rootNode = dns[0]; // CART behaviour, not sure if this is needed: // Now count all data once, so that getNumberOfData() // will return the correct figure. ((DecisionNode) rootNode).countData(); } else if (lns.length > 0) { rootNode = lns[0]; // single-leaf tree... } else { rootNode = null; } // set the rootNode as the rootNode of cart return new DirectedGraph(rootNode, featureDefinition, props); }
/** * Get the theoretical number of leaves, given a feature sequence. * * @return The number of leaves, or -1 if the capacity of the long integer was blown. */ public long getTheoreticalNumberOfLeaves(int[] feaSeq) { long ret = 1; for (int i = 0; i < feaSeq.length; i++) { // System.out.println( "Feature [" + i + "] has [" + // featureDefinition.getNumberOfValues( featureSequence[i] ) + "] values." // + "(Number of leaves = [" + ret + "].)" ); ret *= featureDefinition.getNumberOfValues(feaSeq[i]); if (ret < 0) return (-1); } return (ret); }
/** * A local sort at a particular node along the deep sorting operation. This is a recursive * function. * * @param currentFeatureIdx The currently tested feature. * @param currentNode The current node, holding the currently processed zone in the array of * feature vectors. */ private void sortNode(int currentFeatureIdx, MaryNode currentNode) { /* If we have reached a leaf, do a final sort according to the unit index and return: */ if (currentFeatureIdx == featureSequence.length) { Arrays.sort(featureVectors, currentNode.from, currentNode.to, cui); numberOfLeaves++; /*System.out.print( "LEAF ! (" + (currentNode.to-currentNode.from) + " units)" ); for ( int i = currentNode.from; i < currentNode.to; i++ ) { System.out.print( " (" + featureVectors[i].getUnitIndex() + " 0)" ); } System.out.println( "" );*/ return; } /* Else: */ int currentFeature = featureSequence[currentFeatureIdx]; FeatureVector.FeatureType featureType = featureVectors[0].getFeatureType(currentFeature); /* Register the feature currently used for the splitting */ currentNode.setFeatureIndex(currentFeature); /* Perform the sorting according to the currently considered feature: */ /* 1) position the comparator onto the right feature */ c.setFeatureIdx(currentFeature, featureType); /* 2) do the sorting */ Arrays.sort(featureVectors, currentNode.from, currentNode.to, c); /* Then, seek for the zones where the feature value is the same, * and launch the next sort level on these. */ int nVal = featureDefinition.getNumberOfValues(currentFeature); currentNode.split(nVal); int nextFrom = currentNode.from; int nextTo = currentNode.from; for (int i = 0; i < nVal; i++) { nextFrom = nextTo; // System.out.print( "Next node begins at " + nextFrom ); while ((nextTo < currentNode.to) && (featureVectors[nextTo].getFeatureAsInt(currentFeature) == i)) { // System.out.print( " " + featureVectors[nextTo].getFeatureAsInt( currentFeature ) ); nextTo++; } // System.out.println( " and ends at " + nextTo + " for a total of " + (nextTo-nextFrom) + " // units." ); if ((nextTo - nextFrom) != 0) { MaryNode nod = new MaryNode(nextFrom, nextTo); currentNode.setChild(i, nod); // System.out.print("(" + i + " isByteOf " + currentFeature + ")" ); sortNode(currentFeatureIdx + 1, nod); } else currentNode.setChild(i, null); } }
/** * Export this feature definition in the "all.desc" format which can be read by wagon. * * @param out the destination of the data * @param featuresToIgnore a set of Strings containing the names of features that wagon should * ignore. Can be null. */ private void createDescFile() throws IOException { PrintWriter out = new PrintWriter(new FileOutputStream(descFile)); Set<String> featuresToIgnore = new HashSet<String>(); featuresToIgnore.add("unit_logf0"); featuresToIgnore.add("unit_duration"); int numDiscreteFeatures = featureDefinition.getNumberOfByteFeatures() + featureDefinition.getNumberOfShortFeatures(); out.println("("); out.println("(occurid cluster)"); for (int i = 0, n = featureDefinition.getNumberOfFeatures(); i < n; i++) { out.print("( "); String featureName = featureDefinition.getFeatureName(i); out.print(featureName); if (featuresToIgnore != null && featuresToIgnore.contains(featureName)) { out.print(" ignore"); } if (i < numDiscreteFeatures) { // list values for (int v = 0, vmax = featureDefinition.getNumberOfValues(i); v < vmax; v++) { out.print(" "); // Print values surrounded by double quotes, and make sure any // double quotes in the value are preceded by a backslash -- // otherwise, we get problems e.g. for sentence_punc String val = featureDefinition.getFeatureValueAsString(i, v); if (val.indexOf('"') != -1) { StringBuilder buf = new StringBuilder(); for (int c = 0; c < val.length(); c++) { char ch = val.charAt(c); if (ch == '"') buf.append("\\\""); else buf.append(ch); } val = buf.toString(); } out.print("\"" + val + "\""); } out.println(" )"); } else { // float feature out.println(" float )"); } } out.println(")"); out.close(); }