예제 #1
2
  /**
   * Load the directed graph from the given file
   *
   * @param fileName the file to load the cart from
   * @param featDefinition the feature definition
   * @param dummy unused, just here for compatibility with the FeatureFileIndexer.
   * @throws IOException , {@link MaryConfigurationException} if a problem occurs while loading
   */
  public DirectedGraph load(InputStream inStream) throws IOException, MaryConfigurationException {
    BufferedInputStream buffInStream = new BufferedInputStream(inStream);
    assert buffInStream.markSupported();
    buffInStream.mark(10000);
    // open the CART-File and read the header
    DataInput raf = new DataInputStream(buffInStream);

    MaryHeader maryHeader = new MaryHeader(raf);
    if (!maryHeader.hasCurrentVersion()) {
      throw new IOException("Wrong version of database file");
    }
    if (maryHeader.getType() != MaryHeader.DIRECTED_GRAPH) {
      if (maryHeader.getType() == MaryHeader.CARTS) {
        buffInStream.reset();
        return new MaryCARTReader().loadFromStream(buffInStream);
      } else {
        throw new IOException("Not a directed graph file");
      }
    }

    // Read properties
    short propDataLength = raf.readShort();
    Properties props;
    if (propDataLength == 0) {
      props = null;
    } else {
      byte[] propsData = new byte[propDataLength];
      raf.readFully(propsData);
      ByteArrayInputStream bais = new ByteArrayInputStream(propsData);
      props = new Properties();
      props.load(bais);
      bais.close();
    }

    // Read the feature definition
    FeatureDefinition featureDefinition = new FeatureDefinition(raf);

    // read the decision nodes
    int numDecNodes = raf.readInt(); // number of decision nodes

    // First we need to read all nodes into memory, then we can link them properly
    // in terms of parent/child.
    DecisionNode[] dns = new DecisionNode[numDecNodes];
    int[][] childIndexes = new int[numDecNodes][];
    for (int i = 0; i < numDecNodes; i++) {
      // read one decision node
      int featureNameIndex = raf.readInt();
      int nodeTypeNr = raf.readInt();
      DecisionNode.Type nodeType = DecisionNode.Type.values()[nodeTypeNr];
      int numChildren = 2; // for binary nodes
      switch (nodeType) {
        case BinaryByteDecisionNode:
          int criterion = raf.readInt();
          dns[i] =
              new DecisionNode.BinaryByteDecisionNode(
                  featureNameIndex, (byte) criterion, featureDefinition);
          break;
        case BinaryShortDecisionNode:
          criterion = raf.readInt();
          dns[i] =
              new DecisionNode.BinaryShortDecisionNode(
                  featureNameIndex, (short) criterion, featureDefinition);
          break;
        case BinaryFloatDecisionNode:
          float floatCriterion = raf.readFloat();
          dns[i] =
              new DecisionNode.BinaryFloatDecisionNode(
                  featureNameIndex, floatCriterion, featureDefinition);
          break;
        case ByteDecisionNode:
          numChildren = raf.readInt();
          if (featureDefinition.getNumberOfValues(featureNameIndex) != numChildren) {
            throw new IOException(
                "Inconsistent cart file: feature "
                    + featureDefinition.getFeatureName(featureNameIndex)
                    + " should have "
                    + featureDefinition.getNumberOfValues(featureNameIndex)
                    + " values, but decision node "
                    + i
                    + " has only "
                    + numChildren
                    + " child nodes");
          }
          dns[i] =
              new DecisionNode.ByteDecisionNode(featureNameIndex, numChildren, featureDefinition);
          break;
        case ShortDecisionNode:
          numChildren = raf.readInt();
          if (featureDefinition.getNumberOfValues(featureNameIndex) != numChildren) {
            throw new IOException(
                "Inconsistent cart file: feature "
                    + featureDefinition.getFeatureName(featureNameIndex)
                    + " should have "
                    + featureDefinition.getNumberOfValues(featureNameIndex)
                    + " values, but decision node "
                    + i
                    + " has only "
                    + numChildren
                    + " child nodes");
          }
          dns[i] =
              new DecisionNode.ShortDecisionNode(featureNameIndex, numChildren, featureDefinition);
      }
      dns[i].setUniqueDecisionNodeId(i + 1);
      // now read the children, indexes only:
      childIndexes[i] = new int[numChildren];
      for (int k = 0; k < numChildren; k++) {
        childIndexes[i][k] = raf.readInt();
      }
    }

    // read the leaves
    int numLeafNodes = raf.readInt(); // number of leaves, it does not include empty leaves
    LeafNode[] lns = new LeafNode[numLeafNodes];

    for (int j = 0; j < numLeafNodes; j++) {
      // read one leaf node
      int leafTypeNr = raf.readInt();
      LeafNode.LeafType leafNodeType = LeafNode.LeafType.values()[leafTypeNr];
      switch (leafNodeType) {
        case IntArrayLeafNode:
          int numData = raf.readInt();
          int[] data = new int[numData];
          for (int d = 0; d < numData; d++) {
            data[d] = raf.readInt();
          }
          lns[j] = new LeafNode.IntArrayLeafNode(data);
          break;
        case FloatLeafNode:
          float stddev = raf.readFloat();
          float mean = raf.readFloat();
          lns[j] = new LeafNode.FloatLeafNode(new float[] {stddev, mean});
          break;
        case IntAndFloatArrayLeafNode:
        case StringAndFloatLeafNode:
          int numPairs = raf.readInt();
          int[] ints = new int[numPairs];
          float[] floats = new float[numPairs];
          for (int d = 0; d < numPairs; d++) {
            ints[d] = raf.readInt();
            floats[d] = raf.readFloat();
          }
          if (leafNodeType == LeafNode.LeafType.IntAndFloatArrayLeafNode)
            lns[j] = new LeafNode.IntAndFloatArrayLeafNode(ints, floats);
          else lns[j] = new LeafNode.StringAndFloatLeafNode(ints, floats);
          break;
        case FeatureVectorLeafNode:
          throw new IllegalArgumentException(
              "Reading feature vector leaf nodes is not yet implemented");
        case PdfLeafNode:
          throw new IllegalArgumentException("Reading pdf leaf nodes is not yet implemented");
      }
      lns[j].setUniqueLeafId(j + 1);
    }

    // Graph nodes
    int numDirectedGraphNodes = raf.readInt();
    DirectedGraphNode[] graphNodes = new DirectedGraphNode[numDirectedGraphNodes];
    int[] dgnLeafIndices = new int[numDirectedGraphNodes];
    int[] dgnDecIndices = new int[numDirectedGraphNodes];
    for (int g = 0; g < numDirectedGraphNodes; g++) {
      graphNodes[g] = new DirectedGraphNode(null, null);
      graphNodes[g].setUniqueGraphNodeID(g + 1);
      dgnLeafIndices[g] = raf.readInt();
      dgnDecIndices[g] = raf.readInt();
    }

    // Now, link up the decision nodes with their daughters
    for (int i = 0; i < numDecNodes; i++) {
      // System.out.print(dns[i]+" "+dns[i].getFeatureName()+" ");
      for (int k = 0; k < childIndexes[i].length; k++) {
        Node child = childIndexToNode(childIndexes[i][k], dns, lns, graphNodes);
        dns[i].addDaughter(child);
        // System.out.print(" "+dns[i].getDaughter(k));
      }
      // System.out.println();
    }
    // And link up directed graph nodes
    for (int g = 0; g < numDirectedGraphNodes; g++) {
      Node leaf = childIndexToNode(dgnLeafIndices[g], dns, lns, graphNodes);
      graphNodes[g].setLeafNode(leaf);
      Node dec = childIndexToNode(dgnDecIndices[g], dns, lns, graphNodes);
      if (dec != null && !dec.isDecisionNode())
        throw new IllegalArgumentException("Only decision nodes allowed, read " + dec.getClass());
      graphNodes[g].setDecisionNode((DecisionNode) dec);
      // System.out.println("Graph node "+(g+1)+", leaf: "+Integer.toHexString(dgnLeafIndices[g])+",
      // "+leaf+" -- dec: "+Integer.toHexString(dgnDecIndices[g])+", "+dec);
    }

    Node rootNode;
    if (graphNodes.length > 0) {
      rootNode = graphNodes[0];
    } else if (dns.length > 0) {
      rootNode = dns[0];
      // CART behaviour, not sure if this is needed:
      // Now count all data once, so that getNumberOfData()
      // will return the correct figure.
      ((DecisionNode) rootNode).countData();
    } else if (lns.length > 0) {
      rootNode = lns[0]; // single-leaf tree...
    } else {
      rootNode = null;
    }

    // set the rootNode as the rootNode of cart
    return new DirectedGraph(rootNode, featureDefinition, props);
  }
예제 #2
0
 /**
  * Get the theoretical number of leaves, given a feature sequence.
  *
  * @return The number of leaves, or -1 if the capacity of the long integer was blown.
  */
 public long getTheoreticalNumberOfLeaves(int[] feaSeq) {
   long ret = 1;
   for (int i = 0; i < feaSeq.length; i++) {
     //          System.out.println( "Feature [" + i + "] has [" +
     // featureDefinition.getNumberOfValues( featureSequence[i] ) + "] values."
     //          + "(Number of leaves = [" + ret + "].)" );
     ret *= featureDefinition.getNumberOfValues(feaSeq[i]);
     if (ret < 0) return (-1);
   }
   return (ret);
 }
예제 #3
0
  /**
   * A local sort at a particular node along the deep sorting operation. This is a recursive
   * function.
   *
   * @param currentFeatureIdx The currently tested feature.
   * @param currentNode The current node, holding the currently processed zone in the array of
   *     feature vectors.
   */
  private void sortNode(int currentFeatureIdx, MaryNode currentNode) {
    /* If we have reached a leaf, do a final sort according to the unit index and return: */
    if (currentFeatureIdx == featureSequence.length) {
      Arrays.sort(featureVectors, currentNode.from, currentNode.to, cui);
      numberOfLeaves++;
      /*System.out.print( "LEAF ! (" + (currentNode.to-currentNode.from) + " units)" );
      for ( int i = currentNode.from; i < currentNode.to; i++ ) {
          System.out.print( " (" + featureVectors[i].getUnitIndex() + " 0)" );
      }
      System.out.println( "" );*/
      return;
    }
    /* Else: */
    int currentFeature = featureSequence[currentFeatureIdx];
    FeatureVector.FeatureType featureType = featureVectors[0].getFeatureType(currentFeature);
    /* Register the feature currently used for the splitting */
    currentNode.setFeatureIndex(currentFeature);
    /* Perform the sorting according to the currently considered feature: */
    /* 1) position the comparator onto the right feature */
    c.setFeatureIdx(currentFeature, featureType);
    /* 2) do the sorting */
    Arrays.sort(featureVectors, currentNode.from, currentNode.to, c);

    /* Then, seek for the zones where the feature value is the same,
     * and launch the next sort level on these. */
    int nVal = featureDefinition.getNumberOfValues(currentFeature);
    currentNode.split(nVal);
    int nextFrom = currentNode.from;
    int nextTo = currentNode.from;
    for (int i = 0; i < nVal; i++) {
      nextFrom = nextTo;
      // System.out.print( "Next node begins at " + nextFrom );
      while ((nextTo < currentNode.to)
          && (featureVectors[nextTo].getFeatureAsInt(currentFeature) == i)) {
        // System.out.print( " " + featureVectors[nextTo].getFeatureAsInt( currentFeature ) );
        nextTo++;
      }
      // System.out.println( " and ends at " + nextTo + " for a total of " + (nextTo-nextFrom) + "
      // units." );
      if ((nextTo - nextFrom) != 0) {
        MaryNode nod = new MaryNode(nextFrom, nextTo);
        currentNode.setChild(i, nod);
        // System.out.print("(" + i + " isByteOf " + currentFeature + ")" );
        sortNode(currentFeatureIdx + 1, nod);
      } else currentNode.setChild(i, null);
    }
  }
예제 #4
0
  /**
   * Export this feature definition in the "all.desc" format which can be read by wagon.
   *
   * @param out the destination of the data
   * @param featuresToIgnore a set of Strings containing the names of features that wagon should
   *     ignore. Can be null.
   */
  private void createDescFile() throws IOException {
    PrintWriter out = new PrintWriter(new FileOutputStream(descFile));
    Set<String> featuresToIgnore = new HashSet<String>();
    featuresToIgnore.add("unit_logf0");
    featuresToIgnore.add("unit_duration");

    int numDiscreteFeatures =
        featureDefinition.getNumberOfByteFeatures() + featureDefinition.getNumberOfShortFeatures();
    out.println("(");
    out.println("(occurid cluster)");
    for (int i = 0, n = featureDefinition.getNumberOfFeatures(); i < n; i++) {
      out.print("( ");
      String featureName = featureDefinition.getFeatureName(i);
      out.print(featureName);
      if (featuresToIgnore != null && featuresToIgnore.contains(featureName)) {
        out.print(" ignore");
      }
      if (i < numDiscreteFeatures) { // list values
        for (int v = 0, vmax = featureDefinition.getNumberOfValues(i); v < vmax; v++) {
          out.print("  ");
          // Print values surrounded by double quotes, and make sure any
          // double quotes in the value are preceded by a backslash --
          // otherwise, we get problems e.g. for sentence_punc
          String val = featureDefinition.getFeatureValueAsString(i, v);
          if (val.indexOf('"') != -1) {
            StringBuilder buf = new StringBuilder();
            for (int c = 0; c < val.length(); c++) {
              char ch = val.charAt(c);
              if (ch == '"') buf.append("\\\"");
              else buf.append(ch);
            }
            val = buf.toString();
          }
          out.print("\"" + val + "\"");
        }
        out.println(" )");
      } else { // float feature
        out.println(" float )");
      }
    }
    out.println(")");
    out.close();
  }