Exemple #1
0
  /**
   * Calculates the follow list of the current node.
   *
   * @param nodeCur The curent node.
   * @exception RuntimeException Thrown if follow list cannot be calculated.
   */
  private void calcFollowList(CMNode nodeCur) {
    // Recurse as required
    if (nodeCur.type() == XSModelGroupImpl.MODELGROUP_CHOICE) {
      // Recurse only
      calcFollowList(((XSCMBinOp) nodeCur).getLeft());
      calcFollowList(((XSCMBinOp) nodeCur).getRight());
    } else if (nodeCur.type() == XSModelGroupImpl.MODELGROUP_SEQUENCE) {
      // Recurse first
      calcFollowList(((XSCMBinOp) nodeCur).getLeft());
      calcFollowList(((XSCMBinOp) nodeCur).getRight());

      //
      //  Now handle our level. We use our left child's last pos
      //  set and our right child's first pos set, so go ahead and
      //  get them ahead of time.
      //
      final CMStateSet last = ((XSCMBinOp) nodeCur).getLeft().lastPos();
      final CMStateSet first = ((XSCMBinOp) nodeCur).getRight().firstPos();

      //
      //  Now, for every position which is in our left child's last set
      //  add all of the states in our right child's first set to the
      //  follow set for that position.
      //
      for (int index = 0; index < fLeafCount; index++) {
        if (last.getBit(index)) fFollowList[index].union(first);
      }
    } else if (nodeCur.type() == XSParticleDecl.PARTICLE_ZERO_OR_MORE
        || nodeCur.type() == XSParticleDecl.PARTICLE_ONE_OR_MORE) {
      // Recurse first
      calcFollowList(((XSCMUniOp) nodeCur).getChild());

      //
      //  Now handle our level. We use our own first and last position
      //  sets, so get them up front.
      //
      final CMStateSet first = nodeCur.firstPos();
      final CMStateSet last = nodeCur.lastPos();

      //
      //  For every position which is in our last position set, add all
      //  of our first position states to the follow set for that
      //  position.
      //
      for (int index = 0; index < fLeafCount; index++) {
        if (last.getBit(index)) fFollowList[index].union(first);
      }
    } else if (nodeCur.type() == XSParticleDecl.PARTICLE_ZERO_OR_ONE) {
      // Recurse only
      calcFollowList(((XSCMUniOp) nodeCur).getChild());
    }
  }
Exemple #2
0
  /**
   * Builds the internal DFA transition table from the given syntax tree.
   *
   * @param syntaxTree The syntax tree.
   * @exception RuntimeException Thrown if DFA cannot be built.
   */
  private void buildDFA(CMNode syntaxTree) {
    //
    //  The first step we need to take is to rewrite the content model
    //  using our CMNode objects, and in the process get rid of any
    //  repetition short cuts, converting them into '*' style repetitions
    //  or getting rid of repetitions altogether.
    //
    //  The conversions done are:
    //
    //  x+ -> (x|x*)
    //  x? -> (x|epsilon)
    //
    //  This is a relatively complex scenario. What is happening is that
    //  we create a top level binary node of which the special EOC value
    //  is set as the right side node. The the left side is set to the
    //  rewritten syntax tree. The source is the original content model
    //  info from the decl pool. The rewrite is done by buildSyntaxTree()
    //  which recurses the decl pool's content of the element and builds
    //  a new tree in the process.
    //
    //  Note that, during this operation, we set each non-epsilon leaf
    //  node's DFA state position and count the number of such leafs, which
    //  is left in the fLeafCount member.
    //
    //  The nodeTmp object is passed in just as a temp node to use during
    //  the recursion. Otherwise, we'd have to create a new node on every
    //  level of recursion, which would be piggy in Java (as is everything
    //  for that matter.)
    //

    /* MODIFIED (Jan, 2001)
     *
     * Use following rules.
     *   nullable(x+) := nullable(x), first(x+) := first(x),  last(x+) := last(x)
     *   nullable(x?) := true, first(x?) := first(x),  last(x?) := last(x)
     *
     * The same computation of follow as x* is applied to x+
     *
     * The modification drastically reduces computation time of
     * "(a, (b, a+, (c, (b, a+)+, a+, (d,  (c, (b, a+)+, a+)+, (b, a+)+, a+)+)+)+)+"
     */

    //
    //  And handle specially the EOC node, which also must be numbered
    //  and counted as a non-epsilon leaf node. It could not be handled
    //  in the above tree build because it was created before all that
    //  started. We save the EOC position since its used during the DFA
    //  building loop.
    //
    int EOCPos = fLeafCount;
    XSCMLeaf nodeEOC = new XSCMLeaf(XSParticleDecl.PARTICLE_ELEMENT, null, -1, fLeafCount++);
    fHeadNode = new XSCMBinOp(XSModelGroupImpl.MODELGROUP_SEQUENCE, syntaxTree, nodeEOC);

    //
    //  Ok, so now we have to iterate the new tree and do a little more
    //  work now that we know the leaf count. One thing we need to do is
    //  to calculate the first and last position sets of each node. This
    //  is cached away in each of the nodes.
    //
    //  Along the way we also set the leaf count in each node as the
    //  maximum state count. They must know this in order to create their
    //  first/last pos sets.
    //
    //  We also need to build an array of references to the non-epsilon
    //  leaf nodes. Since we iterate it in the same way as before, this
    //  will put them in the array according to their position values.
    //
    fLeafList = new XSCMLeaf[fLeafCount];
    fLeafListType = new int[fLeafCount];
    postTreeBuildInit(fHeadNode);

    //
    //  And, moving onward... We now need to build the follow position
    //  sets for all the nodes. So we allocate an array of state sets,
    //  one for each leaf node (i.e. each DFA position.)
    //
    fFollowList = new CMStateSet[fLeafCount];
    for (int index = 0; index < fLeafCount; index++)
      fFollowList[index] = new CMStateSet(fLeafCount);
    calcFollowList(fHeadNode);
    //
    //  And finally the big push... Now we build the DFA using all the
    //  states and the tree we've built up. First we set up the various
    //  data structures we are going to use while we do this.
    //
    //  First of all we need an array of unique element names in our
    //  content model. For each transition table entry, we need a set of
    //  contiguous indices to represent the transitions for a particular
    //  input element. So we need to a zero based range of indexes that
    //  map to element types. This element map provides that mapping.
    //
    fElemMap = new Object[fLeafCount];
    fElemMapType = new int[fLeafCount];
    fElemMapId = new int[fLeafCount];
    fElemMapSize = 0;
    Occurence[] elemOccurenceMap = null;
    for (int outIndex = 0; outIndex < fLeafCount; outIndex++) {
      // optimization from Henry Zongaro:
      // fElemMap[outIndex] = new Object ();
      fElemMap[outIndex] = null;

      int inIndex = 0;
      final int id = fLeafList[outIndex].getParticleId();
      for (; inIndex < fElemMapSize; inIndex++) {
        if (id == fElemMapId[inIndex]) break;
      }

      // If it was not in the list, then add it, if not the EOC node
      if (inIndex == fElemMapSize) {
        XSCMLeaf leaf = fLeafList[outIndex];
        fElemMap[fElemMapSize] = leaf.getLeaf();
        if (leaf instanceof XSCMRepeatingLeaf) {
          if (elemOccurenceMap == null) {
            elemOccurenceMap = new Occurence[fLeafCount];
          }
          elemOccurenceMap[fElemMapSize] = new Occurence((XSCMRepeatingLeaf) leaf, fElemMapSize);
        }
        fElemMapType[fElemMapSize] = fLeafListType[outIndex];
        fElemMapId[fElemMapSize] = id;
        fElemMapSize++;
      }
    }

    // the last entry in the element map must be the EOC element.
    // remove it from the map.
    if (DEBUG) {
      if (fElemMapId[fElemMapSize - 1] != -1)
        System.err.println("interal error in DFA: last element is not EOC.");
    }
    fElemMapSize--;

    /**
     * * Optimization(Jan, 2001); We sort fLeafList according to elemIndex which is *uniquely*
     * associated to each leaf. We are *assuming* that each element appears in at least one leaf.
     */
    int[] fLeafSorter = new int[fLeafCount + fElemMapSize];
    int fSortCount = 0;

    for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) {
      final int id = fElemMapId[elemIndex];
      for (int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) {
        if (id == fLeafList[leafIndex].getParticleId()) fLeafSorter[fSortCount++] = leafIndex;
      }
      fLeafSorter[fSortCount++] = -1;
    }

    /* Optimization(Jan, 2001) */

    //
    //  Next lets create some arrays, some that hold transient
    //  information during the DFA build and some that are permament.
    //  These are kind of sticky since we cannot know how big they will
    //  get, but we don't want to use any Java collections because of
    //  performance.
    //
    //  Basically they will probably be about fLeafCount*2 on average,
    //  but can be as large as 2^(fLeafCount*2), worst case. So we start
    //  with fLeafCount*4 as a middle ground. This will be very unlikely
    //  to ever have to expand, though it if does, the overhead will be
    //  somewhat ugly.
    //
    int curArraySize = fLeafCount * 4;
    CMStateSet[] statesToDo = new CMStateSet[curArraySize];
    fFinalStateFlags = new boolean[curArraySize];
    fTransTable = new int[curArraySize][];

    //
    //  Ok we start with the initial set as the first pos set of the
    //  head node (which is the seq node that holds the content model
    //  and the EOC node.)
    //
    CMStateSet setT = fHeadNode.firstPos();

    //
    //  Init our two state flags. Basically the unmarked state counter
    //  is always chasing the current state counter. When it catches up,
    //  that means we made a pass through that did not add any new states
    //  to the lists, at which time we are done. We could have used a
    //  expanding array of flags which we used to mark off states as we
    //  complete them, but this is easier though less readable maybe.
    //
    int unmarkedState = 0;
    int curState = 0;

    //
    //  Init the first transition table entry, and put the initial state
    //  into the states to do list, then bump the current state.
    //
    fTransTable[curState] = makeDefStateList();
    statesToDo[curState] = setT;
    curState++;

    /* Optimization(Jan, 2001); This is faster for
     * a large content model such as, "(t001+|t002+|.... |t500+)".
     */

    HashMap stateTable = new HashMap();

    /* Optimization(Jan, 2001) */

    //
    //  Ok, almost done with the algorithm... We now enter the
    //  loop where we go until the states done counter catches up with
    //  the states to do counter.
    //
    while (unmarkedState < curState) {
      //
      //  Get the first unmarked state out of the list of states to do.
      //  And get the associated transition table entry.
      //
      setT = statesToDo[unmarkedState];
      int[] transEntry = fTransTable[unmarkedState];

      // Mark this one final if it contains the EOC state
      fFinalStateFlags[unmarkedState] = setT.getBit(EOCPos);

      // Bump up the unmarked state count, marking this state done
      unmarkedState++;

      // Loop through each possible input symbol in the element map
      CMStateSet newSet = null;
      /* Optimization(Jan, 2001) */
      int sorterIndex = 0;
      /* Optimization(Jan, 2001) */
      for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) {
        //
        //  Build up a set of states which is the union of all of
        //  the follow sets of DFA positions that are in the current
        //  state. If we gave away the new set last time through then
        //  create a new one. Otherwise, zero out the existing one.
        //
        if (newSet == null) newSet = new CMStateSet(fLeafCount);
        else newSet.zeroBits();

        /* Optimization(Jan, 2001) */
        int leafIndex = fLeafSorter[sorterIndex++];

        while (leafIndex != -1) {
          // If this leaf index (DFA position) is in the current set...
          if (setT.getBit(leafIndex)) {
            //
            //  If this leaf is the current input symbol, then we
            //  want to add its follow list to the set of states to
            //  transition to from the current state.
            //
            newSet.union(fFollowList[leafIndex]);
          }

          leafIndex = fLeafSorter[sorterIndex++];
        }
        /* Optimization(Jan, 2001) */

        //
        //  If this new set is not empty, then see if its in the list
        //  of states to do. If not, then add it.
        //
        if (!newSet.isEmpty()) {
          //
          //  Search the 'states to do' list to see if this new
          //  state set is already in there.
          //

          /* Optimization(Jan, 2001) */
          Integer stateObj = (Integer) stateTable.get(newSet);
          int stateIndex = (stateObj == null ? curState : stateObj.intValue());
          /* Optimization(Jan, 2001) */

          // If we did not find it, then add it
          if (stateIndex == curState) {
            //
            //  Put this new state into the states to do and init
            //  a new entry at the same index in the transition
            //  table.
            //
            statesToDo[curState] = newSet;
            fTransTable[curState] = makeDefStateList();

            /* Optimization(Jan, 2001) */
            stateTable.put(newSet, new Integer(curState));
            /* Optimization(Jan, 2001) */

            // We now have a new state to do so bump the count
            curState++;

            //
            //  Null out the new set to indicate we adopted it.
            //  This will cause the creation of a new set on the
            //  next time around the loop.
            //
            newSet = null;
          }

          //
          //  Now set this state in the transition table's entry
          //  for this element (using its index), with the DFA
          //  state we will move to from the current state when we
          //  see this input element.
          //
          transEntry[elemIndex] = stateIndex;

          // Expand the arrays if we're full
          if (curState == curArraySize) {
            //
            //  Yikes, we overflowed the initial array size, so
            //  we've got to expand all of these arrays. So adjust
            //  up the size by 50% and allocate new arrays.
            //
            final int newSize = (int) (curArraySize * 1.5);
            CMStateSet[] newToDo = new CMStateSet[newSize];
            boolean[] newFinalFlags = new boolean[newSize];
            int[][] newTransTable = new int[newSize][];

            // Copy over all of the existing content
            System.arraycopy(statesToDo, 0, newToDo, 0, curArraySize);
            System.arraycopy(fFinalStateFlags, 0, newFinalFlags, 0, curArraySize);
            System.arraycopy(fTransTable, 0, newTransTable, 0, curArraySize);

            // Store the new array size
            curArraySize = newSize;
            statesToDo = newToDo;
            fFinalStateFlags = newFinalFlags;
            fTransTable = newTransTable;
          }
        }
      }
    }

    //
    // Fill in the occurence information for each looping state
    // if we're using counters.
    //
    if (elemOccurenceMap != null) {
      fCountingStates = new Occurence[curState];
      for (int i = 0; i < curState; ++i) {
        int[] transitions = fTransTable[i];
        for (int j = 0; j < transitions.length; ++j) {
          if (i == transitions[j]) {
            fCountingStates[i] = elemOccurenceMap[j];
            break;
          }
        }
      }
    }

    //
    //  And now we can say bye bye to the temp representation since we've
    //  built the DFA.
    //
    if (DEBUG_VALIDATE_CONTENT) dumpTree(fHeadNode, 0);
    fHeadNode = null;
    fLeafList = null;
    fFollowList = null;
    fLeafListType = null;
    fElemMapId = null;
  }