// Builds necessary utf8 edges between start & end void convertOneEdge(State start, State end, int startCodePoint, int endCodePoint) { startUTF8.set(startCodePoint); endUTF8.set(endCodePoint); // System.out.println("start = " + startUTF8); // System.out.println(" end = " + endUTF8); build(start, end, startUTF8, endUTF8, 0); }
private void end(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) { if (upto == utf8.len - 1) { // Done recursing start.addTransition( new Transition( utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto) - 1]), utf8.byteAt(upto), end)); // type=end } else { final int startCode; if (utf8.numBits(upto) == 5) { // special case -- avoid created unused edges (utf8 // doesn't accept certain byte sequences) -- there // are other cases we could optimize too: startCode = 194; } else { startCode = utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto) - 1]); } if (doAll && utf8.byteAt(upto) != startCode) { all(start, end, startCode, utf8.byteAt(upto) - 1, utf8.len - upto - 1); } State n = newUTF8State(); start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=end end(n, end, utf8, 1 + upto, true); } }
private void start(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) { if (upto == utf8.len - 1) { // Done recursing start.addTransition( new Transition( utf8.byteAt(upto), utf8.byteAt(upto) | MASKS[utf8.numBits(upto) - 1], end)); // type=start } else { State n = newUTF8State(); start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=start start(n, end, utf8, 1 + upto, true); int endCode = utf8.byteAt(upto) | MASKS[utf8.numBits(upto) - 1]; if (doAll && utf8.byteAt(upto) != endCode) { all(start, end, utf8.byteAt(upto) + 1, endCode, utf8.len - upto - 1); } } }
private void build( State start, State end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) { // Break into start, middle, end: if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) { // Degen case: lead with the same byte: if (upto == startUTF8.len - 1 && upto == endUTF8.len - 1) { // Super degen: just single edge, one UTF8 byte: start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); return; } else { assert startUTF8.len > upto + 1; assert endUTF8.len > upto + 1; State n = newUTF8State(); // Single value leading edge start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single // Recurse for the rest build(n, end, startUTF8, endUTF8, 1 + upto); } } else if (startUTF8.len == endUTF8.len) { if (upto == startUTF8.len - 1) { start.addTransition( new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend } else { start(start, end, startUTF8, upto, false); if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) { // There is a middle all( start, end, startUTF8.byteAt(upto) + 1, endUTF8.byteAt(upto) - 1, startUTF8.len - upto - 1); } end(start, end, endUTF8, upto, false); } } else { // start start(start, end, startUTF8, upto, true); // possibly middle, spanning multiple num bytes int byteCount = 1 + startUTF8.len - upto; final int limit = endUTF8.len - upto; while (byteCount < limit) { // wasteful: we only need first byte, and, we should // statically encode this first byte: tmpUTF8a.set(startCodes[byteCount - 1]); tmpUTF8b.set(endCodes[byteCount - 1]); all(start, end, tmpUTF8a.byteAt(0), tmpUTF8b.byteAt(0), tmpUTF8a.len - 1); byteCount++; } // end end(start, end, endUTF8, upto, true); } }