/** Return the next element of the json document */ private String getNextNode() { final int popState; switch (states.peek()) { case ARRAY: switch (rand.nextInt(9)) { case 0: // String case final String val = "stepha" + this.getWhitespace() + "n" + this.getWhitespace() + "e"; this.addToLastNode(1); nodes.add(IntsRef.deepCopyOf(curNodePath)); images.add(val); types.add(ExtendedJsonTokenizer.getTokenTypes()[LITERAL]); incr.add(1); datatypes.add(XSDDatatype.XSD_STRING); return "\"" + val + "\"" + this.getWhitespace() + ","; case 1: // DOUBLE case this.addToLastNode(1); nodes.add(IntsRef.deepCopyOf(curNodePath)); images.add("34.560e-9"); types.add(ExtendedJsonTokenizer.getTokenTypes()[NUMBER]); incr.add(1); datatypes.add(XSDDatatype.XSD_DOUBLE); return "34.560e-9" + this.getWhitespace() + ","; case 2: // LONG case this.addToLastNode(1); nodes.add(IntsRef.deepCopyOf(curNodePath)); images.add("34560e-9"); types.add(ExtendedJsonTokenizer.getTokenTypes()[NUMBER]); incr.add(1); datatypes.add(XSDDatatype.XSD_LONG); return "34560e-9" + this.getWhitespace() + ","; case 3: // true case this.addToLastNode(1); nodes.add(IntsRef.deepCopyOf(curNodePath)); images.add("true"); types.add(ExtendedJsonTokenizer.getTokenTypes()[TRUE]); incr.add(1); datatypes.add(XSDDatatype.XSD_BOOLEAN); return "true" + this.getWhitespace() + ","; case 4: // false case this.addToLastNode(1); nodes.add(IntsRef.deepCopyOf(curNodePath)); images.add("false"); types.add(ExtendedJsonTokenizer.getTokenTypes()[FALSE]); incr.add(1); datatypes.add(XSDDatatype.XSD_BOOLEAN); return "false" + this.getWhitespace() + ","; case 5: // null case this.addToLastNode(1); nodes.add(IntsRef.deepCopyOf(curNodePath)); images.add("null"); types.add(ExtendedJsonTokenizer.getTokenTypes()[NULL]); incr.add(1); datatypes.add(XSDDatatype.XSD_STRING); return "null" + this.getWhitespace() + ","; case 6: // nested array case if (states.size() <= MAX_DEPTH) { this.addToLastNode(1); this.incrNodeObjectPath(); states.add(ARRAY); return "["; } return ""; case 7: // nested object case if (states.size() <= MAX_DEPTH) { this.addToLastNode(1); this.incrNodeObjectPath(); states.add(ARRAY_OBJECT); return "{"; } return ""; case 8: // closing array case this.decrNodeObjectPath(); popState = states.pop(); if (popState != ARRAY) { shouldFail = true; } // Remove previous comma, this is not allowed final int comma = sb.lastIndexOf(","); if (comma != -1 && sb.substring(comma + 1).matches("\\s*")) { sb.deleteCharAt(comma); } return "],"; } case ARRAY_OBJECT: case OBJECT_ATT: switch (rand.nextInt(3)) { case 0: // new object field types.add(ExtendedJsonTokenizer.getTokenTypes()[LITERAL]); images.add("ste ph ane"); incr.add(1); this.addToLastNode(1); nodes.add(IntsRef.deepCopyOf(curNodePath)); datatypes.add(JSONDatatype.JSON_FIELD); states.push(OBJECT_VAL); return "\"ste ph ane\"" + this.getWhitespace() + ":"; case 1: // close object if (states.peek() == OBJECT_ATT && nestedObjs > 0) { this.decrNodeObjectPath(); nestedObjs--; } this.decrNodeObjectPath(); popState = states.pop(); if (popState != OBJECT_ATT && popState != ARRAY_OBJECT) { shouldFail = true; } // Remove previous comma, this is not allowed final int comma = sb.lastIndexOf(","); if (comma != -1 && sb.substring(comma + 1).matches("\\s*")) { sb.deleteCharAt(comma); } return states.empty() ? "}" : "},"; case 2: // Datatype if (getLastNode() >= 0) { // this nested object cannot be a datatype object because other things have been added // to it return ""; } final String field; if (states.isEmpty()) { // datatype object at the root are not possible shouldFail = true; field = "{"; } else if (states.peek() == OBJECT_ATT) { // field name this.addToLastNode(1); field = "\"field\":{"; types.add(ExtendedJsonTokenizer.getTokenTypes()[LITERAL]); images.add("field"); incr.add(1); nodes.add(IntsRef.deepCopyOf(curNodePath)); datatypes.add(JSONDatatype.JSON_FIELD); // value this.incrNodeObjectPath(); this.setLastNode(0); } else if (states.peek() == ARRAY) { this.addToLastNode(1); field = "{"; } else if (states.peek() == ARRAY_OBJECT) { this.decrNodeObjectPath(); field = ""; } else { // should not happen throw new IllegalStateException("Received unknown state=" + states.peek()); } types.add(ExtendedJsonTokenizer.getTokenTypes()[LITERAL]); images.add("Luke Skywalker"); incr.add(1); nodes.add(IntsRef.deepCopyOf(curNodePath)); datatypes.add("jedi"); // close datatype object if (states.peek() == ARRAY_OBJECT) { popState = states.pop(); } else { this.decrNodeObjectPath(); } return field + this.getWhitespace() + "\"" + ExtendedJsonTokenizer.DATATYPE_LABEL + "\":" + this.getWhitespace() + "\"jedi\"," + "\"" + ExtendedJsonTokenizer.DATATYPE_VALUES + "\":" + this.getWhitespace() + "\"Luke Skywalker\"" + this.getWhitespace() + "},"; } case OBJECT_VAL: switch (rand.nextInt(8)) { case 0: // String return this.doValString( "stepha" + this.getWhitespace() + "n" + this.getWhitespace() + "e"); case 1: // DOUBLE case images.add("34.560e-9"); types.add(ExtendedJsonTokenizer.getTokenTypes()[NUMBER]); incr.add(1); this.incrNodeObjectPath(); this.setLastNode(0); nodes.add(IntsRef.deepCopyOf(curNodePath)); this.decrNodeObjectPath(); datatypes.add(XSDDatatype.XSD_DOUBLE); states.pop(); // remove OBJECT_VAL state return "34.560e-9" + this.getWhitespace() + ","; case 2: // LONG case images.add("34560e-9"); types.add(ExtendedJsonTokenizer.getTokenTypes()[NUMBER]); incr.add(1); this.incrNodeObjectPath(); this.setLastNode(0); nodes.add(IntsRef.deepCopyOf(curNodePath)); this.decrNodeObjectPath(); datatypes.add(XSDDatatype.XSD_LONG); states.pop(); // remove OBJECT_VAL state return "34560e-9" + this.getWhitespace() + ","; case 3: // True images.add("true"); types.add(ExtendedJsonTokenizer.getTokenTypes()[TRUE]); incr.add(1); this.incrNodeObjectPath(); this.setLastNode(0); nodes.add(IntsRef.deepCopyOf(curNodePath)); this.decrNodeObjectPath(); datatypes.add(XSDDatatype.XSD_BOOLEAN); states.pop(); // remove OBJECT_VAL state return "true" + this.getWhitespace() + ","; case 4: // False images.add("false"); types.add(ExtendedJsonTokenizer.getTokenTypes()[FALSE]); incr.add(1); this.incrNodeObjectPath(); this.setLastNode(0); nodes.add(IntsRef.deepCopyOf(curNodePath)); this.decrNodeObjectPath(); datatypes.add(XSDDatatype.XSD_BOOLEAN); states.pop(); // remove OBJECT_VAL state return "false" + this.getWhitespace() + ","; case 5: // NULL images.add("null"); types.add(ExtendedJsonTokenizer.getTokenTypes()[NULL]); incr.add(1); this.incrNodeObjectPath(); this.setLastNode(0); nodes.add(IntsRef.deepCopyOf(curNodePath)); this.decrNodeObjectPath(); datatypes.add(XSDDatatype.XSD_STRING); states.pop(); // remove OBJECT_VAL state return "null" + this.getWhitespace() + ","; case 6: // New array if (states.size() <= MAX_DEPTH) { states.pop(); // remove OBJECT_VAL state this.incrNodeObjectPath(); states.add(ARRAY); return "["; } return this.doValString(""); case 7: // new Object if (states.size() <= MAX_DEPTH) { states.pop(); // remove OBJECT_VAL state // Two incrementations, because the object introduce a "blank" node nestedObjs++; this.incrNodeObjectPath(); this.setLastNode(0); this.incrNodeObjectPath(); states.add(OBJECT_ATT); return "{"; } return this.doValString(""); } default: throw new IllegalStateException("Got unknown lexical state: " + states.peek()); } }
/** Create a random Json document with random values */ public String getRandomJson(int nbNodes) { // init sb.setLength(0); sb.append("{"); states.clear(); states.add(OBJECT_ATT); images.clear(); nodes.clear(); incr.clear(); datatypes.clear(); types.clear(); curNodePath.length = 1; curNodePath.offset = 0; Arrays.fill(curNodePath.ints, -1); shouldFail = false; nestedObjs = 0; // <= so that when nbNodes == 1, the json is still valid /* * the generated json might be uncomplete, if states is not empty, and * the maximum number of nodes has been reached. */ for (final int i = 0; i <= nbNodes && !states.empty(); nbNodes++) { sb.append(this.getWhitespace()).append(this.getNextNode()).append(this.getWhitespace()); } shouldFail = shouldFail ? true : !states.empty(); return sb.toString(); }
private InputOutput<T> setResult() { if (upto == 0) { return null; } else { current.length = upto - 1; result.output = output[upto]; return result; } }
/** Add a string value to an object entry */ private String doValString(final String val) { images.add(val); types.add(ExtendedJsonTokenizer.getTokenTypes()[LITERAL]); incr.add(1); this.incrNodeObjectPath(); this.setLastNode(0); nodes.add(IntsRef.deepCopyOf(curNodePath)); this.decrNodeObjectPath(); datatypes.add(XSDDatatype.XSD_STRING); states.pop(); // remove OBJECT_VAL state return "\"" + val + "\"" + this.getWhitespace() + ","; }
private void decodeTermFreqs() throws IOException { // logger.debug("Decode Term Freq in Node: {}", this.hashCode()); // logger.debug("Decode Term Freq in Node at {}", in.getFilePointer()); in.readBytes(termFreqCompressedBuffer.bytes, 0, termFreqCompressedBufferLength); termFreqCompressedBuffer.offset = 0; termFreqCompressedBuffer.length = termFreqCompressedBufferLength; nodDecompressor.decompress(termFreqCompressedBuffer, termFreqBuffer); // set length limit based on block size, as certain decompressor with // large window size can set it larger than the blockSize, e.g., AFor termFreqBuffer.length = termFreqBlockSize; termFreqReadPending = false; }
private void decodeNodeLengths() throws IOException { // logger.debug("Decode Nodes Length: {}", this.hashCode()); // logger.debug("Decode Nodes Length at {}", in.getFilePointer()); in.readBytes(nodLenCompressedBuffer.bytes, 0, nodLenCompressedBufferLength); nodLenCompressedBuffer.offset = 0; nodLenCompressedBuffer.length = nodLenCompressedBufferLength; nodDecompressor.decompress(nodLenCompressedBuffer, nodLenBuffer); // set length limit based on block size, as certain decompressor with // large window size can set it larger than the blockSize, e.g., AFor nodLenBuffer.length = nodLenBlockSize; nodLenReadPending = false; }
/** * Decode delta of the node. * * <p>If a new doc has been read (currentNode.length == 0), then update currentNode offset and * length. Otherwise, perform delta decoding. * * <p>Perform delta decoding while current node id and previous node id are equals. */ private final void deltaDecoding() { final int[] nodBufferInts = nodBuffer.ints; // increment length by one final int nodLength = nodLenBuffer.ints[nodLenBuffer.offset++] + 1; final int nodOffset = nodBuffer.offset; final int nodEnd = nodOffset + nodLength; final int currentNodeOffset = currentNode.offset; final int currentNodeEnd = currentNodeOffset + currentNode.length; for (int i = nodOffset, j = currentNodeOffset; i < nodEnd && j < currentNodeEnd; i++, j++) { nodBufferInts[i] += nodBufferInts[j]; // if node ids are different, then stop decoding if (nodBufferInts[i] != nodBufferInts[j]) { break; } } // increment node buffer offset nodBuffer.offset += nodLength; // update last node offset and length currentNode.offset = nodOffset; currentNode.length = nodLength; }
@Override protected void readHeader() throws IOException { // logger.debug("Read Nod header: {}", this.hashCode()); // logger.debug("Nod header start at {}", in.getFilePointer()); // read blockSize and check buffer size nodLenBlockSize = in.readVInt(); // ensure that the output buffer has the minimum size required final int nodLenBufferLength = this.getMinimumBufferSize(nodLenBlockSize, nodDecompressor.getWindowSize()); nodLenBuffer = ArrayUtils.grow(nodLenBuffer, nodLenBufferLength); // logger.debug("Read Nod length block size: {}", nodLenblockSize); nodBlockSize = in.readVInt(); // ensure that the output buffer has the minimum size required final int nodBufferLength = this.getMinimumBufferSize(nodBlockSize, nodDecompressor.getWindowSize()); nodBuffer = ArrayUtils.grow(nodBuffer, nodBufferLength); // logger.debug("Read Nod block size: {}", nodBlockSize); termFreqBlockSize = in.readVInt(); // ensure that the output buffer has the minimum size required final int termFreqBufferLength = this.getMinimumBufferSize(termFreqBlockSize, nodDecompressor.getWindowSize()); termFreqBuffer = ArrayUtils.grow(termFreqBuffer, termFreqBufferLength); // logger.debug("Read Term Freq In Node block size: {}", termFreqblockSize); // read size of each compressed data block and check buffer size nodLenCompressedBufferLength = in.readVInt(); nodLenCompressedBuffer = ArrayUtils.grow(nodLenCompressedBuffer, nodLenCompressedBufferLength); nodLenReadPending = true; nodCompressedBufferLength = in.readVInt(); nodCompressedBuffer = ArrayUtils.grow(nodCompressedBuffer, nodCompressedBufferLength); nodReadPending = true; termFreqCompressedBufferLength = in.readVInt(); termFreqCompressedBuffer = ArrayUtils.grow(termFreqCompressedBuffer, termFreqCompressedBufferLength); termFreqReadPending = true; // decode node lengths this.decodeNodeLengths(); // copy reference of node buffer currentNode.ints = nodBuffer.ints; }
/** * Increments the byte buffer to the next String in binary order after s that will not put the * machine into a reject state. If such a string does not exist, returns false. * * <p>The correctness of this method depends upon the automaton being deterministic, and having no * transitions to dead states. * * @return true if more possible solutions exist for the DFA */ private boolean nextString() { int state; int pos = 0; savedStates.grow(seekBytesRef.length + 1); final int[] states = savedStates.ints; states[0] = runAutomaton.getInitialState(); while (true) { curGen++; linear = false; // walk the automaton until a character is rejected. for (state = states[pos]; pos < seekBytesRef.length; pos++) { visited[state] = curGen; int nextState = runAutomaton.step(state, seekBytesRef.bytes[pos] & 0xff); if (nextState == -1) break; states[pos + 1] = nextState; // we found a loop, record it for faster enumeration if (!finite && !linear && visited[nextState] == curGen) { setLinear(pos); } state = nextState; } // take the useful portion, and the last non-reject state, and attempt to // append characters that will match. if (nextString(state, pos)) { return true; } else { /* no more solutions exist from this useful portion, backtrack */ if ((pos = backtrack(pos)) < 0) /* no more solutions at all */ return false; final int newState = runAutomaton.step(states[pos], seekBytesRef.bytes[pos] & 0xff); if (newState >= 0 && runAutomaton.isAccept(newState)) /* String is good to go as-is */ return true; /* else advance further */ // TODO: paranoia? if we backtrack thru an infinite DFA, the loop detection is important! // for now, restart from scratch for all infinite DFAs if (!finite) pos = 0; } } }
@Override public void initBlock() { nodLenBuffer.offset = nodLenBuffer.length = 0; nodBuffer.offset = nodBuffer.length = 0; termFreqBuffer.offset = termFreqBuffer.length = 0; this.resetCurrentNode(); nodLenReadPending = true; nodReadPending = true; termFreqReadPending = true; nodLenCompressedBufferLength = 0; nodCompressedBufferLength = 0; termFreqCompressedBufferLength = 0; }
@Override public void compress(final IntsRef input, final BytesRef output) { assert input.ints.length % 32 == 0; final int[] uncompressedData = input.ints; final byte[] compressedData = output.bytes; // prepare the input buffer before starting the compression this.prepareInputBuffer(input); while (input.offset < input.length) { for (final long compressorCode : this.frameCompressorCodes(uncompressedData, input.offset, input.length)) { compressedData[output.offset] = (byte) compressorCode; this.compressors[(int) compressorCode].compress(input, output); } } // flip buffer input.offset = 0; output.length = output.offset; output.offset = 0; }
/** * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to * the biggest term before target. */ public IntsRefFSTEnum(FST<T> fst) { super(fst); result.input = current; current.offset = 1; }
@Override protected void grow() { current.ints = ArrayUtil.grow(current.ints, upto + 1); }
public void resetCurrentNode() { currentNode.offset = currentNode.length = 0; }