/** * finds the next match in the current input, appends it to <code>out</code> and returns the * {@link FaAction} associated with the match. Input is read until a match is found, {@link * #maxCopy} is reached or EOF is hit. Non-matching input is handled according to {@link * #setOnFailedMatch setOnFailedMatch()}. In particular: * * <dl> * <dt>{@link #UNMATCHED_COPY} * <dd>will append up to {@link #maxCopy} non-matching characters in front of the match. If * <code>maxCopy</code> is reached before the match, <b>no matching text is returned</b>, * only the non-matching characters. In this case the return value is <code>null</code>, and * should <code>maxCopy</code> be ≤ 1, then 1 character is always delivered. If a * match is found before <code>maxCopy</code> is reached, the match is appended to <code>out * </code>. To find out where the match actually starts, call {@link #matchStart()}. * <dt>{@link #UNMATCHED_DROP} * <dd>will drop (delete) unmatched text. In this case the matching text is the only text * appended to <code>out</code>. * <dt>{@link #UNMATCHED_THROW} * <dd>causes a {@link monq.jfa.NomatchException} to be thrown. No text will be appended to * <code>out</code> and the offenting text will still be available in the {@link CharSource} * serving as input to <code>this</code>. * </dl> * * <p><b>Hint:</b> Use this method if you are interested only in a simple tokenization of the * input. The actions returned may serve as the token type. If you however want to apply the * actions returned immediately to the match, then rather use one of the <code>read</code> or * <code>filter</code> methods. If you find yourself using <code>if</code> statements on the * <code>FaAction</code> returned, you are definitively doing something wrong. * * @return * <dl> * <dt>eofAction * <dd>When EOF is hit the first time and the <code>Dfa</code> operated has a action set for * EOF which is not <code>null</code> this is returned (see {@link Nfa#compile * Nfa.compile()}). * <dt>{@link #EOF} * <dd>if EOF is hit and <code>eofAction</code> was already delivered or is <code>null * </code>. The output may have non-matching input that was found just before EOF. * <dt><code>null</code> * <dd>if <code>UNMATCHED_COPY</code> is active and <code>maxCopy</code> non-matching * characters where found before a match was encountered. * <dt>an action * <dd>found for a match. * </dl> */ public FaAction next(StringBuilder out) throws java.io.IOException { matchStart = out.length(); FaAction a = dfa.match(in, out, smd); if (a == null) { // There was no match, so we have to search for the first // match. Note: there is always at least one character available as // long as not Dfa.EOF is returned by dfa.match() if (onFailedMatch == UNMATCHED_COPY) { int unmatched = 0; do { out.append((char) (in.read())); unmatched += 1; a = dfa.match(in, out, smd); } while (a == null && unmatched < maxCopy); matchStart += unmatched; } else if (onFailedMatch == UNMATCHED_DROP) { do { in.read(); a = dfa.match(in, out, smd); } while (a == null); } else { // everything else is a failure String emsg = lookahead(); throw new NomatchException( "no matching regular expression " + "when looking at `" + emsg + "'"); } } // We handle EOF and eofAction as if we have found a match if (a == EOF && dfa.eofAction != null && eofArmed) { eofArmed = false; return dfa.eofAction; } return a; }
/** * fetch a bit of lookahead for use in messages for exceptions. The lookahead is pushed back into * the input afterwards. */ private String lookahead() { // Read up to 30 chars for a decent error message StringBuilder sb = new StringBuilder(30); int i; try { for (i = 0; i < 30; i++) { int ch = in.read(); if (ch == -1) break; sb.append((char) ch); } } catch (java.io.IOException e) { in.pushBack(sb, 0); return "IOException when trying to generate context info"; } String result; if (i == 30) result = sb.substring(0, 27) + "..."; else result = sb.toString() + "[EOF]"; in.pushBack(sb, 0); return result; }
/** * reads one character immediately from the input source and returns it without filtering. If * filtered characters are already available because of a previous {@link #read()}, these are not * touched and will be used in the next call to one of the <code>read()</code> functions. */ public int skip() throws java.io.IOException { return in.read(); }