/** * Populates a {@link Tag} object using data from the supplied {@link CharArray}. * * <p>The supplied tag parameter is reset and reused - this avoids excess object creation which * hwlps performance. * * @return the same tag instance that was passed in, except it will be populated with a new * <tt>name</tt> value (and the corresponding <tt>nameEndIdx</tt> value). However if the tag * contained nathing but whitespace, this method will return <tt>null</tt>. */ private Tag parseTag(Tag tag, CharArray buf) { int len = buf.length(); int idx = 0; int begin; // Skip over any leading whitespace in the tag while (idx < len && Character.isWhitespace(buf.charAt(idx))) idx++; if (idx == len) return null; // Find out where the non-whitespace characters end. This will give us the tag name. begin = idx; while (idx < len && !Character.isWhitespace(buf.charAt(idx))) idx++; // Mark the tag name as a substring within the buffer. This allows us to perform // a substring comparison against it at a later date buf.setSubstr(begin, buf.charAt(idx - 1) == '/' ? idx - 1 : idx); // Remember where the name finishes so we can pull out the properties later if need be tag.nameEndIdx = idx; return tag; }
public Page parse(SitemeshBuffer buffer) throws IOException { CharArrayReader reader = new CharArrayReader(buffer.getCharArray(), 0, buffer.getBufferLength()); CharArray _buffer = new CharArray(4096); CharArray _body = new CharArray(4096); CharArray _head = new CharArray(512); CharArray _title = new CharArray(128); Map _htmlProperties = null; Map _metaProperties = new HashMap(6); Map _sitemeshProperties = new HashMap(6); Map _bodyProperties = null; CharArray _currentTaggedContent = new CharArray(1024); String _contentTagId = null; boolean tagged = false; boolean _frameSet = false; int _state = STATE_TEXT; int _tokenType = TOKEN_NONE; int _pushBack = 0; int _comment = 0; int _quote = 0; boolean hide = false; int state = TAG_STATE_NONE; int laststate = TAG_STATE_NONE; boolean doneTitle = false; // This tag object gets reused each iteration. Tag tagObject = new Tag(); while (_tokenType != TOKEN_EOF) { if (tagged) { if (_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG) { if (_buffer == null || _buffer.length() == 0) { _tokenType = TOKEN_NONE; continue; } if (parseTag(tagObject, _buffer) == null) continue; if (_buffer.compareLowerSubstr( "/content")) // Note that the '/' survives the | 32 operation { tagged = false; if (_contentTagId != null) { state = TAG_STATE_NONE; _sitemeshProperties.put(_contentTagId, _currentTaggedContent.toString()); _currentTaggedContent.setLength(0); _contentTagId = null; } } else { _currentTaggedContent.append('<').append(_buffer).append('>'); } } else { if (_buffer.length() > 0) _currentTaggedContent.append(_buffer); } } else { if (_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG) { if (_buffer == null || _buffer.length() == 0) { _tokenType = TOKEN_NONE; continue; } if (parseTag(tagObject, _buffer) == null) { _tokenType = TOKEN_TEXT; continue; } int tagHash = _buffer.substrHashCode(); if (state == TAG_STATE_XML || state == TAG_STATE_XMP) { writeTag(state, laststate, hide, _head, _buffer, _body); if ((state == TAG_STATE_XML && tagHash == SLASH_XML_HASH) || (state == TAG_STATE_XMP && tagHash == SLASH_XMP_HASH)) { state = laststate; } } else { boolean doDefault = false; switch (tagHash) { case HTML_HASH: if (!_buffer.compareLowerSubstr("html")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_HTML; _htmlProperties = parseProperties(tagObject, _buffer).properties; break; case HEAD_HASH: if (!_buffer.compareLowerSubstr("head")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_HEAD; break; case XML_HASH: if (!_buffer.compareLowerSubstr("xml")) { // skip any accidental hash collisions doDefault = true; break; } laststate = state; writeTag(state, laststate, hide, _head, _buffer, _body); state = TAG_STATE_XML; break; case XMP_HASH: if (!_buffer.compareLowerSubstr("xmp")) { // skip any accidental hash collisions doDefault = true; break; } laststate = state; writeTag(state, laststate, hide, _head, _buffer, _body); state = TAG_STATE_XMP; break; case TITLE_HASH: if (!_buffer.compareLowerSubstr("title")) { // skip any accidental hash collisions doDefault = true; break; } if (doneTitle) { hide = true; } else { laststate = state; state = TAG_STATE_TITLE; } break; case SLASH_TITLE_HASH: if (!_buffer.compareLowerSubstr("/title")) { // skip any accidental hash collisions doDefault = true; break; } if (doneTitle) { hide = false; } else { doneTitle = true; state = laststate; } break; case PARAMETER_HASH: if (!_buffer.compareLowerSubstr( "parameter")) { // skip any accidental hash collisions doDefault = true; break; } parseProperties(tagObject, _buffer); String name = (String) tagObject.properties.get("name"); String value = (String) tagObject.properties.get("value"); if (name != null && value != null) { _sitemeshProperties.put(name, value); } break; case META_HASH: if (!_buffer.compareLowerSubstr("meta")) { // skip any accidental hash collisions doDefault = true; break; } CharArray metaDestination = state == TAG_STATE_HEAD ? _head : _body; metaDestination.append('<'); metaDestination.append(_buffer); metaDestination.append('>'); parseProperties(tagObject, _buffer); name = (String) tagObject.properties.get("name"); value = (String) tagObject.properties.get("content"); if (name == null) { String httpEquiv = (String) tagObject.properties.get("http-equiv"); if (httpEquiv != null) { name = "http-equiv." + httpEquiv; } } if (name != null && value != null) { _metaProperties.put(name, value); } break; case SLASH_HEAD_HASH: if (!_buffer.compareLowerSubstr("/head")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_HTML; break; case FRAME_HASH: if (!_buffer.compareLowerSubstr("frame")) { // skip any accidental hash collisions doDefault = true; break; } _frameSet = true; break; case FRAMESET_HASH: if (!_buffer.compareLowerSubstr( "frameset")) { // skip any accidental hash collisions doDefault = true; break; } _frameSet = true; break; case BODY_HASH: if (!_buffer.compareLowerSubstr("body")) { // skip any accidental hash collisions doDefault = true; break; } if (_tokenType == TOKEN_EMPTYTAG) { state = TAG_STATE_BODY; } _bodyProperties = parseProperties(tagObject, _buffer).properties; break; case CONTENT_HASH: if (!_buffer.compareLowerSubstr("content")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_NONE; Map props = parseProperties(tagObject, _buffer).properties; if (props != null) { tagged = true; _contentTagId = (String) props.get("tag"); } break; case SLASH_XMP_HASH: if (!_buffer.compareLowerSubstr("/xmp")) { // skip any accidental hash collisions doDefault = true; break; } hide = false; break; case SLASH_BODY_HASH: if (!_buffer.compareLowerSubstr("/body")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_NONE; hide = true; break; case SLASH_HTML_HASH: if (!_buffer.compareLowerSubstr("/html")) { // skip any accidental hash collisions doDefault = true; break; } state = TAG_STATE_NONE; hide = true; break; default: doDefault = true; } if (doDefault) writeTag(state, laststate, hide, _head, _buffer, _body); } } else if (!hide) { if (_tokenType == TOKEN_TEXT) { if (state == TAG_STATE_TITLE) { _title.append(_buffer); } else if (shouldWriteToHead(state, laststate)) { _head.append(_buffer); } else { _body.append(_buffer); } } else if (_tokenType == TOKEN_COMMENT) { final CharArray commentDestination = shouldWriteToHead(state, laststate) ? _head : _body; commentDestination.append("<!--"); commentDestination.append(_buffer); commentDestination.append("-->"); } else if (_tokenType == TOKEN_CDATA) { final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body; commentDestination.append("<![CDATA["); commentDestination.append(_buffer); commentDestination.append("]]>"); } else if (_tokenType == TOKEN_SCRIPT) { final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body; commentDestination.append('<'); commentDestination.append(_buffer); } } } _buffer.setLength(0); start: while (true) { int c; if (_pushBack != 0) { c = _pushBack; _pushBack = 0; } else { try { c = reader.read(); } catch (IOException e) { _tokenType = TOKEN_EOF; break start; } } if (c < 0) { int tmpstate = _state; _state = STATE_EOF; if (_buffer.length() > 0 && tmpstate == STATE_TEXT) { _tokenType = TOKEN_TEXT; break start; } else { _tokenType = TOKEN_EOF; break start; } } switch (_state) { case STATE_TAG: { int buflen = _buffer.length(); if (c == '>') { if (_buffer.length() > 1 && _buffer.charAt(_buffer.length() - 1) == '/') { _tokenType = TOKEN_EMPTYTAG; } else { _tokenType = TOKEN_TAG; } _state = STATE_TEXT; break start; } else if (c == '/') { _buffer.append('/'); } else if (c == '<' && buflen == 0) { _buffer.append("<<"); _state = STATE_TEXT; } else if (c == '-' && buflen == 2 && _buffer.charAt(1) == '-' && _buffer.charAt(0) == '!') { _buffer.setLength(0); _state = STATE_COMMENT; } else if (c == '[' && buflen == 7 && _buffer.charAt(0) == '!' && _buffer.charAt(1) == '[' && _buffer.compareLower("cdata", 2)) { _buffer.setLength(0); _state = STATE_CDATA; } else if ((c == 'e' || c == 'E') && buflen == 7 && _buffer.charAt(0) == '!' && _buffer.compareLower("doctyp", 1)) { _buffer.append((char) c); _state = STATE_DOCTYPE; } else if ((c == 'T' || c == 't') && buflen == 5 && _buffer.compareLower("scrip", 0)) { _buffer.append((char) c); _state = STATE_SCRIPT; } else if (c == '"' || c == '\'') { _quote = c; _buffer.append((char) c); _state = STATE_TAG_QUOTE; } else { _buffer.append((char) c); } } break; case STATE_TEXT: { if (c == '<') { _state = STATE_TAG; if (_buffer.length() > 0) { _tokenType = TOKEN_TEXT; break start; } } else { _buffer.append((char) c); } } break; case STATE_TAG_QUOTE: { if (c == '>') { _pushBack = c; _state = STATE_TAG; } else { _buffer.append((char) c); if (c == _quote) { _state = STATE_TAG; } } } break; case STATE_COMMENT: { if (c == '>' && _comment >= 2) { _buffer.setLength(_buffer.length() - 2); _comment = 0; _state = STATE_TEXT; _tokenType = TOKEN_COMMENT; break start; } else if (c == '-') { _comment++; } else { _comment = 0; } _buffer.append((char) c); } break; case STATE_CDATA: { if (c == '>' && _comment >= 2) { _buffer.setLength(_buffer.length() - 2); _comment = 0; _state = STATE_TEXT; _tokenType = TOKEN_CDATA; break start; } else if (c == ']') { _comment++; } else { _comment = 0; } _buffer.append((char) c); } break; case STATE_SCRIPT: { _buffer.append((char) c); if (c == '<') { _comment = 0; } else if ((c == '/' && _comment == 0) || ((c == 's' || c == 'S') && _comment == 1) || ((c == 'c' || c == 'C') && _comment == 2) || ((c == 'r' || c == 'R') && _comment == 3) || ((c == 'i' || c == 'I') && _comment == 4) || ((c == 'p' || c == 'P') && _comment == 5) || ((c == 't' || c == 'T') && _comment == 6)) { _comment++; } else if (c == '>' && _comment >= 7) { _comment = 0; _state = STATE_TEXT; _tokenType = TOKEN_SCRIPT; break start; } } break; case STATE_DOCTYPE: { _buffer.append((char) c); if (c == '>') { _state = STATE_TEXT; _tokenType = TOKEN_DOCTYPE; break start; } else { _comment = 0; } } break; } } } // Help the GC _currentTaggedContent = null; _buffer = null; return new FastPage( buffer, _sitemeshProperties, _htmlProperties, _metaProperties, _bodyProperties, _title.toString().trim(), _head.toString().trim(), _body.toString().trim(), _frameSet); }
/** * This is called when we need to extract the properties for the tag from the tag's HTML. We only * call this when necessary since it has quite a lot of overhead. * * @param tag the tag that is currently being processed. This should be the tag that was returned * as a result of a call to {@link #parseTag(FastPageParser.Tag, CharArray)} (ie, it has the * <tt>name</tt> and <tt>nameEndIdx</tt> fields set correctly for the tag in question. The * <tt>properties</tt> field can be in an undefined state - it will get replaced regardless). * @param buffer a <tt>CharArray</tt> containing the entire tag that is being parsed. * @return the same tag instance that was passed in, only it will now be populated with any * properties that were specified in the tag's HTML. */ private static Tag parseProperties(Tag tag, CharArray buffer) { int len = buffer.length(); int idx = tag.nameEndIdx; // Start with an empty hashmap. A new HashMap is lazy-created if we happen to find any // properties tag.properties = Collections.EMPTY_MAP; int begin; while (idx < len) { // Skip forward to the next non-whitespace character while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++; if (idx == len) continue; begin = idx; if (buffer.charAt(idx) == '"') { idx++; while (idx < len && buffer.charAt(idx) != '"') idx++; if (idx == len) continue; idx++; } else if (buffer.charAt(idx) == '\'') { idx++; while (idx < len && buffer.charAt(idx) != '\'') idx++; if (idx == len) continue; idx++; } else { while (idx < len && !Character.isWhitespace(buffer.charAt(idx)) && buffer.charAt(idx) != '=') idx++; } // Mark the substring. This is the attribute name buffer.setSubstr(begin, idx); if (idx < len && Character.isWhitespace(buffer.charAt(idx))) { while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++; } if (idx == len || buffer.charAt(idx) != '=') continue; idx++; if (idx == len) continue; while (idx < len && (buffer.charAt(idx) == '\n' || buffer.charAt(idx) == '\r')) idx++; if (buffer.charAt(idx) == ' ') { while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++; if (idx == len || (buffer.charAt(idx) != '"' && buffer.charAt(idx) != '"')) continue; } begin = idx; int end; if (buffer.charAt(idx) == '"') { idx++; begin = idx; while (idx < len && buffer.charAt(idx) != '"') idx++; if (idx == len) continue; end = idx; idx++; } else if (buffer.charAt(idx) == '\'') { idx++; begin = idx; while (idx < len && buffer.charAt(idx) != '\'') idx++; if (idx == len) continue; end = idx; idx++; } else { while (idx < len && !Character.isWhitespace(buffer.charAt(idx))) idx++; end = idx; } // Extract the name and value as String objects and add them to the property map String name = buffer.getLowerSubstr(); String value = buffer.substring(begin, end); tag.addProperty(name, value); } return tag; }