Exemplo n.º 1
0
  /**
   * Populates a {@link Tag} object using data from the supplied {@link CharArray}.
   *
   * <p>The supplied tag parameter is reset and reused - this avoids excess object creation which
   * hwlps performance.
   *
   * @return the same tag instance that was passed in, except it will be populated with a new
   *     <tt>name</tt> value (and the corresponding <tt>nameEndIdx</tt> value). However if the tag
   *     contained nathing but whitespace, this method will return <tt>null</tt>.
   */
  private Tag parseTag(Tag tag, CharArray buf) {
    int len = buf.length();
    int idx = 0;
    int begin;

    // Skip over any leading whitespace in the tag
    while (idx < len && Character.isWhitespace(buf.charAt(idx))) idx++;

    if (idx == len) return null;

    // Find out where the non-whitespace characters end. This will give us the tag name.
    begin = idx;
    while (idx < len && !Character.isWhitespace(buf.charAt(idx))) idx++;

    // Mark the tag name as a substring within the buffer. This allows us to perform
    // a substring comparison against it at a later date
    buf.setSubstr(begin, buf.charAt(idx - 1) == '/' ? idx - 1 : idx);

    // Remember where the name finishes so we can pull out the properties later if need be
    tag.nameEndIdx = idx;

    return tag;
  }
Exemplo n.º 2
0
  public Page parse(SitemeshBuffer buffer) throws IOException {
    CharArrayReader reader =
        new CharArrayReader(buffer.getCharArray(), 0, buffer.getBufferLength());
    CharArray _buffer = new CharArray(4096);
    CharArray _body = new CharArray(4096);
    CharArray _head = new CharArray(512);
    CharArray _title = new CharArray(128);
    Map _htmlProperties = null;
    Map _metaProperties = new HashMap(6);
    Map _sitemeshProperties = new HashMap(6);
    Map _bodyProperties = null;

    CharArray _currentTaggedContent = new CharArray(1024);
    String _contentTagId = null;
    boolean tagged = false;

    boolean _frameSet = false;

    int _state = STATE_TEXT;
    int _tokenType = TOKEN_NONE;
    int _pushBack = 0;
    int _comment = 0;
    int _quote = 0;
    boolean hide = false;

    int state = TAG_STATE_NONE;
    int laststate = TAG_STATE_NONE;
    boolean doneTitle = false;

    // This tag object gets reused each iteration.
    Tag tagObject = new Tag();

    while (_tokenType != TOKEN_EOF) {
      if (tagged) {
        if (_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG) {
          if (_buffer == null || _buffer.length() == 0) {
            _tokenType = TOKEN_NONE;
            continue;
          }

          if (parseTag(tagObject, _buffer) == null) continue;

          if (_buffer.compareLowerSubstr(
              "/content")) // Note that the '/' survives the | 32 operation
          {
            tagged = false;
            if (_contentTagId != null) {
              state = TAG_STATE_NONE;
              _sitemeshProperties.put(_contentTagId, _currentTaggedContent.toString());
              _currentTaggedContent.setLength(0);
              _contentTagId = null;
            }
          } else {
            _currentTaggedContent.append('<').append(_buffer).append('>');
          }
        } else {
          if (_buffer.length() > 0) _currentTaggedContent.append(_buffer);
        }
      } else {
        if (_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG) {
          if (_buffer == null || _buffer.length() == 0) {
            _tokenType = TOKEN_NONE;
            continue;
          }

          if (parseTag(tagObject, _buffer) == null) {
            _tokenType = TOKEN_TEXT;
            continue;
          }

          int tagHash = _buffer.substrHashCode();

          if (state == TAG_STATE_XML || state == TAG_STATE_XMP) {
            writeTag(state, laststate, hide, _head, _buffer, _body);
            if ((state == TAG_STATE_XML && tagHash == SLASH_XML_HASH)
                || (state == TAG_STATE_XMP && tagHash == SLASH_XMP_HASH)) {
              state = laststate;
            }
          } else {
            boolean doDefault = false;
            switch (tagHash) {
              case HTML_HASH:
                if (!_buffer.compareLowerSubstr("html")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                state = TAG_STATE_HTML;
                _htmlProperties = parseProperties(tagObject, _buffer).properties;
                break;
              case HEAD_HASH:
                if (!_buffer.compareLowerSubstr("head")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                state = TAG_STATE_HEAD;
                break;
              case XML_HASH:
                if (!_buffer.compareLowerSubstr("xml")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                laststate = state;
                writeTag(state, laststate, hide, _head, _buffer, _body);
                state = TAG_STATE_XML;
                break;
              case XMP_HASH:
                if (!_buffer.compareLowerSubstr("xmp")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                laststate = state;
                writeTag(state, laststate, hide, _head, _buffer, _body);
                state = TAG_STATE_XMP;
                break;
              case TITLE_HASH:
                if (!_buffer.compareLowerSubstr("title")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                if (doneTitle) {
                  hide = true;
                } else {
                  laststate = state;
                  state = TAG_STATE_TITLE;
                }
                break;
              case SLASH_TITLE_HASH:
                if (!_buffer.compareLowerSubstr("/title")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                if (doneTitle) {
                  hide = false;
                } else {
                  doneTitle = true;
                  state = laststate;
                }
                break;
              case PARAMETER_HASH:
                if (!_buffer.compareLowerSubstr(
                    "parameter")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                parseProperties(tagObject, _buffer);
                String name = (String) tagObject.properties.get("name");
                String value = (String) tagObject.properties.get("value");

                if (name != null && value != null) {
                  _sitemeshProperties.put(name, value);
                }
                break;
              case META_HASH:
                if (!_buffer.compareLowerSubstr("meta")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                CharArray metaDestination = state == TAG_STATE_HEAD ? _head : _body;
                metaDestination.append('<');
                metaDestination.append(_buffer);
                metaDestination.append('>');
                parseProperties(tagObject, _buffer);
                name = (String) tagObject.properties.get("name");
                value = (String) tagObject.properties.get("content");

                if (name == null) {
                  String httpEquiv = (String) tagObject.properties.get("http-equiv");

                  if (httpEquiv != null) {
                    name = "http-equiv." + httpEquiv;
                  }
                }

                if (name != null && value != null) {
                  _metaProperties.put(name, value);
                }
                break;
              case SLASH_HEAD_HASH:
                if (!_buffer.compareLowerSubstr("/head")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                state = TAG_STATE_HTML;
                break;
              case FRAME_HASH:
                if (!_buffer.compareLowerSubstr("frame")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                _frameSet = true;
                break;
              case FRAMESET_HASH:
                if (!_buffer.compareLowerSubstr(
                    "frameset")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                _frameSet = true;
                break;
              case BODY_HASH:
                if (!_buffer.compareLowerSubstr("body")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                if (_tokenType == TOKEN_EMPTYTAG) {
                  state = TAG_STATE_BODY;
                }
                _bodyProperties = parseProperties(tagObject, _buffer).properties;
                break;
              case CONTENT_HASH:
                if (!_buffer.compareLowerSubstr("content")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                state = TAG_STATE_NONE;
                Map props = parseProperties(tagObject, _buffer).properties;
                if (props != null) {
                  tagged = true;
                  _contentTagId = (String) props.get("tag");
                }
                break;
              case SLASH_XMP_HASH:
                if (!_buffer.compareLowerSubstr("/xmp")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                hide = false;
                break;
              case SLASH_BODY_HASH:
                if (!_buffer.compareLowerSubstr("/body")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                state = TAG_STATE_NONE;
                hide = true;
                break;
              case SLASH_HTML_HASH:
                if (!_buffer.compareLowerSubstr("/html")) { // skip any accidental hash collisions
                  doDefault = true;
                  break;
                }
                state = TAG_STATE_NONE;
                hide = true;
                break;
              default:
                doDefault = true;
            }
            if (doDefault) writeTag(state, laststate, hide, _head, _buffer, _body);
          }
        } else if (!hide) {
          if (_tokenType == TOKEN_TEXT) {
            if (state == TAG_STATE_TITLE) {
              _title.append(_buffer);
            } else if (shouldWriteToHead(state, laststate)) {
              _head.append(_buffer);
            } else {
              _body.append(_buffer);
            }
          } else if (_tokenType == TOKEN_COMMENT) {
            final CharArray commentDestination =
                shouldWriteToHead(state, laststate) ? _head : _body;
            commentDestination.append("<!--");
            commentDestination.append(_buffer);
            commentDestination.append("-->");
          } else if (_tokenType == TOKEN_CDATA) {
            final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body;
            commentDestination.append("<![CDATA[");
            commentDestination.append(_buffer);
            commentDestination.append("]]>");
          } else if (_tokenType == TOKEN_SCRIPT) {
            final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body;
            commentDestination.append('<');
            commentDestination.append(_buffer);
          }
        }
      }
      _buffer.setLength(0);

      start:
      while (true) {
        int c;

        if (_pushBack != 0) {
          c = _pushBack;
          _pushBack = 0;
        } else {
          try {
            c = reader.read();
          } catch (IOException e) {
            _tokenType = TOKEN_EOF;
            break start;
          }
        }

        if (c < 0) {
          int tmpstate = _state;
          _state = STATE_EOF;

          if (_buffer.length() > 0 && tmpstate == STATE_TEXT) {
            _tokenType = TOKEN_TEXT;
            break start;
          } else {
            _tokenType = TOKEN_EOF;
            break start;
          }
        }

        switch (_state) {
          case STATE_TAG:
            {
              int buflen = _buffer.length();

              if (c == '>') {
                if (_buffer.length() > 1 && _buffer.charAt(_buffer.length() - 1) == '/') {
                  _tokenType = TOKEN_EMPTYTAG;
                } else {
                  _tokenType = TOKEN_TAG;
                }
                _state = STATE_TEXT;
                break start;
              } else if (c == '/') {
                _buffer.append('/');
              } else if (c == '<' && buflen == 0) {
                _buffer.append("<<");
                _state = STATE_TEXT;
              } else if (c == '-'
                  && buflen == 2
                  && _buffer.charAt(1) == '-'
                  && _buffer.charAt(0) == '!') {
                _buffer.setLength(0);
                _state = STATE_COMMENT;
              } else if (c == '['
                  && buflen == 7
                  && _buffer.charAt(0) == '!'
                  && _buffer.charAt(1) == '['
                  && _buffer.compareLower("cdata", 2)) {
                _buffer.setLength(0);
                _state = STATE_CDATA;
              } else if ((c == 'e' || c == 'E')
                  && buflen == 7
                  && _buffer.charAt(0) == '!'
                  && _buffer.compareLower("doctyp", 1)) {
                _buffer.append((char) c);
                _state = STATE_DOCTYPE;
              } else if ((c == 'T' || c == 't')
                  && buflen == 5
                  && _buffer.compareLower("scrip", 0)) {
                _buffer.append((char) c);
                _state = STATE_SCRIPT;
              } else if (c == '"' || c == '\'') {
                _quote = c;
                _buffer.append((char) c);
                _state = STATE_TAG_QUOTE;
              } else {
                _buffer.append((char) c);
              }
            }
            break;

          case STATE_TEXT:
            {
              if (c == '<') {
                _state = STATE_TAG;
                if (_buffer.length() > 0) {
                  _tokenType = TOKEN_TEXT;
                  break start;
                }
              } else {
                _buffer.append((char) c);
              }
            }
            break;

          case STATE_TAG_QUOTE:
            {
              if (c == '>') {
                _pushBack = c;
                _state = STATE_TAG;
              } else {
                _buffer.append((char) c);
                if (c == _quote) {
                  _state = STATE_TAG;
                }
              }
            }
            break;

          case STATE_COMMENT:
            {
              if (c == '>' && _comment >= 2) {
                _buffer.setLength(_buffer.length() - 2);
                _comment = 0;
                _state = STATE_TEXT;
                _tokenType = TOKEN_COMMENT;
                break start;
              } else if (c == '-') {
                _comment++;
              } else {
                _comment = 0;
              }

              _buffer.append((char) c);
            }
            break;

          case STATE_CDATA:
            {
              if (c == '>' && _comment >= 2) {
                _buffer.setLength(_buffer.length() - 2);
                _comment = 0;
                _state = STATE_TEXT;
                _tokenType = TOKEN_CDATA;
                break start;
              } else if (c == ']') {
                _comment++;
              } else {
                _comment = 0;
              }

              _buffer.append((char) c);
            }
            break;

          case STATE_SCRIPT:
            {
              _buffer.append((char) c);
              if (c == '<') {
                _comment = 0;
              } else if ((c == '/' && _comment == 0)
                  || ((c == 's' || c == 'S') && _comment == 1)
                  || ((c == 'c' || c == 'C') && _comment == 2)
                  || ((c == 'r' || c == 'R') && _comment == 3)
                  || ((c == 'i' || c == 'I') && _comment == 4)
                  || ((c == 'p' || c == 'P') && _comment == 5)
                  || ((c == 't' || c == 'T') && _comment == 6)) {
                _comment++;
              } else if (c == '>' && _comment >= 7) {
                _comment = 0;
                _state = STATE_TEXT;
                _tokenType = TOKEN_SCRIPT;
                break start;
              }
            }
            break;

          case STATE_DOCTYPE:
            {
              _buffer.append((char) c);
              if (c == '>') {
                _state = STATE_TEXT;
                _tokenType = TOKEN_DOCTYPE;
                break start;
              } else {
                _comment = 0;
              }
            }
            break;
        }
      }
    }

    // Help the GC
    _currentTaggedContent = null;
    _buffer = null;

    return new FastPage(
        buffer,
        _sitemeshProperties,
        _htmlProperties,
        _metaProperties,
        _bodyProperties,
        _title.toString().trim(),
        _head.toString().trim(),
        _body.toString().trim(),
        _frameSet);
  }
Exemplo n.º 3
0
  /**
   * This is called when we need to extract the properties for the tag from the tag's HTML. We only
   * call this when necessary since it has quite a lot of overhead.
   *
   * @param tag the tag that is currently being processed. This should be the tag that was returned
   *     as a result of a call to {@link #parseTag(FastPageParser.Tag, CharArray)} (ie, it has the
   *     <tt>name</tt> and <tt>nameEndIdx</tt> fields set correctly for the tag in question. The
   *     <tt>properties</tt> field can be in an undefined state - it will get replaced regardless).
   * @param buffer a <tt>CharArray</tt> containing the entire tag that is being parsed.
   * @return the same tag instance that was passed in, only it will now be populated with any
   *     properties that were specified in the tag's HTML.
   */
  private static Tag parseProperties(Tag tag, CharArray buffer) {
    int len = buffer.length();
    int idx = tag.nameEndIdx;

    // Start with an empty hashmap. A new HashMap is lazy-created if we happen to find any
    // properties
    tag.properties = Collections.EMPTY_MAP;
    int begin;
    while (idx < len) {
      // Skip forward to the next non-whitespace character
      while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++;

      if (idx == len) continue;

      begin = idx;
      if (buffer.charAt(idx) == '"') {
        idx++;
        while (idx < len && buffer.charAt(idx) != '"') idx++;
        if (idx == len) continue;
        idx++;
      } else if (buffer.charAt(idx) == '\'') {
        idx++;
        while (idx < len && buffer.charAt(idx) != '\'') idx++;
        if (idx == len) continue;
        idx++;
      } else {
        while (idx < len
            && !Character.isWhitespace(buffer.charAt(idx))
            && buffer.charAt(idx) != '=') idx++;
      }

      // Mark the substring. This is the attribute name
      buffer.setSubstr(begin, idx);

      if (idx < len && Character.isWhitespace(buffer.charAt(idx))) {
        while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++;
      }

      if (idx == len || buffer.charAt(idx) != '=') continue;

      idx++;

      if (idx == len) continue;

      while (idx < len && (buffer.charAt(idx) == '\n' || buffer.charAt(idx) == '\r')) idx++;

      if (buffer.charAt(idx) == ' ') {
        while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++;
        if (idx == len || (buffer.charAt(idx) != '"' && buffer.charAt(idx) != '"')) continue;
      }

      begin = idx;
      int end;
      if (buffer.charAt(idx) == '"') {
        idx++;
        begin = idx;
        while (idx < len && buffer.charAt(idx) != '"') idx++;
        if (idx == len) continue;
        end = idx;
        idx++;
      } else if (buffer.charAt(idx) == '\'') {
        idx++;
        begin = idx;
        while (idx < len && buffer.charAt(idx) != '\'') idx++;
        if (idx == len) continue;
        end = idx;
        idx++;
      } else {
        while (idx < len && !Character.isWhitespace(buffer.charAt(idx))) idx++;
        end = idx;
      }
      // Extract the name and value as String objects and add them to the property map
      String name = buffer.getLowerSubstr();
      String value = buffer.substring(begin, end);

      tag.addProperty(name, value);
    }
    return tag;
  }