public List<Newsitem> parseContent(String content) throws Exception {
    List<Newsitem> newsitems = new ArrayList<Newsitem>();

    Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news");
    NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem");

    for (int i = 0; i < nodes.size(); i++) {
      NewsitemImpl newsitem = new NewsitemImpl();
      Tag itemTable = (Tag) nodes.elementAt(i);

      Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle");
      newsitem.setTitle(titleTag.toPlainTextString());

      Node descriptionSpan =
          titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling();
      newsitem.setDescription(
          descriptionSpan
              .toPlainTextString()
              .replaceAll("[^\\u0000-\\u00FF]", " ")
              .replace("&nbsp;Read More...", "")
              .trim());

      Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0);
      newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href"));
      newsitems.add(newsitem);
    }
    return newsitems;
  }
  public void testParentConnections() throws ParserException {
    String tag1 = "<custom>";
    String tag2 = "<custom>something</custom>";
    String tag3 = "</custom>";
    createParser(tag1 + tag2 + tag3);
    parser.setNodeFactory(
        new PrototypicalNodeFactory(
            new Tag[] {
              new CustomTag(false), new AnotherTag(false),
            }));
    parseAndAssertNodeCount(3);

    CustomTag customTag = (CustomTag) node[0];

    assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml());
    assertNull("first custom tag should have no parent", customTag.getParent());

    customTag = (CustomTag) node[1];
    assertStringEquals("second custom tag html", tag2, customTag.toHtml());
    assertNull("second custom tag should have no parent", customTag.getParent());

    Node firstChild = customTag.childAt(0);
    assertType("firstChild", Text.class, firstChild);
    Node parent = firstChild.getParent();
    assertNotNull("first child parent should not be null", parent);
    assertSame("parent and custom tag should be the same", customTag, parent);

    Tag endTag = (Tag) node[2];
    assertStringEquals("third custom tag html", tag3, endTag.toHtml());
    assertNull("end tag should have no parent", endTag.getParent());
  }
Esempio n. 3
0
 public Row() {
   rowNode = (TableRow) newTag(TableRow.class);
   rowNode.setChildren(new NodeList());
   Tag endNode = new TableRow();
   endNode.setTagName("/" + rowNode.getTagName().toLowerCase());
   rowNode.setEndTag(endNode);
 }
Esempio n. 4
0
  /**
   * Extract the object <code>PARAM</code> tags from the child list.
   *
   * @return The list of object parameters (keys and values are String objects).
   */
  public HashMap createObjectParamsTable() {
    NodeList kids;
    Node node;
    Tag tag;
    String paramName;
    String paramValue;
    HashMap ret;

    ret = new HashMap();
    kids = getChildren();
    if (null != kids)
      for (int i = 0; i < kids.size(); i++) {
        node = children.elementAt(i);
        if (node instanceof Tag) {
          tag = (Tag) node;
          if (tag.getTagName().equals("PARAM")) {
            paramName = tag.getAttribute("NAME");
            if (null != paramName && 0 != paramName.length()) {
              paramValue = tag.getAttribute("VALUE");
              ret.put(paramName.toUpperCase(), paramValue);
            }
          }
        }
      }

    return (ret);
  }
Esempio n. 5
0
 private void setExecutionResult(ExecutionResult executionResult) {
   NodeList cells = rowNode.getChildren();
   for (int i = 0; i < cells.size(); i++) {
     Node cell = cells.elementAt(i);
     if (cell instanceof Tag) {
       Tag tag = (Tag) cell;
       tag.setAttribute("class", executionResult.toString(), '"');
     }
   }
 }
Esempio n. 6
0
  /**
   * Returns true if a given tag is allowed. Also, it checks and removes any unwanted attribute the
   * tag may contain.
   *
   * @param node The tag node to analyze
   * @return true if it is a valid tag.
   */
  private boolean isTagWelcome(Node node) {
    Tag tag = (Tag) node;

    if (!welcomeTags.contains(tag.getTagName())) {
      return false;
    }

    this.checkAndValidateAttributes(tag, true);

    return true;
  }
Esempio n. 7
0
 /**
  * A bug in the freshmeat page - really bad html tag - &lt;A&gt;Revision&lt;\a&gt; Reported by
  * Mazlan Mat Note: Actually, this is completely legal HTML - Derrick
  */
 public void testFreshMeatBug() throws ParserException {
   String html = "<a>Revision</a>";
   createParser(html, "http://www.yahoo.com");
   parseAndAssertNodeCount(1);
   assertTrue("Node 0 should be a tag", node[0] instanceof Tag);
   Tag tag = (Tag) node[0];
   assertEquals("Tag Contents", html, tag.toHtml());
   assertEquals("Node 0 should have one child", 1, tag.getChildren().size());
   assertTrue("The child should be a string node", tag.getChildren().elementAt(0) instanceof Text);
   Text stringNode = (Text) tag.getChildren().elementAt(0);
   assertEquals("Text Contents", "Revision", stringNode.getText());
 }
Esempio n. 8
0
    public boolean accept(Node node) {
      Tag tag;
      Attribute attribute;
      boolean ret;

      ret = false;
      if (node instanceof Tag) {
        tag = (Tag) node;
        attribute = tag.getAttributeEx(mAttribute);
        ret = null != attribute;
        if (ret && (null != mValue)) ret = attribute.getValue().startsWith(mValue);
      }

      return (ret);
    }
Esempio n. 9
0
 public void testLinkDataContents() throws ParserException {
   createParser(
       "<a href=\"http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689\" target=\"_new\"><img src=\"http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif\" width=468 height=60 border=\"0\" alt=\"See Signs in Theaters 8-2 - Starring Mel Gibson\" align=><font face=\"verdana,arial,helvetica\" SIZE=\"1\"><b></b></font></a>",
       "http://transfer.go.com");
   parser.setNodeFactory(
       new PrototypicalNodeFactory(
           new Tag[] {
             new LinkTag(), new ImageTag(),
           }));
   parseAndAssertNodeCount(1);
   assertTrue("Node 0 should be a link tag", node[0] instanceof LinkTag);
   LinkTag linkTag = (LinkTag) node[0];
   assertEquals(
       "Link URL",
       "http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689",
       linkTag.getLink());
   assertEquals("Link Text", "", linkTag.getLinkText());
   Node[] containedNodes = new Node[10];
   int i = 0;
   for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) {
     containedNodes[i++] = e.nextNode();
   }
   assertEquals("There should be 5 contained nodes in the link tag", 5, i);
   assertTrue(
       "First contained node should be an image tag", containedNodes[0] instanceof ImageTag);
   ImageTag imageTag = (ImageTag) containedNodes[0];
   assertEquals(
       "Image Location",
       "http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif",
       imageTag.getImageURL());
   assertEquals("Image Height", "60", imageTag.getAttribute("HEIGHT"));
   assertEquals("Image Width", "468", imageTag.getAttribute("WIDTH"));
   assertEquals("Image Border", "0", imageTag.getAttribute("BORDER"));
   assertEquals(
       "Image Alt",
       "See Signs in Theaters 8-2 - Starring Mel Gibson",
       imageTag.getAttribute("ALT"));
   assertTrue("Second contained node should be Tag", containedNodes[1] instanceof Tag);
   Tag tag1 = (Tag) containedNodes[1];
   assertEquals(
       "Tag Contents", "font face=\"verdana,arial,helvetica\" SIZE=\"1\"", tag1.getText());
   assertTrue("Third contained node should be Tag", containedNodes[2] instanceof Tag);
   Tag tag2 = (Tag) containedNodes[2];
   assertEquals("Tag Contents", "b", tag2.getText());
   assertTrue("Fourth contained node should be a Tag", containedNodes[3] instanceof Tag);
   Tag tag = (Tag) containedNodes[3];
   assertTrue("Fourth contained node should be an EndTag", tag.isEndTag());
   assertEquals("Fourth Tag contents", "/b", tag.getText());
   assertTrue("Fifth contained node should be a Tag", containedNodes[4] instanceof Tag);
   tag = (Tag) containedNodes[4];
   assertTrue("Fifth contained node should be an EndTag", tag.isEndTag());
   assertEquals("Fifth Tag contents", "/font", tag.getText());
 }
  /**
   * Look up the index of an attribute by Namespace name.
   *
   * @param uri The Namespace URI, or the empty string if the name has no Namespace URI.
   * @param localName The attribute's local name.
   * @return The index of the attribute, or -1 if it does not appear in the list.
   */
  public int getIndex(String uri, String localName) {
    Vector attributes;
    int size;
    Attribute attribute;
    String string;
    int ret;

    ret = -1;

    attributes = mTag.getAttributesEx();
    if (null != attributes) {
      size = attributes.size();
      for (int i = 1; i < size; i++) {
        attribute = (Attribute) attributes.elementAt(i);
        string = attribute.getName();
        if (null != string) // not whitespace
        {
          mSupport.processName(string, mParts, true);
          if (uri.equals(mParts[0]) & localName.equalsIgnoreCase(mParts[1])) {
            ret = i;
            i = size; // exit fast
          }
        }
      }
    }

    return (ret);
  }
 public void visitTag(Tag tag) {
   for (int i = 0; i < tagsToBeFound.length; i++)
     if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) {
       count[i]++;
       tags[i].add(tag);
     }
 }
 public void visitEndTag(Tag tag) {
   if (!endTagCheck) return;
   for (int i = 0; i < tagsToBeFound.length; i++)
     if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) {
       endTagCount[i]++;
       endTags[i].add(tag);
     }
 }
Esempio n. 13
0
  /** Test child filtering. */
  public void testChild() throws ParserException {
    String guts;
    String html;
    NodeList list;

    guts =
        "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>";
    html = "<html>" + guts + "</html>";
    createParser(html);
    list = parser.extractAllNodesThatMatch(new HasChildFilter(new TagNameFilter("b")));
    assertEquals("only one element", 1, list.size());
    assertType("should be LinkTag", LinkTag.class, list.elementAt(0));
    LinkTag link = (LinkTag) list.elementAt(0);
    assertEquals("three children", 3, link.getChildCount());
    assertSuperType("should be TagNode", Tag.class, link.getChildren().elementAt(0));
    Tag tag = (Tag) link.getChildren().elementAt(0);
    assertStringEquals("name", "B", tag.getTagName());
  }
  /**
   * Look up an attribute's value by index.
   *
   * <p>If the attribute value is a list of tokens (IDREFS, ENTITIES, or NMTOKENS), the tokens will
   * be concatenated into a single string with each token separated by a single space.
   *
   * @param index The attribute index (zero-based).
   * @return The attribute's value as a string, or null if the index is out of range.
   * @see #getLength
   */
  public String getValue(int index) {
    Attribute attribute;
    String ret;

    attribute = (Attribute) (mTag.getAttributesEx().elementAt(index + 1));
    ret = attribute.getValue();
    if (null == ret) ret = "";

    return (ret);
  }
  /**
   * Look up an attribute's XML qualified (prefixed) name by index.
   *
   * @param index The attribute index (zero-based).
   * @return The XML qualified name, or the empty string if none is available, or null if the index
   *     is out of range.
   * @see #getLength
   */
  public String getQName(int index) {
    Attribute attribute;
    String ret;

    attribute = (Attribute) (mTag.getAttributesEx().elementAt(index + 1));
    if (attribute.isWhitespace()) ret = "#text";
    else ret = attribute.getName();

    return (ret);
  }
Esempio n. 16
0
 @Override
 public void visitTag(Tag tag) {
   Element e = Document.get().createElement(tag.getTagName());
   map.put(tag, e);
   for (Object o : tag.getAttributesEx()) {
     Attribute a = (Attribute) o;
     if ("id".equalsIgnoreCase(a.getName())) {
       e.setId(a.getValue());
     } else if ("style".equalsIgnoreCase(a.getName())) {
       processStyle(e, a.getValue());
     } else if ("class".equalsIgnoreCase(a.getName())) {
       e.setClassName(a.getValue());
     } else if (!a.isEmpty() && !a.isWhitespace() && a.isValued()) {
       e.setAttribute(a.getName(), a.getValue());
     }
   }
   Element parent = getParent(tag.getParent());
   parent.appendChild(e);
 }
Esempio n. 17
0
 private Tag newTag(Class<? extends Tag> klass) {
   Tag tag = null;
   try {
     tag = klass.newInstance();
     tag.setTagName(tag.getTagName().toLowerCase());
     Tag endTag = klass.newInstance();
     endTag.setTagName("/" + tag.getTagName().toLowerCase());
     endTag.setParent(tag);
     tag.setEndTag(endTag);
   } catch (Exception e) {
     e.printStackTrace();
   }
   return tag;
 }
  public void testCompositeTagWithSelfChildren() throws ParserException {
    String tag1 = "<custom>";
    String tag2 = "<custom>something</custom>";
    String tag3 = "</custom>";
    createParser(tag1 + tag2 + tag3);
    parser.setNodeFactory(
        new PrototypicalNodeFactory(
            new Tag[] {
              new CustomTag(false), new AnotherTag(false),
            }));
    parseAndAssertNodeCount(3);

    CustomTag customTag = (CustomTag) node[0];
    assertEquals("child count", 0, customTag.getChildCount());
    assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag());

    assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml());
    customTag = (CustomTag) node[1];
    assertStringEquals("second custom tag html", tag2, customTag.toHtml());
    Tag endTag = (Tag) node[2];
    assertStringEquals("third custom tag html", tag3, endTag.toHtml());
  }
Esempio n. 19
0
    /** @see org.htmlparser.visitors.NodeVisitor */
    public void visitTag(final Tag n) {
      if ((null != n.getParent())
          || ((n instanceof CompositeTag) && (null == ((CompositeTag) n).getEndTag()))) {

        if (n instanceof ScriptTag) {
          this.scriptTag = (ScriptTag) n;
        }
      } else {
        if (n instanceof ScriptTag) {
          this.scriptTag = (ScriptTag) n;
        }
      }
    }
Esempio n. 20
0
  /**
   * Given a tag, check its attributes, removing those unwanted or not secure
   *
   * @param tag The tag to analyze
   * @param checkIfAttributeIsWelcome true if the attribute name should be matched against the list
   *     of welcome attributes, set in the main configuration file.
   */
  private void checkAndValidateAttributes(Tag tag, boolean checkIfAttributeIsWelcome) {
    Vector<Attribute> newAttributes = new Vector<Attribute>();

    for (Iterator<?> iter = tag.getAttributesEx().iterator(); iter.hasNext(); ) {
      Attribute a = (Attribute) iter.next();

      String name = a.getName();

      if (name == null) {
        newAttributes.add(a);
      } else {
        name = name.toUpperCase();

        if (a.getValue() == null) {
          newAttributes.add(a);
          continue;
        }

        String value = a.getValue().toLowerCase();

        if (checkIfAttributeIsWelcome && !this.isAttributeWelcome(name)) {
          continue;
        }

        if (!this.isAttributeSafe(name, value)) {
          continue;
        }

        if (a.getValue().indexOf("&#") > -1) {
          a.setValue(a.getValue().replaceAll("&#", "&amp;#"));
        }

        newAttributes.add(a);
      }
    }

    tag.setAttributesEx(newAttributes);
  }
Esempio n. 21
0
  /**
   * Given an input, analyze each HTML tag and remove unsecured attributes from them.
   *
   * @param contents The content to verify
   * @return the content, secure.
   */
  public String ensureAllAttributesAreSafe(String contents) {
    StringBuffer sb = new StringBuffer(contents.length());

    try {
      Lexer lexer = new Lexer(contents);
      Node node;

      while ((node = lexer.nextNode()) != null) {
        if (node instanceof Tag) {
          Tag tag = (Tag) node;

          this.checkAndValidateAttributes(tag, false);

          sb.append(tag.toHtml());
        } else {
          sb.append(node.toHtml());
        }
      }
    } catch (Exception e) {
      throw new ForumException("Problems while parsing HTML: " + e, e);
    }

    return sb.toString();
  }
 public int getEndLine() {
   int nr = arg0.getStartingLineNumber() + 1;
   int nrE = nr;
   Tag endTag = arg0.getEndTag();
   if (endTag != null) {
     nrE = endTag.getEndingLineNumber();
     int offset = endTag.getStartPosition() - endTag.getEndPosition();
     if (offset == 0)
       fEditor.addProblemMarker(
           endTag.getTagName().toLowerCase()
               + " is not correctly closed proposed line for closing is line "
               + nrE,
           nr,
           IMarker.SEVERITY_WARNING);
   }
   return nrE;
 }
  /**
   * Scan the tag. For this implementation, the only operation is to perform the tag's semantic
   * action.
   *
   * @param tag The tag to scan.
   * @param lexer Provides html page access.
   * @param stack The parse stack. May contain pending tags that enclose this tag.
   * @return The resultant tag (may be unchanged).
   */
  public Tag scan(Tag tag, Lexer lexer, NodeList stack) throws ParserException {
    tag.doSemanticAction();

    return (tag);
  }
 public int getStartLine() {
   return arg0.getStartingLineNumber() + 1;
 }
 public String getUniqueID() {
   return arg0.toString();
 }
 public void assertTagNameShouldBe(String message, Node node, String expectedTagName) {
   Tag tag = (Tag) node;
   assertStringEquals(message, expectedTagName, tag.getTagName());
 }
 /**
  * Return the number of attributes in the list.
  *
  * <p>Once you know the number of attributes, you can iterate through the list.
  *
  * @return The number of attributes in the list.
  * @see #getURI(int)
  * @see #getLocalName(int)
  * @see #getQName(int)
  * @see #getType(int)
  * @see #getValue(int)
  */
 public int getLength() {
   return (mTag.getAttributesEx().size() - 1);
 }
 /**
  * Look up an attribute's value by Namespace name.
  *
  * <p>See {@link #getValue(int) getValue(int)} for a description of the possible values.
  *
  * @param uri The Namespace URI, or the empty String if the name has no Namespace URI.
  * @param localName The local name of the attribute.
  * @return The attribute value as a string, or null if the attribute is not in the list.
  */
 public String getValue(String uri, String localName) {
   return (mTag.getAttribute(localName));
 }