public List<Newsitem> parseContent(String content) throws Exception {
    List<Newsitem> newsitems = new ArrayList<Newsitem>();

    Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news");
    NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem");

    for (int i = 0; i < nodes.size(); i++) {
      NewsitemImpl newsitem = new NewsitemImpl();
      Tag itemTable = (Tag) nodes.elementAt(i);

      Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle");
      newsitem.setTitle(titleTag.toPlainTextString());

      Node descriptionSpan =
          titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling();
      newsitem.setDescription(
          descriptionSpan
              .toPlainTextString()
              .replaceAll("[^\\u0000-\\u00FF]", " ")
              .replace("&nbsp;Read More...", "")
              .trim());

      Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0);
      newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href"));
      newsitems.add(newsitem);
    }
    return newsitems;
  }
  public void testParentConnections() throws ParserException {
    String tag1 = "<custom>";
    String tag2 = "<custom>something</custom>";
    String tag3 = "</custom>";
    createParser(tag1 + tag2 + tag3);
    parser.setNodeFactory(
        new PrototypicalNodeFactory(
            new Tag[] {
              new CustomTag(false), new AnotherTag(false),
            }));
    parseAndAssertNodeCount(3);

    CustomTag customTag = (CustomTag) node[0];

    assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml());
    assertNull("first custom tag should have no parent", customTag.getParent());

    customTag = (CustomTag) node[1];
    assertStringEquals("second custom tag html", tag2, customTag.toHtml());
    assertNull("second custom tag should have no parent", customTag.getParent());

    Node firstChild = customTag.childAt(0);
    assertType("firstChild", Text.class, firstChild);
    Node parent = firstChild.getParent();
    assertNotNull("first child parent should not be null", parent);
    assertSame("parent and custom tag should be the same", customTag, parent);

    Tag endTag = (Tag) node[2];
    assertStringEquals("third custom tag html", tag3, endTag.toHtml());
    assertNull("end tag should have no parent", endTag.getParent());
  }
Exemple #3
0
 /**
  * A bug in the freshmeat page - really bad html tag - &lt;A&gt;Revision&lt;\a&gt; Reported by
  * Mazlan Mat Note: Actually, this is completely legal HTML - Derrick
  */
 public void testFreshMeatBug() throws ParserException {
   String html = "<a>Revision</a>";
   createParser(html, "http://www.yahoo.com");
   parseAndAssertNodeCount(1);
   assertTrue("Node 0 should be a tag", node[0] instanceof Tag);
   Tag tag = (Tag) node[0];
   assertEquals("Tag Contents", html, tag.toHtml());
   assertEquals("Node 0 should have one child", 1, tag.getChildren().size());
   assertTrue("The child should be a string node", tag.getChildren().elementAt(0) instanceof Text);
   Text stringNode = (Text) tag.getChildren().elementAt(0);
   assertEquals("Text Contents", "Revision", stringNode.getText());
 }
  public void testCompositeTagWithSelfChildren() throws ParserException {
    String tag1 = "<custom>";
    String tag2 = "<custom>something</custom>";
    String tag3 = "</custom>";
    createParser(tag1 + tag2 + tag3);
    parser.setNodeFactory(
        new PrototypicalNodeFactory(
            new Tag[] {
              new CustomTag(false), new AnotherTag(false),
            }));
    parseAndAssertNodeCount(3);

    CustomTag customTag = (CustomTag) node[0];
    assertEquals("child count", 0, customTag.getChildCount());
    assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag());

    assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml());
    customTag = (CustomTag) node[1];
    assertStringEquals("second custom tag html", tag2, customTag.toHtml());
    Tag endTag = (Tag) node[2];
    assertStringEquals("third custom tag html", tag3, endTag.toHtml());
  }
Exemple #5
0
  /**
   * Given an input, analyze each HTML tag and remove unsecured attributes from them.
   *
   * @param contents The content to verify
   * @return the content, secure.
   */
  public String ensureAllAttributesAreSafe(String contents) {
    StringBuffer sb = new StringBuffer(contents.length());

    try {
      Lexer lexer = new Lexer(contents);
      Node node;

      while ((node = lexer.nextNode()) != null) {
        if (node instanceof Tag) {
          Tag tag = (Tag) node;

          this.checkAndValidateAttributes(tag, false);

          sb.append(tag.toHtml());
        } else {
          sb.append(node.toHtml());
        }
      }
    } catch (Exception e) {
      throw new ForumException("Problems while parsing HTML: " + e, e);
    }

    return sb.toString();
  }