Exemplos de Node em Java, exemplos de org.htmlparser.Node em Java

Exemplo n.º 1

0

Exibir arquivo

Arquivo: CompositeTagScannerTest.java Projeto: theZnorf/esdexercises

  public void testParentConnections() throws ParserException {
    String tag1 = "<custom>";
    String tag2 = "<custom>something</custom>";
    String tag3 = "</custom>";
    createParser(tag1 + tag2 + tag3);
    parser.setNodeFactory(
        new PrototypicalNodeFactory(
            new Tag[] {
              new CustomTag(false), new AnotherTag(false),
            }));
    parseAndAssertNodeCount(3);

    CustomTag customTag = (CustomTag) node[0];

    assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml());
    assertNull("first custom tag should have no parent", customTag.getParent());

    customTag = (CustomTag) node[1];
    assertStringEquals("second custom tag html", tag2, customTag.toHtml());
    assertNull("second custom tag should have no parent", customTag.getParent());

    Node firstChild = customTag.childAt(0);
    assertType("firstChild", Text.class, firstChild);
    Node parent = firstChild.getParent();
    assertNotNull("first child parent should not be null", parent);
    assertSame("parent and custom tag should be the same", customTag, parent);

    Tag endTag = (Tag) node[2];
    assertStringEquals("third custom tag html", tag3, endTag.toHtml());
    assertNull("end tag should have no parent", endTag.getParent());
  }

Exemplo n.º 2

0

Exibir arquivo

Arquivo: CapitalFootballScraper.java Projeto: tonytw1/brownbag

  public List<Newsitem> parseContent(String content) throws Exception {
    List<Newsitem> newsitems = new ArrayList<Newsitem>();

    Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news");
    NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem");

    for (int i = 0; i < nodes.size(); i++) {
      NewsitemImpl newsitem = new NewsitemImpl();
      Tag itemTable = (Tag) nodes.elementAt(i);

      Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle");
      newsitem.setTitle(titleTag.toPlainTextString());

      Node descriptionSpan =
          titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling();
      newsitem.setDescription(
          descriptionSpan
              .toPlainTextString()
              .replaceAll("[^\\u0000-\\u00FF]", " ")
              .replace("&nbsp;Read More...", "")
              .trim());

      Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0);
      newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href"));
      newsitems.add(newsitem);
    }
    return newsitems;
  }

Exemplo n.º 3

0

Exibir arquivo

Arquivo: CompositeTagScannerTest.java Projeto: theZnorf/esdexercises

  public void testCompositeTagWithAnotherTagChild() throws ParserException {
    String childtag = "<Another/>";
    createParser("<Custom>" + childtag + "</Custom>");
    parser.setNodeFactory(
        new PrototypicalNodeFactory(
            new Tag[] {
              new CustomTag(), new AnotherTag(true),
            }));
    parseAndAssertNodeCount(1);
    assertType("node", CustomTag.class, node[0]);
    CustomTag customTag = (CustomTag) node[0];
    assertEquals("child count", 1, customTag.getChildCount());
    assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag());
    assertEquals("starting loc", 0, customTag.getStartPosition());
    assertEquals("ending loc", 8, customTag.getEndPosition());
    assertEquals("custom tag starting loc", 0, customTag.getStartPosition());
    assertEquals("custom tag ending loc", 27, customTag.getEndTag().getEndPosition());

    Node child = customTag.childAt(0);
    assertType("child", AnotherTag.class, child);
    AnotherTag tag = (AnotherTag) child;
    assertEquals("another tag start pos", 8, tag.getStartPosition());
    assertEquals("another tag ending pos", 18, tag.getEndPosition());

    assertEquals("custom end tag start pos", 18, customTag.getEndTag().getStartPosition());
    assertStringEquals("child html", childtag, child.toHtml());
  }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: SearchEngineService.java Projeto: Genxl/ShareSystem

  /**
   * 递归钻取正文信息
   *
   * @param nodeP
   * @return
   */
  @SuppressWarnings("unchecked")
  protected List<Node> extractHtml(Node nodeP, String type) throws Exception {
    NodeList nodeList = nodeP.getChildren();
    if ((nodeList == null) || (nodeList.size() == 0)) {
      return null;
    }
    ArrayList tableList = new ArrayList();
    try {
      for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) {
        Node node = (Node) e.nextNode();
        if (node instanceof LinkTag) {
          tableList.add(node);
        } else if (node instanceof ScriptTag
            || node instanceof StyleTag
            || node instanceof SelectTag) {
        } else if (node instanceof TextNode) {
          if (node.getText().length() > 0) {
            tableList.add(node);
          }
        } else {
          List tempList = extractHtml(node, type);
          if ((tempList != null) && (tempList.size() > 0)) {
            Iterator ti = tempList.iterator();
            while (ti.hasNext()) {
              tableList.add(ti.next());
            }
          }
        }
      }
    } catch (Exception e) {
      return null;
    }
    if ((tableList != null) && (tableList.size() > 0)) {
      TableContext tc = new TableContext();
      tc.setLinkList(new ArrayList());
      tc.setTextBuffer(new StringBuffer());
      tableNumber++;
      tc.setTableRow(tableNumber);
      Iterator ti = tableList.iterator();

      // 得到设置的搜索URL
      String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL);

      while (ti.hasNext()) {
        Node node = (Node) ti.next();
        if (node instanceof LinkTag) {
          LinkTag linkTag = (LinkTag) node;
          if (!"1".equalsIgnoreCase(type)) {
            linkTag.setAttribute(
                "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href")));
          }
          tc.getLinkList().add(linkTag);
        } else {
          tc.getTextBuffer().append(node.getText());
        }
      }
      return tableList;
    }
    return null;
  }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: AreaTest.java Projeto: zhaoccx/LS

  /**
   * 对新闻URL进行解析提取新闻，同时将新闻插入到数据库中。
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    try {
      parser = new Parser(url);
      // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new
      // HasAttributeFilter("class", "TRS_PreAppend"));

      // parser.reset(); //记得每次用完parser后，要重置一次parser。要不然就得不到我们想要的内容了。

      NodeFilter innerFilter =
          new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal"));
      NodeFilter xk = new HasParentFilter(innerFilter);
      NodeList nodes = parser.extractAllNodesThatMatch(xk);
      System.out.println(nodes.size());
      for (int i = 0; i < nodes.size(); i++) {
        Node time = nodes.elementAt(i);
        // System.out.println(time.toPlainTextString().trim().replace("&nbsp;",
        // "").replaceAll("[\\t\\n\\r]", "").replaceAll("　", ""));
        System.out.println(
            replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll("　", "")));
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
  }

Exemplo n.º 6

0

Exibir arquivo

Arquivo: HtmlParserTool.java Projeto: edin-chou/myDemos

  // 获取一个网站上的链接，filter来过滤链接
  public static Set<String> extracLinks(String url, Cobweb cobweb) {
    Set<String> links = new HashSet<String>();

    try {
      Parser parser = new Parser(url);
      parser.setEncoding(cobweb.getCharSet());

      // 过滤<frame >标签的filter，用来提取frame 标签里的src 属性
      NodeFilter frameFilter =
          new NodeFilter() {
            public boolean accept(Node node) {
              if (node.getText().startsWith("frame src=")) {
                return true;
              } else {
                return false;
              }
            }
          };

      // OrFilter 来设置过滤<a> 标签和<frame> 标签
      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
      // 得到所有经过过滤的标签
      NodeList list = parser.extractAllNodesThatMatch(linkFilter);
      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);
        if (tag instanceof LinkTag) { // <a> 标签
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // URL
          if (cobweb.accept(linkUrl)) {
            links.add( // java.net.URLEncoder.encode(linkUrl));
                linkUrl
                    .replaceAll("\\?", "\\%3F") // 转码
                    .replaceAll("\\&", "\\%26")
                    .replaceAll("\\|", "\\%124")
                    .replaceAll("\\#", ""));
          }
          ;
        } else { // <frame>标签
          // 提取frame 里src 属性的链接，如<frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=");
          frame = frame.substring(start);
          int end = frame.indexOf(" ");
          if (end == -1) {
            end = frame.indexOf(">");
          }
          String frameUrl = frame.substring(5, end - 1);
          if (cobweb.accept(frameUrl)) {
            links.add(frameUrl);
          }
        }
      }
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: MParseBrand.java Projeto: user20161119/beiker-Deprecated

 /**
  * 获取滚动品牌
  *
  * @param path
  * @param city
  * @param fileName
  * @return
  */
 public static Map<String, String> getBrandInfo(String path, String city, String fileName) {
   Map<String, String> brandMap = new LinkedHashMap<String, String>();
   try {
     StringBuilder filePath = new StringBuilder();
     filePath.append(PATH);
     filePath.append(city);
     filePath.append(INCLUDE);
     filePath.append(fileName);
     filePath.append(STUFF);
     // 开始解析
     Parser parser = new Parser(filePath.toString());
     // 过滤出<a></a>标签
     NodeFilter divFilter = new NodeClassFilter(Div.class);
     NodeList classList = parser.extractAllNodesThatMatch(divFilter);
     NodeList hrefList = null;
     NodeList imgList = null;
     Node picNode = null;
     Node hrefNode = null;
     Node imgNode = null;
     String classStr = "";
     String hrefStr = "";
     String imgStr = "";
     String imgClass = "";
     for (int i = 0; i < classList.size(); i++) {
       picNode = classList.elementAt(i);
       classStr = ((Div) picNode).getAttribute("class");
       if ("business_list_pic".equalsIgnoreCase(classStr)) {
         hrefList = picNode.getChildren();
         for (int j = 0; j < hrefList.size(); j++) {
           hrefNode = hrefList.elementAt(j);
           if (hrefNode instanceof LinkTag) {
             hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id，获取到id
             hrefStr = MParseBrand.getBrandId(hrefStr);
             imgList = hrefNode.getChildren();
             for (int k = 0; k < imgList.size(); k++) {
               imgNode = imgList.elementAt(k);
               if (imgNode instanceof ImageTag) {
                 imgClass = ((ImageTag) imgNode).getAttribute("class");
                 if (null != imgClass) {
                   imgStr = ((ImageTag) imgNode).getAttribute("src");
                   if (null == imgStr) {
                     imgStr = ((ImageTag) imgNode).getAttribute("original");
                   }
                 }
               }
             }
             brandMap.put(hrefStr, imgStr);
           }
         }
       }
     }
   } catch (Exception e) {
     e.printStackTrace();
   }
   return brandMap;
 }

Exemplo n.º 8

0

Exibir arquivo

Arquivo: VelocityReconcilingStrategy.java Projeto: yesen/veloeclipse

 /** @throws ParserException */
 private void parseHtml() throws ParserException {
   htmlTags = new ArrayList();
   Parser parser = new Parser();
   parser.setInputHTML(fDocument.get());
   for (NodeIterator e = parser.elements(); e.hasMoreNodes(); ) {
     Node node = e.nextNode();
     VHtmlNodeVisitor htmlNodeVisitor = new VHtmlNodeVisitor();
     node.accept(htmlNodeVisitor);
   }
 }

Exemplo n.º 9

0

Exibir arquivo

Arquivo: PullDataFromHtml.java Projeto: yh443042575/gitdata

 /**
  * 提取网页中所有的IssueComment元素
  *
  * @param source
  */
 private List<IssueCommentEvent> processComment(
     NodeList nodeList, List<IssueCommentEvent> icList) {
   SimpleNodeIterator sni = nodeList.elements();
   while (sni.hasMoreNodes()) {
     Node node = sni.nextNode();
     if (node.getText().matches("div id=\"issuecomment-.*\".*+")) {
       IssueCommentEvent i = new IssueCommentEvent();
       // TODO 解析comment工作
       Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
       i.setActor(actorNode.toPlainTextString());
       Node contentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body");
       i.setCommentBody(contentNode.toPlainTextString());
       Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
       Pattern pattern = Pattern.compile("datetime=\".*\"");
       Matcher matcher = pattern.matcher(timeNode.getText());
       if (matcher.find()) {
         String time = matcher.group().split("\"")[1];
         i.setCreatedAt(time);
         System.out.println(time);
       }
       icList.add(i);
     } else {
       // 得到该节点的子节点列表
       NodeList childList = node.getChildren();
       // 孩子节点为空，说明是值节点
       if (null != childList) { // 如果孩子结点不为空则递归调用
         processComment(childList, icList);
       }
     }
   }
   return icList;
 }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: PullDataFromHtml.java Projeto: yh443042575/gitdata

  /**
   * 处理开启pullrequest的需求
   *
   * @param nodeList
   * @param pList
   * @return
   */
  public List<PullRequestEvent> processOpenPull(NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("div id=\"issue-")) {
        PullRequestEvent pullRequestEvent = new PullRequestEvent();
        pullRequestEvent.setAction("open");
        Node commentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body");
        pullRequestEvent.setBody(commentNode.toPlainTextString());
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author");
        pullRequestEvent.setActor(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pullRequestEvent.setCreatedAt(time);
        }
        pList.add(pullRequestEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空，说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processOpenPull(childList, pList);
        }
      }
    }
    return pList;
  }

Exemplo n.º 11

0

Exibir arquivo

Arquivo: GridUtils.java Projeto: sunbiz/dhis2

  /**
   * Retrieves the value of a table cell. Appends the text of child nodes of the cell. In case of
   * composite tags like span or div the inner text is appended.
   */
  public static String getValue(TagNode cell) {
    String value = EMPTY;

    for (Node child : cell.getChildren().toNodeArray()) {
      if (child instanceof CompositeTag) {
        value += ((CompositeTag) child).getStringText();
      } else {
        value = value + child.getText();
      }
    }

    return value.trim().replaceAll("&nbsp;", EMPTY);
  }

Exemplo n.º 12

0

Exibir arquivo

Arquivo: HtmlParserTool.java Projeto: jason440682/SearchEngine

  // 获取一个网站上的链接,filter 用来过滤链接
  public static Set<String> extracLinks(String url, LinkFilter filter) {

    Set<String> links = new HashSet<String>();
    try {
      Parser parser = new Parser(url);
      // parser.setEncoding("utf8");
      // 过滤 <frame >标签的 filter，用来提取 frame 标签里的 src 属性所表示的链接
      NodeFilter frameFilter =
          new NodeFilter() {
            /** */
            private static final long serialVersionUID = 1L;

            public boolean accept(Node node) {
              if (node.getText().startsWith("iframe") && node.getText().contains("src=")) {
                return true;
              } else {
                return false;
              }
            }
          };
      // OrFilter 来设置过滤 <a> 标签和 <frame> 标签
      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
      // 得到所有经过过滤的标签
      NodeList list = parser.extractAllNodesThatMatch(linkFilter);
      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);
        if (tag instanceof LinkTag) // <a> 标签
        {
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // url可能出现在src,href等属性中
          if (filter.accept(linkUrl)) links.add(linkUrl);
        } else // <frame> 标签
        {
          // 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=\"");
          frame = frame.substring(start);
          int end = frame.indexOf("\">");
          if (end == -1) {
            end = frame.indexOf("?");
          }
          String frameUrl = frame.substring(5, end - 1);
          if (filter.accept(frameUrl)) links.add(frameUrl);
        }
      }
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: CompositeTagScannerTest.java Projeto: theZnorf/esdexercises

 public void testCompositeTagWithOneTextChild() throws ParserException {
   String html = "<Custom>" + "Hello" + "</Custom>";
   createParser(html);
   CustomTag customTag = parseCustomTag(1);
   assertEquals("child count", 1, customTag.getChildCount());
   assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag());
   assertEquals("starting loc", 0, customTag.getStartPosition());
   assertEquals("ending loc", 8, customTag.getEndPosition());
   assertEquals("starting line position", 0, customTag.getStartingLineNumber());
   assertEquals("ending line position", 0, customTag.getEndingLineNumber());
   Node child = customTag.childAt(0);
   assertType("child", Text.class, child);
   assertStringEquals("child text", "Hello", child.toPlainTextString());
 }

Exemplo n.º 14

0

Exibir arquivo

Arquivo: CompositeTagScannerTest.java Projeto: theZnorf/esdexercises

  public void testCompositeTagWithTagChild() throws ParserException {
    String childtag = "<Hello>";
    createParser("<Custom>" + childtag + "</Custom>");
    CustomTag customTag = parseCustomTag(1);
    assertEquals("child count", 1, customTag.getChildCount());
    assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag());
    assertEquals("starting loc", 0, customTag.getStartPosition());
    assertEquals("ending loc", 8, customTag.getEndPosition());
    assertEquals("custom tag starting loc", 0, customTag.getStartPosition());
    assertEquals("custom tag ending loc", 24, customTag.getEndTag().getEndPosition());

    Node child = customTag.childAt(0);
    assertType("child", Tag.class, child);
    assertStringEquals("child html", childtag, child.toHtml());
  }

Exemplo n.º 15

0

Exibir arquivo

Arquivo: AnywayParser.java Projeto: alnasfire/tours

 private List<String> getHotTourRefs() {
   List<String> refs = new ArrayList<String>();
   try {
     Node[] nodes = getNodes(URL);
     for (Node node : nodes) {
       if (node.getText().contains("class=\"latestnews\"") && node.getText().length() > 40) {
         refs.add(URL.substring(0, URL.length() - 2) + node.getText().split("\"")[1]);
       }
     }
   } catch (ParserException e) {
     e
         .printStackTrace(); // To change body of catch statement use File | Settings | File
                             // Templates.
   }
   return refs;
 }

Exemplo n.º 16

0

Exibir arquivo

Arquivo: SafeHtml.java Projeto: 3mtee/jforum

  /**
   * Given an input, makes it safe for HTML displaying. Removes any not allowed HTML tag or
   * attribute, as well unwanted JavaScript statements inside the tags.
   *
   * @param contents the input to analyze
   * @return the modified and safe string
   */
  public String makeSafe(String contents) {
    if (contents == null || contents.length() == 0) {
      return contents;
    }

    StringBuffer sb = new StringBuffer(contents.length());

    try {
      Lexer lexer = new Lexer(contents);
      Node node;

      while ((node = lexer.nextNode()) != null) {
        boolean isTextNode = node instanceof TextNode;

        if (isTextNode) {
          // Text nodes are raw data, so we just
          // strip off all possible HTML content
          String text = node.toHtml();

          if (text.indexOf('>') > -1 || text.indexOf('<') > -1) {
            text = text.replaceAll("<", "&lt;");
            text = text.replaceAll(">", "&gt;");
            text = text.replaceAll("\"", "&quot;");

            node.setText(text);
          }
        }

        if (isTextNode || (node instanceof Tag && this.isTagWelcome(node))) {
          sb.append(node.toHtml());
        } else {
          String text = node.toHtml();

          text = text.replaceAll("<", "&lt;");
          text = text.replaceAll(">", "&gt;");

          sb.append(text);
        }
      }
    } catch (Exception e) {
      throw new ForumException("Error while parsing HTML: " + e, e);
    }

    return sb.toString();
  }

Exemplo n.º 17

0

Exibir arquivo

Arquivo: ParserAmazonPage.java Projeto: kanxg/searchafGae2

 private void getPriceAndLabel(Node node, AmazonProduct product) throws Exception {
   NodeList childList = node.getChildren();
   List<String> productvalue = new ArrayList<String>();
   processNodeList(childList, productvalue);
   System.out.println(productvalue);
   product.label = productvalue.get(0);
   // String price = productvalue.get(3);
   // product.price = getprice(price);
 }

Exemplo n.º 18

0

Exibir arquivo

Arquivo: ParserDrugstorePage.java Projeto: kanxg/searchafGae2

 private String processHTML(Node node) {
   String html = node.toHtml();
   // String html = node.getChildren().elementAt(3).toHtml();
   // html = html + node.getChildren().elementAt(5).toHtml();
   // html = html + node.getChildren().elementAt(9).toHtml();
   // html = html
   // .replaceAll("ProductDisplay",
   // "http://www.abercrombie.com/webapp/wcs/stores/servlet/ProductDisplay");
   // html = html.replace("//anf", "http://anf");
   return html;
 }

Exemplo n.º 19

0

Exibir arquivo

Arquivo: HTMLTemplateCompiler.java Projeto: zsigmond-czine-everit/templating-html

 @Override
 public CompiledTemplate compile(
     final String template, final ParserConfiguration parserConfiguration) {
   Source source = new StringSource(template);
   Page page = new Page(source);
   Lexer lexer = new Lexer(page);
   HTMLNodeVisitor visitor =
       new HTMLNodeVisitor(
           ehtAttributeprefix, expressionCompiler, inlineCompilers, parserConfiguration);
   visitor.beginParsing();
   try {
     for (Node node = lexer.nextNode(); node != null; node = lexer.nextNode()) {
       node.accept(visitor);
     }
   } catch (ParserException e) {
     throw new RuntimeException(e);
   }
   visitor.finishedParsing();
   return new CompiledTemplateImpl(visitor.getRootNode());
 }

Exemplo n.º 20

0

Exibir arquivo

Arquivo: Lexer.java Projeto: Qoiuy/InteresDemo

  /**
   * Mainline for command line operation
   *
   * @param args [0] The URL to parse.
   * @exception MalformedURLException If the provided URL cannot be resolved.
   * @exception ParserException If the parse fails.
   */
  public static void main(String[] args) throws MalformedURLException, ParserException {
    ConnectionManager manager;
    Lexer lexer;
    Node node;

    if (0 >= args.length) {
      System.out.println("HTML Lexer v" + getVersion() + "\n");
      System.out.println();
      System.out.println("usage: java -jar htmllexer.jar <url>");
    } else {
      try {
        manager = Page.getConnectionManager();
        lexer = new Lexer(manager.openConnection(args[0]));
        while (null != (node = lexer.nextNode(false))) System.out.println(node.toString());
      } catch (ParserException pe) {
        System.out.println(pe.getMessage());
        if (null != pe.getThrowable()) System.out.println(pe.getThrowable().getMessage());
      }
    }
  }

Exemplo n.º 21

0

Exibir arquivo

Arquivo: ParserDrugstorePage.java Projeto: kanxg/searchafGae2

 private void processNodeList(NodeList list, List<String> valueList) {
   // 迭代开始
   SimpleNodeIterator iterator = list.elements();
   while (iterator.hasMoreNodes()) {
     Node node = iterator.nextNode();
     // 得到该节点的子节点列表
     NodeList childList = node.getChildren();
     // 孩子节点为空，说明是值节点
     if (null == childList) {
       // 得到值节点的值
       String result = node.toPlainTextString().trim();
       // 若包含关键字，则简单打印出来文本
       // System.out.println(result);
       if (result != null && !"".equals(result)) valueList.add(result);
     } // end if
     // 孩子节点不为空，继续迭代该孩子节点
     else {
       processNodeList(childList, valueList);
     } // end else
   } // end wile
 }

Exemplo n.º 22

0

Exibir arquivo

Arquivo: PullDataFromHtml.java Projeto: yh443042575/gitdata

 /**
  * 处理对pullrequest的review时，comment的操作, 与processSubPullRequestReviewComment配合一起使用
  *
  * @param nodeList
  * @param prList
  * @return
  */
 public List<PullRequestReviewCommentEvent> processPullRequestReviewComment(
     NodeList nodeList, List<PullRequestReviewCommentEvent> prList) {
   SimpleNodeIterator sni = nodeList.elements();
   while (sni.hasMoreNodes()) {
     Node node = sni.nextNode();
     if (node.getText().contains("div id=\"diff-for-comment-")) {
       String discussionId = node.getText().split("\"")[1];
       System.out.println(discussionId);
       NodeList subNodeList = node.getChildren();
       prList = processSubPullRequestReviewComment(subNodeList, prList, discussionId);
     } else {
       // 得到该节点的子节点列表
       NodeList childList = node.getChildren();
       // 孩子节点为空，说明是值节点
       if (null != childList) { // 如果孩子结点不为空则递归调用
         processPullRequestReviewComment(childList, prList);
       }
     }
   }
   return prList;
 }

Exemplo n.º 23

0

Exibir arquivo

Arquivo: PullDataFromHtml.java Projeto: yh443042575/gitdata

  /**
   * 处理Reference了当前pullrequest的操作
   *
   * @param source
   */
  public List<PullRequestEvent> processReference(NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("div class=\"discussion-item discussion-item-ref\"")) {
        PullRequestEvent pullRequestEvent = new PullRequestEvent();
        pullRequestEvent.setAction("ref");
        Node anotherAtifactNode = DownloadUtil.getSomeChild(node, "class=\"title-link\"");
        pullRequestEvent.setBody(
            anotherAtifactNode == null ? "" : anotherAtifactNode.toPlainTextString());
        Pattern artifactPattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/[a-z[0-9]]+");
        Matcher artifactMatcher =
            artifactPattern.matcher(anotherAtifactNode == null ? "" : anotherAtifactNode.getText());
        if (artifactMatcher.find()) {
          String anotherAtifact = artifactMatcher.group();
          pullRequestEvent.setPullrequestBaseRef(anotherAtifact);
          System.out.println(anotherAtifact);
        }
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        pullRequestEvent.setActor(actorNode == null ? "" : actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pullRequestEvent.setCreatedAt(time);
        }
        pList.add(pullRequestEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空，说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processReference(childList, pList);
        }
      }
    }
    return pList;
  }

Exemplo n.º 24

0

Exibir arquivo

Arquivo: SafeHtml.java Projeto: 3mtee/jforum

  /**
   * Given an input, analyze each HTML tag and remove unsecured attributes from them.
   *
   * @param contents The content to verify
   * @return the content, secure.
   */
  public String ensureAllAttributesAreSafe(String contents) {
    StringBuffer sb = new StringBuffer(contents.length());

    try {
      Lexer lexer = new Lexer(contents);
      Node node;

      while ((node = lexer.nextNode()) != null) {
        if (node instanceof Tag) {
          Tag tag = (Tag) node;

          this.checkAndValidateAttributes(tag, false);

          sb.append(tag.toHtml());
        } else {
          sb.append(node.toHtml());
        }
      }
    } catch (Exception e) {
      throw new ForumException("Problems while parsing HTML: " + e, e);
    }

    return sb.toString();
  }

Exemplo n.º 25

0

Exibir arquivo

Arquivo: ParserBootsPage.java Projeto: kanxg/kanxg

 private boolean getName(Node node) {
   NodeList childList = node.getChildren();
   List<String> productvalue = new ArrayList<String>();
   processNodeList(childList, productvalue);
   // System.out.println(productvalue);
   int i = 0;
   while (i < productvalue.size()) {
     String Quantity = productvalue.get(i);
     if (Quantity.startsWith("Quantity")) {
       return true;
     }
     i = i + 1;
   }
   return false;
 }

Exemplo n.º 26

0

Exibir arquivo

Arquivo: LinkStringFilter.java Projeto: kookse/bboss

  /**
   * Accept nodes that are a LinkTag and have a URL that matches the pattern supplied in the
   * constructor.
   *
   * @param node The node to check.
   * @return <code>true</code> if the node is a link with the pattern.
   */
  public boolean accept(Node node) {
    boolean ret;

    ret = false;
    if (LinkTag.class.isAssignableFrom(node.getClass())) {
      String link = ((LinkTag) node).getLink();
      if (mCaseSensitive) {
        if (link.indexOf(mPattern) > -1) ret = true;
      } else {
        if (link.toUpperCase().indexOf(mPattern.toUpperCase()) > -1) ret = true;
      }
    }

    return (ret);
  }

Exemplo n.º 27

0

Exibir arquivo

Arquivo: PullDataFromHtml.java Projeto: yh443042575/gitdata

  /**
   * 处理移除里程碑动作
   *
   * @param nodeList
   * @param pList
   * @return
   */
  public List<PullRequestEvent> processRemoveMileStone(
      NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("div class=\"discussion-item discussion-item-demilestoned\"")) {
        PullRequestEvent p = new PullRequestEvent();
        p.setAction("removeMilestone");
        Node milestoneNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\"");
        Pattern milestonePattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/.*+");
        Matcher milestoneMatcher = milestonePattern.matcher(milestoneNode.getText());
        if (milestoneMatcher.find()) {
          String milestone = milestoneMatcher.group().split("\"")[0];
          p.setBody(milestone);
        }
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        p.setActor(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          p.setCreatedAt(time);
        }
        pList.add(p);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空，说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processRemoveMileStone(childList, pList);
        }
      }
    }
    return pList;
  }

Exemplo n.º 28

0

Exibir arquivo

Arquivo: FastArchivalUrlReplayParseEventHandler.java Projeto: nicosensei/wayback

  protected void emit(ReplayParseContext context, String pre, Node node, String post)
      throws IOException {

    OutputStream out = context.getOutputStream();
    if (out != null) {
      //			Charset charset = Charset.forName(context.getOutputCharset());
      String charset = context.getOutputCharset();

      if (pre != null) {

        out.write(pre.getBytes(charset));
      }

      if (node != null) {
        out.write(node.toHtml(true).getBytes(charset));
      }

      if (post != null) {

        out.write(post.getBytes(charset));
      }
    }
  }

Exemplo n.º 29

0

Exibir arquivo

Arquivo: PullDataFromHtml.java Projeto: yh443042575/gitdata

  /**
   * 处理labeled操作
   *
   * @param source
   */
  public List<PullRequestEvent> processLabled(NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("class=\"discussion-item discussion-item-labeled\"")) {
        PullRequestEvent pullRequestEvent = new PullRequestEvent();
        pullRequestEvent.setAction("labeled");
        List<Node> lableList = new ArrayList<Node>();
        lableList = DownloadUtil.getLableList(node, "style=\"color:", lableList);
        String lables = "";
        for (int i = 0; i < lableList.size(); i++) {
          lables += lableList.get(i).toPlainTextString();
          if (i != lableList.size() - 1) {
            lables += ",";
          }
        }
        System.out.println(lables);
        pullRequestEvent.setPullrequestBaseLabels(lables);
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        pullRequestEvent.setActor(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pullRequestEvent.setCreatedAt(time);
        }
        pList.add(pullRequestEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空，说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processLabled(childList, pList);
        }
      }
    }
    return pList;
  }

Exemplo n.º 30

0

Exibir arquivo

Arquivo: PullDataFromHtml.java Projeto: yh443042575/gitdata

  /**
   * 处理取消指派某人操作
   *
   * <p>跟之前一样，取消指派的是后面的家伙
   *
   * @param nodeList
   * @param pList
   * @return
   */
  private List<PullRequestEvent> processUnassigned(
      NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("class=\"discussion-item discussion-item-unassigned\"")) {
        PullRequestEvent pEvent = new PullRequestEvent();
        pEvent.setAction("assigned");
        Node assignedNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        pEvent.setPullrequestAssgnee(assignedNode.toPlainTextString());
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\"");
        if (actorNode != null) {
          pEvent.setActor(actorNode.toPlainTextString());
        } else {
          pEvent.setActor(assignedNode.toPlainTextString());
        }
        System.out.println(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pEvent.setCreatedAt(time);
        }
        pList.add(pEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空，说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processUnassigned(childList, pList);
        }
      }
    }
    return pList;
  }