Ejemplo n.º 1
0
  /**
   * 递归钻取正文信息
   *
   * @param nodeP
   * @return
   */
  @SuppressWarnings("unchecked")
  protected List<Node> extractHtml(Node nodeP, String type) throws Exception {
    NodeList nodeList = nodeP.getChildren();
    if ((nodeList == null) || (nodeList.size() == 0)) {
      return null;
    }
    ArrayList tableList = new ArrayList();
    try {
      for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) {
        Node node = (Node) e.nextNode();
        if (node instanceof LinkTag) {
          tableList.add(node);
        } else if (node instanceof ScriptTag
            || node instanceof StyleTag
            || node instanceof SelectTag) {
        } else if (node instanceof TextNode) {
          if (node.getText().length() > 0) {
            tableList.add(node);
          }
        } else {
          List tempList = extractHtml(node, type);
          if ((tempList != null) && (tempList.size() > 0)) {
            Iterator ti = tempList.iterator();
            while (ti.hasNext()) {
              tableList.add(ti.next());
            }
          }
        }
      }
    } catch (Exception e) {
      return null;
    }
    if ((tableList != null) && (tableList.size() > 0)) {
      TableContext tc = new TableContext();
      tc.setLinkList(new ArrayList());
      tc.setTextBuffer(new StringBuffer());
      tableNumber++;
      tc.setTableRow(tableNumber);
      Iterator ti = tableList.iterator();

      // 得到设置的搜索URL
      String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL);

      while (ti.hasNext()) {
        Node node = (Node) ti.next();
        if (node instanceof LinkTag) {
          LinkTag linkTag = (LinkTag) node;
          if (!"1".equalsIgnoreCase(type)) {
            linkTag.setAttribute(
                "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href")));
          }
          tc.getLinkList().add(linkTag);
        } else {
          tc.getTextBuffer().append(node.getText());
        }
      }
      return tableList;
    }
    return null;
  }
 /** @throws ParserException */
 private void parseHtml() throws ParserException {
   htmlTags = new ArrayList();
   Parser parser = new Parser();
   parser.setInputHTML(fDocument.get());
   for (NodeIterator e = parser.elements(); e.hasMoreNodes(); ) {
     Node node = e.nextNode();
     VHtmlNodeVisitor htmlNodeVisitor = new VHtmlNodeVisitor();
     node.accept(htmlNodeVisitor);
   }
 }
Ejemplo n.º 3
0
  /** Test regular expression matching: */
  public void testRegularExpression() throws Exception {
    String target =
        "\n"
            + "\n"
            + "Most recently, in the Western Conference final, the Flames knocked off \n"
            + "the San Jose Sharks, the Pacific Division champions, to become the first \n"
            + "Canadian team to reach the Stanley Cup Championship series since 1994.";

    String html =
        "<html><head><title>CBC Sports Online: NHL Playoffs</title></head>"
            + "<body><h1>CBC SPORTS ONLINE</h1>\n"
            + "The Calgary Flames have already defeated three NHL division winners \n"
            + "during their improbable playoff run. If they are to hoist the Stanley \n"
            + "Cup they'll have to go through one more. <p><table ALIGN=\"Right\" width=196 CELLPADDING=0 cellspacing=0 hspace=4> <tr><td><img src=\"/gfx/topstory/sports/iginla_j0524.jpg\" width=194 height=194 hspace=3 border=1><br>\n"
            + "\n"
            + "<font SIZE=\"1\" FACE=\"verdana,arial\">\n"
            + "Jarome Iginla skates during the Flames' practice on Monday. Calgary takes on the Tampa Bay Lightning in the Stanley Cup finals beginning Tuesday night in Tampa\n"
            + "</font></td></tr></table>\n"
            + "\n"
            + "\n"
            + "In the post-season's first round, the Flames defeated the Vancouver \n"
            + "Canucks, the Northwest Division winners, in seven tough games. <p>\n"
            + "\n"
            + "In Round 2 it was the Detroit Red Wings, who not only won the Central \n"
            + "Division, but also boasted the NHL's best overall record during the \n"
            + "regular season, who fell to the Flames. <p>"
            + target
            + "<p>\n"
            + "\n"
            + "Up next for the Flames is the Tampa Bay Lighting -- the runaway winners \n"
            + "of the NHL's Southeast Division and the Eastern Conference's best team \n"
            + "during the regular season. <p>\n"
            + "\n"
            + "The Lighting advanced by beating the Philadelphia Flyers in the Eastern \n"
            + "Conference final. <p>\n"
            + "</body></html>\n";
    Lexer lexer;
    Parser parser;
    RegexFilter filter;
    NodeIterator iterator;
    int count;

    lexer = new Lexer(html);
    parser = new Parser(lexer);
    filter =
        new RegexFilter(
            "(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?");
    count = 0;
    for (iterator = parser.extractAllNodesThatMatch(filter).elements(); iterator.hasMoreNodes(); ) {
      assertEquals("text wrong", target, iterator.nextNode().toHtml());
      count++;
    }
    assertEquals("wrong count", 1, count);
  }
  /**
   * Assign the underlying node filter for this wrapper.
   *
   * @param filter The filter to wrap.
   * @param context The parser to use for conditioning this filter. Some filters need contextual
   *     information to provide to the user, i.e. for tag names or attribute names or values, so the
   *     Parser context is provided.
   */
  public void setNodeFilter(NodeFilter filter, Parser context) {
    Set set;

    mFilter = (TagNameFilter) filter;
    set = new HashSet();
    context.reset();
    try {
      for (NodeIterator iterator = context.elements(); iterator.hasMoreNodes(); )
        addName(set, iterator.nextNode());
    } catch (ParserException pe) {
      // oh well, we tried
    }
    for (Iterator iterator = set.iterator(); iterator.hasNext(); ) mName.addItem(iterator.next());
    mName.setSelectedItem(mFilter.getName());
  }
Ejemplo n.º 5
0
  public void testSelectors() throws Exception {
    String html =
        "<html><head><title>sample title</title></head><body inserterr=\"true\" yomama=\"false\"><h3 id=\"heading\">big </invalid>heading</h3><ul id=\"things\"><li><br word=\"broken\"/>&gt;moocow<li><applet/>doohickey<li class=\"last\"><b class=\"item\">final<br>item</b></ul></body></html>";
    Lexer l;
    Parser p;
    CssSelectorNodeFilter it;
    NodeIterator i;
    int count;

    l = new Lexer(html);
    p = new Parser(l);
    it = new CssSelectorNodeFilter("li + li");
    count = 0;
    for (i = p.extractAllNodesThatMatch(it).elements(); i.hasMoreNodes(); ) {
      assertEquals("tag name wrong", "LI", ((Tag) i.nextNode()).getTagName());
      count++;
    }
    assertEquals("wrong count", 2, count);
  }
Ejemplo n.º 6
0
  @Override
  public void crawl(Parser parser) throws ParserException {
    List<LCOdds> data = new ArrayList<LCOdds>();
    NodeList nl = parser.parse(new CssSelectorNodeFilter(ROOT));
    for (NodeIterator it = nl.elements(); it.hasMoreNodes(); ) {
      NodeList cells = it.nextNode().getChildren();
      cells.keepAllNodesThatMatch(tdFilter);

      LCOdds lc = parseRow(cells);

      if (null != lc) {
        data.add(lc);
      }
    }
    // persist
    if (data.size() < 1) {
      log.warn(" -- [ 06_LC_2 ] data is empty !");
    }
    storeData("lc_odds", data);
  }
Ejemplo n.º 7
0
  /**
   * 方法:获取对应的页面内容
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws ParserException
   *     <p>Add By Ethan Lam At 2011-11-23
   */
  public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException {
    Parser parser = new Parser();
    parser.setInputHTML(htmlPageContent);
    NodeFilter filter =
        new AndFilter(
            new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon"));
    NodeList nodeList = parser.parse(filter);
    NodeIterator it = nodeList.elements();
    Div div = null;
    StringBuffer htmlContent = new StringBuffer();
    while (it.hasMoreNodes()) {
      div = (Div) it.nextNode();
      NodeList nl = div.getChildren();
      if (nl == null) continue;
      NodeIterator sub = nl.elements();
      while (sub.hasMoreNodes()) {
        Node t = sub.nextNode();
        if (t instanceof ParagraphTag) {
          //	        		    LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText());
          htmlContent.append(((ParagraphTag) t).getStringText());
        }
      }
    }
    if ("".equals(htmlContent.toString().trim())) return;

    Page page = new Page();
    page.setUrl(preUrl);
    page.setSegment(htmlContent.toString());
    LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString());
    pageSer.save(page);
  }
Ejemplo n.º 8
0
  /**
   * 处理目标 超链接节点
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws Exception
   */
  public void dealLinkNodes(String htmlPageContent, String preUrl) {
    try {
      Parser parser = new Parser();
      parser.setInputHTML(htmlPageContent);
      NodeFilter filter =
          new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"));
      NodeList nodeList = parser.parse(filter);
      LoggerUtil.info("ParserHandler", "爬虫得到新的节点个数:" + (nodeList != null ? nodeList.size() : 0));
      NodeIterator it = nodeList.elements();
      while (it.hasMoreNodes()) {
        Node node = it.nextNode();
        if (node instanceof LinkTag) {
          if (!filterHandler.isLinkTagFilter(((LinkTag) node))) {
            LoggerUtil.debug(
                "ParserHandler  ", ((LinkTag) node).getLink(), ((LinkTag) node).getLinkText());
            CrawlQueue.getQueueManager()
                .newNode(((LinkTag) node).getLinkText(), ((LinkTag) node).getLink(), preUrl);
          }
        }
      }
    } catch (Exception e) {

    }
  }
 public ConversationID findCorrespondingHTMLFormConversation(ConversationID samlId) {
   ConversationModel conversationModel = this.model.getConversationModel();
   HttpUrl samlHttpUrl = conversationModel.getRequestUrl(samlId);
   int samlConversationIndex = conversationModel.getIndexOfConversation(samlId);
   for (int conversationIndex = samlConversationIndex - 1;
       conversationIndex >= 0;
       conversationIndex--) {
     ConversationID id = conversationModel.getConversationAt(conversationIndex);
     Response response = conversationModel.getResponse(id);
     HttpUrl httpUrl = conversationModel.getRequestUrl(id);
     Object parsedContent = Parser.parse(httpUrl, response);
     if (null == parsedContent) {
       continue;
     }
     if (false == parsedContent instanceof org.htmlparser.util.NodeList) {
       continue;
     }
     org.htmlparser.util.NodeList htmlNodeList = (org.htmlparser.util.NodeList) parsedContent;
     org.htmlparser.util.NodeList forms = htmlNodeList.searchFor(FormTag.class);
     try {
       for (NodeIterator ni = forms.elements(); ni.hasMoreNodes(); ) {
         FormTag form = (FormTag) ni.nextNode();
         String formAction = form.getAttribute("action");
         HttpUrl formActionHttpUrl = new HttpUrl(formAction);
         if (samlHttpUrl.equals(formActionHttpUrl)) {
           return id;
         }
       }
     } catch (ParserException ex) {
       this._logger.log(Level.WARNING, "Looking for forms, got ''{0}''", ex);
     } catch (MalformedURLException ex) {
       this._logger.log(Level.WARNING, "Malformed action url: {0}", ex.getMessage());
     }
   }
   return null;
 }