예제 #1
0
 private NodeList getMatchingTags(NodeFilter filter) throws Exception {
   String html = examiner.html();
   Parser parser = new Parser(new Lexer(new Page(html)));
   NodeList list = parser.parse(null);
   NodeList matches = list.extractAllNodesThatMatch(filter, true);
   return matches;
 }
예제 #2
0
파일: AreaTest.java 프로젝트: zhaoccx/LS
  /**
   * 对新闻URL进行解析提取新闻,同时将新闻插入到数据库中。
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    try {
      parser = new Parser(url);
      // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new
      // HasAttributeFilter("class", "TRS_PreAppend"));

      // parser.reset(); //记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。

      NodeFilter innerFilter =
          new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal"));
      NodeFilter xk = new HasParentFilter(innerFilter);
      NodeList nodes = parser.extractAllNodesThatMatch(xk);
      System.out.println(nodes.size());
      for (int i = 0; i < nodes.size(); i++) {
        Node time = nodes.elementAt(i);
        // System.out.println(time.toPlainTextString().trim().replace("&nbsp;",
        // "").replaceAll("[\\t\\n\\r]", "").replaceAll(" ", ""));
        System.out.println(
            replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll(" ", "")));
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
  }
예제 #3
0
  /**
   * 递归钻取正文信息
   *
   * @param nodeP
   * @return
   */
  @SuppressWarnings("unchecked")
  protected List<Node> extractHtml(Node nodeP, String type) throws Exception {
    NodeList nodeList = nodeP.getChildren();
    if ((nodeList == null) || (nodeList.size() == 0)) {
      return null;
    }
    ArrayList tableList = new ArrayList();
    try {
      for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) {
        Node node = (Node) e.nextNode();
        if (node instanceof LinkTag) {
          tableList.add(node);
        } else if (node instanceof ScriptTag
            || node instanceof StyleTag
            || node instanceof SelectTag) {
        } else if (node instanceof TextNode) {
          if (node.getText().length() > 0) {
            tableList.add(node);
          }
        } else {
          List tempList = extractHtml(node, type);
          if ((tempList != null) && (tempList.size() > 0)) {
            Iterator ti = tempList.iterator();
            while (ti.hasNext()) {
              tableList.add(ti.next());
            }
          }
        }
      }
    } catch (Exception e) {
      return null;
    }
    if ((tableList != null) && (tableList.size() > 0)) {
      TableContext tc = new TableContext();
      tc.setLinkList(new ArrayList());
      tc.setTextBuffer(new StringBuffer());
      tableNumber++;
      tc.setTableRow(tableNumber);
      Iterator ti = tableList.iterator();

      // 得到设置的搜索URL
      String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL);

      while (ti.hasNext()) {
        Node node = (Node) ti.next();
        if (node instanceof LinkTag) {
          LinkTag linkTag = (LinkTag) node;
          if (!"1".equalsIgnoreCase(type)) {
            linkTag.setAttribute(
                "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href")));
          }
          tc.getLinkList().add(linkTag);
        } else {
          tc.getTextBuffer().append(node.getText());
        }
      }
      return tableList;
    }
    return null;
  }
 // If there is a <title> element on the start page, use that as our AU
 // name.
 String recomputeRegName() {
   if (!isStarted()) {
     // This can get invoked (seveeral times, mostly from logging) before
     // enough mechanism has started to make it possible to resolve the CuUrl
     // below.
     return null;
   }
   try {
     CachedUrl cu = makeCachedUrl(m_registryUrl);
     if (cu == null) return null;
     URL cuUrl = CuUrl.fromCu(cu);
     Parser parser = new Parser(cuUrl.toString());
     NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class));
     Node nodes[] = nodelst.toNodeArray();
     recomputeRegName = false;
     if (nodes.length < 1) return null;
     // Get the first title found
     TitleTag tag = (TitleTag) nodes[0];
     if (tag == null) return null;
     return tag.getTitle();
   } catch (MalformedURLException e) {
     log.warning("recomputeRegName", e);
     return null;
   } catch (ParserException e) {
     if (e.getThrowable() instanceof FileNotFoundException) {
       log.warning("recomputeRegName: " + e.getThrowable().toString());
     } else {
       log.warning("recomputeRegName", e);
     }
     return null;
   }
 }
예제 #5
0
  /**
   * 方法:获取对应的页面内容
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws ParserException
   *     <p>Add By Ethan Lam At 2011-11-23
   */
  public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException {
    Parser parser = new Parser();
    parser.setInputHTML(htmlPageContent);
    NodeFilter filter =
        new AndFilter(
            new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon"));
    NodeList nodeList = parser.parse(filter);
    NodeIterator it = nodeList.elements();
    Div div = null;
    StringBuffer htmlContent = new StringBuffer();
    while (it.hasMoreNodes()) {
      div = (Div) it.nextNode();
      NodeList nl = div.getChildren();
      if (nl == null) continue;
      NodeIterator sub = nl.elements();
      while (sub.hasMoreNodes()) {
        Node t = sub.nextNode();
        if (t instanceof ParagraphTag) {
          //	        		    LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText());
          htmlContent.append(((ParagraphTag) t).getStringText());
        }
      }
    }
    if ("".equals(htmlContent.toString().trim())) return;

    Page page = new Page();
    page.setUrl(preUrl);
    page.setSegment(htmlContent.toString());
    LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString());
    pageSer.save(page);
  }
예제 #6
0
 // 获取页面指定内容的Link
 public static List getLinksByConditions(String result, String coditions, String codeKind) {
   List links = null;
   Parser parser;
   NodeList nodelist;
   // 页面编码配置 To do by shengf
   parser = Parser.createParser(result, codeKind);
   NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
   try {
     links = new ArrayList();
     nodelist = parser.parse(linkFilter);
     Node[] nodes = nodelist.toNodeArray();
     int count = 1;
     for (int i = 0; i < nodes.length; i++) {
       Node node = nodes[i];
       if (node instanceof LinkTag) {
         LinkTag link = (LinkTag) node;
         if (link.toHtml().indexOf(coditions) != -1) {
           links.add(link);
           count++;
           if (count > CatchNum) {
             return links;
           }
         }
       }
     }
   } catch (ParserException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   return links;
 }
예제 #7
0
  public void checkprice() throws Exception {

    // System.out.println("checking Aptamil url [" + page.url + "]");
    URL url = new URL(page.url);
    HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
    urlConnection.setConnectTimeout(Constant.connect_timeout);
    urlConnection.connect();

    Parser parser = new Parser(urlConnection);

    parser.setEncoding(Constant.ENCODE);
    NodeClassFilter div_filter = new NodeClassFilter(Div.class);

    OrFilter filters = new OrFilter();
    filters.setPredicates(new NodeFilter[] {div_filter});

    NodeList list = parser.extractAllNodesThatMatch(filters);

    for (int i = 0; i < list.size(); i++) {
      Node tag = list.elementAt(i);
      if (tag instanceof Div) {
        Div d = (Div) tag;
        String divclass = d.getAttribute("class");
        if ("pl_addToBasket".equalsIgnoreCase(divclass)) {
          // return getName(d);
        }
      }
    }
  }
예제 #8
0
  public List<Newsitem> parseContent(String content) throws Exception {
    List<Newsitem> newsitems = new ArrayList<Newsitem>();

    Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news");
    NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem");

    for (int i = 0; i < nodes.size(); i++) {
      NewsitemImpl newsitem = new NewsitemImpl();
      Tag itemTable = (Tag) nodes.elementAt(i);

      Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle");
      newsitem.setTitle(titleTag.toPlainTextString());

      Node descriptionSpan =
          titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling();
      newsitem.setDescription(
          descriptionSpan
              .toPlainTextString()
              .replaceAll("[^\\u0000-\\u00FF]", " ")
              .replace("&nbsp;Read More...", "")
              .trim());

      Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0);
      newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href"));
      newsitems.add(newsitem);
    }
    return newsitems;
  }
예제 #9
0
파일: AreaTest.java 프로젝트: zhaoccx/LS
  /**
   * 获取新闻的内容
   *
   * @param newsContentFilter
   * @param parser
   * @return content 新闻内容
   */
  public String getNewsContent(NodeFilter newsContentFilter, Parser parser) {
    String content = null;
    StringBuilder builder = new StringBuilder();

    try {
      NodeList newsContentList = parser.parse(newsContentFilter);
      for (int i = 0; i < newsContentList.size(); i++) {
        Div newsContenTag = (Div) newsContentList.elementAt(i);
        builder = builder.append(newsContenTag.getStringText());
      }
      content = builder.toString(); // 转换为String 类型。
      if (content != null) {
        parser.reset();
        parser = Parser.createParser(content, "utf8");
        StringBean sb = new StringBean();
        sb.setCollapse(true);
        parser.visitAllNodesWith(sb);
        content = sb.getStrings();
        // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} }
        // showTurnAD444(intTurnAD444); }catch(e){}";

        content = content.replaceAll("\\\".*[a-z].*\\}", "");

        content = content.replace("[我来说两句]", "");

      } else {
        System.out.println("没有得到新闻内容!");
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }

    return content;
  }
예제 #10
0
  /**
   * Extract the object <code>PARAM</code> tags from the child list.
   *
   * @return The list of object parameters (keys and values are String objects).
   */
  public HashMap createObjectParamsTable() {
    NodeList kids;
    Node node;
    Tag tag;
    String paramName;
    String paramValue;
    HashMap ret;

    ret = new HashMap();
    kids = getChildren();
    if (null != kids)
      for (int i = 0; i < kids.size(); i++) {
        node = children.elementAt(i);
        if (node instanceof Tag) {
          tag = (Tag) node;
          if (tag.getTagName().equals("PARAM")) {
            paramName = tag.getAttribute("NAME");
            if (null != paramName && 0 != paramName.length()) {
              paramValue = tag.getAttribute("VALUE");
              ret.put(paramName.toUpperCase(), paramValue);
            }
          }
        }
      }

    return (ret);
  }
예제 #11
0
 public Row(CompositeTag rowNode) {
   this.rowNode = rowNode;
   NodeList nodeList = rowNode.getChildren();
   for (int i = 0; i < nodeList.size(); i++) {
     Node node = nodeList.elementAt(i);
     if (node instanceof TableColumn) cells.add(new Cell((TableColumn) node));
   }
 }
예제 #12
0
  // 获取一个网站上的链接,filter来过滤链接
  public static Set<String> extracLinks(String url, Cobweb cobweb) {
    Set<String> links = new HashSet<String>();

    try {
      Parser parser = new Parser(url);
      parser.setEncoding(cobweb.getCharSet());

      // 过滤<frame >标签的filter,用来提取frame 标签里的src 属性
      NodeFilter frameFilter =
          new NodeFilter() {
            public boolean accept(Node node) {
              if (node.getText().startsWith("frame src=")) {
                return true;
              } else {
                return false;
              }
            }
          };

      // OrFilter 来设置过滤<a> 标签和<frame> 标签
      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
      // 得到所有经过过滤的标签
      NodeList list = parser.extractAllNodesThatMatch(linkFilter);
      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);
        if (tag instanceof LinkTag) { // <a> 标签
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // URL
          if (cobweb.accept(linkUrl)) {
            links.add( // java.net.URLEncoder.encode(linkUrl));
                linkUrl
                    .replaceAll("\\?", "\\%3F") // 转码
                    .replaceAll("\\&", "\\%26")
                    .replaceAll("\\|", "\\%124")
                    .replaceAll("\\#", ""));
          }
          ;
        } else { // <frame>标签
          // 提取frame 里src 属性的链接,如<frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=");
          frame = frame.substring(start);
          int end = frame.indexOf(" ");
          if (end == -1) {
            end = frame.indexOf(">");
          }
          String frameUrl = frame.substring(5, end - 1);
          if (cobweb.accept(frameUrl)) {
            links.add(frameUrl);
          }
        }
      }
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }
예제 #13
0
 private void setExecutionResult(ExecutionResult executionResult) {
   NodeList cells = rowNode.getChildren();
   for (int i = 0; i < cells.size(); i++) {
     Node cell = cells.elementAt(i);
     if (cell instanceof Tag) {
       Tag tag = (Tag) cell;
       tag.setAttribute("class", executionResult.toString(), '"');
     }
   }
 }
예제 #14
0
  /**
   * Get the list of options in this <code>SELECT</code> tag.
   *
   * @return The {@.html <OPTION>} tags contained by this tag.
   */
  public OptionTag[] getOptionTags() {
    NodeList list;
    OptionTag[] ret;

    list = searchFor(OptionTag.class, true);
    ret = new OptionTag[list.size()];
    list.copyToNodeArray(ret);

    return (ret);
  }
예제 #15
0
  public static void setViewState(String html) throws Exception {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter filter =
        new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "__VIEWSTATE"));

    NodeList nodes = parser.parse(filter);
    InputTag node = (InputTag) nodes.elementAt(0);

    viewState = node.getAttribute("value");
  }
예제 #16
0
 public HtmlTable(TableTag tableNode) {
   this.tableNode = tableNode;
   NodeList nodeList = tableNode.getChildren();
   for (int i = 0; i < nodeList.size(); i++) {
     Node node = nodeList.elementAt(i);
     if (node instanceof TableRow || node instanceof TableHeader) {
       rows.add(new Row((CompositeTag) node));
     }
   }
 }
예제 #17
0
  public static void setEventValidation(String html) throws ParserException {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter filter =
        new AndFilter(
            new TagNameFilter("input"), new HasAttributeFilter("id", "__EVENTVALIDATION"));
    NodeList nodes = parser.parse(filter);
    InputTag node = (InputTag) nodes.elementAt(0);

    eventValidation = node.getAttribute("value");
  }
 public static List<String> getLinks(String url) throws ParserException {
   Parser htmlParser = new Parser(url);
   List<String> links = new LinkedList<String>();
   NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
   for (int m = 0; m < tagNodeList.size(); m++) {
     LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
     String linkName = loopLinks.getLink();
     links.add(linkName);
   }
   return links;
 }
  /**
   * Add the tag name and it's children's tag names to the set of tag names.
   *
   * @param set The set to add to.
   * @param node The node to get the names from.
   */
  protected void addName(Set set, Node node) {
    NodeList children;

    if (node instanceof Tag) {
      set.add(((Tag) node).getTagName());
      if (node instanceof CompositeTag) {
        children = ((CompositeTag) node).getChildren();
        if (null != children)
          for (int i = 0; i < children.size(); i++) addName(set, children.elementAt(i));
      }
    }
  }
예제 #20
0
  /**
   * 对新闻URL进行解析并采集数据
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    String title = ""; // 新闻标题
    String source = ""; // 新闻来源
    String sourceTime = ""; // 新闻来源时间
    // String author = ""; //新闻作者
    String Content = ""; // 新闻内容
    // String collectTime = ""; //新闻采集时间-系统时间
    try {
      parser = new Parser(url);
      parser.setEncoding("GB2312");
      // 标题
      NodeFilter titleFilter = new TagNameFilter("h1");
      NodeList titleNodeList = parser.parse(titleFilter);
      title = parserUtil.getNodeListText(titleNodeList);
      parser.reset(); // 每次获取都必须reset,不然后面获取不到数据
      System.out.println(title);
      // 来源
      NodeFilter sourceFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name"));
      NodeList sourceNodeList = parser.parse(sourceFilter);
      source = parserUtil.getNodeListText(sourceNodeList);
      parser.reset();
      System.out.println(source);
      // 来源时间
      NodeFilter sourceTimeFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date"));
      NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter);
      String str = parserUtil.getNodeListText(sourceTimeNodeList);
      sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace("&nbsp;", "");
      parser.reset();
      System.out.println(sourceTime);

      // 正文
      NodeFilter ContentTimeFilter =
          new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody"));
      NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter);
      NodeList childList = ContentTimeNodeList.elementAt(0).getChildren();
      childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分
      // childList.keepAllNodesThatMatch(new RegexFilter("  相关专题"));

      Content = parserUtil.getNodeListHTML(ContentTimeNodeList);
      // Content = ParserUtil.getPlainText(Content);
      System.out.println(Content);
      parser.reset();

    } catch (ParserException e) {
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
예제 #21
0
  /** Test tag name filtering. */
  public void testTagName() throws ParserException {
    String guts;
    String html;
    NodeList list;

    guts = "<booty>Now is the time for all good men..</booty>";
    html = "<html>" + guts + "</html>";
    createParser(html);
    list = parser.extractAllNodesThatMatch(new TagNameFilter("booty"));
    assertEquals("only one element", 1, list.size());
    assertSuperType("should be Tag", Tag.class, list.elementAt(0));
    assertStringEquals("name", "BOOTY", ((Tag) (list.elementAt(0))).getTagName());
  }
 /**
  * 获取滚动品牌
  *
  * @param path
  * @param city
  * @param fileName
  * @return
  */
 public static Map<String, String> getBrandInfo(String path, String city, String fileName) {
   Map<String, String> brandMap = new LinkedHashMap<String, String>();
   try {
     StringBuilder filePath = new StringBuilder();
     filePath.append(PATH);
     filePath.append(city);
     filePath.append(INCLUDE);
     filePath.append(fileName);
     filePath.append(STUFF);
     // 开始解析
     Parser parser = new Parser(filePath.toString());
     // 过滤出<a></a>标签
     NodeFilter divFilter = new NodeClassFilter(Div.class);
     NodeList classList = parser.extractAllNodesThatMatch(divFilter);
     NodeList hrefList = null;
     NodeList imgList = null;
     Node picNode = null;
     Node hrefNode = null;
     Node imgNode = null;
     String classStr = "";
     String hrefStr = "";
     String imgStr = "";
     String imgClass = "";
     for (int i = 0; i < classList.size(); i++) {
       picNode = classList.elementAt(i);
       classStr = ((Div) picNode).getAttribute("class");
       if ("business_list_pic".equalsIgnoreCase(classStr)) {
         hrefList = picNode.getChildren();
         for (int j = 0; j < hrefList.size(); j++) {
           hrefNode = hrefList.elementAt(j);
           if (hrefNode instanceof LinkTag) {
             hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id,获取到id
             hrefStr = MParseBrand.getBrandId(hrefStr);
             imgList = hrefNode.getChildren();
             for (int k = 0; k < imgList.size(); k++) {
               imgNode = imgList.elementAt(k);
               if (imgNode instanceof ImageTag) {
                 imgClass = ((ImageTag) imgNode).getAttribute("class");
                 if (null != imgClass) {
                   imgStr = ((ImageTag) imgNode).getAttribute("src");
                   if (null == imgStr) {
                     imgStr = ((ImageTag) imgNode).getAttribute("original");
                   }
                 }
               }
             }
             brandMap.put(hrefStr, imgStr);
           }
         }
       }
     }
   } catch (Exception e) {
     e.printStackTrace();
   }
   return brandMap;
 }
예제 #23
0
파일: AreaTest.java 프로젝트: zhaoccx/LS
  /*
   * 获得新闻的日期
   */
  public String getNewsDate(NodeFilter dateFilter, Parser parser) {
    String newsDate = null;
    try {
      NodeList dateList = parser.parse(dateFilter);
      for (int i = 0; i < dateList.size(); i++) {
        Div dateTag = (Div) dateList.elementAt(i);
        newsDate = dateTag.getStringText();
      }
    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }

    return newsDate;
  }
예제 #24
0
파일: AreaTest.java 프로젝트: zhaoccx/LS
  /**
   * 获得新闻的责任编辑,也就是作者。
   *
   * @param newsauthorFilter
   * @param parser
   * @return
   */
  public String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) {
    String newsAuthor = "";
    try {
      NodeList authorList = parser.parse(newsauthorFilter);
      for (int i = 0; i < authorList.size(); i++) {
        Div authorSpan = (Div) authorList.elementAt(i);
        newsAuthor = authorSpan.getStringText();
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
    return newsAuthor;
  }
예제 #25
0
  // 获取一个网站上的链接,filter 用来过滤链接
  public static Set<String> extracLinks(String url, LinkFilter filter) {

    Set<String> links = new HashSet<String>();
    try {
      Parser parser = new Parser(url);
      // parser.setEncoding("utf8");
      // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
      NodeFilter frameFilter =
          new NodeFilter() {
            /** */
            private static final long serialVersionUID = 1L;

            public boolean accept(Node node) {
              if (node.getText().startsWith("iframe") && node.getText().contains("src=")) {
                return true;
              } else {
                return false;
              }
            }
          };
      // OrFilter 来设置过滤 <a> 标签和 <frame> 标签
      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
      // 得到所有经过过滤的标签
      NodeList list = parser.extractAllNodesThatMatch(linkFilter);
      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);
        if (tag instanceof LinkTag) // <a> 标签
        {
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // url可能出现在src,href等属性中
          if (filter.accept(linkUrl)) links.add(linkUrl);
        } else // <frame> 标签
        {
          // 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=\"");
          frame = frame.substring(start);
          int end = frame.indexOf("\">");
          if (end == -1) {
            end = frame.indexOf("?");
          }
          String frameUrl = frame.substring(5, end - 1);
          if (filter.accept(frameUrl)) links.add(frameUrl);
        }
      }
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }
예제 #26
0
  private String[] processBlog(InputStream in) throws BlogCrawlingException {

    // using a set here to avoid duplicates
    Set<String> linksToBlogs = new TreeSet<String>();

    try {

      Page page = new Page(in, null);
      Parser parser = new Parser(new Lexer(page));

      // register a filter to extract all the anchor tags
      TagNameFilter anchorTagsFilter = new TagNameFilter("a");

      StringBuffer buf = new StringBuffer();
      NodeList anchorTagsList = parser.parse(anchorTagsFilter);

      for (int i = 0; i < anchorTagsList.size(); i++) {
        Node node = anchorTagsList.elementAt(i);
        LinkTag tag = (LinkTag) node;
        String linkURL = tag.getLink();

        if (blogDetector.identifyURL(linkURL, null) != Constants.NOT_A_BLOG) {
          // logger.info(" *BLOG Detected* ==> " + linkURL);
          System.out.println("[" + myNumber + "] *BLOG Detected* ==> " + linkURL);
          linksToBlogs.add(linkURL);
        } else {
          System.out.println("[" + myNumber + "] *Non-BLOG Detected* ==> " + linkURL);
        }
      }

      String[] links = new String[linksToBlogs.size()];
      int count = 0;
      for (String linksToBlog : linksToBlogs) {
        links[count++] = linksToBlog;
      }

      return links;

    } catch (ParserException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    } catch (IOException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    }
  }
예제 #27
0
파일: AreaTest.java 프로젝트: zhaoccx/LS
  /**
   * 获得新闻的标题
   *
   * @param titleFilter
   * @param parser
   * @return
   */
  public String getTitle(NodeFilter titleFilter, Parser parser) {
    String titleName = "";
    try {

      NodeList titleNodeList = parser.parse(titleFilter);
      for (int i = 0; i < titleNodeList.size(); i++) {
        HeadingTag title = (HeadingTag) titleNodeList.elementAt(i);
        titleName = title.getStringText();
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
    return titleName;
  }
예제 #28
0
 private int colspan(Row row) {
   NodeList rowNodes = row.rowNode.getChildren();
   int colspan = 0;
   for (int i = 0; i < rowNodes.size(); i++) {
     if (rowNodes.elementAt(i) instanceof TableColumn) {
       String s = ((TableColumn) rowNodes.elementAt(i)).getAttribute("colspan");
       if (s != null) {
         colspan += Integer.parseInt(s);
       } else {
         colspan++;
       }
     }
   }
   return colspan;
 }
예제 #29
0
  /** Test attribute filtering. */
  public void testAttribute() throws ParserException {
    String guts;
    String html;
    NodeList list;

    guts =
        "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>";
    html = "<html>" + guts + "</html>";
    createParser(html);
    list = parser.extractAllNodesThatMatch(new HasAttributeFilter("id"));
    assertEquals("only one element", 1, list.size());
    assertType("should be LinkTag", LinkTag.class, list.elementAt(0));
    LinkTag link = (LinkTag) list.elementAt(0);
    assertEquals("attribute value", "target", link.getAttribute("id"));
  }
예제 #30
0
  private static void setStandardIdsToMap(Integer pageNo, String html) throws Exception {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter viewStateFilter =
        new AndFilter(
            new TagNameFilter("table"),
            new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_StandardView"));

    NodeList nodes = parser.parse(viewStateFilter);
    TableTag node = (TableTag) nodes.elementAt(0);

    TableRow[] rows = node.getRows();
    for (int i = 1; i < rows.length; i++) {
      TableColumn[] cols = rows[i].getColumns();
      TableColumn col = cols[3];
      LinkTag tag = (LinkTag) ((Div) col.getChildren().elementAt(1)).getChildren().elementAt(2);
      if (tag == null) {
        List<Integer> lst = error.get(pageNo);
        if (lst == null) {
          lst = new ArrayList<Integer>();
        }
        lst.add(i);
        error.put(pageNo, lst);
        continue;
      }

      String href = tag.getAttribute("href");
      if (href == null) {
        List<Integer> lst = error.get(pageNo);
        if (lst == null) {
          lst = new ArrayList<Integer>();
        }
        lst.add(i);
        error.put(pageNo, lst);
        continue;
      }
      int start = href.indexOf("standardid=");
      int end = href.indexOf("&amp;");

      String standardId = href.substring(start, end).replaceAll("standardid=", "");

      List<String> lst = map.get(pageNo);
      if (lst == null) {
        lst = new ArrayList<String>();
      }
      lst.add(standardId);
      map.put(pageNo, lst);
    }
  }