/**
  * 获取滚动品牌
  *
  * @param path
  * @param city
  * @param fileName
  * @return
  */
 public static Map<String, String> getBrandInfo(String path, String city, String fileName) {
   Map<String, String> brandMap = new LinkedHashMap<String, String>();
   try {
     StringBuilder filePath = new StringBuilder();
     filePath.append(PATH);
     filePath.append(city);
     filePath.append(INCLUDE);
     filePath.append(fileName);
     filePath.append(STUFF);
     // 开始解析
     Parser parser = new Parser(filePath.toString());
     // 过滤出<a></a>标签
     NodeFilter divFilter = new NodeClassFilter(Div.class);
     NodeList classList = parser.extractAllNodesThatMatch(divFilter);
     NodeList hrefList = null;
     NodeList imgList = null;
     Node picNode = null;
     Node hrefNode = null;
     Node imgNode = null;
     String classStr = "";
     String hrefStr = "";
     String imgStr = "";
     String imgClass = "";
     for (int i = 0; i < classList.size(); i++) {
       picNode = classList.elementAt(i);
       classStr = ((Div) picNode).getAttribute("class");
       if ("business_list_pic".equalsIgnoreCase(classStr)) {
         hrefList = picNode.getChildren();
         for (int j = 0; j < hrefList.size(); j++) {
           hrefNode = hrefList.elementAt(j);
           if (hrefNode instanceof LinkTag) {
             hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id,获取到id
             hrefStr = MParseBrand.getBrandId(hrefStr);
             imgList = hrefNode.getChildren();
             for (int k = 0; k < imgList.size(); k++) {
               imgNode = imgList.elementAt(k);
               if (imgNode instanceof ImageTag) {
                 imgClass = ((ImageTag) imgNode).getAttribute("class");
                 if (null != imgClass) {
                   imgStr = ((ImageTag) imgNode).getAttribute("src");
                   if (null == imgStr) {
                     imgStr = ((ImageTag) imgNode).getAttribute("original");
                   }
                 }
               }
             }
             brandMap.put(hrefStr, imgStr);
           }
         }
       }
     }
   } catch (Exception e) {
     e.printStackTrace();
   }
   return brandMap;
 }
Beispiel #2
0
  /** Test attribute filtering. */
  public void testAttribute() throws ParserException {
    String guts;
    String html;
    NodeList list;

    guts =
        "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>";
    html = "<html>" + guts + "</html>";
    createParser(html);
    list = parser.extractAllNodesThatMatch(new HasAttributeFilter("id"));
    assertEquals("only one element", 1, list.size());
    assertType("should be LinkTag", LinkTag.class, list.elementAt(0));
    LinkTag link = (LinkTag) list.elementAt(0);
    assertEquals("attribute value", "target", link.getAttribute("id"));
  }
Beispiel #3
0
 private int colspan(Row row) {
   NodeList rowNodes = row.rowNode.getChildren();
   int colspan = 0;
   for (int i = 0; i < rowNodes.size(); i++) {
     if (rowNodes.elementAt(i) instanceof TableColumn) {
       String s = ((TableColumn) rowNodes.elementAt(i)).getAttribute("colspan");
       if (s != null) {
         colspan += Integer.parseInt(s);
       } else {
         colspan++;
       }
     }
   }
   return colspan;
 }
Beispiel #4
0
  /**
   * Set the enclosed <code>PARAM<code> children.
   * @param newObjectParams The new parameters.
   */
  public void setObjectParams(HashMap newObjectParams) {
    NodeList kids;
    Node node;
    Tag tag;
    String paramName;
    String paramValue;
    List attributes;
    TextNode string;

    kids = getChildren();
    if (null == kids) kids = new NodeList();
    else
      // erase objectParams from kids
      for (int i = 0; i < kids.size(); ) {
        node = kids.elementAt(i);
        if (node instanceof Tag)
          if (((Tag) node).getTagName().equals("PARAM")) {
            kids.remove(i);
            // remove whitespace too
            if (i < kids.size()) {
              node = kids.elementAt(i);
              if (node instanceof TextNode) {
                string = (TextNode) node;
                if (0 == string.getText().trim().length()) kids.remove(i);
              }
            }
          } else i++;
        else i++;
      }

    // add newObjectParams to kids
    for (Iterator e = newObjectParams.entrySet().iterator(); e.hasNext(); ) {
      Map.Entry entry = (Entry) e.next();
      attributes = new ArrayList(); // should the tag copy the attributes?
      paramName = (String) entry.getKey();
      paramValue = (String) entry.getValue();
      attributes.add(new Attribute("PARAM", null));
      attributes.add(new Attribute(" "));
      attributes.add(new Attribute("VALUE", paramValue, '"'));
      attributes.add(new Attribute(" "));
      attributes.add(new Attribute("NAME", paramName.toUpperCase(), '"'));
      tag = new TagNode(null, 0, 0, attributes);
      kids.add(tag);
    }

    // set kids as new children
    setChildren(kids);
  }
Beispiel #5
0
  public static void dealOnePage(String url, int starNo) {
    try {
      Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection());
      NodeList tableSet =
          parser.extractAllNodesThatMatch(new HasAttributeFilter("bgcolor", "#DDE1FF"));
      parser = new Parser(new Lexer(tableSet.toHtml()));
      NodeList tdSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("tr"));
      parser = new Parser(new Lexer(tdSet.toHtml()));

      PrototypicalNodeFactory p = new PrototypicalNodeFactory();
      p.registerTag(new SpanTag());
      parser.setNodeFactory(p);

      NodeList spanSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("span"));
      int index = 0;
      for (int i = 5; i < spanSet.size(); i = i + 5) {
        String str = spanSet.elementAt(i).toPlainTextString();
        String now = "" + (starNo * 100 + index);
        index++;
        while (str.compareTo(now) != 0) {
          System.out.println(now);
          now = "" + (starNo * 100 + index);
          index++;
        }
        // System.out.println(str);
      }
    } catch (ParserException e) {
      e.printStackTrace();
    } catch (MalformedURLException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
Beispiel #6
0
  public void checkprice() throws Exception {

    // System.out.println("checking Aptamil url [" + page.url + "]");
    URL url = new URL(page.url);
    HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
    urlConnection.setConnectTimeout(Constant.connect_timeout);
    urlConnection.connect();

    Parser parser = new Parser(urlConnection);

    parser.setEncoding(Constant.ENCODE);
    NodeClassFilter div_filter = new NodeClassFilter(Div.class);

    OrFilter filters = new OrFilter();
    filters.setPredicates(new NodeFilter[] {div_filter});

    NodeList list = parser.extractAllNodesThatMatch(filters);

    for (int i = 0; i < list.size(); i++) {
      Node tag = list.elementAt(i);
      if (tag instanceof Div) {
        Div d = (Div) tag;
        String divclass = d.getAttribute("class");
        if ("pl_addToBasket".equalsIgnoreCase(divclass)) {
          // return getName(d);
        }
      }
    }
  }
  /**
   * 获取文章链接
   *
   * @param url
   * @throws Exception
   */
  void docByHTML(String content, String pre) throws Exception {
    Parser parser = new Parser();
    parser.setInputHTML(content);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(LinkTag.class);
    NodeList list = parser.extractAllNodesThatMatch(fileter);
    if (list != null && list.size() > 0) {
      Parser p1 = new Parser();
      p1.setInputHTML(list.toHtml());
      NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
      NodeList linkList = p1.extractAllNodesThatMatch(linkFilter);
      if (linkList != null && linkList.size() > 0) {
        for (int i = 0; i < linkList.size(); i++) {
          LinkTag link = (LinkTag) linkList.elementAt(i);
          LinkBean bean = null;
          if (link.getLink().toLowerCase().startsWith(pre)
              && !link.getLinkText().equalsIgnoreCase("详细内容")) {
            if (null == articleDocCache.get(getKey(link.getLink()))) {
              bean = new LinkBean();
              bean.setLink(link.getLink());
              bean.setName(link.getLinkText());
              LINKHASH.put(link.getLink(), bean);
            } else {
              logger.info(">> 已存在 [" + link.getLink() + "] 地址");
            }
          }
        }
      }
    }
  }
Beispiel #8
0
  /** Test node class filtering. */
  public void testNodeClass() throws ParserException {
    String guts;
    String html;
    NodeList list;

    guts = "<body>Now is the time for all good men..</body>";
    html = "<html>" + guts + "</html>";
    createParser(html);
    list = parser.extractAllNodesThatMatch(new NodeClassFilter(BodyTag.class));
    assertEquals("only one element", 1, list.size());
    assertType("should be BodyTag", BodyTag.class, list.elementAt(0));
    BodyTag body = (BodyTag) list.elementAt(0);
    assertEquals("only one child", 1, body.getChildCount());
    assertSuperType("should be Text", Text.class, body.getChildren().elementAt(0));
    assertStringEquals("html", guts, body.toHtml());
  }
Beispiel #9
0
  /**
   * 获取新闻的内容
   *
   * @param newsContentFilter
   * @param parser
   * @return content 新闻内容
   */
  public String getNewsContent(NodeFilter newsContentFilter, Parser parser) {
    String content = null;
    StringBuilder builder = new StringBuilder();

    try {
      NodeList newsContentList = parser.parse(newsContentFilter);
      for (int i = 0; i < newsContentList.size(); i++) {
        Div newsContenTag = (Div) newsContentList.elementAt(i);
        builder = builder.append(newsContenTag.getStringText());
      }
      content = builder.toString(); // 转换为String 类型。
      if (content != null) {
        parser.reset();
        parser = Parser.createParser(content, "utf8");
        StringBean sb = new StringBean();
        sb.setCollapse(true);
        parser.visitAllNodesWith(sb);
        content = sb.getStrings();
        // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} }
        // showTurnAD444(intTurnAD444); }catch(e){}";

        content = content.replaceAll("\\\".*[a-z].*\\}", "");

        content = content.replace("[我来说两句]", "");

      } else {
        System.out.println("没有得到新闻内容!");
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }

    return content;
  }
Beispiel #10
0
  /**
   * 对新闻URL进行解析提取新闻,同时将新闻插入到数据库中。
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    try {
      parser = new Parser(url);
      // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new
      // HasAttributeFilter("class", "TRS_PreAppend"));

      // parser.reset(); //记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。

      NodeFilter innerFilter =
          new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal"));
      NodeFilter xk = new HasParentFilter(innerFilter);
      NodeList nodes = parser.extractAllNodesThatMatch(xk);
      System.out.println(nodes.size());
      for (int i = 0; i < nodes.size(); i++) {
        Node time = nodes.elementAt(i);
        // System.out.println(time.toPlainTextString().trim().replace("&nbsp;",
        // "").replaceAll("[\\t\\n\\r]", "").replaceAll(" ", ""));
        System.out.println(
            replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll(" ", "")));
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
  }
Beispiel #11
0
  /** Test string filtering. */
  public void testString() throws ParserException {
    String guts;
    String html;
    NodeList list;

    guts = "<body>Now is the <a id=target><b>time</b></a> for all good <time>men</time>..</body>";
    html = "<html>" + guts + "</html>";
    createParser(html);
    list = parser.extractAllNodesThatMatch(new StringFilter("Time"));
    assertEquals("only one element", 1, list.size());
    assertSuperType("should be String", Text.class, list.elementAt(0));
    assertStringEquals("name", "time", ((Text) list.elementAt(0)).getText());
    // test case sensitivity
    list = parser.extractAllNodesThatMatch(new StringFilter("Time", true));
    assertEquals("should be no elements", 0, list.size());
  }
  public List<Newsitem> parseContent(String content) throws Exception {
    List<Newsitem> newsitems = new ArrayList<Newsitem>();

    Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news");
    NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem");

    for (int i = 0; i < nodes.size(); i++) {
      NewsitemImpl newsitem = new NewsitemImpl();
      Tag itemTable = (Tag) nodes.elementAt(i);

      Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle");
      newsitem.setTitle(titleTag.toPlainTextString());

      Node descriptionSpan =
          titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling();
      newsitem.setDescription(
          descriptionSpan
              .toPlainTextString()
              .replaceAll("[^\\u0000-\\u00FF]", " ")
              .replace("&nbsp;Read More...", "")
              .trim());

      Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0);
      newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href"));
      newsitems.add(newsitem);
    }
    return newsitems;
  }
  /**
   * 获取文章链接
   *
   * @param url
   * @throws Exception
   */
  void doc(String url, String pre) throws Exception {
    Parser parser = new Parser();
    parser.setURL(url);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(LinkTag.class);
    NodeList list = parser.extractAllNodesThatMatch(fileter);
    if (list != null && list.size() > 0) {
      Parser p1 = new Parser();
      p1.setInputHTML(list.toHtml());
      NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
      NodeList linkList = p1.extractAllNodesThatMatch(linkFilter);
      if (linkList != null && linkList.size() > 0) {
        for (int i = 0; i < linkList.size(); i++) {
          LinkTag link = (LinkTag) linkList.elementAt(i);
          LinkBean bean = null;
          if (link.getLink().toLowerCase().startsWith(pre)
              && !link.getLinkText().equalsIgnoreCase("详细内容")) {
            bean = new LinkBean();
            bean.setLink(link.getLink());
            bean.setName(link.getLinkText());
            LINKHASH.put(link.getLink(), bean);
          }
        }
      }
    }
  }
  public static void main(String[] args) throws Exception {
    RequestConfig requestConfig =
        RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build();
    CloseableHttpClient httpclient =
        HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
    int count = 1;
    for (int i = 0; i <= 16; i++) {
      int index = i;
      // System.out.println(index);
      HttpGet httpGet = new HttpGet(url3 + index + url4);
      HttpResponse response = httpclient.execute(httpGet);
      HttpEntity entity = response.getEntity();
      String htmls = null;
      if (entity != null) {
        htmls = EntityUtils.toString(entity).replace("\t", " ");
      }
      Parser parser = Parser.createParser(htmls, "utf-8");
      AndFilter dFilter =
          new AndFilter(new TagNameFilter("h2"), new HasAttributeFilter("class", "field-content"));
      NodeList nodes3 = parser.extractAllNodesThatMatch(dFilter);
      for (int k = 0; k < nodes3.size(); k++) {
        htmls = nodes3.elementAt(k).toHtml();
        parser = Parser.createParser(htmls, "utf-8");
        AndFilter ProfessionNameFilter =
            new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href"));
        NodeList nodes4 = parser.extractAllNodesThatMatch(ProfessionNameFilter);
        for (int j = 0; j < nodes4.size(); j++) {
          LinkTag link = (LinkTag) nodes4.elementAt(j);
          // if(link.getAttribute("href").contains("http://www.ulster.ac.uk/"))
          { // .replaceAll("<span[\\s\\S]*/span>","")
            String temp = link.toHtml();

            System.out.println(
                "{\""
                    + count
                    + "\",\"http://www.chi.ac.uk/"
                    + link.getAttribute("href")
                    + "\",\""
                    + html2Str(temp).replace("\r\n", "").trim()
                    + "\",\"0\"},");
            count++;
          }
        }
      }
    }
    // System.out.println("DONE.");
  }
Beispiel #15
0
 public Row(CompositeTag rowNode) {
   this.rowNode = rowNode;
   NodeList nodeList = rowNode.getChildren();
   for (int i = 0; i < nodeList.size(); i++) {
     Node node = nodeList.elementAt(i);
     if (node instanceof TableColumn) cells.add(new Cell((TableColumn) node));
   }
 }
Beispiel #16
0
  private LCOdds parseRow(NodeList cells) {
    if (cells.size() == 8) {
      try {
        LCOdds lc = new LCOdds(Constants.PLAY_06_LC_2, "2,1");
        parseMatchCode(lc, cells.elementAt(0));
        parseOfftime(lc, formater, cells.elementAt(3));

        for (int i = 4; i <= 5; i++) {
          lc.addOdd(parseOdd(cells.elementAt(i)));
        }
        return lc;
      } catch (Exception e) {
        warn(log, e);
      }
    }
    return null;
  }
Beispiel #17
0
  // 获取一个网站上的链接,filter来过滤链接
  public static Set<String> extracLinks(String url, Cobweb cobweb) {
    Set<String> links = new HashSet<String>();

    try {
      Parser parser = new Parser(url);
      parser.setEncoding(cobweb.getCharSet());

      // 过滤<frame >标签的filter,用来提取frame 标签里的src 属性
      NodeFilter frameFilter =
          new NodeFilter() {
            public boolean accept(Node node) {
              if (node.getText().startsWith("frame src=")) {
                return true;
              } else {
                return false;
              }
            }
          };

      // OrFilter 来设置过滤<a> 标签和<frame> 标签
      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
      // 得到所有经过过滤的标签
      NodeList list = parser.extractAllNodesThatMatch(linkFilter);
      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);
        if (tag instanceof LinkTag) { // <a> 标签
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // URL
          if (cobweb.accept(linkUrl)) {
            links.add( // java.net.URLEncoder.encode(linkUrl));
                linkUrl
                    .replaceAll("\\?", "\\%3F") // 转码
                    .replaceAll("\\&", "\\%26")
                    .replaceAll("\\|", "\\%124")
                    .replaceAll("\\#", ""));
          }
          ;
        } else { // <frame>标签
          // 提取frame 里src 属性的链接,如<frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=");
          frame = frame.substring(start);
          int end = frame.indexOf(" ");
          if (end == -1) {
            end = frame.indexOf(">");
          }
          String frameUrl = frame.substring(5, end - 1);
          if (cobweb.accept(frameUrl)) {
            links.add(frameUrl);
          }
        }
      }
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }
Beispiel #18
0
  /** Test child filtering. */
  public void testChild() throws ParserException {
    String guts;
    String html;
    NodeList list;

    guts =
        "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>";
    html = "<html>" + guts + "</html>";
    createParser(html);
    list = parser.extractAllNodesThatMatch(new HasChildFilter(new TagNameFilter("b")));
    assertEquals("only one element", 1, list.size());
    assertType("should be LinkTag", LinkTag.class, list.elementAt(0));
    LinkTag link = (LinkTag) list.elementAt(0);
    assertEquals("three children", 3, link.getChildCount());
    assertSuperType("should be TagNode", Tag.class, link.getChildren().elementAt(0));
    Tag tag = (Tag) link.getChildren().elementAt(0);
    assertStringEquals("name", "B", tag.getTagName());
  }
Beispiel #19
0
 private void setExecutionResult(ExecutionResult executionResult) {
   NodeList cells = rowNode.getChildren();
   for (int i = 0; i < cells.size(); i++) {
     Node cell = cells.elementAt(i);
     if (cell instanceof Tag) {
       Tag tag = (Tag) cell;
       tag.setAttribute("class", executionResult.toString(), '"');
     }
   }
 }
Beispiel #20
0
  public static void setEventValidation(String html) throws ParserException {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter filter =
        new AndFilter(
            new TagNameFilter("input"), new HasAttributeFilter("id", "__EVENTVALIDATION"));
    NodeList nodes = parser.parse(filter);
    InputTag node = (InputTag) nodes.elementAt(0);

    eventValidation = node.getAttribute("value");
  }
Beispiel #21
0
  public static void setViewState(String html) throws Exception {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter filter =
        new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "__VIEWSTATE"));

    NodeList nodes = parser.parse(filter);
    InputTag node = (InputTag) nodes.elementAt(0);

    viewState = node.getAttribute("value");
  }
Beispiel #22
0
 public HtmlTable(TableTag tableNode) {
   this.tableNode = tableNode;
   NodeList nodeList = tableNode.getChildren();
   for (int i = 0; i < nodeList.size(); i++) {
     Node node = nodeList.elementAt(i);
     if (node instanceof TableRow || node instanceof TableHeader) {
       rows.add(new Row((CompositeTag) node));
     }
   }
 }
Beispiel #23
0
  /** Test and filtering. */
  public void testAnd() throws ParserException {
    String guts;
    String html;
    NodeList list;

    guts =
        "<body>Now is the <a id=one><b>time</b></a> for all good <a id=two><b>men</b></a>..</body>";
    html = "<html>" + guts + "</html>";
    createParser(html);
    list =
        parser.extractAllNodesThatMatch(
            new AndFilter(
                new HasChildFilter(new TagNameFilter("b")),
                new HasChildFilter(new StringFilter("men"))));
    assertEquals("only one element", 1, list.size());
    assertType("should be LinkTag", LinkTag.class, list.elementAt(0));
    LinkTag link = (LinkTag) list.elementAt(0);
    assertEquals("attribute value", "two", link.getAttribute("id"));
  }
 public static List<String> getLinks(String url) throws ParserException {
   Parser htmlParser = new Parser(url);
   List<String> links = new LinkedList<String>();
   NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
   for (int m = 0; m < tagNodeList.size(); m++) {
     LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
     String linkName = loopLinks.getLink();
     links.add(linkName);
   }
   return links;
 }
Beispiel #25
0
  private static void addDetailToMap(String key, String text) throws Exception {
    Parser parser = Parser.createParser(text, "gb2312");
    TagNameFilter tableFiler = new TagNameFilter("table");

    NodeList nodes = parser.parse(tableFiler);

    TableTag node = (TableTag) nodes.elementAt(5);

    TableRow[] rows = node.getRows();
    for (int i = 1; i < 11; i++) {
      TableColumn[] cols = rows[i].getColumns();

      StringBuffer txt1 = new StringBuffer();

      StringBuffer txt2 = new StringBuffer();

      NodeList span1 = cols[1].getChildren().elementAt(1).getChildren();

      for (int j = 0; j < span1.size(); j++) {
        if (span1.elementAt(j) instanceof TextNode) {
          txt1.append(span1.elementAt(j).getText()).append(" ");
        }
      }

      NodeList span2 = cols[3].getChildren().elementAt(1).getChildren();

      for (int j = 0; j < span2.size(); j++) {
        if (span2.elementAt(j) instanceof TextNode) {
          txt2.append(span2.elementAt(j).getText()).append(" ");
        }
      }

      List<String> lst = detailMap.get(key);
      if (lst == null) {
        lst = new ArrayList<String>();
      }
      lst.add(txt1.toString().trim());
      lst.add(txt2.toString().trim());
      detailMap.put(key, lst);
    }
  }
  /**
   * Add the tag name and it's children's tag names to the set of tag names.
   *
   * @param set The set to add to.
   * @param node The node to get the names from.
   */
  protected void addName(Set set, Node node) {
    NodeList children;

    if (node instanceof Tag) {
      set.add(((Tag) node).getTagName());
      if (node instanceof CompositeTag) {
        children = ((CompositeTag) node).getChildren();
        if (null != children)
          for (int i = 0; i < children.size(); i++) addName(set, children.elementAt(i));
      }
    }
  }
Beispiel #27
0
  /**
   * 对新闻URL进行解析并采集数据
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    String title = ""; // 新闻标题
    String source = ""; // 新闻来源
    String sourceTime = ""; // 新闻来源时间
    // String author = ""; //新闻作者
    String Content = ""; // 新闻内容
    // String collectTime = ""; //新闻采集时间-系统时间
    try {
      parser = new Parser(url);
      parser.setEncoding("GB2312");
      // 标题
      NodeFilter titleFilter = new TagNameFilter("h1");
      NodeList titleNodeList = parser.parse(titleFilter);
      title = parserUtil.getNodeListText(titleNodeList);
      parser.reset(); // 每次获取都必须reset,不然后面获取不到数据
      System.out.println(title);
      // 来源
      NodeFilter sourceFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name"));
      NodeList sourceNodeList = parser.parse(sourceFilter);
      source = parserUtil.getNodeListText(sourceNodeList);
      parser.reset();
      System.out.println(source);
      // 来源时间
      NodeFilter sourceTimeFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date"));
      NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter);
      String str = parserUtil.getNodeListText(sourceTimeNodeList);
      sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace("&nbsp;", "");
      parser.reset();
      System.out.println(sourceTime);

      // 正文
      NodeFilter ContentTimeFilter =
          new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody"));
      NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter);
      NodeList childList = ContentTimeNodeList.elementAt(0).getChildren();
      childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分
      // childList.keepAllNodesThatMatch(new RegexFilter("  相关专题"));

      Content = parserUtil.getNodeListHTML(ContentTimeNodeList);
      // Content = ParserUtil.getPlainText(Content);
      System.out.println(Content);
      parser.reset();

    } catch (ParserException e) {
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
Beispiel #28
0
  /** Test tag name filtering. */
  public void testTagName() throws ParserException {
    String guts;
    String html;
    NodeList list;

    guts = "<booty>Now is the time for all good men..</booty>";
    html = "<html>" + guts + "</html>";
    createParser(html);
    list = parser.extractAllNodesThatMatch(new TagNameFilter("booty"));
    assertEquals("only one element", 1, list.size());
    assertSuperType("should be Tag", Tag.class, list.elementAt(0));
    assertStringEquals("name", "BOOTY", ((Tag) (list.elementAt(0))).getTagName());
  }
Beispiel #29
0
  /**
   * Accept tags with children acceptable to the filter.
   *
   * @param node The node to check.
   * @return <code>true</code> if the node has an acceptable child, <code>false</code> otherwise.
   */
  public boolean accept(Node node) {
    CompositeTag tag;
    NodeList children;
    boolean ret;

    ret = false;
    if (node instanceof CompositeTag) {
      tag = (CompositeTag) node;
      children = tag.getChildren();
      if (null != children) {
        for (int i = 0; !ret && i < children.size(); i++)
          if (getChildFilter().accept(children.elementAt(i))) ret = true;
        // do recursion after all children are checked
        // to get breadth first traversal
        if (!ret && getRecursive())
          for (int i = 0; !ret && i < children.size(); i++)
            if (accept(children.elementAt(i))) ret = true;
      }
    }

    return (ret);
  }
Beispiel #30
0
  /*
   * 获得新闻的日期
   */
  public String getNewsDate(NodeFilter dateFilter, Parser parser) {
    String newsDate = null;
    try {
      NodeList dateList = parser.parse(dateFilter);
      for (int i = 0; i < dateList.size(); i++) {
        Div dateTag = (Div) dateList.elementAt(i);
        newsDate = dateTag.getStringText();
      }
    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }

    return newsDate;
  }