Ejemplos de Parser.extractAllNodesThatMatch en Java, ejemplos de org.htmlparser.Parser.extractAllNodesThatMatch en Java

Ejemplo n.º 1

0

Mostrar archivo

Archivo: PCPOPHtmlParser.java Proyecto: mo3athBaioud/bluestome

  /**
   * 获取文章链接
   *
   * @param url
   * @throws Exception
   */
  void docByHTML(String content, String pre) throws Exception {
    Parser parser = new Parser();
    parser.setInputHTML(content);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(LinkTag.class);
    NodeList list = parser.extractAllNodesThatMatch(fileter);
    if (list != null && list.size() > 0) {
      Parser p1 = new Parser();
      p1.setInputHTML(list.toHtml());
      NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
      NodeList linkList = p1.extractAllNodesThatMatch(linkFilter);
      if (linkList != null && linkList.size() > 0) {
        for (int i = 0; i < linkList.size(); i++) {
          LinkTag link = (LinkTag) linkList.elementAt(i);
          LinkBean bean = null;
          if (link.getLink().toLowerCase().startsWith(pre)
              && !link.getLinkText().equalsIgnoreCase("详细内容")) {
            if (null == articleDocCache.get(getKey(link.getLink()))) {
              bean = new LinkBean();
              bean.setLink(link.getLink());
              bean.setName(link.getLinkText());
              LINKHASH.put(link.getLink(), bean);
            } else {
              logger.info(">> 已存在 [" + link.getLink() + "] 地址");
            }
          }
        }
      }
    }
  }

Ejemplo n.º 2

0

Mostrar archivo

Archivo: PCPOPHtmlParser.java Proyecto: mo3athBaioud/bluestome

  /**
   * 获取文章链接
   *
   * @param url
   * @throws Exception
   */
  void doc(String url, String pre) throws Exception {
    Parser parser = new Parser();
    parser.setURL(url);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(LinkTag.class);
    NodeList list = parser.extractAllNodesThatMatch(fileter);
    if (list != null && list.size() > 0) {
      Parser p1 = new Parser();
      p1.setInputHTML(list.toHtml());
      NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
      NodeList linkList = p1.extractAllNodesThatMatch(linkFilter);
      if (linkList != null && linkList.size() > 0) {
        for (int i = 0; i < linkList.size(); i++) {
          LinkTag link = (LinkTag) linkList.elementAt(i);
          LinkBean bean = null;
          if (link.getLink().toLowerCase().startsWith(pre)
              && !link.getLinkText().equalsIgnoreCase("详细内容")) {
            bean = new LinkBean();
            bean.setLink(link.getLink());
            bean.setName(link.getLinkText());
            LINKHASH.put(link.getLink(), bean);
          }
        }
      }
    }
  }

Ejemplo n.º 3

0

Mostrar archivo

Archivo: ListNumber.java Proyecto: limingyao/Crawler

  public static void dealOnePage(String url, int starNo) {
    try {
      Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection());
      NodeList tableSet =
          parser.extractAllNodesThatMatch(new HasAttributeFilter("bgcolor", "#DDE1FF"));
      parser = new Parser(new Lexer(tableSet.toHtml()));
      NodeList tdSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("tr"));
      parser = new Parser(new Lexer(tdSet.toHtml()));

      PrototypicalNodeFactory p = new PrototypicalNodeFactory();
      p.registerTag(new SpanTag());
      parser.setNodeFactory(p);

      NodeList spanSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("span"));
      int index = 0;
      for (int i = 5; i < spanSet.size(); i = i + 5) {
        String str = spanSet.elementAt(i).toPlainTextString();
        String now = "" + (starNo * 100 + index);
        index++;
        while (str.compareTo(now) != 0) {
          System.out.println(now);
          now = "" + (starNo * 100 + index);
          index++;
        }
        // System.out.println(str);
      }
    } catch (ParserException e) {
      e.printStackTrace();
    } catch (MalformedURLException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

Ejemplo n.º 4

0

Mostrar archivo

Archivo: RegistryArchivalUnit.java Proyecto: fkautz/cmusv-practicum

 // If there is a <title> element on the start page, use that as our AU
 // name.
 String recomputeRegName() {
   if (!isStarted()) {
     // This can get invoked (seveeral times, mostly from logging) before
     // enough mechanism has started to make it possible to resolve the CuUrl
     // below.
     return null;
   }
   try {
     CachedUrl cu = makeCachedUrl(m_registryUrl);
     if (cu == null) return null;
     URL cuUrl = CuUrl.fromCu(cu);
     Parser parser = new Parser(cuUrl.toString());
     NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class));
     Node nodes[] = nodelst.toNodeArray();
     recomputeRegName = false;
     if (nodes.length < 1) return null;
     // Get the first title found
     TitleTag tag = (TitleTag) nodes[0];
     if (tag == null) return null;
     return tag.getTitle();
   } catch (MalformedURLException e) {
     log.warning("recomputeRegName", e);
     return null;
   } catch (ParserException e) {
     if (e.getThrowable() instanceof FileNotFoundException) {
       log.warning("recomputeRegName: " + e.getThrowable().toString());
     } else {
       log.warning("recomputeRegName", e);
     }
     return null;
   }
 }

Ejemplo n.º 5

0

Mostrar archivo

Archivo: ParserBootsPage.java Proyecto: kanxg/kanxg

  public void checkprice() throws Exception {

    // System.out.println("checking Aptamil url [" + page.url + "]");
    URL url = new URL(page.url);
    HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
    urlConnection.setConnectTimeout(Constant.connect_timeout);
    urlConnection.connect();

    Parser parser = new Parser(urlConnection);

    parser.setEncoding(Constant.ENCODE);
    NodeClassFilter div_filter = new NodeClassFilter(Div.class);

    OrFilter filters = new OrFilter();
    filters.setPredicates(new NodeFilter[] {div_filter});

    NodeList list = parser.extractAllNodesThatMatch(filters);

    for (int i = 0; i < list.size(); i++) {
      Node tag = list.elementAt(i);
      if (tag instanceof Div) {
        Div d = (Div) tag;
        String divclass = d.getAttribute("class");
        if ("pl_addToBasket".equalsIgnoreCase(divclass)) {
          // return getName(d);
        }
      }
    }
  }

Ejemplo n.º 6

0

Mostrar archivo

Archivo: AreaTest.java Proyecto: zhaoccx/LS

  /**
   * 对新闻URL进行解析提取新闻，同时将新闻插入到数据库中。
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    try {
      parser = new Parser(url);
      // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new
      // HasAttributeFilter("class", "TRS_PreAppend"));

      // parser.reset(); //记得每次用完parser后，要重置一次parser。要不然就得不到我们想要的内容了。

      NodeFilter innerFilter =
          new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal"));
      NodeFilter xk = new HasParentFilter(innerFilter);
      NodeList nodes = parser.extractAllNodesThatMatch(xk);
      System.out.println(nodes.size());
      for (int i = 0; i < nodes.size(); i++) {
        Node time = nodes.elementAt(i);
        // System.out.println(time.toPlainTextString().trim().replace("&nbsp;",
        // "").replaceAll("[\\t\\n\\r]", "").replaceAll("　", ""));
        System.out.println(
            replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll("　", "")));
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
  }

Ejemplo n.º 7

0

Mostrar archivo

Archivo: getURL.java Proyecto: inverthermit/SpidersForParttime

  public static void main(String[] args) throws Exception {
    RequestConfig requestConfig =
        RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build();
    CloseableHttpClient httpclient =
        HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
    int count = 1;
    for (int i = 0; i <= 16; i++) {
      int index = i;
      // System.out.println(index);
      HttpGet httpGet = new HttpGet(url3 + index + url4);
      HttpResponse response = httpclient.execute(httpGet);
      HttpEntity entity = response.getEntity();
      String htmls = null;
      if (entity != null) {
        htmls = EntityUtils.toString(entity).replace("\t", " ");
      }
      Parser parser = Parser.createParser(htmls, "utf-8");
      AndFilter dFilter =
          new AndFilter(new TagNameFilter("h2"), new HasAttributeFilter("class", "field-content"));
      NodeList nodes3 = parser.extractAllNodesThatMatch(dFilter);
      for (int k = 0; k < nodes3.size(); k++) {
        htmls = nodes3.elementAt(k).toHtml();
        parser = Parser.createParser(htmls, "utf-8");
        AndFilter ProfessionNameFilter =
            new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href"));
        NodeList nodes4 = parser.extractAllNodesThatMatch(ProfessionNameFilter);
        for (int j = 0; j < nodes4.size(); j++) {
          LinkTag link = (LinkTag) nodes4.elementAt(j);
          // if(link.getAttribute("href").contains("http://www.ulster.ac.uk/"))
          { // .replaceAll("<span[\\s\\S]*/span>","")
            String temp = link.toHtml();

            System.out.println(
                "{\""
                    + count
                    + "\",\"http://www.chi.ac.uk/"
                    + link.getAttribute("href")
                    + "\",\""
                    + html2Str(temp).replace("\r\n", "").trim()
                    + "\",\"0\"},");
            count++;
          }
        }
      }
    }
    // System.out.println("DONE.");
  }

Ejemplo n.º 8

0

Mostrar archivo

Archivo: HtmlParserTool.java Proyecto: edin-chou/myDemos

  // 获取一个网站上的链接，filter来过滤链接
  public static Set<String> extracLinks(String url, Cobweb cobweb) {
    Set<String> links = new HashSet<String>();

    try {
      Parser parser = new Parser(url);
      parser.setEncoding(cobweb.getCharSet());

      // 过滤<frame >标签的filter，用来提取frame 标签里的src 属性
      NodeFilter frameFilter =
          new NodeFilter() {
            public boolean accept(Node node) {
              if (node.getText().startsWith("frame src=")) {
                return true;
              } else {
                return false;
              }
            }
          };

      // OrFilter 来设置过滤<a> 标签和<frame> 标签
      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
      // 得到所有经过过滤的标签
      NodeList list = parser.extractAllNodesThatMatch(linkFilter);
      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);
        if (tag instanceof LinkTag) { // <a> 标签
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // URL
          if (cobweb.accept(linkUrl)) {
            links.add( // java.net.URLEncoder.encode(linkUrl));
                linkUrl
                    .replaceAll("\\?", "\\%3F") // 转码
                    .replaceAll("\\&", "\\%26")
                    .replaceAll("\\|", "\\%124")
                    .replaceAll("\\#", ""));
          }
          ;
        } else { // <frame>标签
          // 提取frame 里src 属性的链接，如<frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=");
          frame = frame.substring(start);
          int end = frame.indexOf(" ");
          if (end == -1) {
            end = frame.indexOf(">");
          }
          String frameUrl = frame.substring(5, end - 1);
          if (cobweb.accept(frameUrl)) {
            links.add(frameUrl);
          }
        }
      }
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }

Ejemplo n.º 9

0

Mostrar archivo

Archivo: MParseBrand.java Proyecto: user20161119/beiker-Deprecated

 /**
  * 获取滚动品牌
  *
  * @param path
  * @param city
  * @param fileName
  * @return
  */
 public static Map<String, String> getBrandInfo(String path, String city, String fileName) {
   Map<String, String> brandMap = new LinkedHashMap<String, String>();
   try {
     StringBuilder filePath = new StringBuilder();
     filePath.append(PATH);
     filePath.append(city);
     filePath.append(INCLUDE);
     filePath.append(fileName);
     filePath.append(STUFF);
     // 开始解析
     Parser parser = new Parser(filePath.toString());
     // 过滤出<a></a>标签
     NodeFilter divFilter = new NodeClassFilter(Div.class);
     NodeList classList = parser.extractAllNodesThatMatch(divFilter);
     NodeList hrefList = null;
     NodeList imgList = null;
     Node picNode = null;
     Node hrefNode = null;
     Node imgNode = null;
     String classStr = "";
     String hrefStr = "";
     String imgStr = "";
     String imgClass = "";
     for (int i = 0; i < classList.size(); i++) {
       picNode = classList.elementAt(i);
       classStr = ((Div) picNode).getAttribute("class");
       if ("business_list_pic".equalsIgnoreCase(classStr)) {
         hrefList = picNode.getChildren();
         for (int j = 0; j < hrefList.size(); j++) {
           hrefNode = hrefList.elementAt(j);
           if (hrefNode instanceof LinkTag) {
             hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id，获取到id
             hrefStr = MParseBrand.getBrandId(hrefStr);
             imgList = hrefNode.getChildren();
             for (int k = 0; k < imgList.size(); k++) {
               imgNode = imgList.elementAt(k);
               if (imgNode instanceof ImageTag) {
                 imgClass = ((ImageTag) imgNode).getAttribute("class");
                 if (null != imgClass) {
                   imgStr = ((ImageTag) imgNode).getAttribute("src");
                   if (null == imgStr) {
                     imgStr = ((ImageTag) imgNode).getAttribute("original");
                   }
                 }
               }
             }
             brandMap.put(hrefStr, imgStr);
           }
         }
       }
     }
   } catch (Exception e) {
     e.printStackTrace();
   }
   return brandMap;
 }

Ejemplo n.º 10

0

Mostrar archivo

Archivo: FilterTest.java Proyecto: theZnorf/esdexercises

  /** Test regular expression matching: */
  public void testRegularExpression() throws Exception {
    String target =
        "\n"
            + "\n"
            + "Most recently, in the Western Conference final, the Flames knocked off \n"
            + "the San Jose Sharks, the Pacific Division champions, to become the first \n"
            + "Canadian team to reach the Stanley Cup Championship series since 1994.";

    String html =
        "<html><head><title>CBC Sports Online: NHL Playoffs</title></head>"
            + "<body><h1>CBC SPORTS ONLINE</h1>\n"
            + "The Calgary Flames have already defeated three NHL division winners \n"
            + "during their improbable playoff run. If they are to hoist the Stanley \n"
            + "Cup they'll have to go through one more. <p><table ALIGN=\"Right\" width=196 CELLPADDING=0 cellspacing=0 hspace=4> <tr><td><img src=\"/gfx/topstory/sports/iginla_j0524.jpg\" width=194 height=194 hspace=3 border=1><br>\n"
            + "\n"
            + "<font SIZE=\"1\" FACE=\"verdana,arial\">\n"
            + "Jarome Iginla skates during the Flames' practice on Monday. Calgary takes on the Tampa Bay Lightning in the Stanley Cup finals beginning Tuesday night in Tampa\n"
            + "</font></td></tr></table>\n"
            + "\n"
            + "\n"
            + "In the post-season's first round, the Flames defeated the Vancouver \n"
            + "Canucks, the Northwest Division winners, in seven tough games. <p>\n"
            + "\n"
            + "In Round 2 it was the Detroit Red Wings, who not only won the Central \n"
            + "Division, but also boasted the NHL's best overall record during the \n"
            + "regular season, who fell to the Flames. <p>"
            + target
            + "<p>\n"
            + "\n"
            + "Up next for the Flames is the Tampa Bay Lighting -- the runaway winners \n"
            + "of the NHL's Southeast Division and the Eastern Conference's best team \n"
            + "during the regular season. <p>\n"
            + "\n"
            + "The Lighting advanced by beating the Philadelphia Flyers in the Eastern \n"
            + "Conference final. <p>\n"
            + "</body></html>\n";
    Lexer lexer;
    Parser parser;
    RegexFilter filter;
    NodeIterator iterator;
    int count;

    lexer = new Lexer(html);
    parser = new Parser(lexer);
    filter =
        new RegexFilter(
            "(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?");
    count = 0;
    for (iterator = parser.extractAllNodesThatMatch(filter).elements(); iterator.hasMoreNodes(); ) {
      assertEquals("text wrong", target, iterator.nextNode().toHtml());
      count++;
    }
    assertEquals("wrong count", 1, count);
  }

Ejemplo n.º 11

0

Mostrar archivo

Archivo: DistributionValidationTestUtils.java Proyecto: GayashanNA/carbon-platform-integration-utils

 public static List<String> getLinks(String url) throws ParserException {
   Parser htmlParser = new Parser(url);
   List<String> links = new LinkedList<String>();
   NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
   for (int m = 0; m < tagNodeList.size(); m++) {
     LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
     String linkName = loopLinks.getLink();
     links.add(linkName);
   }
   return links;
 }

Ejemplo n.º 12

0

Mostrar archivo

Archivo: HtmlParserTool.java Proyecto: jason440682/SearchEngine

  // 获取一个网站上的链接,filter 用来过滤链接
  public static Set<String> extracLinks(String url, LinkFilter filter) {

    Set<String> links = new HashSet<String>();
    try {
      Parser parser = new Parser(url);
      // parser.setEncoding("utf8");
      // 过滤 <frame >标签的 filter，用来提取 frame 标签里的 src 属性所表示的链接
      NodeFilter frameFilter =
          new NodeFilter() {
            /** */
            private static final long serialVersionUID = 1L;

            public boolean accept(Node node) {
              if (node.getText().startsWith("iframe") && node.getText().contains("src=")) {
                return true;
              } else {
                return false;
              }
            }
          };
      // OrFilter 来设置过滤 <a> 标签和 <frame> 标签
      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
      // 得到所有经过过滤的标签
      NodeList list = parser.extractAllNodesThatMatch(linkFilter);
      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);
        if (tag instanceof LinkTag) // <a> 标签
        {
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // url可能出现在src,href等属性中
          if (filter.accept(linkUrl)) links.add(linkUrl);
        } else // <frame> 标签
        {
          // 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=\"");
          frame = frame.substring(start);
          int end = frame.indexOf("\">");
          if (end == -1) {
            end = frame.indexOf("?");
          }
          String frameUrl = frame.substring(5, end - 1);
          if (filter.accept(frameUrl)) links.add(frameUrl);
        }
      }
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }

Ejemplo n.º 13

0

Mostrar archivo

Archivo: PCPOPHtmlParser.java Proyecto: mo3athBaioud/bluestome

  /**
   * @param url
   * @return
   * @throws Exception
   */
  String author(String url) throws Exception {
    Parser parser = new Parser();
    parser.setURL(url);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(Div.class);
    NodeList list =
        parser
            .extractAllNodesThatMatch(fileter)
            .extractAllNodesThatMatch(new HasAttributeFilter("class", "otb14"));
    String author = null;
    if (list != null && list.size() > 0) {
      Div div = (Div) list.elementAt(0);
      String tmp = div.getStringText();
      author = tmp;
    }

    if (null == author) {
      // logger.debug("重新解析作者栏");
      parser = new Parser();
      parser.setURL(url);
      parser.setEncoding("GB2312");

      NodeFilter fileter1 = new NodeClassFilter(Div.class);
      NodeList list1 =
          parser
              .extractAllNodesThatMatch(fileter1)
              .extractAllNodesThatMatch(new HasAttributeFilter("class", "pop_2_1_2"));
      if (null != list1 && list1.size() > 0) {
        Div div = (Div) list1.elementAt(1);
        String tmp = div.getStringText();
        author = tmp.substring(tmp.indexOf("</a>") + 4);
        logger.debug("author:" + author);
      }
    }
    return author;
  }

Ejemplo n.º 14

0

Mostrar archivo

Archivo: ParserAmazonPage.java Proyecto: kanxg/searchafGae2

  public boolean checkprice() {
    System.out.println("checking amazon url:" + page.url);
    try {

      URL url = new URL(page.url);
      HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
      urlConnection.setConnectTimeout(Constant.connect_timeout);

      Parser parser = new Parser(urlConnection);
      parser.setEncoding(Constant.ENCODE);

      // OrFilter lastFilter = new OrFilter();
      // lastFilter.setPredicates(new NodeFilter[] {
      // new NodeClassFilter(TableTag.class),
      // new NodeClassFilter(Div.class) });
      //
      // NodeList list = parser.extractAllNodesThatMatch(lastFilter);

      NodeList list = parser.extractAllNodesThatMatch(new NodeClassFilter(Div.class));
      System.out.println("size:" + list.size());

      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);

        if (tag instanceof Div) {
          Div d = (Div) tag;
          System.out.println(d.getAttribute("id"));

          if (d.getAttribute("id").startsWith("result_")) {
            // found one product
            try {
              AmazonProduct product = new AmazonProduct();
              product.name = d.getAttribute("name");
              getPriceAndLabel(d, product);

            } catch (Exception e) {
              e.printStackTrace();
            }
          }
        }
      }

    } catch (Exception e) {
      System.out.println(e.getMessage());
    }
    return false;
  }

Ejemplo n.º 15

0

Mostrar archivo

Archivo: FilterTest.java Proyecto: theZnorf/esdexercises

  public void testSelectors() throws Exception {
    String html =
        "<html><head><title>sample title</title></head><body inserterr=\"true\" yomama=\"false\"><h3 id=\"heading\">big </invalid>heading</h3><ul id=\"things\"><li><br word=\"broken\"/>&gt;moocow<li><applet/>doohickey<li class=\"last\"><b class=\"item\">final<br>item</b></ul></body></html>";
    Lexer l;
    Parser p;
    CssSelectorNodeFilter it;
    NodeIterator i;
    int count;

    l = new Lexer(html);
    p = new Parser(l);
    it = new CssSelectorNodeFilter("li + li");
    count = 0;
    for (i = p.extractAllNodesThatMatch(it).elements(); i.hasMoreNodes(); ) {
      assertEquals("tag name wrong", "LI", ((Tag) i.nextNode()).getTagName());
      count++;
    }
    assertEquals("wrong count", 2, count);
  }

Ejemplo n.º 16

0

Mostrar archivo

Archivo: PCPOPHtmlParser.java Proyecto: mo3athBaioud/bluestome

  /**
   * 根据URL获取内容
   *
   * @param url
   * @return
   * @throws Exception
   */
  static String content(String url) throws Exception {
    Parser parser = new Parser();
    parser.setURL(url);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(Div.class);
    NodeList list =
        parser
            .extractAllNodesThatMatch(fileter)
            .extractAllNodesThatMatch(new HasAttributeFilter("id", "contentDiv"));
    String content = null;
    if (null != list && list.size() > 0) {
      Div div = (Div) list.elementAt(0);
      String tmp = div.getStringText();
      // logger.debug("author:"+tmp);
      content = tmp;
    }
    return content;
  }

Ejemplo n.º 17

0

Mostrar archivo

Archivo: HtmlParseUtil.java Proyecto: hjy2011/spider

  /**
   * 获取规定标签及属性的内容 使用示例: HtmlParseUtil.getContentByTagNameAndAttribute(sourse, "div", "class",
   * "hello"); 会获取所有div节点，并且该节点具有属性class="hello"
   *
   * @param sourse
   * @param tagName
   * @param attribute
   * @param attributeValue
   * @return
   */
  public static List<String> getContentByTagNameAndAttribute(
      String sourse, String tagName, String attribute, String attributeValue) {
    List<String> list = new ArrayList<String>();
    Parser parser = null;
    NodeFilter tagNameFilter = new TagNameFilter(tagName);
    NodeFilter classNameFilter = new HasAttributeFilter(attribute, attributeValue);
    NodeFilter and = new AndFilter(tagNameFilter, classNameFilter);
    try {
      parser = new Parser(sourse);
      NodeList nodeList = parser.extractAllNodesThatMatch(and);
      for (int i = 0; i < nodeList.size(); ++i) {
        String text = nodeList.elementAt(i).toHtml();
        list.add(text);
      }

    } catch (ParserException e) {
      e.printStackTrace();
    }
    return list;
  }

Ejemplo n.º 18

0

Mostrar archivo

Archivo: SearchEngineService.java Proyecto: Genxl/ShareSystem

  public ContentModel view(String url) {

    ContentModel model = new ContentModel();

    try {
      NodeFilter filter = new TagNameFilter("html");
      Parser parser = new Parser();
      parser.setURL(SearchHelper.decrypt(url));
      parser.setEncoding(parser.getEncoding());
      // parser.setEncoding("gb2312");
      NodeList list = parser.extractAllNodesThatMatch(filter);
      for (int i = 0; i < list.size(); i++) {
        String s = list.elementAt(i).toHtml();
        model.setContent(s);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }

    return model;
  }

Ejemplo n.º 19

0

Mostrar archivo

Archivo: ParserDrugstorePage.java Proyecto: kanxg/searchafGae2

  public void checkprice() throws Exception {
    // System.out.println("checking drugstore url:" + page.getUrl());

    String cookies = ""; // DrugstoreLogin.getCookies();

    URL url = new URL(page.getUrl());
    HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
    urlConnection.setConnectTimeout(Constant.connect_timeout);
    urlConnection.setRequestProperty(
        "User-Agent", "Mozilla/5.0 (compatible; MSIE 6.0; Windows NT)");
    urlConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");

    urlConnection.setRequestProperty("Cookie", cookies);

    urlConnection.connect();
    // InputStream is = urlConnection.getInputStream();
    //
    // BufferedReader reader = new BufferedReader(new
    // InputStreamReader(is));
    //
    // String s;
    // StringBuilder result = new StringBuilder();
    // while (((s = reader.readLine()) != null)) {
    // result.append(s);
    // }
    //
    // System.out.println("result= " + result.toString());
    //
    // is.close();

    Parser parser = new Parser(urlConnection);

    parser.setEncoding(Constant.ENCODE);

    NodeFilter name_filter3 =
        new AndFilter(
            new NodeClassFilter(Div.class), new HasAttributeFilter("id", "divAvailablity"));
    NodeFilter name_filter4 =
        new AndFilter(new NodeClassFilter(Div.class), new HasAttributeFilter("id", "productprice"));

    OrFilter lastFilter = new OrFilter();
    lastFilter.setPredicates(
        new NodeFilter[] {new NodeClassFilter(TitleTag.class), name_filter3, name_filter4});

    NodeList list = parser.extractAllNodesThatMatch(lastFilter);
    for (int i = 0; i < list.size(); i++) {
      Node tag = list.elementAt(i);
      if (tag instanceof TitleTag) {
        TitleTag d = (TitleTag) tag;
        page.title = d.getTitle().replaceAll("drugstore.com", "").replaceAll("\\|", "").trim();
      } else if (tag instanceof Div) {
        Div d = (Div) tag;
        String sStr = d.getStringText();
        // System.out.println(sStr);
        if ("productprice".equalsIgnoreCase(d.getAttribute("id"))) {
          page.price = getPrice(sStr);
        } else if ("divAvailablity".equalsIgnoreCase(d.getAttribute("id"))) {
          if (sStr.indexOf("In Stock") >= 0 || sStr.indexOf("in stock") >= 0) {
            page.instock = true;
          }
        }
        // System.out.println(d.getStringText());
        // getinStock(d);
      }
    }
  }

Ejemplo n.º 20

0

Mostrar archivo

Archivo: HenuParseEXP.java Proyecto: zuiyu/superdaxue

  /** 从课表处，分课表 */
  public List<Courses> parseCourses(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      e.printStackTrace();
    }
    NodeFilter filter = new NodeClassFilter(TableTag.class);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      e.printStackTrace();
    }

    List<Courses> list = new ArrayList<Courses>();
    String schoolyear = "";
    String semester = "";
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        TableRow[] rows = tag.getRows();
        for (int j = 0; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          Courses courses = null;
          boolean isCourse = false;
          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            String temp = "学年学期：";
            int start = info.indexOf(temp);
            int len = "2012-2013".length();
            if (start != -1) {
              start = start + temp.length();
              schoolyear = info.substring(start, start + len);
              // semester = info.substring(start+len+2);
              // 网络正常时候测试学期改为数字
              semester = info.substring(start + len + 3, start + len + 4);
              if ("一".equals(semester)) {
                semester = "1";
              } else if ("二".equals(semester)) {
                semester = "2";
              }
            }
            if (k == 1 && info.indexOf("[") != -1) {
              courses = new Courses();
              String courseCode = info.substring(1, 9);
              String coursesname = info.substring(10);
              courses.setCourseCode(courseCode);
              courses.setCoursesname(coursesname);
              isCourse = true;
            }
            if (k == 2 && isCourse) {
              double credit = Double.parseDouble(info);
              courses.setCredit(credit);
            }
            if (k == 3 && isCourse) {
              courses.setType(info);
            }
            if (k == 4 && isCourse) {
              courses.setLeanType(info);
            }
            if (k == 5 && isCourse) {
              courses.setCheckType(info);
            }
            if (k == 6 && isCourse) {
              courses.setGetType(info);
            }
            if (k == 7 && isCourse) {
              //	double score=Double.parseDouble(info);
              courses.setScore(info);
            }
            if (k == 8 && isCourse) {
              courses.setRemark(info);
            }
          } // end for k
          if (courses != null) {
            courses.setSchoolYear(schoolyear);
            courses.setSemester(semester);
            list.add(courses);
          }
        } // end for j
      }
    }
    return list;
  }

Ejemplo n.º 21

0

Mostrar archivo

Archivo: HenuParseEXP.java Proyecto: zuiyu/superdaxue

  public List<TimeTable> parseTimeTables(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    List<TimeTable> list = new ArrayList<TimeTable>();
    NodeFilter filter = new NodeClassFilter(TableTag.class);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        if (tag.getText().indexOf("[课程号]") == -1) {
          continue;
        }
        TableRow[] rows = tag.getRows();
        for (int j = 1; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          boolean isCourse = false;
          TimeTable timeTable = null;
          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            System.out.println(info + "===" + k);
            switch (k) {
              case 1:
                int start = info.indexOf("[");
                int end = info.indexOf("]");
                timeTable = new TimeTable();
                timeTable.setCourseCode(info.substring(start + 1, end));
                timeTable.setCourseName(info.substring(end + 1));
                break;
              case 3:
                timeTable.setCredit(Double.parseDouble(info));
                break;
              case 4:
                timeTable.setType(info);
                break;
              case 5:
                int t_start = info.indexOf("]");
                timeTable.setTeacher(info.substring(t_start + 1));
                break;
              case 8:
                List<TimeAndAdress> ta_list = praseStr(info);
                for (TimeAndAdress ta : ta_list) {
                  timeTable.setAddress(ta.getAddress());
                  timeTable.setTime(ta.getTime());
                  timeTable.setCycle(ta.getCycle());
                  timeTable.setSingleDouble(ta.getSingleDouble());
                  timeTable.setWeek(ta.getWeek());
                  list.add(timeTable.clone());
                }
                break;
              default:
                break;
            }
          }
        } // end for j
      }
    }
    return list;
  }

Ejemplo n.º 22

0

Mostrar archivo

Archivo: MaffeyArticleIteratorFactory.java Proyecto: edina/lockss-daemon

    /*
     * In order to find full text PDF you need to find the citation_pdf_url meta tag in the
     * abstract html pull out the pdf url normalize it (reorder params...) and find the matching
     * cached URL
     */
    protected ArticleFiles processAbstract(CachedUrl absCu, Matcher absMat) {
      NodeList nl = null;
      ArticleFiles af = new ArticleFiles();
      if (absCu != null && absCu.hasContent()) {
        // TEMPORARY: set absCU as default full text CU in case there is
        // no PDF CU with content; the current metadata manager currently
        // uses only the full text CU, but this will change with the new
        // metadata schema that can have multiple CUs for an article.
        af.setFullTextCu(absCu);
        af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, absCu);
        try {
          InputStreamSource is =
              new InputStreamSource(new Stream(absCu.getUnfilteredInputStream()));
          Page pg = new Page(is);
          Lexer lx = new Lexer(pg);
          Parser parser = new Parser(lx);
          Lexer.STRICT_REMARKS = false;
          NodeFilter nf =
              new NodeFilter() {
                public boolean accept(Node node) {
                  if (!(node instanceof MetaTag)) return false;
                  MetaTag meta = (MetaTag) node;
                  if (!"citation_pdf_url".equalsIgnoreCase(meta.getMetaTagName())) return false;
                  return true;
                }
              };
          nl = parser.extractAllNodesThatMatch(nf);
        } catch (ParserException e) {
          log.debug("Unable to parse abstract page html", e);
        } catch (UnsupportedEncodingException e) {
          log.debug("Bad encoding in abstact page html", e);
        } finally {
          absCu.release();
        }
      }
      try {
        if (nl != null) {
          if (nl.size() > 0) {
            // minimally encode URL to prevent URL constructor
            // from stripping trailing spaces
            String pdfUrlStr = ((MetaTag) nl.elementAt(0)).getMetaContent();
            URL pdfUrl = new URL(UrlUtil.minimallyEncodeUrl(pdfUrlStr));
            List<String> paramList = new ArrayList<String>();
            paramList.add("fileType");
            paramList.add("fileId");
            paramList.add("fileName");
            pdfUrl = reArrangeUrlParams(pdfUrl, paramList);

            if (!pdfUrl.getHost().startsWith("www.")) {
              pdfUrl = new URL(pdfUrl.getProtocol(), "www." + pdfUrl.getHost(), pdfUrl.getFile());
            }

            // note: must leave URL encoded because that's how we store URLs
            CachedUrl pdfCu = au.makeCachedUrl(pdfUrl.toString());
            if (pdfCu != null && pdfCu.hasContent()) {
              // replace absCU with pdfCU if exists and has content
              af.setFullTextCu(pdfCu);
              af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, pdfCu);
            }
          }
        }
      } catch (MalformedURLException e) {
        log.debug("Badly formatted pdf url link", e);
      } catch (IllegalArgumentException e) {
        log.debug("Badly formatted pdf url link", e);
      }

      return af;
    }

Ejemplo n.º 23

0

Mostrar archivo

Archivo: AnuMajorData.java Proyecto: inverthermit/SpidersForParttime

  public static HashMap<String, String> SouthamptonGetDetails(String[] url) {

    while (true) {
      try {
        HashMap<String, String> result = new HashMap<String, String>();
        RequestConfig requestConfig =
            RequestConfig.custom().setSocketTimeout(10000).setConnectTimeout(10000).build();
        CloseableHttpClient httpclient =
            HttpClients.custom().setDefaultRequestConfig(requestConfig).build();

        HttpGet httpGet = new HttpGet(url[1]);
        HttpResponse response = httpclient.execute(httpGet);
        HttpEntity entity = response.getEntity();

        String htmls = null;
        if (entity != null) {
          htmls = EntityUtils.toString(entity).replace("\t", " ");
          // System.out.println(htmls);

        }
        System.out.println("Got reply!");
        // htmls=HTMLFilter(htmls);

        Parser parser = null;

        // **********************************get school**********************
        parser = Parser.createParser(htmls.replace("span", "form"), "utf-8");
        AndFilter SFilter =
            new AndFilter(
                new TagNameFilter("form"), // table class="CSCPreviewTable grey"
                new HasAttributeFilter("class", "first-owner"));
        NodeList nodes4 = parser.extractAllNodesThatMatch(SFilter);
        if (nodes4.size() > 0) {

          String school = html2Str(nodes4.elementAt(0).toHtml());
          result.put("School", school);
        }

        // **********************************get entry structure**********************

        parser = Parser.createParser(htmls, "utf-8");
        AndFilter ESFilter =
            new AndFilter(
                new TagNameFilter("div"), // table class="CSCPreviewTable grey"
                new HasAttributeFilter("class", "body__inner w-doublewide copy"));
        NodeList nodes1 = parser.extractAllNodesThatMatch(ESFilter);
        String structure = "";
        String[] ProgramURL = null;
        if (nodes1.size() > 0) {
          String AllContents = nodes1.toHtml();
          String[] SP = AllContents.split("<h2 id=");
          for (int i = 1; i < SP.length; i++) {
            String row = "<h2 id=" + SP[i];
            if (row.contains("<h2 id=\"requirements\">Requirements</h2>")) // Structure
            {
              structure =
                  (html2Str(
                          row.replace("<br />", "\r\n")
                              .replace("</strong>", "")
                              .replace("<strong>", "")
                              .replace("</", "\r\n</")
                              .replace("\t", " ")
                              .replace("&amp;", " "))
                      .replace("\r\n\r\n", "\r\n"));
              structure = HTMLFilter(structure);
              result.put("Structure", structure);
            } // <a href="/program/BSC">Bachelor of Science (BSC)</a>
            else if (row.contains("<h2 id=\"relevant-degrees\">Relevant Degrees</h2>")) {
              parser = Parser.createParser(row, "utf-8");
              AndFilter ProfessionNameFilter =
                  new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href"));
              NodeList nodes5 = parser.extractAllNodesThatMatch(ProfessionNameFilter);
              for (int j = 0; j < nodes4.size(); j++) {
                LinkTag link = (LinkTag) nodes5.elementAt(j);
                if (!link.getAttribute("href").equals("#")) {
                  String code = link.getAttribute("href").replace("/program/", "");
                  ProgramURL = getProgram(code);
                  result.put("Scholarship", code);
                  // title=HTMLFilter(html2Str(link.toHtml()));
                }
              }
            }
          }
        }

        // ****************IELTS
        result.put("IELTS Average Requirement", "6.5");
        result.put("IELTS Lowest Requirement", "6.0");

        // **************************get title & type**********************

        result.put("Title", url[4] + " " + url[2]);

        result.put("Level", url[3]);
        if (ProgramURL != null) {
          result.put("Type", ProgramURL[2]);
          result.put("Length (months)", ProgramURL[5]);
        }

        httpclient.close();
        return result;
      } catch (Exception ee) {
        System.out.println("Retrying..." + url[0]);
        ee.printStackTrace();
      }
    }
  } // ...

Ejemplo n.º 24

0

Mostrar archivo

Archivo: LotteryDrawFetch508WorkerOfficial.java Proyecto: L-Arun/pcore

  @Override
  protected LotteryDraw parseLotteryDrawResult(String html) {
    LotteryType lotteryType = this.getLotteryType();
    Parser parser = null;
    try {
      parser = Parser.createParser(html, CharsetConstant.CHARSET_UTF8);
    } catch (Exception e) {
      logger.error("解析html内容出错: {}", html, e);
      return null;
    }

    LotteryDraw lotteryDraw = new LotteryDraw();
    lotteryDraw.setLotteryType(lotteryType);

    // 解析基本信息
    try {
      NodeFilter tInfoFilter = new HasAttributeFilter("class", "tInfo");
      NodeList tInfoNodeList = parser.extractAllNodesThatMatch(tInfoFilter);
      if (tInfoNodeList.size() == 0) {
        return null;
      }
      parser.setInputHTML(tInfoNodeList.elementAt(0).toHtml());
      // 取四个红色部分，依次为彩期、销售总额、开奖日期、开奖号码
      NodeFilter redFilter = new HasAttributeFilter("class", "fc-red");
      NodeList redNodeList = parser.extractAllNodesThatMatch(redFilter);
      if (redNodeList.size() < 4) {
        logger.error("解析的内容不符合要求: {}", tInfoNodeList.elementAt(0).toHtml());
        return null;
      }
      lotteryDraw.setPhase(redNodeList.elementAt(0).toPlainTextString().trim());
      lotteryDraw.setVolumeOfSales(
          StringUtils.replace(redNodeList.elementAt(1).toPlainTextString().trim(), ",", ""));
      Date drawDate =
          CoreDateUtils.parseDate(
              redNodeList.elementAt(2).toPlainTextString().trim(), "yyyy年MM月dd日");
      if (drawDate != null) {
        lotteryDraw.setTimeDraw(CoreDateUtils.formatDateTime(drawDate));
      }
      lotteryDraw.setResult(
          StringUtils.replace(redNodeList.elementAt(3).toPlainTextString().trim(), " ", ","));
    } catch (ParserException e) {
      logger.error(e.getMessage(), e);
      return null;
    }

    // 解析详情信息
    try {
      parser.setInputHTML(html);
      NodeFilter dInfoFilter = new HasAttributeFilter("class", "dInfo");
      NodeList dInfoNodeList = parser.extractAllNodesThatMatch(dInfoFilter);
      if (dInfoNodeList.size() == 0) {
        return null;
      }
      parser.setInputHTML(dInfoNodeList.elementAt(0).toHtml());
      NodeFilter prizeFilter = new TagNameFilter("p");
      NodeList prizeNodeList = parser.extractAllNodesThatMatch(prizeFilter);
      if (prizeNodeList.size() == 0) {
        return null;
      }
      String[] splitted =
          prizeNodeList
              .elementAt(0)
              .toPlainTextString()
              .split("--------------------------------------------------");
      if (splitted.length < 2) {
        logger.error("未解析到{}开奖详情: {}", lotteryType.getName());
        return lotteryDraw;
      }
      splitted = StringUtils.split(splitted[1].trim(), "&nbsp;");

      List<LotteryDrawPrizeItem> resultDetail = new ArrayList<LotteryDrawPrizeItem>();
      int index = -1;
      LotteryDrawPrizeItem prizeItem = null;
      for (int i = 0; i < splitted.length; i++) {
        String s = splitted[i].trim();
        if (s.length() == 0) {
          continue;
        }
        index++;
        if (index % 4 == 0) {
          // 一行有4列
          index = 0;
          prizeItem = new LotteryDrawPrizeItem();
          resultDetail.add(prizeItem);
        }
        switch (index) {
          case 0:
            prizeItem.setName(s);
            break;
          case 1:
            prizeItem.setWinningCount(StringUtils.replace(s, "注", ""));
            break;
          case 2:
            prizeItem.setPrizeAmount(
                CoreStringUtils.replaceAll(
                    s,
                    new String[][] {
                      {"元", ""},
                      {",", ""}
                    }));
            break;
          default:
            break;
        }
      }
      lotteryDraw.setResultDetail(resultDetail);
    } catch (ParserException e) {
      logger.error(e.getMessage(), e);
    }

    return lotteryDraw;
  }

Ejemplo n.º 25

0

Mostrar archivo

Archivo: LotteryDrawFetch513WorkerOfficial.java Proyecto: L-Arun/pcore

  @Override
  public LotteryDraw fetchResultDetail(String phase) {

    LotteryDraw lotteryDraw = null;
    lotteryDraw = nowPhaseResult();
    if (phase == null || "".equals(phase) || lotteryDraw.getPhase().equals(phase)) {
      return lotteryDraw;
    } else {
      lotteryDraw = null;
    }

    String url = RESULT_MORE_LOCALITY_URL;

    String data = null;
    String pageInfo = "结果页面" + url;
    String encoding = "utf-8";
    String logHeader =
        "=="
            + lotteryScope
            + "=="
            + siteName
            + "=="
            + pageInfo
            + "==抓取=="
            + getLotteryType().getName()
            + "==";

    try {
      data = CoreFetcherUtils.URLGet(url, null, encoding);
    } catch (Exception e) {
      logger.error("获取html数据失败" + e.getMessage());
      return null;
    }

    if (data == null || data.indexOf("404 Not Found") > 0 || data.isEmpty()) {
      logger.error(logHeader + "data is null or 404 Not Found");
      return null;
    }
    Parser parser = null;
    try {
      parser = Parser.createParser(data, encoding);
    } catch (Exception e) {
      logger.error("解析html页面失败" + e.getMessage());
      return null;
    }
    NodeFilter filter = new HasAttributeFilter("class", "mytable");
    NodeList nodeList = null;

    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
      TableTag tableTag = (TableTag) nodeList.elementAt(0);
      TableRow[] tableRows = tableTag.getRows();
      for (int i = 1; i < tableRows.length; i++) {
        TableColumn[] tableColumns = tableRows[i].getColumns();
        String phaseTmp = tableColumns[0].toPlainTextString();
        if (phaseTmp != null && !"".equals(phaseTmp) && phase.equals(phaseTmp)) {
          lotteryDraw = new LotteryDraw();
          // 彩期
          lotteryDraw.setPhase(phaseTmp);
          // 开奖结果
          String strResult = tableColumns[1].toPlainTextString();
          strResult = strResult.trim().replace(" ", ",");
          lotteryDraw.setResult(strResult);
          // 彩种
          lotteryDraw.setLotteryType(super.getLotteryType());
          break;
        }
      }
    } catch (ParserException e) {
      logger.error("数据解析错误==" + e.getMessage(), e);
      return null;
    }
    return lotteryDraw;
  }

Ejemplo n.º 26

0

Mostrar archivo

Archivo: HtmlParserTool.java Proyecto: yuexuahandao/spiker

  // 获取一个网站上的链接,filter 用来过滤链接
  public static Set<String> extracLinks(String url, NodeFilter filter) {
    Set<String> links = new HashSet<String>();
    try {
      Parser parser = new Parser(url);
      parser.setEncoding("UTF-8");

      @SuppressWarnings("serial")
      NodeFilter frameFilter =
          new NodeFilter() {
            public boolean accept(Node node) {
              if (node.getText().startsWith("frame src=")) {
                return true;
              } else {
                return false;
              }
            }
          };

      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);

      NodeList list = parser.extractAllNodesThatMatch(linkFilter);

      System.out.println("length=" + list.size());

      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);

        if (tag instanceof LinkTag) { // <a> 标签
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // URL

          /*
           * if (filter.accept(linkUrl)) { links.add(linkUrl); }
           */

          System.out.println("linkUrl=" + linkUrl);

          if (filter.accept(tag)) {
            links.add(linkUrl);
          }
        } else { // <frame> 标签
          // 提取 frame 里 src 属性的链接,如 <frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=");
          frame = frame.substring(start);
          int end = frame.indexOf(" ");

          if (end == -1) {
            end = frame.indexOf(">");
          }

          String frameUrl = frame.substring(5, end - 1);
          // if (filter.accept(frameUrl)) {
          // links.add(frameUrl);
          // }

          System.out.println("frameUrl=" + frameUrl);

          if (filter.accept(tag)) {
            links.add(frameUrl);
          }
        }
      }

      /*
       * NodeFilter filter = new TagNameFilter("DIV"); NodeList nodes =
       * parser.extractAllNodesThatMatch(filter); if(nodes!=null) { for
       * (int i = 0; i < nodes.size(); i++) { Node textnode = (Node)
       * nodes.elementAt(i);
       * System.out.println("getText:"+textnode.getText());
       * System.out.println
       * ("================================================="); } }
       */
      /*
       * for(NodeIterator i = parser.elements (); i.hasMoreNodes(); ) {
       * Node node = i.nextNode();
       * System.out.println("getText:"+node.getText());
       * System.out.println("getPlainText:"+node.toPlainTextString());
       * System.out.println("toHtml:"+node.toHtml());
       * System.out.println("toHtml(true):"+node.toHtml(true));
       * System.out.println("toHtml(false):"+node.toHtml(false));
       * System.out.println("toString:"+node.toString());
       * System.out.println
       * ("================================================="); }
       */

      /*
       * TextExtractingVisitor visitor = new TextExtractingVisitor();
       * parser.visitAllNodesWith(visitor); String textInPage =
       * visitor.getExtractedText(); System.out.println(textInPage);
       */

    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }

Ejemplo n.º 27

0

Mostrar archivo

Archivo: GridUtils.java Proyecto: sunbiz/dhis2

  /**
   * Creates a list of Grids based on the given HTML string. This works only for table-based HTML
   * documents.
   *
   * @param html the HTML string.
   * @return a list of Grids.
   */
  public static List<Grid> fromHtml(String html) throws Exception {
    if (html == null || html.trim().isEmpty()) {
      return null;
    }

    List<Grid> grids = new ArrayList<>();

    Parser parser = Parser.createParser(html, "UTF-8");

    Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray();

    for (Node t : tables) {
      Grid grid = new ListGrid();

      TableTag table = (TableTag) t;

      TableRow[] rows = table.getRows();

      Integer firstColumnCount = null;

      for (TableRow row : rows) {
        if (getColumnCount(row) == 0) // Ignore if no cells
        {
          log.warn("Ignoring row with no columns");
          continue;
        }

        Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray();

        if (firstColumnCount == null) // First row becomes header
        {
          firstColumnCount = getColumnCount(row);

          for (Node c : cells) {
            TagNode cell = (TagNode) c;

            grid.addHeader(new GridHeader(getValue(cell), false, false));

            Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));

            if (colSpan != null && colSpan > 1) {
              grid.addEmptyHeaders((colSpan - 1));
            }
          }
        } else // Rest becomes rows
        {
          if (firstColumnCount != getColumnCount(row)) // Ignore
          {
            log.warn(
                "Ignoring row which has "
                    + row.getColumnCount()
                    + " columns since table has "
                    + firstColumnCount
                    + " columns");
            continue;
          }

          grid.addRow();

          for (Node c : cells) {
            // TODO row span

            TagNode cell = (TagNode) c;

            grid.addValue(getValue(cell));

            Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));

            if (colSpan != null && colSpan > 1) {
              grid.addEmptyValues((colSpan - 1));
            }
          }
        }
      }

      grids.add(grid);
    }

    return grids;
  }

Ejemplo n.º 28

0

Mostrar archivo

Archivo: HenuParse.java Proyecto: zuiyu/superdaxue

  public List<TimeTable> parseTimeTables(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    List<TimeTable> list = new ArrayList<TimeTable>();
    NodeFilter tagfilter = new NodeClassFilter(TableTag.class);
    NodeFilter idFilter = new HasAttributeFilter("id", "reportArea");
    NodeFilter filter = new AndFilter(tagfilter, idFilter);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        TableRow[] rows = tag.getRows();
        for (int j = 0; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          boolean isCourse = false;
          TimeTable timeTable = null;

          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            //   System.out.println(info+"=="+k);
            if (k == 1 && info.indexOf("[") != -1) {
              timeTable = new TimeTable();
              String courseCode = info.substring(1, 9);
              String coursesname = info.substring(10);
              timeTable.setCourseName(coursesname);
              timeTable.setCourseCode(courseCode);
              isCourse = true;
            }
            if (k == 2 && isCourse) {
              double credit = Double.parseDouble(info);
              timeTable.setCredit(credit);
            }
            if (k == 3 && isCourse) {
              timeTable.setType(info);
            }
            if (k == 4 && isCourse) {
              timeTable.setTeacher(info);
            }
            if (k == 5 && isCourse) {
              timeTable.setClassId(info);
            }
            if (k == 6 && isCourse) {
              timeTable.setClassNum(info);
            }
            if (k == 11 && isCourse) {
              List<TimeAndAdress> ta_list = praseStr(info);
              for (TimeAndAdress ta : ta_list) {
                timeTable.setAddress(ta.getAddress());
                timeTable.setTime(ta.getTime());
                timeTable.setCycle(ta.getCycle());
                timeTable.setSingleDouble(ta.getSingleDouble());
                timeTable.setWeek(ta.getWeek());
                list.add(timeTable.clone());
              }
            }
          } // end for k
        } // end for j
      }
    }
    return list;
  }

Ejemplo n.º 29

0

Mostrar archivo

Archivo: CommonJclqScheduleFetchWorkerOfficial.java Proyecto: L-Arun/pcore

  @Override
  public List<JclqScheduleItem> fetchJclqSchedule(String officialDate) throws FetchFailedException {
    Map<String, String> headerParams = new HashMap<String, String>();
    headerParams.put("Referer", "http://info.sporttery.cn");
    headerParams.put(
        "User-Agent",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19");
    List<JclqScheduleItem> jclqScheduleItemList = new ArrayList<JclqScheduleItem>();

    String encoding = "gbk";
    JclqScheduleItem jclqScheduleItem = null;
    String siteName = "中国竞彩网网[竞彩篮球赛程]";
    String logHeader = siteName + SCHEDULE_URL;

    try {
      String webInfo =
          CoreFetcherUtils.URLGetWithHeaderParams(SCHEDULE_URL, headerParams, null, encoding);
      if (webInfo == null || webInfo.indexOf("404 Not Found") > 0) {
        logger.error(logHeader + ",data is null or 404 Not Found");
        throw new FetchFailedException("404 Not Found");
      }

      Parser parser = Parser.createParser(webInfo, encoding);

      NodeList nodeList =
          parser.extractAllNodesThatMatch(new CssSelectorNodeFilter("div[class='box-tbl']"));
      if (null != nodeList && nodeList.size() > 0) {
        NodeFilter tableFilter = new TagNameFilter("table");
        Parser parser2 = Parser.createParser(nodeList.toHtml(), encoding);
        NodeList tableNodeList = parser2.extractAllNodesThatMatch(tableFilter);
        if (tableNodeList != null && tableNodeList.size() > 0) {
          TableTag catchTableTag = new TableTag();
          catchTableTag = (TableTag) tableNodeList.elementAt(0);
          if (catchTableTag != null) {
            TableRow[] catchRows = catchTableTag.getRows();
            TableColumn[] catchColumns = null;
            for (int i = 2; i < catchRows.length; i++) {
              catchColumns = catchRows[i].getColumns();
              if (catchColumns != null && catchColumns.length >= 6) {
                jclqScheduleItem = new JclqScheduleItem();

                String officialNum = catchColumns[0].toPlainTextString().trim();
                if (officialNum.length() < 5) {
                  continue;
                }

                // 先解析比赛时间
                String matchDateStr = catchColumns[3].toPlainTextString().trim();
                String[] yearStr = matchDateStr.split("-");
                if (yearStr[0].length() <= 2) {
                  matchDateStr = "20" + matchDateStr + ":00";
                } else {
                  matchDateStr = matchDateStr + ":00";
                }
                Date matchDate = CoreDateUtils.parseDate(matchDateStr, CoreDateUtils.DATETIME);
                Calendar matchDateCalendar = Calendar.getInstance();
                matchDateCalendar.setTime(matchDate);
                matchDateCalendar.add(Calendar.MINUTE, 1);
                jclqScheduleItem.setMatchDate(matchDateCalendar.getTime());

                // 根据周几、当前时间和比赛时间计算官方发布的日期
                Calendar cd = Calendar.getInstance();
                // 将时分秒等区域清零
                cd.set(Calendar.HOUR_OF_DAY, 0);
                cd.set(Calendar.MINUTE, 0);
                cd.set(Calendar.SECOND, 0);
                cd.set(Calendar.MILLISECOND, 0);

                int nowWeekDay = cd.get(Calendar.DAY_OF_WEEK);
                int fetchWeekDay = weekDay.get(officialNum.substring(0, 2));

                if (nowWeekDay != fetchWeekDay) {
                  int m = fetchWeekDay - nowWeekDay;
                  if (m < -1) {
                    cd.add(Calendar.DATE, m + 7);
                  } else {
                    cd.add(Calendar.DATE, m);
                  }
                }

                // 如果计算出来的日期超过了比赛时间，减去一周
                if (cd.after(matchDateCalendar)) {
                  cd.add(Calendar.DATE, -7);
                }

                // 如果计算出来的日期距离比赛时间相隔超过一周，加上一周的倍数
                // 一周的毫秒数
                long weekTimeInMillis = 3600 * 1000 * 24 * 7;
                long diffTimeInMillis = matchDateCalendar.getTimeInMillis() - cd.getTimeInMillis();
                if (diffTimeInMillis > weekTimeInMillis) {
                  // 计算相差几周
                  int diffWeekCount = (int) (diffTimeInMillis / weekTimeInMillis);
                  cd.add(Calendar.DATE, 7 * diffWeekCount);
                }

                jclqScheduleItem.setMatchNum(
                    CoreDateUtils.formatDate(cd.getTime(), "yyyyMMdd")
                        + LotteryConstant.JCLQ_MATCH_NUM_CODE_DEFAULT
                        + officialNum.substring(2));
                jclqScheduleItem.setOfficialDate(
                    CoreDateUtils.parseDate(CoreDateUtils.formatDate(cd.getTime())));
                Integer oNum = null;
                try {
                  oNum = Integer.valueOf(officialNum.substring(2));
                } catch (Exception e) {
                  logger.error("截取官方编码时，转换为Integer错误", e);
                }
                jclqScheduleItem.setOfficialNum(oNum);
                jclqScheduleItem.setMatchName(
                    JclqUtil.convertMatchName(
                        catchColumns[1].toPlainTextString().trim(),
                        LotteryType.JCLQ_SF,
                        FetcherType.T_PENGINEAPI));

                String team = catchColumns[2].toPlainTextString().trim();
                String[] teamStr = team.split("VS");
                jclqScheduleItem.setAwayTeam(teamStr[0].trim());
                jclqScheduleItem.setHomeTeam(teamStr[1].trim());

                if ("已开售".equals(catchColumns[4].toPlainTextString().trim())) {
                  jclqScheduleItem.setStatus(JclqRaceStatus.OPEN);
                } else {
                  jclqScheduleItem.setStatus(JclqRaceStatus.UNOPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("胜负单关") > 0) {
                  jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("胜负过关") > 0) {
                  jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负单关") > 0) {
                  jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负过关") > 0) {
                  jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差单关") > 0) {
                  jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差过关") > 0) {
                  jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("大小分单关") > 0) {
                  jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("大小分过关") > 0) {
                  jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_OPEN);
                }
                jclqScheduleItemList.add(jclqScheduleItem);
              }
            } // end for catchRows
          } // end if catchTableTag!=null
        } // end if(tableNodeList!=null&&tableNodeList.size()>0)
      } else {
        logger.error(logHeader + "竞彩篮球赛程数据表格不存在，返回null");
        throw new FetchFailedException("竞彩篮球赛程数据表格不存在");
      }
    } catch (Exception e) {
      logger.error(logHeader + "竞彩篮球赛程错误" + e.getMessage(), e);
      throw new FetchFailedException(e.getMessage());
    }
    return jclqScheduleItemList;
  }

Ejemplo n.º 30

0

Mostrar archivo

Archivo: Main.java Proyecto: damacode/MyTumblr

 private static boolean handleURL(String address) {
   Main.status(String.format("Processing page \"%s\".", address));
   try {
     NodeList posts = getPosts(address);
     if (posts.toNodeArray().length == 0) {
       return false;
     }
     for (Node post_node : posts.toNodeArray()) {
       if (post_node instanceof TagNode) {
         TagNode post = (TagNode) post_node;
         Post new_post = new Post(Long.parseLong(post.getAttribute("id").substring(5)));
         if (!Main.post_post_hash.containsKey(new_post)) {
           NodeList photo_posts = getPhotoPosts(post.getChildren());
           NodeList remarks = getRemarks(photo_posts);
           for (Node node : remarks.toNodeArray()) {
             Matcher matcher = lores.matcher(node.getText());
             String media_url = "";
             if (matcher.find()) {
               media_url = matcher.group();
               media_url = media_url.substring(17, media_url.length() - 1);
             }
             String thumb =
                 media_url.replace(
                     media_url.substring(media_url.lastIndexOf("_"), media_url.lastIndexOf(".")),
                     "_75sq");
             URL thumb_url = new URL(thumb);
             new_post.pictures.add(new Picture(new URL(media_url), thumb_url));
           }
           NodeList photoset_posts = getPhotosetPosts(post.getChildren());
           NodeList iframes = getIFrames(photoset_posts);
           for (Node node : iframes.toNodeArray()) {
             if (node instanceof TagNode) {
               String iframe_url = ((TagNode) node).getAttribute("src");
               Parser parser2 = new Parser(iframe_url);
               NodeList a_list = parser2.extractAllNodesThatMatch(new TagNameFilter("a"));
               Node[] a_array = a_list.toNodeArray();
               Node[] img_array =
                   a_list.extractAllNodesThatMatch(new TagNameFilter("img"), true).toNodeArray();
               String media_url;
               for (int i = 0; i < a_array.length; i++) {
                 media_url = ((TagNode) img_array[i]).getAttribute("src");
                 String thumb =
                     media_url.replace(
                         media_url.substring(
                             media_url.lastIndexOf("_"), media_url.lastIndexOf(".")),
                         "_75sq");
                 URL thumb_url = new URL(thumb);
                 new_post.pictures.add(new Picture(new URL(media_url), thumb_url));
               }
             }
           }
           Main.handlePost(new_post);
         } else {
           new_post = post_post_hash.get(new_post);
           handleNonDownloadPost(new_post);
         }
       }
     }
   } catch (Exception ex) {
     ex.printStackTrace();
     Main.status("Error handling post.");
   }
   return true;
 }