private void initPane() {
    //		WebEngine engine = optionView.getEngine();
    try {
      Document document = Jsoup.connect(webView.getEngine().getLocation()).get();
      Element table =
          document.select("#normal_basket_" + document.select("[name=item_id]").val()).first();
      Element td = table.select("td").first();
      Elements spans = td.select("span");
      Elements selects = td.select("select");
      //			System.out.println(spans.size());
      cmb = new ArrayList<ComboBox>();
      for (int i = 0; i < spans.size(); i++) {

        ObservableList<ValuePair> obs = FXCollections.observableArrayList();
        Elements options = selects.get(i).select("option");
        for (int k = 0; k < options.size(); k++) {
          Element option = options.get(k);
          obs.add(new ValuePair("choice", option.text(), option.val()));
        }

        cmb.add(new ComboBox<ValuePair>(obs));
        optionArea.getChildren().addAll(new Text(spans.get(i).text()), cmb.get(i));
      }

    } catch (Exception e) {
      // TODO 自動生成された catch ブロック
      e.printStackTrace();
    }
  }
예제 #2
0
  public List<MenuMeal> getMenuMeals(int number) {
    Document doc = null;
    List<MenuMeal> meals = new ArrayList<>();

    try {
      doc =
          Jsoup.connect(String.format(URL, number))
              .userAgent("Chrome/49.0.2623.112")
              .referrer("https://www.google.ru/")
              .timeout(7000)
              .get();
    } catch (IOException e) {
      e.printStackTrace();
    }
    if (doc == null) return meals;

    Elements elements = doc.select("td[width=400");

    if (!elements.isEmpty()) {
      for (Element element : elements) {
        Element parent = element.parent();
        MenuMeal menuMeal = new MenuMeal();

        menuMeal.setDescription(parent.select("div[id=ssilka]").first().text());
        String cost = parent.select("div[id=ssilka]").last().text();
        menuMeal.setCost(Integer.valueOf(cost.substring(0, cost.indexOf("-"))));

        meals.add(menuMeal);
      }
      return meals;
    } else {
      return meals;
    }
  }
예제 #3
0
  public static boolean isShotbowDonor(String user) throws IOException {

    Document doc =
        Jsoup.connect("https://shotbow.net/forum/search").userAgent(WebUtils.USER_AGENT).get();
    String xfToken = doc.select("input[name=_xfToken]").val();

    RequestSettings request = new RequestSettings();

    request.setGzip(false);
    request.setUrl("https://shotbow.net/forum/search/search");
    request.setHost("shotbow.net");
    request.setOrigin("https://www.shotbow.net");
    request.setReferer("https://shotbow.net/forum/portal/");

    request.addParameter("keywords", user);
    request.addParameter("users", "");
    request.addParameter("date", "");
    request.addParameter("_xfToken", xfToken);

    String response = RequestUtils.excuteSpecialPost(request);

    Document doc1 = Jsoup.parse(response);

    for (Element e : doc1.select("li.userResult")) {

      if (e.select("a.username").first().text().equalsIgnoreCase(user)) {
        return !e.select("div.userTitle").first().text().equalsIgnoreCase("Regular Member");
      }
    }

    return false;
  }
예제 #4
0
  /**
   * 解析数据,默认解析第一列
   *
   * @param rows 源数据集
   * @return 节目数据
   */
  private static String[][] parseRows(Elements rows) {
    String[][] programs = new String[rows.size()][2];
    int rowspan_0 = 0;
    int rowspan_1 = 0;
    for (int i = 0; i < rows.size(); i++) {
      Element row = rows.get(i);
      try {
        Elements cells = row.children();

        if (rowspan_0 == 0) {
          Element cell_0 = cells.get(0);
          rowspan_0 = Integer.valueOf(cell_0.attr("rowspan"));
          if (rowspan_1 == 0) {
            Element cell_1 = cells.get(1);
            rowspan_1 = Integer.valueOf(cell_1.attr("rowspan"));
            programs[i][0] = DBclass.xmlFilte(cell_1.select("dt").text());
            programs[i][1] = DBclass.xmlFilte(cell_1.select("dd").text());
          }
        } else if (rowspan_1 == 0) {
          Element cell_0 = cells.get(0);
          rowspan_1 = Integer.valueOf(cell_0.attr("rowspan"));
          programs[i][0] = DBclass.xmlFilte(cell_0.select("dt").text());
          programs[i][1] = DBclass.xmlFilte(cell_0.select("dd").text());
        }
        rowspan_0--;
        rowspan_1--;
      } catch (Exception e) {
        e.printStackTrace(System.out);
      }
    }
    return programs;
  }
예제 #5
0
  public static void initMajorList(String originalUrl) {

    System.out.println("preparing majorList");

    boolean finish = false;
    do {
      try {
        majorList.clear();
        Connection conn = Jsoup.connect(originalUrl);
        Document doc = conn.timeout(10000).get();
        Elements es = doc.select("#accordion__target-3 > div.course-listing__box > a");
        for (Element e : es) { // major
          MajorForCollection major = new MajorForCollection();
          major.setLevel(LEVEL);
          major.setTitle(e.select("h3").get(0).text().trim());
          major.setType(e.select("p").get(0).text().replaceAll("-[\\s\\S]*", "").trim());
          major.setUrl(e.select("a").get(0).attr("href"));
          majorList.add(major);
        }
        ;
        finish = true;
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    } while (!finish);

    System.out.println("majorList prepared");
    System.out.println("majorList size: " + majorList.size());
  }
예제 #6
0
 public ArrayList<String> searchAmazon(
     Document doc, String searchTerm, HashMap<String, String> bestBuyInfo) {
   String url =
       "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords="
           + searchTerm;
   doc = jsoupConnect(url);
   Elements items = doc.select(".s-item-container");
   ArrayList<String> matchingList = new ArrayList<String>();
   for (Element ele : items) {
     // grab name and price
     String name = ele.select(".a-link-normal").attr("title");
     String price =
         (ele.select("div div div div div div a.a-link-normal span.a-color-price")
             .text()
             .split(" ")[0]);
     double amazonPrice = 0.0;
     System.out.println(name + "\n" + price);
     if (price.equals("")) {
       price = ele.select("span.a-size-base").text().split(" ")[0];
     } else if (price.contains("$")) {
       amazonPrice = Double.parseDouble(price.replace("$", "").replace(",", ""));
     }
     if (price.contains("$") && amazonReady(ele.select(".a-link-normal").attr("href"), true)) {
       matchingList.add(price);
       System.out.println("yes");
     } else System.out.println("no");
   }
   return matchingList;
 }
예제 #7
0
 @Override
 public void parsePages(ArrayList<?> tableList, Map.Entry<String, String> entry) {
   int type = 2;
   String[] words = entry.getValue().split(";");
   String key = entry.getKey().split(";")[0];
   String website = "全景社区";
   for (Element ele : (ArrayList<Element>) tableList) {
     String title = ele.select("h3.title").select("a").text();
     String time = Subutils.getTime(ele.select("p.meta").last().text());
     String summary = ele.select("p.content").text();
     String url = ele.select("h3.title").select("a").attr("href");
     String content = Page.getContent(url, "div.pcb", "utf-8");
     ArrayList<Integer> FNum = new ArrayList<Integer>();
     if (Transmition.contentFilter(words, content, key, FNum)
         && Transmition.timeFilter(time, this.spyHistory, title)) {
       spyHistory.add(title);
       Transmition.showDebug(type, title, content, url, time, summary, website, FNum.get(0));
       // 调接口~~~~~
       Article article =
           Transmition.getArticle(
               type, title, content, url, time, summary, website, key, FNum.get(0));
       Transmition.transmit(article);
     }
   }
 }
예제 #8
0
  /** Mudah is not standardized, result will be messy if crawl them */
  @Override
  public List<Item> parse(String query, int size) throws IOException {

    // request for a page
    Document doc =
        Jsoup.connect("http://www.mudah.my/li?q=" + query)
            .userAgent(Constant.HTTP_USER_AGENT)
            .timeout(Constant.HTTP_TIMEOUT)
            .get();

    Elements listS = doc.select("div.listing_thumbs").first().select("div.list_ads");

    ArrayList<Item> result = new ArrayList<Item>(size);
    for (int i = 0; i < listS.size(); i++) {
      Element list = listS.get(i);

      String img = "";
      list.select("div.image_thumb");
      Elements imgS = list.select("div.image_thumb > a + img");
      if (imgS.size() < 0) { // some may not have images
        img = imgS.first().attr("href");
      }

      Element listE = list.select("li.listing_ads_title").first();
      String title = listE.child(0).text();
      String url = listE.child(0).attr("href");
      String price = listE.text();
      price = price.substring(price.lastIndexOf("RM") + 2).trim().replaceAll(" ", "");
      int dPrice = Integer.parseInt(price);

      result.add(new Item("Mudah", title, dPrice, img, url));
    }

    return result;
  }
예제 #9
0
 @Override
 public void parsePages(ArrayList<?> tableList, Map.Entry<String, String> entry) {
   String website = "中国企业新闻";
   int type = 4;
   String[] words = entry.getValue().split(";");
   String key = entry.getKey().split(";")[0];
   for (Element ele : (ArrayList<Element>) tableList) {
     String title = ele.select("li.news_title").select("a").text();
     String time = FormatTime.getTime(ele.select("li.news_other").text(), "\\d{4}-\\d{2}-\\d{2}");
     String summary = ele.select("li.news_content").text();
     String url = ele.select("li.news_title").select("a").attr("href");
     String content = Page.getAllHtmlContent(url);
     ArrayList<Integer> FNum = new ArrayList<Integer>();
     if (Transmition.contentFilter(words, content, key, FNum)
         && Transmition.timeFilter(time, this.spyHistory, title)) {
       spyHistory.add(title);
       Transmition.showDebug(type, title, content, url, time, summary, website, FNum.get(0));
       // 调接口~~~~~
       Article article =
           Transmition.getArticle(
               type, title, content, url, time, summary, website, key, FNum.get(0));
       Transmition.transmit(article);
     }
   }
 }
  @Override
  public SearchResult[] getSearchResults(String searchString) throws IOException {
    Document doc = Jsoup.connect(searchString).timeout(CONNECTION_TIMEOUT_VALUE).get();
    boolean onSearchResultsPage = doc.location().contains("adultSearch.htm");
    // found the movie without a search results page
    if (doc.location() != null && !onSearchResultsPage) {
      String idOfPage = getIDStringFromDocumentLocation(doc);
      String posterPath = getPosterPreviewPathFromIDString(idOfPage);
      String label = doc.select("title").first().text();
      Thumb previewImage = new Thumb(posterPath);
      // SearchResult directResult = new SearchResult(doc.location());
      SearchResult result = null;
      if (posterPath != null) result = new SearchResult(doc.location(), label, previewImage);
      else result = new SearchResult(doc.location(), label, null);

      SearchResult[] directResultArray = {result};
      return directResultArray;
    }
    Elements foundMovies = doc.select("table[width=690]:contains(Wish List) tr tbody:has(img)");
    LinkedList<SearchResult> searchList = new LinkedList<SearchResult>();

    for (Element movie : foundMovies) {
      String urlPath = movie.select("a").first().attr("href");
      String thumb = movie.select("img").first().attr("src");
      String label = movie.select("img").first().attr("alt");
      SearchResult searchResult = new SearchResult(urlPath, label, new Thumb(thumb));
      if (!searchList.contains(searchResult)) searchList.add(searchResult);
    }
    return searchList.toArray(new SearchResult[searchList.size()]);
  }
예제 #11
0
파일: MovieSpider.java 프로젝트: jimly/jca
 private static void crawl() {
   String url = url_tpl + (page++);
   Logger.info("正在抓取:%s", url);
   if (StringUtils.isBlank(url)) return;
   sleep();
   Document doc = Jsoup.parse(WS.url(url).get().body, url);
   Elements elements = doc.select(".video-item");
   if (elements.isEmpty()) return;
   for (Element element : elements) {
     try {
       Element link = element.select(">a").first();
       String cover = link.select("img").first().absUrl("src");
       String coverTitle = link.select(".v-update").first().html();
       String detailUrl = link.absUrl("href");
       String name = element.select(".v-desc .v-title a").first().html();
       Logger.info("正在抓取名称:%s", name);
       Movie movie = Movie.find("byName", name).first();
       if (movie == null) {
         movie = new Movie();
         movie.id = DBCounter.generateUniqueCounter(Movie.class) + "";
       }
       movie.name = name;
       movie.cover = cover;
       movie.cover_title = coverTitle;
       movie.details =
           getDetails(
               movie, "http://video.baidu.com/v?word=" + URLEncoder.encode("美剧 " + name, "GBK"));
       movie.save();
     } catch (Exception e) {
       Logger.error(e.getMessage(), e);
     }
   }
   crawl();
 }
예제 #12
0
파일: HtmlView.java 프로젝트: r4geee/JRHW
 private String getUpdatedFileContent(List<Vacancy> vacancies) {
   Document document = null;
   try {
     document = getDocument();
     document.html();
     Element template = document.select("[class=vacancy template]").first();
     Element templateCopy = template.clone();
     templateCopy.removeAttr("style");
     templateCopy.removeAttr("class");
     templateCopy.addClass("vacancy");
     document.select("tr[class=vacancy]").remove();
     for (Vacancy vacancy : vacancies) {
       Element thisVacancyElement = templateCopy.clone();
       thisVacancyElement.select("[class=city]").first().text(vacancy.getCity());
       thisVacancyElement.select("[class=companyName]").first().text(vacancy.getCompanyName());
       thisVacancyElement.select("[class=salary]").first().text(vacancy.getSalary());
       thisVacancyElement
           .select("[class=title]")
           .select("a[href]")
           .first()
           .text(vacancy.getTitle());
       thisVacancyElement
           .select("[class=title]")
           .select("a[href]")
           .first()
           .attr("href", vacancy.getUrl());
       document.select("[class=vacancy template]").first().before(thisVacancyElement.outerHtml());
     }
   } catch (IOException e) {
     e.printStackTrace();
     System.out.println("Some exception occurred");
   }
   return document.html();
 }
  public void getNewsInfo(String NewsUrl) { // 获得新闻来源URL
    try {
      System.out.println(NewsUrl);
      Document Doc =
          Jsoup.connect(NewsUrl).userAgent("Mozilla").cookie("auth", "token").timeout(3000).get();
      Element textDIV =
          Doc.select("div[style=height:800px; overflow-y:scroll; width:100%;]").first();
      Element TitleEle = textDIV.select("strong").first();
      String Title = TitleEle.text(); // 获得文章title

      String PublishTime = getDate(NewsUrl); // 获得文章发表日期
      Elements ContentPTags = textDIV.select("div[id=ozoom]").select("p");
      String Content = "\r\n"; // 获得文章正文内容
      for (Element ContentPTag : ContentPTags) {
        Content += ContentPTag.text() + "\r\n";
      }
      List<String> IMGList = new ArrayList<String>(); // 获得图片地址列表
      Elements IMGs = textDIV.select("td[align=center]").select("img[src]");
      for (Element IMG : IMGs) {
        IMGList.add(IMG.attr("abs:src"));
      }
      savexml.format.source = NewsUrl;
      savexml.format.title = Title;
      savexml.format.publishtime = PublishTime;
      savexml.format.body = Content;
      savexml.format.img = IMGList;
      savexml.save();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
예제 #14
0
  /*
   * Getting news from "http://enib.net/"
   */
  public List<News> getNews() {
    Document doc = null;
    try {
      doc = Jsoup.connect("http://enib.net/").get();
    } catch (IOException e) {
      System.out.println("Can't load news");
      e.printStackTrace();
    }

    /*
     * Getting name, information, description and add it to the news List
     */
    Elements getter = doc.getElementsByClass("news");
    for (Element get : getter) {
      String news = "";
      String name = get.select("h1").text();
      String information = get.select("h2").text();
      Elements markdown = get.getElementsByClass("markdown");
      for (Element paragraph : markdown.select("p")) {
        news = news + paragraph.text() + System.getProperty("line.separator");
      }
      News n = new News(name, information, news);
      this.news.add(n);
    }
    return this.news;
  }
    @Override
    protected String doInBackground(String... params) {

      Document doc = null;
      try {
        doc =
            Jsoup.connect(params[0])
                .userAgent(
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36")
                .timeout(9000)
                .get();

        Elements container = doc.select("div.cell-border-css");

        for (Element e : container) {
          name = e.select("h4.product-title").text();
          productUrl = e.select("a.picture").get(0).absUrl("href");
          price = e.select("span[itemprop=price]").text();
          imgUrl = e.select("meta[itemprop=image]").attr("content");

          Log.d("INFO", name);
          products.add(new Product(name, price, imgUrl, productUrl));
        }

      } catch (IOException e) {
        e.printStackTrace();
      }

      return null;
    }
예제 #16
0
  public ArrayList<DataStructure> parseXML(String xmlUrl, String whatForm) throws IOException {
    System.err.println("Creating an XML database");
    File file = new File(xmlUrl);
    Document document = Jsoup.parse(file, "UTF-8");

    String kopuk;
    ArrayList<DataStructure> xmlData = new ArrayList<>();
    Elements linkFrom = document.select("FORM");
    for (Element link : linkFrom) {
      Map<String, Map<String, Map<String, String>>> hashMapGlobal = new HashMap<>();
      if (link.hasAttr("KO")) {
        kopuk = link.attr("KO");
      } else {
        kopuk = link.attr("KOPUK");
      }
      Elements part;
      if (!document.select("PodPart").isEmpty()) {
        part = link.select("PodPart");
      } else {
        part = link.select("Part");
      }
      for (Element ichPart : part) {
        Map<String, Map<String, String>> hashMapPart = new HashMap<>();
        Elements rows = ichPart.select("Row");
        for (Element ichRow : rows) {
          Map<String, Map<String, String>> hashMapRow = new HashMap<>();
          Map<String, String> hashMapGrahp = new HashMap<>();
          Elements graph = ichRow.select("graph");
          for (Element ichGraph : graph) {
            hashMapGrahp.put(ichGraph.attr("seqNum"), ichGraph.text());
          }
          hashMapPart.put(ichRow.attr("seqNum"), hashMapGrahp);
          hashMapRow.clear();
        }
        if (Objects.equals(whatForm, "242") || Objects.equals(whatForm, "243")) {
          System.out.println("Find 242 243 ");
          if (hashMapGlobal.size() > 0) {
            System.out.println(" hashMapGlobal.size()>0 ");
            Map<String, Map<String, String>> hashMapRowOld = hashMapGlobal.get("1");
            for (String rowKey : hashMapPart.keySet()) {
              Map<String, String> hashMapPartIN = hashMapPart.get(rowKey);
              Map<String, String> hashMapPartOldIN = hashMapRowOld.get(rowKey);
              for (String graphKey : hashMapPartIN.keySet()) {
                hashMapPartOldIN.put(graphKey, hashMapPartIN.get(graphKey));
              }
              hashMapRowOld.put(rowKey, hashMapPartOldIN);
            }
            hashMapGlobal.put("1", hashMapRowOld);
          } else hashMapGlobal.put(ichPart.attr("seqNum"), hashMapPart);

        } else {
          hashMapGlobal.put(ichPart.attr("seqNum"), hashMapPart);
        }
      }
      xmlData.add(new DataStructure(kopuk, hashMapGlobal));
    }
    return xmlData;
  }
예제 #17
0
  @Override
  protected List<Tome> parseTomes(Document htmlDocument, Serie parent) {
    Date today = new Date();
    List<Tome> tomes = new LinkedList<>();

    Elements divChapters = htmlDocument.select("div.detail_list");
    if (!divChapters.isEmpty()) {
      Elements spansLeft = divChapters.first().select("span.left");
      if (!spansLeft.isEmpty()) {
        for (Element span : spansLeft) {
          Elements tomeNumberElements = span.select("span.mr6");
          final String tomeNumberString =
              StringUtils.substringAfter(tomeNumberElements.first().text(), "Vol ");
          int tomeNumber = 0;
          if (tomeNumberString != null && !tomeNumberString.isEmpty()) {
            Integer.parseInt(tomeNumberString);
          }

          Tome foundTome = null;
          for (Tome tome : tomes) {
            if (tomeNumber == tome.getNumber()) {
              foundTome = tome;
              break;
            }
          }

          if (foundTome == null) {
            Tome tome = new Tome();
            tome.setNumber(tomeNumber);
            tome.setName("Tome " + tomeNumber);
            tome.setMustBeSaved(true);
            tome.setValidityDate(today);
            tome.setSerie(parent);

            tomes.add(tome);
            foundTome = tome;
          }

          Element link = span.select("a").first();

          Chapter chapter = new Chapter();
          chapter.setMustBeSaved(true);
          chapter.setUrl(link.attr("href"));
          String chapterNumberToParse = link.text();
          String tempNumber = StringUtils.substringAfterLast(chapterNumberToParse, " ");
          chapter.setNumber(Float.parseFloat(tempNumber));
          chapter.setName(span.text());
          chapter.setTome(foundTome);

          foundTome.addChapter(chapter);
        }
      }
    }

    parent.setValidityDate(today);
    return tomes;
  }
예제 #18
0
 private Post parsePost(Element element) {
   Post post = new Post();
   post.setId(Long.parseLong(element.dataset().get("story-id")));
   post.setTitle(element.select(titleSelector).text());
   post.setContent(element.select(contentSelector).text());
   post.setDescription(element.select(descriptionSelector).text());
   post.setUrls(findUrls(post));
   return post;
 }
예제 #19
0
 private ViewModel parseDetail(Document doc, ViewModel item) {
   if (doc.select("select#SeasonSelection").size() > 0) {
     item.setType(ViewModel.Type.SERIES);
     String rel = doc.select("select#SeasonSelection").attr("rel");
     rel = rel.substring(rel.indexOf("SeriesID=") + "SeriesID=".length());
     item.setSeriesID(Integer.valueOf(rel));
     // Fill seasons and episodes
     Elements seasons = doc.select("select#SeasonSelection > option");
     List<Season> list = new ArrayList<Season>();
     for (Element season : seasons) {
       String[] rels = season.attr("rel").split(",");
       Season s = new Season();
       s.id = Integer.valueOf(season.val());
       s.name = season.text();
       s.episodes = rels;
       list.add(s);
     }
     item.setSeasons(list.toArray(new Season[list.size()]));
   } else {
     item.setType(ViewModel.Type.MOVIE);
     List<Host> hostlist = new ArrayList<Host>();
     Elements hosts = doc.select("ul#HosterList").select("li");
     for (Element host : hosts) {
       int hosterId = 0;
       Set<String> classes = host.classNames();
       for (String c : classes) {
         if (c.startsWith("MirStyle")) {
           hosterId = Integer.valueOf(c.substring("MirStyle".length()));
         }
       }
       String name = host.select("div.Named").text();
       String count = host.select("div.Data").text();
       int c = 1;
       if (count.contains("/")) {
         count = count.substring(count.indexOf("/") + 1, count.indexOf(" ", count.indexOf("/")));
         c = Integer.valueOf(count);
       }
       for (int i = 0; i < c; i++) {
         Host h = Host.selectById(hosterId);
         h.setName(name);
         h.setMirror(i + 1);
         if (h.isEnabled()) {
           hostlist.add(h);
         }
       }
     }
     item.setMirrors(hostlist.toArray(new Host[hostlist.size()]));
   }
   String imdb = doc.select("div.IMDBRatingLinks > a").attr("href").trim();
   if (!TextUtils.isEmpty(imdb)) {
     imdb = imdb.replace("/", "");
     item.setImdbId(imdb);
   }
   return item;
 }
예제 #20
0
  /**
   * Extract content with jsoup maybe later.
   *
   * @param doc
   * @return
   */
  public static List<Item> extractItem(Document doc) {
    List<Item> itemList = new ArrayList<Item>();
    Elements itemRows = doc.select("tr");
    Iterator iterator = itemRows.iterator();
    while (iterator.hasNext()) {
      Element element = (Element) iterator.next();
      Element titleElement = element.select(".title a").first();
      if (titleElement == null) {
        continue;
      }
      String titleStr = titleElement.text().trim();
      String urlStr = titleElement.attr("href").trim();

      Element comHeadElement = element.select(".comhead").first();
      if (comHeadElement == null) {
        continue;
      }

      String comheadStr = comHeadElement.text().trim();

      Element pointsElement = element.select("span[id^=score_]").first();
      if (pointsElement == null) {
        continue;
      }
      String pointsStr = pointsElement.text();
      if (pointsStr == null) {
        continue;
      }
      String[] pointsArr = pointsStr.split(" ");
      if (pointsArr.length != 2) {
        continue;
      }
      int points = -1;
      try {
        points = Integer.parseInt(pointsArr[0]);
      } catch (NumberFormatException e) {
      }

      if (points < 0) {
        continue;
      }
      Element userElement = element.select("a[href^=user]").first();
      if (userElement == null) {
        continue;
      }

      String user = userElement.text().trim();

      Element dateElement = element.select(".subtext").first();
    }

    return itemList;
  }
예제 #21
0
  @Override
  public void process(ResultItems page) {
    Document doc = (Document) page.getResource();

    Elements elements = doc.select("div.txt-list-category-v2");
    for (Element item : elements) {
      String ancestorName = item.select("h3").text();
      String ancestorId = item.attr("id");
      CategoryEntity ancestor =
          new CategoryEntity().setName(ancestorName).setSite(SiteName.Taobao).setCode(ancestorId);
      getLogger().trace(ancestor);
      page.addItem(ancestor);

      Elements subElements = item.select("a");
      CategoryEntity parent = null;
      for (Element item3rd : subElements) {
        if (item3rd.attr("href").isEmpty()) {
          String name = item3rd.text().trim();
          if (name.isEmpty()) {
            continue;
          }
          if (name.toCharArray()[0] == 160) {
            continue;
          }
          parent = new CategoryEntity().setName(name).setSite(SiteName.Taobao).setParent(ancestor);
          getLogger().trace(parent);
          page.addItem(parent);
        } else {
          String url = item3rd.absUrl("href");
          try {
            url = java.net.URLDecoder.decode(url, "utf-8");
          } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(url, e);
          }
          String name = item3rd.text().trim();
          if (name.isEmpty()) {
            continue;
          }
          CategoryEntity grand =
              new CategoryEntity()
                  .setName(name)
                  .setUrl(url)
                  .setSite(SiteName.Taobao)
                  .setParent(parent);
          if (parent == null) {
            throw new RuntimeException("no parent of " + grand);
          }
          getLogger().trace(grand);
          page.addItem(grand);
        }
      }
    }
  }
  private static PharmacieResultatParsingSousCategorie parserSousCategorie(
      Category subCategory, boolean premierAppel, Document sousCategorie) {
    PharmacieResultatParsingSousCategorie resultat = new PharmacieResultatParsingSousCategorie();
    Element productList = sousCategorie.select("div.products-list").first();
    if (productList != null) {
      Elements productsInfo = productList.select("div.info_product");
      if (productsInfo != null) {
        for (Element productInfo : productsInfo) {
          Element productInfoLink = productInfo.select("a.name-link").first();
          if (productInfoLink != null) {
            String productInfoLinkUrl = productInfoLink.attributes().get("href").toLowerCase();
            resultat.getListeUrlArticlesTrouves().add(productInfoLinkUrl);
          }
        }
      }
      if (premierAppel) {
        resultat
            .getListeAutrePagesAParserMemeCategorie()
            .addAll(verificationAutrePageAParser(sousCategorie));
      }
    } else {
      Element categoryList = sousCategorie.select("div.category-list").first();
      if (categoryList != null) {
        Elements nouvellesSousCategorie = categoryList.select("h2.category-title");
        if (nouvellesSousCategorie != null) {
          for (Element nouvelleSousCategorie : nouvellesSousCategorie) {
            Element productInfoLink = nouvelleSousCategorie.select("a").first();
            String nouvelleSousCategorieLink =
                productInfoLink.attributes().get("href").toLowerCase();
            String nouvelleSousCategorieName = nouvelleSousCategorie.text();

            // On crée la nouvelle catégorie
            List<Category> nouvelleListeCategorie = new ArrayList<Category>();
            Category newCategory =
                new Category(nouvelleSousCategorieName, nouvelleSousCategorieLink);
            newCategory.setParentCategory(subCategory);
            nouvelleListeCategorie.add(newCategory);
            if (!resultat.getMapAutresSousCategories().containsKey(subCategory)) {
              resultat.getMapAutresSousCategories().put(subCategory, nouvelleListeCategorie);
            } else {
              resultat
                  .getMapAutresSousCategories()
                  .get(subCategory)
                  .add(new Category(nouvelleSousCategorieName, nouvelleSousCategorieLink));
            }
          }
        }
      }
    }

    return resultat;
  }
예제 #23
0
 @Override
 protected void onPostExecute(String s) {
   super.onPostExecute(s);
   for (Element e :
       doc.select(
           "div.wrap.ch_clip._cardArea div.cds_area._infiniteCardArea div.cds._MM_CARD")) {
     String Title = e.select("div.cds_type.uio_thumb dl.cds_info dt.title h3 span").text();
     String Date = e.select("div.cds_type.uio_thumb dl.cds_info dd.meta span.time").text();
     Item_CardList_Ted data = new Item_CardList_Ted(Title, Date);
     listCardItems.add(data);
   }
   cardItemAdapter.notifyDataSetChanged();
 }
예제 #24
0
  public List<News> scrape(Document doc) {
    Elements trs =
        doc.select("body > center > " + "table > tbody > tr > td > " + "table > tbody > tr");

    int num = 0;

    List<News> newsList = new ArrayList<>();

    News.Builder builder = null;

    out:
    for (Element tr : trs) {
      switch (num % 3) {
        case 1:
          Elements titles = tr.select(".title");
          if (titles.size() < 2) {
            break out;
          }
          builder = new News.Builder();

          Element titleEl = titles.get(1);
          Element a = titleEl.select("a").first();
          builder.title = a.text();
          builder.url = getUrl(a);
          Elements comhead = titleEl.select(".comhead");
          if (comhead.size() > 0) {
            String domain = comhead.first().text();
            builder.domain = extract(domain, DOMAIN);
          }
          break;

        case 2:
          assert builder != null;
          Element subtext = tr.select(".subtext").first();
          Elements els = subtext.select("a");

          if (els.size() > 1) {
            Element comments = els.get(1);
            builder.id = getId(comments);
            builder.points = getPoints(subtext);
            builder.commentsNum = getCommentsNum(comments);
          }

          newsList.add(builder.build());
          break;
      }
      num++;
    }

    return newsList;
  }
예제 #25
0
  public JSONArray toFourDayJSON(String html, String[] labels) {
    Document doc = Jsoup.parse(html);
    JSONArray dates = new JSONArray();

    try {
      Elements tables = doc.select("table");
      // Log.d("jsoup", "Four day: Parsing html table: " + tables.size());
      for (Element table : tables) {
        Elements rows = table.select("tr");

        JSONObject date_item = new JSONObject();
        JSONArray row_json = new JSONArray();

        for (Element row : rows) {
          Elements data = row.select("td");
          if (!data.isEmpty()) {
            JSONObject details = new JSONObject();

            for (Element dataItem : data) {
              Elements img = dataItem.select("img");
              String img_src = null, label = null;
              String[] tokens = null;
              if (img.size() == 0) {
                label = labels[data.indexOf(dataItem)];
                if (label.equals("Time")) {
                  details.put(label, dataItem.text().split("-")[0]);
                } else {
                  details.put(label, dataItem.text());
                }
              } else {
                img_src = img.get(0).attr("src");
                tokens = img_src.split("/");
                // Log.d("jsoup", "img: "+tokens[tokens.length-1]);
                details.put(labels[data.indexOf(dataItem)], tokens[tokens.length - 1]);
              }
            }
            row_json.put(details);
          }
        }
        date_item.put("data", new JSONArray(row_json.toString()));
        date_item.put("date", table.previousElementSibling().text());
        dates.put(new JSONObject(date_item.toString()));
      }

    } catch (JSONException e) {
      e.printStackTrace();
    }

    return dates;
  }
예제 #26
0
  /**
   * Parse search results from a search result site
   *
   * @param pUrl
   */
  private void parseSearchResults(String pUrl) {
    LOGGER.info("Started parsing: " + pUrl);
    Document doc = null;

    doc = ParserUtils.connectGetUrl(ParserUtils.getUri(pUrl).toASCIIString());
    doc.setBaseUri(DEFAULT_VSP_URL);
    Elements results = doc.select("div[class*=map-list-item]");
    for (Element result : results) {
      PersistentEntity ent = new PersistentEntity();
      Elements infoElement = result.select("div[class*=info-content]");
      LOGGER.debug(infoElement.select("p[class*=establishment-category]").first().ownText());
      String tmp =
          result
              .select("div[class*=info-content]")
              .select("p[class*=establishment-category]")
              .first()
              .ownText();

      ent.setIndustry(new Utf8(tmp.split("/")[0]));
      ent.setLabel(new Utf8(tmp));
      // getting same as value to where it is
      EylloLink link =
          ParserUtils.detectUrl(
              infoElement.select("p[class*=establishment-name]").select("a").first());
      if (link != null) {
        LOGGER.debug(DEFAULT_VSP_URL + link.getLinkHref());
        ent.putToSameAs(
            new Utf8(DEFAULT_VSP_URL + link.getLinkHref()), new Utf8(link.getLinkText()));
        ent.setName(new Utf8(link.getLinkText()));
      }
      // getting its address and phone
      PersistentPoint point = new PersistentPoint();
      infoElement = result.select("div[class*=establishment-details]").select("p");
      ent.addToTelephones(new Utf8(infoElement.get(0).ownText()));
      point.setAddress(new Utf8(infoElement.get(0).text()));
      if (!result.attr("data-lng").toString().equals("")
          && !result.attr("data-lat").toString().equals("")) {
        // Format in [lon, lat], note, the order of lon/lat here in order to conform with GeoJSON.
        point.addToCoordinates(Double.parseDouble(result.attr("data-lng")));
        point.addToCoordinates(Double.parseDouble(result.attr("data-lat")));
        point.setAccuracy(EylloLocation.GEOCODER_VERIF_ACC_HIGH);
      }
      ent.setPersistentpoint(point);
      ent.addToScenarioId(getScenarioId());

      this.pEntities.add(ent);
    }
    LOGGER.info("Completed getting basic information from entities.");
  }
예제 #27
0
  public void test1() throws Exception {
    Document dom = Jsoup.connect("http://book.douban.com/latest?icn=index-latestbook-all").get();
    // 根据jquery
    Elements es = dom.select("#content li").not(".clear");
    for (int i = 0; i < es.size(); i++) {
      Element e = es.get(i); // li
      String title = e.select("h2").get(0).text();
      System.out.println("title:" + title);
      Elements esp = e.select("p");
      Element p1 = esp.get(0);
      String auth = p1.text();
      System.out.println("auth:" + auth);
      String text = esp.get(1).text();
      System.out.println(text);

      final String url = e.select("img").get(0).attr("src");
      System.out.println(url);

      new Thread() {
        public void run() {
          try {
            String fileName = url.substring(url.lastIndexOf("/") + 1);
            HttpURLConnection con = (HttpURLConnection) new URL(url).openConnection();
            con.setConnectTimeout(3000);
            con.setRequestMethod("GET");
            con.setDoInput(true);
            con.connect();
            int code = con.getResponseCode();
            if (code == 200) {
              InputStream in = con.getInputStream();
              byte[] b = new byte[1024];
              int len = 0;
              OutputStream out = new FileOutputStream("f:/" + fileName);
              while ((len = in.read(b)) != -1) {
                out.write(b, 0, len);
              }
              out.close();
            }
            con.disconnect();
          } catch (Exception e) {
            e.printStackTrace();
          }
        };
      }.start();

      System.out.println("-------------------");
    }
    System.in.read();
  }
예제 #28
0
  @Test
  public void deeperDescendant() {
    String h =
        "<div class=head><p><span class=first>Hello</div><div class=head><p class=first><span>Another</span><p>Again</div>";
    Document doc = Jsoup.parse(h);
    Element root = doc.getElementsByClass("head").first();

    Elements els = root.select("div p .first");
    assertEquals(1, els.size());
    assertEquals("Hello", els.first().text());
    assertEquals("span", els.first().tagName());

    Elements aboveRoot = root.select("body p .first");
    assertEquals(0, aboveRoot.size());
  }
예제 #29
0
  private List<Ingredient> grabIngredients(Document doc) {

    List<Ingredient> ingredientsList = new ArrayList<Ingredient>();

    Element table = doc.select("table[class=zutaten]").first();
    Iterator<Element> ite = table.select("tr[class=ingredient]").iterator();
    while (ite.hasNext()) {
      Element ingredient = ite.next();
      String amount = ingredient.select("td[class=nobr amount]").first().text();
      String name = ingredient.select("td[class=name]").first().text();
      ingredientsList.add(new Ingredient(amount, name));
    }

    return ingredientsList;
  }
예제 #30
0
 @Override
 protected Boolean doInBackground(String... params) {
   try {
     Document doc = Jsoup.connect(params[0]).get();
     Element body = doc.body();
     Elements titleEs = body.select("td.title");
     Elements subTitleEs = body.select("td.subtext");
     int index = 1;
     if (!titleEs.isEmpty()) {
       if (mType == TYPE_REFRESH && mNews.size() > 0) {
         mNews.clear();
       }
       Iterator<Element> iterator = titleEs.iterator();
       Iterator<Element> subIt = subTitleEs.iterator();
       NewEntity entity = null;
       User user = null;
       while (iterator.hasNext()) {
         Element e = iterator.next();
         if (index % 2 == 0) {
           Element subE = subIt.next();
           Elements aTag = e.select("a");
           Elements spanTag = e.select("span.comhead");
           Elements subEa = subE.select("a");
           user = new User();
           user.setId(subEa.get(0).text());
           entity =
               new NewEntity(
                   aTag.get(0).attr("href"),
                   aTag.get(0).text(),
                   spanTag.isEmpty() ? null : spanTag.get(0).text(),
                   subE.html());
           entity.setDiscussUrl(subEa.get(1).attr("href"));
           // Log.i(LOG_TAG, entity.toString());
           mNews.add(entity);
         }
         index++;
       }
     }
     Elements more = doc.getElementsByAttributeValueStarting("href", "/x?fnid=");
     if (!more.isEmpty()) {
       mMoreURLPath = more.get(1).attr("href");
     }
     return true;
   } catch (IOException e) {
     Log.e(LOG_TAG, "", e);
     return false;
   }
 }