private static void crawl() { String url = url_tpl + (page++); Logger.info("正在抓取:%s", url); if (StringUtils.isBlank(url)) return; sleep(); Document doc = Jsoup.parse(WS.url(url).get().body, url); Elements elements = doc.select(".video-item"); if (elements.isEmpty()) return; for (Element element : elements) { try { Element link = element.select(">a").first(); String cover = link.select("img").first().absUrl("src"); String coverTitle = link.select(".v-update").first().html(); String detailUrl = link.absUrl("href"); String name = element.select(".v-desc .v-title a").first().html(); Logger.info("正在抓取名称:%s", name); Movie movie = Movie.find("byName", name).first(); if (movie == null) { movie = new Movie(); movie.id = DBCounter.generateUniqueCounter(Movie.class) + ""; } movie.name = name; movie.cover = cover; movie.cover_title = coverTitle; movie.details = getDetails( movie, "http://video.baidu.com/v?word=" + URLEncoder.encode("美剧 " + name, "GBK")); movie.save(); } catch (Exception e) { Logger.error(e.getMessage(), e); } } crawl(); }
private static List<MovieItem> getDetails(Movie movie, String url) { sleep(); List<MovieItem> result = new ArrayList<MovieItem>(); Logger.info("正在抓取Detail界面:%s", url); String body = WS.url(url).get().body; String rating = null; String[] blocks = StringUtils.substringsBetween(body, "T.object.extend(", ", {\"alias\":\""); if (blocks == null) { System.out.println(url); System.out.println(body); return result; } for (String block : blocks) { try { Map map = mapper.readValue(block, Map.class); Object id = map.get("id"); List<Map> sites = (List<Map>) map.get("sites"); Map site = sites.get(0); Object from = site.get("site_name"); List<Map> es = (List<Map>) site.get("episode"); if ("tudou.com".equals(site.get("site_url")) && sites.size() > 1) { // 土豆不支持html5 site = sites.get(1); from = site.get("site_name"); String b = WS.url( "http://video.baidu.com/htvplaysingles/?id=" + id + "&site=" + site.get("site_url")) .get() .body; es = (List<Map>) mapper.readValue(b, Map.class).get("videos"); } if ("tudou.com".equals(site.get("site_url"))) { return result; } List<Episode> episodes = new ArrayList<Episode>(); for (Map e : es) { String webUrl = e.get("url").toString(); if (webUrl.contains("baidu.com")) { continue; } Episode episode = new Episode(); episode.e = NumberUtils.toInt(e.get("episode").toString()); episode.url = webUrl; episode.v = ""; episodes.add(episode); } Collections.sort( episodes, new Comparator<Episode>() { public int compare(Episode o1, Episode o2) { return o1.e < o2.e ? -1 : 1; } }); if (rating == null && map.get("rating") != null && StringUtils.isNotBlank(map.get("rating").toString())) { rating = map.get("rating").toString(); } Object season = map.get("season"); if (season == null || "0".equals(season)) season = "1"; MovieItem item = new MovieItem(); item.brief = Objects.de4(map.get("brief"), ""); item.from = Objects.de4(from, ""); item.season = Objects.de4(season, ""); item.actors = (List<String>) map.get("actor"); item.episodes = episodes; result.add(item); } catch (IOException e) { Logger.error(e.getMessage(), e); } } if (rating == null) rating = (8 + Math.floor(RandomUtils.nextFloat() * 10) / 10) + ""; movie.rate = rating; Collections.sort( result, new Comparator<MovieItem>() { public int compare(MovieItem o1, MovieItem o2) { return NumberUtils.toInt(o1.season) < NumberUtils.toInt(o2.season) ? 1 : -1; } }); return result; }