/** * 获取页面的内容,存放到Document里。 * * @return */ private Document getHtml() { Document doc = null; if (this.url == null) { return null; } else { try { doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").timeout(3000).get(); } catch (IOException e) { // System.out.println("error in get html code"); logger.infoLog(url + " -- not exist!"); } } return doc; }
/** * 对页面内容进行爬取 * * @param dict * @return * @throws SQLException * @throws ClassNotFoundException * @throws IllegalAccessException * @throws InstantiationException */ public PersonInfo crawler() throws InstantiationException, IllegalAccessException, ClassNotFoundException, SQLException { if (this.url == null) { logger.warnLog("empty url"); return null; } // 获得页面的Html代码 Document pageDoc = this.getHtml(); // 这里要加上用户不存在的页面判断 if (pageDoc == null) { return null; } PersonInfo info = convertDoc2PersonInfo(pageDoc); if (info == null) { return null; } else { return info; } }
/** * 把网页Html代码转换为PersonInfo对象 * * @param doc * @return */ private PersonInfo convertDoc2PersonInfo(Document doc) { PersonInfo info = new PersonInfo(); if (doc == null) { return null; } Elements obssinHtml = doc.select("div[class=\"obssin\""); // 用户id info.setUserId(id); // 用户主页url info.setHomePageUrl(url); // 关注的人数 String fellowPeopleNum = getFellowPeopleNum(doc); info.setFellowPeopleNum(fellowPeopleNum); if (fellowPeopleNum == null) { logger.infoLog("User:"******": empty fellow people"); } // 被关注的人数 String fellowedPeopleNum = getFellowedPeopleNum(doc); info.setFellowedPeopleNum(fellowedPeopleNum); if (fellowedPeopleNum == null) { logger.infoLog("User:"******": empty fellowed people"); } // 常去的小组 List<String> oftenGroup = getOftenGroup(doc); info.setOftenGroup(oftenGroup); if (oftenGroup == null) { logger.infoLog("User:"******": empty often group"); } // 获取在读的书列表 List<String> readingBook = getReadingBook(obssinHtml); info.setReadingBook(readingBook); if (readingBook == null) { logger.infoLog("User:"******": empty reading book"); } // 获取想读的书列表 List<String> wantBook = getWantBook(obssinHtml); info.setWantBook(wantBook); if (wantBook == null) { logger.infoLog("User:"******": empty want book"); } // 想看的电影 List<String> wantMovie = getWantMovie(obssinHtml); info.setWantMovie(wantMovie); if (wantMovie == null) { logger.infoLog("User:"******": empty want movie"); } // 看过的电影 List<String> watchedMovie = getWatchedMovie(obssinHtml); info.setWatchedMovie(watchedMovie); if (watchedMovie == null) { logger.infoLog("User:"******": empty watched movie"); } // 在听的歌 List<String> listeningMusic = getListeningMusic(obssinHtml); info.setListeningMusic(listeningMusic); if (listeningMusic == null) { logger.infoLog("User:"******": empty listening music"); } // 想听的歌 List<String> wantMusic = getWantMusic(obssinHtml); info.setWantMusic(wantMusic); if (wantMusic == null) { logger.infoLog("User:"******": empty want music"); } // 加入时间 String joinDate = getJoinDate(doc); info.setJoinDate(joinDate); if (joinDate == null) { logger.infoLog("User:"******": empty join date"); } // 性别(暂时无法获取,设置默认值) info.setSex("n"); // 年龄(暂时无法获取,设置默认值) info.setAge("0"); // 姓名 String name = getName(doc); info.setUserName(name); if (name == null) { logger.infoLog("User:"******": empty name"); } return info; }