コード例 #1
0
 /**
  * 获取页面的内容,存放到Document里。
  *
  * @return
  */
 private Document getHtml() {
   Document doc = null;
   if (this.url == null) {
     return null;
   } else {
     try {
       doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").timeout(3000).get();
     } catch (IOException e) {
       //                System.out.println("error in get html code");
       logger.infoLog(url + " -- not exist!");
     }
   }
   return doc;
 }
コード例 #2
0
  /**
   * 对页面内容进行爬取
   *
   * @param dict
   * @return
   * @throws SQLException
   * @throws ClassNotFoundException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
  public PersonInfo crawler()
      throws InstantiationException, IllegalAccessException, ClassNotFoundException, SQLException {
    if (this.url == null) {
      logger.warnLog("empty url");
      return null;
    }
    // 获得页面的Html代码
    Document pageDoc = this.getHtml();
    // 这里要加上用户不存在的页面判断
    if (pageDoc == null) {
      return null;
    }
    PersonInfo info = convertDoc2PersonInfo(pageDoc);

    if (info == null) {
      return null;
    } else {
      return info;
    }
  }
コード例 #3
0
  /**
   * 把网页Html代码转换为PersonInfo对象
   *
   * @param doc
   * @return
   */
  private PersonInfo convertDoc2PersonInfo(Document doc) {
    PersonInfo info = new PersonInfo();
    if (doc == null) {
      return null;
    }

    Elements obssinHtml = doc.select("div[class=\"obssin\"");

    // 用户id
    info.setUserId(id);
    // 用户主页url
    info.setHomePageUrl(url);

    // 关注的人数
    String fellowPeopleNum = getFellowPeopleNum(doc);
    info.setFellowPeopleNum(fellowPeopleNum);
    if (fellowPeopleNum == null) {
      logger.infoLog("User:"******": empty fellow people");
    }

    // 被关注的人数
    String fellowedPeopleNum = getFellowedPeopleNum(doc);
    info.setFellowedPeopleNum(fellowedPeopleNum);
    if (fellowedPeopleNum == null) {
      logger.infoLog("User:"******": empty fellowed people");
    }

    // 常去的小组
    List<String> oftenGroup = getOftenGroup(doc);
    info.setOftenGroup(oftenGroup);
    if (oftenGroup == null) {
      logger.infoLog("User:"******": empty often group");
    }

    // 获取在读的书列表
    List<String> readingBook = getReadingBook(obssinHtml);
    info.setReadingBook(readingBook);
    if (readingBook == null) {
      logger.infoLog("User:"******": empty reading book");
    }

    // 获取想读的书列表
    List<String> wantBook = getWantBook(obssinHtml);
    info.setWantBook(wantBook);
    if (wantBook == null) {
      logger.infoLog("User:"******": empty want book");
    }

    // 想看的电影
    List<String> wantMovie = getWantMovie(obssinHtml);
    info.setWantMovie(wantMovie);
    if (wantMovie == null) {
      logger.infoLog("User:"******": empty want movie");
    }

    // 看过的电影
    List<String> watchedMovie = getWatchedMovie(obssinHtml);
    info.setWatchedMovie(watchedMovie);
    if (watchedMovie == null) {
      logger.infoLog("User:"******": empty watched movie");
    }

    // 在听的歌
    List<String> listeningMusic = getListeningMusic(obssinHtml);
    info.setListeningMusic(listeningMusic);
    if (listeningMusic == null) {
      logger.infoLog("User:"******": empty listening music");
    }

    // 想听的歌
    List<String> wantMusic = getWantMusic(obssinHtml);
    info.setWantMusic(wantMusic);
    if (wantMusic == null) {
      logger.infoLog("User:"******": empty want music");
    }

    // 加入时间
    String joinDate = getJoinDate(doc);
    info.setJoinDate(joinDate);
    if (joinDate == null) {
      logger.infoLog("User:"******": empty join date");
    }
    // 性别(暂时无法获取,设置默认值)
    info.setSex("n");
    // 年龄(暂时无法获取,设置默认值)
    info.setAge("0");
    // 姓名
    String name = getName(doc);
    info.setUserName(name);
    if (name == null) {
      logger.infoLog("User:"******": empty name");
    }

    return info;
  }