Exemple #1
0
 @Override
 protected void parseRow(
     final String query, final int options, final Element tr, final List<Name> results) {
   final String thumbnailUrl =
       tr.getElementsByAttributeValue("class", "primary_photo")
           .first()
           .getElementsByTag("img")
           .first()
           .attr("src");
   final Element r = tr.getElementsByAttributeValue("class", "result_text").first();
   final Element a = r.getElementsByTag("a").first();
   final String url = Imdb.BASE_URL + a.attr("href");
   final String name = a.ownText();
   String job = "";
   Reference ref = null;
   final Elements smalls = r.getElementsByTag("small");
   if (!smalls.isEmpty()) {
     final String refUrl =
         Imdb.BASE_URL + smalls.first().getElementsByTag("a").first().attr("href");
     String desc = smalls.first().text();
     if (desc.startsWith("(") && desc.endsWith(")")) desc = desc.substring(1, desc.length() - 1);
     final int comma = desc.indexOf(',');
     if (comma != -1) {
       job = desc.substring(0, comma).trim();
       ref = new Reference(refUrl, desc.substring(comma + 1).trim());
     } else {
       if (desc.matches(".+\\(\\d+\\)"))
         ref = new Reference(refUrl, desc.substring(comma + 1).trim());
       else job = desc;
     }
   }
   results.add(new Name(url, thumbnailUrl, name, job, ref));
 }
 private int getCount(Element ele, String dataElementTerm) throws NumberFormatException {
   Elements allElements = ele.getElementsByAttributeValue("data-element-term", dataElementTerm);
   if (allElements != null && allElements.size() > 0) {
     Element target = allElements.first();
     String count = target.child(0).ownText();
     count = count.replaceAll(",", "");
     int res = Integer.parseInt(count);
     return res;
   }
   return -1;
 }
  /**
   * 解析引用回复的准备数据
   *
   * @param responseBody
   * @return
   */
  public static PrepareQuoteReply parsePrepareQuoteReply(String responseBody) {
    PrepareQuoteReply quoteReply = new PrepareQuoteReply();
    try {

      Document document = Jsoup.parse(responseBody);
      document.setBaseUri(Constants.BASE_URL);

      Element postform = document.getElementById("postform");
      String url = postform.absUrl("action");

      String formhash =
          postform.getElementsByAttributeValue("name", "formhash").first().attr("value");
      String posttime =
          postform.getElementsByAttributeValue("name", "posttime").first().attr("value");
      String noticeauthor =
          postform.getElementsByAttributeValue("name", "noticeauthor").first().attr("value");
      String noticetrimstr =
          postform.getElementsByAttributeValue("name", "noticetrimstr").first().attr("value");
      String noticeauthormsg =
          postform.getElementsByAttributeValue("name", "noticeauthormsg").first().attr("value");
      String reppid = postform.getElementsByAttributeValue("name", "reppid").first().attr("value");
      String reppost =
          postform.getElementsByAttributeValue("name", "reppost").first().attr("value");
      String quoteBody = postform.getElementsByTag("blockquote").first().toString();

      quoteReply.setNoticeauthor(noticeauthor);
      quoteReply.setNoticeauthormsg(noticeauthormsg);
      quoteReply.setNoticetrimstr(noticetrimstr);
      quoteReply.setPosttime(posttime);
      quoteReply.setQuoteBody(quoteBody);
      quoteReply.setReppid(reppid);
      quoteReply.setUrl(url);
      quoteReply.setFormhash(formhash);
      quoteReply.setReppost(reppost);
    } catch (Exception e) {
      e.printStackTrace();
    }

    return quoteReply;
  }
Exemple #4
0
  public static List<Movie> getIntroduction(String url) {

    List<Movie> movieList = new ArrayList<Movie>();

    Document doc = null;

    try {
      doc = Jsoup.connect("http://movie.douban.com/").timeout(5000).get();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    if (doc == null) {
      try {
        doc = Jsoup.connect("http://movie.douban.com/").timeout(5000).get();
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }

    if (doc == null) {
      return null;
    }

    Elements mainUL = doc.select("ul#newcontent1");

    Elements elem = mainUL.get(0).getElementsByTag("li");

    for (int i = 0; i < elem.size(); i++) {
      // Log.i("li"+i, elem.get(i).html());
      Movie m = new Movie();
      Element tempElement = elem.get(i);
      Elements imgDiv = tempElement.getElementsByAttributeValue("class", "img");

      m.setImgHtml(imgDiv.get(0).html());
      // Log.i("img_url:", imgDiv.get(0).html());
      Elements elems = imgDiv.get(0).children();
      String imgURL = null;
      imgURL =
          elems
              .get(0)
              .html()
              .substring(
                  elems.get(0).html().indexOf("\"") + 1, elems.get(0).html().lastIndexOf("\""));
      // Log.d("Movie News", imgURL);
      m.setImgUrl(imgURL);

      // get the movie's index at movie.douban.com
      String doubanUrl = imgDiv.get(0).html();
      int start = doubanUrl.indexOf("href=\"") + 6;
      int end = start;
      while (doubanUrl.charAt(++end) != '\"') ;
      doubanUrl = doubanUrl.substring(start, end);
      m.setDoubanUrl(doubanUrl);
      String id = doubanUrl.substring(doubanUrl.indexOf("subject/") + 8, doubanUrl.length() - 1);
      m.setID(Integer.parseInt(id));
      // Log.e(doubanUrl, id);
      // Log.e("Movie News", doubanUrl);
      Elements introDiv = tempElement.getElementsByAttributeValue("class", "intro");

      // save the rating node to extract this movie's rating
      Node rating = introDiv.get(0).child(1);
      List<Node> nodeList = rating.childNodes();
      // rank of the movie instance will be set to the number from the extracted node
      m.setDoubanRank(nodeList.get(2).toString().trim());

      // remove the rating node from the parent
      introDiv.get(0).child(1).remove();
      // remove the number of current movie
      introDiv.get(0).childNodes().get(1).childNode(0).remove();

      // get the movie's name
      List<Node> childList = introDiv.get(0).childNodes().get(1).childNodes().get(0).childNodes();
      String title = childList.get(0).toString();
      if (title.contains("&middot;")) {
        Log.e("Movie News", "tag found");
        title = title.replace("&middot;", ".");
      }
      m.setMovieName(title);

      m.setInfoHtml(introDiv.get(0).html());
      movieList.add(m);
    }
    // System.out.println("working");
    return movieList;
  }
Exemple #5
0
  public static ArrayList<RequestInfo> parseApiDoc(String path) {
    File file = new File(path);
    String response = FileUtils.readToString(file, "UTF-8");
    Document parse = Jsoup.parse(response);
    // 接口类型总称 api-Search
    String mainName = parse.getElementsByTag("section").get(0).attr("id");
    //		System.out.println("-> type = " + mainName);
    //		System.out.println();
    ArrayList<RequestInfo> requestInfos = new ArrayList<RequestInfo>();
    // 全部类型接口
    for (Element e : parse.getAllElements()) {
      String attrId = e.attr("id");
      // div的标签,且id名前缀为mainName
      // 为类型下单个接口 api-Search-PostSearchCompany
      if (e.tagName().equals("div") && attrId.startsWith(mainName)) {
        RequestInfo requestInfo = new RequestInfo();

        //				System.out.println("---> name = " + attrId);
        requestInfo.setName(attrId);

        // method post/get
        String method = e.getElementsByTag("pre").get(0).attr("data-type");
        //				System.out.println("-----> method = " + method);
        requestInfo.setMethod(method);

        // url
        String url = e.getElementsByAttributeValue("class", "pln").get(0).text();
        //				System.out.println("-----> url = " + url);
        requestInfo.setUrl(url);

        // des
        String des = e.getElementsByAttributeValue("class", "pull-left").get(0).text();
        //				System.out.println("-----> des = " + des);
        requestInfo.setDes(des);

        // post params
        Element ePostParams = e.getElementsByTag("table").get(0);
        ArrayList<RequestParam> params = new ArrayList<RequestParam>();
        for (Element ePostParam : ePostParams.getElementsByTag("tr")) {
          // param 字段
          Elements eColumn = ePostParam.getElementsByTag("td");
          if (eColumn.size() == 0) {
            continue;
          }

          // 标签"选项"
          //					String label = ePostParam.getElementsByAttributeValue("class", "label
          // label-optional")
          //							.get(0).text();
          String label = "选项";

          // 第一个字段为参数名
          String paramName = eColumn.get(0).text();
          // 去除标签
          paramName = paramName.replace(label, "").trim();
          // 第二个字段为参数类型
          // 可能类型为 String Number Float
          String paramType = eColumn.get(1).text();
          // 第三个字段为参数描述
          String paramDes = eColumn.get(2).text();
          //					System.out.println("-----> param = " + paramName + " ... "
          //							+ paramType + " ... " + paramDes);
          RequestParam param = new RequestParam(paramName, paramType, paramDes);
          params.add(param);
        }

        requestInfo.setParams(params);
        requestInfos.add(requestInfo);
        //				System.out.println();
      }
    }

    return requestInfos;
  }
  public static void getLessons(Document doc, Schedule s) throws ParseException {

    final SimpleDateFormat format = new SimpleDateFormat("dd.MM.yyyy HH:mm", Locale.ENGLISH);
    format.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));

    Elements lessonCells = doc.getElementsByAttribute("number");

    for (Element lessonCell : lessonCells) {

      Lesson l, lPrev = null; // lPrev to handle duplicate lesson
      int sameLesson = 0; // Also to handle duplicate lesson

      int number = Integer.parseInt(lessonCell.attr("number"));
      String time = "";

      Elements timeDetails = lessonCell.getElementsByClass("cell-header2");
      for (Element timeDetail : timeDetails) {
        if (timeDetail.hasAttr("style")) time = timeDetail.text();
      }

      Elements lessonCellDetails = lessonCell.getElementsByAttribute("jsdate");
      for (Element lessonCellDetail : lessonCellDetails) {

        String date = lessonCellDetail.attr("jsdate");
        int index = 0;
        sameLesson = 0;

        for (Element subject :
            lessonCellDetail.getElementsByAttributeValue("class", "lesson-subject")) {

          if (subject == null || subject.text() == null || subject.text().length() <= 0) {
            // No lesson scheduled
            continue;
          }

          Date start = format.parse(date + " " + time.substring(0, time.indexOf("-") - 1));
          if ((l = s.getLessonByNumber(start, number)) == null) {

            if (BuildConfig.DEBUG)
              Log.d("GshisHTMLParser", TS.get() + " getLessons() not found in db, will insert");

            l = new Lesson();
            sameLesson = 0;

            l.setStart(start);
            l.setStop(
                format.parse(date + " " + time.substring(time.indexOf("-") + 2, time.length())));
            l.setFormId(subject.attr("id"));
            l.setFormText(subject.text());
            l.setTeacher(
                lessonCellDetail
                    .getElementsByAttributeValue("class", "lesson-teacher")
                    .get(sameLesson)
                    .text());
            l.setNumber(number);

            s.addLesson(l);

          } else {

            if (BuildConfig.DEBUG)
              Log.d("GshisHTMLParser", TS.get() + " getLessons() found in db, will update");

            l.setFormId(subject.attr("id"));

            if (lPrev != null && lPrev.getStart().equals(start) && lPrev.getNumber() == number) {

              if (BuildConfig.DEBUG)
                Log.d(
                    "GshisHTMLParser",
                    TS.get()
                        + " getLessons() dup = "
                        + subject.text()
                        + " index = "
                        + index
                        + " sameLesson = "
                        + sameLesson);

              sameLesson++;

              if (!lPrev.getFormText().equals(subject.text()))
                l.setFormText(fixDuplicateString(subject.text(), lPrev.getFormText(), sameLesson));

              String teacher =
                  lessonCellDetail
                      .getElementsByAttributeValue("class", "lesson-teacher")
                      .get(index)
                      .text();

              if (!lPrev.getTeacher().equals(teacher))
                l.setTeacher(fixDuplicateString(teacher, lPrev.getTeacher(), sameLesson));

            } else {

              l.setNumber(number);
              l.setFormText(subject.text());
              l.setTeacher(
                  lessonCellDetail
                      .getElementsByAttributeValue("class", "lesson-teacher")
                      .get(index)
                      .text());
            }

            l.update();
          }

          lPrev = l;
          index++;
        }
      }
    }
  }