コード例 #1
0
ファイル: Names.java プロジェクト: joshvm/imdb
 @Override
 protected void parseRow(
     final String query, final int options, final Element tr, final List<Name> results) {
   final String thumbnailUrl =
       tr.getElementsByAttributeValue("class", "primary_photo")
           .first()
           .getElementsByTag("img")
           .first()
           .attr("src");
   final Element r = tr.getElementsByAttributeValue("class", "result_text").first();
   final Element a = r.getElementsByTag("a").first();
   final String url = Imdb.BASE_URL + a.attr("href");
   final String name = a.ownText();
   String job = "";
   Reference ref = null;
   final Elements smalls = r.getElementsByTag("small");
   if (!smalls.isEmpty()) {
     final String refUrl =
         Imdb.BASE_URL + smalls.first().getElementsByTag("a").first().attr("href");
     String desc = smalls.first().text();
     if (desc.startsWith("(") && desc.endsWith(")")) desc = desc.substring(1, desc.length() - 1);
     final int comma = desc.indexOf(',');
     if (comma != -1) {
       job = desc.substring(0, comma).trim();
       ref = new Reference(refUrl, desc.substring(comma + 1).trim());
     } else {
       if (desc.matches(".+\\(\\d+\\)"))
         ref = new Reference(refUrl, desc.substring(comma + 1).trim());
       else job = desc;
     }
   }
   results.add(new Name(url, thumbnailUrl, name, job, ref));
 }
コード例 #2
0
ファイル: ElementTest.java プロジェクト: haggisandchips/jsoup
 @Test
 public void getNamespacedElementsByTag() {
   Document doc = Jsoup.parse("<div><abc:def id=1>Hello</abc:def></div>");
   Elements els = doc.getElementsByTag("abc:def");
   assertEquals(1, els.size());
   assertEquals("1", els.first().id());
   assertEquals("abc:def", els.first().tagName());
 }
コード例 #3
0
  @Override
  protected List<Tome> parseTomes(Document htmlDocument, Serie parent) {
    Date today = new Date();
    List<Tome> tomes = new LinkedList<>();

    Elements divChapters = htmlDocument.select("div.detail_list");
    if (!divChapters.isEmpty()) {
      Elements spansLeft = divChapters.first().select("span.left");
      if (!spansLeft.isEmpty()) {
        for (Element span : spansLeft) {
          Elements tomeNumberElements = span.select("span.mr6");
          final String tomeNumberString =
              StringUtils.substringAfter(tomeNumberElements.first().text(), "Vol ");
          int tomeNumber = 0;
          if (tomeNumberString != null && !tomeNumberString.isEmpty()) {
            Integer.parseInt(tomeNumberString);
          }

          Tome foundTome = null;
          for (Tome tome : tomes) {
            if (tomeNumber == tome.getNumber()) {
              foundTome = tome;
              break;
            }
          }

          if (foundTome == null) {
            Tome tome = new Tome();
            tome.setNumber(tomeNumber);
            tome.setName("Tome " + tomeNumber);
            tome.setMustBeSaved(true);
            tome.setValidityDate(today);
            tome.setSerie(parent);

            tomes.add(tome);
            foundTome = tome;
          }

          Element link = span.select("a").first();

          Chapter chapter = new Chapter();
          chapter.setMustBeSaved(true);
          chapter.setUrl(link.attr("href"));
          String chapterNumberToParse = link.text();
          String tempNumber = StringUtils.substringAfterLast(chapterNumberToParse, " ");
          chapter.setNumber(Float.parseFloat(tempNumber));
          chapter.setName(span.text());
          chapter.setTome(foundTome);

          foundTome.addChapter(chapter);
        }
      }
    }

    parent.setValidityDate(today);
    return tomes;
  }
コード例 #4
0
  /**
   * 解析回帖列表
   *
   * @param content
   * @return
   */
  public static List<Post> parsePostList(String content) {
    long s = System.currentTimeMillis();

    List<Post> posts = new ArrayList<Post>();
    Document document = Jsoup.parse(content);
    document.setBaseUri(Constants.BASE_URL);
    Elements elements = document.getElementsByClass("plc");
    for (Element plc : elements) {
      try {
        Post post = new Post();
        // 解析头像
        Element avatar = plc.getElementsByClass("avatar").first();
        post.setAvatarUrl(avatar.child(0).absUrl("src"));

        String authi = plc.getElementsByClass("authi").first().html();
        Element message = plc.getElementsByClass("message").first();
        post.setContent(message.html().trim());
        // // 解析头像
        // // Element avatar = plc.getElementsByClass("avatar").first();
        // Element avatar = plc.child(0);
        // post.setAvatarUrl(avatar.child(0).absUrl("src"));
        //
        // // Element message = plc.getElementsByClass("message").first();
        // Element display = plc.child(1);
        // String authi = display.child(0).html();
        // Element message = display.child(1);
        // post.setContent(message.html().trim());

        try { // 主贴没有replyUrl
          String replyUrl = plc.getElementsByClass("replybtn").first().child(0).absUrl("href");
          post.setReplyUrl(replyUrl);
        } catch (Exception e) {
        }

        Elements img_list = plc.getElementsByClass("img_list");
        if (img_list != null && !img_list.isEmpty()) {
          String imgList = img_list.first().html();
          post.setImgList(imgList);
        } else { // 单张图片附件时
          Elements img_one = plc.getElementsByClass("img_one");
          if (img_one != null && !img_one.isEmpty()) {
            String imgOne = img_one.first().html();
            post.setImgList(imgOne);
          }
        }
        post.setAuthi(authi);

        posts.add(post);
      } catch (Exception e) {
      }
      LogMessage.i("parsePostList", "解析时间:" + (System.currentTimeMillis() - s));
    }
    return posts;
  }
コード例 #5
0
  public void download(Connection aInConnection, Collection<Image> images) throws IOException {
    aInConnection.url(url);
    Document lDocument = aInConnection.get();
    Element lMain = lDocument.getElementById("main");
    Elements lContents = lMain.getElementsByClass("content");

    if (lContents.size() == 1) {
      StringBuilder sb = new StringBuilder();
      Element lContent = lContents.first();

      collectImages(lContent, images);

      Elements lLightboxElements = lContent.getElementsByClass("lightbox");
      for (Element lLightboxElement : lLightboxElements) {
        Collection<Node> lImageNodes = extractImageNodes(lLightboxElement);

        Element lParent = lLightboxElement.parent();
        int i = lLightboxElement.siblingIndex();
        lParent.insertChildren(i, lImageNodes);
        lLightboxElement.remove();
      }

      Elements lChildElements = lContent.children();
      for (Element lChildElement : lChildElements) {
        if (lChildElement.hasClass("clear")) {
          // no more post content
          break;
        }

        if (title == null && lChildElement.tagName().equals("h1")) {
          // the first h1 header is the title
          title = lChildElement.html();
        } else {
          if (excerpt == null && lChildElement.tagName().equals("p")) {
            excerpt = lChildElement.text();
          }
          String lStr = lChildElement.toString();
          sb.append(lStr);
        }
      }

      content = sb.toString();

      Elements lDateElements = lContent.getElementsByClass("date");
      String lHunDate = lDateElements.first().html();
      date = new PostDate(lHunDate);
    } else {
      System.out.println("More than one content in main section of post page " + toString());
    }
  }
コード例 #6
0
  @Override
  public Object parseHtml2Obj(String html) {
    Document doc = Jsoup.parse(html);
    Element title = doc.getElementById("activity-name");
    Element createtime = doc.getElementById("post-date");
    // Element from = doc.getElementById("post-user");
    Element content = doc.getElementById("essay-body");
    Elements pic = doc.select("#media img");
    Elements _intro = doc.select(".text p");
    String intro = null;
    if (_intro.isEmpty()) {
      intro = "阅读全部";
    } else {
      intro = _intro.first().text();
    }

    // List<ArticleObj> objs = new ArrayList<ArticleObj>();
    ArticleObj obj = new ArticleObj();
    obj.setFrom(account_desc);
    obj.setContent(content.html());
    obj.setCreatetime(createtime.text());
    obj.setTitle(title.text());
    obj.setIntro(intro.substring(0, intro.length() > 50 ? 50 : intro.length()) + "...");
    if (!pic.isEmpty()) {
      String src = pic.get(0).attr("src");
      obj.setPic(getSrc(src));
    }
    System.err.println(obj.getPic());
    dbRobot.AddArticleData(obj);
    cur_count++;
    return null;
  }
コード例 #7
0
ファイル: Mudah.java プロジェクト: kwanrong/comparison
  /** Mudah is not standardized, result will be messy if crawl them */
  @Override
  public List<Item> parse(String query, int size) throws IOException {

    // request for a page
    Document doc =
        Jsoup.connect("http://www.mudah.my/li?q=" + query)
            .userAgent(Constant.HTTP_USER_AGENT)
            .timeout(Constant.HTTP_TIMEOUT)
            .get();

    Elements listS = doc.select("div.listing_thumbs").first().select("div.list_ads");

    ArrayList<Item> result = new ArrayList<Item>(size);
    for (int i = 0; i < listS.size(); i++) {
      Element list = listS.get(i);

      String img = "";
      list.select("div.image_thumb");
      Elements imgS = list.select("div.image_thumb > a + img");
      if (imgS.size() < 0) { // some may not have images
        img = imgS.first().attr("href");
      }

      Element listE = list.select("li.listing_ads_title").first();
      String title = listE.child(0).text();
      String url = listE.child(0).attr("href");
      String price = listE.text();
      price = price.substring(price.lastIndexOf("RM") + 2).trim().replaceAll(" ", "");
      int dPrice = Integer.parseInt(price);

      result.add(new Item("Mudah", title, dPrice, img, url));
    }

    return result;
  }
コード例 #8
0
 public boolean contentMatch(String selector, String regexp) {
   Elements es = document().select(selector);
   if (es.size() == 1) {
     lastMatch = (Pattern.compile(regexp, Pattern.MULTILINE)).matcher(es.first().text());
     if (!lastMatch.matches()) {
       addViolation(String.format("DOM要素 '%s' のテキストが正規表現 '%s' にマッチしません", selector, regexp));
       return wrap(false);
     }
   } else {
     Pattern p = Pattern.compile(regexp, Pattern.MULTILINE);
     boolean match = false;
     for (Element e : es) {
       if (p.matcher(e.text()).matches()) {
         match = true;
         break;
       }
     }
     if (!match) {
       addViolation(
           String.format("DOM要素 '%s' の中に、テキストが正規表現 '%s' にマッチするものが見付かりません", selector, regexp));
       return wrap(false);
     }
   }
   return wrap(true);
 }
コード例 #9
0
  @Override
  public void parse(String result, Task task) throws Exception {

    List<ECBean> beans = new ArrayList<ECBean>();
    JSONObject parseObject = JSON.parseObject(result);
    Object object = parseObject.get("value");
    Document doc = Jsoup.parse((String) object);
    Elements eles = doc.select("div.mod_search_pro");
    String categroy = "";
    for (Element element : eles) {
      ECBean bean = new ECBean("yhd");
      Elements select = element.select("p.proName > a");
      String url = select.first().attr("href");
      String id = select.first().attr("pmid");
      // 过滤没有pid数据
      if (id.equals("0")) {
        continue;
      }
      String name = select.first().attr("title");
      // 抓取分类
      if (StringUtils.isBlank(categroy)) {
        Document document = GlobalComponents.fetcher.document(url);
        Elements select2 = document.select("div.crumb > a");
        StringBuilder sb = new StringBuilder();
        for (int i = 1; i < select2.size() - 1; i++) {
          sb.append(select2.get(i).text());
        }
        sb.deleteCharAt(sb.length() - 1);
        categroy = sb.toString();
        categroy = StringUtils.replaceChars(categroy, "", "/");
      }
      bean.setId(id);
      bean.setUrl(url);
      bean.setTitle(name);
      bean.setCategory(categroy);
      bean.setKeyword(task.getExtra());
      beans.add(bean);
    }
    log.info("fetch list:" + beans.size());
    if (!beans.isEmpty()) {
      for (ECBean bean : beans) {
        bean.saveOnNotExist();
      }
    }
    beans.clear();
  }
コード例 #10
0
ファイル: ElementTest.java プロジェクト: haggisandchips/jsoup
 @Test
 public void testGetElementsWithAttributeDash() {
   Document doc =
       Jsoup.parse(
           "<meta http-equiv=content-type value=utf8 id=1> <meta name=foo content=bar id=2> <div http-equiv=content-type value=utf8 id=3>");
   Elements meta = doc.select("meta[http-equiv=content-type], meta[charset]");
   assertEquals(1, meta.size());
   assertEquals("1", meta.first().id());
 }
コード例 #11
0
  /**
   * 提取每一场演出的票价
   *
   * @param url 演出url
   */
  private void extractEach(String url) {
    Show show = new Show();
    try {
      show.setAgent_id(agentID);
      Document ticket = getDoc(url);
      show.setType(typeCor.get(ticket.select("a.font12hui_bottom:eq(2)").text().trim()));
      show.setName(ticket.select("td.PERFORM_BOLD_NAME").text()); // 演出标题
      // 演出简介
      show.setIntroduction(
          PubFun.cleanElement(ticket.select("body>table").get(6).select("table").get(3)).html());
      show.setSiteName(ticket.select(".font12hui:contains(演出场馆)").text().replace("演出场馆:", ""));
      show.setImage_path(ticket.select("img[width=240]").first().attr("abs:src"));

      Map<String, List<TicketPrice>> timeAndPrice = new HashMap<String, List<TicketPrice>>();
      show.setTimeAndPrice(timeAndPrice);
      for (Element each :
          ticket.select("body>table").get(6).select("table tr[id^=perform_price_line]")) {
        Elements tmp = each.select("td");
        String time = tmp.get(1).text();
        if (time.length() == 18) { // 正常时间
          time = time.substring(0, 16);
        }
        List<TicketPrice> ticketPrice = new ArrayList<TicketPrice>();
        timeAndPrice.put(time, ticketPrice);
        int priceIndex = 2;
        if (tmp.size() > 3) { // 含有套票
          String[] prices = tmp.get(priceIndex).select("span.font14lanse").text().split("\\s+");
          for (int i = 0; i < prices.length; i++) {
            Elements a =
                tmp.get(priceIndex).select("span.font14lanse a:matches(\\b" + prices[i] + "\\b)");
            TicketPrice price = new TicketPrice();
            price.setMainURL(url);
            price.setPrice(prices[i]);
            price.setExist(!a.isEmpty());
            if (price.isExist()) {
              price.setRemark(a.first().attr("title"));
            }
            ticketPrice.add(price);
          }
          priceIndex = 3;
        }
        String[] prices = tmp.get(priceIndex).select("span.font14lanse").text().split("\\s+");
        for (int i = 0; i < prices.length; i++) { // 正常的非套票
          Elements a =
              tmp.get(priceIndex).select("span.font14lanse a:matches(\\b" + prices[i] + "\\b)");
          TicketPrice price = new TicketPrice();
          price.setMainURL(url);
          price.setPrice(prices[i]);
          price.setExist(!a.isEmpty());
          ticketPrice.add(price);
        }
      }
      getDao().saveShow(show);
    } catch (Exception e) {
      log.error(url, e);
    }
  }
コード例 #12
0
ファイル: SuperSelector.java プロジェクト: Alwaysmy/jcrawler
 private Elements parseFirst(String query) {
   if (!FIRST_TAG.equals(query)) {
     throw new IllegalArgumentException("Argument selector part: " + query + " is illegal");
   } else {
     Elements eles = new Elements();
     eles.add(elements.first());
     return eles;
   }
 }
コード例 #13
0
  public void doAnylyze(String content, UserProfile userprofile) {
    String picture_url = "";
    int tweet = 0, following = 0, follower = 0;
    String location = null, selfIntroductionstr = null;
    Document doc = Jsoup.parse(content, "/");
    Elements picture =
        doc.getElementsByAttributeValueContaining(
            "class", "profile-picture media-thumbnail js-nav js-tooltip");
    if (picture.size() > 0) {
      picture_url = picture.get(0).child(0).attr("src");
    } else {
      picture_url = "null";
    }
    Elements locationElements = doc.getElementsByAttributeValue("class", "location profile-field");
    if (locationElements != null && locationElements.size() > 0) {
      location = locationElements.first().ownText();
    } else {
      location = "null";
    }
    Elements selfIntroduction = doc.getElementsByAttributeValue("class", "bio profile-field");
    if (selfIntroduction != null && selfIntroduction.size() > 0) {
      selfIntroductionstr = selfIntroduction.first().ownText();
    } else {
      selfIntroductionstr = "null";
    }

    Elements CountElement = doc.getElementsByAttributeValue("class", "default-footer");
    if (CountElement != null && CountElement.size() > 0) {
      Element target = CountElement.first();
      tweet = this.getCount(target, "tweet_stats");
      following = this.getCount(target, "following_stats");
      follower = this.getCount(target, "follower_stats");
    } else {
      tweet = -1;
      following = -1;
      follower = -1;
    }
    userprofile.setTweet(tweet);
    userprofile.setFollower(follower);
    userprofile.setFollowing(following);
    userprofile.setPicture_url(picture_url);
    userprofile.setLocation(location);
    userprofile.setSelfintroduction(selfIntroductionstr);
  }
コード例 #14
0
  private void saveSmsLog(
      SimpleObject context,
      final int page,
      final int t,
      final Date d,
      final String dstr,
      final int isHistory) {
    String text = ContextUtil.getContent(context);
    Document doc = ContextUtil.getDocumentOfContent(context);
    System.out.println(doc.toString());
    if (text.indexOf("没有查找到相关数据") >= 0) {
      return;
    }
    String tableSort = InfoUtil.getInstance().getInfo("dx/sh", "tableSort");
    String tbody = InfoUtil.getInstance().getInfo("dx/sh", "tbody");
    String tr = InfoUtil.getInstance().getInfo("dx/sh", "tr");
    String td = InfoUtil.getInstance().getInfo("dx/sh", "td");
    Elements elements = doc.select(tableSort);
    if (elements != null && elements.size() > 0) {
      Elements elements2 = elements.first().select(tbody).first().select(tr);
      for (int j = 0; j < elements2.size(); j++) {
        try {
          Elements tds = elements2.get(j).select(td);
          if (tds.size() == 5) {
            String RecevierPhone = tds.get(2).text().trim(); // 对方号码
            String SentTime = tds.get(1).text().trim(); // 发送时间
            String BusinessType = tds.get(3).text().trim(); // 费用类型
            String AllPay = tds.get(4).text().trim(); // 费用

            Date sentTime = null;
            try {
              sentTime = DateUtils.StringToDate(SentTime, "yyyy-MM-dd HH:mm:ss");
            } catch (Exception e) {
              e.printStackTrace();
            }
            TelcomMessage obj = new TelcomMessage();
            obj.setPhone(phoneNo);
            UUID uuid = UUID.randomUUID();
            obj.setId(uuid.toString());
            obj.setBusinessType(BusinessType); // 业务类型:点对点
            obj.setRecevierPhone(RecevierPhone); // 对方号码
            obj.setSentTime(sentTime); // 发送时间
            obj.setCreateTs(new Date());
            obj.setAllPay(Double.parseDouble(AllPay)); // 总费用
            messageList.add(obj);
          }

        } catch (Exception e) {
          logger.error("saveSmsLog", e);
        }
      }
      if (text.contains("下一页")) {
        requestSmsLogService(page + 1, 1, d, dstr, isHistory);
      }
    }
  }
コード例 #15
0
ファイル: ElementTest.java プロジェクト: haggisandchips/jsoup
  @Test
  public void testHasText() {
    Document doc = Jsoup.parse("<div><p>Hello</p><p></p></div>");
    Element div = doc.select("div").first();
    Elements ps = doc.select("p");

    assertTrue(div.hasText());
    assertTrue(ps.first().hasText());
    assertFalse(ps.last().hasText());
  }
コード例 #16
0
ファイル: Design.java プロジェクト: morenos/vaadin
 /**
  * Constructs a component hierarchy from the design specified as an html tree.
  *
  * <p>If a component root is given, the component instances created during reading the design are
  * assigned to its member fields based on their id, local id, and caption
  *
  * @param doc the html tree
  * @param componentRoot optional component root instance. The type must match the type of the root
  *     element in the design.
  * @param classWithFields a class (componentRoot class or a super class) with some member fields.
  *     The member fields whose type is assignable from {@link Component} are bound to fields in
  *     the design based on id/local id/caption
  */
 private static DesignContext designToComponentTree(
     Document doc, Component componentRoot, Class<?> classWithFields) {
   DesignContext designContext = new DesignContext(doc);
   designContext.readPackageMappings(doc);
   // No special handling for a document without a body element - should be
   // taken care of by jsoup.
   Element root = doc.body();
   Elements children = root.children();
   if (children.size() > 1) {
     throw new DesignException(
         "The first level of a component hierarchy should contain at most one root component, but found "
             + children.size()
             + ".");
   }
   Element element = children.size() == 0 ? null : children.first();
   if (componentRoot != null) {
     if (element == null) {
       throw new DesignException(
           "The root element cannot be null when the specified root Component is" + " not null.");
     }
     // user has specified root instance that may have member fields that
     // should be bound
     final FieldBinder binder;
     try {
       binder = new FieldBinder(componentRoot, classWithFields);
     } catch (IntrospectionException e) {
       throw new DesignException("Could not bind fields of the root component", e);
     }
     // create listener for component creations that binds the created
     // components to the componentRoot instance fields
     ComponentCreationListener creationListener =
         new ComponentCreationListener() {
           @Override
           public void componentCreated(ComponentCreatedEvent event) {
             binder.bindField(event.getComponent(), event.getLocalId());
           }
         };
     designContext.addComponentCreationListener(creationListener);
     // create subtree
     designContext.readDesign(element, componentRoot);
     // make sure that all the member fields are bound
     Collection<String> unboundFields = binder.getUnboundFields();
     if (!unboundFields.isEmpty()) {
       throw new DesignException("Found unbound fields from component root " + unboundFields);
     }
     // no need to listen anymore
     designContext.removeComponentCreationListener(creationListener);
   } else {
     // createChild creates the entire component hierarchy
     componentRoot = element == null ? null : designContext.readDesign(element);
   }
   designContext.setRootComponent(componentRoot);
   return designContext;
 }
コード例 #17
0
 private int getCount(Element ele, String dataElementTerm) throws NumberFormatException {
   Elements allElements = ele.getElementsByAttributeValue("data-element-term", dataElementTerm);
   if (allElements != null && allElements.size() > 0) {
     Element target = allElements.first();
     String count = target.child(0).ownText();
     count = count.replaceAll(",", "");
     int res = Integer.parseInt(count);
     return res;
   }
   return -1;
 }
コード例 #18
0
ファイル: Configuration.java プロジェクト: nator/goose
 @Override
 public Date extract(Element rootElement) {
   // this belongs in a separate class as a proper Date Extractor. But...
   // for now this knows how to pull dates out of mnartists articles
   Elements body = rootElement.select("div[class=articleBody]");
   try {
     if (body.size() > 0) {
       Elements ems = body.first().select("em");
       if (ems.size() > 0) {
         String em = ems.first().text();
         String date_str = string.isNullOrEmpty(em) ? string.empty : em.trim();
         logger.error(date_str);
         Date date = new SimpleDateFormat("MMM dd, yyyy").parse(date_str);
         return date;
       }
     }
   } catch (java.text.ParseException e) {
     // ignore it
   }
   return null;
 }
コード例 #19
0
 @Before
 public void init() throws IOException {
   if (elem == null) {
     URL url =
         new URL(
             "http://newhouse.hfhouse.com/HouseList/index/keyWord/%20%E4%B8%AD%E6%B5%B7%E6%BB%A8%E6%B9%96%E5%85%AC%E9%A6%86/");
     URLConnection conn = url.openConnection();
     String result = IOUtils.toString(conn.getInputStream(), "utf-8");
     Document doc = Jsoup.parse(result);
     Elements loupanList = doc.getElementsByAttributeValue("class", "loupan_list_none");
     elem = loupanList.first();
   }
 }
コード例 #20
0
ファイル: NewsScraper.java プロジェクト: scheakur/orehn
  public List<News> scrape(Document doc) {
    Elements trs =
        doc.select("body > center > " + "table > tbody > tr > td > " + "table > tbody > tr");

    int num = 0;

    List<News> newsList = new ArrayList<>();

    News.Builder builder = null;

    out:
    for (Element tr : trs) {
      switch (num % 3) {
        case 1:
          Elements titles = tr.select(".title");
          if (titles.size() < 2) {
            break out;
          }
          builder = new News.Builder();

          Element titleEl = titles.get(1);
          Element a = titleEl.select("a").first();
          builder.title = a.text();
          builder.url = getUrl(a);
          Elements comhead = titleEl.select(".comhead");
          if (comhead.size() > 0) {
            String domain = comhead.first().text();
            builder.domain = extract(domain, DOMAIN);
          }
          break;

        case 2:
          assert builder != null;
          Element subtext = tr.select(".subtext").first();
          Elements els = subtext.select("a");

          if (els.size() > 1) {
            Element comments = els.get(1);
            builder.id = getId(comments);
            builder.points = getPoints(subtext);
            builder.commentsNum = getCommentsNum(comments);
          }

          newsList.add(builder.build());
          break;
      }
      num++;
    }

    return newsList;
  }
コード例 #21
0
ファイル: AuthTest.java プロジェクト: JamesSullivan/mamute
  @Test
  public void should_save_url_when_redirected_to_login() {
    UserFlow navigation = createQuestionPage(navigate());
    VRaptorTestResult navigationResult = navigation.followRedirect().execute();
    navigationResult.wasStatus(200).isValid();

    Elements redirectInput =
        getElementsByAttributeAndValue(navigationResult, "name", "redirectUrl");

    String redirectUrl = redirectInput.first().attr("value");
    String expectedUrl = rootPath(navigationResult).concat("/perguntar");

    assertThat(redirectUrl, equalTo(expectedUrl));
  }
コード例 #22
0
ファイル: ElementTest.java プロジェクト: haggisandchips/jsoup
  @Test
  public void insertChildrenAsCopy() {
    Document doc = Jsoup.parse("<div id=1>Text <p>One</p> Text <p>Two</p></div><div id=2></div>");
    Element div1 = doc.select("div").get(0);
    Element div2 = doc.select("div").get(1);
    Elements ps = doc.select("p").clone();
    ps.first().text("One cloned");
    div2.insertChildren(-1, ps);

    assertEquals(4, div1.childNodeSize()); // not moved -- cloned
    assertEquals(2, div2.childNodeSize());
    assertEquals(
        "<div id=\"1\">Text <p>One</p> Text <p>Two</p></div><div id=\"2\"><p>One cloned</p><p>Two</p></div>",
        TextUtil.stripNewlines(doc.body().html()));
  }
コード例 #23
0
 public boolean content(String selector, String text) {
   Elements es = document().select(selector);
   if (es.stream().noneMatch(e -> e.hasText() && e.text().trim().equals(text))) {
     if (es.size() == 1) {
       addViolation(
           String.format(
               "DOM要素 '%s' に文字列 '%s' がセットされているはずですが '%s' となっています",
               selector, text, es.first().text()));
       return wrap(false);
     } else {
       addViolation(String.format("DOM要素 '%s' で文字列 '%s' をもつものが見付かりません", selector, text));
       return wrap(false);
     }
   }
   return wrap(true);
 }
コード例 #24
0
 public String getAlbumTitle(URL url) throws MalformedURLException {
   try {
     // Attempt to use album title as GID
     if (albumDoc == null) {
       logger.info("    Retrieving " + url.toExternalForm());
       sendUpdate(STATUS.LOADING_RESOURCE, url.toString());
       albumDoc = Jsoup.connect(url.toExternalForm()).userAgent(USER_AGENT).timeout(TIMEOUT).get();
     }
     Elements elems = albumDoc.select(".albumName");
     return HOST + "_" + elems.first().text();
   } catch (Exception e) {
     // Fall back to default album naming convention
     logger.warn("Failed to get album title from " + url, e);
   }
   return super.getAlbumTitle(url);
 }
コード例 #25
0
  private EntryType detectEntryType(@NotNull Element element) {
    Elements wordTypeNodes = element.getElementsByClass("wordType");

    if (wordTypeNodes.size() < 1) {
      LOGGER.debug("No wordType node found - defaulting to {}", EntryType.UNKNOWN);
      return EntryType.UNKNOWN;
    }

    EntryType entryType =
        ENTRY_TYPE_MAP.getOrDefault(wordTypeNodes.first().text(), EntryType.UNKNOWN);

    if (entryType == EntryType.UNKNOWN)
      LOGGER.debug("Unable to resolve entry type \"{}\"", entryType);

    return entryType;
  }
コード例 #26
0
  @Override
  public void initialize(URL location, ResourceBundle resources) {
    urlField.setOnAction(
        event -> {
          String text = urlField.getText();
          urlField.setText("tetetetetetetete");
          webView.getEngine().load(text);
        });

    webView
        .getEngine()
        .getLoadWorker()
        .stateProperty()
        .addListener(
            (ov, oldState, newState) -> {
              if (newState == State.SUCCEEDED) {
                String url = webView.getEngine().getLocation();
                urlField.setText(url);
                if (Pattern.compile("http://item.rakuten.co.jp/.*").matcher(url).find()) {
                  try {
                    Elements tmp;
                    Document document = Jsoup.connect(url).get();
                    tmp = document.select("input");
                    tmp = tmp.select("#etime");
                    if (tmp.size() != 0) {
                      if (!(Long.parseLong(tmp.first().val()) < new Date().getTime())) {
                        entryButton.setDisable(false);
                      }
                    } else {
                      entryButton.setDisable(false);
                    }
                  } catch (Exception e) {
                    // TODO 自動生成された catch ブロック
                    e.printStackTrace();
                  }
                }
              }
              ;
            });

    entryButton.setOnAction(
        event -> {
          urlField.setText("webView disable");
          sendEntryTaskController();
        });
  }
コード例 #27
0
ファイル: SuperSelector.java プロジェクト: Alwaysmy/jcrawler
 private Elements parseNextElement(String query) {
   if (!NEXT_ELEMENT_TAG.equals(query)) {
     throw new IllegalArgumentException("Argument selector part: " + query + " is illegal");
   } else {
     Elements eles = new Elements();
     if (elements.size() == 1) {
       Element element = elements.first().nextElementSibling();
       if (element == null) {
         return eles;
       }
       eles.add(element);
     } else {
       eles = elements;
     }
     return eles;
   }
 }
コード例 #28
0
ファイル: VuFind.java プロジェクト: opacapp/opacclient
  static DetailledItem parseDetail(String id, Document doc, JSONObject data)
      throws OpacErrorException, JSONException {
    if (doc.select("p.error, p.errorMsg, .alert-error").size() > 0) {
      throw new OpacErrorException(doc.select("p.error, p.errorMsg, .alert-error").text());
    }

    DetailledItem res = new DetailledItem();
    res.setId(id);

    Elements title = doc.select(".record h1, .record [itemprop=name], .record [property=name]");
    if (title.size() > 0) {
      res.setTitle(title.first().text());
    }
    for (Element img : doc.select(".record img, #cover img")) {
      String src = img.absUrl("src");
      if (src.contains("over")) {
        if (!src.contains("Unavailable")) {
          res.setCover(src);
        }
        break;
      }
    }
    for (Element tr : doc.select(".record table").first().select("tr")) {
      String text = tr.child(1).text();
      if (tr.child(1).select("a").size() > 0) {
        String href = tr.child(1).select("a").attr("href");
        if (!href.startsWith("/") && !text.contains(data.getString("baseurl"))) {
          text += " " + href;
        }
      }
      res.addDetail(new Detail(tr.child(0).text(), text));
    }

    try {
      if (doc.select("#Volumes").size() > 0) {
        parseVolumes(res, doc, data);
      } else {
        parseCopies(res, doc, data);
      }
    } catch (JSONException e) {
      e.printStackTrace();
    }

    return res;
  }
コード例 #29
0
ファイル: DePlHandler.java プロジェクト: loginus/herman
 public List<DictionaryEntry> extractEntries(Document doc) {
   Elements tables = doc.body().select("table");
   if (tables.size() > 1) {
     return Collections.emptyList();
   }
   List<DictionaryEntry> dictionary = new ArrayList<DictionaryEntry>();
   Elements entries = tables.first().select("td[class]");
   for (Element entry : entries) {
     String word = entry.ownText();
     String meta = entry.select("i").first().text();
     Elements meanings = entry.select("b");
     for (Element meaning : meanings) {
       String meaningText = meaning.text();
       dictionary.add(new DictionaryEntry(word, meaningText, meta));
     }
   }
   return dictionary;
 }
コード例 #30
0
  private static int parseAlumni() throws Exception {
    int count = 0;

    // Processes all pages, following "next" links.
    String url = START_URL;
    while (url != null) {
      // Opens the page and extracts the HTML DOM structure into Jsoup.
      System.out.printf("Parsing %s...%n", url);
      Document doc = Jsoup.connect(url).get();
      url = null;

      // Looks for the first table in the document, where the names of the students are supposed to
      // be.
      Element table = doc.select(SELECTOR_TABLE).first();

      // Extracts the rows from the table. Goes through all of them.
      Elements rows = table.select(SELECTOR_ROW);
      for (Element row : rows) {
        // Extracts the columns from the row.
        Elements columns = row.select(SELECTOR_COLUMN);

        // Read the columns with useful information.
        if (!columns.isEmpty()) {
          count++;
          Element nameCell = columns.get(COLUMN_NAME);
          Element defenseDateCell = columns.get(COLUMN_DEFENSE_DATE);
          Element levelCell = columns.get(COLUMN_LEVEL);

          // Also extracts the link to the detail page of the alumni.
          String link = baseUrl + nameCell.select(SELECTOR_DETAIL_LINK).attr(ATTRIBUTE_LINK);

          // Creates and stores the alumni in the set.
          alumni.add(new Alumnus(nameCell.text(), defenseDateCell.text(), levelCell.text(), link));
        }
      }

      // Checks if there's a next page.
      Elements nextLinks = doc.select(SELECTOR_NEXT_LINK);
      if (!nextLinks.isEmpty()) url = nextLinks.first().attr(ATTRIBUTE_LINK);
      if (url != null && url.startsWith("/")) url = baseUrl + url;
    }

    return count;
  }