public void getNewsInfo(String NewsUrl) { // 获得新闻来源URL
    try {
      System.out.println(NewsUrl);
      Document Doc =
          Jsoup.connect(NewsUrl).userAgent("Mozilla").cookie("auth", "token").timeout(3000).get();
      Element textDIV =
          Doc.select("div[style=height:800px; overflow-y:scroll; width:100%;]").first();
      Element TitleEle = textDIV.select("strong").first();
      String Title = TitleEle.text(); // 获得文章title

      String PublishTime = getDate(NewsUrl); // 获得文章发表日期
      Elements ContentPTags = textDIV.select("div[id=ozoom]").select("p");
      String Content = "\r\n"; // 获得文章正文内容
      for (Element ContentPTag : ContentPTags) {
        Content += ContentPTag.text() + "\r\n";
      }
      List<String> IMGList = new ArrayList<String>(); // 获得图片地址列表
      Elements IMGs = textDIV.select("td[align=center]").select("img[src]");
      for (Element IMG : IMGs) {
        IMGList.add(IMG.attr("abs:src"));
      }
      savexml.format.source = NewsUrl;
      savexml.format.title = Title;
      savexml.format.publishtime = PublishTime;
      savexml.format.body = Content;
      savexml.format.img = IMGList;
      savexml.save();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  private List<ArtifactVersionBean> parseMavenMetadata(Document doc) {
    String groupId = doc.getElementsByTag("groupId").text();
    String artifactId = doc.getElementsByTag("artifactId").text();
    if (!StringUtils.hasText(groupId) || !StringUtils.hasText(artifactId)) {
      return Lists.newArrayListWithCapacity(0);
    }

    Elements versions = doc.getElementsByTag("version");
    List<ArtifactVersionBean> artifactList = Lists.newArrayList();
    for (Element version : versions) {

      ArtifactVersionBean artifactVersionBean = new ArtifactVersionBean();
      artifactVersionBean.setGroupId(groupId);
      artifactVersionBean.setArtifactId(artifactId);
      artifactVersionBean.setVersion(version.text());
      artifactVersionBean.setId(groupId + ":" + artifactId + ":" + version.text());

      // Gets and convert the last update date
      Long lastUpdateDate = retrieveLastUpdateDate(artifactVersionBean);
      if (lastUpdateDate == null) {
        continue;
      }
      artifactVersionBean.setTimestamp(lastUpdateDate);

      artifactList.add(artifactVersionBean);
    }

    return artifactList;
  }
Example #3
0
 public List<AreaVO> parseMessage(String text, int pid) {
   Document doc = Jsoup.parse(text);
   Element body = doc.body();
   List<AreaVO> areas = new ArrayList<AreaVO>();
   Elements divs = body.getElementsByClass("subarea");
   if (divs.size() > 0) {
     Element div = divs.get(0);
     Elements childs = div.children();
     String letter = "";
     for (int i = 1; i < childs.size(); i++) {
       Element child = childs.get(i);
       if ("b".equals(child.tagName())) {
         letter = child.text();
         continue;
       }
       if ("a".equals(child.tagName())) {
         AreaVO area = new AreaVO();
         area.setLetter(letter);
         area.setName(child.text());
         area.setOrderIdx(index);
         area.setPid(pid);
         String href = child.attr("href");
         String pinyin = href.substring(7, href.lastIndexOf("/"));
         area.setPinyin(pinyin);
         index++;
         System.out.println(area.toString());
         areas.add(area);
       }
     }
   }
   return areas;
 }
  public static Pupil getSelectedPupil(Document doc) throws ParseException {

    boolean found = false;
    Pupil p, selectedP = null;

    Elements pupilSelectors =
        doc.getElementsByAttributeValue("id", "ctl00_topMenu_pupil_drdPupils");
    for (Element pupilSelector : pupilSelectors) {

      Elements pupils = pupilSelector.getAllElements();
      for (Element pupil : pupils) {
        if (pupil.tagName().equals("option")) {

          String value = pupil.attr("value");

          found = true;
          if ((p = Pupil.getByFormId(value)) == null) {

            p = new Pupil(pupil.text(), value);
            long rowId = p.insert();

            if (BuildConfig.DEBUG)
              Log.d("GshisHTMLParser", TS.get() + " Pupil.insert() = " + rowId);
          }

          if (pupil.hasAttr("selected") && pupil.attr("selected").equals("selected")) {

            selectedP = p;
          }
        }
      }
    }

    if (!found) {

      if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Alternative fields found!");

      Element userName = doc.getElementsByClass("user-name").first();
      Element userId = doc.getElementsByAttributeValue("id", "ctl00_topMenu_tbUserId").first();

      String name = userName.text();
      String id = userId.attr("value");

      if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " name=" + name + " id=" + id);

      if ((p = Pupil.getByFormId(id)) == null) {

        p = new Pupil(name, id);
        long rowId = p.insert();

        if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Pupil.insert() = " + rowId);
      }

      selectedP = p;
    }

    if (selectedP == null) throw new ParseException("Pupils not found", 0);

    return selectedP;
  }
Example #5
0
  /**
   * This method sets the Y1 position of the table. This is the highest pixel in the table (with the
   * lowest Y1 score).
   */
  private void setMaxY1() {
    Element lastSpan = null;
    String[] positions;
    String pos;
    int lastX2 = 0;

    for (Element span : spans) {
      try {
        pos = span.attr("title");
        positions = pos.split("\\s+");
        int x1 = Integer.parseInt(positions[1]);
        int x2 = Integer.parseInt(positions[3]);
        int y1 = Integer.parseInt(positions[2]);
        if (!(x1 >= lastX2)) {
          break;
        }
        name = name + span.text() + " ";
        if (y1 > maxY1) {
          this.maxY1 = y1;
        }
        lastX2 = x2;
        lastSpan = span;
      } catch (IndexOutOfBoundsException e) {
        System.out.println("This table got a weird name it raised the following error: ");
        if (lastSpan != null) {
          System.out.println(lastSpan.text());
        }
        System.out.println(e);
      }
    }
  }
  @Override
  protected RemoteDetectionResult detectRemoteRepository(
      final ScrapeContext context, final Page page) {
    // cheap checks first, to quickly eliminate target without doing any remote requests
    if (page.getHttpResponse().getStatusLine().getStatusCode() == 200) {
      final Elements elements = page.getDocument().getElementsByTag("a");
      if (!elements.isEmpty()) {
        // get "template" parent link
        final Element templateParentLink = getParentDirectoryElement(page);
        // get the page parent link (note: usually it's 1st elem, but HTTPD for example has extra
        // links for
        // column
        // sorting
        for (Element element : elements) {
          // if text is same and abs URLs points to same place, we got it
          if (templateParentLink.text().equals(element.text())
              && templateParentLink.absUrl("href").equals(element.absUrl("href"))) {
            return new RemoteDetectionResult(
                RemoteDetectionOutcome.RECOGNIZED_SHOULD_BE_SCRAPED,
                getTargetedServer(),
                "Remote is a generated index page of " + getTargetedServer());
          }
        }
      }
    }

    // um, we were not totally positive, this might be some web server with index page similar to
    // Nexus one
    return new RemoteDetectionResult(
        RemoteDetectionOutcome.UNRECOGNIZED,
        getTargetedServer(),
        "Remote is not a generated index page of " + getTargetedServer());
  }
 /**
  * 解析首页 获取首页的所有链接
  *
  * @param html 首页html文本
  * @return Map<String,String> 链接子集
  * @throws Exception
  */
 public static Map<String, String> parseIndexHtml(String html) throws Exception {
   Map<String, String> urlMap = new HashMap<String, String>();
   Document doc = Jsoup.parse(html);
   Elements links = doc.select("a[href]");
   Element content = doc.getElementById("xhxm");
   Element tonggous = doc.getElementById("xsrs");
   tonggou = tonggous.text(); // 通告内容
   String studentname = content.text().substring(0, content.text().lastIndexOf("同")); // 郭灶鹏同学
   String replacename = URLEncoder.encode(studentname, "gb2312"); // 转成gb2312编码
   for (Element link : links) {
     String linkHref = link.attr("href");
     String linkText = link.text();
     if ("退出".equals(linkText)) {
       continue;
     }
     if ("#".equals(linkHref) || "#a".equals(linkHref)) {
       continue;
     } else {
       String truelinkHref = linkHref.replaceAll(studentname, replacename); // 真正的url
       System.out.println(linkText + ":" + truelinkHref);
       urlMap.put(linkText, truelinkHref);
     }
   }
   return urlMap;
 }
  @BeforeClass
  public static void setUp() {
    File input =
        new File("src/test/java/org/jenkinsci/plugins/marketfeaturereport/market_features.html");
    Document doc = null;
    try {
      doc = Jsoup.parse(input, "UTF-8");
    } catch (IOException e) {
      e.printStackTrace();
    }
    assert doc != null;
    Element content = doc.getElementById("market-feature-header");
    Elements header = content.getElementsByClass("rTableHead");
    Elements failedHeader = content.getElementsByClass("rTableHeadFailed");
    Elements rows = content.getElementsByClass("rTableCell");
    Elements rows_failed = content.getElementsByClass("rTableCellFailed");
    int count_failed = 0, count = 0;

    for (Element element : header) {
      summary_table.put(element.text(), rows.get(count).text());
      ++count;
    }
    Elements link_error = content.getElementsByTag("a");
    for (Element element : failedHeader) {
      summary_table.put(element.text(), rows_failed.get(count_failed).text());
      String linkHref = link_error.get(count_failed).attr("href");
      summary_error_table.put(element.text(), linkHref);
      ++count_failed;
    }
  }
  @Override
  public Object parseHtml2Obj(String html) {
    Document doc = Jsoup.parse(html);
    Element title = doc.getElementById("activity-name");
    Element createtime = doc.getElementById("post-date");
    // Element from = doc.getElementById("post-user");
    Element content = doc.getElementById("essay-body");
    Elements pic = doc.select("#media img");
    Elements _intro = doc.select(".text p");
    String intro = null;
    if (_intro.isEmpty()) {
      intro = "阅读全部";
    } else {
      intro = _intro.first().text();
    }

    // List<ArticleObj> objs = new ArrayList<ArticleObj>();
    ArticleObj obj = new ArticleObj();
    obj.setFrom(account_desc);
    obj.setContent(content.html());
    obj.setCreatetime(createtime.text());
    obj.setTitle(title.text());
    obj.setIntro(intro.substring(0, intro.length() > 50 ? 50 : intro.length()) + "...");
    if (!pic.isEmpty()) {
      String src = pic.get(0).attr("src");
      obj.setPic(getSrc(src));
    }
    System.err.println(obj.getPic());
    dbRobot.AddArticleData(obj);
    cur_count++;
    return null;
  }
Example #10
0
  //// COMPLETAMENTE INUTILE
  public static int[] getPrice(String path) {
    int[] month = new int[31];
    int count = 0;
    try {
      File input = new File(path);
      Document doc = Jsoup.parse(input, "UTF-8", "http://example.com/");
      Elements elementi_div = doc.getElementsByTag("div");
      for (Element e : elementi_div) {
        if (e.text().length() > 0)
          if (Character.isDigit(e.text().charAt(0)) && e.text().contains("€ ")) {
            count++;
            String[] arr = e.text().split(" ");
            month[Integer.parseInt(arr[0]) - 1] = Integer.parseInt(arr[2].replace(".", ""));
          }
      }
    } catch (Exception e) {
      System.out.println(e);
    }
    if (count == 0) {
      System.out.println("Non e' stato scaricato il file");
      // getPrice(path);
    }

    return month;
  }
 @Override
 public String getContentText(Document document) {
   String ret = "";
   //		System.out.println(this.getClass().getName());
   //		Element e = document.getElementById("video_tags");
   //		if(e != null){
   //			ret = ret + " " + e.text().toString();
   //		}
   for (Element e : document.getElementsByAttributeValue("class", "starPicTxt")) {
     ret = ret + " " + e.text().toString();
   }
   for (Element e : document.getElementsByAttributeValue("class", "dlTxt clearfix")) {
     ret = ret + " " + e.text().toString();
   }
   for (Element e : document.getElementsByAttributeValue("class", "v-star-info")) {
     ret = ret + " " + e.text().toString();
   }
   //		for(Element e : document.getElementsByAttributeValue("class", "listCon")){
   //			ret = ret + " " + e.text().toString();
   //		}
   if (ret.isEmpty() == false) {
     ret = ret.substring(1);
   }
   return ret;
 }
Example #12
0
    @Override
    protected Boolean doInBackground(String... mess) {

      try {
        Document page = Jsoup.connect("http://messmenu.snu.in/messMenu.php").get();

        Element menu;
        if (mess[0].equals("dh1")) menu = page.getElementsByTag("tbody").get(0);
        else menu = page.getElementsByTag("tbody").get(1);

        Elements breakfast_items = menu.getElementsByTag("td").get(1).children();
        Elements lunch_items = menu.getElementsByTag("td").get(2).children();
        Elements dinner_items = menu.getElementsByTag("td").get(3).children();

        for (Element item : breakfast_items) breakfast.add(item.text());
        for (Element item : lunch_items) lunch.add(item.text());
        for (Element item : dinner_items) dinner.add(item.text());

        return true;

      } catch (IOException | IndexOutOfBoundsException e) {
        e.printStackTrace();
      }

      return false;
    }
Example #13
0
 public static String getType(Document doc) {
   String type = "";
   if (doc.select("#kw").size() > 0) {
     Element e = doc.select("#kw").get(0);
     StringBuilder typeURL = new StringBuilder();
     typeURL.append("http://widget.unistats.ac.uk/Widget/");
     typeURL.append(e.attr("data-institution") + "/");
     typeURL.append(e.attr("data-course") + "/");
     typeURL.append(e.attr("data-orientation") + "/");
     typeURL.append("null/");
     typeURL.append(e.attr("data-language") + "/");
     typeURL.append(e.attr("data-kismode"));
     boolean finishe = false;
     try {
       do {
         Connection tmpConn = Jsoup.connect(typeURL.toString());
         Document tmpDoc = tmpConn.timeout(10000).get();
         if (tmpDoc.select("#kisWidget > div.widgetCourse > h1").size() > 0) {
           e = tmpDoc.select("#kisWidget > div.widgetCourse > h1").get(0);
           type =
               e.text().trim().indexOf(" ") > 0
                   ? e.text().trim().substring(0, e.text().trim().indexOf(" "))
                   : e.text().trim();
         }
         finishe = true;
       } while (!finishe);
     } catch (Exception ex) {
       ex.printStackTrace();
     }
   }
   return type;
 }
  public HashMap<String, String> initialBestBuyScan(Document doc, String url) {
    doc = jsoupConnect(url);
    HashMap<String, String> matchingItems = new HashMap<String, String>();
    matchingItems.put("price", doc.select(".medium-item-price").text());
    matchingItems.put(
        "modelNumber", doc.select(".list-item-info .sku-model ul .model-number").text());
    matchingItems.put("title", doc.select(".list-item-info .sku-title h4 a").text());

    String newURL =
        "http://bestbuy.com"
            + bestBuySpecsFormatter(doc.select(".list-item-info .sku-title h4 a").attr("href"));
    System.out.println(newURL);
    doc = jsoupConnect(newURL);
    Elements tableEles = doc.select("#full-specifications table tbody tr");
    for (Element ele : tableEles) {
      if (ele.text().contains("UPC")) {
        matchingItems.put("upc", ele.text().replace("UPC ", ""));
        break;
      }
    }
    if (tableEles.size() < 1) matchingItems.put("GoodSKU", "false");
    else matchingItems.put("GoodSKU", "true");
    doc.empty();

    return matchingItems;
  }
  public List<Arrival> busTimetable(final Arrival arrival) throws Exception {
    final Calendar now = Calendar.getInstance(Locale.UK);

    final Uri url =
        Uri.parse("http://transportapi.com")
            .buildUpon()
            .path(
                String.format(
                    "v3/uk/bus/route/%s/%s/inbound/%s/%s/%s/timetable",
                    arrival.bus.operator,
                    arrival.bus.route,
                    arrival.stop.atcocode,
                    dateFormat.format(now.getTime()),
                    timeFormat.format(now.getTime())))
            .appendQueryParameter("api_key", apiKey)
            .appendQueryParameter("app_id", appId)
            .appendQueryParameter("group", "no")
            .build();
    Log.d("JSON API", String.format("Requesting %s", url));

    final HttpResponse response = http.execute(new HttpGet(url.toString()));
    final StatusLine status = response.getStatusLine();

    if (status.getStatusCode() != HttpStatus.SC_OK) {
      response.getEntity().getContent().close();
      throw new IOException(status.getReasonPhrase());
    }

    final Document doc = Jsoup.parse(EntityUtils.toString(response.getEntity()), url.toString());
    final Element stopList = doc.getElementsByClass("busroutelist").first();
    final Elements stopListItems = stopList.getElementsByTag("li");

    ArrayList<Arrival> result = new ArrayList<Arrival>();
    for (Element stopListItem : stopListItems) {
      String destcode;
      String destname;
      Time desttime;

      Element timeElement = stopListItem.getElementsByClass("routelist-time").first();
      desttime = parseSimpleTime(timeElement.text().substring(0, 5));

      Element destElement = stopListItem.getElementsByClass("routelist-destination").first();
      String href = destElement.getElementsByTag("a").first().attr("href");
      destcode = href;
      if (destcode.startsWith("/v3/uk/bus/stop/")) {
        destcode = destcode.substring("/v3/uk/bus/stop/".length());
      }
      if (destcode.indexOf('/') > 0) {
        destcode = destcode.substring(0, destcode.indexOf('/'));
      }

      destname = destElement.text();

      result.add(new Arrival(arrival.bus, new Stop(destcode, destname), desttime));
    }
    return result;
  }
  @Override
  protected List<Tome> parseTomes(Document htmlDocument, Serie parent) {
    Date today = new Date();
    List<Tome> tomes = new LinkedList<>();

    Elements divChapters = htmlDocument.select("div.detail_list");
    if (!divChapters.isEmpty()) {
      Elements spansLeft = divChapters.first().select("span.left");
      if (!spansLeft.isEmpty()) {
        for (Element span : spansLeft) {
          Elements tomeNumberElements = span.select("span.mr6");
          final String tomeNumberString =
              StringUtils.substringAfter(tomeNumberElements.first().text(), "Vol ");
          int tomeNumber = 0;
          if (tomeNumberString != null && !tomeNumberString.isEmpty()) {
            Integer.parseInt(tomeNumberString);
          }

          Tome foundTome = null;
          for (Tome tome : tomes) {
            if (tomeNumber == tome.getNumber()) {
              foundTome = tome;
              break;
            }
          }

          if (foundTome == null) {
            Tome tome = new Tome();
            tome.setNumber(tomeNumber);
            tome.setName("Tome " + tomeNumber);
            tome.setMustBeSaved(true);
            tome.setValidityDate(today);
            tome.setSerie(parent);

            tomes.add(tome);
            foundTome = tome;
          }

          Element link = span.select("a").first();

          Chapter chapter = new Chapter();
          chapter.setMustBeSaved(true);
          chapter.setUrl(link.attr("href"));
          String chapterNumberToParse = link.text();
          String tempNumber = StringUtils.substringAfterLast(chapterNumberToParse, " ");
          chapter.setNumber(Float.parseFloat(tempNumber));
          chapter.setName(span.text());
          chapter.setTome(foundTome);

          foundTome.addChapter(chapter);
        }
      }
    }

    parent.setValidityDate(today);
    return tomes;
  }
  public static Week getSelectedWeek(Document doc, Schedule s) throws ParseException {

    boolean found = false;
    Week selectedW = null;

    SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH);
    f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));

    Elements weekSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_week_drdWeeks");
    for (Element weekSelector : weekSelectors) {

      Elements weeks = weekSelector.getAllElements();
      for (Element week : weeks) {
        if (week.tagName().equals("option")) {

          String value = week.text();
          Week w;
          found = true;

          if ((w = s.getWeek(week.attr("value"))) == null) {

            w = new Week();

            String wBegin = value.substring(0, value.indexOf("-") - 1);
            String wMonth = wBegin.substring(wBegin.indexOf(".") + 1, wBegin.length());

            String year;
            if (Integer.parseInt(wMonth) > 7) {
              year = s.getFormText().substring(0, s.getFormText().indexOf("-") - 1);
            } else {
              year =
                  s.getFormText()
                      .substring(s.getFormText().indexOf("-") + 2, s.getFormText().length());
            }

            w.setStart(f.parse(year + " " + wBegin));
            w.setFormText(week.text());
            w.setFormId(week.attr("value"));

            s.addWeek(w);
          }

          if (week.hasAttr("selected") && week.attr("selected").equals("selected")) {

            selectedW = w;
            long u = w.setLoaded().update();

            if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Week.update() = " + u);
          }
        }
      }
    }

    if (!found) throw new ParseException("Weeks not found", 0);

    return selectedW;
  }
Example #18
0
  private void parseFeedItem(String resource) {
    try {
      Document doc = Jsoup.parse(resource);
      Element masthead = doc.select("div.tie-wrapper").first();
      Elements feedBoxs = masthead.select("div.tie-box");

      for (int i = 0; i < feedBoxs.size(); i++) {

        FeedItem feedItem = new FeedItem();

        Element feedPost = feedBoxs.get(i);

        Element titleElement = feedPost.select("div.tie-header h2.tie-title a").first();

        Element nameElement =
            feedPost.select("div.tie-content div.tie-user div.user-info p span.user-name").first();
        Element sourceElement =
            feedPost.select("div.tie-content div.tie-user div.user-info p span.user-form").first();
        Element timestampElement =
            feedPost.select("div.tie-content div.tie-user div.user-info p.tie-date").first();
        Elements imageElement = feedPost.select("div.tie-content img.st-photo");
        Elements contentElements = feedPost.select("div.tie-content p:not(.tie-date):gt(0)");

        String title = titleElement.text();
        String name = nameElement.text();
        String source = sourceElement.text();
        String timestamp = timestampElement.text();

        String content = "";
        for (int j = 0; j < contentElements.size(); j++) {
          content = content + contentElements.get(j).text() + "\n";
        }

        String image;
        if (imageElement.attr("src") != "") {
          image = url + imageElement.attr("src");
        } else {
          image = null;
        }

        feedItem.setTitle(title);
        feedItem.setName(name);
        feedItem.setPostTime(timestamp);
        feedItem.setSource(source);
        feedItem.setImage(image);
        feedItem.setContent(content);

        mFeedItems.add(feedItem);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    mFeedItemAdapter.notifyDataSetChanged();
  }
 public Book createBook(Element bookElement, Element bookIndexElement) {
   Book book = new Book(bookElement.text().trim());
   Elements chapterUrlElements = bookIndexElement.select("a");
   for (Element link : chapterUrlElements) {
     String chapterPage = link.attr("href");
     int chapterID = Integer.parseInt(link.text());
     Chapter chapter = createChapter(chapterID, chapterPage);
     book.addChapter(chapter);
   }
   return book;
 }
  // start setting of list in right rail
  public void rightRailList(
      Node listNode, Element rightListEle, Map<String, String> urlMap, String locale) {
    try {
      Element title;
      Element description;
      Elements headElements = rightListEle.getElementsByTag("h2");
      if (headElements.size() > 1) {
        title = rightListEle.getElementsByTag("h2").last();
        description = rightListEle.getElementsByTag("p").last();
        sb.append("<li>Mismatch in count of list panel component in right rail.</li>");
      } else {
        title = rightListEle.getElementsByTag("h2").first();
        description = rightListEle.getElementsByTag("p").first();
      }
      listNode.setProperty("title", title.text());
      javax.jcr.Node introNode = listNode.getNode("intro");
      introNode.setProperty("paragraph_rte", description.text());
      javax.jcr.Node eleListNode = listNode.getNode("element_list_0");

      Elements ulList = rightListEle.getElementsByTag("ul");
      for (Element element : ulList) {
        java.util.List<String> list = new ArrayList<String>();
        Elements menuLiList = element.getElementsByTag("li");

        for (Element li : menuLiList) {
          JSONObject jsonObjrr = new JSONObject();
          Element listItemAnchor = li.getElementsByTag("a").first();
          String anchorText = listItemAnchor != null ? listItemAnchor.text() : "";
          String anchorHref = listItemAnchor.absUrl("href");
          if (StringUtil.isBlank(anchorHref)) {
            anchorHref = listItemAnchor.attr("href");
          }
          // Start extracting valid href
          log.debug("Before right list LinkUrl" + anchorHref + "\n");
          anchorHref = FrameworkUtils.getLocaleReference(anchorHref, urlMap, locale, sb);
          log.debug("after right list LinkUrl" + anchorHref + "\n");
          // End extracting valid href

          jsonObjrr.put("linktext", anchorText);
          jsonObjrr.put("linkurl", anchorHref);
          jsonObjrr.put("icon", "none");
          jsonObjrr.put("size", "");
          jsonObjrr.put("description", "");
          jsonObjrr.put("openInNewWindow", "false");
          list.add(jsonObjrr.toString());
        }
        eleListNode.setProperty("listitems", list.toArray(new String[list.size()]));
      }
      log.debug("Updated title, descriptoin and linktext at " + listNode.getPath());
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Example #21
0
  /**
   * Extract content with jsoup maybe later.
   *
   * @param doc
   * @return
   */
  public static List<Item> extractItem(Document doc) {
    List<Item> itemList = new ArrayList<Item>();
    Elements itemRows = doc.select("tr");
    Iterator iterator = itemRows.iterator();
    while (iterator.hasNext()) {
      Element element = (Element) iterator.next();
      Element titleElement = element.select(".title a").first();
      if (titleElement == null) {
        continue;
      }
      String titleStr = titleElement.text().trim();
      String urlStr = titleElement.attr("href").trim();

      Element comHeadElement = element.select(".comhead").first();
      if (comHeadElement == null) {
        continue;
      }

      String comheadStr = comHeadElement.text().trim();

      Element pointsElement = element.select("span[id^=score_]").first();
      if (pointsElement == null) {
        continue;
      }
      String pointsStr = pointsElement.text();
      if (pointsStr == null) {
        continue;
      }
      String[] pointsArr = pointsStr.split(" ");
      if (pointsArr.length != 2) {
        continue;
      }
      int points = -1;
      try {
        points = Integer.parseInt(pointsArr[0]);
      } catch (NumberFormatException e) {
      }

      if (points < 0) {
        continue;
      }
      Element userElement = element.select("a[href^=user]").first();
      if (userElement == null) {
        continue;
      }

      String user = userElement.text().trim();

      Element dateElement = element.select(".subtext").first();
    }

    return itemList;
  }
  @Override
  public void process(ResultItems page) {
    Document doc = (Document) page.getResource();

    Elements elements = doc.select("div.txt-list-category-v2");
    for (Element item : elements) {
      String ancestorName = item.select("h3").text();
      String ancestorId = item.attr("id");
      CategoryEntity ancestor =
          new CategoryEntity().setName(ancestorName).setSite(SiteName.Taobao).setCode(ancestorId);
      getLogger().trace(ancestor);
      page.addItem(ancestor);

      Elements subElements = item.select("a");
      CategoryEntity parent = null;
      for (Element item3rd : subElements) {
        if (item3rd.attr("href").isEmpty()) {
          String name = item3rd.text().trim();
          if (name.isEmpty()) {
            continue;
          }
          if (name.toCharArray()[0] == 160) {
            continue;
          }
          parent = new CategoryEntity().setName(name).setSite(SiteName.Taobao).setParent(ancestor);
          getLogger().trace(parent);
          page.addItem(parent);
        } else {
          String url = item3rd.absUrl("href");
          try {
            url = java.net.URLDecoder.decode(url, "utf-8");
          } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(url, e);
          }
          String name = item3rd.text().trim();
          if (name.isEmpty()) {
            continue;
          }
          CategoryEntity grand =
              new CategoryEntity()
                  .setName(name)
                  .setUrl(url)
                  .setSite(SiteName.Taobao)
                  .setParent(parent);
          if (parent == null) {
            throw new RuntimeException("no parent of " + grand);
          }
          getLogger().trace(grand);
          page.addItem(grand);
        }
      }
    }
  }
Example #23
0
  /**
   * Parse nasdq page and write in hbase
   *
   * @param symbol
   */
  public static void parseUSSymbols(String symbol) {
    if (!Hbase.getData(symbol).equals("")) {
      // System.out.println(symbol + " Exists!");
      return;
    }
    String result =
        HttpRequest.sendPost(
            "http://www.nasdaq.com/symbol/" + symbol.toLowerCase() + "/historical",
            length + "|false|" + symbol);
    if (result.equals("")) {
      WriteError(symbol);
      System.out.println(symbol + " result error");
      return;
    }
    // System.out.println(result);
    Document doc = Jsoup.parse(result);
    JSONArray HistoricalData = new JSONArray();
    try {
      Element body = doc.getElementsByTag("tbody").get(0);
      // System.out.println(body.toString());
      Elements nodes = body.getElementsByTag("tr");
      if (nodes.size() == 0) {
        WriteError(symbol);
        System.out.println(symbol + " size 0");
        return;
      }
      // System.out.println(nodes.size());
      for (Element node : nodes) {
        JSONArray DailyData = new JSONArray();
        Elements units = node.getElementsByTag("td");
        for (Element unit : units) {
          if (!unit.text().equals("")) {
            DailyData.put(unit.text());
          }
        }
        if (DailyData.length() > 0) {
          HistoricalData.put(DailyData);
        }
      }
      Hbase.addData(symbol, type, HistoricalData.toString());
      // System.out.println(symbol + " done");
    } catch (Exception e) {
      if (handleError) {
        errors.add(symbol);
      } else {
        WriteError(symbol);
        System.out.println(symbol + " parsing error");
      }

      // TODO: handle exception
    }
  }
Example #24
0
    /**
     * 这里的Integer参数对应AsyncTask中的第一个参数 这里的String返回值对应AsyncTask的第三个参数
     * 该方法并不运行在UI线程当中,主要用于异步操作,所有在该方法中不能对UI当中的空间进行设置和修改 但是可以调用publish
     * Progress方法触发onProgressUpdate对UI进行操作
     */
    @Override
    protected String doInBackground(Integer... params) {

      // 测试看看能不能显示去掉后缀的部分
      String str3 = new String();
      String total = null;
      Document doc = null;
      try {
        //                doc =
        // Jsoup.connect("http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html").get();
        //                Elements ListDiv = doc.getElementsByAttributeValue("class","postBody");

        // 武汉天气预报的借口,http://tianqi.xixik.com/city/wuhan/
        doc = Jsoup.connect("http://tianqi.xixik.com/city/wuhan/").get();
        Elements ListDiv = doc.getElementsByAttributeValue("class", "left");
        // System.out.println(ListDiv);
        Element ListDiv1 = doc.getElementById("left"); //

        /**
         * 用来测试,另外一种写法 Elements ListDiv2=doc.getElementsByAttributeValue("class","content"); Element
         * te=ListDiv2.get(0); //System.out.println(te.text()); 还有一种写法, Elements
         * ListDiv3=doc.getElementsByAttributeValue("class","fourday"); for (int
         * i=0;i<ListDiv3.size()-1;i++){ System.out.println(ListDiv3.get(i).child(0).attr("href"));
         * //System.out.println(ListDiv3.get(i).text()); }
         */

        // 我自己写的方法获得天气信息,这个只包含前三天的天气信息
        Elements temp = doc.getElementsByClass("oneweather");
        for (Element element : temp) {
          // str3=element.text()+","+str3;
          str3 = str3 + "," + element.text();
          // System.out.println(element.text());
        }
        // 接下来就是获取下面七天的天气预报
        Elements temp1 = doc.getElementsByClass("fourday");
        for (Element element : temp1) {
          // str3=element.text()+","+str3;
          str3 = str3 + "," + element.text();
          // System.out.println(element.text());
        }

        // 可以试试GBK或UTF-8
      } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
      // return str.toString() ;
      total = str3;
      return str3;
      // return test();
    }
Example #25
0
  public JSONArray toFourDayJSON(String html, String[] labels) {
    Document doc = Jsoup.parse(html);
    JSONArray dates = new JSONArray();

    try {
      Elements tables = doc.select("table");
      // Log.d("jsoup", "Four day: Parsing html table: " + tables.size());
      for (Element table : tables) {
        Elements rows = table.select("tr");

        JSONObject date_item = new JSONObject();
        JSONArray row_json = new JSONArray();

        for (Element row : rows) {
          Elements data = row.select("td");
          if (!data.isEmpty()) {
            JSONObject details = new JSONObject();

            for (Element dataItem : data) {
              Elements img = dataItem.select("img");
              String img_src = null, label = null;
              String[] tokens = null;
              if (img.size() == 0) {
                label = labels[data.indexOf(dataItem)];
                if (label.equals("Time")) {
                  details.put(label, dataItem.text().split("-")[0]);
                } else {
                  details.put(label, dataItem.text());
                }
              } else {
                img_src = img.get(0).attr("src");
                tokens = img_src.split("/");
                // Log.d("jsoup", "img: "+tokens[tokens.length-1]);
                details.put(labels[data.indexOf(dataItem)], tokens[tokens.length - 1]);
              }
            }
            row_json.put(details);
          }
        }
        date_item.put("data", new JSONArray(row_json.toString()));
        date_item.put("date", table.previousElementSibling().text());
        dates.put(new JSONObject(date_item.toString()));
      }

    } catch (JSONException e) {
      e.printStackTrace();
    }

    return dates;
  }
Example #26
0
 /**
  * 从一个xmltxt中得到当前信息的程序
  *
  * @param element
  * @throws IllegalAccessException
  */
 public void dealelement(Element element) {
   Field[] fields = this.getClass().getDeclaredFields();
   for (int i = 0; i < fields.length; i++) {
     Field f = fields[i];
     String type = f.getGenericType().toString();
     if (type.equals("class java.lang.String")) {
       Element temp = element.getElementsByTag(f.getName()).first();
       if (temp != null) {
         try {
           f.set(this, temp.text());
         } catch (IllegalAccessException e) {
           e.printStackTrace();
         }
       }
     } else if (type.equals("class java.lang.Integer")) {
       Element temp = element.getElementsByTag(f.getName()).first();
       if (temp != null) {
         int txt = Integer.parseInt(temp.text());
         try {
           f.set(this, txt);
         } catch (IllegalAccessException e) {
           e.printStackTrace();
         }
       }
     } else if (type.equals("class java.lang.Float")) {
       Element temp = element.getElementsByTag(f.getName()).first();
       if (temp != null) {
         Float txt = Float.parseFloat(temp.text());
         try {
           f.set(this, txt);
         } catch (IllegalAccessException e) {
           e.printStackTrace();
         }
       }
     } else if (type.equals("java.util.List<java.lang.String>")) {
       Elements temp = element.getElementsByTag(f.getName());
       if (temp.size() > 0) {
         List<String> list = new ArrayList<>();
         for (Element ele : temp) {
           list.add(ele.text());
         }
         try {
           f.set(this, list);
         } catch (IllegalAccessException e) {
           e.printStackTrace();
         }
       }
     }
   }
 }
Example #27
0
  @Override
  public void endElement(String uri, String localName, String qName) throws SAXException {
    if (inChannel) {
      if (inTitle) {
        feeds.setTitle(temp);
        inTitle = false;
      } else if (inLink) {
        feeds.setLink(temp);
        inLink = false;
      } else if (inDesc) {
        Document doc = Jsoup.parseBodyFragment(temp);
        Element body = doc.body();
        feeds.setDescription(body.text());
        inDesc = false;
      } else if (inLanguage) {
        feeds.setLanguage(temp);
        inLanguage = false;
      }

    } else if (inItem) {
      if (inTitle) {
        item.setTitle(temp);
        inTitle = false;
      } else if (inLink) {
        item.setLink(temp);
        inLink = false;
      } else if (inDesc) {
        Document doc = Jsoup.parseBodyFragment(temp);
        Element body = doc.body();
        item.setDescription(body.text());
        inDesc = false;
      } else if (inPubdate) {
        item.setPubdate(temp);
        inPubdate = false;
      } else if (inGuid) {
        item.setGuid(temp);
        inGuid = false;
      }
    }
    if (qName.equalsIgnoreCase("channel")) {
      if (feeds != null) {
        feeds.setItems(itemsList);
        feedsList.add(feeds);
        itemsList = new ArrayList<RSSItem>();
      }
    }
    if (qName.equalsIgnoreCase("item")) {
      if (item != null) itemsList.add(item);
    }
  }
Example #28
0
 String parseTitle(Element element) {
   try {
     if (element.classNames().contains("m-hero__slot")) {
       Element a = element.getElementsByClass("m-hero__slot-link").first();
       Element h2 = a.getElementsByTag("h2").first();
       return h2.text();
     } else if (element.classNames().contains("m-entry-slot")) {
       Element h3 = element.getElementsByTag("h3").first();
       return h3.text();
     } else throw new NullPointerException();
   } catch (NullPointerException e) {
     e.printStackTrace();
     return "Unknown title";
   }
 }
  /** takes an element and turns the P tags into \n\n */
  public String getFormattedText(Element topNode) {
    removeNodesWithNegativeScores(topNode);
    StringBuilder sb = new StringBuilder();
    append(topNode, sb, nodesToKeepCssSelector);
    String str = SHelper.innerTrim(sb.toString());
    if (str.length() > 100) return str;

    // no subelements
    if (str.isEmpty() || !topNode.text().isEmpty() && str.length() <= topNode.ownText().length())
      str = topNode.text();

    // if jsoup failed to parse the whole html now parse this smaller
    // snippet again to avoid html tags disturbing our text:
    return Jsoup.parse(str).text();
  }
 public URL getPlayerStatsURL() {
   Elements aElements = getDocument().getElementsByTag("a");
   for (Element aElement : aElements) {
     if (aElement.text() != null && aElement.text().equalsIgnoreCase("player statistics")) {
       String urlString = "http://www.whoscored.com" + aElement.attr("href");
       try {
         URL url = new URL(urlString);
         return url;
       } catch (MalformedURLException e) {
         logger.error("Malformed URL exception when getting player stats URL: {}.", urlString);
       }
     }
   }
   return null;
 }