Пример #1
1
 @Override
 public HSDeck getDeckDetail(final HSDeck hsDeck, final float n) {
   try {
     final Document value = Jsoup.connect(HPDeckSource.BASE_URL + hsDeck.getUrl()).get();
     final Elements select = value.select("section.class-listing table.listing td.col-name");
     final HashMap<String, String> classHsItemMap = new HashMap<String, String>();
     final ArrayList<String> list = new ArrayList<String>();
     for (int i = 0; i < select.size(); ++i) {
       final String text = select.get(i).select("a").get(0).text();
       classHsItemMap.put(
           text, select.get(i).text().trim().substring(select.get(i).text().trim().length() - 1));
       list.add(text);
     }
     hsDeck.setClassHsItemMap(classHsItemMap);
     hsDeck.setClassHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list));
     final Elements select2 = value.select("section.neutral-listing table.listing td.col-name");
     final HashMap<String, String> neutralHsItemMap = new HashMap<String, String>();
     final ArrayList<String> list2 = new ArrayList<String>();
     for (int j = 0; j < select2.size(); ++j) {
       final String text2 = select2.get(j).select("a").get(0).text();
       neutralHsItemMap.put(
           text2,
           select2.get(j).text().trim().substring(select2.get(j).text().trim().length() - 1));
       list2.add(text2);
     }
     hsDeck.setNeutralHsItemMap(neutralHsItemMap);
     hsDeck.setNeutralHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list2));
     hsDeck.setDescription(
         HtmlHelper.parseDescription(value.select("div.deck-description").html(), n, false));
     return hsDeck;
   } catch (IOException ex) {
     ex.printStackTrace();
     return hsDeck;
   }
 }
Пример #2
1
 public Date getStartDate(StockInfo stock) throws Exception {
   String path = "/corp/go.php/vMS_MarketHistory/stockid/" + stock.numberToString() + ".phtml";
   URI uri =
       new URIBuilder()
           .setScheme("http")
           .setHost("vip.stock.finance.sina.com.cn")
           .setPath(path)
           .setParameter("year", "1980")
           .setParameter("jidu", "1")
           .build();
   DownloadHelper download = new DownloadHelper(uri);
   InputStream is = download.getInputStream();
   Document doc = Jsoup.parse(inputStreamToStringBuilder(is).toString());
   is.close();
   download.close();
   Elements select = doc.getElementsByAttributeValue("name", "year");
   if (select == null) {
     return null;
   }
   // System.out.println(select.size());
   Elements years = select.get(0).getElementsByTag("option");
   String year = years.get(years.size() - 1).text();
   // System.out.println(year);
   return Date.quarterToDate(Integer.parseInt(year), tryQuarter(stock, year));
 }
Пример #3
0
  /**
   * 解析数据,默认解析第一列
   *
   * @param rows 源数据集
   * @return 节目数据
   */
  private static String[][] parseRows(Elements rows) {
    String[][] programs = new String[rows.size()][2];
    int rowspan_0 = 0;
    int rowspan_1 = 0;
    for (int i = 0; i < rows.size(); i++) {
      Element row = rows.get(i);
      try {
        Elements cells = row.children();

        if (rowspan_0 == 0) {
          Element cell_0 = cells.get(0);
          rowspan_0 = Integer.valueOf(cell_0.attr("rowspan"));
          if (rowspan_1 == 0) {
            Element cell_1 = cells.get(1);
            rowspan_1 = Integer.valueOf(cell_1.attr("rowspan"));
            programs[i][0] = DBclass.xmlFilte(cell_1.select("dt").text());
            programs[i][1] = DBclass.xmlFilte(cell_1.select("dd").text());
          }
        } else if (rowspan_1 == 0) {
          Element cell_0 = cells.get(0);
          rowspan_1 = Integer.valueOf(cell_0.attr("rowspan"));
          programs[i][0] = DBclass.xmlFilte(cell_0.select("dt").text());
          programs[i][1] = DBclass.xmlFilte(cell_0.select("dd").text());
        }
        rowspan_0--;
        rowspan_1--;
      } catch (Exception e) {
        e.printStackTrace(System.out);
      }
    }
    return programs;
  }
  private void initPane() {
    //		WebEngine engine = optionView.getEngine();
    try {
      Document document = Jsoup.connect(webView.getEngine().getLocation()).get();
      Element table =
          document.select("#normal_basket_" + document.select("[name=item_id]").val()).first();
      Element td = table.select("td").first();
      Elements spans = td.select("span");
      Elements selects = td.select("select");
      //			System.out.println(spans.size());
      cmb = new ArrayList<ComboBox>();
      for (int i = 0; i < spans.size(); i++) {

        ObservableList<ValuePair> obs = FXCollections.observableArrayList();
        Elements options = selects.get(i).select("option");
        for (int k = 0; k < options.size(); k++) {
          Element option = options.get(k);
          obs.add(new ValuePair("choice", option.text(), option.val()));
        }

        cmb.add(new ComboBox<ValuePair>(obs));
        optionArea.getChildren().addAll(new Text(spans.get(i).text()), cmb.get(i));
      }

    } catch (Exception e) {
      // TODO 自動生成された catch ブロック
      e.printStackTrace();
    }
  }
Пример #5
0
 public Chapter createChapter(int id, String page) {
   Chapter chapter = new Chapter(id);
   chapter.setUrl(Constants.BASE_URL + getVersion() + page);
   String cache = getCachePath() + page;
   try {
     String html = client.requestWithCache(chapter.getUrl(), cache, client.METHOD_GET, null);
     Document chapterDoc = Jsoup.parse(html);
     // 取出内容
     Elements tables = chapterDoc.select("table");
     int tableIndexOfMainBody = 1;
     if (tables.size() == 1) {
       tableIndexOfMainBody = 0;
     }
     Element table = chapterDoc.select("table").get(tableIndexOfMainBody);
     Elements sectionElements = table.select("td[class=v]");
     logger.debug(sectionElements.size());
     for (Element tdIndex : sectionElements) {
       Element tdContent = tdIndex.nextElementSibling();
       String section = tdContent.text();
       logger.debug(section);
       chapter.addSection(section);
     }
   } catch (IOException e) {
     logger.error(e.getMessage());
   } catch (IndexOutOfBoundsException e) {
     logger.error(e.getMessage());
   }
   return chapter;
 }
Пример #6
0
  /** Mudah is not standardized, result will be messy if crawl them */
  @Override
  public List<Item> parse(String query, int size) throws IOException {

    // request for a page
    Document doc =
        Jsoup.connect("http://www.mudah.my/li?q=" + query)
            .userAgent(Constant.HTTP_USER_AGENT)
            .timeout(Constant.HTTP_TIMEOUT)
            .get();

    Elements listS = doc.select("div.listing_thumbs").first().select("div.list_ads");

    ArrayList<Item> result = new ArrayList<Item>(size);
    for (int i = 0; i < listS.size(); i++) {
      Element list = listS.get(i);

      String img = "";
      list.select("div.image_thumb");
      Elements imgS = list.select("div.image_thumb > a + img");
      if (imgS.size() < 0) { // some may not have images
        img = imgS.first().attr("href");
      }

      Element listE = list.select("li.listing_ads_title").first();
      String title = listE.child(0).text();
      String url = listE.child(0).attr("href");
      String price = listE.text();
      price = price.substring(price.lastIndexOf("RM") + 2).trim().replaceAll(" ", "");
      int dPrice = Integer.parseInt(price);

      result.add(new Item("Mudah", title, dPrice, img, url));
    }

    return result;
  }
Пример #7
0
 public List<AreaVO> parseMessage(String text, int pid) {
   Document doc = Jsoup.parse(text);
   Element body = doc.body();
   List<AreaVO> areas = new ArrayList<AreaVO>();
   Elements divs = body.getElementsByClass("subarea");
   if (divs.size() > 0) {
     Element div = divs.get(0);
     Elements childs = div.children();
     String letter = "";
     for (int i = 1; i < childs.size(); i++) {
       Element child = childs.get(i);
       if ("b".equals(child.tagName())) {
         letter = child.text();
         continue;
       }
       if ("a".equals(child.tagName())) {
         AreaVO area = new AreaVO();
         area.setLetter(letter);
         area.setName(child.text());
         area.setOrderIdx(index);
         area.setPid(pid);
         String href = child.attr("href");
         String pinyin = href.substring(7, href.lastIndexOf("/"));
         area.setPinyin(pinyin);
         index++;
         System.out.println(area.toString());
         areas.add(area);
       }
     }
   }
   return areas;
 }
Пример #8
0
 public static void main(String[] args) throws IOException {
   //		Validate.isTrue(args.length == 1, "usage: supply url to fetch");
   //		String url = args[0];
   //		String url = "http://www.hao123.com";
   String url = "http://www.iteye.com/login";
   print("Fetching %s...", url);
   Document doc = Jsoup.connect(url).get();
   Elements links = doc.select("a[href]");
   Elements media = doc.select("[src]");
   Elements imports = doc.select("link[href]");
   print("\nMedia: (%d)", media.size());
   for (Element src : media) {
     if (src.tagName().equals("img"))
       print(
           " * %s: <%s> %sx%s (%s)",
           src.tagName(),
           src.attr("abs:src"),
           src.attr("width"),
           src.attr("height"),
           trim(src.attr("alt"), 20));
     else print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
   }
   print("\nImports: (%d)", imports.size());
   for (Element link : imports) {
     print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
   }
   print("\nLinks: (%d)", links.size());
   for (Element link : links) {
     print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
   }
 }
Пример #9
0
  private void processEntry(
      @NotNull String queryString,
      @NotNull Element entryNode,
      @NotNull BilingualQueryResultBuilder resultBuilder,
      @NotNull Language sourceLanguage,
      @NotNull Language targetLanguage) {
    if (!StringUtils.equals(entryNode.tag().getName(), "tr")) {
      LOGGER.warn("Expected <tr> tag - got <{}>", entryNode.tag().getName());
      return;
    }
    Elements words = entryNode.getElementsByClass("words");

    if (words.size() != 2) {
      LOGGER.warn("Expected 2 elements with class \"words\" - got {}", words.size());
      return;
    }

    BilingualEntryBuilder entryBuilder = ImmutableBilingualEntry.builder();

    entryBuilder.setEntryType(detectEntryType(words.get(0)));
    entryBuilder.setInputObject(processSingleNode(words.get(0), sourceLanguage, queryString));
    entryBuilder.setOutputObject(processSingleNode(words.get(1), targetLanguage, queryString));

    resultBuilder.addBilingualEntry(entryBuilder.build());
  }
Пример #10
0
  private Integer searchResults(Document document) {
    Integer occurences = 0;
    String searchResult = "0";
    Elements searchResults = document.select("h2.page-title.hidden-xs");

    if (searchResults.size() == 0) {
      // Sometimes results come in a different place, check it
      searchResults = document.select("div#resultsCountHeader h1.fnt12");
    }

    if (searchResults.size() > 0) {
      searchResult = searchResults.get(0).text().split(" ")[0];
    }

    // When the result is more than 1000 we get 1000+, so we delete the + sign
    if (searchResult.endsWith("+")) {
      searchResult = searchResult.substring(0, searchResult.length() - 1);
    }

    try {
      // We deal with results like 'Zero' or 'Sorry, none job...'
      occurences = Integer.parseInt(searchResult.replace(",", ""));
    } catch (NumberFormatException e) {
      System.out.println("Error parsing:" + searchResult);
      occurences = 0;
    }

    return occurences;
  }
Пример #11
0
  public String reviseImgForWX(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("img");
    if (eleimages.size() > 0) {
      for (Element img : eleimages) {
        String source = img.attr("data-src");
        int pos = source.lastIndexOf("/") + 1;
        source = source.substring(0, pos);
        img.removeAttr("data-s");
        img.removeAttr("data-src");
        img.removeAttr("data-w");
        img.attr("src", source + "640");
        img.attr("max-width", "640");
      }
    }
    Elements elesrp = doc.select("script");
    Elements divs = doc.select("div");
    if (elesrp.size() > 0 && divs.size() > 0) {

      for (Element ele : elesrp) {
        String s = ele.html();
        Pattern p = Pattern.compile("(?<=(var\\scover\\s=\\s\"))\\S+(?=\")");
        Matcher m = p.matcher(s);
        if (m.find()) {
          String nimg = "<img src=\"" + m.group() + "\"/>";
          divs.get(0).before(nimg);
        }
      }
    }
    return doc.html();
  }
Пример #12
0
  public Holder doParse(String html, String url) {
    Holder holder = new Holder();
    holder.url = url;

    Document doc = Jsoup.parse(html, url);
    Elements typeElement =
        doc.select("body > div.main_w.clearfix > div.main.clearfix > ul > li:nth-child(5) > a");
    holder.dishType = typeElement.text();

    Elements titleElement =
        doc.select(
            "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info1 > h1 > a");

    holder.title = titleElement.text();

    Elements methodElement =
        doc.select(
            "body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info2 > ul > li:nth-child(1) > a");
    holder.method = methodElement.text();

    Elements materialElement =
        doc.select(
            "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.zl.clearfix > ul > li > div > h4 > a");

    holder.mainMaterial = materialElement.text();

    Elements stepE =
        doc.select(
            "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.editnew.edit > div.content.clearfix");
    //
    // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix
    // > div.cp_body_left > div.measure > div.editnew.edit >
    // div.content.clearfix
    // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix
    // > div.cp_body_left > div.measure > div.edit > p:nth-child(1) > em
    //

    if (stepE.size() == 0) {
      stepE =
          doc.select(
              "body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.edit > p");
    }

    for (int i = 0; i < stepE.size(); i++) {
      Element e = stepE.get(i);

      if (e.children().hasClass("step")) {
        String step = e.text();
        if (!"".equals(step)) {
          holder.steps.add(step);
        }
      }
    }
    // body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix
    // > div.cp_body_left > div.measure > div.editnew.edit >
    // div:nth-child(1)

    return holder;
  }
Пример #13
0
  private void saveSmsLog(
      SimpleObject context,
      final int page,
      final int t,
      final Date d,
      final String dstr,
      final int isHistory) {
    String text = ContextUtil.getContent(context);
    Document doc = ContextUtil.getDocumentOfContent(context);
    System.out.println(doc.toString());
    if (text.indexOf("没有查找到相关数据") >= 0) {
      return;
    }
    String tableSort = InfoUtil.getInstance().getInfo("dx/sh", "tableSort");
    String tbody = InfoUtil.getInstance().getInfo("dx/sh", "tbody");
    String tr = InfoUtil.getInstance().getInfo("dx/sh", "tr");
    String td = InfoUtil.getInstance().getInfo("dx/sh", "td");
    Elements elements = doc.select(tableSort);
    if (elements != null && elements.size() > 0) {
      Elements elements2 = elements.first().select(tbody).first().select(tr);
      for (int j = 0; j < elements2.size(); j++) {
        try {
          Elements tds = elements2.get(j).select(td);
          if (tds.size() == 5) {
            String RecevierPhone = tds.get(2).text().trim(); // 对方号码
            String SentTime = tds.get(1).text().trim(); // 发送时间
            String BusinessType = tds.get(3).text().trim(); // 费用类型
            String AllPay = tds.get(4).text().trim(); // 费用

            Date sentTime = null;
            try {
              sentTime = DateUtils.StringToDate(SentTime, "yyyy-MM-dd HH:mm:ss");
            } catch (Exception e) {
              e.printStackTrace();
            }
            TelcomMessage obj = new TelcomMessage();
            obj.setPhone(phoneNo);
            UUID uuid = UUID.randomUUID();
            obj.setId(uuid.toString());
            obj.setBusinessType(BusinessType); // 业务类型:点对点
            obj.setRecevierPhone(RecevierPhone); // 对方号码
            obj.setSentTime(sentTime); // 发送时间
            obj.setCreateTs(new Date());
            obj.setAllPay(Double.parseDouble(AllPay)); // 总费用
            messageList.add(obj);
          }

        } catch (Exception e) {
          logger.error("saveSmsLog", e);
        }
      }
      if (text.contains("下一页")) {
        requestSmsLogService(page + 1, 1, d, dstr, isHistory);
      }
    }
  }
Пример #14
0
  private void parseFeedItem(String resource) {
    try {
      Document doc = Jsoup.parse(resource);
      Element masthead = doc.select("div.tie-wrapper").first();
      Elements feedBoxs = masthead.select("div.tie-box");

      for (int i = 0; i < feedBoxs.size(); i++) {

        FeedItem feedItem = new FeedItem();

        Element feedPost = feedBoxs.get(i);

        Element titleElement = feedPost.select("div.tie-header h2.tie-title a").first();

        Element nameElement =
            feedPost.select("div.tie-content div.tie-user div.user-info p span.user-name").first();
        Element sourceElement =
            feedPost.select("div.tie-content div.tie-user div.user-info p span.user-form").first();
        Element timestampElement =
            feedPost.select("div.tie-content div.tie-user div.user-info p.tie-date").first();
        Elements imageElement = feedPost.select("div.tie-content img.st-photo");
        Elements contentElements = feedPost.select("div.tie-content p:not(.tie-date):gt(0)");

        String title = titleElement.text();
        String name = nameElement.text();
        String source = sourceElement.text();
        String timestamp = timestampElement.text();

        String content = "";
        for (int j = 0; j < contentElements.size(); j++) {
          content = content + contentElements.get(j).text() + "\n";
        }

        String image;
        if (imageElement.attr("src") != "") {
          image = url + imageElement.attr("src");
        } else {
          image = null;
        }

        feedItem.setTitle(title);
        feedItem.setName(name);
        feedItem.setPostTime(timestamp);
        feedItem.setSource(source);
        feedItem.setImage(image);
        feedItem.setContent(content);

        mFeedItems.add(feedItem);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    mFeedItemAdapter.notifyDataSetChanged();
  }
Пример #15
0
 /**
  * Constructs a component hierarchy from the design specified as an html tree.
  *
  * <p>If a component root is given, the component instances created during reading the design are
  * assigned to its member fields based on their id, local id, and caption
  *
  * @param doc the html tree
  * @param componentRoot optional component root instance. The type must match the type of the root
  *     element in the design.
  * @param classWithFields a class (componentRoot class or a super class) with some member fields.
  *     The member fields whose type is assignable from {@link Component} are bound to fields in
  *     the design based on id/local id/caption
  */
 private static DesignContext designToComponentTree(
     Document doc, Component componentRoot, Class<?> classWithFields) {
   DesignContext designContext = new DesignContext(doc);
   designContext.readPackageMappings(doc);
   // No special handling for a document without a body element - should be
   // taken care of by jsoup.
   Element root = doc.body();
   Elements children = root.children();
   if (children.size() > 1) {
     throw new DesignException(
         "The first level of a component hierarchy should contain at most one root component, but found "
             + children.size()
             + ".");
   }
   Element element = children.size() == 0 ? null : children.first();
   if (componentRoot != null) {
     if (element == null) {
       throw new DesignException(
           "The root element cannot be null when the specified root Component is" + " not null.");
     }
     // user has specified root instance that may have member fields that
     // should be bound
     final FieldBinder binder;
     try {
       binder = new FieldBinder(componentRoot, classWithFields);
     } catch (IntrospectionException e) {
       throw new DesignException("Could not bind fields of the root component", e);
     }
     // create listener for component creations that binds the created
     // components to the componentRoot instance fields
     ComponentCreationListener creationListener =
         new ComponentCreationListener() {
           @Override
           public void componentCreated(ComponentCreatedEvent event) {
             binder.bindField(event.getComponent(), event.getLocalId());
           }
         };
     designContext.addComponentCreationListener(creationListener);
     // create subtree
     designContext.readDesign(element, componentRoot);
     // make sure that all the member fields are bound
     Collection<String> unboundFields = binder.getUnboundFields();
     if (!unboundFields.isEmpty()) {
       throw new DesignException("Found unbound fields from component root " + unboundFields);
     }
     // no need to listen anymore
     designContext.removeComponentCreationListener(creationListener);
   } else {
     // createChild creates the entire component hierarchy
     componentRoot = element == null ? null : designContext.readDesign(element);
   }
   designContext.setRootComponent(componentRoot);
   return designContext;
 }
Пример #16
0
  /**
   * get search results
   *
   * @param url
   * @return
   */
  public static List<Movie> getSearchResult(Document doc) {
    Elements elem = doc.getElementsByAttributeValue("class", "article");
    Elements movies = elem.get(0).getElementsByTag("table");

    if (movies == null || movies.size() == 0) {
      return null;
    }

    ArrayList<Movie> results = new ArrayList<Movie>();

    for (int i = 0; i < movies.size(); i++) {
      Movie m = new Movie();
      Elements tds = movies.get(i).getElementsByTag("td");

      String imgURL = tds.get(0).html();

      String doubanURL = imgURL.substring(imgURL.indexOf("http://movie.douban.com/subject/"));
      doubanURL = doubanURL.substring(0, doubanURL.indexOf("\""));
      m.setDoubanUrl(doubanURL);
      Log.d("doubanURL:", doubanURL);

      String mID = doubanURL.replace("http://movie.douban.com/subject/", "");
      mID = mID.replace("/", "");
      m.setID(Integer.parseInt(mID));
      Log.d("movie id", mID);

      imgURL = imgURL.substring(imgURL.indexOf("<img src=\""));
      imgURL = imgURL.replace("<img src=\"", "");
      imgURL = imgURL.substring(0, imgURL.indexOf("\""));
      m.setImgUrl(imgURL);
      Log.d("imgURL", imgURL);

      String name = "";
      Elements title = tds.get(1).getElementsByAttributeValue("class", "pl2");
      List<Node> nodes = title.get(0).childNodes();
      List<Node> n = nodes.get(0).childNodes();

      name += n.get(0).toString().toString().replace("/", "");

      Log.d("movie name", name);
      m.setMovieName(name);

      String intro = "";
      Elements info = tds.get(1).getElementsByAttributeValue("class", "pl");
      intro = info.get(0).childNodes().get(0).toString();
      m.setDirActHtml(intro);

      results.add(m);
    }

    return results;
  }
Пример #17
0
  public List<News> scrape(Document doc) {
    Elements trs =
        doc.select("body > center > " + "table > tbody > tr > td > " + "table > tbody > tr");

    int num = 0;

    List<News> newsList = new ArrayList<>();

    News.Builder builder = null;

    out:
    for (Element tr : trs) {
      switch (num % 3) {
        case 1:
          Elements titles = tr.select(".title");
          if (titles.size() < 2) {
            break out;
          }
          builder = new News.Builder();

          Element titleEl = titles.get(1);
          Element a = titleEl.select("a").first();
          builder.title = a.text();
          builder.url = getUrl(a);
          Elements comhead = titleEl.select(".comhead");
          if (comhead.size() > 0) {
            String domain = comhead.first().text();
            builder.domain = extract(domain, DOMAIN);
          }
          break;

        case 2:
          assert builder != null;
          Element subtext = tr.select(".subtext").first();
          Elements els = subtext.select("a");

          if (els.size() > 1) {
            Element comments = els.get(1);
            builder.id = getId(comments);
            builder.points = getPoints(subtext);
            builder.commentsNum = getCommentsNum(comments);
          }

          newsList.add(builder.build());
          break;
      }
      num++;
    }

    return newsList;
  }
Пример #18
0
  public String reviseImgForBundpic(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("div#list_image_div>img"),
        divs = doc.select("div#list_image_div");

    if (divs.size() > 0 && eleimages.size() > 0) {
      eleimages.get(0).removeAttr("style");
      divs.get(0).removeAttr("style");
    }

    return doc.html();
  }
  private void parseLoginStep4(SimpleObject context) {
    Document doc = ContextUtil.getDocumentOfContent(context);
    Elements e1 = doc.select("form#c2000004");
    if (e1.size() > 0) {
      data.put("errMsg", e1.select("td#status2").text());
      setStatus(STAT_STOPPED_FAIL);
      notifyStatus();
      return;
    }
    e1 = doc.select("form#login_form");
    if (e1.size() > 0) {
      data.put("errMsg", "登录失败,请重试!");
      setStatus(STAT_STOPPED_FAIL);
      notifyStatus();
      return;
    }
    String text = ContextUtil.getContent(context);

    String url =
        StringUtil.subStr(
            "<script type='text/javascript'>location.replace('", "');</script>", text);
    if (StringUtils.isBlank(url.trim())) {

      if ("IBM HTTP Server".equalsIgnoreCase(doc.select("title").text())) {
        setStatus(STAT_LOGIN_SUC);
        // notifyStatus();
        ssoLogin(context);
      } else {
        data.put("fail", true);
        setStatus(STAT_STOPPED_FAIL);
        notifyStatus();
        logger.error("Login Fail.....");
      }

      return;
    }
    getUrl(
        url,
        null,
        new Object[] {UAM_CHAR_SET},
        new AbstractProcessorObserver(util, WaringConstaint.ZGDX_5) {
          @Override
          public void afterRequest(SimpleObject context) {
            setStatus(STAT_LOGIN_SUC);
            ssoLogin(context);
          }
        });
  }
Пример #20
0
 public static boolean getFormFields(
     ResponseWrapper rw, List<NameValuePairString> hiddenFormFields, String formSelector) {
   // --- analisi della pagina contente la form, specifica al sito
   Document doc = rw.getJSoupDocument();
   Elements els = doc.select(formSelector); // per debug, dovrebbe essere uo
   if (els == null || els.size() <= 0) {
     log.error("unable to find form at selector: " + formSelector);
     System.exit(1);
     return false;
   }
   Element loginForm = els.get(0);
   if (loginForm == null) {
     log.error("failed to get form to analyze at: " + rw.dump());
     System.exit(1);
   }
   // log.info("login form OUTER HTML\n" + loginForm.outerHtml());
   Elements inputFields = loginForm.select("input");
   // display all
   for (Element e : inputFields) {
     String type = e.attr("type");
     if (type.equals("submit")) {
       continue;
     }
     String attrName = e.attr("name");
     hiddenFormFields.add(new NameValuePairString(attrName, e.val()));
     log.debug("captured form input: " + attrName + " = " + e.val());
   }
   return false;
 }
  @Scheduled(fixedDelay = 900000)
  public void loadBooksInfo() {
    int booksListPageNumber = 1;
    while (true) {
      Document document = null;
      try {
        URL url =
            new URL(
                "http://www.labirint.ru/genres/2308/?page="
                    + Integer.toString(booksListPageNumber++));
        document = Jsoup.parse(url, 5000);
      } catch (MalformedURLException ex) {

      } catch (IOException ex) {

      }
      Elements elements = document.getElementsByClass("product");
      if (elements.size() == 0) break;
      Iterator<Element> iterator = elements.iterator();
      while (iterator.hasNext()) {
        try {
          bookDao.saveBook(bookInfo(iterator.next()));
        } catch (Exception ex) {
          if (ex instanceof ConstraintViolationException) continue;
        }
      }
    }
  }
Пример #22
0
  public static void readHead() {
    String url = "http://www.2177s.com";
    try {
      Document doc = Jsoup.connect(url).timeout(10000).get();
      String title = doc.title();
      System.out.printf("title:%s\n", title);

      //			Elements eles = doc.select("meta[name~=(?i)keywords|(?i)description]");

      Elements eles = doc.select("meta");
      System.out.println(eles.size());
      for (Element ele : eles) {
        if (StringUtils.containsIgnoreCase(url, title)) ;
        if (ele.toString().matches(".*(?i)keywords.*")) {
          System.out.println(ele.attr("content"));
        }
        //				System.out.println(ele.attr("content"));
      }

      //			Elements eles = doc.getElementsByTag("meta");
      //			for (Element ele : eles) {
      //				System.out.printf("keys:%s\n", ele.attr("keywords"));
      //				System.out.printf("desc:%s\n", ele.attr("description"));
      //				System.out.println("----------------");
      //			}
      doc = null;
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Пример #23
0
 /**
  * begin crawling with a specific url use depth first search
  *
  * @throws IOException
  * @throws SQLException
  */
 public void crawl(String starturl) throws IOException, SQLException {
   if (urlid >= MAXURL) // base case
   return;
   Document doc;
   try {
     doc = Jsoup.connect(starturl).get();
   } catch (IOException e) {
     // if the url is not valid, stop the crawling process
     return;
   } catch (IllegalArgumentException e) {
     System.out.println("Must supply a valid URL : " + starturl);
     return;
   }
   if (!urlList.contains(starturl)) {
     urlList.add(starturl);
   }
   // if the url has already been crawled
   else if (urlList.contains(starturl)) {
     return;
   }
   Elements hrefs = doc.select("a");
   urlid += 1;
   // terminate the process if there is no more link in a webpage
   if (hrefs == null || hrefs.size() == 0) return;
   HashMap<String, Integer> wordMap = parseHTML(getHTMLContent(starturl));
   insertDBWord(starturl, wordMap, urlid);
   insertDBDescription(starturl, topOneHundred(starturl), urlid);
   for (Element e : hrefs) {
     String href = e.attr("href");
     crawl(href); // depth first search;
   }
 }
Пример #24
0
  @Override
  public List<String> parseCategory(String categoryName, String categoryURL) {
    // TODO Auto-generated method stub

    List<String> linksByCategoryList = null;

    try {

      Document doc = Jsoup.connect(categoryURL).timeout(Constants.MAX_DELAY_TIME * 1000).get();

      Elements links = doc.select("div[class=views-field views-field-title]").select("a");

      if (links != null && links.size() > 0) {

        linksByCategoryList = new ArrayList<String>();

        for (Element element : links) {

          String newsLink = element.attr("href");
          newsLink = newsLink.substring(1);

          linksByCategoryList.add(newsLink);
        }
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return linksByCategoryList;
  }
Пример #25
0
 @Override
 public String fire(String inputContent) throws Exception {
   validate();
   Document document = Jsoup.parse(inputContent);
   Elements elements = document.select(cssSelector);
   return (elements != null && elements.size() > 0 ? elements.html().trim() : null);
 }
 @Override
 public void populateMetaData(MetaData metaData) throws MetaDataException {
   Document doc;
   try {
     if (method.equals("GET")) {
       doc = Jsoup.connect(url).get();
     } else if (method.equals("POST")) {
       doc = Jsoup.connect(url).data(requestData).post();
     } else {
       throw new MetaDataException("Unsupported HTML access method: " + method);
     }
     for (MetaDataAttribute attribute : attributes) {
       Elements elements = doc.select(attribute.getQuery());
       if (elements.size() > 0) {
         String sValue = elements.get(0).text();
         Object oValue = attribute.getValueMapper().parse(sValue);
         metaData.put(attribute.getName(), oValue);
       }
     }
   } catch (IOException e) {
     throw new MetaDataException(e);
   } catch (ValueMapperException e) {
     throw new MetaDataException(e);
   }
 }
Пример #27
0
  public HashMap<String, String> initialBestBuyScan(Document doc, String url) {
    doc = jsoupConnect(url);
    HashMap<String, String> matchingItems = new HashMap<String, String>();
    matchingItems.put("price", doc.select(".medium-item-price").text());
    matchingItems.put(
        "modelNumber", doc.select(".list-item-info .sku-model ul .model-number").text());
    matchingItems.put("title", doc.select(".list-item-info .sku-title h4 a").text());

    String newURL =
        "http://bestbuy.com"
            + bestBuySpecsFormatter(doc.select(".list-item-info .sku-title h4 a").attr("href"));
    System.out.println(newURL);
    doc = jsoupConnect(newURL);
    Elements tableEles = doc.select("#full-specifications table tbody tr");
    for (Element ele : tableEles) {
      if (ele.text().contains("UPC")) {
        matchingItems.put("upc", ele.text().replace("UPC ", ""));
        break;
      }
    }
    if (tableEles.size() < 1) matchingItems.put("GoodSKU", "false");
    else matchingItems.put("GoodSKU", "true");
    doc.empty();

    return matchingItems;
  }
    @Override
    public void run() {
      // TODO Auto-generated method stub
      Document doc = null;
      Elements eles = null;
      if (!Utils.isNET(NewsContentActivity.this)) {
        Utils.showToast(NewsContentActivity.this, "网络不可用哦,亲!", Toast.LENGTH_SHORT);
      } else {
        try {
          doc = Jsoup.connect(url).timeout(8000).get();
          if (null == doc) {
            Utils.showToast(NewsContentActivity.this, "网络不给力哦,亲,请返回再进入吧!", Toast.LENGTH_SHORT);
            return;
          }
          eles = doc.select("#Cnt-Main-Article-QQ P");
          StringBuilder sb = new StringBuilder();
          for (int i = 0; i < eles.size(); i++) {
            sb.append(eles.get(i).outerHtml());
          }
          Message msg = new Message();
          Bundle bundle = new Bundle();
          bundle.putString("content", sb.toString());
          Log.i("content", sb.toString());
          msg.setData(bundle);
          msg.what = NewsContentActivity.NEWCONTENTRECEIVED;
          myHandler.sendMessage(msg);

        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }
Пример #29
0
  private static String replaceCidWithAttachments(
      String html, Map<String, Attachment> attachments) {
    Document doc = Jsoup.parse(html);
    String[] attrNames = {"src", "href"};

    for (String attrName : attrNames) {
      Elements tags = doc.select("*[" + attrName + "]");
      for (Element tag : tags) {
        String uriString = tag.attr(attrName).trim();

        if (!uriString.toLowerCase().startsWith("cid:")) {
          continue;
        }

        String cid = uriString.substring("cid:".length());

        if (!attachments.containsKey(cid)) {
          continue;
        }

        Long id = attachments.get(cid).id;
        tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url());
      }
    }

    Elements bodies = doc.getElementsByTag("body");

    if (bodies.size() > 0) {
      return bodies.get(0).html();
    } else {
      return doc.html();
    }
  }
  @Bean
  public IntegrationFlow evernoteIntegration() {
    return IntegrationFlows.from(
            this.evernoteMessageSource(),
            configurer ->
                configurer.poller(Pollers.fixedRate(pollIntervalInSeconds, TimeUnit.SECONDS)))
        .channel(this.inputChannel())
        .filter(Collection.class, source -> !source.isEmpty())
        .split()
        .transform(
            Note.class,
            source -> {
              String content = source.getContent();
              if (StringUtils.isNotBlank(content)) {
                Document enmlDocument = Jsoup.parse(content);
                Elements noteElements = enmlDocument.select("en-note");
                if (noteElements.size() == 1) {
                  Element noteElement = noteElements.get(0);
                  String wordsFromNote = noteElement.text();
                  if (StringUtils.isNotBlank(wordsFromNote)) {
                    return wordsFromNote;
                  }
                }
              }

              return source.getTitle();
            },
            configurer -> configurer.requiresReply(false))
        .filter(source -> source != null)
        .channel(wordRequestsChannel)
        .get();
  }