예제 #1
1
 @Override
 public HSDeck getDeckDetail(final HSDeck hsDeck, final float n) {
   try {
     final Document value = Jsoup.connect(HPDeckSource.BASE_URL + hsDeck.getUrl()).get();
     final Elements select = value.select("section.class-listing table.listing td.col-name");
     final HashMap<String, String> classHsItemMap = new HashMap<String, String>();
     final ArrayList<String> list = new ArrayList<String>();
     for (int i = 0; i < select.size(); ++i) {
       final String text = select.get(i).select("a").get(0).text();
       classHsItemMap.put(
           text, select.get(i).text().trim().substring(select.get(i).text().trim().length() - 1));
       list.add(text);
     }
     hsDeck.setClassHsItemMap(classHsItemMap);
     hsDeck.setClassHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list));
     final Elements select2 = value.select("section.neutral-listing table.listing td.col-name");
     final HashMap<String, String> neutralHsItemMap = new HashMap<String, String>();
     final ArrayList<String> list2 = new ArrayList<String>();
     for (int j = 0; j < select2.size(); ++j) {
       final String text2 = select2.get(j).select("a").get(0).text();
       neutralHsItemMap.put(
           text2,
           select2.get(j).text().trim().substring(select2.get(j).text().trim().length() - 1));
       list2.add(text2);
     }
     hsDeck.setNeutralHsItemMap(neutralHsItemMap);
     hsDeck.setNeutralHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list2));
     hsDeck.setDescription(
         HtmlHelper.parseDescription(value.select("div.deck-description").html(), n, false));
     return hsDeck;
   } catch (IOException ex) {
     ex.printStackTrace();
     return hsDeck;
   }
 }
예제 #2
0
파일: Names.java 프로젝트: joshvm/imdb
 @Override
 protected void parseRow(
     final String query, final int options, final Element tr, final List<Name> results) {
   final String thumbnailUrl =
       tr.getElementsByAttributeValue("class", "primary_photo")
           .first()
           .getElementsByTag("img")
           .first()
           .attr("src");
   final Element r = tr.getElementsByAttributeValue("class", "result_text").first();
   final Element a = r.getElementsByTag("a").first();
   final String url = Imdb.BASE_URL + a.attr("href");
   final String name = a.ownText();
   String job = "";
   Reference ref = null;
   final Elements smalls = r.getElementsByTag("small");
   if (!smalls.isEmpty()) {
     final String refUrl =
         Imdb.BASE_URL + smalls.first().getElementsByTag("a").first().attr("href");
     String desc = smalls.first().text();
     if (desc.startsWith("(") && desc.endsWith(")")) desc = desc.substring(1, desc.length() - 1);
     final int comma = desc.indexOf(',');
     if (comma != -1) {
       job = desc.substring(0, comma).trim();
       ref = new Reference(refUrl, desc.substring(comma + 1).trim());
     } else {
       if (desc.matches(".+\\(\\d+\\)"))
         ref = new Reference(refUrl, desc.substring(comma + 1).trim());
       else job = desc;
     }
   }
   results.add(new Name(url, thumbnailUrl, name, job, ref));
 }
  /**
   * achieve the num of people him/her fellowed
   *
   * @param doc
   * @return
   */
  private String getFellowPeopleNum(Document doc) {
    Elements friendHtml = doc.select("div[id=\"friend\"]");
    Elements fellowPeopleNumHtml = null;

    if (friendHtml != null) {
      fellowPeopleNumHtml = friendHtml.select("a");
      // 关注人数
      if (fellowPeopleNumHtml != null) {
        String fellowPeopleNum =
            UtilsMethod.findFirstStringByRegex("成员[0-9]+", fellowPeopleNumHtml.text());
        if (fellowPeopleNum != null) {
          fellowPeopleNum = fellowPeopleNum.replaceAll("[\\D]+", "");
          if (fellowPeopleNum != null) {
            return fellowPeopleNum;
          } else {
            return null;
          }
        } else {
          return null;
        }
      } else {
        return null;
      }
    } else {
      return null;
    }
  }
예제 #4
0
 @Override
 public String fire(String inputContent) throws Exception {
   validate();
   Document document = Jsoup.parse(inputContent);
   Elements elements = document.select(cssSelector);
   return (elements != null && elements.size() > 0 ? elements.html().trim() : null);
 }
 @Override
 public void populateMetaData(MetaData metaData) throws MetaDataException {
   Document doc;
   try {
     if (method.equals("GET")) {
       doc = Jsoup.connect(url).get();
     } else if (method.equals("POST")) {
       doc = Jsoup.connect(url).data(requestData).post();
     } else {
       throw new MetaDataException("Unsupported HTML access method: " + method);
     }
     for (MetaDataAttribute attribute : attributes) {
       Elements elements = doc.select(attribute.getQuery());
       if (elements.size() > 0) {
         String sValue = elements.get(0).text();
         Object oValue = attribute.getValueMapper().parse(sValue);
         metaData.put(attribute.getName(), oValue);
       }
     }
   } catch (IOException e) {
     throw new MetaDataException(e);
   } catch (ValueMapperException e) {
     throw new MetaDataException(e);
   }
 }
예제 #6
0
 public Chapter createChapter(int id, String page) {
   Chapter chapter = new Chapter(id);
   chapter.setUrl(Constants.BASE_URL + getVersion() + page);
   String cache = getCachePath() + page;
   try {
     String html = client.requestWithCache(chapter.getUrl(), cache, client.METHOD_GET, null);
     Document chapterDoc = Jsoup.parse(html);
     // 取出内容
     Elements tables = chapterDoc.select("table");
     int tableIndexOfMainBody = 1;
     if (tables.size() == 1) {
       tableIndexOfMainBody = 0;
     }
     Element table = chapterDoc.select("table").get(tableIndexOfMainBody);
     Elements sectionElements = table.select("td[class=v]");
     logger.debug(sectionElements.size());
     for (Element tdIndex : sectionElements) {
       Element tdContent = tdIndex.nextElementSibling();
       String section = tdContent.text();
       logger.debug(section);
       chapter.addSection(section);
     }
   } catch (IOException e) {
     logger.error(e.getMessage());
   } catch (IndexOutOfBoundsException e) {
     logger.error(e.getMessage());
   }
   return chapter;
 }
  @Override
  protected void initialize(Element source) {
    Elements elements = source.getElementsByTag("td");

    Element element = elements.get(0).select("[data-sc-params]").get(0);
    String name =
        element
            .attr("data-sc-params")
            .replaceAll("\\{ 'name': '", "")
            .replaceAll("', 'magnet':.*", "")
            .replaceAll("%20", "\\.")
            .replaceAll("%5B.*", "");

    ShowData showData = ShowData.fromFilename(name);
    initialize(showData);

    seeds = Integer.parseInt(elements.get(4).text());
    peers = Integer.parseInt(elements.get(5).text());

    element = elements.get(0).select("div a[title=Download torrent file]").get(0);
    String[] array = element.attr("href").split("\\?");
    downloadLink = array[0].replaceAll("\\.torrent", "/temp\\.torrent");

    if (downloadLink.startsWith("//")) {
      downloadLink = "http:" + downloadLink;
    }
  }
    @Override
    public void run() {
      // TODO Auto-generated method stub
      Document doc = null;
      Elements eles = null;
      if (!Utils.isNET(NewsContentActivity.this)) {
        Utils.showToast(NewsContentActivity.this, "网络不可用哦,亲!", Toast.LENGTH_SHORT);
      } else {
        try {
          doc = Jsoup.connect(url).timeout(8000).get();
          if (null == doc) {
            Utils.showToast(NewsContentActivity.this, "网络不给力哦,亲,请返回再进入吧!", Toast.LENGTH_SHORT);
            return;
          }
          eles = doc.select("#Cnt-Main-Article-QQ P");
          StringBuilder sb = new StringBuilder();
          for (int i = 0; i < eles.size(); i++) {
            sb.append(eles.get(i).outerHtml());
          }
          Message msg = new Message();
          Bundle bundle = new Bundle();
          bundle.putString("content", sb.toString());
          Log.i("content", sb.toString());
          msg.setData(bundle);
          msg.what = NewsContentActivity.NEWCONTENTRECEIVED;
          myHandler.sendMessage(msg);

        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }
예제 #9
0
 public static String getAddress(String string) throws Exception {
   Document document = Jsoup.connect(string).get();
   Elements elementsByClass = document.getElementsByClass("result-title");
   String attr = elementsByClass.get(0).attr("href");
   String readHref = readHref(attr);
   return readHref;
 }
예제 #10
0
파일: Test.java 프로젝트: vlemonkey/Test
  public static void readHead() {
    String url = "http://www.2177s.com";
    try {
      Document doc = Jsoup.connect(url).timeout(10000).get();
      String title = doc.title();
      System.out.printf("title:%s\n", title);

      //			Elements eles = doc.select("meta[name~=(?i)keywords|(?i)description]");

      Elements eles = doc.select("meta");
      System.out.println(eles.size());
      for (Element ele : eles) {
        if (StringUtils.containsIgnoreCase(url, title)) ;
        if (ele.toString().matches(".*(?i)keywords.*")) {
          System.out.println(ele.attr("content"));
        }
        //				System.out.println(ele.attr("content"));
      }

      //			Elements eles = doc.getElementsByTag("meta");
      //			for (Element ele : eles) {
      //				System.out.printf("keys:%s\n", ele.attr("keywords"));
      //				System.out.printf("desc:%s\n", ele.attr("description"));
      //				System.out.println("----------------");
      //			}
      doc = null;
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  private static Collection<Node> extractImageNodes(Element aInContent) {
    Collection<Node> lImageNodes = new LinkedList<>();

    Elements lImageElements = aInContent.getElementsByTag("img");
    if (!lImageElements.isEmpty()) {
      int i = 0;
      for (Element lImageElement : lImageElements) {
        i++;
        if (lImageElement.hasClass("float-left")) {
          if (!lImageElement.hasClass("alignleft")) {
            lImageElement.addClass("alignleft");
          }
        } else if (lImageElement.hasClass("float-right")) {
          if (!lImageElement.hasClass("alignright")) {
            lImageElement.addClass("alignright");
          }
        }

        if (i > 1) {
          lImageElement.removeAttr("width");
          lImageElement.removeAttr("height");
        }

        Node lThisNode = toNode(lImageElement);
        lImageNodes.add(lThisNode.clone());
      }
    }

    return lImageNodes;
  }
예제 #12
0
파일: Crawler.java 프로젝트: cintoros/nzApp
 @Override
 public Collection<News> crawl() {
   HashSet<News> news = new HashSet<>();
   try {
     String startURL = Settings.HOMEPAGE;
     Document doc = Jsoup.connect(startURL).get();
     Elements contents = doc.select("article"); // extract all articles out of src
     long counter = 1;
     for (Element content : contents) { // getting content for all article
       Elements articleLink = content.select("a.teaser__link");
       Element img = articleLink.select("img").first();
       String imageSrc = null;
       try { // try to clean image src
         imageSrc = img.attr("data-srcset");
         imageSrc = imageSrc.split(",")[0].split(" ")[0];
       } catch (Exception e) {
       }
       String title = articleLink.select("div.title__catchline").text();
       String undertitle = articleLink.select("div.title__name").text();
       String link = articleLink.select("[href]").attr("href");
       news.add(new News(counter, title, undertitle, link, imageSrc, "DE"));
       counter++;
     }
   } catch (Exception ex) {
     System.out.println("Website not parsed!!");
     return null;
   }
   return news;
 }
예제 #13
0
 public Collection<String> extractSubscribedUser(final String htmlContent) {
   // logger.debug("htmlContent:\n" + htmlContent);
   final List<String> result = new ArrayList<String>();
   final Document document = Jsoup.parse(htmlContent);
   final Elements tables = document.getElementsByTag("table");
   for (final Element table : tables) {
     if (isSubscriptTable(table)) {
       for (final Element tr : table.getElementsByTag("tr")) {
         final Elements tds = tr.getElementsByTag("td");
         if (!tds.isEmpty()) {
           final String name = tds.get(0).text();
           if (name != null) {
             final String nameTrimed = name;
             if (nameTrimed.length() > 1) {
               logger.debug("found subscription for user: '******'");
               result.add(nameTrimed);
             }
           }
         }
       }
     }
   }
   logger.debug("found " + result.size() + " subscribed users in htmlcontent");
   return result;
 }
예제 #14
0
 /**
  * begin crawling with a specific url use depth first search
  *
  * @throws IOException
  * @throws SQLException
  */
 public void crawl(String starturl) throws IOException, SQLException {
   if (urlid >= MAXURL) // base case
   return;
   Document doc;
   try {
     doc = Jsoup.connect(starturl).get();
   } catch (IOException e) {
     // if the url is not valid, stop the crawling process
     return;
   } catch (IllegalArgumentException e) {
     System.out.println("Must supply a valid URL : " + starturl);
     return;
   }
   if (!urlList.contains(starturl)) {
     urlList.add(starturl);
   }
   // if the url has already been crawled
   else if (urlList.contains(starturl)) {
     return;
   }
   Elements hrefs = doc.select("a");
   urlid += 1;
   // terminate the process if there is no more link in a webpage
   if (hrefs == null || hrefs.size() == 0) return;
   HashMap<String, Integer> wordMap = parseHTML(getHTMLContent(starturl));
   insertDBWord(starturl, wordMap, urlid);
   insertDBDescription(starturl, topOneHundred(starturl), urlid);
   for (Element e : hrefs) {
     String href = e.attr("href");
     crawl(href); // depth first search;
   }
 }
예제 #15
0
  public static void getComic(String arg) {
    Document doc;

    try {
      doc = Jsoup.connect(arg).get();

      // String title = doc.title();
      // System.out.print("Title: " + title);

      // Select the img tag in the comic id
      Elements links = doc.select("#comic img");
      System.out.print("\nComic Name : " + links.attr("alt"));
      System.out.print("\nImage Source : " + links.attr("src") + "\n\n");
      URL url = new URL(links.attr("src"));

      RenderedImage comic = ImageIO.read(url);
      String baseName = links.attr("alt").replaceAll("\\s", "_");

      ImageIO.write(comic, "png", new File("/home/paranoidsp/Pictures/xkcd/" + baseName + ".png"));

      /*
       * Unfortunately, the transcript isn't formatted, so I get one
       * large line of text instead of readable dialogue.
       * TODO: Fix this. Find a way to get it.
       *
      Elements transcript = doc.select("#transcript");
      System.out.print("Transcript: \n" + transcript.text());
      */

    } catch (IOException exp) {
      exp.printStackTrace();
    }
  }
  @Bean
  public IntegrationFlow evernoteIntegration() {
    return IntegrationFlows.from(
            this.evernoteMessageSource(),
            configurer ->
                configurer.poller(Pollers.fixedRate(pollIntervalInSeconds, TimeUnit.SECONDS)))
        .channel(this.inputChannel())
        .filter(Collection.class, source -> !source.isEmpty())
        .split()
        .transform(
            Note.class,
            source -> {
              String content = source.getContent();
              if (StringUtils.isNotBlank(content)) {
                Document enmlDocument = Jsoup.parse(content);
                Elements noteElements = enmlDocument.select("en-note");
                if (noteElements.size() == 1) {
                  Element noteElement = noteElements.get(0);
                  String wordsFromNote = noteElement.text();
                  if (StringUtils.isNotBlank(wordsFromNote)) {
                    return wordsFromNote;
                  }
                }
              }

              return source.getTitle();
            },
            configurer -> configurer.requiresReply(false))
        .filter(source -> source != null)
        .channel(wordRequestsChannel)
        .get();
  }
예제 #17
0
  private static String replaceCidWithAttachments(
      String html, Map<String, Attachment> attachments) {
    Document doc = Jsoup.parse(html);
    String[] attrNames = {"src", "href"};

    for (String attrName : attrNames) {
      Elements tags = doc.select("*[" + attrName + "]");
      for (Element tag : tags) {
        String uriString = tag.attr(attrName).trim();

        if (!uriString.toLowerCase().startsWith("cid:")) {
          continue;
        }

        String cid = uriString.substring("cid:".length());

        if (!attachments.containsKey(cid)) {
          continue;
        }

        Long id = attachments.get(cid).id;
        tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url());
      }
    }

    Elements bodies = doc.getElementsByTag("body");

    if (bodies.size() > 0) {
      return bodies.get(0).html();
    } else {
      return doc.html();
    }
  }
예제 #18
0
  public HashMap<String, String> initialBestBuyScan(Document doc, String url) {
    doc = jsoupConnect(url);
    HashMap<String, String> matchingItems = new HashMap<String, String>();
    matchingItems.put("price", doc.select(".medium-item-price").text());
    matchingItems.put(
        "modelNumber", doc.select(".list-item-info .sku-model ul .model-number").text());
    matchingItems.put("title", doc.select(".list-item-info .sku-title h4 a").text());

    String newURL =
        "http://bestbuy.com"
            + bestBuySpecsFormatter(doc.select(".list-item-info .sku-title h4 a").attr("href"));
    System.out.println(newURL);
    doc = jsoupConnect(newURL);
    Elements tableEles = doc.select("#full-specifications table tbody tr");
    for (Element ele : tableEles) {
      if (ele.text().contains("UPC")) {
        matchingItems.put("upc", ele.text().replace("UPC ", ""));
        break;
      }
    }
    if (tableEles.size() < 1) matchingItems.put("GoodSKU", "false");
    else matchingItems.put("GoodSKU", "true");
    doc.empty();

    return matchingItems;
  }
예제 #19
0
 private static void parseStatHeaderDetails(Document doc, Statistic stat) {
   Elements statsTrs = doc.select("table#id_stats").select("tr");
   for (Element tr : statsTrs) {
     Elements tds = tr.select("td");
     String name = tds.get(0).text().trim();
     String value = tds.get(1).text().trim();
     if (name != null) {
       if (name.startsWith("Win-Loss-Void")) {
         String[] values = value.split("-");
         if (values != null && values.length == 3) {
           stat.setWin(NumberParser.parseInt(values[0]));
           stat.setLose(NumberParser.parseInt(values[1]));
           stat.setVoid_(NumberParser.parseInt(values[2]));
         } else {
           logger.warn("Win-Loss-Void section doesn't contain 3 elements as expected");
         }
       } else if (name.startsWith("Stake avg")) {
         stat.setAvgStake(NumberParser.parseDouble(value));
       } else if (name.startsWith("Odd avg")) {
         stat.setAvgOdds(NumberParser.parseDouble(value));
       } else if (name.startsWith("Staked")) {
         stat.setStaked(NumberParser.parseDouble(value));
       } else if (name.startsWith("Returned")) {
         stat.setReturned(NumberParser.parseDouble(value));
       }
     }
   }
 }
예제 #20
0
  private Observable<WebPageEntity> parseDocument(DownloadResult downloadResult) {
    Set<WebPageEntity> result = new HashSet<>(1);

    Document document = downloadResult.getDocument();
    if (document != null) {
      Elements elements = document.select(".InfoArea a[title]");
      if (!elements.isEmpty()) {
        for (Element element : elements) {
          WebPageEntity webPageEntity =
              new WebPageEntity(
                  downloadResult.getSourcePage(),
                  "",
                  "productPage",
                  element.attr("abs:href"),
                  downloadResult.getSourcePage().getCategory());
          LOGGER.info("productPageUrl={}", webPageEntity.getUrl());
          result.add(webPageEntity);
        }
      } else {
        WebPageEntity webPageEntity =
            new WebPageEntity(
                downloadResult.getSourcePage(),
                "",
                "productPage",
                downloadResult.getSourcePage().getUrl(),
                downloadResult.getSourcePage().getCategory());
        LOGGER.info("productPageUrl={}", webPageEntity.getUrl());
        result.add(webPageEntity);
      }
    }
    return Observable.from(result);
  }
예제 #21
0
파일: MovieSpider.java 프로젝트: jimly/jca
 private static void crawl() {
   String url = url_tpl + (page++);
   Logger.info("正在抓取:%s", url);
   if (StringUtils.isBlank(url)) return;
   sleep();
   Document doc = Jsoup.parse(WS.url(url).get().body, url);
   Elements elements = doc.select(".video-item");
   if (elements.isEmpty()) return;
   for (Element element : elements) {
     try {
       Element link = element.select(">a").first();
       String cover = link.select("img").first().absUrl("src");
       String coverTitle = link.select(".v-update").first().html();
       String detailUrl = link.absUrl("href");
       String name = element.select(".v-desc .v-title a").first().html();
       Logger.info("正在抓取名称:%s", name);
       Movie movie = Movie.find("byName", name).first();
       if (movie == null) {
         movie = new Movie();
         movie.id = DBCounter.generateUniqueCounter(Movie.class) + "";
       }
       movie.name = name;
       movie.cover = cover;
       movie.cover_title = coverTitle;
       movie.details =
           getDetails(
               movie, "http://video.baidu.com/v?word=" + URLEncoder.encode("美剧 " + name, "GBK"));
       movie.save();
     } catch (Exception e) {
       Logger.error(e.getMessage(), e);
     }
   }
   crawl();
 }
예제 #22
0
  /*
   * Getting news from "http://enib.net/"
   */
  public List<News> getNews() {
    Document doc = null;
    try {
      doc = Jsoup.connect("http://enib.net/").get();
    } catch (IOException e) {
      System.out.println("Can't load news");
      e.printStackTrace();
    }

    /*
     * Getting name, information, description and add it to the news List
     */
    Elements getter = doc.getElementsByClass("news");
    for (Element get : getter) {
      String news = "";
      String name = get.select("h1").text();
      String information = get.select("h2").text();
      Elements markdown = get.getElementsByClass("markdown");
      for (Element paragraph : markdown.select("p")) {
        news = news + paragraph.text() + System.getProperty("line.separator");
      }
      News n = new News(name, information, news);
      this.news.add(n);
    }
    return this.news;
  }
  @Override
  public Article run(HtmlObject htmlObject) {
    String html = htmlObject.getHtml();
    Document doc = Jsoup.parse(html);
    String title = doc.select(".article h1").text();
    Elements contentElement = doc.select(".article_con");
    String content = "";
    String contentHtml = "";
    if (contentElement != null) {
      // contentElement.select(".author").remove();
      content = contentElement.text();
      contentHtml = contentElement.html();
    }

    String Ele_data = doc.select(".article h2").text();
    Matcher m1 = datePattern.matcher(Ele_data);
    String date = "";
    if (m1.find()) {
      date = m1.group(1);
    } else {
      Date today = new Date();
      SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
      date = formatter.format(today);
    }

    Article model1 = new Article();
    model1.setUrl(htmlObject.getUrl());
    model1.setTitle(title);
    model1.setContent(content);
    model1.setPublishDate(date);
    model1.setArticleType(ArticleType.News);
    model1.setProvider("雨果网");
    return model1;
  }
예제 #24
0
 public static boolean getFormFields(
     ResponseWrapper rw, List<NameValuePairString> hiddenFormFields, String formSelector) {
   // --- analisi della pagina contente la form, specifica al sito
   Document doc = rw.getJSoupDocument();
   Elements els = doc.select(formSelector); // per debug, dovrebbe essere uo
   if (els == null || els.size() <= 0) {
     log.error("unable to find form at selector: " + formSelector);
     System.exit(1);
     return false;
   }
   Element loginForm = els.get(0);
   if (loginForm == null) {
     log.error("failed to get form to analyze at: " + rw.dump());
     System.exit(1);
   }
   // log.info("login form OUTER HTML\n" + loginForm.outerHtml());
   Elements inputFields = loginForm.select("input");
   // display all
   for (Element e : inputFields) {
     String type = e.attr("type");
     if (type.equals("submit")) {
       continue;
     }
     String attrName = e.attr("name");
     hiddenFormFields.add(new NameValuePairString(attrName, e.val()));
     log.debug("captured form input: " + attrName + " = " + e.val());
   }
   return false;
 }
예제 #25
0
  @Override
  public List<String> parseCategory(String categoryName, String categoryURL) {
    // TODO Auto-generated method stub

    List<String> linksByCategoryList = null;

    try {

      Document doc = Jsoup.connect(categoryURL).timeout(Constants.MAX_DELAY_TIME * 1000).get();

      Elements links = doc.select("div[class=views-field views-field-title]").select("a");

      if (links != null && links.size() > 0) {

        linksByCategoryList = new ArrayList<String>();

        for (Element element : links) {

          String newsLink = element.attr("href");
          newsLink = newsLink.substring(1);

          linksByCategoryList.add(newsLink);
        }
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return linksByCategoryList;
  }
예제 #26
0
  /**
   * Method responsible for querying and parsing to correios cep locator
   *
   * @author pulu - 09/09/2013
   */
  private Webservicecep findAddressByCepAtCorreios(String url) throws IOException {
    HttpClient httpClient = new HttpClient();
    PostMethod postMethod = new PostMethod(url);
    log.log(Level.INFO, "Querying to correios WS...");
    try {
      httpClient.executeMethod(postMethod);

      if (postMethod.getStatusCode() == HttpStatus.SC_OK) {
        Document doc = Jsoup.parse(new URL(url).openStream(), "ISO-8859-1", url);
        Elements elements = doc.select("td:not([colspan]):not(:has(*))");

        return new Webservicecep(
            Webservicecep.SUCCESS_CODE,
            elements.get(3).ownText(),
            elements.get(2).ownText(),
            elements.get(1).ownText(),
            "",
            elements.get(0).ownText());

      } else return new Webservicecep(Webservicecep.ERROR_CODE);
    } catch (Exception e) {
      log.log(Level.WARNING, "Failed to parse html data. Possible reason: invalid cep.");
      return new Webservicecep(Webservicecep.ERROR_CODE);
    }
  }
예제 #27
0
  private BancoMegaSena() throws IOException {
    this.concursos = new ArrayList<Concurso>();

    File input = new File("C:\\Users\\Rodrigo Lacerda\\Downloads\\D_mgsasc (1)\\d_megasc.htm");

    Document doc = Jsoup.parse(input, "UTF-8");
    Elements trs = doc.getElementsByTag("tr");
    System.out.println(trs.get(1).getElementsByTag("th"));

    for (Element tr : trs)
      if (tr.getElementsByTag("th").isEmpty()) {
        String codigo = tr.getElementsByTag("td").get(0).text();
        String d1 = tr.getElementsByTag("td").get(2).text();
        String d2 = tr.getElementsByTag("td").get(3).text();
        String d3 = tr.getElementsByTag("td").get(4).text();
        String d4 = tr.getElementsByTag("td").get(5).text();
        String d5 = tr.getElementsByTag("td").get(6).text();
        String d6 = tr.getElementsByTag("td").get(7).text();
        boolean acumulado = tr.getElementsByTag("td").get(15).text().equals("SIM");

        Concurso concurso = new Concurso(Integer.parseInt(codigo));
        concurso.addNumero(Integer.parseInt(d1));
        concurso.addNumero(Integer.parseInt(d2));
        concurso.addNumero(Integer.parseInt(d3));
        concurso.addNumero(Integer.parseInt(d4));
        concurso.addNumero(Integer.parseInt(d5));
        concurso.addNumero(Integer.parseInt(d6));
        concurso.setAcumulado(acumulado);

        this.concursos.add(concurso);
      }
  }
  @Override
  protected RemoteDetectionResult detectRemoteRepository(
      final ScrapeContext context, final Page page) {
    // cheap checks first, to quickly eliminate target without doing any remote requests
    if (page.getHttpResponse().getStatusLine().getStatusCode() == 200) {
      final Elements elements = page.getDocument().getElementsByTag("a");
      if (!elements.isEmpty()) {
        // get "template" parent link
        final Element templateParentLink = getParentDirectoryElement(page);
        // get the page parent link (note: usually it's 1st elem, but HTTPD for example has extra
        // links for
        // column
        // sorting
        for (Element element : elements) {
          // if text is same and abs URLs points to same place, we got it
          if (templateParentLink.text().equals(element.text())
              && templateParentLink.absUrl("href").equals(element.absUrl("href"))) {
            return new RemoteDetectionResult(
                RemoteDetectionOutcome.RECOGNIZED_SHOULD_BE_SCRAPED,
                getTargetedServer(),
                "Remote is a generated index page of " + getTargetedServer());
          }
        }
      }
    }

    // um, we were not totally positive, this might be some web server with index page similar to
    // Nexus one
    return new RemoteDetectionResult(
        RemoteDetectionOutcome.UNRECOGNIZED,
        getTargetedServer(),
        "Remote is not a generated index page of " + getTargetedServer());
  }
예제 #29
0
  public List<MenuMeal> getMenuMeals(int number) {
    Document doc = null;
    List<MenuMeal> meals = new ArrayList<>();

    try {
      doc =
          Jsoup.connect(String.format(URL, number))
              .userAgent("Chrome/49.0.2623.112")
              .referrer("https://www.google.ru/")
              .timeout(7000)
              .get();
    } catch (IOException e) {
      e.printStackTrace();
    }
    if (doc == null) return meals;

    Elements elements = doc.select("td[width=400");

    if (!elements.isEmpty()) {
      for (Element element : elements) {
        Element parent = element.parent();
        MenuMeal menuMeal = new MenuMeal();

        menuMeal.setDescription(parent.select("div[id=ssilka]").first().text());
        String cost = parent.select("div[id=ssilka]").last().text();
        menuMeal.setCost(Integer.valueOf(cost.substring(0, cost.indexOf("-"))));

        meals.add(menuMeal);
      }
      return meals;
    } else {
      return meals;
    }
  }
예제 #30
0
  private Integer searchResults(Document document) {
    Integer occurences = 0;
    String searchResult = "0";
    Elements searchResults = document.select("h2.page-title.hidden-xs");

    if (searchResults.size() == 0) {
      // Sometimes results come in a different place, check it
      searchResults = document.select("div#resultsCountHeader h1.fnt12");
    }

    if (searchResults.size() > 0) {
      searchResult = searchResults.get(0).text().split(" ")[0];
    }

    // When the result is more than 1000 we get 1000+, so we delete the + sign
    if (searchResult.endsWith("+")) {
      searchResult = searchResult.substring(0, searchResult.length() - 1);
    }

    try {
      // We deal with results like 'Zero' or 'Sorry, none job...'
      occurences = Integer.parseInt(searchResult.replace(",", ""));
    } catch (NumberFormatException e) {
      System.out.println("Error parsing:" + searchResult);
      occurences = 0;
    }

    return occurences;
  }