Пример #1
1
 @Override
 public HSDeck getDeckDetail(final HSDeck hsDeck, final float n) {
   try {
     final Document value = Jsoup.connect(HPDeckSource.BASE_URL + hsDeck.getUrl()).get();
     final Elements select = value.select("section.class-listing table.listing td.col-name");
     final HashMap<String, String> classHsItemMap = new HashMap<String, String>();
     final ArrayList<String> list = new ArrayList<String>();
     for (int i = 0; i < select.size(); ++i) {
       final String text = select.get(i).select("a").get(0).text();
       classHsItemMap.put(
           text, select.get(i).text().trim().substring(select.get(i).text().trim().length() - 1));
       list.add(text);
     }
     hsDeck.setClassHsItemMap(classHsItemMap);
     hsDeck.setClassHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list));
     final Elements select2 = value.select("section.neutral-listing table.listing td.col-name");
     final HashMap<String, String> neutralHsItemMap = new HashMap<String, String>();
     final ArrayList<String> list2 = new ArrayList<String>();
     for (int j = 0; j < select2.size(); ++j) {
       final String text2 = select2.get(j).select("a").get(0).text();
       neutralHsItemMap.put(
           text2,
           select2.get(j).text().trim().substring(select2.get(j).text().trim().length() - 1));
       list2.add(text2);
     }
     hsDeck.setNeutralHsItemMap(neutralHsItemMap);
     hsDeck.setNeutralHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list2));
     hsDeck.setDescription(
         HtmlHelper.parseDescription(value.select("div.deck-description").html(), n, false));
     return hsDeck;
   } catch (IOException ex) {
     ex.printStackTrace();
     return hsDeck;
   }
 }
Пример #2
0
  public static List genSitemap(String mapUrl, String base) {
    try {
      Document doc = Jsoup.connect(mapUrl).get();
      Elements links = doc.select("a");
      Elements imgs = doc.select("img");
      List<String> stringLinks = new ArrayList<String>();
      for (Element link : links) {
        stringLinks.add(link.attr("abs:href"));
      }

      Iterator<String> domIt = stringLinks.iterator(); // filter out links to external domains
      while (domIt.hasNext()) {
        String incDom = domIt.next();
        boolean domTest;
        domTest = incDom.contains(base);
        if (domTest == false) {
          domIt.remove();
        }
      }
      Iterator<String> i = stringLinks.iterator();
      while (i.hasNext()) { // remove index.html from incoming links prevents infinite loop
        String incA = i.next();
        if (incA.contains("index")) {
          i.remove();
        }
      }

      return stringLinks;
    } catch (Exception e) {
      // System.out.println(e);
      return null;
    }
  }
Пример #3
0
 public static List getImgs(String mapUrl) {
   try {
     Document doc = Jsoup.connect(mapUrl).get();
     Elements imgs = doc.select("img");
     List<String> stringImgs = new ArrayList<String>();
     stringImgs.add(mapUrl);
     for (Element img : imgs) {
       String imgSrc = img.attr("abs:src");
       if (imgSrc.contains("paypal") == false) stringImgs.add(imgSrc);
     }
     return stringImgs;
   } catch (Exception e) {
     System.out.println(e);
     return null;
   }
 }
Пример #4
0
  public Scraper() {

    Document doc = null;

    try {
      doc =
          Jsoup.connect(
                  "http://www.geog.leeds.ac.uk/courses/other/programming/practicals/general/web/scraping-intro/table.html")
              .get();
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }
    Element table = doc.getElementById("datatable");
    Elements rows = table.getElementsByTag("TR");

    for (Element row : rows) {
      Elements tds = row.getElementsByTag("TD");
      for (int i = 0; i < tds.size(); i++) {
        if (i == 1) System.out.println(tds.get(i).text());
      }
    }
  }
Пример #5
0
  public void CollectData(String link) {

    try {
      // Creating an empty XML Document

      DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance();
      DocumentBuilder docBuilder = dbfac.newDocumentBuilder();
      Document doc = docBuilder.newDocument();
      int flag = 0;
      // create the root element and add it to the document
      Element movie = doc.createElement("movie");
      doc.appendChild(movie);
      movie.setAttribute("id", String.valueOf(n));
      n++;
      // create sub elements
      Element genres = doc.createElement("genres");
      Element actors = doc.createElement("actors");
      Element reviews = doc.createElement("reviews");

      URL movieUrl = new URL(link);
      URL reviewsURL = new URL(link + "reviews/#type=top_critics");
      BufferedWriter bw3 = new BufferedWriter(new FileWriter("movies.xml", true));
      int count = -1;
      String auth = "";
      BufferedReader br3 = new BufferedReader(new InputStreamReader(movieUrl.openStream()));
      String str2 = "";
      String info = "";
      while (null != (str2 = br3.readLine())) {
        // start reading the html document
        if (str2.isEmpty()) continue;
        if (count == 14) break;
        if (count == 12) {
          if (!str2.contains("<h3>Cast</h3>")) continue;
          else count++;
        }
        if (count == 13) {
          if (str2.contains(">ADVERTISEMENT</p>")) {
            count++;
            movie.appendChild(actors);
            continue;
          } else {
            if (str2.contains("itemprop=\"name\">")) {
              Element actor = doc.createElement("actor");
              actors.appendChild(actor);
              Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text());
              actor.appendChild(text);
            } else continue;
          }
        }

        if (count <= 11) {
          switch (count) {
            case -1:
              {
                if (!str2.contains("property=\"og:image\"")) continue;
                else {
                  Pattern image =
                      Pattern.compile("http://.*.jpg", Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
                  Matcher match = image.matcher(str2);
                  while (match.find()) {

                    Element imageLink = doc.createElement("imageLink");
                    movie.appendChild(imageLink);
                    Text text = doc.createTextNode(match.group());
                    imageLink.appendChild(text);
                    count++;
                  }
                }
                break;
              }
            case 0:
              {
                if (str2.contains("<title>")) {

                  Element name = doc.createElement("name");
                  movie.appendChild(name);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(str2.toString().replace(" - Rotten Tomatoes", "")).text());
                  name.appendChild(text);
                  count++;
                }
                break;
              }
            case 1:
              {
                if (!str2.contains("itemprop=\"ratingValue\"")) break;
                else {
                  Element score = doc.createElement("score");
                  movie.appendChild(score);
                  Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text());
                  score.appendChild(text);
                  count++;
                }
                break;
              }
            case 2:
              {
                if (!str2.contains("itemprop=\"description\">")) continue;
                else count++;
                break;
              }
            case 3:
              {
                if (!str2.contains("itemprop=\"duration\"")) info = info.concat(str2);
                else {
                  Element MovieInfo = doc.createElement("MovieInfo");
                  movie.appendChild(MovieInfo);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  MovieInfo.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 4:
              {
                if (!str2.contains("itemprop=\"genre\"")) info = info.concat(str2);
                else {
                  Element duration = doc.createElement("duration");
                  movie.appendChild(duration);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  duration.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 5:
              {
                if (info.contains("itemprop=\"genre\"")) {
                  Element genre = doc.createElement("genre");
                  genres.appendChild(genre);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  genre.appendChild(text);
                  info = "";
                }
                if (str2.contains(">Directed By:<")) {
                  count++;
                  movie.appendChild(genres);
                  continue;
                } else {

                  if (str2.contains("itemprop=\"genre\"")) {
                    Element genre = doc.createElement("genre");
                    genres.appendChild(genre);
                    Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text());
                    genre.appendChild(text);
                  } else continue;
                }
                break;
              }
            case 6:
              {
                if (!str2.contains(">Written By:<")) {
                  if (str2.contains(">In Theaters:<")) {
                    Element director = doc.createElement("director");
                    movie.appendChild(director);
                    Text text =
                        doc.createTextNode(
                            Jsoup.parse(info.toString().replace("Directed By: ", "")).text());
                    director.appendChild(text);
                    info = str2;
                    count += 2;
                    break;
                  }
                  info = info.concat(str2);
                } else {
                  Element director = doc.createElement("director");
                  movie.appendChild(director);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(info.toString().replace("Directed By: ", "")).text());
                  director.appendChild(text);
                  info = "";
                  count++;
                }
                break;
              }
            case 7:
              {
                if (!str2.contains(">In Theaters:<")) {
                  if (str2.contains(">On DVD:<")) {
                    Element writer = doc.createElement("writer");
                    movie.appendChild(writer);
                    Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                    writer.appendChild(text);
                    info = str2;
                    count += 2;
                    break;
                  }
                  info = info.concat(str2);
                } else {
                  Element writer = doc.createElement("writer");
                  movie.appendChild(writer);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  writer.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 8:
              {
                if (!str2.contains(">On DVD:<")) info = info.concat(str2);
                else {
                  Element TheatreRelease = doc.createElement("TheatreRelease");
                  movie.appendChild(TheatreRelease);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(info.toString().replace("In Theaters:", "")).text());
                  TheatreRelease.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 9:
              {
                if (!str2.contains(">US Box Office:<")) {
                  if (str2.contains("itemprop=\"productionCompany\"")) {
                    Element DvdRelease = doc.createElement("DvdRelease");
                    movie.appendChild(DvdRelease);
                    Text text =
                        doc.createTextNode(
                            Jsoup.parse(info.toString().replace("On DVD:", "")).text());
                    DvdRelease.appendChild(text);
                    info = str2;
                    count += 2;
                    break;
                  }
                  info = info.concat(str2);
                } else {
                  Element DvdRelease = doc.createElement("DvdRelease");
                  movie.appendChild(DvdRelease);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(info.toString().replace("On DVD:", "")).text());
                  DvdRelease.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 10:
              {
                if (!str2.contains("itemprop=\"productionCompany\"")) info = info.concat(str2);
                else {
                  Element BOCollection = doc.createElement("BOCollection");
                  movie.appendChild(BOCollection);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(info.toString().replace("US Box Office:", "")).text());
                  BOCollection.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 11:
              {
                if (!str2.contains(">Official Site")) info = info.concat(str2);
                else {
                  Element Production = doc.createElement("Production");
                  movie.appendChild(Production);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  Production.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }

            default:
              break;
          }
        }
      }
      BufferedReader br4 = new BufferedReader(new InputStreamReader(reviewsURL.openStream()));
      String str3 = "";
      String info2 = "";
      int count2 = 0;
      while (null != (str3 = br4.readLine())) {
        if (count2 == 0) {

          if (!str3.contains("<div class=\"reviewsnippet\">")) continue;
          else count2++;
        }
        if (count2 == 1) {
          if (!str3.contains("<p class=\"small subtle\">")) info2 = info2.concat(str3);
          else {
            Element review = doc.createElement("review");
            reviews.appendChild(review);
            Text text = doc.createTextNode(Jsoup.parse(info2.toString()).text());
            review.appendChild(text);
            info2 = "";
            count2 = 0;
          }
        }
      }
      movie.appendChild(reviews);
      TransformerFactory transfac = TransformerFactory.newInstance();
      Transformer trans = transfac.newTransformer();
      trans.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
      trans.setOutputProperty(OutputKeys.INDENT, "yes");

      // create string from xml tree
      StringWriter sw = new StringWriter();
      StreamResult result = new StreamResult(sw);
      DOMSource source = new DOMSource(doc);
      trans.transform(source, result);
      String xmlString = sw.toString();
      bw3.write(xmlString);
      br3.close();
      br4.close();
      bw3.close();
    } catch (Exception ex) {
      ex.printStackTrace();
    }
  }
Пример #6
0
 @Override
 public List<HSDeck> getDeckListFiltered(final DeckBrowserRequest deckBrowserRequest) {
   final List<HSPlayerClass> classFilter = deckBrowserRequest.getClassFilter();
   final ArrayList<HSDeck> list = new ArrayList<HSDeck>();
   try {
     String s2;
     final String s = s2 = HPDeckSource.BASE_URL + HPDeckSource.DECKS_URL;
     if (deckBrowserRequest.getSortingKey() != null) {
       s2 = s;
       if (!deckBrowserRequest.getSortingKey().trim().isEmpty()) {
         s2 =
             s
                 + "&"
                 + HP_REQUEST_PARAMS.FILTER_OPTION.requestParam
                 + deckBrowserRequest.getSortingKey();
       }
     }
     String string = s2;
     if (deckBrowserRequest.getDeckNameFilter() != null) {
       string = s2;
       if (!deckBrowserRequest.getDeckNameFilter().trim().isEmpty()) {
         string =
             s2
                 + "&"
                 + HP_REQUEST_PARAMS.FILTER_SEARCH.requestParam
                 + this.constructDeckNameFilter(deckBrowserRequest.getDeckNameFilter());
       }
     }
     String string2 = string;
     if (classFilter != null) {
       string2 = string;
       if (classFilter.size() > 0) {
         string2 = string;
         if (!classFilter.contains(HSPlayerClass.ALL)) {
           int n = 0;
           for (final HSPlayerClass hsPlayerClass : classFilter) {
             if (hsPlayerClass.isSingleClass()) {
               n += hsPlayerClass.getHsFilterValue();
             }
           }
           string2 = string + "&" + HP_REQUEST_PARAMS.FILTER_CLASS.requestParam + n;
         }
       }
     }
     String string3 = string2;
     if (deckBrowserRequest.getOrderBy() != null) {
       string3 = string2;
       if (!deckBrowserRequest.getOrderBy().isEmpty()) {
         String s3;
         if (deckBrowserRequest.isAsc()) {
           s3 = "";
         } else {
           s3 = "-";
         }
         string3 =
             string2
                 + "&"
                 + HP_REQUEST_PARAMS.FILTER_SORT.requestParam
                 + s3
                 + deckBrowserRequest.getOrderBy();
       }
     }
     final Elements select =
         Jsoup.connect(string3)
             .referrer(HPDeckSource.BASE_URL + "/")
             .followRedirects(true)
             .ignoreHttpErrors(true)
             .get()
             .select("table#decks tr");
     for (int i = 1; i < select.size() - 1; ++i) {
       final Element value = select.get(i);
       final Elements select2 = value.select("td.col-name span.tip a");
       final Elements select3 = value.select("td.col-deck-type");
       final Elements select4 = value.select("td.col-class");
       final Elements select5 = value.select("td.col-ratings div.rating-sum");
       final Elements select6 = value.select("td.col-dust-cost");
       final Elements select7 = value.select("td.col-updated abbr");
       final HSDeck hsDeck = new HSDeck();
       hsDeck.setName(select2.get(0).text());
       hsDeck.setUrl(select2.get(0).attr("href"));
       hsDeck.setType(select3.get(0).text());
       hsDeck.setPlayerClass(select4.get(0).text());
       hsDeck.setRating(select5.get(0).text());
       hsDeck.setCost(select6.get(0).text());
       if (select7.get(0).hasAttr("data-epoch")) {
         hsDeck.setLastUpdate(select7.get(0).attributes().get("data-epoch"));
       }
       hsDeck.setLastUpdateAsString(select7.get(0).text());
       list.add(hsDeck);
     }
   } catch (IOException ex) {
     ex.printStackTrace();
   }
   return list;
 }