@Override public HSDeck getDeckDetail(final HSDeck hsDeck, final float n) { try { final Document value = Jsoup.connect(HPDeckSource.BASE_URL + hsDeck.getUrl()).get(); final Elements select = value.select("section.class-listing table.listing td.col-name"); final HashMap<String, String> classHsItemMap = new HashMap<String, String>(); final ArrayList<String> list = new ArrayList<String>(); for (int i = 0; i < select.size(); ++i) { final String text = select.get(i).select("a").get(0).text(); classHsItemMap.put( text, select.get(i).text().trim().substring(select.get(i).text().trim().length() - 1)); list.add(text); } hsDeck.setClassHsItemMap(classHsItemMap); hsDeck.setClassHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list)); final Elements select2 = value.select("section.neutral-listing table.listing td.col-name"); final HashMap<String, String> neutralHsItemMap = new HashMap<String, String>(); final ArrayList<String> list2 = new ArrayList<String>(); for (int j = 0; j < select2.size(); ++j) { final String text2 = select2.get(j).select("a").get(0).text(); neutralHsItemMap.put( text2, select2.get(j).text().trim().substring(select2.get(j).text().trim().length() - 1)); list2.add(text2); } hsDeck.setNeutralHsItemMap(neutralHsItemMap); hsDeck.setNeutralHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list2)); hsDeck.setDescription( HtmlHelper.parseDescription(value.select("div.deck-description").html(), n, false)); return hsDeck; } catch (IOException ex) { ex.printStackTrace(); return hsDeck; } }
public static List genSitemap(String mapUrl, String base) { try { Document doc = Jsoup.connect(mapUrl).get(); Elements links = doc.select("a"); Elements imgs = doc.select("img"); List<String> stringLinks = new ArrayList<String>(); for (Element link : links) { stringLinks.add(link.attr("abs:href")); } Iterator<String> domIt = stringLinks.iterator(); // filter out links to external domains while (domIt.hasNext()) { String incDom = domIt.next(); boolean domTest; domTest = incDom.contains(base); if (domTest == false) { domIt.remove(); } } Iterator<String> i = stringLinks.iterator(); while (i.hasNext()) { // remove index.html from incoming links prevents infinite loop String incA = i.next(); if (incA.contains("index")) { i.remove(); } } return stringLinks; } catch (Exception e) { // System.out.println(e); return null; } }
public static List getImgs(String mapUrl) { try { Document doc = Jsoup.connect(mapUrl).get(); Elements imgs = doc.select("img"); List<String> stringImgs = new ArrayList<String>(); stringImgs.add(mapUrl); for (Element img : imgs) { String imgSrc = img.attr("abs:src"); if (imgSrc.contains("paypal") == false) stringImgs.add(imgSrc); } return stringImgs; } catch (Exception e) { System.out.println(e); return null; } }
public Scraper() { Document doc = null; try { doc = Jsoup.connect( "http://www.geog.leeds.ac.uk/courses/other/programming/practicals/general/web/scraping-intro/table.html") .get(); } catch (IOException ioe) { ioe.printStackTrace(); } Element table = doc.getElementById("datatable"); Elements rows = table.getElementsByTag("TR"); for (Element row : rows) { Elements tds = row.getElementsByTag("TD"); for (int i = 0; i < tds.size(); i++) { if (i == 1) System.out.println(tds.get(i).text()); } } }
public void CollectData(String link) { try { // Creating an empty XML Document DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = dbfac.newDocumentBuilder(); Document doc = docBuilder.newDocument(); int flag = 0; // create the root element and add it to the document Element movie = doc.createElement("movie"); doc.appendChild(movie); movie.setAttribute("id", String.valueOf(n)); n++; // create sub elements Element genres = doc.createElement("genres"); Element actors = doc.createElement("actors"); Element reviews = doc.createElement("reviews"); URL movieUrl = new URL(link); URL reviewsURL = new URL(link + "reviews/#type=top_critics"); BufferedWriter bw3 = new BufferedWriter(new FileWriter("movies.xml", true)); int count = -1; String auth = ""; BufferedReader br3 = new BufferedReader(new InputStreamReader(movieUrl.openStream())); String str2 = ""; String info = ""; while (null != (str2 = br3.readLine())) { // start reading the html document if (str2.isEmpty()) continue; if (count == 14) break; if (count == 12) { if (!str2.contains("<h3>Cast</h3>")) continue; else count++; } if (count == 13) { if (str2.contains(">ADVERTISEMENT</p>")) { count++; movie.appendChild(actors); continue; } else { if (str2.contains("itemprop=\"name\">")) { Element actor = doc.createElement("actor"); actors.appendChild(actor); Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text()); actor.appendChild(text); } else continue; } } if (count <= 11) { switch (count) { case -1: { if (!str2.contains("property=\"og:image\"")) continue; else { Pattern image = Pattern.compile("http://.*.jpg", Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher match = image.matcher(str2); while (match.find()) { Element imageLink = doc.createElement("imageLink"); movie.appendChild(imageLink); Text text = doc.createTextNode(match.group()); imageLink.appendChild(text); count++; } } break; } case 0: { if (str2.contains("<title>")) { Element name = doc.createElement("name"); movie.appendChild(name); Text text = doc.createTextNode( Jsoup.parse(str2.toString().replace(" - Rotten Tomatoes", "")).text()); name.appendChild(text); count++; } break; } case 1: { if (!str2.contains("itemprop=\"ratingValue\"")) break; else { Element score = doc.createElement("score"); movie.appendChild(score); Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text()); score.appendChild(text); count++; } break; } case 2: { if (!str2.contains("itemprop=\"description\">")) continue; else count++; break; } case 3: { if (!str2.contains("itemprop=\"duration\"")) info = info.concat(str2); else { Element MovieInfo = doc.createElement("MovieInfo"); movie.appendChild(MovieInfo); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); MovieInfo.appendChild(text); info = str2; count++; } break; } case 4: { if (!str2.contains("itemprop=\"genre\"")) info = info.concat(str2); else { Element duration = doc.createElement("duration"); movie.appendChild(duration); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); duration.appendChild(text); info = str2; count++; } break; } case 5: { if (info.contains("itemprop=\"genre\"")) { Element genre = doc.createElement("genre"); genres.appendChild(genre); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); genre.appendChild(text); info = ""; } if (str2.contains(">Directed By:<")) { count++; movie.appendChild(genres); continue; } else { if (str2.contains("itemprop=\"genre\"")) { Element genre = doc.createElement("genre"); genres.appendChild(genre); Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text()); genre.appendChild(text); } else continue; } break; } case 6: { if (!str2.contains(">Written By:<")) { if (str2.contains(">In Theaters:<")) { Element director = doc.createElement("director"); movie.appendChild(director); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("Directed By: ", "")).text()); director.appendChild(text); info = str2; count += 2; break; } info = info.concat(str2); } else { Element director = doc.createElement("director"); movie.appendChild(director); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("Directed By: ", "")).text()); director.appendChild(text); info = ""; count++; } break; } case 7: { if (!str2.contains(">In Theaters:<")) { if (str2.contains(">On DVD:<")) { Element writer = doc.createElement("writer"); movie.appendChild(writer); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); writer.appendChild(text); info = str2; count += 2; break; } info = info.concat(str2); } else { Element writer = doc.createElement("writer"); movie.appendChild(writer); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); writer.appendChild(text); info = str2; count++; } break; } case 8: { if (!str2.contains(">On DVD:<")) info = info.concat(str2); else { Element TheatreRelease = doc.createElement("TheatreRelease"); movie.appendChild(TheatreRelease); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("In Theaters:", "")).text()); TheatreRelease.appendChild(text); info = str2; count++; } break; } case 9: { if (!str2.contains(">US Box Office:<")) { if (str2.contains("itemprop=\"productionCompany\"")) { Element DvdRelease = doc.createElement("DvdRelease"); movie.appendChild(DvdRelease); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("On DVD:", "")).text()); DvdRelease.appendChild(text); info = str2; count += 2; break; } info = info.concat(str2); } else { Element DvdRelease = doc.createElement("DvdRelease"); movie.appendChild(DvdRelease); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("On DVD:", "")).text()); DvdRelease.appendChild(text); info = str2; count++; } break; } case 10: { if (!str2.contains("itemprop=\"productionCompany\"")) info = info.concat(str2); else { Element BOCollection = doc.createElement("BOCollection"); movie.appendChild(BOCollection); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("US Box Office:", "")).text()); BOCollection.appendChild(text); info = str2; count++; } break; } case 11: { if (!str2.contains(">Official Site")) info = info.concat(str2); else { Element Production = doc.createElement("Production"); movie.appendChild(Production); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); Production.appendChild(text); info = str2; count++; } break; } default: break; } } } BufferedReader br4 = new BufferedReader(new InputStreamReader(reviewsURL.openStream())); String str3 = ""; String info2 = ""; int count2 = 0; while (null != (str3 = br4.readLine())) { if (count2 == 0) { if (!str3.contains("<div class=\"reviewsnippet\">")) continue; else count2++; } if (count2 == 1) { if (!str3.contains("<p class=\"small subtle\">")) info2 = info2.concat(str3); else { Element review = doc.createElement("review"); reviews.appendChild(review); Text text = doc.createTextNode(Jsoup.parse(info2.toString()).text()); review.appendChild(text); info2 = ""; count2 = 0; } } } movie.appendChild(reviews); TransformerFactory transfac = TransformerFactory.newInstance(); Transformer trans = transfac.newTransformer(); trans.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); trans.setOutputProperty(OutputKeys.INDENT, "yes"); // create string from xml tree StringWriter sw = new StringWriter(); StreamResult result = new StreamResult(sw); DOMSource source = new DOMSource(doc); trans.transform(source, result); String xmlString = sw.toString(); bw3.write(xmlString); br3.close(); br4.close(); bw3.close(); } catch (Exception ex) { ex.printStackTrace(); } }
@Override public List<HSDeck> getDeckListFiltered(final DeckBrowserRequest deckBrowserRequest) { final List<HSPlayerClass> classFilter = deckBrowserRequest.getClassFilter(); final ArrayList<HSDeck> list = new ArrayList<HSDeck>(); try { String s2; final String s = s2 = HPDeckSource.BASE_URL + HPDeckSource.DECKS_URL; if (deckBrowserRequest.getSortingKey() != null) { s2 = s; if (!deckBrowserRequest.getSortingKey().trim().isEmpty()) { s2 = s + "&" + HP_REQUEST_PARAMS.FILTER_OPTION.requestParam + deckBrowserRequest.getSortingKey(); } } String string = s2; if (deckBrowserRequest.getDeckNameFilter() != null) { string = s2; if (!deckBrowserRequest.getDeckNameFilter().trim().isEmpty()) { string = s2 + "&" + HP_REQUEST_PARAMS.FILTER_SEARCH.requestParam + this.constructDeckNameFilter(deckBrowserRequest.getDeckNameFilter()); } } String string2 = string; if (classFilter != null) { string2 = string; if (classFilter.size() > 0) { string2 = string; if (!classFilter.contains(HSPlayerClass.ALL)) { int n = 0; for (final HSPlayerClass hsPlayerClass : classFilter) { if (hsPlayerClass.isSingleClass()) { n += hsPlayerClass.getHsFilterValue(); } } string2 = string + "&" + HP_REQUEST_PARAMS.FILTER_CLASS.requestParam + n; } } } String string3 = string2; if (deckBrowserRequest.getOrderBy() != null) { string3 = string2; if (!deckBrowserRequest.getOrderBy().isEmpty()) { String s3; if (deckBrowserRequest.isAsc()) { s3 = ""; } else { s3 = "-"; } string3 = string2 + "&" + HP_REQUEST_PARAMS.FILTER_SORT.requestParam + s3 + deckBrowserRequest.getOrderBy(); } } final Elements select = Jsoup.connect(string3) .referrer(HPDeckSource.BASE_URL + "/") .followRedirects(true) .ignoreHttpErrors(true) .get() .select("table#decks tr"); for (int i = 1; i < select.size() - 1; ++i) { final Element value = select.get(i); final Elements select2 = value.select("td.col-name span.tip a"); final Elements select3 = value.select("td.col-deck-type"); final Elements select4 = value.select("td.col-class"); final Elements select5 = value.select("td.col-ratings div.rating-sum"); final Elements select6 = value.select("td.col-dust-cost"); final Elements select7 = value.select("td.col-updated abbr"); final HSDeck hsDeck = new HSDeck(); hsDeck.setName(select2.get(0).text()); hsDeck.setUrl(select2.get(0).attr("href")); hsDeck.setType(select3.get(0).text()); hsDeck.setPlayerClass(select4.get(0).text()); hsDeck.setRating(select5.get(0).text()); hsDeck.setCost(select6.get(0).text()); if (select7.get(0).hasAttr("data-epoch")) { hsDeck.setLastUpdate(select7.get(0).attributes().get("data-epoch")); } hsDeck.setLastUpdateAsString(select7.get(0).text()); list.add(hsDeck); } } catch (IOException ex) { ex.printStackTrace(); } return list; }