@Override public HSDeck getDeckDetail(final HSDeck hsDeck, final float n) { try { final Document value = Jsoup.connect(HPDeckSource.BASE_URL + hsDeck.getUrl()).get(); final Elements select = value.select("section.class-listing table.listing td.col-name"); final HashMap<String, String> classHsItemMap = new HashMap<String, String>(); final ArrayList<String> list = new ArrayList<String>(); for (int i = 0; i < select.size(); ++i) { final String text = select.get(i).select("a").get(0).text(); classHsItemMap.put( text, select.get(i).text().trim().substring(select.get(i).text().trim().length() - 1)); list.add(text); } hsDeck.setClassHsItemMap(classHsItemMap); hsDeck.setClassHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list)); final Elements select2 = value.select("section.neutral-listing table.listing td.col-name"); final HashMap<String, String> neutralHsItemMap = new HashMap<String, String>(); final ArrayList<String> list2 = new ArrayList<String>(); for (int j = 0; j < select2.size(); ++j) { final String text2 = select2.get(j).select("a").get(0).text(); neutralHsItemMap.put( text2, select2.get(j).text().trim().substring(select2.get(j).text().trim().length() - 1)); list2.add(text2); } hsDeck.setNeutralHsItemMap(neutralHsItemMap); hsDeck.setNeutralHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list2)); hsDeck.setDescription( HtmlHelper.parseDescription(value.select("div.deck-description").html(), n, false)); return hsDeck; } catch (IOException ex) { ex.printStackTrace(); return hsDeck; } }
public ArrayList<String> collectLinks(String p) { ArrayList<String> PageLinks = new ArrayList<String>(); try { URL url = new URL(p); BufferedReader br3 = new BufferedReader(new InputStreamReader(url.openStream())); String str = ""; while (null != (str = br3.readLine())) { Pattern link = Pattern.compile( "<a target=\"_top\" href=\"/m/.*", Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher match = link.matcher(str); while (match.find()) { String tmp = match.group(); int start = tmp.indexOf('/'); tmp = tmp.substring(start + 1, tmp.indexOf('\"', start + 1)); if (Crawl.contains("http://www.rottentomatoes.com/" + tmp) || ToCrawl.contains("http://www.rottentomatoes.com/" + tmp) || PageLinks.contains("http://www.rottentomatoes.com/" + tmp)) continue; PageLinks.add("http://www.rottentomatoes.com/" + tmp); // bw4.write("http://www.rottentomatoes.com/"+tmp+"\r\n"); } } br3.close(); } catch (Exception ex) { ex.printStackTrace(); } return PageLinks; }
public void CrawlRT(String RTPage) throws IOException { ArrayList<String> t = new ArrayList<String>(); String crawlData; String crawlData2; String crawlData3; FileReader freader = new FileReader("Crawl.txt"); BufferedReader br = new BufferedReader(freader); FileReader freader2 = new FileReader("Tocrawl.txt"); BufferedReader br2 = new BufferedReader(freader2); FileWriter fwriter2 = new FileWriter("Tocrawl.txt", true); BufferedWriter bw2 = new BufferedWriter(fwriter2); FileWriter fwriter = new FileWriter("Crawl.txt", true); BufferedWriter bw = new BufferedWriter(fwriter); /*while(null != (crawlData2 = br.readLine())) { if(crawlData2 !=null) Crawl.add(crawlData2); } t = collectLinks(RTPage); Iterator<String> e3= t.iterator(); while(e3.hasNext()) { String ee = e3.next(); if(!Crawl.contains(ee)) { bw2.write(ee+"\r\n"); } } br.close(); br2.close(); bw.close(); bw2.close();*/ if (null == (crawlData = br.readLine())) // if(true) { // initial iteration bw.write(RTPage + "\r\n"); Crawl.add(RTPage); t = collectLinks(RTPage); ToCrawl.addAll(t); } else { // collect data from files and load to array lists while (null != (crawlData2 = br.readLine())) { if (crawlData2 != null) Crawl.add(crawlData2); } while (null != (crawlData3 = br2.readLine())) { if (crawlData3 != null) ToCrawl.add(crawlData3); } } System.out.println("Crawlled"); // Number of movies to be crawled for (int i = 0; i < 1000; i++) { if (ToCrawl.size() > 0) { Crawl.removeAll(Collections.singleton(null)); ToCrawl.removeAll(Collections.singleton(null)); String c = ToCrawl.get(0); if (Crawl.contains(c)) ToCrawl.remove(c); else { // collect links and collect data from a particular link Crawl.add(c); t = collectLinks(c); CollectData(c); ToCrawl.remove(c); Iterator<String> e3 = t.iterator(); while (e3.hasNext()) { String ee = e3.next(); if (!ToCrawl.contains(ee)) { if (!Crawl.contains(ee)) { ToCrawl.add(ee); } } } bw.write(c + "\r\n"); } } } System.out.println("To Be Crawlled"); Iterator<String> e2 = ToCrawl.iterator(); while (e2.hasNext()) { // write to file the movies still to be crawled. bw2.write(e2.next() + "\r\n"); } prop.setProperty("Id", Integer.toString(n)); prop.store(new FileOutputStream("config.properties"), null); br.close(); br2.close(); bw.close(); bw2.close(); }
@Override public List<HSDeck> getDeckListFiltered(final DeckBrowserRequest deckBrowserRequest) { final List<HSPlayerClass> classFilter = deckBrowserRequest.getClassFilter(); final ArrayList<HSDeck> list = new ArrayList<HSDeck>(); try { String s2; final String s = s2 = HPDeckSource.BASE_URL + HPDeckSource.DECKS_URL; if (deckBrowserRequest.getSortingKey() != null) { s2 = s; if (!deckBrowserRequest.getSortingKey().trim().isEmpty()) { s2 = s + "&" + HP_REQUEST_PARAMS.FILTER_OPTION.requestParam + deckBrowserRequest.getSortingKey(); } } String string = s2; if (deckBrowserRequest.getDeckNameFilter() != null) { string = s2; if (!deckBrowserRequest.getDeckNameFilter().trim().isEmpty()) { string = s2 + "&" + HP_REQUEST_PARAMS.FILTER_SEARCH.requestParam + this.constructDeckNameFilter(deckBrowserRequest.getDeckNameFilter()); } } String string2 = string; if (classFilter != null) { string2 = string; if (classFilter.size() > 0) { string2 = string; if (!classFilter.contains(HSPlayerClass.ALL)) { int n = 0; for (final HSPlayerClass hsPlayerClass : classFilter) { if (hsPlayerClass.isSingleClass()) { n += hsPlayerClass.getHsFilterValue(); } } string2 = string + "&" + HP_REQUEST_PARAMS.FILTER_CLASS.requestParam + n; } } } String string3 = string2; if (deckBrowserRequest.getOrderBy() != null) { string3 = string2; if (!deckBrowserRequest.getOrderBy().isEmpty()) { String s3; if (deckBrowserRequest.isAsc()) { s3 = ""; } else { s3 = "-"; } string3 = string2 + "&" + HP_REQUEST_PARAMS.FILTER_SORT.requestParam + s3 + deckBrowserRequest.getOrderBy(); } } final Elements select = Jsoup.connect(string3) .referrer(HPDeckSource.BASE_URL + "/") .followRedirects(true) .ignoreHttpErrors(true) .get() .select("table#decks tr"); for (int i = 1; i < select.size() - 1; ++i) { final Element value = select.get(i); final Elements select2 = value.select("td.col-name span.tip a"); final Elements select3 = value.select("td.col-deck-type"); final Elements select4 = value.select("td.col-class"); final Elements select5 = value.select("td.col-ratings div.rating-sum"); final Elements select6 = value.select("td.col-dust-cost"); final Elements select7 = value.select("td.col-updated abbr"); final HSDeck hsDeck = new HSDeck(); hsDeck.setName(select2.get(0).text()); hsDeck.setUrl(select2.get(0).attr("href")); hsDeck.setType(select3.get(0).text()); hsDeck.setPlayerClass(select4.get(0).text()); hsDeck.setRating(select5.get(0).text()); hsDeck.setCost(select6.get(0).text()); if (select7.get(0).hasAttr("data-epoch")) { hsDeck.setLastUpdate(select7.get(0).attributes().get("data-epoch")); } hsDeck.setLastUpdateAsString(select7.get(0).text()); list.add(hsDeck); } } catch (IOException ex) { ex.printStackTrace(); } return list; }