public static void scrapAndSaveTvDotComShows() throws IOException { // ########## get ALL tv shows from TV.com ########## TVDotComCrawler crawler = new TVDotComCrawler(); ArrayList<Show> allShows = crawler.scrapAllShows(MINIMUM_VOTES); // System.out.println(allShows.size()); writeToXmlFile(createDocumentFromShows(allShows), "allShowsTvDotCom.xml"); }
private static void filterShowsByVotes() throws JDOMException, IOException { List<Show> allShows = createShowsFromXml("allShowsTvDotCom.xml"); TVDotComCrawler crawler = new TVDotComCrawler(); long start = System.currentTimeMillis(); crawler.addVotesInfo(allShows, 150, MINIMUM_VOTES); System.out.println("Total time: " + (System.currentTimeMillis() - start)); Document result = createDocumentFromShows(allShows); writeToXmlFile(result, "updatedShows.xml"); }
public static void main(String[] args) throws IOException, JDOMException { String lostURL = "http://www.tv.com/chuck/show/68724/summary.html?flag=1&tag=page_nav;subtabs;summary"; // "http://www.tv.com/lost/show/24313/summary.html?flag=1&tag=page_nav;subtabs;summary"; TVDotComCrawler c = new TVDotComCrawler(); String frontPage = c.getPage(lostURL); // System.out.println(frontPage); // String CAST_PATTERN = "<a href=\"(.*?)\">Cast and Crew</a>"; // String castURL = c.getMatch(CAST_PATTERN, frontPage, 1, 1); String castURL = lostURL.replace("summary.html", "cast.html"); // System.out.println("\n\n" + castURL + "\n\n"); // more than one cast page pattern /*<ul class="TAB_LINKS"> <li class="first"> <a class="selected" href="http://www.tv.com/lost/show/24313/cast.html?tag=Stars;paginator;1&pg_celebs=0">1</a> </li> <li> <a href="http://www.tv.com/lost/show/24313/cast.html?tag=Stars;paginator;2&pg_celebs=1">2</a> </li> </ul>*/ String castPage = c.getPage(castURL); System.out.println(castPage); String PAGES_PATTERN = "<ul class=\"TAB_LINKS\">.*?<a href=\"(.*?)\".*?</li>"; String otherPageURL = c.getMatch(PAGES_PATTERN, castPage, 2, 1); System.out.println(otherPageURL); // List<Show> allShows = createShowsFromXml("newUpdatedShows.xml"); // System.out.println("\n\n\n" + allShows.size() + "\n\n"); // Document result = createDocumentFromShows(allShows); // writeToXmlFile(result, "newUpdatedShows.xml"); /* long start = System.currentTimeMillis(); // ########## get ALL tv shows from XML file ########## ArrayList<Show> allShows = getShowsFromXml(); long reading = System.currentTimeMillis(); System.out.println("Reading: " + (reading - start)); TVRageCrawler crawler = new TVRageCrawler(); int count = 12122; // ################################################################################## List<Show> shows = new LinkedList<Show>(); char startingWith = 't'; String name; // HACK!!!!!!!!!!!!!!!! boolean add = false; for(Show show : allShows) { if(add) shows.add(show); else { if(show.getName().compareToIgnoreCase("TV Revolution") == 0) { add = true; System.out.println("YUPI!!!!!!!!!!"); } } } while(true) { //shows = getShowsStartingBy(startingWith, allShows); System.out.println("Shows starting with letter: " + startingWith); // actualiza os shows com info do TVRage for(Show show : shows) { name = show.getName().trim().toLowerCase(); if(name.length() < 1 || name.charAt(0) != startingWith) continue; // try // { try { //crawler.addSeasonsInfo(show); Document d = crawler.writeAllToFileTest(show); if(d != null) { writeToXmlFile(d, "allTvRageData\\" + count + ".xml"); count++; } } catch(Exception e) { e.printStackTrace(); } // } catch (JDOMException e){ // //e.printStackTrace(); // System.out.println("JDOM Exception! Show: " + show.getName()); // } } // escreve para ficheiro //writeToXmlFile(new Document().setRootElement(getRootElement(shows)), "showsFromTvDotComStartingWith_" + startingWith + ".xml"); if(startingWith == 'z') break; startingWith++; } // ################################################################################## // COMMENT 1 long scraping = System.currentTimeMillis(); System.out.println("Scraping:" + (scraping - reading)); // escreve para ficheiro writeToXmlFile(new Document().setRootElement(getRootElement(allShows)), "allShowsComplete.xml"); long writing = System.currentTimeMillis(); System.out.println("Writing:" + (writing - scraping)); // grava no jena for(Show show : allShows) show.persist(); long saving = System.currentTimeMillis(); System.out.println("Saving:" + (saving - writing)); // mostra dados do jena //showShows(); // END OF COMMENT 1 */ }