Exemple #1
0
 public static void scrapAndSaveTvDotComShows() throws IOException {
   // ########## get ALL tv shows from TV.com ##########
   TVDotComCrawler crawler = new TVDotComCrawler();
   ArrayList<Show> allShows = crawler.scrapAllShows(MINIMUM_VOTES);
   // System.out.println(allShows.size());
   writeToXmlFile(createDocumentFromShows(allShows), "allShowsTvDotCom.xml");
 }
Exemple #2
0
 private static void filterShowsByVotes() throws JDOMException, IOException {
   List<Show> allShows = createShowsFromXml("allShowsTvDotCom.xml");
   TVDotComCrawler crawler = new TVDotComCrawler();
   long start = System.currentTimeMillis();
   crawler.addVotesInfo(allShows, 150, MINIMUM_VOTES);
   System.out.println("Total time: " + (System.currentTimeMillis() - start));
   Document result = createDocumentFromShows(allShows);
   writeToXmlFile(result, "updatedShows.xml");
 }
Exemple #3
0
  public static void main(String[] args) throws IOException, JDOMException {
    String lostURL =
        "http://www.tv.com/chuck/show/68724/summary.html?flag=1&tag=page_nav;subtabs;summary";
    // "http://www.tv.com/lost/show/24313/summary.html?flag=1&tag=page_nav;subtabs;summary";
    TVDotComCrawler c = new TVDotComCrawler();
    String frontPage = c.getPage(lostURL);
    // System.out.println(frontPage);
    //		String CAST_PATTERN = "<a href=\"(.*?)\">Cast and Crew</a>";
    //		String castURL = c.getMatch(CAST_PATTERN, frontPage, 1, 1);
    String castURL = lostURL.replace("summary.html", "cast.html");
    // System.out.println("\n\n" + castURL + "\n\n");
    // more than one cast page pattern
    /*<ul class="TAB_LINKS">
    <li class="first">
    <a class="selected" href="http://www.tv.com/lost/show/24313/cast.html?tag=Stars;paginator;1&pg_celebs=0">1</a>
    </li>
    <li>
    <a href="http://www.tv.com/lost/show/24313/cast.html?tag=Stars;paginator;2&pg_celebs=1">2</a>
    </li>
    </ul>*/
    String castPage = c.getPage(castURL);
    System.out.println(castPage);
    String PAGES_PATTERN = "<ul class=\"TAB_LINKS\">.*?<a href=\"(.*?)\".*?</li>";
    String otherPageURL = c.getMatch(PAGES_PATTERN, castPage, 2, 1);
    System.out.println(otherPageURL);

    // List<Show> allShows = createShowsFromXml("newUpdatedShows.xml");
    // System.out.println("\n\n\n" + allShows.size() + "\n\n");
    //		Document result = createDocumentFromShows(allShows);
    //		writeToXmlFile(result, "newUpdatedShows.xml");

    /*
    		long start = System.currentTimeMillis();
    		// ########## get ALL tv shows from XML file ##########
    		ArrayList<Show> allShows = getShowsFromXml();
    		long reading = System.currentTimeMillis();
    		System.out.println("Reading: " + (reading - start));
    		TVRageCrawler crawler = new TVRageCrawler();
    		int count = 12122;

    		// ##################################################################################
    		List<Show> shows = new LinkedList<Show>();
    		char startingWith = 't';
    		String name;

    		// HACK!!!!!!!!!!!!!!!!
    		boolean add = false;
    		for(Show show : allShows)
    		{
    			if(add)
    				shows.add(show);
    			else
    			{
    				if(show.getName().compareToIgnoreCase("TV Revolution") == 0)
    				{
    					add = true;
    					System.out.println("YUPI!!!!!!!!!!");
    				}
    			}
    		}

    		while(true)
    		{
    			//shows = getShowsStartingBy(startingWith, allShows);
    			System.out.println("Shows starting with letter: " + startingWith);
    			// actualiza os shows com info do TVRage
    			for(Show show : shows)
    			{
    				name = show.getName().trim().toLowerCase();
    				if(name.length() < 1 || name.charAt(0) != startingWith)
    					continue;

    //				try
    //				{
    					try
    					{
    						//crawler.addSeasonsInfo(show);
    						Document d = crawler.writeAllToFileTest(show);
    						if(d != null)
    						{
    							writeToXmlFile(d, "allTvRageData\\" + count + ".xml");
    							count++;
    						}
    					} catch(Exception e) {
    						e.printStackTrace();
    					}

    //				} catch (JDOMException e){
    //					//e.printStackTrace();
    //					System.out.println("JDOM Exception! Show: " + show.getName());
    //				}
    			}
    			// escreve para ficheiro
    			//writeToXmlFile(new Document().setRootElement(getRootElement(shows)), "showsFromTvDotComStartingWith_" + startingWith + ".xml");
    			if(startingWith == 'z')
    				break;
    			startingWith++;
    		}
    		// ##################################################################################
    		// COMMENT 1
    		long scraping = System.currentTimeMillis();
    		System.out.println("Scraping:" + (scraping - reading));
    		// escreve para ficheiro
    		writeToXmlFile(new Document().setRootElement(getRootElement(allShows)), "allShowsComplete.xml");
    		long writing = System.currentTimeMillis();
    		System.out.println("Writing:" + (writing - scraping));
    		// grava no jena
    		for(Show show : allShows)
    			show.persist();
    		long saving = System.currentTimeMillis();
    		System.out.println("Saving:" + (saving - writing));
    		// mostra dados do jena
    		//showShows();
    		// END OF COMMENT 1
    		*/
  }