private Show parseShowInfo(Matcher matcher) { Show show = new Show(); show.setName(matcher.group(2)); show.setShowLink(matcher.group(1)); show.setDate(matcher.group(3).trim()); show.setTvDotComRating(matcher.group(4)); return show; }
public static List<Show> getShowsStartingBy(char c, List<Show> allShows) { List<Show> shows = new LinkedList<Show>(); String name; for (Show s : allShows) { name = s.getName().trim().toLowerCase(); if (name.length() < 1) { System.out.println("WTF???? => " + name); continue; } if (name.charAt(0) == c) shows.add(s); } return shows; }
private int getNumberOfVotes(Show show) { try { String page = getPage(show.getShowLink()); String temp = getMatch(VOTES_NUMBER_PATTERN, page, 1, 1); if (temp != null) { return parseNumberOfVotes(temp); } } catch (IOException e) { System.out.println("Error on show link: '" + show.getShowLink() + "'"); } return 0; }
public static Document createDocumentFromShows(List<Show> allShows) { Document document = new Document(); Element root = new Element("shows"); Element showElement, seasonElement, episodeElement; for (Show show : allShows) { if (show.getTvDotComVotes() < MINIMUM_VOTES) continue; showElement = new Element("show"); showElement.addContent(new Element("name").setText(show.getName())); showElement.addContent(new Element("showLink").setText(show.getShowLink())); showElement.addContent(new Element("startDate").setText(show.getDate())); showElement.addContent(new Element("tvDotComRating").setText(show.getTvDotComRating())); showElement.addContent(new Element("tvDotComVotes").setText("" + show.getTvDotComVotes())); /*<Season no="0"> <episode> <epnum>0</epnum> <seasonnum>00</seasonnum> <prodnum>4V79</prodnum> <airdate>0000-00-00</airdate> <link>http://www.tvrage.com/Buffy_The_Vampire_Slayer/episodes/329033</link> <title>Unaired Pilot</title> </episode> </Season>*/ if (show.getSeasons() == null) { root.addContent(showElement); System.out.println("Show name:" + show.getName()); continue; } for (Season season : show.getSeasons()) { seasonElement = new Element("season"); seasonElement.addContent(new Element("no").setText(season.getName())); for (Episode episode : season.getEpisodes()) { episodeElement = new Element("episode"); episodeElement.addContent(new Element("epnum").setText(episode.getNumber())); episodeElement.addContent(new Element("airdate").setText(episode.getAirDate())); // episodeElement.addContent(new Element("link").setText(episode.getNumber())); episodeElement.addContent(new Element("title").setText(episode.getTitle())); } showElement.addContent(seasonElement); } root.addContent(showElement); } document.setRootElement(root); return document; }
/** * Creates a list of shows by reading the given XML file. * * @param fileName * @return * @throws JDOMException * @throws IOException */ public static List<Show> createShowsFromXml(String fileName) throws JDOMException, IOException { List<Show> allShows = new ArrayList<Show>(); Show show; SAXBuilder builder = new SAXBuilder(); Document d = builder.build(new File(fileName)); Element root = d.getRootElement(); Element newElement; String value; // percorre todos os shows List<Element> children = (List<Element>) root.getChildren(); for (Iterator<Element> i = children.iterator(); i.hasNext(); ) { newElement = (Element) i.next(); show = new Show(); // name value = newElement.getChildText("name"); if (value != null) show.setName(value); // show link value = newElement.getChildText("showLink"); if (value != null) show.setShowLink(value); // start date value = newElement.getChildText("startDate"); if (value != null) show.setDate(value); // tv.com rating value = newElement.getChildText("tvDotComRating"); if (value != null) show.setTvDotComRating(value); // tv.com votes value = newElement.getChildText("tvDotComVotes"); if (value != null) show.setTvDotComVotes(Integer.parseInt(value)); System.out.println(newElement.getChildText("name")); // System.out.println(newElement.getChildText("showLink")); // System.out.println(newElement.getChildText("startDate")); System.out.println(newElement.getChildText("tvDotComRating")); System.out.println(newElement.getChildText("tvDotComVotes")); allShows.add(show); } return allShows; }
// vai buscar o nome e link de todas as series limitando as series com um minimo de votos public ArrayList<Show> scrapAllShows(int minimumVotes) throws IOException { long start = System.currentTimeMillis(); ArrayList<Show> shows = new ArrayList<Show>(); ArrayList<String> allURLs = createAllShowsURLs(); String page, temp; for (String s : allURLs) { page = getPage(s); temp = getMatch(PAGE_NUMBER_PATTERN, page, 1, 1); // System.out.println(s + ": " + temp); // get shows from page 1 Pattern pattern = Pattern.compile(SHOW_LISTING_PATTERN); Matcher matcher = pattern.matcher(page); System.out.println(" ### Page 1 ###"); while (matcher.find()) { Show newShow = parseShowInfo(matcher); int votes = getNumberOfVotes(newShow); if (votes < minimumVotes) continue; else System.out.println("Show: " + newShow.getName() + "\nVotes: " + votes + "\n"); // completeShowInfo(newShow); shows.add(newShow); delete++; /*System.out.println("Name: " + matcher.group(2)); System.out.println("URL: " + matcher.group(1)); System.out.println("Date: " + matcher.group(3).trim()); System.out.println("Rating: " + matcher.group(4));*/ // getSummary(matcher.group(1)); } // se há mais do que uma página if (temp != null) { // get number of pages int nPages = parseNumberOfPages(temp); System.out.println(" Number of pages: " + nPages); for (int i = 1; i < nPages; i++) { page = getPage(s + "&pg=" + i); System.out.println(" ### Page " + (i + 1) + " ###"); pattern = Pattern.compile(SHOW_LISTING_PATTERN); matcher = pattern.matcher(page); while (matcher.find()) { Show newShow = parseShowInfo(matcher); int votes = getNumberOfVotes(newShow); if (votes < minimumVotes) continue; else System.out.println("Show: " + newShow.getName() + "\nVotes: " + votes + "\n"); // completeShowInfo(newShow); shows.add(newShow); delete++; /*System.out.println("Name: " + matcher.group(2)); System.out.println("URL: " + matcher.group(1)); System.out.println("Date: " + matcher.group(3).trim()); System.out.println("Rating: " + matcher.group(4));*/ // getSummary(matcher.group(1)); } } } } System.out.println("Total time: " + (System.currentTimeMillis() - start)); return shows; }