private Show parseShowInfo(Matcher matcher) {
   Show show = new Show();
   show.setName(matcher.group(2));
   show.setShowLink(matcher.group(1));
   show.setDate(matcher.group(3).trim());
   show.setTvDotComRating(matcher.group(4));
   return show;
 }
Example #2
0
 public static List<Show> getShowsStartingBy(char c, List<Show> allShows) {
   List<Show> shows = new LinkedList<Show>();
   String name;
   for (Show s : allShows) {
     name = s.getName().trim().toLowerCase();
     if (name.length() < 1) {
       System.out.println("WTF???? => " + name);
       continue;
     }
     if (name.charAt(0) == c) shows.add(s);
   }
   return shows;
 }
  private int getNumberOfVotes(Show show) {
    try {
      String page = getPage(show.getShowLink());
      String temp = getMatch(VOTES_NUMBER_PATTERN, page, 1, 1);
      if (temp != null) {
        return parseNumberOfVotes(temp);
      }

    } catch (IOException e) {
      System.out.println("Error on show link: '" + show.getShowLink() + "'");
    }

    return 0;
  }
Example #4
0
  public static Document createDocumentFromShows(List<Show> allShows) {
    Document document = new Document();
    Element root = new Element("shows");
    Element showElement, seasonElement, episodeElement;
    for (Show show : allShows) {
      if (show.getTvDotComVotes() < MINIMUM_VOTES) continue;

      showElement = new Element("show");
      showElement.addContent(new Element("name").setText(show.getName()));
      showElement.addContent(new Element("showLink").setText(show.getShowLink()));
      showElement.addContent(new Element("startDate").setText(show.getDate()));
      showElement.addContent(new Element("tvDotComRating").setText(show.getTvDotComRating()));
      showElement.addContent(new Element("tvDotComVotes").setText("" + show.getTvDotComVotes()));
      /*<Season no="0">
      <episode>
      	<epnum>0</epnum>
      	<seasonnum>00</seasonnum>
      	<prodnum>4V79</prodnum>
      	<airdate>0000-00-00</airdate>
      	<link>http://www.tvrage.com/Buffy_The_Vampire_Slayer/episodes/329033</link>
      	<title>Unaired Pilot</title>
      </episode>
      </Season>*/

      if (show.getSeasons() == null) {
        root.addContent(showElement);
        System.out.println("Show name:" + show.getName());
        continue;
      }

      for (Season season : show.getSeasons()) {
        seasonElement = new Element("season");
        seasonElement.addContent(new Element("no").setText(season.getName()));
        for (Episode episode : season.getEpisodes()) {
          episodeElement = new Element("episode");
          episodeElement.addContent(new Element("epnum").setText(episode.getNumber()));
          episodeElement.addContent(new Element("airdate").setText(episode.getAirDate()));
          // episodeElement.addContent(new Element("link").setText(episode.getNumber()));
          episodeElement.addContent(new Element("title").setText(episode.getTitle()));
        }
        showElement.addContent(seasonElement);
      }

      root.addContent(showElement);
    }

    document.setRootElement(root);
    return document;
  }
Example #5
0
  /**
   * Creates a list of shows by reading the given XML file.
   *
   * @param fileName
   * @return
   * @throws JDOMException
   * @throws IOException
   */
  public static List<Show> createShowsFromXml(String fileName) throws JDOMException, IOException {
    List<Show> allShows = new ArrayList<Show>();
    Show show;
    SAXBuilder builder = new SAXBuilder();
    Document d = builder.build(new File(fileName));
    Element root = d.getRootElement();
    Element newElement;
    String value;

    // percorre todos os shows
    List<Element> children = (List<Element>) root.getChildren();
    for (Iterator<Element> i = children.iterator(); i.hasNext(); ) {
      newElement = (Element) i.next();
      show = new Show();

      // name
      value = newElement.getChildText("name");
      if (value != null) show.setName(value);
      // show link
      value = newElement.getChildText("showLink");
      if (value != null) show.setShowLink(value);
      // start date
      value = newElement.getChildText("startDate");
      if (value != null) show.setDate(value);
      // tv.com rating
      value = newElement.getChildText("tvDotComRating");
      if (value != null) show.setTvDotComRating(value);
      // tv.com votes
      value = newElement.getChildText("tvDotComVotes");
      if (value != null) show.setTvDotComVotes(Integer.parseInt(value));

      System.out.println(newElement.getChildText("name"));
      //			System.out.println(newElement.getChildText("showLink"));
      //			System.out.println(newElement.getChildText("startDate"));
      System.out.println(newElement.getChildText("tvDotComRating"));
      System.out.println(newElement.getChildText("tvDotComVotes"));

      allShows.add(show);
    }
    return allShows;
  }
  // vai buscar o nome e link de todas as series limitando as series com um minimo de votos
  public ArrayList<Show> scrapAllShows(int minimumVotes) throws IOException {
    long start = System.currentTimeMillis();
    ArrayList<Show> shows = new ArrayList<Show>();
    ArrayList<String> allURLs = createAllShowsURLs();
    String page, temp;
    for (String s : allURLs) {
      page = getPage(s);
      temp = getMatch(PAGE_NUMBER_PATTERN, page, 1, 1);
      // System.out.println(s + ": " + temp);
      // get shows from page 1
      Pattern pattern = Pattern.compile(SHOW_LISTING_PATTERN);
      Matcher matcher = pattern.matcher(page);
      System.out.println("		### Page 1 ###");
      while (matcher.find()) {
        Show newShow = parseShowInfo(matcher);
        int votes = getNumberOfVotes(newShow);

        if (votes < minimumVotes) continue;
        else System.out.println("Show: " + newShow.getName() + "\nVotes: " + votes + "\n");

        //				completeShowInfo(newShow);
        shows.add(newShow);
        delete++;
        /*System.out.println("Name: " + matcher.group(2));
        System.out.println("URL: " + matcher.group(1));
        System.out.println("Date: " + matcher.group(3).trim());
        System.out.println("Rating: " + matcher.group(4));*/
        // getSummary(matcher.group(1));
      }

      // se há mais do que uma página
      if (temp != null) {
        // get number of pages
        int nPages = parseNumberOfPages(temp);
        System.out.println("		Number of pages: " + nPages);
        for (int i = 1; i < nPages; i++) {
          page = getPage(s + "&pg=" + i);
          System.out.println("		### Page " + (i + 1) + " ###");
          pattern = Pattern.compile(SHOW_LISTING_PATTERN);
          matcher = pattern.matcher(page);
          while (matcher.find()) {
            Show newShow = parseShowInfo(matcher);
            int votes = getNumberOfVotes(newShow);

            if (votes < minimumVotes) continue;
            else System.out.println("Show: " + newShow.getName() + "\nVotes: " + votes + "\n");

            //						completeShowInfo(newShow);
            shows.add(newShow);
            delete++;

            /*System.out.println("Name: " + matcher.group(2));
            System.out.println("URL: " + matcher.group(1));
            System.out.println("Date: " + matcher.group(3).trim());
            System.out.println("Rating: " + matcher.group(4));*/
            // getSummary(matcher.group(1));
          }
        }
      }
    }
    System.out.println("Total time: " + (System.currentTimeMillis() - start));
    return shows;
  }