@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); if (options.getType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getType()); } String id = ""; if (StringUtils.isNotBlank(options.getId(providerInfo.getId()))) { id = options.getId(providerInfo.getId()); } if (StringUtils.isBlank(id) && options.getResult() != null) { if (StringUtils.isEmpty(options.getResult().getId())) { id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)"); } else { id = options.getResult().getId(); } } // we can not scrape without zelluloid id and url if (StringUtils.isBlank(id) && StringUtils.isBlank(options.getResult().getUrl())) { throw new Exception("cannot scrape without id and url"); } String detailurl = BASE_URL + "/filme/index.php3?id=" + id; if (StringUtils.isBlank(id)) { detailurl = options.getResult().getUrl(); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); Url url; try { url = new CachedUrl(detailurl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); // parse title String title = doc.getElementsByAttributeValue("property", "og:title").attr("content").trim(); md.setTitle(title); // parse plot String plot = doc.getElementsByAttributeValue("class", "bigtext").text(); md.setPlot(plot); md.setTagline(plot.length() > 150 ? plot.substring(0, 150) : plot); // parse poster Elements el = doc.getElementsByAttributeValueStarting("src", "/images/poster"); if (el.size() == 1) { // Poster MediaArtwork ma = new MediaArtwork(providerInfo.getId(), MediaArtwork.MediaArtworkType.POSTER); ma.setPreviewUrl(BASE_URL + el.get(0).attr("src")); ma.setDefaultUrl(BASE_URL + el.get(0).attr("src")); ma.setLanguage(options.getLanguage().getLanguage()); md.addMediaArt(ma); } // parse year el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { try { md.setYear(Integer.parseInt(el.get(0).text())); } catch (Exception ignored) { } } // parse cinema release el = doc.getElementsByAttributeValueContaining("href", "?v=w"); if (el.size() > 0) { try { SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy"); Date d = sdf.parse(el.get(0).text()); md.setReleaseDate(d); } catch (Exception e) { LOGGER.warn("cannot parse cinema release date: " + el.get(0).text()); } } // parse original title md.setOriginalTitle(StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<")); if (StringUtils.isEmpty(md.getOriginalTitle())) { md.setOriginalTitle(md.getTitle()); } // parse runtime String rt = (StrgUtils.substr(doc.toString(), "ca. (.*?) min")); if (!rt.isEmpty()) { try { md.setRuntime(Integer.valueOf(rt)); } catch (Exception e2) { LOGGER.warn("cannot convert runtime: " + rt); } } // parse genres el = doc.getElementsByAttributeValueContaining("href", "az.php3?g="); for (Element g : el) { String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1); md.addGenre(getTmmGenre(gid)); } // parse cert // FSK: ab 12, $230 Mio. Budget String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]"); if (!fsk.isEmpty()) { md.addCertification(Certification.findCertification(fsk)); } // parse rating Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable"); if (ratings.size() == 2) { // get user rating Element e = ratings.get(1); // <div>87%</div> String r = e.getElementsByTag("div").text().replace("%", ""); try { md.setRating(Float.valueOf(r) / 10); // only 0-10 } catch (Exception e2) { LOGGER.warn("cannot convert rating: " + r); } } // details page doc = null; String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id; try { url = new CachedUrl(detailsUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get details: " + e.getMessage()); } if (doc != null) { Element tab = doc.getElementById("ccdetails"); int header = 0; String lastRole = ""; for (Element tr : tab.getElementsByTag("tr")) { if (tr.toString().contains("dyngfx")) { // header gfx if (tr.toString().contains("Besetzung")) { header = 1; } else if (tr.toString().contains("Crew")) { header = 2; } else if (tr.toString().contains("Produktion")) { // company, not producers header = 3; } else if (tr.toString().contains("Verleih")) { header = 4; } else if (tr.toString().contains("Alternativtitel")) { header = 5; } continue; } else { // no header gfx, so data MediaCastMember mcm = new MediaCastMember(); el = tr.getElementsByTag("td"); if (header == 1) { // actors if (el.size() == 2) { String role = "" + el.get(0).text().trim(); // text() decodes to \u00a0 if (role.equals("\u00a0") || StringUtils.isBlank(role)) { continue; } mcm.setCharacter(role); mcm.setName(el.get(1).getElementsByTag("a").text()); mcm.setId( StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); mcm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(mcm); // parsing actor pages would we too heavy here just for actor images.. } } else if (header == 2) { // crew if (el.size() == 2) { String crewrole = el.get(0).html().trim(); mcm.setName(el.get(1).getElementsByTag("a").text()); if (crewrole.equals(" ")) { crewrole = lastRole; // pop previous } else { lastRole = crewrole; // push new } mcm.setPart(crewrole); switch (crewrole) { case "Regie": mcm.setType(MediaCastMember.CastType.DIRECTOR); break; case "Drehbuch": mcm.setType(MediaCastMember.CastType.WRITER); break; case "Produktion": mcm.setType(MediaCastMember.CastType.PRODUCER); break; default: mcm.setType(MediaCastMember.CastType.OTHER); break; } mcm.setId( StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"), "id=(\\d+)")); md.addCastMember(mcm); } } else if (header == 3) { // production md.addProductionCompany(el.get(0).text()); } } } } // get links page doc = null; String linksUrl = BASE_URL + "/filme/links.php3?id=" + id; try { url = new CachedUrl(linksUrl); in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get links page: " + e.getMessage()); } if (doc != null) { el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com"); if (el != null && el.size() > 0) { String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})"); if (imdb.isEmpty()) { imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)"); } md.setId(MediaMetadata.IMDB, imdb); } } } catch (Exception e) { LOGGER.error("Error parsing " + detailurl); throw e; } return md; }
@Override public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception { LOGGER.debug("search() " + options.toString()); if (options.getMediaType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getMediaType()); } int year = 0; if (options.getYear() != 0) { year = options.getYear(); } ArrayList<MediaSearchResult> resultList = new ArrayList<>(); String searchUrl = ""; String searchTerm = ""; String imdb = ""; // only title search if (StringUtils.isNotEmpty(options.getQuery())) { searchTerm = options.getQuery(); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search for : " + searchTerm); } else { LOGGER.debug("empty searchString"); return resultList; } searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); Document doc = null; try { Url url = new CachedUrl(searchUrl); InputStream in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage()); } if (doc == null || doc.text().contains("Interner Fehler")) { // FIXME: we are using the one which comes with zelluloid - NOT the global one SearchTitleWithGoogle gs = new SearchTitleWithGoogle(); List<MediaSearchResult> gr = gs.search("zelluloid.de", this.getProviderInfo(), options); for (MediaSearchResult msr : gr) { // filter google results - only movie links if (msr.getUrl().contains("/filme/index.php3")) { String id = StrgUtils.substr(msr.getUrl(), "id=(.*)"); msr.setId(id); resultList.add(msr); } } return resultList; } // only look for movie links // Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php"); // <TR><TD ALIGN=CENTER><IMG SRC="/gfx/icoMovie.gif" WIDTH=26 HEIGHT=26 // ALT="Film"></TD><TD><B><a // href="hit.php3?hit=3700de0676109950820a042115e98d99-movie-886-23126993-2" // class="normLight">Twelve // Monkeys</B> <nobr>(1995)</nobr></a><div class="smallBlur">R: Terry Gilliam</div></TD> Elements filme = doc.getElementsByTag("tr"); for (Element tr : filme) { // no nesting trs if (tr.getElementsByTag("tr").size() > 1) { continue; } // only tr with movie links Elements as = tr.getElementsByAttributeValueStarting("href", "hit.php3?hit="); if (as.isEmpty()) { continue; } // and only movies if (tr.text().contains("TV-Serie")) { continue; } try { Element a = as.first(); String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-"); MediaSearchResult sr = new MediaSearchResult(providerInfo.getId(), options.getMediaType()); sr.setId(id); if (StringUtils.isEmpty(sr.getTitle())) { if (a.html().contains("nobr")) { sr.setTitle(a.ownText()); } else { sr.setTitle(a.text()); } } LOGGER.debug("found movie " + sr.getTitle()); sr.setOriginalTitle(a.getElementsByTag("span").text()); try { sr.setYear( Integer.parseInt( StrgUtils.substr( tr.getElementsByTag("nobr").text(), ".*(\\d{4}).*"))); // any 4 digit } catch (Exception ignored) { } sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id); // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(), // "images(.*?)\\"")); if (imdb.equals(sr.getIMDBId())) { // perfect match sr.setScore(1); } else { // compare score based on names float score = MetadataUtil.calculateScore(searchTerm, sr.getTitle()); if (yearDiffers(year, sr.getYear())) { float diff = (float) Math.abs(year - sr.getYear()) / 100; LOGGER.debug( "parsed year does not match search result year - downgrading score by " + diff); score -= diff; } sr.setScore(score); } resultList.add(sr); } catch (Exception e) { LOGGER.warn("error parsing movie result: " + e.getMessage()); } } LOGGER.debug("found " + resultList.size() + " search results"); // didn't we find anything? we may have been redirected to the details page if (resultList.isEmpty()) { if (!doc.getElementsByTag("title").text().contains("Suche nach")) { // redirected to detail page MediaSearchResult msr = new MediaSearchResult(providerInfo.getId(), options.getMediaType()); Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id="); if (el.size() > 0) { msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)")); } msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim()); el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { try { msr.setYear(Integer.parseInt(el.get(0).text())); } catch (Exception ignored) { } } resultList.add(msr); } return resultList; } Collections.sort(resultList); Collections.reverse(resultList); return resultList; }