private void import_biographies(File directory) throws IOException, SQLException { reader = new ListFileReader(new File(directory, "biographies.list")); reader.skipUntil("^---*$"); Batch batch = schema.createBatch( "UPDATE People SET real_name = ?, nick_name = ?, " + "birth_date = ?, death_date = ? WHERE imdb_name = ?"); while (true) { List<String> lines = reader.readUntil("^---*$", false, false); if (lines == null) { break; } String imdb_name = null; String nick_name = null; String real_name = null; Date bdate = null; Date ddate = null; for (String ln : lines) { ln = ln.trim(); if (ln.isEmpty()) { continue; } if (ln.charAt(2) != ':') { continue; } String key = ln.substring(0, 2); String value = ln.substring(3).trim(); if (key.equals("NM")) { imdb_name = value; } else if (key.equals("RN")) { if (value.length() > 90) { real_name = value.substring(0, 90); } else { real_name = value; } } else if (key.equals("NK")) { if (value.length() > 90) { nick_name = value.substring(0, 90); } else { nick_name = value; } } else if (key.equals("DB")) { bdate = strToDate(value); } else if (key.equals("DD")) { ddate = strToDate(value); } } if (imdb_name != null) { batch.add(real_name, nick_name, bdate, ddate, imdb_name); } } batch.close(); reader.close(); }
private void import_movies(File directory) throws IOException, SQLException { reader = new ListFileReader(new File(directory, "movies.list")); reader.skipUntil("^MOVIES\\s+LIST\\s*$"); reader.skipUntil("^=+\\s*$"); final Pattern linePattern = Pattern.compile("^(.+?)\t\\s*(.+?)$"); final Pattern tvshowPattern = Pattern.compile("\"(.+?)\"\\s.*?\\{(.+?)\\}"); final Pattern filmPattern = Pattern.compile("(.+?)\\s\\(\\d+\\)"); Batch batch = schema.createBatch( "INSERT IGNORE INTO movies (imdb_name, is_film, name, episode, year) " + "VALUES (?, ?, ?, ?, ?)"); while (true) { String line = reader.readLine(); if (line == null) { break; } line = line.trim(); if (line.isEmpty()) { continue; } Matcher m = linePattern.matcher(line); if (!m.matches()) { continue; } String imdb_name = m.group(1); String name = null; String episode = null; boolean tvshow = false; int year; try { year = Integer.parseInt(m.group(2)); } catch (NumberFormatException ex) { year = -1; } if (imdb_name.charAt(0) == '"') { tvshow = true; m = tvshowPattern.matcher(imdb_name); if (!m.matches()) { continue; } name = m.group(1); episode = m.group(2); } else { m = filmPattern.matcher(imdb_name); if (!m.matches()) { continue; } name = m.group(1); } if (isEnglish(name)) { batch.add(imdb_name, tvshow ? 0 : 1, name, episode, (year > 1900) ? year : null); } } batch.close(); reader.close(); }
private void import_ratings(File directory) throws IOException, SQLException { reader = new ListFileReader(new File(directory, "ratings.list")); reader.skipUntil("New\\s+Distribution\\s+Votes\\s+Rank\\s+Title"); final Pattern linePattern = Pattern.compile("^.+?\\s+(\\d+)\\s+(\\d\\.\\d)\\s+(.+)$"); Batch batch = schema.createBatch("UPDATE movies SET rating = ?, votes = ? " + "WHERE imdb_name = ?"); while (true) { String line = reader.readLine(); if (line == null) { break; } line = line.trim(); if (line.isEmpty()) { continue; } Matcher m = linePattern.matcher(line); if (!m.matches()) { continue; } int votes = -1; double rank = -1; String imdb_name = m.group(3); try { votes = Integer.parseInt(m.group(1)); } catch (NumberFormatException ex) { } try { rank = Double.parseDouble(m.group(2)); } catch (NumberFormatException ex) { } batch.add(rank, votes, imdb_name); } batch.close(); reader.close(); }
private void import_genres(File directory) throws IOException, SQLException { reader = new ListFileReader(new File(directory, "genres.list")); reader.skipUntil("^8: THE GENRES LIST\\s*$"); final Pattern linePattern = Pattern.compile("^(.+?)\\t\\s*(.+)$"); HashMap<String, Integer> genresMap = new HashMap<String, Integer>(); Batch batch = schema.createBatch( "INSERT IGNORE MovieGenres (movie, genre) VALUES " + "((SELECT movie_id FROM Movies WHERE imdb_name = ? LIMIT 1), ?)"); SimpleInsert insertGenre = schema.createInsert("Genres", true, "name"); SimpleQuery findGenre = schema.createQuery("genre_id", "Genres", "name = ?"); // speedup: drop the FKs Statement stmt = schema.createStatement(); try { stmt.executeUpdate("ALTER TABLE MovieGenres DROP FOREIGN KEY `mg_genre`"); } catch (SQLException ex) { } try { stmt.executeUpdate("ALTER TABLE MovieGenres DROP FOREIGN KEY `mg_movie`"); } catch (SQLException ex) { } stmt.close(); while (true) { String line = reader.readLine(); if (line == null) { break; } line = line.trim(); if (line.isEmpty()) { continue; } Matcher m = linePattern.matcher(line); if (!m.matches()) { continue; } String imdb_name = m.group(1); String genre = m.group(2); int genre_id; genre = genre.toLowerCase(); if (genresMap.containsKey(genre)) { genre_id = genresMap.get(genre); } else { genre_id = insertGenre.insert(genre); if (genre_id < 0) { genre_id = findGenre.queryGetKey(genre); } genresMap.put(genre, genre_id); } batch.add(imdb_name, genre_id); } insertGenre.close(); findGenre.close(); batch.close(); // remove anomalies and re-add the FKs debug("re-normalizing..."); stmt = schema.createStatement(); stmt.executeUpdate("DELETE FROM MovieGenres WHERE movie = 0 OR genre = 0"); stmt.executeUpdate( "ALTER TABLE MovieGenres ADD CONSTRAINT `mg_movie` " + "FOREIGN KEY (`movie`) REFERENCES `Movies` (`movie_id`) " + "ON DELETE CASCADE ON UPDATE NO ACTION, " + "ADD CONSTRAINT `mg_genre` " + "FOREIGN KEY (`genre`) REFERENCES `Genres` (`genre_id`) " + "ON DELETE CASCADE ON UPDATE NO ACTION"); stmt.close(); reader.close(); }