oqgraph-dev team mailing list archive
-
oqgraph-dev team
-
Mailing list archive
-
Message #00067
IMDB tools
Hi all
Here is my old IMDB conversion tool (C code), and the schema I used.
It uses only Actors and Movies, filtering out TV series and DVDs.
1484084 actors \_ actors
866799 actresses /
574006 movies
5900319 actors_movies
actors.list.gz and actresses.list.gz files from ftp://ftp.fu-berlin.de/pub/misc/movies/database/
(via http://www.imdb.com/interfaces)
My laptop is still building the actors_movies_actors table so I don't yet have a total for that, but from memory I think it was around 45 million edges.
Currently I load using MyISAM with delay key writes and large key buffer size.
We can possibly optimise the load methodology a bit further.
Cheers,
Arjen.
--
Arjen Lentz, Exec.Director @ Open Query (http://openquery.com)
Australian peace of mind for your MySQL/MariaDB infrastructure.
Follow us at http://openquery.com/blog/ & http://twitter.com/openquery
/* ================================================================
Parse IMDB .list files (actors, actresses) to SQL)
For having some more data to play with in OQGRAPH engine
Copyright (C) 2009 by Arjen Lentz (arjen@xxxxxxxxxxxxx)
GPLv2 or BSD license, you choose ;-)
================================================================
*/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#define MAX_LINE 1024
/*
very rough trimming of leading and trailing spaces
return new ptr to start of \0 string
*/
char *strip (char *s)
{
char *p;
while (isspace(*s))
*s++;
p = s + strlen(s);
while (p > s && isspace(*--p));
*++p = '\0';
return (s);
}/*strip()*/
char *title_suffix[] = {
" (uncredited)",
" (credit",
" (as ",
" (archive ",
" (TV)",
" (V)",
" (VG)",
" [",
" <",
NULL
};
/*
very rough string escape for SQL
(just escapes single quote)
*/
char *escape_string (char *s)
{
static char line[MAX_LINE * 2];
char *p = line;
while (*s) {
if (*s == '\'')
*p++ = '\\';
*p++ = *s++;
}
*p++ = '\0';
return (line);
}/*escape_string*/
int main (int argc, char *argv[])
{
char line[MAX_LINE+1];
char actor[MAX_LINE+1], longtitle[MAX_LINE+1], title[MAX_LINE+1];
char *p, *q;
int i;
// skip over file header
while (fgets(line,1024,stdin) && strncmp(line,"Name",4));
fgets(line,1024,stdin); // just dashes
actor[0] = '\0';
while (fgets(line,1024,stdin) && strncmp(line,"----",4)) {
if ((p = strchr(line,'\t')) == NULL) { // skip weird lines
actor[0] = '\0'; // could be end of an actor's movie list
continue;
}
p = strip(++p);
strcpy(longtitle,p);
for (i = 0; title_suffix[i]; i++) {
if ((q = strstr(p,title_suffix[i])) != NULL)
*q = '\0';
}
strcpy(title,strip(p));
if (!isspace(line[0])) { // line with actor
*p = '\0'; // \0 after actor
if ((p = strstr(line," (")) != NULL)
*p = '\0'; // no (III) etc
if ((p = strchr(line,',')) != NULL) { // last, first
*p++ = '\0'; // set p to start of first name
strcpy(actor,strip(p)); // first
strcat(actor," "); // space
strcat(actor,strip(line)); // last
while ((q = strchr(actor,'\t')) != NULL) *q = ' ';
}
else {
strcpy(actor,strip(line));
}
printf("INSERT INTO actors (name) VALUES ('%s') ON DUPLICATE KEY UPDATE actor_id = LAST_INSERT_ID(actor_id);\n",
escape_string(actor));
printf("SET @actor_id = LAST_INSERT_ID();\n");
}
else if (!actor[0])
continue;
// skip TV series/episodes and TV movies, Video only for now, and Video Games
if (strstr(longtitle," {") || strstr(longtitle," (TV)") || strstr(longtitle," (V)") || strstr(longtitle," (VG)"))
continue;
printf("INSERT INTO movies (title) VALUES ('%s') ON DUPLICATE KEY UPDATE movie_id = LAST_INSERT_ID(movie_id);\n",
escape_string(title));
printf("SET @movie_id = LAST_INSERT_ID();\n");
printf("INSERT IGNORE INTO actors_movies (actor_id,movie_id) VALUES (@actor_id,@movie_id);\n");
}
return (0);
}/*main()*/
/* end of imdb2csv.c */
-- We use MyISAM here because it's just a repeatable bulk load
-- ACID commits would just delay stuff...
-- enabling delay key write, key_buffer_size=3G
CREATE TABLE actors (
actor_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(100) NOT NULL UNIQUE
) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1;
CREATE TABLE movies (
movie_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(300) NOT NULL UNIQUE
) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1;
CREATE TABLE actors_movies (
actor_movie_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
actor_id INT UNSIGNED NOT NULL,
movie_id INT UNSIGNED NOT NULL,
UNIQUE (actor_id,movie_id),
INDEX (movie_id)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1;
CREATE TABLE `actors_movies_actors` (
`actor_movie_id` INT(10) UNSIGNED NOT NULL,
`actor_id` INT(10) UNSIGNED NOT NULL,
PRIMARY KEY (`actor_movie_id`,`actor_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1;
-- populate link table
-- insert into actors_movies_actors select am1.actor_movie_id,am2.actor_id from actors_movies am1 join actors_movies am2 on (am1.movie_id=am2.movie_id AND am1.actor_id != am2.actor_id);
-- populate graph table
-- insert into actors_graph (origid,destid) select am.actor_id,ama.actor_id from actors_movies_actors ama join actors_movies am using (actor_movie_id);
Follow ups