← Back to team overview

oqgraph-dev team mailing list archive

IMDB tools

 

Hi all

Here is my old IMDB conversion tool (C code), and the schema I used.
It uses only Actors and Movies, filtering out TV series and DVDs.

1484084 actors   \_ actors
866799 actresses /
574006 movies
5900319 actors_movies

actors.list.gz and actresses.list.gz files from ftp://ftp.fu-berlin.de/pub/misc/movies/database/
(via http://www.imdb.com/interfaces)

My laptop is still building the actors_movies_actors table so I don't yet have a total for that, but from memory I think it was around 45 million edges.

Currently I load using MyISAM with delay key writes and large key buffer size.
We can possibly optimise the load methodology a bit further.


Cheers,
Arjen.
-- 
Arjen Lentz, Exec.Director @ Open Query (http://openquery.com)
Australian peace of mind for your MySQL/MariaDB infrastructure.

Follow us at http://openquery.com/blog/ & http://twitter.com/openquery

/* ================================================================
   Parse IMDB .list files (actors, actresses) to SQL)
   For having some more data to play with in OQGRAPH engine
   Copyright (C) 2009 by Arjen Lentz (arjen@xxxxxxxxxxxxx)
   GPLv2 or BSD license, you choose ;-)
   ================================================================
*/

#include <stdio.h>
#include <string.h>
#include <ctype.h>


#define MAX_LINE 1024



/*
   very rough trimming of leading and trailing spaces
   return new ptr to start of \0 string
*/
char *strip (char *s)
{
	char *p;

	while (isspace(*s))
		*s++;
	p = s + strlen(s);
	while (p > s && isspace(*--p));
	*++p = '\0';

	return (s);
}/*strip()*/



char *title_suffix[] = {
	" (uncredited)",
	" (credit",
	" (as ",
	" (archive ",
	" (TV)",
	" (V)",
	" (VG)",
	" [",
	" <",
	NULL
};


/*
   very rough string escape for SQL
   (just escapes single quote)
*/
char *escape_string (char *s)
{
	static char line[MAX_LINE * 2];
	char *p = line;

	while (*s) {
		if (*s == '\'')
			*p++ = '\\';
		*p++ = *s++;
	}
	*p++ = '\0';

	return (line);
}/*escape_string*/



int main (int argc, char *argv[])
{
	char line[MAX_LINE+1];
	char actor[MAX_LINE+1], longtitle[MAX_LINE+1], title[MAX_LINE+1];
	char *p, *q;
	int i;

	// skip over file header
	while (fgets(line,1024,stdin) && strncmp(line,"Name",4));
	fgets(line,1024,stdin);	// just dashes

	actor[0] = '\0';
	while (fgets(line,1024,stdin) && strncmp(line,"----",4)) {
		if ((p = strchr(line,'\t')) == NULL) {	// skip weird lines
			actor[0] = '\0';	// could be end of an actor's movie list
			continue;
		}

		p = strip(++p);
		strcpy(longtitle,p);

		for (i = 0; title_suffix[i]; i++) {
			if ((q = strstr(p,title_suffix[i])) != NULL)
				*q = '\0';
		}
		strcpy(title,strip(p));

		if (!isspace(line[0])) {	// line with actor
			*p = '\0';		// \0 after actor
			if ((p = strstr(line," (")) != NULL)
				*p = '\0';			// no (III) etc

			if ((p = strchr(line,',')) != NULL) {	// last, first
				*p++ = '\0';	// set p to start of first name
				strcpy(actor,strip(p));		// first
				strcat(actor," ");		// space
				strcat(actor,strip(line));	// last
				while ((q = strchr(actor,'\t')) != NULL) *q = ' ';
			}
			else {
				strcpy(actor,strip(line));
			}

			printf("INSERT INTO actors (name) VALUES ('%s') ON DUPLICATE KEY UPDATE actor_id = LAST_INSERT_ID(actor_id);\n",
				escape_string(actor));
			printf("SET @actor_id = LAST_INSERT_ID();\n");
				
		}
		else if (!actor[0])
			continue;

		// skip TV series/episodes and TV movies, Video only for now, and Video Games
		if (strstr(longtitle," {") || strstr(longtitle," (TV)") || strstr(longtitle," (V)") || strstr(longtitle," (VG)"))
			continue;

		printf("INSERT INTO movies (title) VALUES ('%s') ON DUPLICATE KEY UPDATE movie_id = LAST_INSERT_ID(movie_id);\n",
			escape_string(title));
		printf("SET @movie_id = LAST_INSERT_ID();\n");

		printf("INSERT IGNORE INTO actors_movies (actor_id,movie_id) VALUES (@actor_id,@movie_id);\n");
	}

	return (0);
}/*main()*/



/* end of imdb2csv.c */
-- We use MyISAM here because it's just a repeatable bulk load
-- ACID commits would just delay stuff...
-- enabling delay key write, key_buffer_size=3G

CREATE TABLE actors (
	actor_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
	name VARCHAR(100) NOT NULL UNIQUE
) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1;

CREATE TABLE movies (
	movie_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
	title VARCHAR(300) NOT NULL UNIQUE
) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1;

CREATE TABLE actors_movies (
	actor_movie_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
	actor_id INT UNSIGNED NOT NULL,
	movie_id INT UNSIGNED NOT NULL,
	UNIQUE (actor_id,movie_id),
	INDEX (movie_id)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1;

CREATE TABLE `actors_movies_actors` (
  `actor_movie_id` INT(10) UNSIGNED NOT NULL,
  `actor_id` INT(10) UNSIGNED NOT NULL,
  PRIMARY KEY (`actor_movie_id`,`actor_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 DELAY_KEY_WRITE=1;

-- populate link table
-- insert into actors_movies_actors select am1.actor_movie_id,am2.actor_id from actors_movies am1 join actors_movies am2 on (am1.movie_id=am2.movie_id AND am1.actor_id != am2.actor_id);

-- populate graph table
-- insert into actors_graph (origid,destid) select am.actor_id,ama.actor_id from actors_movies_actors ama join actors_movies am using (actor_movie_id);


Follow ups