Thursday, June 10, 2010

Interesting Internet Movie Database statistics - in Python

In one of my previous posts I presented how to load the database files from IMDB in the Python shell.

In the same way not only the release year but other information can be loaded, like the language, genre, ratings, country, etc.

To plot graphics in Python you can use the matplotlib. To use this library you will need the numpy package too.

All the functions used to extract info's from the loaded database can be found at the end of the post.
You can download the full code to load the database and make the queries from: imdb.py and query.py

Lets obtain the number of movies by year:

> MbY = query.MoviesByYear(imdb.Movies)


To plot the resulting data:

> from pylab import plot,show,legend
> plot(MbY.keys(), MbY.values())
> show()




Now lets see the number of movies by countries:

> MC = query.ByCountry(imdb.Movies)
> MC[0:10]
[('USA', 328177),
('UK', 64717),
('France', 38066),
('Germany', 31408),
('Japan', 28819),
('Canada', 24745),
('Italy', 23877),
('India', 23687),
('Spain', 18313),
('Mexico', 17544)]


Plot the movie count for USA by year:

> USA = query.CountryByYear(imdb.Movies, 'USA')
> plot(USA .keys(), USA .values())
> show()




Now plot more countries on the same figure:

> UK = query.CountryByYear(imdb.Movies, 'UK')
> France = query.CountryByYear(imdb.Movies, 'France')
> Germany = query.CountryByYear(imdb.Movies, 'Germany')
> Japan = query.CountryByYear(imdb.Movies, 'Japan')
> Canada = query.CountryByYear(imdb.Movies, 'Canada')
> p1=plot(UK.keys(), UK.values())
> p2=plot(France.keys(), France.values())
> p3=plot(Germany.keys(), Germany.values())
> p4=plot(Japan.keys(), Japan.values())
> p5=plot(Canada.keys(), Canada.values())
> show()
> legend( (p1, p2, p3, p4, p5), ('UK', 'France', 'Germany', 'Japan', 'Canada'), 'upper left', shadow=True)




For Germany the movie count is 0 between 1950 and 1989 because the country was divided into East and West Germany.

Now lets see the same plots for movie count by languages:

> BL = query.ByLanguage(imdb.Movies)
> BL[0:10]
[(u'English', 409215),
(u'Spanish', 50291),
(u'German', 43118),
(u'French', 35512),
(u'Japanese', 26340),
(u'Italian', 22422),
(u'Portuguese', 9902),
(u'Hindi', 8362),
(u'Dutch', 8161),
(u'Russian', 8131)]
> Eng = query.LangByYear(imdb.Movies, 'English')
> Sp = query.LangByYear(imdb.Movies, 'Spanish')
> Ger = query.LangByYear(imdb.Movies, 'German')
> Fr = query.LangByYear(imdb.Movies, 'French')
> Jp = query.LangByYear(imdb.Movies, 'Japanese')
> p1=plot(Eng.keys(), Eng.values())
> p2=plot(Sp.keys(), Sp.values())
> p3=plot(Ger.keys(), Ger.values())
> p4=plot(Fr.keys(), Fr.values())
> p5=plot(Jp.keys(), Jp.values())
> show()
> legend( (p1, p2, p3, p4, p5), ('English', 'Spanish', 'German', 'French', 'Japanese'), 'upper left', shadow=True)



Find bellow the simple functions used to extract information for statistics.


# Number of movies by year:
def MoviesByYear(i):
    data={}
    for k,v in i.iteritems():
        if v.has_key('year') and v['year'].isdigit():
            if data.has_key(int(v['year'])) :
                data[int(v['year'])] = data[int(v['year'])] + 1
            else:
                data[int(v['year'])] = 1


# Number of movies by year per country:
def CountryByYear(i,country):
    data={}
    for k,v in i.iteritems():
        if v.has_key('country') and v['country']==country:
            if v.has_key('year') and v['year'].isdigit():
                if data.has_key(int(v['year'])) :
                    data[int(v['year'])] = data[int(v['year'])] + 1
                else:
                    data[int(v['year'])] = 1
    return data


# Number of movies by language
def languagesort(x,y):
    if x[1]>y[1]:
        return -1
    if x[1]<y[1]:
        return 1
    if x[1]==y[1]:
        return 0
        
def ByLanguage(i):
    data={}
    for k,v in i.iteritems():
        if v.has_key('language') :
            if data.has_key(v['language']) :
                data[v['language']] = data[v['language']] + 1
            else:
                data[v['language']] = 1
                
    ll = map(lambda (k,v): (k,v),data.items())
    ll.sort(cmp = languagesort)
    return ll


# Number of movies by country
def ByCountry(i):
    data={}
    for k,v in i.iteritems():
        if v.has_key('country') :
            if data.has_key(v['country']) :
                data[v['country']] = data[v['country']] + 1
            else:
                data[v['country']] = 1
                
    ll = map(lambda (k,v): (k,v),data.items())
    ll.sort(cmp = languagesort)
    return ll


def LangByYear(i,lang):
    data={}
    for k,v in i.iteritems():
        if v.has_key('language') and v['language']==lang:
            if v.has_key('year') and v['year'].isdigit():
                if data.has_key(int(v['year'])) :
                    data[int(v['year'])] = data[int(v['year'])] + 1
                else:
                    data[int(v['year'])] = 1
    return data