IMDbPY-4.9/0000755000000000000000000000000011766731642011125 5ustar rootrootIMDbPY-4.9/setup.py0000755000000000000000000002050611766731642012645 0ustar rootroot#!/usr/bin/env python import distutils.sysconfig import os import sys import ez_setup ez_setup.use_setuptools() import setuptools # version of the software; in the code repository this represents # the _next_ release. setuptools will automatically add 'dev-rREVISION'. version = '4.9' home_page = 'http://imdbpy.sf.net/' long_desc = """IMDbPY is a Python package useful to retrieve and manage the data of the IMDb movie database about movies, people, characters and companies. Platform-independent and written in pure Python (and few C lines), it can retrieve data from both the IMDb's web server and a local copy of the whole database. IMDbPY package can be very easily used by programmers and developers to provide access to the IMDb's data to their programs. Some simple example scripts - useful for the end users - are included in this package; other IMDbPY-based programs are available at the home page: %s """ % home_page dwnl_url = 'http://imdbpy.sf.net/?page=download' classifiers = """\ Development Status :: 5 - Production/Stable Environment :: Console Environment :: Web Environment Environment :: Handhelds/PDA's Intended Audience :: Developers Intended Audience :: End Users/Desktop License :: OSI Approved :: GNU General Public License (GPL) Natural Language :: English Natural Language :: Italian Natural Language :: Turkish Programming Language :: Python Programming Language :: C Operating System :: OS Independent Topic :: Database :: Front-Ends Topic :: Internet :: WWW/HTTP :: Dynamic Content :: CGI Tools/Libraries Topic :: Software Development :: Libraries :: Python Modules """ keywords = ['imdb', 'movie', 'people', 'database', 'cinema', 'film', 'person', 'cast', 'actor', 'actress', 'director', 'sql', 'character', 'company', 'package', 'plain text data files', 'keywords', 'top250', 'bottom100', 'xml'] cutils = setuptools.Extension('imdb.parser.sql.cutils', ['imdb/parser/sql/cutils.c']) scripts = ['./bin/get_first_movie.py', './bin/get_movie.py', './bin/search_movie.py', './bin/get_first_person.py', './bin/get_person.py', './bin/search_person.py', './bin/get_character.py', './bin/get_first_character.py', './bin/get_company.py', './bin/search_character.py', './bin/search_company.py', './bin/get_first_company.py', './bin/get_keyword.py', './bin/search_keyword.py', './bin/get_top_bottom_movies.py'] # XXX: I'm not sure that 'etc' is a good idea. Making it an absolute # path seems a recipe for a disaster (with bdist_egg, at least). data_files = [('doc', setuptools.findall('docs')), ('etc', ['docs/imdbpy.cfg'])] # Defining these 'features', it's possible to run commands like: # python ./setup.py --without-sql bdist # having (in this example) imdb.parser.sql removed. featCutils = setuptools.dist.Feature('compile the C module', standard=True, ext_modules=[cutils]) featLxml = setuptools.dist.Feature('add lxml dependency', standard=True, install_requires=['lxml']) # XXX: it seems there's no way to specify that we need EITHER # SQLObject OR SQLAlchemy. featSQLObject = setuptools.dist.Feature('add SQLObject dependency', standard=True, install_requires=['SQLObject', 'FormEncode'], require_features='sql') featSQLAlchemy = setuptools.dist.Feature('add SQLAlchemy dependency', standard=True, install_requires=['SQLAlchemy', 'sqlalchemy-migrate'], require_features='sql') sqlScripts = ['./bin/imdbpy2sql.py'] # standard=False so that it's not installed if both --without-sqlobject # and --without-sqlalchemy are specified. featSQL = setuptools.dist.Feature('access to SQL databases', standard=False, remove='imdb.parser.sql', scripts=sqlScripts) features = { 'cutils': featCutils, 'sql': featSQL, 'lxml': featLxml, 'sqlobject': featSQLObject, 'sqlalchemy': featSQLAlchemy } params = { # Meta-information. 'name': 'IMDbPY', 'version': version, 'description': 'Python package to access the IMDb\'s database', 'long_description': long_desc, 'author': 'Davide Alberani', 'author_email': 'da@erlug.linux.it', 'contact': 'IMDbPY-devel mailing list', 'contact_email': 'imdbpy-devel@lists.sourceforge.net', 'maintainer': 'Davide Alberani', 'maintainer_email': 'da@erlug.linux.it', 'license': 'GPL', 'platforms': 'any', 'keywords': keywords, 'classifiers': filter(None, classifiers.split("\n")), 'zip_safe': False, # XXX: I guess, at least... # Download URLs. 'url': home_page, 'download_url': dwnl_url, # Scripts. 'scripts': scripts, # Documentation files. 'data_files': data_files, # C extensions. #'ext_modules': [cutils], # Requirements. XXX: maybe we can use extras_require? #'install_requires': install_requires, #'extras_require': extras_require, 'features': features, # Packages. 'packages': setuptools.find_packages() } ERR_MSG = """ ==================================================================== ERROR ===== Aaargh! An error! An error! Curse my metal body, I wasn't fast enough. It's all my fault! Anyway, if you were trying to build a package or install IMDbPY to your system, looks like we're unable to fetch or install some dependencies, or to compile the C module. The best solution is to resolve these dependencies (maybe you're not connected to Internet?) and/or install a C compiler. You may, however, go on without some optional pieces of IMDbPY; try re-running this script with the corresponding optional argument: --without-lxml exclude lxml (speeds up 'http') --without-cutils don't compile the C module (speeds up 'sql') --without-sqlobject exclude SQLObject (you need at least one of) --without-sqlalchemy exclude SQLAlchemy (SQLObject or SQLAlchemy,) (if you want to access a ) (local SQL database ) --without-sql no access to SQL databases (implied if both --without-sqlobject and --without-sqlalchemy are used) Example: python ./setup.py --without-lxml --without-sql install The caught exception, is re-raise below: """ REBUILDMO_DIR = os.path.join('imdb', 'locale') REBUILDMO_NAME = 'rebuildmo' def runRebuildmo(): """Call the function to rebuild the locales.""" cwd = os.getcwd() import sys path = list(sys.path) languages = [] try: import imp scriptPath = os.path.dirname(__file__) modulePath = os.path.join(cwd, scriptPath, REBUILDMO_DIR) sys.path += [modulePath, '.', cwd] modInfo = imp.find_module(REBUILDMO_NAME, [modulePath, '.', cwd]) rebuildmo = imp.load_module('rebuildmo', *modInfo) os.chdir(modulePath) languages = rebuildmo.rebuildmo() print 'Created locale for: %s.' % ' '.join(languages) except Exception, e: print 'ERROR: unable to rebuild .mo files; caught exception %s' % e sys.path = path os.chdir(cwd) return languages def hasCommand(): """Return true if at least one command is found on the command line.""" args = sys.argv[1:] if '--help' in args: return False if '-h' in args: return False for arg in args: if arg and not arg.startswith('-'): return True return False try: if hasCommand(): languages = runRebuildmo() else: languages = [] if languages: data_files.append((os.path.join(distutils.sysconfig.get_python_lib(), 'imdb/locale'), ['imdb/locale/imdbpy.pot'])) for lang in languages: files_found = setuptools.findall('imdb/locale/%s' % lang) if not files_found: continue base_dir = os.path.dirname(files_found[0]) data_files.append((os.path.join(distutils.sysconfig.get_python_lib(), 'imdb/locale'), ['imdb/locale/imdbpy-%s.po' % lang])) if not base_dir: continue data_files.append((os.path.join(distutils.sysconfig.get_python_lib(), base_dir), files_found)) setuptools.setup(**params) except SystemExit: print ERR_MSG raise IMDbPY-4.9/MANIFEST.in0000644000000000000000000000101211766731642012655 0ustar rootroot# # MANIFEST.in # # Manifest template for creating the Distutils source distribution. # # Comment out the "recursive-include docs" entry if you don't want # to install the documentation. recursive-include docs * recursive-include imdb/locale * global-exclude *~ prune CVS prune .svn prune .hg global-exclude CVS global-exclude .svn # Try to force the inclusion of ez_setup.py. include ez_setup.py # Uncomment the following line if you don't want to install the logo images. # exclude docs/*.png docs/*.xpm docs/*.bmp IMDbPY-4.9/.hgtags0000644000000000000000000000034511766731642012405 0ustar rootrootc3dba80881f0a810b3bf93051a56190b297e7a50 4.6 c8b07121469a2173a587b1a34beb4f1fecd640b6 4.7 ba221c9050599463b4b78c89a8bdada7d7aef173 4.8 e807ba790392d406018af0f98d5dad5117721a4d 4.8.1 b02c61369b27e0d5af0a755a8a2fc3355c08bb67 4.8.2 IMDbPY-4.9/docs/0000755000000000000000000000000011766731642012055 5ustar rootrootIMDbPY-4.9/docs/README.redesign0000644000000000000000000000235111766731642014535 0ustar rootroot IMDb's web site redesign ======================== On September 2010 the IMDb web pages had a major redesign. With IMDbPY 4.7 we're trying to parse the new web pages, but it will take some time before all the bugs are fixed. Any help (fixing parsers or simple bug reports) is greatly appreciated. Beware that: - the "httpThin" data access method is badly broken and probably it will not fixed. - the "mobile" data access method can be partially broken, and will be fixed: please report any problem. - some of the information in these keys could be somewhat ruined: soundtrack, awards, episodes rating, faqs. - information about series were not extensively tested. - it's possible that some information will be missing, like "in development" movies. The above problems, with the exception of "httpThin" will be fixed in future releases. Notes about the code: we have a very powerful and efficient parsing infrastructure, but after many releases and so many changes of the IMDb pages, some of the main parsers are showing their age. So, parsers for main information about movies and persons should be probably rewritten from scratch, and the same applies to helper functions like "build_person" and "build_movie" in imdb.parser.http.utils. IMDbPY-4.9/docs/imdbpyico32x32.ico0000755000000000000000000000427611766731642015246 0ustar rootroot ( @  q   7 & ! '$ #!(%"3&('-+*-+9#/0 ;,?Q88U<39:;;!88Y586ZK* ??8:9TXdBD,><CEHFIG@BAoACA&IFBCH fOMQOHJISPXL,PNUURTSVf;=LOM7[>XXXVNQOt)[[2VT\\0j5:[QSUT^^__,n2```a8l<VYWbd X[YrPfhgihjHaamkWhXomMefrj@}?wcqoced tr*(Qjkusewv23iljdlmVzy{zlom}|autuqtr~[oswqmddJx{y=_Zp|d\VRjjSm?ICD0&8y )6!fb Y.?}%tAtttAA22AZ2EeZZZZ2ZNq>5BEEYW]OZNZĦP/EEZbfKZN̸8ljHjwb*EE$8*'oy*U<*ZN9}'EEx˜JZNMdm`JEEA3|L7h,ZN?4 6R[EE򘳬S(jZN>DZv&"ucjEE'rp/X{DbZN+IT}C,EE[z0wKZNk: 2009-2011 Davide Alberani --> IMDbPY-4.9/docs/FAQS.txt0000644000000000000000000001363211766731642013355 0ustar rootroot IMDbPY FAQS =========== Q1: Since version 3.7, parsing the data from the IMDb web site is slow, sloow, slooow! Why? A1: if python-lxml is not installed in your system, IMDbPY uses the pure-python BeautifulSoup module as a fall-back; BeautifulSoup does an impressive job, but it can't be as fast as a parser written in C. You can install python-lxml following the instructions in the README.newparsers file. Q2: why the movieID (and other IDs) used in the 'sql' database are not the same used on the IMDb.com site? A2: first, a bit of nomenclature: we'll call "movieID" (or things like "personID", for instance of the Person class) a unique identifier used by IMDbPY to manage a single movie (or other kinds of object). We'll call "imdbID" a unique identifier used, for the same kind of data, by the IMDb.com site (i.e.: the 7-digit number in tt0094226, as seen in the URL for "The Untouchables"). Using IMDbPY to access the web ('http' and 'mobile' data access systems), movieIDs and imdbIDs are the same thing - beware that in this case a movieID is a string, with the leading zeroes. Unfortunately, populating a sql database with data from the plain text data files, we don't have access to imdbIDs - since they are not distributed at all - and so we have to made them by ourselves (they are the 'id' column in tables like 'title' or 'name'). This mean that these values are valid only for your current database: if you update it with a newer set of plain text data files, these IDs will surely change (and, by the way, they are integers). It's also obvious, now, that you can't exchange IDs between the 'http' (or 'mobile') data access system and 'sql', and in the same way you can't use imdbIDs with your local database or vice-versa. Q3: using a sql database, what's the imdb_id (or something like that) column in tables like 'title', 'name' and so on? A3: it's internally used by IMDbPY to remember the imdbID (the one used by the web site - accessing the database you'll use the numeric value of the 'id' column, as movieID) of a movie, once it stumbled upon. This way, if IMDbPY is asked again about the imdbID of a movie (or person, or ...), it doesn't have to contact again to the web site. Notice that you have to access the sql database using a user with write permission, to update it. As a bonus, when possible, the values of these imdbIDs are saved between updates of the sql database (using the imdbpy2sql.py script). Beware that it's tricky and not always possible, but the script does its best to succeed. Q4: but what if I really need the imdbIDs, to use my database? A4: no, you don't. Search for a title, get its information. Be happy! Q5: I have a great idea: write a script to fetch all the imdbID from the web site! Can't you do it? A5: yeah, I can. But I won't. :-) It would be somewhat easy to map every title on the web to its imdbID, but there are still a lot of problems. First of all, every user will end up doing it for its own copy of the plain text data files (and this will make the imdbpy2sql.py script painfully slow and prone to all sort of problems). Moreover, the imdbIDs are unique and never reused, true, but movie title _do_ change: to fix typos, override working titles, to cope with a new movie with the same title release in the same year (not to mention cancelled or postponed movies). Besides that, we'd have to do the same for persons, characters and companies. Believe me: it doesn't make sense. Work on your local database using your movieIDs (or even better: don't mind about movieIDs and think in terms of searches and Movie instances!) and retrieve the imdbID only in the rare circumstances when you really need them (see the next FAQ). Repeat with me: I DON'T NEED ALL THE imdbIDs. :-) Q6: using a sql database, how can I convert a movieID (whose value is valid only locally) to an imdbID (the ID used by the imdb.com site)? A6: various functions can be used to convert a movieID (or personID or other IDs) to the imdbID used by the web site. Example of code: from imdb import IMDb ia = IMDb('sql', uri=URI_TO_YOUR_SQL_DATABASE) movie = ia.search_movie('The Untouchables')[0] # a Movie instance. print 'The movieID for The Untouchables:', movie.movieID print 'The imdbID used by the site:', ia.get_imdbMovieID(movie.movieID) print 'Same ID, smarter function:', ia.get_imdbID(movie) It goes without saying that get_imdbMovieID has some sibling methods: get_imdbPersonID, get_imdbCompanyID and get_imdbCharacterID. Also notice that the get_imdbID method is smarter, and takes any kind of instance (the other functions need a movieID, personID, ...) Another method that will try to retrieve the imdbID is get_imdbURL, which works like get_imdbID but returns an URL. In case of problems, these methods will return None. Q7: I have a movie title (in the format used by the plain text data files) or other kind of data (like a person/character/company name) and I want to get its imdbID. How can I do? A7: the safest thing, is probably to do a normal search on IMDb (using the 'http' or 'mobile' data access system of IMDbPY) and see if the first item is the correct one. You can also try the 'title2imdbID' method (and similar) of the IMDb instance (no matter if you're using 'http', 'mobile' or 'sql'), but expect some failures - it returns None in this case. Q8: I have an URL (of a movie, person or something else); how can I get a Movie/Person/... instance? A8: import the imdb.helpers module and use the get_byURL function. Q9: I'm writing an interface based on IMDbPY and I have problems handling encoding, chars conversions, replacements of references and so on. A9: see the many functions in the imdb.helpers module. IMDbPY-4.9/docs/imdbpyico.xpm0000644000000000000000000001447111766731642014571 0ustar rootroot/* XPM */ static char * imdbpyico_xpm[] = { "32 32 264 2", " c None", ". c #000000", "+ c #160F00", "@ c #1C1300", "# c #6C6C6C", "$ c #8E8E8E", "% c #4A4A4A", "& c #141414", "* c #9B9B9B", "= c #282828", "- c #656565", "; c #737373", "> c #1B1B1B", ", c #222222", "' c #887F6C", ") c #251800", "! c #B1B1B1", "~ c #E9E9E9", "{ c #7A7A7A", "] c #212121", "^ c #FFFFFF", "/ c #434343", "( c #A6A6A6", "_ c #BCBCBC", ": c #2C2C2C", "< c #373737", "[ c #555555", "} c #6F6F6F", "| c #3A3A3A", "1 c #101010", "2 c #202020", "3 c #505050", "4 c #5A5A5A", "5 c #151515", "6 c #0A0A0A", "7 c #191919", "8 c #171717", "9 c #030303", "0 c #4E4E4E", "a c #7E7E6F", "b c #383800", "c c #757564", "d c #3F3F0D", "e c #848484", "f c #161603", "g c #262600", "h c #141400", "i c #2C2C00", "j c #676700", "k c #54541A", "l c #C6C6C6", "m c #595959", "n c #DDDD17", "o c #484827", "p c #CECE0F", "q c #A6A659", "r c #2F2F13", "s c #4E4E00", "t c #B1B143", "u c #00006F", "v c #B1B12C", "w c #0000FF", "x c #0B0B37", "y c #E8E8E8", "z c #424242", "A c #B1B103", "B c #606005", "C c #A6A602", "D c #B1B10B", "E c #494902", "F c #5B5B00", "G c #121208", "H c #F9F9F9", "I c #949494", "J c #616148", "K c #484800", "L c #686800", "M c #505000", "N c #747400", "O c #808000", "P c #272700", "Q c #6C6C65", "R c #BABABA", "S c #8F8F86", "T c #616109", "U c #757500", "V c #718F00", "W c #778900", "X c #20C800", "Y c #2C4B00", "Z c #777700", "` c #717100", " . c #525211", ".. c #BDBDBA", "+. c #DEDEDE", "@. c #555500", "#. c #585800", "$. c #7D7D00", "%. c #758B00", "&. c #60A000", "*. c #22DE00", "=. c #29D700", "-. c #00FF00", ";. c #00B100", ">. c #383821", ",. c #333C00", "'. c #627700", "). c #7C8400", "!. c #6A7200", "~. c #778600", "{. c #47B800", "]. c #55AB00", "^. c #0DE700", "/. c #065A00", "(. c #048F00", "_. c #03FC00", ":. c #00A900", "<. c #4E5700", "[. c #636307", "}. c #006F00", "|. c #33CD00", "1. c #18E700", "2. c #22DD00", "3. c #00A800", "4. c #005100", "5. c #233900", "6. c #1EE100", "7. c #18A200", "8. c #009300", "9. c #2FD000", "0. c #6F6F00", "a. c #F5F5F5", "b. c #648664", "c. c #04CD00", "d. c #3AC500", "e. c #1BE400", "f. c #277400", "g. c #000A00", "h. c #005500", "i. c #599600", "j. c #00BA00", "k. c #001800", "l. c #59A700", "m. c #23DC00", "n. c #1B0000", "o. c #C10000", "p. c #860000", "q. c #161600", "r. c #155800", "s. c #2ED100", "t. c #07BA07", "u. c #939393", "v. c #788800", "w. c #5E5E0A", "x. c #494343", "y. c #BE0000", "z. c #9A6500", "A. c #3B6500", "B. c #1CB800", "C. c #07DE00", "D. c #006400", "E. c #829882", "F. c #C2C2C2", "G. c #1E8A1E", "H. c #3BC500", "I. c #6C9300", "J. c #FFE7E7", "K. c #ED3434", "L. c #8A3E00", "M. c #08C000", "N. c #1F831F", "O. c #566956", "P. c #CECECE", "Q. c #406B38", "R. c #679E0D", "S. c #2ECA00", "T. c #7A7A00", "U. c #53531D", "V. c #CFCFCD", "W. c #FFEEEE", "X. c #FF7F7F", "Y. c #FA2828", "Z. c #858585", "`. c #CACACA", " + c #97978B", ".+ c #828240", "++ c #0FD600", "@+ c #1AE500", "#+ c #6D9200", "$+ c #C5C5C5", "%+ c #B4B4B4", "&+ c #EEEEEE", "*+ c #FFF9F9", "=+ c #FFE6E6", "-+ c #FF6565", ";+ c #B5B5B1", ">+ c #3D3D2C", ",+ c #17B706", "'+ c #5BA400", ")+ c #728D00", "!+ c #7B7B00", "~+ c #393900", "{+ c #AEAEAC", "]+ c #EFEFEF", "^+ c #F1F1F1", "/+ c #1B1B13", "(+ c #1E3B0B", "_+ c #48B700", ":+ c #45BA00", "<+ c #5AA500", "[+ c #434300", "}+ c #66664B", "|+ c #DDDDDD", "1+ c #6A6A51", "2+ c #565632", "3+ c #909090", "4+ c #4E4E06", "5+ c #08C303", "6+ c #06F900", "7+ c #17E800", "8+ c #424200", "9+ c #696900", "0+ c #4F4F2E", "a+ c #6D6D00", "b+ c #5F5F00", "c+ c #3B3B06", "d+ c #DFDFDF", "e+ c #50A500", "f+ c #53AC00", "g+ c #21DE00", "h+ c #537200", "i+ c #555917", "j+ c #B7B7B7", "k+ c #00F400", "l+ c #08F700", "m+ c #54AB00", "n+ c #46BA00", "o+ c #5C5C00", "p+ c #87877B", "q+ c #6C8B6C", "r+ c #005900", "s+ c #B6BAB6", "t+ c #F8F8F8", "u+ c #505B39", "v+ c #598A00", "w+ c #4AB600", "x+ c #13ED00", "y+ c #0DEC00", "z+ c #06B700", "A+ c #346A2E", "B+ c #306E2E", "C+ c #3D5B37", "D+ c #939390", "E+ c #FCFCFC", "F+ c #FBFBFB", "G+ c #F4F4F4", "H+ c #E2E2E2", "I+ c #3F7D3F", "J+ c #1B5400", "K+ c #19660D", "L+ c #ECECEC", "M+ c #0E0E0E", "N+ c #332200", "O+ c #412B00", ". . . . . . . . . . . . . . . . + @ . . . . . . . . . . . . . . ", "# . $ % & * = . - ; . > * , . # ' ) , * > . ; - . = * & % $ . # ", "! . ~ { ] ^ / . ( _ . : ^ < . ! ! . < ^ : . _ ( . / ^ ] { ~ . ! ", "[ . } | 1 { 2 . 3 4 . 5 { > . [ [ . > { 5 . 4 3 . 2 { 1 | } . [ ", "6 ] ] ] ] ] ] ] ] 7 . 8 . 9 ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] 6 ", "0 ^ ^ ^ ^ ^ ^ ^ ^ a b c b d _ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ ^ e f g h i j k l ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ ^ m n o p q r s ! ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ ^ m t u v w x s ! ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ y z A B C D E F G _ _ H ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ I J K K L M K N O P K K Q R ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ H S T U O O O O O V W X Y Z ` ...^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ +.b @.O #.$.O %.&.*.=.-.;.s O O >.^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ +.,.'.).!.~.{.].^./.(._.:.<.O O [.$ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ +.}._.|.|.1.2.3.4.5.6.7.8.9.).O 0.} ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ a.b.c.d.d.e.f.g.h.i.j.k.l.m.%.O 0.} ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ / n.o.p.q.r.s.t.u.0 e.1.v.O w.* ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ x.y.z.A.B.C.D.E.F.G.H.I.O O >.^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ J.K.L.M.M.N.O.P.Q.R.S.v.O T.U.V.^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ W.X.Y.Z.Z.Z.`. +.+++@+#+O O U.$+^ ^ ^ ^ ^ ^ %+&+^ ^ ^ 0 ", "0 ^ ^ ^ *+=+-+^ ^ ;+>+,+6.'+)+O !+~+{+]+^ ^++.~ ^ ;+/++.^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ ^ ^ (+_+:+<+O O O [+b b }+|+1+b 2+3+4+b +.^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ ^ e 5+6+7+O O O 8+9+O O 0.0+a+O N b+$.c+d+^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ ^ m e+f+g+O O O O O O O O O O b+h+f+i+j+^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ ^ m k+-.l+m+O O O n+O O O O o+p+q+r+s+^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ ^ t+u+v+w+x+y+z+A+B+z+z+z+C+D+E+F+G+^ ^ ^ ^ ^ ^ 0 ", "0 ^ ^ ^ ^ ^ ^ ^ ^ H+I+}.J+K+3+|+|+3+3+3+L+^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 ", "6 ] ] ] ] ] ] ] ] ] M+. . 9 ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] 6 ", "[ . } | 1 { 2 . 3 4 . 5 { > . [ [ . > { 5 . 4 3 . 2 { 1 | } . [ ", "! . ~ { ] ^ / . ( _ . : ^ < . ! ! . < ^ : . _ ( . / ^ ] { ~ . ! ", "# . $ % & * = . - ; . > * , . # # . , * > . ; - . = * & % $ . # ", ". . . . . . . . . . . . . . . . N+O+. . . . . . . . . . . . . . "}; IMDbPY-4.9/docs/CONTRIBUTORS.txt0000644000000000000000000000310411766731642014551 0ustar rootroot People who contributed with a substantial amount of work and that share the copyright over some portions of the code: NAME: H. Turgut Uyar EMAIL: tekir.org> CONTRIBUTION: the whole new "http" data access system (using a DOM and XPath-based approach) is based on his work. The imdbpykit interface was mostly written by him and he holds the copyright over the whole code (with some portions shared with others). NAME: Giuseppe "Cowo" Corbelli EMAIL: lugbs.linux.it> CONTRIBUTION: provided a lot of code and hints to integrate IMDbPY with SQLObject, working on the imdbpy2sql.py script and the dbschema.py module. Actually, besides Turgut, Giuseppe and me, these other people are listed as developers for the IMDbPY project on sourceforge and may share copyright on some (minor) portions of the code: NAME: Alberto Malagoli CONTRIBUTION: developed the new web site, and detains the copyright of it, and provided helper functions and other code. NAME: Martin Kirst EMAIL: s1998.tu-chemnitz.de> CONTRIBUTION: has done an important refactoring of the imdbpyweb program and shares with me the copyright on the whole program. NAME: Jesper Nøhr EMAIL: noehr.org> CONTRIBUTION: provided extensive testing and some patches for the 'http' data access system. NAME: Joachim Selke EMAIL: tu-bs.de> CONTRIBUTION: many tests on IBM DB2 and work on the CSV support. NAME: Timo Schulz EMAIL: users.sourceforge.net> CONTRIBUTION: did a lot of work 'sql', DB2 and CSV support and extensive analysis aimed at diff files support. IMDbPY-4.9/docs/DISCLAIMER.txt0000644000000000000000000000104511766731642014232 0ustar rootroot DISCLAIMER ========== IMDbPY (and the author) is not affiliated with Internet Movie Database Inc. IMDb is a trademark of Internet Movie Database Inc. and all contents and data included on the IMDb's site is the property of IMDb or its content suppliers and protected by United States and international copyright laws. Please, read the IMDb's conditions of use in their website: - http://www.imdb.com/help/show_article?conditions - http://www.imdb.com/help/show_leaf?usedatasoftware - any other notice in the http://www.imdb.com/ site. IMDbPY-4.9/docs/imdbpyico16x16.ico0000755000000000000000000000257611766731642015253 0ustar rootrooth(  /,,,>.8///11""Y666]BBBQAALL2TMTTZZ.[['XXXdd^^^nn~Pqjppsk ooxasntm(nn2rk8oo N8Hjjjlll 8zz9ssjnn ?>tt||C".|~@yy8M%sss|Lzz8d+BFUppGtd =+rv6moRhq($zbh~ªº    J,J==J,J[[[T--W[[[[_oooZML*25(cboo__ooo;X7!/4# $`o__omha.D8Yd\]Ho__of+CNSG9:kooio__oo 6FO9Uoooo__o^QVB>)oooo__oR&%A False # 1, on, true, yes -> True # none -> None # # Other options, like defaultModFunct, must be passed by the code. # [imdbpy] ## Default. accessSystem = http ## Optional (options common to every data access system): # Activate adult searches (on, by default). #adultSearch = on # Number of results for searches (20 by default). #results = 20 # Re-raise all caught exceptions (off, by default). #reraiseExceptions = off ## Optional (options common to http and mobile data access systems): # Proxy used to access the network. If it requires authentication, # try with: http://username:password@server_address:port/ #proxy = http://localhost:8080/ # Cookies of the IMDb.com account #cookie_id = string_representing_the_cookie_id #cookie_uu = string_representing_the_cookie_uu ## Timeout for the connection to IMDb (30 seconds, by default). #timeout = 30 # Base url to access pages on the IMDb.com web server. #imdbURL_base = http://akas.imdb.com/ ## Parameters for the 'http' data access system. # Parser to use; can be a single value or a list of value separated by # a comma, to express order preference. Valid values: "lxml", "beautifulsoup" #useModule = lxml,beautifulsoup ## Parameters for the 'mobile' data access system. #accessSystem = mobile ## Parameters for the 'sql' data access system. #accessSystem = sql #uri = mysql://user:password@localhost/imdb # ORM to use; can be a single value or a list of value separated by # a comma, to express order preference. Valid values: "sqlobject", "sqlalchemy" #useORM = sqlobject,sqlalchemy ## Set the threshold for logging messages. # Can be one of "debug", "info", "warning", "error", "critical" (default: # "warning"). #loggingLevel = debug ## Path to a configuration file for the logging facility; # see: http://docs.python.org/library/logging.html#configuring-logging #loggingConfig = ~/.imdbpy-logger.cfg IMDbPY-4.9/docs/LICENSE.txt0000644000000000000000000000223711766731642013704 0ustar rootrootIMDbPY NOTE: see also the recommendations in the "DISCLAIMER.txt" file. NOTE: for a list of other persons who share with me the copyright over specific portions of code, see the "CONTRIBUTORS.txt" file. NOTE: IMDbPY includes an unmodified version of BeautifulSoup, renamed _bsoup.py; that code is copyrighted by its author, Leonard Richardson and is released under a New-style BSD license. Copyright 2004-2009 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA IMDbPY-4.9/docs/README.txt0000644000000000000000000002063211766731642013556 0ustar rootroot What's IMDbPY? ============== NOTE: see also the recommendations in the "DISCLAIMER.txt" file. IMDbPY is a Python package useful to retrieve and manage the data of the IMDb movie database. IMDbPY is mainly a tool intended for programmers and developers, but some example scripts are included. If you're a poor, simple, clueless user, read the "README.users" file. :-) Seriously: take a look at the provided example scripts even if you're a Really Mighty Programmer(tm), they should clearly show how to use IMDbPY. Other IMDbPY-based programs can be downloaded from: http://imdbpy.sourceforge.net/?page=programs If you want to develop a program/script/package/framework using the IMDbPY package, see the "README.package" file, for instructions about how to use this package. If you're installing IMDbPY in a smart phone, PDA or hand-held system, read the "README.mobile" file. If you're crazy enough and/or you've realized that your higher inspiration in life is to help the development of IMDbPY, begin reading the "README.devel" file. ;-) INSTALLATION ============ Everything you need to do is to run, as the root user, the command: # python setup.py install IMDbPY itself can be installed through easy_install and pip, with - respectively - these commands (as root): easy_install IMDbPY pip install IMDbPY Using easy_install and pip, the dependencies will be automatically satisfied. Third-party packages may be downloaded, and if not otherwise specified (see below), C extensions compiled (this means that you need the python-dev package installed). If, for some reason, it doesn't work, you can copy the "./imdb" directory in the local site-packages directory of the python major version you're using, but remember that you'll not satisfy the required dependencies and neither compile the optional C module, so use this as your very last resort. To know what major version of python you've installed, run: $ python -V It should return a string like "Python 2.6.1"; in this example the major version is "2.6". Now copy the "./imdb" directory: # cp -r ./imdb /usr/local/lib/python{MAJORVERSION}/site-packages/ The setup.py contains some configuration options that could be useful if you're installing IMDbPY in a system with very little hard disk space (like an handheld device) or where you've not a complete development environment available; read the "README.mobile" file. If you want to insert the content of the plain text data files into a SQL database, read the "README.sqldb" file. The whole list of command line options of the setup.py script is: --without-lxml exclude lxml (speeds up "http" considerably, so try to fix it). --without-cutils don't compile the C module (speeds up 'sql') --without-sql no access to SQL databases. If you're install 'sql', setup.py tries to install BOTH SQLObject and SQLAlchemy. In fact, having one of them will be enough. You can exclude the unwanted one with: --without-sqlobject exclude SQLObject --without-sqlalchemy exclude SQLAlchemy If you specify both, --without-sql is implied. Mercurial VERSION ================= The best thing is always to use a package for your distribution, or use easy_install or pip to install the latest release, but it goes without saying that sometimes you need the very latest version (keep in mind that the IMDb site is a moving target...). In this case, you can always use the Mercurial version, available here: http://imdbpy.sourceforge.net/?page=download#hg HELP ==== Refer to the web site http://imdbpy.sf.net/ and subscribe to the mailing list: http://imdbpy.sf.net/?page=help#ml NOTES FOR PACKAGERS =================== If you plan to package IMDbPY for your distribution/operating system, keep in mind that, while IMDbPY can works out-of-the-box, some external package may be required for certain functionality: - python-lxml: the 'http' data access system will be much faster, if it's installed. - SQLObject or SQLAlchemy: one of these is REQUIRED if you want to use the 'sql' data access system. All of them should probably be "recommended" (or at least "suggested") dependencies. To compile the C module, you also need the python-dev package. As of IMDbPY 4.0, the installer is based on setuptools. RECENT IMPORTANT CHANGES ======================== Since release 2.4, IMDbPY internally manages every information about movies and people using unicode strings. Please read the README.utf8 file. Since release 3.3, IMDbPY supports IMDb's character pages; see the README.currentRole file for more information. Since release 3.6, IMDbPY supports IMDb's company pages; see the README.companies file for more information. Since release 3.7, IMDbPY has moved its main parsers from a SAX-based approach to a DOM/XPath-based one; see the README.newparsers file for more information. Since release 3.8, IMDbPY supports both SQLObject and SQLAlchemy; see README.sqldb for more information. Since release 3.9 support dumping the plain text data files in CSV files; see README.sqldb for more information. Since release 4.0 it's possible to search for keywords (get keywords similar to a given one and get a list of movies for a specified keyword). See README.keywords for more information. Moreover, it's possible to get information out of Movie, Person, Character and Company instances as XML (getting a single keys or the representation of a whole object). See README.info2xml for more information. Another new feature, is the ability to get top250 and bottom100 lists; see the "TOP250 / BOTTOM100 LISTS" section of the README.package file for more information. Since release 4.1 a DTD for the XML output is available (see imdbpyXY.dtd). Other important features are locale (i18n) support (see README.locale) and support for the new style of movie titles used by IMDb (now in the "The Title" style, and no more as "Title, The"). FEATURES ======== So far you can search for a movie with a given title, a person with a given name, a character you've seen in a movie or a company, and retrieve information for a given movie, person, character or company; the supported data access systems are 'http' (i.e.: the data are fetched through the IMDb's web server http://akas.imdb.com) and 'sql', meaning that the data are taken from a SQL database, populated (using the imdbpy2sql.py script) with data taken from the plain text data files; see http://www.imdb.com/interfaces/ for more information. For mobile systems there's the 'mobile' data access system, useful for PDA, hand-held devices and smart phones. Another data access system is 'httpThin', which is equal to 'http' but fetch less data and so it is (or at least it tries to be) suitable for systems with limited bandwidth but normal CPU power. FEATURES OF THE HTTP DATA ACCESS SYSTEM ======================================= * Returns almost every available information about a movie, person or character. * The use of the "akas" server will provide access to a lot of AKA titles in many languages, so it's really useful if English is not your native language. * By default includes adult titles (and people who have worked only/mostly in adult movies) in the results of a title/name search; this behavior can be changed with the do_adult_search() method; please read the "README.adult" file. * You can set/use a proxy to access the web; if set, the HTTP_PROXY environment variable will be automatically used, otherwise you can set a proxy with the set_proxy() method of the class returned by the imdb.IMDb function; obviously this method is available only for the http data access system, since it's defined in the IMDbHTTPAccessSystem class of the parser.http package. Example: from imdb import IMDb i = IMDb(accessSystem='http') # the accessSystem argument is not really # needed, since "http" is the default. i.set_proxy('http://localhost:8080/') You can force a direct connection to the net setting the proxy to a null value (i.e.: i.set_proxy('')). FEATURES OF THE SQL DATA ACCESS SYSTEM ====================================== * Returns every information available in the plain text data files. * Every database supported by SQLObject and SQLAlchemy is available. FEATURES OF THE MOBILE DATA ACCESS SYSTEM ========================================= * Very lightweight, returns almost every needed information. * Accessories data sets (like 'goofs', 'trivia' and so on) are always available (being a subclass of the 'http' data access system). IMDbPY-4.9/docs/CREDITS.txt0000644000000000000000000002067711766731642013727 0ustar rootroot CREDITS ======= See also CONTRIBUTORS.txt for a list of the most important developers who share the copyright on some portions of the code. First of all, I want to thank all the maintainers of the packages, listed on http://imdbpy.sf.net/?page=download#otherpkg, and especially Ana Guerrero. Another big thank to the developers who used IMDbPY for their projects and researches; they can be found here: http://imdbpy.sf.net/?page=programs Other very special thanks go to some people who followed very closely the development of IMDbPY, providing hints and insights: Ori Cohen, James Rubino, Tero Saarni and Jesper Noer (for a lot of help, and also for the wonderful http://bitbucket.org) Below, a list of persons who contributed with bug reports, small patches and hints (kept in a reverse order since IMDbPY 4.5): * John Lambert, Rick Summerhill and Maciej for reports and fixes for the search query. * Kaspars "Darklow" Sprogis for an impressive amount of tests and reports about bugs parsing the plain text data files and many new ideas. * Damien Stewart for many bug reports about the Windows environment. * Vincenzo Ampolo for a bug report about the new imdbIDs save/restore queries. * Tomáš Hnyk for the idea of an option to reraise caught exceptions. * Emmanuel Tabard for ideas, code and testing on restoring imdbIDs. * Fabian Roth for a bug report about the new style of episodes list. * Y. Josuin for a bug report on missing info in crazy credits file. * Arfrever Frehtes Taifersar Arahesis for a patch for locales. * Gustaf Nilsson for bug reports about BeautifulSoup. * Jernej Kos for patches to handle "in production" information and birth/death years. * Saravanan Thirumuruganathan for a bug report about genres in mobile. * Paul Koan, for a bug report about DVD pages and movie references. * Greg Walters for a report about a bug with queries with too many results. * Olav Kolbu for tests and report about how the IMDb.com servers reply to queries made with and without cookies. * Jef "ofthelit", for a patch for the reduce.sh script bug reports for Windows. * Reiner Herrmann for benchmarks using SSD hard drives. * Thomas Stewart for some tests and reports about a bug with charset in the plain text data files. * Ju-Hee Bae for an important series of bug reports about the problems derived by the last IMDb's redesign. * Luis Liras and Petite Abeille for a report and a bugfix about imdbpy2sql.py used with SQLite and SQLObject. * Kevin S. Anthony for a bug report about episodes list. * Bhupinder Singh for a bug report about exception handling in Python 2.4. * Ronald Hatcher for a bug report on the GAE environment. * Ramusus for a lot of precious bug reports. * Laurent Vergne for a hint about InnoDB, MyISAM and foreign keys. * Israel Fruch for patches to support the new set of parsers. * Inf3cted MonkeY, for a bug report about 'vote details'. * Alexmipego, for suggesting to add a md5sum to titles and names. * belgabortm for a bug report about movies with multiple 'countries'. * David Kaufman for an idea to make the 'update' method more robust. * Dustin Wyatt for a bug with SQLite of Python 2.6. * Julian Scheid for bug reports about garbage in the ptdf. * Adeodato Simó for a bug report about the new imdb.com layout. * Josh Harding for a bug report about the new imdb.com layout. * Xavier Naidoo for a bug report about top250 and BeautifulSoup. * Basil Shubin for hints about a new helper function. * Mark Jeffery, for some help debugging a lxml bug. * Hieu Nguyen for a bug report about fetching real imdbIDs. * Rdian06 for a patch for movies without plot authors. * Tero Saarni, for the series 60 GUI and a lot of testing and debugging. * Ana Guerrero, for maintaining the official debian package. * H. Turgut Uyar for a number of bug reports and a lot of work on the test-suite. * Ori Cohen for some code and various hints. * Jesper Nøhr for a lot of testing, especially on 'sql'. * James Rubino for many bug reports. * Cesare Lasorella for a bug report about newer versions of SQLObject. * Andre LeBlanc for a bug report about airing date of tv series episodes. * aow for a note about some misleading descriptions. * Sébastien Ragons for tests and reports. * Sridhar Ratnakumar for info about PKG-INF. * neonrush for a bug parsing Malcolm McDowell filmography! * Alen Ribic for some bug reports and hints. * Joachim Selke for some bug reports with SQLAlchemy and DB2 and a lot of testing and debugging of the ibm_db driver (plus a lot of hints about how to improve the imdbpy2sql.py script). * Karl Newman for bug reports about the installer of version 4.5. * Saruke Kun and Treas0n for bug reports about 'Forbidden' errors from the imdb.com server. * Chris Thompson for some bug reports about summary() methods. * Mike Castle for performace tests with SQLite and numerous hints. * Indy (indyx) for a bug about series cast parsing using BeautifulSoup. * Yoav Aviram for a bug report about tv mini-series. * Arjan Gijsberts for a bug report and patch for a problem with movies listed in the Bottom 100. * Helio MC Pereira for a bug report about unicode. * Michael Charclo for some bug reports performing 'http' queries. * Amit Belani for bug reports about plot outline and other changes. * Matt Warnock for some tests with MySQL. * Mark Armendariz for a bug report about too long field in MySQL db and some tests/analyses. * Alexy Khrabrov, for a report about a subtle bug in imdbpy2sql.py. * Clark Bassett for bug reports and fixes about the imdbpy2sql.py script and the cutils.c C module. * mumas for reporting a bug in summary methods. * Ken R. Garland for a bug report about 'cover url' and a lot of other hints. * Steven Ovits for hints and tests with Microsoft SQL Server, SQLExpress and preliminary work on supporting diff files. * Fredrik Arnell for tests and bug reports about the imdbpy2sql.py script. * Arnab for a bug report in the imdbpy2sql.py script. * Elefterios Stamatogiannakis for the hint about transactions and SQLite, to obtain an impressive improvement in performances. * Jon Sabo for a bug report about unicode and the imdbpy2sql.py script and some feedback. * Andrew Pendleton for a report about a very hideous bug in the imdbpy2sql.py (garbage in the plain text data files + programming errors + utf8 strings + postgres). * Ataru Moroboshi ;-) for a bug report about role/duty and notes. * Ivan Kedrin for a bug report about the analyze_title function. * Hadley Rich for reporting bugs and providing patches for troubles parsing tv series' episodes and searching for tv series' titles. * Jamie R. Rytlewski for a suggestion about saving imbIDs in 'sql'. * Vincent Crevot, for a bug report about unicode support. * Jay Klein for a bug report and testing to fix a nasty bug in the imdbpy2sql.py script (splitting too large data sets). * Ivan Garcia for an important bug report about the use of IMDbPY within wxPython programs. * Kessia Pinheiro for a bug report about tv series list of episodes. * Michael G. Noll for a bug report and a patch to fix a bug retrieving 'plot keywords'. * Alain Michel, for a bug report about search_*.py and get_*.py scripts. * Martin Arpon and Andreas Schoenle for bug reports (and patches) about "runtime", "aka titles" and "production notes" information not being parsed. * none none (dclist at gmail.com) for a useful hint and code to retrieve a movie/person object, given an URL. * Sebastian Pölsterl, for a bug report about the cover url for tv (mini) series, and another one about search_* methods. * Martin Kirst for many hints and the work on the imdbpyweb program. * Julian Mayer, for a bug report and a patch about non-ascii chars. * Wim Schut and "eccentric", for bug reports and a patches about movies' cover url. * Alfio Ferrara, for a bug report about the get_first_movie.py script. * Magnus Lie Hetland for an hint about the searches in sql package. * Thomas Jadjewski for a bug report about the imdbpy2sql.py script. * Trevor MacPhail, for a bug report about search_* methods and the ParserBase.parse method. * Guillaume Wisniewski, for a bug report. * Kent Johnson, for a bug report. * Andras Bali, for the hint about the "plot outline" information. * Nick S. Novikov, who provided the Windows installer until I've managed to set up a Windows development environment. * Simone Bacciglieri, who downloaded the plain text data files for me. * Carmine Noviello, for some design hints. * "Basilius" for a bug report. * Davide for a bug report. IMDbPY-4.9/docs/README.local0000644000000000000000000000121211766731642014022 0ustar rootroot LOCAL INSTALLATION ================== Simple instruction: switch to 'sql' (see the README.sqldb file). The 'local' data access system was removed since IMDbPY 4.2, for a series of good reasons: - the moviedb program was no more distributed by IMDb. - the new format for movie titles ("The Title" instead of "Title, The") created way too many problems almost impossible to fix, since a lot of damage was done by the - never updated - moviedb program. - it was slower and less complete than 'sql'. - there were very few users of it. If you are veeery willing to resuscitate it, you can write in the mailing list about your crazy idea. :-) IMDbPY-4.9/docs/README.users0000644000000000000000000001033311766731642014075 0ustar rootroot IMDbPY FOR THE USERS ==================== As I've already said, IMDbPY by itself is not really useful if you're not a developer; anyway, some simple example scripts are included in the ./bin directory. You can find other IMDbPY-based programs here: http://imdbpy.sf.net/?page=programs The 'search_movie.py' script takes a single argument, which must be a title of a movie to search; if the title contains spaces or other "strange" chars, enclose the title in single or double quotes. It will print a list of imdbID and long imdb titles. The 'get_movie.py' script takes a single argument: a movieID and it will print some information about the movie. The movieID is a unique identifier for a single movie. Notice that, since IMDbPY can take its information from different sources, the movieID can take different shapes. By default IMDbPY accesses the web using its 'http' data access system, and so in this case movieID are the same imdbIDs that you can find on the web page for that movie (the same is true using the 'mobile' data access system). On the other hand, if you've configured IMDbPY to use the 'sql' data access system, the movieIDs are still unique, but totally arbitrary and not related to the imdbIDs used by the web server; this, because the list of imdbIDs is not published with the plain text data files. Also notice that these movieIDs generated by 'sql', are only valid for your current setup, and will change when you'll update your database with a new set of plain text data files. The 'get_first_movie.py' works like 'search_movie.py', but it will only print information about the first matching title. Now guess what 'get_person.py', 'search_person.py','get_first_person.py', 'get_character.py', 'search_character.py','get_first_character.py', 'get_company.py', 'search_company.py', 'get_first_company.py', 'search_keyword.py', 'get_keyword.py' and 'get_top_bottom_movies.py' scripts do... :-) Remember that you've to be connected to the net, if you want to use these scripts! If the HTTP_PROXY environment variable is set, the referred proxy is used. Take a look at the scripts; they're somewhat commented; maybe you can customize it... Examples: $ search_movie.py 'the passion' 20 results for "the passion": Passion of the Christ, The (2004) Passion, The (2003/I) Passion, The (1999) (TV) Patima (1975) Andrei Rublyov (1969) Passion de Jeanne d'Arc, La (1928) Passion of Darkly Noon, The (1996) Passion of Ayn Rand, The (1999) Passion Béatrice, La (1987) Passion, En (1969) Pride and the Passion, The (1957) "Charles II: The Power & the Passion" (2003) (mini) Pasión de María Elena, La (2003) Pasión según Berenice, La (1976) Passion of Rita Camilleri, The (1993) Culture, Water, Money: The Passion of the Frontier (1998) Sanguisuga conduce la danza, La (1975) Passion of John Ruskin, The (1994) Making of 'The Passion of the Christ', The (2004) (TV) Scream of the Butterfly (1965) # Here we assumes you've not configured IMDbPY to use a local SQL database, # and so we use '0133093' as the movieID. $ get_movie.py 0133093 Movie ===== Title: Matrix, The Genres: Action, Thriller, Sci-Fi. Director: Andy Wachowski (as The Wachowski Brothers), Larry Wachowski (as The Wachowski Brothers). Writer: Andy Wachowski (written by) (as The Wachowski Brothers), Larry Wachowski (written by) (as The Wachowski Brothers). Cast: Keanu Reeves (Neo (Thomas A. Anderson)), Laurence Fishburne (Morpheus), Carrie-Anne Moss (Trinity), Hugo Weaving (Agent Smith), Joe Pantoliano (Cypher (Mr. Reagan)). Runtime: 136. Country: USA. Language: English. Rating: 8.5 Votes: 114,264 Plot: In the near future, a computer hacker named Neo (Keanu Reeves) discovers that all life on Earth may be nothing more than an elaborate facade created by a malevolent cyber-intelligence, for the purpose of placating us while our life essence is "farmed" to fuel the Matrix's campaign of domination in the "real" world. He joins like-minded Rebel warriors Morpheus (Laurence Fishburne) and Trinity (Carrie Ann Moss) in their struggle to overthrow the Matrix. $ get_first_character.py 'Jesse James' Best match for "Jesse James" Character ===== Name: Jesse James Biography: History::Born: September 5, 1847 in Clay County, Missouri, USA Died: April 3, 1882 in St. Joseph, Missouri, USA [...] IMDbPY-4.9/docs/README.package0000644000000000000000000005710111766731642014333 0ustar rootroot IMDbPY package ============== Here you can find information useful to use IMDbPY to write your own scripts or programs. These information are far from complete: the code is the final documentation! ;-) Sections in this file: * GENERAL USAGE * THE Movie CLASS * THE Person CLASS * THE Character CLASS * THE Company CLASS * INFORMATION SETS * Person OBJECTS INSIDE A Movie CLASS AND Movie OBJECTS INSIDE A Person OBJECT * Company OBJECTS INSIDE A Movie CLASS AND Movie OBJECTS INSIDE A Company OBJECT * THE (NOT-SO-)"UNIVERSAL" '::' SEPARATOR * MOVIE TITLES AND PERSON/CHARACTER NAMES REFERENCES * EXCEPTIONS * OTHER SOURCES OF INFO UNICODE NOTICE ============== Since release 2.4, IMDbPY internally manages every information about movies and people using unicode strings. Please read the README.utf8 file. GENERAL USAGE ============= To use the IMDbPY package, you've to import the imdb package and call the IMDb function. the basic invocation is: import imdb imdb_access = imdb.IMDb() If you're accessing a sql installation of the IMDb's data, you must do: imdb_access = imdb.IMDb('sql', uri='URI_TO_YOUR_DB') where 'URI_TO_YOUR_DB' points to your SQL database (see README.sqldb for more information). Now you've the "imdb_access" object, instance of a subclass of the imdb.IMDbBase class, which can be used to search for a given title/name and to retrieve information about the referred movie, person or character. The IMDb function can be called with a 'accessSystem' keyword argument, that must be a string representing the type of data access you want to use. That's because different systems to access the IMDb data are available: you can directly fetch data from the web server, you can have a local copy of the database (see http://www.imdb.com/interfaces/), you can access movie data through the e-mail interface, etc. etc. Supported access systems | Aliases | Description ---------------------------+-----------+------------------------------------ (default) 'http' | 'web', | information are fetched through | 'html' | the http://akas.imdb.com web server. ---------------------------+-----------+------------------------------------ 'sql' | 'db', | information are fetched through | 'database'| a SQL database (every database | | supported by SQLObject and SQLAlchemy | | is available). ---------------------------+-----------+------------------------------------ 'mobile' | | same as 'httpThin', but string | | methods are used for parsing. | | Ideal for small systems like PDA, | | smart phones, hand-held devices with | | limited bandwidth and CPU power. ---------------------------+-----------+------------------------------------ 'httpThin' | 'webThin' | identical to 'http', but less | 'htmlThin'| information are gathered; useful | | for systems with limited bandwidth. NOTE ON THE 'DEFAULT' ACCESS SYSTEM: since release 3.4, the 'imdbpy.cfg' configuration file is available, so that you can set a system-wide (or user-wide) default. The file is commented with indication of the location where it can be put, and how to modify it. Obviously, if no imdbpy.cfg file is found (or is not readable or it can't be parsed), 'http' is still considered the default. The imdb_access object has ten main methods: search_movie(title), get_movie(movieID), search_person(name), get_person(personID), search_character(name), get_character(characterID), search_company(name), get_company(companyID), search_episode() and update(MovieOrPersonObject) Methods description: search_movie(title) searches for the given title, and returns a list of Movie objects containing only basic information like the movie title and year, and with a "movieID" instance variable: - movieID is an identifier of some kind; for the sake of simplicity you can think of it as the ID used by the IMDb's web server used to univocally identify a movie (e.g.: '0094226' for Brian De Palma's "The Untouchables"), but keep in mind that it's not necessary the same ID!!! For some implementations of the "data access system" these two IDs can be the same (and this is the case of the 'http' data access system), but other "access systems" can use a totally different kind of movieID. The easier (I hope!) way to understand this is to think at the movieID of a Movie returned by the search_movie() method as the _thing_ you've to pass to the get_movie() method, so that it can retrieve info about the referred movie. So, movieID _can_ be the imdbID ('0094226') if you're accessing the web server, but with a sql installation of the IMDb database, movieID will be an integer, as read from the id columns in the database. search_episode(title) is identical to search_movie(), except that its tailored to search for episodes' titles; best results are expected searching for just the title of the episode, _without_ the title of the TV series. get_movie(movieID) will fetch the needed data and return a Movie object for the movie referenced by the given movieID; the Movie class can be found in the Movie module; a Movie object presents basically the same interface of a Python's dictionary, so you can access, for example, the list of actors and actress using the syntax: movieObject['cast'] The search_person(name), get_person(personID) search_character(name) get_character(characterID), search_company(name) and get_company(companyID) methods work the same way as search_movie(title) and get_movie(movieID). The search_keyword(string) method returns a list of unicode string that are valid keywords, similar to the one given. The get_keyword(keyword) method returns a list of Movie instances that are tagged with the given keyword. For more information see README.keywords. The get_imdbMovieID(movieID), get_imdbPersonID(personID), get_imdbCharacterID(characterID) and get_imdbCompanyID(companyID) take, respectively, a movieID, a personID, a movieID and a companyID and return the relative imdbID; it's safer to use the get_imdbID(MovieOrPersonOrCharacterOrCompanyObject) method. The title2imdbID(title), name2imdbID(name), character2imdbID(name) and company2imdbID(name) take, respectively, a movie title (in the plain text data files format), a person name, a character name and a company name, and return the relative imdbID; when possibile it's safer to use the get_imdbID(MovieOrPersonOrCharacterOrCompanyObject) method. These functions _always_ need to connect to the IMDb's web site, if you're not using 'http', 'httpThin' or 'mobile' data acess systems. The get_imdbID(MovieOrPersonOrCharacterOrCompanyObject) method returns the imdbID for the given Movie, Person, Character or Company object. The get_imdbURL(MovieOrPersonOrCharacterOrCompanyObject) method returns a string with the main IMDb URL for the given Movie, Person, Character or Company object; it tries to do its best to retrieve the URL. The update(MovieOrPersonOrCharacterOrCompanyObject) method takes an instance of a Movie, Person, Character or Company class and retrieve other available information. Remember that the search_*(txt) methods will return a list of Movie, Person, Character or Company objects with only basic information, like the movie title or the person/character name, so update() can be used to retrieve every other information. By default a "reasonable" set of information are retrieved ('main', 'filmography' and 'biography' for a Person/Character objects, 'main' and 'plot' for a Movie object and 'main' for Company objects). Example: i = IMDb() # movie_list is a list of Movie objects, with only attributes like 'title' # and 'year' defined. movie_list = i.search_movie('the passion') # the first movie in the list. first_match = movie_list[0] # only basic information like the title will be printed. print first_match.summary() # update the information for this movie. i.update(first_match) # a lot of information will be printed! print first_match.summary() # retrieve trivia information and print it. i.update(first_match, 'trivia') print m['trivia'] # retrieve both 'quotes' and 'goofs' information (with a list or tuple) i.update(m, ['quotes', 'goofs']) print m['quotes'] print m['goofs'] # retrieve every available information. i.update(m, 'all') THE Movie CLASS =============== The main use of a Movie object is to access to the info it contains with a dictionary-like interface, like "movieObject[key]" where 'key' is a string that identifies the information you want to get. I've a really bad news for you: at this time, what 'key' is, is a little unclear! In general, it's the name of the section as used by the IMDb web server to show the data. Where the information is a list of people with a role (an actor, a stunt, a writer, etc.) the relative section in the HTML page starts with a link to a "/Glossary/X#SectName" page; here "sectname" is used as 'key'. When the info regard companies (distributors, special effects, etc.) or the movie itself (sound mix, certifications, etc.) the section in the HTML page begins with a link to a "/List?SectName=" page, so we use "sectname" as a 'key'. The section name (the key) is always (with some minor exceptions) lowercase; underscores and minus signs are replaced with spaces. Some other keys aren't taken from the HTML page, but are defined within the Movie class. To get the complete list of keys available for a given Movie object, you can use the movieObject.keys() method (obviously only keys that refer to some existing information are defined, so a movie without an art director will raise a KeyError exception is you try movieObject['art director']); to avoid the exception, you can test if a Movie object has a given key with the has_key(key) method, or get the value with the get(key) method, which returns the value or None if the key is not found (an optional paramenter can modify the default value returned if the key isn't found). Below, a list of the main keys you can encounter, the type of the value returned by movieObject[key] and a short description/example: title; string; the "usual" title of the movie, like "The Untouchables". long imdb title; string; "Uncommon Valor (1983/II) (TV)" canonical title; string; the title in the canonical format, like "Untouchables, The". long imdb canonical title; string; "Patriot, The (2000)". year; string; the year of release or '????' if unknown. kind; string; one in ('movie', 'tv series', 'tv mini series', 'video game', 'video movie', 'tv movie', 'episode') imdbIndex; string; the roman number for movies with the same title/year. director; Person list; a list of director's name (e.g.: ['Brian De Palma']) cast; Person list; list of actor/actress, with the currentRole instance variable set to a Character object which describe his role/duty. cover url; string; the link to the image of the poster. writer; Person list; list of writers ['Oscar Fraley (novel)'] plot; list; list of plots and authors of the plot. rating; string; user rating on IMDb from 1 to 10 (e.g. '7.8') votes; string; number of votes (e.g. '24,101') runtimes; string list; in minutes ['119'] or something like ['USA:118', 'UK:116'] number of episodes; int; number or episodes for a series. color info; string list; ["Color (Technicolor)"] countries; string list; production's country ['USA', 'Italy'] genres; string list; one or more in (Action, Adventure, Adult, Animation, Comedy, Crime, Documentary, Drama, Family, Fantasy, Film-Noir, Horror, Musical, Mystery, Romance, Sci-Fi, Short, Thriller, War, Western) and other genres defined by IMDb. akas; string list; list of aka for this movie languages; string list; list of languages certificates; string list; ['UK:15', 'USA:R'] mpaa; string; the mpaa rating episodes (series only); dictionary of dictionary; one key for every season, one key for every episode in the season. number of episodes (series only); int; total number of episodes. number of seasons (series only); int; total number of seasons. series years (series only); string; range of years when the series was produced. episode of (episode only); Movie object; the parent series for an episode. season (episode only); int; the season number. episode (episode only); int; the number of the episode in the season. long imdb episode title (episode only); string; episode and series title. series title; string. canonical series title; string. Other keys that contain a list of Person objects are: costume designer, sound crew, crewmembers, editor, production manager, visual effects, assistant director, art department, composer, art director, cinematographer, make up, stunt performer, producer, set decorator, production designer. Other keys that contain list of companies are: production companies, special effects, sound mix, special effects companies, miscellaneous companies, distributors. Converting a title to its 'Title, The' canonical format, IMDbPY does some assumptions about what is an article and what not, and this could lead to some wrong canonical titles. For more information on this subject, see the "ARTICLES IN TITLES" section of the README.locale file. THE Person CLASS ================ It works mostly like the Movie class. :-) The Movie class defines a __contains__() method, which is used to check if a given person has worked in a given movie with the syntax: if personObject in movieObject: print '%s worked in %s' % (personObject['name'], movieObject['title']) The Person class defines a isSamePerson(otherPersonObject) method, useful to compare two person if you're not sure that both objects have retrieved complete information (e.g.: a Person object returned by a query); th syntax is: if personObject.isSamePerson(otherPersonObject): print 'they are the same person!' An analogous method is defined for the Movie class, and it's called isSameTitle(otherMovieObject) THE Character CLASS =================== It works mostly like the Person class. :-) For more information about the "currentRole" attribute, see the README.currentRole file. THE Company CLASS ================ It works mostly like the Person class. :-) The "currentRole" attribute is always None. INFORMATION SETS ================ Since release 1.2, it's possibile to retrieve almost every piece of information about a given movie or person; this can be a problem, because (at least for the 'http' data access system) it means that a lot of web pages must be fetched and parsed, and this can be time and bandwidth consuming, especially if you're interested only in a small set of information. Now the get_person, get_movie, get_character, get_company and update methods have an optional 'info' argument, which can be set to a list of strings, each one representing an "information set". Movie/Person/Character/Company objects have, respectively, their own list of available "information sets". E.g.: the Movie class have a set called 'taglines' for the taglines of the movie, a set called 'vote details' for the number of votes for rating [1-10], demographic breakdowns and top 250 rank; the Person class have a set called 'other works' for miscellaneous works of this person an so on. By default only important information are retrieved/updated (i.e.: for a Movie object, only the 'main' and 'plot' information sets; for a Person/Character object only 'main', 'filmography', 'biography'. Example: i = imdb.IMDb(accessSystem='http') m = i.get_movie('0133093') # only default info set are retrieved. m.has_key('demographic') # returns false, since no demographic breakdowns # aren't available by default. i.update(m, info=('vote details',)) # retrieve the vote details info set. print m['demographic'] # print the demographic breakdowns. Another example: i = imdb.IMDb(accessSystem='http') # retrieve only the biography and the "other works" page: p = i.get_person('0000154', info=['biography', 'other works']) print p['salary'] print p['other works'] To see which information sets are available and what are the defaults, see the all_info and default_info instance variable of Movie, Person and Character classes. Each object instance of Movie, Person or Character, also have a current_info instance variable, to remember the information sets already retrieved. Beware that the information sets vary from an access system to another: locally not every data is accessible, while - for example for sql - accessing one set of data automatically means automatic access to a number of other unrelated information (without major performace drawbacks). You can get the list of available info set with the methods: i.get_movie_infoset(), i.get_person_infoset(), i.get_character_infoset() and i.get_company_infoset(). TOP250 / BOTTOM100 LISTS ======================== Since IMDbPY 4.0, it's possible to retrieve the list of top250 and bottom100 movies. Use the get_top250_movies() and get_bottom100_movies() methods. Beware that, for 'sql', the bottom100 list i limited to the first 10 results. Person OBJECTS INSIDE A Movie CLASS AND Movie OBJECTS INSIDE A Person OBJECT ============================================================================ Parsing the information about a movie, you'll encounter a lot of references to the people who worked on it, like the cast, the director, the stunts, and so on. For people in the cast (actors/actresses), the "currentRole" instance variable is set to the name of the character they played (e.g.: "Roy Neary" for the role played by Richard Dreyfuss in Close Encounters of the Third Kind). In fact, in this case currentRole will be a Character instance. Another instance variable of a Person object is "notes", used to store miscellaneous information (like an aka name for the actor, an "uncredited" notice and so on). It's also used, for non-cast people, to describe the specific task of the person (e.g.: "assistant dialogue staff" for a person of the sound departement). It's possible to test, with the Python "in" statement, if a person worked in a given movie, or vice-versa; the following are all valid tests: movie in person movie in character person in movie person in character character in movie character in person Considerations similar to the above ones, can be done for Character instances: please read the README.currentRole file for more information. E.g.: # retrieve data for Steven Spielberg's "Close Encounters of the Third Kind" import imdb i = imdb.IMDb(accessSystem='http') movie = i.get_movie('0075860') # Get the 7th Person object in the cast list cast = movie['cast'][6] # Will print "Warren J. Kemmerling" print cast['name'] # Will print "Wild Bill" print cast.currentRole # Will print "(as Warren Kemmerling)" print cast.notes # Get the 5th Person object in the list of writers writer = movie['writer'][4] # Will print "Steven Spielberg" print writer['name'] # Will print "written by", because that was duty of Steven Spielberg, # as a writer for the movie. print writer.notes Obviously these Person objects contain only information directly available parsing the movie pages (e.g.: the name, an imdbID, the role/duty), so if now you: print writer['actor'] to get a list of movies acted by Mel Gibson, you'll get a KeyError exception, because the Person object doesn't contain this kind of information. To gather every available information, you've to use the update() method of the IMDb class: i.update(writer) # Now it will print a list of Movie objects. print writer['actor'] The same is true parsing a person data: you'll find a list of movie he/she worked on and, for every movie, the currentRole instance variable is set to a string describing the role/duty of the considered person. E.g.: # Julia Roberts julia = i.get_person('0000210') # Print a list of movies she acted in and the played role, separated # by '::' print [movie['title'] + '::' + movie.currentRole for movie in julia['actress']] Here the various Movie objects only contain minimal information, like the title and the year; the latest movie with Julia Roberts: last = julia['actress'][0] # Retrieve full information i.update(last) # Print the name of the first director print last['director'][0]['name'] Company OBJECTS INSIDE A Movie CLASS AND Movie OBJECTS INSIDE A Company OBJECT ============================================================================== As for Person/Character and Movie objects, you can test - using the "in" operator - if a Company has worked on a given Movie. THE (NOT-SO-)"UNIVERSAL" '::' SEPARATOR ======================================= Sometimes I've used '::' to separate a set of different information inside a string, like the name of a company and what it has done for the movie, the information in the "Also Known As" section, and so on. It's easier to understand if you look at it; look at the output of: import imdb i = imdb.IMDb() m = i.get_movie('0094226') print m['akas'] As a rule, there's as much as one '::' separator inside a string, splitting it two logical pieces: "TEXT::NOTE". In the helpers module there's the makeTextNotes function, that can be used to create a custom function to pretty-print this kind of information. See its documentation for more info. MOVIE TITLES AND PERSON/CHARACTER NAMES REFERENCES ================================================== Sometimes in Movie, Person and Character attributes, there're strings with references to other movies or persons (e.g.: in the plot, in the biography, etc.). These references are stored in the Movie, Person and Character instances; in the strings you'll find values like _A Movie (2003)_ (qv) or 'A Person' (qv) or '#A Character# (qv)'; accessing these string (like movie['plot'] or person['biography']), these strings are modified using a provided function, which must take, as arguments, the string and two dictionary with titles and names references; by default the (qv) strings are converted in the "normal" format ("A Movie (2003)", "A Person" and "A Character"). You can find some examples of these functions in the imdb.utils module. The function used to modify the strings can be set with the defaultModFunct parameter of the IMDb class or with the modFunct parameter of the get_movie, get_person and get_character methods. E.g.: import imdb i = imdb.IMDb(defaultModFunct=imdb.utils.modHtmlLinks) Or: import imdb i = imdb.IMDb() i.get_person('0000154', modFunct=imdb.utils.modHtmlLinks) EXCEPTIONS ========== The imdb._exceptions module contains the exceptions raised by the imdb package. Every exception is a subsclass of IMDbError, which is available from the imdb package. You can catch any type of errors raised by the IMDbPY package with something like: from imdb import IMDb, IMDbError try: i = IMDb() except IMDbError, err: print err try: results = i.search_person('Mel Gibson') except IMDbError, err: print err try: movie = i.get_movie('0335345') except IMDbError, err: print err OTHER SOURCES OF INFO ===================== Once the IMDbPY package is installed, you can read the docstring for packages, modules, functions, classes, objects, methods using the pydoc program; e.g.: "pydoc imdb.IMDb" will show the documentation about the imdb.IMDb class. The code contains a lot of comments, try reading it, if you can understand my English! IMDbPY-4.9/docs/README.sqldb0000644000000000000000000003740211766731642014047 0ustar rootroot NOTE: the imdbpy2sql.py script, used to populate a database using the data in the IMDb's plain text data files, is a critical piece of IMDbPY: it's based on an ORM to be database-independent and contains a lot of tricks to be as fast as possible; however there are huge margins for improvements; if you want to help, please read the TODO.txt file and subscribe the imdbpy-devel mailing list at: http://imdbpy.sf.net/?page=help#ml NOTE: see README.currentRole for information about characters support. SQL === Since version 2.1 it's possible to transfer the whole IMDb's database from the plain text data files into a SQL database. Starting with version 2.5 every database supported by the SQLObject Object Relational Manager can be used to store and retrieve movies and persons information. This means that MySQL, PostgreSQL, SQLite, Firebird, MAX DB, Sybase and MSSQL are supported and, as your read this text, maybe other database backends were added. Since release 3.8, SQLAlchemy (version 0.4 and 0.5) is also supported (this adds at least DB2/Informix IDS to the list of supported databases). Since release 3.9, there's a partial support to output large tables in a set of CSV (Comma Separated Values) files, to be later imported in a database. Actually only MySQL, PostgreSQL and IBM DB2 are supported. In version 4.1 the imdbpy2sql.py script has the '--fix-old-style-titles' command line argument; if used, every movie title will be converted to the new style ("The Title", instead of the old "Title, The"). This option will go away in 4.2, and is intended only to support old set of plain text data files. Since version 4.2 --fix-old-style-titles is no more needed, being turned on by default. The --do-not-fix-old-style-titles was introduced in case you want to turn it off for some strange reason. REQUIREMENTS ============ You need one of SQLObject or SQLAlchemy (both can be installed safely: by default IMDbPY first tries SQLObject; if not present it fall-backs to SQLAlchemy). [SQLObject] You need the SQLObject package, at least version 0.8; even better if you can download the latest SVN snapshot. SQLObject home page: http://sqlobject.org/ SVN command to download the latest development version: svn co http://svn.colorstudy.com/SQLObject/trunk SQLObject [SQLAlchemy] Support for SQLAlchemy is still in beta (please report any bug!) and a bit slower than SQLObject; anyway, you need version 0.4 or 0.5. SQLAlchemy home page: http://www.sqlalchemy.org/ SVN command to download the latest development version: svn checkout http://svn.sqlalchemy.org/sqlalchemy/trunk sqlalchemy [OTHER REQUIRED MODULES] Obviously SQLObject and SQLAlchemy can access databases only through other specific modules/packages, that you need to have installed (e.g.: 'mysql-python' for MySQL, 'psycopg' for PostgreSQL, and so on). SQL DATABASE INSTALLATION ========================= Select a mirror of the "The Plain Text Data Files" from the http://www.imdb.com/interfaces/ page and download every file in the main directory (beware that the "diffs" subdirectory contains _a lot_ of files you _don't_ need, so don't start mirroring everything!). Starting from release 2.4, you can just download the files you need, instead of every single file; the files not downloaded will be skipped. This feature is still quite untested, so please report any bug. Create a database named "imdb" (or whatever you like), using the tool provided by your database; as an example, for MySQL you will use the 'mysqladmin' command: # mysqladmin -p create imdb For PostgreSQL, you have to use the "createdb" command: # createdb -W imdb To create the tables and to populate the database, you must run the imdbpy2sql.py script: # imdbpy2sql.py -d /dir/with/plainTextDataFiles/ -u 'URI' Where the 'URI' argument is a string representing the connection to your database, with the schema: scheme://[user[:password]@]host[:port]/database[?parameters] Where 'scheme' is one in "sqlite", "mysql", "postgres", "firebird", "interbase", "maxdb", "sapdb", "mssql", "sybase", "ibm_db_sa". Some examples: mysql://user:password@host/database postgres://user:password@host/database mysql://host/database?debug=1 postgres:///full/path/to/socket/database postgres://host:5432/database sqlite:///full/path/to/database sqlite:/C|/full/path/to/database sqlite:/:memory: For other information you can read the SQLObject/SQLAlchemy documentation. You can force the use of SQLObject or SQLAlchemy with the '-o' command line option (i.e.: "-o sqlobject" or "-o sqlalchemy" or a list of comma separated values to specify an order of preference). TIMING ====== The performances are hugely dependant upon the underlying Python module/package used to access the database. The imdbpy2sql.py script has a number of command line arguments, useful to chose amongst presets that can improve performances, using specific database servers. The fastest database appears to be MySQL, with about 200 minutes to complete on my test system (read below). A lot of memory (RAM or swap space) is required, in the range of at least 250/500 megabytes (plus more for the database server). In the end, the database will require between 2.5GB and 5GB of disc space. The should be no difference - at insert time - between SQLObject and SQLAlchemy. As said, the performances varies greatly using a database server or another: MySQL, for instance, has an executemany() method of the cursor object that accept multiple data insertion with a single SQL statement; other database requires a call to the execute() method for every single row of data, and they will be much slower - from 2 to 7 times slower than MySQL. There are generic suggestions that can lead to better performances, like turning off your filesystem journaling (so it can be a good idea to remount an ext3 filesystem as ext2). Another option is the use of a ramdisk/tmpfs, if you have enough RAM. Obviously these have effect only at insert-time: during the day-to-day use, you can turn your journaling on again. You can also consider the use of the CSV output, explained below (but be sure that your database server of choice is able to import CSV files). I've done some tests, using an AMD Athlon 1800+, 1GB of RAM, over a complete plain text data files set (as of 11 Apr 2008, with more than 1.200.000 titles and over 2.200.000 names): database | time in minutes: total (insert data/create indexes) ----------------------+----------------------------------------------------- MySQL 5.0 MyISAM | 205 (160/45) MySQL 5.0 InnoDB | _untested_, see NOTES below. PostgreSQL 8.1 | 560 (530/30) SQLite 3.3 | ??? (150/???) - very slow building indexes. | Timed with the "--sqlite-transactions" command | line option; otherwise it's _really_ slow: even | 35 hours or more. SQLite 3.7 | 65/13 - with --sqlite-transactions and using a SSD hard disk SQL Server | about 3 or 4 hours. If you have different experiences, please tell me! As expected, the most important things that you can do to improve performances are: 1. use an in-memory filesystem or an SSD disk. 2. use the -c /path/to/empty/dir argument to use CSV files. 3. follow the specific notes about your database server. NOTES ===== [save the output] The imdbpy2sql.py will print a lot of debug information on standard output; you can save it in a file, appending (without quotes) "2>&1 | tee output.txt" [Microsoft Windows paths] It's much safer, in a Microsoft Windows environment, to use full paths for the values of the '-c' and '-d' arguments, complete with drive letter. The best thing is to use _UNIX_ path separator, and to add a leading separator. E.g.: -d C:/path/to/imdb_files/ -c C:/path/to/csv_tmp_files/ [MySQL] In general, if you get an embarrassingly high numbero of "TOO MANY DATA ... SPLITTING" lines, consider increasing max_allowed_packet (in the configuration of your MySQL server) to at least 8M or 16M. Otherwise, inserting the data will be very slow, and some data may be lost. [MySQL InnoDB and MyISAM] InnoDB is abysmal slow for our purposes: my suggestion is to always use MyISAM tables and - if you really want to use InnoDB - convert the tables later. The imdbpy2sql.py script provides a simple way to manage these cases, see ADVANCED FEATURES below. In my opinion, the cleaner thing to do is to set the server to use MyISAM tables or - you you can't modifiy the server - use the --mysql-force-myisam command line option of imdbpy2sql.py. Anyway, if you really need to use InnoDB, in the server-side settings I recommend to set innodb_file_per_table to "true". Beware that the conversion will be extremely slow (some hours), but still faster than using InnoDB from the begin. You can use the "--mysql-innodb" command line option to force the creation of a datbase with MyISAM tables, converted at the end into InnoDB. [Microsoft SQL Server/SQLExpress] If you get and error about how wrong and against nature is the blasphemous act of inserting indentity keys, you can try to fix it with the new custom queries support; see ADVANCED FEATURES below. As a shortcut, you can use the "--ms-sqlserver" command line option to set all the needed options. You probably need SQLObject 0.10 (in the svn repository, as I'm writing this). [SQLite speed-up] For some reason, SQLite is really slow, except when used with transactions; you can use the '--sqlite-transactions' command line option to obtain acceptable performances. The same command, also turns off "PRAGMA synchronous". SQLite seems to hugely benefit from the use of a non-journaling filesystem and/or of a ramdisk/tmpfs: see the generic suggestions discussed above in the TIMING section. [SQLite failure] It seems that, with older versions of the python-sqlite package, the first run may fail; if you get a DatabaseError exception saying "no such table", try running again the command with the same arguments. Double funny, uh? ;-) [data truncated] If you get an insane amount (hundreds or thousands, on various text columns) of warnings like these lines: imdbpy2sql.py:727: Warning: Data truncated for column 'person_role' at row 4979 CURS.executemany(self.sqlString, self.converter(self.values())) you probably have a problem with the configuration of your database. The error came from strings that get cut at the first non-ASCII char (and so you're losing a lot of information). To obviate at this problem, you must be sure that your database server is set up properly, with the use library/client configured to communicate with the server in a consistent way. E.g., for MySQL you can set: character-set-server = utf8 default-collation = utf8_unicode_ci default-character-set = utf8 of even: character-set-server = latin1 default-collation = latin1_bin default-character-set = latin1 [adult titles] Beware that, while running, the imdbpy2sql.py script will output a lot of strings containing both person names and movie titles. The script has absolutely no way to know that the processed title is an adult-only movie, so... if you leave it running and your little daughter runs to you screaming 'daddy! daddy! what kind of animals Rocco trains in the documentary "Rocco: Animal Trainer 17"???'... well it's not my fault! ;-) SQL USAGE ========= Now you can use IMDbPY with the database: from imdb import IMDb i = IMDb('sql', uri='YOUR_URI_STRING') resList = i.search_movie('the incredibles') for x in resList: print x ti = resList[0] i.update(ti) print ti['director'][0] and so on... The 'sql' data access system takes an optional argument, named "useORM", which can be set to a string or a list of values (the string can be a comma-separated list of items, to denote an order of preference). Valid values are "sqlobject" and "sqlalchemy". The default is ('sqlobject', 'sqlalchemy'). E.g.: i = IMDb('sql', uri='YOUR_URI_STRING', useORM='sqlalchemy,sqlobject') i = IMDb('sql', uri='YOUR_URI_STRING', useORM=['sqlalchemy', 'sqlobject']) i = IMDb('sql', uri='YOUR_URI_STRING', useORM='sqlalchemy']) ADVANCED FEATURES ================= With the -e (or --execute) command line argument you can specify custom queries to be executed at certain times, with the syntax: -e "TIME:[OPTIONAL_MODIFIER:]QUERY" Where TIME is actually one of these: 'BEGIN', 'BEFORE_DROP', 'BEFORE_CREATE', 'AFTER_CREATE', 'BEFORE_MOVIES', 'BEFORE_CAST', 'BEFORE_RESTORE', 'BEFORE_INDEXES' and 'END'. The only available OPTIONAL_MODIFIER is 'FOR_EVERY_TABLE' and it means that the QUERY command will be executed for every table in the database (so it doesn't make much sense to use it with BEGIN, BEFORE_DROP or BEFORE_CREATE time...), replacing the "%(table)s" text in the QUERY with the appropriate table name. Other available TIMEs are: 'BEFORE_MOVIES_TODB', 'AFTER_MOVIES_TODB', 'BEFORE_PERSONS_TODB', 'AFTER_PERSONS_TODB', 'BEFORE_CHARACTERS_TODB', 'AFTER_CHARACTERS_TODB', 'BEFORE_SQLDATA_TODB', 'AFTER_SQLDATA_TODB', 'BEFORE_AKAMOVIES_TODB' and 'AFTER_AKAMOVIES_TODB'; they take no modifiers. Special TIMEs 'BEFORE_EVERY_TODB' and 'AFTER_EVERY_TODB' apply to every BEFORE_* and AFTER_* TIME above mentioned. These commands are executed before and after every _toDB() call in their respective objects (CACHE_MID, CACHE_PID and SQLData instances); the "%(table)s" text in the QUERY is replaced as above. You can specify so many -e arguments as you need, even if they refers to the same TIME: they will be executed from the first to the last. Also, always remember to correctly escape queries: after all you're passing it on the command line! E.g. (ok, quite a silly example...): -e "AFTER_CREATE:SELECT * FROM title;" The most useful case is when you want to convert the tables of a MySQL from MyISAM to InnoDB: -e "END:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=InnoDB;" If your system uses InnoDB by default, you can trick it with: -e "AFTER_CREATE:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=MyISAM;" -e "END:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=InnoDB;" You can use the "--mysql-innodb" command line option as a shortcut of the above command. Cool, uh? Another possible use is to fix a problem with Microsoft SQLServer/SQLExpress: to prevent errors setting IDENTITY fields, you can run something like this: -e 'BEFORE_EVERY_TODB:SET IDENTITY_INSERT %(table)s ON' -e 'AFTER_EVERY_TODB:SET IDENTITY_INSERT %(table)s OFF' You can use the "--ms-sqlserver" command line option as a shortcut of the above command. To use transactions to speed-up SQLite, try: -e 'BEFORE_EVERY_TODB:BEGIN TRANSACTION;' -e 'AFTER_EVERY_TODB:COMMIT;' Which is also the same thing the command line option '--sqlite-transactions' does. CSV files ========= Keep in mind that actually only MySQL, PostgreSQL and IBM DB2 are supported. Moreover, you may incur in problems (e.g.: your postgres _server_ process must have reading access to the directory you're storing the CSV files). To create (and import) a set of CSV files, run imdbpy2sql.py with the syntax: ./imdbpy2sql.py -d /dir/with/plainTextDataFiles/ -u URI -c /directory/where/to/store/CSVfiles The created files will be imported near the end of the imdbpy2sql.py processing; notice that after that, you can safely cancel these files. CSV partial processing ====================== It's possible, since IMDbPY 4.5, to separate the two steps involved using CSV files. With the --csv-only-write command line option the old database will be zeroed and the CSV files saved (along with imdbIDs information). Using the --csv-only-load option you can load these saved files into an existing database (this database MUST be the one left almost empty by the previous run). Beware that right now the whole procedure is not very well tested. Using both commands, on the command line you still have to specify the whole "-u URI -d /path/plainTextDataFiles/ -c /path/CSVfiles/" series of arguments. IMDbPY-4.9/docs/README.devel0000644000000000000000000003123311766731642014035 0ustar rootroot DEVELOPMENT OF IMDbPY ===================== A lot of other information useful to IMDbPY developers are available in the "README.package" file. Sections in this file: * STRUCTURE OF THE IMDbPY PACKAGE * GENERIC DESCRIPTION * HOW TO EXTEND STRUCTURE OF THE IMDbPY PACKAGE =============================== imdb (package) | +-> _compat +-> _exceptions +-> _logging +-> linguistics +-> Movie +-> Person +-> Character +-> Company +-> utils +-> helpers +-> parser (package) | +-> http (package) | | | +-> movieParser | +-> personParser | +-> characterParser | +-> companyParser | +-> searchMovieParser | +-> searchPersonParser | +-> searchCharacterParser | +-> searchCompanyParser | +-> searchKeywordParser | +-> topBottomParser | +-> utils | +-> bsouplxml | | | +-> _bsoup.py | +-> etree.py | +-> html.py | +-> bsoupxpath.py | +-> mobile (package) | +-> sql (package) | +-> dbschema +-> alchemyadapter +-> objectadapter +-> cutils (C module) Description: imdb (package): contains the IMDb function, the IMDbBase class and imports the IMDbError exception class. _compat: compatibility functions and class for some strange environments (internally used). _exceptions: defines the exceptions internally used. _logging: provides the logging facility used by IMDbPY. linguistics: defines some functions and data useful to smartly guess the language of a movie title (internally used). Movie: contains the Movie class, used to describe and manage a movie. Person: contains the Person class, used to describe and manage a person. Character: contains the Character class, used to describe and manage a character. Company: contains the Company, used to describe and manage a company. utils: miscellaneous utilities used by many IMDbPY modules. parser (package): a package containing a package for every data access system implemented. http (package): contains the IMDbHTTPAccessSystem class which is a subclass of the imdb.IMDbBase class; it provides the methods used to retrieve and manage data from the web server (using, in turn, the other modules in the package). It defines methods to get a movie and to search for a title. http.movieParser: parse html strings from the pages on the IMDb web server about a movie; returns dictionaries of {key: value} http.personParser: parse html strings from the pages on the IMDb web server about a person; returns dictionaries. http.characterParser: parse html strings from the pages on the IMDb web server about a character; returns dictionaries. http.companyParser: parse html strings from the pages on the IMDb web server about a company; returns dictionaries. http.searchMovieParser: parse an html string, result of a query for a movie title. http.searchPersonParser: parse an html string, result of a query for a person name. http.searchCharacterParser: parse an html string, result of a query for a character name. http.searchCompanyParser: parse an html string, result of a query for a company name. http.searchKeywordParser: parse an html string, result of a query for a keyword. http.topBottomParser: parse an html string, result of a query for top250 and bottom100 movies. http.utils: miscellaneous utilities used only by the http package. http.bsouplxml (package): adapter to make BeautifulSoup behave like lxml (internally, the API of lxml is always used). http.bsouplxml._bsoup: just a copy of the BeautifulSoup module, so that it's not an external dependency. http.bsouplxml.etree: adapter for the lxml.etree module. http.bsouplxml.html: adapter for the lxml.html module. http.bsouplxml.bsoupxpath: xpath support for beautifulsoup. The parser.sql package manages the access to the data in the SQL database, created with the imdbpy2sql.py script; see the README.sqldb file. The dbschema module contains tables definitions and some useful functions; The alchemyadapter adapts the SQLAlchemy ORM to the internal mechanisms of IMDbPY, and the objectadapter does the same for the SQLObject ORM (internally the API of SQLObject is always used). The cutils module is a C module containing C function to speed up the 'sql' data access system; if it can't be compiled, a set of fall'back functions will be used. The class in the parser.mobile package is a subclass of the one found in parser.http, with some method overridden to be many times faster (from 2 to 20 times); it's useful for systems with slow bandwidth and not much CPU power. The helpers module contains functions and other goodies not directly used by the IMDbPY package, but that can be useful to develop IMDbPY-based programs. GENERIC DESCRIPTION =================== I wanted to stay independent from the source of the data for a given movie/person/character/company, and so the imdb.IMDb function returns an instance of a class that provides specific methods to access a given data source (web server, SQL database, etc.) Unfortunately that means that the movieID in the Movie class, the personID in the Person class and the characterID in the Character class are dependent on the data access system used. So, when a Movie, a Person or a Character object is instantiated, the accessSystem instance variable is set to a string used to identify the used data access system. HOW TO EXTEND ============= To introduce a new data access system, you've to write a new package inside the "parser" package; this new package must provide a subclass of the imdb.IMDb class which must define at least the following methods: _search_movie(title) - to search for a given title; must return a list of (movieID, {movieData}) tuples. _search_episode(title) - to search for a given episode title; must return a list of (movieID, {movieData}) tuples. _search_person(name) - to search for a given name; must return a list of (movieID, {personData}) tuples. _search_character(name) - to search for a given character's name; must return a list of (characterID, {characterData}) tuples. _search_company(name) - to search for a given company's name; must return a list of (companyID, {companyData}) tuples. get_movie_*(movieID) - a set of methods, one for every set of information defined for a Movie object; should return a dictionary with the relative information. This dictionary can contains some optional keys: 'data': must be a dictionary with the movie info. 'titlesRefs': a dictionary of 'movie title': movieObj pairs. 'namesRefs': a dictionary of 'person name': personObj pairs. get_person_*(personID) - a set of methods, one for every set of information defined for a Person object; should return a dictionary with the relative information. get_character_*(characterID) - a set of methods, one for every set of information defined for a character object; should return a dictionary with the relative information. get_company_*(companyID) - a set of methods, one for every set of information defined for a company object; should return a dictionary with the relative information. _get_top_bottom_movies(kind) - kind can be one of 'top' and 'bottom'; returns the related list of movies. _get_keyword(keyword) - return a list of Movie objects with the given keyword. _search_keyword(key) - return a list of keywords similar to the given key. get_imdbMovieID(movieID) - must convert the given movieID to a string representing the imdbID, as used by the IMDb web server (e.g.: '0094226' for Brian De Palma's "The Untouchables"). get_imdbPersonID(personID) - must convert the given personID to a string representing the imdbID, as used by the IMDb web server (e.g.: '0000154' for "Mel Gibson"). get_imdbCharacterID(characterID) - must convert the given characterID to a string representing the imdbID, as used by the IMDb web server (e.g.: '0000001' for "Jesse James"). get_imdbCompanyID(companyID) - must convert the given companyID to a string representing the imdbID, as used by the IMDb web server (e.g.: '0071509' for "Columbia Pictures [us]"). _normalize_movieID(movieID) - must convert the provided movieID in a format suitable for internal use (e.g.: convert a string to a long int). NOTE: as a rule of thumb you _always_ need to provide a way to convert a "string representation of the movieID" into the internally used format, and the internally used format should _always_ be converted to a string, in a way or another. Rationale: a movieID can be passed from the command line, or from a web browser. _normalize_personID(personID) - idem. _normalize_characterID(characterID) - idem. _normalize_companyID(companyID) - idem. _get_real_movieID(movieID) - return the true movieID; useful to handle title aliases. _get_real_personID(personID) - idem. _get_real_characterID(characterID) - idem. _get_real_companyID(companyID) - idem. The class should raise the appropriate exceptions, when needed; IMDbDataAccessError must be raised when you cannot access the resource you need to retrieve movie info or you're unable to do a query (this is _not_ the case when a query returns zero matches: in this situation an empty list must be returned); IMDbParserError should be raised when an error occurred parsing some data. Now you've to modify the imdb.IMDb function so that, when the right data access system is selected with the "accessSystem" parameter, an instance of your newly created class is returned. NOTE: this is a somewhat misleading example: we already have a data access system for sql database (it's called 'sql' and it supports also MySQL, amongst other). Maybe I'll find a better example... E.g.: if you want to call your new data access system "mysql" (meaning that the data are stored in a mysql database), you've to add to the imdb.IMDb function something like: if accessSystem == 'mysql': from parser.mysql import IMDbMysqlAccessSystem return IMDbMysqlAccessSystem(*arguments, **keywords) where "parser.mysql" is the package you've created to access the local installation, and "IMDbMysqlAccessSystem" is the subclass of imdb.IMDbBase. Then it's possibile to use the new data access system like: from imdb import IMDb i = IMDb(accessSystem='mysql') results = i.search_movie('the matrix') print results A specific data access system implementation can defines it's own methods. As an example, the IMDbHTTPAccessSystem that is in the parser.http package defines the method set_proxy() to manage the use a web proxy; you can use it this way: from imdb import IMDb i = IMDb(accessSystem='http') # the 'accessSystem' argument is not # really needed, since "http" is the default. i.set_proxy('http://localhost:8080/') A list of special methods provided by the imdb.IMDbBase subclass, along with their description, is always available calling the get_special_methods() of the IMDb class. E.g.: i = IMDb(accessSystem='http') print i.get_special_methods() will print a dictionary with the format: {'method_name': 'method_description', ...} IMDbPY-4.9/docs/README.series0000644000000000000000000001714111766731642014232 0ustar rootrootSummary of this file: * MANAGING SERIES EPISODES * TITLES * SERIES * FULL CREDITS * RATINGS * PEOPLE * GOODIES MANAGING SERIES EPISODES ======================== Since January 2006, IMDb changed the way it handles TV episodes: now every episode is treated as full title. Starting with version 2.5, also IMDbPY supports this new behavior. TITLES ====== analyze_title() and build_title() now supports tv episodes. You can pass a string to the analyze_title function in the format used by the web server ("The Series" The Episode (2005)) or in the format of the plain text data files ("The Series" (2004) {The Episode (#ser.epi)}) An example of the returned dictionary: call the function: analyze_title('"The Series" The Episode (2005)') the result will be: {'kind': 'episode', # kind is set to 'episode'. 'year': '2005', # the release year of this episode. 'title': 'The Episode', # episode title 'episode of': {'kind': 'tv series', # 'episode of' will contains 'title': 'The Series'} # information about the series. } The 'episode of' key can be a dictionary or a Movie class instance with the same information. The build_title() function takes an optional argument: ptdf; is it's set to false (the default), it returns the title of the episode in the format used by the IMDb's web server ("The Series" An Episode (2006)), otherwise it uses the format used by the plain text data files (something like "The Series" (2004) {An Episode (#2.5)}) SERIES ====== You can retrieve information about seasons and episodes for a tv (mini) series: from imdb import IMDb i = IMDb() m = i.get_movie('0389564') # The 4400. m['kind'] # kind is 'tv series'. i.update(m, 'episodes') # retrieves episodes information. m['episodes'] # a dictionary with the format: # {#season_number: { # #episode_number: Movie object, # #episode_number: Movie object, # ... # }, # ... # } # season_number always starts with 1, episode_number # depends on the series' numbering schema: some series # have a 'episode 0', while others starts counting from 1. m['episodes'][1][1] # e = m['episodes'][1][2] # second episode of the first season. e['kind'] # kind is 'episode'. e['season'], e['episode'] # return 1, 2. e['episode of'] # # XXX: beware that e['episode of'] and m _are not_ the # same object, while both represents the same series. # This is to avoid circular references; the # e['episode of'] object only contains basics # information (title, movieID, year, ....) i.update(e) # retrieve normal information about this episode (cast, ...) e['title'] # 'The New and Improved Carl Morrissey' e['series title'] # 'The 4400' e['long imdb episode title'] # '"The 4400" The New and Improved Carl Morrissey (2004)' Summary of keys of the Movie object for a series episode: 'kind': set to 'episode'. 'episode of': set to a movie object, this is a reference to the series. 'season': an integer; the number of the season. 'episode': an integer; the number of the episode in the season. 'long imdb episode title': combines series and episode title. 'series title': title of the series. 'canonical series title': title of the series, in the canonical format. Summary of keys of the Movie object for a series: 'kind': set to 'tv series'. 'episodes': dictionary (seasons) of dictionary (episodes in the season). FULL CREDITS ============ Retrieving credits for a tv (mini) series, you may notice that many long lists (like "cast", "writers", ...) are incomplete. You can fetch the complete list of cast and crew with the "full credits" data set; e.g.: from imdb import IMDb i = IMDb() m = i.get_movie('0285331') # 24. print len(m['cast']) # wooah! Only 7 person in the cast of 24?!?! i.update(m, 'full credits') print len(m['cast']) # yup! More than 300 persons! If you prefer, you can retrieve the complete cast of every episode, keeping the lists separated for every episode; instead of retrieving the list of episodes with: i.update(m, 'episodes') use instead: i.update('episodes cast') or the equivalent: i.update(m, 'guests') Now you end up having the same information as if you have updated the 'episodes' info set, but every Movie object inside the dictionary of dictionary has the complete cast. E.g.: cast = m['episodes'][1][2]['cast'] # cast list for the second episode # of the first season. Beware that both 'episodes cast' and 'guests' will update the keyword 'episodes' (and not 'episodes cast' or 'guests'). RATINGS ======= You can retrieve rating information about every episode in a tv (mini) series using the 'episodes rating' data set. PEOPLE ====== You can retrieve information about single episodes acted/directed/... by a person. from imdb import IMDb i = IMDb() p = i.get_person('0005041') # Laura Innes. p['actress'][0] # # At this point you have an entry (in keys like 'actor', 'actress', # 'director', ...) for every series the person starred/worked in, but # you knows nothing about singles episodes. i.update(p, 'episodes') # updates information about single episodes. p['episodes'] # a dictionary with the format: # {: [ , , ... ], ... } er = p['actress'][0] # ER tv series. p['episodes'][er] # list of Movie objects; one for every ER episode # she starred/worked in. p['episodes'][er][0] # p['episodes'][er]['kind'] # 'episode' p['episodes'][er][0].currentRole # 'Dr. Kerry Weaver' GOODIES ======= In the imdb.helpers module there are some functions useful to manage lists of episodes: - sortedSeasons(m) returns a sorted list of seasons of the given series. E.g.: >>> from imdb import IMDb >>> i = IMDb() >>> m = i.get_movie('0411008') >>> i.update(m, 'episodes') >>> sortedSeasons(m) [1, 2] - sortedEpisodes(m, season=None) returns a sorted list of episodes of the the given series, considering only the specified season(s) (every season, if None). E.g.: >>> from imdb import IMDb >>> i = IMDb() >>> m = i.get_movie('0411008') >>> i.update(m, 'episodes') >>> sortedEpisodes(m, season=1) [, , ...] IMDbPY-4.9/docs/README.adult0000644000000000000000000000503111766731642014044 0ustar rootroot IMDbPY for (too) sensitive people ================================= Since version 2.0 (shame on me! I've noticed this only after more than a year of development!!!) by default adult movies are included in the result of the search_movie(), search_episode() and search_person() methods. If for some unintelligible reason you don't want classics like "Debbie Does Dallas" to show up in your list of results, you can disable this feature initializing the IMDb class with the 'adultSearch' argument set to 0 (or other "False" value). E.g.: from imdb import IMDb ia = IMDb(accessSystem='http', adultSearch=0) The behavior of a IMDb class's instance can be modified at runtime, calling the do_adult_search() method. E.g.: from imdb import IMDb # By default in the horny-mode. ia = IMDb(accessSystem='http') # Just for this example, be sure to exclude the proxy. ia.set_proxy(None) results = ia.search_movie('debby does dallas', results=5) for movie in results: print movie['long imdb title'], movie.movieID # It will print: # Debbie Does Dallas (1978) 0077415 # Debbie Does Dallas Part II (1981) 0083807 # Debbie Does Dallas: The Next Generation (1997) (V) 0160174 # Debbie Does Dallas '99 (1999) (V) 0233539 # Debbie Does Dallas 3 (1985) 0124352 # You can now revert to the old puritan behavior. ia.do_adult_search(0) results = ia.search_movie('debby does dallas', results=5) for movie in results: print movie['long imdb title'], movie.movieID # It will print only: # Pauly Does Dallas (1993) (TV) 0208347 The do_adult_search() method of the http and mobile data access system also takes another couple of arguments: "cookie_id" and "cookie_uu", so that you can select _your own_ IMDb's account; if cookie_id is set to None, no cookies are sent. These parameters can also be set in the imdbpy.cfg configuration file. For the strings to use, see your "cookie" or "cookie.txt" file. Obviously you need to activate the "adult movies" option for your account; see http://imdb.com/find/preferences?_adult=1 OTHER DATA ACCESS SYSTEMS ========================= Since version 2.2 every other data access system (sql) support the same behavior of the http and mobile data access systems (i.e.: you can set the 'adultSearch' argument and use the 'do_adult_search' method). Notice that for the sql data access system only results from the search_movie() and search_episode() methods are filtered: there's no easy (and fast) way to tell that an actor/actress is a porn-star. IMDbPY-4.9/docs/INSTALL.txt0000644000000000000000000000014511766731642013724 0ustar rootroot INSTALLATION ============ See the "README.txt" file. You've to read it anyway, isn't it? IMDbPY-4.9/docs/README.info2xml0000644000000000000000000000545711766731642014505 0ustar rootroot INFORMATION IN XML FORMAT ========================= Since version 4.0, IMDbPY can output information of Movie, Person, Character and Company instances in XML format. It's possible to get a single information (a key) in XML format, using the getAsXML(key) method (it will return None if the key is not found). E.g.: from imdb import IMDb ia = IMDb('http') movie = ia.get_movie(theMovieID) print movie.getAsXML('keywords') It's also possible to get a representation of a whole object, using the asXML() method: print movie.asXML() The returned strings are unicode. The _with_add_keys argument of the asXML() method can be set to False (default: True) to exclude the dynamically generated keys (like 'smart canonical title' and so on). XML FORMAT ========== Keywords are converted to tags, items in lists are enclosed in a 'item' tag. E.g.: a keyword another keyword Except when keys are known to be not fixed (e.g.: a list of keywords), in which case this schema is used: ... In general, the 'key' attribute is present whenever the used tag doesn't match the key name. Movie, Person, Character and Company instances are converted like that (portions enclosed in squares are optionals): A Long IMDb Movie Title (YEAR) [ Name Surname [A Note About The Person] ] [A Note About The Movie] Every 'id' can be empty. Actually the returned XML is mostly not pretty-printed. REFERENCES ========== Some text keys can contain references to other movies, persons and characters. The user can provide the defaultModFunct function (see the "MOVIE TITLES AND PERSON/CHARACTER NAMES REFERENCES" section of the README.package file), to replace these references with their own strings (e.g.: a link to a web page); it's up to the user, to be sure that the output of the defaultModFunct function is valid XML. DTD === Since version 4.1 a DTD is available; it can be found in this directory or on the web, at: http://imdbpy.sf.net/dtd/imdbpy41.dtd The version number changes with the IMDbPY version. LOCALIZATION ============ Since version 4.1 it's possible to translate the XML tags; see README.locale. FROM XML TO OBJECTS =================== Since version 4.6, you can dump the generated XML in a string or in a file, using it - later - to rebuild the original object. In the imdb.helpers module there's the parseXML() function which takes a string as input and return - if possible - an instance of the Movie, Person, Character or Company classes. IMDbPY-4.9/docs/README.newparsers0000644000000000000000000000552111766731642015130 0ustar rootroot IMDbPY'S NEW HTML PARSERS ========================= Since version 3.7, IMDbPY has moved its parsers for the HTML of the IMDb's website from a set of subclasses of SGMLParser (they were finite-states machines, being SGMLParser a SAX parser) to a set of parsers based on the libxml2 library or on the BeautifulSoup module (and so, using a DOM/XPath-based approach). The idea and the implementation of these new parsers is mostly a work of H. Turgut Uyar, and can bring to parsers that are shorter, easier to write and maybe even faster. The old set of parsers was removed since IMDbYP 4.0. LIBXML AND/OR BEAUTIFULSOUP =========================== To use "lxml", you need the libxml2 library installed (and its python-lxml binding). If it's not present on your system, you'll fall-back to BeautifulSoup - distributed alongside IMDbPY, and so you don't need to install anything. However, beware that being pure-Python, BeautifulSoup is much slower than lxml, so install it, if you can. If for some reason you can't get lxml and BeautifulSoup is too slow for your needs, consider the use of the 'mobile' data access system. GETTING LIBXML, LIBXSLT AND PYTHON-LXML ======================================= If you're in a Microsoft Windows environment, all you need is python-lxml (it includes all the required libraries), which can be downloaded from here: http://pypi.python.org/pypi/lxml/ Otherwise, if you're in a Unix environment, you can download libxml2 and libxslt from here (you need both, to install python-lxml): http://xmlsoft.org/downloads.html http://xmlsoft.org/XSLT/downloads.html The python-lxml package can be found here: http://codespeak.net/lxml/index.html#download Obviously you should first check if these libraries are already packaged for your distribution/operating system. IMDbPY was tested with libxml2 2.7.1, libxslt 1.1.24 and python-lxml python-lxml 2.1.1. Older versions can work, too; if you have problems, submit a bug report specifying your versions. You can also get the latest version of BeautifulSoup from here: http://www.crummy.com/software/BeautifulSoup/ but since it's distributed with IMDbPY, you don't need it (or you have to override the '_bsoup.py' file in the imdb/parser/http directory), and this is probably not a good idea, since newer versions of BeautifulSoup behave in new and unexpected ways. USING THE OLD PARSERS ===================== The old set of parsers was removed since IMDbYP 4.0. FORCING LXML OR BEAUTIFULSOUP ============================= By default, IMdbPY uses python-lxml, if it's installed. You can force the use of one given parser passing the 'useModule' parameter. Valid values are 'lxml' and 'BeautifulSoup'. E.g.: from imdb import IMDb ia = IMDb('http', useModule='BeautifulSoup') ... useModule can also be a list/tuple of strings, to specify the preferred order. IMDbPY-4.9/docs/README.logging0000644000000000000000000000067611766731642014373 0ustar rootroot LOGGING ======= Since version 4.4 IMDbPY provides a logging facility, using the powerful "logging" module. You can find documentation about it here: http://docs.python.org/library/logging.html By default information are logged on standard error; you can read on the module documentation how to stream them elsewhere. The default logging level is "warning"; this can be changed modifying the "loggingLevel" key of your imdbpy.cfg file. IMDbPY-4.9/docs/README.locale0000644000000000000000000000774711766731642014212 0ustar rootroot LOCALIZATION FOR IMDbPY ======================= Since version 4.1 it's easy to translate the labels that describe sets of information. LIMITATION ========== So far no internal message or exception is translated, the internationalization is limited to the "tags" returned by the getAsXML and asXML methods of the Movie, Person, Character or Company classes. Beware that in many cases these "tags" are not the same as the "keys" used to access information in the same classes, as if they are dictionaries. E.g.: you can translate "long-imdb-name" - the tag returned by the call person.getAsXML('long imdb name') - but not "long imdb name" directly. To translate keys, you can use the helpers.translateKey function in the 'helpers' module. USAGE ===== If you want to add i18n to your IMDbPY-based application, all you need to do is to switch to the 'imdbpy' text domain. E.g.: import imdb.locale # Standard gettext stuff. import gettext from gettext import gettext as _ # Switch to the imdbpy domain. gettext.textdomain('imdbpy') # Request a translation. print _(u'long-imdb-name') ADD A NEW LANGUAGE ================== You can (but you're not forced to) use Transifex to manage/coordinate your translations; see: http://www.transifex.net/projects/p/imdbpy/c/default/ Below, the generic instruction about how translation works. In the imdb.locale package, you'll find some scripts useful to build your own internationalization files. If you create a new translation or update an existing one, you can send it to the mailing list, for inclusion in the next releases. - the generatepot.py should be used only when the DTD is changed; it's used to create the imdbpy.pot file (the one shipped is always up-to-date). - you can copy the imdbpy.pot file to your language's .po file (e.g. imdbpy-fr.po for French) and modify it accordingly to your needs. - then you must run rebuildmo.py (which is automatically called at install time, by the setup.py script) to create the .mo files. If you need to upgrade an existing .po file, after changes to the .pot file (usually because the DTD was changed), you can use the msgmerge tool, part of the GNU gettext suite. E.g.: msgmerge -N imdbpy-fr.po imdbpy.pot > new-imdbpy-fr.po ARTICLES IN TITLES ================== Converting a title to its 'Title, The' canonical format, IMDbPY does some assumptions about what is an article and what not, and this could lead to some wrong canonical titles. E.g.: "Hard, Die" instead of "Die Hard", since 'Die' is guessed as an article (and it is, in Germany...) To solve this problem, there are other keys: "smart canonical title", "smart long imdb canonical title", "smart canonical series title", "smart canonical episode title" which can be used to do a better job converting a title into its canonical format. It works, but it needs to know something about articles in various languages: if you want to help, see the LANG_ARTICLES and LANG_COUNTRIES dictionaries in the 'linguistics' module. To know what the language in which a movie title is assumed to be, call its 'guessLanguage' method (it will return None, if unable to guess). If you want to force a given language instead of the guessed one, you can call its 'smartCanonicalTitle' method, setting the 'lang' argument appropriately. TITLE AKAS ========== Sometimes it's useful to manage title's AKAs knowing their languages. In the 'helpers' module there are some (hopefully) useful functions: akasLanguages(movie) - given a movie, return a list of tuples in (lang, AKA) format (lang can be None, if unable to detect). sortAKAsBySimilarity(movie, title) - sorts the AKAs on a movie considering how much they are similar to a given title (see the code for more options). getAKAsInLanguage(movie, lang) - return a list of AKAs of the movie in the given language (see the code for more options). IMDbPY-4.9/docs/GPL.txt0000644000000000000000000004311011766731642013237 0ustar rootroot GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. IMDbPY-4.9/docs/imdbpyico.png0000644000000000000000000000063311766731642014544 0ustar rootrootPNG  IHDR..6PLTE?`YbKGDH pHYs  d_tIME  6%6 IDATxn -|J>Œ ~ܩyKΐ r?R|X6C]yo"D0'eu' 9~ߏKq.J4paLWIFc,!ZypSNyqJc/?\e%9oŚzR<Qnys{m [SܦrI>o:c3]>ꂲmmAĕؾ^\h8W{js]IENDB`IMDbPY-4.9/docs/TODO.txt0000644000000000000000000001355111766731642013370 0ustar rootroot TODO for IMDbPY =============== See the code, and search for XXX, FIXME and TODO. NOTE: it's always time to clean the code! [general] * improve the logging facility. * ability to silent warnings/logging. * create mobile versions for smart phones (with GUI). * Write better summary() methods for Movie, Person, Character and Company classes. * Some portions of code are poorly commented. * The documentation is written in my funny Anglo-Bolognese. * a better test-suite is really needed. * Compatibility with Python 2.2 and previous versions is no more assured for every data access system (the imdbpy2sql.py script for sure requires at least Python 2.3). Be sure to keep 2.2 compatibility at least for 'http' and 'mobile', since they are used by mobile devices. * The analyze_title/build_title functions are grown too complex and beyond their initial goals. [searches] * Support advanced query for movie titles and person/character/company names - if possible this should be available in every data access systems. [xml] * bugs of the helpers.parseXML function: * doesn't handle Movie instances as keys of a dictionary; person->episodes, character->episodes, character->quotes. * changes tuples to lists (not a big deal). * person/movieRefs are lost. * uses only BeautifulSoup; maybe it can work with lxml, too. * use the ID type, instead of CDATA, for the 'id' attribute? * keywords (and hence tags) generated by SQL may be slightly different, and must be integrate in the DTD. [Movie objects] * Define invariable names for the sections (the keys you use to access info stored in a Movie object). * Should a movie object automatically build a Movie object, when an 'episode of' dictionary is in the data? * Should isSameTitle() check first the accessSystem and the movieID, and use 'title' only if movieID is None? * For TV series the list of directors/writers returned by 'sql' is a long list with every single episodes listed in the 'notes' attribute (i.e.: the same person is listed more than one time, just with a different note). For 'http' and 'mobile' there's a list with one item for every person, with a long 'notes' listing every episode. It's not easy to split these information since they can contain notes ("written by", "as Aka Name", ...) * The 'laserdisc' information for 'sql' is probabily wrong: I think they merge data from different laserdisc titles. Anyway these data are no more updated by IMDb, and so... * there are links to hollywoodreporter.com that are not gathered in the "external reviews" page. [Person objects] * Define invariable names for the sections (the keys you use to access info stored in a Person object). * Should isSameName() check first the accessSystem and the personID, and use 'name' only if personID is None? * Fetching data from the web ('http' and 'mobile'), the filmography for a given person contains a list named "himself" or "herself" for movies/shows where they were not acting. In 'sql', these movies/shows are listed in the "actor" or "actress" list. Check if this is still true, with the new IMDb's schema. [Character objects] * Define invariable names for the sections (the keys you use to access info stored in a Character object). [Company objects] * Define invariable names for the sections (the keys you use to access info stored in a Company object). [http data access system] * Serious confusion about handling XML/HTML/SGML char references; there are too many fixes for special cases, and a better understanding of how lxml and BeautifulSoup behave is required. * If the access through the proxy fails, is it possible to automatically try without? It doesn't seem easy... * Access to the "my IMDb" functions for registered users would be really cool. * Gather more movies' data: user comments, laserdisc details, trailers, posters, photo gallery, on tv, schedule links, showtimes, message boards. * Gather more people's data: photo gallery. [httpThin data access system] * It should be made _really_ faster than 'http'. [mobile data access system] * General optimization. * Make find() methods case insensitive. [sql data access system] NOTE NOTE NOTE: this is still beta code and I'm not a database guru; moreover I'm short of time and so I will be happy to fix every bug you'll find, but if you're about to write me an email like "ehi, the database access should be faster", "the imdbpy2sql.py script must run with 64 MB of RAM and complete in 2 minutes" or "your database layout sucks: I've an idea for a better structure...", well, consider that _these_ kinds of email will be probably immediately discarded. I _know_ these are important issues, but I've neither the time nor the ability to fix these problems by myself, sorry. Obviously if you want to contribute with patches, new code, new SQL queries and new database structures you're welcome and I will be very grateful for your work. Again: if there's something that bother you, write some code. It's free software, after all. Things to do: * The imdbpy2sql.py script MUST be run on a database with empty tables; unfortunately so far a SQL installation can't be "updated" without recreating the database from scratch. IMDb releases also "diff" files to keep the plain text files updated; it would be wonderful to directly use these diff files to upgrade the SQL database, but I think this is a nearly impossible task. A lot of attempts were made in this direction, always failing. * There are a lot of things to do to improve SQLAlchemy support (especially in terms of performances); see FIXME/TODO/XXX notices in the code. * The pysqlite2.dbapi2.OperationalError exception is raise when SQLite is used with SQLAlchemy (but only if the --sqlite-transactions command line argument is used). * With 0.5 branch of SQLAlchemy, it seems that there are serious problems using SQLite; try switching to SQLObject, as a temporary solution. IMDbPY-4.9/docs/Changelog.txt0000644000000000000000000014000111766731642014501 0ustar rootroot Changelog for IMDbPY ==================== * What's the new in release 4.9 "Iron Sky" (15 Jun 2012) [general] - urls used to access the IMDb site can be configured. - helpers function to handle movie AKAs in various languages (code by Alberto Malagoli). - renamed the 'articles' module into 'linguistics'. - introduced the 'reraiseExceptions' option, to re-raise evey caught exception. [http] - fix for changed search parameters. - introduced a 'timeout' parameter for connections to the web server. - fix for business information. - parser for the new style of episodes list. - unicode searches handled as iso8859-1. - fix for garbage in AKA titles. [sql] - vastly improved the store/restore of imdbIDs; now it should be faster and more accurate. - now the 'name' table contains a 'gender' field that can be 'm', 'f' or NULL. - fix for nicknames. - fix for missing titles in the crazy credits file. - handled exceptions creating indexes, foreign keys and executing custom queries. - fixed creation on index for keywords. - excluded {{SUSPENDED}} titles. * What's the new in release 4.8.2 "The Big Bang Theory" (02 Nov 2011) [general] - fixed install path of locales. [http] - removed debug code. * What's the new in release 4.8 "Super" (01 Nov 2011) [general] - fix for a problem managing exceptions with Python 2.4. - converted old-style exceptions to instances. - enanchements for the reduce.sh script. - added notes about problems connecting to IMDb's web servers. - improvements in the parsers of movie titles. - improvements in the parser of person names. [http] - potential fix for GAE environment. - handled the new style of "in production" information. - fix for 'episodes' list. - fix for 'episodes rating'. - fix for queries that returned too many results. - fix for wrong/missing references. - removed no more available information set "amazon reviews" and "dvd". - fix for cast of tv series. - fix for title of tv series. - now the beautiful parses work again. [httpThin] - removed "httpThin", falling back to "http". [mobile] - fix for missing headshots. - fix for rating and number of votes. - fix for missing genres. - many other fixes to keep up-to-date with the IMDb site. [sql] - fix for a nasty bug parsing notes about character names. - fixes for SQLite with SQLOjbect. * What's the new in release 4.7 "Saw VI" (23 Jan 2011) [http] - first fixes for the new set of parsers. - first changes to support the new set of web pages. - fix for lists of uncategorized episodes. - fix for movies with multiple countries. - fix for the currentRole property. - more robust handling for vote details. [mobile] - first fixes for the new set of parsers. [sql] - the tables containing titles and names (and akas) now include a 'md5sum' column calculated on the "long imdb canonical title/name". * What's the new in release 4.6 "The Road" (19 Jun 2010) [general] - introduced the 'full-size cover url' and 'full-size headshot' keys for Movie, Person and Character instances. - moved the development to a Mercurial repository. - introduced the parseXML function in the imdb.helpers module. - now the asXML method can exclude dynamically generated keys. - rationalized the use of the 'logging' and 'warnings' modules. - the 'update' method no longer raises an exception, if asked for an unknown info set. [http/mobile] - removed new garbage from the imdb pages. - support new style of akas. - fix for the "trivia" page. - fixes for searches with too many results. [sql] - fixes for garbage in the plain text data files. - support for SQLite shipped with Python 2.6. * What's the new in release 4.5.1 "Dollhouse" (01 Mar 2010) [general] - reintroduced the ez_setup.py file. - fixes for AKAs on 'release dates'. - added the dtd. * What's the new in release 4.5 "Invictus" (28 Feb 2010) [general] - moved to setuptools 0.6c11. - trying to make the SVN release versions work fine. - http/mobile should work in GAE (Google App Engine). - added some goodies scripts, useful for programmers (see the docs/goodies directory). [http/mobile] - removed urllib-based User-Agent header. - fixes for some minor changes to IMDb's html. - fixes for garbage in movie quotes. - improvements in the handling of AKAs. [mobile] - fixe for AKAs in search results. [sql] - fixes for bugs restoring imdbIDs. - first steps to split CSV creation/insertion. * What's the new in release 4.4 "Gandhi" (06 Jan 2010) [general] - introduced a logging facility; see README.logging. - the 'http' and 'mobile' should be a lot more robust. [http] - fixes for the n-th set of changes to IMDb's HTML. - improvements to perfect-match searches. - slightly simplified the parsers for search results. [mobile] - fixes for the n-th set of changes to IMDb's HTML. - slightly simplified the parsers for search results. [sql] - movies' keywords are now correctly imported, using CSV files. - minor fixes to handle crap in the plain text data files. - removed an outdate parameter passed to SQLObject. - made imdbpy2sql.py more robust in some corner-cases. - fixes for the Windows environment. * What's the new in release 4.3 "Public Enemies" (18 Nov 2009) [general] - the installer now takes care of .mo files. - introduced, in the helpers module, the functions keyToXML and translateKey, useful to translate dictionary keys. - support for smart guessing of the language of a movie title. - updated the DTD. [http] - fixed a lot of bugs introduced by the new IMDb.com design. - nicer handling of HTTP 404 response code. - fixed parsers for top250 and bottom100 lists. - fixed a bug parsing AKAs. - fixed misc bugs. [mobile] - removed duplicates in list of genres. [sql] - fixed a bug in the imdbpy2sql.py script using CSV files; the 'movie_info_idx' and 'movie_keyword' were left empty/with wrong data. * What's the new in release 4.2 "Battlestar Galactica" (31 Aug 2009) [general] - the 'local' data access system is gone. See README.local. - the imdb.parser.common package was removed, and its code integrated in imdb.parser.sql and in the imdbpy2sql.py script. - fixes for the installer. - the helpers module contains the fullSizeCoverURL function, to convert a Movie, Person or Character instance (or a URL in a string) in an URL to the full-size version of its cover/headshot. Courtesy of Basil Shubin. - used a newer version of msgfmt.py, to work around a hideous bug generating locales. - minor updates to locales. - updated the DTD to version 4.2. [http] - removed garbage at the end of quotes. - fixed problems parsing company names and notes. - keys in character's quotes dictionary are now Movie instances. - fixed a bug converting entities char references (affected BeautifulSoup). - fixed a long-standing bug handling & with BeautifulSoup. - top250 is now correctly parsed by BeautifulSoup. [sql] - fixed DB2 call for loading blobs/cblobs. - information from obsolete files are now used if and only if they refer to still existing titles. - the --fix-old-style-titles argument is now obsolete. * What's the new in release 4.1 "State Of Play" (02 May 2009) [general] - DTD definition. - support for locale. - support for the new style for movie titles ("The Title" and no more "Title, The" is internally used). - minor fix to XML code to work with the test-suite. [http] - char references in the &#xHEXCODE; format are handled. - fixed a bug with movies containing '....' in titles. And I'm talking about Malcolm McDowell's filmography! - 'airing' contains object (so the accessSystem variable is set). - 'tv schedule' ('airing') pages of episodes can be parsed. - 'tv schedule' is now a valid alias for 'airing'. - minor fixes for empty/wrong strings. [sql] - in the database, soundex values for titles are always calculated after the article is stripped (if any). - imdbpy2sql.py has the --fix-old-style-titles option, to handle files in the old format. - fixed a bug saving imdbIDs. [local] - the 'local' data access system should be considered obsolete, and will probably be removed in the next release. * What's the new in release 4.0 "Watchmen" (12 Mar 2009) [general] - the installer is now based on setuptools. - new functions get_keyword and search_keyword to handle movie's keywords (example scripts included). - Movie/Person/... keys (and whole instances) can be converted to XML. - two new functions, get_top250_movies and get_bottom100_movies, to retrieve lists of best/worst movies (example scripts included). - searching for movies and persons - if present - the 'akas' keyword is filled, in the results. - 'quotes' for movies is now always a list of lists. - the old set of parsers (based on sgmllib.SGMLParser) are gone. - fixed limitations handling multiple roles (with notes). - fixed a bug converting somethingIDs to real imdbIDs. - fixed some summary methods. - updates to the documentation. [http] - adapted BeautifulSoup to lxml (internally, the lxml API is used). - currentRole is no longer populated, for non-cast entries (everything ends up into .notes). - fixed a bug search for too common terms. - fixed a bug identifying 'kind', searching for titles. - fixed a bug parsing airing dates. - fixed a bug searching for company names (when there's a direct hit). - fixed a bug handling multiple characters. - fixed a bug parsing episode ratings. - nicer keys for technical details. - removed the 'agent' page. [sql] - searching for a movie, the original titles are returned, instead of AKAs. - support for Foreign Keys. - minor changes to the db's design. - fixed a bug populating tables with SQLAlchemy. - imdbpy2sql.py shows user time and system time, along with wall time. [local] - searching for a movie, the original titles are returned, instead of AKAs. * What's the new in release 3.9 "The Strangers" (06 Jan 2009) [general] - introduced the search_episode method, to search for episodes' titles. - movie['year'] is now an integer, and no more a string. - fixed a bug parsing company names. - introduced the helpers.makeTextNotes function, useful to pretty-print strings in the 'TEXT::NOTE' format. [http] - fixed a bug regarding movies listed in the Bottom 100. - fixed bugs about tv mini-series. - fixed a bug about 'series cast' using BeautifulSoup. [sql] - fixes for DB2 (with SQLAlchemy). - improved support for movies' aka titles (for series). - made imdbpy2sql.py more robust, catching exceptions even when huge amounts of data are skipped due to errors. - introduced CSV support in the imdbpy2sql.py script. * What's the new in release 3.8 "Quattro Carogne a Malopasso" (03 Nov 2008) [http] - fixed search system for direct hits. - fixed IDs so that they always are str and not unicode. - fixed a bug about plot without authors. - for pages about a single episode of a series, "Series Crew" are now separated items. - introduced the preprocess_dom method of the DOMParserBase class. - handling rowspan for DOMHTMLAwardsParser is no more a special case. - first changes to remove old parsers. [sql] - introduced support for SQLAlchemy. [mobile] - fixed multiple 'nick names'. - added 'aspect ratio'. - fixed a "direct hit" bug searching for people. [global] - fixed search_* example scripts. - updated the documentation. * What's the new in release 3.7 "Burn After Reading" (22 Sep 2008) [http] - introduced a new set of parsers, active by default, based on DOM/XPath. - old parsers fixed; 'news', 'genres', 'keywords', 'ratings', 'votes', 'tech', 'taglines' and 'episodes'. [sql] - the pure python soundex function now behaves correctly. [general] - minor updates to the documentation, with an introduction to the new set of parsers and notes for packagers. * What's the new in release 3.6 "RahXephon" (08 Jun 2008) [general] - support for company objects for every data access systems. - introduced example scripts for companies. - updated the documentation. [http and mobile] - changes to support the new HTML for "plot outline" and some lists of values (languages, genres, ...) - introduced the set_cookies method to set cookies for IMDb's account and the del_cookies method to remove the use of cookies; in the imdbpy.cfg configuration file, options "cookie_id" and "cookie_uu" can be set to the appropriate values; if "cookie_id" is None, no cookies are sent. - fixed parser for 'news' pages. - fixed minor bug fetching movie/person/character references. [http] - fixed a search problem, while not using the IMDbPYweb's account. - fixed bugs searching for characters. [mobile] - fixed minor bugs parsing search results. [sql] - fixed a bug handling movieIDs, when there are some inconsistencies in the plain text data files. [local] - access to 'mpaa' and 'miscellaneous companies' information. * What's the new in release 3.5 "Blade Runner" (19 Apr 2008) [general] - first changes to work on Symbian mobile phones. - now there is an imdb.available_access_systems() function, that can be used to get a list of available data access systems. - it's possible to pass 'results' as a parameter of the imdb.IMDb function; it sets the number of results to return for queries. - fixed summary() method in Movie and Person, to correctly handle unicode chars. - the helpers.makeObject2Txt function now supports recursion over dictionaries. - cutils.c MXLINELEN increased from 512 to 1024; some critical strcpy replaced with strncpy. - fixed configuration parser to be compatible with Python 2.2. - updated list of articles and some stats in the comments. - documentation updated. [sql] - fixed minor bugs in imdbpy2sql.py. - restores imdbIDs for characters. - now CharactersCache honors custom queries. - the imdbpy2sql.py's --mysql-force-myisam command line option can be used to force usage of MyISAM tables on InnoDB databases. - added some warnings to the imdbpy2sql.py script. [local] - fixed a bug in the fall-back function used to scan movie titles, when the cutils module is not available. - mini biographies are cut up to 2**16-1 chars, to prevent troubles with some MySQL servers. - fixed bug in characters4local.py, dealing with some garbage in the files. * What's the new in release 3.4 "Flatliners" (16 Dec 2007) [general] - *** NOTE FOR PACKAGERS *** in the docs directory there is the "imdbpy.cfg" configuration file, which should be installed in /etc or equivalent directory; the setup.py script _doesn't_ manage its installation. - introduced a global configuration file to set IMDbPY's parameters. - supported characters using "sql" and "local" data access systems. - fixed a bug retrieving characterID from a character's name. [http] - fixed a bug in "release dates" parser. - fixed bugs in "episodes" parser. - fixed bugs reading "series years". - stricter definition for ParserBase._re_imdbIDmatch regular expression. [mobile] - fixed bugs reading "series years". - fixed bugs reading characters' filmography. [sql] - support for characters. [local] - support for characters. - introduced the characters4local.py script. * What's the new in release 3.3 "Heroes" (18 Nov 2007) [general] - first support for character pages; only for "http" and "mobile", so far. - support for multiple characters. - introduced an helper function to pretty-print objects. - added README.currentRole. - fixed minor bug in the __hash__ method of the _Container class. - fixed changes to some key names for movies. - introduced the search_character.py, get_character.py and get_first_character.py example scripts. [http] - full support for character pages. - fixed a bug retrieving some 'cover url'. - fixed a bug with multi-paragraphs biographies. - parsers are now instanced on demand. - accessSystem and modFunct are correctly set for every Movie, Person and Character object instanced. [mobile] - full support for character pages. [sql] - extended functionality of the custom queries support for the imdbpy2sql.py script to circumvent a problem with MS SQLServer. - introducted the "--mysql-innodb" and "--ms-sqlserver" shortcuts for the imdbpy2sql.py script. - introduced the "--sqlite-transactions" shortcut to activate transaction using SQLite which, otherwise, would have horrible performances. - fixed a minor bug with top/bottom ratings, in the imdbpy2sql.py script. [local] - filtered out some crap in the "quotes" plain text data files, which also affected sql, importing the data. * What's the new in release 3.2 "Videodrome" (25 Sep 2007) [global] - now there's an unique place where "akas.imdb.com" is set, in the main module. - introduced __version__ and VERSION in the main module. - minor improvements to the documentation. [http] - updated the main movie parser to retrieve the recently modified cast section. - updated the crazy credits parser. - fixed a bug retrieving 'cover url'. [mobile] - fixed a bug parsing people's filmography when only one duty was listed. - updated to retrieve series' creator. [sql] - added the ability to perform custom SQL queries at the command line of the imdbpy2sql.py script. - minor fixes for the imdbpy2sql.py script. * What's the new in release 3.1 "The Snake King" (18 Jul 2007) [global] - the IMDbPYweb account now returns a single item, when a search returns only one "good enough" match (this is the IMDb's default). - updated the documentation. - updated list of contributors and developers. [http] - supported the new result page for searches. - supported the 'synopsis' page. - supported the 'parents guide' page. - fixed a bug retrieving notes about a movie's connections. - fixed a bug for python2.2 (s60 mobile phones). - fixed a bug with 'Production Notes/Status'. - fixed a bug parsing role/duty and notes (also for httpThin). - fixed a bug retrieving user ratings. - fixed a bug (un)setting the proxy. - fixed 2 bugs in movie/person news. - fixed a bug in movie faqs. - fixed a bug in movie taglines. - fixed a bug in movie quotes. - fixed a bug in movie title, in "full cast and crew" page. - fixed 2 bugs in persons' other works. [sql] - hypothetical fix for a unicode problem in the imdbpy2sql.py script. - now the 'imdbID' fields in the Title and Name tables are restored, updating from an older version. - fixed a nasty bug handling utf-8 strings in the imdbpy2sql.py script. [mobile] - supported the new result page for searches. - fixed a bug for python2.2 (s60 mobile phones). - fixed a bug searching for persons with single match and no messages in the board. - fixed a bug parsing role/duty and notes. * What's the new in release 3.0 "Spider-Man 3" (03 May 2007) [global] - IMDbPY now works with the new IMDb's site design; a new account is used to access data; this affect a lot of code, especially in the 'http', 'httpThin' and 'mobile' data access systems. - every returned string should now be unicode; dictionary keywords are _not_ guaranteed to be unicode (but they are always 7bit strings). - fixed a bug in the __contains__ method of the Movie class. - fix in the analyze_title() function to handle malformed episode numbers. [http] - introduced the _in_content instance variable for objects instances of ParserBase, True when inside the
tag. Opening and closing this pair of tags two methods, named _begin_content() and _end_content() are called with no parameters (by default, they do nothing). - in the utils module there's the build_person function, useful to create a Person instance from the tipical formats found in the IMDb's web site. - an analogue build_movie function can be used to instance Movie objects. - inverted the getRefs default - now if not otherwise set, it's False. - added a parser for the "merchandising" ("for sale") page for persons. - the 'rating' parser now collects also 'rating' and 'votes' data. - the HTMLMovieParser class (for movies) was rewritten from zero. - the HTMLMaindetailsParser class (for persons) was rewritten from zero. - unified the "episode list" and "episodes cast" parsers. - fixed a bug parsing locations, which resulted in missing information. - locations_parser splitted from "tech" parser. - "connections" parser now handles the recently introduced notes. [http parser conversion] - these parsers worked out-of-the-box; airing, eprating, alternateversions, dvd, goofs, keywords, movie_awards, movie_faqs, person_awards, rec, releasedates, search_movie, search_person, soundclips, soundtrack, trivia, videoclips. - these parsers were fixed; amazonrev, connections, episodes, crazycredits, externalrev, misclinks, newsgrouprev, news, officialsites, otherworks, photosites, plot, quotes, ratings, sales, taglines, tech, business, literature, publicity, trivia, videoclips, maindetails, movie. [mobile] - fixed to work with the new design. - a lot of code is now shared amongst 'http' and 'mobile'. [sql] - fixes for other bugs related to unicode support. - minor changes to slightly improve performances. * What's the new in release 2.9 "Rodan! The Flying Monster" (21 Feb 2007) [global] - on 19 February IMDb has redesigned its site; this is the last IMDbPY's release to parse the "old layout" pages; from now on, the development will be geared to support the new web pages. See the README.redesign file for more information. - minor clean-ups and functions added to the helpers module. [http] - fixed some unicode-related problems searching for movie titles and person names; also changed the queries used to search titles/names. - fixed a bug parsing episodes for tv series. - fixed a bug retrieving movieID for tv series, searching for titles. [mobile] - fixed a problem searching exact matches (movie titles only). - fixed a bug with cast entries, after minor changes to the IMDb's web site HTML. [local and sql] - fixed a bug parsing birth/death dates and notes. [sql] - (maybe) fixed another unicode-related bug fetching data from a MySQL database. Maybe. Maybe. Maybe. * What's the new in release 2.8 "Apollo 13" (14 Dec 2006) [general] - fix for environments where sys.stdin was overridden by a custom object. [http data access system] - added support for the movies' "FAQ" page. - now the "full credits" (aka "full cast and crew") page can be parsed; it's mostly useful for tv series, because this page is complete while "combined details" contains only partial data. E.g. ia.update(tvSeries, 'full credits') - added support for the movies' "on television" (ia.update(movie, "airing")) - fixed a bug with 'miscellaneous companies'. - fixed a bug retrieving the list of episodes for tv series. - fixed a bug with tv series episodes' cast. - generic fix for XML single tags (unvalid HTML tags) like
- fixed a minor bug with 'original air date'. [sql data access system] - fix for a unicode bug with recent versions of SQLObject and MySQL. - fix for a nasty bug in imdbpy2sql.py that will show up splitting a data set too large to be sent in a single shot to the database. [mobile data access system] - fixed a bug searching titles and names, where XML char references were not converted. * What's the new in release 2.7 "Pitch Black" (26 Sep 2006) [general] - fixed search_movie.py and search_person.py scripts; now they return both the movieID/personID and the imdbID. - the IMDbPY account was configured to hide the mini-headshots. - http and mobile data access systems now try to handle queries with too many results. [http data access system] - fixed a minor bug retrieving information about persons, with movies in production. - fixed support for cast list of tv series. - fixed a bug retrieving 'plot keywords'. - some left out company credits are now properly handled. [mobile data access system] - fixed a major bug with the cast list, after the changes to the IMDb web site. - fixed support for cast list of tv series. - fixed a minor bug retrieving information about persons, with movies in production. - now every AKA title is correctly parsed. [sql data access system] - fixed a(nother) bug updating imdbID for movies and persons. - fixed a bug retrieving personID, while handling names references. [local data access system] - "where now" information now correctly handles multiple lines (also affecting the imdbpy2sql.py script). * What's the new in release 2.6 "They Live" (04 Jul 2006) [general] - renamed sortMovies to cmpMovies and sortPeople to cmpPeople; these function are now used to compare Movie/Person objects. The cmpMovies also handles tv series episodes. [http data access system] - now information about "episodes rating" are retrieved. - fixed a bug retrieving runtimes and akas information. - fixed an obscure bug trying an Exact Primary Title/Name search when the provided title was wrong/incomplete. - support for the new format of the "DVD details" page. [sql data access system] - now at insert-time the tables doesn't have indexes, which are added later, resulting in a huge improvement of the performances of the imdbpy2sql.py script. - searching for tv series episodes now works. - fixed a bug inserting information about top250 and bottom10 films rank. - fixed a bug sorting movies in people's filmography. - fixed a bug filtering out adult-only movies. - removed unused ForeignKeys in the dbschema module. - fixed a bug inserting data in databases that require a commit() call, after a call to executemany(). - fixed a bug inserting aka titles in database that checks for foreign keys consistency. - fixed an obscure bug splitting too huge data sets. - MoviesCache and PersonsCache are now flushed few times. - fixed a bug handling excessive recursion. - improved the exceptions handling. * What's the new in release 2.5 "Ninja Thunderbolt" (15 May 2006) [general] - support for tv series episodes; see the README.series file. - modified the DISCLAIMER.txt file to be compliant to the debian guidelines. - fixed a bug in the get_first_movie.py script. - Movie and Person instances are now hashable, so that they can be used as dictionary keys. - modified functions analyze_title and build_title to support tv episodes. - use isinstance for type checking. - minor updates to the documentation. - the imdbID for Movie and Person instances is now searched if either one of movieID/personID and title/name is provided. - introduced the isSame() method for both Movie and Person classes, useful to compare object by movieID/personID and accessSystem. - __contains__() methods are now recursive. - two new functions in the IMDbBase class, title2imdbID() and name2imdbID() are used to get the imdbID, given a movie title or person name. - two new functions in the helpers module, sortedSeasons() and sortedEpisodes(), useful to manage lists/dictionaries of tv series episodes. - in the helpers module, the get_byURL() function can be used to retrieve a Movie or Person object for the given URL. - renamed the "ratober" C module to "cutils". - added CONTRIBUTORS.txt file. [http data access system] - fixed a bug regarding currentRole for tv series. - fixed a bug about the "merchandising links" page. [http and mobile data access systems] - fixed a bug retrieving cover url for tv (mini) series. [mobile data access system] - fixed a bug with tv series titles. - retrieves the number of episodes for tv series. [local data access system] - new get_episodes function in the cutils/ratober C module. - search functions (both C and pure python) are now a lot faster. - updated the documentation with work-arounds to make the mkdb program works with a recent set of plain text data files. [sql data access system] - uses the SQLObject ORM to support a wide range of database engines. - added in the cutils C module the soundex() function, and a fall back Python only version in the parser.sql package. * What's the new in release 2.4 "Munich" (09 Feb 2006) [general] - strings are now unicode/utf8. - unified Movie and Person classes. - the strings used to store every kind of information about movies and person now are modified (substituting titles and names references) only when it's really needed. - speed improvements in functions modifyStrings, sortMovies, canonicalName, analyze_name, analyze_title. - performance improvements in every data access system. - removed the deepcopy of the data, updating Movie and Person information. - moved the "ratober" C module in the imdb.parser.common package, being used by both ""http" and "sql" data access systems. - C functions in the "ratober" module are always case insensitive. - the setup.py script contains a work-around to make installation go on even if the "ratober" C module can't be compiled (displaying a warning), since it's now optional. - minor updates to documentation, to keep it in sync with changes in the code. - the new helpers.py module contains functions useful to write IMDbPY-based programs. - new doc file README.utf8, about unicode support. [http data access system] - the ParserBase class now inherits from sgmllib.SGMLParser, instead of htmllib.HTMLParser, resulting in a little improvement in parsing speed. - fixed a bug in the parser for the "news" page for movies and persons. - removed special handlers for entity and chardefs in the HTMLMovieParser class. - fixed bugs related to non-ascii chars. - fixed a bug retrieving the URL of the cover. - fixed a nasty bug retrieving the title field. - retrieve the 'merchandising links' page. - support for the new "episodes cast" page for tv series. - fixed a horrible bug retrieving guests information for tv series. [sql data access system] - fixed the imdbpy2sql.py script, to handle files with spurious lines. - searches for names and titles are now much faster, if the imdb.parser.common.ratober C module is compiled and installed. - imdbpy2sql.py now works also on partial data (i.e. if you've not downloaded every single plain text file). - imdbpy2sql.py considers also a couple of files in the contrib directory. - searching names and titles, only the first 5 chars returned from the SOUNDEX() SQL function are compared. - should works if the database is set to unicode/utf-8. [mobile data access system] - fixed bugs related to non-ascii chars. - fixed a bug retrieving the URL of the cover. - retrieve currentRole/notes also for tv guest appearances. [local data access system] - it can work even if the "ratober" C module is not compiled; obviously the pure python substitute is painfully slow (a warning is issued). * What's the new in release 2.3 "Big Fish" (03 Dec 2005) [general] - uniformed numerous keys for Movie and Person objects. - 'birth name' is now always in canonical form, and 'nick names' are always normalized; these changes also affect the sql data access system. [http data access system] - removed the 'imdb mini-biography by' key; the name of the author is now prepended to the 'mini biography' key. - fixed an obscure bug using more than one access system (http in conjunction with mobile or httpThin). - fixed a bug in amazon reviews. [mobile data access system] - corrected some bugs retrieving filmography and cast list. [sql data access system] - remove 'birth name' and 'nick names' from the list of 'akas'. - in the SQL database, 'crewmembers' is now 'miscellaneous crew'. - fixed a bug retrieving "guests" for TV Series. * What's the new in release 2.2 "The Thing" (17 Oct 2005) [general] - now the Person class has a 'billingPos' instance variable used to keep record of the position of the person in the list of credits (as an example, "Laurence Fishburne" is billed in 2nd position in the cast list for the "Matrix, The (1999)" movie. - added two functions to the utils module, to sort respectively movies (by year/title/imdbIndex) and persons (by billingPos/name/imdbIndex). - every data access system support the 'adultSearch' argument and the do_adult_search() method to exclude the adult movies from your searches. By default, adult movies are always listed. - renamed the scripts, appending the ".py" extension. - added an "IMDbPY Powered" logo and a bitmap used by the Windows installer. - now Person and Movie objects always convert name/title to the canonical format (Title, The). - minor changes to the functions used to convert to "canonical format" names and titles; they should be faster and with better matches. - 'title' is the first argument, instancing a Movie object (instead of 'movieID'). - 'name' is the first argument, instancing a Movie object (instead of 'personID'). [http data access system] - retrieves the 'guest appearances' page for TV series. - fixed a bug retrieving newsgroup reviews urls. - fixed a bug managing non-breaking spaces (they're truly a damnation!) - fixed a bug with mini TV Series in people's biographies. - now keywords are in format 'bullet-time' and no more 'Bullet Time'. [mobile data access system] - fixed a bug with direct hits, searching for a person's name. - fixed a bug with languages and countries. [local data access system] - now cast entries are correctly sorted. - new search system; it should return better matches in less time (searching people's name is still somewhat slow); it's also possibile to search for "long imdb canonical title/name". - fixed a bug retrieving information about a movie with the same person listed more than one time in a given role/duty (e.g., the same director for different episodes of a TV series). Now it works fine and it should also be a bit faster. - 'notable tv guest appearences' in biography is now a list of Movie objects. - writers are sorted in the right order. [sql data access system] - search results are now sorted in correct order; difflib is used to calculate strings similarity. - new search SQL query and comparison algorithm; it should return much better matches. - searches for only a surname now returns much better results. - fixed a bug in the imdbpy2sql.py script; now movie quotes are correctly managed. - added another role, 'guests', for notable tv guest appearences. - writers are sorted in the right order. - put also the 'birth name' and the 'nick names' in the akanames table. * What's the new in release 2.1 "Madagascar" (30 Aug 2005) [general] - introduced the "sql data access system"; now you can transfer the whole content of the plain text data files (distributed by IMDb) into a SQL database (MySQL, so far). - written a tool to insert the plain text data files in a SQL database. - fixed a bug in items() and values() methods of Movie and Person classes. - unified portions of code shared between "local" and "sql". [http data access system] - fixed a bug in the search_movie() and search_person() methods. - parse the "external reviews", "newsgroup reviews", "newsgroup reviews", "misc links", "sound clips", "video clips", "amazon reviews", "news" and "photo sites" pages for movies. - parse the "news" page for persons. - fixed a bug retrieving personID and movieID within namesRefs and titlesRefs. [local data access system] - fixed a bug; 'producer' data where scanned two times. - some tags were missing for the laserdisc entries. [mobile data access system] - fixed a bug retrieving cast information (sometimes introduced with "Cast overview" and sometimes with "Credited cast"). - fixed a bug in the search_movie() and search_person() methods. * What's the new in release 2.0 "Land Of The Dead" (16 Jul 2005) [general] - WARNING! Now, using http and mobile access methods, movie/person searches will include by default adult movie titles/pornstar names. You can still deactivate this feature by setting the adultSearch argument to false, or calling the do_adult_search() method with a false value. - fixed a bug using the 'all' keyword of the 'update' method. [http data access system] - added the "recommendations" page. - the 'notes' instance variable is now correctly used to store miscellaneous information about people in non-cast roles, replacing the 'currentRole' variable. - the adultSearch initialization argument is by default true. - you can supply the proxy to use with the 'proxy' initialization argument. - retrieve the "plot outline" information. - fixed a bug in the BasicMovieParser class, due to changes in the IMDb's html. - the "rating details" parse information about the total number of voters, arithmetic mean, median and so on. The values are stored as integers and floats, and no more as strings. - dictionary keys in soundtrack are lowercase. - fixed a bug with empty 'location' information. [mobile data access system] - number of votes, rating and top 250 rank are now integers/floats. - retrieve the "plot outline" information. [local data access system] - number of votes, rating and top 250 rank are now integers/floats. * What's the new in release 1.9 "Ed Wood" (02 May 2005) [general] - introduced the new "mobile" data access system, useful for small systems. It should be from 2 to 20 times faster than "http" or "httpThin". - the "http", "httpThin" and "mobile" data access system can now search for adult movies. See the README.adult file. - now it should works again with python 2.0 and 2.1. - fixed a bug affecting performances/download time. - unified some keywords amongst differents data access systems. [http data access system] - fixed some bugs; now it retrieves names akas correctly. * What's the new in release 1.8 "Paths Of Glory" (24 Mar 2005) [general] - introduced a new data access system "httpThin", useful for systems with limited bandwidth and CPU power, like PDA, hand-held devices and mobile phones. - the setup.py script can be configured to not compile/install the local access system and the example scripts (useful for hand-held devices); introduced setup.cfg and MANIFEST.in files. - updated the list of articles used to manage movie titles. - removed the all_info tuples from Movie and Person classes, since the list of available info sets depends on the access system. I've added two methods to the IMDbBase class, get_movie_infoset() and get_person_infoset(). - removed the IMDbNotAvailable exception. - unified some code in methods get_movie(), get_person() and update() in IMDbBase class. - minor updates to the documentation; added a 46x46 PNG icon. - documentation for small/mobile systems. [Movie class] - renamed the m['notes'] item of Movie objects to m['episodes']. [Person class] - the p.__contains__(m) method can be used to check if the p Person has worked in the m Movie. [local data access system] - gather information about "laserdisc", "literature" and "business". - fixed a bug in ratober.c; now the search_name() function handles search strings already in the "Surname, Name" format. - two new methods, get_lastMovieID() and get_lastPersonID(). [http data access system] - limit the number of results for the query; this will save a lot of bandwidth. - fixed a bug retrieving the number of episodes of tv series. - now it retrieves movies information about "technical specifications", "business data", "literature", "soundtrack", "dvd" and "locations". - retrieves people information about "publicity" and "agent". * What's the new in release 1.7 "Saw" (04 Feb 2005) [general] - Person class has two new keys; 'canonical name' and 'long imdb canonical name', like "Gibson, Mel" and "Gibson, Mel (I)". - now titles and names are always internally stored in the canonical format. - search_movie() and search_person() methods return the "read" movieID or personID (handling aliases). - Movie and Person objects have a 'notes' instance attribute, used to specify comments about the role of a person in a movie. The Movie class can also contain a ['notes'] item, used to store information about the runtime; e.g. (26 episodes). - fixed minor bugs in the IMDbBase, Person and Movie classes. - some performance improvements. [http data access system] - fixed bugs retrieving the currentRole. - try to handle unicode chars; return unicode strings when required. - now the searches return also "popular titles" and "popular names" from the new IMDb's search system. [local data access system] - information about movie connections are retrieved. - support for multiple biographies. - now it works with Python 2.2 or previous versions. - fixed a minor glitch in the initialization of the ratober C module. - fixed a pair buffer overflows. - fixed some (very rare) infinite loops bugs. - it raises IMDbDataAccessError for (most of) I/O errors. [Movie class] - fixed a bug getting the "long imdb canonical title". * What's the new in release 1.6 "Ninja Commandments" (04 Jan 2005) [general] - now inside Movie and Person object, the text strings (biography, movie plot, etc.) contain titles and names references, like "_Movie, The (1999)_ (qv)" or "'A Person' (qv)"; these reference are transformed at access time with a user defined function. - introduced _get_real_movieID and _get_real_personID methods in the IMDbBase class, to handle title/name aliases for the local access system. - split the _normalize_id method in _normalize_movieID and _normalize_personID. - fixed some bugs. [Movie class] - now you can access the 'canonical title' and 'long imdb canonical title' attributes, to get the movie title in the format "Movie Title, The". [local data access system] - title and name aliases now work correctly. - now get_imdbMovieID and get_imdbPersonID methods should work in almost every case. - people's akas are handled. [http data access system] - now the BasicMovieParser class can correctly gather the imdbID. * What's the new in release 1.5 "The Incredibles" (23 Dec 2004) [local database] - support a local installation of the IMDb database! WOW! Now you can download the plain text data files from http://imdb.com/interfaces.html and access those information through IMDbPY! [general] - movie titles and person names are "fully normalized"; Not "Matrix, The (1999)", but "The Matrix (1999)"; Not "Cruise, Tom" but "Tom Cruise". - get_mop_infoSet() methods can now return a tuple with the dictionary data and a list of information sets they provided. [http data access system] - support for the new search system (yes, another one...) - a lot of small fixes to stay up-to-date with the html of the IMDb web server. - modified the personParser module so that it will no more download both "filmoyear" and "maindetails" pages; now only the latter is parsed. - movie search now correctly reports the movie year and index. - gather "locations" information about a movie. - modified the HTMLAwardsParser class so that it doesn't list empty entries. * What's the new in release 1.4 "The Village" (10 Nov 2004) [http data access system] - modified the personParser.HTMLMaindetailsParser class, because IMDb has changed the img tag for the headshot. - now 'archive footage' is handled correctly. [IMDb class] - fixed minor glitches (missing "self" parameter in a couple of methods). [misc] - now distutils installs also the example scripts in ./bin/* * What's the new in release 1.3 "House of 1000 Corpses" (6 Jul 2004) [http data access system] - modified the BasicMovieParser and BasicPersonParser classes, because IMDb has removed the "pageflicker" from the html pages. [general] - the test suite was moved outside the tgz package. * What's the new in release 1.2 "Kill Bill" (2 May 2004) [general] - now it retrieves almost every available information about movie and people! - introduced the concept of "data set", to retrieve different sets of information about a movie/person (so that it's possibile to fetch only the needed information). - introduced a test suite, using the PyUnit (unittest) module. - fixed a nasty typo; the analyze_title and build_title functions now use the strings 'tv mini series' and 'tv series' for the 'kind' key (previously the 'serie' word ws used). - new design; removed the mix-in class and used a factory pattern; imdb.IMDb is now a function, which returns an instance of a class, subclass of imdb.IMDbBase. - introduced the build_name(name_dict) function in the utils module, which takes a dictionary and build a long imdb name. - fixed bugs in the analyze_name function; now it correctly raise an IMDbParserError exception for empty/all spaces strings. - now the analyze_title function sets only the meaningful information (i.e.: no 'kind' or 'year' key, if they're not set) [http data access system] - removed all non-greedy regular expressions. - removed all regular expressions in the movieParser module; now self.rawdata is no more used to search "strange" matches. - introduced a ParserBase class, used as base class for the parsers. - retrieve information about the production status (pre-production, announced, in production, etc.) - mpaa is now a string. - now when an IMDbDataAccessError is raised it shows also the used proxy. - minor changes to improve performances in the handle_data method of the HTMLMovieParser class. - minor changes to achieve a major performances improvement in the BasicPersonParser class in the searchPersonParse module. [Movie class] - fixed a bug in isSameTitle method, now the accessSystem is correctly checked. - fixed some typos. [Person class] - minor changes to the isSamePerson method (now it uses the build_name function). * What's the new in release 1.1 "Gigli" (17 Apr 2004) [general] - added support for persons (search & retrieve information about people). - removed the dataSets module. - removed the MovieTitle and the SearchMovieResults classes; now information about the title is stored directly in the Movie object and the search methods return simple lists (of Movie or Person objects). - removed the IMDbTitleError exception. - added the analyze_name() function in the imdb.utils module, which returns a dictionary with the 'name' and 'imdbIndex' keys from the given long imdb name string. [http data access system] - http search uses the new search system. - moved the plotParser module content inside the movieParser module. - fixed a minor bug handling AKAs for movie titles. [IMDb class] - introduced the update(obj) method of the IMDb class, to update the information of the given object (a Movie or Person instance). - added the get_imdbURL(obj) method if the IMDb class, which returns the URL of the main IMDb page for the given object (a Movie or Person). - renamed the 'kind' parameter of the IMDb class to 'accessSystem'. [Movie class] - now __str__() returns only the short name; the summary() method returns a pretty-printed string for the Movie object. - persons are no more simple strings, but Person objects (the role/duty is stored in the currentRole variable of the object). - isSameTitle(obj) method to compare two Movie objects even when not all information are gathered. - new __contains__() method, to check is a given person was in a movie. [misc] - updated the documentation. - corrected some syntax/grammar errors. * What's the new in release 1.0 "Equilibrium" (01 Apr 2004) [general] - first public release. - retrieve data only from the web server. - search only for movie titles. IMDbPY-4.9/docs/README.currentRole0000644000000000000000000000732711766731642015251 0ustar rootroot THE currentRole ATTRIBUTE AND THE Character CLASS ================================================= Since version 3.3, IMDbPY supports the character pages of the IMDb database; this required some substantial changes to how actors' and acresses' roles were handled. Starting with release 3.4, "sql" data access system is supported, too - but it works a bit differently from "http" and "mobile". See "SQL" below. The currentRole instance attribute can be found in every instance of Person, Movie and Character classes, even if actually the Character never uses it. The currentRole of a Person object is set to a Character instance, inside a list of person who acted in a given movie. The currentRole of a Movie object is set to a Character instance, inside a list of movies played be given person. The currentRole of a Movie object is set to a Person instance, inside a list of movies in which a given character was portrayed. Schema: movie['cast'][0].currentRole -> a Character object. | +-> a Person object. person['actor'][0].currentRole -> a Character object. | +-> a Movie object. character['filmography'][0].currentRole -> a Person object. | +-> a Movie object. The roleID attribute can be used to access/set the characterID or personID instance attribute of the current currentRole. Building Movie or Person objects, you can pass the currentRole parameter and the roleID parameter (to set the ID). The currentRole parameter can be an object (Character or Person), an unicode string (in which case a Character or Person object is automatically instanced) or a list of objects or strings (to handle multiple characters played by the same actor/actress in a movie, or character played by more then a single actor/actress in the same movie). Anyway, currentRole objects (Character or Person instances) can be pretty-printed easily: calling unicode(CharacterOrPersonObject) will return a good-old-unicode string, like expected in the previous version of IMDbPY. SQL === Fetching data from the web, only characters with an active page on the web site will have their characterID; we don't have these information accessing "sql", so _every_ character will have an associated characterID. This way, every character with the same name will share the same characterID, even if - in fact - they may not be portraying the same character. GOODIES ======= To help getting the required information from Movie, Person and Character objects, in the "helpers" module there's a new factory function, makeObject2Txt, which can be used to create your pretty-printing function. It takes some optional parameters: movieTxt, personTxt, characterTxt and companyTxt; in these strings %(value)s items are replaced with object['value'] or with obj.value (if the first is not present). E.g.: import imdb myPrint = imdb.helpers.makeObject2Txt(personTxt=u'%(name)s ... %(currentRole)s') i = imdb.IMDb() m = i.get_movie('0057012') ps = m['cast'][0] print myPrint(ps) # The output will be something like: Peter Sellers ... Group Captain Lionel Mandrake / President Merkin Muffley / Dr. Strangelove Portions of the formatting string can be stripped conditionally: if the specified condition is false, they will be cancelled. E.g.: myPrint = imdb.helpers.makeObject2Txt(personTxt='%(long imdb name)s ... %(currentRole)s %(notes)s' Another useful argument is 'applyToValues': if set to a function, it will be applied to every value before the substitution; it can be useful to format strings for html output. IMDbPY-4.9/docs/README.mobile0000644000000000000000000000720311766731642014205 0ustar rootroot IMDbPY FOR SMALL SYSTEMS ======================== Since version 1.8, IMDbPY tries to be usable even on systems with very limited storage space, bandwidth and CPU power, like PDA, hand-held devices and mobile phones. Sections in this file: * INSTALLATION OPTIONS how to save a little space installing IMDbPY. * THE "MOBILE" DATA ACCESS SYSTEM useful for systems with very little CPU power and bandwidth. * THE "HTTPTHIN" DATA ACCESS SYSTEM for systems with normal CPU power, but insufficient bandwidth. * OTHER TIPS Please read all the following section. INSTALLATION OPTIONS ==================== You can call the setup.py script with some arguments: The --without-sql argument, if used, will excludes the parser.sql package; you don't need it if your system does not have any of the SQLObject or SQLAlchemy packages and/or you don't want to store the whole IMDb's plain text database files in a SQL database. Now, if you're installing IMDbPY (using ./setup.py install), you should take a look at some options, like "--no-compile" and "-O0" to exclude pyc and pyo files, saving hundreds of KBs. Moreover, if you're creating a package (rpm, deb or whatever), in the setup.cfg you can exclude from your package things like the documentation (more than 200Kb) and the scripts in the ./bin/ directory. THE "MOBILE" DATA ACCESS SYSTEM =============================== Intended to be used with PDA, smart phones and hand-held devices, the "mobile" data access system is a subclass of the default "httpThin" data access system, with some methods replaced with faster string methods, instead of the html parser. Moreover, for the movies, only the main information are retrieved (see the 'httpThin' notes). It should be, at usage time, from 2 to 20 times faster than the "http"/"httpThin" data access system. This code still needs tests on mobile phones! Please report any bugs/ideas/hints... Usage: from imdb import IMDb i = IMDb('mobile') sp = i.search_person('mel gibson', results=10) p = sp[0] i.update(p) sm = i.search_movie('mystic river', results=15) m = sm[0] i.update(m) ...and so on... A GUI for Series 60 smart phones, is available at: http://imdbpy.sourceforge.net/?page=mobile THE "HTTPTHIN" DATA ACCESS SYSTEM ================================= Instead of the default data access system ('http'), you can also use 'httpThin' (or 'webThin' or 'htmlThin'). I.e.: from imdb import IMDb i = IMDb('httpThin') sp = i.search_person('mel gibson', results=10) sm = i.search_movie('mystic river', results=15) ...and so on... The main difference is that, parsing movies' information, the "maindetails" page is parsed, in place of the "combined" page. This reduces the required bandwidth and the CPU power needed. Obviously a lot of information are lost (and only the first 15 people of the cast are listed), but it still retrieves everything you usually need (director, writer, runtime, country, language, akas, etc.) Another difference is that, if the "defaultModFuct" parameter is not provided (as default) calling the IMDb() function, no references to people or movie are collected from textual information (like the plot of a movie). OTHER TIPS ========== Remember that, calling the search_movie(), search_episode() and search_person() methods of the "IMDb" object, you can provide a "results" parameter, to download only a limited amount of results (20, by default). With the http, httpThin and mobile data access systems you can set a proxy with the set_proxy() method; e.g.: i = IMDb('http') i.set_proxy('http://localhost:8080/') Remember that the proxy is automatically used if the $HTTP_PROXY environment variable is set. IMDbPY-4.9/docs/imdbpyPowered.png0000644000000000000000000000476011766731642015404 0ustar rootrootPNG  IHDRd#Ua_PLTE]{`~_azR:Qª⾠ߺʺںεμζ۶ư߾ʽ޾&xD~#mC?Ʈھ۲خӢֲʳª۶ת͜zڥu0Qu1SR2v"ԦҬվѪp&6.0gmPi VVUֶjjhº濡orEkeBBBޮδRRQʳӝ{NNN֓uƈdƩJJJ޺⶘©ᒨP}HFFFνưưܐ(+sss}ںֵcb`9^^\A/Jͣv:Cu~@Yɲ}4͖tg~Fwoij~~|o\k5gnԺX|fqX^R<|QLE|FbZVc/)~njy^ӽƎ澾***222O|bzQuDHpCiFoGf+o8ezRWpQwzzxg|e[YW~j-pnkeRffecoDXgrS\:::T5bKGDH pHYs  #utIME  6XIDATxw3A;ϒB@YeIvpB 8lf]{?ɰ7t&  k BJ26F6Yi]S`k3`mY;mp v޽Ww{UU_CBQ_A֑{t_Gkj,-骫ܱ^zs!kl41w4kc:/9/[q_bO^R +I\BNjq[=*U5^T)֒u7l\'&NU[jg ;]]7s{+2'J&l"5"ItUojm~-[[m>'1ޟ/;pl˻qPŮEQh:Қ&oɪ*H'D޾֓~iH'e ?u[WFQWX(r ^Sxh.JqTY (Ip&nS/D==?i`@+u$u|ʦ B+(N^ `iUMn,$щW@ EM^}=|[ 4D JdMg=Xg]I =T}!nV_ؾzĮe{,-q?5!1`X;1(j`\1СJ-GH{߱'p~koD]F%$ {i:F TL R@u)}?؏#z][3ԼgڏZ5'`!TAboPG'} gbH? >HaeKwm$=i./8-mB cfWرOjqn-Ic @8N@`t:!w)qЋ[DvO`wbu+^\O_^?~֌e̿O '|g2iON ɲv{0nGGx勛ʕaB`k'DhKb$ M IpDU@ jHvůNc~ 7?99yȊ:xiW[#BC֖!4`l3QC5gh&06% P2bh%W/}gC:{~XODp ˡNq C'qN^{drtS{eHtA5O#c6MȀx!G>wϢ 6C-1GavAZIS c] $eE#o]WVSͶGx,|svppu=(8͕'MX_5H '9$s!,9)^גm WTSm{WBe1C \.R0F\㓐C.RS`:eN\6g M0pq"/F B!REL m6l0r0!5}T B1!b*i 3*9|B HJ0r%77(0o1/(Bzzf-!wȯ`^ܓl+fi$UGd2û?y?QFeIENDB`IMDbPY-4.9/docs/README.keywords0000644000000000000000000000243411766731642014606 0ustar rootroot KEYWORDS ======== Since version 4.0, it's possible (for every data access system) to search for movies' keywords. People's keywords are not supported. SEARCH FOR A KEYWORD SIMILAR TO A GIVEN STRING ============================================== The search_keyword(unicode_string) can be used to search amongst keywords: a list of keywords similar to the given string will be returned, sorted by similarity. Notice that the keywords in the returned list are plain unicode strings, and not instances of some class (like the ones returned by other search_SOMETHING methods). E.g.: from imdb import IMDb ia = IMDb('http') print ia.search_keyword(u'alabama') GET A LIST OF MOVIES FOR A GIVEN KEYWORD ======================================== To get a list of movies that are tagged with the given keyword, use the get_keyword(unicode_string) method. E.g.: from imdb import IMDb ia = IMDb('http') print ia.get_keyword(u'alabama') Beware that by default the list is limited to 100 movies, and it's not possible to get more results, using 'http'. Moreover, the lists returned using 'sql' are not sorted in any way. Another limit is that actually (as of february 2009), the IMDb's web server is unable to serve pages about non-ascii keywords. It's a known problem of their systems. IMDbPY-4.9/docs/imdbpywin.bmp0000644000000000000000000002510211766731642014557 0ustar rootrootBMB*6( & v R~VZbbBBB^J2vj"~n&r^ryRRR222r"j^~.ƶZFnbNNNjr~.jN2~~~j66j6~V^r:nX~l~JJJv~f~RV~~^^^zFvf6j^J~vƾRrb~***.JjfjBnBnRrfVZZfnN~>FFF^~zvj־fzRfffJZrRFz^frjnr^~**vƾf:::bbz^VVVRBF~f~ƶZFnB~vF"vF~jʲrrrfZ:RzfvzVzzzZFƲbbb~*>R2v:.6ʾ^jjj&^zBrbƾfbjvRfNƲj^pRnZvƾRfr~~~F~Rnn*vzRzf¾f^~:Zfv>¦nz$      #    "*! **** ** ***! *Ô* *T**Ô *Ô *ÔÔ! * ** Ô **  * * !* *)uÔÔu*uuuu*uÔÔ** *uÔÔ*uÔÔ *!6 ** ** Ô Ô *  * ** !* Ô** *!* Æ   Æ ÔÔ ** ÔÔ *T *! *   **  * !* ÔÔu *A* Æ  Æ ÔÔ Æ ** ÔÔ Æ ** *!3 Ô  ** ** Ô  T  Ô T !* ÔÔu*uÔÔu *5* uÔÔu uÔu*uÔÔu*uÔu *! ** *  * * Ô ** ** Ô ! ÔÔ**Ô**Ô*T* *Ô* * *!***** ** *** ** "  * *%** %*Ô*%     * *   %* ** ** ** ** uÔÔu ** *!%      ** * &%* * * ** ** * * Æ ** Æ ** *&%  *** *  **   * &% * Ô* *uu** ** ÔÔ Æ ** Æ Æu *&% ** Ô *    Ô Ô &%* ÔÔu*uTu* ** ÔÔ Æ ** Æ u *&% ** **   **   * &%* TT Æ ** ** ** * * Æ ** Æ ** *&% **  ** **  ** * &%* ÔÔu ** Ô  ** uÔÔu ** *&% *     * *  &%ÔÔ**ÔÔÔ**Ô*'% ***#****** (&) -  C     .                                                                                                                                  +  S ]JNNJ8uJ'+\JpN8uN,ƥJ  JN J78NN+ 8JNJu7'NJ88JN'J+u+NN'JJee 7e$;J  J$$e7 +zp#S'ȥpp7'SpzJ ppƫ'zppppƥSN++ezppzE_ES8 +S;pN f,+ f#$z;;+7eff$N8e$;;Nzf;f;$ƇJuJ,;f$$;ff;' +$fNNp;J  Jz$fpN,++,$_e78e$$$z;f;f$ESJu'$f;$f_ ǺF$fFNp;z'  J;;peN,+ 'S+ 7e$$f$_Ef;p+;;EEf$}Ǻ$fN C3Lff (o6$$$f;_,' "(QQňQ;pN 0ʂDyσEtWЃstЃ;Ey5WtʴɅo(,zE$f_,  ŋrԋBfu I[<[R/t׃t϶ksВʹfEߨ[55stקksߒy,b{3o ũZPe;fEe7  B\3@ ^S 9yʂ[ߨ$y[[2ЃЃ 2tyy偂y$IPo bݝvf;Nu oBr;u I[[yym2y$RsρՉ[߽[xRs$Ru[yy;.wwL j\},  ?aPP#, Ftymymyߧ󛛃Я`yt_2[[ymx߃Чϧ˽`ty2;zBd46!a 4z;puFaw ;Nu  Ik˃yy[m222קʛ˧_y[2׃tyy;.w "!U|4 ȶf+ |4 f;,J  Issyyymm`i]t[Gtss[[yyk]yG[[$4 $(&q$z'  (q_$;p7  Iϧyϧy[߃]߽mxyy[R"c)4HHffb¡dUڶfpNY¡Hqp;fu  :my[y[mm[ρߨmЛ][y_mmm[xxm2[y[y߿;E"YqO}"q"$fEq1Y"q"S_f_SJ  Oyρtʧmmkk״ymy_25xϨmmpkm2[;$E'Y}ggS7.}IwfFHY}gIB$;+  2[y[y%z[]km_2[yxӨmmp[ϛ]k[2ze'8d}S+ v){P3$f̝ݜŠ{P3ȶ;f$e+   [[y`$`[[2z"mR`y;f[[2'8o1}L'N+8ئ($;~oBǺݦS_$7 炒yy`Ay߽m7mm[y傂G_`AAxzmm_y[[炒[yy`A bf_e҈QJ>~b#bQZf$z8 IR[tyyfϒקRkG[xӨ2[2[ϹקRRk[vjXL(f$EEbHf7 jbH NzN8 I;ѿyϞf[ʒsϹѽϒϒ߽_fyk["j0Xc,f@fz ޱq_zuHp$ff' H׃n烛t%f$z߿GWktt[_߿yk׃t-lP +lCr^'C9rE+CP9eƎ$fEe y?DEЃ[S[[[[y[[xtϮ%Ѓ,2[y[[uܢFI)f#4^SJulF\'L4o-eS  [[[ykt5k嫇e'u[[[[etև=kyy'+[ʫeJ!U|VK&!vow)PeJu |\K7wdw!Xp$ffE [ʒς7yit[ϧЛk7)uyG[[77[tt[ʧЛ y[78!U(ad\U|d37  (a6ƫ+ PX!Uj_E [yy[Wג׃ks'[2m[[ [ג׃kЛ˽[ 4ĵ3dd) 4qLad4 PCd4)b֏v 4ĵP?Ne'8 2yѨ[t[[y뷨yyt[[ϧty ǢvHHdqLdd vH)8u  4 vu78 [ ϒmm[[Չ[[[[yϒmm[Չ[ wB3wddg}oB HOB)3wdH BB)\w  [A2tt222[y[[A[tt2m2[y[ BBBvwBBBvBBr WFhF(ooZ>? (Q>B~ XN99Zb IMDbPY-4.9/docs/goodies/0000755000000000000000000000000011766731642013506 5ustar rootrootIMDbPY-4.9/docs/goodies/README.txt0000644000000000000000000000107511766731642015207 0ustar rootroot IMDbPY's goodies ================ Useful shell scripts, especially for developers. See the comments at the top of the files for usage and configuration options. applydiffs.sh: Bash script useful apply patches to a set of IMDb's plain text data files. You can use this script to apply the diffs files distributed on a (more or less) weekly base by IMDb. reduce.sh: Bash script useful to create a "slimmed down" version of the IMDb's plain text data files. It's useful to create shorter versions of the plain text data files, to test the imdbpy2sql.py script faster. IMDbPY-4.9/docs/goodies/reduce.sh0000755000000000000000000000700511766731642015316 0ustar rootroot#!/bin/bash # # reduce.sh: Bash script useful to create a "slimmed down" version of the # IMDb's plain text data files. # # Usage: copy this script in the directory with the plain text data files; # configure the options below and run it. # # Copyright: 2009-2010 Davide Alberani # # This program is released under the terms of the GNU GPL 2 or later license. # # Cygwin packages to install (Windows): # - util-unix for rev # - gzip for gzip, zcat, zgrep # Directory with the plain text data file. ORIG_DIR="." # Directory where "reduced" files will be stored; it will be create if needed. # Beware that this directory is relative to ORIG_DIR. DEST_DIR="./partial/" # How much percentage of the original file to keep. KEEP_X_PERCENT="1" # The compression ratio of the created files. COMPRESSION="1" # - # Nothing to configure below. # - cd "$ORIG_DIR" mkdir -p "$DEST_DIR" DIV_BY="`expr 100 / $KEEP_X_PERCENT`" for file in *.gz do LINES="`zcat "$file" | wc -l`" CONSIDER="`expr $LINES / $DIV_BY`" FULL_CONS="$CONSIDER" CONSIDER="`expr $CONSIDER / 2`" NEWNAME="`echo "$file" | rev | cut -c 4- | rev `" # Tries to keep enough lines from the top of the file. MIN_TOP_LINES="`zgrep -n -m 1 "^-----------------------------------------" "$file" | cut -d : -f 1`" if test -z "$MIN_TOP_LINES" ; then MIN_TOP_LINES=0 fi if test "$file" == "business.list.gz" -a $MIN_TOP_LINES -lt 260 ; then MIN_TOP_LINES=260 elif test "$file" == "alternate-versions.list.gz" -a $MIN_TOP_LINES -lt 320 ; then MIN_TOP_LINES=320 elif test "$file" == "cinematographers.list.gz" -a $MIN_TOP_LINES -lt 240 ; then MIN_TOP_LINES=240 elif test "$file" == "complete-cast.list.gz" ; then MIN_TOP_LINES=140 elif test "$file" == "complete-crew.list.gz" ; then MIN_TOP_LINES=150 elif test "$file" == "composers.list.gz" -a $MIN_TOP_LINES -lt 160 ; then MIN_TOP_LINES=160 elif test "$file" == "costume-designers.list.gz" -a $MIN_TOP_LINES -lt 240 ; then MIN_TOP_LINES=240 elif test "$file" == "directors.list.gz" -a $MIN_TOP_LINES -lt 160 ; then MIN_TOP_LINES=160 elif test "$file" == "genres.list.gz" -a $MIN_TOP_LINES -lt 400 ; then MIN_TOP_LINES=400 elif test "$file" == "keywords.list.gz" -a $MIN_TOP_LINES -lt 36000 ; then MIN_TOP_LINES=36000 elif test "$file" == "literature.list.gz" -a $MIN_TOP_LINES -lt 320 ; then MIN_TOP_LINES=320 elif test "$file" == "mpaa-ratings-reasons.list.gz" -a $MIN_TOP_LINES -lt 400 ; then MIN_TOP_LINES=400 elif test "$file" == "producers.list.gz" ; then MIN_TOP_LINES=220 elif test "$file" == "production-companies.list.gz" -a $MIN_TOP_LINES -lt 270 ; then MIN_TOP_LINES=270 elif test "$file" == "production-designers.list.gz" -a $MIN_TOP_LINES -lt 240 ; then MIN_TOP_LINES=240 elif test "$file" == "ratings.list.gz" -a $MIN_TOP_LINES -lt 320 ; then MIN_TOP_LINES=320 elif test "$file" == "special-effects-companies.list.gz" -a $MIN_TOP_LINES -lt 320 ; then MIN_TOP_LINES=320 elif test "$file" == "sound-mix.list.gz" -a $MIN_TOP_LINES -lt 340 ; then MIN_TOP_LINES=340 elif test "$file" == "writers.list.gz" ; then MIN_TOP_LINES=400 else MIN_TOP_LINES="`expr $MIN_TOP_LINES + 60`" fi if test "$MIN_TOP_LINES" -gt "$CONSIDER" ; then TOP_CONSIDER=$MIN_TOP_LINES else TOP_CONSIDER=$CONSIDER fi HOW_MANY="`expr $TOP_CONSIDER + $CONSIDER`" echo "Processing $file [$KEEP_X_PERCENT%: $HOW_MANY lines]" zcat "$file" | head -$TOP_CONSIDER > "$DEST_DIR/$NEWNAME" zcat "$file" | tail -$CONSIDER >> "$DEST_DIR/$NEWNAME" gzip -f -$COMPRESSION "$DEST_DIR/$NEWNAME" done IMDbPY-4.9/docs/goodies/applydiffs.sh0000755000000000000000000000276611766731642016221 0ustar rootroot#!/bin/sh # # applydiffs.sh: Bash script useful apply patches to a set of # IMDb's plain text data files. # # Usage: copy this script in the directory with the plain text # data files and run it passing a list of diffs-file(s) as # arguments. # It's possible that the plain text data files will be left # in an inconsistent state, so a backup is probably a good idea. # # Copyright: 2009-2010 Davide Alberani # # This program is released under the terms of the GNU GPL 2 or later license. # if [ $# -lt 1 ] ; then echo "USAGE: $0 diffs-file [diffs-file...]" echo " Beware that diffs-file must be sorted from the older to the newer!" exit 1 fi COMPRESSION="1" ALL_DIFFS="$@" for DIFFS in $@ do rm -rf diffs echo -n "Unpacking $DIFFS..." tar xfz "$DIFFS" echo " done!" for DF in diffs/*.list do fname="`basename $DF`" if [ -f "$fname" ] ; then wasUnpacked=1 applyTo="$fname" elif [ -f "$fname.gz" ] ; then wasUnpacked=0 applyTo="$fname.gz" else echo "NOT applying: $fname doesn't exists." continue fi if [ $wasUnpacked -eq 0 ] ; then echo -n "unzipping $applyTo..." gunzip "$applyTo" echo "done!" fi echo -n "patching $fname with $DF..." patch -s "$fname" "$DF" if [ $? -ne 0 ] ; then echo "FAILED!" continue fi echo "done!" done echo "finished with $DIFFS" echo "" done rm -rf diffs for lfile in *.list do echo -n "gzipping $lfile..." gzip -$COMPRESSION "$lfile" echo "done!" done IMDbPY-4.9/docs/README.http0000644000000000000000000000135711766731642013721 0ustar rootroot IMDbPY HTTP CONNECTION ====================== HTTP is the default data access system of IMDbPY, meaning that by default data are requested at the IMDb web servers. For other kinds of data access, see README.sqldb and README.mobile. By default IMDbPY uses its own account to access the IMDb web server (this is done to enable searches on adult titles); if you want to uses your own account, see README.adult. CONNECTION PROBLEMS =================== It has been reported some kind of problems connecting to the IMDb servers; the problem seems to be related to the use of our cookie and the geographical location of the user. If you experience such a problem, report it and try to disable the use of the cookie (to do so, see README.adult). IMDbPY-4.9/docs/README.utf80000644000000000000000000001345611766731642013633 0ustar rootroot UNICODE SUPPORT =============== Starting with release 2.4, IMDbPY internally manages (almost) every string using unicode, with UTF-8 encoding. Since release 3.0, every string containing some sort of information is guarantee to be unicode (notable exceptions are dictionary keywords and movieID/personID, where they are stored as strings). The good: we can correctly manage "foreign" names, titles and other information. Previously every string was stored in bytecode, losing information about the original charset. Without knowing the charset, how can you know that the bytecode string 'Lina Wertm\xfcller' is west-European iso-8859-1 (and so it's "Lina Wertmüller" - if you're reading this file as UTF-8) and not Cyrillic KOI-8-R (resulting in "Lina WertmЭller")? Using unicode, you can store every human language, and show/print every char correctly, provided that your local charset (and font) is right. The bad: in primis, performances will suffer: IMDbPY does _a lot_ (and with _a lot_ I mean _A BLOODY DAMN LOT_) of string operations (moving, copying, splitting, searching, slicing, ...) and moving to unicode the slow down will be measurable (and probably noticeable). Moreover, every IMDbPY-base program will need to be modified, because utf-8 chars must be encoded-back to your local charset before they can be printed on screen or on files. The ugly: converting to unicode a program so huge, born without unicode support from start, is prone to errors, bugs, spontaneous combustion and eternal damnation! You can't mix bytecode strings (with unknown charset) and unicode with impunity: an exception will be raised because python doesn't know the encoding of the bytecode string, that must be explicitly specified. INPUT ===== Searching for a movie title or a person name, you (or another program) should pass a unicode string, encoded specifying your local charset. E.g., you're writing on a terminal with iso-8859-1 charset (aka latin-1): >>> from imdb import IMDb >>> ia = IMDb() >>> >>> lat1_str = 'Lina Wertm�ler' # written on a latin-1 terminal >>> utf8_str = unicode(lat1_str, 'iso-8859-1') >>> >>> results = ia.search_person(utf8_str) If you pass a string to search_person(), search_movie() or search_episode() functions, IMDbPY attempts to guess the encoding, using the sys.stdin.encoding or the value returned from the sys.getdefaultencoding function. Trust me: you want to provide an unicode string... Maybe in a future release the IMDb() function can take a "defaultInputEncoding" argument or something. OUTPUT ====== You've searched for a person or a movie, you've retrieved the information you wanted. Cool. Now you're about to print these information to the screen, or send it to a file or over a network socket. Ok, wait a minute. Before you proceed, you need to revert back the unicode chars to strings in the charset you will use to display/save/send it: >>> from imdb import IMDb >>> ia = IMDb() >>> >>> gmv_str = unicode('gian maria volonte', 'ascii') # optional, IT'S ascii... >>> gmv = ia.search_person(gmv_str)[0] >>> ia.update(gmv) # fetch the default set of information. >>> >>> gmv['name'] u'Gian Maria Volont\xe9' >>> >>> type(gmv['name']) >>> >>> >>> print gmv['name'] # WRONG: because if you are on an ASCII only terminal... Traceback (most recent call last): File "", line 1, in ? UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 17: ordinal not in range(128) >>> >>> print gmv['name'].encode(yourLocalEncoding, 'replace') # CORRECT. Gian Maria Volonté You have to use the encode() method of unicode strings to obtain a string suited for your local configuration. The encoding depends on your system and on what you've to do with these strings. The second (optional) argument of the encode() method specifies what to do with the unicode chars that cannot be represented in the encoding of your choice. If not specified, a UnicodeEncodeError exception is raised, so be prepared. Other values are 'ignore' to skip these chars, 'replace' to substitute these chars with question marks ('?'), 'xmlcharrefreplace' to replace the chars with XML references (e.g.: "é" for "é"). WRITING IMDbPY-based PROGRAMS ============================= In the imdb.helpers module you can find some functions useful to manage/translate unicode strings in some common situations. RULE OF THUMB ============= Always convert to/from unicode at the I/O level: at the first moment you've got some strings from the user (terminal) or the net (sockets, web forms, whatever). You need to know the encoding of the input, checking sys.stding.encoding, the LANG/LC_* environment variables, the headers of the http request and so on. Whenever you're outputting information about movies or persons, convert these unicode string to bytecode strings using the encoding of your output channel (terminal, net, web pages, ...) Remember: "u = unicode(string, inputEncoding)" convert your input string to unicode, "s = u.encode(outputEncoding, manageErrors)" convert unicode strings to your local environment. LINKS ===== * The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!): http://www.joelonsoftware.com/articles/Unicode.html * Python Unicode HOWTO: http://www.amk.ca/python/howto/unicode * Dive Into Python, unicode page: http://diveintopython.org/xml_processing/unicode.html * How to Use UTF-8 with Python: http://evanjones.ca/python-utf8.html * End to End Unicode Web Applications in Python: http://dalchemy.com/opensource/unicodedoc/ IMDbPY-4.9/ez_setup.py0000644000000000000000000002405511766731642013343 0ustar rootroot#!python """Bootstrap setuptools installation If you want to use setuptools in your package's setup.py, just include this file in the same directory with it, and add this to the top of your setup.py:: from ez_setup import use_setuptools use_setuptools() If you want to require a specific version of setuptools, set a download mirror, or use an alternate download directory, you can do so by supplying the appropriate options to ``use_setuptools()``. This file can also be run as a script to install or upgrade setuptools. """ import sys DEFAULT_VERSION = "0.6c11" DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] md5_data = { 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', 'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090', 'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4', 'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7', 'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5', 'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de', 'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b', 'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2', 'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086', 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', } import sys, os try: from hashlib import md5 except ImportError: from md5 import md5 def _validate_md5(egg_name, data): if egg_name in md5_data: digest = md5(data).hexdigest() if digest != md5_data[egg_name]: print >>sys.stderr, ( "md5 validation of %s failed! (Possible download problem?)" % egg_name ) sys.exit(2) return data def use_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, download_delay=15 ): """Automatically find/download setuptools and make it available on sys.path `version` should be a valid setuptools version number that is available as an egg for download under the `download_base` URL (which should end with a '/'). `to_dir` is the directory where setuptools will be downloaded, if it is not already available. If `download_delay` is specified, it should be the number of seconds that will be paused before initiating a download, should one be required. If an older version of setuptools is installed, this routine will print a message to ``sys.stderr`` and raise SystemExit in an attempt to abort the calling script. """ was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules def do_download(): egg = download_setuptools(version, download_base, to_dir, download_delay) sys.path.insert(0, egg) import setuptools; setuptools.bootstrap_install_from = egg try: import pkg_resources except ImportError: return do_download() try: pkg_resources.require("setuptools>="+version); return except pkg_resources.VersionConflict, e: if was_imported: print >>sys.stderr, ( "The required version of setuptools (>=%s) is not available, and\n" "can't be installed while this script is running. Please install\n" " a more recent version first, using 'easy_install -U setuptools'." "\n\n(Currently using %r)" ) % (version, e.args[0]) sys.exit(2) else: del pkg_resources, sys.modules['pkg_resources'] # reload ok return do_download() except pkg_resources.DistributionNotFound: return do_download() def download_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, delay = 15 ): """Download setuptools from a specified location and return its filename `version` should be a valid setuptools version number that is available as an egg for download under the `download_base` URL (which should end with a '/'). `to_dir` is the directory where the egg will be downloaded. `delay` is the number of seconds to pause before an actual download attempt. """ import urllib2, shutil egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) url = download_base + egg_name saveto = os.path.join(to_dir, egg_name) src = dst = None if not os.path.exists(saveto): # Avoid repeated downloads try: from distutils import log if delay: log.warn(""" --------------------------------------------------------------------------- This script requires setuptools version %s to run (even to display help). I will attempt to download it for you (from %s), but you may need to enable firewall access for this script first. I will start the download in %d seconds. (Note: if this machine does not have network access, please obtain the file %s and place it in this directory before rerunning this script.) ---------------------------------------------------------------------------""", version, download_base, delay, url ); from time import sleep; sleep(delay) log.warn("Downloading %s", url) src = urllib2.urlopen(url) # Read/write all in one block, so we don't create a corrupt file # if the download is interrupted. data = _validate_md5(egg_name, src.read()) dst = open(saveto,"wb"); dst.write(data) finally: if src: src.close() if dst: dst.close() return os.path.realpath(saveto) def main(argv, version=DEFAULT_VERSION): """Install or upgrade setuptools and EasyInstall""" try: import setuptools except ImportError: egg = None try: egg = download_setuptools(version, delay=0) sys.path.insert(0,egg) from setuptools.command.easy_install import main return main(list(argv)+[egg]) # we're done here finally: if egg and os.path.exists(egg): os.unlink(egg) else: if setuptools.__version__ == '0.0.1': print >>sys.stderr, ( "You have an obsolete version of setuptools installed. Please\n" "remove it from your system entirely before rerunning this script." ) sys.exit(2) req = "setuptools>="+version import pkg_resources try: pkg_resources.require(req) except pkg_resources.VersionConflict: try: from setuptools.command.easy_install import main except ImportError: from easy_install import main main(list(argv)+[download_setuptools(delay=0)]) sys.exit(0) # try to force an exit else: if argv: from setuptools.command.easy_install import main main(argv) else: print "Setuptools version",version,"or greater has been installed." print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' def update_md5(filenames): """Update our built-in md5 registry""" import re for name in filenames: base = os.path.basename(name) f = open(name,'rb') md5_data[base] = md5(f.read()).hexdigest() f.close() data = [" %r: %r,\n" % it for it in md5_data.items()] data.sort() repl = "".join(data) import inspect srcfile = inspect.getsourcefile(sys.modules[__name__]) f = open(srcfile, 'rb'); src = f.read(); f.close() match = re.search("\nmd5_data = {\n([^}]+)}", src) if not match: print >>sys.stderr, "Internal error!" sys.exit(2) src = src[:match.start(1)] + repl + src[match.end(1):] f = open(srcfile,'w') f.write(src) f.close() if __name__=='__main__': if len(sys.argv)>2 and sys.argv[1]=='--md5update': update_md5(sys.argv[2:]) else: main(sys.argv[1:]) IMDbPY-4.9/imdb/0000755000000000000000000000000011766731642012040 5ustar rootrootIMDbPY-4.9/imdb/_compat.py0000644000000000000000000000530111766731642014033 0ustar rootroot""" _compat module (imdb package). This module provides compatibility functions used by the imdb package to deal with unusual environments. Copyright 2008-2010 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ # TODO: now we're heavily using the 'logging' module, which was not # present in Python 2.2. To work in a Symbian environment, we # need to create a fake 'logging' module (its functions may call # the 'warnings' module, or do nothing at all). import os # If true, we're working on a Symbian device. if os.name == 'e32': # Replace os.path.expandvars and os.path.expanduser, if needed. def _noact(x): """Ad-hoc replacement for IMDbPY.""" return x try: os.path.expandvars except AttributeError: os.path.expandvars = _noact try: os.path.expanduser except AttributeError: os.path.expanduser = _noact # time.strptime is missing, on Symbian devices. import time try: time.strptime except AttributeError: import re _re_web_time = re.compile(r'Episode dated (\d+) (\w+) (\d+)') _re_ptdf_time = re.compile(r'\((\d+)-(\d+)-(\d+)\)') _month2digit = {'January': '1', 'February': '2', 'March': '3', 'April': '4', 'May': '5', 'June': '6', 'July': '7', 'August': '8', 'September': '9', 'October': '10', 'November': '11', 'December': '12'} def strptime(s, format): """Ad-hoc strptime replacement for IMDbPY.""" try: if format.startswith('Episode'): res = _re_web_time.findall(s)[0] return (int(res[2]), int(_month2digit[res[1]]), int(res[0]), 0, 0, 0, 0, 1, 0) else: res = _re_ptdf_time.findall(s)[0] return (int(res[0]), int(res[1]), int(res[2]), 0, 0, 0, 0, 1, 0) except: raise ValueError('error in IMDbPY\'s ad-hoc strptime!') time.strptime = strptime IMDbPY-4.9/imdb/parser/0000755000000000000000000000000011766731642013334 5ustar rootrootIMDbPY-4.9/imdb/parser/__init__.py0000644000000000000000000000200611766731642015443 0ustar rootroot""" parser package (imdb package). This package provides various parsers to access IMDb data (e.g.: a parser for the web/http interface, a parser for the SQL database interface, etc.). So far, the http/httpThin, mobile and sql parsers are implemented. Copyright 2004-2009 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ __all__ = ['http', 'mobile', 'sql'] IMDbPY-4.9/imdb/parser/mobile/0000755000000000000000000000000011766731642014603 5ustar rootrootIMDbPY-4.9/imdb/parser/mobile/__init__.py0000644000000000000000000011113211766731642016713 0ustar rootroot""" parser.mobile package (imdb package). This package provides the IMDbMobileAccessSystem class used to access IMDb's data for mobile systems. the imdb.IMDb function will return an instance of this class when called with the 'accessSystem' argument set to "mobile". Copyright 2005-2011 Davide Alberani This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re import logging from urllib import unquote from imdb.Movie import Movie from imdb.utils import analyze_title, analyze_name, canonicalName, \ date_and_notes from imdb._exceptions import IMDbDataAccessError from imdb.parser.http import IMDbHTTPAccessSystem from imdb.parser.http.utils import subXMLRefs, subSGMLRefs, build_person, \ build_movie, re_spaces # XXX NOTE: the first version of this module was heavily based on # regular expressions. This new version replace regexps with # find() strings' method calls; despite being less flexible, it # seems to be at least as fast and, hopefully, much more # lightweight. Yes: the regexp-based version was too heavyweight # for systems with very limited CPU power and memory footprint. re_spacessub = re_spaces.sub # Strip html. re_unhtml = re.compile(r'<.+?>') re_unhtmlsub = re_unhtml.sub # imdb person or movie ids. re_imdbID = re.compile(r'(?<=nm|tt|ch)([0-9]{7})\b') # movie AKAs. re_makas = re.compile('(

.*?

)') # Remove episode numbers. re_filmo_episodes = re.compile('
.*?
', re.M | re.I) def _unHtml(s): """Return a string without tags and no multiple spaces.""" return subSGMLRefs(re_spacessub(' ', re_unhtmlsub('', s)).strip()) _inttype = type(0) def _getTagsWith(s, cont, toClosure=False, maxRes=None): """Return the html tags in the 's' string containing the 'cont' string; if toClosure is True, everything between the opening tag and the closing tag is returned.""" lres = [] bi = s.find(cont) if bi != -1: btag = s[:bi].rfind('<') if btag != -1: if not toClosure: etag = s[bi+1:].find('>') if etag != -1: endidx = bi+2+etag lres.append(s[btag:endidx]) if maxRes is not None and len(lres) >= maxRes: return lres lres += _getTagsWith(s[endidx:], cont, toClosure=toClosure) else: spaceidx = s[btag:].find(' ') if spaceidx != -1: ctag = '' % s[btag+1:btag+spaceidx] closeidx = s[bi:].find(ctag) if closeidx != -1: endidx = bi+closeidx+len(ctag) lres.append(s[btag:endidx]) if maxRes is not None and len(lres) >= maxRes: return lres lres += _getTagsWith(s[endidx:], cont, toClosure=toClosure) return lres def _findBetween(s, begins, ends, beginindx=0, maxRes=None, lres=None): """Return the list of strings from the 's' string which are included between the 'begins' and 'ends' strings.""" if lres is None: lres = [] bi = s.find(begins, beginindx) if bi != -1: lbegins = len(begins) if isinstance(ends, (list, tuple)): eset = [s.find(end, bi+lbegins) for end in ends] eset[:] = [x for x in eset if x != -1] if not eset: ei = -1 else: ei = min(eset) else: ei = s.find(ends, bi+lbegins) if ei != -1: match = s[bi+lbegins:ei] lres.append(match) if maxRes is not None and len(lres) >= maxRes: return lres _findBetween(s, begins, ends, beginindx=ei, maxRes=maxRes, lres=lres) return lres class IMDbMobileAccessSystem(IMDbHTTPAccessSystem): """The class used to access IMDb's data through the web for mobile terminals.""" accessSystem = 'mobile' _mobile_logger = logging.getLogger('imdbpy.parser.mobile') def __init__(self, isThin=0, *arguments, **keywords): self.accessSystem = 'mobile' IMDbHTTPAccessSystem.__init__(self, isThin, *arguments, **keywords) def _clean_html(self, html): """Normalize the retrieve html.""" html = re_spaces.sub(' ', html) # Remove silly  » chars. html = html.replace(' »', '') return subXMLRefs(html) def _mretrieve(self, url, size=-1): """Retrieve an html page and normalize it.""" cont = self._retrieve(url, size=size) return self._clean_html(cont) def _getPersons(self, s, sep='
'): """Return a list of Person objects, from the string s; items are assumed to be separated by the sep string.""" names = s.split(sep) pl = [] plappend = pl.append counter = 1 for name in names: pid = re_imdbID.findall(name) if not pid: continue characters = _getTagsWith(name, 'class="char"', toClosure=True, maxRes=1) chpids = [] if characters: for ch in characters[0].split(' / '): chid = re_imdbID.findall(ch) if not chid: chpids.append(None) else: chpids.append(chid[-1]) if not chpids: chpids = None elif len(chpids) == 1: chpids = chpids[0] name = _unHtml(name) # Catch unclosed tags. gt_indx = name.find('>') if gt_indx != -1: name = name[gt_indx+1:].lstrip() if not name: continue if name.endswith('...'): name = name[:-3] p = build_person(name, personID=str(pid[0]), billingPos=counter, modFunct=self._defModFunct, roleID=chpids, accessSystem=self.accessSystem) plappend(p) counter += 1 return pl def _search_movie(self, title, results): ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('tt', title, results)) title = _findBetween(cont, '', '', maxRes=1) res = [] if not title: self._mobile_logger.error('no title tag searching for movie %s', title) return res tl = title[0].lower() if not tl.startswith('imdb title'): # a direct hit! title = _unHtml(title[0]) mid = None midtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if midtag: mid = _findBetween(midtag[0], '/title/tt', '/', maxRes=1) if not (mid and title): self._mobile_logger.error('no direct hit title/movieID for' \ ' title %s', title) return res if cont.find('TV mini-series') != -1: title += ' (mini)' res[:] = [(str(mid[0]), analyze_title(title))] else: # XXX: this results*3 prevents some recursion errors, but... # it's not exactly understandable (i.e.: why 'results' is # not enough to get all the results?) lis = _findBetween(cont, 'td valign="top">', '', maxRes=results*3) for li in lis: akas = re_makas.findall(li) for idx, aka in enumerate(akas): aka = aka.replace('" - ', '::', 1) aka = _unHtml(aka) if aka.startswith('aka "'): aka = aka[5:].strip() if aka[-1] == '"': aka = aka[:-1] akas[idx] = aka imdbid = re_imdbID.findall(li) li = re_makas.sub('', li) mtitle = _unHtml(li) if not (imdbid and mtitle): self._mobile_logger.debug('no title/movieID parsing' \ ' %s searching for title %s', li, title) continue mtitle = mtitle.replace('(TV mini-series)', '(mini)') resd = analyze_title(mtitle) if akas: resd['akas'] = akas res.append((str(imdbid[0]), resd)) return res def get_movie_main(self, movieID): cont = self._mretrieve(self.urls['movie_main'] % movieID + 'maindetails') title = _findBetween(cont, '', '', maxRes=1) if not title: raise IMDbDataAccessError('unable to get movieID "%s"' % movieID) title = _unHtml(title[0]) if title.endswith(' - IMDb'): title = title[:-7] if cont.find('TV mini-series') != -1: title += ' (mini)' d = analyze_title(title) kind = d.get('kind') tv_series = _findBetween(cont, 'TV Series:', '', maxRes=1) if tv_series: mid = re_imdbID.findall(tv_series[0]) else: mid = None if tv_series and mid: s_title = _unHtml(tv_series[0]) s_data = analyze_title(s_title) m = Movie(movieID=str(mid[0]), data=s_data, accessSystem=self.accessSystem, modFunct=self._defModFunct) d['kind'] = kind = u'episode' d['episode of'] = m if kind in ('tv series', 'tv mini series'): years = _findBetween(cont, '

', '

', maxRes=1) if years: years[:] = _findBetween(years[0], 'TV series', '', maxRes=1) if years: d['series years'] = years[0].strip() air_date = _findBetween(cont, 'Original Air Date:', '
', maxRes=1) if air_date: air_date = air_date[0] vi = air_date.find('(') if vi != -1: date = _unHtml(air_date[:vi]).strip() if date != '????': d['original air date'] = date air_date = air_date[vi:] season = _findBetween(air_date, 'Season', ',', maxRes=1) if season: season = season[0].strip() try: season = int(season) except: pass if season or type(season) is _inttype: d['season'] = season episode = _findBetween(air_date, 'Episode', ')', maxRes=1) if episode: episode = episode[0].strip() try: episode = int(episode) except: pass if episode or type(season) is _inttype: d['episode'] = episode direct = _findBetween(cont, '
Director', ('', '

'), maxRes=1) if direct: direct = direct[0] h5idx = direct.find('/h5>') if h5idx != -1: direct = direct[h5idx+4:] direct = self._getPersons(direct) if direct: d['director'] = direct if kind in ('tv series', 'tv mini series', 'episode'): if kind != 'episode': seasons = _findBetween(cont, 'Seasons:
', '', maxRes=1) if seasons: d['number of seasons'] = seasons[0].count('|') + 1 creator = _findBetween(cont, 'Created by', ('class="tn15more"', '', '

'), maxRes=1) if not creator: # They change 'Created by' to 'Creator' and viceversa # from time to time... # XXX: is 'Creators' also used? creator = _findBetween(cont, 'Creator:', ('class="tn15more"', '', '

'), maxRes=1) if creator: creator = creator[0] if creator.find('tn15more'): creator = '%s>' % creator creator = self._getPersons(creator) if creator: d['creator'] = creator writers = _findBetween(cont, '
Writer', ('', '

'), maxRes=1) if writers: writers = writers[0] h5idx = writers.find('/h5>') if h5idx != -1: writers = writers[h5idx+4:] writers = self._getPersons(writers) if writers: d['writer'] = writers cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1) if cvurl: cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1) if cvurl: d['cover url'] = cvurl[0] genres = _findBetween(cont, 'href="/genre/', '"') if genres: d['genres'] = list(set(genres)) ur = _findBetween(cont, 'id="star-bar-user-rate">', '', maxRes=1) if ur: rat = _findBetween(ur[0], '', '', maxRes=1) if rat: if rat: d['rating'] = rat[0].strip() else: self._mobile_logger.warn('wrong rating: %s', rat) vi = ur[0].rfind('href="ratings"') if vi != -1 and ur[0][vi+10:].find('await') == -1: try: votes = _findBetween(ur[0][vi:], "title='", " IMDb", maxRes=1) votes = int(votes[0].replace(',', '')) d['votes'] = votes except (ValueError, IndexError): self._mobile_logger.warn('wrong votes: %s', ur) top250 = _findBetween(cont, 'href="/chart/top?', '', maxRes=1) if top250: fn = top250[0].rfind('#') if fn != -1: try: td = int(top250[0][fn+1:]) d['top 250 rank'] = td except ValueError: self._mobile_logger.warn('wrong top250: %s', top250) castdata = _findBetween(cont, 'Cast overview', '', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Credited cast', '', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Complete credited cast', '', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Series Cast Summary', '', maxRes=1) if not castdata: castdata = _findBetween(cont, 'Episode Credited cast', '', maxRes=1) if castdata: castdata = castdata[0] # Reintegrate the fist tag. fl = castdata.find('href=') if fl != -1: castdata = '') if smib != -1: smie = castdata.rfind('') if smie != -1: castdata = castdata[:smib].strip() + \ castdata[smie+18:].strip() castdata = castdata.replace('/tr> ', '', maxRes=1) if akas: # For some reason, here
is still used in place of
. akas[:] = [x for x in akas[0].split('
') if x.strip()] akas = [_unHtml(x).replace('" - ','::', 1).lstrip('"').strip() for x in akas] if 'See more' in akas: akas.remove('See more') akas[:] = [x for x in akas if x] if akas: d['akas'] = akas mpaa = _findBetween(cont, 'MPAA
:', '', maxRes=1) if mpaa: d['mpaa'] = _unHtml(mpaa[0]) runtimes = _findBetween(cont, 'Runtime:
', '', maxRes=1) if runtimes: runtimes = runtimes[0] runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1) for x in runtimes.split('|')] d['runtimes'] = [_unHtml(x).strip() for x in runtimes] if kind == 'episode': # number of episodes. epsn = _findBetween(cont, 'title="Full Episode List">', '', maxRes=1) if epsn: epsn = epsn[0].replace(' Episodes', '').strip() if epsn: try: epsn = int(epsn) except: self._mobile_logger.warn('wrong episodes #: %s', epsn) d['number of episodes'] = epsn country = _findBetween(cont, 'Country:', '', maxRes=1) if country: country[:] = country[0].split(' | ') country[:] = ['', '::')) for x in country] if country: d['countries'] = country lang = _findBetween(cont, 'Language:', '', maxRes=1) if lang: lang[:] = lang[0].split(' | ') lang[:] = ['', '::')) for x in lang] if lang: d['languages'] = lang col = _findBetween(cont, '"/search/title?colors=', '') if col: col[:] = col[0].split(' | ') col[:] = ['', '::')) for x in col] if col: d['color info'] = col sm = _findBetween(cont, '/search/title?sound_mixes=', '', maxRes=1) if sm: sm[:] = sm[0].split(' | ') sm[:] = ['', '::')) for x in sm] if sm: d['sound mix'] = sm cert = _findBetween(cont, 'Certification:', '', maxRes=1) if cert: cert[:] = cert[0].split(' | ') cert[:] = [_unHtml(x.replace(' ', '::')) for x in cert] if cert: d['certificates'] = cert plotoutline = _findBetween(cont, 'Plot:', [''], maxRes=1) if plotoutline: plotoutline = plotoutline[0].strip() plotoutline = plotoutline.rstrip('|').rstrip() if plotoutline: d['plot outline'] = _unHtml(plotoutline) aratio = _findBetween(cont, 'Aspect Ratio:', [''], maxRes=1) if aratio: aratio = aratio[0].strip().replace(' (', '::(', 1) if aratio: d['aspect ratio'] = _unHtml(aratio) return {'data': d} def get_movie_plot(self, movieID): cont = self._mretrieve(self.urls['movie_main'] % movieID + 'plotsummary') plot = _findBetween(cont, '

', '

') plot[:] = [_unHtml(x) for x in plot] for i in xrange(len(plot)): p = plot[i] wbyidx = p.rfind(' Written by ') if wbyidx != -1: plot[i] = '%s::%s' % \ (p[:wbyidx].rstrip(), p[wbyidx+12:].rstrip().replace('{','<').replace('}','>')) if plot: return {'data': {'plot': plot}} return {'data': {}} def _search_person(self, name, results): ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) ##params = 'q=%s&nm=on&mx=%s' % (urllib.quote_plus(name), str(results)) ##cont = self._mretrieve(imdbURL_search % params) cont = subXMLRefs(self._get_search_content('nm', name, results)) name = _findBetween(cont, '', '', maxRes=1) res = [] if not name: self._mobile_logger.warn('no title tag searching for name %s', name) return res nl = name[0].lower() if not nl.startswith('imdb name'): # a direct hit! name = _unHtml(name[0]) name = name.replace('- Filmography by type' , '').strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], '/name/nm', '/', maxRes=1) if not (pid and name): self._mobile_logger.error('no direct hit name/personID for' \ ' name %s', name) return res res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] else: lis = _findBetween(cont, 'td valign="top">', '', maxRes=results*3) for li in lis: akas = _findBetween(li, '"', '"') for sep in [' aka', '
birth name']: sepIdx = li.find(sep) if sepIdx != -1: li = li[:sepIdx] pid = re_imdbID.findall(li) pname = _unHtml(li) if not (pid and pname): self._mobile_logger.debug('no name/personID parsing' \ ' %s searching for name %s', li, name) continue resd = analyze_name(pname, canonical=1) if akas: resd['akas'] = akas res.append((str(pid[0]), resd)) return res def get_person_main(self, personID, _parseChr=False): if not _parseChr: url = self.urls['person_main'] % personID + 'maindetails' else: url = self.urls['character_main'] % personID s = self._mretrieve(url) r = {} name = _findBetween(s, '', '', maxRes=1) if not name: if _parseChr: w = 'characterID' else: w = 'personID' raise IMDbDataAccessError('unable to get %s "%s"' % (w, personID)) name = _unHtml(name[0].replace(' - IMDb', '')) if _parseChr: name = name.replace('(Character)', '').strip() name = name.replace('- Filmography by type', '').strip() else: name = name.replace('- Filmography by', '').strip() r = analyze_name(name, canonical=not _parseChr) for dKind in ('Born', 'Died'): date = _findBetween(s, '%s:' % dKind.capitalize(), ('
', '

'), maxRes=1) if date: date = _unHtml(date[0]) if date: #date, notes = date_and_notes(date) # TODO: fix to handle real names. date_notes = date.split(' in ', 1) notes = u'' date = date_notes[0] if len(date_notes) == 2: notes = date_notes[1] dtitle = 'birth' if dKind == 'Died': dtitle = 'death' if date: r['%s date' % dtitle] = date if notes: r['%s notes' % dtitle] = notes akas = _findBetween(s, 'Alternate Names:', ('
', '

'), maxRes=1) if akas: akas = akas[0] if akas: akas = _unHtml(akas) if akas.find(' | ') != -1: akas = akas.split(' | ') else: akas = akas.split(' / ') if akas: r['akas'] = filter(None, [x.strip() for x in akas]) hs = _findBetween(s, "rel='image_src'", '>', maxRes=1) if not hs: hs = _findBetween(s, 'rel="image_src"', '>', maxRes=1) if not hs: hs = _findBetween(s, '
', maxRes=1) if hs: hsl = _findBetween(hs[0], "href='", "'", maxRes=1) if not hsl: hsl = _findBetween(hs[0], 'href="', '"', maxRes=1) if hsl and 'imdb-share-logo' not in hsl[0]: r['headshot'] = hsl[0] # Build a list of tuples such [('hrefLink', 'section name')] workkind = _findBetween(s, 'id="jumpto_', '') ws = [] for work in workkind: sep = '" >' if '">' in work: sep = '">' wsplit = work.split(sep, 1) if len(wsplit) == 2: sect = wsplit[0] if '"' in sect: sect = sect[:sect.find('"')] ws.append((sect, wsplit[1].lower())) # XXX: I think "guest appearances" are gone. if s.find(' tag. if _parseChr and sect == 'filmography': inisect = s.find('
') else: inisect = s.find('',)) for m in mlist: fCB = m.find('>') if fCB != -1: m = m[fCB+1:].lstrip() m = re_filmo_episodes.sub('', m) # For every movie in the current section. movieID = re_imdbID.findall(m) if not movieID: self._mobile_logger.debug('no movieID in %s', m) continue m = m.replace('
', ' .... ', 1) if not _parseChr: chrIndx = m.find(' .... ') else: chrIndx = m.find(' Played by ') chids = [] if chrIndx != -1: chrtxt = m[chrIndx+6:] if _parseChr: chrtxt = chrtxt[5:] for ch in chrtxt.split(' / '): chid = re_imdbID.findall(ch) if not chid: chids.append(None) else: chids.append(chid[-1]) if not chids: chids = None elif len(chids) == 1: chids = chids[0] movieID = str(movieID[0]) # Search the status. stidx = m.find('') status = u'' if stidx != -1: stendidx = m.rfind('') if stendidx != -1: status = _unHtml(m[stidx+3:stendidx]) m = m.replace(m[stidx+3:stendidx], '') year = _findBetween(m, 'year_column">', '', maxRes=1) if year: year = year[0] m = m.replace('%s' % year, '') else: year = None m = _unHtml(m) if not m: self._mobile_logger.warn('no title for movieID %s', movieID) continue movie = build_movie(m, movieID=movieID, status=status, roleID=chids, modFunct=self._defModFunct, accessSystem=self.accessSystem, _parsingCharacter=_parseChr, year=year) sectName = sectName.split(':')[0] r.setdefault(sectName, []).append(movie) # If available, take the always correct name from a form. itag = _getTagsWith(s, 'NAME="primary"', maxRes=1) if not itag: itag = _getTagsWith(s, 'name="primary"', maxRes=1) if itag: vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1) if not vtag: vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1) if vtag: try: vtag = unquote(str(vtag[0])) vtag = unicode(vtag, 'latin_1') r.update(analyze_name(vtag)) except UnicodeEncodeError: pass return {'data': r, 'info sets': ('main', 'filmography')} def get_person_biography(self, personID): cont = self._mretrieve(self.urls['person_main'] % personID + 'bio') d = {} spouses = _findBetween(cont, 'Spouse', ('', ''), maxRes=1) if spouses: sl = [] for spouse in spouses[0].split(''): if spouse.count('') > 1: spouse = spouse.replace('', '::', 1) spouse = _unHtml(spouse) spouse = spouse.replace(':: ', '::').strip() if spouse: sl.append(spouse) if sl: d['spouse'] = sl nnames = _findBetween(cont, '
Nickname
', ('

','
'), maxRes=1) if nnames: nnames = nnames[0] if nnames: nnames = [x.strip().replace(' (', '::(', 1) for x in nnames.split('
')] if nnames: d['nick names'] = nnames misc_sects = _findBetween(cont, '
', '
') misc_sects[:] = [x.split('
') for x in misc_sects] misc_sects[:] = [x for x in misc_sects if len(x) == 2] for sect, data in misc_sects: sect = sect.lower().replace(':', '').strip() if d.has_key(sect) and sect != 'mini biography': continue elif sect in ('spouse', 'nickname'): continue if sect == 'salary': sect = 'salary history' elif sect == 'where are they now': sect = 'where now' elif sect == 'personal quotes': sect = 'quotes' data = data.replace('

', '::') data = data.replace('

', ' ') # for multi-paragraphs 'bio' data = data.replace(' ', '@@@@') data = data.replace(' ', '::') data = _unHtml(data) data = [x.strip() for x in data.split('::')] data[:] = [x.replace('@@@@', '::') for x in data if x] if sect == 'height' and data: data = data[0] elif sect == 'birth name': data = canonicalName(data[0]) elif sect == 'date of birth': date, notes = date_and_notes(data[0]) if date: d['birth date'] = date if notes: d['birth notes'] = notes continue elif sect == 'date of death': date, notes = date_and_notes(data[0]) if date: d['death date'] = date if notes: d['death notes'] = notes continue elif sect == 'mini biography': ndata = [] for bio in data: byidx = bio.rfind('IMDb Mini Biography By') if byidx != -1: bioAuth = bio[:byidx].rstrip() else: bioAuth = 'Anonymous' bio = u'%s::%s' % (bioAuth, bio[byidx+23:].lstrip()) ndata.append(bio) data[:] = ndata if 'mini biography' in d: d['mini biography'].append(ndata[0]) continue d[sect] = data return {'data': d} def _search_character(self, name, results): cont = subXMLRefs(self._get_search_content('char', name, results)) name = _findBetween(cont, '', '', maxRes=1) res = [] if not name: self._mobile_logger.error('no title tag searching character %s', name) return res nl = name[0].lower() if not (nl.startswith('imdb search') or nl.startswith('imdb search') \ or nl.startswith('imdb character')): # a direct hit! name = _unHtml(name[0]).replace('(Character)', '').strip() pid = None pidtag = _getTagsWith(cont, 'rel="canonical"', maxRes=1) if pidtag: pid = _findBetween(pidtag[0], '/character/ch', '/', maxRes=1) if not (pid and name): self._mobile_logger.error('no direct hit name/characterID for' \ ' character %s', name) return res res[:] = [(str(pid[0]), analyze_name(name))] else: sects = _findBetween(cont, 'Popular Characters', '', maxRes=results*3) sects += _findBetween(cont, 'Characters', '', maxRes=results*3) for sect in sects: lis = _findBetween(sect, '
', ('', '

'), maxRes=1) if intro: intro = _unHtml(intro[0]).strip() if intro: d['introduction'] = intro tocidx = cont.find(' 2008 H. Turgut Uyar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re from utils import build_movie, Attribute, Extractor, DOMParserBase, \ analyze_imdbid from imdb.utils import analyze_company_name class DOMCompanyParser(DOMParserBase): """Parser for the main page of a given company. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMCompanyParser() result = cparser.parse(company_html_string) """ _containsObjects = True extractors = [ Extractor(label='name', path="//title", attrs=Attribute(key='name', path="./text()", postprocess=lambda x: \ analyze_company_name(x, stripNotes=True))), Extractor(label='filmography', group="//b/a[@name]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../following-sibling::ol[1]/li", attrs=Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': "./a[1]/text()", 'year': "./text()[1]" }, postprocess=lambda x: build_movie(u'%s %s' % \ (x.get('title'), x.get('year').strip()), movieID=analyze_imdbid(x.get('link') or u''), _parsingCompany=True))), ] preprocessors = [ (re.compile('(\1') ] def postprocess_data(self, data): for key in data.keys(): new_key = key.replace('company', 'companies') new_key = new_key.replace('other', 'miscellaneous') new_key = new_key.replace('distributor', 'distributors') if new_key != key: data[new_key] = data[key] del data[key] return data _OBJECTS = { 'company_main_parser': ((DOMCompanyParser,), None) } IMDbPY-4.9/imdb/parser/http/movieParser.py0000644000000000000000000023147711766731642017177 0ustar rootroot""" parser.http.movieParser module (imdb package). This module provides the classes (and the instances), used to parse the IMDb pages on the akas.imdb.com server about a movie. E.g., for Brian De Palma's "The Untouchables", the referred pages would be: combined details: http://akas.imdb.com/title/tt0094226/combined plot summary: http://akas.imdb.com/title/tt0094226/plotsummary ...and so on... Copyright 2004-2012 Davide Alberani 2008 H. Turgut Uyar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re import urllib from imdb import imdbURL_base from imdb.Person import Person from imdb.Movie import Movie from imdb.Company import Company from imdb.utils import analyze_title, split_company_name_notes, _Container from utils import build_person, DOMParserBase, Attribute, Extractor, \ analyze_imdbid # Dictionary used to convert some section's names. _SECT_CONV = { 'directed': 'director', 'directed by': 'director', 'directors': 'director', 'editors': 'editor', 'writing credits': 'writer', 'writers': 'writer', 'produced': 'producer', 'cinematography': 'cinematographer', 'film editing': 'editor', 'casting': 'casting director', 'costume design': 'costume designer', 'makeup department': 'make up', 'production management': 'production manager', 'second unit director or assistant director': 'assistant director', 'costume and wardrobe department': 'costume department', 'sound department': 'sound crew', 'stunts': 'stunt performer', 'other crew': 'miscellaneous crew', 'also known as': 'akas', 'country': 'countries', 'runtime': 'runtimes', 'language': 'languages', 'certification': 'certificates', 'genre': 'genres', 'created': 'creator', 'creators': 'creator', 'color': 'color info', 'plot': 'plot outline', 'seasons': 'number of seasons', 'art directors': 'art direction', 'assistant directors': 'assistant director', 'set decorators': 'set decoration', 'visual effects department': 'visual effects', 'production managers': 'production manager', 'miscellaneous': 'miscellaneous crew', 'make up department': 'make up', 'plot summary': 'plot outline', 'cinematographers': 'cinematographer', 'camera department': 'camera and electrical department', 'costume designers': 'costume designer', 'production designers': 'production design', 'production managers': 'production manager', 'music original': 'original music', 'casting directors': 'casting director', 'other companies': 'miscellaneous companies', 'producers': 'producer', 'special effects by': 'special effects department', 'special effects': 'special effects companies' } def _manageRoles(mo): """Perform some transformation on the html, so that roleIDs can be easily retrieved.""" firstHalf = mo.group(1) secondHalf = mo.group(2) newRoles = [] roles = secondHalf.split(' / ') for role in roles: role = role.strip() if not role: continue roleID = analyze_imdbid(role) if roleID is None: roleID = u'/' else: roleID += u'/' newRoles.append(u'
%s
' % \ (roleID, role.strip())) return firstHalf + u' / '.join(newRoles) + mo.group(3) _reRolesMovie = re.compile(r'(
)', re.I | re.M | re.S) def _replaceBR(mo): """Replaces
tags with '::' (useful for some akas)""" txt = mo.group(0) return txt.replace('
', '::') _reAkas = re.compile(r'
also known as:
.*?', re.I | re.M | re.S) def makeSplitter(lstrip=None, sep='|', comments=True, origNotesSep=' (', newNotesSep='::(', strip=None): """Return a splitter function suitable for a given set of data.""" def splitter(x): if not x: return x x = x.strip() if not x: return x if lstrip is not None: x = x.lstrip(lstrip).lstrip() lx = x.split(sep) lx[:] = filter(None, [j.strip() for j in lx]) if comments: lx[:] = [j.replace(origNotesSep, newNotesSep, 1) for j in lx] if strip: lx[:] = [j.strip(strip) for j in lx] return lx return splitter def _toInt(val, replace=()): """Return the value, converted to integer, or None; if present, 'replace' must be a list of tuples of values to replace.""" for before, after in replace: val = val.replace(before, after) try: return int(val) except (TypeError, ValueError): return None class DOMHTMLMovieParser(DOMParserBase): """Parser for the "combined details" (and if instance.mdparse is True also for the "main details") page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: mparser = DOMHTMLMovieParser() result = mparser.parse(combined_details_html_string) """ _containsObjects = True extractors = [Extractor(label='title', path="//h1", attrs=Attribute(key='title', path=".//text()", postprocess=analyze_title)), Extractor(label='glossarysections', group="//a[@class='glossary']", group_key="./@name", group_key_normalize=lambda x: x.replace('_', ' '), path="../../../..//tr", attrs=Attribute(key=None, multi=True, path={'person': ".//text()", 'link': "./td[1]/a[@href]/@href"}, postprocess=lambda x: \ build_person(x.get('person') or u'', personID=analyze_imdbid(x.get('link'))) )), Extractor(label='cast', path="//table[@class='cast']//tr", attrs=Attribute(key="cast", multi=True, path={'person': ".//text()", 'link': "td[2]/a/@href", 'roleID': \ "td[4]/div[@class='_imdbpyrole']/@roleid"}, postprocess=lambda x: \ build_person(x.get('person') or u'', personID=analyze_imdbid(x.get('link')), roleID=(x.get('roleID') or u'').split('/')) )), Extractor(label='genres', path="//div[@class='info']//a[starts-with(@href," \ " '/Sections/Genres')]", attrs=Attribute(key="genres", multi=True, path="./text()")), Extractor(label='h5sections', path="//div[@class='info']/h5/..", attrs=[ Attribute(key="plot summary", path="./h5[starts-with(text(), " \ "'Plot:')]/../div/text()", postprocess=lambda x: \ x.strip().rstrip('|').rstrip()), Attribute(key="aspect ratio", path="./h5[starts-with(text()," \ " 'Aspect')]/../div/text()", postprocess=lambda x: x.strip()), Attribute(key="mpaa", path="./h5/a[starts-with(text()," \ " 'MPAA')]/../../div/text()", postprocess=lambda x: x.strip()), Attribute(key="countries", path="./h5[starts-with(text(), " \ "'Countr')]/../div[@class='info-content']//text()", postprocess=makeSplitter('|')), Attribute(key="language", path="./h5[starts-with(text(), " \ "'Language')]/..//text()", postprocess=makeSplitter('Language:')), Attribute(key='color info', path="./h5[starts-with(text(), " \ "'Color')]/..//text()", postprocess=makeSplitter('Color:')), Attribute(key='sound mix', path="./h5[starts-with(text(), " \ "'Sound Mix')]/..//text()", postprocess=makeSplitter('Sound Mix:')), # Collects akas not encosed in tags. Attribute(key='other akas', path="./h5[starts-with(text(), " \ "'Also Known As')]/../div//text()", postprocess=makeSplitter(sep='::', origNotesSep='" - ', newNotesSep='::', strip='"')), Attribute(key='runtimes', path="./h5[starts-with(text(), " \ "'Runtime')]/../div/text()", postprocess=makeSplitter()), Attribute(key='certificates', path="./h5[starts-with(text(), " \ "'Certificat')]/..//text()", postprocess=makeSplitter('Certification:')), Attribute(key='number of seasons', path="./h5[starts-with(text(), " \ "'Seasons')]/..//text()", postprocess=lambda x: x.count('|') + 1), Attribute(key='original air date', path="./h5[starts-with(text(), " \ "'Original Air Date')]/../div/text()"), Attribute(key='tv series link', path="./h5[starts-with(text(), " \ "'TV Series')]/..//a/@href"), Attribute(key='tv series title', path="./h5[starts-with(text(), " \ "'TV Series')]/..//a/text()") ]), Extractor(label='language codes', path="//h5[starts-with(text(), 'Language')]/..//a[starts-with(@href, '/language/')]", attrs=Attribute(key='language codes', multi=True, path="./@href", postprocess=lambda x: x.split('/')[2].strip() )), Extractor(label='country codes', path="//h5[starts-with(text(), 'Country')]/..//a[starts-with(@href, '/country/')]", attrs=Attribute(key='country codes', multi=True, path="./@href", postprocess=lambda x: x.split('/')[2].strip() )), Extractor(label='creator', path="//h5[starts-with(text(), 'Creator')]/..//a", attrs=Attribute(key='creator', multi=True, path={'name': "./text()", 'link': "./@href"}, postprocess=lambda x: \ build_person(x.get('name') or u'', personID=analyze_imdbid(x.get('link'))) )), Extractor(label='thin writer', path="//h5[starts-with(text(), 'Writer')]/..//a", attrs=Attribute(key='thin writer', multi=True, path={'name': "./text()", 'link': "./@href"}, postprocess=lambda x: \ build_person(x.get('name') or u'', personID=analyze_imdbid(x.get('link'))) )), Extractor(label='thin director', path="//h5[starts-with(text(), 'Director')]/..//a", attrs=Attribute(key='thin director', multi=True, path={'name': "./text()", 'link': "@href"}, postprocess=lambda x: \ build_person(x.get('name') or u'', personID=analyze_imdbid(x.get('link'))) )), Extractor(label='top 250/bottom 100', path="//div[@class='starbar-special']/" \ "a[starts-with(@href, '/chart/')]", attrs=Attribute(key='top/bottom rank', path="./text()")), Extractor(label='series years', path="//div[@id='tn15title']//span" \ "[starts-with(text(), 'TV series')]", attrs=Attribute(key='series years', path="./text()", postprocess=lambda x: \ x.replace('TV series','').strip())), Extractor(label='number of episodes', path="//a[@title='Full Episode List']", attrs=Attribute(key='number of episodes', path="./text()", postprocess=lambda x: \ _toInt(x, [(' Episodes', '')]))), Extractor(label='akas', path="//i[@class='transl']", attrs=Attribute(key='akas', multi=True, path='text()', postprocess=lambda x: x.replace(' ', ' ').rstrip('-').replace('" - ', '"::', 1).strip('"').replace(' ', ' '))), Extractor(label='production notes/status', path="//h5[starts-with(text(), 'Status:')]/..//div[@class='info-content']", attrs=Attribute(key='production status', path=".//text()", postprocess=lambda x: x.strip().split('|')[0].strip().lower())), Extractor(label='production notes/status updated', path="//h5[starts-with(text(), 'Status Updated:')]/..//div[@class='info-content']", attrs=Attribute(key='production status updated', path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='production notes/comments', path="//h5[starts-with(text(), 'Comments:')]/..//div[@class='info-content']", attrs=Attribute(key='production comments', path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='production notes/note', path="//h5[starts-with(text(), 'Note:')]/..//div[@class='info-content']", attrs=Attribute(key='production note', path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='blackcatheader', group="//b[@class='blackcatheader']", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../ul/li", attrs=Attribute(key=None, multi=True, path={'name': "./a//text()", 'comp-link': "./a/@href", 'notes': "./text()"}, postprocess=lambda x: \ Company(name=x.get('name') or u'', companyID=analyze_imdbid(x.get('comp-link')), notes=(x.get('notes') or u'').strip()) )), Extractor(label='rating', path="//div[@class='starbar-meta']/b", attrs=Attribute(key='rating', path=".//text()")), Extractor(label='votes', path="//div[@class='starbar-meta']/a[@href]", attrs=Attribute(key='votes', path=".//text()")), Extractor(label='cover url', path="//a[@name='poster']", attrs=Attribute(key='cover url', path="./img/@src")) ] preprocessors = [ (re.compile(r'(.+?)', re.I), r'
\1'), ('Full cast and crew for
', ''), ('
', ''), ('TV mini-series', '(mini)'), (_reRolesMovie, _manageRoles), (_reAkas, _replaceBR)] def preprocess_dom(self, dom): # Handle series information. xpath = self.xpath(dom, "//b[text()='Series Crew']") if xpath: b = xpath[-1] # In doubt, take the last one. for a in self.xpath(b, "./following::h5/a[@class='glossary']"): name = a.get('name') if name: a.set('name', 'series %s' % name) # Remove links to IMDbPro. for proLink in self.xpath(dom, "//span[@class='pro-link']"): proLink.drop_tree() # Remove some 'more' links (keep others, like the one around # the number of votes). for tn15more in self.xpath(dom, "//a[@class='tn15more'][starts-with(@href, '/title/')]"): tn15more.drop_tree() return dom re_space = re.compile(r'\s+') re_airdate = re.compile(r'(.*)\s*\(season (\d+), episode (\d+)\)', re.I) def postprocess_data(self, data): # Convert section names. for sect in data.keys(): if sect in _SECT_CONV: data[_SECT_CONV[sect]] = data[sect] del data[sect] sect = _SECT_CONV[sect] # Filter out fake values. for key in data: value = data[key] if isinstance(value, list) and value: if isinstance(value[0], Person): data[key] = filter(lambda x: x.personID is not None, value) if isinstance(value[0], _Container): for obj in data[key]: obj.accessSystem = self._as obj.modFunct = self._modFunct if 'akas' in data or 'other akas' in data: akas = data.get('akas') or [] other_akas = data.get('other akas') or [] akas += other_akas nakas = [] for aka in akas: aka = aka.strip() if aka.endswith('" -'): aka = aka[:-3].rstrip() nakas.append(aka) if 'akas' in data: del data['akas'] if 'other akas' in data: del data['other akas'] if nakas: data['akas'] = nakas if 'runtimes' in data: data['runtimes'] = [x.replace(' min', u'') for x in data['runtimes']] if 'original air date' in data: oid = self.re_space.sub(' ', data['original air date']).strip() data['original air date'] = oid aid = self.re_airdate.findall(oid) if aid and len(aid[0]) == 3: date, season, episode = aid[0] date = date.strip() try: season = int(season) except: pass try: episode = int(episode) except: pass if date and date != '????': data['original air date'] = date else: del data['original air date'] # Handle also "episode 0". if season or type(season) is type(0): data['season'] = season if episode or type(season) is type(0): data['episode'] = episode for k in ('writer', 'director'): t_k = 'thin %s' % k if t_k not in data: continue if k not in data: data[k] = data[t_k] del data[t_k] if 'top/bottom rank' in data: tbVal = data['top/bottom rank'].lower() if tbVal.startswith('top'): tbKey = 'top 250 rank' tbVal = _toInt(tbVal, [('top 250: #', '')]) else: tbKey = 'bottom 100 rank' tbVal = _toInt(tbVal, [('bottom 100: #', '')]) if tbVal: data[tbKey] = tbVal del data['top/bottom rank'] if 'year' in data and data['year'] == '????': del data['year'] if 'tv series link' in data: if 'tv series title' in data: data['episode of'] = Movie(title=data['tv series title'], movieID=analyze_imdbid( data['tv series link']), accessSystem=self._as, modFunct=self._modFunct) del data['tv series title'] del data['tv series link'] if 'rating' in data: try: data['rating'] = float(data['rating'].replace('/10', '')) except (TypeError, ValueError): pass if 'votes' in data: try: votes = data['votes'].replace(',', '').replace('votes', '') data['votes'] = int(votes) except (TypeError, ValueError): pass return data def _process_plotsummary(x): """Process a plot (contributed by Rdian06).""" xauthor = x.get('author') if xauthor: xauthor = xauthor.replace('{', '<').replace('}', '>').replace('(', '<').replace(')', '>').strip() xplot = x.get('plot', u'').strip() if xauthor: xplot += u'::%s' % xauthor return xplot class DOMHTMLPlotParser(DOMParserBase): """Parser for the "plot summary" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a 'plot' key, containing a list of string with the structure: 'summary::summary_author '. Example: pparser = HTMLPlotParser() result = pparser.parse(plot_summary_html_string) """ _defGetRefs = True # Notice that recently IMDb started to put the email of the # author only in the link, that we're not collecting, here. extractors = [Extractor(label='plot', path="//p[@class='plotpar']", attrs=Attribute(key='plot', multi=True, path={'plot': './text()', 'author': './i/a/text()'}, postprocess=_process_plotsummary))] def _process_award(x): award = {} award['award'] = x.get('award').strip() if not award['award']: return {} award['year'] = x.get('year').strip() if award['year'] and award['year'].isdigit(): award['year'] = int(award['year']) award['result'] = x.get('result').strip() category = x.get('category').strip() if category: award['category'] = category received_with = x.get('with') if received_with is not None: award['with'] = received_with.strip() notes = x.get('notes') if notes is not None: notes = notes.strip() if notes: award['notes'] = notes award['anchor'] = x.get('anchor') return award class DOMHTMLAwardsParser(DOMParserBase): """Parser for the "awards" page of a given person or movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: awparser = HTMLAwardsParser() result = awparser.parse(awards_html_string) """ subject = 'title' _containsObjects = True extractors = [ Extractor(label='awards', group="//table//big", group_key="./a", path="./ancestor::tr[1]/following-sibling::tr/" \ "td[last()][not(@colspan)]", attrs=Attribute(key=None, multi=True, path={ 'year': "../td[1]/a/text()", 'result': "../td[2]/b/text()", 'award': "../td[3]/text()", 'category': "./text()[1]", # FIXME: takes only the first co-recipient 'with': "./small[starts-with(text()," \ " 'Shared with:')]/following-sibling::a[1]/text()", 'notes': "./small[last()]//text()", 'anchor': ".//text()" }, postprocess=_process_award )), Extractor(label='recipients', group="//table//big", group_key="./a", path="./ancestor::tr[1]/following-sibling::tr/" \ "td[last()]/small[1]/preceding-sibling::a", attrs=Attribute(key=None, multi=True, path={ 'name': "./text()", 'link': "./@href", 'anchor': "..//text()" } )) ] preprocessors = [ (re.compile('(]*>.*?\n\n
)(.*?)( ...
)', re.I), r'\1'), (re.compile('(]*>\n\n.*?)', re.I), r'\1'), (re.compile('(]*>\n\n)
(.*?)
(.*?\n\n)(\2') ] def preprocess_dom(self, dom): """Repeat td elements according to their rowspan attributes in subsequent tr elements. """ cols = self.xpath(dom, "//td[@rowspan]") for col in cols: span = int(col.get('rowspan')) del col.attrib['rowspan'] position = len(self.xpath(col, "./preceding-sibling::td")) row = col.getparent() for tr in self.xpath(row, "./following-sibling::tr")[:span-1]: # if not cloned, child will be moved to new parent clone = self.clone(col) # XXX: beware that here we don't use an "adapted" function, # because both BeautifulSoup and lxml uses the same # "insert" method. tr.insert(position, clone) return dom def postprocess_data(self, data): if len(data) == 0: return {} nd = [] for key in data.keys(): dom = self.get_dom(key) assigner = self.xpath(dom, "//a/text()")[0] for entry in data[key]: if not entry.has_key('name'): if not entry: continue # this is an award, not a recipient entry['assigner'] = assigner.strip() # find the recipients matches = [p for p in data[key] if p.has_key('name') and (entry['anchor'] == p['anchor'])] if self.subject == 'title': recipients = [Person(name=recipient['name'], personID=analyze_imdbid(recipient['link'])) for recipient in matches] entry['to'] = recipients elif self.subject == 'name': recipients = [Movie(title=recipient['name'], movieID=analyze_imdbid(recipient['link'])) for recipient in matches] entry['for'] = recipients nd.append(entry) del entry['anchor'] return {'awards': nd} class DOMHTMLTaglinesParser(DOMParserBase): """Parser for the "taglines" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: tparser = DOMHTMLTaglinesParser() result = tparser.parse(taglines_html_string) """ extractors = [Extractor(label='taglines', path="//div[@id='tn15content']/p", attrs=Attribute(key='taglines', multi=True, path="./text()"))] class DOMHTMLKeywordsParser(DOMParserBase): """Parser for the "keywords" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: kwparser = DOMHTMLKeywordsParser() result = kwparser.parse(keywords_html_string) """ extractors = [Extractor(label='keywords', path="//a[starts-with(@href, '/keyword/')]", attrs=Attribute(key='keywords', path="./text()", multi=True, postprocess=lambda x: \ x.lower().replace(' ', '-')))] class DOMHTMLAlternateVersionsParser(DOMParserBase): """Parser for the "alternate versions" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: avparser = HTMLAlternateVersionsParser() result = avparser.parse(alternateversions_html_string) """ _defGetRefs = True extractors = [Extractor(label='alternate versions', path="//ul[@class='trivia']/li", attrs=Attribute(key='alternate versions', multi=True, path=".//text()", postprocess=lambda x: x.strip()))] class DOMHTMLTriviaParser(DOMParserBase): """Parser for the "trivia" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: avparser = HTMLAlternateVersionsParser() result = avparser.parse(alternateversions_html_string) """ _defGetRefs = True extractors = [Extractor(label='alternate versions', path="//div[@class='sodatext']", attrs=Attribute(key='trivia', multi=True, path=".//text()", postprocess=lambda x: x.strip()))] def preprocess_dom(self, dom): # Remove "link this quote" links. for qLink in self.xpath(dom, "//span[@class='linksoda']"): qLink.drop_tree() return dom class DOMHTMLSoundtrackParser(DOMHTMLAlternateVersionsParser): kind = 'soundtrack' preprocessors = [ ('
', '\n') ] def postprocess_data(self, data): if 'soundtrack' in data: nd = [] for x in data['soundtrack']: ds = x.split('\n') title = ds[0] if title[0] == '"' and title[-1] == '"': title = title[1:-1] nds = [] newData = {} for l in ds[1:]: if ' with ' in l or ' by ' in l or ' from ' in l \ or ' of ' in l or l.startswith('From '): nds.append(l) else: if nds: nds[-1] += l else: nds.append(l) newData[title] = {} for l in nds: skip = False for sep in ('From ',): if l.startswith(sep): fdix = len(sep) kind = l[:fdix].rstrip().lower() info = l[fdix:].lstrip() newData[title][kind] = info skip = True if not skip: for sep in ' with ', ' by ', ' from ', ' of ': fdix = l.find(sep) if fdix != -1: fdix = fdix+len(sep) kind = l[:fdix].rstrip().lower() info = l[fdix:].lstrip() newData[title][kind] = info break nd.append(newData) data['soundtrack'] = nd return data class DOMHTMLCrazyCreditsParser(DOMParserBase): """Parser for the "crazy credits" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: ccparser = DOMHTMLCrazyCreditsParser() result = ccparser.parse(crazycredits_html_string) """ _defGetRefs = True extractors = [Extractor(label='crazy credits', path="//ul/li/tt", attrs=Attribute(key='crazy credits', multi=True, path=".//text()", postprocess=lambda x: \ x.replace('\n', ' ').replace(' ', ' ')))] class DOMHTMLGoofsParser(DOMParserBase): """Parser for the "goofs" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: gparser = DOMHTMLGoofsParser() result = gparser.parse(goofs_html_string) """ _defGetRefs = True extractors = [Extractor(label='goofs', path="//ul[@class='trivia']/li", attrs=Attribute(key='goofs', multi=True, path=".//text()", postprocess=lambda x: (x or u'').strip()))] class DOMHTMLQuotesParser(DOMParserBase): """Parser for the "memorable quotes" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: qparser = DOMHTMLQuotesParser() result = qparser.parse(quotes_html_string) """ _defGetRefs = True extractors = [ Extractor(label='quotes', path="//div[@class='_imdbpy']", attrs=Attribute(key='quotes', multi=True, path=".//text()", postprocess=lambda x: x.strip().replace(' \n', '::').replace('::\n', '::').replace('\n', ' '))) ] preprocessors = [ (re.compile('(
)', re.I), r'\1
'), (re.compile('
', re.I), '
'), (re.compile('
', re.I), '

'), (re.compile('', re.I|re.S), ''), # For BeautifulSoup. (re.compile('', re.I), '') ] def preprocess_dom(self, dom): # Remove "link this quote" links. for qLink in self.xpath(dom, "//p[@class='linksoda']"): qLink.drop_tree() return dom def postprocess_data(self, data): if 'quotes' not in data: return {} for idx, quote in enumerate(data['quotes']): data['quotes'][idx] = quote.split('::') return data class DOMHTMLReleaseinfoParser(DOMParserBase): """Parser for the "release dates" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rdparser = DOMHTMLReleaseinfoParser() result = rdparser.parse(releaseinfo_html_string) """ extractors = [Extractor(label='release dates', path="//th[@class='xxxx']/../../tr", attrs=Attribute(key='release dates', multi=True, path={'country': ".//td[1]//text()", 'date': ".//td[2]//text()", 'notes': ".//td[3]//text()"})), Extractor(label='akas', path="//div[@class='_imdbpy_akas']/table/tr", attrs=Attribute(key='akas', multi=True, path={'title': "./td[1]/text()", 'countries': "./td[2]/text()"}))] preprocessors = [ (re.compile('(
)', re.I | re.M | re.S), r'
\1
')] def postprocess_data(self, data): if not ('release dates' in data or 'akas' in data): return data releases = data.get('release dates') or [] rl = [] for i in releases: country = i.get('country') date = i.get('date') if not (country and date): continue country = country.strip() date = date.strip() if not (country and date): continue notes = i['notes'] info = u'%s::%s' % (country, date) if notes: info += notes rl.append(info) if releases: del data['release dates'] if rl: data['release dates'] = rl akas = data.get('akas') or [] nakas = [] for aka in akas: title = (aka.get('title') or '').strip() if not title: continue countries = (aka.get('countries') or '').split('/') if not countries: nakas.append(title) else: for country in countries: nakas.append('%s::%s' % (title, country.strip())) if akas: del data['akas'] if nakas: data['akas from release info'] = nakas return data class DOMHTMLRatingsParser(DOMParserBase): """Parser for the "user ratings" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rparser = DOMHTMLRatingsParser() result = rparser.parse(userratings_html_string) """ re_means = re.compile('mean\s*=\s*([0-9]\.[0-9])\.\s*median\s*=\s*([0-9])', re.I) extractors = [ Extractor(label='number of votes', path="//td[b='Percentage']/../../tr", attrs=[Attribute(key='votes', multi=True, path={ 'votes': "td[1]//text()", 'ordinal': "td[3]//text()" })]), Extractor(label='mean and median', path="//p[starts-with(text(), 'Arithmetic mean')]", attrs=Attribute(key='mean and median', path="text()")), Extractor(label='rating', path="//a[starts-with(@href, '/search/title?user_rating=')]", attrs=Attribute(key='rating', path="text()")), Extractor(label='demographic voters', path="//td[b='Average']/../../tr", attrs=Attribute(key='demographic voters', multi=True, path={ 'voters': "td[1]//text()", 'votes': "td[2]//text()", 'average': "td[3]//text()" })), Extractor(label='top 250', path="//a[text()='top 250']", attrs=Attribute(key='top 250', path="./preceding-sibling::text()[1]")) ] def postprocess_data(self, data): nd = {} votes = data.get('votes', []) if votes: nd['number of votes'] = {} for i in xrange(1, 11): _ordinal = int(votes[i]['ordinal']) _strvts = votes[i]['votes'] or '0' nd['number of votes'][_ordinal] = \ int(_strvts.replace(',', '')) mean = data.get('mean and median', '') if mean: means = self.re_means.findall(mean) if means and len(means[0]) == 2: am, med = means[0] try: am = float(am) except (ValueError, OverflowError): pass if type(am) is type(1.0): nd['arithmetic mean'] = am try: med = int(med) except (ValueError, OverflowError): pass if type(med) is type(0): nd['median'] = med if 'rating' in data: nd['rating'] = float(data['rating']) dem_voters = data.get('demographic voters') if dem_voters: nd['demographic'] = {} for i in xrange(1, len(dem_voters)): if (dem_voters[i]['votes'] is not None) \ and (dem_voters[i]['votes'].strip()): nd['demographic'][dem_voters[i]['voters'].strip().lower()] \ = (int(dem_voters[i]['votes'].replace(',', '')), float(dem_voters[i]['average'])) if 'imdb users' in nd.get('demographic', {}): nd['votes'] = nd['demographic']['imdb users'][0] nd['demographic']['all votes'] = nd['demographic']['imdb users'] del nd['demographic']['imdb users'] top250 = data.get('top 250') if top250: sd = top250[9:] i = sd.find(' ') if i != -1: sd = sd[:i] try: sd = int(sd) except (ValueError, OverflowError): pass if type(sd) is type(0): nd['top 250 rank'] = sd return nd class DOMHTMLEpisodesRatings(DOMParserBase): """Parser for the "episode ratings ... by date" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: erparser = DOMHTMLEpisodesRatings() result = erparser.parse(eprating_html_string) """ _containsObjects = True extractors = [Extractor(label='title', path="//title", attrs=Attribute(key='title', path="./text()")), Extractor(label='ep ratings', path="//th/../..//tr", attrs=Attribute(key='episodes', multi=True, path={'nr': ".//td[1]/text()", 'ep title': ".//td[2]//text()", 'movieID': ".//td[2]/a/@href", 'rating': ".//td[3]/text()", 'votes': ".//td[4]/text()"}))] def postprocess_data(self, data): if 'title' not in data or 'episodes' not in data: return {} nd = [] title = data['title'] for i in data['episodes']: ept = i['ep title'] movieID = analyze_imdbid(i['movieID']) votes = i['votes'] rating = i['rating'] if not (ept and movieID and votes and rating): continue try: votes = int(votes.replace(',', '').replace('.', '')) except: pass try: rating = float(rating) except: pass ept = ept.strip() ept = u'%s {%s' % (title, ept) nr = i['nr'] if nr: ept += u' (#%s)' % nr.strip() ept += '}' if movieID is not None: movieID = str(movieID) m = Movie(title=ept, movieID=movieID, accessSystem=self._as, modFunct=self._modFunct) epofdict = m.get('episode of') if epofdict is not None: m['episode of'] = Movie(data=epofdict, accessSystem=self._as, modFunct=self._modFunct) nd.append({'episode': m, 'votes': votes, 'rating': rating}) return {'episodes rating': nd} def _normalize_href(href): if (href is not None) and (not href.lower().startswith('http://')): if href.startswith('/'): href = href[1:] # TODO: imdbURL_base may be set by the user! href = '%s%s' % (imdbURL_base, href) return href class DOMHTMLOfficialsitesParser(DOMParserBase): """Parser for the "official sites", "external reviews", "newsgroup reviews", "miscellaneous links", "sound clips", "video clips" and "photographs" pages of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: osparser = DOMHTMLOfficialsitesParser() result = osparser.parse(officialsites_html_string) """ kind = 'official sites' extractors = [ Extractor(label='site', path="//ol/li/a", attrs=Attribute(key='self.kind', multi=True, path={ 'link': "./@href", 'info': "./text()" }, postprocess=lambda x: (x.get('info').strip(), urllib.unquote(_normalize_href(x.get('link')))))) ] class DOMHTMLConnectionParser(DOMParserBase): """Parser for the "connections" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: connparser = DOMHTMLConnectionParser() result = connparser.parse(connections_html_string) """ _containsObjects = True extractors = [Extractor(label='connection', group="//div[@class='_imdbpy']", group_key="./h5/text()", group_key_normalize=lambda x: x.lower(), path="./a", attrs=Attribute(key=None, path={'title': "./text()", 'movieID': "./@href"}, multi=True))] preprocessors = [ ('
', '
'), # To get the movie's year. (' (', ' ('), ('\n
', ''), ('
- ', '::') ] def postprocess_data(self, data): for key in data.keys(): nl = [] for v in data[key]: title = v['title'] ts = title.split('::', 1) title = ts[0].strip() notes = u'' if len(ts) == 2: notes = ts[1].strip() m = Movie(title=title, movieID=analyze_imdbid(v['movieID']), accessSystem=self._as, notes=notes, modFunct=self._modFunct) nl.append(m) data[key] = nl if not data: return {} return {'connections': data} class DOMHTMLLocationsParser(DOMParserBase): """Parser for the "locations" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: lparser = DOMHTMLLocationsParser() result = lparser.parse(locations_html_string) """ extractors = [Extractor(label='locations', path="//dt", attrs=Attribute(key='locations', multi=True, path={'place': ".//text()", 'note': "./following-sibling::dd[1]" \ "//text()"}, postprocess=lambda x: (u'%s::%s' % ( x['place'].strip(), (x['note'] or u'').strip())).strip(':')))] class DOMHTMLTechParser(DOMParserBase): """Parser for the "technical", "business", "literature", "publicity" (for people) and "contacts (for people) pages of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: tparser = HTMLTechParser() result = tparser.parse(technical_html_string) """ kind = 'tech' extractors = [Extractor(label='tech', group="//h5", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="./following-sibling::div[1]", attrs=Attribute(key=None, path=".//text()", postprocess=lambda x: [t.strip() for t in x.split('\n') if t.strip()]))] preprocessors = [ (re.compile('(
.*?
)', re.I), r'
\1
'), (re.compile('((
|

|))\n?
(?!'), # the ones below are for the publicity parser (re.compile('

(.*?)

', re.I), r'\1
'), (re.compile('()', re.I), r'\1::'), (re.compile('()', re.I), r'\n\1'), # this is for splitting individual entries (re.compile('
', re.I), r'\n'), ] def postprocess_data(self, data): for key in data: data[key] = filter(None, data[key]) if self.kind in ('literature', 'business', 'contacts') and data: if 'screenplay/teleplay' in data: data['screenplay-teleplay'] = data['screenplay/teleplay'] del data['screenplay/teleplay'] data = {self.kind: data} else: if self.kind == 'publicity': if 'biography (print)' in data: data['biography-print'] = data['biography (print)'] del data['biography (print)'] # Tech info. for key in data.keys(): if key.startswith('film negative format'): data['film negative format'] = data[key] del data[key] elif key.startswith('film length'): data['film length'] = data[key] del data[key] return data class DOMHTMLRecParser(DOMParserBase): """Parser for the "recommendations" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: rparser = HTMLRecParser() result = rparser.parse(recommendations_html_string) """ _containsObjects = True extractors = [Extractor(label='recommendations', path="//td[@valign='middle'][1]", attrs=Attribute(key='../../tr/td[1]//text()', multi=True, path={'title': ".//text()", 'movieID': ".//a/@href"}))] def postprocess_data(self, data): for key in data.keys(): n_key = key n_keyl = n_key.lower() if n_keyl == 'suggested by the database': n_key = 'database' elif n_keyl == 'imdb users recommend': n_key = 'users' data[n_key] = [Movie(title=x['title'], movieID=analyze_imdbid(x['movieID']), accessSystem=self._as, modFunct=self._modFunct) for x in data[key]] del data[key] if data: return {'recommendations': data} return data class DOMHTMLNewsParser(DOMParserBase): """Parser for the "news" page of a given movie or person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: nwparser = DOMHTMLNewsParser() result = nwparser.parse(news_html_string) """ _defGetRefs = True extractors = [ Extractor(label='news', path="//h2", attrs=Attribute(key='news', multi=True, path={ 'title': "./text()", 'fromdate': "../following-sibling::p[1]/small//text()", # FIXME: sometimes (see The Matrix (1999))

is found # inside news text. 'body': "../following-sibling::p[2]//text()", 'link': "../..//a[text()='Permalink']/@href", 'fulllink': "../..//a[starts-with(text(), " \ "'See full article at')]/@href" }, postprocess=lambda x: { 'title': x.get('title').strip(), 'date': x.get('fromdate').split('|')[0].strip(), 'from': x.get('fromdate').split('|')[1].replace('From ', '').strip(), 'body': (x.get('body') or u'').strip(), 'link': _normalize_href(x.get('link')), 'full article link': _normalize_href(x.get('fulllink')) })) ] preprocessors = [ (re.compile('(]+>

)', re.I), r'
\1'), (re.compile('(
)', re.I), r'
\1'), (re.compile('

', re.I), r'') ] def postprocess_data(self, data): if not data.has_key('news'): return {} for news in data['news']: if news.has_key('full article link'): if news['full article link'] is None: del news['full article link'] return data def _parse_review(x): result = {} title = x.get('title').strip() if title[-1] == ':': title = title[:-1] result['title'] = title result['link'] = _normalize_href(x.get('link')) kind = x.get('kind').strip() if kind[-1] == ':': kind = kind[:-1] result['review kind'] = kind text = x.get('review').replace('\n\n', '||').replace('\n', ' ').split('||') review = '\n'.join(text) if x.get('author') is not None: author = x.get('author').strip() review = review.split(author)[0].strip() result['review author'] = author[2:] if x.get('item') is not None: item = x.get('item').strip() review = review[len(item):].strip() review = "%s: %s" % (item, review) result['review'] = review return result class DOMHTMLSeasonEpisodesParser(DOMParserBase): """Parser for the "episode list" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = DOMHTMLSeasonEpisodesParser() result = sparser.parse(episodes_html_string) """ extractors = [ Extractor(label='series link', path="//div[@class='parent']", attrs=[Attribute(key='series link', path=".//a/@href")] ), Extractor(label='series title', path="//head/meta[@property='og:title']", attrs=[Attribute(key='series title', path="./@content")] ), Extractor(label='seasons list', path="//select[@id='bySeason']//option", attrs=[Attribute(key='_seasons', multi=True, path="./@value")]), Extractor(label='selected season', path="//select[@id='bySeason']//option[@selected]", attrs=[Attribute(key='_current_season', path='./@value')]), Extractor(label='episodes', path=".", group="//div[@class='info']", group_key=".//meta/@content", group_key_normalize=lambda x: 'episode %s' % x, attrs=[Attribute(key=None, multi=True, path={ "link": ".//strong//a[@href][1]/@href", "original air date": ".//div[@class='airdate']/text()", "title": ".//strong//text()", "plot": ".//div[@class='item_description']//text()" } )] ) ] def postprocess_data(self, data): series_id = analyze_imdbid(data.get('series link')) series_title = data.get('series title', '').strip() selected_season = data.get('_current_season', 'unknown season').strip() if not (series_id and series_title): return {} series = Movie(title=series_title, movieID=str(series_id), accessSystem=self._as, modFunct=self._modFunct) if series.get('kind') == 'movie': series['kind'] = u'tv series' try: selected_season = int(selected_season) except: pass nd = {selected_season: {}} for episode_nr, episode in data.iteritems(): if not (episode and episode[0] and episode_nr.startswith('episode ')): continue episode = episode[0] episode_nr = episode_nr[8:].rstrip() try: episode_nr = int(episode_nr) except: pass episode_id = analyze_imdbid(episode.get('link' '')) episode_air_date = episode.get('original air date', '').strip() episode_title = episode.get('title', '').strip() episode_plot = episode.get('plot', '') if not (episode_nr and episode_id and episode_title): continue ep_obj = Movie(movieID=episode_id, title=episode_title, accessSystem=self._as, modFunct=self._modFunct) ep_obj['kind'] = u'episode' ep_obj['episode of'] = series ep_obj['season'] = selected_season ep_obj['episode'] = episode_nr if episode_air_date: ep_obj['original air date'] = episode_air_date if episode_air_date[-4:].isdigit(): ep_obj['year'] = episode_air_date[-4:] if episode_plot: ep_obj['plot'] = episode_plot nd[selected_season][episode_nr] = ep_obj _seasons = data.get('_seasons') or [] for idx, season in enumerate(_seasons): try: _seasons[idx] = int(season) except: pass return {'episodes': nd, '_seasons': _seasons, '_current_season': selected_season} def _build_episode(x): """Create a Movie object for a given series' episode.""" episode_id = analyze_imdbid(x.get('link')) episode_title = x.get('title') e = Movie(movieID=episode_id, title=episode_title) e['kind'] = u'episode' oad = x.get('oad') if oad: e['original air date'] = oad.strip() year = x.get('year') if year is not None: year = year[5:] if year == 'unknown': year = u'????' if year and year.isdigit(): year = int(year) e['year'] = year else: if oad and oad[-4:].isdigit(): e['year'] = int(oad[-4:]) epinfo = x.get('episode') if epinfo is not None: season, episode = epinfo.split(':')[0].split(',') e['season'] = int(season[7:]) e['episode'] = int(episode[8:]) else: e['season'] = 'unknown' e['episode'] = 'unknown' plot = x.get('plot') if plot: e['plot'] = plot.strip() return e class DOMHTMLEpisodesParser(DOMParserBase): """Parser for the "episode list" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: eparser = DOMHTMLEpisodesParser() result = eparser.parse(episodes_html_string) """ # XXX: no more used for the list of episodes parser, # but only for the episodes cast parser (see below). _containsObjects = True kind = 'episodes list' _episodes_path = "..//h4" _oad_path = "./following-sibling::span/strong[1]/text()" def _init(self): self.extractors = [ Extractor(label='series', path="//html", attrs=[Attribute(key='series title', path=".//title/text()"), Attribute(key='series movieID', path=".//h1/a[@class='main']/@href", postprocess=analyze_imdbid) ]), Extractor(label='episodes', group="//div[@class='_imdbpy']/h3", group_key="./a/@name", path=self._episodes_path, attrs=Attribute(key=None, multi=True, path={ 'link': "./a/@href", 'title': "./a/text()", 'year': "./preceding-sibling::a[1]/@name", 'episode': "./text()[1]", 'oad': self._oad_path, 'plot': "./following-sibling::text()[1]" }, postprocess=_build_episode))] if self.kind == 'episodes cast': self.extractors += [ Extractor(label='cast', group="//h4", group_key="./text()[1]", group_key_normalize=lambda x: x.strip(), path="./following-sibling::table[1]//td[@class='nm']", attrs=Attribute(key=None, multi=True, path={'person': "..//text()", 'link': "./a/@href", 'roleID': \ "../td[4]/div[@class='_imdbpyrole']/@roleid"}, postprocess=lambda x: \ build_person(x.get('person') or u'', personID=analyze_imdbid(x.get('link')), roleID=(x.get('roleID') or u'').split('/'), accessSystem=self._as, modFunct=self._modFunct))) ] preprocessors = [ (re.compile('(
\n)(

)', re.I), r'

\1
\2'), (re.compile('(

\n\n)
', re.I), r'\1'), (re.compile('

(.*?)

', re.I), r'

\1

'), (_reRolesMovie, _manageRoles), (re.compile('(

\n)(
)', re.I), r'\1\2') ] def postprocess_data(self, data): # A bit extreme? if not 'series title' in data: return {} if not 'series movieID' in data: return {} stitle = data['series title'].replace('- Episode list', '') stitle = stitle.replace('- Episodes list', '') stitle = stitle.replace('- Episode cast', '') stitle = stitle.replace('- Episodes cast', '') stitle = stitle.strip() if not stitle: return {} seriesID = data['series movieID'] if seriesID is None: return {} series = Movie(title=stitle, movieID=str(seriesID), accessSystem=self._as, modFunct=self._modFunct) nd = {} for key in data.keys(): if key.startswith('filter-season-') or key.startswith('season-'): season_key = key.replace('filter-season-', '').replace('season-', '') try: season_key = int(season_key) except: pass nd[season_key] = {} ep_counter = 1 for episode in data[key]: if not episode: continue episode_key = episode.get('episode') if episode_key is None: continue if not isinstance(episode_key, int): episode_key = ep_counter ep_counter += 1 cast_key = 'Season %s, Episode %s:' % (season_key, episode_key) if data.has_key(cast_key): cast = data[cast_key] for i in xrange(len(cast)): cast[i].billingPos = i + 1 episode['cast'] = cast episode['episode of'] = series nd[season_key][episode_key] = episode if len(nd) == 0: return {} return {'episodes': nd} class DOMHTMLEpisodesCastParser(DOMHTMLEpisodesParser): """Parser for the "episodes cast" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: eparser = DOMHTMLEpisodesParser() result = eparser.parse(episodes_html_string) """ kind = 'episodes cast' _episodes_path = "..//h4" _oad_path = "./following-sibling::b[1]/text()" class DOMHTMLFaqsParser(DOMParserBase): """Parser for the "FAQ" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: fparser = DOMHTMLFaqsParser() result = fparser.parse(faqs_html_string) """ _defGetRefs = True # XXX: bsoup and lxml don't match (looks like a minor issue, anyway). extractors = [ Extractor(label='faqs', path="//div[@class='section']", attrs=Attribute(key='faqs', multi=True, path={ 'question': "./h3/a/span/text()", 'answer': "../following-sibling::div[1]//text()" }, postprocess=lambda x: u'%s::%s' % (x.get('question').strip(), '\n\n'.join(x.get('answer').replace( '\n\n', '\n').strip().split('||'))))) ] preprocessors = [ (re.compile('

', re.I), r'||'), (re.compile('

(.*?)

\n', re.I), r'||\1--'), (re.compile('(.*?)', re.I), r'[spoiler]\1[/spoiler]') ] class DOMHTMLAiringParser(DOMParserBase): """Parser for the "airing" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: aparser = DOMHTMLAiringParser() result = aparser.parse(airing_html_string) """ _containsObjects = True extractors = [ Extractor(label='series title', path="//title", attrs=Attribute(key='series title', path="./text()", postprocess=lambda x: \ x.replace(' - TV schedule', u''))), Extractor(label='series id', path="//h1/a[@href]", attrs=Attribute(key='series id', path="./@href")), Extractor(label='tv airings', path="//tr[@class]", attrs=Attribute(key='airing', multi=True, path={ 'date': "./td[1]//text()", 'time': "./td[2]//text()", 'channel': "./td[3]//text()", 'link': "./td[4]/a[1]/@href", 'title': "./td[4]//text()", 'season': "./td[5]//text()", }, postprocess=lambda x: { 'date': x.get('date'), 'time': x.get('time'), 'channel': x.get('channel').strip(), 'link': x.get('link'), 'title': x.get('title'), 'season': (x.get('season') or '').strip() } )) ] def postprocess_data(self, data): if len(data) == 0: return {} seriesTitle = data['series title'] seriesID = analyze_imdbid(data['series id']) if data.has_key('airing'): for airing in data['airing']: title = airing.get('title', '').strip() if not title: epsTitle = seriesTitle if seriesID is None: continue epsID = seriesID else: epsTitle = '%s {%s}' % (data['series title'], airing['title']) epsID = analyze_imdbid(airing['link']) e = Movie(title=epsTitle, movieID=epsID) airing['episode'] = e del airing['link'] del airing['title'] if not airing['season']: del airing['season'] if 'series title' in data: del data['series title'] if 'series id' in data: del data['series id'] if 'airing' in data: data['airing'] = filter(None, data['airing']) if 'airing' not in data or not data['airing']: return {} return data class DOMHTMLSynopsisParser(DOMParserBase): """Parser for the "synopsis" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = HTMLSynopsisParser() result = sparser.parse(synopsis_html_string) """ extractors = [ Extractor(label='synopsis', path="//div[@class='display'][not(@style)]", attrs=Attribute(key='synopsis', path=".//text()", postprocess=lambda x: '\n\n'.join(x.strip().split('||')))) ] preprocessors = [ (re.compile('

', re.I), r'||') ] class DOMHTMLParentsGuideParser(DOMParserBase): """Parser for the "parents guide" page of a given movie. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: pgparser = HTMLParentsGuideParser() result = pgparser.parse(parentsguide_html_string) """ extractors = [ Extractor(label='parents guide', group="//div[@class='section']", group_key="./h3/a/span/text()", group_key_normalize=lambda x: x.lower(), path="../following-sibling::div[1]/p", attrs=Attribute(key=None, path=".//text()", postprocess=lambda x: [t.strip().replace('\n', ' ') for t in x.split('||') if t.strip()])) ] preprocessors = [ (re.compile('

', re.I), r'||') ] def postprocess_data(self, data): data2 = {} for key in data: if data[key]: data2[key] = data[key] if not data2: return {} return {'parents guide': data2} _OBJECTS = { 'movie_parser': ((DOMHTMLMovieParser,), None), 'plot_parser': ((DOMHTMLPlotParser,), None), 'movie_awards_parser': ((DOMHTMLAwardsParser,), None), 'taglines_parser': ((DOMHTMLTaglinesParser,), None), 'keywords_parser': ((DOMHTMLKeywordsParser,), None), 'crazycredits_parser': ((DOMHTMLCrazyCreditsParser,), None), 'goofs_parser': ((DOMHTMLGoofsParser,), None), 'alternateversions_parser': ((DOMHTMLAlternateVersionsParser,), None), 'trivia_parser': ((DOMHTMLTriviaParser,), None), 'soundtrack_parser': ((DOMHTMLSoundtrackParser,), {'kind': 'soundtrack'}), 'quotes_parser': ((DOMHTMLQuotesParser,), None), 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None), 'ratings_parser': ((DOMHTMLRatingsParser,), None), 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None), 'externalrev_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'external reviews'}), 'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'newsgroup reviews'}), 'misclinks_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'misc links'}), 'soundclips_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'sound clips'}), 'videoclips_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'video clips'}), 'photosites_parser': ((DOMHTMLOfficialsitesParser,), {'kind': 'photo sites'}), 'connections_parser': ((DOMHTMLConnectionParser,), None), 'tech_parser': ((DOMHTMLTechParser,), None), 'business_parser': ((DOMHTMLTechParser,), {'kind': 'business', '_defGetRefs': 1}), 'literature_parser': ((DOMHTMLTechParser,), {'kind': 'literature'}), 'locations_parser': ((DOMHTMLLocationsParser,), None), 'rec_parser': ((DOMHTMLRecParser,), None), 'news_parser': ((DOMHTMLNewsParser,), None), 'episodes_parser': ((DOMHTMLEpisodesParser,), None), 'season_episodes_parser': ((DOMHTMLSeasonEpisodesParser,), None), 'episodes_cast_parser': ((DOMHTMLEpisodesCastParser,), None), 'eprating_parser': ((DOMHTMLEpisodesRatings,), None), 'movie_faqs_parser': ((DOMHTMLFaqsParser,), None), 'airing_parser': ((DOMHTMLAiringParser,), None), 'synopsis_parser': ((DOMHTMLSynopsisParser,), None), 'parentsguide_parser': ((DOMHTMLParentsGuideParser,), None) } IMDbPY-4.9/imdb/parser/http/searchMovieParser.py0000644000000000000000000001656211766731642020321 0ustar rootroot""" parser.http.searchMovieParser module (imdb package). This module provides the HTMLSearchMovieParser class (and the search_movie_parser instance), used to parse the results of a search for a given title. E.g., for when searching for the title "the passion", the parsed page would be: http://akas.imdb.com/find?q=the+passion&tt=on&mx=20 Copyright 2004-2010 Davide Alberani 2008 H. Turgut Uyar This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re from imdb.utils import analyze_title, build_title from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid class DOMBasicMovieParser(DOMParserBase): """Simply get the title of a movie and the imdbID. It's used by the DOMHTMLSearchMovieParser class to return a result for a direct match (when a search on IMDb results in a single movie, the web server sends directly the movie page.""" # Stay generic enough to be used also for other DOMBasic*Parser classes. _titleAttrPath = ".//text()" _linkPath = "//link[@rel='canonical']" _titleFunct = lambda self, x: analyze_title(x or u'') def _init(self): self.preprocessors += [('TV mini-series', '(mini)')] self.extractors = [Extractor(label='title', path="//h1", attrs=Attribute(key='title', path=self._titleAttrPath, postprocess=self._titleFunct)), Extractor(label='link', path=self._linkPath, attrs=Attribute(key='link', path="./@href", postprocess=lambda x: \ analyze_imdbid((x or u'').replace( 'http://pro.imdb.com', '')) ))] # Remove 'More at IMDb Pro' links. preprocessors = [(re.compile(r''), ''), (re.compile(r'
< a href="')] def postprocess_data(self, data): if not 'link' in data: data = [] else: link = data.pop('link') if (link and data): data = [(link, data)] else: data = [] return data def custom_analyze_title(title): """Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)""" # XXX: very crappy. :-( nt = title.split(' ')[0] if nt: title = nt if not title: return {} return analyze_title(title) # Manage AKAs. _reAKAStitles = re.compile(r'(?:aka) "(.*?)(
|<\/td>)', re.I | re.M) class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = 'imdb title' _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", #'akas': ".//div[@class='_imdbpyAKA']//text()" 'akas': ".//p[@class='find-aka']//text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), custom_analyze_title(x.get('info') or u''), x.get('akas') ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, '/title/tt')]/..", attrs=_attrs)] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): if self._linkPrefix == '/title/tt': # Only for movies. html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = html_string.replace('<p class="find-aka">', '<p class="find-aka">::') #html_string = _reAKAStitles.sub( # r'<div class="_imdbpyAKA">\1::</div>\2', html_string) return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): data['data'] = [x for x in data['data'] if x[0] and x[1]] for idx, datum in enumerate(data['data']): if not isinstance(datum, tuple): continue if not datum[0] and datum[1]: continue if datum[2] is not None: akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': akas = [a.replace('" - ', '::').rstrip() for a in akas] akas = [a.replace('aka "', '', 1).replace('aka "', '', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data _OBJECTS = { 'search_movie_parser': ((DOMHTMLSearchMovieParser,), None) } ����������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/characterParser.py������������������������������������������������������0000644�0000000�0000000�00000017713�11766731642�020007� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.characterParser module (imdb package). This module provides the classes (and the instances), used to parse the IMDb pages on the akas.imdb.com server about a character. E.g., for "Jesse James" the referred pages would be: main details: http://www.imdb.com/character/ch0000001/ biography: http://www.imdb.com/character/ch0000001/bio ...and so on... Copyright 2007-2009 Davide Alberani <da@erlug.linux.it> 2008 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re from utils import Attribute, Extractor, DOMParserBase, build_movie, \ analyze_imdbid from personParser import DOMHTMLMaindetailsParser from imdb.Movie import Movie _personIDs = re.compile(r'/name/nm([0-9]{7})') class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser): """Parser for the "filmography" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bparser = DOMHTMLCharacterMaindetailsParser() result = bparser.parse(character_biography_html_string) """ _containsObjects = True _film_attrs = [Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': ".//text()", 'status': "./i/a//text()", 'roleID': "./a/@href" }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=_personIDs.findall(x.get('roleID') or u''), status=x.get('status') or None, _parsingCharacter=True))] extractors = [ Extractor(label='title', path="//title", attrs=Attribute(key='name', path="./text()", postprocess=lambda x: \ x.replace(' (Character)', '').replace( '- Filmography by type', '').strip())), Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h5='Alternate Names:']", attrs=Attribute(key='akas', path="./div//text()", postprocess=lambda x: x.strip().split(' / '))), Extractor(label='filmography', path="//div[@class='filmo'][not(h5)]/ol/li", attrs=_film_attrs), Extractor(label='filmography sections', group="//div[@class='filmo'][h5]", group_key="./h5/a/text()", group_key_normalize=lambda x: x.lower()[:-1], path="./ol/li", attrs=_film_attrs), ] preprocessors = [ # Check that this doesn't cut "status"... (re.compile(r'<br>(\.\.\.| ).+?</li>', re.I | re.M), '</li>')] class DOMHTMLCharacterBioParser(DOMParserBase): """Parser for the "biography" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bparser = DOMHTMLCharacterBioParser() result = bparser.parse(character_biography_html_string) """ _defGetRefs = True extractors = [ Extractor(label='introduction', path="//div[@id='_intro']", attrs=Attribute(key='introduction', path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='biography', path="//span[@class='_biography']", attrs=Attribute(key='biography', multi=True, path={ 'info': "./preceding-sibling::h4[1]//text()", 'text': ".//text()" }, postprocess=lambda x: u'%s: %s' % ( x.get('info').strip(), x.get('text').replace('\n', ' ').replace('||', '\n\n').strip()))), ] preprocessors = [ (re.compile('(<div id="swiki.2.3.1">)', re.I), r'\1<div id="_intro">'), (re.compile('(<a name="history">)\s*(<table .*?</table>)', re.I | re.DOTALL), r'</div>\2\1</a>'), (re.compile('(<a name="[^"]+">)(<h4>)', re.I), r'</span>\1</a>\2'), (re.compile('(</h4>)</a>', re.I), r'\1<span class="_biography">'), (re.compile('<br/><br/>', re.I), r'||'), (re.compile('\|\|\n', re.I), r'</span>'), ] class DOMHTMLCharacterQuotesParser(DOMParserBase): """Parser for the "quotes" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: qparser = DOMHTMLCharacterQuotesParser() result = qparser.parse(character_quotes_html_string) """ _defGetRefs = True extractors = [ Extractor(label='charquotes', group="//h5", group_key="./a/text()", path="./following-sibling::div[1]", attrs=Attribute(key=None, path={'txt': ".//text()", 'movieID': ".//a[1]/@href"}, postprocess=lambda x: (analyze_imdbid(x['movieID']), x['txt'].strip().replace(': ', ': ').replace(': ', ': ').split('||')))) ] preprocessors = [ (re.compile('(</h5>)', re.I), r'\1<div>'), (re.compile('\s*<br/><br/>\s*', re.I), r'||'), (re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'), (re.compile('\s*<br/>\s*', re.I), r'::') ] def postprocess_data(self, data): if not data: return {} newData = {} for title in data: movieID, quotes = data[title] if movieID is None: movie = title else: movie = Movie(title=title, movieID=movieID, accessSystem=self._as, modFunct=self._modFunct) newData[movie] = [quote.split('::') for quote in quotes] return {'quotes': newData} from personParser import DOMHTMLSeriesParser _OBJECTS = { 'character_main_parser': ((DOMHTMLCharacterMaindetailsParser,), {'kind': 'character'}), 'character_series_parser': ((DOMHTMLSeriesParser,), None), 'character_bio_parser': ((DOMHTMLCharacterBioParser,), None), 'character_quotes_parser': ((DOMHTMLCharacterQuotesParser,), None) } �����������������������������������������������������IMDbPY-4.9/imdb/parser/http/personParser.py���������������������������������������������������������0000644�0000000�0000000�00000053066�11766731642�017362� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.personParser module (imdb package). This module provides the classes (and the instances), used to parse the IMDb pages on the akas.imdb.com server about a person. E.g., for "Mel Gibson" the referred pages would be: categorized: http://akas.imdb.com/name/nm0000154/maindetails biography: http://akas.imdb.com/name/nm0000154/bio ...and so on... Copyright 2004-20101 Davide Alberani <da@erlug.linux.it> 2008 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re from imdb.Movie import Movie from imdb.utils import analyze_name, canonicalName, normalizeName, \ analyze_title, date_and_notes from utils import build_movie, DOMParserBase, Attribute, Extractor, \ analyze_imdbid from movieParser import _manageRoles _reRoles = re.compile(r'(<li>.*? \.\.\.\. )(.*?)(</li>|<br>)', re.I | re.M | re.S) def build_date(date): day = date.get('day') year = date.get('year') if day and year: return "%s %s" % (day, year) if day: return day if year: return year return "" class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _birth_attrs = [Attribute(key='birth date', path='.//time[@itemprop="birthDate"]/@datetime'), Attribute(key='birth place', path=".//a[starts-with(@href, " \ "'/search/name?birth_place=')]/text()")] _death_attrs = [Attribute(key='death date', path='.//time[@itemprop="deathDate"]/@datetime'), Attribute(key='death place', path=".//a[starts-with(@href, " \ "'/search/name?death_place=')]/text()")] _film_attrs = [Attribute(key=None, multi=True, path={ 'link': "./b/a[1]/@href", 'title': "./b/a[1]/text()", 'notes': "./b/following-sibling::text()", 'year': "./span[@class='year_column']/text()", 'status': "./a[@class='in_production']/text()", 'rolesNoChar': './/br/following-sibling::text()', 'chrRoles': "./a[@imdbpyname]/@imdbpyname", 'roleID': "./a[starts-with(@href, '/character/')]/@href" }, postprocess=lambda x: build_movie(x.get('title') or u'', year=x.get('year'), movieID=analyze_imdbid(x.get('link') or u''), rolesNoChar=(x.get('rolesNoChar') or u'').strip(), chrRoles=(x.get('chrRoles') or u'').strip(), additionalNotes=x.get('notes'), roleID=(x.get('roleID') or u''), status=x.get('status') or None))] extractors = [ Extractor(label='name', path="//h1[@class='header']", attrs=Attribute(key='name', path=".//text()", postprocess=lambda x: analyze_name(x, canonical=1))), Extractor(label='birth info', path="//div[h4='Born:']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h4='Died:']", attrs=_death_attrs), Extractor(label='headshot', path="//td[@id='img_primary']/a", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h4='Alternate Names:']", attrs=Attribute(key='akas', path="./text()", postprocess=lambda x: x.strip().split(' '))), Extractor(label='filmography', group="//div[starts-with(@id, 'filmo-head-')]", group_key="./a[@name]/text()", group_key_normalize=lambda x: x.lower().replace(': ', ' '), path="./following-sibling::div[1]" \ "/div[starts-with(@class, 'filmo-row')]", attrs=_film_attrs), Extractor(label='indevelopment', path="//div[starts-with(@class,'devitem')]", attrs=Attribute(key='in development', multi=True, path={ 'link': './a/@href', 'title': './a/text()' }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=(x.get('roleID') or u'').split('/'), status=x.get('status') or None))) ] preprocessors = [('<div class="clear"/> </div>', ''), ('<br/>', '<br />'), (re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'), r'\1 imdbpyname="\2@@">\2</a>')] def postprocess_data(self, data): for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] # XXX: the code below is for backwards compatibility # probably could be removed for key in data.keys(): if key.startswith('actor '): if not data.has_key('actor'): data['actor'] = [] data['actor'].extend(data[key]) del data[key] if key.startswith('actress '): if not data.has_key('actress'): data['actress'] = [] data['actress'].extend(data[key]) del data[key] if key.startswith('self '): if not data.has_key('self'): data['self'] = [] data['self'].extend(data[key]) del data[key] if key == 'birth place': data['birth notes'] = data[key] del data[key] if key == 'death place': data['death notes'] = data[key] del data[key] return data class DOMHTMLBioParser(DOMParserBase): """Parser for the "biography" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bioparser = DOMHTMLBioParser() result = bioparser.parse(biography_html_string) """ _defGetRefs = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/date/')]/text()", 'year': "./a[starts-with(@href, " \ "'/search/name?birth_year=')]/text()" }, postprocess=build_date), Attribute(key='birth notes', path="./a[starts-with(@href, " \ "'/search/name?birth_place=')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/date/')]/text()", 'year': "./a[starts-with(@href, " \ "'/search/name?death_date=')]/text()" }, postprocess=build_date), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: u''.join(x).strip()[2:])] extractors = [ Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='birth info', path="//div[h5='Date of Birth']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h5='Date of Death']", attrs=_death_attrs), Extractor(label='nick names', path="//div[h5='Nickname']", attrs=Attribute(key='nick names', path="./text()", joiner='|', postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|') if n.strip()])), Extractor(label='birth name', path="//div[h5='Birth Name']", attrs=Attribute(key='birth name', path="./text()", postprocess=lambda x: canonicalName(x.strip()))), Extractor(label='height', path="//div[h5='Height']", attrs=Attribute(key='height', path="./text()", postprocess=lambda x: x.strip())), Extractor(label='mini biography', path="//div[h5='Mini Biography']", attrs=Attribute(key='mini biography', multi=True, path={ 'bio': "./p//text()", 'by': "./b/following-sibling::a/text()" }, postprocess=lambda x: "%s::%s" % \ (x.get('bio').strip(), (x.get('by') or u'').strip() or u'Anonymous'))), Extractor(label='spouse', path="//div[h5='Spouse']/table/tr", attrs=Attribute(key='spouse', multi=True, path={ 'name': "./td[1]//text()", 'info': "./td[2]//text()" }, postprocess=lambda x: ("%s::%s" % \ (x.get('name').strip(), (x.get('info') or u'').strip())).strip(':'))), Extractor(label='trade mark', path="//div[h5='Trade Mark']/p", attrs=Attribute(key='trade mark', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='trivia', path="//div[h5='Trivia']/p", attrs=Attribute(key='trivia', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='quotes', path="//div[h5='Personal Quotes']/p", attrs=Attribute(key='quotes', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='salary', path="//div[h5='Salary']/table/tr", attrs=Attribute(key='salary history', multi=True, path={ 'title': "./td[1]//text()", 'info': "./td[2]/text()", }, postprocess=lambda x: "%s::%s" % \ (x.get('title').strip(), x.get('info').strip()))), Extractor(label='where now', path="//div[h5='Where Are They Now']/p", attrs=Attribute(key='where now', multi=True, path=".//text()", postprocess=lambda x: x.strip())), ] preprocessors = [ (re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'), (re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('\.<br><br>([^\s])', re.I), r'. \1') ] def postprocess_data(self, data): for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] return data class DOMHTMLOtherWorksParser(DOMParserBase): """Parser for the "other works" and "agent" pages of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: owparser = DOMHTMLOtherWorksParser() result = owparser.parse(otherworks_html_string) """ _defGetRefs = True kind = 'other works' # XXX: looks like the 'agent' page is no more public. extractors = [ Extractor(label='other works', path="//h5[text()='Other works']/" \ "following-sibling::div[1]", attrs=Attribute(key='self.kind', path=".//text()", postprocess=lambda x: x.strip().split('\n\n'))) ] preprocessors = [ (re.compile('(<h5>[^<]+</h5>)', re.I), r'</div>\1<div class="_imdbpy">'), (re.compile('(</table>\n</div>\s+)</div>', re.I), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('<br/><br/>', re.I), r'\n\n') ] def _build_episode(link, title, minfo, role, roleA, roleAID): """Build an Movie object for a given episode of a series.""" episode_id = analyze_imdbid(link) notes = u'' minidx = minfo.find(' -') # Sometimes, for some unknown reason, the role is left in minfo. if minidx != -1: slfRole = minfo[minidx+3:].lstrip() minfo = minfo[:minidx].rstrip() if slfRole.endswith(')'): commidx = slfRole.rfind('(') if commidx != -1: notes = slfRole[commidx:] slfRole = slfRole[:commidx] if slfRole and role is None and roleA is None: role = slfRole eps_data = analyze_title(title) eps_data['kind'] = u'episode' # FIXME: it's wrong for multiple characters (very rare on tv series?). if role is None: role = roleA # At worse, it's None. if role is None: roleAID = None if roleAID is not None: roleAID = analyze_imdbid(roleAID) e = Movie(movieID=episode_id, data=eps_data, currentRole=role, roleID=roleAID, notes=notes) # XXX: are we missing some notes? # XXX: does it parse things as "Episode dated 12 May 2005 (12 May 2005)"? if minfo.startswith('('): pe = minfo.find(')') if pe != -1: date = minfo[1:pe] if date != '????': e['original air date'] = date if eps_data.get('year', '????') == '????': syear = date.split()[-1] if syear.isdigit(): e['year'] = int(syear) return e class DOMHTMLSeriesParser(DOMParserBase): """Parser for the "by TV series" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: sparser = DOMHTMLSeriesParser() result = sparser.parse(filmoseries_html_string) """ _containsObjects = True extractors = [ Extractor(label='series', group="//div[@class='filmo']/span[1]", group_key="./a[1]", path="./following-sibling::ol[1]/li/a[1]", attrs=Attribute(key=None, multi=True, path={ 'link': "./@href", 'title': "./text()", 'info': "./following-sibling::text()", 'role': "./following-sibling::i[1]/text()", 'roleA': "./following-sibling::a[1]/text()", 'roleAID': "./following-sibling::a[1]/@href" }, postprocess=lambda x: _build_episode(x.get('link'), x.get('title'), (x.get('info') or u'').strip(), x.get('role'), x.get('roleA'), x.get('roleAID')))) ] def postprocess_data(self, data): if len(data) == 0: return {} nd = {} for key in data.keys(): dom = self.get_dom(key) link = self.xpath(dom, "//a/@href")[0] title = self.xpath(dom, "//a/text()")[0][1:-1] series = Movie(movieID=analyze_imdbid(link), data=analyze_title(title), accessSystem=self._as, modFunct=self._modFunct) nd[series] = [] for episode in data[key]: # XXX: should we create a copy of 'series', to avoid # circular references? episode['episode of'] = series nd[series].append(episode) return {'episodes': nd} class DOMHTMLPersonGenresParser(DOMParserBase): """Parser for the "by genre" and "by keywords" pages of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: gparser = DOMHTMLPersonGenresParser() result = gparser.parse(bygenre_html_string) """ kind = 'genres' _containsObjects = True extractors = [ Extractor(label='genres', group="//b/a[@name]/following-sibling::a[1]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../../following-sibling::ol[1]/li//a[1]", attrs=Attribute(key=None, multi=True, path={ 'link': "./@href", 'title': "./text()", 'info': "./following-sibling::text()" }, postprocess=lambda x: \ build_movie(x.get('title') + \ x.get('info').split('[')[0], analyze_imdbid(x.get('link'))))) ] def postprocess_data(self, data): if len(data) == 0: return {} return {self.kind: data} from movieParser import DOMHTMLTechParser from movieParser import DOMHTMLOfficialsitesParser from movieParser import DOMHTMLAwardsParser from movieParser import DOMHTMLNewsParser _OBJECTS = { 'maindetails_parser': ((DOMHTMLMaindetailsParser,), None), 'bio_parser': ((DOMHTMLBioParser,), None), 'otherworks_parser': ((DOMHTMLOtherWorksParser,), None), #'agent_parser': ((DOMHTMLOtherWorksParser,), {'kind': 'agent'}), 'person_officialsites_parser': ((DOMHTMLOfficialsitesParser,), None), 'person_awards_parser': ((DOMHTMLAwardsParser,), {'subject': 'name'}), 'publicity_parser': ((DOMHTMLTechParser,), {'kind': 'publicity'}), 'person_series_parser': ((DOMHTMLSeriesParser,), None), 'person_contacts_parser': ((DOMHTMLTechParser,), {'kind': 'contacts'}), 'person_genres_parser': ((DOMHTMLPersonGenresParser,), None), 'person_keywords_parser': ((DOMHTMLPersonGenresParser,), {'kind': 'keywords'}), 'news_parser': ((DOMHTMLNewsParser,), None), } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/__init__.py�������������������������������������������������������������0000644�0000000�0000000�00000110366�11766731642�016433� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http package (imdb package). This package provides the IMDbHTTPAccessSystem class used to access IMDb's data through the web interface. the imdb.IMDb function will return an instance of this class when called with the 'accessSystem' argument set to "http" or "web" or "html" (this is the default). Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> 2008 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import sys import socket import logging from urllib import FancyURLopener, quote_plus from codecs import lookup from imdb import IMDbBase, imdbURL_movie_main, imdbURL_person_main, \ imdbURL_character_main, imdbURL_company_main, \ imdbURL_keyword_main, imdbURL_find, imdbURL_top250, \ imdbURL_bottom100 from imdb.utils import analyze_title from imdb._exceptions import IMDbDataAccessError, IMDbParserError import searchMovieParser import searchPersonParser import searchCharacterParser import searchCompanyParser import searchKeywordParser import movieParser import personParser import characterParser import companyParser import topBottomParser # Logger for miscellaneous functions. _aux_logger = logging.getLogger('imdbpy.parser.http.aux') IN_GAE = False try: import google.appengine IN_GAE = True _aux_logger.info('IMDbPY is running in the Google App Engine environment') except ImportError: pass class _ModuleProxy: """A proxy to instantiate and access parsers.""" def __init__(self, module, defaultKeys=None, oldParsers=False, useModule=None, fallBackToNew=False): """Initialize a proxy for the given module; defaultKeys, if set, muste be a dictionary of values to set for instanced objects.""" if oldParsers or fallBackToNew: _aux_logger.warn('The old set of parsers was removed; falling ' \ 'back to the new parsers.') self.useModule = useModule if defaultKeys is None: defaultKeys = {} self._defaultKeys = defaultKeys self._module = module def __getattr__(self, name): """Called only when no look-up is found.""" _sm = self._module # Read the _OBJECTS dictionary to build the asked parser. if name in _sm._OBJECTS: _entry = _sm._OBJECTS[name] # Initialize the parser. kwds = {} if self.useModule: kwds = {'useModule': self.useModule} parserClass = _entry[0][0] obj = parserClass(**kwds) attrsToSet = self._defaultKeys.copy() attrsToSet.update(_entry[1] or {}) # Set attribute to the object. for key in attrsToSet: setattr(obj, key, attrsToSet[key]) setattr(self, name, obj) return obj return getattr(_sm, name) PY_VERSION = sys.version_info[:2] # The cookies for the "adult" search. # Please don't mess with these account. # Old 'IMDbPY' account. _old_cookie_id = 'boM2bYxz9MCsOnH9gZ0S9QHs12NWrNdApxsls1Vb5/NGrNdjcHx3dUas10UASoAjVEvhAbGagERgOpNkAPvxdbfKwaV2ikEj9SzXY1WPxABmDKQwdqzwRbM+12NSeJFGUEx3F8as10WwidLzVshDtxaPIbP13NdjVS9UZTYqgTVGrNcT9vyXU1' _old_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZp4x1X+uAUGKD7BM2g+dVd8eqEzDErCoYvdcvGLvVLAen1y08hNQtALjVKAe+1hM8g9QbNonlG1/t4S82ieUsBbrSIQbq1yhV6tZ6ArvSbA7rgHc8n5AdReyAmDaJ5Wm/ee3VDoCnGj/LlBs2ieUZNorhHDKK5Q==' # New 'IMDbPYweb' account. _cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1' _cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk=' # imdbpy2010 account. #_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI=' #_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A==' class _FakeURLOpener(object): """Fake URLOpener object, used to return empty strings instead of errors. """ def __init__(self, url, headers): self.url = url self.headers = headers def read(self, *args, **kwds): return '' def close(self, *args, **kwds): pass def info(self, *args, **kwds): return self.headers class IMDbURLopener(FancyURLopener): """Fetch web pages and handle errors.""" _logger = logging.getLogger('imdbpy.parser.http.urlopener') def __init__(self, *args, **kwargs): self._last_url = u'' FancyURLopener.__init__(self, *args, **kwargs) # Headers to add to every request. # XXX: IMDb's web server doesn't like urllib-based programs, # so lets fake to be Mozilla. # Wow! I'm shocked by my total lack of ethic! <g> for header in ('User-Agent', 'User-agent', 'user-agent'): self.del_header(header) self.set_header('User-Agent', 'Mozilla/5.0') # XXX: This class is used also to perform "Exact Primary # [Title|Name]" searches, and so by default the cookie is set. c_header = 'id=%s; uu=%s' % (_cookie_id, _cookie_uu) self.set_header('Cookie', c_header) def get_proxy(self): """Return the used proxy, or an empty string.""" return self.proxies.get('http', '') def set_proxy(self, proxy): """Set the proxy.""" if not proxy: if self.proxies.has_key('http'): del self.proxies['http'] else: if not proxy.lower().startswith('http://'): proxy = 'http://%s' % proxy self.proxies['http'] = proxy def set_header(self, header, value, _overwrite=True): """Set a default header.""" if _overwrite: self.del_header(header) self.addheaders.append((header, value)) def get_header(self, header): """Return the first value of a header, or None if not present.""" for index in xrange(len(self.addheaders)): if self.addheaders[index][0] == header: return self.addheaders[index][1] return None def del_header(self, header): """Remove a default header.""" for index in xrange(len(self.addheaders)): if self.addheaders[index][0] == header: del self.addheaders[index] break def retrieve_unicode(self, url, size=-1): """Retrieves the given URL, and returns a unicode string, trying to guess the encoding of the data (assuming latin_1 by default)""" encode = None try: if size != -1: self.set_header('Range', 'bytes=0-%d' % size) uopener = self.open(url) kwds = {} if PY_VERSION > (2, 3) and not IN_GAE: kwds['size'] = size content = uopener.read(**kwds) self._last_url = uopener.url # Maybe the server is so nice to tell us the charset... server_encode = uopener.info().getparam('charset') # Otherwise, look at the content-type HTML meta tag. if server_encode is None and content: first_bytes = content[:512] begin_h = first_bytes.find('text/html; charset=') if begin_h != -1: end_h = first_bytes[19+begin_h:].find('"') if end_h != -1: server_encode = first_bytes[19+begin_h:19+begin_h+end_h] if server_encode: try: if lookup(server_encode): encode = server_encode except (LookupError, ValueError, TypeError): pass uopener.close() if size != -1: self.del_header('Range') self.close() except IOError, e: if size != -1: # Ensure that the Range header is removed. self.del_header('Range') raise IMDbDataAccessError({'errcode': e.errno, 'errmsg': str(e.strerror), 'url': url, 'proxy': self.get_proxy(), 'exception type': 'IOError', 'original exception': e}) if encode is None: encode = 'latin_1' # The detection of the encoding is error prone... self._logger.warn('Unable to detect the encoding of the retrieved ' 'page [%s]; falling back to default latin1.', encode) ##print unicode(content, encode, 'replace').encode('utf8') return unicode(content, encode, 'replace') def http_error_default(self, url, fp, errcode, errmsg, headers): if errcode == 404: self._logger.warn('404 code returned for %s: %s (headers: %s)', url, errmsg, headers) return _FakeURLOpener(url, headers) raise IMDbDataAccessError({'url': 'http:%s' % url, 'errcode': errcode, 'errmsg': errmsg, 'headers': headers, 'error type': 'http_error_default', 'proxy': self.get_proxy()}) def open_unknown(self, fullurl, data=None): raise IMDbDataAccessError({'fullurl': fullurl, 'data': str(data), 'error type': 'open_unknown', 'proxy': self.get_proxy()}) def open_unknown_proxy(self, proxy, fullurl, data=None): raise IMDbDataAccessError({'proxy': str(proxy), 'fullurl': fullurl, 'error type': 'open_unknown_proxy', 'data': str(data)}) class IMDbHTTPAccessSystem(IMDbBase): """The class used to access IMDb's data through the web.""" accessSystem = 'http' _http_logger = logging.getLogger('imdbpy.parser.http') def __init__(self, isThin=0, adultSearch=1, proxy=-1, oldParsers=False, fallBackToNew=False, useModule=None, cookie_id=-1, timeout=30, cookie_uu=None, *arguments, **keywords): """Initialize the access system.""" IMDbBase.__init__(self, *arguments, **keywords) self.urlOpener = IMDbURLopener() # When isThin is set, we're parsing the "maindetails" page # of a movie (instead of the "combined" page) and movie/person # references are not collected if no defaultModFunct is provided. # # NOTE: httpThin was removed since IMDbPY 4.8. self.isThin = isThin self._getRefs = True self._mdparse = False if isThin: self._http_logger.warn('"httpThin" access system no longer ' + 'supported; "http" used automatically', exc_info=False) self.isThin = 0 if self.accessSystem in ('httpThin', 'webThin', 'htmlThin'): self.accessSystem = 'http' self.set_timeout(timeout) self.do_adult_search(adultSearch) if cookie_id != -1: if cookie_id is None: self.del_cookies() elif cookie_uu is not None: self.set_cookies(cookie_id, cookie_uu) if proxy != -1: self.set_proxy(proxy) if useModule is not None: if not isinstance(useModule, (list, tuple)) and ',' in useModule: useModule = useModule.split(',') _def = {'_modFunct': self._defModFunct, '_as': self.accessSystem} # Proxy objects. self.smProxy = _ModuleProxy(searchMovieParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) self.spProxy = _ModuleProxy(searchPersonParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) self.scProxy = _ModuleProxy(searchCharacterParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) self.scompProxy = _ModuleProxy(searchCompanyParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) self.skProxy = _ModuleProxy(searchKeywordParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) self.mProxy = _ModuleProxy(movieParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) self.pProxy = _ModuleProxy(personParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) self.cProxy = _ModuleProxy(characterParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) self.compProxy = _ModuleProxy(companyParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) self.topBottomProxy = _ModuleProxy(topBottomParser, defaultKeys=_def, oldParsers=oldParsers, useModule=useModule, fallBackToNew=fallBackToNew) def _normalize_movieID(self, movieID): """Normalize the given movieID.""" try: return '%07d' % int(movieID) except ValueError, e: raise IMDbParserError('invalid movieID "%s": %s' % (movieID, e)) def _normalize_personID(self, personID): """Normalize the given personID.""" try: return '%07d' % int(personID) except ValueError, e: raise IMDbParserError('invalid personID "%s": %s' % (personID, e)) def _normalize_characterID(self, characterID): """Normalize the given characterID.""" try: return '%07d' % int(characterID) except ValueError, e: raise IMDbParserError('invalid characterID "%s": %s' % \ (characterID, e)) def _normalize_companyID(self, companyID): """Normalize the given companyID.""" try: return '%07d' % int(companyID) except ValueError, e: raise IMDbParserError('invalid companyID "%s": %s' % \ (companyID, e)) def get_imdbMovieID(self, movieID): """Translate a movieID in an imdbID; in this implementation the movieID _is_ the imdbID. """ return movieID def get_imdbPersonID(self, personID): """Translate a personID in an imdbID; in this implementation the personID _is_ the imdbID. """ return personID def get_imdbCharacterID(self, characterID): """Translate a characterID in an imdbID; in this implementation the characterID _is_ the imdbID. """ return characterID def get_imdbCompanyID(self, companyID): """Translate a companyID in an imdbID; in this implementation the companyID _is_ the imdbID. """ return companyID def get_proxy(self): """Return the used proxy or an empty string.""" return self.urlOpener.get_proxy() def set_proxy(self, proxy): """Set the web proxy to use. It should be a string like 'http://localhost:8080/'; if the string is empty, no proxy will be used. If set, the value of the environment variable HTTP_PROXY is automatically used. """ self.urlOpener.set_proxy(proxy) def set_timeout(self, timeout): """Set the default timeout, in seconds, of the connection.""" try: timeout = int(timeout) except Exception: timeout = 0 if timeout <= 0: timeout = None socket.setdefaulttimeout(timeout) def set_cookies(self, cookie_id, cookie_uu): """Set a cookie to access an IMDb's account.""" c_header = 'id=%s; uu=%s' % (cookie_id, cookie_uu) self.urlOpener.set_header('Cookie', c_header) def del_cookies(self): """Remove the used cookie.""" self.urlOpener.del_header('Cookie') def do_adult_search(self, doAdult, cookie_id=_cookie_id, cookie_uu=_cookie_uu): """If doAdult is true, 'adult' movies are included in the search results; cookie_id and cookie_uu are optional parameters to select a specific account (see your cookie or cookies.txt file.""" if doAdult: self.set_cookies(cookie_id, cookie_uu) #c_header = 'id=%s; uu=%s' % (cookie_id, cookie_uu) #self.urlOpener.set_header('Cookie', c_header) else: self.urlOpener.del_header('Cookie') def _retrieve(self, url, size=-1, _noCookies=False): """Retrieve the given URL.""" ##print url _cookies = None # XXX: quite obscene, but in some very limited # cases (/ttXXXXXXX/epdate) if the cookies # are set, a 500 error is returned. if _noCookies: _cookies = self.urlOpener.get_header('Cookie') self.del_cookies() self._http_logger.debug('fetching url %s (size: %d)', url, size) try: ret = self.urlOpener.retrieve_unicode(url, size=size) finally: if _noCookies and _cookies: self.urlOpener.set_header('Cookie', _cookies) return ret def _get_search_content(self, kind, ton, results): """Retrieve the web page for a given search. kind can be 'tt' (for titles), 'nm' (for names), 'char' (for characters) or 'co' (for companies). ton is the title or the name to search. results is the maximum number of results to be retrieved.""" if isinstance(ton, unicode): try: ton = ton.encode('iso8859-1') except Exception, e: try: ton = ton.encode('utf-8') except Exception, e: pass ##params = 'q=%s&%s=on&mx=%s' % (quote_plus(ton), kind, str(results)) params = 'q=%s;s=%s;mx=%s' % (quote_plus(ton), kind, str(results)) if kind == 'ep': params = params.replace('s=ep;', 's=tt;ttype=ep;', 1) cont = self._retrieve(self.urls['find'] % params) #print 'URL:', imdbURL_find % params if cont.find('Your search returned more than') == -1 or \ cont.find("displayed the exact matches") == -1: return cont # The retrieved page contains no results, because too many # titles or names contain the string we're looking for. params = 'q=%s;ls=%s;lm=0' % (quote_plus(ton), kind) size = 131072 + results * 512 return self._retrieve(self.urls['find'] % params, size=size) def _search_movie(self, title, results): # The URL of the query. # XXX: To retrieve the complete results list: # params = urllib.urlencode({'more': 'tt', 'q': title}) ##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title}) ##params = 'q=%s&tt=on&mx=%s' % (quote_plus(title), str(results)) ##cont = self._retrieve(imdbURL_find % params) cont = self._get_search_content('tt', title, results) return self.smProxy.search_movie_parser.parse(cont, results=results)['data'] def _search_episode(self, title, results): t_dict = analyze_title(title) if t_dict['kind'] == 'episode': title = t_dict['title'] cont = self._get_search_content('ep', title, results) return self.smProxy.search_movie_parser.parse(cont, results=results)['data'] def get_movie_main(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'combined') return self.mProxy.movie_parser.parse(cont, mdparse=self._mdparse) def get_movie_full_credits(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'fullcredits') return self.mProxy.movie_parser.parse(cont) def get_movie_plot(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'plotsummary') return self.mProxy.plot_parser.parse(cont, getRefs=self._getRefs) def get_movie_awards(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'awards') return self.mProxy.movie_awards_parser.parse(cont) def get_movie_taglines(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'taglines') return self.mProxy.taglines_parser.parse(cont) def get_movie_keywords(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'keywords') return self.mProxy.keywords_parser.parse(cont) def get_movie_alternate_versions(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'alternateversions') return self.mProxy.alternateversions_parser.parse(cont, getRefs=self._getRefs) def get_movie_crazy_credits(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'crazycredits') return self.mProxy.crazycredits_parser.parse(cont, getRefs=self._getRefs) def get_movie_goofs(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'goofs') return self.mProxy.goofs_parser.parse(cont, getRefs=self._getRefs) def get_movie_quotes(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'quotes') return self.mProxy.quotes_parser.parse(cont, getRefs=self._getRefs) def get_movie_release_dates(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'releaseinfo') ret = self.mProxy.releasedates_parser.parse(cont) ret['info sets'] = ('release dates', 'akas') return ret get_movie_akas = get_movie_release_dates get_movie_release_info = get_movie_release_dates def get_movie_vote_details(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'ratings') return self.mProxy.ratings_parser.parse(cont) def get_movie_official_sites(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'officialsites') return self.mProxy.officialsites_parser.parse(cont) def get_movie_trivia(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'trivia') return self.mProxy.trivia_parser.parse(cont, getRefs=self._getRefs) def get_movie_connections(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'movieconnections') return self.mProxy.connections_parser.parse(cont) def get_movie_technical(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'technical') return self.mProxy.tech_parser.parse(cont) def get_movie_business(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'business') return self.mProxy.business_parser.parse(cont, getRefs=self._getRefs) def get_movie_literature(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'literature') return self.mProxy.literature_parser.parse(cont) def get_movie_locations(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'locations') return self.mProxy.locations_parser.parse(cont) def get_movie_soundtrack(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'soundtrack') return self.mProxy.soundtrack_parser.parse(cont) def get_movie_dvd(self, movieID): self._http_logger.warn('dvd information no longer available', exc_info=False) return {} def get_movie_recommendations(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'recommendations') return self.mProxy.rec_parser.parse(cont) def get_movie_external_reviews(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'externalreviews') return self.mProxy.externalrev_parser.parse(cont) def get_movie_newsgroup_reviews(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'newsgroupreviews') return self.mProxy.newsgrouprev_parser.parse(cont) def get_movie_misc_sites(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'miscsites') return self.mProxy.misclinks_parser.parse(cont) def get_movie_sound_clips(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'soundsites') return self.mProxy.soundclips_parser.parse(cont) def get_movie_video_clips(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'videosites') return self.mProxy.videoclips_parser.parse(cont) def get_movie_photo_sites(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'photosites') return self.mProxy.photosites_parser.parse(cont) def get_movie_news(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'news') return self.mProxy.news_parser.parse(cont, getRefs=self._getRefs) def get_movie_amazon_reviews(self, movieID): self._http_logger.warn('amazon review no longer available', exc_info=False) return {} def get_movie_guests(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'epcast') return self.mProxy.episodes_cast_parser.parse(cont) get_movie_episodes_cast = get_movie_guests def get_movie_merchandising_links(self, movieID): self._http_logger.warn('merchandising links no longer available', exc_info=False) return {} def _purge_seasons_data(self, data_d): if '_current_season' in data_d['data']: del data_d['data']['_current_season'] if '_seasons' in data_d['data']: del data_d['data']['_seasons'] return data_d def get_movie_episodes(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'episodes') data_d = self.mProxy.season_episodes_parser.parse(cont) if not data_d and 'data' in data_d: return {} _current_season = data_d['data'].get('_current_season', '') _seasons = data_d['data'].get('_seasons') or [] data_d = self._purge_seasons_data(data_d) data_d['data'].setdefault('episodes', {}) nr_eps = len(data_d['data']['episodes'].get(_current_season) or []) for season in _seasons: if season == _current_season: continue other_cont = self._retrieve(self.urls['movie_main'] % movieID + 'episodes?season=' + str(season)) other_d = self.mProxy.season_episodes_parser.parse(other_cont) other_d = self._purge_seasons_data(other_d) other_d['data'].setdefault('episodes', {}) if not (other_d and other_d['data'] and other_d['data']['episodes'][season]): continue nr_eps += len(other_d['data']['episodes'].get(season) or []) data_d['data']['episodes'][season] = other_d['data']['episodes'][season] data_d['data']['number of episodes'] = nr_eps return data_d def get_movie_episodes_rating(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'epdate', _noCookies=True) data_d = self.mProxy.eprating_parser.parse(cont) # set movie['episode of'].movieID for every episode. if data_d.get('data', {}).has_key('episodes rating'): for item in data_d['data']['episodes rating']: episode = item['episode'] episode['episode of'].movieID = movieID return data_d def get_movie_faqs(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'faq') return self.mProxy.movie_faqs_parser.parse(cont, getRefs=self._getRefs) def get_movie_airing(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'tvschedule') return self.mProxy.airing_parser.parse(cont) get_movie_tv_schedule = get_movie_airing def get_movie_synopsis(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'synopsis') return self.mProxy.synopsis_parser.parse(cont) def get_movie_parents_guide(self, movieID): cont = self._retrieve(self.urls['movie_main'] % movieID + 'parentalguide') return self.mProxy.parentsguide_parser.parse(cont) def _search_person(self, name, results): # The URL of the query. # XXX: To retrieve the complete results list: # params = urllib.urlencode({'more': 'nm', 'q': name}) ##params = urllib.urlencode({'nm': 'on', 'mx': str(results), 'q': name}) #params = 'q=%s&nm=on&mx=%s' % (quote_plus(name), str(results)) #cont = self._retrieve(imdbURL_find % params) cont = self._get_search_content('nm', name, results) return self.spProxy.search_person_parser.parse(cont, results=results)['data'] def get_person_main(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'maindetails') ret = self.pProxy.maindetails_parser.parse(cont) ret['info sets'] = ('main', 'filmography') return ret def get_person_filmography(self, personID): return self.get_person_main(personID) def get_person_biography(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'bio') return self.pProxy.bio_parser.parse(cont, getRefs=self._getRefs) def get_person_awards(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'awards') return self.pProxy.person_awards_parser.parse(cont) def get_person_other_works(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'otherworks') return self.pProxy.otherworks_parser.parse(cont, getRefs=self._getRefs) #def get_person_agent(self, personID): # cont = self._retrieve(self.urls['person_main'] % personID + 'agent') # return self.pProxy.agent_parser.parse(cont) def get_person_publicity(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'publicity') return self.pProxy.publicity_parser.parse(cont) def get_person_official_sites(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'officialsites') return self.pProxy.person_officialsites_parser.parse(cont) def get_person_news(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'news') return self.pProxy.news_parser.parse(cont) def get_person_episodes(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'filmoseries') return self.pProxy.person_series_parser.parse(cont) def get_person_merchandising_links(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'forsale') return self.pProxy.sales_parser.parse(cont) def get_person_genres_links(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'filmogenre') return self.pProxy.person_genres_parser.parse(cont) def get_person_keywords_links(self, personID): cont = self._retrieve(self.urls['person_main'] % personID + 'filmokey') return self.pProxy.person_keywords_parser.parse(cont) def _search_character(self, name, results): cont = self._get_search_content('char', name, results) return self.scProxy.search_character_parser.parse(cont, results=results)['data'] def get_character_main(self, characterID): cont = self._retrieve(self.urls['character_main'] % characterID) ret = self.cProxy.character_main_parser.parse(cont) ret['info sets'] = ('main', 'filmography') return ret get_character_filmography = get_character_main def get_character_biography(self, characterID): cont = self._retrieve(self.urls['character_main'] % characterID + 'bio') return self.cProxy.character_bio_parser.parse(cont, getRefs=self._getRefs) def get_character_episodes(self, characterID): cont = self._retrieve(self.urls['character_main'] % characterID + 'filmoseries') return self.cProxy.character_series_parser.parse(cont) def get_character_quotes(self, characterID): cont = self._retrieve(self.urls['character_main'] % characterID + 'quotes') return self.cProxy.character_quotes_parser.parse(cont, getRefs=self._getRefs) def _search_company(self, name, results): cont = self._get_search_content('co', name, results) url = self.urlOpener._last_url return self.scompProxy.search_company_parser.parse(cont, url=url, results=results)['data'] def get_company_main(self, companyID): cont = self._retrieve(self.urls['company_main'] % companyID) ret = self.compProxy.company_main_parser.parse(cont) return ret def _search_keyword(self, keyword, results): # XXX: the IMDb web server seems to have some serious problem with # non-ascii keyword. # E.g.: http://akas.imdb.com/keyword/fianc%E9/ # will return a 500 Internal Server Error: Redirect Recursion. keyword = keyword.encode('utf8', 'ignore') try: cont = self._get_search_content('kw', keyword, results) except IMDbDataAccessError: self._http_logger.warn('unable to search for keyword %s', keyword, exc_info=True) return [] return self.skProxy.search_keyword_parser.parse(cont, results=results)['data'] def _get_keyword(self, keyword, results): keyword = keyword.encode('utf8', 'ignore') try: cont = self._retrieve(self.urls['keyword_main'] % keyword) except IMDbDataAccessError: self._http_logger.warn('unable to get keyword %s', keyword, exc_info=True) return [] return self.skProxy.search_moviekeyword_parser.parse(cont, results=results)['data'] def _get_top_bottom_movies(self, kind): if kind == 'top': parser = self.topBottomProxy.top250_parser url = self.urls['top250'] elif kind == 'bottom': parser = self.topBottomProxy.bottom100_parser url = self.urls['bottom100'] else: return [] cont = self._retrieve(url) return parser.parse(cont)['data'] ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/searchCompanyParser.py��������������������������������������������������0000644�0000000�0000000�00000005537�11766731642�020650� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.searchCompanyParser module (imdb package). This module provides the HTMLSearchCompanyParser class (and the search_company_parser instance), used to parse the results of a search for a given company. E.g., when searching for the name "Columbia Pictures", the parsed page would be: http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures Copyright 2008-2009 Davide Alberani <da@erlug.linux.it> 2008 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from imdb.utils import analyze_company_name, build_company_name from utils import Extractor, Attribute, analyze_imdbid from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser class DOMBasicCompanyParser(DOMBasicMovieParser): """Simply get the name of a company and the imdbID. It's used by the DOMHTMLSearchCompanyParser class to return a result for a direct match (when a search on IMDb results in a single company, the web server sends directly the company page. """ _titleFunct = lambda self, x: analyze_company_name(x or u'') class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser): _BaseParser = DOMBasicCompanyParser _notDirectHitTitle = '<title>imdb company' _titleBuilder = lambda self, x: build_company_name(x) _linkPrefix = '/company/co' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'notes': "./text()[1]" }, postprocess=lambda x: ( analyze_imdbid(x.get('link')), analyze_company_name(x.get('name')+(x.get('notes') or u''), stripNotes=True) ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, " \ "'/company/co')]/..", attrs=_attrs)] _OBJECTS = { 'search_company_parser': ((DOMHTMLSearchCompanyParser,), {'kind': 'company', '_basic_parser': DOMBasicCompanyParser}) } �����������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/utils.py����������������������������������������������������������������0000644�0000000�0000000�00000103343�11766731642�016031� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.utils module (imdb package). This module provides miscellaneous utilities used by the imdb.parser.http classes. Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> 2008 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re import logging import warnings from imdb._exceptions import IMDbError from imdb.utils import flatten, _Container from imdb.Movie import Movie from imdb.Person import Person from imdb.Character import Character # Year, imdbIndex and kind. re_yearKind_index = re.compile(r'(\([0-9\?]{4}(?:/[IVXLCDM]+)?\)(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)') # Match imdb ids in href tags re_imdbid = re.compile(r'(title/tt|name/nm|character/ch|company/co)([0-9]+)') def analyze_imdbid(href): """Return an imdbID from an URL.""" if not href: return None match = re_imdbid.search(href) if not match: return None return str(match.group(2)) _modify_keys = list(Movie.keys_tomodify_list) + list(Person.keys_tomodify_list) def _putRefs(d, re_titles, re_names, re_characters, lastKey=None): """Iterate over the strings inside list items or dictionary values, substitutes movie titles and person names with the (qv) references.""" if isinstance(d, list): for i in xrange(len(d)): if isinstance(d[i], (unicode, str)): if lastKey in _modify_keys: if re_names: d[i] = re_names.sub(ur"'\1' (qv)", d[i]) if re_titles: d[i] = re_titles.sub(ur'_\1_ (qv)', d[i]) if re_characters: d[i] = re_characters.sub(ur'#\1# (qv)', d[i]) elif isinstance(d[i], (list, dict)): _putRefs(d[i], re_titles, re_names, re_characters, lastKey=lastKey) elif isinstance(d, dict): for k, v in d.items(): lastKey = k if isinstance(v, (unicode, str)): if lastKey in _modify_keys: if re_names: d[k] = re_names.sub(ur"'\1' (qv)", v) if re_titles: d[k] = re_titles.sub(ur'_\1_ (qv)', v) if re_characters: d[k] = re_characters.sub(ur'#\1# (qv)', v) elif isinstance(v, (list, dict)): _putRefs(d[k], re_titles, re_names, re_characters, lastKey=lastKey) # Handle HTML/XML/SGML entities. from htmlentitydefs import entitydefs entitydefs = entitydefs.copy() entitydefsget = entitydefs.get entitydefs['nbsp'] = ' ' sgmlentity = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\'', 'ndash': '-'} sgmlentityget = sgmlentity.get _sgmlentkeys = sgmlentity.keys() entcharrefs = {} entcharrefsget = entcharrefs.get for _k, _v in entitydefs.items(): if _k in _sgmlentkeys: continue if _v[0:2] == '&#': dec_code = _v[1:-1] _v = unichr(int(_v[2:-1])) entcharrefs[dec_code] = _v else: dec_code = '#' + str(ord(_v)) _v = unicode(_v, 'latin_1', 'replace') entcharrefs[dec_code] = _v entcharrefs[_k] = _v del _sgmlentkeys, _k, _v entcharrefs['#160'] = u' ' entcharrefs['#xA0'] = u' ' entcharrefs['#xa0'] = u' ' entcharrefs['#XA0'] = u' ' entcharrefs['#x22'] = u'"' entcharrefs['#X22'] = u'"' # convert &x26; to &, to make BeautifulSoup happy; beware that this # leaves lone '&' in the html broken, but I assume this is better than # the contrary... entcharrefs['#38'] = u'&' entcharrefs['#x26'] = u'&' entcharrefs['#x26'] = u'&' re_entcharrefs = re.compile('&(%s|\#160|\#\d{1,5}|\#x[0-9a-f]{1,4});' % '|'.join(map(re.escape, entcharrefs)), re.I) re_entcharrefssub = re_entcharrefs.sub sgmlentity.update(dict([('#34', u'"'), ('#38', u'&'), ('#60', u'<'), ('#62', u'>'), ('#39', u"'")])) re_sgmlref = re.compile('&(%s);' % '|'.join(map(re.escape, sgmlentity))) re_sgmlrefsub = re_sgmlref.sub # Matches XML-only single tags, like <br/> ; they are invalid in HTML, # but widely used by IMDb web site. :-/ re_xmltags = re.compile('<([a-zA-Z]+)/>') def _replXMLRef(match): """Replace the matched XML/HTML entities and references; replace everything except sgml entities like <, >, ...""" ref = match.group(1) value = entcharrefsget(ref) if value is None: if ref[0] == '#': ref_code = ref[1:] if ref_code in ('34', '38', '60', '62', '39'): return match.group(0) elif ref_code[0].lower() == 'x': #if ref[2:] == '26': # # Don't convert &x26; to &, to make BeautifulSoup happy. # return '&' return unichr(int(ref[2:], 16)) else: return unichr(int(ref[1:])) else: return ref return value def subXMLRefs(s): """Return the given html string with entity and char references replaced.""" return re_entcharrefssub(_replXMLRef, s) # XXX: no more used here; move it to mobile (they are imported by helpers, too)? def _replSGMLRefs(match): """Replace the matched SGML entity.""" ref = match.group(1) return sgmlentityget(ref, ref) def subSGMLRefs(s): """Return the given html string with sgml entity and char references replaced.""" return re_sgmlrefsub(_replSGMLRefs, s) _b_p_logger = logging.getLogger('imdbpy.parser.http.build_person') def build_person(txt, personID=None, billingPos=None, roleID=None, accessSystem='http', modFunct=None): """Return a Person instance from the tipical <tr>...</tr> strings found in the IMDb's web site.""" #if personID is None # _b_p_logger.debug('empty name or personID for "%s"', txt) notes = u'' role = u'' # Search the (optional) separator between name and role/notes. if txt.find('....') != -1: sep = '....' elif txt.find('...') != -1: sep = '...' else: sep = '...' # Replace the first parenthesis, assuming there are only # notes, after. # Rationale: no imdbIndex is (ever?) showed on the web site. txt = txt.replace('(', '...(', 1) txt_split = txt.split(sep, 1) name = txt_split[0].strip() if len(txt_split) == 2: role_comment = txt_split[1].strip() # Strip common endings. if role_comment[-4:] == ' and': role_comment = role_comment[:-4].rstrip() elif role_comment[-2:] == ' &': role_comment = role_comment[:-2].rstrip() elif role_comment[-6:] == '& ....': role_comment = role_comment[:-6].rstrip() # Get the notes. if roleID is not None: if not isinstance(roleID, list): cmt_idx = role_comment.find('(') if cmt_idx != -1: role = role_comment[:cmt_idx].rstrip() notes = role_comment[cmt_idx:] else: # Just a role, without notes. role = role_comment else: role = role_comment else: # We're managing something that doesn't have a 'role', so # everything are notes. notes = role_comment if role == '....': role = u'' roleNotes = [] # Manages multiple roleIDs. if isinstance(roleID, list): rolesplit = role.split('/') role = [] for r in rolesplit: nidx = r.find('(') if nidx != -1: role.append(r[:nidx].rstrip()) roleNotes.append(r[nidx:]) else: role.append(r) roleNotes.append(None) lr = len(role) lrid = len(roleID) if lr > lrid: roleID += [None] * (lrid - lr) elif lr < lrid: roleID = roleID[:lr] for i, rid in enumerate(roleID): if rid is not None: roleID[i] = str(rid) if lr == 1: role = role[0] roleID = roleID[0] notes = roleNotes[0] or u'' elif roleID is not None: roleID = str(roleID) if personID is not None: personID = str(personID) if (not name) or (personID is None): # Set to 'debug', since build_person is expected to receive some crap. _b_p_logger.debug('empty name or personID for "%s"', txt) # XXX: return None if something strange is detected? person = Person(name=name, personID=personID, currentRole=role, roleID=roleID, notes=notes, billingPos=billingPos, modFunct=modFunct, accessSystem=accessSystem) if roleNotes and len(roleNotes) == len(roleID): for idx, role in enumerate(person.currentRole): if roleNotes[idx]: role.notes = roleNotes[idx] return person _re_chrIDs = re.compile('[0-9]{7}') _b_m_logger = logging.getLogger('imdbpy.parser.http.build_movie') # To shrink spaces. re_spaces = re.compile(r'\s+') def build_movie(txt, movieID=None, roleID=None, status=None, accessSystem='http', modFunct=None, _parsingCharacter=False, _parsingCompany=False, year=None, chrRoles=None, rolesNoChar=None, additionalNotes=None): """Given a string as normally seen on the "categorized" page of a person on the IMDb's web site, returns a Movie instance.""" # FIXME: Oook, lets face it: build_movie and build_person are now # two horrible sets of patches to support the new IMDb design. They # must be rewritten from scratch. if _parsingCharacter: _defSep = ' Played by ' elif _parsingCompany: _defSep = ' ... ' else: _defSep = ' .... ' title = re_spaces.sub(' ', txt).strip() # Split the role/notes from the movie title. tsplit = title.split(_defSep, 1) role = u'' notes = u'' roleNotes = [] if len(tsplit) == 2: title = tsplit[0].rstrip() role = tsplit[1].lstrip() if title[-9:] == 'TV Series': title = title[:-9].rstrip() #elif title[-7:] == '(short)': # title = title[:-7].rstrip() #elif title[-11:] == '(TV series)': # title = title[:-11].rstrip() #elif title[-10:] == '(TV movie)': # title = title[:-10].rstrip() elif title[-14:] == 'TV mini-series': title = title[:-14] + ' (mini)' if title and title.endswith(_defSep.rstrip()): title = title[:-len(_defSep)+1] # Try to understand where the movie title ends. while True: if year: break if title[-1:] != ')': # Ignore the silly "TV Series" notice. if title[-9:] == 'TV Series': title = title[:-9].rstrip() continue else: # Just a title: stop here. break # Try to match paired parentheses; yes: sometimes there are # parentheses inside comments... nidx = title.rfind('(') while (nidx != -1 and \ title[nidx:].count('(') != title[nidx:].count(')')): nidx = title[:nidx].rfind('(') # Unbalanced parentheses: stop here. if nidx == -1: break # The last item in parentheses seems to be a year: stop here. first4 = title[nidx+1:nidx+5] if (first4.isdigit() or first4 == '????') and \ title[nidx+5:nidx+6] in (')', '/'): break # The last item in parentheses is a known kind: stop here. if title[nidx+1:-1] in ('TV', 'V', 'mini', 'VG', 'TV movie', 'TV series', 'short'): break # Else, in parentheses there are some notes. # XXX: should the notes in the role half be kept separated # from the notes in the movie title half? if notes: notes = '%s %s' % (title[nidx:], notes) else: notes = title[nidx:] title = title[:nidx].rstrip() if year: year = year.strip() if title[-1] == ')': fpIdx = title.rfind('(') if fpIdx != -1: if notes: notes = '%s %s' % (title[fpIdx:], notes) else: notes = title[fpIdx:] title = title[:fpIdx].rstrip() title = u'%s (%s)' % (title, year) if _parsingCharacter and roleID and not role: roleID = None if not roleID: roleID = None elif len(roleID) == 1: roleID = roleID[0] if not role and chrRoles and isinstance(roleID, (str, unicode)): roleID = _re_chrIDs.findall(roleID) role = ' / '.join(filter(None, chrRoles.split('@@'))) # Manages multiple roleIDs. if isinstance(roleID, list): tmprole = role.split('/') role = [] for r in tmprole: nidx = r.find('(') if nidx != -1: role.append(r[:nidx].rstrip()) roleNotes.append(r[nidx:]) else: role.append(r) roleNotes.append(None) lr = len(role) lrid = len(roleID) if lr > lrid: roleID += [None] * (lrid - lr) elif lr < lrid: roleID = roleID[:lr] for i, rid in enumerate(roleID): if rid is not None: roleID[i] = str(rid) if lr == 1: role = role[0] roleID = roleID[0] elif roleID is not None: roleID = str(roleID) if movieID is not None: movieID = str(movieID) if (not title) or (movieID is None): _b_m_logger.error('empty title or movieID for "%s"', txt) if rolesNoChar: rolesNoChar = filter(None, [x.strip() for x in rolesNoChar.split('/')]) if not role: role = [] elif not isinstance(role, list): role = [role] role += rolesNoChar notes = notes.strip() if additionalNotes: additionalNotes = re_spaces.sub(' ', additionalNotes).strip() if notes: notes += u' ' notes += additionalNotes if role and isinstance(role, list) and notes.endswith(role[-1].replace('\n', ' ')): role = role[:-1] m = Movie(title=title, movieID=movieID, notes=notes, currentRole=role, roleID=roleID, roleIsPerson=_parsingCharacter, modFunct=modFunct, accessSystem=accessSystem) if roleNotes and len(roleNotes) == len(roleID): for idx, role in enumerate(m.currentRole): try: if roleNotes[idx]: role.notes = roleNotes[idx] except IndexError: break # Status can't be checked here, and must be detected by the parser. if status: m['status'] = status return m class DOMParserBase(object): """Base parser to handle HTML data from the IMDb's web server.""" _defGetRefs = False _containsObjects = False preprocessors = [] extractors = [] usingModule = None _logger = logging.getLogger('imdbpy.parser.http.domparser') def __init__(self, useModule=None): """Initialize the parser. useModule can be used to force it to use 'BeautifulSoup' or 'lxml'; by default, it's auto-detected, using 'lxml' if available and falling back to 'BeautifulSoup' otherwise.""" # Module to use. if useModule is None: useModule = ('lxml', 'BeautifulSoup') if not isinstance(useModule, (tuple, list)): useModule = [useModule] self._useModule = useModule nrMods = len(useModule) _gotError = False for idx, mod in enumerate(useModule): mod = mod.strip().lower() try: if mod == 'lxml': from lxml.html import fromstring from lxml.etree import tostring self._is_xml_unicode = False self.usingModule = 'lxml' elif mod == 'beautifulsoup': from bsouplxml.html import fromstring from bsouplxml.etree import tostring self._is_xml_unicode = True self.usingModule = 'beautifulsoup' else: self._logger.warn('unknown module "%s"' % mod) continue self.fromstring = fromstring self._tostring = tostring if _gotError: warnings.warn('falling back to "%s"' % mod) break except ImportError, e: if idx+1 >= nrMods: # Raise the exception, if we don't have any more # options to try. raise IMDbError('unable to use any parser in %s: %s' % \ (str(useModule), str(e))) else: warnings.warn('unable to use "%s": %s' % (mod, str(e))) _gotError = True continue else: raise IMDbError('unable to use parsers in %s' % str(useModule)) # Fall-back defaults. self._modFunct = None self._as = 'http' self._cname = self.__class__.__name__ self._init() self.reset() def reset(self): """Reset the parser.""" # Names and titles references. self._namesRefs = {} self._titlesRefs = {} self._charactersRefs = {} self._reset() def _init(self): """Subclasses can override this method, if needed.""" pass def _reset(self): """Subclasses can override this method, if needed.""" pass def parse(self, html_string, getRefs=None, **kwds): """Return the dictionary generated from the given html string; getRefs can be used to force the gathering of movies/persons/characters references.""" self.reset() if getRefs is not None: self.getRefs = getRefs else: self.getRefs = self._defGetRefs # Useful only for the testsuite. if not isinstance(html_string, unicode): html_string = unicode(html_string, 'latin_1', 'replace') html_string = subXMLRefs(html_string) # Temporary fix: self.parse_dom must work even for empty strings. html_string = self.preprocess_string(html_string) html_string = html_string.strip() if self.usingModule == 'beautifulsoup': # tag attributes like title=""Family Guy"" will be # converted to title=""Family Guy"" and this confuses BeautifulSoup. html_string = html_string.replace('""', '"') # Browser-specific escapes create problems to BeautifulSoup. html_string = html_string.replace('<!--[if IE]>', '"') html_string = html_string.replace('<![endif]-->', '"') #print html_string.encode('utf8') if html_string: dom = self.get_dom(html_string) #print self.tostring(dom).encode('utf8') try: dom = self.preprocess_dom(dom) except Exception, e: self._logger.error('%s: caught exception preprocessing DOM', self._cname, exc_info=True) if self.getRefs: try: self.gather_refs(dom) except Exception, e: self._logger.warn('%s: unable to gather refs: %s', self._cname, exc_info=True) data = self.parse_dom(dom) else: data = {} try: data = self.postprocess_data(data) except Exception, e: self._logger.error('%s: caught exception postprocessing data', self._cname, exc_info=True) if self._containsObjects: self.set_objects_params(data) data = self.add_refs(data) return data def _build_empty_dom(self): from bsouplxml import _bsoup return _bsoup.BeautifulSoup('') def get_dom(self, html_string): """Return a dom object, from the given string.""" try: dom = self.fromstring(html_string) if dom is None: dom = self._build_empty_dom() self._logger.error('%s: using a fake empty DOM', self._cname) return dom except Exception, e: self._logger.error('%s: caught exception parsing DOM', self._cname, exc_info=True) return self._build_empty_dom() def xpath(self, element, path): """Return elements matching the given XPath.""" try: xpath_result = element.xpath(path) if self._is_xml_unicode: return xpath_result result = [] for item in xpath_result: if isinstance(item, str): item = unicode(item) result.append(item) return result except Exception, e: self._logger.error('%s: caught exception extracting XPath "%s"', self._cname, path, exc_info=True) return [] def tostring(self, element): """Convert the element to a string.""" if isinstance(element, (unicode, str)): return unicode(element) else: try: return self._tostring(element, encoding=unicode) except Exception, e: self._logger.error('%s: unable to convert to string', self._cname, exc_info=True) return u'' def clone(self, element): """Clone an element.""" return self.fromstring(self.tostring(element)) def preprocess_string(self, html_string): """Here we can modify the text, before it's parsed.""" if not html_string: return html_string # Remove silly  » and – chars. html_string = html_string.replace(u' \xbb', u'') html_string = html_string.replace(u'–', u'-') try: preprocessors = self.preprocessors except AttributeError: return html_string for src, sub in preprocessors: # re._pattern_type is present only since Python 2.5. if callable(getattr(src, 'sub', None)): html_string = src.sub(sub, html_string) elif isinstance(src, str): html_string = html_string.replace(src, sub) elif callable(src): try: html_string = src(html_string) except Exception, e: _msg = '%s: caught exception preprocessing html' self._logger.error(_msg, self._cname, exc_info=True) continue ##print html_string.encode('utf8') return html_string def gather_refs(self, dom): """Collect references.""" grParser = GatherRefs(useModule=self._useModule) grParser._as = self._as grParser._modFunct = self._modFunct refs = grParser.parse_dom(dom) refs = grParser.postprocess_data(refs) self._namesRefs = refs['names refs'] self._titlesRefs = refs['titles refs'] self._charactersRefs = refs['characters refs'] def preprocess_dom(self, dom): """Last chance to modify the dom, before the rules in self.extractors are applied by the parse_dom method.""" return dom def parse_dom(self, dom): """Parse the given dom according to the rules specified in self.extractors.""" result = {} for extractor in self.extractors: ##print extractor.label if extractor.group is None: elements = [(extractor.label, element) for element in self.xpath(dom, extractor.path)] else: groups = self.xpath(dom, extractor.group) elements = [] for group in groups: group_key = self.xpath(group, extractor.group_key) if not group_key: continue group_key = group_key[0] # XXX: always tries the conversion to unicode: # BeautifulSoup.NavigableString is a subclass # of unicode, and so it's never converted. group_key = self.tostring(group_key) normalizer = extractor.group_key_normalize if normalizer is not None: if callable(normalizer): try: group_key = normalizer(group_key) except Exception, e: _m = '%s: unable to apply group_key normalizer' self._logger.error(_m, self._cname, exc_info=True) group_elements = self.xpath(group, extractor.path) elements.extend([(group_key, element) for element in group_elements]) for group_key, element in elements: for attr in extractor.attrs: if isinstance(attr.path, dict): data = {} for field in attr.path.keys(): path = attr.path[field] value = self.xpath(element, path) if not value: data[field] = None else: # XXX: use u'' , to join? data[field] = ''.join(value) else: data = self.xpath(element, attr.path) if not data: data = None else: data = attr.joiner.join(data) if not data: continue attr_postprocess = attr.postprocess if callable(attr_postprocess): try: data = attr_postprocess(data) except Exception, e: _m = '%s: unable to apply attr postprocess' self._logger.error(_m, self._cname, exc_info=True) key = attr.key if key is None: key = group_key elif key.startswith('.'): # assuming this is an xpath try: key = self.xpath(element, key)[0] except IndexError: self._logger.error('%s: XPath returned no items', self._cname, exc_info=True) elif key.startswith('self.'): key = getattr(self, key[5:]) if attr.multi: if key not in result: result[key] = [] result[key].append(data) else: if isinstance(data, dict): result.update(data) else: result[key] = data return result def postprocess_data(self, data): """Here we can modify the data.""" return data def set_objects_params(self, data): """Set parameters of Movie/Person/... instances, since they are not always set in the parser's code.""" for obj in flatten(data, yieldDictKeys=True, scalar=_Container): obj.accessSystem = self._as obj.modFunct = self._modFunct def add_refs(self, data): """Modify data according to the expected output.""" if self.getRefs: titl_re = ur'(%s)' % '|'.join([re.escape(x) for x in self._titlesRefs.keys()]) if titl_re != ur'()': re_titles = re.compile(titl_re, re.U) else: re_titles = None nam_re = ur'(%s)' % '|'.join([re.escape(x) for x in self._namesRefs.keys()]) if nam_re != ur'()': re_names = re.compile(nam_re, re.U) else: re_names = None chr_re = ur'(%s)' % '|'.join([re.escape(x) for x in self._charactersRefs.keys()]) if chr_re != ur'()': re_characters = re.compile(chr_re, re.U) else: re_characters = None _putRefs(data, re_titles, re_names, re_characters) return {'data': data, 'titlesRefs': self._titlesRefs, 'namesRefs': self._namesRefs, 'charactersRefs': self._charactersRefs} class Extractor(object): """Instruct the DOM parser about how to parse a document.""" def __init__(self, label, path, attrs, group=None, group_key=None, group_key_normalize=None): """Initialize an Extractor object, used to instruct the DOM parser about how to parse a document.""" # rarely (never?) used, mostly for debugging purposes. self.label = label self.group = group if group_key is None: self.group_key = ".//text()" else: self.group_key = group_key self.group_key_normalize = group_key_normalize self.path = path # A list of attributes to fetch. if isinstance(attrs, Attribute): attrs = [attrs] self.attrs = attrs def __repr__(self): """String representation of an Extractor object.""" r = '<Extractor id:%s (label=%s, path=%s, attrs=%s, group=%s, ' \ 'group_key=%s group_key_normalize=%s)>' % (id(self), self.label, self.path, repr(self.attrs), self.group, self.group_key, self.group_key_normalize) return r class Attribute(object): """The attribute to consider, for a given node.""" def __init__(self, key, multi=False, path=None, joiner=None, postprocess=None): """Initialize an Attribute object, used to specify the attribute to consider, for a given node.""" # The key under which information will be saved; can be a string or an # XPath. If None, the label of the containing extractor will be used. self.key = key self.multi = multi self.path = path if joiner is None: joiner = '' self.joiner = joiner # Post-process this set of information. self.postprocess = postprocess def __repr__(self): """String representation of an Attribute object.""" r = '<Attribute id:%s (key=%s, multi=%s, path=%s, joiner=%s, ' \ 'postprocess=%s)>' % (id(self), self.key, self.multi, repr(self.path), self.joiner, repr(self.postprocess)) return r def _parse_ref(text, link, info): """Manage links to references.""" if link.find('/title/tt') != -1: yearK = re_yearKind_index.match(info) if yearK and yearK.start() == 0: text += ' %s' % info[:yearK.end()] return (text.replace('\n', ' '), link) class GatherRefs(DOMParserBase): """Parser used to gather references to movies, persons and characters.""" _attrs = [Attribute(key=None, multi=True, path={ 'text': './text()', 'link': './@href', 'info': './following::text()[1]' }, postprocess=lambda x: _parse_ref(x.get('text') or u'', x.get('link') or '', (x.get('info') or u'').strip()))] extractors = [ Extractor(label='names refs', path="//a[starts-with(@href, '/name/nm')][string-length(@href)=16]", attrs=_attrs), Extractor(label='titles refs', path="//a[starts-with(@href, '/title/tt')]" \ "[string-length(@href)=17]", attrs=_attrs), Extractor(label='characters refs', path="//a[starts-with(@href, '/character/ch')]" \ "[string-length(@href)=21]", attrs=_attrs), ] def postprocess_data(self, data): result = {} for item in ('names refs', 'titles refs', 'characters refs'): result[item] = {} for k, v in data.get(item, []): k = k.strip() v = v.strip() if not (k and v): continue if not v.endswith('/'): continue imdbID = analyze_imdbid(v) if item == 'names refs': obj = Person(personID=imdbID, name=k, accessSystem=self._as, modFunct=self._modFunct) elif item == 'titles refs': obj = Movie(movieID=imdbID, title=k, accessSystem=self._as, modFunct=self._modFunct) else: obj = Character(characterID=imdbID, name=k, accessSystem=self._as, modFunct=self._modFunct) # XXX: companies aren't handled: are they ever found in text, # as links to their page? result[item][k] = obj return result def add_refs(self, data): return data ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/searchPersonParser.py���������������������������������������������������0000644�0000000�0000000�00000007236�11766731642�020506� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.searchPersonParser module (imdb package). This module provides the HTMLSearchPersonParser class (and the search_person_parser instance), used to parse the results of a search for a given person. E.g., when searching for the name "Mel Gibson", the parsed page would be: http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20 Copyright 2004-2010 Davide Alberani <da@erlug.linux.it> 2008 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re from imdb.utils import analyze_name, build_name from utils import Extractor, Attribute, analyze_imdbid from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser def _cleanName(n): """Clean the name in a title tag.""" if not n: return u'' n = n.replace('Filmography by type for', '') # FIXME: temporary. return n class DOMBasicPersonParser(DOMBasicMovieParser): """Simply get the name of a person and the imdbID. It's used by the DOMHTMLSearchPersonParser class to return a result for a direct match (when a search on IMDb results in a single person, the web server sends directly the movie page.""" _titleFunct = lambda self, x: analyze_name(_cleanName(x), canonical=1) _reAKASp = re.compile(r'(?:aka|birth name) (<em>")(.*?)"(<br>|<\/em>|<\/td>)', re.I | re.M) class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, for persons.""" _BaseParser = DOMBasicPersonParser _notDirectHitTitle = '<title>imdb name' _titleBuilder = lambda self, x: build_name(x, canonical=True) _linkPrefix = '/name/nm' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'index': "./text()[1]", 'akas': ".//div[@class='_imdbpyAKA']/text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), analyze_name((x.get('name') or u'') + \ (x.get('index') or u''), canonical=1), x.get('akas') ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, '/name/nm')]/..", attrs=_attrs)] def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): html_string = _reAKASp.sub( r'\1<div class="_imdbpyAKA">\2::</div>\3', html_string) return DOMHTMLSearchMovieParser.preprocess_string(self, html_string) _OBJECTS = { 'search_person_parser': ((DOMHTMLSearchPersonParser,), {'kind': 'person', '_basic_parser': DOMBasicPersonParser}) } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/searchKeywordParser.py��������������������������������������������������0000644�0000000�0000000�00000010353�11766731642�020656� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.searchKeywordParser module (imdb package). This module provides the HTMLSearchKeywordParser class (and the search_company_parser instance), used to parse the results of a search for a given keyword. E.g., when searching for the keyword "alabama", the parsed page would be: http://akas.imdb.com/find?s=kw;mx=20;q=alabama Copyright 2009 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from utils import Extractor, Attribute, analyze_imdbid from imdb.utils import analyze_title, analyze_company_name from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser class DOMBasicKeywordParser(DOMBasicMovieParser): """Simply get the name of a keyword. It's used by the DOMHTMLSearchKeywordParser class to return a result for a direct match (when a search on IMDb results in a single keyword, the web server sends directly the keyword page. """ # XXX: it's still to be tested! # I'm not even sure there can be a direct hit, searching for keywords. _titleFunct = lambda self, x: analyze_company_name(x or u'') class DOMHTMLSearchKeywordParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, searching for keywords similar to the one given.""" _BaseParser = DOMBasicKeywordParser _notDirectHitTitle = '<title>imdb keyword' _titleBuilder = lambda self, x: x _linkPrefix = '/keyword/' _attrs = [Attribute(key='data', multi=True, path="./a[1]/text()" )] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, " \ "'/keyword/')]/..", attrs=_attrs)] def custom_analyze_title4kwd(title, yearNote, outline): """Return a dictionary with the needed info.""" title = title.strip() if not title: return {} if yearNote: yearNote = '%s)' % yearNote.split(' ')[0] title = title + ' ' + yearNote retDict = analyze_title(title) if outline: retDict['plot outline'] = outline return retDict class DOMHTMLSearchMovieKeywordParser(DOMHTMLSearchMovieParser): """Parse the html page that the IMDb web server shows when the "new search system" is used, searching for movies with the given keyword.""" _notDirectHitTitle = '<title>best' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': "./a[1]//text()", 'ynote': "./span[@class='desc']/text()", 'outline': "./span[@class='outline']//text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), custom_analyze_title4kwd(x.get('info') or u'', x.get('ynote') or u'', x.get('outline') or u'') ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, " \ "'/title/tt')]/..", attrs=_attrs)] _OBJECTS = { 'search_keyword_parser': ((DOMHTMLSearchKeywordParser,), {'kind': 'keyword', '_basic_parser': DOMBasicKeywordParser}), 'search_moviekeyword_parser': ((DOMHTMLSearchMovieKeywordParser,), None) } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/bsouplxml/��������������������������������������������������������������0000755�0000000�0000000�00000000000�11766731642�016340� 5����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/bsouplxml/etree.py������������������������������������������������������0000644�0000000�0000000�00000005250�11766731642�020020� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.bsouplxml.etree module (imdb.parser.http package). This module adapts the beautifulsoup interface to lxml.etree module. Copyright 2008 H. Turgut Uyar <uyar@tekir.org> 2008 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import _bsoup as BeautifulSoup from _bsoup import Tag as Element import bsoupxpath # Not directly used by IMDbPY, but do not remove: it's used by IMDbPYKit, # for example. def fromstring(xml_string): """Return a DOM representation of the string.""" # We try to not use BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES, # for convertEntities. return BeautifulSoup.BeautifulStoneSoup(xml_string, convertEntities=None).findChild(True) def tostring(element, encoding=None, pretty_print=False): """Return a string or unicode representation of an element.""" if encoding is unicode: encoding = None # For BeautifulSoup 3.1 #encArgs = {'prettyPrint': pretty_print} #if encoding is not None: # encArgs['encoding'] = encoding #return element.encode(**encArgs) return element.__str__(encoding, pretty_print) def setattribute(tag, name, value): tag[name] = value def xpath(node, expr): """Apply an xpath expression to a node. Return a list of nodes.""" #path = bsoupxpath.Path(expr) path = bsoupxpath.get_path(expr) return path.apply(node) # XXX: monkey patching the beautifulsoup tag class class _EverythingIsNestable(dict): """"Fake that every tag is nestable.""" def get(self, key, *args, **kwds): return [] BeautifulSoup.BeautifulStoneSoup.NESTABLE_TAGS = _EverythingIsNestable() BeautifulSoup.Tag.tag = property(fget=lambda self: self.name) BeautifulSoup.Tag.attrib = property(fget=lambda self: self) BeautifulSoup.Tag.text = property(fget=lambda self: self.string) BeautifulSoup.Tag.set = setattribute BeautifulSoup.Tag.getparent = lambda self: self.parent BeautifulSoup.Tag.drop_tree = BeautifulSoup.Tag.extract BeautifulSoup.Tag.xpath = xpath # TODO: setting the text attribute for tags ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/bsouplxml/__init__.py���������������������������������������������������0000644�0000000�0000000�00000000000�11766731642�020437� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/bsouplxml/html.py�������������������������������������������������������0000644�0000000�0000000�00000002227�11766731642�017661� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.bsouplxml.html module (imdb.parser.http package). This module adapts the beautifulsoup interface to lxml.html module. Copyright 2008 H. Turgut Uyar <uyar@tekir.org> 2008 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import _bsoup as BeautifulSoup def fromstring(html_string): """Return a DOM representation of the string.""" return BeautifulSoup.BeautifulSoup(html_string, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES ).findChild(True) �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/bsouplxml/_bsoup.py�����������������������������������������������������0000644�0000000�0000000�00000230323�11766731642�020204� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" imdb.parser.http._bsoup module (imdb.parser.http package). This is the BeautifulSoup.py module, not modified; it's included here so that it's not an external dependency. Beautiful Soup Elixir and Tonic "The Screen-Scraper's Friend" http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup parses a (possibly invalid) XML or HTML document into a tree representation. It provides methods and Pythonic idioms that make it easy to navigate, search, and modify the tree. A well-formed XML/HTML document yields a well-formed data structure. An ill-formed XML/HTML document yields a correspondingly ill-formed data structure. If your document is only locally well-formed, you can use this library to find and process the well-formed part of it. Beautiful Soup works with Python 2.2 and up. It has no external dependencies, but you'll have more success at converting data to UTF-8 if you also install these three packages: * chardet, for auto-detecting character encodings http://chardet.feedparser.org/ * cjkcodecs and iconv_codec, which add more encodings to the ones supported by stock Python. http://cjkpython.i18n.org/ Beautiful Soup defines classes for two main parsing strategies: * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific language that kind of looks like XML. * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid or invalid. This class has web browser-like heuristics for obtaining a sensible parse tree in the face of common HTML errors. Beautiful Soup also defines a class (UnicodeDammit) for autodetecting the encoding of an HTML or XML document, and converting it to Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/documentation.html Here, have some legalese: Copyright (c) 2004-2008, Leonard Richardson All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the the Beautiful Soup Consortium and All Night Kosher Bakery nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. """ from __future__ import generators __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "3.0.7a" __copyright__ = "Copyright (c) 2004-2008 Leonard Richardson" __license__ = "New-style BSD" from sgmllib import SGMLParser, SGMLParseError import codecs import markupbase import types import re import sgmllib try: from htmlentitydefs import name2codepoint except ImportError: name2codepoint = {} try: set except NameError: from sets import Set as set #These hacks make Beautiful Soup able to parse XML with namespaces sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match DEFAULT_OUTPUT_ENCODING = "utf-8" # First, the classes that represent markup elements. class PageElement: """Contains the navigational information for some part of the page (either a tag or a piece of text)""" def setup(self, parent=None, previous=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent self.previous = previous self.next = None self.previousSibling = None self.nextSibling = None if self.parent and self.parent.contents: self.previousSibling = self.parent.contents[-1] self.previousSibling.nextSibling = self def replaceWith(self, replaceWith): oldParent = self.parent myIndex = self.parent.contents.index(self) if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: # We're replacing this element with one of its siblings. index = self.parent.contents.index(replaceWith) if index and index < myIndex: # Furthermore, it comes before this element. That # means that when we extract it, the index of this # element will change. myIndex = myIndex - 1 self.extract() oldParent.insert(myIndex, replaceWith) def extract(self): """Destructively rips this element out of the tree.""" if self.parent: try: self.parent.contents.remove(self) except ValueError: pass #Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect #the two. lastChild = self._lastRecursiveChild() nextElement = lastChild.next if self.previous: self.previous.next = nextElement if nextElement: nextElement.previous = self.previous self.previous = None lastChild.next = None self.parent = None if self.previousSibling: self.previousSibling.nextSibling = self.nextSibling if self.nextSibling: self.nextSibling.previousSibling = self.previousSibling self.previousSibling = self.nextSibling = None return self def _lastRecursiveChild(self): "Finds the last element beneath this object to be parsed." lastChild = self while hasattr(lastChild, 'contents') and lastChild.contents: lastChild = lastChild.contents[-1] return lastChild def insert(self, position, newChild): if (isinstance(newChild, basestring) or isinstance(newChild, unicode)) \ and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) position = min(position, len(self.contents)) if hasattr(newChild, 'parent') and newChild.parent != None: # We're 'inserting' an element that's already one # of this object's children. if newChild.parent == self: index = self.find(newChild) if index and index < position: # Furthermore we're moving it further down the # list of this object's children. That means that # when we extract this element, our target index # will jump down one. position = position - 1 newChild.extract() newChild.parent = self previousChild = None if position == 0: newChild.previousSibling = None newChild.previous = self else: previousChild = self.contents[position-1] newChild.previousSibling = previousChild newChild.previousSibling.nextSibling = newChild newChild.previous = previousChild._lastRecursiveChild() if newChild.previous: newChild.previous.next = newChild newChildsLastElement = newChild._lastRecursiveChild() if position >= len(self.contents): newChild.nextSibling = None parent = self parentsNextSibling = None while not parentsNextSibling: parentsNextSibling = parent.nextSibling parent = parent.parent if not parent: # This is the last element in the document. break if parentsNextSibling: newChildsLastElement.next = parentsNextSibling else: newChildsLastElement.next = None else: nextChild = self.contents[position] newChild.nextSibling = nextChild if newChild.nextSibling: newChild.nextSibling.previousSibling = newChild newChildsLastElement.next = nextChild if newChildsLastElement.next: newChildsLastElement.next.previous = newChildsLastElement self.contents.insert(position, newChild) def append(self, tag): """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) def findNext(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findAllNext, name, attrs, text, **kwargs) def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.nextGenerator, **kwargs) def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findNextSiblings, name, attrs, text, **kwargs) def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.nextSiblingGenerator, **kwargs) fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x def findPrevious(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears before this Tag in the document.""" return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousGenerator, **kwargs) fetchPrevious = findAllPrevious # Compatibility with pre-3.x def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.""" return self._findOne(self.findPreviousSiblings, name, attrs, text, **kwargs) def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousSiblingGenerator, **kwargs) fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x def findParent(self, name=None, attrs={}, **kwargs): """Returns the closest parent of this Tag that matches the given criteria.""" # NOTE: We can't use _findOne because findParents takes a different # set of arguments. r = None l = self.findParents(name, attrs, 1) if l: r = l[0] return r def findParents(self, name=None, attrs={}, limit=None, **kwargs): """Returns the parents of this Tag that match the given criteria.""" return self._findAll(name, attrs, None, limit, self.parentGenerator, **kwargs) fetchParents = findParents # Compatibility with pre-3.x #These methods do the real heavy lifting. def _findOne(self, method, name, attrs, text, **kwargs): r = None l = method(name, attrs, text, 1, **kwargs) if l: r = l[0] return r def _findAll(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match." if isinstance(name, SoupStrainer): strainer = name else: # Build a SoupStrainer strainer = SoupStrainer(name, attrs, text, **kwargs) results = ResultSet(strainer) g = generator() while True: try: i = g.next() except StopIteration: break if i: found = strainer.search(i) if found: results.append(found) if limit and len(results) >= limit: break return results #These Generators can be used to navigate starting from both #NavigableStrings and Tags. def nextGenerator(self): i = self while i: i = i.next yield i def nextSiblingGenerator(self): i = self while i: i = i.nextSibling yield i def previousGenerator(self): i = self while i: i = i.previous yield i def previousSiblingGenerator(self): i = self while i: i = i.previousSibling yield i def parentGenerator(self): i = self while i: i = i.parent yield i # Utility methods def substituteEncoding(self, str, encoding=None): encoding = encoding or "utf-8" return str.replace("%SOUP-ENCODING%", encoding) def toEncoding(self, s, encoding=None): """Encodes an object to a string in some encoding, or to Unicode. .""" if isinstance(s, unicode): if encoding: s = s.encode(encoding) elif isinstance(s, str): if encoding: s = s.encode(encoding) else: s = unicode(s) else: if encoding: s = self.toEncoding(str(s), encoding) else: s = unicode(s) return s class NavigableString(unicode, PageElement): def __new__(cls, value): """Create a new NavigableString. When unpickling a NavigableString, this method is called with the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ if isinstance(value, unicode): return unicode.__new__(cls, value) return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): return (NavigableString.__str__(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards compatibility for Navigable*String, but for CData* it lets you get the string without the CData wrapper.""" if attr == 'string': return self else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) def __unicode__(self): return str(self).decode(DEFAULT_OUTPUT_ENCODING) def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): if encoding: return self.encode(encoding) else: return self class CData(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) class ProcessingInstruction(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): output = self if "%SOUP-ENCODING%" in output: output = self.substituteEncoding(output, encoding) return "<?%s?>" % self.toEncoding(output, encoding) class Comment(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "<!--%s-->" % NavigableString.__str__(self, encoding) class Declaration(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "<!%s>" % NavigableString.__str__(self, encoding) class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" def _invert(h): "Cheap function to invert a hash." i = {} for k,v in h.items(): i[v] = k return i XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", "quot" : '"', "amp" : "&", "lt" : "<", "gt" : ">" } XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x def __init__(self, parser, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name if attrs == None: attrs = [] self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.convertHTMLEntities = parser.convertHTMLEntities self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. convert = lambda(k, val): (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) self.attrs = map(convert, self.attrs) def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" return self._getAttrMap().get(key, default) def has_key(self, key): return self._getAttrMap().has_key(key) def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" return self._getAttrMap()[key] def __iter__(self): "Iterating over a tag iterates over its contents." return iter(self.contents) def __len__(self): "The length of a tag is the length of its list of contents." return len(self.contents) def __contains__(self, x): return x in self.contents def __nonzero__(self): "A tag is non-None even if it has no contents." return True def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" self._getAttrMap() self.attrMap[key] = value found = False for i in range(0, len(self.attrs)): if self.attrs[i][0] == key: self.attrs[i] = (key, value) found = True if not found: self.attrs.append((key, value)) self._getAttrMap()[key] = value def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." for item in self.attrs: if item[0] == key: self.attrs.remove(item) #We don't break because bad HTML can define the same #attribute multiple times. self._getAttrMap() if self.attrMap.has_key(key): del self.attrMap[key] def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its findAll() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return apply(self.findAll, args, kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, and the same contents (recursively) as the given tag. NOTE: right now this will return false if two tags have the same attributes in a different order. Should this be fixed?""" if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): return False for i in range(0, len(self.contents)): if self.contents[i] != other.contents[i]: return False return True def __ne__(self, other): """Returns true iff this tag is not identical to the other tag, as defined in __eq__.""" return not self == other def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" return self.__str__(encoding) def __unicode__(self): return self.__str__(None) BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") def _sub_entity(self, x): """Used with a regular expression to substitute the appropriate XML entity for an XML special character.""" return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): """Returns a string or Unicode representation of this tag and its contents. To get Unicode, pass None for encoding. NOTE: since Python's HTML parser consumes whitespace, this method is not certain to reproduce the whitespace present in the original string.""" encodedName = self.toEncoding(self.name, encoding) attrs = [] if self.attrs: for key, val in self.attrs: fmt = '%s="%s"' if isString(val): if self.containsSubstitutions and '%SOUP-ENCODING%' in val: val = self.substituteEncoding(val, encoding) # The attribute value either: # # * Contains no embedded double quotes or single quotes. # No problem: we enclose it in double quotes. # * Contains embedded single quotes. No problem: # double quotes work here too. # * Contains embedded double quotes. No problem: # we enclose it in single quotes. # * Embeds both single _and_ double quotes. This # can't happen naturally, but it can happen if # you modify an attribute value after parsing # the document. Now we have a bit of a # problem. We solve it by enclosing the # attribute in single quotes, and escaping any # embedded single quotes to XML entities. if '"' in val: fmt = "%s='%s'" if "'" in val: # TODO: replace with apos when # appropriate. val = val.replace("'", "&squot;") # Now we're okay w/r/t quotes. But the attribute # value might also contain angle brackets, or # ampersands that aren't part of entities. We need # to escape those to XML entities too. val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) attrs.append(fmt % (self.toEncoding(key, encoding), self.toEncoding(val, encoding))) close = '' closeTag = '' if self.isSelfClosing: close = ' /' else: closeTag = '</%s>' % encodedName indentTag, indentContents = 0, 0 if prettyPrint: indentTag = indentLevel space = (' ' * (indentTag-1)) indentContents = indentTag + 1 contents = self.renderContents(encoding, prettyPrint, indentContents) if self.hidden: s = contents else: s = [] attributeString = '' if attrs: attributeString = ' ' + ' '.join(attrs) if prettyPrint: s.append(space) s.append('<%s%s%s>' % (encodedName, attributeString, close)) if prettyPrint: s.append("\n") s.append(contents) if prettyPrint and contents and contents[-1] != "\n": s.append("\n") if prettyPrint and closeTag: s.append(space) s.append(closeTag) if prettyPrint and closeTag and self.nextSibling: s.append("\n") s = ''.join(s) return s def decompose(self): """Recursively destroys the contents of this tree.""" contents = [i for i in self.contents] for i in contents: if isinstance(i, Tag): i.decompose() else: i.extract() self.extract() def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.__str__(encoding, True) def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] for c in self: text = None if isinstance(c, NavigableString): text = c.__str__(encoding) elif isinstance(c, Tag): s.append(c.__str__(encoding, prettyPrint, indentLevel)) if text and prettyPrint: text = text.strip() if text: if prettyPrint: s.append(" " * (indentLevel-1)) s.append(text) if prettyPrint: s.append("\n") return ''.join(s) #Soup methods def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): """Return only the first child of this Tag matching the given criteria.""" r = None l = self.findAll(name, attrs, recursive, text, 1, **kwargs) if l: r = l[0] return r findChild = find def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): """Extracts a list of Tag objects that match the given criteria. You can specify the name of the Tag and any attributes you want the Tag to have. The value of a key-value pair in the 'attrs' map can be a string, a list of strings, a regular expression object, or a callable that takes a string and returns whether or not the string matches for some custom definition of 'matches'. The same is true of the tag name.""" generator = self.recursiveChildGenerator if not recursive: generator = self.childGenerator return self._findAll(name, attrs, text, limit, generator, **kwargs) findChildren = findAll # Pre-3.x compatibility methods first = find fetch = findAll def fetchText(self, text=None, recursive=True, limit=None): return self.findAll(text=text, recursive=recursive, limit=limit) def firstText(self, text=None, recursive=True): return self.find(text=text, recursive=recursive) #Private methods def _getAttrMap(self): """Initializes a map representation of this tag's attributes, if not already initialized.""" if not getattr(self, 'attrMap'): self.attrMap = {} for (key, value) in self.attrs: self.attrMap[key] = value return self.attrMap #Generator methods def childGenerator(self): for i in range(0, len(self.contents)): yield self.contents[i] raise StopIteration def recursiveChildGenerator(self): stack = [(self, 0)] while stack: tag, start = stack.pop() if isinstance(tag, Tag): for i in range(start, len(tag.contents)): a = tag.contents[i] yield a if isinstance(a, Tag) and tag.contents: if i < len(tag.contents) - 1: stack.append((tag, i+1)) stack.append((a, 0)) break raise StopIteration # Next, a couple classes to represent queries and their results. class SoupStrainer: """Encapsulates a number of ways of matching a markup element (tag or text).""" def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isString(attrs): kwargs['class'] = attrs attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text def __str__(self): if self.text: return self.text else: return "%s|%s" % (self.name, self.attrs) def searchTag(self, markupName=None, markupAttrs={}): found = None markup = None if isinstance(markupName, Tag): markup = markupName markupAttrs = markup callFunctionWithTagData = callable(self.name) \ and not isinstance(markupName, Tag) if (not self.name) \ or callFunctionWithTagData \ or (markup and self._matches(markup, self.name)) \ or (not markup and self._matches(markupName, self.name)): if callFunctionWithTagData: match = self.name(markupName, markupAttrs) else: match = True markupAttrMap = None for attr, matchAgainst in self.attrs.items(): if not markupAttrMap: if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs else: markupAttrMap = {} for k,v in markupAttrs: markupAttrMap[k] = v attrValue = markupAttrMap.get(attr) if not self._matches(attrValue, matchAgainst): match = False break if match: if markup: found = markup else: found = markupName return found def search(self, markup): #print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that # matches. if isList(markup) and not isinstance(markup, Tag): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): found = element break # If it's a Tag, make sure its name or attributes match. # Don't bother with Tags if we're searching for text. elif isinstance(markup, Tag): if not self.text: found = self.searchTag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ isString(markup): if self._matches(markup, self.text): found = markup else: raise Exception, "I don't know how to match against a %s" \ % markup.__class__ return found def _matches(self, markup, matchAgainst): #print "Matching %s against %s" % (markup, matchAgainst) result = False if matchAgainst == True and type(matchAgainst) == types.BooleanType: result = markup != None elif callable(matchAgainst): result = matchAgainst(markup) else: #Custom match methods take the tag as an argument, but all #other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name if markup and not isString(markup): markup = unicode(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. result = markup and matchAgainst.search(markup) elif isList(matchAgainst): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): result = markup.has_key(matchAgainst) elif matchAgainst and isString(markup): if isinstance(markup, unicode): matchAgainst = unicode(matchAgainst) else: matchAgainst = str(matchAgainst) if not result: result = matchAgainst == markup return result class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" def __init__(self, source): list.__init__([]) self.source = source # Now, some helper functions. def isList(l): """Convenience method that works with all 2.x versions of Python to determine whether or not something is listlike.""" return hasattr(l, '__iter__') \ or (type(l) in (types.ListType, types.TupleType)) def isString(s): """Convenience method that works with all 2.x versions of Python to determine whether or not something is stringlike.""" try: return isinstance(s, unicode) or isinstance(s, basestring) except NameError: return isinstance(s, str) def buildTagMap(default, *args): """Turns a list of maps, lists, or scalars into a single map. Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and NESTING_RESET_TAGS maps out of lists and partial maps.""" built = {} for portion in args: if hasattr(portion, 'items'): #It's a map. Merge it. for k,v in portion.items(): built[k] = v elif isList(portion): #It's a list. Map each item to the default. for k in portion: built[k] = default else: #It's a scalar. Map it to the default. built[portion] = default return built # Now, the parser classes. class BeautifulStoneSoup(Tag, SGMLParser): """This class contains the basic parser and search code. It defines a parser that knows nothing about tag behavior except for the following: You can't close a tag without closing all the tags it encloses. That is, "<foo><bar></foo>" actually means "<foo><bar></bar></foo>". [Another possible explanation is "<foo><bar /></foo>", but since this class defines no SELF_CLOSING_TAGS, it will never use that explanation.] This class is useful for parsing XML or made-up markup languages, or when BeautifulSoup makes an assumption counter to what you were expecting.""" SELF_CLOSING_TAGS = {} NESTABLE_TAGS = {} RESET_NESTING_TAGS = {} QUOTE_TAGS = {} PRESERVE_WHITESPACE_TAGS = [] MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), lambda x: x.group(1) + ' />'), (re.compile('<!\s+([^<>]*)>'), lambda x: '<!' + x.group(1) + '>') ] ROOT_TAG_NAME = u'[document]' HTML_ENTITIES = "html" XML_ENTITIES = "xml" XHTML_ENTITIES = "xhtml" # TODO: This only exists for backwards-compatibility ALL_ENTITIES = XHTML_ENTITIES # Used when determining whether a text node is all whitespace and # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left # alone. STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, convertEntities=None, selfClosingTags=None, isHTML=False): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser. sgmllib will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills sgmllib, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly. By default, Beautiful Soup uses regexes to sanitize input, avoiding the vast majority of these problems. If the problems don't apply to you, pass in False for markupMassage, and you'll get better performance. The default parser massage techniques fix the two most common instances of invalid HTML that choke sgmllib: <br/> (No space between name of closing tag and tag close) <! --Comment--> (Extraneous whitespace in declaration) You can pass in a custom list of (RE object, replace method) tuples to get Beautiful Soup to scrub your input the way you want.""" self.parseOnlyThese = parseOnlyThese self.fromEncoding = fromEncoding self.smartQuotesTo = smartQuotesTo self.convertEntities = convertEntities # Set the rules for how we'll deal with the entities we # encounter if self.convertEntities: # It doesn't make sense to convert encoded characters to # entities even while you're converting entities to Unicode. # Just convert it all to Unicode. self.smartQuotesTo = None if convertEntities == self.HTML_ENTITIES: self.convertXMLEntities = False self.convertHTMLEntities = True self.escapeUnrecognizedEntities = True elif convertEntities == self.XHTML_ENTITIES: self.convertXMLEntities = True self.convertHTMLEntities = True self.escapeUnrecognizedEntities = False elif convertEntities == self.XML_ENTITIES: self.convertXMLEntities = True self.convertHTMLEntities = False self.escapeUnrecognizedEntities = False else: self.convertXMLEntities = False self.convertHTMLEntities = False self.escapeUnrecognizedEntities = False self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) SGMLParser.__init__(self) if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() self.markup = markup self.markupMassage = markupMassage try: self._feed(isHTML=isHTML) except StopParsing: pass self.markup = None # The markup can now be GCed def convert_charref(self, name): """This method fixes a bug in Python's SGMLParser.""" try: n = int(name) except ValueError: return if not 0 <= n <= 127 : # ASCII ends at 127, not 255 return return self.convert_codepoint(n) def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not isList(self.markupMassage): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def __getattr__(self, methodName): """This method routes method call requests to either the SGMLParser superclass or the Tag superclass, depending on the method name.""" #print "__getattr__ called on %s.%s" % (self.__class__, methodName) if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ or methodName.find('do_') == 0: return SGMLParser.__getattr__(self, methodName) elif methodName.find('__') != 0: return Tag.__getattr__(self, methodName) else: raise AttributeError def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" return self.SELF_CLOSING_TAGS.has_key(name) \ or self.instanceSelfClosingTags.has_key(name) def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self) def popTag(self): tag = self.tagStack.pop() # Tags with just one string-owning child get the child as a # 'string' property, so that soup.tag.string is shorthand for # soup.tag.contents[0] if len(self.currentTag.contents) == 1 and \ isinstance(self.currentTag.contents[0], NavigableString): self.currentTag.string = self.currentTag.contents[0] #print "Pop", tag.name if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): #print "Push", tag.name if self.currentTag: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] def endData(self, containerClass=NavigableString): if self.currentData: currentData = u''.join(self.currentData) if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and not set([tag.name for tag in self.tagStack]).intersection( self.PRESERVE_WHITESPACE_TAGS)): if '\n' in currentData: currentData = '\n' else: currentData = ' ' self.currentData = [] if self.parseOnlyThese and len(self.tagStack) <= 1 and \ (not self.parseOnlyThese.text or \ not self.parseOnlyThese.search(currentData)): return o = containerClass(currentData) o.setup(self.currentTag, self.previous) if self.previous: self.previous.next = o self.previous = o self.currentTag.contents.append(o) def _popToTag(self, name, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of the given tag.""" #print "Popping to %s" % name if name == self.ROOT_TAG_NAME: return numPops = 0 mostRecentTag = None for i in range(len(self.tagStack)-1, 0, -1): if name == self.tagStack[i].name: numPops = len(self.tagStack)-i break if not inclusivePop: numPops = numPops - 1 for i in range(0, numPops): mostRecentTag = self.popTag() return mostRecentTag def _smartPop(self, name): """We need to pop up to the previous tag of this type, unless one of this tag's nesting reset triggers comes between this tag and the previous tag of this type, OR unless this tag is a generic nesting trigger and another generic nesting trigger comes between this tag and the previous tag of this type. Examples: <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' <td><tr><td> *<td>* should pop to 'tr', not the first 'td' """ nestingResetTriggers = self.NESTABLE_TAGS.get(name) isNestable = nestingResetTriggers != None isResetNesting = self.RESET_NESTING_TAGS.has_key(name) popTo = None inclusive = True for i in range(len(self.tagStack)-1, 0, -1): p = self.tagStack[i] if (not p or p.name == name) and not isNestable: #Non-nestable tags get popped to the top or to their #last occurance. popTo = name break if (nestingResetTriggers != None and p.name in nestingResetTriggers) \ or (nestingResetTriggers == None and isResetNesting and self.RESET_NESTING_TAGS.has_key(p.name)): #If we encounter one of the nesting reset triggers #peculiar to this tag, or we encounter another tag #that causes nesting to reset, pop up to but not #including that tag. popTo = p.name inclusive = False break p = p.parent if popTo: self._popToTag(popTo, inclusive) def unknown_starttag(self, name, attrs, selfClosing=0): #print "Start tag %s: %s" % (name, attrs) if self.quoteStack: #This is not a real tag. #print "<%s> is not real!" % name attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) self.handle_data('<%s%s>' % (name, attrs)) return self.endData() if not self.isSelfClosingTag(name) and not selfClosing: self._smartPop(name) if self.parseOnlyThese and len(self.tagStack) <= 1 \ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): return tag = Tag(self, name, attrs, self.currentTag, self.previous) if self.previous: self.previous.next = tag self.previous = tag self.pushTag(tag) if selfClosing or self.isSelfClosingTag(name): self.popTag() if name in self.QUOTE_TAGS: #print "Beginning quote (%s)" % name self.quoteStack.append(name) self.literal = 1 return tag def unknown_endtag(self, name): #print "End tag %s" % name if self.quoteStack and self.quoteStack[-1] != name: #This is not a real end tag. #print "</%s> is not real!" % name self.handle_data('</%s>' % name) return self.endData() self._popToTag(name) if self.quoteStack and self.quoteStack[-1] == name: self.quoteStack.pop() self.literal = (len(self.quoteStack) > 0) def handle_data(self, data): self.currentData.append(data) def _toStringSubclass(self, text, subclass): """Adds a certain piece of text to the tree as a NavigableString subclass.""" self.endData() self.handle_data(text) self.endData(subclass) def handle_pi(self, text): """Handle a processing instruction as a ProcessingInstruction object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.""" if text[:3] == "xml": text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): "Handle comments as Comment objects." self._toStringSubclass(text, Comment) def handle_charref(self, ref): "Handle character references as data." if self.convertEntities: data = unichr(int(ref)) else: data = '&#%s;' % ref self.handle_data(data) def handle_entityref(self, ref): """Handle entity references as data, possibly converting known HTML and/or XML entity references to the corresponding Unicode characters.""" data = None if self.convertHTMLEntities: try: data = unichr(name2codepoint[ref]) except KeyError: pass if not data and self.convertXMLEntities: data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) if not data and self.convertHTMLEntities and \ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): # TODO: We've got a problem here. We're told this is # an entity reference, but it's not an XML entity # reference or an HTML entity reference. Nonetheless, # the logical thing to do is to pass it through as an # unrecognized entity reference. # # Except: when the input is "&carol;" this function # will be called with input "carol". When the input is # "AT&T", this function will be called with input # "T". We have no way of knowing whether a semicolon # was present originally, so we don't know whether # this is an unknown entity or just a misplaced # ampersand. # # The more common case is a misplaced ampersand, so I # escape the ampersand and omit the trailing semicolon. data = "&%s" % ref if not data: # This case is different from the one above, because we # haven't already gone through a supposedly comprehensive # mapping of entities to Unicode characters. We might not # have gone through any mapping at all. So the chances are # very high that this is a real entity, and not a # misplaced ampersand. data = "&%s;" % ref self.handle_data(data) def handle_decl(self, data): "Handle DOCTYPEs and the like as Declaration objects." self._toStringSubclass(data, Declaration) def parse_declaration(self, i): """Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.""" j = None if self.rawdata[i:i+9] == '<![CDATA[': k = self.rawdata.find(']]>', i) if k == -1: k = len(self.rawdata) data = self.rawdata[i+9:k] j = k+3 self._toStringSubclass(data, CData) else: try: j = SGMLParser.parse_declaration(self, i) except SGMLParseError: toHandle = self.rawdata[i:] self.handle_data(toHandle) j = i + len(toHandle) return j class BeautifulSoup(BeautifulStoneSoup): """This parser knows the following facts about HTML: * Some tags have no closing tag and should be interpreted as being closed as soon as they are encountered. * The text inside some tags (ie. 'script') may contain tags which are not really part of the document and which should be parsed as text, not tags. If you want to parse the text as tags, you can always fetch it and parse it explicitly. * Tag nesting rules: Most tags can't be nested at all. For instance, the occurance of a <p> tag should implicitly close the previous <p> tag. <p>Para1<p>Para2 should be transformed into: <p>Para1</p><p>Para2 Some tags can be nested arbitrarily. For instance, the occurance of a <blockquote> tag should _not_ implicitly close the previous <blockquote> tag. Alice said: <blockquote>Bob said: <blockquote>Blah should NOT be transformed into: Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah Some tags can be nested, but the nesting is reset by the interposition of other tags. For instance, a <tr> tag should implicitly close the previous <tr> tag within the same <table>, but not close a <tr> tag in another table. <table><tr>Blah<tr>Blah should be transformed into: <table><tr>Blah</tr><tr>Blah but, <tr>Blah<table><tr>Blah should NOT be transformed into <tr>Blah<table></tr><tr>Blah Differing assumptions about tag nesting rules are a major source of problems with the BeautifulSoup class. If BeautifulSoup is not treating as nestable a tag your page author treats as nestable, try ICantBelieveItsBeautifulSoup, MinimalSoup, or BeautifulStoneSoup before writing your own subclass.""" def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs) SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) QUOTE_TAGS = {'script' : None, 'textarea' : None} #According to the HTML standard, each of these inline tags can #contain another tag of the same type. Furthermore, it's common #to actually use these tags this way. NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center'] #According to the HTML standard, these block tags can contain #another tag of the same type. Furthermore, it's common #to actually use these tags this way. NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] #Lists can contain other lists, but there are restrictions. NESTABLE_LIST_TAGS = { 'ol' : [], 'ul' : [], 'li' : ['ul', 'ol'], 'dl' : [], 'dd' : ['dl'], 'dt' : ['dl'] } #Tables can contain other tables, but there are restrictions. NESTABLE_TABLE_TAGS = {'table' : [], 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 'td' : ['tr'], 'th' : ['tr'], 'thead' : ['table'], 'tbody' : ['table'], 'tfoot' : ['table'], } NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] #If one of these tags is encountered, all tags up to the next tag of #this type are popped. RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', NON_NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) # Used to detect the charset in a META tag; see start_meta CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) def start_meta(self, attrs): """Beautiful Soup can detect a charset included in a META tag, try to convert the document to that charset, and re-parse the document from the beginning.""" httpEquiv = None contentType = None contentTypeIndex = None tagNeedsEncodingSubstitution = False for i in range(0, len(attrs)): key, value = attrs[i] key = key.lower() if key == 'http-equiv': httpEquiv = value elif key == 'content': contentType = value contentTypeIndex = i if httpEquiv and contentType: # It's an interesting meta tag. match = self.CHARSET_RE.search(contentType) if match: if (self.declaredHTMLEncoding is not None or self.originalEncoding == self.fromEncoding): # An HTML encoding was sniffed while converting # the document to Unicode, or an HTML encoding was # sniffed during a previous pass through the # document, or an encoding was specified # explicitly and it worked. Rewrite the meta tag. def rewrite(match): return match.group(1) + "%SOUP-ENCODING%" newAttr = self.CHARSET_RE.sub(rewrite, contentType) attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], newAttr) tagNeedsEncodingSubstitution = True else: # This is our first pass through the document. # Go through it again with the encoding information. newCharset = match.group(3) if newCharset and newCharset != self.originalEncoding: self.declaredHTMLEncoding = newCharset self._feed(self.declaredHTMLEncoding) raise StopParsing pass tag = self.unknown_starttag("meta", attrs) if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True class StopParsing(Exception): pass class ICantBelieveItsBeautifulSoup(BeautifulSoup): """The BeautifulSoup class is oriented towards skipping over common HTML errors like unclosed tags. However, sometimes it makes errors of its own. For instance, consider this fragment: <b>Foo<b>Bar</b></b> This is perfectly valid (if bizarre) HTML. However, the BeautifulSoup class will implicitly close the first b tag when it encounters the second 'b'. It will think the author wrote "<b>Foo<b>Bar", and didn't close the first 'b' tag, because there's no real-world reason to bold something that's already bold. When it encounters '</b></b>' it will close two more 'b' tags, for a grand total of three tags closed instead of two. This can throw off the rest of your document structure. The same is true of a number of other tags, listed below. It's much more common for someone to forget to close a 'b' tag than to actually use nested 'b' tags, and the BeautifulSoup class handles the common case. This class handles the not-co-common case: where you can't believe someone wrote what they did, but it's valid HTML and BeautifulSoup screwed up by assuming it wouldn't be.""" I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big'] I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) class MinimalSoup(BeautifulSoup): """The MinimalSoup class is for parsing HTML that contains pathologically bad markup. It makes no assumptions about tag nesting, but it does know which tags are self-closing, that <script> tags contain Javascript and should not be parsed, that META tags may contain encoding information, and so on. This also makes it better for subclassing than BeautifulStoneSoup or BeautifulSoup.""" RESET_NESTING_TAGS = buildTagMap('noscript') NESTABLE_TAGS = {} class BeautifulSOAP(BeautifulStoneSoup): """This class will push a tag with only a single string child into the tag's parent as an attribute. The attribute's name is the tag name, and the value is the string child. An example should give the flavor of the change: <foo><bar>baz</bar></foo> => <foo bar="baz"><bar>baz</bar></foo> You can then access fooTag['bar'] instead of fooTag.barTag.string. This is, of course, useful for scraping structures that tend to use subelements instead of attributes, such as SOAP messages. Note that it modifies its input, so don't print the modified version out. I'm not sure how many people really want to use this class; let me know if you do. Mainly I like the name.""" def popTag(self): if len(self.tagStack) > 1: tag = self.tagStack[-1] parent = self.tagStack[-2] parent._getAttrMap() if (isinstance(tag, Tag) and len(tag.contents) == 1 and isinstance(tag.contents[0], NavigableString) and not parent.attrMap.has_key(tag.name)): parent[tag.name] = tag.contents[0] BeautifulStoneSoup.popTag(self) #Enterprise class names! It has come to our attention that some people #think the names of the Beautiful Soup parser classes are too silly #and "unprofessional" for use in enterprise screen-scraping. We feel #your pain! For such-minded folk, the Beautiful Soup Consortium And #All-Night Kosher Bakery recommends renaming this file to #"RobustParser.py" (or, in cases of extreme enterprisiness, #"RobustParserBeanInterface.class") and using the following #enterprise-friendly class aliases: class RobustXMLParser(BeautifulStoneSoup): pass class RobustHTMLParser(BeautifulSoup): pass class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): pass class RobustInsanelyWackAssHTMLParser(MinimalSoup): pass class SimplifyingSOAPParser(BeautifulSOAP): pass ###################################################### # # Bonus library: Unicode, Dammit # # This class forces XML data into a standard format (usually to UTF-8 # or Unicode). It is heavily based on code from Mark Pilgrim's # Universal Feed Parser. It does not rewrite the XML or HTML to # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi # (XML) and BeautifulSoup.start_meta (HTML). # Autodetects character encodings. # Download from http://chardet.feedparser.org/ try: import chardet # import chardet.constants # chardet.constants._debug = 1 except ImportError: chardet = None # cjkcodecs and iconv_codec make Python know about more character encodings. # Both are available from http://cjkpython.i18n.org/ # They're built in if you use Python 2.4. try: import cjkcodecs.aliases except ImportError: pass try: import iconv_codec except ImportError: pass class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is windows-1252, can replace MS smart quotes with their HTML or XML equivalents.""" # This dictionary maps commonly seen values for "charset" in HTML # meta tags to the corresponding Python codec names. It only covers # values that aren't in Python's aliases and can't be determined # by the heuristics in find_codec. CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml', isHTML=False): self.declaredHTMLEncoding = None self.markup, documentEncoding, sniffedEncoding = \ self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): self.originalEncoding = None self.unicode = unicode(markup) return u = None for proposedEncoding in overrideEncodings: u = self._convertFrom(proposedEncoding) if u: break if not u: for proposedEncoding in (documentEncoding, sniffedEncoding): u = self._convertFrom(proposedEncoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): u = self._convertFrom(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) if u: break self.unicode = u if not u: self.originalEncoding = None def _subMSChar(self, orig): """Changes a MS smart quote character to an XML or HTML entity.""" sub = self.MS_CHARS.get(orig) if type(sub) == types.TupleType: if self.smartQuotesTo == 'xml': sub = '&#x%s;' % sub[1] else: sub = '&%s;' % sub[0] return sub def _convertFrom(self, proposed): proposed = self.find_codec(proposed) if not proposed or proposed in self.triedEncodings: return None self.triedEncodings.append(proposed) markup = self.markup # Convert smart quotes to HTML if coming from an encoding # that might have them. if self.smartQuotesTo and proposed.lower() in("windows-1252", "iso-8859-1", "iso-8859-2"): markup = re.compile("([\x80-\x9f])").sub \ (lambda(x): self._subMSChar(x.group(1)), markup) try: # print "Trying to convert document to %s" % proposed u = self._toUnicode(markup, proposed) self.markup = u self.originalEncoding = proposed except Exception, e: # print "That didn't work!" # print e return None #print "Correct encoding: %s" % proposed return self.markup def _toUnicode(self, data, encoding): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' # strip Byte Order Mark (if present) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' data = data[2:] elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16le' data = data[2:] elif data[:3] == '\xef\xbb\xbf': encoding = 'utf-8' data = data[3:] elif data[:4] == '\x00\x00\xfe\xff': encoding = 'utf-32be' data = data[4:] elif data[:4] == '\xff\xfe\x00\x00': encoding = 'utf-32le' data = data[4:] newdata = unicode(data, encoding) return newdata def _detectEncoding(self, xml_data, isHTML=False): """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: if xml_data[:4] == '\x4c\x6f\xa7\x94': # EBCDIC xml_data = self._ebcdic_to_ascii(xml_data) elif xml_data[:4] == '\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == '\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == '\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == '\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == '\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') else: sniffed_xml_encoding = 'ascii' pass except: xml_encoding_match = None xml_encoding_match = re.compile( '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) if not xml_encoding_match and isHTML: regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) xml_encoding_match = regexp.search(xml_data) if xml_encoding_match is not None: xml_encoding = xml_encoding_match.groups()[0].lower() if isHTML: self.declaredHTMLEncoding = xml_encoding if sniffed_xml_encoding and \ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): xml_encoding = sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ or (charset and self._codec(charset.replace("-", "_"))) \ or charset def _codec(self, charset): if not charset: return charset codec = None try: codecs.lookup(charset) codec = charset except (LookupError, ValueError): pass return codec EBCDIC_TO_ASCII_MAP = None def _ebcdic_to_ascii(self, s): c = self.__class__ if not c.EBCDIC_TO_ASCII_MAP: emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ ''.join(map(chr, range(256))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) MS_CHARS = { '\x80' : ('euro', '20AC'), '\x81' : ' ', '\x82' : ('sbquo', '201A'), '\x83' : ('fnof', '192'), '\x84' : ('bdquo', '201E'), '\x85' : ('hellip', '2026'), '\x86' : ('dagger', '2020'), '\x87' : ('Dagger', '2021'), '\x88' : ('circ', '2C6'), '\x89' : ('permil', '2030'), '\x8A' : ('Scaron', '160'), '\x8B' : ('lsaquo', '2039'), '\x8C' : ('OElig', '152'), '\x8D' : '?', '\x8E' : ('#x17D', '17D'), '\x8F' : '?', '\x90' : '?', '\x91' : ('lsquo', '2018'), '\x92' : ('rsquo', '2019'), '\x93' : ('ldquo', '201C'), '\x94' : ('rdquo', '201D'), '\x95' : ('bull', '2022'), '\x96' : ('ndash', '2013'), '\x97' : ('mdash', '2014'), '\x98' : ('tilde', '2DC'), '\x99' : ('trade', '2122'), '\x9a' : ('scaron', '161'), '\x9b' : ('rsaquo', '203A'), '\x9c' : ('oelig', '153'), '\x9d' : '?', '\x9e' : ('#x17E', '17E'), '\x9f' : ('Yuml', ''),} ####################################################################### #By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) print soup.prettify() �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/bsouplxml/bsoupxpath.py�������������������������������������������������0000644�0000000�0000000�00000034271�11766731642�021116� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.bsoupxpath module (imdb.parser.http package). This module provides XPath support for BeautifulSoup. Copyright 2008 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ __author__ = 'H. Turgut Uyar <uyar@tekir.org>' __docformat__ = 'restructuredtext' import re import string import _bsoup as BeautifulSoup # XPath related enumerations and constants AXIS_ANCESTOR = 'ancestor' AXIS_ATTRIBUTE = 'attribute' AXIS_CHILD = 'child' AXIS_DESCENDANT = 'descendant' AXIS_FOLLOWING = 'following' AXIS_FOLLOWING_SIBLING = 'following-sibling' AXIS_PRECEDING_SIBLING = 'preceding-sibling' AXES = (AXIS_ANCESTOR, AXIS_ATTRIBUTE, AXIS_CHILD, AXIS_DESCENDANT, AXIS_FOLLOWING, AXIS_FOLLOWING_SIBLING, AXIS_PRECEDING_SIBLING) XPATH_FUNCTIONS = ('starts-with', 'string-length', 'contains') def tokenize_path(path): """Tokenize a location path into location steps. Return the list of steps. If two steps are separated by a double slash, the double slashes are part of the second step. If they are separated by only one slash, the slash is not included in any of the steps. """ # form a list of tuples that mark the start and end positions of steps separators = [] last_position = 0 i = -1 in_string = False while i < len(path) - 1: i = i + 1 if path[i] == "'": in_string = not in_string if in_string: # slashes within strings are not step separators continue if path[i] == '/': if i > 0: separators.append((last_position, i)) if (path[i+1] == '/'): last_position = i i = i + 1 else: last_position = i + 1 separators.append((last_position, len(path))) steps = [] for start, end in separators: steps.append(path[start:end]) return steps class Path: """A location path. """ def __init__(self, path, parse=True): self.path = path self.steps = [] if parse: if (path[0] == '/') and (path[1] != '/'): # if not on the descendant axis, remove the leading slash path = path[1:] steps = tokenize_path(path) for step in steps: self.steps.append(PathStep(step)) def apply(self, node): """Apply the path to a node. Return the resulting list of nodes. Apply the steps in the path sequentially by sending the output of each step as input to the next step. """ # FIXME: this should return a node SET, not a node LIST # or at least a list with no duplicates if self.path[0] == '/': # for an absolute path, start from the root if not isinstance(node, BeautifulSoup.Tag) \ or (node.name != '[document]'): node = node.findParent('[document]') nodes = [node] for step in self.steps: nodes = step.apply(nodes) return nodes class PathStep: """A location step in a location path. """ AXIS_PATTERN = r"""(%s)::|@""" % '|'.join(AXES) NODE_TEST_PATTERN = r"""\w+(\(\))?""" PREDICATE_PATTERN = r"""\[(.*?)\]""" LOCATION_STEP_PATTERN = r"""(%s)?(%s)((%s)*)""" \ % (AXIS_PATTERN, NODE_TEST_PATTERN, PREDICATE_PATTERN) _re_location_step = re.compile(LOCATION_STEP_PATTERN) PREDICATE_NOT_PATTERN = r"""not\((.*?)\)""" PREDICATE_AXIS_PATTERN = r"""(%s)?(%s)(='(.*?)')?""" \ % (AXIS_PATTERN, NODE_TEST_PATTERN) PREDICATE_FUNCTION_PATTERN = r"""(%s)\(([^,]+(,\s*[^,]+)*)?\)(=(.*))?""" \ % '|'.join(XPATH_FUNCTIONS) _re_predicate_not = re.compile(PREDICATE_NOT_PATTERN) _re_predicate_axis = re.compile(PREDICATE_AXIS_PATTERN) _re_predicate_function = re.compile(PREDICATE_FUNCTION_PATTERN) def __init__(self, step): self.step = step if (step == '.') or (step == '..'): return if step[:2] == '//': default_axis = AXIS_DESCENDANT step = step[2:] else: default_axis = AXIS_CHILD step_match = self._re_location_step.match(step) # determine the axis axis = step_match.group(1) if axis is None: self.axis = default_axis elif axis == '@': self.axis = AXIS_ATTRIBUTE else: self.axis = step_match.group(2) self.soup_args = {} self.index = None self.node_test = step_match.group(3) if self.node_test == 'text()': self.soup_args['text'] = True else: self.soup_args['name'] = self.node_test self.checkers = [] predicates = step_match.group(5) if predicates is not None: predicates = [p for p in predicates[1:-1].split('][') if p] for predicate in predicates: checker = self.__parse_predicate(predicate) if checker is not None: self.checkers.append(checker) def __parse_predicate(self, predicate): """Parse the predicate. Return a callable that can be used to filter nodes. Update `self.soup_args` to take advantage of BeautifulSoup search features. """ try: position = int(predicate) if self.axis == AXIS_DESCENDANT: return PredicateFilter('position', value=position) else: # use the search limit feature instead of a checker self.soup_args['limit'] = position self.index = position - 1 return None except ValueError: pass if predicate == "last()": self.index = -1 return None negate = self._re_predicate_not.match(predicate) if negate: predicate = negate.group(1) function_match = self._re_predicate_function.match(predicate) if function_match: name = function_match.group(1) arguments = function_match.group(2) value = function_match.group(4) if value is not None: value = function_match.group(5) return PredicateFilter(name, arguments, value) axis_match = self._re_predicate_axis.match(predicate) if axis_match: axis = axis_match.group(1) if axis is None: axis = AXIS_CHILD elif axis == '@': axis = AXIS_ATTRIBUTE if axis == AXIS_ATTRIBUTE: # use the attribute search feature instead of a checker attribute_name = axis_match.group(3) if axis_match.group(5) is not None: attribute_value = axis_match.group(6) elif not negate: attribute_value = True else: attribute_value = None if not self.soup_args.has_key('attrs'): self.soup_args['attrs'] = {} self.soup_args['attrs'][attribute_name] = attribute_value return None elif axis == AXIS_CHILD: node_test = axis_match.group(3) node_value = axis_match.group(6) return PredicateFilter('axis', node_test, value=node_value, negate=negate) raise NotImplementedError("This predicate is not implemented") def apply(self, nodes): """Apply the step to a list of nodes. Return the list of nodes for the next step. """ if self.step == '.': return nodes elif self.step == '..': return [node.parent for node in nodes] result = [] for node in nodes: if self.axis == AXIS_CHILD: found = node.findAll(recursive=False, **self.soup_args) elif self.axis == AXIS_DESCENDANT: found = node.findAll(recursive=True, **self.soup_args) elif self.axis == AXIS_ATTRIBUTE: try: found = [node[self.node_test]] except KeyError: found = [] elif self.axis == AXIS_FOLLOWING_SIBLING: found = node.findNextSiblings(**self.soup_args) elif self.axis == AXIS_PRECEDING_SIBLING: # TODO: make sure that the result is reverse ordered found = node.findPreviousSiblings(**self.soup_args) elif self.axis == AXIS_FOLLOWING: # find the last descendant of this node last = node while (not isinstance(last, BeautifulSoup.NavigableString)) \ and (len(last.contents) > 0): last = last.contents[-1] found = last.findAllNext(**self.soup_args) elif self.axis == AXIS_ANCESTOR: found = node.findParents(**self.soup_args) # this should only be active if there is a position predicate # and the axis is not 'descendant' if self.index is not None: if found: if len(found) > self.index: found = [found[self.index]] else: found = [] if found: for checker in self.checkers: found = filter(checker, found) result.extend(found) return result class PredicateFilter: """A callable class for filtering nodes. """ def __init__(self, name, arguments=None, value=None, negate=False): self.name = name self.arguments = arguments self.negate = negate if name == 'position': self.__filter = self.__position self.value = value elif name == 'axis': self.__filter = self.__axis self.node_test = arguments self.value = value elif name in ('starts-with', 'contains'): if name == 'starts-with': self.__filter = self.__starts_with else: self.__filter = self.__contains args = map(string.strip, arguments.split(',')) if args[0][0] == '@': self.arguments = (True, args[0][1:], args[1][1:-1]) else: self.arguments = (False, args[0], args[1][1:-1]) elif name == 'string-length': self.__filter = self.__string_length args = map(string.strip, arguments.split(',')) if args[0][0] == '@': self.arguments = (True, args[0][1:]) else: self.arguments = (False, args[0]) self.value = int(value) else: raise NotImplementedError("This XPath function is not implemented") def __call__(self, node): if self.negate: return not self.__filter(node) else: return self.__filter(node) def __position(self, node): if isinstance(node, BeautifulSoup.NavigableString): actual_position = len(node.findPreviousSiblings(text=True)) + 1 else: actual_position = len(node.findPreviousSiblings(node.name)) + 1 return actual_position == self.value def __axis(self, node): if self.node_test == 'text()': return node.string == self.value else: children = node.findAll(self.node_test, recursive=False) if len(children) > 0 and self.value is None: return True for child in children: if child.string == self.value: return True return False def __starts_with(self, node): if self.arguments[0]: # this is an attribute attribute_name = self.arguments[1] if node.has_key(attribute_name): first = node[attribute_name] return first.startswith(self.arguments[2]) elif self.arguments[1] == 'text()': first = node.contents and node.contents[0] if isinstance(first, BeautifulSoup.NavigableString): return first.startswith(self.arguments[2]) return False def __contains(self, node): if self.arguments[0]: # this is an attribute attribute_name = self.arguments[1] if node.has_key(attribute_name): first = node[attribute_name] return self.arguments[2] in first elif self.arguments[1] == 'text()': first = node.contents and node.contents[0] if isinstance(first, BeautifulSoup.NavigableString): return self.arguments[2] in first return False def __string_length(self, node): if self.arguments[0]: # this is an attribute attribute_name = self.arguments[1] if node.has_key(attribute_name): value = node[attribute_name] else: value = None elif self.arguments[1] == 'text()': value = node.string if value is not None: return len(value) == self.value return False _paths = {} _steps = {} def get_path(path): """Utility for eliminating repeated parsings of the same paths and steps. """ if not _paths.has_key(path): p = Path(path, parse=False) steps = tokenize_path(path) for step in steps: if not _steps.has_key(step): _steps[step] = PathStep(step) p.steps.append(_steps[step]) _paths[path] = p return _paths[path] ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/topBottomParser.py������������������������������������������������������0000644�0000000�0000000�00000007405�11766731642�020037� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.topBottomParser module (imdb package). This module provides the classes (and the instances), used to parse the lists of top 250 and bottom 100 movies. E.g.: http://akas.imdb.com/chart/top http://akas.imdb.com/chart/bottom Copyright 2009 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from imdb.utils import analyze_title from utils import DOMParserBase, Attribute, Extractor, analyze_imdbid class DOMHTMLTop250Parser(DOMParserBase): """Parser for the "top 250" page. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: tparser = DOMHTMLTop250Parser() result = tparser.parse(top250_html_string) """ label = 'top 250' ranktext = 'top 250 rank' def _init(self): self.extractors = [Extractor(label=self.label, path="//div[@id='main']//table//tr", attrs=Attribute(key=None, multi=True, path={self.ranktext: "./td[1]//text()", 'rating': "./td[2]//text()", 'title': "./td[3]//text()", 'movieID': "./td[3]//a/@href", 'votes': "./td[4]//text()" }))] def postprocess_data(self, data): if not data or self.label not in data: return [] mlist = [] data = data[self.label] # Avoid duplicates. A real fix, using XPath, is auspicabile. # XXX: probably this is no more needed. seenIDs = [] for d in data: if 'movieID' not in d: continue if self.ranktext not in d: continue if 'title' not in d: continue theID = analyze_imdbid(d['movieID']) if theID is None: continue theID = str(theID) if theID in seenIDs: continue seenIDs.append(theID) minfo = analyze_title(d['title']) try: minfo[self.ranktext] = int(d[self.ranktext].replace('.', '')) except: pass if 'votes' in d: try: minfo['votes'] = int(d['votes'].replace(',', '')) except: pass if 'rating' in d: try: minfo['rating'] = float(d['rating']) except: pass mlist.append((theID, minfo)) return mlist class DOMHTMLBottom100Parser(DOMHTMLTop250Parser): """Parser for the "bottom 100" page. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: tparser = DOMHTMLBottom100Parser() result = tparser.parse(bottom100_html_string) """ label = 'bottom 100' ranktext = 'bottom 100 rank' _OBJECTS = { 'top250_parser': ((DOMHTMLTop250Parser,), None), 'bottom100_parser': ((DOMHTMLBottom100Parser,), None) } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/http/searchCharacterParser.py������������������������������������������������0000644�0000000�0000000�00000005351�11766731642�021130� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.http.searchCharacterParser module (imdb package). This module provides the HTMLSearchCharacterParser class (and the search_character_parser instance), used to parse the results of a search for a given character. E.g., when searching for the name "Jesse James", the parsed page would be: http://akas.imdb.com/find?s=Characters;mx=20;q=Jesse+James Copyright 2007-2009 Davide Alberani <da@erlug.linux.it> 2008 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from imdb.utils import analyze_name, build_name from utils import Extractor, Attribute, analyze_imdbid from searchMovieParser import DOMHTMLSearchMovieParser, DOMBasicMovieParser class DOMBasicCharacterParser(DOMBasicMovieParser): """Simply get the name of a character and the imdbID. It's used by the DOMHTMLSearchCharacterParser class to return a result for a direct match (when a search on IMDb results in a single character, the web server sends directly the movie page.""" _titleFunct = lambda self, x: analyze_name(x or u'', canonical=False) class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser): _BaseParser = DOMBasicCharacterParser _notDirectHitTitle = '<title>imdb search' _titleBuilder = lambda self, x: build_name(x, canonical=False) _linkPrefix = '/character/ch' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), {'name': x.get('name')} ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, " \ "'/character/ch')]/..", attrs=_attrs)] _OBJECTS = { 'search_character_parser': ((DOMHTMLSearchCharacterParser,), {'kind': 'character', '_basic_parser': DOMBasicCharacterParser}) } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/sql/�������������������������������������������������������������������������0000755�0000000�0000000�00000000000�11766731642�014133� 5����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/sql/__init__.py��������������������������������������������������������������0000644�0000000�0000000�00000200526�11766731642�016251� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.sql package (imdb package). This package provides the IMDbSqlAccessSystem class used to access IMDb's data through a SQL database. Every database supported by the SQLObject _AND_ SQLAlchemy Object Relational Managers is available. the imdb.IMDb function will return an instance of this class when called with the 'accessSystem' argument set to "sql", "database" or "db". Copyright 2005-2010 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ # FIXME: this whole module was written in a veeery short amount of time. # The code should be commented, rewritten and cleaned. :-) import re import logging from difflib import SequenceMatcher from codecs import lookup from imdb import IMDbBase from imdb.utils import normalizeName, normalizeTitle, build_title, \ build_name, analyze_name, analyze_title, \ canonicalTitle, canonicalName, re_titleRef, \ build_company_name, re_episodes, _unicodeArticles, \ analyze_company_name, re_year_index, re_nameRef from imdb.Person import Person from imdb.Movie import Movie from imdb.Company import Company from imdb._exceptions import IMDbDataAccessError, IMDbError # Logger for miscellaneous functions. _aux_logger = logging.getLogger('imdbpy.parser.sql.aux') # ============================= # Things that once upon a time were in imdb.parser.common.locsql. def titleVariations(title, fromPtdf=0): """Build title variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" if fromPtdf: title1 = u'' else: title1 = title title2 = title3 = u'' if fromPtdf or re_year_index.search(title): # If it appears to have a (year[/imdbIndex]) indication, # assume that a long imdb canonical name was provided. titldict = analyze_title(title, canonical=1) # title1: the canonical name. title1 = titldict['title'] if titldict['kind'] != 'episode': # title3: the long imdb canonical name. if fromPtdf: title3 = title else: title3 = build_title(titldict, canonical=1, ptdf=1) else: title1 = normalizeTitle(title1) title3 = build_title(titldict, canonical=1, ptdf=1) else: # Just a title. # title1: the canonical title. title1 = canonicalTitle(title) title3 = u'' # title2 is title1 without the article, or title1 unchanged. if title1: title2 = title1 t2s = title2.split(u', ') if t2s[-1].lower() in _unicodeArticles: title2 = u', '.join(t2s[:-1]) _aux_logger.debug('title variations: 1:[%s] 2:[%s] 3:[%s]', title1, title2, title3) return title1, title2, title3 re_nameIndex = re.compile(r'\(([IVXLCDM]+)\)') def nameVariations(name, fromPtdf=0): """Build name variations useful for searches; if fromPtdf is true, the input is assumed to be in the plain text data files format.""" name1 = name2 = name3 = u'' if fromPtdf or re_nameIndex.search(name): # We've a name with an (imdbIndex) namedict = analyze_name(name, canonical=1) # name1 is the name in the canonical format. name1 = namedict['name'] # name3 is the canonical name with the imdbIndex. if fromPtdf: if namedict.has_key('imdbIndex'): name3 = name else: name3 = build_name(namedict, canonical=1) else: # name1 is the name in the canonical format. name1 = canonicalName(name) name3 = u'' # name2 is the name in the normal format, if it differs from name1. name2 = normalizeName(name1) if name1 == name2: name2 = u'' _aux_logger.debug('name variations: 1:[%s] 2:[%s] 3:[%s]', name1, name2, name3) return name1, name2, name3 try: from cutils import ratcliff as _ratcliff def ratcliff(s1, s2, sm): """Return the Ratcliff-Obershelp value between the two strings, using the C implementation.""" return _ratcliff(s1.encode('latin_1', 'replace'), s2.encode('latin_1', 'replace')) except ImportError: _aux_logger.warn('Unable to import the cutils.ratcliff function.' ' Searching names and titles using the "sql"' ' data access system will be slower.') def ratcliff(s1, s2, sm): """Ratcliff-Obershelp similarity.""" STRING_MAXLENDIFFER = 0.7 s1len = len(s1) s2len = len(s2) if s1len < s2len: threshold = float(s1len) / s2len else: threshold = float(s2len) / s1len if threshold < STRING_MAXLENDIFFER: return 0.0 sm.set_seq2(s2.lower()) return sm.ratio() def merge_roles(mop): """Merge multiple roles.""" new_list = [] for m in mop: if m in new_list: keep_this = new_list[new_list.index(m)] if not isinstance(keep_this.currentRole, list): keep_this.currentRole = [keep_this.currentRole] keep_this.currentRole.append(m.currentRole) else: new_list.append(m) return new_list def scan_names(name_list, name1, name2, name3, results=0, ro_thresold=None, _scan_character=False): """Scan a list of names, searching for best matches against the given variations.""" if ro_thresold is not None: RO_THRESHOLD = ro_thresold else: RO_THRESHOLD = 0.6 sm1 = SequenceMatcher() sm2 = SequenceMatcher() sm3 = SequenceMatcher() sm1.set_seq1(name1.lower()) if name2: sm2.set_seq1(name2.lower()) if name3: sm3.set_seq1(name3.lower()) resd = {} for i, n_data in name_list: nil = n_data['name'] # XXX: on Symbian, here we get a str; not sure this is the # right place to fix it. if isinstance(nil, str): nil = unicode(nil, 'latin1', 'ignore') # Distance with the canonical name. ratios = [ratcliff(name1, nil, sm1) + 0.05] namesurname = u'' if not _scan_character: nils = nil.split(', ', 1) surname = nils[0] if len(nils) == 2: namesurname = '%s %s' % (nils[1], surname) else: nils = nil.split(' ', 1) surname = nils[-1] namesurname = nil if surname != nil: # Distance with the "Surname" in the database. ratios.append(ratcliff(name1, surname, sm1)) if not _scan_character: ratios.append(ratcliff(name1, namesurname, sm1)) if name2: ratios.append(ratcliff(name2, surname, sm2)) # Distance with the "Name Surname" in the database. if namesurname: ratios.append(ratcliff(name2, namesurname, sm2)) if name3: # Distance with the long imdb canonical name. ratios.append(ratcliff(name3, build_name(n_data, canonical=1), sm3) + 0.1) ratio = max(ratios) if ratio >= RO_THRESHOLD: if resd.has_key(i): if ratio > resd[i][0]: resd[i] = (ratio, (i, n_data)) else: resd[i] = (ratio, (i, n_data)) res = resd.values() res.sort() res.reverse() if results > 0: res[:] = res[:results] return res def scan_titles(titles_list, title1, title2, title3, results=0, searchingEpisode=0, onlyEpisodes=0, ro_thresold=None): """Scan a list of titles, searching for best matches against the given variations.""" if ro_thresold is not None: RO_THRESHOLD = ro_thresold else: RO_THRESHOLD = 0.6 sm1 = SequenceMatcher() sm2 = SequenceMatcher() sm3 = SequenceMatcher() sm1.set_seq1(title1.lower()) sm2.set_seq2(title2.lower()) if title3: sm3.set_seq1(title3.lower()) if title3[-1] == '}': searchingEpisode = 1 hasArt = 0 if title2 != title1: hasArt = 1 resd = {} for i, t_data in titles_list: if onlyEpisodes: if t_data.get('kind') != 'episode': continue til = t_data['title'] if til[-1] == ')': dateIdx = til.rfind('(') if dateIdx != -1: til = til[:dateIdx].rstrip() if not til: continue ratio = ratcliff(title1, til, sm1) if ratio >= RO_THRESHOLD: resd[i] = (ratio, (i, t_data)) continue if searchingEpisode: if t_data.get('kind') != 'episode': continue elif t_data.get('kind') == 'episode': continue til = t_data['title'] # XXX: on Symbian, here we get a str; not sure this is the # right place to fix it. if isinstance(til, str): til = unicode(til, 'latin1', 'ignore') # Distance with the canonical title (with or without article). # titleS -> titleR # titleS, the -> titleR, the if not searchingEpisode: til = canonicalTitle(til) ratios = [ratcliff(title1, til, sm1) + 0.05] # til2 is til without the article, if present. til2 = til tils = til2.split(', ') matchHasArt = 0 if tils[-1].lower() in _unicodeArticles: til2 = ', '.join(tils[:-1]) matchHasArt = 1 if hasArt and not matchHasArt: # titleS[, the] -> titleR ratios.append(ratcliff(title2, til, sm2)) elif matchHasArt and not hasArt: # titleS -> titleR[, the] ratios.append(ratcliff(title1, til2, sm1)) else: ratios = [0.0] if title3: # Distance with the long imdb canonical title. ratios.append(ratcliff(title3, build_title(t_data, canonical=1, ptdf=1), sm3) + 0.1) ratio = max(ratios) if ratio >= RO_THRESHOLD: if resd.has_key(i): if ratio > resd[i][0]: resd[i] = (ratio, (i, t_data)) else: resd[i] = (ratio, (i, t_data)) res = resd.values() res.sort() res.reverse() if results > 0: res[:] = res[:results] return res def scan_company_names(name_list, name1, results=0, ro_thresold=None): """Scan a list of company names, searching for best matches against the given name. Notice that this function takes a list of strings, and not a list of dictionaries.""" if ro_thresold is not None: RO_THRESHOLD = ro_thresold else: RO_THRESHOLD = 0.6 sm1 = SequenceMatcher() sm1.set_seq1(name1.lower()) resd = {} withoutCountry = not name1.endswith(']') for i, n in name_list: # XXX: on Symbian, here we get a str; not sure this is the # right place to fix it. if isinstance(n, str): n = unicode(n, 'latin1', 'ignore') o_name = n var = 0.0 if withoutCountry and n.endswith(']'): cidx = n.rfind('[') if cidx != -1: n = n[:cidx].rstrip() var = -0.05 # Distance with the company name. ratio = ratcliff(name1, n, sm1) + var if ratio >= RO_THRESHOLD: if resd.has_key(i): if ratio > resd[i][0]: resd[i] = (ratio, (i, analyze_company_name(o_name))) else: resd[i] = (ratio, (i, analyze_company_name(o_name))) res = resd.values() res.sort() res.reverse() if results > 0: res[:] = res[:results] return res try: from cutils import soundex except ImportError: _aux_logger.warn('Unable to import the cutils.soundex function.' ' Searches of movie titles and person names will be' ' a bit slower.') _translate = dict(B='1', C='2', D='3', F='1', G='2', J='2', K='2', L='4', M='5', N='5', P='1', Q='2', R='6', S='2', T='3', V='1', X='2', Z='2') _translateget = _translate.get _re_non_ascii = re.compile(r'^[^a-z]*', re.I) SOUNDEX_LEN = 5 def soundex(s): """Return the soundex code for the given string.""" # Maximum length of the soundex code. s = _re_non_ascii.sub('', s) if not s: return None s = s.upper() soundCode = s[0] for c in s[1:]: cw = _translateget(c, '0') if cw != '0' and soundCode[-1] != cw: soundCode += cw return soundCode[:SOUNDEX_LEN] or None def _sortKeywords(keyword, kwds): """Sort a list of keywords, based on the searched one.""" sm = SequenceMatcher() sm.set_seq1(keyword.lower()) ratios = [(ratcliff(keyword, k, sm), k) for k in kwds] checkContained = False if len(keyword) > 4: checkContained = True for idx, data in enumerate(ratios): ratio, key = data if key.startswith(keyword): ratios[idx] = (ratio+0.5, key) elif checkContained and keyword in key: ratios[idx] = (ratio+0.3, key) ratios.sort() ratios.reverse() return [r[1] for r in ratios] def filterSimilarKeywords(keyword, kwdsIterator): """Return a sorted list of keywords similar to the one given.""" seenDict = {} kwdSndx = soundex(keyword.encode('ascii', 'ignore')) matches = [] matchesappend = matches.append checkContained = False if len(keyword) > 4: checkContained = True for movieID, key in kwdsIterator: if key in seenDict: continue seenDict[key] = None if checkContained and keyword in key: matchesappend(key) continue if kwdSndx == soundex(key.encode('ascii', 'ignore')): matchesappend(key) return _sortKeywords(keyword, matches) # ============================= _litlist = ['screenplay/teleplay', 'novel', 'adaption', 'book', 'production process protocol', 'interviews', 'printed media reviews', 'essays', 'other literature'] _litd = dict([(x, ('literature', x)) for x in _litlist]) _buslist = ['budget', 'weekend gross', 'gross', 'opening weekend', 'rentals', 'admissions', 'filming dates', 'production dates', 'studios', 'copyright holder'] _busd = dict([(x, ('business', x)) for x in _buslist]) def _reGroupDict(d, newgr): """Regroup keys in the d dictionary in subdictionaries, based on the scheme in the newgr dictionary. E.g.: in the newgr, an entry 'LD label': ('laserdisc', 'label') tells the _reGroupDict() function to take the entry with label 'LD label' (as received from the sql database) and put it in the subsection (another dictionary) named 'laserdisc', using the key 'label'.""" r = {} newgrks = newgr.keys() for k, v in d.items(): if k in newgrks: r.setdefault(newgr[k][0], {})[newgr[k][1]] = v # A not-so-clearer version: ##r.setdefault(newgr[k][0], {}) ##r[newgr[k][0]][newgr[k][1]] = v else: r[k] = v return r def _groupListBy(l, index): """Regroup items in a list in a list of lists, grouped by the value at the given index.""" tmpd = {} for item in l: tmpd.setdefault(item[index], []).append(item) res = tmpd.values() return res def sub_dict(d, keys): """Return the subdictionary of 'd', with just the keys listed in 'keys'.""" return dict([(k, d[k]) for k in keys if k in d]) def get_movie_data(movieID, kindDict, fromAka=0, _table=None): """Return a dictionary containing data about the given movieID; if fromAka is true, the AkaTitle table is searched; _table is reserved for the imdbpy2sql.py script.""" if _table is not None: Table = _table else: if not fromAka: Table = Title else: Table = AkaTitle m = Table.get(movieID) mdict = {'title': m.title, 'kind': kindDict[m.kindID], 'year': m.productionYear, 'imdbIndex': m.imdbIndex, 'season': m.seasonNr, 'episode': m.episodeNr} if not fromAka: if m.seriesYears is not None: mdict['series years'] = unicode(m.seriesYears) if mdict['imdbIndex'] is None: del mdict['imdbIndex'] if mdict['year'] is None: del mdict['year'] else: try: mdict['year'] = int(mdict['year']) except (TypeError, ValueError): del mdict['year'] if mdict['season'] is None: del mdict['season'] else: try: mdict['season'] = int(mdict['season']) except: pass if mdict['episode'] is None: del mdict['episode'] else: try: mdict['episode'] = int(mdict['episode']) except: pass episodeOfID = m.episodeOfID if episodeOfID is not None: ser_dict = get_movie_data(episodeOfID, kindDict, fromAka) mdict['episode of'] = Movie(data=ser_dict, movieID=episodeOfID, accessSystem='sql') if fromAka: ser_note = AkaTitle.get(episodeOfID).note if ser_note: mdict['episode of'].notes = ser_note return mdict def _iterKeywords(results): """Iterate over (key.id, key.keyword) columns of a selection of the Keyword table.""" for key in results: yield key.id, key.keyword def getSingleInfo(table, movieID, infoType, notAList=False): """Return a dictionary in the form {infoType: infoListOrString}, retrieving a single set of information about a given movie, from the specified table.""" infoTypeID = InfoType.select(InfoType.q.info == infoType) if infoTypeID.count() == 0: return {} res = table.select(AND(table.q.movieID == movieID, table.q.infoTypeID == infoTypeID[0].id)) retList = [] for r in res: info = r.info note = r.note if note: info += u'::%s' % note retList.append(info) if not retList: return {} if not notAList: return {infoType: retList} else: return {infoType: retList[0]} def _cmpTop(a, b, what='top 250 rank'): """Compare function used to sort top 250/bottom 10 rank.""" av = int(a[1].get(what)) bv = int(b[1].get(what)) if av == bv: return 0 return (-1, 1)[av > bv] def _cmpBottom(a, b): """Compare function used to sort top 250/bottom 10 rank.""" return _cmpTop(a, b, what='bottom 10 rank') class IMDbSqlAccessSystem(IMDbBase): """The class used to access IMDb's data through a SQL database.""" accessSystem = 'sql' _sql_logger = logging.getLogger('imdbpy.parser.sql') def __init__(self, uri, adultSearch=1, useORM=None, *arguments, **keywords): """Initialize the access system.""" IMDbBase.__init__(self, *arguments, **keywords) if useORM is None: useORM = ('sqlobject', 'sqlalchemy') if not isinstance(useORM, (tuple, list)): if ',' in useORM: useORM = useORM.split(',') else: useORM = [useORM] self.useORM = useORM nrMods = len(useORM) _gotError = False DB_TABLES = [] for idx, mod in enumerate(useORM): mod = mod.strip().lower() try: if mod == 'sqlalchemy': from alchemyadapter import getDBTables, NotFoundError, \ setConnection, AND, OR, IN, \ ISNULL, CONTAINSSTRING, toUTF8 elif mod == 'sqlobject': from objectadapter import getDBTables, NotFoundError, \ setConnection, AND, OR, IN, \ ISNULL, CONTAINSSTRING, toUTF8 else: self._sql_logger.warn('unknown module "%s"' % mod) continue self._sql_logger.info('using %s ORM', mod) # XXX: look ma'... black magic! It's used to make # TableClasses and some functions accessible # through the whole module. for k, v in [('NotFoundError', NotFoundError), ('AND', AND), ('OR', OR), ('IN', IN), ('ISNULL', ISNULL), ('CONTAINSSTRING', CONTAINSSTRING)]: globals()[k] = v self.toUTF8 = toUTF8 DB_TABLES = getDBTables(uri) for t in DB_TABLES: globals()[t._imdbpyName] = t if _gotError: self._sql_logger.warn('falling back to "%s"' % mod) break except ImportError, e: if idx+1 >= nrMods: raise IMDbError('unable to use any ORM in %s: %s' % ( str(useORM), str(e))) else: self._sql_logger.warn('unable to use "%s": %s' % (mod, str(e))) _gotError = True continue else: raise IMDbError('unable to use any ORM in %s' % str(useORM)) # Set the connection to the database. self._sql_logger.debug('connecting to %s', uri) try: self._connection = setConnection(uri, DB_TABLES) except AssertionError, e: raise IMDbDataAccessError( \ 'unable to connect to the database server; ' + \ 'complete message: "%s"' % str(e)) self.Error = self._connection.module.Error # Maps some IDs to the corresponding strings. self._kind = {} self._kindRev = {} self._sql_logger.debug('reading constants from the database') try: for kt in KindType.select(): self._kind[kt.id] = kt.kind self._kindRev[str(kt.kind)] = kt.id except self.Error: # NOTE: you can also get the error, but - at least with # MySQL - it also contains the password, and I don't # like the idea to print it out. raise IMDbDataAccessError( \ 'unable to connect to the database server') self._role = {} for rl in RoleType.select(): self._role[rl.id] = str(rl.role) self._info = {} self._infoRev = {} for inf in InfoType.select(): self._info[inf.id] = str(inf.info) self._infoRev[str(inf.info)] = inf.id self._compType = {} for cType in CompanyType.select(): self._compType[cType.id] = cType.kind info = [(it.id, it.info) for it in InfoType.select()] self._compcast = {} for cc in CompCastType.select(): self._compcast[cc.id] = str(cc.kind) self._link = {} for lt in LinkType.select(): self._link[lt.id] = str(lt.link) self._moviesubs = {} # Build self._moviesubs, a dictionary used to rearrange # the data structure for a movie object. for vid, vinfo in info: if not vinfo.startswith('LD '): continue self._moviesubs[vinfo] = ('laserdisc', vinfo[3:]) self._moviesubs.update(_litd) self._moviesubs.update(_busd) self.do_adult_search(adultSearch) def _findRefs(self, o, trefs, nrefs): """Find titles or names references in strings.""" if isinstance(o, (unicode, str)): for title in re_titleRef.findall(o): a_title = analyze_title(title, canonical=0) rtitle = build_title(a_title, ptdf=1) if trefs.has_key(rtitle): continue movieID = self._getTitleID(rtitle) if movieID is None: movieID = self._getTitleID(title) if movieID is None: continue m = Movie(title=rtitle, movieID=movieID, accessSystem=self.accessSystem) trefs[rtitle] = m rtitle2 = canonicalTitle(a_title.get('title', u'')) if rtitle2 and rtitle2 != rtitle and rtitle2 != title: trefs[rtitle2] = m if title != rtitle: trefs[title] = m for name in re_nameRef.findall(o): a_name = analyze_name(name, canonical=1) rname = build_name(a_name, canonical=1) if nrefs.has_key(rname): continue personID = self._getNameID(rname) if personID is None: personID = self._getNameID(name) if personID is None: continue p = Person(name=rname, personID=personID, accessSystem=self.accessSystem) nrefs[rname] = p rname2 = normalizeName(a_name.get('name', u'')) if rname2 and rname2 != rname: nrefs[rname2] = p if name != rname and name != rname2: nrefs[name] = p elif isinstance(o, (list, tuple)): for item in o: self._findRefs(item, trefs, nrefs) elif isinstance(o, dict): for value in o.values(): self._findRefs(value, trefs, nrefs) return (trefs, nrefs) def _extractRefs(self, o): """Scan for titles or names references in strings.""" trefs = {} nrefs = {} try: return self._findRefs(o, trefs, nrefs) except RuntimeError, e: # Symbian/python 2.2 has a poor regexp implementation. import warnings warnings.warn('RuntimeError in ' "imdb.parser.sql.IMDbSqlAccessSystem; " "if it's not a recursion limit exceeded and we're not " "running in a Symbian environment, it's a bug:\n%s" % e) return (trefs, nrefs) def _changeAKAencoding(self, akanotes, akatitle): """Return akatitle in the correct charset, as specified in the akanotes field; if akatitle doesn't need to be modified, return None.""" oti = akanotes.find('(original ') if oti == -1: return None ote = akanotes[oti+10:].find(' title)') if ote != -1: cs_info = akanotes[oti+10:oti+10+ote].lower().split() for e in cs_info: # excludes some strings that clearly are not encoding. if e in ('script', '', 'cyrillic', 'greek'): continue if e.startswith('iso-') and e.find('latin') != -1: e = e[4:].replace('-', '') try: lookup(e) lat1 = akatitle.encode('latin_1', 'replace') return unicode(lat1, e, 'replace') except (LookupError, ValueError, TypeError): continue return None def _buildNULLCondition(self, col, val): """Build a comparison for columns where values can be NULL.""" if val is None: return ISNULL(col) else: if isinstance(val, (int, long)): return col == val else: return col == self.toUTF8(val) def _getTitleID(self, title): """Given a long imdb canonical title, returns a movieID or None if not found.""" td = analyze_title(title) condition = None if td['kind'] == 'episode': epof = td['episode of'] seriesID = [s.id for s in Title.select( AND(Title.q.title == self.toUTF8(epof['title']), self._buildNULLCondition(Title.q.imdbIndex, epof.get('imdbIndex')), Title.q.kindID == self._kindRev[epof['kind']], self._buildNULLCondition(Title.q.productionYear, epof.get('year'))))] if seriesID: condition = AND(IN(Title.q.episodeOfID, seriesID), Title.q.title == self.toUTF8(td['title']), self._buildNULLCondition(Title.q.imdbIndex, td.get('imdbIndex')), Title.q.kindID == self._kindRev[td['kind']], self._buildNULLCondition(Title.q.productionYear, td.get('year'))) if condition is None: condition = AND(Title.q.title == self.toUTF8(td['title']), self._buildNULLCondition(Title.q.imdbIndex, td.get('imdbIndex')), Title.q.kindID == self._kindRev[td['kind']], self._buildNULLCondition(Title.q.productionYear, td.get('year'))) res = Title.select(condition) try: if res.count() != 1: return None except (UnicodeDecodeError, TypeError): return None return res[0].id def _getNameID(self, name): """Given a long imdb canonical name, returns a personID or None if not found.""" nd = analyze_name(name) res = Name.select(AND(Name.q.name == self.toUTF8(nd['name']), self._buildNULLCondition(Name.q.imdbIndex, nd.get('imdbIndex')))) try: c = res.count() if res.count() != 1: return None except (UnicodeDecodeError, TypeError): return None return res[0].id def _normalize_movieID(self, movieID): """Normalize the given movieID.""" try: return int(movieID) except (ValueError, OverflowError): raise IMDbError('movieID "%s" can\'t be converted to integer' % \ movieID) def _normalize_personID(self, personID): """Normalize the given personID.""" try: return int(personID) except (ValueError, OverflowError): raise IMDbError('personID "%s" can\'t be converted to integer' % \ personID) def _normalize_characterID(self, characterID): """Normalize the given characterID.""" try: return int(characterID) except (ValueError, OverflowError): raise IMDbError('characterID "%s" can\'t be converted to integer' \ % characterID) def _normalize_companyID(self, companyID): """Normalize the given companyID.""" try: return int(companyID) except (ValueError, OverflowError): raise IMDbError('companyID "%s" can\'t be converted to integer' \ % companyID) def get_imdbMovieID(self, movieID): """Translate a movieID in an imdbID. If not in the database, try an Exact Primary Title search on IMDb; return None if it's unable to get the imdbID. """ try: movie = Title.get(movieID) except NotFoundError: return None imdbID = movie.imdbID if imdbID is not None: return '%07d' % imdbID m_dict = get_movie_data(movie.id, self._kind) titline = build_title(m_dict, ptdf=1) imdbID = self.title2imdbID(titline) # If the imdbID was retrieved from the web and was not in the # database, update the database (ignoring errors, because it's # possibile that the current user has not update privileges). # There're times when I think I'm a genius; this one of # those times... <g> if imdbID is not None: try: movie.imdbID = int(imdbID) except: pass return imdbID def get_imdbPersonID(self, personID): """Translate a personID in an imdbID. If not in the database, try an Exact Primary Name search on IMDb; return None if it's unable to get the imdbID. """ try: person = Name.get(personID) except NotFoundError: return None imdbID = person.imdbID if imdbID is not None: return '%07d' % imdbID n_dict = {'name': person.name, 'imdbIndex': person.imdbIndex} namline = build_name(n_dict, canonical=1) imdbID = self.name2imdbID(namline) if imdbID is not None: try: person.imdbID = int(imdbID) except: pass return imdbID def get_imdbCharacterID(self, characterID): """Translate a characterID in an imdbID. If not in the database, try an Exact Primary Name search on IMDb; return None if it's unable to get the imdbID. """ try: character = CharName.get(characterID) except NotFoundError: return None imdbID = character.imdbID if imdbID is not None: return '%07d' % imdbID n_dict = {'name': character.name, 'imdbIndex': character.imdbIndex} namline = build_name(n_dict, canonical=1) imdbID = self.character2imdbID(namline) if imdbID is not None: try: character.imdbID = int(imdbID) except: pass return imdbID def get_imdbCompanyID(self, companyID): """Translate a companyID in an imdbID. If not in the database, try an Exact Primary Name search on IMDb; return None if it's unable to get the imdbID. """ try: company = CompanyName.get(companyID) except NotFoundError: return None imdbID = company.imdbID if imdbID is not None: return '%07d' % imdbID n_dict = {'name': company.name, 'country': company.countryCode} namline = build_company_name(n_dict) imdbID = self.company2imdbID(namline) if imdbID is not None: try: company.imdbID = int(imdbID) except: pass return imdbID def do_adult_search(self, doAdult): """If set to 0 or False, movies in the Adult category are not episodeOf = title_dict.get('episode of') shown in the results of a search.""" self.doAdult = doAdult def _search_movie(self, title, results, _episodes=False): title = title.strip() if not title: return [] title_dict = analyze_title(title, canonical=1) s_title = title_dict['title'] if not s_title: return [] episodeOf = title_dict.get('episode of') if episodeOf: _episodes = False s_title_split = s_title.split(', ') if len(s_title_split) > 1 and \ s_title_split[-1].lower() in _unicodeArticles: s_title_rebuilt = ', '.join(s_title_split[:-1]) if s_title_rebuilt: s_title = s_title_rebuilt #if not episodeOf: # if not _episodes: # s_title_split = s_title.split(', ') # if len(s_title_split) > 1 and \ # s_title_split[-1].lower() in _articles: # s_title_rebuilt = ', '.join(s_title_split[:-1]) # if s_title_rebuilt: # s_title = s_title_rebuilt #else: # _episodes = False if isinstance(s_title, unicode): s_title = s_title.encode('ascii', 'ignore') soundexCode = soundex(s_title) # XXX: improve the search restricting the kindID if the # "kind" of the input differs from "movie"? condition = conditionAka = None if _episodes: condition = AND(Title.q.phoneticCode == soundexCode, Title.q.kindID == self._kindRev['episode']) conditionAka = AND(AkaTitle.q.phoneticCode == soundexCode, AkaTitle.q.kindID == self._kindRev['episode']) elif title_dict['kind'] == 'episode' and episodeOf is not None: # set canonical=0 ? Should not make much difference. series_title = build_title(episodeOf, canonical=1) # XXX: is it safe to get "results" results? # Too many? Too few? serRes = results if serRes < 3 or serRes > 10: serRes = 10 searchSeries = self._search_movie(series_title, serRes) seriesIDs = [result[0] for result in searchSeries] if seriesIDs: condition = AND(Title.q.phoneticCode == soundexCode, IN(Title.q.episodeOfID, seriesIDs), Title.q.kindID == self._kindRev['episode']) conditionAka = AND(AkaTitle.q.phoneticCode == soundexCode, IN(AkaTitle.q.episodeOfID, seriesIDs), AkaTitle.q.kindID == self._kindRev['episode']) else: # XXX: bad situation: we have found no matching series; # try searching everything (both episodes and # non-episodes) for the title. condition = AND(Title.q.phoneticCode == soundexCode, IN(Title.q.episodeOfID, seriesIDs)) conditionAka = AND(AkaTitle.q.phoneticCode == soundexCode, IN(AkaTitle.q.episodeOfID, seriesIDs)) if condition is None: # XXX: excludes episodes? condition = AND(Title.q.kindID != self._kindRev['episode'], Title.q.phoneticCode == soundexCode) conditionAka = AND(AkaTitle.q.kindID != self._kindRev['episode'], AkaTitle.q.phoneticCode == soundexCode) # Up to 3 variations of the title are searched, plus the # long imdb canonical title, if provided. if not _episodes: title1, title2, title3 = titleVariations(title) else: title1 = title title2 = '' title3 = '' try: qr = [(q.id, get_movie_data(q.id, self._kind)) for q in Title.select(condition)] q2 = [(q.movieID, get_movie_data(q.id, self._kind, fromAka=1)) for q in AkaTitle.select(conditionAka)] qr += q2 except NotFoundError, e: raise IMDbDataAccessError( \ 'unable to search the database: "%s"' % str(e)) resultsST = results * 3 res = scan_titles(qr, title1, title2, title3, resultsST, searchingEpisode=episodeOf is not None, onlyEpisodes=_episodes, ro_thresold=0.0) res[:] = [x[1] for x in res] if res and not self.doAdult: mids = [x[0] for x in res] genreID = self._infoRev['genres'] adultlist = [al.movieID for al in MovieInfo.select( AND(MovieInfo.q.infoTypeID == genreID, MovieInfo.q.info == 'Adult', IN(MovieInfo.q.movieID, mids)))] res[:] = [x for x in res if x[0] not in adultlist] new_res = [] # XXX: can there be duplicates? for r in res: if r not in q2: new_res.append(r) continue mdict = r[1] aka_title = build_title(mdict, ptdf=1) orig_dict = get_movie_data(r[0], self._kind) orig_title = build_title(orig_dict, ptdf=1) if aka_title == orig_title: new_res.append(r) continue orig_dict['akas'] = [aka_title] new_res.append((r[0], orig_dict)) if results > 0: new_res[:] = new_res[:results] return new_res def _search_episode(self, title, results): return self._search_movie(title, results, _episodes=True) def get_movie_main(self, movieID): # Every movie information is retrieved from here. infosets = self.get_movie_infoset() try: res = get_movie_data(movieID, self._kind) except NotFoundError, e: raise IMDbDataAccessError( \ 'unable to get movieID "%s": "%s"' % (movieID, str(e))) if not res: raise IMDbDataAccessError('unable to get movieID "%s"' % movieID) # Collect cast information. castdata = [[cd.personID, cd.personRoleID, cd.note, cd.nrOrder, self._role[cd.roleID]] for cd in CastInfo.select(CastInfo.q.movieID == movieID)] for p in castdata: person = Name.get(p[0]) p += [person.name, person.imdbIndex] if p[4] in ('actor', 'actress'): p[4] = 'cast' # Regroup by role/duty (cast, writer, director, ...) castdata[:] = _groupListBy(castdata, 4) for group in castdata: duty = group[0][4] for pdata in group: curRole = pdata[1] curRoleID = None if curRole is not None: robj = CharName.get(curRole) curRole = robj.name curRoleID = robj.id p = Person(personID=pdata[0], name=pdata[5], currentRole=curRole or u'', roleID=curRoleID, notes=pdata[2] or u'', accessSystem='sql') if pdata[6]: p['imdbIndex'] = pdata[6] p.billingPos = pdata[3] res.setdefault(duty, []).append(p) if duty == 'cast': res[duty] = merge_roles(res[duty]) res[duty].sort() # Info about the movie. minfo = [(self._info[m.infoTypeID], m.info, m.note) for m in MovieInfo.select(MovieInfo.q.movieID == movieID)] minfo += [(self._info[m.infoTypeID], m.info, m.note) for m in MovieInfoIdx.select(MovieInfoIdx.q.movieID == movieID)] minfo += [('keywords', Keyword.get(m.keywordID).keyword, None) for m in MovieKeyword.select(MovieKeyword.q.movieID == movieID)] minfo = _groupListBy(minfo, 0) for group in minfo: sect = group[0][0] for mdata in group: data = mdata[1] if mdata[2]: data += '::%s' % mdata[2] res.setdefault(sect, []).append(data) # Companies info about a movie. cinfo = [(self._compType[m.companyTypeID], m.companyID, m.note) for m in MovieCompanies.select(MovieCompanies.q.movieID == movieID)] cinfo = _groupListBy(cinfo, 0) for group in cinfo: sect = group[0][0] for mdata in group: cDb = CompanyName.get(mdata[1]) cDbTxt = cDb.name if cDb.countryCode: cDbTxt += ' %s' % cDb.countryCode company = Company(name=cDbTxt, companyID=mdata[1], notes=mdata[2] or u'', accessSystem=self.accessSystem) res.setdefault(sect, []).append(company) # AKA titles. akat = [(get_movie_data(at.id, self._kind, fromAka=1), at.note) for at in AkaTitle.select(AkaTitle.q.movieID == movieID)] if akat: res['akas'] = [] for td, note in akat: nt = build_title(td, ptdf=1) if note: net = self._changeAKAencoding(note, nt) if net is not None: nt = net nt += '::%s' % note if nt not in res['akas']: res['akas'].append(nt) # Complete cast/crew. compcast = [(self._compcast[cc.subjectID], self._compcast[cc.statusID]) for cc in CompleteCast.select(CompleteCast.q.movieID == movieID)] if compcast: for entry in compcast: val = unicode(entry[1]) res[u'complete %s' % entry[0]] = val # Movie connections. mlinks = [[ml.linkedMovieID, self._link[ml.linkTypeID]] for ml in MovieLink.select(MovieLink.q.movieID == movieID)] if mlinks: for ml in mlinks: lmovieData = get_movie_data(ml[0], self._kind) m = Movie(movieID=ml[0], data=lmovieData, accessSystem='sql') ml[0] = m res['connections'] = {} mlinks[:] = _groupListBy(mlinks, 1) for group in mlinks: lt = group[0][1] res['connections'][lt] = [i[0] for i in group] # Episodes. episodes = {} eps_list = list(Title.select(Title.q.episodeOfID == movieID)) eps_list.sort() if eps_list: ps_data = {'title': res['title'], 'kind': res['kind'], 'year': res.get('year'), 'imdbIndex': res.get('imdbIndex')} parentSeries = Movie(movieID=movieID, data=ps_data, accessSystem='sql') for episode in eps_list: episodeID = episode.id episode_data = get_movie_data(episodeID, self._kind) m = Movie(movieID=episodeID, data=episode_data, accessSystem='sql') m['episode of'] = parentSeries season = episode_data.get('season', 'UNKNOWN') if season not in episodes: episodes[season] = {} ep_number = episode_data.get('episode') if ep_number is None: ep_number = max((episodes[season].keys() or [0])) + 1 episodes[season][ep_number] = m res['episodes'] = episodes res['number of episodes'] = sum([len(x) for x in episodes.values()]) res['number of seasons'] = len(episodes.keys()) # Regroup laserdisc information. res = _reGroupDict(res, self._moviesubs) # Do some transformation to preserve consistency with other # data access systems. if 'quotes' in res: for idx, quote in enumerate(res['quotes']): res['quotes'][idx] = quote.split('::') if 'runtimes' in res and len(res['runtimes']) > 0: rt = res['runtimes'][0] episodes = re_episodes.findall(rt) if episodes: res['runtimes'][0] = re_episodes.sub('', rt) if res['runtimes'][0][-2:] == '::': res['runtimes'][0] = res['runtimes'][0][:-2] if 'votes' in res: res['votes'] = int(res['votes'][0]) if 'rating' in res: res['rating'] = float(res['rating'][0]) if 'votes distribution' in res: res['votes distribution'] = res['votes distribution'][0] if 'mpaa' in res: res['mpaa'] = res['mpaa'][0] if 'top 250 rank' in res: try: res['top 250 rank'] = int(res['top 250 rank']) except: pass if 'bottom 10 rank' in res: try: res['bottom 100 rank'] = int(res['bottom 10 rank']) except: pass del res['bottom 10 rank'] for old, new in [('guest', 'guests'), ('trademarks', 'trade-mark'), ('articles', 'article'), ('pictorials', 'pictorial'), ('magazine-covers', 'magazine-cover-photo')]: if old in res: res[new] = res[old] del res[old] trefs,nrefs = {}, {} trefs,nrefs = self._extractRefs(sub_dict(res,Movie.keys_tomodify_list)) return {'data': res, 'titlesRefs': trefs, 'namesRefs': nrefs, 'info sets': infosets} # Just to know what kind of information are available. get_movie_alternate_versions = get_movie_main get_movie_business = get_movie_main get_movie_connections = get_movie_main get_movie_crazy_credits = get_movie_main get_movie_goofs = get_movie_main get_movie_keywords = get_movie_main get_movie_literature = get_movie_main get_movie_locations = get_movie_main get_movie_plot = get_movie_main get_movie_quotes = get_movie_main get_movie_release_dates = get_movie_main get_movie_soundtrack = get_movie_main get_movie_taglines = get_movie_main get_movie_technical = get_movie_main get_movie_trivia = get_movie_main get_movie_vote_details = get_movie_main get_movie_episodes = get_movie_main def _search_person(self, name, results): name = name.strip() if not name: return [] s_name = analyze_name(name)['name'] if not s_name: return [] if isinstance(s_name, unicode): s_name = s_name.encode('ascii', 'ignore') soundexCode = soundex(s_name) name1, name2, name3 = nameVariations(name) # If the soundex is None, compare only with the first # phoneticCode column. if soundexCode is not None: condition = IN(soundexCode, [Name.q.namePcodeCf, Name.q.namePcodeNf, Name.q.surnamePcode]) conditionAka = IN(soundexCode, [AkaName.q.namePcodeCf, AkaName.q.namePcodeNf, AkaName.q.surnamePcode]) else: condition = ISNULL(Name.q.namePcodeCf) conditionAka = ISNULL(AkaName.q.namePcodeCf) try: qr = [(q.id, {'name': q.name, 'imdbIndex': q.imdbIndex}) for q in Name.select(condition)] q2 = [(q.personID, {'name': q.name, 'imdbIndex': q.imdbIndex}) for q in AkaName.select(conditionAka)] qr += q2 except NotFoundError, e: raise IMDbDataAccessError( \ 'unable to search the database: "%s"' % str(e)) res = scan_names(qr, name1, name2, name3, results) res[:] = [x[1] for x in res] # Purge empty imdbIndex. returnl = [] for x in res: tmpd = x[1] if tmpd['imdbIndex'] is None: del tmpd['imdbIndex'] returnl.append((x[0], tmpd)) new_res = [] # XXX: can there be duplicates? for r in returnl: if r not in q2: new_res.append(r) continue pdict = r[1] aka_name = build_name(pdict, canonical=1) p = Name.get(r[0]) orig_dict = {'name': p.name, 'imdbIndex': p.imdbIndex} if orig_dict['imdbIndex'] is None: del orig_dict['imdbIndex'] orig_name = build_name(orig_dict, canonical=1) if aka_name == orig_name: new_res.append(r) continue orig_dict['akas'] = [aka_name] new_res.append((r[0], orig_dict)) if results > 0: new_res[:] = new_res[:results] return new_res def get_person_main(self, personID): # Every person information is retrieved from here. infosets = self.get_person_infoset() try: p = Name.get(personID) except NotFoundError, e: raise IMDbDataAccessError( \ 'unable to get personID "%s": "%s"' % (personID, str(e))) res = {'name': p.name, 'imdbIndex': p.imdbIndex} if res['imdbIndex'] is None: del res['imdbIndex'] if not res: raise IMDbDataAccessError('unable to get personID "%s"' % personID) # Collect cast information. castdata = [(cd.movieID, cd.personRoleID, cd.note, self._role[cd.roleID], get_movie_data(cd.movieID, self._kind)) for cd in CastInfo.select(CastInfo.q.personID == personID)] # Regroup by role/duty (cast, writer, director, ...) castdata[:] = _groupListBy(castdata, 3) episodes = {} seenDuties = [] for group in castdata: for mdata in group: duty = orig_duty = group[0][3] if duty not in seenDuties: seenDuties.append(orig_duty) note = mdata[2] or u'' if 'episode of' in mdata[4]: duty = 'episodes' if orig_duty not in ('actor', 'actress'): if note: note = ' %s' % note note = '[%s]%s' % (orig_duty, note) curRole = mdata[1] curRoleID = None if curRole is not None: robj = CharName.get(curRole) curRole = robj.name curRoleID = robj.id m = Movie(movieID=mdata[0], data=mdata[4], currentRole=curRole or u'', roleID=curRoleID, notes=note, accessSystem='sql') if duty != 'episodes': res.setdefault(duty, []).append(m) else: episodes.setdefault(m['episode of'], []).append(m) if episodes: for k in episodes: episodes[k].sort() episodes[k].reverse() res['episodes'] = episodes for duty in seenDuties: if duty in res: if duty in ('actor', 'actress', 'himself', 'herself', 'themselves'): res[duty] = merge_roles(res[duty]) res[duty].sort() # Info about the person. pinfo = [(self._info[pi.infoTypeID], pi.info, pi.note) for pi in PersonInfo.select(PersonInfo.q.personID == personID)] # Regroup by duty. pinfo = _groupListBy(pinfo, 0) for group in pinfo: sect = group[0][0] for pdata in group: data = pdata[1] if pdata[2]: data += '::%s' % pdata[2] res.setdefault(sect, []).append(data) # AKA names. akan = [(an.name, an.imdbIndex) for an in AkaName.select(AkaName.q.personID == personID)] if akan: res['akas'] = [] for n in akan: nd = {'name': n[0]} if n[1]: nd['imdbIndex'] = n[1] nt = build_name(nd, canonical=1) res['akas'].append(nt) # Do some transformation to preserve consistency with other # data access systems. for key in ('birth date', 'birth notes', 'death date', 'death notes', 'birth name', 'height'): if key in res: res[key] = res[key][0] if 'guest' in res: res['notable tv guest appearances'] = res['guest'] del res['guest'] miscnames = res.get('nick names', []) if 'birth name' in res: miscnames.append(res['birth name']) if 'akas' in res: for mname in miscnames: if mname in res['akas']: res['akas'].remove(mname) if not res['akas']: del res['akas'] trefs,nrefs = self._extractRefs(sub_dict(res,Person.keys_tomodify_list)) return {'data': res, 'titlesRefs': trefs, 'namesRefs': nrefs, 'info sets': infosets} # Just to know what kind of information are available. get_person_filmography = get_person_main get_person_biography = get_person_main get_person_other_works = get_person_main get_person_episodes = get_person_main def _search_character(self, name, results): name = name.strip() if not name: return [] s_name = analyze_name(name)['name'] if not s_name: return [] if isinstance(s_name, unicode): s_name = s_name.encode('ascii', 'ignore') s_name = normalizeName(s_name) soundexCode = soundex(s_name) surname = s_name.split(' ')[-1] surnameSoundex = soundex(surname) name2 = '' soundexName2 = None nsplit = s_name.split() if len(nsplit) > 1: name2 = '%s %s' % (nsplit[-1], ' '.join(nsplit[:-1])) if s_name == name2: name2 = '' else: soundexName2 = soundex(name2) # If the soundex is None, compare only with the first # phoneticCode column. if soundexCode is not None: if soundexName2 is not None: condition = OR(surnameSoundex == CharName.q.surnamePcode, IN(CharName.q.namePcodeNf, [soundexCode, soundexName2]), IN(CharName.q.surnamePcode, [soundexCode, soundexName2])) else: condition = OR(surnameSoundex == CharName.q.surnamePcode, IN(soundexCode, [CharName.q.namePcodeNf, CharName.q.surnamePcode])) else: condition = ISNULL(Name.q.namePcodeNf) try: qr = [(q.id, {'name': q.name, 'imdbIndex': q.imdbIndex}) for q in CharName.select(condition)] except NotFoundError, e: raise IMDbDataAccessError( \ 'unable to search the database: "%s"' % str(e)) res = scan_names(qr, s_name, name2, '', results, _scan_character=True) res[:] = [x[1] for x in res] # Purge empty imdbIndex. returnl = [] for x in res: tmpd = x[1] if tmpd['imdbIndex'] is None: del tmpd['imdbIndex'] returnl.append((x[0], tmpd)) return returnl def get_character_main(self, characterID, results=1000): # Every character information is retrieved from here. infosets = self.get_character_infoset() try: c = CharName.get(characterID) except NotFoundError, e: raise IMDbDataAccessError( \ 'unable to get characterID "%s": "%s"' % (characterID, e)) res = {'name': c.name, 'imdbIndex': c.imdbIndex} if res['imdbIndex'] is None: del res['imdbIndex'] if not res: raise IMDbDataAccessError('unable to get characterID "%s"' % \ characterID) # Collect filmography information. items = CastInfo.select(CastInfo.q.personRoleID == characterID) if results > 0: items = items[:results] filmodata = [(cd.movieID, cd.personID, cd.note, get_movie_data(cd.movieID, self._kind)) for cd in items if self._role[cd.roleID] in ('actor', 'actress')] fdata = [] for f in filmodata: curRole = None curRoleID = f[1] note = f[2] or u'' if curRoleID is not None: robj = Name.get(curRoleID) curRole = robj.name m = Movie(movieID=f[0], data=f[3], currentRole=curRole or u'', roleID=curRoleID, roleIsPerson=True, notes=note, accessSystem='sql') fdata.append(m) fdata = merge_roles(fdata) fdata.sort() if fdata: res['filmography'] = fdata return {'data': res, 'info sets': infosets} get_character_filmography = get_character_main get_character_biography = get_character_main def _search_company(self, name, results): name = name.strip() if not name: return [] if isinstance(name, unicode): name = name.encode('ascii', 'ignore') soundexCode = soundex(name) # If the soundex is None, compare only with the first # phoneticCode column. if soundexCode is None: condition = ISNULL(CompanyName.q.namePcodeNf) else: if name.endswith(']'): condition = CompanyName.q.namePcodeSf == soundexCode else: condition = CompanyName.q.namePcodeNf == soundexCode try: qr = [(q.id, {'name': q.name, 'country': q.countryCode}) for q in CompanyName.select(condition)] except NotFoundError, e: raise IMDbDataAccessError( \ 'unable to search the database: "%s"' % str(e)) qr[:] = [(x[0], build_company_name(x[1])) for x in qr] res = scan_company_names(qr, name, results) res[:] = [x[1] for x in res] # Purge empty country keys. returnl = [] for x in res: tmpd = x[1] country = tmpd.get('country') if country is None and 'country' in tmpd: del tmpd['country'] returnl.append((x[0], tmpd)) return returnl def get_company_main(self, companyID, results=0): # Every company information is retrieved from here. infosets = self.get_company_infoset() try: c = CompanyName.get(companyID) except NotFoundError, e: raise IMDbDataAccessError( \ 'unable to get companyID "%s": "%s"' % (companyID, e)) res = {'name': c.name, 'country': c.countryCode} if res['country'] is None: del res['country'] if not res: raise IMDbDataAccessError('unable to get companyID "%s"' % \ companyID) # Collect filmography information. items = MovieCompanies.select(MovieCompanies.q.companyID == companyID) if results > 0: items = items[:results] filmodata = [(cd.movieID, cd.companyID, self._compType[cd.companyTypeID], cd.note, get_movie_data(cd.movieID, self._kind)) for cd in items] filmodata = _groupListBy(filmodata, 2) for group in filmodata: ctype = group[0][2] for movieID, companyID, ctype, note, movieData in group: movie = Movie(data=movieData, movieID=movieID, notes=note or u'', accessSystem=self.accessSystem) res.setdefault(ctype, []).append(movie) res.get(ctype, []).sort() return {'data': res, 'info sets': infosets} def _search_keyword(self, keyword, results): constr = OR(Keyword.q.phoneticCode == soundex(keyword.encode('ascii', 'ignore')), CONTAINSSTRING(Keyword.q.keyword, self.toUTF8(keyword))) return filterSimilarKeywords(keyword, _iterKeywords(Keyword.select(constr)))[:results] def _get_keyword(self, keyword, results): keyID = Keyword.select(Keyword.q.keyword == keyword) if keyID.count() == 0: return [] keyID = keyID[0].id movies = MovieKeyword.select(MovieKeyword.q.keywordID == keyID)[:results] return [(m.movieID, get_movie_data(m.movieID, self._kind)) for m in movies] def _get_top_bottom_movies(self, kind): if kind == 'top': kind = 'top 250 rank' elif kind == 'bottom': # Not a refuse: the plain text data files contains only # the bottom 10 movies. kind = 'bottom 10 rank' else: return [] infoID = InfoType.select(InfoType.q.info == kind) if infoID.count() == 0: return [] infoID = infoID[0].id movies = MovieInfoIdx.select(MovieInfoIdx.q.infoTypeID == infoID) ml = [] for m in movies: minfo = get_movie_data(m.movieID, self._kind) for k in kind, 'votes', 'rating', 'votes distribution': valueDict = getSingleInfo(MovieInfoIdx, m.movieID, k, notAList=True) if k in (kind, 'votes') and k in valueDict: valueDict[k] = int(valueDict[k]) elif k == 'rating' and k in valueDict: valueDict[k] = float(valueDict[k]) minfo.update(valueDict) ml.append((m.movieID, minfo)) sorter = (_cmpBottom, _cmpTop)[kind == 'top 250 rank'] ml.sort(sorter) return ml def __del__(self): """Ensure that the connection is closed.""" if not hasattr(self, '_connection'): return self._sql_logger.debug('closing connection to the database') self._connection.close() ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/sql/alchemyadapter.py��������������������������������������������������������0000644�0000000�0000000�00000045537�11766731642�017506� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.sql.alchemyadapter module (imdb.parser.sql package). This module adapts the SQLAlchemy ORM to the internal mechanism. Copyright 2008-2010 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re import sys import logging from sqlalchemy import * from sqlalchemy import schema try: from sqlalchemy import exc # 0.5 except ImportError: from sqlalchemy import exceptions as exc # 0.4 _alchemy_logger = logging.getLogger('imdbpy.parser.sql.alchemy') try: import migrate.changeset HAS_MC = True except ImportError: HAS_MC = False _alchemy_logger.warn('Unable to import migrate.changeset: Foreign ' \ 'Keys will not be created.') from imdb._exceptions import IMDbDataAccessError from dbschema import * # Used to convert table and column names. re_upper = re.compile(r'([A-Z])') # XXX: I'm not sure at all that this is the best method to connect # to the database and bind that connection to every table. metadata = MetaData() # Maps our placeholders to SQLAlchemy's column types. MAP_COLS = { INTCOL: Integer, UNICODECOL: UnicodeText, STRINGCOL: String } class NotFoundError(IMDbDataAccessError): """Exception raised when Table.get(id) returns no value.""" pass def _renameTable(tname): """Build the name of a table, as done by SQLObject.""" tname = re_upper.sub(r'_\1', tname) if tname.startswith('_'): tname = tname[1:] return tname.lower() def _renameColumn(cname): """Build the name of a column, as done by SQLObject.""" cname = cname.replace('ID', 'Id') return _renameTable(cname) class DNNameObj(object): """Used to access table.sqlmeta.columns[column].dbName (a string).""" def __init__(self, dbName): self.dbName = dbName def __repr__(self): return '<DNNameObj(dbName=%s) [id=%s]>' % (self.dbName, id(self)) class DNNameDict(object): """Used to access table.sqlmeta.columns (a dictionary).""" def __init__(self, colMap): self.colMap = colMap def __getitem__(self, key): return DNNameObj(self.colMap[key]) def __repr__(self): return '<DNNameDict(colMap=%s) [id=%s]>' % (self.colMap, id(self)) class SQLMetaAdapter(object): """Used to access table.sqlmeta (an object with .table, .columns and .idName attributes).""" def __init__(self, table, colMap=None): self.table = table if colMap is None: colMap = {} self.colMap = colMap def __getattr__(self, name): if name == 'table': return getattr(self.table, name) if name == 'columns': return DNNameDict(self.colMap) if name == 'idName': return self.colMap.get('id', 'id') return None def __repr__(self): return '<SQLMetaAdapter(table=%s, colMap=%s) [id=%s]>' % \ (repr(self.table), repr(self.colMap), id(self)) class QAdapter(object): """Used to access table.q attribute (remapped to SQLAlchemy table.c).""" def __init__(self, table, colMap=None): self.table = table if colMap is None: colMap = {} self.colMap = colMap def __getattr__(self, name): try: return getattr(self.table.c, self.colMap[name]) except KeyError, e: raise AttributeError("unable to get '%s'" % name) def __repr__(self): return '<QAdapter(table=%s, colMap=%s) [id=%s]>' % \ (repr(self.table), repr(self.colMap), id(self)) class RowAdapter(object): """Adapter for a SQLAlchemy RowProxy object.""" def __init__(self, row, table, colMap=None): self.row = row # FIXME: it's OBSCENE that 'table' should be passed from # TableAdapter through ResultAdapter only to land here, # where it's used to directly update a row item. self.table = table if colMap is None: colMap = {} self.colMap = colMap self.colMapKeys = colMap.keys() def __getattr__(self, name): try: return getattr(self.row, self.colMap[name]) except KeyError, e: raise AttributeError("unable to get '%s'" % name) def __setattr__(self, name, value): # FIXME: I can't even think about how much performances suffer, # for this horrible hack (and it's used so rarely...) # For sure something like a "property" to map column names # to getter/setter functions would be much better, but it's # not possible (or at least not easy) to build them for a # single instance. if name in self.__dict__.get('colMapKeys', ()): # Trying to update a value in the database. row = self.__dict__['row'] table = self.__dict__['table'] colMap = self.__dict__['colMap'] params = {colMap[name]: value} table.update(table.c.id==row.id).execute(**params) # XXX: minor bug: after a value is assigned with the # 'rowAdapterInstance.colName = value' syntax, for some # reason rowAdapterInstance.colName still returns the # previous value (even if the database is updated). # Fix it? I'm not even sure it's ever used. return # For every other attribute. object.__setattr__(self, name, value) def __repr__(self): return '<RowAdapter(row=%s, table=%s, colMap=%s) [id=%s]>' % \ (repr(self.row), repr(self.table), repr(self.colMap), id(self)) class ResultAdapter(object): """Adapter for a SQLAlchemy ResultProxy object.""" def __init__(self, result, table, colMap=None): self.result = result self.table = table if colMap is None: colMap = {} self.colMap = colMap def count(self): return len(self) def __len__(self): # FIXME: why sqlite returns -1? (that's wrooong!) if self.result.rowcount == -1: return 0 return self.result.rowcount def __getitem__(self, key): res = list(self.result)[key] if not isinstance(key, slice): # A single item. return RowAdapter(res, self.table, colMap=self.colMap) else: # A (possible empty) list of items. return [RowAdapter(x, self.table, colMap=self.colMap) for x in res] def __iter__(self): for item in self.result: yield RowAdapter(item, self.table, colMap=self.colMap) def __repr__(self): return '<ResultAdapter(result=%s, table=%s, colMap=%s) [id=%s]>' % \ (repr(self.result), repr(self.table), repr(self.colMap), id(self)) class TableAdapter(object): """Adapter for a SQLAlchemy Table object, to mimic a SQLObject class.""" def __init__(self, table, uri=None): """Initialize a TableAdapter object.""" self._imdbpySchema = table self._imdbpyName = table.name self.connectionURI = uri self.colMap = {} columns = [] for col in table.cols: # Column's paramters. params = {'nullable': True} params.update(col.params) if col.name == 'id': params['primary_key'] = True if 'notNone' in params: params['nullable'] = not params['notNone'] del params['notNone'] cname = _renameColumn(col.name) self.colMap[col.name] = cname colClass = MAP_COLS[col.kind] colKindParams = {} if 'length' in params: colKindParams['length'] = params['length'] del params['length'] elif colClass is UnicodeText and col.index: # XXX: limit length for UNICODECOLs that will have an index. # this can result in name.name and title.title truncations! colClass = Unicode # Should work for most of the database servers. length = 511 if self.connectionURI: if self.connectionURI.startswith('mysql'): # To stay compatible with MySQL 4.x. length = 255 colKindParams['length'] = length elif self._imdbpyName == 'PersonInfo' and col.name == 'info': if self.connectionURI: if self.connectionURI.startswith('ibm'): # There are some entries longer than 32KB. colClass = CLOB # I really do hope that this space isn't wasted # for each other shorter entry... <g> colKindParams['length'] = 68*1024 colKind = colClass(**colKindParams) if 'alternateID' in params: # There's no need to handle them here. del params['alternateID'] # Create a column. colObj = Column(cname, colKind, **params) columns.append(colObj) self.tableName = _renameTable(table.name) # Create the table. self.table = Table(self.tableName, metadata, *columns) self._ta_insert = self.table.insert() self._ta_select = self.table.select # Adapters for special attributes. self.q = QAdapter(self.table, colMap=self.colMap) self.sqlmeta = SQLMetaAdapter(self.table, colMap=self.colMap) def select(self, conditions=None): """Return a list of results.""" result = self._ta_select(conditions).execute() return ResultAdapter(result, self.table, colMap=self.colMap) def get(self, theID): """Get an object given its ID.""" result = self.select(self.table.c.id == theID) #if not result: # raise NotFoundError, 'no data for ID %s' % theID # FIXME: isn't this a bit risky? We can't check len(result), # because sqlite returns -1... # What about converting it to a list and getting the first item? try: return result[0] except KeyError: raise NotFoundError('no data for ID %s' % theID) def dropTable(self, checkfirst=True): """Drop the table.""" dropParams = {'checkfirst': checkfirst} # Guess what? Another work-around for a ibm_db bug. if self.table.bind.engine.url.drivername.startswith('ibm_db'): del dropParams['checkfirst'] try: self.table.drop(**dropParams) except exc.ProgrammingError: # As above: re-raise the exception, but only if it's not ibm_db. if not self.table.bind.engine.url.drivername.startswith('ibm_db'): raise def createTable(self, checkfirst=True): """Create the table.""" self.table.create(checkfirst=checkfirst) # Create indexes for alternateID columns (other indexes will be # created later, at explicit request for performances reasons). for col in self._imdbpySchema.cols: if col.name == 'id': continue if col.params.get('alternateID', False): self._createIndex(col, checkfirst=checkfirst) def _createIndex(self, col, checkfirst=True): """Create an index for a given (schema) column.""" # XXX: indexLen is ignored in SQLAlchemy, and that means that # indexes will be over the whole 255 chars strings... # NOTE: don't use a dot as a separator, or DB2 will do # nasty things. idx_name = '%s_%s' % (self.table.name, col.index or col.name) if checkfirst: for index in self.table.indexes: if index.name == idx_name: return idx = Index(idx_name, getattr(self.table.c, self.colMap[col.name])) # XXX: beware that exc.OperationalError can be raised, is some # strange circumstances; that's why the index name doesn't # follow the SQLObject convention, but includes the table name: # sqlite, for example, expects index names to be unique at # db-level. try: idx.create() except exc.OperationalError, e: _alchemy_logger.warn('Skipping creation of the %s.%s index: %s' % (self.sqlmeta.table, col.name, e)) def addIndexes(self, ifNotExists=True): """Create all required indexes.""" for col in self._imdbpySchema.cols: if col.index: self._createIndex(col, checkfirst=ifNotExists) def addForeignKeys(self, mapTables, ifNotExists=True): """Create all required foreign keys.""" if not HAS_MC: return # It seems that there's no reason to prevent the creation of # indexes for columns with FK constrains: if there's already # an index, the FK index is not created. countCols = 0 for col in self._imdbpySchema.cols: countCols += 1 if not col.foreignKey: continue fks = col.foreignKey.split('.', 1) foreignTableName = fks[0] if len(fks) == 2: foreignColName = fks[1] else: foreignColName = 'id' foreignColName = mapTables[foreignTableName].colMap.get( foreignColName, foreignColName) thisColName = self.colMap.get(col.name, col.name) thisCol = self.table.columns[thisColName] foreignTable = mapTables[foreignTableName].table foreignCol = getattr(foreignTable.c, foreignColName) # Need to explicitly set an unique name, otherwise it will # explode, if two cols points to the same table. fkName = 'fk_%s_%s_%d' % (foreignTable.name, foreignColName, countCols) constrain = migrate.changeset.ForeignKeyConstraint([thisCol], [foreignCol], name=fkName) try: constrain.create() except exc.OperationalError: continue def __call__(self, *args, **kwds): """To insert a new row with the syntax: TableClass(key=value, ...)""" taArgs = {} for key, value in kwds.items(): taArgs[self.colMap.get(key, key)] = value self._ta_insert.execute(*args, **taArgs) def __repr__(self): return '<TableAdapter(table=%s) [id=%s]>' % (repr(self.table), id(self)) # Module-level "cache" for SQLObject classes, to prevent # "Table 'tableName' is already defined for this MetaData instance" errors, # when two or more connections to the database are made. # XXX: is this the best way to act? TABLES_REPOSITORY = {} def getDBTables(uri=None): """Return a list of TableAdapter objects to be used to access the database through the SQLAlchemy ORM. The connection uri is optional, and can be used to tailor the db schema to specific needs.""" DB_TABLES = [] for table in DB_SCHEMA: if table.name in TABLES_REPOSITORY: DB_TABLES.append(TABLES_REPOSITORY[table.name]) continue tableAdapter = TableAdapter(table, uri) DB_TABLES.append(tableAdapter) TABLES_REPOSITORY[table.name] = tableAdapter return DB_TABLES # Functions used to emulate SQLObject's logical operators. def AND(*params): """Emulate SQLObject's AND.""" return and_(*params) def OR(*params): """Emulate SQLObject's OR.""" return or_(*params) def IN(item, inList): """Emulate SQLObject's IN.""" if not isinstance(item, schema.Column): return OR(*[x == item for x in inList]) else: return item.in_(inList) def ISNULL(x): """Emulate SQLObject's ISNULL.""" # XXX: Should we use null()? Can null() be a global instance? # XXX: Is it safe to test None with the == operator, in this case? return x == None def ISNOTNULL(x): """Emulate SQLObject's ISNOTNULL.""" return x != None def CONTAINSSTRING(expr, pattern): """Emulate SQLObject's CONTAINSSTRING.""" return expr.like('%%%s%%' % pattern) def toUTF8(s): """For some strange reason, sometimes SQLObject wants utf8 strings instead of unicode; with SQLAlchemy we just return the unicode text.""" return s class _AlchemyConnection(object): """A proxy for the connection object, required since _ConnectionFairy uses __slots__.""" def __init__(self, conn): self.conn = conn def __getattr__(self, name): return getattr(self.conn, name) def setConnection(uri, tables, encoding='utf8', debug=False): """Set connection for every table.""" # FIXME: why on earth MySQL requires an additional parameter, # is well beyond my understanding... if uri.startswith('mysql'): if '?' in uri: uri += '&' else: uri += '?' uri += 'charset=%s' % encoding params = {'encoding': encoding} if debug: params['echo'] = True if uri.startswith('ibm_db'): # Try to work-around a possible bug of the ibm_db DB2 driver. params['convert_unicode'] = True # XXX: is this the best way to connect? engine = create_engine(uri, **params) metadata.bind = engine eng_conn = engine.connect() if uri.startswith('sqlite'): major = sys.version_info[0] minor = sys.version_info[1] if major > 2 or (major == 2 and minor > 5): eng_conn.connection.connection.text_factory = str # XXX: OH MY, THAT'S A MESS! # We need to return a "connection" object, with the .dbName # attribute set to the db engine name (e.g. "mysql"), .paramstyle # set to the style of the paramters for query() calls, and the # .module attribute set to a module (?) with .OperationalError and # .IntegrityError attributes. # Another attribute of "connection" is the getConnection() function, # used to return an object with a .cursor() method. connection = _AlchemyConnection(eng_conn.connection) paramstyle = eng_conn.dialect.paramstyle connection.module = eng_conn.dialect.dbapi connection.paramstyle = paramstyle connection.getConnection = lambda: connection.connection connection.dbName = engine.url.drivername return connection �����������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/sql/cutils.so����������������������������������������������������������������0000755�0000000�0000000�00000067465�11766731642�016026� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������ELF����������>����P������@�������8\����������@�8��@�%�"��������������������������������������������� ����������������� ����� ������������������� �����������@������@ �����@ ������������������������������������������������$�������$��������������Ptd���������������������4�������4��������������Qtd��������������������������������������������������Rtd��������� ����� ���������������������������������GNU�45A/tJ���������������� �$@ �����������������������-UqXCE칍|��������������������������������� ���������������������������������������� �������������������+��� �������������������?����������������������W����������������������P�������������������������"�������������������k����������������������y�������������������������������������������������������������������������������������������� ������������������ �h������������������ ����������������� ����������������� ������������������� ��������������__gmon_start__�_init�_fini�__cxa_finalize�_Jv_RegisterClasses�PyArg_ParseTuple�strlen�__ctype_toupper_loc�Py_BuildValue�__stack_chk_fail�__ctype_tolower_loc�strncpy�strcmp�initcutils�Py_InitModule4_64�libpthread.so.0�libc.so.6�_edata�__bss_start�_end�GLIBC_2.4�GLIBC_2.2.5�GLIBC_2.3��������������������������������������������ii ���������ui ��������ii ���������` ������������` ����� ������������������ ������������ ������ ������������������ ������������������ ������������ ������ ������������������ ������������������� ������������������� �������������������� ������������������� ������������������� ������������������� ������������������� �������������������( �������� �����������0 �������� �����������8 �������� �����������@ �������� �����������H �������� �����������H���Z����H5J �%L �@�%J �h����%B �h���%: �h���%2 �h���%* �h���%" �h���% �h���% �h���p%  �h���`% �h ���PHHu �HtHÐU=h ��HATSubH=X ��t H= �ZH �L% �H= �L)HHH9s D��HH �AH �H9r �[A\f�����H=@ ��UHtH �HtH=' �@�ÐAWAVAUATUHSH(H9H|$H $-��H9$��III9o��I1L9t$���HD$H $1Hl$HD$����H9MkMs|ECMHEfD��HH9v_D8@HxHuEE���D:HA���A���tYfA9~H $IHN4&DH|$Ll$LH9w����IM9k1fu]H([]A\A]A^A_D��AMGT%�EtF:'uD��HA���A���s@�H(1[]A\A]A^A_HT$HH $HH|$HL$Ht$AH|$HkAH([]A\A]A^A_HR1H9$MAffff.�����ATHH5��USH ��dH%(���H$��1HT$HD$����1+��H\$1HI~5~AH0JL#fHPw HcՃDHH9uHD$HcD,�IAI!ʁ t€��DIHLD�IA)���T$A$�����H������H=���A9~<AHc tDNMcB8 ��tLc΃B ��H~HcH=?��1Ƅ4���H$��H$��dH3%(���u#H ��[]A\H=.��1qѾ���s�ATIUS1D��I,,tHU�H�E�HcL H9r[]A\ÐH\$Hl$HLd$Ll$HX��dH%(���H$(��1HT$LD$HL$H5Y��HD$����HD$����HD$����y1uAH$(��dH3%(�����H$8��H$@��L$H��L$P��HX��fH$ ��Ht$��Hl$ HIHt$��H|HHA$I!%tI��DIT$LD�IA)AU�I!%t��DIULD�IA)DAąuWH=5�����ZHHtiE9|tA*A*^ c��Zf.wIcIcHHL �H43HGd%�[*A*X^ffZqD�� ��_�A*A*^�H5i �H=g���A��11UHSHH �HtH ��HHHuH[ÐHH����������ss|O�cutils�ratcliff�soundex�Soundex code for strings.��Ratcliff-Obershelp similarity.�����������123�12��22455�12623�1�2�2����?ffffff?;4������P��������8��`�������������zR�x ��|���������BBB B(A0CA8D`R 8A0A(B BBBF | 8C0A(B BBBA | 8C0A(B BBCA ���4���������BKA GaV  AABA ��,������?����BCAA BrAB�����$�����-���^DK C �����,������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������� ������������� �������h������o����������������� �������������@������ ������������� ��������������������� ������������������������������������������������������������������������� ��������������o����h������o�����������o����<������o�������������������������������������������������������������������������������������������������������������������@ ���������������������������������������������������������������&������6������F����������������������` ����������������������������������� ������������������������� ���������������������������������������������������GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2�,������������� ������ ��������������������������������$��initcutils��������������I��,�� ������-����������9���8�������7 ������# ��int�����i�����i���6���i����������1���{��x�� ��b���#�|�����# �����# �����#������# ^ �����#(�����#0`�����#8�����#@�����#HU�����#Pg��� ���#X��"��#`��$��#h ��&b���#p��*b���#tu��,p���#x��0F���#R ��1T���#e��2��#���6��#y��?{���#x��H���#��I���#��J���#��K���#��L-���#���Nb���#/��P��#� ^�� ���� ����#� ����# ��b���#������ ����� 8�����x�� ����� 8���������8���g�������������b�� Y��jN�� ��k ��#� / ��k]��#�Z��D]����E ��#�/ ��E]��#��E ��#Q��F��# ��G ��# F ��G ��#(V���K6��#0��LT��#8��M��#@{��N��#H��O��#P���P��#X ��T��#`���U��#hY��V��#pi��Z ��#x��[��#��\��#��]��#��^��#w��a��#l��di���#��f��#E��j3 ��#��m��#@���q+��#*��t ��#��xW��#��yc��#���|C��#��}O��#?��~��#��]��#����#Q ��o��#��{��# �� ��#����# ����# ����#����#<����#�����#h����#����# ����#I����# ��6��#��M���#�N����l%��;��y�����������c�� ��������������������������������N������b�������������� ��(�������3��9��b���N��N��N�������_��e����z���� ��������������� �� ���N������b������� �����I������b�������� �� ������� ����b���+��������������� ��B��H�� ��b���� ��+�����B��"��x��~�� ��������� ��{ ������ ������ ��1��� ��`y ��buf����#�obj���#len� ��# I �� ��# ��b���# ��b���#$ �����#( ����#0 �����#8 m����#@ ��y ��#H �����#X� �� �� 8����������� �� ��b��� ���� ��b���� ����� �� �� ���� ������� �� ��b��� ��������� �� ��b���3 ����������> ��D ��b���^ ���� ������8ۭ �� �����#� �����# s ����# �����# ����# .����#( l ����#0 %��n��#8 ��n��#@ E��n��#H t�����#P F��n��#X 1����#` ����#h ����#p �����#x ����# ��(��# O���n��# o��n��# ��n��# ��n��# ��n��# ����# g����# ����# ����# ������#����#����#a ����#'����#����#����#o�� ��#b�� ��# �� ��#�� ��#���n��#�8��^ ��PY ��% ����#�����#��T��#��T��#��z��# ����#(����#0 �� ��#8����#@H��T��#H�4 �� ��! ����"��#���#��#��$���#���%e ��0' ����(7��#�#��)b��#��*m��#��+��#p��, ��# ��- ��#(���. ��6��1$��*��6������ ��2B��H��T�����q��3`��f��b���������b������� ��4����������������5����6����b������������� ��7 ����8 ����9y����:����i���+�������;7��=����W������b������<y��P��=y����>����? ����@ ��Q��A��������]�������$��B��������]�� ��� ��Y �� �� �� *��� %C�� �� &��#� �� '��# �� (b���# �� *��#���5��I�� ��( �� �� ���#�get� ��#set� ��#doc� ���# �� ���# �U��]��:�� ��*��� ,��a�� ��������������- �� ����b���������������G��y��s�G��t�G��0��I��C��Jb���a���Kb�����R-��� ��R-�����A��v�������v��� ��v��u ��v-����s������s1����s2����l1�b���l2�b���res���� ��db��� ������ ������������ st1�d�����!V��d����� st2�d�����![��d�����"a1�f���^��"a2�f�����"b1�g�����"b2�g���a��"s1�h�����"s2�h�����"max�i[�����"i�i[���R������� ������ ����������!��Џ��w ��!f��Џ�� ��"i�b��� ��"j�b���B ��"n�b��� ��#s�ӓ���w$}��Ԃ��w$��Փ��P"c�֙��� ��%` ������d ��������b����� �����&8���� ����� 8����'�� ������ ������^ ���� s1���� ��"i�b���0 ��(������b�����q���� ������ ������ ��$��!���� ��!f����' ��#s1����o#s2����o$F����o$i����w$����o)�� ������0���*��] ��*�� ��(p���+�� ��+�� ��,��a)�� ���������*6�� ��*-����(���+?��5��+J��k��+U����%p������{������-a��-l��������.��������-������w/����/J ���� ���k�� 8����$8��Ȁ�� ������[�� ���� 8����$����  �����/����/J �����% ���: ; I��$� > ��$� > ��� ��� I�� : ;�� �: ;I8 �� �: ; ��  : ; �� �: ; I8 �� I�� !�I/ ��&�I�� : ;��' I���I�� �: ; I8 ��' �� : ; ���: ;I�� : ;���< ��.: ; ' I ���: ; I��4�: ; I�� ��.? : ; ' I 4 ���: ; I��4�: ; I��.: ; ' I@�� �: ; I��!�: ; I��"4�: ; I��#4�: ; I ��$4�: ; I ��% ��&!�I/��'.: ; ' @��( U��)1RUX Y ��*�1��+4�1��,4�1 ��-4�1��..�? : ;' @ ��/4�: ; I? < �������� ������imdb/parser/sql�/usr/include/bits�/usr/lib/x86_64-linux-gnu/gcc/x86_64-linux-gnu/4.5.2/include�/usr/include�/usr/include/python2.7��cutils.c���string3.h���stddef.h���types.h���stdio.h���libio.h���pyport.h���object.h���methodobject.h���descrobject.h�����  �������z<�.�o.��o<�JoX<vJYnJJ�Z>kI=IKv-]o<�.tcJ.=MH0!�d<�K�-�_yt5Su-W/+?q.�.�u=;KF XV8\FxYY�J>p tv��l .wXu- JyAA�@XQ</<�f;7�<[�<B���objobjproc�nb_inplace_remainder�nb_divide�PyMethodDef�__ssize_t�tp_richcompare�nb_int�tp_dealloc�t_len�_IO_save_end�nb_nonzero�tp_as_sequence�tp_repr�strides�_IO_write_base�_lock�getbufferproc�nb_add�nb_subtract�releasebufferproc�nb_xor�tp_bases�tp_methods�_IO_save_base�tp_init�_chain�_cur_column�tp_weaklistoffset�tp_is_gc�nb_absolute�tp_name�_object�getter�tp_mro�nb_floor_divide�ternaryfunc�mp_ass_subscript�ob_refcnt�writebufferproc�nb_inplace_multiply�nb_inplace_divide�_IO_marker�cutils_methods�tp_iter�nb_inplace_or�__s2_len�__res�hashfunc�allocfunc�nb_divmod�soundTable�s_len�imdb/parser/sql/cutils.c�nb_true_divide�printfunc�_IO_FILE�smalltable�PyBufferProcs�tp_doc�unsigned char�ndim�nb_inplace_true_divide�nb_float�tp_free�sq_repeat�mp_length�Py_buffer�tp_base�nb_inplace_power�nb_remainder�bf_getwritebuffer�PyMemberDef�strncpy�ssizessizeobjargproc�_IO_lock_t�tp_hash�pyratcliff�_IO_read_ptr�_pos�stdin�getattrofunc�sq_ass_slice�soundCode�__s1_len�tp_getattro�sq_slice�_markers�reprfunc�strings_check�tp_descr_set�lenfunc�RatcliffObershelp�tp_dict�nb_negative�nb_lshift�unaryfunc�tp_traverse�newfunc�tp_as_mapping�nb_inplace_subtract�tp_setattr�nb_inplace_add�traverseproc�nb_inplace_xor�ssizessizeargfunc�closure�bf_getreadbuffer�nb_and�tp_str�format�self�_flags2�getiterfunc�_IO_read_base�sq_concat�segcountproc�_unused2�PyNumberMethods�sq_inplace_repeat�_typeobject�pArgs�tp_flags�_old_offset�tp_compare�readonly�bf_getsegcount�s2copy�long long int�nb_inplace_lshift�internal�ml_meth�_IO_write_end�ob_size�PyObject�tp_iternext�nb_hex�tp_clear�tp_call�bf_releasebuffer�threshold�PyCFunction�discard�inquiry�end1�end2�_IO_buf_base�suboffsets�__pad1�__pad2�__pad3�__pad4�__pad5�descrsetfunc�_sbuf�nb_positive�tp_members�pysoundex�setattrfunc�strtolower�PyMappingMethods�coercion�_mode�sq_item�tp_setattro�sq_inplace_concat�/home/da/hg/imdbpy/imdbpy�nb_invert�iternextfunc�long double�s1copy�bf_getbuffer�word�descrgetfunc�visitproc�nb_coerce�bf_getcharbuffer�mp_subscript�GNU C 4.5.2�long long unsigned int�Py_ssize_t�initproc�__off_t�nb_index�tp_alloc�nb_rshift�initcutils�nb_inplace_and�freefunc�tp_getset�tp_weaklist�_IO_backup_base�_shortbuf�nb_long�tp_as_buffer�objobjargproc�sq_ass_item�_next�__off64_t�richcmpfunc�PyGetSetDef�tp_print�tp_version_tag�tp_getattr�_IO_buf_end�__dest�tp_cache�tp_basicsize�__src�binaryfunc�short int�setter�PySequenceMethods�tp_itemsize�_vtable_offset�nb_inplace_rshift�nb_multiply�tp_as_number�nb_inplace_floor_divide�setattrofunc�getattrfunc�bufferinfo�nb_or�nb_oct�shape�ml_doc�_IO_read_end�ml_flags�tp_del�sq_contains�destructor�_fileno�tp_new�sq_length�ob_type�short unsigned int�stdout�tp_descr_get�_IO_write_ptr�nb_power�__len�charbufferproc�tp_subclasses�ml_name�tp_dictoffset�readbufferproc�����������������w���������������w���������������w���������������w ������� ��������w( ������� ��������w0 ���������������w8��������������w������� �������w8 ������ �������w0 ������ �������w( �������������w �������������w�������������w�������������w������T�������w�T������W�������w8W������X�������w0X������Z�������w(Z������\�������w \������^�������w^������`�������w`������a�������wa�������������w��������������w8�������������w0�������������w(�������������w �������������w�������������w�������������w�������������w�������������������������a��������Ua��������������������������������������z�������T�������������T������������������������?��������Q?������� �������V������X�������VX������a�������Qa�������������V��������������������������Q�������������V������������������������U��������RU��������������w��������������������T�������w�T������a�������a�������������w���������������������������w�����������������?�������a�����������������������[��������������_a�������������_����������������o���������������V���������������P���������������Q����������������?���������������^��������������^������L�������^a�������������^����������������?�������a��������w�a���������������R��������������R������L�������Ra������n�������R�����������������������a��������a������������������������������]��������������������L�������L������a�������a�������������������������������������������������?��������Q?�������a��������Va������������������������������p���������������q��������������������L�������L������X�������VX������a�������Qa��������������������������Q�������������V����������������?�������a��������0o���������������S���������������Y��������������S������L�������Sa�������������S�������������p�s�"�������������|�s�"�������������p�s�"�������������������������������1������3�������Y3������L�������1�����������������������������w�������������w�������������w�������������w ������\�������w\������]�������w ]������^�������w^������`�������w`������a�������wa������}�������w�����������������������������U�����������������������������T������ �������U����������������+������>�������0�������������1�������������Q������!�������Qq������x�������1����������������������>�������0Y������b�������V�������������1�������������T������!�������Tq������x�������1����������������+������1�������P1������6�������\�������������x�p�����������������G������b�������P�������������R������!�������R�����������������������������w�������������w�������������w�������������w �������������w�������������w�������������w�����������������������������U�������������\�����������������������������0�������������S�������������s�������������S�������������V�����������������������������w������m�������wm������p�������wp�������������w�����������������������������U�����������������������������T������&�������U�����������������������������V�����������������������������w�����������������������������\�����������������������������]�����������������������������V�����������������������������w����������������r������u�������a�������������a����������������������4�������\P�������������\�������������\����������������&�������������]�������������������������-���size_t�p���__off_t�{���__off64_t����__ssize_t����FILE�x��_IO_lock_t���_IO_marker����_IO_FILE���ssize_t� ��Py_ssize_t�%��_object�c��PyObject�n��unaryfunc���binaryfunc���ternaryfunc���inquiry���lenfunc�(��coercion�T��ssizeargfunc�z��ssizessizeargfunc���ssizeobjargproc���ssizessizeobjargproc����objobjargproc�7��readbufferproc�b��writebufferproc�m��segcountproc���charbufferproc���bufferinfo� ��Py_buffer� ��getbufferproc� ��releasebufferproc� ��objobjproc� ��visitproc�3 ��traverseproc� ��PyNumberMethods�Y ��PySequenceMethods� ��PyMappingMethods� ��PyBufferProcs���freefunc�6��destructor�T��printfunc���getattrfunc���getattrofunc���setattrfunc���setattrofunc���cmpfunc���reprfunc� ��hashfunc�+��richcmpfunc�W��getiterfunc�c��iternextfunc�o��descrgetfunc�{��descrsetfunc���initproc���newfunc���allocfunc�N��_typeobject���PyCFunction���PyMethodDef���PyMethodDef���getter���setter�U��PyGetSetDef���������������������������������������������������������P������������������4����������������������������������P������������������4����������������������������������������������P������������������4����������������������������������������������P������������������4�����������������������.symtab�.strtab�.shstrtab�.note.gnu.build-id�.gnu.hash�.dynsym�.dynstr�.gnu.version�.gnu.version_r�.rela.dyn�.rela.plt�.init�.text�.fini�.rodata�.eh_frame_hdr�.eh_frame�.ctors�.dtors�.jcr�.dynamic�.got�.got.plt�.data�.bss�.comment�.debug_aranges�.debug_pubnames�.debug_info�.debug_abbrev�.debug_line�.debug_str�.debug_loc�.debug_pubtypes�.debug_ranges�������������������������������������������������������������������������������������������������$������������������������������.���o�������������������L�����������������������������8��� ����������@������@��������������������������������@������������� ������ �����������������������������������H���o�������<������<������(����������������������������U���o�������h������h������@����������������������������d�����������������������������������������������������n����������������������������������� �����������������x�������������������������������������������������������s������������������������������������������������������~�������������P������P������������������������������������������������h������h��������������������������������������������������������������������������������������������������������������������4�������������������������������������������@������@������D������������������������������������������ ������������������������������������������������������( �����(�������������������������������������������������8 �����8�������������������������������������������������@ �����@���������������������������������������������� ����������������������������������������������������� �����������h������������������������������������������` �����` ��������������������� ���������������������������� ����� ������������������������������������������0��������������� ������*��������������������������������������������������� !������0����������������������������������������������������:!������!���������������������������������������������������[!�������������������������������������������������������� 7������Z�����������������������������!���������������������z9�����������������������������������-�����0���������������<������ ����������������������������8���������������������F�����������������������������������C���������������������U�����������������������������������S���������������������Y������P���������������������������������������������������Z������a���������������������������������������������������xe������8������$���;����������������� ����������������������l���������������������������������������������������������������������������������������������������������@�������������������� ��������������������<��������������������h����������������������������������������������������������� �������������������� �������������������� �P������������������� �h������������������� �����������������������������������������@�������������������� �������������������( �������������������8 �������������������@ ������������������� ������������������� �������������������` ������������������� �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������� ����������������������!��������������������� �P��������������������������������������� �������������*�����( �������������8�����8 �������������E���� �p��������������[����� ������������j����� ������������x���� ����������������������������������������  �������������������������������������8 ����������������� �0�������������������������������������� � ���������������� � ���������������� ������������������ � ������?����������� � ������-���������� �����`���������� �������������,����` �������������9����0 �������������F���@ �������������O���������������������a�� �������������������p�� ���������������������� �h�������������������������������������������������������������������������������"������������������������������������������� ���������������� �����������������������������������+���������������������?���������������������^��� �������������c���������������������x��� ���������������� ����������������call_gmon_start�crtstuff.c�__CTOR_LIST__�__DTOR_LIST__�__JCR_LIST__�__do_global_dtors_aux�completed.6557�dtor_idx.6559�frame_dummy�__CTOR_END__�__FRAME_END__�__JCR_END__�__do_global_ctors_aux�cutils.c�RatcliffObershelp�pysoundex�soundTable�strtolower�pyratcliff�cutils_methods�_GLOBAL_OFFSET_TABLE_�__dso_handle�__DTOR_END__�_DYNAMIC�Py_InitModule4_64�__gmon_start__�_Jv_RegisterClasses�_fini�PyArg_ParseTuple�__ctype_toupper_loc@@GLIBC_2.3�strlen@@GLIBC_2.2.5�__cxa_finalize@@GLIBC_2.2.5�Py_BuildValue�__bss_start�initcutils�__stack_chk_fail@@GLIBC_2.4�strcmp@@GLIBC_2.2.5�__ctype_tolower_loc@@GLIBC_2.3�_end�strncpy@@GLIBC_2.2.5�_edata�_init������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/sql/cutils.c�����������������������������������������������������������������0000644�0000000�0000000�00000015756�11766731642�015620� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * cutils.c module. * * Miscellaneous functions to speed up the IMDbPY package. * * Contents: * - pyratcliff(): * Function that implements the Ratcliff-Obershelp comparison * amongst Python strings. * * - pysoundex(): * Return a soundex code string, for the given string. * * Copyright 2004-2009 Davide Alberani <da@erlug.linux.it> * Released under the GPL license. * * NOTE: The Ratcliff-Obershelp part was heavily based on code from the * "simil" Python module. * The "simil" module is copyright of Luca Montecchiani <cbm64 _at_ inwind.it> * and can be found here: http://spazioinwind.libero.it/montecchiani/ * It was released under the GPL license; original comments are leaved * below. * */ /*========== Ratcliff-Obershelp ==========*/ /***************************************************************************** * * Stolen code from : * * [Python-Dev] Why is soundex marked obsolete? * by Eric S. Raymond [4]esr@thyrsus.com * on Sun, 14 Jan 2001 14:09:01 -0500 * *****************************************************************************/ /***************************************************************************** * * Ratcliff-Obershelp common-subpattern similarity. * * This code first appeared in a letter to the editor in Doctor * Dobbs's Journal, 11/1988. The original article on the algorithm, * "Pattern Matching by Gestalt" by John Ratcliff, had appeared in the * July 1988 issue (#181) but the algorithm was presented in assembly. * The main drawback of the Ratcliff-Obershelp algorithm is the cost * of the pairwise comparisons. It is significantly more expensive * than stemming, Hamming distance, soundex, and the like. * * Running time quadratic in the data size, memory usage constant. * *****************************************************************************/ #include <Python.h> #define DONTCOMPARE_NULL 0.0 #define DONTCOMPARE_SAME 1.0 #define COMPARE 2.0 #define STRING_MAXLENDIFFER 0.7 /* As of 05 Mar 2008, the longest title is ~600 chars. */ #define MXLINELEN 1023 #define MAX(a,b) ((a) > (b) ? (a) : (b)) //***************************************** // preliminary check.... //***************************************** static float strings_check(char const *s, char const *t) { float threshold; // lenght difference int s_len = strlen(s); // length of s int t_len = strlen(t); // length of t // NULL strings ? if ((t_len * s_len) == 0) return (DONTCOMPARE_NULL); // the same ? if (strcmp(s, t) == 0) return (DONTCOMPARE_SAME); // string lenght difference threshold // we don't want to compare too different lenght strings ;) if (s_len < t_len) threshold = (float) s_len / (float) t_len; else threshold = (float) t_len / (float) s_len; if (threshold < STRING_MAXLENDIFFER) return (DONTCOMPARE_NULL); // proceed return (COMPARE); } static int RatcliffObershelp(char *st1, char *end1, char *st2, char *end2) { register char *a1, *a2; char *b1, *b2; char *s1 = st1, *s2 = st2; /* initializations are just to pacify GCC */ short max, i; if (end1 <= st1 || end2 <= st2) return (0); if (end1 == st1 + 1 && end2 == st2 + 1) return (0); max = 0; b1 = end1; b2 = end2; for (a1 = st1; a1 < b1; a1++) { for (a2 = st2; a2 < b2; a2++) { if (*a1 == *a2) { /* determine length of common substring */ for (i = 1; a1[i] && (a1[i] == a2[i]); i++) continue; if (i > max) { max = i; s1 = a1; s2 = a2; b1 = end1 - max; b2 = end2 - max; } } } } if (!max) return (0); max += RatcliffObershelp(s1 + max, end1, s2 + max, end2); /* rhs */ max += RatcliffObershelp(st1, s1, st2, s2); /* lhs */ return max; } static float ratcliff(char *s1, char *s2) /* compute Ratcliff-Obershelp similarity of two strings */ { int l1, l2; float res; // preliminary tests res = strings_check(s1, s2); if (res != COMPARE) return(res); l1 = strlen(s1); l2 = strlen(s2); return 2.0 * RatcliffObershelp(s1, s1 + l1, s2, s2 + l2) / (l1 + l2); } /* Change a string to lowercase. */ static void strtolower(char *s1) { int i; for (i=0; i < strlen(s1); i++) s1[i] = tolower(s1[i]); } /* Ratcliff-Obershelp for two python strings; returns a python float. */ static PyObject* pyratcliff(PyObject *self, PyObject *pArgs) { char *s1 = NULL; char *s2 = NULL; PyObject *discard = NULL; char s1copy[MXLINELEN+1]; char s2copy[MXLINELEN+1]; /* The optional PyObject parameter is here to be compatible * with the pure python implementation, which uses a * difflib.SequenceMatcher object. */ if (!PyArg_ParseTuple(pArgs, "ss|O", &s1, &s2, &discard)) return NULL; strncpy(s1copy, s1, MXLINELEN); strncpy(s2copy, s2, MXLINELEN); /* Work on copies. */ strtolower(s1copy); strtolower(s2copy); return Py_BuildValue("f", ratcliff(s1copy, s2copy)); } /*========== soundex ==========*/ /* Max length of the soundex code to output (an uppercase char and * _at most_ 4 digits). */ #define SOUNDEX_LEN 5 /* Group Number Lookup Table */ static char soundTable[26] = { 0 /* A */, '1' /* B */, '2' /* C */, '3' /* D */, 0 /* E */, '1' /* F */, '2' /* G */, 0 /* H */, 0 /* I */, '2' /* J */, '2' /* K */, '4' /* L */, '5' /* M */, '5' /* N */, 0 /* O */, '1' /* P */, '2' /* Q */, '6' /* R */, '2' /* S */, '3' /* T */, 0 /* U */, '1' /* V */, 0 /* W */, '2' /* X */, 0 /* Y */, '2' /* Z */}; static PyObject* pysoundex(PyObject *self, PyObject *pArgs) { int i, j, n; char *s = NULL; char word[MXLINELEN+1]; char soundCode[SOUNDEX_LEN+1]; char c; if (!PyArg_ParseTuple(pArgs, "s", &s)) return NULL; j = 0; n = strlen(s); /* Convert to uppercase and exclude non-ascii chars. */ for (i = 0; i < n; i++) { c = toupper(s[i]); if (c < 91 && c > 64) { word[j] = c; j++; } } word[j] = '\0'; n = strlen(word); if (n == 0) { /* If the string is empty, returns None. */ return Py_BuildValue(""); } soundCode[0] = word[0]; /* Build the soundCode string. */ j = 1; for (i = 1; j < SOUNDEX_LEN && i < n; i++) { c = soundTable[(word[i]-65)]; /* Compact zeroes and equal consecutive digits ("12234112"->"123412") */ if (c != 0 && c != soundCode[j-1]) { soundCode[j++] = c; } } soundCode[j] = '\0'; return Py_BuildValue("s", soundCode); } static PyMethodDef cutils_methods[] = { {"ratcliff", pyratcliff, METH_VARARGS, "Ratcliff-Obershelp similarity."}, {"soundex", pysoundex, METH_VARARGS, "Soundex code for strings."}, {NULL} }; void initcutils(void) { Py_InitModule("cutils", cutils_methods); } ������������������IMDbPY-4.9/imdb/parser/sql/objectadapter.py���������������������������������������������������������0000644�0000000�0000000�00000017230�11766731642�017317� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" parser.sql.objectadapter module (imdb.parser.sql package). This module adapts the SQLObject ORM to the internal mechanism. Copyright 2008-2010 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import sys import logging from sqlobject import * from sqlobject.sqlbuilder import ISNULL, ISNOTNULL, AND, OR, IN, CONTAINSSTRING from dbschema import * _object_logger = logging.getLogger('imdbpy.parser.sql.object') # Maps our placeholders to SQLAlchemy's column types. MAP_COLS = { INTCOL: IntCol, UNICODECOL: UnicodeCol, STRINGCOL: StringCol } # Exception raised when Table.get(id) returns no value. NotFoundError = SQLObjectNotFound # class method to be added to the SQLObject class. def addIndexes(cls, ifNotExists=True): """Create all required indexes.""" for col in cls._imdbpySchema.cols: if col.index: idxName = col.index colToIdx = col.name if col.indexLen: colToIdx = {'column': col.name, 'length': col.indexLen} if idxName in [i.name for i in cls.sqlmeta.indexes]: # Check if the index is already present. continue idx = DatabaseIndex(colToIdx, name=idxName) cls.sqlmeta.addIndex(idx) try: cls.createIndexes(ifNotExists) except dberrors.OperationalError, e: _object_logger.warn('Skipping creation of the %s.%s index: %s' % (cls.sqlmeta.table, col.name, e)) addIndexes = classmethod(addIndexes) # Global repository for "fake" tables with Foreign Keys - need to # prevent troubles if addForeignKeys is called more than one time. FAKE_TABLES_REPOSITORY = {} def _buildFakeFKTable(cls, fakeTableName): """Return a "fake" table, with foreign keys where needed.""" countCols = 0 attrs = {} for col in cls._imdbpySchema.cols: countCols += 1 if col.name == 'id': continue if not col.foreignKey: # A non-foreign key column - add it as usual. attrs[col.name] = MAP_COLS[col.kind](**col.params) continue # XXX: Foreign Keys pointing to TableName.ColName not yet supported. thisColName = col.name if thisColName.endswith('ID'): thisColName = thisColName[:-2] fks = col.foreignKey.split('.', 1) foreignTableName = fks[0] if len(fks) == 2: foreignColName = fks[1] else: foreignColName = 'id' # Unused... #fkName = 'fk_%s_%s_%d' % (foreignTableName, foreignColName, # countCols) # Create a Foreign Key column, with the correct references. fk = ForeignKey(foreignTableName, name=thisColName, default=None) attrs[thisColName] = fk # Build a _NEW_ SQLObject subclass, with foreign keys, if needed. newcls = type(fakeTableName, (SQLObject,), attrs) return newcls def addForeignKeys(cls, mapTables, ifNotExists=True): """Create all required foreign keys.""" # Do not even try, if there are no FK, in this table. if not filter(None, [col.foreignKey for col in cls._imdbpySchema.cols]): return fakeTableName = 'myfaketable%s' % cls.sqlmeta.table if fakeTableName in FAKE_TABLES_REPOSITORY: newcls = FAKE_TABLES_REPOSITORY[fakeTableName] else: newcls = _buildFakeFKTable(cls, fakeTableName) FAKE_TABLES_REPOSITORY[fakeTableName] = newcls # Connect the class with foreign keys. newcls.setConnection(cls._connection) for col in cls._imdbpySchema.cols: if col.name == 'id': continue if not col.foreignKey: continue # Get the SQL that _WOULD BE_ run, if we had to create # this "fake" table. fkQuery = newcls._connection.createReferenceConstraint(newcls, newcls.sqlmeta.columns[col.name]) if not fkQuery: # Probably the db doesn't support foreign keys (SQLite). continue # Remove "myfaketable" to get references to _real_ tables. fkQuery = fkQuery.replace('myfaketable', '') # Execute the query. newcls._connection.query(fkQuery) # Disconnect it. newcls._connection.close() addForeignKeys = classmethod(addForeignKeys) # Module-level "cache" for SQLObject classes, to prevent # "class TheClass is already in the registry" errors, when # two or more connections to the database are made. # XXX: is this the best way to act? TABLES_REPOSITORY = {} def getDBTables(uri=None): """Return a list of classes to be used to access the database through the SQLObject ORM. The connection uri is optional, and can be used to tailor the db schema to specific needs.""" DB_TABLES = [] for table in DB_SCHEMA: if table.name in TABLES_REPOSITORY: DB_TABLES.append(TABLES_REPOSITORY[table.name]) continue attrs = {'_imdbpyName': table.name, '_imdbpySchema': table, 'addIndexes': addIndexes, 'addForeignKeys': addForeignKeys} for col in table.cols: if col.name == 'id': continue attrs[col.name] = MAP_COLS[col.kind](**col.params) # Create a subclass of SQLObject. # XXX: use a metaclass? I can't see any advantage. cls = type(table.name, (SQLObject,), attrs) DB_TABLES.append(cls) TABLES_REPOSITORY[table.name] = cls return DB_TABLES def toUTF8(s): """For some strange reason, sometimes SQLObject wants utf8 strings instead of unicode.""" return s.encode('utf_8') def setConnection(uri, tables, encoding='utf8', debug=False): """Set connection for every table.""" kw = {} # FIXME: it's absolutely unclear what we should do to correctly # support unicode in MySQL; with some versions of SQLObject, # it seems that setting use_unicode=1 is the _wrong_ thing to do. _uriLower = uri.lower() if _uriLower.startswith('mysql'): kw['use_unicode'] = 1 #kw['sqlobject_encoding'] = encoding kw['charset'] = encoding conn = connectionForURI(uri, **kw) conn.debug = debug # XXX: doesn't work and a work-around was put in imdbpy2sql.py; # is there any way to modify the text_factory parameter of # a SQLite connection? #if uri.startswith('sqlite'): # major = sys.version_info[0] # minor = sys.version_info[1] # if major > 2 or (major == 2 and minor > 5): # sqliteConn = conn.getConnection() # sqliteConn.text_factory = str for table in tables: table.setConnection(conn) #table.sqlmeta.cacheValues = False # FIXME: is it safe to set table._cacheValue to False? Looks like # we can't retrieve correct values after an update (I think # it's never needed, but...) Anyway, these are set to False # for performance reason at insert time (see imdbpy2sql.py). table._cacheValue = False # Required by imdbpy2sql.py. conn.paramstyle = conn.module.paramstyle return conn ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/parser/sql/dbschema.py��������������������������������������������������������������0000644�0000000�0000000�00000050032�11766731642�016253� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#-*- encoding: utf-8 -*- """ parser.sql.dbschema module (imdb.parser.sql package). This module provides the schema used to describe the layout of the database used by the imdb.parser.sql package; functions to create/drop tables and indexes are also provided. Copyright 2005-2012 Davide Alberani <da@erlug.linux.it> 2006 Giuseppe "Cowo" Corbelli <cowo --> lugbs.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import logging _dbschema_logger = logging.getLogger('imdbpy.parser.sql.dbschema') # Placeholders for column types. INTCOL = 1 UNICODECOL = 2 STRINGCOL = 3 _strMap = {1: 'INTCOL', 2: 'UNICODECOL', 3: 'STRINGCOL'} class DBCol(object): """Define column objects.""" def __init__(self, name, kind, **params): self.name = name self.kind = kind self.index = None self.indexLen = None # If not None, two notations are accepted: 'TableName' # and 'TableName.ColName'; in the first case, 'id' is assumed # as the name of the pointed column. self.foreignKey = None if 'index' in params: self.index = params['index'] del params['index'] if 'indexLen' in params: self.indexLen = params['indexLen'] del params['indexLen'] if 'foreignKey' in params: self.foreignKey = params['foreignKey'] del params['foreignKey'] self.params = params def __str__(self): """Class representation.""" s = '<DBCol %s %s' % (self.name, _strMap[self.kind]) if self.index: s += ' INDEX' if self.indexLen: s += '[:%d]' % self.indexLen if self.foreignKey: s += ' FOREIGN' if 'default' in self.params: val = self.params['default'] if val is not None: val = '"%s"' % val s += ' DEFAULT=%s' % val for param in self.params: if param == 'default': continue s += ' %s' % param.upper() s += '>' return s def __repr__(self): """Class representation.""" s = '<DBCol(name="%s", %s' % (self.name, _strMap[self.kind]) if self.index: s += ', index="%s"' % self.index if self.indexLen: s += ', indexLen=%d' % self.indexLen if self.foreignKey: s += ', foreignKey="%s"' % self.foreignKey for param in self.params: val = self.params[param] if isinstance(val, (unicode, str)): val = u'"%s"' % val s += ', %s=%s' % (param, val) s += ')>' return s class DBTable(object): """Define table objects.""" def __init__(self, name, *cols, **kwds): self.name = name self.cols = cols # Default values. self.values = kwds.get('values', {}) def __str__(self): """Class representation.""" return '<DBTable %s (%d cols, %d values)>' % (self.name, len(self.cols), sum([len(v) for v in self.values.values()])) def __repr__(self): """Class representation.""" s = '<DBTable(name="%s"' % self.name col_s = ', '.join([repr(col).rstrip('>').lstrip('<') for col in self.cols]) if col_s: s += ', %s' % col_s if self.values: s += ', values=%s' % self.values s += ')>' return s # Default values to insert in some tables: {'column': (list, of, values, ...)} kindTypeDefs = {'kind': ('movie', 'tv series', 'tv movie', 'video movie', 'tv mini series', 'video game', 'episode')} companyTypeDefs = {'kind': ('distributors', 'production companies', 'special effects companies', 'miscellaneous companies')} infoTypeDefs = {'info': ('runtimes', 'color info', 'genres', 'languages', 'certificates', 'sound mix', 'tech info', 'countries', 'taglines', 'keywords', 'alternate versions', 'crazy credits', 'goofs', 'soundtrack', 'quotes', 'release dates', 'trivia', 'locations', 'mini biography', 'birth notes', 'birth date', 'height', 'death date', 'spouse', 'other works', 'birth name', 'salary history', 'nick names', 'books', 'agent address', 'biographical movies', 'portrayed in', 'where now', 'trade mark', 'interviews', 'article', 'magazine cover photo', 'pictorial', 'death notes', 'LD disc format', 'LD year', 'LD digital sound', 'LD official retail price', 'LD frequency response', 'LD pressing plant', 'LD length', 'LD language', 'LD review', 'LD spaciality', 'LD release date', 'LD production country', 'LD contrast', 'LD color rendition', 'LD picture format', 'LD video noise', 'LD video artifacts', 'LD release country', 'LD sharpness', 'LD dynamic range', 'LD audio noise', 'LD color information', 'LD group genre', 'LD quality program', 'LD close captions-teletext-ld-g', 'LD category', 'LD analog left', 'LD certification', 'LD audio quality', 'LD video quality', 'LD aspect ratio', 'LD analog right', 'LD additional information', 'LD number of chapter stops', 'LD dialogue intellegibility', 'LD disc size', 'LD master format', 'LD subtitles', 'LD status of availablility', 'LD quality of source', 'LD number of sides', 'LD video standard', 'LD supplement', 'LD original title', 'LD sound encoding', 'LD number', 'LD label', 'LD catalog number', 'LD laserdisc title', 'screenplay-teleplay', 'novel', 'adaption', 'book', 'production process protocol', 'printed media reviews', 'essays', 'other literature', 'mpaa', 'plot', 'votes distribution', 'votes', 'rating', 'production dates', 'copyright holder', 'filming dates', 'budget', 'weekend gross', 'gross', 'opening weekend', 'rentals', 'admissions', 'studios', 'top 250 rank', 'bottom 10 rank')} compCastTypeDefs = {'kind': ('cast', 'crew', 'complete', 'complete+verified')} linkTypeDefs = {'link': ('follows', 'followed by', 'remake of', 'remade as', 'references', 'referenced in', 'spoofs', 'spoofed in', 'features', 'featured in', 'spin off from', 'spin off', 'version of', 'similar to', 'edited into', 'edited from', 'alternate language version of', 'unknown link')} roleTypeDefs = {'role': ('actor', 'actress', 'producer', 'writer', 'cinematographer', 'composer', 'costume designer', 'director', 'editor', 'miscellaneous crew', 'production designer', 'guest')} # Schema of tables in our database. # XXX: Foreign keys can be used to create constrains between tables, # but they create indexes in the database, and this # means poor performances at insert-time. DB_SCHEMA = [ DBTable('Name', # namePcodeCf is the soundex of the name in the canonical format. # namePcodeNf is the soundex of the name in the normal format, if # different from namePcodeCf. # surnamePcode is the soundex of the surname, if different from the # other two values. # The 'id' column is simply skipped by SQLObject (it's a default); # the alternateID attribute here will be ignored by SQLAlchemy. DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6), DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('imdbID', INTCOL, default=None, index='idx_imdb_id'), DBCol('gender', STRINGCOL, length=1, default=None), DBCol('namePcodeCf', STRINGCOL, length=5, default=None, index='idx_pcodecf'), DBCol('namePcodeNf', STRINGCOL, length=5, default=None, index='idx_pcodenf'), DBCol('surnamePcode', STRINGCOL, length=5, default=None, index='idx_pcode'), DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5') ), DBTable('CharName', # namePcodeNf is the soundex of the name in the normal format. # surnamePcode is the soundex of the surname, if different # from namePcodeNf. DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6), DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('imdbID', INTCOL, default=None), DBCol('namePcodeNf', STRINGCOL, length=5, default=None, index='idx_pcodenf'), DBCol('surnamePcode', STRINGCOL, length=5, default=None, index='idx_pcode'), DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5') ), DBTable('CompanyName', # namePcodeNf is the soundex of the name in the normal format. # namePcodeSf is the soundex of the name plus the country code. DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('name', UNICODECOL, notNone=True, index='idx_name', indexLen=6), DBCol('countryCode', UNICODECOL, length=255, default=None), DBCol('imdbID', INTCOL, default=None), DBCol('namePcodeNf', STRINGCOL, length=5, default=None, index='idx_pcodenf'), DBCol('namePcodeSf', STRINGCOL, length=5, default=None, index='idx_pcodesf'), DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5') ), DBTable('KindType', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('kind', STRINGCOL, length=15, default=None, alternateID=True), values=kindTypeDefs ), DBTable('Title', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('title', UNICODECOL, notNone=True, index='idx_title', indexLen=10), DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'), DBCol('productionYear', INTCOL, default=None), DBCol('imdbID', INTCOL, default=None, index="idx_imdb_id"), DBCol('phoneticCode', STRINGCOL, length=5, default=None, index='idx_pcode'), DBCol('episodeOfID', INTCOL, default=None, index='idx_epof', foreignKey='Title'), DBCol('seasonNr', INTCOL, default=None, index="idx_season_nr"), DBCol('episodeNr', INTCOL, default=None, index="idx_episode_nr"), # Maximum observed length is 44; 49 can store 5 comma-separated # year-year pairs. DBCol('seriesYears', STRINGCOL, length=49, default=None), DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5') ), DBTable('CompanyType', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('kind', STRINGCOL, length=32, default=None, alternateID=True), values=companyTypeDefs ), DBTable('AkaName', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('personID', INTCOL, notNone=True, index='idx_person', foreignKey='Name'), DBCol('name', UNICODECOL, notNone=True), DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('namePcodeCf', STRINGCOL, length=5, default=None, index='idx_pcodecf'), DBCol('namePcodeNf', STRINGCOL, length=5, default=None, index='idx_pcodenf'), DBCol('surnamePcode', STRINGCOL, length=5, default=None, index='idx_pcode'), DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5') ), DBTable('AkaTitle', # XXX: It's safer to set notNone to False, here. # alias for akas are stored completely in the AkaTitle table; # this means that episodes will set also a "tv series" alias name. # Reading the aka-title.list file it looks like there are # episode titles with aliases to different titles for both # the episode and the series title, while for just the series # there are no aliases. # E.g.: # aka title original title # "Series, The" (2005) {The Episode} "Other Title" (2005) {Other Title} # But there is no: # "Series, The" (2005) "Other Title" (2005) DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('movieID', INTCOL, notNone=True, index='idx_movieid', foreignKey='Title'), DBCol('title', UNICODECOL, notNone=True), DBCol('imdbIndex', UNICODECOL, length=12, default=None), DBCol('kindID', INTCOL, notNone=True, foreignKey='KindType'), DBCol('productionYear', INTCOL, default=None), DBCol('phoneticCode', STRINGCOL, length=5, default=None, index='idx_pcode'), DBCol('episodeOfID', INTCOL, default=None, index='idx_epof', foreignKey='AkaTitle'), DBCol('seasonNr', INTCOL, default=None), DBCol('episodeNr', INTCOL, default=None), DBCol('note', UNICODECOL, default=None), DBCol('md5sum', STRINGCOL, length=32, default=None, index='idx_md5') ), DBTable('RoleType', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('role', STRINGCOL, length=32, notNone=True, alternateID=True), values=roleTypeDefs ), DBTable('CastInfo', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('personID', INTCOL, notNone=True, index='idx_pid', foreignKey='Name'), DBCol('movieID', INTCOL, notNone=True, index='idx_mid', foreignKey='Title'), DBCol('personRoleID', INTCOL, default=None, index='idx_cid', foreignKey='CharName'), DBCol('note', UNICODECOL, default=None), DBCol('nrOrder', INTCOL, default=None), DBCol('roleID', INTCOL, notNone=True, foreignKey='RoleType') ), DBTable('CompCastType', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('kind', STRINGCOL, length=32, notNone=True, alternateID=True), values=compCastTypeDefs ), DBTable('CompleteCast', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('movieID', INTCOL, index='idx_mid', foreignKey='Title'), DBCol('subjectID', INTCOL, notNone=True, foreignKey='CompCastType'), DBCol('statusID', INTCOL, notNone=True, foreignKey='CompCastType') ), DBTable('InfoType', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('info', STRINGCOL, length=32, notNone=True, alternateID=True), values=infoTypeDefs ), DBTable('LinkType', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('link', STRINGCOL, length=32, notNone=True, alternateID=True), values=linkTypeDefs ), DBTable('Keyword', DBCol('id', INTCOL, notNone=True, alternateID=True), # XXX: can't use alternateID=True, because it would create # a UNIQUE index; unfortunately (at least with a common # collation like utf8_unicode_ci) MySQL will consider # some different keywords identical - like # "fiancée" and "fiancee". DBCol('keyword', UNICODECOL, notNone=True, index='idx_keyword', indexLen=5), DBCol('phoneticCode', STRINGCOL, length=5, default=None, index='idx_pcode') ), DBTable('MovieKeyword', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('movieID', INTCOL, notNone=True, index='idx_mid', foreignKey='Title'), DBCol('keywordID', INTCOL, notNone=True, index='idx_keywordid', foreignKey='Keyword') ), DBTable('MovieLink', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('movieID', INTCOL, notNone=True, index='idx_mid', foreignKey='Title'), DBCol('linkedMovieID', INTCOL, notNone=True, foreignKey='Title'), DBCol('linkTypeID', INTCOL, notNone=True, foreignKey='LinkType') ), DBTable('MovieInfo', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('movieID', INTCOL, notNone=True, index='idx_mid', foreignKey='Title'), DBCol('infoTypeID', INTCOL, notNone=True, foreignKey='InfoType'), DBCol('info', UNICODECOL, notNone=True), DBCol('note', UNICODECOL, default=None) ), # This table is identical to MovieInfo, except that both 'infoTypeID' # and 'info' are indexed. DBTable('MovieInfoIdx', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('movieID', INTCOL, notNone=True, index='idx_mid', foreignKey='Title'), DBCol('infoTypeID', INTCOL, notNone=True, index='idx_infotypeid', foreignKey='InfoType'), DBCol('info', UNICODECOL, notNone=True, index='idx_info', indexLen=10), DBCol('note', UNICODECOL, default=None) ), DBTable('MovieCompanies', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('movieID', INTCOL, notNone=True, index='idx_mid', foreignKey='Title'), DBCol('companyID', INTCOL, notNone=True, index='idx_cid', foreignKey='CompanyName'), DBCol('companyTypeID', INTCOL, notNone=True, foreignKey='CompanyType'), DBCol('note', UNICODECOL, default=None) ), DBTable('PersonInfo', DBCol('id', INTCOL, notNone=True, alternateID=True), DBCol('personID', INTCOL, notNone=True, index='idx_pid', foreignKey='Name'), DBCol('infoTypeID', INTCOL, notNone=True, foreignKey='InfoType'), DBCol('info', UNICODECOL, notNone=True), DBCol('note', UNICODECOL, default=None) ) ] # Functions to manage tables. def dropTables(tables, ifExists=True): """Drop the tables.""" # In reverse order (useful to avoid errors about foreign keys). DB_TABLES_DROP = list(tables) DB_TABLES_DROP.reverse() for table in DB_TABLES_DROP: _dbschema_logger.info('dropping table %s', table._imdbpyName) table.dropTable(ifExists) def createTables(tables, ifNotExists=True): """Create the tables and insert default values.""" for table in tables: # Create the table. _dbschema_logger.info('creating table %s', table._imdbpyName) table.createTable(ifNotExists) # Insert default values, if any. if table._imdbpySchema.values: _dbschema_logger.info('inserting values into table %s', table._imdbpyName) for key in table._imdbpySchema.values: for value in table._imdbpySchema.values[key]: table(**{key: unicode(value)}) def createIndexes(tables, ifNotExists=True): """Create the indexes in the database. Return a list of errors, if any.""" errors = [] for table in tables: _dbschema_logger.info('creating indexes for table %s', table._imdbpyName) try: table.addIndexes(ifNotExists) except Exception, e: errors.append(e) continue return errors def createForeignKeys(tables, ifNotExists=True): """Create Foreign Keys. Return a list of errors, if any.""" errors = [] mapTables = {} for table in tables: mapTables[table._imdbpyName] = table for table in tables: _dbschema_logger.info('creating foreign keys for table %s', table._imdbpyName) try: table.addForeignKeys(mapTables, ifNotExists) except Exception, e: errors.append(e) continue return errors ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/__init__.py�������������������������������������������������������������������������0000644�0000000�0000000�00000120073�11766731642�014154� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" imdb package. This package can be used to retrieve information about a movie or a person from the IMDb database. It can fetch data through different media (e.g.: the IMDb web pages, a SQL database, etc.) Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ __all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company', 'available_access_systems'] __version__ = VERSION = '4.9' # Import compatibility module (importing it is enough). import _compat import sys, os, ConfigParser, logging from types import MethodType from imdb import Movie, Person, Character, Company import imdb._logging from imdb._exceptions import IMDbError, IMDbDataAccessError, IMDbParserError from imdb.utils import build_title, build_name, build_company_name _aux_logger = logging.getLogger('imdbpy.aux') # URLs of the main pages for movies, persons, characters and queries. imdbURL_base = 'http://akas.imdb.com/' # NOTE: the urls below will be removed in a future version. # please use the values in the 'urls' attribute # of the IMDbBase subclass instance. # http://akas.imdb.com/title/ imdbURL_movie_base = '%stitle/' % imdbURL_base # http://akas.imdb.com/title/tt%s/ imdbURL_movie_main = imdbURL_movie_base + 'tt%s/' # http://akas.imdb.com/name/ imdbURL_person_base = '%sname/' % imdbURL_base # http://akas.imdb.com/name/nm%s/ imdbURL_person_main = imdbURL_person_base + 'nm%s/' # http://akas.imdb.com/character/ imdbURL_character_base = '%scharacter/' % imdbURL_base # http://akas.imdb.com/character/ch%s/ imdbURL_character_main = imdbURL_character_base + 'ch%s/' # http://akas.imdb.com/company/ imdbURL_company_base = '%scompany/' % imdbURL_base # http://akas.imdb.com/company/co%s/ imdbURL_company_main = imdbURL_company_base + 'co%s/' # http://akas.imdb.com/keyword/%s/ imdbURL_keyword_main = imdbURL_base + 'keyword/%s/' # http://akas.imdb.com/chart/top imdbURL_top250 = imdbURL_base + 'chart/top' # http://akas.imdb.com/chart/bottom imdbURL_bottom100 = imdbURL_base + 'chart/bottom' # http://akas.imdb.com/find?%s imdbURL_find = imdbURL_base + 'find?%s' # Name of the configuration file. confFileName = 'imdbpy.cfg' class ConfigParserWithCase(ConfigParser.ConfigParser): """A case-sensitive parser for configuration files.""" def __init__(self, defaults=None, confFile=None, *args, **kwds): """Initialize the parser. *defaults* -- defaults values. *confFile* -- the file (or list of files) to parse.""" ConfigParser.ConfigParser.__init__(self, defaults=defaults) if confFile is None: dotFileName = '.' + confFileName # Current and home directory. confFile = [os.path.join(os.getcwd(), confFileName), os.path.join(os.getcwd(), dotFileName), os.path.join(os.path.expanduser('~'), confFileName), os.path.join(os.path.expanduser('~'), dotFileName)] if os.name == 'posix': sep = getattr(os.path, 'sep', '/') # /etc/ and /etc/conf.d/ confFile.append(os.path.join(sep, 'etc', confFileName)) confFile.append(os.path.join(sep, 'etc', 'conf.d', confFileName)) else: # etc subdirectory of sys.prefix, for non-unix systems. confFile.append(os.path.join(sys.prefix, 'etc', confFileName)) for fname in confFile: try: self.read(fname) except (ConfigParser.MissingSectionHeaderError, ConfigParser.ParsingError), e: _aux_logger.warn('Troubles reading config file: %s' % e) # Stop at the first valid file. if self.has_section('imdbpy'): break def optionxform(self, optionstr): """Option names are case sensitive.""" return optionstr def _manageValue(self, value): """Custom substitutions for values.""" if not isinstance(value, (str, unicode)): return value vlower = value.lower() if vlower in self._boolean_states: return self._boolean_states[vlower] elif vlower == 'none': return None return value def get(self, section, option, *args, **kwds): """Return the value of an option from a given section.""" value = ConfigParser.ConfigParser.get(self, section, option, *args, **kwds) return self._manageValue(value) def items(self, section, *args, **kwds): """Return a list of (key, value) tuples of items of the given section.""" if section != 'DEFAULT' and not self.has_section(section): return [] keys = ConfigParser.ConfigParser.options(self, section) return [(k, self.get(section, k, *args, **kwds)) for k in keys] def getDict(self, section): """Return a dictionary of items of the specified section.""" return dict(self.items(section)) def IMDb(accessSystem=None, *arguments, **keywords): """Return an instance of the appropriate class. The accessSystem parameter is used to specify the kind of the preferred access system.""" if accessSystem is None or accessSystem in ('auto', 'config'): try: cfg_file = ConfigParserWithCase(*arguments, **keywords) # Parameters set by the code take precedence. kwds = cfg_file.getDict('imdbpy') if 'accessSystem' in kwds: accessSystem = kwds['accessSystem'] del kwds['accessSystem'] else: accessSystem = 'http' kwds.update(keywords) keywords = kwds except Exception, e: logging.getLogger('imdbpy').warn('Unable to read configuration' \ ' file; complete error: %s' % e) # It just LOOKS LIKE a bad habit: we tried to read config # options from some files, but something is gone horribly # wrong: ignore everything and pretend we were called with # the 'http' accessSystem. accessSystem = 'http' if 'loggingLevel' in keywords: imdb._logging.setLevel(keywords['loggingLevel']) del keywords['loggingLevel'] if 'loggingConfig' in keywords: logCfg = keywords['loggingConfig'] del keywords['loggingConfig'] try: import logging.config logging.config.fileConfig(os.path.expanduser(logCfg)) except Exception, e: logging.getLogger('imdbpy').warn('unable to read logger ' \ 'config: %s' % e) if accessSystem in ('httpThin', 'webThin', 'htmlThin'): logging.warn('httpThin was removed since IMDbPY 4.8') accessSystem = 'http' if accessSystem in ('http', 'web', 'html'): from parser.http import IMDbHTTPAccessSystem return IMDbHTTPAccessSystem(*arguments, **keywords) elif accessSystem in ('mobile',): from parser.mobile import IMDbMobileAccessSystem return IMDbMobileAccessSystem(*arguments, **keywords) elif accessSystem in ('local', 'files'): # The local access system was removed since IMDbPY 4.2. raise IMDbError('the local access system was removed since IMDbPY 4.2') elif accessSystem in ('sql', 'db', 'database'): try: from parser.sql import IMDbSqlAccessSystem except ImportError: raise IMDbError('the sql access system is not installed') return IMDbSqlAccessSystem(*arguments, **keywords) else: raise IMDbError('unknown kind of data access system: "%s"' \ % accessSystem) def available_access_systems(): """Return the list of available data access systems.""" asList = [] # XXX: trying to import modules is a good thing? try: from parser.http import IMDbHTTPAccessSystem asList.append('http') except ImportError: pass try: from parser.mobile import IMDbMobileAccessSystem asList.append('mobile') except ImportError: pass try: from parser.sql import IMDbSqlAccessSystem asList.append('sql') except ImportError: pass return asList # XXX: I'm not sure this is a good guess. # I suppose that an argument of the IMDb function can be used to # set a default encoding for the output, and then Movie, Person and # Character objects can use this default encoding, returning strings. # Anyway, passing unicode strings to search_movie(), search_person() # and search_character() methods is always safer. encoding = getattr(sys.stdin, 'encoding', '') or sys.getdefaultencoding() class IMDbBase: """The base class used to search for a movie/person/character and to get a Movie/Person/Character object. This class cannot directly fetch data of any kind and so you have to search the "real" code into a subclass.""" # The name of the preferred access system (MUST be overridden # in the subclasses). accessSystem = 'UNKNOWN' # Top-level logger for IMDbPY. _imdb_logger = logging.getLogger('imdbpy') # Whether to re-raise caught exceptions or not. _reraise_exceptions = False def __init__(self, defaultModFunct=None, results=20, keywordsResults=100, *arguments, **keywords): """Initialize the access system. If specified, defaultModFunct is the function used by default by the Person, Movie and Character objects, when accessing their text fields. """ # The function used to output the strings that need modification (the # ones containing references to movie titles and person names). self._defModFunct = defaultModFunct # Number of results to get. try: results = int(results) except (TypeError, ValueError): results = 20 if results < 1: results = 20 self._results = results try: keywordsResults = int(keywordsResults) except (TypeError, ValueError): keywordsResults = 100 if keywordsResults < 1: keywordsResults = 100 self._keywordsResults = keywordsResults self._reraise_exceptions = keywords.get('reraiseExceptions') or False self.set_imdb_urls(keywords.get('imdbURL_base') or imdbURL_base) def set_imdb_urls(self, imdbURL_base): """Set the urls used accessing the IMDb site.""" imdbURL_base = imdbURL_base.strip().strip('"\'') if not imdbURL_base.startswith('http://'): imdbURL_base = 'http://%s' % imdbURL_base if not imdbURL_base.endswith('/'): imdbURL_base = '%s/' % imdbURL_base # http://akas.imdb.com/title/ imdbURL_movie_base='%stitle/' % imdbURL_base # http://akas.imdb.com/title/tt%s/ imdbURL_movie_main=imdbURL_movie_base + 'tt%s/' # http://akas.imdb.com/name/ imdbURL_person_base='%sname/' % imdbURL_base # http://akas.imdb.com/name/nm%s/ imdbURL_person_main=imdbURL_person_base + 'nm%s/' # http://akas.imdb.com/character/ imdbURL_character_base='%scharacter/' % imdbURL_base # http://akas.imdb.com/character/ch%s/ imdbURL_character_main=imdbURL_character_base + 'ch%s/' # http://akas.imdb.com/company/ imdbURL_company_base='%scompany/' % imdbURL_base # http://akas.imdb.com/company/co%s/ imdbURL_company_main=imdbURL_company_base + 'co%s/' # http://akas.imdb.com/keyword/%s/ imdbURL_keyword_main=imdbURL_base + 'keyword/%s/' # http://akas.imdb.com/chart/top imdbURL_top250=imdbURL_base + 'chart/top', # http://akas.imdb.com/chart/bottom imdbURL_bottom100=imdbURL_base + 'chart/bottom' # http://akas.imdb.com/find?%s imdbURL_find=imdbURL_base + 'find?%s' self.urls = dict( movie_base=imdbURL_movie_base, movie_main=imdbURL_movie_main, person_base=imdbURL_person_base, person_main=imdbURL_person_main, character_base=imdbURL_character_base, character_main=imdbURL_character_main, company_base=imdbURL_company_base, company_main=imdbURL_company_main, keyword_main=imdbURL_keyword_main, top250=imdbURL_top250, bottom100=imdbURL_bottom100, find=imdbURL_find) def _normalize_movieID(self, movieID): """Normalize the given movieID.""" # By default, do nothing. return movieID def _normalize_personID(self, personID): """Normalize the given personID.""" # By default, do nothing. return personID def _normalize_characterID(self, characterID): """Normalize the given characterID.""" # By default, do nothing. return characterID def _normalize_companyID(self, companyID): """Normalize the given companyID.""" # By default, do nothing. return companyID def _get_real_movieID(self, movieID): """Handle title aliases.""" # By default, do nothing. return movieID def _get_real_personID(self, personID): """Handle name aliases.""" # By default, do nothing. return personID def _get_real_characterID(self, characterID): """Handle character name aliases.""" # By default, do nothing. return characterID def _get_real_companyID(self, companyID): """Handle company name aliases.""" # By default, do nothing. return companyID def _get_infoset(self, prefname): """Return methods with the name starting with prefname.""" infoset = [] excludes = ('%sinfoset' % prefname,) preflen = len(prefname) for name in dir(self.__class__): if name.startswith(prefname) and name not in excludes: member = getattr(self.__class__, name) if isinstance(member, MethodType): infoset.append(name[preflen:].replace('_', ' ')) return infoset def get_movie_infoset(self): """Return the list of info set available for movies.""" return self._get_infoset('get_movie_') def get_person_infoset(self): """Return the list of info set available for persons.""" return self._get_infoset('get_person_') def get_character_infoset(self): """Return the list of info set available for characters.""" return self._get_infoset('get_character_') def get_company_infoset(self): """Return the list of info set available for companies.""" return self._get_infoset('get_company_') def get_movie(self, movieID, info=Movie.Movie.default_info, modFunct=None): """Return a Movie object for the given movieID. The movieID is something used to univocally identify a movie; it can be the imdbID used by the IMDb web server, a file pointer, a line number in a file, an ID in a database, etc. info is the list of sets of information to retrieve. If specified, modFunct will be the function used by the Movie object when accessing its text fields (like 'plot').""" movieID = self._normalize_movieID(movieID) movieID = self._get_real_movieID(movieID) movie = Movie.Movie(movieID=movieID, accessSystem=self.accessSystem) modFunct = modFunct or self._defModFunct if modFunct is not None: movie.set_mod_funct(modFunct) self.update(movie, info) return movie get_episode = get_movie def _search_movie(self, title, results): """Return a list of tuples (movieID, {movieData})""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def search_movie(self, title, results=None, _episodes=False): """Return a list of Movie objects for a query for the given title. The results argument is the maximum number of results to return.""" if results is None: results = self._results try: results = int(results) except (ValueError, OverflowError): results = 20 # XXX: I suppose it will be much safer if the user provides # an unicode string... this is just a guess. if not isinstance(title, unicode): title = unicode(title, encoding, 'replace') if not _episodes: res = self._search_movie(title, results) else: res = self._search_episode(title, results) return [Movie.Movie(movieID=self._get_real_movieID(mi), data=md, modFunct=self._defModFunct, accessSystem=self.accessSystem) for mi, md in res][:results] def _search_episode(self, title, results): """Return a list of tuples (movieID, {movieData})""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def search_episode(self, title, results=None): """Return a list of Movie objects for a query for the given title. The results argument is the maximum number of results to return; this method searches only for titles of tv (mini) series' episodes.""" return self.search_movie(title, results=results, _episodes=True) def get_person(self, personID, info=Person.Person.default_info, modFunct=None): """Return a Person object for the given personID. The personID is something used to univocally identify a person; it can be the imdbID used by the IMDb web server, a file pointer, a line number in a file, an ID in a database, etc. info is the list of sets of information to retrieve. If specified, modFunct will be the function used by the Person object when accessing its text fields (like 'mini biography').""" personID = self._normalize_personID(personID) personID = self._get_real_personID(personID) person = Person.Person(personID=personID, accessSystem=self.accessSystem) modFunct = modFunct or self._defModFunct if modFunct is not None: person.set_mod_funct(modFunct) self.update(person, info) return person def _search_person(self, name, results): """Return a list of tuples (personID, {personData})""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def search_person(self, name, results=None): """Return a list of Person objects for a query for the given name. The results argument is the maximum number of results to return.""" if results is None: results = self._results try: results = int(results) except (ValueError, OverflowError): results = 20 if not isinstance(name, unicode): name = unicode(name, encoding, 'replace') res = self._search_person(name, results) return [Person.Person(personID=self._get_real_personID(pi), data=pd, modFunct=self._defModFunct, accessSystem=self.accessSystem) for pi, pd in res][:results] def get_character(self, characterID, info=Character.Character.default_info, modFunct=None): """Return a Character object for the given characterID. The characterID is something used to univocally identify a character; it can be the imdbID used by the IMDb web server, a file pointer, a line number in a file, an ID in a database, etc. info is the list of sets of information to retrieve. If specified, modFunct will be the function used by the Character object when accessing its text fields (like 'biography').""" characterID = self._normalize_characterID(characterID) characterID = self._get_real_characterID(characterID) character = Character.Character(characterID=characterID, accessSystem=self.accessSystem) modFunct = modFunct or self._defModFunct if modFunct is not None: character.set_mod_funct(modFunct) self.update(character, info) return character def _search_character(self, name, results): """Return a list of tuples (characterID, {characterData})""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def search_character(self, name, results=None): """Return a list of Character objects for a query for the given name. The results argument is the maximum number of results to return.""" if results is None: results = self._results try: results = int(results) except (ValueError, OverflowError): results = 20 if not isinstance(name, unicode): name = unicode(name, encoding, 'replace') res = self._search_character(name, results) return [Character.Character(characterID=self._get_real_characterID(pi), data=pd, modFunct=self._defModFunct, accessSystem=self.accessSystem) for pi, pd in res][:results] def get_company(self, companyID, info=Company.Company.default_info, modFunct=None): """Return a Company object for the given companyID. The companyID is something used to univocally identify a company; it can be the imdbID used by the IMDb web server, a file pointer, a line number in a file, an ID in a database, etc. info is the list of sets of information to retrieve. If specified, modFunct will be the function used by the Company object when accessing its text fields (none, so far).""" companyID = self._normalize_companyID(companyID) companyID = self._get_real_companyID(companyID) company = Company.Company(companyID=companyID, accessSystem=self.accessSystem) modFunct = modFunct or self._defModFunct if modFunct is not None: company.set_mod_funct(modFunct) self.update(company, info) return company def _search_company(self, name, results): """Return a list of tuples (companyID, {companyData})""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def search_company(self, name, results=None): """Return a list of Company objects for a query for the given name. The results argument is the maximum number of results to return.""" if results is None: results = self._results try: results = int(results) except (ValueError, OverflowError): results = 20 if not isinstance(name, unicode): name = unicode(name, encoding, 'replace') res = self._search_company(name, results) return [Company.Company(companyID=self._get_real_companyID(pi), data=pd, modFunct=self._defModFunct, accessSystem=self.accessSystem) for pi, pd in res][:results] def _search_keyword(self, keyword, results): """Return a list of 'keyword' strings.""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def search_keyword(self, keyword, results=None): """Search for existing keywords, similar to the given one.""" if results is None: results = self._keywordsResults try: results = int(results) except (ValueError, OverflowError): results = 100 if not isinstance(keyword, unicode): keyword = unicode(keyword, encoding, 'replace') return self._search_keyword(keyword, results) def _get_keyword(self, keyword, results): """Return a list of tuples (movieID, {movieData})""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def get_keyword(self, keyword, results=None): """Return a list of movies for the given keyword.""" if results is None: results = self._keywordsResults try: results = int(results) except (ValueError, OverflowError): results = 100 # XXX: I suppose it will be much safer if the user provides # an unicode string... this is just a guess. if not isinstance(keyword, unicode): keyword = unicode(keyword, encoding, 'replace') res = self._get_keyword(keyword, results) return [Movie.Movie(movieID=self._get_real_movieID(mi), data=md, modFunct=self._defModFunct, accessSystem=self.accessSystem) for mi, md in res][:results] def _get_top_bottom_movies(self, kind): """Return the list of the top 250 or bottom 100 movies.""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. # This method must return a list of (movieID, {movieDict}) # tuples. The kind parameter can be 'top' or 'bottom'. raise NotImplementedError('override this method') def get_top250_movies(self): """Return the list of the top 250 movies.""" res = self._get_top_bottom_movies('top') return [Movie.Movie(movieID=self._get_real_movieID(mi), data=md, modFunct=self._defModFunct, accessSystem=self.accessSystem) for mi, md in res] def get_bottom100_movies(self): """Return the list of the bottom 100 movies.""" res = self._get_top_bottom_movies('bottom') return [Movie.Movie(movieID=self._get_real_movieID(mi), data=md, modFunct=self._defModFunct, accessSystem=self.accessSystem) for mi, md in res] def new_movie(self, *arguments, **keywords): """Return a Movie object.""" # XXX: not really useful... if 'title' in keywords: if not isinstance(keywords['title'], unicode): keywords['title'] = unicode(keywords['title'], encoding, 'replace') elif len(arguments) > 1: if not isinstance(arguments[1], unicode): arguments[1] = unicode(arguments[1], encoding, 'replace') return Movie.Movie(accessSystem=self.accessSystem, *arguments, **keywords) def new_person(self, *arguments, **keywords): """Return a Person object.""" # XXX: not really useful... if 'name' in keywords: if not isinstance(keywords['name'], unicode): keywords['name'] = unicode(keywords['name'], encoding, 'replace') elif len(arguments) > 1: if not isinstance(arguments[1], unicode): arguments[1] = unicode(arguments[1], encoding, 'replace') return Person.Person(accessSystem=self.accessSystem, *arguments, **keywords) def new_character(self, *arguments, **keywords): """Return a Character object.""" # XXX: not really useful... if 'name' in keywords: if not isinstance(keywords['name'], unicode): keywords['name'] = unicode(keywords['name'], encoding, 'replace') elif len(arguments) > 1: if not isinstance(arguments[1], unicode): arguments[1] = unicode(arguments[1], encoding, 'replace') return Character.Character(accessSystem=self.accessSystem, *arguments, **keywords) def new_company(self, *arguments, **keywords): """Return a Company object.""" # XXX: not really useful... if 'name' in keywords: if not isinstance(keywords['name'], unicode): keywords['name'] = unicode(keywords['name'], encoding, 'replace') elif len(arguments) > 1: if not isinstance(arguments[1], unicode): arguments[1] = unicode(arguments[1], encoding, 'replace') return Company.Company(accessSystem=self.accessSystem, *arguments, **keywords) def update(self, mop, info=None, override=0): """Given a Movie, Person, Character or Company object with only partial information, retrieve the required set of information. info is the list of sets of information to retrieve. If override is set, the information are retrieved and updated even if they're already in the object.""" # XXX: should this be a method of the Movie/Person/Character/Company # classes? NO! What for instances created by external functions? mopID = None prefix = '' if isinstance(mop, Movie.Movie): mopID = mop.movieID prefix = 'movie' elif isinstance(mop, Person.Person): mopID = mop.personID prefix = 'person' elif isinstance(mop, Character.Character): mopID = mop.characterID prefix = 'character' elif isinstance(mop, Company.Company): mopID = mop.companyID prefix = 'company' else: raise IMDbError('object ' + repr(mop) + \ ' is not a Movie, Person, Character or Company instance') if mopID is None: # XXX: enough? It's obvious that there are Characters # objects without characterID, so I think they should # just do nothing, when an i.update(character) is tried. if prefix == 'character': return raise IMDbDataAccessError( \ 'the supplied object has null movieID, personID or companyID') if mop.accessSystem == self.accessSystem: aSystem = self else: aSystem = IMDb(mop.accessSystem) if info is None: info = mop.default_info elif info == 'all': if isinstance(mop, Movie.Movie): info = self.get_movie_infoset() elif isinstance(mop, Person.Person): info = self.get_person_infoset() elif isinstance(mop, Character.Character): info = self.get_character_infoset() else: info = self.get_company_infoset() if not isinstance(info, (tuple, list)): info = (info,) res = {} for i in info: if i in mop.current_info and not override: continue if not i: continue self._imdb_logger.debug('retrieving "%s" info set', i) try: method = getattr(aSystem, 'get_%s_%s' % (prefix, i.replace(' ', '_'))) except AttributeError: self._imdb_logger.error('unknown information set "%s"', i) # Keeps going. method = lambda *x: {} try: ret = method(mopID) except Exception, e: self._imdb_logger.critical('caught an exception retrieving ' \ 'or parsing "%s" info set for mopID ' \ '"%s" (accessSystem: %s)', i, mopID, mop.accessSystem, exc_info=True) ret = {} # If requested by the user, reraise the exception. if self._reraise_exceptions: raise keys = None if 'data' in ret: res.update(ret['data']) if isinstance(ret['data'], dict): keys = ret['data'].keys() if 'info sets' in ret: for ri in ret['info sets']: mop.add_to_current_info(ri, keys, mainInfoset=i) else: mop.add_to_current_info(i, keys) if 'titlesRefs' in ret: mop.update_titlesRefs(ret['titlesRefs']) if 'namesRefs' in ret: mop.update_namesRefs(ret['namesRefs']) if 'charactersRefs' in ret: mop.update_charactersRefs(ret['charactersRefs']) mop.set_data(res, override=0) def get_imdbMovieID(self, movieID): """Translate a movieID in an imdbID (the ID used by the IMDb web server); must be overridden by the subclass.""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def get_imdbPersonID(self, personID): """Translate a personID in a imdbID (the ID used by the IMDb web server); must be overridden by the subclass.""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def get_imdbCharacterID(self, characterID): """Translate a characterID in a imdbID (the ID used by the IMDb web server); must be overridden by the subclass.""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def get_imdbCompanyID(self, companyID): """Translate a companyID in a imdbID (the ID used by the IMDb web server); must be overridden by the subclass.""" # XXX: for the real implementation, see the method of the # subclass, somewhere under the imdb.parser package. raise NotImplementedError('override this method') def _searchIMDb(self, kind, ton): """Search the IMDb akas server for the given title or name.""" # The Exact Primary search system has gone AWOL, so we resort # to the mobile search. :-/ if not ton: return None aSystem = IMDb('mobile') if kind == 'tt': searchFunct = aSystem.search_movie check = 'long imdb canonical title' elif kind == 'nm': searchFunct = aSystem.search_person check = 'long imdb canonical name' elif kind == 'char': searchFunct = aSystem.search_character check = 'long imdb canonical name' elif kind == 'co': # XXX: are [COUNTRY] codes included in the results? searchFunct = aSystem.search_company check = 'long imdb name' try: searchRes = searchFunct(ton) except IMDbError: return None # When only one result is returned, assume it was from an # exact match. if len(searchRes) == 1: return searchRes[0].getID() for item in searchRes: # Return the first perfect match. if item[check] == ton: return item.getID() return None def title2imdbID(self, title): """Translate a movie title (in the plain text data files format) to an imdbID. Try an Exact Primary Title search on IMDb; return None if it's unable to get the imdbID.""" return self._searchIMDb('tt', title) def name2imdbID(self, name): """Translate a person name in an imdbID. Try an Exact Primary Name search on IMDb; return None if it's unable to get the imdbID.""" return self._searchIMDb('tt', name) def character2imdbID(self, name): """Translate a character name in an imdbID. Try an Exact Primary Name search on IMDb; return None if it's unable to get the imdbID.""" return self._searchIMDb('char', name) def company2imdbID(self, name): """Translate a company name in an imdbID. Try an Exact Primary Name search on IMDb; return None if it's unable to get the imdbID.""" return self._searchIMDb('co', name) def get_imdbID(self, mop): """Return the imdbID for the given Movie, Person, Character or Company object.""" imdbID = None if mop.accessSystem == self.accessSystem: aSystem = self else: aSystem = IMDb(mop.accessSystem) if isinstance(mop, Movie.Movie): if mop.movieID is not None: imdbID = aSystem.get_imdbMovieID(mop.movieID) else: imdbID = aSystem.title2imdbID(build_title(mop, canonical=0, ptdf=1)) elif isinstance(mop, Person.Person): if mop.personID is not None: imdbID = aSystem.get_imdbPersonID(mop.personID) else: imdbID = aSystem.name2imdbID(build_name(mop, canonical=1)) elif isinstance(mop, Character.Character): if mop.characterID is not None: imdbID = aSystem.get_imdbCharacterID(mop.characterID) else: # canonical=0 ? imdbID = aSystem.character2imdbID(build_name(mop, canonical=1)) elif isinstance(mop, Company.Company): if mop.companyID is not None: imdbID = aSystem.get_imdbCompanyID(mop.companyID) else: imdbID = aSystem.company2imdbID(build_company_name(mop)) else: raise IMDbError('object ' + repr(mop) + \ ' is not a Movie, Person or Character instance') return imdbID def get_imdbURL(self, mop): """Return the main IMDb URL for the given Movie, Person, Character or Company object, or None if unable to get it.""" imdbID = self.get_imdbID(mop) if imdbID is None: return None if isinstance(mop, Movie.Movie): url_firstPart = imdbURL_movie_main elif isinstance(mop, Person.Person): url_firstPart = imdbURL_person_main elif isinstance(mop, Character.Character): url_firstPart = imdbURL_character_main elif isinstance(mop, Company.Company): url_firstPart = imdbURL_company_main else: raise IMDbError('object ' + repr(mop) + \ ' is not a Movie, Person, Character or Company instance') return url_firstPart % imdbID def get_special_methods(self): """Return the special methods defined by the subclass.""" sm_dict = {} base_methods = [] for name in dir(IMDbBase): member = getattr(IMDbBase, name) if isinstance(member, MethodType): base_methods.append(name) for name in dir(self.__class__): if name.startswith('_') or name in base_methods or \ name.startswith('get_movie_') or \ name.startswith('get_person_') or \ name.startswith('get_company_') or \ name.startswith('get_character_'): continue member = getattr(self.__class__, name) if isinstance(member, MethodType): sm_dict.update({name: member.__doc__}) return sm_dict ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/Person.py���������������������������������������������������������������������������0000644�0000000�0000000�00000026365�11766731642�013674� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" Person module (imdb package). This module provides the Person class, used to store information about a given person. Copyright 2004-2010 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from copy import deepcopy from imdb.utils import analyze_name, build_name, normalizeName, \ flatten, _Container, cmpPeople class Person(_Container): """A Person. Every information about a person can be accessed as: personObject['information'] to get a list of the kind of information stored in a Person object, use the keys() method; some useful aliases are defined (as "biography" for the "mini biography" key); see the keys_alias dictionary. """ # The default sets of information retrieved. default_info = ('main', 'filmography', 'biography') # Aliases for some not-so-intuitive keys. keys_alias = {'biography': 'mini biography', 'bio': 'mini biography', 'aka': 'akas', 'also known as': 'akas', 'nick name': 'nick names', 'nicks': 'nick names', 'nickname': 'nick names', 'miscellaneouscrew': 'miscellaneous crew', 'crewmembers': 'miscellaneous crew', 'misc': 'miscellaneous crew', 'guest': 'notable tv guest appearances', 'guests': 'notable tv guest appearances', 'tv guest': 'notable tv guest appearances', 'guest appearances': 'notable tv guest appearances', 'spouses': 'spouse', 'salary': 'salary history', 'salaries': 'salary history', 'otherworks': 'other works', "maltin's biography": "biography from leonard maltin's movie encyclopedia", "leonard maltin's biography": "biography from leonard maltin's movie encyclopedia", 'real name': 'birth name', 'where are they now': 'where now', 'personal quotes': 'quotes', 'mini-biography author': 'imdb mini-biography by', 'biography author': 'imdb mini-biography by', 'genre': 'genres', 'portrayed': 'portrayed in', 'keys': 'keywords', 'trademarks': 'trade mark', 'trade mark': 'trade mark', 'trade marks': 'trade mark', 'trademark': 'trade mark', 'pictorials': 'pictorial', 'magazine covers': 'magazine cover photo', 'magazine-covers': 'magazine cover photo', 'tv series episodes': 'episodes', 'tv-series episodes': 'episodes', 'articles': 'article', 'keyword': 'keywords'} # 'nick names'??? keys_tomodify_list = ('mini biography', 'spouse', 'quotes', 'other works', 'salary history', 'trivia', 'trade mark', 'news', 'books', 'biographical movies', 'portrayed in', 'where now', 'interviews', 'article', "biography from leonard maltin's movie encyclopedia") cmpFunct = cmpPeople def _init(self, **kwds): """Initialize a Person object. *personID* -- the unique identifier for the person. *name* -- the name of the Person, if not in the data dictionary. *myName* -- the nickname you use for this person. *myID* -- your personal id for this person. *data* -- a dictionary used to initialize the object. *currentRole* -- a Character instance representing the current role or duty of a person in this movie, or a Person object representing the actor/actress who played a given character in a Movie. If a string is passed, an object is automatically build. *roleID* -- if available, the characterID/personID of the currentRole object. *roleIsPerson* -- when False (default) the currentRole is assumed to be a Character object, otherwise a Person. *notes* -- notes about the given person for a specific movie or role (e.g.: the alias used in the movie credits). *accessSystem* -- a string representing the data access system used. *titlesRefs* -- a dictionary with references to movies. *namesRefs* -- a dictionary with references to persons. *modFunct* -- function called returning text fields. *billingPos* -- position of this person in the credits list. """ name = kwds.get('name') if name and not self.data.has_key('name'): self.set_name(name) self.personID = kwds.get('personID', None) self.myName = kwds.get('myName', u'') self.billingPos = kwds.get('billingPos', None) def _reset(self): """Reset the Person object.""" self.personID = None self.myName = u'' self.billingPos = None def _clear(self): """Reset the dictionary.""" self.billingPos = None def set_name(self, name): """Set the name of the person.""" # XXX: convert name to unicode, if it's a plain string? d = analyze_name(name, canonical=1) self.data.update(d) def _additional_keys(self): """Valid keys to append to the data.keys() list.""" addkeys = [] if self.data.has_key('name'): addkeys += ['canonical name', 'long imdb name', 'long imdb canonical name'] if self.data.has_key('headshot'): addkeys += ['full-size headshot'] return addkeys def _getitem(self, key): """Handle special keys.""" if self.data.has_key('name'): if key == 'name': return normalizeName(self.data['name']) elif key == 'canonical name': return self.data['name'] elif key == 'long imdb name': return build_name(self.data, canonical=0) elif key == 'long imdb canonical name': return build_name(self.data) if key == 'full-size headshot' and self.data.has_key('headshot'): return self._re_fullsizeURL.sub('', self.data.get('headshot', '')) return None def getID(self): """Return the personID.""" return self.personID def __nonzero__(self): """The Person is "false" if the self.data does not contain a name.""" # XXX: check the name and the personID? if self.data.has_key('name'): return 1 return 0 def __contains__(self, item): """Return true if this Person has worked in the given Movie, or if the fiven Character was played by this Person.""" from Movie import Movie from Character import Character if isinstance(item, Movie): for m in flatten(self.data, yieldDictKeys=1, scalar=Movie): if item.isSame(m): return 1 elif isinstance(item, Character): for m in flatten(self.data, yieldDictKeys=1, scalar=Movie): if item.isSame(m.currentRole): return 1 return 0 def isSameName(self, other): """Return true if two persons have the same name and imdbIndex and/or personID. """ if not isinstance(other, self.__class__): return 0 if self.data.has_key('name') and \ other.data.has_key('name') and \ build_name(self.data, canonical=1) == \ build_name(other.data, canonical=1): return 1 if self.accessSystem == other.accessSystem and \ self.personID and self.personID == other.personID: return 1 return 0 isSamePerson = isSameName # XXX: just for backward compatiblity. def __deepcopy__(self, memo): """Return a deep copy of a Person instance.""" p = Person(name=u'', personID=self.personID, myName=self.myName, myID=self.myID, data=deepcopy(self.data, memo), currentRole=deepcopy(self.currentRole, memo), roleIsPerson=self._roleIsPerson, notes=self.notes, accessSystem=self.accessSystem, titlesRefs=deepcopy(self.titlesRefs, memo), namesRefs=deepcopy(self.namesRefs, memo), charactersRefs=deepcopy(self.charactersRefs, memo)) p.current_info = list(self.current_info) p.set_mod_funct(self.modFunct) p.billingPos = self.billingPos return p def __repr__(self): """String representation of a Person object.""" # XXX: add also currentRole and notes, if present? r = '<Person id:%s[%s] name:_%s_>' % (self.personID, self.accessSystem, self.get('long imdb canonical name')) if isinstance(r, unicode): r = r.encode('utf_8', 'replace') return r def __str__(self): """Simply print the short name.""" return self.get('name', u'').encode('utf_8', 'replace') def __unicode__(self): """Simply print the short title.""" return self.get('name', u'') def summary(self): """Return a string with a pretty-printed summary for the person.""" if not self: return u'' s = u'Person\n=====\nName: %s\n' % \ self.get('long imdb canonical name', u'') bdate = self.get('birth date') if bdate: s += u'Birth date: %s' % bdate bnotes = self.get('birth notes') if bnotes: s += u' (%s)' % bnotes s += u'.\n' ddate = self.get('death date') if ddate: s += u'Death date: %s' % ddate dnotes = self.get('death notes') if dnotes: s += u' (%s)' % dnotes s += u'.\n' bio = self.get('mini biography') if bio: s += u'Biography: %s\n' % bio[0] director = self.get('director') if director: d_list = [x.get('long imdb canonical title', u'') for x in director[:3]] s += u'Last movies directed: %s.\n' % u'; '.join(d_list) act = self.get('actor') or self.get('actress') if act: a_list = [x.get('long imdb canonical title', u'') for x in act[:5]] s += u'Last movies acted: %s.\n' % u'; '.join(a_list) return s ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/Movie.py����������������������������������������������������������������������������0000644�0000000�0000000�00000041601�11766731642�013473� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" Movie module (imdb package). This module provides the Movie class, used to store information about a given movie. Copyright 2004-2010 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from copy import deepcopy from imdb import linguistics from imdb.utils import analyze_title, build_title, canonicalTitle, \ flatten, _Container, cmpMovies class Movie(_Container): """A Movie. Every information about a movie can be accessed as: movieObject['information'] to get a list of the kind of information stored in a Movie object, use the keys() method; some useful aliases are defined (as "casting" for the "casting director" key); see the keys_alias dictionary. """ # The default sets of information retrieved. default_info = ('main', 'plot') # Aliases for some not-so-intuitive keys. keys_alias = { 'tv schedule': 'airing', 'user rating': 'rating', 'plot summary': 'plot', 'plot summaries': 'plot', 'directed by': 'director', 'created by': 'creator', 'writing credits': 'writer', 'produced by': 'producer', 'original music by': 'original music', 'non-original music by': 'non-original music', 'music': 'original music', 'cinematography by': 'cinematographer', 'cinematography': 'cinematographer', 'film editing by': 'editor', 'film editing': 'editor', 'editing': 'editor', 'actors': 'cast', 'actresses': 'cast', 'casting by': 'casting director', 'casting': 'casting director', 'art direction by': 'art direction', 'set decoration by': 'set decoration', 'costume design by': 'costume designer', 'costume design': 'costume designer', 'makeup department': 'make up', 'makeup': 'make up', 'make-up': 'make up', 'production management': 'production manager', 'production company': 'production companies', 'second unit director or assistant director': 'assistant director', 'second unit director': 'assistant director', 'sound department': 'sound crew', 'costume and wardrobe department': 'costume department', 'special effects by': 'special effects', 'visual effects by': 'visual effects', 'special effects company': 'special effects companies', 'stunts': 'stunt performer', 'other crew': 'miscellaneous crew', 'misc crew': 'miscellaneous crew', 'miscellaneouscrew': 'miscellaneous crew', 'crewmembers': 'miscellaneous crew', 'crew members': 'miscellaneous crew', 'other companies': 'miscellaneous companies', 'misc companies': 'miscellaneous companies', 'miscellaneous company': 'miscellaneous companies', 'misc company': 'miscellaneous companies', 'other company': 'miscellaneous companies', 'aka': 'akas', 'also known as': 'akas', 'country': 'countries', 'production country': 'countries', 'production countries': 'countries', 'genre': 'genres', 'runtime': 'runtimes', 'lang': 'languages', 'color': 'color info', 'cover': 'cover url', 'full-size cover': 'full-size cover url', 'seasons': 'number of seasons', 'language': 'languages', 'certificate': 'certificates', 'certifications': 'certificates', 'certification': 'certificates', 'miscellaneous links': 'misc links', 'miscellaneous': 'misc links', 'soundclips': 'sound clips', 'videoclips': 'video clips', 'photographs': 'photo sites', 'distributor': 'distributors', 'distribution': 'distributors', 'distribution companies': 'distributors', 'distribution company': 'distributors', 'guest': 'guests', 'guest appearances': 'guests', 'tv guests': 'guests', 'notable tv guest appearances': 'guests', 'episodes cast': 'guests', 'episodes number': 'number of episodes', 'amazon review': 'amazon reviews', 'merchandising': 'merchandising links', 'merchandise': 'merchandising links', 'sales': 'merchandising links', 'faq': 'faqs', 'parental guide': 'parents guide', 'frequently asked questions': 'faqs'} keys_tomodify_list = ('plot', 'trivia', 'alternate versions', 'goofs', 'quotes', 'dvd', 'laserdisc', 'news', 'soundtrack', 'crazy credits', 'business', 'supplements', 'video review', 'faqs') cmpFunct = cmpMovies def _init(self, **kwds): """Initialize a Movie object. *movieID* -- the unique identifier for the movie. *title* -- the title of the Movie, if not in the data dictionary. *myTitle* -- your personal title for the movie. *myID* -- your personal identifier for the movie. *data* -- a dictionary used to initialize the object. *currentRole* -- a Character instance representing the current role or duty of a person in this movie, or a Person object representing the actor/actress who played a given character in a Movie. If a string is passed, an object is automatically build. *roleID* -- if available, the characterID/personID of the currentRole object. *roleIsPerson* -- when False (default) the currentRole is assumed to be a Character object, otherwise a Person. *notes* -- notes for the person referred in the currentRole attribute; e.g.: '(voice)'. *accessSystem* -- a string representing the data access system used. *titlesRefs* -- a dictionary with references to movies. *namesRefs* -- a dictionary with references to persons. *charactersRefs* -- a dictionary with references to characters. *modFunct* -- function called returning text fields. """ title = kwds.get('title') if title and not self.data.has_key('title'): self.set_title(title) self.movieID = kwds.get('movieID', None) self.myTitle = kwds.get('myTitle', u'') def _reset(self): """Reset the Movie object.""" self.movieID = None self.myTitle = u'' def set_title(self, title): """Set the title of the movie.""" # XXX: convert title to unicode, if it's a plain string? d_title = analyze_title(title) self.data.update(d_title) def _additional_keys(self): """Valid keys to append to the data.keys() list.""" addkeys = [] if self.data.has_key('title'): addkeys += ['canonical title', 'long imdb title', 'long imdb canonical title', 'smart canonical title', 'smart long imdb canonical title'] if self.data.has_key('episode of'): addkeys += ['long imdb episode title', 'series title', 'canonical series title', 'episode title', 'canonical episode title', 'smart canonical series title', 'smart canonical episode title'] if self.data.has_key('cover url'): addkeys += ['full-size cover url'] return addkeys def guessLanguage(self): """Guess the language of the title of this movie; returns None if there are no hints.""" lang = self.get('languages') if lang: lang = lang[0] else: country = self.get('countries') if country: lang = linguistics.COUNTRY_LANG.get(country[0]) return lang def smartCanonicalTitle(self, title=None, lang=None): """Return the canonical title, guessing its language. The title can be forces with the 'title' argument (internally used) and the language can be forced with the 'lang' argument, otherwise it's auto-detected.""" if title is None: title = self.data.get('title', u'') if lang is None: lang = self.guessLanguage() return canonicalTitle(title, lang=lang) def _getitem(self, key): """Handle special keys.""" if self.data.has_key('episode of'): if key == 'long imdb episode title': return build_title(self.data) elif key == 'series title': return self.data['episode of']['title'] elif key == 'canonical series title': ser_title = self.data['episode of']['title'] return canonicalTitle(ser_title) elif key == 'smart canonical series title': ser_title = self.data['episode of']['title'] return self.smartCanonicalTitle(ser_title) elif key == 'episode title': return self.data.get('title', u'') elif key == 'canonical episode title': return canonicalTitle(self.data.get('title', u'')) elif key == 'smart canonical episode title': return self.smartCanonicalTitle(self.data.get('title', u'')) if self.data.has_key('title'): if key == 'title': return self.data['title'] elif key == 'long imdb title': return build_title(self.data) elif key == 'canonical title': return canonicalTitle(self.data['title']) elif key == 'smart canonical title': return self.smartCanonicalTitle(self.data['title']) elif key == 'long imdb canonical title': return build_title(self.data, canonical=1) elif key == 'smart long imdb canonical title': return build_title(self.data, canonical=1, lang=self.guessLanguage()) if key == 'full-size cover url' and self.data.has_key('cover url'): return self._re_fullsizeURL.sub('', self.data.get('cover url', '')) return None def getID(self): """Return the movieID.""" return self.movieID def __nonzero__(self): """The Movie is "false" if the self.data does not contain a title.""" # XXX: check the title and the movieID? if self.data.has_key('title'): return 1 return 0 def isSameTitle(self, other): """Return true if this and the compared object have the same long imdb title and/or movieID. """ # XXX: obsolete? if not isinstance(other, self.__class__): return 0 if self.data.has_key('title') and \ other.data.has_key('title') and \ build_title(self.data, canonical=0) == \ build_title(other.data, canonical=0): return 1 if self.accessSystem == other.accessSystem and \ self.movieID is not None and self.movieID == other.movieID: return 1 return 0 isSameMovie = isSameTitle # XXX: just for backward compatiblity. def __contains__(self, item): """Return true if the given Person object is listed in this Movie, or if the the given Character is represented in this Movie.""" from Person import Person from Character import Character from Company import Company if isinstance(item, Person): for p in flatten(self.data, yieldDictKeys=1, scalar=Person, toDescend=(list, dict, tuple, Movie)): if item.isSame(p): return 1 elif isinstance(item, Character): for p in flatten(self.data, yieldDictKeys=1, scalar=Person, toDescend=(list, dict, tuple, Movie)): if item.isSame(p.currentRole): return 1 elif isinstance(item, Company): for c in flatten(self.data, yieldDictKeys=1, scalar=Company, toDescend=(list, dict, tuple, Movie)): if item.isSame(c): return 1 return 0 def __deepcopy__(self, memo): """Return a deep copy of a Movie instance.""" m = Movie(title=u'', movieID=self.movieID, myTitle=self.myTitle, myID=self.myID, data=deepcopy(self.data, memo), currentRole=deepcopy(self.currentRole, memo), roleIsPerson=self._roleIsPerson, notes=self.notes, accessSystem=self.accessSystem, titlesRefs=deepcopy(self.titlesRefs, memo), namesRefs=deepcopy(self.namesRefs, memo), charactersRefs=deepcopy(self.charactersRefs, memo)) m.current_info = list(self.current_info) m.set_mod_funct(self.modFunct) return m def __repr__(self): """String representation of a Movie object.""" # XXX: add also currentRole and notes, if present? if self.has_key('long imdb episode title'): title = self.get('long imdb episode title') else: title = self.get('long imdb title') r = '<Movie id:%s[%s] title:_%s_>' % (self.movieID, self.accessSystem, title) if isinstance(r, unicode): r = r.encode('utf_8', 'replace') return r def __str__(self): """Simply print the short title.""" return self.get('title', u'').encode('utf_8', 'replace') def __unicode__(self): """Simply print the short title.""" return self.get('title', u'') def summary(self): """Return a string with a pretty-printed summary for the movie.""" if not self: return u'' def _nameAndRole(personList, joiner=u', '): """Build a pretty string with name and role.""" nl = [] for person in personList: n = person.get('name', u'') if person.currentRole: n += u' (%s)' % person.currentRole nl.append(n) return joiner.join(nl) s = u'Movie\n=====\nTitle: %s\n' % \ self.get('long imdb canonical title', u'') genres = self.get('genres') if genres: s += u'Genres: %s.\n' % u', '.join(genres) director = self.get('director') if director: s += u'Director: %s.\n' % _nameAndRole(director) writer = self.get('writer') if writer: s += u'Writer: %s.\n' % _nameAndRole(writer) cast = self.get('cast') if cast: cast = cast[:5] s += u'Cast: %s.\n' % _nameAndRole(cast) runtime = self.get('runtimes') if runtime: s += u'Runtime: %s.\n' % u', '.join(runtime) countries = self.get('countries') if countries: s += u'Country: %s.\n' % u', '.join(countries) lang = self.get('languages') if lang: s += u'Language: %s.\n' % u', '.join(lang) rating = self.get('rating') if rating: s += u'Rating: %s' % rating nr_votes = self.get('votes') if nr_votes: s += u' (%s votes)' % nr_votes s += u'.\n' plot = self.get('plot') if not plot: plot = self.get('plot summary') if plot: plot = [plot] if plot: plot = plot[0] i = plot.find('::') if i != -1: plot = plot[:i] s += u'Plot: %s' % plot return s �������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/utils.py����������������������������������������������������������������������������0000644�0000000�0000000�00000166271�11766731642�013567� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" utils module (imdb package). This module provides basic utilities for the imdb package. Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> 2009 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from __future__ import generators import re import string import logging from copy import copy, deepcopy from time import strptime, strftime from imdb import VERSION from imdb import linguistics from imdb._exceptions import IMDbParserError # Logger for imdb.utils module. _utils_logger = logging.getLogger('imdbpy.utils') # The regular expression for the "long" year format of IMDb, like # "(1998)" and "(1986/II)", where the optional roman number (that I call # "imdbIndex" after the slash is used for movies with the same title # and year of release. # XXX: probably L, C, D and M are far too much! ;-) re_year_index = re.compile(r'\(([0-9\?]{4}(/[IVXLCDM]+)?)\)') re_extended_year_index = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?((?:[0-9\?]{4})(?:-[0-9\?]{4})?)(?:/([IVXLCDM]+)?)?\)') re_remove_kind = re.compile(r'\((TV episode|TV Series|TV mini-series|TV|Video|Video Game)? ?') # Match only the imdbIndex (for name strings). re_index = re.compile(r'^\(([IVXLCDM]+)\)$') # Match things inside parentheses. re_parentheses = re.compile(r'(\(.*\))') # Match the number of episodes. re_episodes = re.compile('\s?\((\d+) episodes\)', re.I) re_episode_info = re.compile(r'{\s*(.+?)?\s?(\([0-9\?]{4}-[0-9\?]{1,2}-[0-9\?]{1,2}\))?\s?(\(#[0-9]+\.[0-9]+\))?}') # Common suffixes in surnames. _sname_suffixes = ('de', 'la', 'der', 'den', 'del', 'y', 'da', 'van', 'e', 'von', 'the', 'di', 'du', 'el', 'al') def canonicalName(name): """Return the given name in canonical "Surname, Name" format. It assumes that name is in the 'Name Surname' format.""" # XXX: some statistics (as of 17 Apr 2008, over 2288622 names): # - just a surname: 69476 # - single surname, single name: 2209656 # - composed surname, composed name: 9490 # - composed surname, single name: 67606 # (2: 59764, 3: 6862, 4: 728) # - single surname, composed name: 242310 # (2: 229467, 3: 9901, 4: 2041, 5: 630) # - Jr.: 8025 # Don't convert names already in the canonical format. if name.find(', ') != -1: return name if isinstance(name, unicode): joiner = u'%s, %s' sur_joiner = u'%s %s' sur_space = u' %s' space = u' ' else: joiner = '%s, %s' sur_joiner = '%s %s' sur_space = ' %s' space = ' ' sname = name.split(' ') snl = len(sname) if snl == 2: # Just a name and a surname: how boring... name = joiner % (sname[1], sname[0]) elif snl > 2: lsname = [x.lower() for x in sname] if snl == 3: _indexes = (0, snl-2) else: _indexes = (0, snl-2, snl-3) # Check for common surname prefixes at the beginning and near the end. for index in _indexes: if lsname[index] not in _sname_suffixes: continue try: # Build the surname. surn = sur_joiner % (sname[index], sname[index+1]) del sname[index] del sname[index] try: # Handle the "Jr." after the name. if lsname[index+2].startswith('jr'): surn += sur_space % sname[index] del sname[index] except (IndexError, ValueError): pass name = joiner % (surn, space.join(sname)) break except ValueError: continue else: name = joiner % (sname[-1], space.join(sname[:-1])) return name def normalizeName(name): """Return a name in the normal "Name Surname" format.""" if isinstance(name, unicode): joiner = u'%s %s' else: joiner = '%s %s' sname = name.split(', ') if len(sname) == 2: name = joiner % (sname[1], sname[0]) return name def analyze_name(name, canonical=None): """Return a dictionary with the name and the optional imdbIndex keys, from the given string. If canonical is None (default), the name is stored in its own style. If canonical is True, the name is converted to canonical style. If canonical is False, the name is converted to normal format. raise an IMDbParserError exception if the name is not valid. """ original_n = name name = name.strip() res = {} imdbIndex = '' opi = name.rfind('(') cpi = name.rfind(')') # Strip notes (but not if the name starts with a parenthesis). if opi not in (-1, 0) and cpi > opi: if re_index.match(name[opi:cpi+1]): imdbIndex = name[opi+1:cpi] name = name[:opi].rstrip() else: # XXX: for the birth and death dates case like " (1926-2004)" name = re_parentheses.sub('', name).strip() if not name: raise IMDbParserError('invalid name: "%s"' % original_n) if canonical is not None: if canonical: name = canonicalName(name) else: name = normalizeName(name) res['name'] = name if imdbIndex: res['imdbIndex'] = imdbIndex return res def build_name(name_dict, canonical=None): """Given a dictionary that represents a "long" IMDb name, return a string. If canonical is None (default), the name is returned in the stored style. If canonical is True, the name is converted to canonical style. If canonical is False, the name is converted to normal format. """ name = name_dict.get('canonical name') or name_dict.get('name', '') if not name: return '' if canonical is not None: if canonical: name = canonicalName(name) else: name = normalizeName(name) imdbIndex = name_dict.get('imdbIndex') if imdbIndex: name += ' (%s)' % imdbIndex return name # XXX: here only for backward compatibility. Find and remove any dependency. _articles = linguistics.GENERIC_ARTICLES _unicodeArticles = linguistics.toUnicode(_articles) articlesDicts = linguistics.articlesDictsForLang(None) spArticles = linguistics.spArticlesForLang(None) def canonicalTitle(title, lang=None): """Return the title in the canonic format 'Movie Title, The'; beware that it doesn't handle long imdb titles, but only the title portion, without year[/imdbIndex] or special markup. The 'lang' argument can be used to specify the language of the title. """ isUnicode = isinstance(title, unicode) articlesDicts = linguistics.articlesDictsForLang(lang) try: if title.split(', ')[-1].lower() in articlesDicts[isUnicode]: return title except IndexError: pass if isUnicode: _format = u'%s, %s' else: _format = '%s, %s' ltitle = title.lower() spArticles = linguistics.spArticlesForLang(lang) for article in spArticles[isUnicode]: if ltitle.startswith(article): lart = len(article) title = _format % (title[lart:], title[:lart]) if article[-1] == ' ': title = title[:-1] break ## XXX: an attempt using a dictionary lookup. ##for artSeparator in (' ', "'", '-'): ## article = _articlesDict.get(ltitle.split(artSeparator)[0]) ## if article is not None: ## lart = len(article) ## # check titles like "una", "I'm Mad" and "L'abbacchio". ## if title[lart:] == '' or (artSeparator != ' ' and ## title[lart:][1] != artSeparator): continue ## title = '%s, %s' % (title[lart:], title[:lart]) ## if artSeparator == ' ': title = title[1:] ## break return title def normalizeTitle(title, lang=None): """Return the title in the normal "The Title" format; beware that it doesn't handle long imdb titles, but only the title portion, without year[/imdbIndex] or special markup. The 'lang' argument can be used to specify the language of the title. """ isUnicode = isinstance(title, unicode) stitle = title.split(', ') articlesDicts = linguistics.articlesDictsForLang(lang) if len(stitle) > 1 and stitle[-1].lower() in articlesDicts[isUnicode]: sep = ' ' if stitle[-1][-1] in ("'", '-'): sep = '' if isUnicode: _format = u'%s%s%s' _joiner = u', ' else: _format = '%s%s%s' _joiner = ', ' title = _format % (stitle[-1], sep, _joiner.join(stitle[:-1])) return title def _split_series_episode(title): """Return the series and the episode titles; if this is not a series' episode, the returned series title is empty. This function recognize two different styles: "The Series" An Episode (2005) "The Series" (2004) {An Episode (2005) (#season.episode)}""" series_title = '' episode_or_year = '' if title[-1:] == '}': # Title of the episode, as in the plain text data files. begin_eps = title.rfind('{') if begin_eps == -1: return '', '' series_title = title[:begin_eps].rstrip() # episode_or_year is returned with the {...} episode_or_year = title[begin_eps:].strip() if episode_or_year[:12] == '{SUSPENDED}}': return '', '' # XXX: works only with tv series; it's still unclear whether # IMDb will support episodes for tv mini series and tv movies... elif title[0:1] == '"': second_quot = title[1:].find('"') + 2 if second_quot != 1: # a second " was found. episode_or_year = title[second_quot:].lstrip() first_char = episode_or_year[0:1] if not first_char: return '', '' if first_char != '(': # There is not a (year) but the title of the episode; # that means this is an episode title, as returned by # the web server. series_title = title[:second_quot] ##elif episode_or_year[-1:] == '}': ## # Title of the episode, as in the plain text data files. ## begin_eps = episode_or_year.find('{') ## if begin_eps == -1: return series_title, episode_or_year ## series_title = title[:second_quot+begin_eps].rstrip() ## # episode_or_year is returned with the {...} ## episode_or_year = episode_or_year[begin_eps:] return series_title, episode_or_year def is_series_episode(title): """Return True if 'title' is an series episode.""" title = title.strip() if _split_series_episode(title)[0]: return 1 return 0 def analyze_title(title, canonical=None, canonicalSeries=None, canonicalEpisode=None, _emptyString=u''): """Analyze the given title and return a dictionary with the "stripped" title, the kind of the show ("movie", "tv series", etc.), the year of production and the optional imdbIndex (a roman number used to distinguish between movies with the same title and year). If canonical is None (default), the title is stored in its own style. If canonical is True, the title is converted to canonical style. If canonical is False, the title is converted to normal format. raise an IMDbParserError exception if the title is not valid. """ # XXX: introduce the 'lang' argument? if canonical is not None: canonicalSeries = canonicalEpisode = canonical original_t = title result = {} title = title.strip() year = _emptyString kind = _emptyString imdbIndex = _emptyString series_title, episode_or_year = _split_series_episode(title) if series_title: # It's an episode of a series. series_d = analyze_title(series_title, canonical=canonicalSeries) oad = sen = ep_year = _emptyString # Plain text data files format. if episode_or_year[0:1] == '{' and episode_or_year[-1:] == '}': match = re_episode_info.findall(episode_or_year) if match: # Episode title, original air date and #season.episode episode_or_year, oad, sen = match[0] episode_or_year = episode_or_year.strip() if not oad: # No year, but the title is something like (2005-04-12) if episode_or_year and episode_or_year[0] == '(' and \ episode_or_year[-1:] == ')' and \ episode_or_year[1:2] != '#': oad = episode_or_year if oad[1:5] and oad[5:6] == '-': try: ep_year = int(oad[1:5]) except (TypeError, ValueError): pass if not oad and not sen and episode_or_year.startswith('(#'): sen = episode_or_year elif episode_or_year.startswith('Episode dated'): oad = episode_or_year[14:] if oad[-4:].isdigit(): try: ep_year = int(oad[-4:]) except (TypeError, ValueError): pass episode_d = analyze_title(episode_or_year, canonical=canonicalEpisode) episode_d['kind'] = u'episode' episode_d['episode of'] = series_d if oad: episode_d['original air date'] = oad[1:-1] if ep_year and episode_d.get('year') is None: episode_d['year'] = ep_year if sen and sen[2:-1].find('.') != -1: seas, epn = sen[2:-1].split('.') if seas: # Set season and episode. try: seas = int(seas) except: pass try: epn = int(epn) except: pass episode_d['season'] = seas if epn: episode_d['episode'] = epn return episode_d # First of all, search for the kind of show. # XXX: Number of entries at 17 Apr 2008: # movie: 379,871 # episode: 483,832 # tv movie: 61,119 # tv series: 44,795 # video movie: 57,915 # tv mini series: 5,497 # video game: 5,490 # More up-to-date statistics: http://us.imdb.com/database_statistics if title.endswith('(TV)'): kind = u'tv movie' title = title[:-4].rstrip() elif title.endswith('(V)'): kind = u'video movie' title = title[:-3].rstrip() elif title.endswith('(video)'): kind = u'video movie' title = title[:-7].rstrip() elif title.endswith('(mini)'): kind = u'tv mini series' title = title[:-6].rstrip() elif title.endswith('(VG)'): kind = u'video game' title = title[:-4].rstrip() # Search for the year and the optional imdbIndex (a roman number). yi = re_year_index.findall(title) if not yi: yi = re_extended_year_index.findall(title) if yi: yk, yiy, yii = yi[-1] yi = [(yiy, yii)] if yk == 'TV episode': kind = u'episode' elif yk == 'TV': kind = u'tv movie' elif yk == 'TV Series': kind = u'tv series' elif yk == 'Video': kind = u'video movie' elif yk == 'TV mini-series': kind = u'tv mini series' elif yk == 'Video Game': kind = u'video game' title = re_remove_kind.sub('(', title) if yi: last_yi = yi[-1] year = last_yi[0] if last_yi[1]: imdbIndex = last_yi[1][1:] year = year[:-len(imdbIndex)-1] i = title.rfind('(%s)' % last_yi[0]) if i != -1: title = title[:i-1].rstrip() # This is a tv (mini) series: strip the '"' at the begin and at the end. # XXX: strip('"') is not used for compatibility with Python 2.0. if title and title[0] == title[-1] == '"': if not kind: kind = u'tv series' title = title[1:-1].strip() elif title.endswith('(TV series)'): kind = u'tv series' title = title[:-11].rstrip() if not title: raise IMDbParserError('invalid title: "%s"' % original_t) if canonical is not None: if canonical: title = canonicalTitle(title) else: title = normalizeTitle(title) # 'kind' is one in ('movie', 'episode', 'tv series', 'tv mini series', # 'tv movie', 'video movie', 'video game') result['title'] = title result['kind'] = kind or u'movie' if year and year != '????': if '-' in year: result['series years'] = year year = year[:4] try: result['year'] = int(year) except (TypeError, ValueError): pass if imdbIndex: result['imdbIndex'] = imdbIndex if isinstance(_emptyString, str): result['kind'] = str(kind or 'movie') return result _web_format = '%d %B %Y' _ptdf_format = '(%Y-%m-%d)' def _convertTime(title, fromPTDFtoWEB=1, _emptyString=u''): """Convert a time expressed in the pain text data files, to the 'Episode dated ...' format used on the web site; if fromPTDFtoWEB is false, the inverted conversion is applied.""" try: if fromPTDFtoWEB: from_format = _ptdf_format to_format = _web_format else: from_format = u'Episode dated %s' % _web_format to_format = _ptdf_format t = strptime(title, from_format) title = strftime(to_format, t) if fromPTDFtoWEB: if title[0] == '0': title = title[1:] title = u'Episode dated %s' % title except ValueError: pass if isinstance(_emptyString, str): try: title = str(title) except UnicodeDecodeError: pass return title def build_title(title_dict, canonical=None, canonicalSeries=None, canonicalEpisode=None, ptdf=0, lang=None, _doYear=1, _emptyString=u''): """Given a dictionary that represents a "long" IMDb title, return a string. If canonical is None (default), the title is returned in the stored style. If canonical is True, the title is converted to canonical style. If canonical is False, the title is converted to normal format. lang can be used to specify the language of the title. If ptdf is true, the plain text data files format is used. """ if canonical is not None: canonicalSeries = canonical pre_title = _emptyString kind = title_dict.get('kind') episode_of = title_dict.get('episode of') if kind == 'episode' and episode_of is not None: # Works with both Movie instances and plain dictionaries. doYear = 0 if ptdf: doYear = 1 pre_title = build_title(episode_of, canonical=canonicalSeries, ptdf=0, _doYear=doYear, _emptyString=_emptyString) ep_dict = {'title': title_dict.get('title', ''), 'imdbIndex': title_dict.get('imdbIndex')} ep_title = ep_dict['title'] if not ptdf: doYear = 1 ep_dict['year'] = title_dict.get('year', '????') if ep_title[0:1] == '(' and ep_title[-1:] == ')' and \ ep_title[1:5].isdigit(): ep_dict['title'] = _convertTime(ep_title, fromPTDFtoWEB=1, _emptyString=_emptyString) else: doYear = 0 if ep_title.startswith('Episode dated'): ep_dict['title'] = _convertTime(ep_title, fromPTDFtoWEB=0, _emptyString=_emptyString) episode_title = build_title(ep_dict, canonical=canonicalEpisode, ptdf=ptdf, _doYear=doYear, _emptyString=_emptyString) if ptdf: oad = title_dict.get('original air date', _emptyString) if len(oad) == 10 and oad[4] == '-' and oad[7] == '-' and \ episode_title.find(oad) == -1: episode_title += ' (%s)' % oad seas = title_dict.get('season') if seas is not None: episode_title += ' (#%s' % seas episode = title_dict.get('episode') if episode is not None: episode_title += '.%s' % episode episode_title += ')' episode_title = '{%s}' % episode_title return '%s %s' % (pre_title, episode_title) title = title_dict.get('title', '') if not title: return _emptyString if canonical is not None: if canonical: title = canonicalTitle(title, lang=lang) else: title = normalizeTitle(title, lang=lang) if pre_title: title = '%s %s' % (pre_title, title) if kind in (u'tv series', u'tv mini series'): title = '"%s"' % title if _doYear: imdbIndex = title_dict.get('imdbIndex') year = title_dict.get('year') or u'????' if isinstance(_emptyString, str): year = str(year) title += ' (%s' % year if imdbIndex: title += '/%s' % imdbIndex title += ')' if kind: if kind == 'tv movie': title += ' (TV)' elif kind == 'video movie': title += ' (V)' elif kind == 'tv mini series': title += ' (mini)' elif kind == 'video game': title += ' (VG)' return title def split_company_name_notes(name): """Return two strings, the first representing the company name, and the other representing the (optional) notes.""" name = name.strip() notes = u'' if name.endswith(')'): fpidx = name.find('(') if fpidx != -1: notes = name[fpidx:] name = name[:fpidx].rstrip() return name, notes def analyze_company_name(name, stripNotes=False): """Return a dictionary with the name and the optional 'country' keys, from the given string. If stripNotes is true, tries to not consider optional notes. raise an IMDbParserError exception if the name is not valid. """ if stripNotes: name = split_company_name_notes(name)[0] o_name = name name = name.strip() country = None if name.endswith(']'): idx = name.rfind('[') if idx != -1: country = name[idx:] name = name[:idx].rstrip() if not name: raise IMDbParserError('invalid name: "%s"' % o_name) result = {'name': name} if country: result['country'] = country return result def build_company_name(name_dict, _emptyString=u''): """Given a dictionary that represents a "long" IMDb company name, return a string. """ name = name_dict.get('name') if not name: return _emptyString country = name_dict.get('country') if country is not None: name += ' %s' % country return name class _LastC: """Size matters.""" def __cmp__(self, other): if isinstance(other, self.__class__): return 0 return 1 _last = _LastC() def cmpMovies(m1, m2): """Compare two movies by year, in reverse order; the imdbIndex is checked for movies with the same year of production and title.""" # Sort tv series' episodes. m1e = m1.get('episode of') m2e = m2.get('episode of') if m1e is not None and m2e is not None: cmp_series = cmpMovies(m1e, m2e) if cmp_series != 0: return cmp_series m1s = m1.get('season') m2s = m2.get('season') if m1s is not None and m2s is not None: if m1s < m2s: return 1 elif m1s > m2s: return -1 m1p = m1.get('episode') m2p = m2.get('episode') if m1p < m2p: return 1 elif m1p > m2p: return -1 try: if m1e is None: m1y = int(m1.get('year', 0)) else: m1y = int(m1e.get('year', 0)) except ValueError: m1y = 0 try: if m2e is None: m2y = int(m2.get('year', 0)) else: m2y = int(m2e.get('year', 0)) except ValueError: m2y = 0 if m1y > m2y: return -1 if m1y < m2y: return 1 # Ok, these movies have the same production year... #m1t = m1.get('canonical title', _last) #m2t = m2.get('canonical title', _last) # It should works also with normal dictionaries (returned from searches). #if m1t is _last and m2t is _last: m1t = m1.get('title', _last) m2t = m2.get('title', _last) if m1t < m2t: return -1 if m1t > m2t: return 1 # Ok, these movies have the same title... m1i = m1.get('imdbIndex', _last) m2i = m2.get('imdbIndex', _last) if m1i > m2i: return -1 if m1i < m2i: return 1 m1id = getattr(m1, 'movieID', None) # Introduce this check even for other comparisons functions? # XXX: is it safe to check without knowning the data access system? # probably not a great idea. Check for 'kind', instead? if m1id is not None: m2id = getattr(m2, 'movieID', None) if m1id > m2id: return -1 elif m1id < m2id: return 1 return 0 def cmpPeople(p1, p2): """Compare two people by billingPos, name and imdbIndex.""" p1b = getattr(p1, 'billingPos', None) or _last p2b = getattr(p2, 'billingPos', None) or _last if p1b > p2b: return 1 if p1b < p2b: return -1 p1n = p1.get('canonical name', _last) p2n = p2.get('canonical name', _last) if p1n is _last and p2n is _last: p1n = p1.get('name', _last) p2n = p2.get('name', _last) if p1n > p2n: return 1 if p1n < p2n: return -1 p1i = p1.get('imdbIndex', _last) p2i = p2.get('imdbIndex', _last) if p1i > p2i: return 1 if p1i < p2i: return -1 return 0 def cmpCompanies(p1, p2): """Compare two companies.""" p1n = p1.get('long imdb name', _last) p2n = p2.get('long imdb name', _last) if p1n is _last and p2n is _last: p1n = p1.get('name', _last) p2n = p2.get('name', _last) if p1n > p2n: return 1 if p1n < p2n: return -1 p1i = p1.get('country', _last) p2i = p2.get('country', _last) if p1i > p2i: return 1 if p1i < p2i: return -1 return 0 # References to titles, names and characters. # XXX: find better regexp! re_titleRef = re.compile(r'_(.+?(?: \([0-9\?]{4}(?:/[IVXLCDM]+)?\))?(?: \(mini\)| \(TV\)| \(V\)| \(VG\))?)_ \(qv\)') # FIXME: doesn't match persons with ' in the name. re_nameRef = re.compile(r"'([^']+?)' \(qv\)") # XXX: good choice? Are there characters with # in the name? re_characterRef = re.compile(r"#([^']+?)# \(qv\)") # Functions used to filter the text strings. def modNull(s, titlesRefs, namesRefs, charactersRefs): """Do nothing.""" return s def modClearTitleRefs(s, titlesRefs, namesRefs, charactersRefs): """Remove titles references.""" return re_titleRef.sub(r'\1', s) def modClearNameRefs(s, titlesRefs, namesRefs, charactersRefs): """Remove names references.""" return re_nameRef.sub(r'\1', s) def modClearCharacterRefs(s, titlesRefs, namesRefs, charactersRefs): """Remove characters references""" return re_characterRef.sub(r'\1', s) def modClearRefs(s, titlesRefs, namesRefs, charactersRefs): """Remove titles, names and characters references.""" s = modClearTitleRefs(s, {}, {}, {}) s = modClearCharacterRefs(s, {}, {}, {}) return modClearNameRefs(s, {}, {}, {}) def modifyStrings(o, modFunct, titlesRefs, namesRefs, charactersRefs): """Modify a string (or string values in a dictionary or strings in a list), using the provided modFunct function and titlesRefs namesRefs and charactersRefs references dictionaries.""" # Notice that it doesn't go any deeper than the first two levels in a list. if isinstance(o, (unicode, str)): return modFunct(o, titlesRefs, namesRefs, charactersRefs) elif isinstance(o, (list, tuple, dict)): _stillorig = 1 if isinstance(o, (list, tuple)): keys = xrange(len(o)) else: keys = o.keys() for i in keys: v = o[i] if isinstance(v, (unicode, str)): if _stillorig: o = copy(o) _stillorig = 0 o[i] = modFunct(v, titlesRefs, namesRefs, charactersRefs) elif isinstance(v, (list, tuple)): modifyStrings(o[i], modFunct, titlesRefs, namesRefs, charactersRefs) return o def date_and_notes(s): """Parse (birth|death) date and notes; returns a tuple in the form (date, notes).""" s = s.strip() if not s: return (u'', u'') notes = u'' if s[0].isdigit() or s.split()[0].lower() in ('c.', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'ca.', 'circa', '????,'): i = s.find(',') if i != -1: notes = s[i+1:].strip() s = s[:i] else: notes = s s = u'' if s == '????': s = u'' return s, notes class RolesList(list): """A list of Person or Character instances, used for the currentRole property.""" def __unicode__(self): return u' / '.join([unicode(x) for x in self]) def __str__(self): # FIXME: does it make sense at all? Return a unicode doesn't # seem right, in __str__. return u' / '.join([unicode(x).encode('utf8') for x in self]) # Replace & with &, but only if it's not already part of a charref. #_re_amp = re.compile(r'(&)(?!\w+;)', re.I) #_re_amp = re.compile(r'(?<=\W)&(?=[^a-zA-Z0-9_#])') _re_amp = re.compile(r'&(?![^a-zA-Z0-9_#]{1,5};)') def escape4xml(value): """Escape some chars that can't be present in a XML value.""" if isinstance(value, int): value = str(value) value = _re_amp.sub('&', value) value = value.replace('"', '"').replace("'", ''') value = value.replace('<', '<').replace('>', '>') if isinstance(value, unicode): value = value.encode('ascii', 'xmlcharrefreplace') return value def _refsToReplace(value, modFunct, titlesRefs, namesRefs, charactersRefs): """Return three lists - for movie titles, persons and characters names - with two items tuples: the first item is the reference once escaped by the user-provided modFunct function, the second is the same reference un-escaped.""" mRefs = [] for refRe, refTemplate in [(re_titleRef, u'_%s_ (qv)'), (re_nameRef, u"'%s' (qv)"), (re_characterRef, u'#%s# (qv)')]: theseRefs = [] for theRef in refRe.findall(value): # refTemplate % theRef values don't change for a single # _Container instance, so this is a good candidate for a # cache or something - even if it's so rarely used that... # Moreover, it can grow - ia.update(...) - and change if # modFunct is modified. goodValue = modFunct(refTemplate % theRef, titlesRefs, namesRefs, charactersRefs) # Prevents problems with crap in plain text data files. # We should probably exclude invalid chars and string that # are too long in the re_*Ref expressions. if '_' in goodValue or len(goodValue) > 128: continue toReplace = escape4xml(goodValue) # Only the 'value' portion is replaced. replaceWith = goodValue.replace(theRef, escape4xml(theRef)) theseRefs.append((toReplace, replaceWith)) mRefs.append(theseRefs) return mRefs def _handleTextNotes(s): """Split text::notes strings.""" ssplit = s.split('::', 1) if len(ssplit) == 1: return s return u'%s<notes>%s</notes>' % (ssplit[0], ssplit[1]) def _normalizeValue(value, withRefs=False, modFunct=None, titlesRefs=None, namesRefs=None, charactersRefs=None): """Replace some chars that can't be present in a XML text.""" # XXX: use s.encode(encoding, 'xmlcharrefreplace') ? Probably not # a great idea: after all, returning a unicode is safe. if isinstance(value, (unicode, str)): if not withRefs: value = _handleTextNotes(escape4xml(value)) else: # Replace references that were accidentally escaped. replaceLists = _refsToReplace(value, modFunct, titlesRefs, namesRefs, charactersRefs) value = modFunct(value, titlesRefs or {}, namesRefs or {}, charactersRefs or {}) value = _handleTextNotes(escape4xml(value)) for replaceList in replaceLists: for toReplace, replaceWith in replaceList: value = value.replace(toReplace, replaceWith) else: value = unicode(value) return value def _tag4TON(ton, addAccessSystem=False, _containerOnly=False): """Build a tag for the given _Container instance; both open and close tags are returned.""" tag = ton.__class__.__name__.lower() what = 'name' if tag == 'movie': value = ton.get('long imdb title') or ton.get('title', '') what = 'title' else: value = ton.get('long imdb name') or ton.get('name', '') value = _normalizeValue(value) extras = u'' crl = ton.currentRole if crl: if not isinstance(crl, list): crl = [crl] for cr in crl: crTag = cr.__class__.__name__.lower() crValue = cr['long imdb name'] crValue = _normalizeValue(crValue) crID = cr.getID() if crID is not None: extras += u'<current-role><%s id="%s">' \ u'<name>%s</name></%s>' % (crTag, crID, crValue, crTag) else: extras += u'<current-role><%s><name>%s</name></%s>' % \ (crTag, crValue, crTag) if cr.notes: extras += u'<notes>%s</notes>' % _normalizeValue(cr.notes) extras += u'</current-role>' theID = ton.getID() if theID is not None: beginTag = u'<%s id="%s"' % (tag, theID) if addAccessSystem and ton.accessSystem: beginTag += ' access-system="%s"' % ton.accessSystem if not _containerOnly: beginTag += u'><%s>%s</%s>' % (what, value, what) else: beginTag += u'>' else: if not _containerOnly: beginTag = u'<%s><%s>%s</%s>' % (tag, what, value, what) else: beginTag = u'<%s>' % tag beginTag += extras if ton.notes: beginTag += u'<notes>%s</notes>' % _normalizeValue(ton.notes) return (beginTag, u'</%s>' % tag) TAGS_TO_MODIFY = { 'movie.parents-guide': ('item', True), 'movie.number-of-votes': ('item', True), 'movie.soundtrack.item': ('item', True), 'movie.quotes': ('quote', False), 'movie.quotes.quote': ('line', False), 'movie.demographic': ('item', True), 'movie.episodes': ('season', True), 'movie.episodes.season': ('episode', True), 'person.merchandising-links': ('item', True), 'person.genres': ('item', True), 'person.quotes': ('quote', False), 'person.keywords': ('item', True), 'character.quotes': ('item', True), 'character.quotes.item': ('quote', False), 'character.quotes.item.quote': ('line', False) } _allchars = string.maketrans('', '') _keepchars = _allchars.translate(_allchars, string.ascii_lowercase + '-' + string.digits) def _tagAttr(key, fullpath): """Return a tuple with a tag name and a (possibly empty) attribute, applying the conversions specified in TAGS_TO_MODIFY and checking that the tag is safe for a XML document.""" attrs = {} _escapedKey = escape4xml(key) if fullpath in TAGS_TO_MODIFY: tagName, useTitle = TAGS_TO_MODIFY[fullpath] if useTitle: attrs['key'] = _escapedKey elif not isinstance(key, unicode): if isinstance(key, str): tagName = unicode(key, 'ascii', 'ignore') else: strType = str(type(key)).replace("<type '", "").replace("'>", "") attrs['keytype'] = strType tagName = unicode(key) else: tagName = key if isinstance(key, int): attrs['keytype'] = 'int' origTagName = tagName tagName = tagName.lower().replace(' ', '-') tagName = str(tagName).translate(_allchars, _keepchars) if origTagName != tagName: if 'key' not in attrs: attrs['key'] = _escapedKey if (not tagName) or tagName[0].isdigit() or tagName[0] == '-': # This is a fail-safe: we should never be here, since unpredictable # keys must be listed in TAGS_TO_MODIFY. # This will proably break the DTD/schema, but at least it will # produce a valid XML. tagName = 'item' _utils_logger.error('invalid tag: %s [%s]' % (_escapedKey, fullpath)) attrs['key'] = _escapedKey return tagName, u' '.join([u'%s="%s"' % i for i in attrs.items()]) def _seq2xml(seq, _l=None, withRefs=False, modFunct=None, titlesRefs=None, namesRefs=None, charactersRefs=None, _topLevel=True, key2infoset=None, fullpath=''): """Convert a sequence or a dictionary to a list of XML unicode strings.""" if _l is None: _l = [] if isinstance(seq, dict): for key in seq: value = seq[key] if isinstance(key, _Container): # Here we're assuming that a _Container is never a top-level # key (otherwise we should handle key2infoset). openTag, closeTag = _tag4TON(key) # So that fullpath will contains something meaningful. tagName = key.__class__.__name__.lower() else: tagName, attrs = _tagAttr(key, fullpath) openTag = u'<%s' % tagName if attrs: openTag += ' %s' % attrs if _topLevel and key2infoset and key in key2infoset: openTag += u' infoset="%s"' % key2infoset[key] if isinstance(value, int): openTag += ' type="int"' elif isinstance(value, float): openTag += ' type="float"' openTag += u'>' closeTag = u'</%s>' % tagName _l.append(openTag) _seq2xml(value, _l, withRefs, modFunct, titlesRefs, namesRefs, charactersRefs, _topLevel=False, fullpath='%s.%s' % (fullpath, tagName)) _l.append(closeTag) elif isinstance(seq, (list, tuple)): tagName, attrs = _tagAttr('item', fullpath) beginTag = u'<%s' % tagName if attrs: beginTag += u' %s' % attrs #beginTag += u'>' closeTag = u'</%s>' % tagName for item in seq: if isinstance(item, _Container): _seq2xml(item, _l, withRefs, modFunct, titlesRefs, namesRefs, charactersRefs, _topLevel=False, fullpath='%s.%s' % (fullpath, item.__class__.__name__.lower())) else: openTag = beginTag if isinstance(item, int): openTag += ' type="int"' elif isinstance(item, float): openTag += ' type="float"' openTag += u'>' _l.append(openTag) _seq2xml(item, _l, withRefs, modFunct, titlesRefs, namesRefs, charactersRefs, _topLevel=False, fullpath='%s.%s' % (fullpath, tagName)) _l.append(closeTag) else: if isinstance(seq, _Container): _l.extend(_tag4TON(seq)) else: # Text, ints, floats and the like. _l.append(_normalizeValue(seq, withRefs=withRefs, modFunct=modFunct, titlesRefs=titlesRefs, namesRefs=namesRefs, charactersRefs=charactersRefs)) return _l _xmlHead = u"""<?xml version="1.0"?> <!DOCTYPE %s SYSTEM "http://imdbpy.sf.net/dtd/imdbpy{VERSION}.dtd"> """ _xmlHead = _xmlHead.replace('{VERSION}', VERSION.replace('.', '').split('dev')[0][:2]) class _Container(object): """Base class for Movie, Person, Character and Company classes.""" # The default sets of information retrieved. default_info = () # Aliases for some not-so-intuitive keys. keys_alias = {} # List of keys to modify. keys_tomodify_list = () # Function used to compare two instances of this class. cmpFunct = None # Regular expression used to build the 'full-size (headshot|cover url)'. _re_fullsizeURL = re.compile(r'\._V1\._SX(\d+)_SY(\d+)_') def __init__(self, myID=None, data=None, notes=u'', currentRole=u'', roleID=None, roleIsPerson=False, accessSystem=None, titlesRefs=None, namesRefs=None, charactersRefs=None, modFunct=None, *args, **kwds): """Initialize a Movie, Person, Character or Company object. *myID* -- your personal identifier for this object. *data* -- a dictionary used to initialize the object. *notes* -- notes for the person referred in the currentRole attribute; e.g.: '(voice)' or the alias used in the movie credits. *accessSystem* -- a string representing the data access system used. *currentRole* -- a Character instance representing the current role or duty of a person in this movie, or a Person object representing the actor/actress who played a given character in a Movie. If a string is passed, an object is automatically build. *roleID* -- if available, the characterID/personID of the currentRole object. *roleIsPerson* -- when False (default) the currentRole is assumed to be a Character object, otherwise a Person. *titlesRefs* -- a dictionary with references to movies. *namesRefs* -- a dictionary with references to persons. *charactersRefs* -- a dictionary with references to characters. *modFunct* -- function called returning text fields. """ self.reset() self.accessSystem = accessSystem self.myID = myID if data is None: data = {} self.set_data(data, override=1) self.notes = notes if titlesRefs is None: titlesRefs = {} self.update_titlesRefs(titlesRefs) if namesRefs is None: namesRefs = {} self.update_namesRefs(namesRefs) if charactersRefs is None: charactersRefs = {} self.update_charactersRefs(charactersRefs) self.set_mod_funct(modFunct) self.keys_tomodify = {} for item in self.keys_tomodify_list: self.keys_tomodify[item] = None self._roleIsPerson = roleIsPerson if not roleIsPerson: from imdb.Character import Character self._roleClass = Character else: from imdb.Person import Person self._roleClass = Person self.currentRole = currentRole if roleID: self.roleID = roleID self._init(*args, **kwds) def _get_roleID(self): """Return the characterID or personID of the currentRole object.""" if not self.__role: return None if isinstance(self.__role, list): return [x.getID() for x in self.__role] return self.currentRole.getID() def _set_roleID(self, roleID): """Set the characterID or personID of the currentRole object.""" if not self.__role: # XXX: needed? Just ignore it? It's probably safer to # ignore it, to prevent some bugs in the parsers. #raise IMDbError,"Can't set ID of an empty Character/Person object." pass if not self._roleIsPerson: if not isinstance(roleID, (list, tuple)): self.currentRole.characterID = roleID else: for index, item in enumerate(roleID): self.__role[index].characterID = item else: if not isinstance(roleID, (list, tuple)): self.currentRole.personID = roleID else: for index, item in enumerate(roleID): self.__role[index].personID = item roleID = property(_get_roleID, _set_roleID, doc="the characterID or personID of the currentRole object.") def _get_currentRole(self): """Return a Character or Person instance.""" if self.__role: return self.__role return self._roleClass(name=u'', accessSystem=self.accessSystem, modFunct=self.modFunct) def _set_currentRole(self, role): """Set self.currentRole to a Character or Person instance.""" if isinstance(role, (unicode, str)): if not role: self.__role = None else: self.__role = self._roleClass(name=role, modFunct=self.modFunct, accessSystem=self.accessSystem) elif isinstance(role, (list, tuple)): self.__role = RolesList() for item in role: if isinstance(item, (unicode, str)): self.__role.append(self._roleClass(name=item, accessSystem=self.accessSystem, modFunct=self.modFunct)) else: self.__role.append(item) if not self.__role: self.__role = None else: self.__role = role currentRole = property(_get_currentRole, _set_currentRole, doc="The role of a Person in a Movie" + \ " or the interpreter of a Character in a Movie.") def _init(self, **kwds): pass def reset(self): """Reset the object.""" self.data = {} self.myID = None self.notes = u'' self.titlesRefs = {} self.namesRefs = {} self.charactersRefs = {} self.modFunct = modClearRefs self.current_info = [] self.infoset2keys = {} self.key2infoset = {} self.__role = None self._reset() def _reset(self): pass def clear(self): """Reset the dictionary.""" self.data.clear() self.notes = u'' self.titlesRefs = {} self.namesRefs = {} self.charactersRefs = {} self.current_info = [] self.infoset2keys = {} self.key2infoset = {} self.__role = None self._clear() def _clear(self): pass def get_current_info(self): """Return the current set of information retrieved.""" return self.current_info def update_infoset_map(self, infoset, keys, mainInfoset): """Update the mappings between infoset and keys.""" if keys is None: keys = [] if mainInfoset is not None: theIS = mainInfoset else: theIS = infoset self.infoset2keys[theIS] = keys for key in keys: self.key2infoset[key] = theIS def set_current_info(self, ci): """Set the current set of information retrieved.""" # XXX:Remove? It's never used and there's no way to update infoset2keys. self.current_info = ci def add_to_current_info(self, val, keys=None, mainInfoset=None): """Add a set of information to the current list.""" if val not in self.current_info: self.current_info.append(val) self.update_infoset_map(val, keys, mainInfoset) def has_current_info(self, val): """Return true if the given set of information is in the list.""" return val in self.current_info def set_mod_funct(self, modFunct): """Set the fuction used to modify the strings.""" if modFunct is None: modFunct = modClearRefs self.modFunct = modFunct def update_titlesRefs(self, titlesRefs): """Update the dictionary with the references to movies.""" self.titlesRefs.update(titlesRefs) def get_titlesRefs(self): """Return the dictionary with the references to movies.""" return self.titlesRefs def update_namesRefs(self, namesRefs): """Update the dictionary with the references to names.""" self.namesRefs.update(namesRefs) def get_namesRefs(self): """Return the dictionary with the references to names.""" return self.namesRefs def update_charactersRefs(self, charactersRefs): """Update the dictionary with the references to characters.""" self.charactersRefs.update(charactersRefs) def get_charactersRefs(self): """Return the dictionary with the references to characters.""" return self.charactersRefs def set_data(self, data, override=0): """Set the movie data to the given dictionary; if 'override' is set, the previous data is removed, otherwise the two dictionary are merged. """ if not override: self.data.update(data) else: self.data = data def getID(self): """Return movieID, personID, characterID or companyID.""" raise NotImplementedError('override this method') def __cmp__(self, other): """Compare two Movie, Person, Character or Company objects.""" # XXX: raise an exception? if self.cmpFunct is None: return -1 if not isinstance(other, self.__class__): return -1 return self.cmpFunct(other) def __hash__(self): """Hash for this object.""" # XXX: does it always work correctly? theID = self.getID() if theID is not None and self.accessSystem not in ('UNKNOWN', None): # Handle 'http' and 'mobile' as they are the same access system. acs = self.accessSystem if acs in ('mobile', 'httpThin'): acs = 'http' # There must be some indication of the kind of the object, too. s4h = '%s:%s[%s]' % (self.__class__.__name__, theID, acs) else: s4h = repr(self) return hash(s4h) def isSame(self, other): """Return True if the two represent the same object.""" if not isinstance(other, self.__class__): return 0 if hash(self) == hash(other): return 1 return 0 def __len__(self): """Number of items in the data dictionary.""" return len(self.data) def getAsXML(self, key, _with_add_keys=True): """Return a XML representation of the specified key, or None if empty. If _with_add_keys is False, dinamically generated keys are excluded.""" # Prevent modifyStrings in __getitem__ to be called; if needed, # it will be called by the _normalizeValue function. origModFunct = self.modFunct self.modFunct = modNull # XXX: not totally sure it's a good idea, but could prevent # problems (i.e.: the returned string always contains # a DTD valid tag, and not something that can be only in # the keys_alias map). key = self.keys_alias.get(key, key) if (not _with_add_keys) and (key in self._additional_keys()): self.modFunct = origModFunct return None try: withRefs = False if key in self.keys_tomodify and \ origModFunct not in (None, modNull): withRefs = True value = self.get(key) if value is None: return None tag = self.__class__.__name__.lower() return u''.join(_seq2xml({key: value}, withRefs=withRefs, modFunct=origModFunct, titlesRefs=self.titlesRefs, namesRefs=self.namesRefs, charactersRefs=self.charactersRefs, key2infoset=self.key2infoset, fullpath=tag)) finally: self.modFunct = origModFunct def asXML(self, _with_add_keys=True): """Return a XML representation of the whole object. If _with_add_keys is False, dinamically generated keys are excluded.""" beginTag, endTag = _tag4TON(self, addAccessSystem=True, _containerOnly=True) resList = [beginTag] for key in self.keys(): value = self.getAsXML(key, _with_add_keys=_with_add_keys) if not value: continue resList.append(value) resList.append(endTag) head = _xmlHead % self.__class__.__name__.lower() return head + u''.join(resList) def _getitem(self, key): """Handle special keys.""" return None def __getitem__(self, key): """Return the value for a given key, checking key aliases; a KeyError exception is raised if the key is not found. """ value = self._getitem(key) if value is not None: return value # Handle key aliases. key = self.keys_alias.get(key, key) rawData = self.data[key] if key in self.keys_tomodify and \ self.modFunct not in (None, modNull): try: return modifyStrings(rawData, self.modFunct, self.titlesRefs, self.namesRefs, self.charactersRefs) except RuntimeError, e: # Symbian/python 2.2 has a poor regexp implementation. import warnings warnings.warn('RuntimeError in ' "imdb.utils._Container.__getitem__; if it's not " "a recursion limit exceeded and we're not running " "in a Symbian environment, it's a bug:\n%s" % e) return rawData def __setitem__(self, key, item): """Directly store the item with the given key.""" self.data[key] = item def __delitem__(self, key): """Remove the given section or key.""" # XXX: how to remove an item of a section? del self.data[key] def _additional_keys(self): """Valid keys to append to the data.keys() list.""" return [] def keys(self): """Return a list of valid keys.""" return self.data.keys() + self._additional_keys() def items(self): """Return the items in the dictionary.""" return [(k, self.get(k)) for k in self.keys()] # XXX: is this enough? def iteritems(self): return self.data.iteritems() def iterkeys(self): return self.data.iterkeys() def itervalues(self): return self.data.itervalues() def values(self): """Return the values in the dictionary.""" return [self.get(k) for k in self.keys()] def has_key(self, key): """Return true if a given section is defined.""" try: self.__getitem__(key) except KeyError: return 0 return 1 # XXX: really useful??? # consider also that this will confuse people who meant to # call ia.update(movieObject, 'data set') instead. def update(self, dict): self.data.update(dict) def get(self, key, failobj=None): """Return the given section, or default if it's not found.""" try: return self.__getitem__(key) except KeyError: return failobj def setdefault(self, key, failobj=None): if not self.has_key(key): self[key] = failobj return self[key] def pop(self, key, *args): return self.data.pop(key, *args) def popitem(self): return self.data.popitem() def __repr__(self): """String representation of an object.""" raise NotImplementedError('override this method') def __str__(self): """Movie title or person name.""" raise NotImplementedError('override this method') def __contains__(self, key): raise NotImplementedError('override this method') def append_item(self, key, item): """The item is appended to the list identified by the given key.""" self.data.setdefault(key, []).append(item) def set_item(self, key, item): """Directly store the item with the given key.""" self.data[key] = item def __nonzero__(self): """Return true if self.data contains something.""" if self.data: return 1 return 0 def __deepcopy__(self, memo): raise NotImplementedError('override this method') def copy(self): """Return a deep copy of the object itself.""" return deepcopy(self) def flatten(seq, toDescend=(list, dict, tuple), yieldDictKeys=0, onlyKeysType=(_Container,), scalar=None): """Iterate over nested lists and dictionaries; toDescend is a list or a tuple of types to be considered non-scalar; if yieldDictKeys is true, also dictionaries' keys are yielded; if scalar is not None, only items of the given type(s) are yielded.""" if scalar is None or isinstance(seq, scalar): yield seq if isinstance(seq, toDescend): if isinstance(seq, (dict, _Container)): if yieldDictKeys: # Yield also the keys of the dictionary. for key in seq.iterkeys(): for k in flatten(key, toDescend=toDescend, yieldDictKeys=yieldDictKeys, onlyKeysType=onlyKeysType, scalar=scalar): if onlyKeysType and isinstance(k, onlyKeysType): yield k for value in seq.itervalues(): for v in flatten(value, toDescend=toDescend, yieldDictKeys=yieldDictKeys, onlyKeysType=onlyKeysType, scalar=scalar): yield v elif not isinstance(seq, (str, unicode, int, float)): for item in seq: for i in flatten(item, toDescend=toDescend, yieldDictKeys=yieldDictKeys, onlyKeysType=onlyKeysType, scalar=scalar): yield i ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/Character.py������������������������������������������������������������������������0000644�0000000�0000000�00000017024�11766731642�014312� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" Character module (imdb package). This module provides the Character class, used to store information about a given character. Copyright 2007-2010 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from copy import deepcopy from imdb.utils import analyze_name, build_name, flatten, _Container, cmpPeople class Character(_Container): """A Character. Every information about a character can be accessed as: characterObject['information'] to get a list of the kind of information stored in a Character object, use the keys() method; some useful aliases are defined (as "also known as" for the "akas" key); see the keys_alias dictionary. """ # The default sets of information retrieved. default_info = ('main', 'filmography', 'biography') # Aliases for some not-so-intuitive keys. keys_alias = {'mini biography': 'biography', 'bio': 'biography', 'character biography': 'biography', 'character biographies': 'biography', 'biographies': 'biography', 'character bio': 'biography', 'aka': 'akas', 'also known as': 'akas', 'alternate names': 'akas', 'personal quotes': 'quotes', 'keys': 'keywords', 'keyword': 'keywords'} keys_tomodify_list = ('biography', 'quotes') cmpFunct = cmpPeople def _init(self, **kwds): """Initialize a Character object. *characterID* -- the unique identifier for the character. *name* -- the name of the Character, if not in the data dictionary. *myName* -- the nickname you use for this character. *myID* -- your personal id for this character. *data* -- a dictionary used to initialize the object. *notes* -- notes about the given character. *accessSystem* -- a string representing the data access system used. *titlesRefs* -- a dictionary with references to movies. *namesRefs* -- a dictionary with references to persons. *charactersRefs* -- a dictionary with references to characters. *modFunct* -- function called returning text fields. """ name = kwds.get('name') if name and not self.data.has_key('name'): self.set_name(name) self.characterID = kwds.get('characterID', None) self.myName = kwds.get('myName', u'') def _reset(self): """Reset the Character object.""" self.characterID = None self.myName = u'' def set_name(self, name): """Set the name of the character.""" # XXX: convert name to unicode, if it's a plain string? try: d = analyze_name(name, canonical=0) self.data.update(d) except: # TODO: catch only IMDbPYParserError and issue a warning. pass def _additional_keys(self): """Valid keys to append to the data.keys() list.""" addkeys = [] if self.data.has_key('name'): addkeys += ['long imdb name'] if self.data.has_key('headshot'): addkeys += ['full-size headshot'] return addkeys def _getitem(self, key): """Handle special keys.""" ## XXX: can a character have an imdbIndex? if self.data.has_key('name'): if key == 'long imdb name': return build_name(self.data) if key == 'full-size headshot' and self.data.has_key('headshot'): return self._re_fullsizeURL.sub('', self.data.get('headshot', '')) return None def getID(self): """Return the characterID.""" return self.characterID def __nonzero__(self): """The Character is "false" if the self.data does not contain a name.""" # XXX: check the name and the characterID? if self.data.get('name'): return 1 return 0 def __contains__(self, item): """Return true if this Character was portrayed in the given Movie or it was impersonated by the given Person.""" from Movie import Movie from Person import Person if isinstance(item, Person): for m in flatten(self.data, yieldDictKeys=1, scalar=Movie): if item.isSame(m.currentRole): return 1 elif isinstance(item, Movie): for m in flatten(self.data, yieldDictKeys=1, scalar=Movie): if item.isSame(m): return 1 return 0 def isSameName(self, other): """Return true if two character have the same name and/or characterID.""" if not isinstance(other, self.__class__): return 0 if self.data.has_key('name') and \ other.data.has_key('name') and \ build_name(self.data, canonical=0) == \ build_name(other.data, canonical=0): return 1 if self.accessSystem == other.accessSystem and \ self.characterID is not None and \ self.characterID == other.characterID: return 1 return 0 isSameCharacter = isSameName def __deepcopy__(self, memo): """Return a deep copy of a Character instance.""" c = Character(name=u'', characterID=self.characterID, myName=self.myName, myID=self.myID, data=deepcopy(self.data, memo), notes=self.notes, accessSystem=self.accessSystem, titlesRefs=deepcopy(self.titlesRefs, memo), namesRefs=deepcopy(self.namesRefs, memo), charactersRefs=deepcopy(self.charactersRefs, memo)) c.current_info = list(self.current_info) c.set_mod_funct(self.modFunct) return c def __repr__(self): """String representation of a Character object.""" r = '<Character id:%s[%s] name:_%s_>' % (self.characterID, self.accessSystem, self.get('name')) if isinstance(r, unicode): r = r.encode('utf_8', 'replace') return r def __str__(self): """Simply print the short name.""" return self.get('name', u'').encode('utf_8', 'replace') def __unicode__(self): """Simply print the short title.""" return self.get('name', u'') def summary(self): """Return a string with a pretty-printed summary for the character.""" if not self: return u'' s = u'Character\n=====\nName: %s\n' % \ self.get('name', u'') bio = self.get('biography') if bio: s += u'Biography: %s\n' % bio[0] filmo = self.get('filmography') if filmo: a_list = [x.get('long imdb canonical title', u'') for x in filmo[:5]] s += u'Last movies with this character: %s.\n' % u'; '.join(a_list) return s ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/_exceptions.py����������������������������������������������������������������������0000644�0000000�0000000�00000003177�11766731642�014742� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" _exceptions module (imdb package). This module provides the exception hierarchy used by the imdb package. Copyright 2004-2009 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import logging class IMDbError(Exception): """Base class for every exception raised by the imdb package.""" _logger = logging.getLogger('imdbpy') def __init__(self, *args, **kwargs): """Initialize the exception and pass the message to the log system.""" # Every raised exception also dispatch a critical log. self._logger.critical('%s exception raised; args: %s; kwds: %s', self.__class__.__name__, args, kwargs, exc_info=True) Exception.__init__(self, *args, **kwargs) class IMDbDataAccessError(IMDbError): """Exception raised when is not possible to access needed data.""" pass class IMDbParserError(IMDbError): """Exception raised when an error occurred parsing the data.""" pass �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/locale/�����������������������������������������������������������������������������0000755�0000000�0000000�00000000000�11766731744�013302� 5����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/locale/__init__.py������������������������������������������������������������������0000644�0000000�0000000�00000001671�11766731642�015415� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" locale package (imdb package). This package provides scripts and files for internationalization of IMDbPY. Copyright 2009 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import gettext import os LOCALE_DIR = os.path.dirname(__file__) gettext.bindtextdomain('imdbpy', LOCALE_DIR) �����������������������������������������������������������������������IMDbPY-4.9/imdb/locale/generatepot.py���������������������������������������������������������������0000755�0000000�0000000�00000004474�11766731642�016202� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ generatepot.py script. This script generates the imdbpy.pot file, from the DTD. Copyright 2009 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import re import sys from datetime import datetime as dt DEFAULT_MESSAGES = { } ELEMENT_PATTERN = r"""<!ELEMENT\s+([^\s]+)""" re_element = re.compile(ELEMENT_PATTERN) POT_HEADER_TEMPLATE = r"""# Gettext message file for imdbpy msgid "" msgstr "" "Project-Id-Version: imdbpy\n" "POT-Creation-Date: %(now)s\n" "PO-Revision-Date: YYYY-MM-DD HH:MM+0000\n" "Last-Translator: YOUR NAME <YOUR@EMAIL>\n" "Language-Team: TEAM NAME <TEAM@EMAIL>\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=1; plural=0;\n" "Language-Code: en\n" "Language-Name: English\n" "Preferred-Encodings: utf-8\n" "Domain: imdbpy\n" """ if len(sys.argv) != 2: print "Usage: %s dtd_file" % sys.argv[0] sys.exit() dtdfilename = sys.argv[1] dtd = open(dtdfilename).read() elements = re_element.findall(dtd) uniq = set(elements) elements = list(uniq) print POT_HEADER_TEMPLATE % { 'now': dt.strftime(dt.now(), "%Y-%m-%d %H:%M+0000") } for element in sorted(elements): if element in DEFAULT_MESSAGES: print '# Default: %s' % DEFAULT_MESSAGES[element] else: print '# Default: %s' % element.replace('-', ' ').capitalize() print 'msgid "%s"' % element print 'msgstr ""' # use this part instead of the line above to generate the po file for English #if element in DEFAULT_MESSAGES: # print 'msgstr "%s"' % DEFAULT_MESSAGES[element] #else: # print 'msgstr "%s"' % element.replace('-', ' ').capitalize() print ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/locale/imdbpy.pot�������������������������������������������������������������������0000644�0000000�0000000�00000044663�11766731642�015324� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# Gettext message file for imdbpy msgid "" msgstr "" "Project-Id-Version: imdbpy\n" "POT-Creation-Date: 2010-03-18 14:35+0000\n" "PO-Revision-Date: YYYY-MM-DD HH:MM+0000\n" "Last-Translator: YOUR NAME <YOUR@EMAIL>\n" "Language-Team: TEAM NAME <TEAM@EMAIL>\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=1; plural=0;\n" "Language-Code: en\n" "Language-Name: English\n" "Preferred-Encodings: utf-8\n" "Domain: imdbpy\n" # Default: Actor msgid "actor" msgstr "" # Default: Actress msgid "actress" msgstr "" # Default: Adaption msgid "adaption" msgstr "" # Default: Additional information msgid "additional-information" msgstr "" # Default: Admissions msgid "admissions" msgstr "" # Default: Agent address msgid "agent-address" msgstr "" # Default: Airing msgid "airing" msgstr "" # Default: Akas msgid "akas" msgstr "" # Default: Akas from release info msgid "akas-from-release-info" msgstr "" # Default: All products msgid "all-products" msgstr "" # Default: Alternate language version of msgid "alternate-language-version-of" msgstr "" # Default: Alternate versions msgid "alternate-versions" msgstr "" # Default: Amazon reviews msgid "amazon-reviews" msgstr "" # Default: Analog left msgid "analog-left" msgstr "" # Default: Analog right msgid "analog-right" msgstr "" # Default: Animation department msgid "animation-department" msgstr "" # Default: Archive footage msgid "archive-footage" msgstr "" # Default: Arithmetic mean msgid "arithmetic-mean" msgstr "" # Default: Art department msgid "art-department" msgstr "" # Default: Art direction msgid "art-direction" msgstr "" # Default: Art director msgid "art-director" msgstr "" # Default: Article msgid "article" msgstr "" # Default: Asin msgid "asin" msgstr "" # Default: Aspect ratio msgid "aspect-ratio" msgstr "" # Default: Assigner msgid "assigner" msgstr "" # Default: Assistant director msgid "assistant-director" msgstr "" # Default: Auctions msgid "auctions" msgstr "" # Default: Audio noise msgid "audio-noise" msgstr "" # Default: Audio quality msgid "audio-quality" msgstr "" # Default: Award msgid "award" msgstr "" # Default: Awards msgid "awards" msgstr "" # Default: Biographical movies msgid "biographical-movies" msgstr "" # Default: Biography msgid "biography" msgstr "" # Default: Biography print msgid "biography-print" msgstr "" # Default: Birth date msgid "birth-date" msgstr "" # Default: Birth name msgid "birth-name" msgstr "" # Default: Birth notes msgid "birth-notes" msgstr "" # Default: Body msgid "body" msgstr "" # Default: Book msgid "book" msgstr "" # Default: Books msgid "books" msgstr "" # Default: Bottom 100 rank msgid "bottom-100-rank" msgstr "" # Default: Budget msgid "budget" msgstr "" # Default: Business msgid "business" msgstr "" # Default: By arrangement with msgid "by-arrangement-with" msgstr "" # Default: Camera msgid "camera" msgstr "" # Default: Camera and electrical department msgid "camera-and-electrical-department" msgstr "" # Default: Canonical episode title msgid "canonical-episode-title" msgstr "" # Default: Canonical name msgid "canonical-name" msgstr "" # Default: Canonical series title msgid "canonical-series-title" msgstr "" # Default: Canonical title msgid "canonical-title" msgstr "" # Default: Cast msgid "cast" msgstr "" # Default: Casting department msgid "casting-department" msgstr "" # Default: Casting director msgid "casting-director" msgstr "" # Default: Catalog number msgid "catalog-number" msgstr "" # Default: Category msgid "category" msgstr "" # Default: Certificate msgid "certificate" msgstr "" # Default: Certificates msgid "certificates" msgstr "" # Default: Certification msgid "certification" msgstr "" # Default: Channel msgid "channel" msgstr "" # Default: Character msgid "character" msgstr "" # Default: Cinematographer msgid "cinematographer" msgstr "" # Default: Cinematographic process msgid "cinematographic-process" msgstr "" # Default: Close captions teletext ld g msgid "close-captions-teletext-ld-g" msgstr "" # Default: Color info msgid "color-info" msgstr "" # Default: Color information msgid "color-information" msgstr "" # Default: Color rendition msgid "color-rendition" msgstr "" # Default: Company msgid "company" msgstr "" # Default: Complete cast msgid "complete-cast" msgstr "" # Default: Complete crew msgid "complete-crew" msgstr "" # Default: Composer msgid "composer" msgstr "" # Default: Connections msgid "connections" msgstr "" # Default: Contrast msgid "contrast" msgstr "" # Default: Copyright holder msgid "copyright-holder" msgstr "" # Default: Costume department msgid "costume-department" msgstr "" # Default: Costume designer msgid "costume-designer" msgstr "" # Default: Countries msgid "countries" msgstr "" # Default: Country msgid "country" msgstr "" # Default: Courtesy of msgid "courtesy-of" msgstr "" # Default: Cover msgid "cover" msgstr "" # Default: Cover url msgid "cover-url" msgstr "" # Default: Crazy credits msgid "crazy-credits" msgstr "" # Default: Creator msgid "creator" msgstr "" # Default: Current role msgid "current-role" msgstr "" # Default: Database msgid "database" msgstr "" # Default: Date msgid "date" msgstr "" # Default: Death date msgid "death-date" msgstr "" # Default: Death notes msgid "death-notes" msgstr "" # Default: Demographic msgid "demographic" msgstr "" # Default: Description msgid "description" msgstr "" # Default: Dialogue intellegibility msgid "dialogue-intellegibility" msgstr "" # Default: Digital sound msgid "digital-sound" msgstr "" # Default: Director msgid "director" msgstr "" # Default: Disc format msgid "disc-format" msgstr "" # Default: Disc size msgid "disc-size" msgstr "" # Default: Distributors msgid "distributors" msgstr "" # Default: Dvd msgid "dvd" msgstr "" # Default: Dvd features msgid "dvd-features" msgstr "" # Default: Dvd format msgid "dvd-format" msgstr "" # Default: Dvds msgid "dvds" msgstr "" # Default: Dynamic range msgid "dynamic-range" msgstr "" # Default: Edited from msgid "edited-from" msgstr "" # Default: Edited into msgid "edited-into" msgstr "" # Default: Editor msgid "editor" msgstr "" # Default: Editorial department msgid "editorial-department" msgstr "" # Default: Episode msgid "episode" msgstr "" # Default: Episode of msgid "episode-of" msgstr "" # Default: Episode title msgid "episode-title" msgstr "" # Default: Episodes msgid "episodes" msgstr "" # Default: Episodes rating msgid "episodes-rating" msgstr "" # Default: Essays msgid "essays" msgstr "" # Default: External reviews msgid "external-reviews" msgstr "" # Default: Faqs msgid "faqs" msgstr "" # Default: Feature msgid "feature" msgstr "" # Default: Featured in msgid "featured-in" msgstr "" # Default: Features msgid "features" msgstr "" # Default: Film negative format msgid "film-negative-format" msgstr "" # Default: Filming dates msgid "filming-dates" msgstr "" # Default: Filmography msgid "filmography" msgstr "" # Default: Followed by msgid "followed-by" msgstr "" # Default: Follows msgid "follows" msgstr "" # Default: For msgid "for" msgstr "" # Default: Frequency response msgid "frequency-response" msgstr "" # Default: From msgid "from" msgstr "" # Default: Full article link msgid "full-article-link" msgstr "" # Default: Full size cover url msgid "full-size-cover-url" msgstr "" # Default: Full size headshot msgid "full-size-headshot" msgstr "" # Default: Genres msgid "genres" msgstr "" # Default: Goofs msgid "goofs" msgstr "" # Default: Gross msgid "gross" msgstr "" # Default: Group genre msgid "group-genre" msgstr "" # Default: Headshot msgid "headshot" msgstr "" # Default: Height msgid "height" msgstr "" # Default: Imdbindex msgid "imdbindex" msgstr "" # Default: In development msgid "in-development" msgstr "" # Default: Interview msgid "interview" msgstr "" # Default: Interviews msgid "interviews" msgstr "" # Default: Introduction msgid "introduction" msgstr "" # Default: Item msgid "item" msgstr "" # Default: Keywords msgid "keywords" msgstr "" # Default: Kind msgid "kind" msgstr "" # Default: Label msgid "label" msgstr "" # Default: Laboratory msgid "laboratory" msgstr "" # Default: Language msgid "language" msgstr "" # Default: Languages msgid "languages" msgstr "" # Default: Laserdisc msgid "laserdisc" msgstr "" # Default: Laserdisc title msgid "laserdisc-title" msgstr "" # Default: Length msgid "length" msgstr "" # Default: Line msgid "line" msgstr "" # Default: Link msgid "link" msgstr "" # Default: Link text msgid "link-text" msgstr "" # Default: Literature msgid "literature" msgstr "" # Default: Locations msgid "locations" msgstr "" # Default: Long imdb canonical name msgid "long-imdb-canonical-name" msgstr "" # Default: Long imdb canonical title msgid "long-imdb-canonical-title" msgstr "" # Default: Long imdb episode title msgid "long-imdb-episode-title" msgstr "" # Default: Long imdb name msgid "long-imdb-name" msgstr "" # Default: Long imdb title msgid "long-imdb-title" msgstr "" # Default: Magazine cover photo msgid "magazine-cover-photo" msgstr "" # Default: Make up msgid "make-up" msgstr "" # Default: Master format msgid "master-format" msgstr "" # Default: Median msgid "median" msgstr "" # Default: Merchandising links msgid "merchandising-links" msgstr "" # Default: Mini biography msgid "mini-biography" msgstr "" # Default: Misc links msgid "misc-links" msgstr "" # Default: Miscellaneous companies msgid "miscellaneous-companies" msgstr "" # Default: Miscellaneous crew msgid "miscellaneous-crew" msgstr "" # Default: Movie msgid "movie" msgstr "" # Default: Mpaa msgid "mpaa" msgstr "" # Default: Music department msgid "music-department" msgstr "" # Default: Name msgid "name" msgstr "" # Default: News msgid "news" msgstr "" # Default: Newsgroup reviews msgid "newsgroup-reviews" msgstr "" # Default: Nick names msgid "nick-names" msgstr "" # Default: Notes msgid "notes" msgstr "" # Default: Novel msgid "novel" msgstr "" # Default: Number msgid "number" msgstr "" # Default: Number of chapter stops msgid "number-of-chapter-stops" msgstr "" # Default: Number of episodes msgid "number-of-episodes" msgstr "" # Default: Number of seasons msgid "number-of-seasons" msgstr "" # Default: Number of sides msgid "number-of-sides" msgstr "" # Default: Number of votes msgid "number-of-votes" msgstr "" # Default: Official retail price msgid "official-retail-price" msgstr "" # Default: Official sites msgid "official-sites" msgstr "" # Default: Opening weekend msgid "opening-weekend" msgstr "" # Default: Original air date msgid "original-air-date" msgstr "" # Default: Original music msgid "original-music" msgstr "" # Default: Original title msgid "original-title" msgstr "" # Default: Other literature msgid "other-literature" msgstr "" # Default: Other works msgid "other-works" msgstr "" # Default: Parents guide msgid "parents-guide" msgstr "" # Default: Performed by msgid "performed-by" msgstr "" # Default: Person msgid "person" msgstr "" # Default: Photo sites msgid "photo-sites" msgstr "" # Default: Pictorial msgid "pictorial" msgstr "" # Default: Picture format msgid "picture-format" msgstr "" # Default: Plot msgid "plot" msgstr "" # Default: Plot outline msgid "plot-outline" msgstr "" # Default: Portrayed in msgid "portrayed-in" msgstr "" # Default: Pressing plant msgid "pressing-plant" msgstr "" # Default: Printed film format msgid "printed-film-format" msgstr "" # Default: Printed media reviews msgid "printed-media-reviews" msgstr "" # Default: Producer msgid "producer" msgstr "" # Default: Production companies msgid "production-companies" msgstr "" # Default: Production country msgid "production-country" msgstr "" # Default: Production dates msgid "production-dates" msgstr "" # Default: Production design msgid "production-design" msgstr "" # Default: Production designer msgid "production-designer" msgstr "" # Default: Production manager msgid "production-manager" msgstr "" # Default: Production process protocol msgid "production-process-protocol" msgstr "" # Default: Quality of source msgid "quality-of-source" msgstr "" # Default: Quality program msgid "quality-program" msgstr "" # Default: Quote msgid "quote" msgstr "" # Default: Quotes msgid "quotes" msgstr "" # Default: Rating msgid "rating" msgstr "" # Default: Recommendations msgid "recommendations" msgstr "" # Default: Referenced in msgid "referenced-in" msgstr "" # Default: References msgid "references" msgstr "" # Default: Region msgid "region" msgstr "" # Default: Release country msgid "release-country" msgstr "" # Default: Release date msgid "release-date" msgstr "" # Default: Release dates msgid "release-dates" msgstr "" # Default: Remade as msgid "remade-as" msgstr "" # Default: Remake of msgid "remake-of" msgstr "" # Default: Rentals msgid "rentals" msgstr "" # Default: Result msgid "result" msgstr "" # Default: Review msgid "review" msgstr "" # Default: Review author msgid "review-author" msgstr "" # Default: Review kind msgid "review-kind" msgstr "" # Default: Runtime msgid "runtime" msgstr "" # Default: Runtimes msgid "runtimes" msgstr "" # Default: Salary history msgid "salary-history" msgstr "" # Default: Screenplay teleplay msgid "screenplay-teleplay" msgstr "" # Default: Season msgid "season" msgstr "" # Default: Second unit director or assistant director msgid "second-unit-director-or-assistant-director" msgstr "" # Default: Self msgid "self" msgstr "" # Default: Series animation department msgid "series-animation-department" msgstr "" # Default: Series art department msgid "series-art-department" msgstr "" # Default: Series assistant directors msgid "series-assistant-directors" msgstr "" # Default: Series camera department msgid "series-camera-department" msgstr "" # Default: Series casting department msgid "series-casting-department" msgstr "" # Default: Series cinematographers msgid "series-cinematographers" msgstr "" # Default: Series costume department msgid "series-costume-department" msgstr "" # Default: Series editorial department msgid "series-editorial-department" msgstr "" # Default: Series editors msgid "series-editors" msgstr "" # Default: Series make up department msgid "series-make-up-department" msgstr "" # Default: Series miscellaneous msgid "series-miscellaneous" msgstr "" # Default: Series music department msgid "series-music-department" msgstr "" # Default: Series producers msgid "series-producers" msgstr "" # Default: Series production designers msgid "series-production-designers" msgstr "" # Default: Series production managers msgid "series-production-managers" msgstr "" # Default: Series sound department msgid "series-sound-department" msgstr "" # Default: Series special effects department msgid "series-special-effects-department" msgstr "" # Default: Series stunts msgid "series-stunts" msgstr "" # Default: Series title msgid "series-title" msgstr "" # Default: Series transportation department msgid "series-transportation-department" msgstr "" # Default: Series visual effects department msgid "series-visual-effects-department" msgstr "" # Default: Series writers msgid "series-writers" msgstr "" # Default: Series years msgid "series-years" msgstr "" # Default: Set decoration msgid "set-decoration" msgstr "" # Default: Sharpness msgid "sharpness" msgstr "" # Default: Similar to msgid "similar-to" msgstr "" # Default: Smart canonical episode title msgid "smart-canonical-episode-title" msgstr "" # Default: Smart canonical series title msgid "smart-canonical-series-title" msgstr "" # Default: Smart canonical title msgid "smart-canonical-title" msgstr "" # Default: Smart long imdb canonical title msgid "smart-long-imdb-canonical-title" msgstr "" # Default: Sound clips msgid "sound-clips" msgstr "" # Default: Sound crew msgid "sound-crew" msgstr "" # Default: Sound encoding msgid "sound-encoding" msgstr "" # Default: Sound mix msgid "sound-mix" msgstr "" # Default: Soundtrack msgid "soundtrack" msgstr "" # Default: Spaciality msgid "spaciality" msgstr "" # Default: Special effects msgid "special-effects" msgstr "" # Default: Special effects companies msgid "special-effects-companies" msgstr "" # Default: Special effects department msgid "special-effects-department" msgstr "" # Default: Spin off msgid "spin-off" msgstr "" # Default: Spin off from msgid "spin-off-from" msgstr "" # Default: Spoofed in msgid "spoofed-in" msgstr "" # Default: Spoofs msgid "spoofs" msgstr "" # Default: Spouse msgid "spouse" msgstr "" # Default: Status of availablility msgid "status-of-availablility" msgstr "" # Default: Studio msgid "studio" msgstr "" # Default: Studios msgid "studios" msgstr "" # Default: Stunt performer msgid "stunt-performer" msgstr "" # Default: Stunts msgid "stunts" msgstr "" # Default: Subtitles msgid "subtitles" msgstr "" # Default: Supplement msgid "supplement" msgstr "" # Default: Supplements msgid "supplements" msgstr "" # Default: Synopsis msgid "synopsis" msgstr "" # Default: Taglines msgid "taglines" msgstr "" # Default: Tech info msgid "tech-info" msgstr "" # Default: Thanks msgid "thanks" msgstr "" # Default: Time msgid "time" msgstr "" # Default: Title msgid "title" msgstr "" # Default: Titles in this product msgid "titles-in-this-product" msgstr "" # Default: To msgid "to" msgstr "" # Default: Top 250 rank msgid "top-250-rank" msgstr "" # Default: Trade mark msgid "trade-mark" msgstr "" # Default: Transportation department msgid "transportation-department" msgstr "" # Default: Trivia msgid "trivia" msgstr "" # Default: Tv msgid "tv" msgstr "" # Default: Under license from msgid "under-license-from" msgstr "" # Default: Unknown link msgid "unknown-link" msgstr "" # Default: Upc msgid "upc" msgstr "" # Default: Version of msgid "version-of" msgstr "" # Default: Vhs msgid "vhs" msgstr "" # Default: Video msgid "video" msgstr "" # Default: Video artifacts msgid "video-artifacts" msgstr "" # Default: Video clips msgid "video-clips" msgstr "" # Default: Video noise msgid "video-noise" msgstr "" # Default: Video quality msgid "video-quality" msgstr "" # Default: Video standard msgid "video-standard" msgstr "" # Default: Visual effects msgid "visual-effects" msgstr "" # Default: Votes msgid "votes" msgstr "" # Default: Votes distribution msgid "votes-distribution" msgstr "" # Default: Weekend gross msgid "weekend-gross" msgstr "" # Default: Where now msgid "where-now" msgstr "" # Default: With msgid "with" msgstr "" # Default: Writer msgid "writer" msgstr "" # Default: Written by msgid "written-by" msgstr "" # Default: Year msgid "year" msgstr "" # Default: Zshops msgid "zshops" msgstr "" �����������������������������������������������������������������������������IMDbPY-4.9/imdb/locale/imdbpy-en.po�����������������������������������������������������������������0000644�0000000�0000000�00000052726�11766731642�015537� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# Gettext message file for imdbpy msgid "" msgstr "" "Project-Id-Version: imdbpy\n" "POT-Creation-Date: 2009-04-16 14:27+0000\n" "PO-Revision-Date: YYYY-MM-DD HH:MM+0000\n" "Last-Translator: YOUR NAME <YOUR@EMAIL>\n" "Language-Team: TEAM NAME <TEAM@EMAIL>\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=1; plural=0;\n" "Language-Code: en\n" "Language-Name: English\n" "Preferred-Encodings: utf-8\n" "Domain: imdbpy\n" # Default: Actor msgid "actor" msgstr "Actor" # Default: Actress msgid "actress" msgstr "Actress" # Default: Adaption msgid "adaption" msgstr "Adaption" # Default: Additional information msgid "additional-information" msgstr "Additional information" # Default: Admissions msgid "admissions" msgstr "Admissions" # Default: Agent address msgid "agent-address" msgstr "Agent address" # Default: Airing msgid "airing" msgstr "Airing" # Default: Akas msgid "akas" msgstr "Akas" # Default: All products msgid "all-products" msgstr "All products" # Default: Alternate language version of msgid "alternate-language-version-of" msgstr "Alternate language version of" # Default: Alternate versions msgid "alternate-versions" msgstr "Alternate versions" # Default: Amazon reviews msgid "amazon-reviews" msgstr "Amazon reviews" # Default: Analog left msgid "analog-left" msgstr "Analog left" # Default: Analog right msgid "analog-right" msgstr "Analog right" # Default: Animation department msgid "animation-department" msgstr "Animation department" # Default: Archive footage msgid "archive-footage" msgstr "Archive footage" # Default: Arithmetic mean msgid "arithmetic-mean" msgstr "Arithmetic mean" # Default: Art department msgid "art-department" msgstr "Art department" # Default: Art direction msgid "art-direction" msgstr "Art direction" # Default: Art director msgid "art-director" msgstr "Art director" # Default: Article msgid "article" msgstr "Article" # Default: Asin msgid "asin" msgstr "Asin" # Default: Aspect ratio msgid "aspect-ratio" msgstr "Aspect ratio" # Default: Assigner msgid "assigner" msgstr "Assigner" # Default: Assistant director msgid "assistant-director" msgstr "Assistant director" # Default: Auctions msgid "auctions" msgstr "Auctions" # Default: Audio noise msgid "audio-noise" msgstr "Audio noise" # Default: Audio quality msgid "audio-quality" msgstr "Audio quality" # Default: Award msgid "award" msgstr "Award" # Default: Awards msgid "awards" msgstr "Awards" # Default: Biographical movies msgid "biographical-movies" msgstr "Biographical movies" # Default: Biography msgid "biography" msgstr "Biography" # Default: Biography print msgid "biography-print" msgstr "Biography print" # Default: Birth date msgid "birth-date" msgstr "Birth date" # Default: Birth name msgid "birth-name" msgstr "Birth name" # Default: Birth notes msgid "birth-notes" msgstr "Birth notes" # Default: Body msgid "body" msgstr "Body" # Default: Book msgid "book" msgstr "Book" # Default: Books msgid "books" msgstr "Books" # Default: Bottom 100 rank msgid "bottom-100-rank" msgstr "Bottom 100 rank" # Default: Budget msgid "budget" msgstr "Budget" # Default: Business msgid "business" msgstr "Business" # Default: By arrangement with msgid "by-arrangement-with" msgstr "By arrangement with" # Default: Camera msgid "camera" msgstr "Camera" # Default: Camera and electrical department msgid "camera-and-electrical-department" msgstr "Camera and electrical department" # Default: Canonical episode title msgid "canonical-episode-title" msgstr "Canonical episode title" # Default: Canonical name msgid "canonical-name" msgstr "Canonical name" # Default: Canonical series title msgid "canonical-series-title" msgstr "Canonical series title" # Default: Canonical title msgid "canonical-title" msgstr "Canonical title" # Default: Cast msgid "cast" msgstr "Cast" # Default: Casting department msgid "casting-department" msgstr "Casting department" # Default: Casting director msgid "casting-director" msgstr "Casting director" # Default: Catalog number msgid "catalog-number" msgstr "Catalog number" # Default: Category msgid "category" msgstr "Category" # Default: Certificate msgid "certificate" msgstr "Certificate" # Default: Certificates msgid "certificates" msgstr "Certificates" # Default: Certification msgid "certification" msgstr "Certification" # Default: Channel msgid "channel" msgstr "Channel" # Default: Character msgid "character" msgstr "Character" # Default: Cinematographer msgid "cinematographer" msgstr "Cinematographer" # Default: Cinematographic process msgid "cinematographic-process" msgstr "Cinematographic process" # Default: Close captions teletext ld g msgid "close-captions-teletext-ld-g" msgstr "Close captions teletext ld g" # Default: Color info msgid "color-info" msgstr "Color info" # Default: Color information msgid "color-information" msgstr "Color information" # Default: Color rendition msgid "color-rendition" msgstr "Color rendition" # Default: Company msgid "company" msgstr "Company" # Default: Complete cast msgid "complete-cast" msgstr "Complete cast" # Default: Complete crew msgid "complete-crew" msgstr "Complete crew" # Default: Composer msgid "composer" msgstr "Composer" # Default: Connections msgid "connections" msgstr "Connections" # Default: Contrast msgid "contrast" msgstr "Contrast" # Default: Copyright holder msgid "copyright-holder" msgstr "Copyright holder" # Default: Costume department msgid "costume-department" msgstr "Costume department" # Default: Costume designer msgid "costume-designer" msgstr "Costume designer" # Default: Countries msgid "countries" msgstr "Countries" # Default: Country msgid "country" msgstr "Country" # Default: Courtesy of msgid "courtesy-of" msgstr "Courtesy of" # Default: Cover msgid "cover" msgstr "Cover" # Default: Cover url msgid "cover-url" msgstr "Cover url" # Default: Crazy credits msgid "crazy-credits" msgstr "Crazy credits" # Default: Creator msgid "creator" msgstr "Creator" # Default: Current role msgid "current-role" msgstr "Current role" # Default: Database msgid "database" msgstr "Database" # Default: Date msgid "date" msgstr "Date" # Default: Death date msgid "death-date" msgstr "Death date" # Default: Death notes msgid "death-notes" msgstr "Death notes" # Default: Demographic msgid "demographic" msgstr "Demographic" # Default: Description msgid "description" msgstr "Description" # Default: Dialogue intellegibility msgid "dialogue-intellegibility" msgstr "Dialogue intellegibility" # Default: Digital sound msgid "digital-sound" msgstr "Digital sound" # Default: Director msgid "director" msgstr "Director" # Default: Disc format msgid "disc-format" msgstr "Disc format" # Default: Disc size msgid "disc-size" msgstr "Disc size" # Default: Distributors msgid "distributors" msgstr "Distributors" # Default: Dvd msgid "dvd" msgstr "Dvd" # Default: Dvd features msgid "dvd-features" msgstr "Dvd features" # Default: Dvd format msgid "dvd-format" msgstr "Dvd format" # Default: Dvds msgid "dvds" msgstr "Dvds" # Default: Dynamic range msgid "dynamic-range" msgstr "Dynamic range" # Default: Edited from msgid "edited-from" msgstr "Edited from" # Default: Edited into msgid "edited-into" msgstr "Edited into" # Default: Editor msgid "editor" msgstr "Editor" # Default: Editorial department msgid "editorial-department" msgstr "Editorial department" # Default: Episode msgid "episode" msgstr "Episode" # Default: Episode of msgid "episode-of" msgstr "Episode of" # Default: Episode title msgid "episode-title" msgstr "Episode title" # Default: Episodes msgid "episodes" msgstr "Episodes" # Default: Episodes rating msgid "episodes-rating" msgstr "Episodes rating" # Default: Essays msgid "essays" msgstr "Essays" # Default: External reviews msgid "external-reviews" msgstr "External reviews" # Default: Faqs msgid "faqs" msgstr "Faqs" # Default: Featured in msgid "featured-in" msgstr "Featured in" # Default: Features msgid "features" msgstr "Features" # Default: Film negative format msgid "film-negative-format" msgstr "Film negative format" # Default: Filming dates msgid "filming-dates" msgstr "Filming dates" # Default: Filmography msgid "filmography" msgstr "Filmography" # Default: Followed by msgid "followed-by" msgstr "Followed by" # Default: Follows msgid "follows" msgstr "Follows" # Default: For msgid "for" msgstr "For" # Default: Frequency response msgid "frequency-response" msgstr "Frequency response" # Default: From msgid "from" msgstr "From" # Default: Full article link msgid "full-article-link" msgstr "Full article link" # Default: Genres msgid "genres" msgstr "Genres" # Default: Goofs msgid "goofs" msgstr "Goofs" # Default: Gross msgid "gross" msgstr "Gross" # Default: Group genre msgid "group-genre" msgstr "Group genre" # Default: Headshot msgid "headshot" msgstr "Headshot" # Default: Height msgid "height" msgstr "Height" # Default: Imdbindex msgid "imdbindex" msgstr "Imdbindex" # Default: Interview msgid "interview" msgstr "Interview" # Default: Interviews msgid "interviews" msgstr "Interviews" # Default: Introduction msgid "introduction" msgstr "Introduction" # Default: Item msgid "item" msgstr "Item" # Default: Keywords msgid "keywords" msgstr "Keywords" # Default: Kind msgid "kind" msgstr "Kind" # Default: Label msgid "label" msgstr "Label" # Default: Laboratory msgid "laboratory" msgstr "Laboratory" # Default: Language msgid "language" msgstr "Language" # Default: Languages msgid "languages" msgstr "Languages" # Default: Laserdisc msgid "laserdisc" msgstr "Laserdisc" # Default: Laserdisc title msgid "laserdisc-title" msgstr "Laserdisc title" # Default: Length msgid "length" msgstr "Length" # Default: Line msgid "line" msgstr "Line" # Default: Link msgid "link" msgstr "Link" # Default: Link text msgid "link-text" msgstr "Link text" # Default: Literature msgid "literature" msgstr "Literature" # Default: Locations msgid "locations" msgstr "Locations" # Default: Long imdb canonical name msgid "long-imdb-canonical-name" msgstr "Long imdb canonical name" # Default: Long imdb canonical title msgid "long-imdb-canonical-title" msgstr "Long imdb canonical title" # Default: Long imdb episode title msgid "long-imdb-episode-title" msgstr "Long imdb episode title" # Default: Long imdb name msgid "long-imdb-name" msgstr "Long imdb name" # Default: Long imdb title msgid "long-imdb-title" msgstr "Long imdb title" # Default: Magazine cover photo msgid "magazine-cover-photo" msgstr "Magazine cover photo" # Default: Make up msgid "make-up" msgstr "Make up" # Default: Master format msgid "master-format" msgstr "Master format" # Default: Median msgid "median" msgstr "Median" # Default: Merchandising links msgid "merchandising-links" msgstr "Merchandising links" # Default: Mini biography msgid "mini-biography" msgstr "Mini biography" # Default: Misc links msgid "misc-links" msgstr "Misc links" # Default: Miscellaneous companies msgid "miscellaneous-companies" msgstr "Miscellaneous companies" # Default: Miscellaneous crew msgid "miscellaneous-crew" msgstr "Miscellaneous crew" # Default: Movie msgid "movie" msgstr "Movie" # Default: Mpaa msgid "mpaa" msgstr "Mpaa" # Default: Music department msgid "music-department" msgstr "Music department" # Default: Name msgid "name" msgstr "Name" # Default: News msgid "news" msgstr "News" # Default: Newsgroup reviews msgid "newsgroup-reviews" msgstr "Newsgroup reviews" # Default: Nick names msgid "nick-names" msgstr "Nick names" # Default: Notes msgid "notes" msgstr "Notes" # Default: Novel msgid "novel" msgstr "Novel" # Default: Number msgid "number" msgstr "Number" # Default: Number of chapter stops msgid "number-of-chapter-stops" msgstr "Number of chapter stops" # Default: Number of episodes msgid "number-of-episodes" msgstr "Number of episodes" # Default: Number of seasons msgid "number-of-seasons" msgstr "Number of seasons" # Default: Number of sides msgid "number-of-sides" msgstr "Number of sides" # Default: Number of votes msgid "number-of-votes" msgstr "Number of votes" # Default: Official retail price msgid "official-retail-price" msgstr "Official retail price" # Default: Official sites msgid "official-sites" msgstr "Official sites" # Default: Opening weekend msgid "opening-weekend" msgstr "Opening weekend" # Default: Original air date msgid "original-air-date" msgstr "Original air date" # Default: Original music msgid "original-music" msgstr "Original music" # Default: Original title msgid "original-title" msgstr "Original title" # Default: Other literature msgid "other-literature" msgstr "Other literature" # Default: Other works msgid "other-works" msgstr "Other works" # Default: Parents guide msgid "parents-guide" msgstr "Parents guide" # Default: Performed by msgid "performed-by" msgstr "Performed by" # Default: Person msgid "person" msgstr "Person" # Default: Photo sites msgid "photo-sites" msgstr "Photo sites" # Default: Pictorial msgid "pictorial" msgstr "Pictorial" # Default: Picture format msgid "picture-format" msgstr "Picture format" # Default: Plot msgid "plot" msgstr "Plot" # Default: Plot outline msgid "plot-outline" msgstr "Plot outline" # Default: Portrayed in msgid "portrayed-in" msgstr "Portrayed in" # Default: Pressing plant msgid "pressing-plant" msgstr "Pressing plant" # Default: Printed film format msgid "printed-film-format" msgstr "Printed film format" # Default: Printed media reviews msgid "printed-media-reviews" msgstr "Printed media reviews" # Default: Producer msgid "producer" msgstr "Producer" # Default: Production companies msgid "production-companies" msgstr "Production companies" # Default: Production country msgid "production-country" msgstr "Production country" # Default: Production dates msgid "production-dates" msgstr "Production dates" # Default: Production design msgid "production-design" msgstr "Production design" # Default: Production designer msgid "production-designer" msgstr "Production designer" # Default: Production manager msgid "production-manager" msgstr "Production manager" # Default: Production process protocol msgid "production-process-protocol" msgstr "Production process protocol" # Default: Quality of source msgid "quality-of-source" msgstr "Quality of source" # Default: Quality program msgid "quality-program" msgstr "Quality program" # Default: Quote msgid "quote" msgstr "Quote" # Default: Quotes msgid "quotes" msgstr "Quotes" # Default: Rating msgid "rating" msgstr "Rating" # Default: Recommendations msgid "recommendations" msgstr "Recommendations" # Default: Referenced in msgid "referenced-in" msgstr "Referenced in" # Default: References msgid "references" msgstr "References" # Default: Region msgid "region" msgstr "Region" # Default: Release country msgid "release-country" msgstr "Release country" # Default: Release date msgid "release-date" msgstr "Release date" # Default: Release dates msgid "release-dates" msgstr "Release dates" # Default: Remade as msgid "remade-as" msgstr "Remade as" # Default: Remake of msgid "remake-of" msgstr "Remake of" # Default: Rentals msgid "rentals" msgstr "Rentals" # Default: Result msgid "result" msgstr "Result" # Default: Review msgid "review" msgstr "Review" # Default: Review author msgid "review-author" msgstr "Review author" # Default: Review kind msgid "review-kind" msgstr "Review kind" # Default: Runtime msgid "runtime" msgstr "Runtime" # Default: Runtimes msgid "runtimes" msgstr "Runtimes" # Default: Salary history msgid "salary-history" msgstr "Salary history" # Default: Screenplay teleplay msgid "screenplay-teleplay" msgstr "Screenplay teleplay" # Default: Season msgid "season" msgstr "Season" # Default: Second unit director or assistant director msgid "second-unit-director-or-assistant-director" msgstr "Second unit director or assistant director" # Default: Self msgid "self" msgstr "Self" # Default: Series animation department msgid "series-animation-department" msgstr "Series animation department" # Default: Series art department msgid "series-art-department" msgstr "Series art department" # Default: Series assistant directors msgid "series-assistant-directors" msgstr "Series assistant directors" # Default: Series camera department msgid "series-camera-department" msgstr "Series camera department" # Default: Series casting department msgid "series-casting-department" msgstr "Series casting department" # Default: Series cinematographers msgid "series-cinematographers" msgstr "Series cinematographers" # Default: Series costume department msgid "series-costume-department" msgstr "Series costume department" # Default: Series editorial department msgid "series-editorial-department" msgstr "Series editorial department" # Default: Series editors msgid "series-editors" msgstr "Series editors" # Default: Series make up department msgid "series-make-up-department" msgstr "Series make up department" # Default: Series miscellaneous msgid "series-miscellaneous" msgstr "Series miscellaneous" # Default: Series music department msgid "series-music-department" msgstr "Series music department" # Default: Series producers msgid "series-producers" msgstr "Series producers" # Default: Series production designers msgid "series-production-designers" msgstr "Series production designers" # Default: Series production managers msgid "series-production-managers" msgstr "Series production managers" # Default: Series sound department msgid "series-sound-department" msgstr "Series sound department" # Default: Series special effects department msgid "series-special-effects-department" msgstr "Series special effects department" # Default: Series stunts msgid "series-stunts" msgstr "Series stunts" # Default: Series title msgid "series-title" msgstr "Series title" # Default: Series transportation department msgid "series-transportation-department" msgstr "Series transportation department" # Default: Series visual effects department msgid "series-visual-effects-department" msgstr "Series visual effects department" # Default: Series writers msgid "series-writers" msgstr "Series writers" # Default: Series years msgid "series-years" msgstr "Series years" # Default: Set decoration msgid "set-decoration" msgstr "Set decoration" # Default: Sharpness msgid "sharpness" msgstr "Sharpness" # Default: Similar to msgid "similar-to" msgstr "Similar to" # Default: Sound clips msgid "sound-clips" msgstr "Sound clips" # Default: Sound crew msgid "sound-crew" msgstr "Sound crew" # Default: Sound encoding msgid "sound-encoding" msgstr "Sound encoding" # Default: Sound mix msgid "sound-mix" msgstr "Sound mix" # Default: Soundtrack msgid "soundtrack" msgstr "Soundtrack" # Default: Spaciality msgid "spaciality" msgstr "Spaciality" # Default: Special effects msgid "special-effects" msgstr "Special effects" # Default: Special effects companies msgid "special-effects-companies" msgstr "Special effects companies" # Default: Special effects department msgid "special-effects-department" msgstr "Special effects department" # Default: Spin off msgid "spin-off" msgstr "Spin off" # Default: Spin off from msgid "spin-off-from" msgstr "Spin off from" # Default: Spoofed in msgid "spoofed-in" msgstr "Spoofed in" # Default: Spoofs msgid "spoofs" msgstr "Spoofs" # Default: Spouse msgid "spouse" msgstr "Spouse" # Default: Status of availablility msgid "status-of-availablility" msgstr "Status of availablility" # Default: Studio msgid "studio" msgstr "Studio" # Default: Studios msgid "studios" msgstr "Studios" # Default: Stunt performer msgid "stunt-performer" msgstr "Stunt performer" # Default: Stunts msgid "stunts" msgstr "Stunts" # Default: Subtitles msgid "subtitles" msgstr "Subtitles" # Default: Supplement msgid "supplement" msgstr "Supplement" # Default: Supplements msgid "supplements" msgstr "Supplements" # Default: Synopsis msgid "synopsis" msgstr "Synopsis" # Default: Taglines msgid "taglines" msgstr "Taglines" # Default: Tech info msgid "tech-info" msgstr "Tech info" # Default: Thanks msgid "thanks" msgstr "Thanks" # Default: Time msgid "time" msgstr "Time" # Default: Title msgid "title" msgstr "Title" # Default: Titles in this product msgid "titles-in-this-product" msgstr "Titles in this product" # Default: To msgid "to" msgstr "To" # Default: Top 250 rank msgid "top-250-rank" msgstr "Top 250 rank" # Default: Trade mark msgid "trade-mark" msgstr "Trade mark" # Default: Transportation department msgid "transportation-department" msgstr "Transportation department" # Default: Trivia msgid "trivia" msgstr "Trivia" # Default: Under license from msgid "under-license-from" msgstr "Under license from" # Default: Unknown link msgid "unknown-link" msgstr "Unknown link" # Default: Upc msgid "upc" msgstr "Upc" # Default: Version of msgid "version-of" msgstr "Version of" # Default: Vhs msgid "vhs" msgstr "Vhs" # Default: Video artifacts msgid "video-artifacts" msgstr "Video artifacts" # Default: Video clips msgid "video-clips" msgstr "Video clips" # Default: Video noise msgid "video-noise" msgstr "Video noise" # Default: Video quality msgid "video-quality" msgstr "Video quality" # Default: Video standard msgid "video-standard" msgstr "Video standard" # Default: Visual effects msgid "visual-effects" msgstr "Visual effects" # Default: Votes msgid "votes" msgstr "Votes" # Default: Votes distribution msgid "votes-distribution" msgstr "Votes distribution" # Default: Weekend gross msgid "weekend-gross" msgstr "Weekend gross" # Default: Where now msgid "where-now" msgstr "Where now" # Default: With msgid "with" msgstr "With" # Default: Writer msgid "writer" msgstr "Writer" # Default: Written by msgid "written-by" msgstr "Written by" # Default: Year msgid "year" msgstr "Year" # Default: Zshops msgid "zshops" msgstr "Zshops" ������������������������������������������IMDbPY-4.9/imdb/locale/imdbpy-tr.po�����������������������������������������������������������������0000644�0000000�0000000�00000053433�11766731642�015556� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# Gettext message file for imdbpy msgid "" msgstr "" "Project-Id-Version: imdbpy\n" "POT-Creation-Date: 2010-03-18 14:35+0000\n" "PO-Revision-Date: 2009-04-21 19:04+0200\n" "Last-Translator: H. Turgut Uyar <uyar@itu.edu.tr>\n" "Language-Team: IMDbPY Türkçe <uyar@itu.edu.tr>\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=1; plural=0;\n" "Language-Code: tr\n" "Language-Name: Türkçe\n" "Preferred-Encodings: utf-8\n" "Domain: imdbpy\n" # Default: Actor msgid "actor" msgstr "Oyuncu" # Default: Actress msgid "actress" msgstr "Oyuncu" # Default: Adaption msgid "adaption" msgstr "" # Default: Additional information msgid "additional-information" msgstr "Ek bilgi" # Default: Admissions msgid "admissions" msgstr "" # Default: Agent address msgid "agent-address" msgstr "" # Default: Airing msgid "airing" msgstr "Yayımlanma" # Default: Akas msgid "akas" msgstr "Diğer başlıklar" # Default: Akas from release info msgid "akas-from-release-info" msgstr "" # Default: All products msgid "all-products" msgstr "Bütün ürünler" # Default: Alternate language version of msgid "alternate-language-version-of" msgstr "" # Default: Alternate versions msgid "alternate-versions" msgstr "" # Default: Amazon reviews msgid "amazon-reviews" msgstr "Amazon eleştirileri" # Default: Analog left msgid "analog-left" msgstr "Analog sol" # Default: Analog right msgid "analog-right" msgstr "Analog sağ" # Default: Animation department msgid "animation-department" msgstr "Animasyon departmanı" # Default: Archive footage msgid "archive-footage" msgstr "Arşiv çekimleri" # Default: Arithmetic mean msgid "arithmetic-mean" msgstr "Aritmetik ortalama" # Default: Art department msgid "art-department" msgstr "Sanat departmanı" # Default: Art direction msgid "art-direction" msgstr "Sanat yönetmenliği" # Default: Art director msgid "art-director" msgstr "Sanat yönetmeni" # Default: Article msgid "article" msgstr "" # Default: Asin msgid "asin" msgstr "ASIN" # Default: Aspect ratio msgid "aspect-ratio" msgstr "En-boy oranı" # Default: Assigner msgid "assigner" msgstr "Veren" # Default: Assistant director msgid "assistant-director" msgstr "Yardımcı yönetmen" # Default: Auctions msgid "auctions" msgstr "Açık artırmalar" # Default: Audio noise msgid "audio-noise" msgstr "Ses gürültüsü" # Default: Audio quality msgid "audio-quality" msgstr "Ses kalitesi" # Default: Award msgid "award" msgstr "Ödül" # Default: Awards msgid "awards" msgstr "Ödüller" # Default: Biographical movies msgid "biographical-movies" msgstr "Biyografik filmler" # Default: Biography msgid "biography" msgstr "Biyografi" # Default: Biography print msgid "biography-print" msgstr "Basılı biyografi" # Default: Birth date msgid "birth-date" msgstr "Doğum tarihi" # Default: Birth name msgid "birth-name" msgstr "Asıl ismi" # Default: Birth notes msgid "birth-notes" msgstr "Doğum notları" # Default: Body msgid "body" msgstr "Metin" # Default: Book msgid "book" msgstr "Kitap" # Default: Books msgid "books" msgstr "Kitaplar" # Default: Bottom 100 rank msgid "bottom-100-rank" msgstr "En kötü 100 içindeki sırası" # Default: Budget msgid "budget" msgstr "Bütçe" # Default: Business msgid "business" msgstr "Gişe" # Default: By arrangement with msgid "by-arrangement-with" msgstr "" # Default: Camera msgid "camera" msgstr "Kamera" # Default: Camera and electrical department msgid "camera-and-electrical-department" msgstr "Kamera ve elektrik departmanı" # Default: Canonical episode title msgid "canonical-episode-title" msgstr "" # Default: Canonical name msgid "canonical-name" msgstr "" # Default: Canonical series title msgid "canonical-series-title" msgstr "" # Default: Canonical title msgid "canonical-title" msgstr "" # Default: Cast msgid "cast" msgstr "Oynayanlar" # Default: Casting department msgid "casting-department" msgstr "Oyuncu seçme departmanı" # Default: Casting director msgid "casting-director" msgstr "Oyuncu seçme yönetmeni" # Default: Catalog number msgid "catalog-number" msgstr "Katalog numarası" # Default: Category msgid "category" msgstr "Kategori" # Default: Certificate msgid "certificate" msgstr "Sertifika" # Default: Certificates msgid "certificates" msgstr "Sertifikalar" # Default: Certification msgid "certification" msgstr "" # Default: Channel msgid "channel" msgstr "Kanal" # Default: Character msgid "character" msgstr "Karakter" # Default: Cinematographer msgid "cinematographer" msgstr "Kameraman" # Default: Cinematographic process msgid "cinematographic-process" msgstr "" # Default: Close captions teletext ld g msgid "close-captions-teletext-ld-g" msgstr "" # Default: Color info msgid "color-info" msgstr "Renk bilgisi" # Default: Color information msgid "color-information" msgstr "Renk bilgisi" # Default: Color rendition msgid "color-rendition" msgstr "" # Default: Company msgid "company" msgstr "Şirket" # Default: Complete cast msgid "complete-cast" msgstr "Bütün oynayanlar" # Default: Complete crew msgid "complete-crew" msgstr "Bütün çalışanlar" # Default: Composer msgid "composer" msgstr "Besteci" # Default: Connections msgid "connections" msgstr "Bağlantılar" # Default: Contrast msgid "contrast" msgstr "Kontrast" # Default: Copyright holder msgid "copyright-holder" msgstr "Telif sahibi" # Default: Costume department msgid "costume-department" msgstr "Kostüm departmanı" # Default: Costume designer msgid "costume-designer" msgstr "Kostüm tasarımcısı" # Default: Countries msgid "countries" msgstr "Ülkeler" # Default: Country msgid "country" msgstr "Ülke" # Default: Courtesy of msgid "courtesy-of" msgstr "" # Default: Cover msgid "cover" msgstr "Poster" # Default: Cover url msgid "cover-url" msgstr "Poster adresi" # Default: Crazy credits msgid "crazy-credits" msgstr "" # Default: Creator msgid "creator" msgstr "Yaratıcı" # Default: Current role msgid "current-role" msgstr "Şimdiki rol" # Default: Database msgid "database" msgstr "Veritabanı" # Default: Date msgid "date" msgstr "Tarih" # Default: Death date msgid "death-date" msgstr "Ölüm tarihi" # Default: Death notes msgid "death-notes" msgstr "Ölüm notları" # Default: Demographic msgid "demographic" msgstr "Demografi" # Default: Description msgid "description" msgstr "Tarif" # Default: Dialogue intellegibility msgid "dialogue-intellegibility" msgstr "" # Default: Digital sound msgid "digital-sound" msgstr "Dijital ses" # Default: Director msgid "director" msgstr "Yönetmen" # Default: Disc format msgid "disc-format" msgstr "Disk formatı" # Default: Disc size msgid "disc-size" msgstr "Disk boyu" # Default: Distributors msgid "distributors" msgstr "Dağıtıcılar" # Default: Dvd msgid "dvd" msgstr "DVD" # Default: Dvd features msgid "dvd-features" msgstr "DVD özellikleri" # Default: Dvd format msgid "dvd-format" msgstr "DVD formatı" # Default: Dvds msgid "dvds" msgstr "DVD'ler" # Default: Dynamic range msgid "dynamic-range" msgstr "" # Default: Edited from msgid "edited-from" msgstr "" # Default: Edited into msgid "edited-into" msgstr "" # Default: Editor msgid "editor" msgstr "Montajcı" # Default: Editorial department msgid "editorial-department" msgstr "Montaj departmanı" # Default: Episode msgid "episode" msgstr "Bölüm" # Default: Episode of msgid "episode-of" msgstr "Dizi" # Default: Episode title msgid "episode-title" msgstr "Bölüm başlığı" # Default: Episodes msgid "episodes" msgstr "Bölümler" # Default: Episodes rating msgid "episodes-rating" msgstr "Bölüm puanı" # Default: Essays msgid "essays" msgstr "Denemeler" # Default: External reviews msgid "external-reviews" msgstr "Harici eleştiriler" # Default: Faqs msgid "faqs" msgstr "SSS" # Default: Feature msgid "feature" msgstr "" # Default: Featured in msgid "featured-in" msgstr "" # Default: Features msgid "features" msgstr "" # Default: Film negative format msgid "film-negative-format" msgstr "Film negatif formatı" # Default: Filming dates msgid "filming-dates" msgstr "Çekim tarihleri" # Default: Filmography msgid "filmography" msgstr "Filmografi" # Default: Followed by msgid "followed-by" msgstr "Peşinden gelen film" # Default: Follows msgid "follows" msgstr "Peşinden geldiği film" # Default: For msgid "for" msgstr "Film" # Default: Frequency response msgid "frequency-response" msgstr "" # Default: From msgid "from" msgstr "" # Default: Full article link msgid "full-article-link" msgstr "" # Default: Full size cover url msgid "full-size-cover-url" msgstr "" # Default: Full size headshot msgid "full-size-headshot" msgstr "" # Default: Genres msgid "genres" msgstr "Türler" # Default: Goofs msgid "goofs" msgstr "Hatalar" # Default: Gross msgid "gross" msgstr "Hasılat" # Default: Group genre msgid "group-genre" msgstr "" # Default: Headshot msgid "headshot" msgstr "Resim" # Default: Height msgid "height" msgstr "Boy" # Default: Imdbindex msgid "imdbindex" msgstr "" # Default: In development msgid "in-development" msgstr "" # Default: Interview msgid "interview" msgstr "Söyleşi" # Default: Interviews msgid "interviews" msgstr "Söyleşiler" # Default: Introduction msgid "introduction" msgstr "İlk filmi" # Default: Item msgid "item" msgstr "" # Default: Keywords msgid "keywords" msgstr "Anahtar sözcükler" # Default: Kind msgid "kind" msgstr "Tip" # Default: Label msgid "label" msgstr "" # Default: Laboratory msgid "laboratory" msgstr "Laboratuar" # Default: Language msgid "language" msgstr "Dil" # Default: Languages msgid "languages" msgstr "Diller" # Default: Laserdisc msgid "laserdisc" msgstr "Lazer Disk" # Default: Laserdisc title msgid "laserdisc-title" msgstr "" # Default: Length msgid "length" msgstr "Süre" # Default: Line msgid "line" msgstr "Replik" # Default: Link msgid "link" msgstr "Bağlantı" # Default: Link text msgid "link-text" msgstr "Bağlantı metni" # Default: Literature msgid "literature" msgstr "Edebiyat" # Default: Locations msgid "locations" msgstr "Çekim yerleri" # Default: Long imdb canonical name msgid "long-imdb-canonical-name" msgstr "" # Default: Long imdb canonical title msgid "long-imdb-canonical-title" msgstr "" # Default: Long imdb episode title msgid "long-imdb-episode-title" msgstr "IMDb uzun bölüm başlığı" # Default: Long imdb name msgid "long-imdb-name" msgstr "IMDb uzun ismi" # Default: Long imdb title msgid "long-imdb-title" msgstr "IMDb uzun başlığı" # Default: Magazine cover photo msgid "magazine-cover-photo" msgstr "Dergi kapağı resmi" # Default: Make up msgid "make-up" msgstr "Makyaj" # Default: Master format msgid "master-format" msgstr "Master format" # Default: Median msgid "median" msgstr "Orta değer" # Default: Merchandising links msgid "merchandising-links" msgstr "" # Default: Mini biography msgid "mini-biography" msgstr "Mini biyografi" # Default: Misc links msgid "misc-links" msgstr "" # Default: Miscellaneous companies msgid "miscellaneous-companies" msgstr "" # Default: Miscellaneous crew msgid "miscellaneous-crew" msgstr "" # Default: Movie msgid "movie" msgstr "Film" # Default: Mpaa msgid "mpaa" msgstr "MPAA" # Default: Music department msgid "music-department" msgstr "Müzik departmanı" # Default: Name msgid "name" msgstr "İsim" # Default: News msgid "news" msgstr "Haberler" # Default: Newsgroup reviews msgid "newsgroup-reviews" msgstr "Haber grubu eleştirileri" # Default: Nick names msgid "nick-names" msgstr "Takma isimler" # Default: Notes msgid "notes" msgstr "Notlar" # Default: Novel msgid "novel" msgstr "Roman" # Default: Number msgid "number" msgstr "Sayı" # Default: Number of chapter stops msgid "number-of-chapter-stops" msgstr "" # Default: Number of episodes msgid "number-of-episodes" msgstr "Bölüm sayısı" # Default: Number of seasons msgid "number-of-seasons" msgstr "Sezon sayısı" # Default: Number of sides msgid "number-of-sides" msgstr "" # Default: Number of votes msgid "number-of-votes" msgstr "Oy sayısı" # Default: Official retail price msgid "official-retail-price" msgstr "Resmi perakende satış fiyatı" # Default: Official sites msgid "official-sites" msgstr "Resmi siteler" # Default: Opening weekend msgid "opening-weekend" msgstr "Açılış haftasonu" # Default: Original air date msgid "original-air-date" msgstr "İlk yayımlanma tarihi" # Default: Original music msgid "original-music" msgstr "Orijinal müzik" # Default: Original title msgid "original-title" msgstr "" # Default: Other literature msgid "other-literature" msgstr "" # Default: Other works msgid "other-works" msgstr "Diğer çalışmalar" # Default: Parents guide msgid "parents-guide" msgstr "Ana-baba kılavuzu" # Default: Performed by msgid "performed-by" msgstr "İcra eden" # Default: Person msgid "person" msgstr "Kişi" # Default: Photo sites msgid "photo-sites" msgstr "Fotoğraf siteleri" # Default: Pictorial msgid "pictorial" msgstr "" # Default: Picture format msgid "picture-format" msgstr "Resim formatı" # Default: Plot msgid "plot" msgstr "Konu" # Default: Plot outline msgid "plot-outline" msgstr "Konu kısa özeti" # Default: Portrayed in msgid "portrayed-in" msgstr "" # Default: Pressing plant msgid "pressing-plant" msgstr "" # Default: Printed film format msgid "printed-film-format" msgstr "Basılı film formatı" # Default: Printed media reviews msgid "printed-media-reviews" msgstr "Basın eleştirileri" # Default: Producer msgid "producer" msgstr "Yapımcı" # Default: Production companies msgid "production-companies" msgstr "Yapım şirketleri" # Default: Production country msgid "production-country" msgstr "Yapımcı ülke" # Default: Production dates msgid "production-dates" msgstr "Yapım tarihleri" # Default: Production design msgid "production-design" msgstr "Yapım tasarımı" # Default: Production designer msgid "production-designer" msgstr "Yapım tasarımcısı" # Default: Production manager msgid "production-manager" msgstr "Yapım yöneticisi" # Default: Production process protocol msgid "production-process-protocol" msgstr "" # Default: Quality of source msgid "quality-of-source" msgstr "" # Default: Quality program msgid "quality-program" msgstr "" # Default: Quote msgid "quote" msgstr "Alıntı" # Default: Quotes msgid "quotes" msgstr "Alıntılar" # Default: Rating msgid "rating" msgstr "Puan" # Default: Recommendations msgid "recommendations" msgstr "Tavsiyeler" # Default: Referenced in msgid "referenced-in" msgstr "Gönderme yapılan filmler" # Default: References msgid "references" msgstr "Gönderme yaptığı filmler" # Default: Region msgid "region" msgstr "Bölge" # Default: Release country msgid "release-country" msgstr "" # Default: Release date msgid "release-date" msgstr "" # Default: Release dates msgid "release-dates" msgstr "" # Default: Remade as msgid "remade-as" msgstr "Yeniden çekilişi" # Default: Remake of msgid "remake-of" msgstr "Yeniden çekimi olduğu film" # Default: Rentals msgid "rentals" msgstr "Kiralamalar" # Default: Result msgid "result" msgstr "Sonuç" # Default: Review msgid "review" msgstr "Eleştiri" # Default: Review author msgid "review-author" msgstr "Eleştiri yazarı" # Default: Review kind msgid "review-kind" msgstr "Eleştiri tipi" # Default: Runtime msgid "runtime" msgstr "Süre" # Default: Runtimes msgid "runtimes" msgstr "Süreler" # Default: Salary history msgid "salary-history" msgstr "Üçret tarihçesi" # Default: Screenplay teleplay msgid "screenplay-teleplay" msgstr "Senaryo" # Default: Season msgid "season" msgstr "Sezon" # Default: Second unit director or assistant director msgid "second-unit-director-or-assistant-director" msgstr "İkinci birim yönetmeni ya da yardımcı yönetmen" # Default: Self msgid "self" msgstr "Kendisi" # Default: Series animation department msgid "series-animation-department" msgstr "Dizinin animasyon departmanı" # Default: Series art department msgid "series-art-department" msgstr "Dizinin sanat departmanı" # Default: Series assistant directors msgid "series-assistant-directors" msgstr "Dizinin yardımcı yönetmenleri" # Default: Series camera department msgid "series-camera-department" msgstr "Dizinin kamera departmanı" # Default: Series casting department msgid "series-casting-department" msgstr "Dizinin oyuncu seçimi departmanı" # Default: Series cinematographers msgid "series-cinematographers" msgstr "Dizinin kameramanları" # Default: Series costume department msgid "series-costume-department" msgstr "Dizinin kostüm departmanı" # Default: Series editorial department msgid "series-editorial-department" msgstr "Dizinin montaj departmanı" # Default: Series editors msgid "series-editors" msgstr "Dizinin montajcıları" # Default: Series make up department msgid "series-make-up-department" msgstr "Dizinin makyaj departmanı" # Default: Series miscellaneous msgid "series-miscellaneous" msgstr "" # Default: Series music department msgid "series-music-department" msgstr "Dizinin müzik departmanı" # Default: Series producers msgid "series-producers" msgstr "Dizinin yapımcıları" # Default: Series production designers msgid "series-production-designers" msgstr "Dizinin yapım tasarımcıları" # Default: Series production managers msgid "series-production-managers" msgstr "Dizinin yapım yöneticileri" # Default: Series sound department msgid "series-sound-department" msgstr "Dizinin ses departmanı" # Default: Series special effects department msgid "series-special-effects-department" msgstr "Dizinin özel efekt departmanı" # Default: Series stunts msgid "series-stunts" msgstr "Dizinin dublörleri" # Default: Series title msgid "series-title" msgstr "Dizinin başlığı" # Default: Series transportation department msgid "series-transportation-department" msgstr "Dizinin ulaşım departmanı" # Default: Series visual effects department msgid "series-visual-effects-department" msgstr "Dizinin görsel efekt departmanı" # Default: Series writers msgid "series-writers" msgstr "Dizinin yazarları" # Default: Series years msgid "series-years" msgstr "Dizinin yılları" # Default: Set decoration msgid "set-decoration" msgstr "Set dekorasyonu" # Default: Sharpness msgid "sharpness" msgstr "Keskinlik" # Default: Similar to msgid "similar-to" msgstr "Benzer" # Default: Smart canonical episode title msgid "smart-canonical-episode-title" msgstr "" # Default: Smart canonical series title msgid "smart-canonical-series-title" msgstr "" # Default: Smart canonical title msgid "smart-canonical-title" msgstr "" # Default: Smart long imdb canonical title msgid "smart-long-imdb-canonical-title" msgstr "" # Default: Sound clips msgid "sound-clips" msgstr "Ses klipleri" # Default: Sound crew msgid "sound-crew" msgstr "Ses ekibi" # Default: Sound encoding msgid "sound-encoding" msgstr "Ses kodlaması" # Default: Sound mix msgid "sound-mix" msgstr "" # Default: Soundtrack msgid "soundtrack" msgstr "Film müzikleri" # Default: Spaciality msgid "spaciality" msgstr "" # Default: Special effects msgid "special-effects" msgstr "Özel efektler" # Default: Special effects companies msgid "special-effects-companies" msgstr "Özel efekt şirketleri" # Default: Special effects department msgid "special-effects-department" msgstr "Özel efekt departmanı" # Default: Spin off msgid "spin-off" msgstr "" # Default: Spin off from msgid "spin-off-from" msgstr "" # Default: Spoofed in msgid "spoofed-in" msgstr "Dalga geçildiği filmler" # Default: Spoofs msgid "spoofs" msgstr "Dalga geçtiği filmler" # Default: Spouse msgid "spouse" msgstr "Eşi" # Default: Status of availablility msgid "status-of-availablility" msgstr "" # Default: Studio msgid "studio" msgstr "Stüdyo" # Default: Studios msgid "studios" msgstr "Stüdyolar" # Default: Stunt performer msgid "stunt-performer" msgstr "" # Default: Stunts msgid "stunts" msgstr "Dublörler" # Default: Subtitles msgid "subtitles" msgstr "Altyazılar" # Default: Supplement msgid "supplement" msgstr "" # Default: Supplements msgid "supplements" msgstr "" # Default: Synopsis msgid "synopsis" msgstr "Sinopsis" # Default: Taglines msgid "taglines" msgstr "Spotlar" # Default: Tech info msgid "tech-info" msgstr "Teknik bilgi" # Default: Thanks msgid "thanks" msgstr "Teşekkürler" # Default: Time msgid "time" msgstr "Zaman" # Default: Title msgid "title" msgstr "Başlık" # Default: Titles in this product msgid "titles-in-this-product" msgstr "Bu üründeki başlıklar" # Default: To msgid "to" msgstr "Alan" # Default: Top 250 rank msgid "top-250-rank" msgstr "En iyi 250 içindeki sırası" # Default: Trade mark msgid "trade-mark" msgstr "Kendine has özelliği" # Default: Transportation department msgid "transportation-department" msgstr "Ulaşım departmanı" # Default: Trivia msgid "trivia" msgstr "İlginç notlar" # Default: Tv msgid "tv" msgstr "" # Default: Under license from msgid "under-license-from" msgstr "" # Default: Unknown link msgid "unknown-link" msgstr "" # Default: Upc msgid "upc" msgstr "" # Default: Version of msgid "version-of" msgstr "" # Default: Vhs msgid "vhs" msgstr "VHS" # Default: Video msgid "video" msgstr "" # Default: Video artifacts msgid "video-artifacts" msgstr "" # Default: Video clips msgid "video-clips" msgstr "Video klipleri" # Default: Video noise msgid "video-noise" msgstr "Video gürültüsü" # Default: Video quality msgid "video-quality" msgstr "Video kalitesi" # Default: Video standard msgid "video-standard" msgstr "Video standardı" # Default: Visual effects msgid "visual-effects" msgstr "Görsel efektler" # Default: Votes msgid "votes" msgstr "Oylar" # Default: Votes distribution msgid "votes-distribution" msgstr "Oyların dağılımı" # Default: Weekend gross msgid "weekend-gross" msgstr "Haftasonu hasılatı" # Default: Where now msgid "where-now" msgstr "Şu anda nerede" # Default: With msgid "with" msgstr "" # Default: Writer msgid "writer" msgstr "Yazar" # Default: Written by msgid "written-by" msgstr "Yazan" # Default: Year msgid "year" msgstr "Yıl" # Default: Zshops msgid "zshops" msgstr "ZShops" �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/locale/imdbpy-it.po�����������������������������������������������������������������0000644�0000000�0000000�00000055347�11766731642�015553� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# Gettext message file for imdbpy msgid "" msgstr "" "Project-Id-Version: imdbpy\n" "POT-Creation-Date: 2010-03-18 14:35+0000\n" "PO-Revision-Date: 2009-07-03 13:00+0000\n" "Last-Translator: Davide Alberani <da@erlug.linux.it>\n" "Language-Team: Davide Alberani <da@erlug.linux.it>\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=2; plural=(n != 1);\n" "Language-Code: it\n" "Language-Name: Italian\n" "Preferred-Encodings: utf-8\n" "Domain: imdbpy\n" # Default: Actor msgid "actor" msgstr "Attore" # Default: Actress msgid "actress" msgstr "Attrice" # Default: Adaption msgid "adaption" msgstr "Adattamento" # Default: Additional information msgid "additional-information" msgstr "Ulteriori informazioni" # Default: Admissions msgid "admissions" msgstr "Biglietti venduti" # Default: Agent address msgid "agent-address" msgstr "Indirizzo dell'agente" # Default: Airing msgid "airing" msgstr "In onda" # Default: Akas msgid "akas" msgstr "Alias" # Default: Akas from release info msgid "akas-from-release-info" msgstr "Alias dalle informazioni di rilascio" # Default: All products msgid "all-products" msgstr "Tutti i prodotti" # Default: Alternate language version of msgid "alternate-language-version-of" msgstr "Versione in altra lingua di" # Default: Alternate versions msgid "alternate-versions" msgstr "Versioni alternative" # Default: Amazon reviews msgid "amazon-reviews" msgstr "Recensione di Amazon" # Default: Analog left msgid "analog-left" msgstr "Analogico sinistro" # Default: Analog right msgid "analog-right" msgstr "Analogico destro" # Default: Animation department msgid "animation-department" msgstr "Dipartimento animazione" # Default: Archive footage msgid "archive-footage" msgstr "Materiale d'archivio" # Default: Arithmetic mean msgid "arithmetic-mean" msgstr "Media aritmetica" # Default: Art department msgid "art-department" msgstr "Dipartimento artistico" # Default: Art direction msgid "art-direction" msgstr "Direzione artistica" # Default: Art director msgid "art-director" msgstr "Direttore artistico" # Default: Article msgid "article" msgstr "Articolo" # Default: Asin msgid "asin" msgstr "Asin" # Default: Aspect ratio msgid "aspect-ratio" msgstr "Rapporto d'aspetto" # Default: Assigner msgid "assigner" msgstr "Assegnatario" # Default: Assistant director msgid "assistant-director" msgstr "Assistente regista" # Default: Auctions msgid "auctions" msgstr "Aste" # Default: Audio noise msgid "audio-noise" msgstr "Rumore audio" # Default: Audio quality msgid "audio-quality" msgstr "Qualità audio" # Default: Award msgid "award" msgstr "Premio" # Default: Awards msgid "awards" msgstr "Premi" # Default: Biographical movies msgid "biographical-movies" msgstr "Film biografici" # Default: Biography msgid "biography" msgstr "Biografia" # Default: Biography print msgid "biography-print" msgstr "Biografia" # Default: Birth date msgid "birth-date" msgstr "Data di nascita" # Default: Birth name msgid "birth-name" msgstr "Nome di nascita" # Default: Birth notes msgid "birth-notes" msgstr "Note di nascita" # Default: Body msgid "body" msgstr "Corpo" # Default: Book msgid "book" msgstr "Libro" # Default: Books msgid "books" msgstr "Libri" # Default: Bottom 100 rank msgid "bottom-100-rank" msgstr "Posizione nella bottom 100" # Default: Budget msgid "budget" msgstr "Bilancio" # Default: Business msgid "business" msgstr "Affari" # Default: By arrangement with msgid "by-arrangement-with" msgstr "Arrangiamento con" # Default: Camera msgid "camera" msgstr "Cinepresa" # Default: Camera and electrical department msgid "camera-and-electrical-department" msgstr "Cinepresa e dipartimento elettrico" # Default: Canonical episode title msgid "canonical-episode-title" msgstr "Titolo dell'episodio in forma canonica" # Default: Canonical name msgid "canonical-name" msgstr "Nome in forma canonica" # Default: Canonical series title msgid "canonical-series-title" msgstr "Titolo della serie in forma canonica" # Default: Canonical title msgid "canonical-title" msgstr "Titolo in forma canonica" # Default: Cast msgid "cast" msgstr "Cast" # Default: Casting department msgid "casting-department" msgstr "Casting" # Default: Casting director msgid "casting-director" msgstr "Direttore del casting" # Default: Catalog number msgid "catalog-number" msgstr "Numero di catalogo" # Default: Category msgid "category" msgstr "Categoria" # Default: Certificate msgid "certificate" msgstr "Certificazione" # Default: Certificates msgid "certificates" msgstr "Certificazioni" # Default: Certification msgid "certification" msgstr "Certificazioni" # Default: Channel msgid "channel" msgstr "Canale" # Default: Character msgid "character" msgstr "Personaggio" # Default: Cinematographer msgid "cinematographer" msgstr "Fotografia" # Default: Cinematographic process msgid "cinematographic-process" msgstr "Processo cinematografico" # Default: Close captions teletext ld g msgid "close-captions-teletext-ld-g" msgstr "" # Default: Color info msgid "color-info" msgstr "Colore" # Default: Color information msgid "color-information" msgstr "Informazioni sul colore" # Default: Color rendition msgid "color-rendition" msgstr "Resa dei colori" # Default: Company msgid "company" msgstr "Compagnia" # Default: Complete cast msgid "complete-cast" msgstr "Cast completo" # Default: Complete crew msgid "complete-crew" msgstr "Troupe completa" # Default: Composer msgid "composer" msgstr "Compositore" # Default: Connections msgid "connections" msgstr "Collegamenti" # Default: Contrast msgid "contrast" msgstr "Contrasto" # Default: Copyright holder msgid "copyright-holder" msgstr "Detentore dei diritti d'autore" # Default: Costume department msgid "costume-department" msgstr "Dipartimento costumi" # Default: Costume designer msgid "costume-designer" msgstr "Costumista" # Default: Countries msgid "countries" msgstr "Paesi" # Default: Country msgid "country" msgstr "Paese" # Default: Courtesy of msgid "courtesy-of" msgstr "Cortesia di" # Default: Cover msgid "cover" msgstr "Copertina" # Default: Cover url msgid "cover-url" msgstr "Locandina" # Default: Crazy credits msgid "crazy-credits" msgstr "Titoli pazzi" # Default: Creator msgid "creator" msgstr "Creatore" # Default: Current role msgid "current-role" msgstr "Ruolo" # Default: Database msgid "database" msgstr "Database" # Default: Date msgid "date" msgstr "Data" # Default: Death date msgid "death-date" msgstr "Data di morte" # Default: Death notes msgid "death-notes" msgstr "Note di morte" # Default: Demographic msgid "demographic" msgstr "Spaccato demografico" # Default: Description msgid "description" msgstr "Descrizione" # Default: Dialogue intellegibility msgid "dialogue-intellegibility" msgstr "Comprensibilità dei dialoghi" # Default: Digital sound msgid "digital-sound" msgstr "Suono digitale" # Default: Director msgid "director" msgstr "Regista" # Default: Disc format msgid "disc-format" msgstr "Formato del disco" # Default: Disc size msgid "disc-size" msgstr "Dimensione del disco" # Default: Distributors msgid "distributors" msgstr "Distributori" # Default: Dvd msgid "dvd" msgstr "Dvd" # Default: Dvd features msgid "dvd-features" msgstr "Caratteristiche del DVD" # Default: Dvd format msgid "dvd-format" msgstr "Formato del DVD" # Default: Dvds msgid "dvds" msgstr "Dvd" # Default: Dynamic range msgid "dynamic-range" msgstr "Intervallo dinamico" # Default: Edited from msgid "edited-from" msgstr "Tratto da" # Default: Edited into msgid "edited-into" msgstr "Montato in" # Default: Editor msgid "editor" msgstr "Editore" # Default: Editorial department msgid "editorial-department" msgstr "Dipartimento editoriale" # Default: Episode msgid "episode" msgstr "Episodio" # Default: Episode of msgid "episode-of" msgstr "Episodio di" # Default: Episode title msgid "episode-title" msgstr "Titolo dell'episodio" # Default: Episodes msgid "episodes" msgstr "Episodi" # Default: Episodes rating msgid "episodes-rating" msgstr "Voto degli episodi" # Default: Essays msgid "essays" msgstr "Saggi" # Default: External reviews msgid "external-reviews" msgstr "Recensioni esterne" # Default: Faqs msgid "faqs" msgstr "Domande ricorrenti" # Default: Feature msgid "feature" msgstr "Caratteristica" # Default: Featured in msgid "featured-in" msgstr "Ripreso in" # Default: Features msgid "features" msgstr "Caratteristiche" # Default: Film negative format msgid "film-negative-format" msgstr "Formato del negativo" # Default: Filming dates msgid "filming-dates" msgstr "Data delle riprese" # Default: Filmography msgid "filmography" msgstr "Filmografia" # Default: Followed by msgid "followed-by" msgstr "Seguito da" # Default: Follows msgid "follows" msgstr "Segue" # Default: For msgid "for" msgstr "Per" # Default: Frequency response msgid "frequency-response" msgstr "Frequenze di risposta" # Default: From msgid "from" msgstr "Da" # Default: Full article link msgid "full-article-link" msgstr "Collegamento all'articolo completo" # Default: Full size cover url msgid "full-size-cover-url" msgstr "URL della copertina nelle dimensioni originali" # Default: Full size headshot msgid "full-size-headshot" msgstr "Ritratto nelle dimensioni originali" # Default: Genres msgid "genres" msgstr "Generi" # Default: Goofs msgid "goofs" msgstr "Errori" # Default: Gross msgid "gross" msgstr "Lordo" # Default: Group genre msgid "group-genre" msgstr "" # Default: Headshot msgid "headshot" msgstr "Foto" # Default: Height msgid "height" msgstr "Altezza" # Default: Imdbindex msgid "imdbindex" msgstr "" # Default: In development msgid "in-development" msgstr "In sviluppo" # Default: Interview msgid "interview" msgstr "Intervista" # Default: Interviews msgid "interviews" msgstr "Interviste" # Default: Introduction msgid "introduction" msgstr "Introduzione" # Default: Item msgid "item" msgstr "Elemento" # Default: Keywords msgid "keywords" msgstr "Parole chiave" # Default: Kind msgid "kind" msgstr "Tipo" # Default: Label msgid "label" msgstr "Etichetta" # Default: Laboratory msgid "laboratory" msgstr "Laboratorio" # Default: Language msgid "language" msgstr "Lingua" # Default: Languages msgid "languages" msgstr "Lingue" # Default: Laserdisc msgid "laserdisc" msgstr "Laserdisc" # Default: Laserdisc title msgid "laserdisc-title" msgstr "Titolo del laserdisc" # Default: Length msgid "length" msgstr "Durata" # Default: Line msgid "line" msgstr "Battuta" # Default: Link msgid "link" msgstr "Collegamento" # Default: Link text msgid "link-text" msgstr "Testo del link" # Default: Literature msgid "literature" msgstr "Letteratura" # Default: Locations msgid "locations" msgstr "Luoghi" # Default: Long imdb canonical name msgid "long-imdb-canonical-name" msgstr "Nome canonico IMDb lungo" # Default: Long imdb canonical title msgid "long-imdb-canonical-title" msgstr "Titolo canonico IMDb lungo" # Default: Long imdb episode title msgid "long-imdb-episode-title" msgstr "Titolo dell'episodio canonico IMDb lungo" # Default: Long imdb name msgid "long-imdb-name" msgstr "Nome IMDb lungo" # Default: Long imdb title msgid "long-imdb-title" msgstr "Titolo IMDb lungo" # Default: Magazine cover photo msgid "magazine-cover-photo" msgstr "Foto di copertina" # Default: Make up msgid "make-up" msgstr "Trucco" # Default: Master format msgid "master-format" msgstr "Formato del master" # Default: Median msgid "median" msgstr "Mediana" # Default: Merchandising links msgid "merchandising-links" msgstr "Collegamenti al merchandising" # Default: Mini biography msgid "mini-biography" msgstr "Biografia" # Default: Misc links msgid "misc-links" msgstr "Altri collegamenti" # Default: Miscellaneous companies msgid "miscellaneous-companies" msgstr "Altre compagnie" # Default: Miscellaneous crew msgid "miscellaneous-crew" msgstr "Altra troupe" # Default: Movie msgid "movie" msgstr "Film" # Default: Mpaa msgid "mpaa" msgstr "Visto MPAA" # Default: Music department msgid "music-department" msgstr "Dipartimento musicale" # Default: Name msgid "name" msgstr "Nome" # Default: News msgid "news" msgstr "Notizie" # Default: Newsgroup reviews msgid "newsgroup-reviews" msgstr "Recensioni dai gruppi di discussione" # Default: Nick names msgid "nick-names" msgstr "Soprannomi" # Default: Notes msgid "notes" msgstr "Note" # Default: Novel msgid "novel" msgstr "Novella" # Default: Number msgid "number" msgstr "Numero" # Default: Number of chapter stops msgid "number-of-chapter-stops" msgstr "Numero di interruzioni di capitolo" # Default: Number of episodes msgid "number-of-episodes" msgstr "Numero di episodi" # Default: Number of seasons msgid "number-of-seasons" msgstr "Numero di stagioni" # Default: Number of sides msgid "number-of-sides" msgstr "Numero di lati" # Default: Number of votes msgid "number-of-votes" msgstr "Numero di voti" # Default: Official retail price msgid "official-retail-price" msgstr "Prezzo ufficiale al pubblico" # Default: Official sites msgid "official-sites" msgstr "Siti ufficiali" # Default: Opening weekend msgid "opening-weekend" msgstr "Weekend d'apertura" # Default: Original air date msgid "original-air-date" msgstr "Data della prima messa in onda" # Default: Original music msgid "original-music" msgstr "Musica originale" # Default: Original title msgid "original-title" msgstr "Titolo originale" # Default: Other literature msgid "other-literature" msgstr "Altre opere letterarie" # Default: Other works msgid "other-works" msgstr "Altri lavori" # Default: Parents guide msgid "parents-guide" msgstr "Guida per i genitori" # Default: Performed by msgid "performed-by" msgstr "Eseguito da" # Default: Person msgid "person" msgstr "Persona" # Default: Photo sites msgid "photo-sites" msgstr "Siti con fotografie" # Default: Pictorial msgid "pictorial" msgstr "Ritratto" # Default: Picture format msgid "picture-format" msgstr "Formato dell'immagine" # Default: Plot msgid "plot" msgstr "Trama" # Default: Plot outline msgid "plot-outline" msgstr "Trama in breve" # Default: Portrayed in msgid "portrayed-in" msgstr "Rappresentato in" # Default: Pressing plant msgid "pressing-plant" msgstr "Impianto di stampa" # Default: Printed film format msgid "printed-film-format" msgstr "Formato della pellicola" # Default: Printed media reviews msgid "printed-media-reviews" msgstr "Recensioni su carta stampata" # Default: Producer msgid "producer" msgstr "Produttore" # Default: Production companies msgid "production-companies" msgstr "Compagnie di produzione" # Default: Production country msgid "production-country" msgstr "Paese di produzione" # Default: Production dates msgid "production-dates" msgstr "Date di produzione" # Default: Production design msgid "production-design" msgstr "Design di produzione" # Default: Production designer msgid "production-designer" msgstr "Designer di produzione" # Default: Production manager msgid "production-manager" msgstr "Manager di produzione" # Default: Production process protocol msgid "production-process-protocol" msgstr "Controllo del processo di produzione" # Default: Quality of source msgid "quality-of-source" msgstr "Qualità dell'originale" # Default: Quality program msgid "quality-program" msgstr "Programma di Qualità" # Default: Quote msgid "quote" msgstr "Citazione" # Default: Quotes msgid "quotes" msgstr "Citazioni" # Default: Rating msgid "rating" msgstr "Voto" # Default: Recommendations msgid "recommendations" msgstr "Raccomandazioni" # Default: Referenced in msgid "referenced-in" msgstr "Citato in" # Default: References msgid "references" msgstr "Cita" # Default: Region msgid "region" msgstr "Regione" # Default: Release country msgid "release-country" msgstr "Paese d'uscita" # Default: Release date msgid "release-date" msgstr "Data d'uscita" # Default: Release dates msgid "release-dates" msgstr "Date d'uscita" # Default: Remade as msgid "remade-as" msgstr "Rifatto come" # Default: Remake of msgid "remake-of" msgstr "Rifacimento di" # Default: Rentals msgid "rentals" msgstr "Noleggi" # Default: Result msgid "result" msgstr "Risultato" # Default: Review msgid "review" msgstr "Recensione" # Default: Review author msgid "review-author" msgstr "Autore della recensione" # Default: Review kind msgid "review-kind" msgstr "Tipo di recensione" # Default: Runtime msgid "runtime" msgstr "Durata" # Default: Runtimes msgid "runtimes" msgstr "Durate" # Default: Salary history msgid "salary-history" msgstr "Stipendi" # Default: Screenplay teleplay msgid "screenplay-teleplay" msgstr "" # Default: Season msgid "season" msgstr "Stagione" # Default: Second unit director or assistant director msgid "second-unit-director-or-assistant-director" msgstr "Regista della seconda unità o aiuto regista" # Default: Self msgid "self" msgstr "Se stesso" # Default: Series animation department msgid "series-animation-department" msgstr "Dipartimento animazione della serie" # Default: Series art department msgid "series-art-department" msgstr "Dipartimento artistico della serie" # Default: Series assistant directors msgid "series-assistant-directors" msgstr "Assistenti registi della serie" # Default: Series camera department msgid "series-camera-department" msgstr "" # Default: Series casting department msgid "series-casting-department" msgstr "" # Default: Series cinematographers msgid "series-cinematographers" msgstr "" # Default: Series costume department msgid "series-costume-department" msgstr "" # Default: Series editorial department msgid "series-editorial-department" msgstr "" # Default: Series editors msgid "series-editors" msgstr "" # Default: Series make up department msgid "series-make-up-department" msgstr "" # Default: Series miscellaneous msgid "series-miscellaneous" msgstr "" # Default: Series music department msgid "series-music-department" msgstr "" # Default: Series producers msgid "series-producers" msgstr "" # Default: Series production designers msgid "series-production-designers" msgstr "" # Default: Series production managers msgid "series-production-managers" msgstr "" # Default: Series sound department msgid "series-sound-department" msgstr "Dipartimento sonoro della serie" # Default: Series special effects department msgid "series-special-effects-department" msgstr "Dipartimento effetti speciali della serie" # Default: Series stunts msgid "series-stunts" msgstr "Controfigure della serie" # Default: Series title msgid "series-title" msgstr "Titolo della serie" # Default: Series transportation department msgid "series-transportation-department" msgstr "" # Default: Series visual effects department msgid "series-visual-effects-department" msgstr "" # Default: Series writers msgid "series-writers" msgstr "Scrittori della serie" # Default: Series years msgid "series-years" msgstr "Anni della serie" # Default: Set decoration msgid "set-decoration" msgstr "Decorazione del set" # Default: Sharpness msgid "sharpness" msgstr "" # Default: Similar to msgid "similar-to" msgstr "Simile a" # Default: Smart canonical episode title msgid "smart-canonical-episode-title" msgstr "Titolo canonico intelligente dell'episodio" # Default: Smart canonical series title msgid "smart-canonical-series-title" msgstr "Titolo canonico intelligente della serie" # Default: Smart canonical title msgid "smart-canonical-title" msgstr "Titolo canonico intelligente" # Default: Smart long imdb canonical title msgid "smart-long-imdb-canonical-title" msgstr "Titolo canonico lungo intelligente" # Default: Sound clips msgid "sound-clips" msgstr "" # Default: Sound crew msgid "sound-crew" msgstr "" # Default: Sound encoding msgid "sound-encoding" msgstr "Codifica sonora" # Default: Sound mix msgid "sound-mix" msgstr "Mix audio" # Default: Soundtrack msgid "soundtrack" msgstr "Colonna sonora" # Default: Spaciality msgid "spaciality" msgstr "Specialità" # Default: Special effects msgid "special-effects" msgstr "Effetti speciali" # Default: Special effects companies msgid "special-effects-companies" msgstr "Compagnie di effetti speciali" # Default: Special effects department msgid "special-effects-department" msgstr "Dipartimento effetti speciali" # Default: Spin off msgid "spin-off" msgstr "Derivati" # Default: Spin off from msgid "spin-off-from" msgstr "Deriva da" # Default: Spoofed in msgid "spoofed-in" msgstr "Preso in giro in" # Default: Spoofs msgid "spoofs" msgstr "Prende in giro" # Default: Spouse msgid "spouse" msgstr "Coniuge" # Default: Status of availablility msgid "status-of-availablility" msgstr "Disponibilità" # Default: Studio msgid "studio" msgstr "Studio" # Default: Studios msgid "studios" msgstr "Studi" # Default: Stunt performer msgid "stunt-performer" msgstr "" # Default: Stunts msgid "stunts" msgstr "Stuntman" # Default: Subtitles msgid "subtitles" msgstr "Sottotitoli" # Default: Supplement msgid "supplement" msgstr "Extra" # Default: Supplements msgid "supplements" msgstr "Extra" # Default: Synopsis msgid "synopsis" msgstr "Compendio della trama" # Default: Taglines msgid "taglines" msgstr "Slogan" # Default: Tech info msgid "tech-info" msgstr "Informazioni tecniche" # Default: Thanks msgid "thanks" msgstr "Ringraziamenti" # Default: Time msgid "time" msgstr "Tempo" # Default: Title msgid "title" msgstr "Titolo" # Default: Titles in this product msgid "titles-in-this-product" msgstr "Titoli in questo prodotto" # Default: To msgid "to" msgstr "A" # Default: Top 250 rank msgid "top-250-rank" msgstr "Posizione nella top 250" # Default: Trade mark msgid "trade-mark" msgstr "Marchio registrato" # Default: Transportation department msgid "transportation-department" msgstr "Dipartimento trasporti" # Default: Trivia msgid "trivia" msgstr "Frivolezze" # Default: Tv msgid "tv" msgstr "Tv" # Default: Under license from msgid "under-license-from" msgstr "Sotto licenza da" # Default: Unknown link msgid "unknown-link" msgstr "Collegamento sconosciuto" # Default: Upc msgid "upc" msgstr "" # Default: Version of msgid "version-of" msgstr "Versione di" # Default: Vhs msgid "vhs" msgstr "VHS" # Default: Video msgid "video" msgstr "Video" # Default: Video artifacts msgid "video-artifacts" msgstr "Imperfezioni video" # Default: Video clips msgid "video-clips" msgstr "Video clips" # Default: Video noise msgid "video-noise" msgstr "Rumore video" # Default: Video quality msgid "video-quality" msgstr "Qualità video" # Default: Video standard msgid "video-standard" msgstr "Standard video" # Default: Visual effects msgid "visual-effects" msgstr "Effetti visivi" # Default: Votes msgid "votes" msgstr "Voti" # Default: Votes distribution msgid "votes-distribution" msgstr "Distribuzione dei voti" # Default: Weekend gross msgid "weekend-gross" msgstr "Lordo del primo fine settimana" # Default: Where now msgid "where-now" msgstr "Cosa sta facendo ora" # Default: With msgid "with" msgstr "Con" # Default: Writer msgid "writer" msgstr "Scrittore" # Default: Written by msgid "written-by" msgstr "Scritto da" # Default: Year msgid "year" msgstr "Anno" # Default: Zshops msgid "zshops" msgstr "" �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/locale/msgfmt.py��������������������������������������������������������������������0000644�0000000�0000000�00000014601�11766731642�015150� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python # -*- coding: iso-8859-1 -*- """Generate binary message catalog from textual translation description. This program converts a textual Uniforum-style message catalog (.po file) into a binary GNU catalog (.mo file). This is essentially the same function as the GNU msgfmt program, however, it is a simpler implementation. Usage: msgfmt.py [OPTIONS] filename.po Options: -o file --output-file=file Specify the output file to write to. If omitted, output will go to a file named filename.mo (based off the input file name). -h --help Print this message and exit. -V --version Display version information and exit. Written by Martin v. Lwis <loewis@informatik.hu-berlin.de>, refactored / fixed by Thomas Waldmann <tw AT waldmann-edv DOT de>. """ import sys, os import getopt, struct, array __version__ = "1.3" class SyntaxErrorException(Exception): """raised when having trouble parsing the po file content""" pass class MsgFmt(object): """transform .po -> .mo format""" def __init__(self): self.messages = {} def make_filenames(self, filename, outfile=None): """Compute .mo name from .po name or language""" if filename.endswith('.po'): infile = filename else: infile = filename + '.po' if outfile is None: outfile = os.path.splitext(infile)[0] + '.mo' return infile, outfile def add(self, id, str, fuzzy): """Add a non-fuzzy translation to the dictionary.""" if not fuzzy and str: self.messages[id] = str def read_po(self, lines): ID = 1 STR = 2 section = None fuzzy = False line_no = 0 msgid = msgstr = '' # Parse the catalog for line in lines: line_no += 1 # If we get a comment line after a msgstr, this is a new entry if line.startswith('#') and section == STR: self.add(msgid, msgstr, fuzzy) section = None fuzzy = False # Record a fuzzy mark if line.startswith('#,') and 'fuzzy' in line: fuzzy = True # Skip comments if line.startswith('#'): continue # Now we are in a msgid section, output previous section if line.startswith('msgid'): if section == STR: self.add(msgid, msgstr, fuzzy) fuzzy = False section = ID line = line[5:] msgid = msgstr = '' # Now we are in a msgstr section elif line.startswith('msgstr'): section = STR line = line[6:] # Skip empty lines line = line.strip() if not line: continue # XXX: Does this always follow Python escape semantics? line = eval(line) if section == ID: msgid += line elif section == STR: msgstr += line else: raise SyntaxErrorException('Syntax error on line %d, before:\n%s' % (line_no, line)) # Add last entry if section == STR: self.add(msgid, msgstr, fuzzy) def generate_mo(self): """Return the generated output.""" keys = self.messages.keys() # the keys are sorted in the .mo file keys.sort() offsets = [] ids = '' strs = '' for id in keys: # For each string, we need size and file offset. Each string is NUL # terminated; the NUL does not count into the size. offsets.append((len(ids), len(id), len(strs), len(self.messages[id]))) ids += id + '\0' strs += self.messages[id] + '\0' output = [] # The header is 7 32-bit unsigned integers. We don't use hash tables, so # the keys start right after the index tables. # translated string. keystart = 7*4 + 16*len(keys) # and the values start after the keys valuestart = keystart + len(ids) koffsets = [] voffsets = [] # The string table first has the list of keys, then the list of values. # Each entry has first the size of the string, then the file offset. for o1, l1, o2, l2 in offsets: koffsets += [l1, o1 + keystart] voffsets += [l2, o2 + valuestart] offsets = koffsets + voffsets output.append(struct.pack("Iiiiiii", 0x950412deL, # Magic 0, # Version len(keys), # # of entries 7*4, # start of key index 7*4 + len(keys)*8, # start of value index 0, 0)) # size and offset of hash table output.append(array.array("i", offsets).tostring()) output.append(ids) output.append(strs) return ''.join(output) def make(filename, outfile): mf = MsgFmt() infile, outfile = mf.make_filenames(filename, outfile) try: lines = file(infile).readlines() except IOError, msg: print >> sys.stderr, msg sys.exit(1) try: mf.read_po(lines) output = mf.generate_mo() except SyntaxErrorException, msg: print >> sys.stderr, msg try: open(outfile, "wb").write(output) except IOError, msg: print >> sys.stderr, msg def usage(code, msg=''): print >> sys.stderr, __doc__ if msg: print >> sys.stderr, msg sys.exit(code) def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'hVo:', ['help', 'version', 'output-file=']) except getopt.error, msg: usage(1, msg) outfile = None # parse options for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-V', '--version'): print >> sys.stderr, "msgfmt.py", __version__ sys.exit(0) elif opt in ('-o', '--output-file'): outfile = arg # do it if not args: print >> sys.stderr, 'No input file given' print >> sys.stderr, "Try `msgfmt --help' for more information." return for filename in args: make(filename, outfile) if __name__ == '__main__': main() �������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/locale/rebuildmo.py�����������������������������������������������������������������0000755�0000000�0000000�00000002732�11766731642�015642� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ rebuildmo.py script. This script builds the .mo files, from the .po files. Copyright 2009 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import glob import msgfmt import os #LOCALE_DIR = os.path.dirname(__file__) def rebuildmo(): lang_glob = 'imdbpy-*.po' created = [] for input_file in glob.glob(lang_glob): lang = input_file[7:-3] if not os.path.exists(lang): os.mkdir(lang) mo_dir = os.path.join(lang, 'LC_MESSAGES') if not os.path.exists(mo_dir): os.mkdir(mo_dir) output_file = os.path.join(mo_dir, 'imdbpy.mo') msgfmt.make(input_file, output_file) created.append(lang) return created if __name__ == '__main__': languages = rebuildmo() print 'Created locale for: %s.' % ' '.join(languages) ��������������������������������������IMDbPY-4.9/imdb/linguistics.py����������������������������������������������������������������������0000644�0000000�0000000�00000022004�11766731642�014745� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" linguistics module (imdb package). This module provides functions and data to handle in a smart way languages and articles (in various languages) at the beginning of movie titles. Copyright 2009-2012 Davide Alberani <da@erlug.linux.it> 2012 Alberto Malagoli <albemala AT gmail.com> 2009 H. Turgut Uyar <uyar@tekir.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ # List of generic articles used when the language of the title is unknown (or # we don't have information about articles in that language). # XXX: Managing titles in a lot of different languages, a function to recognize # an initial article can't be perfect; sometimes we'll stumble upon a short # word that is an article in some language, but it's not in another; in these # situations we have to choose if we want to interpret this little word # as an article or not (remember that we don't know what the original language # of the title was). # Example: 'en' is (I suppose) an article in Some Language. Unfortunately it # seems also to be a preposition in other languages (French?). # Running a script over the whole list of titles (and aliases), I've found # that 'en' is used as an article only 376 times, and as another thing 594 # times, so I've decided to _always_ consider 'en' as a non article. # # Here is a list of words that are _never_ considered as articles, complete # with the cound of times they are used in a way or another: # 'en' (376 vs 594), 'to' (399 vs 727), 'as' (198 vs 276), 'et' (79 vs 99), # 'des' (75 vs 150), 'al' (78 vs 304), 'ye' (14 vs 70), # 'da' (23 vs 298), "'n" (8 vs 12) # # I've left in the list 'i' (1939 vs 2151) and 'uno' (52 vs 56) # I'm not sure what '-al' is, and so I've left it out... # # Generic list of articles in utf-8 encoding: GENERIC_ARTICLES = ('the', 'la', 'a', 'die', 'der', 'le', 'el', "l'", 'il', 'das', 'les', 'i', 'o', 'ein', 'un', 'de', 'los', 'an', 'una', 'las', 'eine', 'den', 'het', 'gli', 'lo', 'os', 'ang', 'oi', 'az', 'een', 'ha-', 'det', 'ta', 'al-', 'mga', "un'", 'uno', 'ett', 'dem', 'egy', 'els', 'eines', '\xc3\x8f', '\xc3\x87', '\xc3\x94\xc3\xaf', '\xc3\x8f\xc3\xa9') # Lists of articles separated by language. If possible, the list should # be sorted by frequency (not very important, but...) # If you want to add a list of articles for another language, mail it # it at imdbpy-devel@lists.sourceforge.net; non-ascii articles must be utf-8 # encoded. LANG_ARTICLES = { 'English': ('the', 'a', 'an'), 'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'", 'uno'), 'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos', 'unas'), 'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'), 'Turkish': (), # Some languages doesn't have articles. } LANG_ARTICLESget = LANG_ARTICLES.get # Maps a language to countries where it is the main language. # If you want to add an entry for another language or country, mail it at # imdbpy-devel@lists.sourceforge.net . LANG_COUNTRIES = { 'English': ('Canada', 'Swaziland', 'Ghana', 'St. Lucia', 'Liberia', 'Jamaica', 'Bahamas', 'New Zealand', 'Lesotho', 'Kenya', 'Solomon Islands', 'United States', 'South Africa', 'St. Vincent and the Grenadines', 'Fiji', 'UK', 'Nigeria', 'Australia', 'USA', 'St. Kitts and Nevis', 'Belize', 'Sierra Leone', 'Gambia', 'Namibia', 'Micronesia', 'Kiribati', 'Grenada', 'Antigua and Barbuda', 'Barbados', 'Malta', 'Zimbabwe', 'Ireland', 'Uganda', 'Trinidad and Tobago', 'South Sudan', 'Guyana', 'Botswana', 'United Kingdom', 'Zambia'), 'Italian': ('Italy', 'San Marino', 'Vatican City'), 'Spanish': ('Spain', 'Mexico', 'Argentina', 'Bolivia', 'Guatemala', 'Uruguay', 'Peru', 'Cuba', 'Dominican Republic', 'Panama', 'Costa Rica', 'Ecuador', 'El Salvador', 'Chile', 'Equatorial Guinea', 'Spain', 'Colombia', 'Nicaragua', 'Venezuela', 'Honduras', 'Paraguay'), 'French': ('Cameroon', 'Burkina Faso', 'Dominica', 'Gabon', 'Monaco', 'France', "Cote d'Ivoire", 'Benin', 'Togo', 'Central African Republic', 'Mali', 'Niger', 'Congo, Republic of', 'Guinea', 'Congo, Democratic Republic of the', 'Luxembourg', 'Haiti', 'Chad', 'Burundi', 'Madagascar', 'Comoros', 'Senegal'), 'Portuguese': ('Portugal', 'Brazil', 'Sao Tome and Principe', 'Cape Verde', 'Angola', 'Mozambique', 'Guinea-Bissau'), 'German': ('Liechtenstein', 'Austria', 'West Germany', 'Switzerland', 'East Germany', 'Germany'), 'Arabic': ('Saudi Arabia', 'Kuwait', 'Jordan', 'Oman', 'Yemen', 'United Arab Emirates', 'Mauritania', 'Lebanon', 'Bahrain', 'Libya', 'Palestinian State (proposed)', 'Qatar', 'Algeria', 'Morocco', 'Iraq', 'Egypt', 'Djibouti', 'Sudan', 'Syria', 'Tunisia'), 'Turkish': ('Turkey', 'Azerbaijan'), 'Swahili': ('Tanzania',), 'Swedish': ('Sweden',), 'Icelandic': ('Iceland',), 'Estonian': ('Estonia',), 'Romanian': ('Romania',), 'Samoan': ('Samoa',), 'Slovenian': ('Slovenia',), 'Tok Pisin': ('Papua New Guinea',), 'Palauan': ('Palau',), 'Macedonian': ('Macedonia',), 'Hindi': ('India',), 'Dutch': ('Netherlands', 'Belgium', 'Suriname'), 'Marshallese': ('Marshall Islands',), 'Korean': ('Korea, North', 'Korea, South', 'North Korea', 'South Korea'), 'Vietnamese': ('Vietnam',), 'Danish': ('Denmark',), 'Khmer': ('Cambodia',), 'Lao': ('Laos',), 'Somali': ('Somalia',), 'Filipino': ('Philippines',), 'Hungarian': ('Hungary',), 'Ukrainian': ('Ukraine',), 'Bosnian': ('Bosnia and Herzegovina',), 'Georgian': ('Georgia',), 'Lithuanian': ('Lithuania',), 'Malay': ('Brunei',), 'Tetum': ('East Timor',), 'Norwegian': ('Norway',), 'Armenian': ('Armenia',), 'Russian': ('Russia',), 'Slovak': ('Slovakia',), 'Thai': ('Thailand',), 'Croatian': ('Croatia',), 'Turkmen': ('Turkmenistan',), 'Nepali': ('Nepal',), 'Finnish': ('Finland',), 'Uzbek': ('Uzbekistan',), 'Albanian': ('Albania', 'Kosovo'), 'Hebrew': ('Israel',), 'Bulgarian': ('Bulgaria',), 'Greek': ('Cyprus', 'Greece'), 'Burmese': ('Myanmar',), 'Latvian': ('Latvia',), 'Serbian': ('Serbia',), 'Afar': ('Eritrea',), 'Catalan': ('Andorra',), 'Chinese': ('China', 'Taiwan'), 'Czech': ('Czech Republic', 'Czechoslovakia'), 'Bislama': ('Vanuatu',), 'Japanese': ('Japan',), 'Kinyarwanda': ('Rwanda',), 'Amharic': ('Ethiopia',), 'Persian': ('Afghanistan', 'Iran'), 'Tajik': ('Tajikistan',), 'Mongolian': ('Mongolia',), 'Dzongkha': ('Bhutan',), 'Urdu': ('Pakistan',), 'Polish': ('Poland',), 'Sinhala': ('Sri Lanka',), } # Maps countries to their main language. COUNTRY_LANG = {} for lang in LANG_COUNTRIES: for country in LANG_COUNTRIES[lang]: COUNTRY_LANG[country] = lang def toUnicode(articles): """Convert a list of articles utf-8 encoded to unicode strings.""" return tuple([art.decode('utf_8') for art in articles]) def toDicts(articles): """Given a list of utf-8 encoded articles, build two dictionary (one utf-8 encoded and another one with unicode keys) for faster matches.""" uArticles = toUnicode(articles) return dict([(x, x) for x in articles]), dict([(x, x) for x in uArticles]) def addTrailingSpace(articles): """From the given list of utf-8 encoded articles, return two lists (one utf-8 encoded and another one in unicode) where a space is added at the end - if the last char is not ' or -.""" _spArticles = [] _spUnicodeArticles = [] for article in articles: if article[-1] not in ("'", '-'): article += ' ' _spArticles.append(article) _spUnicodeArticles.append(article.decode('utf_8')) return _spArticles, _spUnicodeArticles # Caches. _ART_CACHE = {} _SP_ART_CACHE = {} def articlesDictsForLang(lang): """Return dictionaries of articles specific for the given language, or the default one if the language is not known.""" if lang in _ART_CACHE: return _ART_CACHE[lang] artDicts = toDicts(LANG_ARTICLESget(lang, GENERIC_ARTICLES)) _ART_CACHE[lang] = artDicts return artDicts def spArticlesForLang(lang): """Return lists of articles (plus optional spaces) specific for the given language, or the default one if the language is not known.""" if lang in _SP_ART_CACHE: return _SP_ART_CACHE[lang] spArticles = addTrailingSpace(LANG_ARTICLESget(lang, GENERIC_ARTICLES)) _SP_ART_CACHE[lang] = spArticles return spArticles ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/_logging.py�������������������������������������������������������������������������0000644�0000000�0000000�00000004173�11766731642�014204� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" _logging module (imdb package). This module provides the logging facilities used by the imdb package. Copyright 2009-2010 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import logging LEVELS = {'debug': logging.DEBUG, 'info': logging.INFO, 'warn': logging.WARNING, 'warning': logging.WARNING, 'error': logging.ERROR, 'critical': logging.CRITICAL} imdbpyLogger = logging.getLogger('imdbpy') imdbpyStreamHandler = logging.StreamHandler() imdbpyFormatter = logging.Formatter('%(asctime)s %(levelname)s [%(name)s]' \ ' %(pathname)s:%(lineno)d: %(message)s') imdbpyStreamHandler.setFormatter(imdbpyFormatter) imdbpyLogger.addHandler(imdbpyStreamHandler) def setLevel(level): """Set logging level for the main logger.""" level = level.lower().strip() imdbpyLogger.setLevel(LEVELS.get(level, logging.NOTSET)) imdbpyLogger.log(imdbpyLogger.level, 'set logging threshold to "%s"', logging.getLevelName(imdbpyLogger.level)) #imdbpyLogger.setLevel(logging.DEBUG) # It can be an idea to have a single function to log and warn: #import warnings #def log_and_warn(msg, args=None, logger=None, level=None): # """Log the message and issue a warning.""" # if logger is None: # logger = imdbpyLogger # if level is None: # level = logging.WARNING # if args is None: # args = () # #warnings.warn(msg % args, stacklevel=0) # logger.log(level, msg % args) �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/Company.py��������������������������������������������������������������������������0000644�0000000�0000000�00000016331�11766731642�014024� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" company module (imdb package). This module provides the company class, used to store information about a given company. Copyright 2008-2009 Davide Alberani <da@erlug.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ from copy import deepcopy from imdb.utils import analyze_company_name, build_company_name, \ flatten, _Container, cmpCompanies class Company(_Container): """A company. Every information about a company can be accessed as: companyObject['information'] to get a list of the kind of information stored in a company object, use the keys() method; some useful aliases are defined (as "also known as" for the "akas" key); see the keys_alias dictionary. """ # The default sets of information retrieved. default_info = ('main',) # Aliases for some not-so-intuitive keys. keys_alias = { 'distributor': 'distributors', 'special effects company': 'special effects companies', 'other company': 'miscellaneous companies', 'miscellaneous company': 'miscellaneous companies', 'other companies': 'miscellaneous companies', 'misc companies': 'miscellaneous companies', 'misc company': 'miscellaneous companies', 'production company': 'production companies'} keys_tomodify_list = () cmpFunct = cmpCompanies def _init(self, **kwds): """Initialize a company object. *companyID* -- the unique identifier for the company. *name* -- the name of the company, if not in the data dictionary. *myName* -- the nickname you use for this company. *myID* -- your personal id for this company. *data* -- a dictionary used to initialize the object. *notes* -- notes about the given company. *accessSystem* -- a string representing the data access system used. *titlesRefs* -- a dictionary with references to movies. *namesRefs* -- a dictionary with references to persons. *charactersRefs* -- a dictionary with references to companies. *modFunct* -- function called returning text fields. """ name = kwds.get('name') if name and not self.data.has_key('name'): self.set_name(name) self.companyID = kwds.get('companyID', None) self.myName = kwds.get('myName', u'') def _reset(self): """Reset the company object.""" self.companyID = None self.myName = u'' def set_name(self, name): """Set the name of the company.""" # XXX: convert name to unicode, if it's a plain string? # Company diverges a bit from other classes, being able # to directly handle its "notes". AND THAT'S PROBABLY A BAD IDEA! oname = name = name.strip() notes = u'' if name.endswith(')'): fparidx = name.find('(') if fparidx != -1: notes = name[fparidx:] name = name[:fparidx].rstrip() if self.notes: name = oname d = analyze_company_name(name) self.data.update(d) if notes and not self.notes: self.notes = notes def _additional_keys(self): """Valid keys to append to the data.keys() list.""" if self.data.has_key('name'): return ['long imdb name'] return [] def _getitem(self, key): """Handle special keys.""" ## XXX: can a company have an imdbIndex? if self.data.has_key('name'): if key == 'long imdb name': return build_company_name(self.data) return None def getID(self): """Return the companyID.""" return self.companyID def __nonzero__(self): """The company is "false" if the self.data does not contain a name.""" # XXX: check the name and the companyID? if self.data.get('name'): return 1 return 0 def __contains__(self, item): """Return true if this company and the given Movie are related.""" from Movie import Movie if isinstance(item, Movie): for m in flatten(self.data, yieldDictKeys=1, scalar=Movie): if item.isSame(m): return 1 return 0 def isSameName(self, other): """Return true if two company have the same name and/or companyID.""" if not isinstance(other, self.__class__): return 0 if self.data.has_key('name') and \ other.data.has_key('name') and \ build_company_name(self.data) == \ build_company_name(other.data): return 1 if self.accessSystem == other.accessSystem and \ self.companyID is not None and \ self.companyID == other.companyID: return 1 return 0 isSameCompany = isSameName def __deepcopy__(self, memo): """Return a deep copy of a company instance.""" c = Company(name=u'', companyID=self.companyID, myName=self.myName, myID=self.myID, data=deepcopy(self.data, memo), notes=self.notes, accessSystem=self.accessSystem, titlesRefs=deepcopy(self.titlesRefs, memo), namesRefs=deepcopy(self.namesRefs, memo), charactersRefs=deepcopy(self.charactersRefs, memo)) c.current_info = list(self.current_info) c.set_mod_funct(self.modFunct) return c def __repr__(self): """String representation of a Company object.""" r = '<Company id:%s[%s] name:_%s_>' % (self.companyID, self.accessSystem, self.get('long imdb name')) if isinstance(r, unicode): r = r.encode('utf_8', 'replace') return r def __str__(self): """Simply print the short name.""" return self.get('name', u'').encode('utf_8', 'replace') def __unicode__(self): """Simply print the short title.""" return self.get('name', u'') def summary(self): """Return a string with a pretty-printed summary for the company.""" if not self: return u'' s = u'Company\n=======\nName: %s\n' % \ self.get('name', u'') for k in ('distributor', 'production company', 'miscellaneous company', 'special effects company'): d = self.get(k, [])[:5] if not d: continue s += u'Last movies from this company (%s): %s.\n' % \ (k, u'; '.join([x.get('long imdb title', u'') for x in d])) return s �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/imdb/helpers.py��������������������������������������������������������������������������0000644�0000000�0000000�00000061011�11766731642�014053� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������""" helpers module (imdb package). This module provides functions not used directly by the imdb package, but useful for IMDbPY-based programs. Copyright 2006-2012 Davide Alberani <da@erlug.linux.it> 2012 Alberto Malagoli <albemala AT gmail.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ # XXX: find better names for the functions in this modules. import re import difflib from cgi import escape import gettext from gettext import gettext as _ gettext.textdomain('imdbpy') # The modClearRefs can be used to strip names and titles references from # the strings in Movie and Person objects. from imdb.utils import modClearRefs, re_titleRef, re_nameRef, \ re_characterRef, _tagAttr, _Container, TAGS_TO_MODIFY from imdb import IMDb, imdbURL_movie_base, imdbURL_person_base, \ imdbURL_character_base import imdb.locale from imdb.linguistics import COUNTRY_LANG from imdb.Movie import Movie from imdb.Person import Person from imdb.Character import Character from imdb.Company import Company from imdb.parser.http.utils import re_entcharrefssub, entcharrefs, \ subXMLRefs, subSGMLRefs from imdb.parser.http.bsouplxml.etree import BeautifulSoup # An URL, more or less. _re_href = re.compile(r'(http://.+?)(?=\s|$)', re.I) _re_hrefsub = _re_href.sub def makeCgiPrintEncoding(encoding): """Make a function to pretty-print strings for the web.""" def cgiPrint(s): """Encode the given string using the %s encoding, and replace chars outside the given charset with XML char references.""" % encoding s = escape(s, quote=1) if isinstance(s, unicode): s = s.encode(encoding, 'xmlcharrefreplace') return s return cgiPrint # cgiPrint uses the latin_1 encoding. cgiPrint = makeCgiPrintEncoding('latin_1') # Regular expression for %(varname)s substitutions. re_subst = re.compile(r'%\((.+?)\)s') # Regular expression for <if condition>....</if condition> clauses. re_conditional = re.compile(r'<if\s+(.+?)\s*>(.+?)</if\s+\1\s*>') def makeTextNotes(replaceTxtNotes): """Create a function useful to handle text[::optional_note] values. replaceTxtNotes is a format string, which can include the following values: %(text)s and %(notes)s. Portions of the text can be conditionally excluded, if one of the values is absent. E.g.: <if notes>[%(notes)s]</if notes> will be replaced with '[notes]' if notes exists, or by an empty string otherwise. The returned function is suitable be passed as applyToValues argument of the makeObject2Txt function.""" def _replacer(s): outS = replaceTxtNotes if not isinstance(s, (unicode, str)): return s ssplit = s.split('::', 1) text = ssplit[0] # Used to keep track of text and note existence. keysDict = {} if text: keysDict['text'] = True outS = outS.replace('%(text)s', text) if len(ssplit) == 2: keysDict['notes'] = True outS = outS.replace('%(notes)s', ssplit[1]) else: outS = outS.replace('%(notes)s', u'') def _excludeFalseConditionals(matchobj): # Return an empty string if the conditional is false/empty. if matchobj.group(1) in keysDict: return matchobj.group(2) return u'' while re_conditional.search(outS): outS = re_conditional.sub(_excludeFalseConditionals, outS) return outS return _replacer def makeObject2Txt(movieTxt=None, personTxt=None, characterTxt=None, companyTxt=None, joiner=' / ', applyToValues=lambda x: x, _recurse=True): """"Return a function useful to pretty-print Movie, Person, Character and Company instances. *movieTxt* -- how to format a Movie object. *personTxt* -- how to format a Person object. *characterTxt* -- how to format a Character object. *companyTxt* -- how to format a Company object. *joiner* -- string used to join a list of objects. *applyToValues* -- function to apply to values. *_recurse* -- if True (default) manage only the given object. """ # Some useful defaults. if movieTxt is None: movieTxt = '%(long imdb title)s' if personTxt is None: personTxt = '%(long imdb name)s' if characterTxt is None: characterTxt = '%(long imdb name)s' if companyTxt is None: companyTxt = '%(long imdb name)s' def object2txt(obj, _limitRecursion=None): """Pretty-print objects.""" # Prevent unlimited recursion. if _limitRecursion is None: _limitRecursion = 0 elif _limitRecursion > 5: return u'' _limitRecursion += 1 if isinstance(obj, (list, tuple)): return joiner.join([object2txt(o, _limitRecursion=_limitRecursion) for o in obj]) elif isinstance(obj, dict): # XXX: not exactly nice, neither useful, I fear. return joiner.join([u'%s::%s' % (object2txt(k, _limitRecursion=_limitRecursion), object2txt(v, _limitRecursion=_limitRecursion)) for k, v in obj.items()]) objData = {} if isinstance(obj, Movie): objData['movieID'] = obj.movieID outs = movieTxt elif isinstance(obj, Person): objData['personID'] = obj.personID outs = personTxt elif isinstance(obj, Character): objData['characterID'] = obj.characterID outs = characterTxt elif isinstance(obj, Company): objData['companyID'] = obj.companyID outs = companyTxt else: return obj def _excludeFalseConditionals(matchobj): # Return an empty string if the conditional is false/empty. condition = matchobj.group(1) proceed = obj.get(condition) or getattr(obj, condition, None) if proceed: return matchobj.group(2) else: return u'' return matchobj.group(2) while re_conditional.search(outs): outs = re_conditional.sub(_excludeFalseConditionals, outs) for key in re_subst.findall(outs): value = obj.get(key) or getattr(obj, key, None) if not isinstance(value, (unicode, str)): if not _recurse: if value: value = unicode(value) if value: value = object2txt(value, _limitRecursion=_limitRecursion) elif value: value = applyToValues(unicode(value)) if not value: value = u'' elif not isinstance(value, (unicode, str)): value = unicode(value) outs = outs.replace(u'%(' + key + u')s', value) return outs return object2txt def makeModCGILinks(movieTxt, personTxt, characterTxt=None, encoding='latin_1'): """Make a function used to pretty-print movies and persons refereces; movieTxt and personTxt are the strings used for the substitutions. movieTxt must contains %(movieID)s and %(title)s, while personTxt must contains %(personID)s and %(name)s and characterTxt %(characterID)s and %(name)s; characterTxt is optional, for backward compatibility.""" _cgiPrint = makeCgiPrintEncoding(encoding) def modCGILinks(s, titlesRefs, namesRefs, characterRefs=None): """Substitute movies and persons references.""" if characterRefs is None: characterRefs = {} # XXX: look ma'... more nested scopes! <g> def _replaceMovie(match): to_replace = match.group(1) item = titlesRefs.get(to_replace) if item: movieID = item.movieID to_replace = movieTxt % {'movieID': movieID, 'title': unicode(_cgiPrint(to_replace), encoding, 'xmlcharrefreplace')} return to_replace def _replacePerson(match): to_replace = match.group(1) item = namesRefs.get(to_replace) if item: personID = item.personID to_replace = personTxt % {'personID': personID, 'name': unicode(_cgiPrint(to_replace), encoding, 'xmlcharrefreplace')} return to_replace def _replaceCharacter(match): to_replace = match.group(1) if characterTxt is None: return to_replace item = characterRefs.get(to_replace) if item: characterID = item.characterID if characterID is None: return to_replace to_replace = characterTxt % {'characterID': characterID, 'name': unicode(_cgiPrint(to_replace), encoding, 'xmlcharrefreplace')} return to_replace s = s.replace('<', '<').replace('>', '>') s = _re_hrefsub(r'<a href="\1">\1</a>', s) s = re_titleRef.sub(_replaceMovie, s) s = re_nameRef.sub(_replacePerson, s) s = re_characterRef.sub(_replaceCharacter, s) return s modCGILinks.movieTxt = movieTxt modCGILinks.personTxt = personTxt modCGILinks.characterTxt = characterTxt return modCGILinks # links to the imdb.com web site. _movieTxt = '<a href="' + imdbURL_movie_base + 'tt%(movieID)s">%(title)s</a>' _personTxt = '<a href="' + imdbURL_person_base + 'nm%(personID)s">%(name)s</a>' _characterTxt = '<a href="' + imdbURL_character_base + \ 'ch%(characterID)s">%(name)s</a>' modHtmlLinks = makeModCGILinks(movieTxt=_movieTxt, personTxt=_personTxt, characterTxt=_characterTxt) modHtmlLinksASCII = makeModCGILinks(movieTxt=_movieTxt, personTxt=_personTxt, characterTxt=_characterTxt, encoding='ascii') everyentcharrefs = entcharrefs.copy() for k, v in {'lt':u'<','gt':u'>','amp':u'&','quot':u'"','apos':u'\''}.items(): everyentcharrefs[k] = v everyentcharrefs['#%s' % ord(v)] = v everyentcharrefsget = everyentcharrefs.get re_everyentcharrefs = re.compile('&(%s|\#160|\#\d{1,5});' % '|'.join(map(re.escape, everyentcharrefs))) re_everyentcharrefssub = re_everyentcharrefs.sub def _replAllXMLRef(match): """Replace the matched XML reference.""" ref = match.group(1) value = everyentcharrefsget(ref) if value is None: if ref[0] == '#': return unichr(int(ref[1:])) else: return ref return value def subXMLHTMLSGMLRefs(s): """Return the given string with XML/HTML/SGML entity and char references replaced.""" return re_everyentcharrefssub(_replAllXMLRef, s) def sortedSeasons(m): """Return a sorted list of seasons of the given series.""" seasons = m.get('episodes', {}).keys() seasons.sort() return seasons def sortedEpisodes(m, season=None): """Return a sorted list of episodes of the given series, considering only the specified season(s) (every season, if None).""" episodes = [] seasons = season if season is None: seasons = sortedSeasons(m) else: if not isinstance(season, (tuple, list)): seasons = [season] for s in seasons: eps_indx = m.get('episodes', {}).get(s, {}).keys() eps_indx.sort() for e in eps_indx: episodes.append(m['episodes'][s][e]) return episodes # Idea and portions of the code courtesy of none none (dclist at gmail.com) _re_imdbIDurl = re.compile(r'\b(nm|tt|ch|co)([0-9]{7})\b') def get_byURL(url, info=None, args=None, kwds=None): """Return a Movie, Person, Character or Company object for the given URL; info is the info set to retrieve, args and kwds are respectively a list and a dictionary or arguments to initialize the data access system. Returns None if unable to correctly parse the url; can raise exceptions if unable to retrieve the data.""" if args is None: args = [] if kwds is None: kwds = {} ia = IMDb(*args, **kwds) match = _re_imdbIDurl.search(url) if not match: return None imdbtype = match.group(1) imdbID = match.group(2) if imdbtype == 'tt': return ia.get_movie(imdbID, info=info) elif imdbtype == 'nm': return ia.get_person(imdbID, info=info) elif imdbtype == 'ch': return ia.get_character(imdbID, info=info) elif imdbtype == 'co': return ia.get_company(imdbID, info=info) return None # Idea and portions of code courtesy of Basil Shubin. # Beware that these information are now available directly by # the Movie/Person/Character instances. def fullSizeCoverURL(obj): """Given an URL string or a Movie, Person or Character instance, returns an URL to the full-size version of the cover/headshot, or None otherwise. This function is obsolete: the same information are available as keys: 'full-size cover url' and 'full-size headshot', respectively for movies and persons/characters.""" if isinstance(obj, Movie): coverUrl = obj.get('cover url') elif isinstance(obj, (Person, Character)): coverUrl = obj.get('headshot') else: coverUrl = obj if not coverUrl: return None return _Container._re_fullsizeURL.sub('', coverUrl) def keyToXML(key): """Return a key (the ones used to access information in Movie and other classes instances) converted to the style of the XML output.""" return _tagAttr(key, '')[0] def translateKey(key): """Translate a given key.""" return _(keyToXML(key)) # Maps tags to classes. _MAP_TOP_OBJ = { 'person': Person, 'movie': Movie, 'character': Character, 'company': Company } # Tags to be converted to lists. _TAGS_TO_LIST = dict([(x[0], None) for x in TAGS_TO_MODIFY.values()]) _TAGS_TO_LIST.update(_MAP_TOP_OBJ) def tagToKey(tag): """Return the name of the tag, taking it from the 'key' attribute, if present.""" keyAttr = tag.get('key') if keyAttr: if tag.get('keytype') == 'int': keyAttr = int(keyAttr) return keyAttr return tag.name def _valueWithType(tag, tagValue): """Return tagValue, handling some type conversions.""" tagType = tag.get('type') if tagType == 'int': tagValue = int(tagValue) elif tagType == 'float': tagValue = float(tagValue) return tagValue # Extra tags to get (if values were not already read from title/name). _titleTags = ('imdbindex', 'kind', 'year') _nameTags = ('imdbindex') _companyTags = ('imdbindex', 'country') def parseTags(tag, _topLevel=True, _as=None, _infoset2keys=None, _key2infoset=None): """Recursively parse a tree of tags.""" # The returned object (usually a _Container subclass, but it can # be a string, an int, a float, a list or a dictionary). item = None if _infoset2keys is None: _infoset2keys = {} if _key2infoset is None: _key2infoset = {} name = tagToKey(tag) firstChild = tag.find(recursive=False) tagStr = (tag.string or u'').strip() if not tagStr and name == 'item': # Handles 'item' tags containing text and a 'notes' sub-tag. tagContent = tag.contents[0] if isinstance(tagContent, BeautifulSoup.NavigableString): tagStr = (unicode(tagContent) or u'').strip() tagType = tag.get('type') infoset = tag.get('infoset') if infoset: _key2infoset[name] = infoset _infoset2keys.setdefault(infoset, []).append(name) # Here we use tag.name to avoid tags like <item title="company"> if tag.name in _MAP_TOP_OBJ: # One of the subclasses of _Container. item = _MAP_TOP_OBJ[name]() itemAs = tag.get('access-system') if itemAs: if not _as: _as = itemAs else: itemAs = _as item.accessSystem = itemAs tagsToGet = [] theID = tag.get('id') if name == 'movie': item.movieID = theID tagsToGet = _titleTags theTitle = tag.find('title', recursive=False) if tag.title: item.set_title(tag.title.string) tag.title.extract() else: if name == 'person': item.personID = theID tagsToGet = _nameTags theName = tag.find('long imdb canonical name', recursive=False) if not theName: theName = tag.find('name', recursive=False) elif name == 'character': item.characterID = theID tagsToGet = _nameTags theName = tag.find('name', recursive=False) elif name == 'company': item.companyID = theID tagsToGet = _companyTags theName = tag.find('name', recursive=False) if theName: item.set_name(theName.string) if theName: theName.extract() for t in tagsToGet: if t in item.data: continue dataTag = tag.find(t, recursive=False) if dataTag: item.data[tagToKey(dataTag)] = _valueWithType(dataTag, dataTag.string) if tag.notes: item.notes = tag.notes.string tag.notes.extract() episodeOf = tag.find('episode-of', recursive=False) if episodeOf: item.data['episode of'] = parseTags(episodeOf, _topLevel=False, _as=_as, _infoset2keys=_infoset2keys, _key2infoset=_key2infoset) episodeOf.extract() cRole = tag.find('current-role', recursive=False) if cRole: cr = parseTags(cRole, _topLevel=False, _as=_as, _infoset2keys=_infoset2keys, _key2infoset=_key2infoset) item.currentRole = cr cRole.extract() # XXX: big assumption, here. What about Movie instances used # as keys in dictionaries? What about other keys (season and # episode number, for example?) if not _topLevel: #tag.extract() return item _adder = lambda key, value: item.data.update({key: value}) elif tagStr: if tag.notes: notes = (tag.notes.string or u'').strip() if notes: tagStr += u'::%s' % notes else: tagStr = _valueWithType(tag, tagStr) return tagStr elif firstChild: firstChildName = tagToKey(firstChild) if firstChildName in _TAGS_TO_LIST: item = [] _adder = lambda key, value: item.append(value) else: item = {} _adder = lambda key, value: item.update({key: value}) else: item = {} _adder = lambda key, value: item.update({name: value}) for subTag in tag(recursive=False): subTagKey = tagToKey(subTag) # Exclude dinamically generated keys. if tag.name in _MAP_TOP_OBJ and subTagKey in item._additional_keys(): continue subItem = parseTags(subTag, _topLevel=False, _as=_as, _infoset2keys=_infoset2keys, _key2infoset=_key2infoset) if subItem: _adder(subTagKey, subItem) if _topLevel and name in _MAP_TOP_OBJ: # Add information about 'info sets', but only to the top-level object. item.infoset2keys = _infoset2keys item.key2infoset = _key2infoset item.current_info = _infoset2keys.keys() return item def parseXML(xml): """Parse a XML string, returning an appropriate object (usually an instance of a subclass of _Container.""" xmlObj = BeautifulSoup.BeautifulStoneSoup(xml, convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES) if xmlObj: mainTag = xmlObj.find() if mainTag: return parseTags(mainTag) return None _re_akas_lang = re.compile('(?:[(])([a-zA-Z]+?)(?: title[)])') _re_akas_country = re.compile('\(.*?\)') # akasLanguages, sortAKAsBySimilarity and getAKAsInLanguage code # copyright of Alberto Malagoli (refactoring by Davide Alberani). def akasLanguages(movie): """Given a movie, return a list of tuples in (lang, AKA) format; lang can be None, if unable to detect.""" lang_and_aka = [] akas = set((movie.get('akas') or []) + (movie.get('akas from release info') or [])) for aka in akas: # split aka aka = aka.encode('utf8').split('::') # sometimes there is no countries information if len(aka) == 2: # search for something like "(... title)" where ... is a language language = _re_akas_lang.search(aka[1]) if language: language = language.groups()[0] else: # split countries using , and keep only the first one (it's sufficient) country = aka[1].split(',')[0] # remove parenthesis country = _re_akas_country.sub('', country).strip() # given the country, get corresponding language from dictionary language = COUNTRY_LANG.get(country) else: language = None lang_and_aka.append((language, aka[0].decode('utf8'))) return lang_and_aka def sortAKAsBySimilarity(movie, title, _titlesOnly=True, _preferredLang=None): """Return a list of movie AKAs, sorted by their similarity to the given title. If _titlesOnly is not True, similarity information are returned. If _preferredLang is specified, AKAs in the given language will get a higher score. The return is a list of title, or a list of tuples if _titlesOnly is False.""" language = movie.guessLanguage() # estimate string distance between current title and given title m_title = movie['title'].lower() l_title = title.lower() if isinstance(l_title, unicode): l_title = l_title.encode('utf8') scores = [] score = difflib.SequenceMatcher(None, m_title.encode('utf8'), l_title).ratio() # set original title and corresponding score as the best match for given title scores.append((score, movie['title'], None)) for language, aka in akasLanguages(movie): # estimate string distance between current title and given title m_title = aka.lower() if isinstance(m_title, unicode): m_title = m_title.encode('utf8') score = difflib.SequenceMatcher(None, m_title, l_title).ratio() # if current language is the same as the given one, increase score if _preferredLang and _preferredLang == language: score += 1 scores.append((score, aka, language)) scores.sort(reverse=True) if _titlesOnly: return [x[1] for x in scores] return scores def getAKAsInLanguage(movie, lang, _searchedTitle=None): """Return a list of AKAs of a movie, in the specified language. If _searchedTitle is given, the AKAs are sorted by their similarity to it.""" akas = [] for language, aka in akasLanguages(movie): if lang == language: akas.append(aka) if _searchedTitle: scores = [] if isinstance(_searchedTitle, unicode): _searchedTitle = _searchedTitle.encode('utf8') for aka in akas: m_aka = aka if isinstance(m_aka): m_aka = m_aka.encode('utf8') scores.append(difflib.SequenceMatcher(None, m_aka.lower(), _searchedTitle.lower()), aka) scores.sort(reverse=True) akas = [x[1] for x in scores] return akas �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/.hgignore��������������������������������������������������������������������������������0000644�0000000�0000000�00000000131�11766731642�012723� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������syntax: glob build dist *.egg-info *.mo *.pyc *.pyo *.so *.pyd *~ *.swp setuptools-*.egg ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/setup.cfg��������������������������������������������������������������������������������0000644�0000000�0000000�00000000617�11766731642�012752� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������[egg_info] #tag_build = dev tag_date = false tag_svn_revision = false [bdist_rpm] vendor = Davide Alberani <da@erlug.linux.it> # Comment out the doc_files entry if you don't want to install # the documentation. doc_files = docs/* # Comment out the icon entry if you don't want to install the icon. icon = docs/imdbpyico.xpm [bdist_wininst] # Bitmap for the installer. bitmap = docs/imdbpywin.bmp �����������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/�������������������������������������������������������������������������������������0000755�0000000�0000000�00000000000�11766731642�011675� 5����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/search_movie.py����������������������������������������������������������������������0000755�0000000�0000000�00000002532�11766731642�014720� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ search_movie.py Usage: search_movie "movie title" Search for the given title and print the results. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "movie title"' % sys.argv[0] sys.exit(2) title = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() title = unicode(title, in_encoding, 'replace') try: # Do the search, and get the results (a list of Movie objects). results = i.search_movie(title) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) # Print the results. print ' %s result%s for "%s":' % (len(results), ('', 's')[len(results) != 1], title.encode(out_encoding, 'replace')) print 'movieID\t: imdbID : title' # Print the long imdb title for every movie. for movie in results: outp = u'%s\t: %s : %s' % (movie.movieID, i.get_imdbID(movie), movie['long imdb title']) print outp.encode(out_encoding, 'replace') ����������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/imdbpy2sql.py������������������������������������������������������������������������0000755�0000000�0000000�00000340161�11766731642�014345� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ imdbpy2sql.py script. This script puts the data of the plain text data files into a SQL database. Copyright 2005-2012 Davide Alberani <da@erlug.linux.it> 2006 Giuseppe "Cowo" Corbelli <cowo --> lugbs.linux.it> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ import os import sys import getopt import time import re import warnings import anydbm from itertools import islice, chain try: import cPickle as pickle except ImportError: import pickle try: from hashlib import md5 except ImportError: from md5 import md5 from gzip import GzipFile from types import UnicodeType from imdb.parser.sql.dbschema import * from imdb.parser.sql import get_movie_data, soundex from imdb.utils import analyze_title, analyze_name, date_and_notes, \ build_name, build_title, normalizeName, normalizeTitle, _articles, \ build_company_name, analyze_company_name, canonicalTitle from imdb._exceptions import IMDbParserError, IMDbError HELP = """imdbpy2sql.py usage: %s -d /directory/with/PlainTextDataFiles/ -u URI [-c /directory/for/CSV_files] [-o sqlobject,sqlalchemy] [-i table,dbm] [--CSV-OPTIONS] [--COMPATIBILITY-OPTIONS] # NOTE: URI is something along the line: scheme://[user[:password]@]host[:port]/database[?parameters] Examples: mysql://user:password@host/database postgres://user:password@host/database sqlite:/tmp/imdb.db sqlite:/C|/full/path/to/database # NOTE: CSV mode (-c path): A directory is used to store CSV files; on supported database servers it should be really fast. # NOTE: ORMs (-o orm): Valid options are 'sqlobject', 'sqlalchemy' or the preferred order separating the voices with a comma. # NOTE: imdbIDs store/restore (-i method): Valid options are 'table' (imdbIDs stored in a temporary table of the database) or 'dbm' (imdbIDs stored on a dbm file - this is the default if CSV is used). # NOTE: --CSV-OPTIONS can be: --csv-ext STRING files extension (.csv) --csv-only-write exit after the CSV files are written. --csv-only-load load an existing set of CSV files. # NOTE: --COMPATIBILITY-OPTIONS can be one of: --mysql-innodb insert data into a MySQL MyISAM db, and then convert it to InnoDB. --mysql-force-myisam force the creation of MyISAM tables. --ms-sqlserver compatibility mode for Microsoft SQL Server and SQL Express. --sqlite-transactions uses transactions, to speed-up SQLite. See README.sqldb for more information. """ % sys.argv[0] # Directory containing the IMDb's Plain Text Data Files. IMDB_PTDF_DIR = None # URI used to connect to the database. URI = None # ORM to use (list of options) and actually used (string). USE_ORM = None USED_ORM = None # List of tables of the database. DB_TABLES = [] # Max allowed recursion, inserting data. MAX_RECURSION = 10 # Method used to (re)store imdbIDs. IMDBIDS_METHOD = None # If set, this directory is used to output CSV files. CSV_DIR = None CSV_CURS = None CSV_ONLY_WRITE = False CSV_ONLY_LOAD = False CSV_EXT = '.csv' CSV_EOL = '\n' CSV_DELIMITER = ',' CSV_QUOTE = '"' CSV_ESCAPE = '"' CSV_NULL = 'NULL' CSV_QUOTEINT = False CSV_LOAD_SQL = None CSV_MYSQL = "LOAD DATA LOCAL INFILE '%(file)s' INTO TABLE `%(table)s` FIELDS TERMINATED BY '%(delimiter)s' ENCLOSED BY '%(quote)s' ESCAPED BY '%(escape)s' LINES TERMINATED BY '%(eol)s'" CSV_PGSQL = "COPY %(table)s FROM '%(file)s' WITH DELIMITER AS '%(delimiter)s' NULL AS '%(null)s' QUOTE AS '%(quote)s' ESCAPE AS '%(escape)s' CSV" CSV_DB2 = "CALL SYSPROC.ADMIN_CMD('LOAD FROM %(file)s OF del MODIFIED BY lobsinfile INSERT INTO %(table)s')" # Temporary fix for old style titles. #FIX_OLD_STYLE_TITLES = True # Store custom queries specified on the command line. CUSTOM_QUERIES = {} # Allowed time specification, for custom queries. ALLOWED_TIMES = ('BEGIN', 'BEFORE_DROP', 'BEFORE_CREATE', 'AFTER_CREATE', 'BEFORE_MOVIES', 'BEFORE_COMPANIES', 'BEFORE_CAST', 'BEFORE_RESTORE', 'BEFORE_INDEXES', 'END', 'BEFORE_MOVIES_TODB', 'AFTER_MOVIES_TODB', 'BEFORE_PERSONS_TODB', 'AFTER_PERSONS_TODB','BEFORE_SQLDATA_TODB', 'AFTER_SQLDATA_TODB', 'BEFORE_AKAMOVIES_TODB', 'AFTER_AKAMOVIES_TODB', 'BEFORE_CHARACTERS_TODB', 'AFTER_CHARACTERS_TODB', 'BEFORE_COMPANIES_TODB', 'AFTER_COMPANIES_TODB', 'BEFORE_EVERY_TODB', 'AFTER_EVERY_TODB', 'BEFORE_CSV_LOAD', 'BEFORE_CSV_TODB', 'AFTER_CSV_TODB') # Shortcuts for some compatibility options. MYSQLFORCEMYISAM_OPTS = ['-e', 'AFTER_CREATE:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=MyISAM;'] MYSQLINNODB_OPTS = ['-e', 'AFTER_CREATE:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=MyISAM;', '-e', 'BEFORE_INDEXES:FOR_EVERY_TABLE:ALTER TABLE %(table)s ENGINE=InnoDB;'] SQLSERVER_OPTS = ['-e', 'BEFORE_EVERY_TODB:SET IDENTITY_INSERT %(table)s ON;', '-e', 'AFTER_EVERY_TODB:SET IDENTITY_INSERT %(table)s OFF;'] SQLITE_OPTS = ['-e', 'BEGIN:PRAGMA synchronous = OFF;', '-e', 'BEFORE_EVERY_TODB:BEGIN TRANSACTION;', '-e', 'AFTER_EVERY_TODB:COMMIT;', '-e', 'BEFORE_INDEXES:BEGIN TRANSACTION;', 'e', 'END:COMMIT;'] if '--mysql-innodb' in sys.argv[1:]: sys.argv += MYSQLINNODB_OPTS if '--mysql-force-myisam' in sys.argv[1:]: sys.argv += MYSQLFORCEMYISAM_OPTS if '--ms-sqlserver' in sys.argv[1:]: sys.argv += SQLSERVER_OPTS if '--sqlite-transactions' in sys.argv[1:]: sys.argv += SQLITE_OPTS # Manage arguments list. try: optlist, args = getopt.getopt(sys.argv[1:], 'u:d:e:o:c:i:h', ['uri=', 'data=', 'execute=', 'mysql-innodb', 'ms-sqlserver', 'sqlite-transactions', 'fix-old-style-titles', 'mysql-force-myisam', 'orm', 'csv-only-write', 'csv-only-load', 'csv=', 'csv-ext=', 'imdbids=', 'help']) except getopt.error, e: print 'Troubles with arguments.' print HELP sys.exit(2) for opt in optlist: if opt[0] in ('-d', '--data'): IMDB_PTDF_DIR = opt[1] elif opt[0] in ('-u', '--uri'): URI = opt[1] elif opt[0] in ('-c', '--csv'): CSV_DIR = opt[1] elif opt[0] == '--csv-ext': CSV_EXT = opt[1] elif opt[0] in ('-i', '--imdbids'): IMDBIDS_METHOD = opt[1] elif opt[0] in ('-e', '--execute'): if opt[1].find(':') == -1: print 'WARNING: wrong command syntax: "%s"' % opt[1] continue when, cmd = opt[1].split(':', 1) if when not in ALLOWED_TIMES: print 'WARNING: unknown time: "%s"' % when continue if when == 'BEFORE_EVERY_TODB': for nw in ('BEFORE_MOVIES_TODB', 'BEFORE_PERSONS_TODB', 'BEFORE_SQLDATA_TODB', 'BEFORE_AKAMOVIES_TODB', 'BEFORE_CHARACTERS_TODB', 'BEFORE_COMPANIES_TODB'): CUSTOM_QUERIES.setdefault(nw, []).append(cmd) elif when == 'AFTER_EVERY_TODB': for nw in ('AFTER_MOVIES_TODB', 'AFTER_PERSONS_TODB', 'AFTER_SQLDATA_TODB', 'AFTER_AKAMOVIES_TODB', 'AFTER_CHARACTERS_TODB', 'AFTER_COMPANIES_TODB'): CUSTOM_QUERIES.setdefault(nw, []).append(cmd) else: CUSTOM_QUERIES.setdefault(when, []).append(cmd) elif opt[0] in ('-o', '--orm'): USE_ORM = opt[1].split(',') elif opt[0] == '--fix-old-style-titles': warnings.warn('The --fix-old-style-titles argument is obsolete.') elif opt[0] == '--csv-only-write': CSV_ONLY_WRITE = True elif opt[0] == '--csv-only-load': CSV_ONLY_LOAD = True elif opt[0] in ('-h', '--help'): print HELP sys.exit(0) if IMDB_PTDF_DIR is None: print 'You must supply the directory with the plain text data files' print HELP sys.exit(2) if URI is None: print 'You must supply the URI for the database connection' print HELP sys.exit(2) if IMDBIDS_METHOD not in (None, 'dbm', 'table'): print 'the method to (re)store imdbIDs must be one of "dbm" or "table"' print HELP sys.exit(2) if (CSV_ONLY_WRITE or CSV_ONLY_LOAD) and not CSV_DIR: print 'You must specify the CSV directory with the -c argument' print HELP sys.exit(3) # Some warnings and notices. URIlower = URI.lower() if URIlower.startswith('mysql'): if '--mysql-force-myisam' in sys.argv[1:] and \ '--mysql-innodb' in sys.argv[1:]: print '\nWARNING: there is no sense in mixing the --mysql-innodb and\n'\ '--mysql-force-myisam command line options!\n' elif '--mysql-innodb' in sys.argv[1:]: print "\nNOTICE: you've specified the --mysql-innodb command line\n"\ "option; you should do this ONLY IF your system uses InnoDB\n"\ "tables or you really want to use InnoDB; if you're running\n"\ "a MyISAM-based database, please omit any option; if you\n"\ "want to force MyISAM usage on a InnoDB-based database,\n"\ "try the --mysql-force-myisam command line option, instead.\n" elif '--mysql-force-myisam' in sys.argv[1:]: print "\nNOTICE: you've specified the --mysql-force-myisam command\n"\ "line option; you should do this ONLY IF your system uses\n"\ "InnoDB tables and you want to use MyISAM tables, instead.\n" else: print "\nNOTICE: IF you're using InnoDB tables, data insertion can\n"\ "be very slow; you can switch to MyISAM tables - forcing it\n"\ "with the --mysql-force-myisam option - OR use the\n"\ "--mysql-innodb command line option, but DON'T USE these if\n"\ "you're already working on MyISAM tables, because it will\n"\ "force MySQL to use InnoDB, and performances will be poor.\n" elif URIlower.startswith('mssql') and \ '--ms-sqlserver' not in sys.argv[1:]: print "\nWARNING: you're using MS SQLServer without the --ms-sqlserver\n"\ "command line option: if something goes wrong, try using it.\n" elif URIlower.startswith('sqlite') and \ '--sqlite-transactions' not in sys.argv[1:]: print "\nWARNING: you're using SQLite without the --sqlite-transactions\n"\ "command line option: you'll have very poor performances! Try\n"\ "using it.\n" if ('--mysql-force-myisam' in sys.argv[1:] and not URIlower.startswith('mysql')) or ('--mysql-innodb' in sys.argv[1:] and not URIlower.startswith('mysql')) or ('--ms-sqlserver' in sys.argv[1:] and not URIlower.startswith('mssql')) or \ ('--sqlite-transactions' in sys.argv[1:] and not URIlower.startswith('sqlite')): print "\nWARNING: you've specified command line options that don't\n"\ "belong to the database server you're using: proceed at your\n"\ "own risk!\n" if CSV_DIR: if URIlower.startswith('mysql'): CSV_LOAD_SQL = CSV_MYSQL elif URIlower.startswith('postgres'): CSV_LOAD_SQL = CSV_PGSQL elif URIlower.startswith('ibm'): CSV_LOAD_SQL = CSV_DB2 CSV_NULL = '' else: print "\nERROR: importing CSV files is not supported for this database" sys.exit(3) if USE_ORM is None: USE_ORM = ('sqlobject', 'sqlalchemy') if not isinstance(USE_ORM, (tuple, list)): USE_ORM = [USE_ORM] nrMods = len(USE_ORM) _gotError = False for idx, mod in enumerate(USE_ORM): mod = mod.lower() try: if mod == 'sqlalchemy': from imdb.parser.sql.alchemyadapter import getDBTables, \ NotFoundError, setConnection, ISNOTNULL, IN elif mod == 'sqlobject': from imdb.parser.sql.objectadapter import getDBTables, \ NotFoundError, setConnection, ISNOTNULL, IN else: warnings.warn('unknown module "%s".' % mod) continue DB_TABLES = getDBTables(URI) for t in DB_TABLES: globals()[t._imdbpyName] = t if _gotError: warnings.warn('falling back to "%s".' % mod) USED_ORM = mod break except ImportError, e: if idx+1 >= nrMods: raise IMDbError('unable to use any ORM in %s: %s' % ( str(USE_ORM), str(e))) else: warnings.warn('unable to use "%s": %s' % (mod, str(e))) _gotError = True continue else: raise IMDbError('unable to use any ORM in %s' % str(USE_ORM)) #----------------------- # CSV Handling. class CSVCursor(object): """Emulate a cursor object, but instead it writes data to a set of CSV files.""" def __init__(self, csvDir, csvExt=CSV_EXT, csvEOL=CSV_EOL, delimeter=CSV_DELIMITER, quote=CSV_QUOTE, escape=CSV_ESCAPE, null=CSV_NULL, quoteInteger=CSV_QUOTEINT): """Initialize a CSVCursor object; csvDir is the directory where the CSV files will be stored.""" self.csvDir = csvDir self.csvExt = csvExt self.csvEOL = csvEOL self.delimeter = delimeter self.quote = quote self.escape = escape self.escaped = '%s%s' % (escape, quote) self.null = null self.quoteInteger = quoteInteger self._fdPool = {} self._lobFDPool = {} self._counters = {} def buildLine(self, items, tableToAddID=False, rawValues=(), lobFD=None, lobFN=None): """Build a single text line for a set of information.""" # FIXME: there are too many special cases to handle, and that # affects performances: management of LOB files, at least, # must be moved away from here. quote = self.quote escape = self.escape null = self.null escaped = self.escaped quoteInteger = self.quoteInteger if not tableToAddID: r = [] else: _counters = self._counters r = [_counters[tableToAddID]] _counters[tableToAddID] += 1 r += list(items) for idx, val in enumerate(r): if val is None: r[idx] = null continue if (not quoteInteger) and isinstance(val, (int, long)): r[idx] = str(val) continue if lobFD and idx == 3: continue val = str(val) if quote: val = '%s%s%s' % (quote, val.replace(quote, escaped), quote) r[idx] = val # Add RawValue(s), if present. rinsert = r.insert if tableToAddID: shift = 1 else: shift = 0 for idx, item in rawValues: rinsert(idx + shift, item) if lobFD: # XXX: totally tailored to suit person_info.info column! val3 = r[3] val3len = len(val3 or '') or -1 if val3len == -1: val3off = 0 else: val3off = lobFD.tell() r[3] = '%s.%d.%d/' % (lobFN, val3off, val3len) lobFD.write(val3) # Build the line and add the end-of-line. return '%s%s' % (self.delimeter.join(r), self.csvEOL) def executemany(self, sqlstr, items): """Emulate the executemany method of a cursor, but writes the data in a set of CSV files.""" # XXX: find a safer way to get the table/file name! tName = sqlstr.split()[2] lobFD = None lobFN = None doLOB = False # XXX: ugly special case, to create the LOB file. if URIlower.startswith('ibm') and tName == 'person_info': doLOB = True # Open the file descriptor or get it from the pool. if tName in self._fdPool: tFD = self._fdPool[tName] lobFD = self._lobFDPool.get(tName) lobFN = getattr(lobFD, 'name', None) if lobFN: lobFN = os.path.basename(lobFN) else: tFD = open(os.path.join(CSV_DIR, tName + self.csvExt), 'wb') self._fdPool[tName] = tFD if doLOB: lobFN = '%s.lob' % tName lobFD = open(os.path.join(CSV_DIR, lobFN), 'wb') self._lobFDPool[tName] = lobFD buildLine = self.buildLine tableToAddID = False if tName in ('cast_info', 'movie_info', 'person_info', 'movie_companies', 'movie_link', 'aka_name', 'complete_cast', 'movie_info_idx', 'movie_keyword'): tableToAddID = tName if tName not in self._counters: self._counters[tName] = 1 # Identify if there are RawValue in the VALUES (...) portion of # the query. parIdx = sqlstr.rfind('(') rawValues = [] vals = sqlstr[parIdx+1:-1] if parIdx != 0: vals = sqlstr[parIdx+1:-1] for idx, item in enumerate(vals.split(', ')): if item[0] in ('%', '?', ':'): continue rawValues.append((idx, item)) # Write these lines. tFD.writelines(buildLine(i, tableToAddID=tableToAddID, rawValues=rawValues, lobFD=lobFD, lobFN=lobFN) for i in items) # Flush to disk, so that no truncaded entries are ever left. # XXX: is this a good idea? tFD.flush() def fileNames(self): """Return the list of file names.""" return [fd.name for fd in self._fdPool.values()] def buildFakeFileNames(self): """Populate the self._fdPool dictionary with fake objects taking file names from the content of the self.csvDir directory.""" class _FakeFD(object): pass for fname in os.listdir(self.csvDir): if not fname.endswith(CSV_EXT): continue fpath = os.path.join(self.csvDir, fname) if not os.path.isfile(fpath): continue fd = _FakeFD() fd.name = fname self._fdPool[fname[:-len(CSV_EXT)]] = fd def close(self, tName): """Close a given table/file.""" if tName in self._fdPool: self._fdPool[tName].close() def closeAll(self): """Close all open file descriptors.""" for fd in self._fdPool.values(): fd.close() for fd in self._lobFDPool.values(): fd.close() def loadCSVFiles(): """Load every CSV file into the database.""" CSV_REPL = {'quote': CSV_QUOTE, 'delimiter': CSV_DELIMITER, 'escape': CSV_ESCAPE, 'null': CSV_NULL, 'eol': CSV_EOL} for fName in CSV_CURS.fileNames(): connectObject.commit() tName = os.path.basename(fName[:-len(CSV_EXT)]) cfName = os.path.join(CSV_DIR, fName) CSV_REPL['file'] = cfName CSV_REPL['table'] = tName sqlStr = CSV_LOAD_SQL % CSV_REPL print ' * LOADING CSV FILE %s...' % cfName sys.stdout.flush() executeCustomQueries('BEFORE_CSV_TODB') try: CURS.execute(sqlStr) try: res = CURS.fetchall() if res: print 'LOADING OUTPUT:', res except: pass except Exception, e: print 'ERROR: unable to import CSV file %s: %s' % (cfName, str(e)) continue connectObject.commit() executeCustomQueries('AFTER_CSV_TODB') #----------------------- conn = setConnection(URI, DB_TABLES) if CSV_DIR: # Go for a CSV ride... CSV_CURS = CSVCursor(CSV_DIR) # Extract exceptions to trap. try: OperationalError = conn.module.OperationalError except AttributeError, e: warnings.warn('Unable to import OperationalError; report this as a bug, ' \ 'since it will mask important exceptions: %s' % e) OperationalError = Exception try: IntegrityError = conn.module.IntegrityError except AttributeError, e: warnings.warn('Unable to import IntegrityError') IntegrityError = Exception connectObject = conn.getConnection() # XXX: fix for a problem that should be fixed in objectadapter.py (see it). if URI and URI.startswith('sqlite') and USED_ORM == 'sqlobject': major = sys.version_info[0] minor = sys.version_info[1] if major > 2 or (major == 2 and minor > 5): connectObject.text_factory = str # Cursor object. CURS = connectObject.cursor() # Name of the database and style of the parameters. DB_NAME = conn.dbName PARAM_STYLE = conn.paramstyle def _get_imdbids_method(): """Return the method to be used to (re)store imdbIDs (one of 'dbm' or 'table').""" if IMDBIDS_METHOD: return IMDBIDS_METHOD if CSV_DIR: return 'dbm' return 'table' def tableName(table): """Return a string with the name of the table in the current db.""" return table.sqlmeta.table def colName(table, column): """Return a string with the name of the column in the current db.""" if column == 'id': return table.sqlmeta.idName return table.sqlmeta.columns[column].dbName class RawValue(object): """String-like objects to store raw SQL parameters, that are not intended to be replaced with positional parameters, in the query.""" def __init__(self, s, v): self.string = s self.value = v def __str__(self): return self.string def _makeConvNamed(cols): """Return a function to be used to convert a list of parameters from positional style to named style (convert from a list of tuples to a list of dictionaries.""" nrCols = len(cols) def _converter(params): for paramIndex, paramSet in enumerate(params): d = {} for i in xrange(nrCols): d[cols[i]] = paramSet[i] params[paramIndex] = d return params return _converter def createSQLstr(table, cols, command='INSERT'): """Given a table and a list of columns returns a sql statement useful to insert a set of data in the database. Along with the string, also a function useful to convert parameters from positional to named style is returned.""" sqlstr = '%s INTO %s ' % (command, tableName(table)) colNames = [] values = [] convCols = [] count = 1 def _valStr(s, index): if DB_NAME in ('mysql', 'postgres'): return '%s' elif PARAM_STYLE == 'format': return '%s' elif PARAM_STYLE == 'qmark': return '?' elif PARAM_STYLE == 'numeric': return ':%s' % index elif PARAM_STYLE == 'named': return ':%s' % s elif PARAM_STYLE == 'pyformat': return '%(' + s + ')s' return '%s' for col in cols: if isinstance(col, RawValue): colNames.append(colName(table, col.string)) values.append(str(col.value)) elif col == 'id': colNames.append(table.sqlmeta.idName) values.append(_valStr('id', count)) convCols.append(col) count += 1 else: colNames.append(colName(table, col)) values.append(_valStr(col, count)) convCols.append(col) count += 1 sqlstr += '(%s) ' % ', '.join(colNames) sqlstr += 'VALUES (%s)' % ', '.join(values) if DB_NAME not in ('mysql', 'postgres') and \ PARAM_STYLE in ('named', 'pyformat'): converter = _makeConvNamed(convCols) else: # Return the list itself. converter = lambda x: x return sqlstr, converter def _(s, truncateAt=None): """Nicely print a string to sys.stdout, optionally truncating it a the given char.""" if not isinstance(s, UnicodeType): s = unicode(s, 'utf_8') if truncateAt is not None: s = s[:truncateAt] s = s.encode(sys.stdout.encoding or 'utf_8', 'replace') return s if not hasattr(os, 'times'): def times(): """Fake times() function.""" return (0.0, 0.0, 0.0, 0.0, 0.0) os.times = times # Show time consumed by the single function call. CTIME = int(time.time()) BEGIN_TIME = CTIME CTIMES = os.times() BEGIN_TIMES = CTIMES def _minSec(*t): """Return a tuple of (mins, secs, ...) - two for every item passed.""" l = [] for i in t: l.extend(divmod(int(i), 60)) return tuple(l) def t(s, sinceBegin=False): """Pretty-print timing information.""" global CTIME, CTIMES nt = int(time.time()) ntimes = os.times() if not sinceBegin: ct = CTIME cts = CTIMES else: ct = BEGIN_TIME cts = BEGIN_TIMES print '# TIME', s, \ ': %dmin, %dsec (wall) %dmin, %dsec (user) %dmin, %dsec (system)' \ % _minSec(nt-ct, ntimes[0]-cts[0], ntimes[1]-cts[1]) if not sinceBegin: CTIME = nt CTIMES = ntimes def title_soundex(title): """Return the soundex code for the given title; the (optional) starting article is pruned. It assumes to receive a title without year/imdbIndex or kind indications, but just the title string, as the one in the analyze_title(title)['title'] value.""" if not title: return None # Convert to canonical format. title = canonicalTitle(title) ts = title.split(', ') # Strip the ending article, if any. if ts[-1].lower() in _articles: title = ', '.join(ts[:-1]) return soundex(title) def name_soundexes(name, character=False): """Return three soundex codes for the given name; the name is assumed to be in the 'surname, name' format, without the imdbIndex indication, as the one in the analyze_name(name)['name'] value. The first one is the soundex of the name in the canonical format. The second is the soundex of the name in the normal format, if different from the first one. The third is the soundex of the surname, if different from the other two values.""" ##if not isinstance(name, unicode): name = unicode(name, 'utf_8') # Prune non-ascii chars from the string. ##name = name.encode('ascii', 'ignore') if not name: return (None, None, None) s1 = soundex(name) name_normal = normalizeName(name) s2 = soundex(name_normal) if s1 == s2: s2 = None if not character: namesplit = name.split(', ') s3 = soundex(namesplit[0]) else: s3 = soundex(name.split(' ')[-1]) if s3 and s3 in (s1, s2): s3 = None return (s1, s2, s3) # Tags to identify where the meaningful data begin/end in files. MOVIES = 'movies.list.gz' MOVIES_START = ('MOVIES LIST', '===========', '') MOVIES_STOP = '--------------------------------------------------' CAST_START = ('Name', '----') CAST_STOP = '-----------------------------' RAT_START = ('MOVIE RATINGS REPORT', '', 'New Distribution Votes Rank Title') RAT_STOP = '\n' RAT_TOP250_START = ('note: for this top 250', '', 'New Distribution') RAT_BOT10_START = ('BOTTOM 10 MOVIES', '', 'New Distribution') TOPBOT_STOP = '\n' AKAT_START = ('AKA TITLES LIST', '=============', '', '', '') AKAT_IT_START = ('AKA TITLES LIST ITALIAN', '=======================', '', '') AKAT_DE_START = ('AKA TITLES LIST GERMAN', '======================', '') AKAT_ISO_START = ('AKA TITLES LIST ISO', '===================', '') AKAT_HU_START = ('AKA TITLES LIST HUNGARIAN', '=========================', '') AKAT_NO_START = ('AKA TITLES LIST NORWEGIAN', '=========================', '') AKAN_START = ('AKA NAMES LIST', '=============', '') AV_START = ('ALTERNATE VERSIONS LIST', '=======================', '', '') MINHASH_STOP = '-------------------------' GOOFS_START = ('GOOFS LIST', '==========', '') QUOTES_START = ('QUOTES LIST', '=============') CC_START = ('CRAZY CREDITS', '=============') BIO_START = ('BIOGRAPHY LIST', '==============') BUS_START = ('BUSINESS LIST', '=============', '') BUS_STOP = ' =====' CER_START = ('CERTIFICATES LIST', '=================') COL_START = ('COLOR INFO LIST', '===============') COU_START = ('COUNTRIES LIST', '==============') DIS_START = ('DISTRIBUTORS LIST', '=================', '') GEN_START = ('8: THE GENRES LIST', '==================', '') KEY_START = ('8: THE KEYWORDS LIST', '====================', '') LAN_START = ('LANGUAGE LIST', '=============') LOC_START = ('LOCATIONS LIST', '==============', '') MIS_START = ('MISCELLANEOUS COMPANY LIST', '==========================') PRO_START = ('PRODUCTION COMPANIES LIST', '=========================', '') RUN_START = ('RUNNING TIMES LIST', '==================') SOU_START = ('SOUND-MIX LIST', '==============') SFX_START = ('SFXCO COMPANIES LIST', '====================', '') TCN_START = ('TECHNICAL LIST', '==============', '', '') LSD_START = ('LASERDISC LIST', '==============', '------------------------') LIT_START = ('LITERATURE LIST', '===============', '') LIT_STOP = 'COPYING POLICY' LINK_START = ('MOVIE LINKS LIST', '================', '') MPAA_START = ('MPAA RATINGS REASONS LIST', '=========================') PLOT_START = ('PLOT SUMMARIES LIST', '===================', '') RELDATE_START = ('RELEASE DATES LIST', '==================') SNDT_START = ('SOUNDTRACKS LIST', '================', '', '', '') TAGL_START = ('TAG LINES LIST', '==============', '', '') TAGL_STOP = '-----------------------------------------' TRIV_START = ('FILM TRIVIA', '===========', '') COMPCAST_START = ('CAST COVERAGE TRACKING LIST', '===========================') COMPCREW_START = ('CREW COVERAGE TRACKING LIST', '===========================') COMP_STOP = '---------------' GzipFileRL = GzipFile.readline class SourceFile(GzipFile): """Instances of this class are used to read gzipped files, starting from a defined line to a (optionally) given end.""" def __init__(self, filename=None, mode=None, start=(), stop=None, pwarning=1, *args, **kwds): filename = os.path.join(IMDB_PTDF_DIR, filename) try: GzipFile.__init__(self, filename, mode, *args, **kwds) except IOError, e: if not pwarning: raise print 'WARNING WARNING WARNING' print 'WARNING unable to read the "%s" file.' % filename print 'WARNING The file will be skipped, and the contained' print 'WARNING information will NOT be stored in the database.' print 'WARNING Complete error: ', e # re-raise the exception. raise self.start = start for item in start: itemlen = len(item) for line in self: if line[:itemlen] == item: break self.set_stop(stop) def set_stop(self, stop): if stop is not None: self.stop = stop self.stoplen = len(self.stop) self.readline = self.readline_checkEnd else: self.readline = self.readline_NOcheckEnd def readline_NOcheckEnd(self, size=-1): line = GzipFile.readline(self, size) return unicode(line, 'latin_1').encode('utf_8') def readline_checkEnd(self, size=-1): line = GzipFile.readline(self, size) if self.stop is not None and line[:self.stoplen] == self.stop: return '' return unicode(line, 'latin_1').encode('utf_8') def getByHashSections(self): return getSectionHash(self) def getByNMMVSections(self): return getSectionNMMV(self) def getSectionHash(fp): """Return sections separated by lines starting with #.""" curSectList = [] curSectListApp = curSectList.append curTitle = '' joiner = ''.join for line in fp: if line and line[0] == '#': if curSectList and curTitle: yield curTitle, joiner(curSectList) curSectList[:] = [] curTitle = '' curTitle = line[2:] else: curSectListApp(line) if curSectList and curTitle: yield curTitle, joiner(curSectList) curSectList[:] = [] curTitle = '' NMMVSections = dict([(x, None) for x in ('MV: ', 'NM: ', 'OT: ', 'MOVI')]) def getSectionNMMV(fp): """Return sections separated by lines starting with 'NM: ', 'MV: ', 'OT: ' or 'MOVI'.""" curSectList = [] curSectListApp = curSectList.append curNMMV = '' joiner = ''.join for line in fp: if line[:4] in NMMVSections: if curSectList and curNMMV: yield curNMMV, joiner(curSectList) curSectList[:] = [] curNMMV = '' if line[:4] == 'MOVI': curNMMV = line[6:] else: curNMMV = line[4:] elif not (line and line[0] == '-'): curSectListApp(line) if curSectList and curNMMV: yield curNMMV, joiner(curSectList) curSectList[:] = [] curNMMV = '' def counter(initValue=1): """A counter implemented using a generator.""" i = initValue while 1: yield i i += 1 class _BaseCache(dict): """Base class for Movie and Person basic information.""" def __init__(self, d=None, flushEvery=100000): dict.__init__(self) # Flush data into the SQL database every flushEvery entries. self.flushEvery = flushEvery self._tmpDict = {} self._flushing = 0 self._deferredData = {} self._recursionLevel = 0 self._table_name = '' self._id_for_custom_q = '' if d is not None: for k, v in d.iteritems(): self[k] = v def __setitem__(self, key, counter): """Every time a key is set, its value is the counter; every flushEvery, the temporary dictionary is flushed to the database, and then zeroed.""" if counter % self.flushEvery == 0: self.flush() dict.__setitem__(self, key, counter) if not self._flushing: self._tmpDict[key] = counter else: self._deferredData[key] = counter def flush(self, quiet=0, _recursionLevel=0): """Flush to the database.""" if self._flushing: return self._flushing = 1 if _recursionLevel >= MAX_RECURSION: print 'WARNING recursion level exceded trying to flush data' print 'WARNING this batch of data is lost (%s).' % self.className self._tmpDict.clear() return if self._tmpDict: # Horrible hack to know if AFTER_%s_TODB has run. _after_has_run = False keys = {'table': self._table_name} try: executeCustomQueries('BEFORE_%s_TODB' % self._id_for_custom_q, _keys=keys, _timeit=False) self._toDB(quiet) executeCustomQueries('AFTER_%s_TODB' % self._id_for_custom_q, _keys=keys, _timeit=False) _after_has_run = True self._tmpDict.clear() except OperationalError, e: # XXX: I'm not sure this is the right thing (and way) # to proceed. if not _after_has_run: executeCustomQueries('AFTER_%s_TODB'%self._id_for_custom_q, _keys=keys, _timeit=False) # Dataset too large; split it in two and retry. # XXX: new code! # the same class instance (self) is used, instead of # creating two separated objects. _recursionLevel += 1 self._flushing = 0 firstHalf = {} poptmpd = self._tmpDict.popitem originalLength = len(self._tmpDict) for x in xrange(1 + originalLength/2): k, v = poptmpd() firstHalf[k] = v print ' * TOO MANY DATA (%s items in %s), recursion: %s' % \ (originalLength, self.className, _recursionLevel) print ' * SPLITTING (run 1 of 2), recursion: %s' % \ _recursionLevel self.flush(quiet=quiet, _recursionLevel=_recursionLevel) self._tmpDict = firstHalf print ' * SPLITTING (run 2 of 2), recursion: %s' % \ _recursionLevel self.flush(quiet=quiet, _recursionLevel=_recursionLevel) self._tmpDict.clear() except Exception, e: if isinstance(e, KeyboardInterrupt): raise print 'WARNING: unknown exception caught committing the data' print 'WARNING: to the database; report this as a bug, since' print 'WARNING: many data (%d items) were lost: %s' % \ (len(self._tmpDict), e) self._flushing = 0 # Flush also deferred data. if self._deferredData: self._tmpDict = self._deferredData self.flush(quiet=1) self._deferredData = {} connectObject.commit() def populate(self): """Populate the dictionary from the database.""" raise NotImplementedError def _toDB(self, quiet=0): """Write the dictionary to the database.""" raise NotImplementedError def add(self, key, miscData=None): """Insert a new key and return its value.""" c = self.counter.next() # miscData=[('a_dict', 'value')] will set self.a_dict's c key # to 'value'. if miscData is not None: for d_name, data in miscData: getattr(self, d_name)[c] = data self[key] = c return c def addUnique(self, key, miscData=None): """Insert a new key and return its value; if the key is already in the dictionary, its previous value is returned.""" if key in self: return self[key] else: return self.add(key, miscData) def fetchsome(curs, size=20000): """Yes, I've read the Python Cookbook! :-)""" while 1: res = curs.fetchmany(size) if not res: break for r in res: yield r class MoviesCache(_BaseCache): """Manage the movies list.""" className = 'MoviesCache' counter = counter() def __init__(self, *args, **kwds): _BaseCache.__init__(self, *args, **kwds) self.movieYear = {} self._table_name = tableName(Title) self._id_for_custom_q = 'MOVIES' self.sqlstr, self.converter = createSQLstr(Title, ('id', 'title', 'imdbIndex', 'kindID', 'productionYear', 'imdbID', 'phoneticCode', 'episodeOfID', 'seasonNr', 'episodeNr', 'seriesYears', 'md5sum')) def populate(self): print ' * POPULATING %s...' % self.className titleTbl = tableName(Title) movieidCol = colName(Title, 'id') titleCol = colName(Title, 'title') kindidCol = colName(Title, 'kindID') yearCol = colName(Title, 'productionYear') imdbindexCol = colName(Title, 'imdbIndex') episodeofidCol = colName(Title, 'episodeOfID') seasonNrCol = colName(Title, 'seasonNr') episodeNrCol = colName(Title, 'episodeNr') sqlPop = 'SELECT %s, %s, %s, %s, %s, %s, %s, %s FROM %s;' % \ (movieidCol, titleCol, kindidCol, yearCol, imdbindexCol, episodeofidCol, seasonNrCol, episodeNrCol, titleTbl) CURS.execute(sqlPop) _oldcacheValues = Title.sqlmeta.cacheValues Title.sqlmeta.cacheValues = False for x in fetchsome(CURS, self.flushEvery): mdict = {'title': x[1], 'kind': KIND_STRS[x[2]], 'year': x[3], 'imdbIndex': x[4]} if mdict['imdbIndex'] is None: del mdict['imdbIndex'] if mdict['year'] is None: del mdict['year'] else: mdict['year'] = str(mdict['year']) episodeOfID = x[5] if episodeOfID is not None: s = Title.get(episodeOfID) series_d = {'title': s.title, 'kind': str(KIND_STRS[s.kindID]), 'year': s.productionYear, 'imdbIndex': s.imdbIndex} if series_d['imdbIndex'] is None: del series_d['imdbIndex'] if series_d['year'] is None: del series_d['year'] else: series_d['year'] = str(series_d['year']) mdict['episode of'] = series_d title = build_title(mdict, ptdf=1, _emptyString='') dict.__setitem__(self, title, x[0]) self.counter = counter(Title.select().count() + 1) Title.sqlmeta.cacheValues = _oldcacheValues def _toDB(self, quiet=0): if not quiet: print ' * FLUSHING %s...' % self.className sys.stdout.flush() l = [] lapp = l.append tmpDictiter = self._tmpDict.iteritems for k, v in tmpDictiter(): try: t = analyze_title(k, _emptyString='') except IMDbParserError: if k and k.strip(): print 'WARNING %s._toDB() invalid title:' % self.className, print _(k) continue tget = t.get episodeOf = None kind = tget('kind') if kind == 'episode': # Series title. stitle = build_title(tget('episode of'), _emptyString='') episodeOf = self.addUnique(stitle) del t['episode of'] year = self.movieYear.get(v) if year is not None and year != '????': try: t['year'] = int(year) except ValueError: pass elif kind in ('tv series', 'tv mini series'): t['series years'] = self.movieYear.get(v) title = tget('title') soundex = title_soundex(title) lapp((v, title, tget('imdbIndex'), KIND_IDS[kind], tget('year'), None, soundex, episodeOf, tget('season'), tget('episode'), tget('series years'), md5(k).hexdigest())) self._runCommand(l) def _runCommand(self, dataList): if not CSV_DIR: CURS.executemany(self.sqlstr, self.converter(dataList)) else: CSV_CURS.executemany(self.sqlstr, dataList) def addUnique(self, key, miscData=None): """Insert a new key and return its value; if the key is already in the dictionary, its previous value is returned.""" if key.endswith('{{SUSPENDED}}'): return None # DONE: to be removed when it will be no more needed! #if FIX_OLD_STYLE_TITLES: # key = build_title(analyze_title(key, canonical=False, # _emptyString=''), ptdf=1, _emptyString='') if key in self: return self[key] else: return self.add(key, miscData) class PersonsCache(_BaseCache): """Manage the persons list.""" className = 'PersonsCache' counter = counter() def __init__(self, *args, **kwds): _BaseCache.__init__(self, *args, **kwds) self.personGender = {} self._table_name = tableName(Name) self._id_for_custom_q = 'PERSONS' self.sqlstr, self.converter = createSQLstr(Name, ['id', 'name', 'imdbIndex', 'imdbID', 'gender', 'namePcodeCf', 'namePcodeNf', 'surnamePcode', 'md5sum']) def populate(self): print ' * POPULATING PersonsCache...' nameTbl = tableName(Name) personidCol = colName(Name, 'id') nameCol = colName(Name, 'name') imdbindexCol = colName(Name, 'imdbIndex') CURS.execute('SELECT %s, %s, %s FROM %s;' % (personidCol, nameCol, imdbindexCol, nameTbl)) _oldcacheValues = Name.sqlmeta.cacheValues Name.sqlmeta.cacheValues = False for x in fetchsome(CURS, self.flushEvery): nd = {'name': x[1]} if x[2]: nd['imdbIndex'] = x[2] name = build_name(nd) dict.__setitem__(self, name, x[0]) self.counter = counter(Name.select().count() + 1) Name.sqlmeta.cacheValues = _oldcacheValues def _toDB(self, quiet=0): if not quiet: print ' * FLUSHING PersonsCache...' sys.stdout.flush() l = [] lapp = l.append tmpDictiter = self._tmpDict.iteritems for k, v in tmpDictiter(): try: t = analyze_name(k) except IMDbParserError: if k and k.strip(): print 'WARNING PersonsCache._toDB() invalid name:', _(k) continue tget = t.get name = tget('name') namePcodeCf, namePcodeNf, surnamePcode = name_soundexes(name) gender = self.personGender.get(v) lapp((v, name, tget('imdbIndex'), None, gender, namePcodeCf, namePcodeNf, surnamePcode, md5(k).hexdigest())) if not CSV_DIR: CURS.executemany(self.sqlstr, self.converter(l)) else: CSV_CURS.executemany(self.sqlstr, l) class CharactersCache(_BaseCache): """Manage the characters list.""" counter = counter() className = 'CharactersCache' def __init__(self, *args, **kwds): _BaseCache.__init__(self, *args, **kwds) self._table_name = tableName(CharName) self._id_for_custom_q = 'CHARACTERS' self.sqlstr, self.converter = createSQLstr(CharName, ['id', 'name', 'imdbIndex', 'imdbID', 'namePcodeNf', 'surnamePcode', 'md5sum']) def populate(self): print ' * POPULATING CharactersCache...' nameTbl = tableName(CharName) personidCol = colName(CharName, 'id') nameCol = colName(CharName, 'name') imdbindexCol = colName(CharName, 'imdbIndex') CURS.execute('SELECT %s, %s, %s FROM %s;' % (personidCol, nameCol, imdbindexCol, nameTbl)) _oldcacheValues = CharName.sqlmeta.cacheValues CharName.sqlmeta.cacheValues = False for x in fetchsome(CURS, self.flushEvery): nd = {'name': x[1]} if x[2]: nd['imdbIndex'] = x[2] name = build_name(nd) dict.__setitem__(self, name, x[0]) self.counter = counter(CharName.select().count() + 1) CharName.sqlmeta.cacheValues = _oldcacheValues def _toDB(self, quiet=0): if not quiet: print ' * FLUSHING CharactersCache...' sys.stdout.flush() l = [] lapp = l.append tmpDictiter = self._tmpDict.iteritems for k, v in tmpDictiter(): try: t = analyze_name(k) except IMDbParserError: if k and k.strip(): print 'WARNING CharactersCache._toDB() invalid name:', _(k) continue tget = t.get name = tget('name') namePcodeCf, namePcodeNf, surnamePcode = name_soundexes(name, character=True) lapp((v, name, tget('imdbIndex'), None, namePcodeCf, surnamePcode, md5(k).hexdigest())) if not CSV_DIR: CURS.executemany(self.sqlstr, self.converter(l)) else: CSV_CURS.executemany(self.sqlstr, l) class CompaniesCache(_BaseCache): """Manage the companies list.""" counter = counter() className = 'CompaniesCache' def __init__(self, *args, **kwds): _BaseCache.__init__(self, *args, **kwds) self._table_name = tableName(CompanyName) self._id_for_custom_q = 'COMPANIES' self.sqlstr, self.converter = createSQLstr(CompanyName, ['id', 'name', 'countryCode', 'imdbID', 'namePcodeNf', 'namePcodeSf', 'md5sum']) def populate(self): print ' * POPULATING CharactersCache...' nameTbl = tableName(CompanyName) companyidCol = colName(CompanyName, 'id') nameCol = colName(CompanyName, 'name') countryCodeCol = colName(CompanyName, 'countryCode') CURS.execute('SELECT %s, %s, %s FROM %s;' % (companyidCol, nameCol, countryCodeCol, nameTbl)) _oldcacheValues = CompanyName.sqlmeta.cacheValues CompanyName.sqlmeta.cacheValues = False for x in fetchsome(CURS, self.flushEvery): nd = {'name': x[1]} if x[2]: nd['country'] = x[2] name = build_company_name(nd) dict.__setitem__(self, name, x[0]) self.counter = counter(CompanyName.select().count() + 1) CompanyName.sqlmeta.cacheValues = _oldcacheValues def _toDB(self, quiet=0): if not quiet: print ' * FLUSHING CompaniesCache...' sys.stdout.flush() l = [] lapp = l.append tmpDictiter = self._tmpDict.iteritems for k, v in tmpDictiter(): try: t = analyze_company_name(k) except IMDbParserError: if k and k.strip(): print 'WARNING CompaniesCache._toDB() invalid name:', _(k) continue tget = t.get name = tget('name') namePcodeNf = soundex(name) namePcodeSf = None country = tget('country') if k != name: namePcodeSf = soundex(k) lapp((v, name, country, None, namePcodeNf, namePcodeSf, md5(k).hexdigest())) if not CSV_DIR: CURS.executemany(self.sqlstr, self.converter(l)) else: CSV_CURS.executemany(self.sqlstr, l) class KeywordsCache(_BaseCache): """Manage the list of keywords.""" counter = counter() className = 'KeywordsCache' def __init__(self, *args, **kwds): _BaseCache.__init__(self, *args, **kwds) self._table_name = tableName(CompanyName) self._id_for_custom_q = 'KEYWORDS' self.flushEvery = 10000 self.sqlstr, self.converter = createSQLstr(Keyword, ['id', 'keyword', 'phoneticCode']) def populate(self): print ' * POPULATING KeywordsCache...' nameTbl = tableName(CompanyName) keywordidCol = colName(Keyword, 'id') keyCol = colName(Keyword, 'name') CURS.execute('SELECT %s, %s FROM %s;' % (keywordidCol, keyCol, nameTbl)) _oldcacheValues = Keyword.sqlmeta.cacheValues Keyword.sqlmeta.cacheValues = False for x in fetchsome(CURS, self.flushEvery): dict.__setitem__(self, x[1], x[0]) self.counter = counter(Keyword.select().count() + 1) Keyword.sqlmeta.cacheValues = _oldcacheValues def _toDB(self, quiet=0): if not quiet: print ' * FLUSHING KeywordsCache...' sys.stdout.flush() l = [] lapp = l.append tmpDictiter = self._tmpDict.iteritems for k, v in tmpDictiter(): keySoundex = soundex(k) lapp((v, k, keySoundex)) if not CSV_DIR: CURS.executemany(self.sqlstr, self.converter(l)) else: CSV_CURS.executemany(self.sqlstr, l) class SQLData(dict): """Variable set of information, to be stored from time to time to the SQL database.""" def __init__(self, table=None, cols=None, sqlString='', converter=None, d={}, flushEvery=20000, counterInit=1): if not sqlString: if not (table and cols): raise TypeError('"table" or "cols" unspecified') sqlString, converter = createSQLstr(table, cols) elif converter is None: raise TypeError('"sqlString" or "converter" unspecified') dict.__init__(self) self.counterInit = counterInit self.counter = counterInit self.flushEvery = flushEvery self.sqlString = sqlString self.converter = converter self._recursionLevel = 1 self._table = table self._table_name = tableName(table) for k, v in d.items(): self[k] = v def __setitem__(self, key, value): """The value is discarded, the counter is used as the 'real' key and the user's 'key' is used as its values.""" counter = self.counter if counter % self.flushEvery == 0: self.flush() dict.__setitem__(self, counter, key) self.counter += 1 def add(self, key): self[key] = None def flush(self, _resetRecursion=1): if not self: return # XXX: it's safer to flush MoviesCache and PersonsCache, to preserve # consistency of ForeignKey, but it can also slow down everything # a bit... CACHE_MID.flush(quiet=1) CACHE_PID.flush(quiet=1) if _resetRecursion: self._recursionLevel = 1 if self._recursionLevel >= MAX_RECURSION: print 'WARNING recursion level exceded trying to flush data' print 'WARNING this batch of data is lost.' self.clear() self.counter = self.counterInit return keys = {'table': self._table_name} _after_has_run = False try: executeCustomQueries('BEFORE_SQLDATA_TODB', _keys=keys, _timeit=False) self._toDB() executeCustomQueries('AFTER_SQLDATA_TODB', _keys=keys, _timeit=False) _after_has_run = True self.clear() self.counter = self.counterInit except OperationalError, e: if not _after_has_run: executeCustomQueries('AFTER_SQLDATA_TODB', _keys=keys, _timeit=False) print ' * TOO MANY DATA (%s items), SPLITTING (run #%d)...' % \ (len(self), self._recursionLevel) self._recursionLevel += 1 newdata = self.__class__(table=self._table, sqlString=self.sqlString, converter=self.converter) newdata._recursionLevel = self._recursionLevel newflushEvery = self.flushEvery / 2 if newflushEvery < 1: print 'WARNING recursion level exceded trying to flush data' print 'WARNING this batch of data is lost.' self.clear() self.counter = self.counterInit return self.flushEvery = newflushEvery newdata.flushEvery = newflushEvery popitem = self.popitem dsi = dict.__setitem__ for x in xrange(len(self)/2): k, v = popitem() dsi(newdata, k, v) newdata.flush(_resetRecursion=0) del newdata self.flush(_resetRecursion=0) self.clear() self.counter = self.counterInit except Exception, e: if isinstance(e, KeyboardInterrupt): raise print 'WARNING: unknown exception caught committing the data' print 'WARNING: to the database; report this as a bug, since' print 'WARNING: many data (%d items) were lost: %s' % \ (len(self), e) connectObject.commit() def _toDB(self): print ' * FLUSHING SQLData...' if not CSV_DIR: CURS.executemany(self.sqlString, self.converter(self.values())) else: CSV_CURS.executemany(self.sqlString, self.values()) # Miscellaneous functions. def unpack(line, headers, sep='\t'): """Given a line, split at seps and return a dictionary with key from the header list. E.g.: line = ' 0000000124 8805 8.4 Incredibles, The (2004)' header = ('votes distribution', 'votes', 'rating', 'title') seps=(' ',) will returns: {'votes distribution': '0000000124', 'votes': '8805', 'rating': '8.4', 'title': 'Incredibles, The (2004)'} """ r = {} ls1 = filter(None, line.split(sep)) for index, item in enumerate(ls1): try: name = headers[index] except IndexError: name = 'item%s' % index r[name] = item.strip() return r def _parseMinusList(fdata): """Parse a list of lines starting with '- '.""" rlist = [] tmplist = [] for line in fdata: if line and line[:2] == '- ': if tmplist: rlist.append(' '.join(tmplist)) l = line[2:].strip() if l: tmplist[:] = [l] else: tmplist[:] = [] else: l = line.strip() if l: tmplist.append(l) if tmplist: rlist.append(' '.join(tmplist)) return rlist def _parseColonList(lines, replaceKeys): """Parser for lists with "TAG: value" strings.""" out = {} for line in lines: line = line.strip() if not line: continue cols = line.split(':', 1) if len(cols) < 2: continue k = cols[0] k = replaceKeys.get(k, k) v = ' '.join(cols[1:]).strip() if k not in out: out[k] = [] out[k].append(v) return out # Functions used to manage data files. def readMovieList(): """Read the movies.list.gz file.""" try: mdbf = SourceFile(MOVIES, start=MOVIES_START, stop=MOVIES_STOP) except IOError: return count = 0 for line in mdbf: line_d = unpack(line, ('title', 'year')) title = line_d['title'] yearData = None # Collect 'year' column for tv "series years" and episodes' year. if title[0] == '"': yearData = [('movieYear', line_d['year'])] mid = CACHE_MID.addUnique(title, yearData) if mid is None: continue if count % 10000 == 0: print 'SCANNING movies:', _(title), print '(movieID: %s)' % mid count += 1 CACHE_MID.flush() CACHE_MID.movieYear.clear() mdbf.close() def doCast(fp, roleid, rolename): """Populate the cast table.""" pid = None count = 0 name = '' roleidVal = RawValue('roleID', roleid) sqldata = SQLData(table=CastInfo, cols=['personID', 'movieID', 'personRoleID', 'note', 'nrOrder', roleidVal]) if rolename == 'miscellaneous crew': sqldata.flushEvery = 10000 for line in fp: if line and line[0] != '\t': if line[0] == '\n': continue sl = filter(None, line.split('\t')) if len(sl) != 2: continue name, line = sl miscData = None if rolename == 'actor': miscData = [('personGender', 'm')] elif rolename == 'actress': miscData = [('personGender', 'f')] pid = CACHE_PID.addUnique(name.strip(), miscData) line = line.strip() ll = line.split(' ') title = ll[0] note = None role = None order = None for item in ll[1:]: if not item: continue if item[0] == '[': # Quite inefficient, but there are some very strange # cases of garbage in the plain text data files to handle... role = item[1:] if role[-1:] == ']': role = role[:-1] if role[-1:] == ')': nidx = role.find('(') if nidx != -1: note = role[nidx:] role = role[:nidx].rstrip() if not role: role = None elif item[0] == '(': if note is None: note = item else: note = '%s %s' % (note, item) elif item[0] == '<': textor = item[1:-1] try: order = long(textor) except ValueError: os = textor.split(',') if len(os) == 3: try: order = ((long(os[2])-1) * 1000) + \ ((long(os[1])-1) * 100) + (long(os[0])-1) except ValueError: pass movieid = CACHE_MID.addUnique(title) if movieid is None: continue if role is not None: roles = filter(None, [x.strip() for x in role.split('/')]) for role in roles: cid = CACHE_CID.addUnique(role) sqldata.add((pid, movieid, cid, note, order)) else: sqldata.add((pid, movieid, None, note, order)) if count % 10000 == 0: print 'SCANNING %s:' % rolename, print _(name) count += 1 sqldata.flush() CACHE_PID.flush() CACHE_PID.personGender.clear() print 'CLOSING %s...' % rolename def castLists(): """Read files listed in the 'role' column of the 'roletypes' table.""" rt = [(x.id, x.role) for x in RoleType.select()] for roleid, rolename in rt: if rolename == 'guest': continue fname = rolename fname = fname.replace(' ', '-') if fname == 'actress': fname = 'actresses.list.gz' elif fname == 'miscellaneous-crew': fname = 'miscellaneous.list.gz' else: fname = fname + 's.list.gz' print 'DOING', fname try: f = SourceFile(fname, start=CAST_START, stop=CAST_STOP) except IOError: if rolename == 'actress': CACHE_CID.flush() if not CSV_DIR: CACHE_CID.clear() continue doCast(f, roleid, rolename) f.close() if rolename == 'actress': CACHE_CID.flush() if not CSV_DIR: CACHE_CID.clear() t('castLists(%s)' % rolename) def doAkaNames(): """People's akas.""" pid = None count = 0 try: fp = SourceFile('aka-names.list.gz', start=AKAN_START) except IOError: return sqldata = SQLData(table=AkaName, cols=['personID', 'name', 'imdbIndex', 'namePcodeCf', 'namePcodeNf', 'surnamePcode', 'md5sum']) for line in fp: if line and line[0] != ' ': if line[0] == '\n': continue pid = CACHE_PID.addUnique(line.strip()) else: line = line.strip() if line[:5] == '(aka ': line = line[5:] if line[-1:] == ')': line = line[:-1] try: name_dict = analyze_name(line) except IMDbParserError: if line: print 'WARNING doAkaNames wrong name:', _(line) continue name = name_dict.get('name') namePcodeCf, namePcodeNf, surnamePcode = name_soundexes(name) sqldata.add((pid, name, name_dict.get('imdbIndex'), namePcodeCf, namePcodeNf, surnamePcode, md5(line).hexdigest())) if count % 10000 == 0: print 'SCANNING akanames:', _(line) count += 1 sqldata.flush() fp.close() class AkasMoviesCache(MoviesCache): """A MoviesCache-like class used to populate the AkaTitle table.""" className = 'AkasMoviesCache' counter = counter() def __init__(self, *args, **kdws): MoviesCache.__init__(self, *args, **kdws) self.flushEvery = 50000 self._mapsIDsToTitles = True self.notes = {} self.ids = {} self._table_name = tableName(AkaTitle) self._id_for_custom_q = 'AKAMOVIES' self.sqlstr, self.converter = createSQLstr(AkaTitle, ('id', 'movieID', 'title', 'imdbIndex', 'kindID', 'productionYear', 'phoneticCode', 'episodeOfID', 'seasonNr', 'episodeNr', 'note', 'md5sum')) def flush(self, *args, **kwds): # Preserve consistency of ForeignKey. CACHE_MID.flush(quiet=1) super(AkasMoviesCache, self).flush(*args, **kwds) def _runCommand(self, dataList): new_dataList = [] new_dataListapp = new_dataList.append while dataList: item = list(dataList.pop()) # Remove the imdbID. del item[5] # id used to store this entry. the_id = item[0] # id of the referred title. original_title_id = self.ids.get(the_id) or 0 new_item = [the_id, original_title_id] md5sum = item[-1] new_item += item[1:-2] new_item.append(self.notes.get(the_id)) new_item.append(md5sum) new_dataListapp(tuple(new_item)) new_dataList.reverse() if not CSV_DIR: CURS.executemany(self.sqlstr, self.converter(new_dataList)) else: CSV_CURS.executemany(self.sqlstr, new_dataList) CACHE_MID_AKAS = AkasMoviesCache() def doAkaTitles(): """Movies' akas.""" mid = None count = 0 for fname, start in (('aka-titles.list.gz',AKAT_START), ('italian-aka-titles.list.gz',AKAT_IT_START), ('german-aka-titles.list.gz',AKAT_DE_START), ('iso-aka-titles.list.gz',AKAT_ISO_START), (os.path.join('contrib','hungarian-aka-titles.list.gz'), AKAT_HU_START), (os.path.join('contrib','norwegian-aka-titles.list.gz'), AKAT_NO_START)): incontrib = 0 pwarning = 1 # Looks like that the only up-to-date AKA file is aka-titles. obsolete = False if fname != 'aka-titles.list.gz': obsolete = True if start in (AKAT_HU_START, AKAT_NO_START): pwarning = 0 incontrib = 1 try: fp = SourceFile(fname, start=start, stop='---------------------------', pwarning=pwarning) except IOError: continue isEpisode = False seriesID = None doNotAdd = False for line in fp: if line and line[0] != ' ': # Reading the official title. doNotAdd = False if line[0] == '\n': continue line = line.strip() if obsolete: try: tonD = analyze_title(line, _emptyString='') except IMDbParserError: if line: print 'WARNING doAkaTitles(obsol O) invalid title:', print _(line) continue tonD['title'] = normalizeTitle(tonD['title']) line = build_title(tonD, ptdf=1, _emptyString='') # Aka information for titles in obsolete files are # added only if the movie already exists in the cache. if line not in CACHE_MID: doNotAdd = True continue mid = CACHE_MID.addUnique(line) if mid is None: continue if line[0] == '"': try: titleDict = analyze_title(line, _emptyString='') except IMDbParserError: if line: print 'WARNING doAkaTitles (O) invalid title:', print _(line) continue if 'episode of' in titleDict: if obsolete: titleDict['episode of']['title'] = \ normalizeTitle(titleDict['episode of']['title']) series = build_title(titleDict['episode of'], ptdf=1, _emptyString='') seriesID = CACHE_MID.addUnique(series) if seriesID is None: continue isEpisode = True else: seriesID = None isEpisode = False else: seriesID = None isEpisode = False else: # Reading an aka title. if obsolete and doNotAdd: continue res = unpack(line.strip(), ('title', 'note')) note = res.get('note') if incontrib: if res.get('note'): note += ' ' else: note = '' if start == AKAT_HU_START: note += '(Hungary)' elif start == AKAT_NO_START: note += '(Norway)' akat = res.get('title', '') if akat[:5] == '(aka ': akat = akat[5:] if akat[-2:] in ('))', '})'): akat = akat[:-1] akat = akat.strip() if not akat: continue if obsolete: try: akatD = analyze_title(akat, _emptyString='') except IMDbParserError: if line: print 'WARNING doAkaTitles(obsol) invalid title:', print _(akat) continue akatD['title'] = normalizeTitle(akatD['title']) akat = build_title(akatD, ptdf=1, _emptyString='') if count % 10000 == 0: print 'SCANNING %s:' % fname[:-8].replace('-', ' '), print _(akat) if isEpisode and seriesID is not None: # Handle series for which only single episodes have # aliases. try: akaDict = analyze_title(akat, _emptyString='') except IMDbParserError: if line: print 'WARNING doAkaTitles (epis) invalid title:', print _(akat) continue if 'episode of' in akaDict: if obsolete: akaDict['episode of']['title'] = normalizeTitle( akaDict['episode of']['title']) akaSeries = build_title(akaDict['episode of'], ptdf=1) CACHE_MID_AKAS.add(akaSeries, [('ids', seriesID)]) append_data = [('ids', mid)] if note is not None: append_data.append(('notes', note)) CACHE_MID_AKAS.add(akat, append_data) count += 1 fp.close() CACHE_MID_AKAS.flush() CACHE_MID_AKAS.clear() CACHE_MID_AKAS.notes.clear() CACHE_MID_AKAS.ids.clear() def doMovieLinks(): """Connections between movies.""" mid = None count = 0 sqldata = SQLData(table=MovieLink, cols=['movieID', 'linkedMovieID', 'linkTypeID'], flushEvery=10000) try: fp = SourceFile('movie-links.list.gz', start=LINK_START) except IOError: return for line in fp: if line and line[0] != ' ': if line[0] == '\n': continue title = line.strip() mid = CACHE_MID.addUnique(title) if mid is None: continue if count % 10000 == 0: print 'SCANNING movielinks:', _(title) else: line = line.strip() link_txt = unicode(line, 'utf_8').encode('ascii', 'replace') theid = None for k, lenkp1, v in MOVIELINK_IDS: if link_txt and link_txt[0] == '(' \ and link_txt[1:lenkp1+1] == k: theid = v break if theid is None: continue totitle = line[lenkp1+2:-1].strip() totitleid = CACHE_MID.addUnique(totitle) if totitleid is None: continue sqldata.add((mid, totitleid, theid)) count += 1 sqldata.flush() fp.close() def minusHashFiles(fp, funct, defaultid, descr): """A file with lines starting with '# ' and '- '.""" sqldata = SQLData(table=MovieInfo, cols=['movieID', 'infoTypeID', 'info', 'note']) sqldata.flushEvery = 2500 if descr == 'quotes': sqldata.flushEvery = 4000 elif descr == 'soundtracks': sqldata.flushEvery = 3000 elif descr == 'trivia': sqldata.flushEvery = 3000 count = 0 for title, text in fp.getByHashSections(): title = title.strip() d = funct(text.split('\n')) if not d: print 'WARNING skipping empty information about title:', print _(title) continue if not title: print 'WARNING skipping information associated to empty title:', print _(d[0], truncateAt=40) continue mid = CACHE_MID.addUnique(title) if mid is None: continue if count % 5000 == 0: print 'SCANNING %s:' % descr, print _(title) for data in d: sqldata.add((mid, defaultid, data, None)) count += 1 sqldata.flush() def doMinusHashFiles(): """Files with lines starting with '# ' and '- '.""" for fname, start in [('alternate versions',AV_START), ('goofs',GOOFS_START), ('crazy credits',CC_START), ('quotes',QUOTES_START), ('soundtracks',SNDT_START), ('trivia',TRIV_START)]: try: fp = SourceFile(fname.replace(' ', '-')+'.list.gz', start=start, stop=MINHASH_STOP) except IOError: continue funct = _parseMinusList if fname == 'quotes': funct = getQuotes index = fname if index == 'soundtracks': index = 'soundtrack' minusHashFiles(fp, funct, INFO_TYPES[index], fname) fp.close() def getTaglines(): """Movie's taglines.""" try: fp = SourceFile('taglines.list.gz', start=TAGL_START, stop=TAGL_STOP) except IOError: return sqldata = SQLData(table=MovieInfo, cols=['movieID', 'infoTypeID', 'info', 'note'], flushEvery=10000) count = 0 for title, text in fp.getByHashSections(): title = title.strip() mid = CACHE_MID.addUnique(title) if mid is None: continue for tag in text.split('\n'): tag = tag.strip() if not tag: continue if count % 10000 == 0: print 'SCANNING taglines:', _(title) sqldata.add((mid, INFO_TYPES['taglines'], tag, None)) count += 1 sqldata.flush() fp.close() def getQuotes(lines): """Movie's quotes.""" quotes = [] qttl = [] for line in lines: if line and line[:2] == ' ' and qttl and qttl[-1] and \ not qttl[-1].endswith('::'): line = line.lstrip() if line: qttl[-1] += ' %s' % line elif not line.strip(): if qttl: quotes.append('::'.join(qttl)) qttl[:] = [] else: line = line.lstrip() if line: qttl.append(line) if qttl: quotes.append('::'.join(qttl)) return quotes _bus = {'BT': 'budget', 'WG': 'weekend gross', 'GR': 'gross', 'OW': 'opening weekend', 'RT': 'rentals', 'AD': 'admissions', 'SD': 'filming dates', 'PD': 'production dates', 'ST': 'studios', 'CP': 'copyright holder' } _usd = '$' _gbp = unichr(0x00a3).encode('utf_8') _eur = unichr(0x20ac).encode('utf_8') def getBusiness(lines): """Movie's business information.""" bd = _parseColonList(lines, _bus) for k in bd.keys(): nv = [] for v in bd[k]: v = v.replace('USD ',_usd).replace('GBP ',_gbp).replace('EUR',_eur) nv.append(v) bd[k] = nv return bd _ldk = {'OT': 'original title', 'PC': 'production country', 'YR': 'year', 'CF': 'certification', 'CA': 'category', 'GR': 'group genre', 'LA': 'language', 'SU': 'subtitles', 'LE': 'length', 'RD': 'release date', 'ST': 'status of availablility', 'PR': 'official retail price', 'RC': 'release country', 'VS': 'video standard', 'CO': 'color information', 'SE': 'sound encoding', 'DS': 'digital sound', 'AL': 'analog left', 'AR': 'analog right', 'MF': 'master format', 'PP': 'pressing plant', 'SZ': 'disc size', 'SI': 'number of sides', 'DF': 'disc format', 'PF': 'picture format', 'AS': 'aspect ratio', 'CC': 'close captions-teletext-ld-g', 'CS': 'number of chapter stops', 'QP': 'quality program', 'IN': 'additional information', 'SL': 'supplement', 'RV': 'review', 'V1': 'quality of source', 'V2': 'contrast', 'V3': 'color rendition', 'V4': 'sharpness', 'V5': 'video noise', 'V6': 'video artifacts', 'VQ': 'video quality', 'A1': 'frequency response', 'A2': 'dynamic range', 'A3': 'spaciality', 'A4': 'audio noise', 'A5': 'dialogue intellegibility', 'AQ': 'audio quality', 'LN': 'number', 'LB': 'label', 'CN': 'catalog number', 'LT': 'laserdisc title' } # Handle laserdisc keys. for key, value in _ldk.items(): _ldk[key] = 'LD %s' % value def getLaserDisc(lines): """Laserdisc information.""" d = _parseColonList(lines, _ldk) for k, v in d.iteritems(): d[k] = ' '.join(v) return d _lit = {'SCRP': 'screenplay-teleplay', 'NOVL': 'novel', 'ADPT': 'adaption', 'BOOK': 'book', 'PROT': 'production process protocol', 'IVIW': 'interviews', 'CRIT': 'printed media reviews', 'ESSY': 'essays', 'OTHR': 'other literature' } def getLiterature(lines): """Movie's literature information.""" return _parseColonList(lines, _lit) _mpaa = {'RE': 'mpaa'} def getMPAA(lines): """Movie's mpaa information.""" d = _parseColonList(lines, _mpaa) for k, v in d.iteritems(): d[k] = ' '.join(v) return d re_nameImdbIndex = re.compile(r'\(([IVXLCDM]+)\)') def nmmvFiles(fp, funct, fname): """Files with sections separated by 'MV: ' or 'NM: '.""" count = 0 sqlsP = (PersonInfo, ['personID', 'infoTypeID', 'info', 'note']) sqlsM = (MovieInfo, ['movieID', 'infoTypeID', 'info', 'note']) if fname == 'biographies.list.gz': datakind = 'person' sqls = sqlsP guestid = RoleType.select(RoleType.q.role == 'guest')[0].id roleid = str(guestid) guestdata = SQLData(table=CastInfo, cols=['personID', 'movieID', 'personRoleID', 'note', RawValue('roleID', roleid)], flushEvery=10000) akanamesdata = SQLData(table=AkaName, cols=['personID', 'name', 'imdbIndex', 'namePcodeCf', 'namePcodeNf', 'surnamePcode', 'md5sum']) else: datakind = 'movie' sqls = sqlsM guestdata = None akanamesdata = None sqldata = SQLData(table=sqls[0], cols=sqls[1]) if fname == 'plot.list.gz': sqldata.flushEvery = 1100 elif fname == 'literature.list.gz': sqldata.flushEvery = 5000 elif fname == 'business.list.gz': sqldata.flushEvery = 10000 elif fname == 'biographies.list.gz': sqldata.flushEvery = 5000 islaserdisc = False if fname == 'laserdisc.list.gz': islaserdisc = True _ltype = type([]) for ton, text in fp.getByNMMVSections(): ton = ton.strip() if not ton: continue note = None if datakind == 'movie': if islaserdisc: tonD = analyze_title(ton, _emptyString='') tonD['title'] = normalizeTitle(tonD['title']) ton = build_title(tonD, ptdf=1, _emptyString='') # Skips movies that are not already in the cache, since # laserdisc.list.gz is an obsolete file. if ton not in CACHE_MID: continue mopid = CACHE_MID.addUnique(ton) if mopid is None: continue else: mopid = CACHE_PID.addUnique(ton) if count % 6000 == 0: print 'SCANNING %s:' % fname[:-8].replace('-', ' '), print _(ton) d = funct(text.split('\n')) for k, v in d.iteritems(): if k != 'notable tv guest appearances': theid = INFO_TYPES.get(k) if theid is None: print 'WARNING key "%s" of ToN' % k, print _(ton), print 'not in INFO_TYPES' continue if type(v) is _ltype: for i in v: if k == 'notable tv guest appearances': # Put "guest" information in the cast table; these # are a list of Movie object (yes, imdb.Movie.Movie) # FIXME: no more used? title = i.get('long imdb canonical title') if not title: continue movieid = CACHE_MID.addUnique(title) if movieid is None: continue crole = i.currentRole if isinstance(crole, list): crole = ' / '.join([x.get('long imdb name', u'') for x in crole]) if not crole: crole = None else: crole = unicode(crole).encode('utf_8') guestdata.add((mopid, movieid, crole, i.notes or None)) continue if k in ('plot', 'mini biography'): s = i.split('::') if len(s) == 2: #if note: note += ' ' #else: note = '' #note += '(author: %s)' % s[1] note = s[1] i = s[0] if i: sqldata.add((mopid, theid, i, note)) note = None else: if v: sqldata.add((mopid, theid, v, note)) if k in ('nick names', 'birth name') and v: # Put also the birth name/nick names in the list of aliases. if k == 'birth name': realnames = [v] else: realnames = v for realname in realnames: imdbIndex = re_nameImdbIndex.findall(realname) or None if imdbIndex: imdbIndex = imdbIndex[0] realname = re_nameImdbIndex.sub('', realname) if realname: # XXX: check for duplicates? ##if k == 'birth name': ## realname = canonicalName(realname) ##else: ## realname = normalizeName(realname) namePcodeCf, namePcodeNf, surnamePcode = \ name_soundexes(realname) akanamesdata.add((mopid, realname, imdbIndex, namePcodeCf, namePcodeNf, surnamePcode, md5(realname).hexdigest())) count += 1 if guestdata is not None: guestdata.flush() if akanamesdata is not None: akanamesdata.flush() sqldata.flush() # ============ # Code from the old 'local' data access system. def _parseList(l, prefix, mline=1): """Given a list of lines l, strips prefix and join consecutive lines with the same prefix; if mline is True, there can be multiple info with the same prefix, and the first line starts with 'prefix: * '.""" resl = [] reslapp = resl.append ltmp = [] ltmpapp = ltmp.append fistl = '%s: * ' % prefix otherl = '%s: ' % prefix if not mline: fistl = fistl[:-2] otherl = otherl[:-2] firstlen = len(fistl) otherlen = len(otherl) parsing = 0 joiner = ' '.join for line in l: if line[:firstlen] == fistl: parsing = 1 if ltmp: reslapp(joiner(ltmp)) ltmp[:] = [] data = line[firstlen:].strip() if data: ltmpapp(data) elif mline and line[:otherlen] == otherl: data = line[otherlen:].strip() if data: ltmpapp(data) else: if ltmp: reslapp(joiner(ltmp)) ltmp[:] = [] if parsing: if ltmp: reslapp(joiner(ltmp)) break return resl def _parseBioBy(l): """Return a list of biographies.""" bios = [] biosappend = bios.append tmpbio = [] tmpbioappend = tmpbio.append joiner = ' '.join for line in l: if line[:4] == 'BG: ': tmpbioappend(line[4:].strip()) elif line[:4] == 'BY: ': if tmpbio: biosappend(joiner(tmpbio) + '::' + line[4:].strip()) tmpbio[:] = [] # Cut mini biographies up to 2**16-1 chars, to prevent errors with # some MySQL versions - when used by the imdbpy2sql.py script. bios[:] = [bio[:65535] for bio in bios] return bios def _parseBiography(biol): """Parse the biographies.data file.""" res = {} bio = ' '.join(_parseList(biol, 'BG', mline=0)) bio = _parseBioBy(biol) if bio: res['mini biography'] = bio for x in biol: x4 = x[:4] x6 = x[:6] if x4 == 'DB: ': date, notes = date_and_notes(x[4:]) if date: res['birth date'] = date if notes: res['birth notes'] = notes elif x4 == 'DD: ': date, notes = date_and_notes(x[4:]) if date: res['death date'] = date if notes: res['death notes'] = notes elif x6 == 'SP: * ': res.setdefault('spouse', []).append(x[6:].strip()) elif x4 == 'RN: ': n = x[4:].strip() if not n: continue try: rn = build_name(analyze_name(n, canonical=1), canonical=1) res['birth name'] = rn except IMDbParserError: if line: print 'WARNING _parseBiography wrong name:', _(n) continue elif x6 == 'AT: * ': res.setdefault('article', []).append(x[6:].strip()) elif x4 == 'HT: ': res['height'] = x[4:].strip() elif x6 == 'PT: * ': res.setdefault('pictorial', []).append(x[6:].strip()) elif x6 == 'CV: * ': res.setdefault('magazine cover photo', []).append(x[6:].strip()) elif x4 == 'NK: ': res.setdefault('nick names', []).append(normalizeName(x[4:])) elif x6 == 'PI: * ': res.setdefault('portrayed in', []).append(x[6:].strip()) elif x6 == 'SA: * ': sal = x[6:].strip().replace(' -> ', '::') res.setdefault('salary history', []).append(sal) trl = _parseList(biol, 'TR') if trl: res['trivia'] = trl quotes = _parseList(biol, 'QU') if quotes: res['quotes'] = quotes otherworks = _parseList(biol, 'OW') if otherworks: res['other works'] = otherworks books = _parseList(biol, 'BO') if books: res['books'] = books agent = _parseList(biol, 'AG') if agent: res['agent address'] = agent wherenow = _parseList(biol, 'WN') if wherenow: res['where now'] = wherenow[0] biomovies = _parseList(biol, 'BT') if biomovies: res['biographical movies'] = biomovies tm = _parseList(biol, 'TM') if tm: res['trade mark'] = tm interv = _parseList(biol, 'IT') if interv: res['interviews'] = interv return res # ============ def doNMMVFiles(): """Files with large sections, about movies and persons.""" for fname, start, funct in [ ('biographies.list.gz', BIO_START, _parseBiography), ('business.list.gz', BUS_START, getBusiness), ('laserdisc.list.gz', LSD_START, getLaserDisc), ('literature.list.gz', LIT_START, getLiterature), ('mpaa-ratings-reasons.list.gz', MPAA_START, getMPAA), ('plot.list.gz', PLOT_START, getPlot)]: ##for fname, start, funct in [('business.list.gz',BUS_START,getBusiness)]: try: fp = SourceFile(fname, start=start) except IOError: continue if fname == 'literature.list.gz': fp.set_stop(LIT_STOP) elif fname == 'business.list.gz': fp.set_stop(BUS_STOP) nmmvFiles(fp, funct, fname) fp.close() t('doNMMVFiles(%s)' % fname[:-8].replace('-', ' ')) def doMovieCompaniesInfo(): """Files with information on a single line about movies, concerning companies.""" sqldata = SQLData(table=MovieCompanies, cols=['movieID', 'companyID', 'companyTypeID', 'note']) for dataf in (('distributors.list.gz', DIS_START), ('miscellaneous-companies.list.gz', MIS_START), ('production-companies.list.gz', PRO_START), ('special-effects-companies.list.gz', SFX_START)): try: fp = SourceFile(dataf[0], start=dataf[1]) except IOError: continue typeindex = dataf[0][:-8].replace('-', ' ') infoid = COMP_TYPES[typeindex] count = 0 for line in fp: data = unpack(line.strip(), ('title', 'company', 'note')) if 'title' not in data: continue if 'company' not in data: continue title = data['title'] company = data['company'] mid = CACHE_MID.addUnique(title) if mid is None: continue cid = CACHE_COMPID.addUnique(company) note = None if 'note' in data: note = data['note'] if count % 10000 == 0: print 'SCANNING %s:' % dataf[0][:-8].replace('-', ' '), print _(data['title']) sqldata.add((mid, cid, infoid, note)) count += 1 sqldata.flush() CACHE_COMPID.flush() fp.close() t('doMovieCompaniesInfo(%s)' % dataf[0][:-8].replace('-', ' ')) def doMiscMovieInfo(): """Files with information on a single line about movies.""" for dataf in (('certificates.list.gz',CER_START), ('color-info.list.gz',COL_START), ('countries.list.gz',COU_START), ('genres.list.gz',GEN_START), ('keywords.list.gz',KEY_START), ('language.list.gz',LAN_START), ('locations.list.gz',LOC_START), ('running-times.list.gz',RUN_START), ('sound-mix.list.gz',SOU_START), ('technical.list.gz',TCN_START), ('release-dates.list.gz',RELDATE_START)): try: fp = SourceFile(dataf[0], start=dataf[1]) except IOError: continue typeindex = dataf[0][:-8].replace('-', ' ') if typeindex == 'running times': typeindex = 'runtimes' elif typeindex == 'technical': typeindex = 'tech info' elif typeindex == 'language': typeindex = 'languages' if typeindex != 'keywords': sqldata = SQLData(table=MovieInfo, cols=['movieID', 'infoTypeID', 'info', 'note']) else: sqldata = SQLData(table=MovieKeyword, cols=['movieID', 'keywordID']) infoid = INFO_TYPES[typeindex] count = 0 if dataf[0] == 'locations.list.gz': sqldata.flushEvery = 10000 else: sqldata.flushEvery = 20000 for line in fp: data = unpack(line.strip(), ('title', 'info', 'note')) if 'title' not in data: continue if 'info' not in data: continue title = data['title'] mid = CACHE_MID.addUnique(title) if mid is None: continue note = None if 'note' in data: note = data['note'] if count % 10000 == 0: print 'SCANNING %s:' % dataf[0][:-8].replace('-', ' '), print _(data['title']) info = data['info'] if typeindex == 'keywords': keywordID = CACHE_KWRDID.addUnique(info) sqldata.add((mid, keywordID)) else: sqldata.add((mid, infoid, info, note)) count += 1 sqldata.flush() if typeindex == 'keywords': CACHE_KWRDID.flush() CACHE_KWRDID.clear() fp.close() t('doMiscMovieInfo(%s)' % dataf[0][:-8].replace('-', ' ')) def getRating(): """Movie's rating.""" try: fp = SourceFile('ratings.list.gz', start=RAT_START, stop=RAT_STOP) except IOError: return sqldata = SQLData(table=MovieInfoIdx, cols=['movieID', 'infoTypeID', 'info', 'note']) count = 0 for line in fp: data = unpack(line, ('votes distribution', 'votes', 'rating', 'title'), sep=' ') if 'title' not in data: continue title = data['title'].strip() mid = CACHE_MID.addUnique(title) if mid is None: continue if count % 10000 == 0: print 'SCANNING rating:', _(title) sqldata.add((mid, INFO_TYPES['votes distribution'], data.get('votes distribution'), None)) sqldata.add((mid, INFO_TYPES['votes'], data.get('votes'), None)) sqldata.add((mid, INFO_TYPES['rating'], data.get('rating'), None)) count += 1 sqldata.flush() fp.close() def getTopBottomRating(): """Movie's rating, scanning for top 250 and bottom 10.""" for what in ('top 250 rank', 'bottom 10 rank'): if what == 'top 250 rank': st = RAT_TOP250_START else: st = RAT_BOT10_START try: fp = SourceFile('ratings.list.gz', start=st, stop=TOPBOT_STOP) except IOError: break sqldata = SQLData(table=MovieInfoIdx, cols=['movieID', RawValue('infoTypeID', INFO_TYPES[what]), 'info', 'note']) count = 1 print 'SCANNING %s...' % what for line in fp: data = unpack(line, ('votes distribution', 'votes', 'rank', 'title'), sep=' ') if 'title' not in data: continue title = data['title'].strip() mid = CACHE_MID.addUnique(title) if mid is None: continue if what == 'top 250 rank': rank = count else: rank = 11 - count sqldata.add((mid, str(rank), None)) count += 1 sqldata.flush() fp.close() def getPlot(lines): """Movie's plot.""" plotl = [] plotlappend = plotl.append plotltmp = [] plotltmpappend = plotltmp.append for line in lines: linestart = line[:4] if linestart == 'PL: ': plotltmpappend(line[4:]) elif linestart == 'BY: ': plotlappend('%s::%s' % (' '.join(plotltmp), line[4:].strip())) plotltmp[:] = [] return {'plot': plotl} def completeCast(): """Movie's complete cast/crew information.""" CCKind = {} cckinds = [(x.id, x.kind) for x in CompCastType.select()] for k, v in cckinds: CCKind[v] = k for fname, start in [('complete-cast.list.gz',COMPCAST_START), ('complete-crew.list.gz',COMPCREW_START)]: try: fp = SourceFile(fname, start=start, stop=COMP_STOP) except IOError: continue if fname == 'complete-cast.list.gz': obj = 'cast' else: obj = 'crew' subID = str(CCKind[obj]) sqldata = SQLData(table=CompleteCast, cols=['movieID', RawValue('subjectID', subID), 'statusID']) count = 0 for line in fp: ll = [x for x in line.split('\t') if x] if len(ll) != 2: continue title = ll[0] mid = CACHE_MID.addUnique(title) if mid is None: continue if count % 10000 == 0: print 'SCANNING %s:' % fname[:-8].replace('-', ' '), print _(title) sqldata.add((mid, CCKind[ll[1].lower().strip()])) count += 1 fp.close() sqldata.flush() # global instances CACHE_MID = MoviesCache() CACHE_PID = PersonsCache() CACHE_CID = CharactersCache() CACHE_CID.className = 'CharactersCache' CACHE_COMPID = CompaniesCache() CACHE_KWRDID = KeywordsCache() def _cmpfunc(x, y): """Sort a list of tuples, by the length of the first item (in reverse).""" lx = len(x[0]) ly = len(y[0]) if lx > ly: return -1 elif lx < ly: return 1 return 0 INFO_TYPES = {} MOVIELINK_IDS = [] KIND_IDS = {} KIND_STRS = {} CCAST_TYPES = {} COMP_TYPES = {} def readConstants(): """Read constants from the database.""" global INFO_TYPES, MOVIELINK_IDS, KIND_IDS, KIND_STRS, \ CCAST_TYPES, COMP_TYPES for x in InfoType.select(): INFO_TYPES[x.info] = x.id for x in LinkType.select(): MOVIELINK_IDS.append((x.link, len(x.link), x.id)) MOVIELINK_IDS.sort(_cmpfunc) for x in KindType.select(): KIND_IDS[x.kind] = x.id KIND_STRS[x.id] = x.kind for x in CompCastType.select(): CCAST_TYPES[x.kind] = x.id for x in CompanyType.select(): COMP_TYPES[x.kind] = x.id def _imdbIDsFileName(fname): """Return a file name, adding the optional CSV_DIR directory.""" return os.path.join(*(filter(None, [CSV_DIR, fname]))) def _countRows(tableName): """Return the number of rows in a table.""" try: CURS.execute('SELECT COUNT(*) FROM %s' % tableName) return (CURS.fetchone() or [0])[0] except Exception, e: print 'WARNING: unable to count rows of table %s: %s' % (tableName, e) return 0 def storeNotNULLimdbIDs(cls): """Store in a temporary table or in a dbm database a mapping between md5sum (of title or name) and imdbID, when the latter is present in the database.""" if cls is Title: cname = 'movies' elif cls is Name: cname = 'people' elif cls is CompanyName: cname = 'companies' else: cname = 'characters' table_name = tableName(cls) md5sum_col = colName(cls, 'md5sum') imdbID_col = colName(cls, 'imdbID') print 'SAVING imdbID values for %s...' % cname, sys.stdout.flush() if _get_imdbids_method() == 'table': try: try: CURS.execute('DROP TABLE %s_extract' % table_name) except: pass try: CURS.execute('SELECT * FROM %s LIMIT 1' % table_name) except Exception, e: print 'missing "%s" table (ok if this is the first run)' % table_name return query = 'CREATE TEMPORARY TABLE %s_extract AS SELECT %s, %s FROM %s WHERE %s IS NOT NULL' % \ (table_name, md5sum_col, imdbID_col, table_name, imdbID_col) CURS.execute(query) CURS.execute('CREATE INDEX %s_md5sum_idx ON %s_extract (%s)' % (table_name, table_name, md5sum_col)) CURS.execute('CREATE INDEX %s_imdbid_idx ON %s_extract (%s)' % (table_name, table_name, imdbID_col)) rows = _countRows('%s_extract' % table_name) print 'DONE! (%d entries using a temporary table)' % rows return except Exception, e: print 'WARNING: unable to store imdbIDs in a temporary table (falling back to dbm): %s' % e try: db = anydbm.open(_imdbIDsFileName('%s_imdbIDs.db' % cname), 'c') except Exception, e: print 'WARNING: unable to store imdbIDs: %s' % str(e) return try: CURS.execute('SELECT %s, %s FROM %s WHERE %s IS NOT NULL' % (md5sum_col, imdbID_col, table_name, imdbID_col)) res = CURS.fetchmany(10000) while res: db.update(dict((str(x[0]), str(x[1])) for x in res)) res = CURS.fetchmany(10000) except Exception, e: print 'SKIPPING: unable to retrieve data: %s' % e return print 'DONE! (%d entries)' % len(db) db.close() return def iterbatch(iterable, size): """Process an iterable 'size' items at a time.""" sourceiter = iter(iterable) while True: batchiter = islice(sourceiter, size) yield chain([batchiter.next()], batchiter) def restoreImdbIDs(cls): """Restore imdbIDs for movies, people, companies and characters.""" if cls is Title: cname = 'movies' elif cls is Name: cname = 'people' elif cls is CompanyName: cname = 'companies' else: cname = 'characters' print 'RESTORING imdbIDs values for %s...' % cname, sys.stdout.flush() table_name = tableName(cls) md5sum_col = colName(cls, 'md5sum') imdbID_col = colName(cls, 'imdbID') if _get_imdbids_method() == 'table': try: try: CURS.execute('SELECT * FROM %s_extract LIMIT 1' % table_name) except Exception, e: raise Exception('missing "%s_extract" table (ok if this is the first run)' % table_name) if DB_NAME == 'mysql': query = 'UPDATE %s INNER JOIN %s_extract USING (%s) SET %s.%s = %s_extract.%s' % \ (table_name, table_name, md5sum_col, table_name, imdbID_col, table_name, imdbID_col) else: query = 'UPDATE %s SET %s = %s_extract.%s FROM %s_extract WHERE %s.%s = %s_extract.%s' % \ (table_name, imdbID_col, table_name, imdbID_col, table_name, table_name, md5sum_col, table_name, md5sum_col) CURS.execute(query) affected_rows = 'an unknown number of' try: CURS.execute('SELECT COUNT(*) FROM %s WHERE %s IS NOT NULL' % (table_name, imdbID_col)) affected_rows = (CURS.fetchone() or [0])[0] except Exception, e: pass rows = _countRows('%s_extract' % table_name) print 'DONE! (restored %s entries out of %d)' % (affected_rows, rows) t('restore %s' % cname) try: CURS.execute('DROP TABLE %s_extract' % table_name) except: pass return except Exception, e: print 'WARNING: unable to restore imdbIDs using the temporary table (falling back to dbm): %s' % e try: db = anydbm.open(_imdbIDsFileName('%s_imdbIDs.db' % cname), 'r') except Exception, e: print 'WARNING: unable to restore imdbIDs (ok if this is the first run)' return count = 0 sql = "UPDATE " + table_name + " SET " + imdbID_col + \ " = CASE " + md5sum_col + " %s END WHERE " + \ md5sum_col + " IN (%s)" def _restore(query, batch): """Execute a query to restore a batch of imdbIDs""" items = list(batch) case_clause = ' '.join("WHEN '%s' THEN %s" % (k, v) for k, v in items) where_clause = ', '.join("'%s'" % x[0] for x in items) success = _executeQuery(query % (case_clause, where_clause)) if success: return len(items) return 0 for batch in iterbatch(db.iteritems(), 10000): count += _restore(sql, batch) print 'DONE! (restored %d entries out of %d)' % (count, len(db)) t('restore %s' % cname) db.close() return def restoreAll_imdbIDs(): """Restore imdbIDs for movies, persons, companies and characters.""" # Restoring imdbIDs for movies and persons (moved after the # built of indexes, so that it can take advantage of them). runSafely(restoreImdbIDs, 'failed to restore imdbIDs for movies', None, Title) runSafely(restoreImdbIDs, 'failed to restore imdbIDs for people', None, Name) runSafely(restoreImdbIDs, 'failed to restore imdbIDs for characters', None, CharName) runSafely(restoreImdbIDs, 'failed to restore imdbIDs for companies', None, CompanyName) def runSafely(funct, fmsg, default, *args, **kwds): """Run the function 'funct' with arguments args and kwds, catching every exception; fmsg is printed out (along with the exception message) in case of trouble; the return value of the function is returned (or 'default').""" try: return funct(*args, **kwds) except Exception, e: print 'WARNING: %s: %s' % (fmsg, e) return default def _executeQuery(query): """Execute a query on the CURS object.""" if len(query) > 60: s_query = query[:60] + '...' else: s_query = query print 'EXECUTING "%s"...' % (s_query), sys.stdout.flush() try: CURS.execute(query) print 'DONE!' return True except Exception, e: print 'FAILED (%s)!' % e return False def executeCustomQueries(when, _keys=None, _timeit=True): """Run custom queries as specified on the command line.""" if _keys is None: _keys = {} for query in CUSTOM_QUERIES.get(when, []): print 'EXECUTING "%s:%s"...' % (when, query) sys.stdout.flush() if query.startswith('FOR_EVERY_TABLE:'): query = query[16:] CURS.execute('SHOW TABLES;') tables = [x[0] for x in CURS.fetchall()] for table in tables: try: keys = {'table': table} keys.update(_keys) _executeQuery(query % keys) if _timeit: t('%s command' % when) except Exception, e: print 'FAILED (%s)!' % e continue else: try: _executeQuery(query % _keys) except Exception, e: print 'FAILED (%s)!' % e continue if _timeit: t('%s command' % when) def buildIndexesAndFK(): """Build indexes and Foreign Keys.""" executeCustomQueries('BEFORE_INDEXES') print 'building database indexes (this may take a while)' sys.stdout.flush() # Build database indexes. idx_errors = createIndexes(DB_TABLES) for idx_error in idx_errors: print 'ERROR caught exception creating an index: %s' % idx_error t('createIndexes()') print 'adding foreign keys (this may take a while)' sys.stdout.flush() # Add FK. fk_errors = createForeignKeys(DB_TABLES) for fk_error in fk_errors: print 'ERROR caught exception creating a foreign key: %s' % fk_error t('createForeignKeys()') def restoreCSV(): """Only restore data from a set of CSV files.""" CSV_CURS.buildFakeFileNames() print 'loading CSV files into the database' executeCustomQueries('BEFORE_CSV_LOAD') loadCSVFiles() t('loadCSVFiles()') executeCustomQueries('BEFORE_RESTORE') t('TOTAL TIME TO LOAD CSV FILES', sinceBegin=True) buildIndexesAndFK() restoreAll_imdbIDs() executeCustomQueries('END') t('FINAL', sinceBegin=True) # begin the iterations... def run(): print 'RUNNING imdbpy2sql.py using the %s ORM' % USED_ORM executeCustomQueries('BEGIN') # Storing imdbIDs for movies and persons. runSafely(storeNotNULLimdbIDs, 'failed to read imdbIDs for movies', None, Title) runSafely(storeNotNULLimdbIDs, 'failed to read imdbIDs for people', None, Name) runSafely(storeNotNULLimdbIDs, 'failed to read imdbIDs for characters', None, CharName) runSafely(storeNotNULLimdbIDs, 'failed to read imdbIDs for companies', None, CompanyName) # Truncate the current database. print 'DROPPING current database...', sys.stdout.flush() dropTables(DB_TABLES) print 'DONE!' executeCustomQueries('BEFORE_CREATE') # Rebuild the database structure. print 'CREATING new tables...', sys.stdout.flush() createTables(DB_TABLES) print 'DONE!' t('dropping and recreating the database') executeCustomQueries('AFTER_CREATE') # Read the constants. readConstants() # Populate the CACHE_MID instance. readMovieList() # Comment readMovieList() and uncomment the following two lines # to keep the current info in the name and title tables. ##CACHE_MID.populate() t('readMovieList()') executeCustomQueries('BEFORE_COMPANIES') # distributors, miscellaneous-companies, production-companies, # special-effects-companies. ##CACHE_COMPID.populate() doMovieCompaniesInfo() # Do this now, and free some memory. CACHE_COMPID.flush() CACHE_COMPID.clear() executeCustomQueries('BEFORE_CAST') # actors, actresses, producers, writers, cinematographers, composers, # costume-designers, directors, editors, miscellaneous, # production-designers. castLists() ##CACHE_PID.populate() ##CACHE_CID.populate() # Aka names and titles. doAkaNames() t('doAkaNames()') doAkaTitles() t('doAkaTitles()') # alternate-versions, goofs, crazy-credits, quotes, soundtracks, trivia. doMinusHashFiles() t('doMinusHashFiles()') # biographies, business, laserdisc, literature, mpaa-ratings-reasons, plot. doNMMVFiles() # certificates, color-info, countries, genres, keywords, language, # locations, running-times, sound-mix, technical, release-dates. doMiscMovieInfo() # movie-links. doMovieLinks() t('doMovieLinks()') # ratings. getRating() t('getRating()') # taglines. getTaglines() t('getTaglines()') # ratings (top 250 and bottom 10 movies). getTopBottomRating() t('getTopBottomRating()') # complete-cast, complete-crew. completeCast() t('completeCast()') if CSV_DIR: CSV_CURS.closeAll() # Flush caches. CACHE_MID.flush() CACHE_PID.flush() CACHE_CID.flush() CACHE_MID.clear() CACHE_PID.clear() CACHE_CID.clear() t('fushing caches...') if CSV_ONLY_WRITE: t('TOTAL TIME TO WRITE CSV FILES', sinceBegin=True) executeCustomQueries('END') t('FINAL', sinceBegin=True) return if CSV_DIR: print 'loading CSV files into the database' executeCustomQueries('BEFORE_CSV_LOAD') loadCSVFiles() t('loadCSVFiles()') executeCustomQueries('BEFORE_RESTORE') t('TOTAL TIME TO INSERT/WRITE DATA', sinceBegin=True) buildIndexesAndFK() restoreAll_imdbIDs() executeCustomQueries('END') t('FINAL', sinceBegin=True) _HEARD = 0 def _kdb_handler(signum, frame): """Die gracefully.""" global _HEARD if _HEARD: print "EHI! DON'T PUSH ME! I'VE HEARD YOU THE FIRST TIME! :-)" return print 'INTERRUPT REQUEST RECEIVED FROM USER. FLUSHING CACHES...' _HEARD = 1 # XXX: trap _every_ error? try: CACHE_MID.flush() except IntegrityError: pass try: CACHE_PID.flush() except IntegrityError: pass try: CACHE_CID.flush() except IntegrityError: pass try: CACHE_COMPID.flush() except IntegrityError: pass print 'DONE! (in %d minutes, %d seconds)' % \ divmod(int(time.time())-BEGIN_TIME, 60) sys.exit() if __name__ == '__main__': try: print 'IMPORTING psyco...', sys.stdout.flush() #import DONOTIMPORTPSYCO import psyco #psyco.log() psyco.profile() print 'DONE!' print '' except ImportError: print 'FAILED (not a big deal, everything is alright...)' print '' import signal signal.signal(signal.SIGINT, _kdb_handler) if CSV_ONLY_LOAD: restoreCSV() else: run() ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/search_person.py���������������������������������������������������������������������0000755�0000000�0000000�00000002534�11766731642�015111� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ search_person.py Usage: search_person "person name" Search for the given name and print the results. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "person name"' % sys.argv[0] sys.exit(2) name = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() name = unicode(name, in_encoding, 'replace') try: # Do the search, and get the results (a list of Person objects). results = i.search_person(name) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) # Print the results. print ' %s result%s for "%s":' % (len(results), ('', 's')[len(results) != 1], name.encode(out_encoding, 'replace')) print 'personID\t: imdbID : name' # Print the long imdb name for every person. for person in results: outp = u'%s\t: %s : %s' % (person.personID, i.get_imdbID(person), person['long imdb name']) print outp.encode(out_encoding, 'replace') ��������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/get_company.py�����������������������������������������������������������������������0000755�0000000�0000000�00000002775�11766731642�014572� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_company.py Usage: get_company "companyID" Show some info about the company with the given companyID (e.g. '0071509' for "Columbia Pictures [us]", using 'http' or 'mobile'). Notice that companyID, using 'sql', are not the same IDs used on the web. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "companyID"' % sys.argv[0] sys.exit(2) companyID = sys.argv[1] i = imdb.IMDb() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() try: # Get a company object with the data about the company identified by # the given companyID. company = i.get_company(companyID) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) if not company: print 'It seems that there\'s no company with companyID "%s"' % companyID sys.exit(4) # XXX: this is the easier way to print the main info about a company; # calling the summary() method of a company object will returns a string # with the main information about the company. # Obviously it's not really meaningful if you want to know how # to access the data stored in a company object, so look below; the # commented lines show some ways to retrieve information from a # company object. print company.summary().encode(out_encoding, 'replace') ���IMDbPY-4.9/bin/get_first_movie.py�������������������������������������������������������������������0000755�0000000�0000000�00000002513�11766731642�015440� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_first_movie.py Usage: get_first_movie "movie title" Search for the given title and print the best matching result. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "movie title"' % sys.argv[0] sys.exit(2) title = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() title = unicode(title, in_encoding, 'replace') try: # Do the search, and get the results (a list of Movie objects). results = i.search_movie(title) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) if not results: print 'No matches for "%s", sorry.' % title.encode(out_encoding, 'replace') sys.exit(0) # Print only the first result. print ' Best match for "%s"' % title.encode(out_encoding, 'replace') # This is a Movie instance. movie = results[0] # So far the Movie object only contains basic information like the # title and the year; retrieve main information: i.update(movie) print movie.summary().encode(out_encoding, 'replace') �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/get_person.py������������������������������������������������������������������������0000755�0000000�0000000�00000005416�11766731642�014425� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_person.py Usage: get_person "personID" Show some info about the person with the given personID (e.g. '0000210' for "Julia Roberts". Notice that personID, using 'sql', are not the same IDs used on the web. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "personID"' % sys.argv[0] sys.exit(2) personID = sys.argv[1] i = imdb.IMDb() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() try: # Get a Person object with the data about the person identified by # the given personID. person = i.get_person(personID) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) if not person: print 'It seems that there\'s no person with personID "%s"' % personID sys.exit(4) # XXX: this is the easier way to print the main info about a person; # calling the summary() method of a Person object will returns a string # with the main information about the person. # Obviously it's not really meaningful if you want to know how # to access the data stored in a Person object, so look below; the # commented lines show some ways to retrieve information from a # Person object. print person.summary().encode(out_encoding, 'replace') # Show some info about the person. # This is only a short example; you can get a longer summary using # 'print person.summary()' and the complete set of information looking for # the output of the person.keys() method. #print '==== "%s" / personID: %s ====' % (person['name'], personID) # XXX: use the IMDb instance to get the IMDb web URL for the person. #imdbURL = i.get_imdbURL(person) #if imdbURL: # print 'IMDb URL: %s' % imdbURL # XXX: print the birth date and birth notes. #d_date = person.get('birth date') #if d_date: # print 'Birth date: %s' % d_date # b_notes = person.get('birth notes') # if b_notes: # print 'Birth notes: %s' % b_notes # XXX: print the last five movies he/she acted in, and the played role. #movies_acted = person.get('actor') or person.get('actress') #if movies_acted: # print 'Last roles played: ' # for movie in movies_acted[:5]: # print ' %s (in "%s")' % (movie.currentRole, movie['title']) # XXX: example of the use of information sets. #import random #i.update(person, info=['awards']) #awards = person.get('awards') #if awards: # rand_award = awards[random.randrange(len(awards))] # s = 'Random award: in year ' # s += rand_award.get('year', '') # s += ' %s "%s"' % (rand_award.get('result', '').lower(), # rand_award.get('award', '')) # print s ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/get_character.py���������������������������������������������������������������������0000755�0000000�0000000�00000003042�11766731642�015044� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_character.py Usage: get_character "characterID" Show some info about the character with the given characterID (e.g. '0000001' for "Jesse James", using 'http' or 'mobile'). Notice that characterID, using 'sql', are not the same IDs used on the web. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "characterID"' % sys.argv[0] sys.exit(2) characterID = sys.argv[1] i = imdb.IMDb() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() try: # Get a character object with the data about the character identified by # the given characterID. character = i.get_character(characterID) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) if not character: print 'It seems that there\'s no character with characterID "%s"' % characterID sys.exit(4) # XXX: this is the easier way to print the main info about a character; # calling the summary() method of a character object will returns a string # with the main information about the character. # Obviously it's not really meaningful if you want to know how # to access the data stored in a character object, so look below; the # commented lines show some ways to retrieve information from a # character object. print character.summary().encode(out_encoding, 'replace') ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/get_first_person.py������������������������������������������������������������������0000755�0000000�0000000�00000002477�11766731642�015640� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_first_person.py Usage: get_first_person "person name" Search for the given name and print the best matching result. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "person name"' % sys.argv[0] sys.exit(2) name = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() name = unicode(name, in_encoding, 'replace') try: # Do the search, and get the results (a list of Person objects). results = i.search_person(name) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) if not results: print 'No matches for "%s", sorry.' % name.encode(out_encoding, 'replace') sys.exit(0) # Print only the first result. print ' Best match for "%s"' % name.encode(out_encoding, 'replace') # This is a Person instance. person = results[0] # So far the Person object only contains basic information like the # name; retrieve main information: i.update(person) print person.summary().encode(out_encoding, 'replace') �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/search_character.py������������������������������������������������������������������0000755�0000000�0000000�00000002605�11766731642�015536� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ search_character.py Usage: search_character "character name" Search for the given name and print the results. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "character name"' % sys.argv[0] sys.exit(2) name = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() name = unicode(name, in_encoding, 'replace') try: # Do the search, and get the results (a list of character objects). results = i.search_character(name) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) # Print the results. print ' %s result%s for "%s":' % (len(results), ('', 's')[len(results) != 1], name.encode(out_encoding, 'replace')) print 'characterID\t: imdbID : name' # Print the long imdb name for every character. for character in results: outp = u'%s\t\t: %s : %s' % (character.characterID, i.get_imdbID(character), character['long imdb name']) print outp.encode(out_encoding, 'replace') ���������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/get_first_company.py�����������������������������������������������������������������0000755�0000000�0000000�00000002512�11766731642�015766� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_first_company.py Usage: get_first_company "company name" Search for the given name and print the best matching result. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "company name"' % sys.argv[0] sys.exit(2) name = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() name = unicode(name, in_encoding, 'replace') try: # Do the search, and get the results (a list of company objects). results = i.search_company(name) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) if not results: print 'No matches for "%s", sorry.' % name.encode(out_encoding, 'replace') sys.exit(0) # Print only the first result. print ' Best match for "%s"' % name.encode(out_encoding, 'replace') # This is a company instance. company = results[0] # So far the company object only contains basic information like the # name; retrieve main information: i.update(company) print company.summary().encode(out_encoding, 'replace') ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/get_keyword.py�����������������������������������������������������������������������0000755�0000000�0000000�00000002445�11766731642�014602� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_keyword.py Usage: get_keyword "keyword" search for movies tagged with the given keyword and print the results. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "keyword"' % sys.argv[0] sys.exit(2) name = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() name = unicode(name, in_encoding, 'replace') try: # Do the search, and get the results (a list of movies). results = i.get_keyword(name, results=20) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) # Print the results. print ' %s result%s for "%s":' % (len(results), ('', 's')[len(results) != 1], name.encode(out_encoding, 'replace')) print ' : movie title' # Print the long imdb title for every movie. for idx, movie in enumerate(results): outp = u'%d: %s' % (idx+1, movie['long imdb title']) print outp.encode(out_encoding, 'replace') ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/get_top_bottom_movies.py�������������������������������������������������������������0000755�0000000�0000000�00000001566�11766731642�016671� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_top_bottom_movies.py Usage: get_top_bottom_movies Return top and bottom 10 movies, by ratings. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 1: print 'No arguments are required.' sys.exit(2) i = imdb.IMDb() top250 = i.get_top250_movies() bottom100 = i.get_bottom100_movies() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() for label, ml in [('top 10', top250[:10]), ('bottom 10', bottom100[:10])]: print '' print '%s movies' % label print 'rating\tvotes\ttitle' for movie in ml: outl = u'%s\t%s\t%s' % (movie.get('rating'), movie.get('votes'), movie['long imdb title']) print outl.encode(out_encoding, 'replace') ������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/search_keyword.py��������������������������������������������������������������������0000755�0000000�0000000�00000002417�11766731642�015267� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ search_keyword.py Usage: search_keyword "keyword" Search for keywords similar to the give one and print the results. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "keyword name"' % sys.argv[0] sys.exit(2) name = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() name = unicode(name, in_encoding, 'replace') try: # Do the search, and get the results (a list of keyword strings). results = i.search_keyword(name, results=20) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) # Print the results. print ' %s result%s for "%s":' % (len(results), ('', 's')[len(results) != 1], name.encode(out_encoding, 'replace')) print ' : keyword' # Print every keyword. for idx, keyword in enumerate(results): outp = u'%d: %s' % (idx+1, keyword) print outp.encode(out_encoding, 'replace') �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/get_first_character.py���������������������������������������������������������������0000755�0000000�0000000�00000002540�11766731642�016255� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_first_character.py Usage: get_first_character "character name" Search for the given name and print the best matching result. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "character name"' % sys.argv[0] sys.exit(2) name = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() name = unicode(name, in_encoding, 'replace') try: # Do the search, and get the results (a list of character objects). results = i.search_character(name) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) if not results: print 'No matches for "%s", sorry.' % name.encode(out_encoding, 'replace') sys.exit(0) # Print only the first result. print ' Best match for "%s"' % name.encode(out_encoding, 'replace') # This is a character instance. character = results[0] # So far the character object only contains basic information like the # name; retrieve main information: i.update(character) print character.summary().encode(out_encoding, 'replace') ����������������������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/search_company.py��������������������������������������������������������������������0000755�0000000�0000000�00000002553�11766731642�015252� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ search_company.py Usage: search_company "company name" Search for the given name and print the results. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "company name"' % sys.argv[0] sys.exit(2) name = sys.argv[1] i = imdb.IMDb() in_encoding = sys.stdin.encoding or sys.getdefaultencoding() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() name = unicode(name, in_encoding, 'replace') try: # Do the search, and get the results (a list of company objects). results = i.search_company(name) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) # Print the results. print ' %s result%s for "%s":' % (len(results), ('', 's')[len(results) != 1], name.encode(out_encoding, 'replace')) print 'companyID\t: imdbID : name' # Print the long imdb name for every company. for company in results: outp = u'%s\t\t: %s : %s' % (company.companyID, i.get_imdbID(company), company['long imdb name']) print outp.encode(out_encoding, 'replace') �����������������������������������������������������������������������������������������������������������������������������������������������������IMDbPY-4.9/bin/get_movie.py�������������������������������������������������������������������������0000755�0000000�0000000�00000006462�11766731642�014240� 0����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/env python """ get_movie.py Usage: get_movie "movieID" Show some info about the movie with the given movieID (e.g. '0133093' for "The Matrix", using 'http' or 'mobile'). Notice that movieID, using 'sql', are not the same IDs used on the web. """ import sys # Import the IMDbPY package. try: import imdb except ImportError: print 'You bad boy! You need to install the IMDbPY package!' sys.exit(1) if len(sys.argv) != 2: print 'Only one argument is required:' print ' %s "movieID"' % sys.argv[0] sys.exit(2) movieID = sys.argv[1] i = imdb.IMDb() out_encoding = sys.stdout.encoding or sys.getdefaultencoding() try: # Get a Movie object with the data about the movie identified by # the given movieID. movie = i.get_movie(movieID) except imdb.IMDbError, e: print "Probably you're not connected to Internet. Complete error report:" print e sys.exit(3) if not movie: print 'It seems that there\'s no movie with movieID "%s"' % movieID sys.exit(4) # XXX: this is the easier way to print the main info about a movie; # calling the summary() method of a Movie object will returns a string # with the main information about the movie. # Obviously it's not really meaningful if you want to know how # to access the data stored in a Movie object, so look below; the # commented lines show some ways to retrieve information from a # Movie object. print movie.summary().encode(out_encoding, 'replace') # Show some info about the movie. # This is only a short example; you can get a longer summary using # 'print movie.summary()' and the complete set of information looking for # the output of the movie.keys() method. #print '==== "%s" / movieID: %s ====' % (movie['title'], movieID) # XXX: use the IMDb instance to get the IMDb web URL for the movie. #imdbURL = i.get_imdbURL(movie) #if imdbURL: # print 'IMDb URL: %s' % imdbURL # # XXX: many keys return a list of values, like "genres". #genres = movie.get('genres') #if genres: # print 'Genres: %s' % ' '.join(genres) # # XXX: even when only one value is present (e.g.: movie with only one # director), fields that can be multiple are ALWAYS a list. # Note that the 'name' variable is a Person object, but since its # __str__() method returns a string with the name, we can use it # directly, instead of name['name'] #director = movie.get('director') #if director: # print 'Director(s): ', # for name in director: # sys.stdout.write('%s ' % name) # print '' # # XXX: notice that every name in the cast is a Person object, with a # currentRole instance variable, which is a string for the played role. #cast = movie.get('cast') #if cast: # print 'Cast: ' # cast = cast[:5] # for name in cast: # print ' %s (%s)' % (name['name'], name.currentRole) # XXX: some information are not lists of strings or Person objects, but simple # strings, like 'rating'. #rating = movie.get('rating') #if rating: # print 'Rating: %s' % rating # XXX: an example of how to use information sets; retrieve the "trivia" # info set; check if it contains some data, select and print a # random entry. #import random #i.update(movie, info=['trivia']) #trivia = movie.get('trivia') #if trivia: # rand_trivia = trivia[random.randrange(len(trivia))] # print 'Random trivia: %s' % rand_trivia ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������